summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore6
-rw-r--r--build/make/Android.mk28
-rw-r--r--build/make/Makefile10
-rw-r--r--build/make/configure.sh6
-rwxr-xr-xbuild/make/gen_msvs_vcxproj.sh2
-rwxr-xr-xconfigure6
-rw-r--r--examples/vp9_spatial_svc_encoder.c34
-rw-r--r--libs.mk4
-rw-r--r--test/invalid_file_test.cc1
-rw-r--r--test/partial_idct_test.cc148
-rw-r--r--test/pp_filter_test.cc31
-rw-r--r--test/test-data.mk2
-rw-r--r--test/test-data.sha12
-rw-r--r--test/test_intra_pred_speed.cc520
-rw-r--r--test/vp9_ethread_test.cc3
-rw-r--r--test/vp9_intrapred_test.cc405
-rw-r--r--tools.mk110
-rw-r--r--tools/tiny_ssim.c200
-rw-r--r--vp9/decoder/vp9_decodeframe.c3
-rw-r--r--vp9/encoder/vp9_encoder.c6
-rw-r--r--vp9/encoder/vp9_firstpass.c142
-rw-r--r--vpx/src/svc_encodeframe.c59
-rw-r--r--vpx/svc_context.h2
-rw-r--r--vpx_dsp/arm/idct32x32_1_add_neon.c4
-rw-r--r--vpx_dsp/arm/idct32x32_34_add_neon.c519
-rw-r--r--vpx_dsp/arm/idct4x4_add_neon.asm4
-rw-r--r--vpx_dsp/arm/idct4x4_add_neon.c5
-rw-r--r--vpx_dsp/arm/idct8x8_add_neon.asm18
-rw-r--r--vpx_dsp/arm/idct8x8_add_neon.c33
-rw-r--r--vpx_dsp/arm/idct_neon.asm29
-rw-r--r--vpx_dsp/arm/idct_neon.h172
-rw-r--r--vpx_dsp/arm/intrapred_neon.c1051
-rw-r--r--vpx_dsp/vpx_dsp.mk11
-rw-r--r--vpx_dsp/vpx_dsp_rtcd_defs.pl8
-rw-r--r--vpxdec.c6
35 files changed, 2530 insertions, 1060 deletions
diff --git a/.gitignore b/.gitignore
index 4337a2c32..bf5ffc713 100644
--- a/.gitignore
+++ b/.gitignore
@@ -39,8 +39,7 @@
/examples/vp8cx_set_ref
/examples/vp9cx_set_ref
/examples/vp9_lossless_encoder
-/examples/vp9_spatial_scalable_encoder
-/examples/vpx_temporal_scalable_patterns
+/examples/vp9_spatial_svc_encoder
/examples/vpx_temporal_svc_encoder
/ivfdec
/ivfdec.dox
@@ -51,6 +50,9 @@
/samples.dox
/test_intra_pred_speed
/test_libvpx
+/tools.dox
+/tools/*.dox
+/tools/tiny_ssim
/vp8_api1_migration.dox
/vp[89x]_rtcd.h
/vpx.pc
diff --git a/build/make/Android.mk b/build/make/Android.mk
index 36120170e..09bdc5d2f 100644
--- a/build/make/Android.mk
+++ b/build/make/Android.mk
@@ -71,7 +71,7 @@ ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
include $(CONFIG_DIR)libs-armv7-android-gcc.mk
LOCAL_ARM_MODE := arm
else ifeq ($(TARGET_ARCH_ABI),arm64-v8a)
- include $(CONFIG_DIR)libs-armv8-android-gcc.mk
+ include $(CONFIG_DIR)libs-arm64-android-gcc.mk
LOCAL_ARM_MODE := arm
else ifeq ($(TARGET_ARCH_ABI),x86)
include $(CONFIG_DIR)libs-x86-android-gcc.mk
@@ -101,8 +101,8 @@ LOCAL_CFLAGS := -O3
# like x86inc.asm and x86_abi_support.asm
LOCAL_ASMFLAGS := -I$(LIBVPX_PATH)
-.PRECIOUS: %.asm.s
-$(ASM_CNV_PATH)/libvpx/%.asm.s: $(LIBVPX_PATH)/%.asm
+.PRECIOUS: %.asm.S
+$(ASM_CNV_PATH)/libvpx/%.asm.S: $(LIBVPX_PATH)/%.asm
@mkdir -p $(dir $@)
@$(CONFIG_DIR)$(ASM_CONVERSION) <$< > $@
@@ -132,7 +132,7 @@ endif
# Pull out assembly files, splitting NEON from the rest. This is
# done to specify that the NEON assembly files use NEON assembler flags.
-# x86 assembly matches %.asm, arm matches %.asm.s
+# x86 assembly matches %.asm, arm matches %.asm.S
# x86:
@@ -140,12 +140,12 @@ CODEC_SRCS_ASM_X86 = $(filter %.asm, $(CODEC_SRCS_UNIQUE))
LOCAL_SRC_FILES += $(foreach file, $(CODEC_SRCS_ASM_X86), libvpx/$(file))
# arm:
-CODEC_SRCS_ASM_ARM_ALL = $(filter %.asm.s, $(CODEC_SRCS_UNIQUE))
+CODEC_SRCS_ASM_ARM_ALL = $(filter %.asm.S, $(CODEC_SRCS_UNIQUE))
CODEC_SRCS_ASM_ARM = $(foreach v, \
$(CODEC_SRCS_ASM_ARM_ALL), \
$(if $(findstring neon,$(v)),,$(v)))
-CODEC_SRCS_ASM_ADS2GAS = $(patsubst %.s, \
- $(ASM_CNV_PATH_LOCAL)/libvpx/%.s, \
+CODEC_SRCS_ASM_ADS2GAS = $(patsubst %.S, \
+ $(ASM_CNV_PATH_LOCAL)/libvpx/%.S, \
$(CODEC_SRCS_ASM_ARM))
LOCAL_SRC_FILES += $(CODEC_SRCS_ASM_ADS2GAS)
@@ -153,18 +153,19 @@ ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
CODEC_SRCS_ASM_NEON = $(foreach v, \
$(CODEC_SRCS_ASM_ARM_ALL),\
$(if $(findstring neon,$(v)),$(v),))
- CODEC_SRCS_ASM_NEON_ADS2GAS = $(patsubst %.s, \
- $(ASM_CNV_PATH_LOCAL)/libvpx/%.s, \
+ CODEC_SRCS_ASM_NEON_ADS2GAS = $(patsubst %.S, \
+ $(ASM_CNV_PATH_LOCAL)/libvpx/%.S, \
$(CODEC_SRCS_ASM_NEON))
- LOCAL_SRC_FILES += $(patsubst %.s, \
- %.s.neon, \
+ LOCAL_SRC_FILES += $(patsubst %.S, \
+ %.S.neon, \
$(CODEC_SRCS_ASM_NEON_ADS2GAS))
endif
LOCAL_CFLAGS += \
-DHAVE_CONFIG_H=vpx_config.h \
-I$(LIBVPX_PATH) \
- -I$(ASM_CNV_PATH)
+ -I$(ASM_CNV_PATH) \
+ -I$(ASM_CNV_PATH)/libvpx
LOCAL_MODULE := libvpx
@@ -185,7 +186,8 @@ endif
$$(rtcd_dep_template_SRCS): vpx_scale_rtcd.h
$$(rtcd_dep_template_SRCS): vpx_dsp_rtcd.h
-ifneq ($(findstring $(TARGET_ARCH_ABI),x86 x86_64),)
+rtcd_dep_template_CONFIG_ASM_ABIS := x86 x86_64 armeabi-v7a
+ifneq ($(findstring $(TARGET_ARCH_ABI),$(rtcd_dep_template_CONFIG_ASM_ABIS)),)
$$(rtcd_dep_template_SRCS): vpx_config.asm
endif
endef
diff --git a/build/make/Makefile b/build/make/Makefile
index 469eb74c3..cba605786 100644
--- a/build/make/Makefile
+++ b/build/make/Makefile
@@ -90,7 +90,7 @@ all:
.PHONY: clean
clean::
- rm -f $(OBJS-yes) $(OBJS-yes:.o=.d) $(OBJS-yes:.asm.s.o=.asm.s)
+ rm -f $(OBJS-yes) $(OBJS-yes:.o=.d) $(OBJS-yes:.asm.S.o=.asm.S)
rm -f $(CLEAN-OBJS)
.PHONY: clean
@@ -180,13 +180,13 @@ $(BUILD_PFX)%.asm.o: %.asm
$(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@))
$(qexec)$(AS) $(ASFLAGS) -o $@ $<
-$(BUILD_PFX)%.s.d: %.s
+$(BUILD_PFX)%.S.d: %.S
$(if $(quiet),@echo " [DEP] $@")
$(qexec)mkdir -p $(dir $@)
$(qexec)$(SRC_PATH_BARE)/build/make/gen_asm_deps.sh \
--build-pfx=$(BUILD_PFX) --depfile=$@ $(ASFLAGS) $< > $@
-$(BUILD_PFX)%.s.o: %.s
+$(BUILD_PFX)%.S.o: %.S
$(if $(quiet),@echo " [AS] $@")
$(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@))
$(qexec)$(AS) $(ASFLAGS) -o $@ $<
@@ -198,8 +198,8 @@ $(BUILD_PFX)%.c.S: %.c
$(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@))
$(qexec)$(CC) -S $(CFLAGS) -o $@ $<
-.PRECIOUS: %.asm.s
-$(BUILD_PFX)%.asm.s: %.asm
+.PRECIOUS: %.asm.S
+$(BUILD_PFX)%.asm.S: %.asm
$(if $(quiet),@echo " [ASM CONVERSION] $@")
$(qexec)mkdir -p $(dir $@)
$(qexec)$(ASM_CONVERSION) <$< >$@
diff --git a/build/make/configure.sh b/build/make/configure.sh
index 35609e89a..f050fa06a 100644
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -635,7 +635,7 @@ setup_gnu_toolchain() {
AS=${AS:-${CROSS}as}
STRIP=${STRIP:-${CROSS}strip}
NM=${NM:-${CROSS}nm}
- AS_SFX=.s
+ AS_SFX=.S
EXE_SFX=
}
@@ -926,7 +926,7 @@ EOF
;;
vs*)
asm_conversion_cmd="${source_path}/build/make/ads2armasm_ms.pl"
- AS_SFX=.s
+ AS_SFX=.S
msvs_arch_dir=arm-msvs
disable_feature multithread
disable_feature unit_tests
@@ -1034,7 +1034,7 @@ EOF
STRIP="$(${XCRUN_FIND} strip)"
NM="$(${XCRUN_FIND} nm)"
RANLIB="$(${XCRUN_FIND} ranlib)"
- AS_SFX=.s
+ AS_SFX=.S
LD="${CXX:-$(${XCRUN_FIND} ld)}"
# ASFLAGS is written here instead of using check_add_asflags
diff --git a/build/make/gen_msvs_vcxproj.sh b/build/make/gen_msvs_vcxproj.sh
index e98611d10..e3395afa2 100755
--- a/build/make/gen_msvs_vcxproj.sh
+++ b/build/make/gen_msvs_vcxproj.sh
@@ -203,7 +203,7 @@ for opt in "$@"; do
# The paths in file_list are fixed outside of the loop.
file_list[${#file_list[@]}]="$opt"
case "$opt" in
- *.asm|*.s) uses_asm=true
+ *.asm|*.[Ss]) uses_asm=true
;;
esac
;;
diff --git a/configure b/configure
index 7065dfef5..fb732acf3 100755
--- a/configure
+++ b/configure
@@ -22,6 +22,7 @@ show_help(){
Advanced options:
${toggle_libs} libraries
${toggle_examples} examples
+ ${toggle_tools} tools
${toggle_docs} documentation
${toggle_unit_tests} unit tests
${toggle_decode_perf_tests} build decoder perf tests with unit tests
@@ -155,7 +156,7 @@ all_platforms="${all_platforms} generic-gnu"
# all_targets is a list of all targets that can be configured
# note that these should be in dependency order for now.
-all_targets="libs examples docs"
+all_targets="libs examples tools docs"
# all targets available are enabled, by default.
for t in ${all_targets}; do
@@ -331,6 +332,7 @@ CMDLINE_SELECT="
libs
examples
+ tools
docs
libc
as
@@ -476,7 +478,7 @@ EOF
#
# Write makefiles for all enabled targets
#
- for tgt in libs examples docs solution; do
+ for tgt in libs examples tools docs solution; do
tgt_fn="$tgt-$toolchain.mk"
if enabled $tgt; then
diff --git a/examples/vp9_spatial_svc_encoder.c b/examples/vp9_spatial_svc_encoder.c
index cecdce080..fa2df7271 100644
--- a/examples/vp9_spatial_svc_encoder.c
+++ b/examples/vp9_spatial_svc_encoder.c
@@ -84,6 +84,8 @@ static const arg_def_t speed_arg =
ARG_DEF("sp", "speed", 1, "speed configuration");
static const arg_def_t aqmode_arg =
ARG_DEF("aq", "aqmode", 1, "aq-mode off/on");
+static const arg_def_t bitrates_arg =
+ ARG_DEF("bl", "bitrates", 1, "bitrates[sl * num_tl + tl]");
#if CONFIG_VP9_HIGHBITDEPTH
static const struct arg_enum_list bitdepth_enum[] = {
@@ -124,6 +126,7 @@ static const arg_def_t *svc_args[] = { &frames_arg,
#endif
&speed_arg,
&rc_end_usage_arg,
+ &bitrates_arg,
NULL };
static const uint32_t default_frames_to_skip = 0;
@@ -250,6 +253,9 @@ static void parse_command_line(int argc, const char **argv_,
} else if (arg_match(&arg, &scale_factors_arg, argi)) {
snprintf(string_options, sizeof(string_options), "%s scale-factors=%s",
string_options, arg.val);
+ } else if (arg_match(&arg, &bitrates_arg, argi)) {
+ snprintf(string_options, sizeof(string_options), "%s bitrates=%s",
+ string_options, arg.val);
} else if (arg_match(&arg, &passes_arg, argi)) {
passes = arg_parse_uint(&arg);
if (passes < 1 || passes > 2) {
@@ -417,7 +423,6 @@ static void set_rate_control_stats(struct RateControlStats *rc,
for (sl = 0; sl < cfg->ss_number_layers; ++sl) {
for (tl = 0; tl < cfg->ts_number_layers; ++tl) {
const int layer = sl * cfg->ts_number_layers + tl;
- const int tlayer0 = sl * cfg->ts_number_layers;
if (cfg->ts_number_layers == 1)
rc->layer_framerate[layer] = framerate;
else
@@ -428,8 +433,8 @@ static void set_rate_control_stats(struct RateControlStats *rc,
cfg->layer_target_bitrate[layer - 1]) /
(rc->layer_framerate[layer] - rc->layer_framerate[layer - 1]);
} else {
- rc->layer_pfb[tlayer0] = 1000.0 * cfg->layer_target_bitrate[tlayer0] /
- rc->layer_framerate[tlayer0];
+ rc->layer_pfb[layer] = 1000.0 * cfg->layer_target_bitrate[layer] /
+ rc->layer_framerate[layer];
}
rc->layer_input_frames[layer] = 0;
rc->layer_enc_frames[layer] = 0;
@@ -449,12 +454,13 @@ static void printout_rate_control_summary(struct RateControlStats *rc,
vpx_codec_enc_cfg_t *cfg,
int frame_cnt) {
unsigned int sl, tl;
- int tot_num_frames = 0;
double perc_fluctuation = 0.0;
+ int tot_num_frames = 0;
printf("Total number of processed frames: %d\n\n", frame_cnt - 1);
printf("Rate control layer stats for sl%d tl%d layer(s):\n\n",
cfg->ss_number_layers, cfg->ts_number_layers);
for (sl = 0; sl < cfg->ss_number_layers; ++sl) {
+ tot_num_frames = 0;
for (tl = 0; tl < cfg->ts_number_layers; ++tl) {
const int layer = sl * cfg->ts_number_layers + tl;
const int num_dropped =
@@ -462,7 +468,7 @@ static void printout_rate_control_summary(struct RateControlStats *rc,
? (rc->layer_input_frames[layer] - rc->layer_enc_frames[layer])
: (rc->layer_input_frames[layer] - rc->layer_enc_frames[layer] -
1);
- if (!sl) tot_num_frames += rc->layer_input_frames[layer];
+ tot_num_frames += rc->layer_input_frames[layer];
rc->layer_encoding_bitrate[layer] = 0.001 * rc->layer_framerate[layer] *
rc->layer_encoding_bitrate[layer] /
tot_num_frames;
@@ -620,7 +626,7 @@ int main(int argc, const char **argv) {
struct RateControlStats rc;
vpx_svc_layer_id_t layer_id;
vpx_svc_ref_frame_config_t ref_frame_config;
- int sl, tl;
+ unsigned int sl, tl;
double sum_bitrate = 0.0;
double sum_bitrate2 = 0.0;
double framerate = 30.0;
@@ -695,6 +701,8 @@ int main(int argc, const char **argv) {
vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (svc_ctx.threads >> 1));
if (svc_ctx.speed >= 5 && svc_ctx.aqmode == 1)
vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3);
+ if (svc_ctx.speed >= 5)
+ vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
// Encode frames
while (!end_of_stream) {
@@ -730,7 +738,7 @@ int main(int argc, const char **argv) {
&ref_frame_config);
// Keep track of input frames, to account for frame drops in rate control
// stats/metrics.
- for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
+ for (sl = 0; sl < (unsigned int)enc_cfg.ss_number_layers; ++sl) {
++rc.layer_input_frames[sl * enc_cfg.ts_number_layers +
layer_id.temporal_layer_id];
}
@@ -793,7 +801,7 @@ int main(int argc, const char **argv) {
rc.layer_encoding_bitrate[layer] += 8.0 * sizes[sl];
// Keep count of rate control stats per layer, for non-key
// frames.
- if (tl == layer_id.temporal_layer_id &&
+ if (tl == (unsigned int)layer_id.temporal_layer_id &&
!(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY)) {
rc.layer_avg_frame_size[layer] += 8.0 * sizes[sl];
rc.layer_avg_rate_mismatch[layer] +=
@@ -807,7 +815,7 @@ int main(int argc, const char **argv) {
// Update for short-time encoding bitrate states, for moving
// window of size rc->window, shifted by rc->window / 2.
// Ignore first window segment, due to key frame.
- if (frame_cnt > rc.window_size) {
+ if (frame_cnt > (unsigned int)rc.window_size) {
tl = layer_id.temporal_layer_id;
for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
sum_bitrate += 0.001 * 8.0 * sizes[sl] * framerate;
@@ -823,13 +831,14 @@ int main(int argc, const char **argv) {
}
// Second shifted window.
- if (frame_cnt > rc.window_size + rc.window_size / 2) {
+ if (frame_cnt >
+ (unsigned int)(rc.window_size + rc.window_size / 2)) {
tl = layer_id.temporal_layer_id;
for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
sum_bitrate2 += 0.001 * 8.0 * sizes[sl] * framerate;
}
- if (frame_cnt > 2 * rc.window_size &&
+ if (frame_cnt > (unsigned int)(2 * rc.window_size) &&
frame_cnt % rc.window_size == 0) {
rc.window_count += 1;
rc.avg_st_encoding_bitrate += sum_bitrate2 / rc.window_size;
@@ -842,10 +851,11 @@ int main(int argc, const char **argv) {
}
#endif
}
-
+ /*
printf("SVC frame: %d, kf: %d, size: %d, pts: %d\n", frames_received,
!!(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY),
(int)cx_pkt->data.frame.sz, (int)cx_pkt->data.frame.pts);
+ */
if (enc_cfg.ss_number_layers == 1 && enc_cfg.ts_number_layers == 1)
si->bytes_sum[0] += (int)cx_pkt->data.frame.sz;
++frames_received;
diff --git a/libs.mk b/libs.mk
index 6e12b5404..f4f48cc16 100644
--- a/libs.mk
+++ b/libs.mk
@@ -12,7 +12,7 @@
# ARM assembly files are written in RVCT-style. We use some make magic to
# filter those files to allow GCC compilation
ifeq ($(ARCH_ARM),yes)
- ASM:=$(if $(filter yes,$(CONFIG_GCC)$(CONFIG_MSVS)),.asm.s,.asm)
+ ASM:=$(if $(filter yes,$(CONFIG_GCC)$(CONFIG_MSVS)),.asm.S,.asm)
else
ASM:=.asm
endif
@@ -366,7 +366,7 @@ endif
#
# Add assembler dependencies for configuration.
#
-$(filter %.s.o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm
+$(filter %.S.o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm
$(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm
diff --git a/test/invalid_file_test.cc b/test/invalid_file_test.cc
index 12eaa80e7..2921e5ddf 100644
--- a/test/invalid_file_test.cc
+++ b/test/invalid_file_test.cc
@@ -188,6 +188,7 @@ const DecodeParam kMultiThreadedVP9InvalidFileTests[] = {
"invalid-vp90-2-08-tile_1x8_frame_parallel.webm.ivf.s288_r01-05_b6-.ivf" },
{ 2, "invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.v2.ivf" },
{ 4, "invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.v2.ivf" },
+ { 2, "invalid-crbug-629481.webm" },
};
INSTANTIATE_TEST_CASE_P(
diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc
index 9eb4d9dbb..0c704c5c8 100644
--- a/test/partial_idct_test.cc
+++ b/test/partial_idct_test.cc
@@ -41,13 +41,42 @@ class PartialIDctTest : public ::testing::TestWithParam<PartialInvTxfmParam> {
partial_itxfm_ = GET_PARAM(2);
tx_size_ = GET_PARAM(3);
last_nonzero_ = GET_PARAM(4);
+
+ switch (tx_size_) {
+ case TX_4X4: size_ = 4; break;
+ case TX_8X8: size_ = 8; break;
+ case TX_16X16: size_ = 16; break;
+ case TX_32X32: size_ = 32; break;
+ default: FAIL() << "Wrong Size!"; break;
+ }
+ block_size_ = size_ * size_;
+
+ input_block_ = reinterpret_cast<tran_low_t *>(
+ vpx_memalign(16, sizeof(*input_block_) * block_size_));
+ output_block_ = reinterpret_cast<uint8_t *>(
+ vpx_memalign(16, sizeof(*output_block_) * block_size_));
+ output_block_ref_ = reinterpret_cast<uint8_t *>(
+ vpx_memalign(16, sizeof(*output_block_ref_) * block_size_));
}
- virtual void TearDown() { libvpx_test::ClearSystemState(); }
+ virtual void TearDown() {
+ vpx_free(input_block_);
+ input_block_ = NULL;
+ vpx_free(output_block_);
+ output_block_ = NULL;
+ vpx_free(output_block_ref_);
+ output_block_ref_ = NULL;
+ libvpx_test::ClearSystemState();
+ }
protected:
int last_nonzero_;
TX_SIZE tx_size_;
+ tran_low_t *input_block_;
+ uint8_t *output_block_;
+ uint8_t *output_block_ref_;
+ int size_;
+ int block_size_;
FwdTxfmFunc ftxfm_;
InvTxfmFunc full_itxfm_;
InvTxfmFunc partial_itxfm_;
@@ -55,95 +84,62 @@ class PartialIDctTest : public ::testing::TestWithParam<PartialInvTxfmParam> {
TEST_P(PartialIDctTest, RunQuantCheck) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
- int size;
- switch (tx_size_) {
- case TX_4X4: size = 4; break;
- case TX_8X8: size = 8; break;
- case TX_16X16: size = 16; break;
- case TX_32X32: size = 32; break;
- default: FAIL() << "Wrong Size!"; break;
- }
- DECLARE_ALIGNED(16, tran_low_t, test_coef_block1[kMaxNumCoeffs]);
- DECLARE_ALIGNED(16, tran_low_t, test_coef_block2[kMaxNumCoeffs]);
- DECLARE_ALIGNED(16, uint8_t, dst1[kMaxNumCoeffs]);
- DECLARE_ALIGNED(16, uint8_t, dst2[kMaxNumCoeffs]);
const int count_test_block = 1000;
- const int block_size = size * size;
DECLARE_ALIGNED(16, int16_t, input_extreme_block[kMaxNumCoeffs]);
DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kMaxNumCoeffs]);
- int max_error = 0;
for (int i = 0; i < count_test_block; ++i) {
// clear out destination buffer
- memset(dst1, 0, sizeof(*dst1) * block_size);
- memset(dst2, 0, sizeof(*dst2) * block_size);
- memset(test_coef_block1, 0, sizeof(*test_coef_block1) * block_size);
- memset(test_coef_block2, 0, sizeof(*test_coef_block2) * block_size);
+ memset(input_block_, 0, sizeof(*input_block_) * block_size_);
+ memset(output_block_, 0, sizeof(*output_block_) * block_size_);
+ memset(output_block_ref_, 0, sizeof(*output_block_ref_) * block_size_);
ACMRandom rnd(ACMRandom::DeterministicSeed());
for (int i = 0; i < count_test_block; ++i) {
// Initialize a test block with input range [-255, 255].
if (i == 0) {
- for (int j = 0; j < block_size; ++j) input_extreme_block[j] = 255;
+ for (int j = 0; j < block_size_; ++j) input_extreme_block[j] = 255;
} else if (i == 1) {
- for (int j = 0; j < block_size; ++j) input_extreme_block[j] = -255;
+ for (int j = 0; j < block_size_; ++j) input_extreme_block[j] = -255;
} else {
- for (int j = 0; j < block_size; ++j) {
+ for (int j = 0; j < block_size_; ++j) {
input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
}
}
- ftxfm_(input_extreme_block, output_ref_block, size);
+ ftxfm_(input_extreme_block, output_ref_block, size_);
// quantization with maximum allowed step sizes
- test_coef_block1[0] = (output_ref_block[0] / 1336) * 1336;
+ input_block_[0] = (output_ref_block[0] / 1336) * 1336;
for (int j = 1; j < last_nonzero_; ++j) {
- test_coef_block1[vp9_default_scan_orders[tx_size_].scan[j]] =
+ input_block_[vp9_default_scan_orders[tx_size_].scan[j]] =
(output_ref_block[j] / 1828) * 1828;
}
}
- ASM_REGISTER_STATE_CHECK(full_itxfm_(test_coef_block1, dst1, size));
- ASM_REGISTER_STATE_CHECK(partial_itxfm_(test_coef_block1, dst2, size));
+ ASM_REGISTER_STATE_CHECK(
+ full_itxfm_(input_block_, output_block_ref_, size_));
+ ASM_REGISTER_STATE_CHECK(
+ partial_itxfm_(input_block_, output_block_, size_));
- for (int j = 0; j < block_size; ++j) {
- const int diff = dst1[j] - dst2[j];
- const int error = diff * diff;
- if (max_error < error) max_error = error;
- }
+ ASSERT_EQ(0, memcmp(output_block_ref_, output_block_,
+ sizeof(*output_block_) * block_size_))
+ << "Error: partial inverse transform produces different results";
}
-
- EXPECT_EQ(0, max_error)
- << "Error: partial inverse transform produces different results";
}
TEST_P(PartialIDctTest, ResultsMatch) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
- int size;
- switch (tx_size_) {
- case TX_4X4: size = 4; break;
- case TX_8X8: size = 8; break;
- case TX_16X16: size = 16; break;
- case TX_32X32: size = 32; break;
- default: FAIL() << "Wrong Size!"; break;
- }
- DECLARE_ALIGNED(16, tran_low_t, test_coef_block1[kMaxNumCoeffs]);
- DECLARE_ALIGNED(16, tran_low_t, test_coef_block2[kMaxNumCoeffs]);
- DECLARE_ALIGNED(16, uint8_t, dst1[kMaxNumCoeffs]);
- DECLARE_ALIGNED(16, uint8_t, dst2[kMaxNumCoeffs]);
const int count_test_block = 1000;
const int max_coeff = 32766 / 4;
- const int block_size = size * size;
- int max_error = 0;
for (int i = 0; i < count_test_block; ++i) {
// clear out destination buffer
- memset(dst1, 0, sizeof(*dst1) * block_size);
- memset(dst2, 0, sizeof(*dst2) * block_size);
- memset(test_coef_block1, 0, sizeof(*test_coef_block1) * block_size);
- memset(test_coef_block2, 0, sizeof(*test_coef_block2) * block_size);
+ memset(input_block_, 0, sizeof(*input_block_) * block_size_);
+ memset(output_block_, 0, sizeof(*output_block_) * block_size_);
+ memset(output_block_ref_, 0, sizeof(*output_block_ref_) * block_size_);
int max_energy_leftover = max_coeff * max_coeff;
for (int j = 0; j < last_nonzero_; ++j) {
int16_t coef = static_cast<int16_t>(sqrt(1.0 * max_energy_leftover) *
@@ -153,24 +149,42 @@ TEST_P(PartialIDctTest, ResultsMatch) {
max_energy_leftover = 0;
coef = 0;
}
- test_coef_block1[vp9_default_scan_orders[tx_size_].scan[j]] = coef;
+ input_block_[vp9_default_scan_orders[tx_size_].scan[j]] = coef;
}
- memcpy(test_coef_block2, test_coef_block1,
- sizeof(*test_coef_block2) * block_size);
+ ASM_REGISTER_STATE_CHECK(
+ full_itxfm_(input_block_, output_block_ref_, size_));
+ ASM_REGISTER_STATE_CHECK(
+ partial_itxfm_(input_block_, output_block_, size_));
- ASM_REGISTER_STATE_CHECK(full_itxfm_(test_coef_block1, dst1, size));
- ASM_REGISTER_STATE_CHECK(partial_itxfm_(test_coef_block2, dst2, size));
+ ASSERT_EQ(0, memcmp(output_block_ref_, output_block_,
+ sizeof(*output_block_) * block_size_))
+ << "Error: partial inverse transform produces different results";
+ }
+}
- for (int j = 0; j < block_size; ++j) {
- const int diff = dst1[j] - dst2[j];
- const int error = diff * diff;
- if (max_error < error) max_error = error;
+TEST_P(PartialIDctTest, AddOutputBlock) {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ const int count_test_block = 10;
+ for (int i = 0; i < count_test_block; ++i) {
+ memset(input_block_, 0, sizeof(*input_block_) * block_size_);
+ for (int j = 0; j < last_nonzero_; ++j) {
+ input_block_[vp9_default_scan_orders[tx_size_].scan[j]] = 10;
}
- }
- EXPECT_EQ(0, max_error)
- << "Error: partial inverse transform produces different results";
+ for (int j = 0; j < block_size_; ++j) {
+ output_block_[j] = output_block_ref_[j] = rnd.Rand8();
+ }
+
+ ASM_REGISTER_STATE_CHECK(
+ full_itxfm_(input_block_, output_block_ref_, size_));
+ ASM_REGISTER_STATE_CHECK(
+ partial_itxfm_(input_block_, output_block_, size_));
+
+ ASSERT_EQ(0, memcmp(output_block_ref_, output_block_,
+ sizeof(*output_block_) * block_size_))
+ << "Error: Transform results are not correctly added to output.";
+ }
}
using std::tr1::make_tuple;
@@ -214,7 +228,7 @@ INSTANTIATE_TEST_CASE_P(
make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c,
&vpx_idct4x4_1_add_neon, TX_4X4, 1)));
#else // !CONFIG_VP9_HIGHBITDEPTH
-// 32x32_34_ 32x32_135_ are implemented using the 1024 version.
+// 32x32_135_ is implemented using the 1024 version.
INSTANTIATE_TEST_CASE_P(
NEON, PartialIDctTest,
::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c,
@@ -222,7 +236,7 @@ INSTANTIATE_TEST_CASE_P(
make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c,
&vpx_idct32x32_1024_add_neon, TX_32X32, 135),
make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c,
- &vpx_idct32x32_1024_add_neon, TX_32X32, 34),
+ &vpx_idct32x32_34_add_neon, TX_32X32, 34),
make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c,
&vpx_idct32x32_1_add_neon, TX_32X32, 1),
make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_c,
diff --git a/test/pp_filter_test.cc b/test/pp_filter_test.cc
index 2e34fed06..4f6592647 100644
--- a/test/pp_filter_test.cc
+++ b/test/pp_filter_test.cc
@@ -7,22 +7,23 @@
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "test/acm_random.h"
#include "test/clear_system_state.h"
#include "test/register_state_check.h"
#include "third_party/googletest/src/include/gtest/gtest.h"
-#include "./vpx_config.h"
-#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"
#include "vpx_mem/vpx_mem.h"
-typedef void (*PostProcFunc)(unsigned char *src_ptr, unsigned char *dst_ptr,
- int src_pixels_per_line, int dst_pixels_per_line,
- int cols, unsigned char *flimit, int size);
+typedef void (*VpxPostProcDownAndAcrossMbRowFunc)(
+ unsigned char *src_ptr, unsigned char *dst_ptr, int src_pixels_per_line,
+ int dst_pixels_per_line, int cols, unsigned char *flimit, int size);
namespace {
-class VPxPostProcessingFilterTest
- : public ::testing::TestWithParam<PostProcFunc> {
+class VpxPostProcDownAndAcrossMbRowTest
+ : public ::testing::TestWithParam<VpxPostProcDownAndAcrossMbRowFunc> {
public:
virtual void TearDown() { libvpx_test::ClearSystemState(); }
};
@@ -30,7 +31,7 @@ class VPxPostProcessingFilterTest
// Test routine for the VPx post-processing function
// vpx_post_proc_down_and_across_mb_row_c.
-TEST_P(VPxPostProcessingFilterTest, FilterOutputCheck) {
+TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckFilterOutput) {
// Size of the underlying data block that will be filtered.
const int block_width = 16;
const int block_height = 16;
@@ -78,14 +79,14 @@ TEST_P(VPxPostProcessingFilterTest, FilterOutputCheck) {
input_stride, output_stride, block_width,
flimits, 16));
- static const uint8_t expected_data[block_height] = { 4, 3, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 3, 4 };
+ static const uint8_t kExpectedOutput[block_height] = {
+ 4, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 4
+ };
pixel_ptr = dst_image_ptr;
for (int i = 0; i < block_height; ++i) {
for (int j = 0; j < block_width; ++j) {
- EXPECT_EQ(expected_data[i], pixel_ptr[j])
- << "VPxPostProcessingFilterTest failed with invalid filter output";
+ ASSERT_EQ(kExpectedOutput[i], pixel_ptr[j]);
}
pixel_ptr += output_stride;
}
@@ -96,18 +97,18 @@ TEST_P(VPxPostProcessingFilterTest, FilterOutputCheck) {
};
INSTANTIATE_TEST_CASE_P(
- C, VPxPostProcessingFilterTest,
+ C, VpxPostProcDownAndAcrossMbRowTest,
::testing::Values(vpx_post_proc_down_and_across_mb_row_c));
#if HAVE_SSE2
INSTANTIATE_TEST_CASE_P(
- SSE2, VPxPostProcessingFilterTest,
+ SSE2, VpxPostProcDownAndAcrossMbRowTest,
::testing::Values(vpx_post_proc_down_and_across_mb_row_sse2));
#endif
#if HAVE_MSA
INSTANTIATE_TEST_CASE_P(
- MSA, VPxPostProcessingFilterTest,
+ MSA, VpxPostProcDownAndAcrossMbRowTest,
::testing::Values(vpx_post_proc_down_and_across_mb_row_msa));
#endif
diff --git a/test/test-data.mk b/test/test-data.mk
index 80b802e0a..e528c9182 100644
--- a/test/test-data.mk
+++ b/test/test-data.mk
@@ -775,6 +775,8 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp91-2-mixedrefcsp-444to420.iv
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-1.webm
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-2.webm
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-3.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-629481.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-629481.webm.res
ifeq ($(CONFIG_DECODE_PERF_TESTS),yes)
# Encode / Decode test
diff --git a/test/test-data.sha1 b/test/test-data.sha1
index b97ae967e..eda46c918 100644
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -840,3 +840,5 @@ a000d568431d07379dd5a8ec066061c07e560b47 *invalid-vp90-2-00-quantizer-63.ivf.kf_
787f04f0483320d536894282f3358a4f8cac1cf9 *invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf.res
91d3cefd0deb98f3b0caf3a2d900ec7a7605e53a *invalid-vp90-2-10-show-existing-frame.webm.ivf.s180315_r01-05_b6-.ivf
1e472baaf5f6113459f0399a38a5a5e68d17799d *invalid-vp90-2-10-show-existing-frame.webm.ivf.s180315_r01-05_b6-.ivf.res
+70057835bf29d14e66699ce5f022df2551fb6b37 *invalid-crbug-629481.webm
+5d9474c0309b7ca09a182d888f73b37a8fe1362c *invalid-crbug-629481.webm.res
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index a89630753..d8eb82426 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -29,6 +29,8 @@ namespace {
typedef void (*VpxPredFunc)(uint8_t *dst, ptrdiff_t y_stride,
const uint8_t *above, const uint8_t *left);
+const int kBPS = 32;
+const int kTotalPixels = 32 * kBPS;
const int kNumVp9IntraPredFuncs = 13;
const char *kVp9IntraPredNames[kNumVp9IntraPredFuncs] = {
"DC_PRED", "DC_LEFT_PRED", "DC_TOP_PRED", "DC_128_PRED", "V_PRED",
@@ -36,107 +38,121 @@ const char *kVp9IntraPredNames[kNumVp9IntraPredFuncs] = {
"D207_PRED", "D63_PRED", "TM_PRED"
};
+template <typename Pixel>
+struct IntraPredTestMem {
+ void Init(int block_size, int bd) {
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ Pixel *const above = above_mem + 16;
+ const int mask = (1 << bd) - 1;
+ for (int i = 0; i < kTotalPixels; ++i) ref_src[i] = rnd.Rand16() & mask;
+ for (int i = 0; i < kBPS; ++i) left[i] = rnd.Rand16() & mask;
+ for (int i = -1; i < kBPS; ++i) above[i] = rnd.Rand16() & mask;
+
+ // some code assumes the top row has been extended:
+ // d45/d63 C-code, for instance, but not the assembly.
+ // TODO(jzern): this style of extension isn't strictly necessary.
+ ASSERT_LE(block_size, kBPS);
+ for (int i = block_size; i < 2 * kBPS; ++i) {
+ above[i] = above[block_size - 1];
+ }
+ }
+
+ DECLARE_ALIGNED(16, Pixel, src[kTotalPixels]);
+ DECLARE_ALIGNED(16, Pixel, ref_src[kTotalPixels]);
+ DECLARE_ALIGNED(16, Pixel, left[kBPS]);
+ DECLARE_ALIGNED(16, Pixel, above_mem[2 * kBPS + 16]);
+};
+
+typedef IntraPredTestMem<uint8_t> Vp9IntraPredTestMem;
+
+void CheckMd5Signature(const char name[], const char *const signatures[],
+ const void *data, size_t data_size, int elapsed_time,
+ int idx) {
+ libvpx_test::MD5 md5;
+ md5.Add(reinterpret_cast<const uint8_t *>(data), data_size);
+ printf("Mode %s[%12s]: %5d ms MD5: %s\n", name, kVp9IntraPredNames[idx],
+ elapsed_time, md5.Get());
+ EXPECT_STREQ(signatures[idx], md5.Get());
+}
+
void TestIntraPred(const char name[], VpxPredFunc const *pred_funcs,
- const char *const pred_func_names[], int num_funcs,
- const char *const signatures[], int block_size,
- int num_pixels_per_test) {
- libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
- const int kBPS = 32;
- const int kTotalPixels = 32 * kBPS;
- DECLARE_ALIGNED(16, uint8_t, src[kTotalPixels]);
- DECLARE_ALIGNED(16, uint8_t, ref_src[kTotalPixels]);
- DECLARE_ALIGNED(16, uint8_t, left[kBPS]);
- DECLARE_ALIGNED(16, uint8_t, above_mem[2 * kBPS + 16]);
- uint8_t *const above = above_mem + 16;
- for (int i = 0; i < kTotalPixels; ++i) ref_src[i] = rnd.Rand8();
- for (int i = 0; i < kBPS; ++i) left[i] = rnd.Rand8();
- for (int i = -1; i < kBPS; ++i) above[i] = rnd.Rand8();
- const int kNumTests = static_cast<int>(2.e10 / num_pixels_per_test);
-
- // some code assumes the top row has been extended:
- // d45/d63 C-code, for instance, but not the assembly.
- // TODO(jzern): this style of extension isn't strictly necessary.
- ASSERT_LE(block_size, kBPS);
- memset(above + block_size, above[block_size - 1], 2 * kBPS - block_size);
-
- for (int k = 0; k < num_funcs; ++k) {
+ const char *const signatures[], int block_size) {
+ const int kNumTests = static_cast<int>(
+ 2.e10 / (block_size * block_size * kNumVp9IntraPredFuncs));
+ Vp9IntraPredTestMem intra_pred_test_mem;
+ const uint8_t *const above = intra_pred_test_mem.above_mem + 16;
+
+ intra_pred_test_mem.Init(block_size, 8);
+
+ for (int k = 0; k < kNumVp9IntraPredFuncs; ++k) {
if (pred_funcs[k] == NULL) continue;
- memcpy(src, ref_src, sizeof(src));
+ memcpy(intra_pred_test_mem.src, intra_pred_test_mem.ref_src,
+ sizeof(intra_pred_test_mem.src));
vpx_usec_timer timer;
vpx_usec_timer_start(&timer);
for (int num_tests = 0; num_tests < kNumTests; ++num_tests) {
- pred_funcs[k](src, kBPS, above, left);
+ pred_funcs[k](intra_pred_test_mem.src, kBPS, above,
+ intra_pred_test_mem.left);
}
libvpx_test::ClearSystemState();
vpx_usec_timer_mark(&timer);
const int elapsed_time =
static_cast<int>(vpx_usec_timer_elapsed(&timer) / 1000);
- libvpx_test::MD5 md5;
- md5.Add(src, sizeof(src));
- printf("Mode %s[%12s]: %5d ms MD5: %s\n", name, pred_func_names[k],
- elapsed_time, md5.Get());
- EXPECT_STREQ(signatures[k], md5.Get());
+ CheckMd5Signature(name, signatures, intra_pred_test_mem.src,
+ sizeof(intra_pred_test_mem.src), elapsed_time, k);
}
}
void TestIntraPred4(VpxPredFunc const *pred_funcs) {
- static const int kNumVp9IntraFuncs = 13;
- static const char *const kSignatures[kNumVp9IntraFuncs] = {
- "4334156168b34ab599d9b5b30f522fe9", "bc4649d5ba47c7ff178d92e475960fb0",
- "8d316e5933326dcac24e1064794b5d12", "a27270fed024eafd762c95de85f4da51",
- "c33dff000d4256c2b8f3bf9e9bab14d2", "44d8cddc2ad8f79b8ed3306051722b4f",
- "eb54839b2bad6699d8946f01ec041cd0", "ecb0d56ae5f677ea45127ce9d5c058e4",
- "0b7936841f6813da818275944895b574", "9117972ef64f91a58ff73e1731c81db2",
- "c56d5e8c729e46825f46dd5d3b5d508a", "c0889e2039bcf7bcb5d2f33cdca69adc",
- "309a618577b27c648f9c5ee45252bc8f",
+ static const char *const kSignatures[kNumVp9IntraPredFuncs] = {
+ "e7ed7353c3383fff942e500e9bfe82fe", "2a4a26fcc6ce005eadc08354d196c8a9",
+ "269d92eff86f315d9c38fe7640d85b15", "ae2960eea9f71ee3dabe08b282ec1773",
+ "6c1abcc44e90148998b51acd11144e9c", "f7bb3186e1ef8a2b326037ff898cad8e",
+ "364c1f3fb2f445f935aec2a70a67eaa4", "141624072a4a56773f68fadbdd07c4a7",
+ "7be49b08687a5f24df3a2c612fca3876", "459bb5d9fd5b238348179c9a22108cd6",
+ "73edb8831bf1bdfce21ae8eaa43b1234", "2e2457f2009c701a355a8b25eb74fcda",
+ "52ae4e8bdbe41494c1f43051d4dd7f0b"
};
- TestIntraPred("Intra4", pred_funcs, kVp9IntraPredNames, kNumVp9IntraFuncs,
- kSignatures, 4, 4 * 4 * kNumVp9IntraFuncs);
+ TestIntraPred("Intra4", pred_funcs, kSignatures, 4);
}
void TestIntraPred8(VpxPredFunc const *pred_funcs) {
- static const int kNumVp9IntraFuncs = 13;
- static const char *const kSignatures[kNumVp9IntraFuncs] = {
- "7694ddeeefed887faf9d339d18850928", "7d726b1213591b99f736be6dec65065b",
- "19c5711281357a485591aaf9c96c0a67", "ba6b66877a089e71cd938e3b8c40caac",
- "802440c93317e0f8ba93fab02ef74265", "9e09a47a15deb0b9d8372824f9805080",
- "b7c2d8c662268c0c427da412d7b0311d", "78339c1c60bb1d67d248ab8c4da08b7f",
- "5c97d70f7d47de1882a6cd86c165c8a9", "8182bf60688b42205acd95e59e967157",
- "08323400005a297f16d7e57e7fe1eaac", "95f7bfc262329a5849eda66d8f7c68ce",
- "815b75c8e0d91cc1ae766dc5d3e445a3",
+ static const char *const kSignatures[kNumVp9IntraPredFuncs] = {
+ "d8bbae5d6547cfc17e4f5f44c8730e88", "373bab6d931868d41a601d9d88ce9ac3",
+ "6fdd5ff4ff79656c14747598ca9e3706", "d9661c2811d6a73674f40ffb2b841847",
+ "7c722d10b19ccff0b8c171868e747385", "f81dd986eb2b50f750d3a7da716b7e27",
+ "d500f2c8fc78f46a4c74e4dcf51f14fb", "0e3523f9cab2142dd37fd07ec0760bce",
+ "79ac4efe907f0a0f1885d43066cfedee", "19ecf2432ac305057de3b6578474eec6",
+ "4f985b61acc6dd5d2d2585fa89ea2e2d", "f1bb25a9060dd262f405f15a38f5f674",
+ "209ea00801584829e9a0f7be7d4a74ba"
};
- TestIntraPred("Intra8", pred_funcs, kVp9IntraPredNames, kNumVp9IntraFuncs,
- kSignatures, 8, 8 * 8 * kNumVp9IntraFuncs);
+ TestIntraPred("Intra8", pred_funcs, kSignatures, 8);
}
void TestIntraPred16(VpxPredFunc const *pred_funcs) {
- static const int kNumVp9IntraFuncs = 13;
- static const char *const kSignatures[kNumVp9IntraFuncs] = {
- "b40dbb555d5d16a043dc361e6694fe53", "fb08118cee3b6405d64c1fd68be878c6",
- "6c190f341475c837cc38c2e566b64875", "db5c34ccbe2c7f595d9b08b0dc2c698c",
- "a62cbfd153a1f0b9fed13e62b8408a7a", "143df5b4c89335e281103f610f5052e4",
- "d87feb124107cdf2cfb147655aa0bb3c", "7841fae7d4d47b519322e6a03eeed9dc",
- "f6ebed3f71cbcf8d6d0516ce87e11093", "3cc480297dbfeed01a1c2d78dd03d0c5",
- "b9f69fa6532b372c545397dcb78ef311", "a8fe1c70432f09d0c20c67bdb6432c4d",
- "b8a41aa968ec108af447af4217cba91b",
+ static const char *const kSignatures[kNumVp9IntraPredFuncs] = {
+ "50971c07ce26977d30298538fffec619", "527a6b9e0dc5b21b98cf276305432bef",
+ "7eff2868f80ebc2c43a4f367281d80f7", "67cd60512b54964ef6aff1bd4816d922",
+ "48371c87dc95c08a33b2048f89cf6468", "b0acf2872ee411d7530af6d2625a7084",
+ "f32aafed4d8d3776ed58bcb6188756d5", "dae208f3dca583529cff49b73f7c4183",
+ "7af66a2f4c8e0b4908e40f047e60c47c", "125e3ab6ab9bc961f183ec366a7afa88",
+ "6b90f25b23983c35386b9fd704427622", "f8d6b11d710edc136a7c62c917435f93",
+ "ed308f18614a362917f411c218aee532"
};
- TestIntraPred("Intra16", pred_funcs, kVp9IntraPredNames, kNumVp9IntraFuncs,
- kSignatures, 16, 16 * 16 * kNumVp9IntraFuncs);
+ TestIntraPred("Intra16", pred_funcs, kSignatures, 16);
}
void TestIntraPred32(VpxPredFunc const *pred_funcs) {
- static const int kNumVp9IntraFuncs = 13;
- static const char *const kSignatures[kNumVp9IntraFuncs] = {
- "558541656d84f9ae7896db655826febe", "b3587a1f9a01495fa38c8cd3c8e2a1bf",
- "4c6501e64f25aacc55a2a16c7e8f0255", "b3b01379ba08916ef6b1b35f7d9ad51c",
- "0f1eb38b6cbddb3d496199ef9f329071", "911c06efb9ed1c3b4c104b232b55812f",
- "9225beb0ddfa7a1d24eaa1be430a6654", "0a6d584a44f8db9aa7ade2e2fdb9fc9e",
- "b01c9076525216925f3456f034fb6eee", "d267e20ad9e5cd2915d1a47254d3d149",
- "ed012a4a5da71f36c2393023184a0e59", "f162b51ed618d28b936974cff4391da5",
- "9e1370c6d42e08d357d9612c93a71cfc",
+ static const char *const kSignatures[kNumVp9IntraPredFuncs] = {
+ "a0a618c900e65ae521ccc8af789729f2", "985aaa7c72b4a6c2fb431d32100cf13a",
+ "10662d09febc3ca13ee4e700120daeb5", "b3b01379ba08916ef6b1b35f7d9ad51c",
+ "9f4261755795af97e34679c333ec7004", "bc2c9da91ad97ef0d1610fb0a9041657",
+ "75c79b1362ad18abfcdb1aa0aacfc21d", "4039bb7da0f6860090d3c57b5c85468f",
+ "b29fff7b61804e68383e3a609b33da58", "e1aa5e49067fd8dba66c2eb8d07b7a89",
+ "4e042822909c1c06d3b10a88281df1eb", "72eb9d9e0e67c93f4c66b70348e9fef7",
+ "a22d102bcb51ca798aac12ca4ae8f2e8"
};
- TestIntraPred("Intra32", pred_funcs, kVp9IntraPredNames, kNumVp9IntraFuncs,
- kSignatures, 32, 32 * 32 * kNumVp9IntraFuncs);
+ TestIntraPred("Intra32", pred_funcs, kSignatures, 32);
}
} // namespace
@@ -153,7 +169,6 @@ void TestIntraPred32(VpxPredFunc const *pred_funcs) {
}
// -----------------------------------------------------------------------------
-// 4x4
INTRA_PRED_TEST(C, TestIntraPred4, vpx_dc_predictor_4x4_c,
vpx_dc_left_predictor_4x4_c, vpx_dc_top_predictor_4x4_c,
@@ -163,47 +178,6 @@ INTRA_PRED_TEST(C, TestIntraPred4, vpx_dc_predictor_4x4_c,
vpx_d153_predictor_4x4_c, vpx_d207_predictor_4x4_c,
vpx_d63_predictor_4x4_c, vpx_tm_predictor_4x4_c)
-#if HAVE_SSE2
-INTRA_PRED_TEST(SSE2, TestIntraPred4, vpx_dc_predictor_4x4_sse2,
- vpx_dc_left_predictor_4x4_sse2, vpx_dc_top_predictor_4x4_sse2,
- vpx_dc_128_predictor_4x4_sse2, vpx_v_predictor_4x4_sse2,
- vpx_h_predictor_4x4_sse2, vpx_d45_predictor_4x4_sse2, NULL,
- NULL, NULL, vpx_d207_predictor_4x4_sse2, NULL,
- vpx_tm_predictor_4x4_sse2)
-#endif // HAVE_SSE2
-
-#if HAVE_SSSE3
-INTRA_PRED_TEST(SSSE3, TestIntraPred4, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, NULL, vpx_d153_predictor_4x4_ssse3, NULL,
- vpx_d63_predictor_4x4_ssse3, NULL)
-#endif // HAVE_SSSE3
-
-#if HAVE_DSPR2
-INTRA_PRED_TEST(DSPR2, TestIntraPred4, vpx_dc_predictor_4x4_dspr2, NULL, NULL,
- NULL, NULL, vpx_h_predictor_4x4_dspr2, NULL, NULL, NULL, NULL,
- NULL, NULL, vpx_tm_predictor_4x4_dspr2)
-#endif // HAVE_DSPR2
-
-#if HAVE_NEON
-INTRA_PRED_TEST(NEON, TestIntraPred4, vpx_dc_predictor_4x4_neon,
- vpx_dc_left_predictor_4x4_neon, vpx_dc_top_predictor_4x4_neon,
- vpx_dc_128_predictor_4x4_neon, vpx_v_predictor_4x4_neon,
- vpx_h_predictor_4x4_neon, vpx_d45_predictor_4x4_neon,
- vpx_d135_predictor_4x4_neon, NULL, NULL, NULL, NULL,
- vpx_tm_predictor_4x4_neon)
-#endif // HAVE_NEON
-
-#if HAVE_MSA
-INTRA_PRED_TEST(MSA, TestIntraPred4, vpx_dc_predictor_4x4_msa,
- vpx_dc_left_predictor_4x4_msa, vpx_dc_top_predictor_4x4_msa,
- vpx_dc_128_predictor_4x4_msa, vpx_v_predictor_4x4_msa,
- vpx_h_predictor_4x4_msa, NULL, NULL, NULL, NULL, NULL, NULL,
- vpx_tm_predictor_4x4_msa)
-#endif // HAVE_MSA
-
-// -----------------------------------------------------------------------------
-// 8x8
-
INTRA_PRED_TEST(C, TestIntraPred8, vpx_dc_predictor_8x8_c,
vpx_dc_left_predictor_8x8_c, vpx_dc_top_predictor_8x8_c,
vpx_dc_128_predictor_8x8_c, vpx_v_predictor_8x8_c,
@@ -212,46 +186,6 @@ INTRA_PRED_TEST(C, TestIntraPred8, vpx_dc_predictor_8x8_c,
vpx_d153_predictor_8x8_c, vpx_d207_predictor_8x8_c,
vpx_d63_predictor_8x8_c, vpx_tm_predictor_8x8_c)
-#if HAVE_SSE2
-INTRA_PRED_TEST(SSE2, TestIntraPred8, vpx_dc_predictor_8x8_sse2,
- vpx_dc_left_predictor_8x8_sse2, vpx_dc_top_predictor_8x8_sse2,
- vpx_dc_128_predictor_8x8_sse2, vpx_v_predictor_8x8_sse2,
- vpx_h_predictor_8x8_sse2, vpx_d45_predictor_8x8_sse2, NULL,
- NULL, NULL, NULL, NULL, vpx_tm_predictor_8x8_sse2)
-#endif // HAVE_SSE2
-
-#if HAVE_SSSE3
-INTRA_PRED_TEST(SSSE3, TestIntraPred8, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, NULL, vpx_d153_predictor_8x8_ssse3,
- vpx_d207_predictor_8x8_ssse3, vpx_d63_predictor_8x8_ssse3, NULL)
-#endif // HAVE_SSSE3
-
-#if HAVE_DSPR2
-INTRA_PRED_TEST(DSPR2, TestIntraPred8, vpx_dc_predictor_8x8_dspr2, NULL, NULL,
- NULL, NULL, vpx_h_predictor_8x8_dspr2, NULL, NULL, NULL, NULL,
- NULL, NULL, vpx_tm_predictor_8x8_c)
-#endif // HAVE_DSPR2
-
-#if HAVE_NEON
-INTRA_PRED_TEST(NEON, TestIntraPred8, vpx_dc_predictor_8x8_neon,
- vpx_dc_left_predictor_8x8_neon, vpx_dc_top_predictor_8x8_neon,
- vpx_dc_128_predictor_8x8_neon, vpx_v_predictor_8x8_neon,
- vpx_h_predictor_8x8_neon, vpx_d45_predictor_8x8_neon, NULL,
- NULL, NULL, NULL, NULL, vpx_tm_predictor_8x8_neon)
-
-#endif // HAVE_NEON
-
-#if HAVE_MSA
-INTRA_PRED_TEST(MSA, TestIntraPred8, vpx_dc_predictor_8x8_msa,
- vpx_dc_left_predictor_8x8_msa, vpx_dc_top_predictor_8x8_msa,
- vpx_dc_128_predictor_8x8_msa, vpx_v_predictor_8x8_msa,
- vpx_h_predictor_8x8_msa, NULL, NULL, NULL, NULL, NULL, NULL,
- vpx_tm_predictor_8x8_msa)
-#endif // HAVE_MSA
-
-// -----------------------------------------------------------------------------
-// 16x16
-
INTRA_PRED_TEST(C, TestIntraPred16, vpx_dc_predictor_16x16_c,
vpx_dc_left_predictor_16x16_c, vpx_dc_top_predictor_16x16_c,
vpx_dc_128_predictor_16x16_c, vpx_v_predictor_16x16_c,
@@ -260,87 +194,287 @@ INTRA_PRED_TEST(C, TestIntraPred16, vpx_dc_predictor_16x16_c,
vpx_d153_predictor_16x16_c, vpx_d207_predictor_16x16_c,
vpx_d63_predictor_16x16_c, vpx_tm_predictor_16x16_c)
+INTRA_PRED_TEST(C, TestIntraPred32, vpx_dc_predictor_32x32_c,
+ vpx_dc_left_predictor_32x32_c, vpx_dc_top_predictor_32x32_c,
+ vpx_dc_128_predictor_32x32_c, vpx_v_predictor_32x32_c,
+ vpx_h_predictor_32x32_c, vpx_d45_predictor_32x32_c,
+ vpx_d135_predictor_32x32_c, vpx_d117_predictor_32x32_c,
+ vpx_d153_predictor_32x32_c, vpx_d207_predictor_32x32_c,
+ vpx_d63_predictor_32x32_c, vpx_tm_predictor_32x32_c)
+
#if HAVE_SSE2
+INTRA_PRED_TEST(SSE2, TestIntraPred4, vpx_dc_predictor_4x4_sse2,
+ vpx_dc_left_predictor_4x4_sse2, vpx_dc_top_predictor_4x4_sse2,
+ vpx_dc_128_predictor_4x4_sse2, vpx_v_predictor_4x4_sse2,
+ vpx_h_predictor_4x4_sse2, vpx_d45_predictor_4x4_sse2, NULL,
+ NULL, NULL, vpx_d207_predictor_4x4_sse2, NULL,
+ vpx_tm_predictor_4x4_sse2)
+
+INTRA_PRED_TEST(SSE2, TestIntraPred8, vpx_dc_predictor_8x8_sse2,
+ vpx_dc_left_predictor_8x8_sse2, vpx_dc_top_predictor_8x8_sse2,
+ vpx_dc_128_predictor_8x8_sse2, vpx_v_predictor_8x8_sse2,
+ vpx_h_predictor_8x8_sse2, vpx_d45_predictor_8x8_sse2, NULL,
+ NULL, NULL, NULL, NULL, vpx_tm_predictor_8x8_sse2)
+
INTRA_PRED_TEST(SSE2, TestIntraPred16, vpx_dc_predictor_16x16_sse2,
vpx_dc_left_predictor_16x16_sse2,
vpx_dc_top_predictor_16x16_sse2,
vpx_dc_128_predictor_16x16_sse2, vpx_v_predictor_16x16_sse2,
vpx_h_predictor_16x16_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
vpx_tm_predictor_16x16_sse2)
+
+INTRA_PRED_TEST(SSE2, TestIntraPred32, vpx_dc_predictor_32x32_sse2,
+ vpx_dc_left_predictor_32x32_sse2,
+ vpx_dc_top_predictor_32x32_sse2,
+ vpx_dc_128_predictor_32x32_sse2, vpx_v_predictor_32x32_sse2,
+ vpx_h_predictor_32x32_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
+ vpx_tm_predictor_32x32_sse2)
#endif // HAVE_SSE2
#if HAVE_SSSE3
+INTRA_PRED_TEST(SSSE3, TestIntraPred4, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, vpx_d153_predictor_4x4_ssse3, NULL,
+ vpx_d63_predictor_4x4_ssse3, NULL)
+INTRA_PRED_TEST(SSSE3, TestIntraPred8, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, vpx_d153_predictor_8x8_ssse3,
+ vpx_d207_predictor_8x8_ssse3, vpx_d63_predictor_8x8_ssse3, NULL)
INTRA_PRED_TEST(SSSE3, TestIntraPred16, NULL, NULL, NULL, NULL, NULL, NULL,
vpx_d45_predictor_16x16_ssse3, NULL, NULL,
vpx_d153_predictor_16x16_ssse3, vpx_d207_predictor_16x16_ssse3,
vpx_d63_predictor_16x16_ssse3, NULL)
+INTRA_PRED_TEST(SSSE3, TestIntraPred32, NULL, NULL, NULL, NULL, NULL, NULL,
+ vpx_d45_predictor_32x32_ssse3, NULL, NULL,
+ vpx_d153_predictor_32x32_ssse3, vpx_d207_predictor_32x32_ssse3,
+ vpx_d63_predictor_32x32_ssse3, NULL)
#endif // HAVE_SSSE3
#if HAVE_DSPR2
+INTRA_PRED_TEST(DSPR2, TestIntraPred4, vpx_dc_predictor_4x4_dspr2, NULL, NULL,
+ NULL, NULL, vpx_h_predictor_4x4_dspr2, NULL, NULL, NULL, NULL,
+ NULL, NULL, vpx_tm_predictor_4x4_dspr2)
+INTRA_PRED_TEST(DSPR2, TestIntraPred8, vpx_dc_predictor_8x8_dspr2, NULL, NULL,
+ NULL, NULL, vpx_h_predictor_8x8_dspr2, NULL, NULL, NULL, NULL,
+ NULL, NULL, vpx_tm_predictor_8x8_c)
INTRA_PRED_TEST(DSPR2, TestIntraPred16, vpx_dc_predictor_16x16_dspr2, NULL,
NULL, NULL, NULL, vpx_h_predictor_16x16_dspr2, NULL, NULL, NULL,
NULL, NULL, NULL, NULL)
#endif // HAVE_DSPR2
#if HAVE_NEON
+INTRA_PRED_TEST(NEON, TestIntraPred4, vpx_dc_predictor_4x4_neon,
+ vpx_dc_left_predictor_4x4_neon, vpx_dc_top_predictor_4x4_neon,
+ vpx_dc_128_predictor_4x4_neon, vpx_v_predictor_4x4_neon,
+ vpx_h_predictor_4x4_neon, vpx_d45_predictor_4x4_neon,
+ vpx_d135_predictor_4x4_neon, NULL, NULL, NULL, NULL,
+ vpx_tm_predictor_4x4_neon)
+INTRA_PRED_TEST(NEON, TestIntraPred8, vpx_dc_predictor_8x8_neon,
+ vpx_dc_left_predictor_8x8_neon, vpx_dc_top_predictor_8x8_neon,
+ vpx_dc_128_predictor_8x8_neon, vpx_v_predictor_8x8_neon,
+ vpx_h_predictor_8x8_neon, vpx_d45_predictor_8x8_neon, NULL,
+ NULL, NULL, NULL, NULL, vpx_tm_predictor_8x8_neon)
INTRA_PRED_TEST(NEON, TestIntraPred16, vpx_dc_predictor_16x16_neon,
vpx_dc_left_predictor_16x16_neon,
vpx_dc_top_predictor_16x16_neon,
vpx_dc_128_predictor_16x16_neon, vpx_v_predictor_16x16_neon,
vpx_h_predictor_16x16_neon, vpx_d45_predictor_16x16_neon, NULL,
NULL, NULL, NULL, NULL, vpx_tm_predictor_16x16_neon)
+INTRA_PRED_TEST(NEON, TestIntraPred32, vpx_dc_predictor_32x32_neon,
+ vpx_dc_left_predictor_32x32_neon,
+ vpx_dc_top_predictor_32x32_neon,
+ vpx_dc_128_predictor_32x32_neon, vpx_v_predictor_32x32_neon,
+ vpx_h_predictor_32x32_neon, NULL, NULL, NULL, NULL, NULL, NULL,
+ vpx_tm_predictor_32x32_neon)
#endif // HAVE_NEON
#if HAVE_MSA
+INTRA_PRED_TEST(MSA, TestIntraPred4, vpx_dc_predictor_4x4_msa,
+ vpx_dc_left_predictor_4x4_msa, vpx_dc_top_predictor_4x4_msa,
+ vpx_dc_128_predictor_4x4_msa, vpx_v_predictor_4x4_msa,
+ vpx_h_predictor_4x4_msa, NULL, NULL, NULL, NULL, NULL, NULL,
+ vpx_tm_predictor_4x4_msa)
+INTRA_PRED_TEST(MSA, TestIntraPred8, vpx_dc_predictor_8x8_msa,
+ vpx_dc_left_predictor_8x8_msa, vpx_dc_top_predictor_8x8_msa,
+ vpx_dc_128_predictor_8x8_msa, vpx_v_predictor_8x8_msa,
+ vpx_h_predictor_8x8_msa, NULL, NULL, NULL, NULL, NULL, NULL,
+ vpx_tm_predictor_8x8_msa)
INTRA_PRED_TEST(MSA, TestIntraPred16, vpx_dc_predictor_16x16_msa,
vpx_dc_left_predictor_16x16_msa, vpx_dc_top_predictor_16x16_msa,
vpx_dc_128_predictor_16x16_msa, vpx_v_predictor_16x16_msa,
vpx_h_predictor_16x16_msa, NULL, NULL, NULL, NULL, NULL, NULL,
vpx_tm_predictor_16x16_msa)
+INTRA_PRED_TEST(MSA, TestIntraPred32, vpx_dc_predictor_32x32_msa,
+ vpx_dc_left_predictor_32x32_msa, vpx_dc_top_predictor_32x32_msa,
+ vpx_dc_128_predictor_32x32_msa, vpx_v_predictor_32x32_msa,
+ vpx_h_predictor_32x32_msa, NULL, NULL, NULL, NULL, NULL, NULL,
+ vpx_tm_predictor_32x32_msa)
#endif // HAVE_MSA
// -----------------------------------------------------------------------------
-// 32x32
-INTRA_PRED_TEST(C, TestIntraPred32, vpx_dc_predictor_32x32_c,
- vpx_dc_left_predictor_32x32_c, vpx_dc_top_predictor_32x32_c,
- vpx_dc_128_predictor_32x32_c, vpx_v_predictor_32x32_c,
- vpx_h_predictor_32x32_c, vpx_d45_predictor_32x32_c,
- vpx_d135_predictor_32x32_c, vpx_d117_predictor_32x32_c,
- vpx_d153_predictor_32x32_c, vpx_d207_predictor_32x32_c,
- vpx_d63_predictor_32x32_c, vpx_tm_predictor_32x32_c)
+#if CONFIG_VP9_HIGHBITDEPTH
+namespace {
-#if HAVE_SSE2
-INTRA_PRED_TEST(SSE2, TestIntraPred32, vpx_dc_predictor_32x32_sse2,
- vpx_dc_left_predictor_32x32_sse2,
- vpx_dc_top_predictor_32x32_sse2,
- vpx_dc_128_predictor_32x32_sse2, vpx_v_predictor_32x32_sse2,
- vpx_h_predictor_32x32_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
- vpx_tm_predictor_32x32_sse2)
-#endif // HAVE_SSE2
+typedef void (*VpxHighbdPredFunc)(uint16_t *dst, ptrdiff_t y_stride,
+ const uint16_t *above, const uint16_t *left,
+ int bd);
-#if HAVE_SSSE3
-INTRA_PRED_TEST(SSSE3, TestIntraPred32, NULL, NULL, NULL, NULL, NULL, NULL,
- vpx_d45_predictor_32x32_ssse3, NULL, NULL,
- vpx_d153_predictor_32x32_ssse3, vpx_d207_predictor_32x32_ssse3,
- vpx_d63_predictor_32x32_ssse3, NULL)
-#endif // HAVE_SSSE3
+typedef IntraPredTestMem<uint16_t> Vp9HighbdIntraPredTestMem;
-#if HAVE_NEON
-INTRA_PRED_TEST(NEON, TestIntraPred32, vpx_dc_predictor_32x32_neon,
- vpx_dc_left_predictor_32x32_neon,
- vpx_dc_top_predictor_32x32_neon,
- vpx_dc_128_predictor_32x32_neon, vpx_v_predictor_32x32_neon,
- vpx_h_predictor_32x32_neon, NULL, NULL, NULL, NULL, NULL, NULL,
- vpx_tm_predictor_32x32_neon)
-#endif // HAVE_NEON
+void TestHighbdIntraPred(const char name[], VpxHighbdPredFunc const *pred_funcs,
+ const char *const signatures[], int block_size) {
+ const int kNumTests = static_cast<int>(
+ 2.e10 / (block_size * block_size * kNumVp9IntraPredFuncs));
+ Vp9HighbdIntraPredTestMem intra_pred_test_mem;
+ const uint16_t *const above = intra_pred_test_mem.above_mem + 16;
-#if HAVE_MSA
-INTRA_PRED_TEST(MSA, TestIntraPred32, vpx_dc_predictor_32x32_msa,
- vpx_dc_left_predictor_32x32_msa, vpx_dc_top_predictor_32x32_msa,
- vpx_dc_128_predictor_32x32_msa, vpx_v_predictor_32x32_msa,
- vpx_h_predictor_32x32_msa, NULL, NULL, NULL, NULL, NULL, NULL,
- vpx_tm_predictor_32x32_msa)
-#endif // HAVE_MSA
+ intra_pred_test_mem.Init(block_size, 12);
+
+ for (int k = 0; k < kNumVp9IntraPredFuncs; ++k) {
+ if (pred_funcs[k] == NULL) continue;
+ memcpy(intra_pred_test_mem.src, intra_pred_test_mem.ref_src,
+ sizeof(intra_pred_test_mem.src));
+ vpx_usec_timer timer;
+ vpx_usec_timer_start(&timer);
+ for (int num_tests = 0; num_tests < kNumTests; ++num_tests) {
+ pred_funcs[k](intra_pred_test_mem.src, kBPS, above,
+ intra_pred_test_mem.left, 12);
+ }
+ libvpx_test::ClearSystemState();
+ vpx_usec_timer_mark(&timer);
+ const int elapsed_time =
+ static_cast<int>(vpx_usec_timer_elapsed(&timer) / 1000);
+ CheckMd5Signature(name, signatures, intra_pred_test_mem.src,
+ sizeof(intra_pred_test_mem.src), elapsed_time, k);
+ }
+}
+
+void TestHighbdIntraPred4(VpxHighbdPredFunc const *pred_funcs) {
+ static const char *const kSignatures[kNumVp9IntraPredFuncs] = {
+ "11f74af6c5737df472f3275cbde062fa", "51bea056b6447c93f6eb8f6b7e8f6f71",
+ "27e97f946766331795886f4de04c5594", "53ab15974b049111fb596c5168ec7e3f",
+ "f0b640bb176fbe4584cf3d32a9b0320a", "729783ca909e03afd4b47111c80d967b",
+ "fbf1c30793d9f32812e4d9f905d53530", "293fc903254a33754133314c6cdba81f",
+ "f8074d704233e73dfd35b458c6092374", "aa6363d08544a1ec4da33d7a0be5640d",
+ "462abcfdfa3d087bb33c9a88f2aec491", "863eab65d22550dd44a2397277c1ec71",
+ "23d61df1574d0fa308f9731811047c4b"
+ };
+ TestHighbdIntraPred("Intra4", pred_funcs, kSignatures, 4);
+}
+
+void TestHighbdIntraPred8(VpxHighbdPredFunc const *pred_funcs) {
+ static const char *const kSignatures[kNumVp9IntraPredFuncs] = {
+ "03da8829fe94663047fd108c5fcaa71d", "ecdb37b8120a2d3a4c706b016bd1bfd7",
+ "1d4543ed8d2b9368cb96898095fe8a75", "f791c9a67b913cbd82d9da8ecede30e2",
+ "065c70646f4dbaff913282f55a45a441", "51f87123616662ef7c35691497dfd0ba",
+ "2a5b0131ef4716f098ee65e6df01e3dd", "9ffe186a6bc7db95275f1bbddd6f7aba",
+ "a3258a2eae2e2bd55cb8f71351b22998", "8d909f0a2066e39b3216092c6289ece4",
+ "d183abb30b9f24c886a0517e991b22c7", "702a42fe4c7d665dc561b2aeeb60f311",
+ "7b5dbbbe7ae3a4ac2948731600bde5d6"
+ };
+ TestHighbdIntraPred("Intra8", pred_funcs, kSignatures, 8);
+}
+
+void TestHighbdIntraPred16(VpxHighbdPredFunc const *pred_funcs) {
+ static const char *const kSignatures[kNumVp9IntraPredFuncs] = {
+ "e33cb3f56a878e2fddb1b2fc51cdd275", "c7bff6f04b6052c8ab335d726dbbd52d",
+ "d0b0b47b654a9bcc5c6008110a44589b", "78f5da7b10b2b9ab39f114a33b6254e9",
+ "c78e31d23831abb40d6271a318fdd6f3", "90d1347f4ec9198a0320daecb6ff90b8",
+ "d2c623746cbb64a0c9e29c10f2c57041", "cf28bd387b81ad3e5f1a1c779a4b70a0",
+ "24c304330431ddeaf630f6ce94af2eac", "91a329798036bf64e8e00a87b131b8b1",
+ "d39111f22885307f920796a42084c872", "e2e702f7250ece98dd8f3f2854c31eeb",
+ "e2fb05b01eb8b88549e85641d8ce5b59"
+ };
+ TestHighbdIntraPred("Intra16", pred_funcs, kSignatures, 16);
+}
+
+void TestHighbdIntraPred32(VpxHighbdPredFunc const *pred_funcs) {
+ static const char *const kSignatures[kNumVp9IntraPredFuncs] = {
+ "a3e8056ba7e36628cce4917cd956fedd", "cc7d3024fe8748b512407edee045377e",
+ "2aab0a0f330a1d3e19b8ecb8f06387a3", "a547bc3fb7b06910bf3973122a426661",
+ "26f712514da95042f93d6e8dc8e431dc", "bb08c6e16177081daa3d936538dbc2e3",
+ "8f031af3e2650e89620d8d2c3a843d8b", "42867c8553285e94ee8e4df7abafbda8",
+ "6496bdee96100667833f546e1be3d640", "2ebfa25bf981377e682e580208504300",
+ "3e8ae52fd1f607f348aa4cb436c71ab7", "3d4efe797ca82193613696753ea624c4",
+ "cb8aab6d372278f3131e8d99efde02d9"
+ };
+ TestHighbdIntraPred("Intra32", pred_funcs, kSignatures, 32);
+}
+
+} // namespace
+
+// Defines a test case for |arch| (e.g., C, SSE2, ...) passing the predictors
+// to |test_func|. The test name is 'arch.test_func', e.g., C.TestIntraPred4.
+#define HIGHBD_INTRA_PRED_TEST(arch, test_func, dc, dc_left, dc_top, dc_128, \
+ v, h, d45, d135, d117, d153, d207, d63, tm) \
+ TEST(arch, test_func) { \
+ static const VpxHighbdPredFunc vpx_intra_pred[] = { \
+ dc, dc_left, dc_top, dc_128, v, h, d45, d135, d117, d153, d207, d63, tm \
+ }; \
+ test_func(vpx_intra_pred); \
+ }
+
+// -----------------------------------------------------------------------------
+
+HIGHBD_INTRA_PRED_TEST(
+ C, TestHighbdIntraPred4, vpx_highbd_dc_predictor_4x4_c,
+ vpx_highbd_dc_left_predictor_4x4_c, vpx_highbd_dc_top_predictor_4x4_c,
+ vpx_highbd_dc_128_predictor_4x4_c, vpx_highbd_v_predictor_4x4_c,
+ vpx_highbd_h_predictor_4x4_c, vpx_highbd_d45_predictor_4x4_c,
+ vpx_highbd_d135_predictor_4x4_c, vpx_highbd_d117_predictor_4x4_c,
+ vpx_highbd_d153_predictor_4x4_c, vpx_highbd_d207_predictor_4x4_c,
+ vpx_highbd_d63_predictor_4x4_c, vpx_highbd_tm_predictor_4x4_c)
+
+HIGHBD_INTRA_PRED_TEST(
+ C, TestHighbdIntraPred8, vpx_highbd_dc_predictor_8x8_c,
+ vpx_highbd_dc_left_predictor_8x8_c, vpx_highbd_dc_top_predictor_8x8_c,
+ vpx_highbd_dc_128_predictor_8x8_c, vpx_highbd_v_predictor_8x8_c,
+ vpx_highbd_h_predictor_8x8_c, vpx_highbd_d45_predictor_8x8_c,
+ vpx_highbd_d135_predictor_8x8_c, vpx_highbd_d117_predictor_8x8_c,
+ vpx_highbd_d153_predictor_8x8_c, vpx_highbd_d207_predictor_8x8_c,
+ vpx_highbd_d63_predictor_8x8_c, vpx_highbd_tm_predictor_8x8_c)
+
+HIGHBD_INTRA_PRED_TEST(
+ C, TestHighbdIntraPred16, vpx_highbd_dc_predictor_16x16_c,
+ vpx_highbd_dc_left_predictor_16x16_c, vpx_highbd_dc_top_predictor_16x16_c,
+ vpx_highbd_dc_128_predictor_16x16_c, vpx_highbd_v_predictor_16x16_c,
+ vpx_highbd_h_predictor_16x16_c, vpx_highbd_d45_predictor_16x16_c,
+ vpx_highbd_d135_predictor_16x16_c, vpx_highbd_d117_predictor_16x16_c,
+ vpx_highbd_d153_predictor_16x16_c, vpx_highbd_d207_predictor_16x16_c,
+ vpx_highbd_d63_predictor_16x16_c, vpx_highbd_tm_predictor_16x16_c)
+
+HIGHBD_INTRA_PRED_TEST(
+ C, TestHighbdIntraPred32, vpx_highbd_dc_predictor_32x32_c,
+ vpx_highbd_dc_left_predictor_32x32_c, vpx_highbd_dc_top_predictor_32x32_c,
+ vpx_highbd_dc_128_predictor_32x32_c, vpx_highbd_v_predictor_32x32_c,
+ vpx_highbd_h_predictor_32x32_c, vpx_highbd_d45_predictor_32x32_c,
+ vpx_highbd_d135_predictor_32x32_c, vpx_highbd_d117_predictor_32x32_c,
+ vpx_highbd_d153_predictor_32x32_c, vpx_highbd_d207_predictor_32x32_c,
+ vpx_highbd_d63_predictor_32x32_c, vpx_highbd_tm_predictor_32x32_c)
+
+#if HAVE_SSE2
+HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred4,
+ vpx_highbd_dc_predictor_4x4_sse2, NULL, NULL, NULL,
+ vpx_highbd_v_predictor_4x4_sse2, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, vpx_highbd_tm_predictor_4x4_c)
+
+HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred8,
+ vpx_highbd_dc_predictor_8x8_sse2, NULL, NULL, NULL,
+ vpx_highbd_v_predictor_8x8_sse2, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, vpx_highbd_tm_predictor_8x8_sse2)
+
+HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred16,
+ vpx_highbd_dc_predictor_16x16_sse2, NULL, NULL, NULL,
+ vpx_highbd_v_predictor_16x16_sse2, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL,
+ vpx_highbd_tm_predictor_16x16_sse2)
+
+HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred32,
+ vpx_highbd_dc_predictor_32x32_sse2, NULL, NULL, NULL,
+ vpx_highbd_v_predictor_32x32_sse2, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL,
+ vpx_highbd_tm_predictor_32x32_sse2)
+#endif // HAVE_SSE2
+
+#endif // CONFIG_VP9_HIGHBITDEPTH
#include "test/test_libvpx.cc"
diff --git a/test/vp9_ethread_test.cc b/test/vp9_ethread_test.cc
index 804dc8956..f89f852b6 100644
--- a/test/vp9_ethread_test.cc
+++ b/test/vp9_ethread_test.cc
@@ -65,6 +65,7 @@ class VPxEncoderThreadTest
encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
encoder->Control(VP8E_SET_ARNR_TYPE, 3);
+ encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 0);
} else {
encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 0);
encoder->Control(VP9E_SET_AQ_MODE, 3);
@@ -127,7 +128,7 @@ VP9_INSTANTIATE_TEST_CASE(VPxEncoderThreadTest,
::testing::Values(::libvpx_test::kTwoPassGood,
::libvpx_test::kOnePassGood,
::libvpx_test::kRealTime),
- ::testing::Range(1, 9), // cpu_used
+ ::testing::Range(0, 9), // cpu_used
::testing::Range(0, 3), // tile_columns
::testing::Range(2, 5)); // threads
} // namespace
diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc
index 1c4a3392e..7cd32b990 100644
--- a/test/vp9_intrapred_test.cc
+++ b/test/vp9_intrapred_test.cc
@@ -28,25 +28,25 @@ using libvpx_test::ACMRandom;
const int count_test_block = 100000;
-typedef void (*IntraPred)(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above, const uint16_t *left, int bps);
+typedef void (*IntraPredFunc)(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left);
-struct IntraPredFunc {
- IntraPredFunc(IntraPred pred = NULL, IntraPred ref = NULL,
- int block_size_value = 0, int bit_depth_value = 0)
+struct IntraPredParam {
+ IntraPredParam(IntraPredFunc pred = NULL, IntraPredFunc ref = NULL,
+ int block_size_value = 0, int bit_depth_value = 0)
: pred_fn(pred), ref_fn(ref), block_size(block_size_value),
bit_depth(bit_depth_value) {}
- IntraPred pred_fn;
- IntraPred ref_fn;
+ IntraPredFunc pred_fn;
+ IntraPredFunc ref_fn;
int block_size;
int bit_depth;
};
-class VP9IntraPredTest : public ::testing::TestWithParam<IntraPredFunc> {
+template <typename Pixel, typename PredParam>
+class IntraPredTest : public ::testing::TestWithParam<PredParam> {
public:
- void RunTest(uint16_t *left_col, uint16_t *above_data, uint16_t *dst,
- uint16_t *ref_dst) {
+ void RunTest(Pixel *left_col, Pixel *above_data, Pixel *dst, Pixel *ref_dst) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
const int block_size = params_.block_size;
above_row_ = above_data + 16;
@@ -56,13 +56,16 @@ class VP9IntraPredTest : public ::testing::TestWithParam<IntraPredFunc> {
int error_count = 0;
for (int i = 0; i < count_test_block; ++i) {
// Fill edges with random data, try first with saturated values.
- for (int x = -1; x <= block_size * 2; x++) {
+ for (int x = -1; x < block_size; x++) {
if (i == 0) {
above_row_[x] = mask_;
} else {
above_row_[x] = rnd.Rand16() & mask_;
}
}
+ for (int x = block_size; x < 2 * block_size; x++) {
+ above_row_[x] = above_row_[block_size - 1];
+ }
for (int y = 0; y < block_size; y++) {
if (i == 0) {
left_col_[y] = mask_;
@@ -78,17 +81,12 @@ class VP9IntraPredTest : public ::testing::TestWithParam<IntraPredFunc> {
protected:
virtual void SetUp() {
- params_ = GetParam();
+ params_ = this->GetParam();
stride_ = params_.block_size * 3;
mask_ = (1 << params_.bit_depth) - 1;
}
- void Predict() {
- const int bit_depth = params_.bit_depth;
- params_.ref_fn(ref_dst_, stride_, above_row_, left_col_, bit_depth);
- ASM_REGISTER_STATE_CHECK(
- params_.pred_fn(dst_, stride_, above_row_, left_col_, bit_depth));
- }
+ void Predict();
void CheckPrediction(int test_case_number, int *error_count) const {
// For each pixel ensure that the calculated value is the same as reference.
@@ -104,18 +102,223 @@ class VP9IntraPredTest : public ::testing::TestWithParam<IntraPredFunc> {
}
}
- uint16_t *above_row_;
- uint16_t *left_col_;
- uint16_t *dst_;
- uint16_t *ref_dst_;
+ Pixel *above_row_;
+ Pixel *left_col_;
+ Pixel *dst_;
+ Pixel *ref_dst_;
ptrdiff_t stride_;
int mask_;
- IntraPredFunc params_;
+ PredParam params_;
};
+template <>
+void IntraPredTest<uint8_t, IntraPredParam>::Predict() {
+ params_.ref_fn(ref_dst_, stride_, above_row_, left_col_);
+ ASM_REGISTER_STATE_CHECK(
+ params_.pred_fn(dst_, stride_, above_row_, left_col_));
+}
+
+typedef IntraPredTest<uint8_t, IntraPredParam> VP9IntraPredTest;
+
TEST_P(VP9IntraPredTest, IntraPredTests) {
// max block size is 32
+ DECLARE_ALIGNED(16, uint8_t, left_col[2 * 32]);
+ DECLARE_ALIGNED(16, uint8_t, above_data[2 * 32 + 32]);
+ DECLARE_ALIGNED(16, uint8_t, dst[3 * 32 * 32]);
+ DECLARE_ALIGNED(16, uint8_t, ref_dst[3 * 32 * 32]);
+ RunTest(left_col, above_data, dst, ref_dst);
+}
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+ SSE2, VP9IntraPredTest,
+ ::testing::Values(
+ IntraPredParam(&vpx_d45_predictor_4x4_sse2, &vpx_d45_predictor_4x4_c, 4,
+ 8),
+ IntraPredParam(&vpx_d45_predictor_8x8_sse2, &vpx_d45_predictor_8x8_c, 8,
+ 8),
+ IntraPredParam(&vpx_d207_predictor_4x4_sse2, &vpx_d207_predictor_4x4_c,
+ 4, 8),
+ IntraPredParam(&vpx_dc_128_predictor_4x4_sse2,
+ &vpx_dc_128_predictor_4x4_c, 4, 8),
+ IntraPredParam(&vpx_dc_128_predictor_8x8_sse2,
+ &vpx_dc_128_predictor_8x8_c, 8, 8),
+ IntraPredParam(&vpx_dc_128_predictor_16x16_sse2,
+ &vpx_dc_128_predictor_16x16_c, 16, 8),
+ IntraPredParam(&vpx_dc_128_predictor_32x32_sse2,
+ &vpx_dc_128_predictor_32x32_c, 32, 8),
+ IntraPredParam(&vpx_dc_left_predictor_4x4_sse2,
+ &vpx_dc_left_predictor_4x4_c, 4, 8),
+ IntraPredParam(&vpx_dc_left_predictor_8x8_sse2,
+ &vpx_dc_left_predictor_8x8_c, 8, 8),
+ IntraPredParam(&vpx_dc_left_predictor_16x16_sse2,
+ &vpx_dc_left_predictor_16x16_c, 16, 8),
+ IntraPredParam(&vpx_dc_left_predictor_32x32_sse2,
+ &vpx_dc_left_predictor_32x32_c, 32, 8),
+ IntraPredParam(&vpx_dc_predictor_4x4_sse2, &vpx_dc_predictor_4x4_c, 4,
+ 8),
+ IntraPredParam(&vpx_dc_predictor_8x8_sse2, &vpx_dc_predictor_8x8_c, 8,
+ 8),
+ IntraPredParam(&vpx_dc_predictor_16x16_sse2, &vpx_dc_predictor_16x16_c,
+ 16, 8),
+ IntraPredParam(&vpx_dc_predictor_32x32_sse2, &vpx_dc_predictor_32x32_c,
+ 32, 8),
+ IntraPredParam(&vpx_dc_top_predictor_4x4_sse2,
+ &vpx_dc_top_predictor_4x4_c, 4, 8),
+ IntraPredParam(&vpx_dc_top_predictor_8x8_sse2,
+ &vpx_dc_top_predictor_8x8_c, 8, 8),
+ IntraPredParam(&vpx_dc_top_predictor_16x16_sse2,
+ &vpx_dc_top_predictor_16x16_c, 16, 8),
+ IntraPredParam(&vpx_dc_top_predictor_32x32_sse2,
+ &vpx_dc_top_predictor_32x32_c, 32, 8),
+ IntraPredParam(&vpx_h_predictor_4x4_sse2, &vpx_h_predictor_4x4_c, 4, 8),
+ IntraPredParam(&vpx_h_predictor_8x8_sse2, &vpx_h_predictor_8x8_c, 8, 8),
+ IntraPredParam(&vpx_h_predictor_16x16_sse2, &vpx_h_predictor_16x16_c,
+ 16, 8),
+ IntraPredParam(&vpx_h_predictor_32x32_sse2, &vpx_h_predictor_32x32_c,
+ 32, 8),
+ IntraPredParam(&vpx_tm_predictor_4x4_sse2, &vpx_tm_predictor_4x4_c, 4,
+ 8),
+ IntraPredParam(&vpx_tm_predictor_8x8_sse2, &vpx_tm_predictor_8x8_c, 8,
+ 8),
+ IntraPredParam(&vpx_tm_predictor_16x16_sse2, &vpx_tm_predictor_16x16_c,
+ 16, 8),
+ IntraPredParam(&vpx_tm_predictor_32x32_sse2, &vpx_tm_predictor_32x32_c,
+ 32, 8),
+ IntraPredParam(&vpx_v_predictor_4x4_sse2, &vpx_v_predictor_4x4_c, 4, 8),
+ IntraPredParam(&vpx_v_predictor_8x8_sse2, &vpx_v_predictor_8x8_c, 8, 8),
+ IntraPredParam(&vpx_v_predictor_16x16_sse2, &vpx_v_predictor_16x16_c,
+ 16, 8),
+ IntraPredParam(&vpx_v_predictor_32x32_sse2, &vpx_v_predictor_32x32_c,
+ 32, 8)));
+#endif // HAVE_SSE2
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(
+ SSSE3, VP9IntraPredTest,
+ ::testing::Values(IntraPredParam(&vpx_d45_predictor_16x16_ssse3,
+ &vpx_d45_predictor_16x16_c, 16, 8),
+ IntraPredParam(&vpx_d45_predictor_32x32_ssse3,
+ &vpx_d45_predictor_32x32_c, 32, 8),
+ IntraPredParam(&vpx_d63_predictor_4x4_ssse3,
+ &vpx_d63_predictor_4x4_c, 4, 8),
+ IntraPredParam(&vpx_d63_predictor_8x8_ssse3,
+ &vpx_d63_predictor_8x8_c, 8, 8),
+ IntraPredParam(&vpx_d63_predictor_16x16_ssse3,
+ &vpx_d63_predictor_16x16_c, 16, 8),
+ IntraPredParam(&vpx_d63_predictor_32x32_ssse3,
+ &vpx_d63_predictor_32x32_c, 32, 8),
+ IntraPredParam(&vpx_d153_predictor_4x4_ssse3,
+ &vpx_d153_predictor_4x4_c, 4, 8),
+ IntraPredParam(&vpx_d153_predictor_8x8_ssse3,
+ &vpx_d153_predictor_8x8_c, 8, 8),
+ IntraPredParam(&vpx_d153_predictor_16x16_ssse3,
+ &vpx_d153_predictor_16x16_c, 16, 8),
+ IntraPredParam(&vpx_d153_predictor_32x32_ssse3,
+ &vpx_d153_predictor_32x32_c, 32, 8),
+ IntraPredParam(&vpx_d207_predictor_8x8_ssse3,
+ &vpx_d207_predictor_8x8_c, 8, 8),
+ IntraPredParam(&vpx_d207_predictor_16x16_ssse3,
+ &vpx_d207_predictor_16x16_c, 16, 8),
+ IntraPredParam(&vpx_d207_predictor_32x32_ssse3,
+ &vpx_d207_predictor_32x32_c, 32, 8)));
+#endif // HAVE_SSSE3
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(
+ NEON, VP9IntraPredTest,
+ ::testing::Values(
+ IntraPredParam(&vpx_d45_predictor_4x4_neon, &vpx_d45_predictor_4x4_c, 4,
+ 8),
+ IntraPredParam(&vpx_d45_predictor_8x8_neon, &vpx_d45_predictor_8x8_c, 8,
+ 8),
+ IntraPredParam(&vpx_d45_predictor_16x16_neon,
+ &vpx_d45_predictor_16x16_c, 16, 8),
+ IntraPredParam(&vpx_d135_predictor_4x4_neon, &vpx_d135_predictor_4x4_c,
+ 4, 8),
+ IntraPredParam(&vpx_dc_128_predictor_4x4_neon,
+ &vpx_dc_128_predictor_4x4_c, 4, 8),
+ IntraPredParam(&vpx_dc_128_predictor_8x8_neon,
+ &vpx_dc_128_predictor_8x8_c, 8, 8),
+ IntraPredParam(&vpx_dc_128_predictor_16x16_neon,
+ &vpx_dc_128_predictor_16x16_c, 16, 8),
+ IntraPredParam(&vpx_dc_128_predictor_32x32_neon,
+ &vpx_dc_128_predictor_32x32_c, 32, 8),
+ IntraPredParam(&vpx_dc_left_predictor_4x4_neon,
+ &vpx_dc_left_predictor_4x4_c, 4, 8),
+ IntraPredParam(&vpx_dc_left_predictor_8x8_neon,
+ &vpx_dc_left_predictor_8x8_c, 8, 8),
+ IntraPredParam(&vpx_dc_left_predictor_16x16_neon,
+ &vpx_dc_left_predictor_16x16_c, 16, 8),
+ IntraPredParam(&vpx_dc_left_predictor_32x32_neon,
+ &vpx_dc_left_predictor_32x32_c, 32, 8),
+ IntraPredParam(&vpx_dc_predictor_4x4_neon, &vpx_dc_predictor_4x4_c, 4,
+ 8),
+ IntraPredParam(&vpx_dc_predictor_8x8_neon, &vpx_dc_predictor_8x8_c, 8,
+ 8),
+ IntraPredParam(&vpx_dc_predictor_16x16_neon, &vpx_dc_predictor_16x16_c,
+ 16, 8),
+ IntraPredParam(&vpx_dc_predictor_32x32_neon, &vpx_dc_predictor_32x32_c,
+ 32, 8),
+ IntraPredParam(&vpx_dc_top_predictor_4x4_neon,
+ &vpx_dc_top_predictor_4x4_c, 4, 8),
+ IntraPredParam(&vpx_dc_top_predictor_8x8_neon,
+ &vpx_dc_top_predictor_8x8_c, 8, 8),
+ IntraPredParam(&vpx_dc_top_predictor_16x16_neon,
+ &vpx_dc_top_predictor_16x16_c, 16, 8),
+ IntraPredParam(&vpx_dc_top_predictor_32x32_neon,
+ &vpx_dc_top_predictor_32x32_c, 32, 8),
+ IntraPredParam(&vpx_h_predictor_4x4_neon, &vpx_h_predictor_4x4_c, 4, 8),
+ IntraPredParam(&vpx_h_predictor_8x8_neon, &vpx_h_predictor_8x8_c, 8, 8),
+ IntraPredParam(&vpx_h_predictor_16x16_neon, &vpx_h_predictor_16x16_c,
+ 16, 8),
+ IntraPredParam(&vpx_h_predictor_32x32_neon, &vpx_h_predictor_32x32_c,
+ 32, 8),
+ IntraPredParam(&vpx_tm_predictor_4x4_neon, &vpx_tm_predictor_4x4_c, 4,
+ 8),
+ IntraPredParam(&vpx_tm_predictor_8x8_neon, &vpx_tm_predictor_8x8_c, 8,
+ 8),
+ IntraPredParam(&vpx_tm_predictor_16x16_neon, &vpx_tm_predictor_16x16_c,
+ 16, 8),
+ IntraPredParam(&vpx_tm_predictor_32x32_neon, &vpx_tm_predictor_32x32_c,
+ 32, 8),
+ IntraPredParam(&vpx_v_predictor_4x4_neon, &vpx_v_predictor_4x4_c, 4, 8),
+ IntraPredParam(&vpx_v_predictor_8x8_neon, &vpx_v_predictor_8x8_c, 8, 8),
+ IntraPredParam(&vpx_v_predictor_16x16_neon, &vpx_v_predictor_16x16_c,
+ 16, 8),
+ IntraPredParam(&vpx_v_predictor_32x32_neon, &vpx_v_predictor_32x32_c,
+ 32, 8)));
+#endif // HAVE_NEON
+
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef void (*HighbdIntraPred)(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above, const uint16_t *left,
+ int bps);
+struct HighbdIntraPredParam {
+ HighbdIntraPredParam(HighbdIntraPred pred = NULL, HighbdIntraPred ref = NULL,
+ int block_size_value = 0, int bit_depth_value = 0)
+ : pred_fn(pred), ref_fn(ref), block_size(block_size_value),
+ bit_depth(bit_depth_value) {}
+
+ HighbdIntraPred pred_fn;
+ HighbdIntraPred ref_fn;
+ int block_size;
+ int bit_depth;
+};
+
+template <>
+void IntraPredTest<uint16_t, HighbdIntraPredParam>::Predict() {
+ const int bit_depth = params_.bit_depth;
+ params_.ref_fn(ref_dst_, stride_, above_row_, left_col_, bit_depth);
+ ASM_REGISTER_STATE_CHECK(
+ params_.pred_fn(dst_, stride_, above_row_, left_col_, bit_depth));
+}
+
+typedef IntraPredTest<uint16_t, HighbdIntraPredParam> VP9HighbdIntraPredTest;
+
+TEST_P(VP9HighbdIntraPredTest, HighbdIntraPredTests) {
+ // max block size is 32
DECLARE_ALIGNED(16, uint16_t, left_col[2 * 32]);
DECLARE_ALIGNED(16, uint16_t, above_data[2 * 32 + 32]);
DECLARE_ALIGNED(16, uint16_t, dst[3 * 32 * 32]);
@@ -124,88 +327,90 @@ TEST_P(VP9IntraPredTest, IntraPredTests) {
}
#if HAVE_SSE2
-#if CONFIG_VP9_HIGHBITDEPTH
INSTANTIATE_TEST_CASE_P(
- SSE2_TO_C_8, VP9IntraPredTest,
- ::testing::Values(IntraPredFunc(&vpx_highbd_dc_predictor_32x32_sse2,
- &vpx_highbd_dc_predictor_32x32_c, 32, 8),
- IntraPredFunc(&vpx_highbd_tm_predictor_16x16_sse2,
- &vpx_highbd_tm_predictor_16x16_c, 16, 8),
- IntraPredFunc(&vpx_highbd_tm_predictor_32x32_sse2,
- &vpx_highbd_tm_predictor_32x32_c, 32, 8),
- IntraPredFunc(&vpx_highbd_dc_predictor_4x4_sse2,
- &vpx_highbd_dc_predictor_4x4_c, 4, 8),
- IntraPredFunc(&vpx_highbd_dc_predictor_8x8_sse2,
- &vpx_highbd_dc_predictor_8x8_c, 8, 8),
- IntraPredFunc(&vpx_highbd_dc_predictor_16x16_sse2,
- &vpx_highbd_dc_predictor_16x16_c, 16, 8),
- IntraPredFunc(&vpx_highbd_v_predictor_4x4_sse2,
- &vpx_highbd_v_predictor_4x4_c, 4, 8),
- IntraPredFunc(&vpx_highbd_v_predictor_8x8_sse2,
- &vpx_highbd_v_predictor_8x8_c, 8, 8),
- IntraPredFunc(&vpx_highbd_v_predictor_16x16_sse2,
- &vpx_highbd_v_predictor_16x16_c, 16, 8),
- IntraPredFunc(&vpx_highbd_v_predictor_32x32_sse2,
- &vpx_highbd_v_predictor_32x32_c, 32, 8),
- IntraPredFunc(&vpx_highbd_tm_predictor_4x4_sse2,
- &vpx_highbd_tm_predictor_4x4_c, 4, 8),
- IntraPredFunc(&vpx_highbd_tm_predictor_8x8_sse2,
- &vpx_highbd_tm_predictor_8x8_c, 8, 8)));
+ SSE2_TO_C_8, VP9HighbdIntraPredTest,
+ ::testing::Values(
+ HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_sse2,
+ &vpx_highbd_dc_predictor_4x4_c, 4, 8),
+ HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_sse2,
+ &vpx_highbd_dc_predictor_8x8_c, 8, 8),
+ HighbdIntraPredParam(&vpx_highbd_dc_predictor_16x16_sse2,
+ &vpx_highbd_dc_predictor_16x16_c, 16, 8),
+ HighbdIntraPredParam(&vpx_highbd_dc_predictor_32x32_sse2,
+ &vpx_highbd_dc_predictor_32x32_c, 32, 8),
+ HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_sse2,
+ &vpx_highbd_tm_predictor_4x4_c, 4, 8),
+ HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_sse2,
+ &vpx_highbd_tm_predictor_8x8_c, 8, 8),
+ HighbdIntraPredParam(&vpx_highbd_tm_predictor_16x16_sse2,
+ &vpx_highbd_tm_predictor_16x16_c, 16, 8),
+ HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_sse2,
+ &vpx_highbd_tm_predictor_32x32_c, 32, 8),
+ HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_sse2,
+ &vpx_highbd_v_predictor_4x4_c, 4, 8),
+ HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_sse2,
+ &vpx_highbd_v_predictor_8x8_c, 8, 8),
+ HighbdIntraPredParam(&vpx_highbd_v_predictor_16x16_sse2,
+ &vpx_highbd_v_predictor_16x16_c, 16, 8),
+ HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_sse2,
+ &vpx_highbd_v_predictor_32x32_c, 32, 8)));
INSTANTIATE_TEST_CASE_P(
- SSE2_TO_C_10, VP9IntraPredTest,
- ::testing::Values(IntraPredFunc(&vpx_highbd_dc_predictor_32x32_sse2,
- &vpx_highbd_dc_predictor_32x32_c, 32, 10),
- IntraPredFunc(&vpx_highbd_tm_predictor_16x16_sse2,
- &vpx_highbd_tm_predictor_16x16_c, 16, 10),
- IntraPredFunc(&vpx_highbd_tm_predictor_32x32_sse2,
- &vpx_highbd_tm_predictor_32x32_c, 32, 10),
- IntraPredFunc(&vpx_highbd_dc_predictor_4x4_sse2,
- &vpx_highbd_dc_predictor_4x4_c, 4, 10),
- IntraPredFunc(&vpx_highbd_dc_predictor_8x8_sse2,
- &vpx_highbd_dc_predictor_8x8_c, 8, 10),
- IntraPredFunc(&vpx_highbd_dc_predictor_16x16_sse2,
- &vpx_highbd_dc_predictor_16x16_c, 16, 10),
- IntraPredFunc(&vpx_highbd_v_predictor_4x4_sse2,
- &vpx_highbd_v_predictor_4x4_c, 4, 10),
- IntraPredFunc(&vpx_highbd_v_predictor_8x8_sse2,
- &vpx_highbd_v_predictor_8x8_c, 8, 10),
- IntraPredFunc(&vpx_highbd_v_predictor_16x16_sse2,
- &vpx_highbd_v_predictor_16x16_c, 16, 10),
- IntraPredFunc(&vpx_highbd_v_predictor_32x32_sse2,
- &vpx_highbd_v_predictor_32x32_c, 32, 10),
- IntraPredFunc(&vpx_highbd_tm_predictor_4x4_sse2,
- &vpx_highbd_tm_predictor_4x4_c, 4, 10),
- IntraPredFunc(&vpx_highbd_tm_predictor_8x8_sse2,
- &vpx_highbd_tm_predictor_8x8_c, 8, 10)));
+ SSE2_TO_C_10, VP9HighbdIntraPredTest,
+ ::testing::Values(
+ HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_sse2,
+ &vpx_highbd_dc_predictor_4x4_c, 4, 10),
+ HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_sse2,
+ &vpx_highbd_dc_predictor_8x8_c, 8, 10),
+ HighbdIntraPredParam(&vpx_highbd_dc_predictor_16x16_sse2,
+ &vpx_highbd_dc_predictor_16x16_c, 16, 10),
+ HighbdIntraPredParam(&vpx_highbd_dc_predictor_32x32_sse2,
+ &vpx_highbd_dc_predictor_32x32_c, 32, 10),
+ HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_sse2,
+ &vpx_highbd_tm_predictor_4x4_c, 4, 10),
+ HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_sse2,
+ &vpx_highbd_tm_predictor_8x8_c, 8, 10),
+ HighbdIntraPredParam(&vpx_highbd_tm_predictor_16x16_sse2,
+ &vpx_highbd_tm_predictor_16x16_c, 16, 10),
+ HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_sse2,
+ &vpx_highbd_tm_predictor_32x32_c, 32, 10),
+ HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_sse2,
+ &vpx_highbd_v_predictor_4x4_c, 4, 10),
+ HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_sse2,
+ &vpx_highbd_v_predictor_8x8_c, 8, 10),
+ HighbdIntraPredParam(&vpx_highbd_v_predictor_16x16_sse2,
+ &vpx_highbd_v_predictor_16x16_c, 16, 10),
+ HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_sse2,
+ &vpx_highbd_v_predictor_32x32_c, 32, 10)));
INSTANTIATE_TEST_CASE_P(
- SSE2_TO_C_12, VP9IntraPredTest,
- ::testing::Values(IntraPredFunc(&vpx_highbd_dc_predictor_32x32_sse2,
- &vpx_highbd_dc_predictor_32x32_c, 32, 12),
- IntraPredFunc(&vpx_highbd_tm_predictor_16x16_sse2,
- &vpx_highbd_tm_predictor_16x16_c, 16, 12),
- IntraPredFunc(&vpx_highbd_tm_predictor_32x32_sse2,
- &vpx_highbd_tm_predictor_32x32_c, 32, 12),
- IntraPredFunc(&vpx_highbd_dc_predictor_4x4_sse2,
- &vpx_highbd_dc_predictor_4x4_c, 4, 12),
- IntraPredFunc(&vpx_highbd_dc_predictor_8x8_sse2,
- &vpx_highbd_dc_predictor_8x8_c, 8, 12),
- IntraPredFunc(&vpx_highbd_dc_predictor_16x16_sse2,
- &vpx_highbd_dc_predictor_16x16_c, 16, 12),
- IntraPredFunc(&vpx_highbd_v_predictor_4x4_sse2,
- &vpx_highbd_v_predictor_4x4_c, 4, 12),
- IntraPredFunc(&vpx_highbd_v_predictor_8x8_sse2,
- &vpx_highbd_v_predictor_8x8_c, 8, 12),
- IntraPredFunc(&vpx_highbd_v_predictor_16x16_sse2,
- &vpx_highbd_v_predictor_16x16_c, 16, 12),
- IntraPredFunc(&vpx_highbd_v_predictor_32x32_sse2,
- &vpx_highbd_v_predictor_32x32_c, 32, 12),
- IntraPredFunc(&vpx_highbd_tm_predictor_4x4_sse2,
- &vpx_highbd_tm_predictor_4x4_c, 4, 12),
- IntraPredFunc(&vpx_highbd_tm_predictor_8x8_sse2,
- &vpx_highbd_tm_predictor_8x8_c, 8, 12)));
+ SSE2_TO_C_12, VP9HighbdIntraPredTest,
+ ::testing::Values(
+ HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_sse2,
+ &vpx_highbd_dc_predictor_4x4_c, 4, 12),
+ HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_sse2,
+ &vpx_highbd_dc_predictor_8x8_c, 8, 12),
+ HighbdIntraPredParam(&vpx_highbd_dc_predictor_16x16_sse2,
+ &vpx_highbd_dc_predictor_16x16_c, 16, 12),
+ HighbdIntraPredParam(&vpx_highbd_dc_predictor_32x32_sse2,
+ &vpx_highbd_dc_predictor_32x32_c, 32, 12),
+ HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_sse2,
+ &vpx_highbd_tm_predictor_4x4_c, 4, 12),
+ HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_sse2,
+ &vpx_highbd_tm_predictor_8x8_c, 8, 12),
+ HighbdIntraPredParam(&vpx_highbd_tm_predictor_16x16_sse2,
+ &vpx_highbd_tm_predictor_16x16_c, 16, 12),
+ HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_sse2,
+ &vpx_highbd_tm_predictor_32x32_c, 32, 12),
+ HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_sse2,
+ &vpx_highbd_v_predictor_4x4_c, 4, 12),
+ HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_sse2,
+ &vpx_highbd_v_predictor_8x8_c, 8, 12),
+ HighbdIntraPredParam(&vpx_highbd_v_predictor_16x16_sse2,
+ &vpx_highbd_v_predictor_16x16_c, 16, 12),
+ HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_sse2,
+ &vpx_highbd_v_predictor_32x32_c, 32, 12)));
+#endif // HAVE_SSE2
#endif // CONFIG_VP9_HIGHBITDEPTH
-#endif // HAVE_SSE2
} // namespace
diff --git a/tools.mk b/tools.mk
new file mode 100644
index 000000000..3c660b1df
--- /dev/null
+++ b/tools.mk
@@ -0,0 +1,110 @@
+##
+## Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+##
+## Use of this source code is governed by a BSD-style license
+## that can be found in the LICENSE file in the root of the source
+## tree. An additional intellectual property rights grant can be found
+## in the file PATENTS. All contributing project authors may
+## be found in the AUTHORS file in the root of the source tree.
+##
+
+# List of tools to build.
+TOOLS-yes += tiny_ssim.c
+tiny_ssim.SRCS += vpx/vpx_integer.h
+tiny_ssim.GUID = 3afa9b05-940b-4d68-b5aa-55157d8ed7b4
+tiny_ssim.DESCRIPTION = Generate SSIM/PSNR from raw .yuv files
+
+#
+# End of specified files. The rest of the build rules should happen
+# automagically from here.
+#
+
+
+# Expand list of selected tools to build (as specified above)
+TOOLS = $(addprefix tools/,$(call enabled,TOOLS))
+ALL_SRCS = $(foreach ex,$(TOOLS),$($(notdir $(ex:.c=)).SRCS))
+
+
+# Expand all tools sources into a variable containing all sources
+# for that tools (not just them main one specified in TOOLS)
+# and add this file to the list (for MSVS workspace generation)
+$(foreach ex,$(TOOLS),$(eval $(notdir $(ex:.c=)).SRCS += $(ex) tools.mk))
+
+
+# Create build/install dependencies for all tools. The common case
+# is handled here. The MSVS case is handled below.
+NOT_MSVS = $(if $(CONFIG_MSVS),,yes)
+DIST-BINS-$(NOT_MSVS) += $(addprefix bin/,$(TOOLS:.c=$(EXE_SFX)))
+DIST-SRCS-yes += $(ALL_SRCS)
+OBJS-$(NOT_MSVS) += $(call objs,$(ALL_SRCS))
+BINS-$(NOT_MSVS) += $(addprefix $(BUILD_PFX),$(TOOLS:.c=$(EXE_SFX)))
+
+
+# Instantiate linker template for all tools.
+$(foreach bin,$(BINS-yes),\
+ $(eval $(bin):)\
+ $(eval $(call linker_template,$(bin),\
+ $(call objs,$($(notdir $(bin:$(EXE_SFX)=)).SRCS)) \
+ -lm\
+ )))
+
+
+# The following pairs define a mapping of locations in the distribution
+# tree to locations in the source/build trees.
+INSTALL_MAPS += src/%.c %.c
+INSTALL_MAPS += src/% $(SRC_PATH_BARE)/%
+INSTALL_MAPS += bin/% %
+INSTALL_MAPS += % %
+
+
+# Build Visual Studio Projects. We use a template here to instantiate
+# explicit rules rather than using an implicit rule because we want to
+# leverage make's VPATH searching rather than specifying the paths on
+# each file in TOOLS. This has the unfortunate side effect that
+# touching the source files trigger a rebuild of the project files
+# even though there is no real dependency there (the dependency is on
+# the makefiles). We may want to revisit this.
+define vcproj_template
+$(1): $($(1:.$(VCPROJ_SFX)=).SRCS) vpx.$(VCPROJ_SFX)
+ $(if $(quiet),@echo " [vcproj] $$@")
+ $(qexec)$$(GEN_VCPROJ)\
+ --exe\
+ --target=$$(TOOLCHAIN)\
+ --name=$$(@:.$(VCPROJ_SFX)=)\
+ --ver=$$(CONFIG_VS_VERSION)\
+ --proj-guid=$$($$(@:.$(VCPROJ_SFX)=).GUID)\
+ --src-path-bare="$(SRC_PATH_BARE)" \
+ $$(if $$(CONFIG_STATIC_MSVCRT),--static-crt) \
+ --out=$$@ $$(INTERNAL_CFLAGS) $$(CFLAGS) \
+ $$(INTERNAL_LDFLAGS) $$(LDFLAGS) $$^
+endef
+TOOLS_BASENAME := $(notdir $(TOOLS))
+PROJECTS-$(CONFIG_MSVS) += $(TOOLS_BASENAME:.c=.$(VCPROJ_SFX))
+INSTALL-BINS-$(CONFIG_MSVS) += $(foreach p,$(VS_PLATFORMS),\
+ $(addprefix bin/$(p)/,$(TOOLS_BASENAME:.c=.exe)))
+$(foreach proj,$(call enabled,PROJECTS),\
+ $(eval $(call vcproj_template,$(proj))))
+
+#
+# Documentation Rules
+#
+%.dox: %.c
+ @echo " [DOXY] $@"
+ @mkdir -p $(dir $@)
+ @echo "/*!\page tools_$(@F:.dox=) $(@F:.dox=)" > $@
+ @echo " \includelineno $(<F)" >> $@
+ @echo "*/" >> $@
+
+tools.dox: tools.mk
+ @echo " [DOXY] $@"
+ @echo "/*!\page tools Tools" > $@
+ @echo " This SDK includes a number of tools/utilities."\
+ "The following tools are included: ">>$@
+ @$(foreach ex,$(sort $(notdir $(TOOLS:.c=))),\
+ echo " - \subpage tools_$(ex) $($(ex).DESCRIPTION)" >> $@;)
+ @echo "*/" >> $@
+
+CLEAN-OBJS += tools.doxy tools.dox $(TOOLS:.c=.dox)
+DOCS-yes += tools.doxy tools.dox
+tools.doxy: tools.dox $(TOOLS:.c=.dox)
+ @echo "INPUT += $^" > $@
diff --git a/tools/tiny_ssim.c b/tools/tiny_ssim.c
new file mode 100644
index 000000000..28052e0a8
--- /dev/null
+++ b/tools/tiny_ssim.c
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <errno.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vpx/vpx_integer.h"
+
+void vp8_ssim_parms_8x8_c(unsigned char *s, int sp, unsigned char *r, int rp,
+ uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s,
+ uint32_t *sum_sq_r, uint32_t *sum_sxr) {
+ int i, j;
+ for (i = 0; i < 8; i++, s += sp, r += rp) {
+ for (j = 0; j < 8; j++) {
+ *sum_s += s[j];
+ *sum_r += r[j];
+ *sum_sq_s += s[j] * s[j];
+ *sum_sq_r += r[j] * r[j];
+ *sum_sxr += s[j] * r[j];
+ }
+ }
+}
+
+static const int64_t cc1 = 26634; // (64^2*(.01*255)^2
+static const int64_t cc2 = 239708; // (64^2*(.03*255)^2
+
+static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s,
+ uint32_t sum_sq_r, uint32_t sum_sxr, int count) {
+ int64_t ssim_n, ssim_d;
+ int64_t c1, c2;
+
+ // scale the constants by number of pixels
+ c1 = (cc1 * count * count) >> 12;
+ c2 = (cc2 * count * count) >> 12;
+
+ ssim_n = (2 * sum_s * sum_r + c1) *
+ ((int64_t)2 * count * sum_sxr - (int64_t)2 * sum_s * sum_r + c2);
+
+ ssim_d = (sum_s * sum_s + sum_r * sum_r + c1) *
+ ((int64_t)count * sum_sq_s - (int64_t)sum_s * sum_s +
+ (int64_t)count * sum_sq_r - (int64_t)sum_r * sum_r + c2);
+
+ return ssim_n * 1.0 / ssim_d;
+}
+
+static double ssim_8x8(unsigned char *s, int sp, unsigned char *r, int rp) {
+ uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
+ vp8_ssim_parms_8x8_c(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
+ &sum_sxr);
+ return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64);
+}
+
+// We are using a 8x8 moving window with starting location of each 8x8 window
+// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
+// block boundaries to penalize blocking artifacts.
+double vp8_ssim2(unsigned char *img1, unsigned char *img2, int stride_img1,
+ int stride_img2, int width, int height) {
+ int i, j;
+ int samples = 0;
+ double ssim_total = 0;
+
+ // sample point start with each 4x4 location
+ for (i = 0; i <= height - 8;
+ i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
+ for (j = 0; j <= width - 8; j += 4) {
+ double v = ssim_8x8(img1 + j, stride_img1, img2 + j, stride_img2);
+ ssim_total += v;
+ samples++;
+ }
+ }
+ ssim_total /= samples;
+ return ssim_total;
+}
+
+static uint64_t calc_plane_error(uint8_t *orig, int orig_stride, uint8_t *recon,
+ int recon_stride, unsigned int cols,
+ unsigned int rows) {
+ unsigned int row, col;
+ uint64_t total_sse = 0;
+ int diff;
+
+ for (row = 0; row < rows; row++) {
+ for (col = 0; col < cols; col++) {
+ diff = orig[col] - recon[col];
+ total_sse += diff * diff;
+ }
+
+ orig += orig_stride;
+ recon += recon_stride;
+ }
+
+ return total_sse;
+}
+
+#define MAX_PSNR 100
+
+double vp9_mse2psnr(double samples, double peak, double mse) {
+ double psnr;
+
+ if (mse > 0.0)
+ psnr = 10.0 * log10(peak * peak * samples / mse);
+ else
+ psnr = MAX_PSNR; // Limit to prevent / 0
+
+ if (psnr > MAX_PSNR) psnr = MAX_PSNR;
+
+ return psnr;
+}
+
+int main(int argc, char *argv[]) {
+ FILE *f[2];
+ uint8_t *buf[2];
+ int w, h, n_frames, tl_skip = 0, tl_skips_remaining = 0;
+ double ssim = 0, psnravg = 0, psnrglb = 0;
+ double ssimy, ssimu, ssimv;
+ uint64_t psnry, psnru, psnrv;
+
+ if (argc < 4) {
+ fprintf(stderr, "Usage: %s file1.yuv file2.yuv WxH [tl_skip={0,1,3}]\n",
+ argv[0]);
+ return 1;
+ }
+ f[0] = strcmp(argv[1], "-") ? fopen(argv[1], "rb") : stdin;
+ f[1] = strcmp(argv[2], "-") ? fopen(argv[2], "rb") : stdin;
+ sscanf(argv[3], "%dx%d", &w, &h);
+ // Number of frames to skip from file1.yuv for every frame used. Normal values
+ // 0, 1 and 3 correspond to TL2, TL1 and TL0 respectively for a 3TL encoding
+ // in mode 10. 7 would be reasonable for comparing TL0 of a 4-layer encoding.
+ if (argc > 4) {
+ sscanf(argv[4], "%d", &tl_skip);
+ }
+ if (!f[0] || !f[1]) {
+ fprintf(stderr, "Could not open input files: %s\n", strerror(errno));
+ return 1;
+ }
+ if (w <= 0 || h <= 0 || w & 1 || h & 1) {
+ fprintf(stderr, "Invalid size %dx%d\n", w, h);
+ return 1;
+ }
+ buf[0] = malloc(w * h * 3 / 2);
+ buf[1] = malloc(w * h * 3 / 2);
+ n_frames = 0;
+ while (1) {
+ size_t r1, r2;
+ r1 = fread(buf[0], w * h * 3 / 2, 1, f[0]);
+ if (r1) {
+ // Reading parts of file1.yuv that were not used in temporal layer.
+ if (tl_skips_remaining > 0) {
+ --tl_skips_remaining;
+ continue;
+ }
+ // Use frame, but skip |tl_skip| after it.
+ tl_skips_remaining = tl_skip;
+ }
+ r2 = fread(buf[1], w * h * 3 / 2, 1, f[1]);
+ if (r1 && r2 && r1 != r2) {
+ fprintf(stderr, "Failed to read data: %s [%d/%d]\n", strerror(errno),
+ (int)r1, (int)r2);
+ return 1;
+ } else if (r1 == 0 || r2 == 0) {
+ break;
+ }
+#define psnr_and_ssim(ssim, psnr, buf0, buf1, w, h) \
+ ssim = vp8_ssim2(buf0, buf1, w, w, w, h); \
+ psnr = calc_plane_error(buf0, w, buf1, w, w, h);
+ psnr_and_ssim(ssimy, psnry, buf[0], buf[1], w, h);
+ psnr_and_ssim(ssimu, psnru, buf[0] + w * h, buf[1] + w * h, w / 2, h / 2);
+ psnr_and_ssim(ssimv, psnrv, buf[0] + w * h * 5 / 4, buf[1] + w * h * 5 / 4,
+ w / 2, h / 2);
+ ssim += 0.8 * ssimy + 0.1 * (ssimu + ssimv);
+ psnravg +=
+ vp9_mse2psnr(w * h * 6 / 4, 255.0, (double)psnry + psnru + psnrv);
+ psnrglb += psnry + psnru + psnrv;
+ n_frames++;
+ }
+ free(buf[0]);
+ free(buf[1]);
+ ssim /= n_frames;
+ psnravg /= n_frames;
+ psnrglb = vp9_mse2psnr((double)n_frames * w * h * 6 / 4, 255.0, psnrglb);
+
+ printf("AvgPSNR: %lf\n", psnravg);
+ printf("GlbPSNR: %lf\n", psnrglb);
+ printf("SSIM: %lf\n", 100 * pow(ssim, 8.0));
+ printf("Nframes: %d\n", n_frames);
+
+ if (strcmp(argv[1], "-")) fclose(f[0]);
+ if (strcmp(argv[2], "-")) fclose(f[1]);
+
+ return 0;
+}
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index fde0b7e31..628d1c8d2 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -1517,7 +1517,6 @@ static int tile_worker_hook(TileWorkerData *const tile_data,
return 0;
}
- tile_data->xd.error_info = &tile_data->error_info;
tile_data->xd.corrupted = 0;
do {
@@ -1529,6 +1528,8 @@ static int tile_worker_hook(TileWorkerData *const tile_data,
&tile_data->error_info, &tile_data->bit_reader,
pbi->decrypt_cb, pbi->decrypt_state);
vp9_init_macroblockd(&pbi->common, &tile_data->xd, tile_data->dqcoeff);
+ // init resets xd.error_info
+ tile_data->xd.error_info = &tile_data->error_info;
for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
mi_row += MI_BLOCK_SIZE) {
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 3bd6026d4..2a5800382 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -2441,6 +2441,8 @@ static int recode_loop_test(VP9_COMP *cpi, int high_limit, int low_limit, int q,
cpi->resize_pending = 1;
return 1;
}
+ // Force recode if projected_frame_size > max_frame_bandwidth
+ if (rc->projected_frame_size >= rc->max_frame_bandwidth) return 1;
// TODO(agrange) high_limit could be greater than the scale-down threshold.
if ((rc->projected_frame_size > high_limit && q < maxq) ||
@@ -2799,13 +2801,13 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) {
dc_quant_devisor = 4.0;
#endif
- fprintf(f, "%10u %dx%d %10d %10d %d %d %10d %10d %10d %10d"
+ fprintf(f, "%10u %dx%d %d %d %10d %10d %10d %10d"
"%10"PRId64" %10"PRId64" %5d %5d %10"PRId64" "
"%10"PRId64" %10"PRId64" %10d "
"%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf"
"%6d %6d %5d %5d %5d "
"%10"PRId64" %10.3lf"
- "%10lf %8u %10"PRId64" %10d %10d %10d\n",
+ "%10lf %8u %10"PRId64" %10d %10d %10d %10d %10d\n",
cpi->common.current_video_frame,
cm->width, cm->height,
cpi->rc.source_alt_ref_pending,
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 2f1fe360d..788952d34 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -48,10 +48,8 @@
#define FIRST_PASS_Q 10.0
#define GF_MAX_BOOST 96.0
#define INTRA_MODE_PENALTY 1024
-#define KF_MAX_BOOST 128.0
#define MIN_ARF_GF_BOOST 240
#define MIN_DECAY_FACTOR 0.01
-#define MIN_KF_BOOST 300
#define NEW_MV_MODE_PENALTY 32
#define SVC_FACTOR_PT_LOW 0.45
#define DARK_THRESH 64
@@ -1578,7 +1576,7 @@ static double get_sr_decay_rate(const VP9_COMP *cpi,
sr_decay = 1.0 - (SR_DIFF_PART * sr_diff) - motion_amplitude_part -
(INTRA_PART * modified_pcnt_intra);
}
- return VPXMAX(sr_decay, VPXMIN(DEFAULT_DECAY_LIMIT, modified_pct_inter));
+ return VPXMAX(sr_decay, DEFAULT_DECAY_LIMIT);
}
// This function gives an estimate of how badly we believe the prediction
@@ -1681,6 +1679,7 @@ static void accumulate_frame_motion_stats(const FIRSTPASS_STATS *stats,
#define BASELINE_ERR_PER_MB 1000.0
static double calc_frame_boost(VP9_COMP *cpi, const FIRSTPASS_STATS *this_frame,
+ double *sr_accumulator,
double this_frame_mv_in_out, double max_boost) {
double frame_boost;
const double lq = vp9_convert_qindex_to_q(
@@ -1694,17 +1693,56 @@ static double calc_frame_boost(VP9_COMP *cpi, const FIRSTPASS_STATS *this_frame,
// Underlying boost factor is based on inter error ratio.
frame_boost = (BASELINE_ERR_PER_MB * num_mbs) /
- DOUBLE_DIVIDE_CHECK(this_frame->coded_error);
+ DOUBLE_DIVIDE_CHECK(this_frame->coded_error + *sr_accumulator);
+
+ // Update the accumulator for second ref error difference.
+ // This is intended to give an indication of how much the coded error is
+ // increasing over time.
+ *sr_accumulator += (this_frame->sr_coded_error - this_frame->coded_error) / 1;
+ *sr_accumulator = VPXMAX(0.0, *sr_accumulator);
+
+ // Small adjustment for cases where there is a zoom out
+ if (this_frame_mv_in_out > 0.0)
+ frame_boost += frame_boost * (this_frame_mv_in_out * 2.0);
+
+ // Q correction and scalling
frame_boost = frame_boost * BOOST_FACTOR * boost_q_correction;
- // Increase boost for frames where new data coming into frame (e.g. zoom out).
- // Slightly reduce boost if there is a net balance of motion out of the frame
- // (zoom in). The range for this_frame_mv_in_out is -1.0 to +1.0.
+ return VPXMIN(frame_boost, max_boost * boost_q_correction);
+}
+
+#define KF_BOOST_FACTOR 12.5
+static double calc_kf_frame_boost(VP9_COMP *cpi,
+ const FIRSTPASS_STATS *this_frame,
+ double *sr_accumulator,
+ double this_frame_mv_in_out,
+ double max_boost) {
+ double frame_boost;
+ const double lq = vp9_convert_qindex_to_q(
+ cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.bit_depth);
+ const double boost_q_correction = VPXMIN((0.50 + (lq * 0.015)), 2.00);
+ int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
+ : cpi->common.MBs;
+
+ // Correct for any inactive region in the image
+ num_mbs = (int)VPXMAX(1, num_mbs * calculate_active_area(cpi, this_frame));
+
+ // Underlying boost factor is based on inter error ratio.
+ frame_boost = (BASELINE_ERR_PER_MB * num_mbs) /
+ DOUBLE_DIVIDE_CHECK(this_frame->coded_error + *sr_accumulator);
+
+ // Update the accumulator for second ref error difference.
+ // This is intended to give an indication of how much the coded error is
+ // increasing over time.
+ *sr_accumulator += (this_frame->sr_coded_error - this_frame->coded_error) / 1;
+ *sr_accumulator = VPXMAX(0.0, *sr_accumulator);
+
+ // Small adjustment for cases where there is a zoom out
if (this_frame_mv_in_out > 0.0)
frame_boost += frame_boost * (this_frame_mv_in_out * 2.0);
- // In the extreme case the boost is halved.
- else
- frame_boost += frame_boost * (this_frame_mv_in_out / 2.0);
+
+ // Q correction and scalling
+ frame_boost = frame_boost * KF_BOOST_FACTOR * boost_q_correction;
return VPXMIN(frame_boost, max_boost * boost_q_correction);
}
@@ -1719,6 +1757,7 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames,
double this_frame_mv_in_out = 0.0;
double mv_in_out_accumulator = 0.0;
double abs_mv_in_out_accumulator = 0.0;
+ double sr_accumulator = 0.0;
int arf_boost;
int flash_detected = 0;
@@ -1745,9 +1784,10 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames,
: decay_accumulator;
}
- boost_score +=
- decay_accumulator *
- calc_frame_boost(cpi, this_frame, this_frame_mv_in_out, GF_MAX_BOOST);
+ sr_accumulator = 0.0;
+ boost_score += decay_accumulator *
+ calc_frame_boost(cpi, this_frame, &sr_accumulator,
+ this_frame_mv_in_out, GF_MAX_BOOST);
}
*f_boost = (int)boost_score;
@@ -1759,6 +1799,7 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames,
this_frame_mv_in_out = 0.0;
mv_in_out_accumulator = 0.0;
abs_mv_in_out_accumulator = 0.0;
+ sr_accumulator = 0.0;
// Search backward towards last gf position.
for (i = -1; i >= -b_frames; --i) {
@@ -1783,9 +1824,10 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames,
: decay_accumulator;
}
- boost_score +=
- decay_accumulator *
- calc_frame_boost(cpi, this_frame, this_frame_mv_in_out, GF_MAX_BOOST);
+ sr_accumulator = 0.0;
+ boost_score += decay_accumulator *
+ calc_frame_boost(cpi, this_frame, &sr_accumulator,
+ this_frame_mv_in_out, GF_MAX_BOOST);
}
*b_boost = (int)boost_score;
@@ -2085,7 +2127,6 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
double mv_ratio_accumulator = 0.0;
double decay_accumulator = 1.0;
double zero_motion_accumulator = 1.0;
-
double loop_decay_rate = 1.00;
double last_loop_decay_rate = 1.00;
@@ -2095,6 +2136,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
double mv_ratio_accumulator_thresh;
double mv_in_out_thresh;
double abs_mv_in_out_thresh;
+ double sr_accumulator = 0.0;
unsigned int allow_alt_ref = is_altref_enabled(cpi);
int f_boost = 0;
@@ -2221,9 +2263,10 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
}
// Calculate a boost number for this frame.
- boost_score +=
- decay_accumulator *
- calc_frame_boost(cpi, &next_frame, this_frame_mv_in_out, GF_MAX_BOOST);
+ sr_accumulator = 0.0;
+ boost_score += decay_accumulator *
+ calc_frame_boost(cpi, &next_frame, &sr_accumulator,
+ this_frame_mv_in_out, GF_MAX_BOOST);
// Break out conditions.
if (
@@ -2473,6 +2516,10 @@ static int test_candidate_kf(TWO_PASS *twopass,
}
#define FRAMES_TO_CHECK_DECAY 8
+#define KF_MAX_FRAME_BOOST 96.0
+#define MIN_KF_TOT_BOOST 300
+#define MAX_KF_TOT_BOOST 5400
+#define KF_BOOST_SCAN_MAX_FRAMES 32
static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
int i, j;
@@ -2485,14 +2532,13 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
FIRSTPASS_STATS next_frame;
FIRSTPASS_STATS last_frame;
int kf_bits = 0;
- int loop_decay_counter = 0;
double decay_accumulator = 1.0;
- double av_decay_accumulator = 0.0;
double zero_motion_accumulator = 1.0;
double boost_score = 0.0;
double kf_mod_err = 0.0;
double kf_group_err = 0.0;
double recent_loop_decay[FRAMES_TO_CHECK_DECAY];
+ double sr_accumulator = 0.0;
vp9_zero(next_frame);
@@ -2642,34 +2688,36 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
// Scan through the kf group collating various stats used to determine
// how many bits to spend on it.
- decay_accumulator = 1.0;
boost_score = 0.0;
+
for (i = 0; i < (rc->frames_to_key - 1); ++i) {
if (EOF == input_stats(twopass, &next_frame)) break;
- // Monitor for static sections.
- zero_motion_accumulator = VPXMIN(zero_motion_accumulator,
- get_zero_motion_factor(cpi, &next_frame));
-
- // Not all frames in the group are necessarily used in calculating boost.
- if ((i <= rc->max_gf_interval) ||
- ((i <= (rc->max_gf_interval * 4)) && (decay_accumulator > 0.5))) {
- const double frame_boost =
- calc_frame_boost(cpi, &next_frame, 0, KF_MAX_BOOST);
-
- // How fast is prediction quality decaying.
- if (!detect_flash(twopass, 0)) {
- const double loop_decay_rate =
- get_prediction_decay_rate(cpi, &next_frame);
- decay_accumulator *= loop_decay_rate;
- decay_accumulator = VPXMAX(decay_accumulator, MIN_DECAY_FACTOR);
- av_decay_accumulator += decay_accumulator;
- ++loop_decay_counter;
- }
- boost_score += (decay_accumulator * frame_boost);
+ if (i <= KF_BOOST_SCAN_MAX_FRAMES) {
+ double frame_boost;
+ double zm_factor;
+
+ // Monitor for static sections.
+ zero_motion_accumulator = VPXMIN(
+ zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));
+
+ // Factor 0.75-1.25 based on how much of frame is static.
+ zm_factor = (0.75 + (zero_motion_accumulator / 2.0));
+
+ // The second (lagging) ref error is not valid immediately after
+ // a key frame because either the lag has not built up (in the case of
+ // the first key frame or it points to a refernce before the new key
+ // frame.
+ if (i < 2) sr_accumulator = 0.0;
+ frame_boost = calc_kf_frame_boost(cpi, &next_frame, &sr_accumulator, 0,
+ KF_MAX_FRAME_BOOST * zm_factor);
+
+ boost_score += frame_boost;
+ if (frame_boost < 25.00) break;
+ } else {
+ break;
}
}
- av_decay_accumulator /= (double)loop_decay_counter;
reset_fpf_position(twopass, start_position);
@@ -2681,9 +2729,9 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
start_position, twopass->stats_in_end, rc->frames_to_key);
// Apply various clamps for min and max boost
- rc->kf_boost = (int)(av_decay_accumulator * boost_score);
- rc->kf_boost = VPXMAX(rc->kf_boost, (rc->frames_to_key * 3));
- rc->kf_boost = VPXMAX(rc->kf_boost, MIN_KF_BOOST);
+ rc->kf_boost = VPXMAX((int)boost_score, (rc->frames_to_key * 3));
+ rc->kf_boost = VPXMAX(rc->kf_boost, MIN_KF_TOT_BOOST);
+ rc->kf_boost = VPXMIN(rc->kf_boost, MAX_KF_TOT_BOOST);
// Work out how many bits to allocate for the key frame itself.
kf_bits = calculate_boost_bits((rc->frames_to_key - 1), rc->kf_boost,
diff --git a/vpx/src/svc_encodeframe.c b/vpx/src/svc_encodeframe.c
index 5aa0b8ddb..88b1531d8 100644
--- a/vpx/src/svc_encodeframe.c
+++ b/vpx/src/svc_encodeframe.c
@@ -53,6 +53,10 @@ static const int DEFAULT_SCALE_FACTORS_NUM[VPX_SS_MAX_LAYERS] = { 4, 5, 7, 11,
static const int DEFAULT_SCALE_FACTORS_DEN[VPX_SS_MAX_LAYERS] = { 16, 16, 16,
16, 16 };
+static const int DEFAULT_SCALE_FACTORS_NUM_2x[VPX_SS_MAX_LAYERS] = { 1, 2, 4 };
+
+static const int DEFAULT_SCALE_FACTORS_DEN_2x[VPX_SS_MAX_LAYERS] = { 4, 4, 4 };
+
typedef enum {
QUANTIZER = 0,
BITRATE,
@@ -156,6 +160,9 @@ static vpx_codec_err_t parse_layer_options_from_string(SvcContext *svc_ctx,
char *token;
const char *delim = ",";
char *save_ptr;
+ int num_layers = svc_ctx->spatial_layers;
+ if (type == BITRATE)
+ num_layers = svc_ctx->spatial_layers * svc_ctx->temporal_layers;
if (input == NULL || option0 == NULL ||
(option1 == NULL && type == SCALE_FACTOR))
@@ -163,7 +170,7 @@ static vpx_codec_err_t parse_layer_options_from_string(SvcContext *svc_ctx,
input_string = strdup(input);
token = strtok_r(input_string, delim, &save_ptr);
- for (i = 0; i < svc_ctx->spatial_layers; ++i) {
+ for (i = 0; i < num_layers; ++i) {
if (token != NULL) {
res = extract_option(type, token, option0 + i, option1 + i);
if (res != VPX_CODEC_OK) break;
@@ -172,11 +179,11 @@ static vpx_codec_err_t parse_layer_options_from_string(SvcContext *svc_ctx,
break;
}
}
- if (res == VPX_CODEC_OK && i != svc_ctx->spatial_layers) {
+ if (res == VPX_CODEC_OK && i != num_layers) {
svc_log(svc_ctx, SVC_LOG_ERROR,
"svc: layer params type: %d %d values required, "
"but only %d specified\n",
- type, svc_ctx->spatial_layers, i);
+ type, num_layers, i);
res = VPX_CODEC_INVALID_PARAM;
}
free(input_string);
@@ -287,24 +294,30 @@ vpx_codec_err_t vpx_svc_set_options(SvcContext *svc_ctx, const char *options) {
return VPX_CODEC_OK;
}
-void assign_layer_bitrates(const SvcContext *svc_ctx,
- vpx_codec_enc_cfg_t *const enc_cfg) {
+vpx_codec_err_t assign_layer_bitrates(const SvcContext *svc_ctx,
+ vpx_codec_enc_cfg_t *const enc_cfg) {
int i;
const SvcInternal_t *const si = get_const_svc_internal(svc_ctx);
int sl, tl, spatial_layer_target;
if (svc_ctx->temporal_layering_mode != 0) {
if (si->bitrates[0] != 0) {
- enc_cfg->rc_target_bitrate = 0;
+ unsigned int total_bitrate = 0;
for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) {
- enc_cfg->ss_target_bitrate[sl * svc_ctx->temporal_layers] = 0;
+ total_bitrate += si->bitrates[sl * svc_ctx->temporal_layers +
+ svc_ctx->temporal_layers - 1];
for (tl = 0; tl < svc_ctx->temporal_layers; ++tl) {
enc_cfg->ss_target_bitrate[sl * svc_ctx->temporal_layers] +=
(unsigned int)si->bitrates[sl * svc_ctx->temporal_layers + tl];
enc_cfg->layer_target_bitrate[sl * svc_ctx->temporal_layers + tl] =
si->bitrates[sl * svc_ctx->temporal_layers + tl];
+ if (tl > 0 && (si->bitrates[sl * svc_ctx->temporal_layers + tl] <=
+ si->bitrates[sl * svc_ctx->temporal_layers + tl - 1]))
+ return VPX_CODEC_INVALID_PARAM;
}
}
+ if (total_bitrate != enc_cfg->rc_target_bitrate)
+ return VPX_CODEC_INVALID_PARAM;
} else {
float total = 0;
float alloc_ratio[VPX_MAX_LAYERS] = { 0 };
@@ -341,11 +354,14 @@ void assign_layer_bitrates(const SvcContext *svc_ctx,
}
} else {
if (si->bitrates[0] != 0) {
- enc_cfg->rc_target_bitrate = 0;
+ unsigned int total_bitrate = 0;
for (i = 0; i < svc_ctx->spatial_layers; ++i) {
enc_cfg->ss_target_bitrate[i] = (unsigned int)si->bitrates[i];
- enc_cfg->rc_target_bitrate += si->bitrates[i];
+ enc_cfg->layer_target_bitrate[i] = (unsigned int)si->bitrates[i];
+ total_bitrate += si->bitrates[i];
}
+ if (total_bitrate != enc_cfg->rc_target_bitrate)
+ return VPX_CODEC_INVALID_PARAM;
} else {
float total = 0;
float alloc_ratio[VPX_MAX_LAYERS] = { 0 };
@@ -368,6 +384,7 @@ void assign_layer_bitrates(const SvcContext *svc_ctx,
}
}
}
+ return VPX_CODEC_OK;
}
vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
@@ -412,12 +429,24 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
si->svc_params.scaling_factor_den[sl] = DEFAULT_SCALE_FACTORS_DEN[sl];
si->svc_params.speed_per_layer[sl] = svc_ctx->speed;
}
-
+ if (enc_cfg->rc_end_usage == VPX_CBR && enc_cfg->g_pass == VPX_RC_ONE_PASS &&
+ svc_ctx->spatial_layers <= 3) {
+ for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) {
+ int sl2 = (svc_ctx->spatial_layers == 2) ? sl + 1 : sl;
+ si->svc_params.scaling_factor_num[sl] = DEFAULT_SCALE_FACTORS_NUM_2x[sl2];
+ si->svc_params.scaling_factor_den[sl] = DEFAULT_SCALE_FACTORS_DEN_2x[sl2];
+ }
+ }
for (tl = 0; tl < svc_ctx->temporal_layers; ++tl) {
for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) {
i = sl * svc_ctx->temporal_layers + tl;
si->svc_params.max_quantizers[i] = MAX_QUANTIZER;
si->svc_params.min_quantizers[i] = 0;
+ if (enc_cfg->rc_end_usage == VPX_CBR &&
+ enc_cfg->g_pass == VPX_RC_ONE_PASS) {
+ si->svc_params.max_quantizers[i] = 56;
+ si->svc_params.min_quantizers[i] = 2;
+ }
}
}
@@ -442,7 +471,15 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
(int)VPX_MAX_LAYERS);
return VPX_CODEC_INVALID_PARAM;
}
- assign_layer_bitrates(svc_ctx, enc_cfg);
+ res = assign_layer_bitrates(svc_ctx, enc_cfg);
+ if (res != VPX_CODEC_OK) {
+ svc_log(svc_ctx, SVC_LOG_ERROR,
+ "layer bitrates incorrect: \n"
+ "1) spatial layer bitrates should sum up to target \n"
+ "2) temporal layer bitrates should be increasing within \n"
+ "a spatial layer \n");
+ return VPX_CODEC_INVALID_PARAM;
+ }
#if CONFIG_SPATIAL_SVC
for (i = 0; i < svc_ctx->spatial_layers; ++i)
diff --git a/vpx/svc_context.h b/vpx/svc_context.h
index c8bde5832..462785075 100644
--- a/vpx/svc_context.h
+++ b/vpx/svc_context.h
@@ -54,7 +54,7 @@ typedef struct SvcInternal {
// values extracted from option, quantizers
vpx_svc_extra_cfg_t svc_params;
int enable_auto_alt_ref[VPX_SS_MAX_LAYERS];
- int bitrates[VPX_SS_MAX_LAYERS];
+ int bitrates[VPX_MAX_LAYERS];
// accumulated statistics
double psnr_sum[VPX_SS_MAX_LAYERS][COMPONENTS]; // total/Y/U/V
diff --git a/vpx_dsp/arm/idct32x32_1_add_neon.c b/vpx_dsp/arm/idct32x32_1_add_neon.c
index 8aad4c579..6be4b0122 100644
--- a/vpx_dsp/arm/idct32x32_1_add_neon.c
+++ b/vpx_dsp/arm/idct32x32_1_add_neon.c
@@ -103,7 +103,7 @@ void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest,
dest_stride8 = dest_stride * 8;
if (a1 >= 0) { // diff_positive_32_32
a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
- q0u8 = vdupq_n_u8(a1);
+ q0u8 = vdupq_n_u8((uint8_t)a1);
for (i = 0; i < 2; i++, dest += 16) { // diff_positive_32_32_loop
d = dest;
for (j = 0; j < 4; j++) {
@@ -119,7 +119,7 @@ void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest,
} else { // diff_negative_32_32
a1 = -a1;
a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
- q0u8 = vdupq_n_u8(a1);
+ q0u8 = vdupq_n_u8((uint8_t)a1);
for (i = 0; i < 2; i++, dest += 16) { // diff_negative_32_32_loop
d = dest;
for (j = 0; j < 4; j++) {
diff --git a/vpx_dsp/arm/idct32x32_34_add_neon.c b/vpx_dsp/arm/idct32x32_34_add_neon.c
new file mode 100644
index 000000000..ebec9df54
--- /dev/null
+++ b/vpx_dsp/arm/idct32x32_34_add_neon.c
@@ -0,0 +1,519 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/txfm_common.h"
+
+// Only for the first pass of the _34_ variant. Since it only uses values from
+// the top left 8x8 it can safely assume all the remaining values are 0 and skip
+// an awful lot of calculations. In fact, only the first 6 columns make the cut.
+// None of the elements in the 7th or 8th column are used so it skips any calls
+// to input[67] too.
+// In C this does a single row of 32 for each call. Here it transposes the top
+// left 8x8 to allow using SIMD.
+
+// vp9/common/vp9_scan.c:vp9_default_iscan_32x32 arranges the first 34 non-zero
+// coefficients as follows:
+// 0 1 2 3 4 5 6 7
+// 0 0 2 5 10 17 25
+// 1 1 4 8 15 22 30
+// 2 3 7 12 18 28
+// 3 6 11 16 23 31
+// 4 9 14 19 29
+// 5 13 20 26
+// 6 21 27 33
+// 7 24 32
+static void idct32_6_neon(const int16_t *input, int16_t *output) {
+ int16x8_t in0, in1, in2, in3, in4, in5, in6, in7;
+ int16x8_t s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s1_9, s1_10,
+ s1_11, s1_12, s1_13, s1_14, s1_15, s1_16, s1_17, s1_18, s1_19, s1_20,
+ s1_21, s1_22, s1_23, s1_24, s1_25, s1_26, s1_27, s1_28, s1_29, s1_30,
+ s1_31;
+ int16x8_t s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s2_9, s2_10,
+ s2_11, s2_12, s2_13, s2_14, s2_15, s2_16, s2_17, s2_18, s2_19, s2_20,
+ s2_21, s2_22, s2_23, s2_24, s2_25, s2_26, s2_27, s2_28, s2_29, s2_30,
+ s2_31;
+ int16x8_t s3_24, s3_25, s3_26, s3_27;
+
+ load_and_transpose_s16_8x8(input, 32, &in0, &in1, &in2, &in3, &in4, &in5,
+ &in6, &in7);
+
+ // stage 1
+ // input[1] * cospi_31_64 - input[31] * cospi_1_64 (but input[31] == 0)
+ s1_16 = multiply_shift_and_narrow_s16(in1, cospi_31_64);
+ // input[1] * cospi_1_64 + input[31] * cospi_31_64 (but input[31] == 0)
+ s1_31 = multiply_shift_and_narrow_s16(in1, cospi_1_64);
+
+ s1_20 = multiply_shift_and_narrow_s16(in5, cospi_27_64);
+ s1_27 = multiply_shift_and_narrow_s16(in5, cospi_5_64);
+
+ s1_23 = multiply_shift_and_narrow_s16(in3, -cospi_29_64);
+ s1_24 = multiply_shift_and_narrow_s16(in3, cospi_3_64);
+
+ // stage 2
+ s2_8 = multiply_shift_and_narrow_s16(in2, cospi_30_64);
+ s2_15 = multiply_shift_and_narrow_s16(in2, cospi_2_64);
+
+ // stage 3
+ s1_4 = multiply_shift_and_narrow_s16(in4, cospi_28_64);
+ s1_7 = multiply_shift_and_narrow_s16(in4, cospi_4_64);
+
+ s1_17 = multiply_accumulate_shift_and_narrow_s16(s1_16, -cospi_4_64, s1_31,
+ cospi_28_64);
+ s1_30 = multiply_accumulate_shift_and_narrow_s16(s1_16, cospi_28_64, s1_31,
+ cospi_4_64);
+
+ s1_21 = multiply_accumulate_shift_and_narrow_s16(s1_20, -cospi_20_64, s1_27,
+ cospi_12_64);
+ s1_26 = multiply_accumulate_shift_and_narrow_s16(s1_20, cospi_12_64, s1_27,
+ cospi_20_64);
+
+ s1_22 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_12_64, s1_24,
+ -cospi_20_64);
+ s1_25 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_20_64, s1_24,
+ cospi_12_64);
+
+ // stage 4
+ s1_0 = multiply_shift_and_narrow_s16(in0, cospi_16_64);
+
+ s2_9 = multiply_accumulate_shift_and_narrow_s16(s2_8, -cospi_8_64, s2_15,
+ cospi_24_64);
+ s2_14 = multiply_accumulate_shift_and_narrow_s16(s2_8, cospi_24_64, s2_15,
+ cospi_8_64);
+
+ s2_20 = vsubq_s16(s1_23, s1_20);
+ s2_21 = vsubq_s16(s1_22, s1_21);
+ s2_22 = vaddq_s16(s1_21, s1_22);
+ s2_23 = vaddq_s16(s1_20, s1_23);
+ s2_24 = vaddq_s16(s1_24, s1_27);
+ s2_25 = vaddq_s16(s1_25, s1_26);
+ s2_26 = vsubq_s16(s1_25, s1_26);
+ s2_27 = vsubq_s16(s1_24, s1_27);
+
+ // stage 5
+ s1_5 = sub_multiply_shift_and_narrow_s16(s1_7, s1_4, cospi_16_64);
+ s1_6 = add_multiply_shift_and_narrow_s16(s1_4, s1_7, cospi_16_64);
+
+ s1_18 = multiply_accumulate_shift_and_narrow_s16(s1_17, -cospi_8_64, s1_30,
+ cospi_24_64);
+ s1_29 = multiply_accumulate_shift_and_narrow_s16(s1_17, cospi_24_64, s1_30,
+ cospi_8_64);
+
+ s1_19 = multiply_accumulate_shift_and_narrow_s16(s1_16, -cospi_8_64, s1_31,
+ cospi_24_64);
+ s1_28 = multiply_accumulate_shift_and_narrow_s16(s1_16, cospi_24_64, s1_31,
+ cospi_8_64);
+
+ s1_20 = multiply_accumulate_shift_and_narrow_s16(s2_20, -cospi_24_64, s2_27,
+ -cospi_8_64);
+ s1_27 = multiply_accumulate_shift_and_narrow_s16(s2_20, -cospi_8_64, s2_27,
+ cospi_24_64);
+
+ s1_21 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_24_64, s2_26,
+ -cospi_8_64);
+ s1_26 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_8_64, s2_26,
+ cospi_24_64);
+
+ // stage 6
+ s2_0 = vaddq_s16(s1_0, s1_7);
+ s2_1 = vaddq_s16(s1_0, s1_6);
+ s2_2 = vaddq_s16(s1_0, s1_5);
+ s2_3 = vaddq_s16(s1_0, s1_4);
+ s2_4 = vsubq_s16(s1_0, s1_4);
+ s2_5 = vsubq_s16(s1_0, s1_5);
+ s2_6 = vsubq_s16(s1_0, s1_6);
+ s2_7 = vsubq_s16(s1_0, s1_7);
+
+ s2_10 = sub_multiply_shift_and_narrow_s16(s2_14, s2_9, cospi_16_64);
+ s2_13 = add_multiply_shift_and_narrow_s16(s2_9, s2_14, cospi_16_64);
+
+ s2_11 = sub_multiply_shift_and_narrow_s16(s2_15, s2_8, cospi_16_64);
+ s2_12 = add_multiply_shift_and_narrow_s16(s2_8, s2_15, cospi_16_64);
+
+ s2_16 = vaddq_s16(s1_16, s2_23);
+ s2_17 = vaddq_s16(s1_17, s2_22);
+ s2_18 = vaddq_s16(s1_18, s1_21);
+ s2_19 = vaddq_s16(s1_19, s1_20);
+ s2_20 = vsubq_s16(s1_19, s1_20);
+ s2_21 = vsubq_s16(s1_18, s1_21);
+ s2_22 = vsubq_s16(s1_17, s2_22);
+ s2_23 = vsubq_s16(s1_16, s2_23);
+
+ s3_24 = vsubq_s16(s1_31, s2_24);
+ s3_25 = vsubq_s16(s1_30, s2_25);
+ s3_26 = vsubq_s16(s1_29, s1_26);
+ s3_27 = vsubq_s16(s1_28, s1_27);
+ s2_28 = vaddq_s16(s1_27, s1_28);
+ s2_29 = vaddq_s16(s1_26, s1_29);
+ s2_30 = vaddq_s16(s2_25, s1_30);
+ s2_31 = vaddq_s16(s2_24, s1_31);
+
+ // stage 7
+ s1_0 = vaddq_s16(s2_0, s2_15);
+ s1_1 = vaddq_s16(s2_1, s2_14);
+ s1_2 = vaddq_s16(s2_2, s2_13);
+ s1_3 = vaddq_s16(s2_3, s2_12);
+ s1_4 = vaddq_s16(s2_4, s2_11);
+ s1_5 = vaddq_s16(s2_5, s2_10);
+ s1_6 = vaddq_s16(s2_6, s2_9);
+ s1_7 = vaddq_s16(s2_7, s2_8);
+ s1_8 = vsubq_s16(s2_7, s2_8);
+ s1_9 = vsubq_s16(s2_6, s2_9);
+ s1_10 = vsubq_s16(s2_5, s2_10);
+ s1_11 = vsubq_s16(s2_4, s2_11);
+ s1_12 = vsubq_s16(s2_3, s2_12);
+ s1_13 = vsubq_s16(s2_2, s2_13);
+ s1_14 = vsubq_s16(s2_1, s2_14);
+ s1_15 = vsubq_s16(s2_0, s2_15);
+
+ s1_20 = sub_multiply_shift_and_narrow_s16(s3_27, s2_20, cospi_16_64);
+ s1_27 = add_multiply_shift_and_narrow_s16(s2_20, s3_27, cospi_16_64);
+
+ s1_21 = sub_multiply_shift_and_narrow_s16(s3_26, s2_21, cospi_16_64);
+ s1_26 = add_multiply_shift_and_narrow_s16(s2_21, s3_26, cospi_16_64);
+
+ s1_22 = sub_multiply_shift_and_narrow_s16(s3_25, s2_22, cospi_16_64);
+ s1_25 = add_multiply_shift_and_narrow_s16(s2_22, s3_25, cospi_16_64);
+
+ s1_23 = sub_multiply_shift_and_narrow_s16(s3_24, s2_23, cospi_16_64);
+ s1_24 = add_multiply_shift_and_narrow_s16(s2_23, s3_24, cospi_16_64);
+
+ // final stage
+ vst1q_s16(output, vaddq_s16(s1_0, s2_31));
+ output += 8;
+ vst1q_s16(output, vaddq_s16(s1_1, s2_30));
+ output += 8;
+ vst1q_s16(output, vaddq_s16(s1_2, s2_29));
+ output += 8;
+ vst1q_s16(output, vaddq_s16(s1_3, s2_28));
+ output += 8;
+ vst1q_s16(output, vaddq_s16(s1_4, s1_27));
+ output += 8;
+ vst1q_s16(output, vaddq_s16(s1_5, s1_26));
+ output += 8;
+ vst1q_s16(output, vaddq_s16(s1_6, s1_25));
+ output += 8;
+ vst1q_s16(output, vaddq_s16(s1_7, s1_24));
+ output += 8;
+
+ vst1q_s16(output, vaddq_s16(s1_8, s1_23));
+ output += 8;
+ vst1q_s16(output, vaddq_s16(s1_9, s1_22));
+ output += 8;
+ vst1q_s16(output, vaddq_s16(s1_10, s1_21));
+ output += 8;
+ vst1q_s16(output, vaddq_s16(s1_11, s1_20));
+ output += 8;
+ vst1q_s16(output, vaddq_s16(s1_12, s2_19));
+ output += 8;
+ vst1q_s16(output, vaddq_s16(s1_13, s2_18));
+ output += 8;
+ vst1q_s16(output, vaddq_s16(s1_14, s2_17));
+ output += 8;
+ vst1q_s16(output, vaddq_s16(s1_15, s2_16));
+ output += 8;
+
+ vst1q_s16(output, vsubq_s16(s1_15, s2_16));
+ output += 8;
+ vst1q_s16(output, vsubq_s16(s1_14, s2_17));
+ output += 8;
+ vst1q_s16(output, vsubq_s16(s1_13, s2_18));
+ output += 8;
+ vst1q_s16(output, vsubq_s16(s1_12, s2_19));
+ output += 8;
+ vst1q_s16(output, vsubq_s16(s1_11, s1_20));
+ output += 8;
+ vst1q_s16(output, vsubq_s16(s1_10, s1_21));
+ output += 8;
+ vst1q_s16(output, vsubq_s16(s1_9, s1_22));
+ output += 8;
+ vst1q_s16(output, vsubq_s16(s1_8, s1_23));
+ output += 8;
+
+ vst1q_s16(output, vsubq_s16(s1_7, s1_24));
+ output += 8;
+ vst1q_s16(output, vsubq_s16(s1_6, s1_25));
+ output += 8;
+ vst1q_s16(output, vsubq_s16(s1_5, s1_26));
+ output += 8;
+ vst1q_s16(output, vsubq_s16(s1_4, s1_27));
+ output += 8;
+ vst1q_s16(output, vsubq_s16(s1_3, s2_28));
+ output += 8;
+ vst1q_s16(output, vsubq_s16(s1_2, s2_29));
+ output += 8;
+ vst1q_s16(output, vsubq_s16(s1_1, s2_30));
+ output += 8;
+ vst1q_s16(output, vsubq_s16(s1_0, s2_31));
+}
+
+static void idct32_8_neon(const int16_t *input, uint8_t *output, int stride) {
+ int16x8_t in0, in1, in2, in3, in4, in5, in6, in7;
+ int16x8_t out0, out1, out2, out3, out4, out5, out6, out7;
+ int16x8_t s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s1_9, s1_10,
+ s1_11, s1_12, s1_13, s1_14, s1_15, s1_16, s1_17, s1_18, s1_19, s1_20,
+ s1_21, s1_22, s1_23, s1_24, s1_25, s1_26, s1_27, s1_28, s1_29, s1_30,
+ s1_31;
+ int16x8_t s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s2_9, s2_10,
+ s2_11, s2_12, s2_13, s2_14, s2_15, s2_16, s2_17, s2_18, s2_19, s2_20,
+ s2_21, s2_22, s2_23, s2_24, s2_25, s2_26, s2_27, s2_28, s2_29, s2_30,
+ s2_31;
+ int16x8_t s3_24, s3_25, s3_26, s3_27;
+
+ load_and_transpose_s16_8x8(input, 8, &in0, &in1, &in2, &in3, &in4, &in5, &in6,
+ &in7);
+
+ // stage 1
+ s1_16 = multiply_shift_and_narrow_s16(in1, cospi_31_64);
+ s1_31 = multiply_shift_and_narrow_s16(in1, cospi_1_64);
+
+ // Different for _8_
+ s1_19 = multiply_shift_and_narrow_s16(in7, -cospi_25_64);
+ s1_28 = multiply_shift_and_narrow_s16(in7, cospi_7_64);
+
+ s1_20 = multiply_shift_and_narrow_s16(in5, cospi_27_64);
+ s1_27 = multiply_shift_and_narrow_s16(in5, cospi_5_64);
+
+ s1_23 = multiply_shift_and_narrow_s16(in3, -cospi_29_64);
+ s1_24 = multiply_shift_and_narrow_s16(in3, cospi_3_64);
+
+ // stage 2
+ s2_8 = multiply_shift_and_narrow_s16(in2, cospi_30_64);
+ s2_15 = multiply_shift_and_narrow_s16(in2, cospi_2_64);
+
+ s2_11 = multiply_shift_and_narrow_s16(in6, -cospi_26_64);
+ s2_12 = multiply_shift_and_narrow_s16(in6, cospi_6_64);
+
+ // stage 3
+ s1_4 = multiply_shift_and_narrow_s16(in4, cospi_28_64);
+ s1_7 = multiply_shift_and_narrow_s16(in4, cospi_4_64);
+
+ s1_17 = multiply_accumulate_shift_and_narrow_s16(s1_16, -cospi_4_64, s1_31,
+ cospi_28_64);
+ s1_30 = multiply_accumulate_shift_and_narrow_s16(s1_16, cospi_28_64, s1_31,
+ cospi_4_64);
+
+ // Different for _8_
+ s1_18 = multiply_accumulate_shift_and_narrow_s16(s1_19, -cospi_28_64, s1_28,
+ -cospi_4_64);
+ s1_29 = multiply_accumulate_shift_and_narrow_s16(s1_19, -cospi_4_64, s1_28,
+ cospi_28_64);
+
+ s1_21 = multiply_accumulate_shift_and_narrow_s16(s1_20, -cospi_20_64, s1_27,
+ cospi_12_64);
+ s1_26 = multiply_accumulate_shift_and_narrow_s16(s1_20, cospi_12_64, s1_27,
+ cospi_20_64);
+
+ s1_22 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_12_64, s1_24,
+ -cospi_20_64);
+ s1_25 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_20_64, s1_24,
+ cospi_12_64);
+
+ // stage 4
+ s1_0 = multiply_shift_and_narrow_s16(in0, cospi_16_64);
+
+ s2_9 = multiply_accumulate_shift_and_narrow_s16(s2_8, -cospi_8_64, s2_15,
+ cospi_24_64);
+ s2_14 = multiply_accumulate_shift_and_narrow_s16(s2_8, cospi_24_64, s2_15,
+ cospi_8_64);
+
+ s2_10 = multiply_accumulate_shift_and_narrow_s16(s2_11, -cospi_24_64, s2_12,
+ -cospi_8_64);
+ s2_13 = multiply_accumulate_shift_and_narrow_s16(s2_11, -cospi_8_64, s2_12,
+ cospi_24_64);
+
+ s2_16 = vaddq_s16(s1_16, s1_19);
+
+ s2_17 = vaddq_s16(s1_17, s1_18);
+ s2_18 = vsubq_s16(s1_17, s1_18);
+
+ s2_19 = vsubq_s16(s1_16, s1_19);
+
+ s2_20 = vsubq_s16(s1_23, s1_20);
+ s2_21 = vsubq_s16(s1_22, s1_21);
+
+ s2_22 = vaddq_s16(s1_21, s1_22);
+ s2_23 = vaddq_s16(s1_20, s1_23);
+
+ s2_24 = vaddq_s16(s1_24, s1_27);
+ s2_25 = vaddq_s16(s1_25, s1_26);
+ s2_26 = vsubq_s16(s1_25, s1_26);
+ s2_27 = vsubq_s16(s1_24, s1_27);
+
+ s2_28 = vsubq_s16(s1_31, s1_28);
+ s2_29 = vsubq_s16(s1_30, s1_29);
+ s2_30 = vaddq_s16(s1_29, s1_30);
+ s2_31 = vaddq_s16(s1_28, s1_31);
+
+ // stage 5
+ s1_5 = sub_multiply_shift_and_narrow_s16(s1_7, s1_4, cospi_16_64);
+ s1_6 = add_multiply_shift_and_narrow_s16(s1_4, s1_7, cospi_16_64);
+
+ s1_8 = vaddq_s16(s2_8, s2_11);
+ s1_9 = vaddq_s16(s2_9, s2_10);
+ s1_10 = vsubq_s16(s2_9, s2_10);
+ s1_11 = vsubq_s16(s2_8, s2_11);
+ s1_12 = vsubq_s16(s2_15, s2_12);
+ s1_13 = vsubq_s16(s2_14, s2_13);
+ s1_14 = vaddq_s16(s2_13, s2_14);
+ s1_15 = vaddq_s16(s2_12, s2_15);
+
+ s1_18 = multiply_accumulate_shift_and_narrow_s16(s2_18, -cospi_8_64, s2_29,
+ cospi_24_64);
+ s1_29 = multiply_accumulate_shift_and_narrow_s16(s2_18, cospi_24_64, s2_29,
+ cospi_8_64);
+
+ s1_19 = multiply_accumulate_shift_and_narrow_s16(s2_19, -cospi_8_64, s2_28,
+ cospi_24_64);
+ s1_28 = multiply_accumulate_shift_and_narrow_s16(s2_19, cospi_24_64, s2_28,
+ cospi_8_64);
+
+ s1_20 = multiply_accumulate_shift_and_narrow_s16(s2_20, -cospi_24_64, s2_27,
+ -cospi_8_64);
+ s1_27 = multiply_accumulate_shift_and_narrow_s16(s2_20, -cospi_8_64, s2_27,
+ cospi_24_64);
+
+ s1_21 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_24_64, s2_26,
+ -cospi_8_64);
+ s1_26 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_8_64, s2_26,
+ cospi_24_64);
+
+ // stage 6
+ s2_0 = vaddq_s16(s1_0, s1_7);
+ s2_1 = vaddq_s16(s1_0, s1_6);
+ s2_2 = vaddq_s16(s1_0, s1_5);
+ s2_3 = vaddq_s16(s1_0, s1_4);
+ s2_4 = vsubq_s16(s1_0, s1_4);
+ s2_5 = vsubq_s16(s1_0, s1_5);
+ s2_6 = vsubq_s16(s1_0, s1_6);
+ s2_7 = vsubq_s16(s1_0, s1_7);
+
+ s2_10 = sub_multiply_shift_and_narrow_s16(s1_13, s1_10, cospi_16_64);
+ s2_13 = add_multiply_shift_and_narrow_s16(s1_10, s1_13, cospi_16_64);
+
+ s2_11 = sub_multiply_shift_and_narrow_s16(s1_12, s1_11, cospi_16_64);
+ s2_12 = add_multiply_shift_and_narrow_s16(s1_11, s1_12, cospi_16_64);
+
+ s1_16 = vaddq_s16(s2_16, s2_23);
+ s1_17 = vaddq_s16(s2_17, s2_22);
+ s2_18 = vaddq_s16(s1_18, s1_21);
+ s2_19 = vaddq_s16(s1_19, s1_20);
+ s2_20 = vsubq_s16(s1_19, s1_20);
+ s2_21 = vsubq_s16(s1_18, s1_21);
+ s1_22 = vsubq_s16(s2_17, s2_22);
+ s1_23 = vsubq_s16(s2_16, s2_23);
+
+ s3_24 = vsubq_s16(s2_31, s2_24);
+ s3_25 = vsubq_s16(s2_30, s2_25);
+ s3_26 = vsubq_s16(s1_29, s1_26);
+ s3_27 = vsubq_s16(s1_28, s1_27);
+ s2_28 = vaddq_s16(s1_27, s1_28);
+ s2_29 = vaddq_s16(s1_26, s1_29);
+ s2_30 = vaddq_s16(s2_25, s2_30);
+ s2_31 = vaddq_s16(s2_24, s2_31);
+
+ // stage 7
+ s1_0 = vaddq_s16(s2_0, s1_15);
+ s1_1 = vaddq_s16(s2_1, s1_14);
+ s1_2 = vaddq_s16(s2_2, s2_13);
+ s1_3 = vaddq_s16(s2_3, s2_12);
+ s1_4 = vaddq_s16(s2_4, s2_11);
+ s1_5 = vaddq_s16(s2_5, s2_10);
+ s1_6 = vaddq_s16(s2_6, s1_9);
+ s1_7 = vaddq_s16(s2_7, s1_8);
+ s1_8 = vsubq_s16(s2_7, s1_8);
+ s1_9 = vsubq_s16(s2_6, s1_9);
+ s1_10 = vsubq_s16(s2_5, s2_10);
+ s1_11 = vsubq_s16(s2_4, s2_11);
+ s1_12 = vsubq_s16(s2_3, s2_12);
+ s1_13 = vsubq_s16(s2_2, s2_13);
+ s1_14 = vsubq_s16(s2_1, s1_14);
+ s1_15 = vsubq_s16(s2_0, s1_15);
+
+ s1_20 = sub_multiply_shift_and_narrow_s16(s3_27, s2_20, cospi_16_64);
+ s1_27 = add_multiply_shift_and_narrow_s16(s2_20, s3_27, cospi_16_64);
+
+ s1_21 = sub_multiply_shift_and_narrow_s16(s3_26, s2_21, cospi_16_64);
+ s1_26 = add_multiply_shift_and_narrow_s16(s2_21, s3_26, cospi_16_64);
+
+ s2_22 = sub_multiply_shift_and_narrow_s16(s3_25, s1_22, cospi_16_64);
+ s1_25 = add_multiply_shift_and_narrow_s16(s1_22, s3_25, cospi_16_64);
+
+ s2_23 = sub_multiply_shift_and_narrow_s16(s3_24, s1_23, cospi_16_64);
+ s1_24 = add_multiply_shift_and_narrow_s16(s1_23, s3_24, cospi_16_64);
+
+ // final stage
+ out0 = vaddq_s16(s1_0, s2_31);
+ out1 = vaddq_s16(s1_1, s2_30);
+ out2 = vaddq_s16(s1_2, s2_29);
+ out3 = vaddq_s16(s1_3, s2_28);
+ out4 = vaddq_s16(s1_4, s1_27);
+ out5 = vaddq_s16(s1_5, s1_26);
+ out6 = vaddq_s16(s1_6, s1_25);
+ out7 = vaddq_s16(s1_7, s1_24);
+
+ add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7, output,
+ stride);
+
+ out0 = vaddq_s16(s1_8, s2_23);
+ out1 = vaddq_s16(s1_9, s2_22);
+ out2 = vaddq_s16(s1_10, s1_21);
+ out3 = vaddq_s16(s1_11, s1_20);
+ out4 = vaddq_s16(s1_12, s2_19);
+ out5 = vaddq_s16(s1_13, s2_18);
+ out6 = vaddq_s16(s1_14, s1_17);
+ out7 = vaddq_s16(s1_15, s1_16);
+
+ add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7,
+ output + (8 * stride), stride);
+
+ out0 = vsubq_s16(s1_15, s1_16);
+ out1 = vsubq_s16(s1_14, s1_17);
+ out2 = vsubq_s16(s1_13, s2_18);
+ out3 = vsubq_s16(s1_12, s2_19);
+ out4 = vsubq_s16(s1_11, s1_20);
+ out5 = vsubq_s16(s1_10, s1_21);
+ out6 = vsubq_s16(s1_9, s2_22);
+ out7 = vsubq_s16(s1_8, s2_23);
+
+ add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7,
+ output + (16 * stride), stride);
+
+ out0 = vsubq_s16(s1_7, s1_24);
+ out1 = vsubq_s16(s1_6, s1_25);
+ out2 = vsubq_s16(s1_5, s1_26);
+ out3 = vsubq_s16(s1_4, s1_27);
+ out4 = vsubq_s16(s1_3, s2_28);
+ out5 = vsubq_s16(s1_2, s2_29);
+ out6 = vsubq_s16(s1_1, s2_30);
+ out7 = vsubq_s16(s1_0, s2_31);
+
+ add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7,
+ output + (24 * stride), stride);
+}
+
+void vpx_idct32x32_34_add_neon(const int16_t *input, uint8_t *dest,
+ int stride) {
+ int i;
+ int16_t temp[32 * 8];
+ int16_t *t = temp;
+
+ idct32_6_neon(input, t);
+
+ for (i = 0; i < 32; i += 8) {
+ idct32_8_neon(t, dest, stride);
+ t += (8 * 8);
+ dest += 8;
+ }
+}
diff --git a/vpx_dsp/arm/idct4x4_add_neon.asm b/vpx_dsp/arm/idct4x4_add_neon.asm
index a4ccba993..bd4e86ded 100644
--- a/vpx_dsp/arm/idct4x4_add_neon.asm
+++ b/vpx_dsp/arm/idct4x4_add_neon.asm
@@ -15,6 +15,8 @@
AREA ||.text||, CODE, READONLY, ALIGN=2
+ INCLUDE vpx_dsp/arm/idct_neon.asm.S
+
AREA Block, CODE, READONLY ; name this block of code
;void vpx_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
;
@@ -33,7 +35,7 @@
; So, two passes of a transpose followed by a column transform.
; load the inputs into q8-q9, d16-d19
- vld1.s16 {q8,q9}, [r0]!
+ LOAD_TRAN_LOW_TO_S16 d16, d17, d18, d19, r0
; generate scalar constants
; cospi_8_64 = 15137
diff --git a/vpx_dsp/arm/idct4x4_add_neon.c b/vpx_dsp/arm/idct4x4_add_neon.c
index 24b91fe48..8f669c907 100644
--- a/vpx_dsp/arm/idct4x4_add_neon.c
+++ b/vpx_dsp/arm/idct4x4_add_neon.c
@@ -11,6 +11,7 @@
#include <arm_neon.h>
#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
#include "vpx_dsp/txfm_common.h"
void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
@@ -28,8 +29,8 @@ void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
d26u32 = d27u32 = vdup_n_u32(0);
- q8s16 = vld1q_s16(input);
- q9s16 = vld1q_s16(input + 8);
+ q8s16 = load_tran_low_to_s16(input);
+ q9s16 = load_tran_low_to_s16(input + 8);
d16s16 = vget_low_s16(q8s16);
d17s16 = vget_high_s16(q8s16);
diff --git a/vpx_dsp/arm/idct8x8_add_neon.asm b/vpx_dsp/arm/idct8x8_add_neon.asm
index 21e75951e..a5c9c927d 100644
--- a/vpx_dsp/arm/idct8x8_add_neon.asm
+++ b/vpx_dsp/arm/idct8x8_add_neon.asm
@@ -16,6 +16,8 @@
AREA ||.text||, CODE, READONLY, ALIGN=2
+ INCLUDE vpx_dsp/arm/idct_neon.asm.S
+
; Parallel 1D IDCT on all the columns of a 8x8 16bit data matrix which are
; loaded in q8-q15. The output will be stored back into q8-q15 registers.
; This macro will touch q0-q7 registers and use them as buffer during
@@ -207,10 +209,10 @@
|vpx_idct8x8_64_add_neon| PROC
push {r4-r9}
vpush {d8-d15}
- vld1.s16 {q8,q9}, [r0]!
- vld1.s16 {q10,q11}, [r0]!
- vld1.s16 {q12,q13}, [r0]!
- vld1.s16 {q14,q15}, [r0]!
+ LOAD_TRAN_LOW_TO_S16 d16, d17, d18, d19, r0
+ LOAD_TRAN_LOW_TO_S16 d20, d21, d22, d23, r0
+ LOAD_TRAN_LOW_TO_S16 d24, d25, d26, d27, r0
+ LOAD_TRAN_LOW_TO_S16 d28, d29, d30, d31, r0
; transpose the input data
TRANSPOSE8X8
@@ -312,10 +314,10 @@
|vpx_idct8x8_12_add_neon| PROC
push {r4-r9}
vpush {d8-d15}
- vld1.s16 {q8,q9}, [r0]!
- vld1.s16 {q10,q11}, [r0]!
- vld1.s16 {q12,q13}, [r0]!
- vld1.s16 {q14,q15}, [r0]!
+ LOAD_TRAN_LOW_TO_S16 d16, d17, d18, d19, r0
+ LOAD_TRAN_LOW_TO_S16 d20, d21, d22, d23, r0
+ LOAD_TRAN_LOW_TO_S16 d24, d25, d26, d27, r0
+ LOAD_TRAN_LOW_TO_S16 d28, d29, d30, d31, r0
; transpose the input data
TRANSPOSE8X8
diff --git a/vpx_dsp/arm/idct8x8_add_neon.c b/vpx_dsp/arm/idct8x8_add_neon.c
index d73feebec..159a6ec98 100644
--- a/vpx_dsp/arm/idct8x8_add_neon.c
+++ b/vpx_dsp/arm/idct8x8_add_neon.c
@@ -12,6 +12,7 @@
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
#include "vpx_dsp/arm/transpose_neon.h"
#include "vpx_dsp/txfm_common.h"
@@ -173,14 +174,14 @@ void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest,
int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
uint16x8_t q8u16, q9u16, q10u16, q11u16;
- q8s16 = vld1q_s16(input);
- q9s16 = vld1q_s16(input + 8);
- q10s16 = vld1q_s16(input + 16);
- q11s16 = vld1q_s16(input + 24);
- q12s16 = vld1q_s16(input + 32);
- q13s16 = vld1q_s16(input + 40);
- q14s16 = vld1q_s16(input + 48);
- q15s16 = vld1q_s16(input + 56);
+ q8s16 = load_tran_low_to_s16(input);
+ q9s16 = load_tran_low_to_s16(input + 8);
+ q10s16 = load_tran_low_to_s16(input + 16);
+ q11s16 = load_tran_low_to_s16(input + 24);
+ q12s16 = load_tran_low_to_s16(input + 32);
+ q13s16 = load_tran_low_to_s16(input + 40);
+ q14s16 = load_tran_low_to_s16(input + 48);
+ q15s16 = load_tran_low_to_s16(input + 56);
transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
&q15s16);
@@ -279,14 +280,14 @@ void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest,
uint16x8_t q8u16, q9u16, q10u16, q11u16;
int32x4_t q9s32, q10s32, q11s32, q12s32;
- q8s16 = vld1q_s16(input);
- q9s16 = vld1q_s16(input + 8);
- q10s16 = vld1q_s16(input + 16);
- q11s16 = vld1q_s16(input + 24);
- q12s16 = vld1q_s16(input + 32);
- q13s16 = vld1q_s16(input + 40);
- q14s16 = vld1q_s16(input + 48);
- q15s16 = vld1q_s16(input + 56);
+ q8s16 = load_tran_low_to_s16(input);
+ q9s16 = load_tran_low_to_s16(input + 8);
+ q10s16 = load_tran_low_to_s16(input + 16);
+ q11s16 = load_tran_low_to_s16(input + 24);
+ q12s16 = load_tran_low_to_s16(input + 32);
+ q13s16 = load_tran_low_to_s16(input + 40);
+ q14s16 = load_tran_low_to_s16(input + 48);
+ q15s16 = load_tran_low_to_s16(input + 56);
transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
&q15s16);
diff --git a/vpx_dsp/arm/idct_neon.asm b/vpx_dsp/arm/idct_neon.asm
new file mode 100644
index 000000000..a223c0b63
--- /dev/null
+++ b/vpx_dsp/arm/idct_neon.asm
@@ -0,0 +1,29 @@
+;
+; Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ INCLUDE ./vpx_config.asm
+
+ ; Helper function used to load tran_low_t into int16, narrowing if
+ ; necessary.
+ ; $dst0..3 are d registers with the pairs assumed to be contiguous in
+ ; non-high-bitdepth builds. q0-q3 are used as temporaries in high-bitdepth.
+ MACRO
+ LOAD_TRAN_LOW_TO_S16 $dst0, $dst1, $dst2, $dst3, $src
+ IF CONFIG_VP9_HIGHBITDEPTH
+ vld1.s32 {q0,q1}, [$src]!
+ vld1.s32 {q2,q3}, [$src]!
+ vmovn.i32 $dst0, q0
+ vmovn.i32 $dst1, q1
+ vmovn.i32 $dst2, q2
+ vmovn.i32 $dst3, q3
+ ELSE
+ vld1.s16 {$dst0-$dst1,$dst2-$dst3}, [$src]!
+ ENDIF
+ MEND
diff --git a/vpx_dsp/arm/idct_neon.h b/vpx_dsp/arm/idct_neon.h
new file mode 100644
index 000000000..5c2a53c03
--- /dev/null
+++ b/vpx_dsp/arm/idct_neon.h
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_ARM_IDCT_NEON_H_
+#define VPX_DSP_ARM_IDCT_NEON_H_
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+//------------------------------------------------------------------------------
+
+// Helper function used to load tran_low_t into int16, narrowing if necessary.
+static INLINE int16x8_t load_tran_low_to_s16(const tran_low_t *buf) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const int32x4_t v0 = vld1q_s32(buf);
+ const int32x4_t v1 = vld1q_s32(buf + 4);
+ const int16x4_t s0 = vmovn_s32(v0);
+ const int16x4_t s1 = vmovn_s32(v1);
+ return vcombine_s16(s0, s1);
+#else
+ return vld1q_s16(buf);
+#endif
+}
+
+// Multiply a by a_const. Saturate, shift and narrow by 14.
+static INLINE int16x8_t multiply_shift_and_narrow_s16(const int16x8_t a,
+ const int16_t a_const) {
+ // Shift by 14 + rounding will be within 16 bits for well formed streams.
+ // See WRAPLOW and dct_const_round_shift for details.
+ // This instruction doubles the result and returns the high half, essentially
+ // resulting in a right shift by 15. By multiplying the constant first that
+ // becomes a right shift by 14.
+ // The largest possible value used here is
+ // vpx_dsp/txfm_common.h:cospi_1_64 = 16364 (* 2 = 32728) a which falls *just*
+ // within the range of int16_t (+32767 / -32768) even when negated.
+ return vqrdmulhq_n_s16(a, a_const * 2);
+}
+
+// Add a and b, then multiply by ab_const. Shift and narrow by 14.
+static INLINE int16x8_t add_multiply_shift_and_narrow_s16(
+ const int16x8_t a, const int16x8_t b, const int16_t ab_const) {
+ // In both add_ and it's pair, sub_, the input for well-formed streams will be
+ // well within 16 bits (input to the idct is the difference between two frames
+ // and will be within -255 to 255, or 9 bits)
+ // However, for inputs over about 25,000 (valid for int16_t, but not for idct
+ // input) this function can not use vaddq_s16.
+ // In order to match existing behavior and intentionally out of range tests,
+ // expand the addition up to 32 bits to prevent truncation.
+ int32x4_t temp_low = vaddl_s16(vget_low_s16(a), vget_low_s16(b));
+ int32x4_t temp_high = vaddl_s16(vget_high_s16(a), vget_high_s16(b));
+ temp_low = vmulq_n_s32(temp_low, ab_const);
+ temp_high = vmulq_n_s32(temp_high, ab_const);
+ return vcombine_s16(vrshrn_n_s32(temp_low, 14), vrshrn_n_s32(temp_high, 14));
+}
+
+// Subtract b from a, then multiply by ab_const. Shift and narrow by 14.
+static INLINE int16x8_t sub_multiply_shift_and_narrow_s16(
+ const int16x8_t a, const int16x8_t b, const int16_t ab_const) {
+ int32x4_t temp_low = vsubl_s16(vget_low_s16(a), vget_low_s16(b));
+ int32x4_t temp_high = vsubl_s16(vget_high_s16(a), vget_high_s16(b));
+ temp_low = vmulq_n_s32(temp_low, ab_const);
+ temp_high = vmulq_n_s32(temp_high, ab_const);
+ return vcombine_s16(vrshrn_n_s32(temp_low, 14), vrshrn_n_s32(temp_high, 14));
+}
+
+// Multiply a by a_const and b by b_const, then accumulate. Shift and narrow by
+// 14.
+static INLINE int16x8_t multiply_accumulate_shift_and_narrow_s16(
+ const int16x8_t a, const int16_t a_const, const int16x8_t b,
+ const int16_t b_const) {
+ int32x4_t temp_low = vmull_n_s16(vget_low_s16(a), a_const);
+ int32x4_t temp_high = vmull_n_s16(vget_high_s16(a), a_const);
+ temp_low = vmlal_n_s16(temp_low, vget_low_s16(b), b_const);
+ temp_high = vmlal_n_s16(temp_high, vget_high_s16(b), b_const);
+ return vcombine_s16(vrshrn_n_s32(temp_low, 14), vrshrn_n_s32(temp_high, 14));
+}
+
+static INLINE void load_and_transpose_s16_8x8(const int16_t *a, int a_stride,
+ int16x8_t *a0, int16x8_t *a1,
+ int16x8_t *a2, int16x8_t *a3,
+ int16x8_t *a4, int16x8_t *a5,
+ int16x8_t *a6, int16x8_t *a7) {
+ *a0 = vld1q_s16(a);
+ a += a_stride;
+ *a1 = vld1q_s16(a);
+ a += a_stride;
+ *a2 = vld1q_s16(a);
+ a += a_stride;
+ *a3 = vld1q_s16(a);
+ a += a_stride;
+ *a4 = vld1q_s16(a);
+ a += a_stride;
+ *a5 = vld1q_s16(a);
+ a += a_stride;
+ *a6 = vld1q_s16(a);
+ a += a_stride;
+ *a7 = vld1q_s16(a);
+
+ transpose_s16_8x8(a0, a1, a2, a3, a4, a5, a6, a7);
+}
+
+// Shift the output down by 6 and add it to the destination buffer.
+static INLINE void add_and_store_u8_s16(const int16x8_t a0, const int16x8_t a1,
+ const int16x8_t a2, const int16x8_t a3,
+ const int16x8_t a4, const int16x8_t a5,
+ const int16x8_t a6, const int16x8_t a7,
+ uint8_t *b, const int b_stride) {
+ uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7;
+ int16x8_t c0, c1, c2, c3, c4, c5, c6, c7;
+ b0 = vld1_u8(b);
+ b += b_stride;
+ b1 = vld1_u8(b);
+ b += b_stride;
+ b2 = vld1_u8(b);
+ b += b_stride;
+ b3 = vld1_u8(b);
+ b += b_stride;
+ b4 = vld1_u8(b);
+ b += b_stride;
+ b5 = vld1_u8(b);
+ b += b_stride;
+ b6 = vld1_u8(b);
+ b += b_stride;
+ b7 = vld1_u8(b);
+ b -= (7 * b_stride);
+
+ // c = b + (a >> 6)
+ c0 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b0)), a0, 6);
+ c1 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b1)), a1, 6);
+ c2 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b2)), a2, 6);
+ c3 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b3)), a3, 6);
+ c4 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b4)), a4, 6);
+ c5 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b5)), a5, 6);
+ c6 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b6)), a6, 6);
+ c7 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b7)), a7, 6);
+
+ b0 = vqmovun_s16(c0);
+ b1 = vqmovun_s16(c1);
+ b2 = vqmovun_s16(c2);
+ b3 = vqmovun_s16(c3);
+ b4 = vqmovun_s16(c4);
+ b5 = vqmovun_s16(c5);
+ b6 = vqmovun_s16(c6);
+ b7 = vqmovun_s16(c7);
+
+ vst1_u8(b, b0);
+ b += b_stride;
+ vst1_u8(b, b1);
+ b += b_stride;
+ vst1_u8(b, b2);
+ b += b_stride;
+ vst1_u8(b, b3);
+ b += b_stride;
+ vst1_u8(b, b4);
+ b += b_stride;
+ vst1_u8(b, b5);
+ b += b_stride;
+ vst1_u8(b, b6);
+ b += b_stride;
+ vst1_u8(b, b7);
+}
+#endif // VPX_DSP_ARM_IDCT_NEON_H_
diff --git a/vpx_dsp/arm/intrapred_neon.c b/vpx_dsp/arm/intrapred_neon.c
index 38e79ed69..e150a5302 100644
--- a/vpx_dsp/arm/intrapred_neon.c
+++ b/vpx_dsp/arm/intrapred_neon.c
@@ -17,306 +17,254 @@
//------------------------------------------------------------------------------
// DC 4x4
-// 'do_above' and 'do_left' facilitate branch removal when inlined.
-static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
- const uint8_t *left, int do_above, int do_left) {
- uint16x4_t sum_top;
- uint16x4_t sum_left;
- uint16x4_t dc0;
-
- if (do_above) {
- const uint8x8_t A = vld1_u8(above); // top row
- const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
- sum_top = vpadd_u16(p0, p0);
- }
-
- if (do_left) {
- const uint8x8_t L = vld1_u8(left); // left border
- const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left
- sum_left = vpadd_u16(p0, p0);
- }
-
- if (do_above && do_left) {
- const uint16x4_t sum = vadd_u16(sum_left, sum_top);
- dc0 = vrshr_n_u16(sum, 3);
- } else if (do_above) {
- dc0 = vrshr_n_u16(sum_top, 2);
- } else if (do_left) {
- dc0 = vrshr_n_u16(sum_left, 2);
- } else {
- dc0 = vdup_n_u16(0x80);
- }
+static INLINE uint16x4_t dc_sum_4(const uint8_t *ref) {
+ const uint8x8_t ref_u8 = vld1_u8(ref);
+ const uint16x4_t p0 = vpaddl_u8(ref_u8);
+ return vpadd_u16(p0, p0);
+}
- {
- const uint8x8_t dc = vdup_lane_u8(vreinterpret_u8_u16(dc0), 0);
- int i;
- for (i = 0; i < 4; ++i) {
- vst1_lane_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc), 0);
- }
+static INLINE void dc_store_4x4(uint8_t *dst, ptrdiff_t stride,
+ const uint8x8_t dc) {
+ const uint8x8_t dc_dup = vdup_lane_u8(dc, 0);
+ int i;
+ for (i = 0; i < 4; ++i, dst += stride) {
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(dc_dup), 0);
}
}
void vpx_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
- dc_4x4(dst, stride, above, left, 1, 1);
+ const uint8x8_t a = vld1_u8(above);
+ const uint8x8_t l = vld1_u8(left);
+ const uint16x8_t al = vaddl_u8(a, l);
+ uint16x4_t sum;
+ uint8x8_t dc;
+ sum = vpadd_u16(vget_low_u16(al), vget_low_u16(al));
+ sum = vpadd_u16(sum, sum);
+ dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 3));
+ dc_store_4x4(dst, stride, dc);
}
void vpx_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
+ const uint16x4_t sum = dc_sum_4(left);
+ const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 2));
(void)above;
- dc_4x4(dst, stride, NULL, left, 0, 1);
+ dc_store_4x4(dst, stride, dc);
}
void vpx_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
+ const uint16x4_t sum = dc_sum_4(above);
+ const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 2));
(void)left;
- dc_4x4(dst, stride, above, NULL, 1, 0);
+ dc_store_4x4(dst, stride, dc);
}
void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
+ const uint8x8_t dc = vdup_n_u8(0x80);
(void)above;
(void)left;
- dc_4x4(dst, stride, NULL, NULL, 0, 0);
+ dc_store_4x4(dst, stride, dc);
}
//------------------------------------------------------------------------------
// DC 8x8
-// 'do_above' and 'do_left' facilitate branch removal when inlined.
-static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
- const uint8_t *left, int do_above, int do_left) {
- uint16x8_t sum_top;
- uint16x8_t sum_left;
- uint8x8_t dc0;
-
- if (do_above) {
- const uint8x8_t A = vld1_u8(above); // top row
- const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
- const uint16x4_t p1 = vpadd_u16(p0, p0);
- const uint16x4_t p2 = vpadd_u16(p1, p1);
- sum_top = vcombine_u16(p2, p2);
- }
-
- if (do_left) {
- const uint8x8_t L = vld1_u8(left); // left border
- const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left
- const uint16x4_t p1 = vpadd_u16(p0, p0);
- const uint16x4_t p2 = vpadd_u16(p1, p1);
- sum_left = vcombine_u16(p2, p2);
- }
-
- if (do_above && do_left) {
- const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
- dc0 = vrshrn_n_u16(sum, 4);
- } else if (do_above) {
- dc0 = vrshrn_n_u16(sum_top, 3);
- } else if (do_left) {
- dc0 = vrshrn_n_u16(sum_left, 3);
- } else {
- dc0 = vdup_n_u8(0x80);
- }
+static INLINE uint16x4_t dc_sum_8(const uint8_t *ref) {
+ const uint8x8_t ref_u8 = vld1_u8(ref);
+ uint16x4_t sum = vpaddl_u8(ref_u8);
+ sum = vpadd_u16(sum, sum);
+ return vpadd_u16(sum, sum);
+}
- {
- const uint8x8_t dc = vdup_lane_u8(dc0, 0);
- int i;
- for (i = 0; i < 8; ++i) {
- vst1_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc));
- }
+static INLINE void dc_store_8x8(uint8_t *dst, ptrdiff_t stride,
+ const uint8x8_t dc) {
+ const uint8x8_t dc_dup = vdup_lane_u8(dc, 0);
+ int i;
+ for (i = 0; i < 8; ++i, dst += stride) {
+ vst1_u8(dst, dc_dup);
}
}
void vpx_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
- dc_8x8(dst, stride, above, left, 1, 1);
+ const uint8x8_t above_u8 = vld1_u8(above);
+ const uint8x8_t left_u8 = vld1_u8(left);
+ const uint8x16_t above_and_left = vcombine_u8(above_u8, left_u8);
+ const uint16x8_t p0 = vpaddlq_u8(above_and_left);
+ uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
+ uint8x8_t dc;
+ sum = vpadd_u16(sum, sum);
+ sum = vpadd_u16(sum, sum);
+ dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 4));
+ dc_store_8x8(dst, stride, dc);
}
void vpx_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
+ const uint16x4_t sum = dc_sum_8(left);
+ const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 3));
(void)above;
- dc_8x8(dst, stride, NULL, left, 0, 1);
+ dc_store_8x8(dst, stride, dc);
}
void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
+ const uint16x4_t sum = dc_sum_8(above);
+ const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 3));
(void)left;
- dc_8x8(dst, stride, above, NULL, 1, 0);
+ dc_store_8x8(dst, stride, dc);
}
void vpx_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
+ const uint8x8_t dc = vdup_n_u8(0x80);
(void)above;
(void)left;
- dc_8x8(dst, stride, NULL, NULL, 0, 0);
+ dc_store_8x8(dst, stride, dc);
}
//------------------------------------------------------------------------------
// DC 16x16
-// 'do_above' and 'do_left' facilitate branch removal when inlined.
-static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left,
- int do_above, int do_left) {
- uint16x8_t sum_top;
- uint16x8_t sum_left;
- uint8x8_t dc0;
-
- if (do_above) {
- const uint8x16_t A = vld1q_u8(above); // top row
- const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top
- const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
- const uint16x4_t p2 = vpadd_u16(p1, p1);
- const uint16x4_t p3 = vpadd_u16(p2, p2);
- sum_top = vcombine_u16(p3, p3);
- }
-
- if (do_left) {
- const uint8x16_t L = vld1q_u8(left); // left row
- const uint16x8_t p0 = vpaddlq_u8(L); // cascading summation of the left
- const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
- const uint16x4_t p2 = vpadd_u16(p1, p1);
- const uint16x4_t p3 = vpadd_u16(p2, p2);
- sum_left = vcombine_u16(p3, p3);
- }
-
- if (do_above && do_left) {
- const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
- dc0 = vrshrn_n_u16(sum, 5);
- } else if (do_above) {
- dc0 = vrshrn_n_u16(sum_top, 4);
- } else if (do_left) {
- dc0 = vrshrn_n_u16(sum_left, 4);
- } else {
- dc0 = vdup_n_u8(0x80);
- }
+static INLINE uint16x4_t dc_sum_16(const uint8_t *ref) {
+ const uint8x16_t ref_u8 = vld1q_u8(ref);
+ const uint16x8_t p0 = vpaddlq_u8(ref_u8);
+ uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
+ sum = vpadd_u16(sum, sum);
+ return vpadd_u16(sum, sum);
+}
- {
- const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
- int i;
- for (i = 0; i < 16; ++i) {
- vst1q_u8(dst + i * stride, dc);
- }
+static INLINE void dc_store_16x16(uint8_t *dst, ptrdiff_t stride,
+ const uint8x8_t dc) {
+ const uint8x16_t dc_dup = vdupq_lane_u8(dc, 0);
+ int i;
+ for (i = 0; i < 16; ++i, dst += stride) {
+ vst1q_u8(dst, dc_dup);
}
}
void vpx_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
- dc_16x16(dst, stride, above, left, 1, 1);
+ const uint8x16_t ref0 = vld1q_u8(above);
+ const uint8x16_t ref1 = vld1q_u8(left);
+ const uint16x8_t p0 = vpaddlq_u8(ref0);
+ const uint16x8_t p1 = vpaddlq_u8(ref1);
+ const uint16x8_t p2 = vaddq_u16(p0, p1);
+ uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
+ uint8x8_t dc;
+ sum = vpadd_u16(sum, sum);
+ sum = vpadd_u16(sum, sum);
+ dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 5));
+ dc_store_16x16(dst, stride, dc);
}
void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above,
const uint8_t *left) {
+ const uint16x4_t sum = dc_sum_16(left);
+ const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 4));
(void)above;
- dc_16x16(dst, stride, NULL, left, 0, 1);
+ dc_store_16x16(dst, stride, dc);
}
void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above,
const uint8_t *left) {
+ const uint16x4_t sum = dc_sum_16(above);
+ const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 4));
(void)left;
- dc_16x16(dst, stride, above, NULL, 1, 0);
+ dc_store_16x16(dst, stride, dc);
}
void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above,
const uint8_t *left) {
+ const uint8x8_t dc = vdup_n_u8(0x80);
(void)above;
(void)left;
- dc_16x16(dst, stride, NULL, NULL, 0, 0);
+ dc_store_16x16(dst, stride, dc);
}
//------------------------------------------------------------------------------
// DC 32x32
-// 'do_above' and 'do_left' facilitate branch removal when inlined.
-static INLINE void dc_32x32(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left,
- int do_above, int do_left) {
- uint16x8_t sum_top;
- uint16x8_t sum_left;
- uint8x8_t dc0;
-
- if (do_above) {
- const uint8x16_t A0 = vld1q_u8(above); // top row
- const uint8x16_t A1 = vld1q_u8(above + 16);
- const uint16x8_t p0 = vpaddlq_u8(A0); // cascading summation of the top
- const uint16x8_t p1 = vpaddlq_u8(A1);
- const uint16x8_t p2 = vaddq_u16(p0, p1);
- const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
- const uint16x4_t p4 = vpadd_u16(p3, p3);
- const uint16x4_t p5 = vpadd_u16(p4, p4);
- sum_top = vcombine_u16(p5, p5);
- }
-
- if (do_left) {
- const uint8x16_t L0 = vld1q_u8(left); // left row
- const uint8x16_t L1 = vld1q_u8(left + 16);
- const uint16x8_t p0 = vpaddlq_u8(L0); // cascading summation of the left
- const uint16x8_t p1 = vpaddlq_u8(L1);
- const uint16x8_t p2 = vaddq_u16(p0, p1);
- const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
- const uint16x4_t p4 = vpadd_u16(p3, p3);
- const uint16x4_t p5 = vpadd_u16(p4, p4);
- sum_left = vcombine_u16(p5, p5);
- }
+static INLINE uint16x4_t dc_sum_32(const uint8_t *ref) {
+ const uint8x16x2_t r = vld2q_u8(ref);
+ const uint16x8_t p0 = vpaddlq_u8(r.val[0]);
+ const uint16x8_t p1 = vpaddlq_u8(r.val[1]);
+ const uint16x8_t p2 = vaddq_u16(p0, p1);
+ uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
+ sum = vpadd_u16(sum, sum);
+ return vpadd_u16(sum, sum);
+}
- if (do_above && do_left) {
- const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
- dc0 = vrshrn_n_u16(sum, 6);
- } else if (do_above) {
- dc0 = vrshrn_n_u16(sum_top, 5);
- } else if (do_left) {
- dc0 = vrshrn_n_u16(sum_left, 5);
- } else {
- dc0 = vdup_n_u8(0x80);
- }
+static INLINE void dc_store_32x32(uint8_t *dst, ptrdiff_t stride,
+ const uint8x8_t dc) {
+ uint8x16x2_t dc_dup;
+ int i;
+ dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u8(dc, 0);
- {
- const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
- int i;
- for (i = 0; i < 32; ++i) {
- vst1q_u8(dst + i * stride, dc);
- vst1q_u8(dst + i * stride + 16, dc);
- }
+ for (i = 0; i < 32; ++i, dst += stride) {
+ vst2q_u8(dst, dc_dup);
}
}
void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
- dc_32x32(dst, stride, above, left, 1, 1);
+ const uint8x16x2_t a = vld2q_u8(above);
+ const uint8x16x2_t l = vld2q_u8(left);
+ const uint16x8_t pa0 = vpaddlq_u8(a.val[0]);
+ const uint16x8_t pl0 = vpaddlq_u8(l.val[0]);
+ const uint16x8_t pa1 = vpaddlq_u8(a.val[1]);
+ const uint16x8_t pl1 = vpaddlq_u8(l.val[1]);
+ const uint16x8_t pa = vaddq_u16(pa0, pa1);
+ const uint16x8_t pl = vaddq_u16(pl0, pl1);
+ const uint16x8_t pal = vaddq_u16(pa, pl);
+ uint16x4_t sum = vadd_u16(vget_low_u16(pal), vget_high_u16(pal));
+ uint8x8_t dc;
+ sum = vpadd_u16(sum, sum);
+ sum = vpadd_u16(sum, sum);
+ dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 6));
+ dc_store_32x32(dst, stride, dc);
}
void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above,
const uint8_t *left) {
+ const uint16x4_t sum = dc_sum_32(left);
+ const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 5));
(void)above;
- dc_32x32(dst, stride, NULL, left, 0, 1);
+ dc_store_32x32(dst, stride, dc);
}
void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above,
const uint8_t *left) {
+ const uint16x4_t sum = dc_sum_32(above);
+ const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 5));
(void)left;
- dc_32x32(dst, stride, above, NULL, 1, 0);
+ dc_store_32x32(dst, stride, dc);
}
void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above,
const uint8_t *left) {
+ const uint8x8_t dc = vdup_n_u8(0x80);
(void)above;
(void)left;
- dc_32x32(dst, stride, NULL, NULL, 0, 0);
+ dc_store_32x32(dst, stride, dc);
}
// -----------------------------------------------------------------------------
void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
- const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(above)); // top row
- const uint64x1_t A1 = vshr_n_u64(A0, 8);
- const uint64x1_t A2 = vshr_n_u64(A0, 16);
- const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);
+ const uint8x8_t ABCDEFGH = vld1_u8(above);
+ const uint64x1_t A1 = vshr_n_u64(vreinterpret_u64_u8(ABCDEFGH), 8);
+ const uint64x1_t A2 = vshr_n_u64(vreinterpret_u64_u8(ABCDEFGH), 16);
const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGH00);
@@ -331,485 +279,506 @@ void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
- dst[3 * stride + 3] = above[7];
+ vst1_lane_u8(dst + 3 * stride + 3, ABCDEFGH, 7);
+}
+
+static INLINE void d45_store_8(uint8_t **dst, const ptrdiff_t stride,
+ const uint8x8_t above_right, uint8x8_t *row) {
+ *row = vext_u8(*row, above_right, 1);
+ vst1_u8(*dst, *row);
+ *dst += stride;
}
void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
- static const uint8_t shuffle1[8] = { 1, 2, 3, 4, 5, 6, 7, 7 };
- static const uint8_t shuffle2[8] = { 2, 3, 4, 5, 6, 7, 7, 7 };
- const uint8x8_t sh_12345677 = vld1_u8(shuffle1);
- const uint8x8_t sh_23456777 = vld1_u8(shuffle2);
- const uint8x8_t A0 = vld1_u8(above); // top row
- const uint8x8_t A1 = vtbl1_u8(A0, sh_12345677);
- const uint8x8_t A2 = vtbl1_u8(A0, sh_23456777);
+ const uint8x8_t A0 = vld1_u8(above);
+ const uint8x8_t above_right = vdup_lane_u8(A0, 7);
+ const uint8x8_t A1 = vext_u8(A0, above_right, 1);
+ const uint8x8_t A2 = vext_u8(A0, above_right, 2);
const uint8x8_t avg1 = vhadd_u8(A0, A2);
uint8x8_t row = vrhadd_u8(avg1, A1);
- int i;
(void)left;
- for (i = 0; i < 7; ++i) {
- vst1_u8(dst + i * stride, row);
- row = vtbl1_u8(row, sh_12345677);
- }
- vst1_u8(dst + i * stride, row);
+
+ vst1_u8(dst, row);
+ dst += stride;
+ d45_store_8(&dst, stride, above_right, &row);
+ d45_store_8(&dst, stride, above_right, &row);
+ d45_store_8(&dst, stride, above_right, &row);
+ d45_store_8(&dst, stride, above_right, &row);
+ d45_store_8(&dst, stride, above_right, &row);
+ d45_store_8(&dst, stride, above_right, &row);
+ vst1_u8(dst, above_right);
+}
+
+static INLINE void d45_store_16(uint8_t **dst, const ptrdiff_t stride,
+ const uint8x16_t above_right, uint8x16_t *row) {
+ *row = vextq_u8(*row, above_right, 1);
+ vst1q_u8(*dst, *row);
+ *dst += stride;
}
void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
- const uint8x16_t A0 = vld1q_u8(above); // top row
- const uint8x16_t above_right = vld1q_dup_u8(above + 15);
+ const uint8x16_t A0 = vld1q_u8(above);
+ const uint8x16_t above_right = vdupq_lane_u8(vget_high_u8(A0), 7);
const uint8x16_t A1 = vextq_u8(A0, above_right, 1);
const uint8x16_t A2 = vextq_u8(A0, above_right, 2);
const uint8x16_t avg1 = vhaddq_u8(A0, A2);
uint8x16_t row = vrhaddq_u8(avg1, A1);
- int i;
(void)left;
- for (i = 0; i < 15; ++i) {
- vst1q_u8(dst + i * stride, row);
- row = vextq_u8(row, above_right, 1);
- }
- vst1q_u8(dst + i * stride, row);
+
+ vst1q_u8(dst, row);
+ dst += stride;
+ d45_store_16(&dst, stride, above_right, &row);
+ d45_store_16(&dst, stride, above_right, &row);
+ d45_store_16(&dst, stride, above_right, &row);
+ d45_store_16(&dst, stride, above_right, &row);
+ d45_store_16(&dst, stride, above_right, &row);
+ d45_store_16(&dst, stride, above_right, &row);
+ d45_store_16(&dst, stride, above_right, &row);
+ d45_store_16(&dst, stride, above_right, &row);
+ d45_store_16(&dst, stride, above_right, &row);
+ d45_store_16(&dst, stride, above_right, &row);
+ d45_store_16(&dst, stride, above_right, &row);
+ d45_store_16(&dst, stride, above_right, &row);
+ d45_store_16(&dst, stride, above_right, &row);
+ d45_store_16(&dst, stride, above_right, &row);
+ vst1q_u8(dst, above_right);
}
// -----------------------------------------------------------------------------
void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
- const uint8x8_t XABCD_u8 = vld1_u8(above - 1);
- const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
- const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
+ const uint8x8_t XABCD = vld1_u8(above - 1);
const uint32x2_t zero = vdup_n_u32(0);
const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0);
- const uint8x8_t IJKL_u8 = vreinterpret_u8_u32(IJKL);
- const uint64x1_t LKJI____ = vreinterpret_u64_u8(vrev32_u8(IJKL_u8));
- const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
- const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
- const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
- const uint8_t D = vget_lane_u8(XABCD_u8, 4);
- const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
- const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
- const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
- const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
+ const uint8x8_t LKJI = vrev64_u8(vreinterpret_u8_u32(IJKL));
+ const uint8x8_t LKJIXABC = vext_u8(LKJI, XABCD, 4);
+ const uint8x8_t KJIXABCD = vext_u8(LKJI, XABCD, 5);
+ const uint8x8_t JIXABCD0 =
+ vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(KJIXABCD), 8));
+ const uint8x8_t avg1 = vhadd_u8(JIXABCD0, LKJIXABC);
+ const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABCD);
const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
- vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
- vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
- vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
- vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
+ vst1_lane_u32((uint32_t *)dst, r0, 0);
+ dst += stride;
+ vst1_lane_u32((uint32_t *)dst, r1, 0);
+ dst += stride;
+ vst1_lane_u32((uint32_t *)dst, r2, 0);
+ dst += stride;
+ vst1_lane_u32((uint32_t *)dst, r3, 0);
}
+// -----------------------------------------------------------------------------
+
#if !HAVE_NEON_ASM
void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
+ const uint32_t d = *(const uint32_t *)above;
int i;
- uint32x2_t d0u32 = vdup_n_u32(0);
(void)left;
- d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0);
- for (i = 0; i < 4; i++, dst += stride)
- vst1_lane_u32((uint32_t *)dst, d0u32, 0);
+ for (i = 0; i < 4; i++, dst += stride) {
+ *(uint32_t *)dst = d;
+ }
}
void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
+ const uint8x8_t d = vld1_u8(above);
int i;
- uint8x8_t d0u8 = vdup_n_u8(0);
(void)left;
- d0u8 = vld1_u8(above);
- for (i = 0; i < 8; i++, dst += stride) vst1_u8(dst, d0u8);
+ for (i = 0; i < 8; i++, dst += stride) {
+ vst1_u8(dst, d);
+ }
}
void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d = vld1q_u8(above);
int i;
- uint8x16_t q0u8 = vdupq_n_u8(0);
(void)left;
- q0u8 = vld1q_u8(above);
- for (i = 0; i < 16; i++, dst += stride) vst1q_u8(dst, q0u8);
+ for (i = 0; i < 16; i++, dst += stride) {
+ vst1q_u8(dst, d);
+ }
}
void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d0 = vld1q_u8(above);
+ const uint8x16_t d1 = vld1q_u8(above + 16);
int i;
- uint8x16_t q0u8 = vdupq_n_u8(0);
- uint8x16_t q1u8 = vdupq_n_u8(0);
(void)left;
- q0u8 = vld1q_u8(above);
- q1u8 = vld1q_u8(above + 16);
- for (i = 0; i < 32; i++, dst += stride) {
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q1u8);
+ for (i = 0; i < 32; i++) {
+ // Note: performance was worse using vst2q_u8 under gcc-4.9 & clang-3.8.
+ // clang-3.8 unrolled the loop fully with no filler so the cause is likely
+ // the latency of the instruction.
+ vst1q_u8(dst, d0);
+ dst += 16;
+ vst1q_u8(dst, d1);
+ dst += stride - 16;
}
}
+// -----------------------------------------------------------------------------
+
void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
- uint8x8_t d0u8 = vdup_n_u8(0);
- uint32x2_t d1u32 = vdup_n_u32(0);
+ const uint32x2_t zero = vdup_n_u32(0);
+ const uint8x8_t left_u8 =
+ vreinterpret_u8_u32(vld1_lane_u32((const uint32_t *)left, zero, 0));
+ uint8x8_t d;
(void)above;
- d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0);
-
- d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0);
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+ d = vdup_lane_u8(left_u8, 0);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
dst += stride;
- d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1);
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+ d = vdup_lane_u8(left_u8, 1);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
dst += stride;
- d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2);
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+ d = vdup_lane_u8(left_u8, 2);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
dst += stride;
- d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3);
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+ d = vdup_lane_u8(left_u8, 3);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
}
void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
- uint8x8_t d0u8 = vdup_n_u8(0);
- uint64x1_t d1u64 = vdup_n_u64(0);
+ const uint8x8_t left_u8 = vld1_u8(left);
+ uint8x8_t d;
(void)above;
- d1u64 = vld1_u64((const uint64_t *)left);
-
- d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0);
- vst1_u8(dst, d0u8);
+ d = vdup_lane_u8(left_u8, 0);
+ vst1_u8(dst, d);
dst += stride;
- d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1);
- vst1_u8(dst, d0u8);
+ d = vdup_lane_u8(left_u8, 1);
+ vst1_u8(dst, d);
dst += stride;
- d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2);
- vst1_u8(dst, d0u8);
+ d = vdup_lane_u8(left_u8, 2);
+ vst1_u8(dst, d);
dst += stride;
- d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3);
- vst1_u8(dst, d0u8);
+ d = vdup_lane_u8(left_u8, 3);
+ vst1_u8(dst, d);
dst += stride;
- d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4);
- vst1_u8(dst, d0u8);
+ d = vdup_lane_u8(left_u8, 4);
+ vst1_u8(dst, d);
dst += stride;
- d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5);
- vst1_u8(dst, d0u8);
+ d = vdup_lane_u8(left_u8, 5);
+ vst1_u8(dst, d);
dst += stride;
- d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6);
- vst1_u8(dst, d0u8);
+ d = vdup_lane_u8(left_u8, 6);
+ vst1_u8(dst, d);
dst += stride;
- d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7);
- vst1_u8(dst, d0u8);
+ d = vdup_lane_u8(left_u8, 7);
+ vst1_u8(dst, d);
}
void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
- int j;
- uint8x8_t d2u8 = vdup_n_u8(0);
- uint8x16_t q0u8 = vdupq_n_u8(0);
- uint8x16_t q1u8 = vdupq_n_u8(0);
+ const uint8x16_t left_u8q = vld1q_u8(left);
+ uint8x8_t left_u8d = vget_low_u8(left_u8q);
+ uint8x16_t d;
+ int i;
(void)above;
- q1u8 = vld1q_u8(left);
- d2u8 = vget_low_u8(q1u8);
- for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
- q0u8 = vdupq_lane_u8(d2u8, 0);
- vst1q_u8(dst, q0u8);
+ for (i = 0; i < 2; i++, left_u8d = vget_high_u8(left_u8q)) {
+ d = vdupq_lane_u8(left_u8d, 0);
+ vst1q_u8(dst, d);
dst += stride;
- q0u8 = vdupq_lane_u8(d2u8, 1);
- vst1q_u8(dst, q0u8);
+ d = vdupq_lane_u8(left_u8d, 1);
+ vst1q_u8(dst, d);
dst += stride;
- q0u8 = vdupq_lane_u8(d2u8, 2);
- vst1q_u8(dst, q0u8);
+ d = vdupq_lane_u8(left_u8d, 2);
+ vst1q_u8(dst, d);
dst += stride;
- q0u8 = vdupq_lane_u8(d2u8, 3);
- vst1q_u8(dst, q0u8);
+ d = vdupq_lane_u8(left_u8d, 3);
+ vst1q_u8(dst, d);
dst += stride;
- q0u8 = vdupq_lane_u8(d2u8, 4);
- vst1q_u8(dst, q0u8);
+ d = vdupq_lane_u8(left_u8d, 4);
+ vst1q_u8(dst, d);
dst += stride;
- q0u8 = vdupq_lane_u8(d2u8, 5);
- vst1q_u8(dst, q0u8);
+ d = vdupq_lane_u8(left_u8d, 5);
+ vst1q_u8(dst, d);
dst += stride;
- q0u8 = vdupq_lane_u8(d2u8, 6);
- vst1q_u8(dst, q0u8);
+ d = vdupq_lane_u8(left_u8d, 6);
+ vst1q_u8(dst, d);
dst += stride;
- q0u8 = vdupq_lane_u8(d2u8, 7);
- vst1q_u8(dst, q0u8);
+ d = vdupq_lane_u8(left_u8d, 7);
+ vst1q_u8(dst, d);
dst += stride;
}
}
void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
- int j, k;
- uint8x8_t d2u8 = vdup_n_u8(0);
- uint8x16_t q0u8 = vdupq_n_u8(0);
- uint8x16_t q1u8 = vdupq_n_u8(0);
+ uint8x16_t d;
+ int i;
(void)above;
- for (k = 0; k < 2; k++, left += 16) {
- q1u8 = vld1q_u8(left);
- d2u8 = vget_low_u8(q1u8);
- for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
- q0u8 = vdupq_lane_u8(d2u8, 0);
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q0u8);
- dst += stride;
- q0u8 = vdupq_lane_u8(d2u8, 1);
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q0u8);
- dst += stride;
- q0u8 = vdupq_lane_u8(d2u8, 2);
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q0u8);
- dst += stride;
- q0u8 = vdupq_lane_u8(d2u8, 3);
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q0u8);
- dst += stride;
- q0u8 = vdupq_lane_u8(d2u8, 4);
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q0u8);
- dst += stride;
- q0u8 = vdupq_lane_u8(d2u8, 5);
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q0u8);
- dst += stride;
- q0u8 = vdupq_lane_u8(d2u8, 6);
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q0u8);
- dst += stride;
- q0u8 = vdupq_lane_u8(d2u8, 7);
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q0u8);
- dst += stride;
- }
+ for (i = 0; i < 2; i++, left += 16) {
+ const uint8x16_t left_u8 = vld1q_u8(left);
+ const uint8x8_t left_low = vget_low_u8(left_u8);
+ const uint8x8_t left_high = vget_high_u8(left_u8);
+ d = vdupq_lane_u8(left_low, 0);
+ vst1q_u8(dst, d); // Note clang-3.8 produced poor code w/vst2q_u8
+ dst += 16;
+ vst1q_u8(dst, d);
+ dst += stride - 16;
+ d = vdupq_lane_u8(left_low, 1);
+ vst1q_u8(dst, d);
+ dst += 16;
+ vst1q_u8(dst, d);
+ dst += stride - 16;
+ d = vdupq_lane_u8(left_low, 2);
+ vst1q_u8(dst, d);
+ dst += 16;
+ vst1q_u8(dst, d);
+ dst += stride - 16;
+ d = vdupq_lane_u8(left_low, 3);
+ vst1q_u8(dst, d);
+ dst += 16;
+ vst1q_u8(dst, d);
+ dst += stride - 16;
+ d = vdupq_lane_u8(left_low, 4);
+ vst1q_u8(dst, d);
+ dst += 16;
+ vst1q_u8(dst, d);
+ dst += stride - 16;
+ d = vdupq_lane_u8(left_low, 5);
+ vst1q_u8(dst, d);
+ dst += 16;
+ vst1q_u8(dst, d);
+ dst += stride - 16;
+ d = vdupq_lane_u8(left_low, 6);
+ vst1q_u8(dst, d);
+ dst += 16;
+ vst1q_u8(dst, d);
+ dst += stride - 16;
+ d = vdupq_lane_u8(left_low, 7);
+ vst1q_u8(dst, d);
+ dst += 16;
+ vst1q_u8(dst, d);
+ dst += stride - 16;
+
+ d = vdupq_lane_u8(left_high, 0);
+ vst1q_u8(dst, d);
+ dst += 16;
+ vst1q_u8(dst, d);
+ dst += stride - 16;
+ d = vdupq_lane_u8(left_high, 1);
+ vst1q_u8(dst, d);
+ dst += 16;
+ vst1q_u8(dst, d);
+ dst += stride - 16;
+ d = vdupq_lane_u8(left_high, 2);
+ vst1q_u8(dst, d);
+ dst += 16;
+ vst1q_u8(dst, d);
+ dst += stride - 16;
+ d = vdupq_lane_u8(left_high, 3);
+ vst1q_u8(dst, d);
+ dst += 16;
+ vst1q_u8(dst, d);
+ dst += stride - 16;
+ d = vdupq_lane_u8(left_high, 4);
+ vst1q_u8(dst, d);
+ dst += 16;
+ vst1q_u8(dst, d);
+ dst += stride - 16;
+ d = vdupq_lane_u8(left_high, 5);
+ vst1q_u8(dst, d);
+ dst += 16;
+ vst1q_u8(dst, d);
+ dst += stride - 16;
+ d = vdupq_lane_u8(left_high, 6);
+ vst1q_u8(dst, d);
+ dst += 16;
+ vst1q_u8(dst, d);
+ dst += stride - 16;
+ d = vdupq_lane_u8(left_high, 7);
+ vst1q_u8(dst, d);
+ dst += 16;
+ vst1q_u8(dst, d);
+ dst += stride - 16;
}
}
+// -----------------------------------------------------------------------------
+
+static INLINE int16x8_t convert_u8_to_s16(uint8x8_t v) {
+ return vreinterpretq_s16_u16(vmovl_u8(v));
+}
+
void vpx_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
- int i;
- uint16x8_t q1u16, q3u16;
- int16x8_t q1s16;
- uint8x8_t d0u8 = vdup_n_u8(0);
- uint32x2_t d2u32 = vdup_n_u32(0);
-
- d0u8 = vld1_dup_u8(above - 1);
- d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0);
- q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8);
- for (i = 0; i < 4; i++, dst += stride) {
- q1u16 = vdupq_n_u16((uint16_t)left[i]);
- q1s16 =
- vaddq_s16(vreinterpretq_s16_u16(q1u16), vreinterpretq_s16_u16(q3u16));
- d0u8 = vqmovun_s16(q1s16);
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
- }
+ const uint8x8_t top_left = vld1_dup_u8(above - 1);
+ const uint8x8_t left_u8 = vld1_u8(left);
+ const uint8x8_t above_u8 = vld1_u8(above);
+ const int16x4_t left_s16 = vget_low_s16(convert_u8_to_s16(left_u8));
+ int16x8_t sub, sum;
+ uint32x2_t d;
+
+ sub = vreinterpretq_s16_u16(vsubl_u8(above_u8, top_left));
+ // Avoid vcombine_s16() which generates lots of redundant code with clang-3.8.
+ sub = vreinterpretq_s16_s64(
+ vdupq_lane_s64(vreinterpret_s64_s16(vget_low_s16(sub)), 0));
+
+ sum = vcombine_s16(vdup_lane_s16(left_s16, 0), vdup_lane_s16(left_s16, 1));
+ sum = vaddq_s16(sum, sub);
+ d = vreinterpret_u32_u8(vqmovun_s16(sum));
+ vst1_lane_u32((uint32_t *)dst, d, 0);
+ dst += stride;
+ vst1_lane_u32((uint32_t *)dst, d, 1);
+ dst += stride;
+
+ sum = vcombine_s16(vdup_lane_s16(left_s16, 2), vdup_lane_s16(left_s16, 3));
+ sum = vaddq_s16(sum, sub);
+ d = vreinterpret_u32_u8(vqmovun_s16(sum));
+ vst1_lane_u32((uint32_t *)dst, d, 0);
+ dst += stride;
+ vst1_lane_u32((uint32_t *)dst, d, 1);
+}
+
+static INLINE void tm_8_kernel(uint8_t **dst, const ptrdiff_t stride,
+ const int16x8_t left_dup, const int16x8_t sub) {
+ const int16x8_t sum = vaddq_s16(left_dup, sub);
+ const uint8x8_t d = vqmovun_s16(sum);
+ vst1_u8(*dst, d);
+ *dst += stride;
}
void vpx_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
- int j;
- uint16x8_t q0u16, q3u16, q10u16;
- int16x8_t q0s16;
- uint16x4_t d20u16;
- uint8x8_t d0u8, d2u8, d30u8;
-
- d0u8 = vld1_dup_u8(above - 1);
- d30u8 = vld1_u8(left);
- d2u8 = vld1_u8(above);
- q10u16 = vmovl_u8(d30u8);
- q3u16 = vsubl_u8(d2u8, d0u8);
- d20u16 = vget_low_u16(q10u16);
- for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
- q0u16 = vdupq_lane_u16(d20u16, 0);
- q0s16 =
- vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
- d0u8 = vqmovun_s16(q0s16);
- vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
- dst += stride;
- q0u16 = vdupq_lane_u16(d20u16, 1);
- q0s16 =
- vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
- d0u8 = vqmovun_s16(q0s16);
- vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
- dst += stride;
- q0u16 = vdupq_lane_u16(d20u16, 2);
- q0s16 =
- vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
- d0u8 = vqmovun_s16(q0s16);
- vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
- dst += stride;
- q0u16 = vdupq_lane_u16(d20u16, 3);
- q0s16 =
- vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
- d0u8 = vqmovun_s16(q0s16);
- vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
- dst += stride;
+ const uint8x8_t top_left = vld1_dup_u8(above - 1);
+ const uint8x8_t above_u8 = vld1_u8(above);
+ const uint8x8_t left_u8 = vld1_u8(left);
+ const int16x8_t left_s16q = convert_u8_to_s16(left_u8);
+ const int16x8_t sub = vreinterpretq_s16_u16(vsubl_u8(above_u8, top_left));
+ int16x4_t left_s16d = vget_low_s16(left_s16q);
+ int i;
+
+ for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16q)) {
+ int16x8_t left_dup;
+
+ left_dup = vdupq_lane_s16(left_s16d, 0);
+ tm_8_kernel(&dst, stride, left_dup, sub);
+ left_dup = vdupq_lane_s16(left_s16d, 1);
+ tm_8_kernel(&dst, stride, left_dup, sub);
+ left_dup = vdupq_lane_s16(left_s16d, 2);
+ tm_8_kernel(&dst, stride, left_dup, sub);
+ left_dup = vdupq_lane_s16(left_s16d, 3);
+ tm_8_kernel(&dst, stride, left_dup, sub);
}
}
+static INLINE void tm_16_kernel(uint8_t **dst, const ptrdiff_t stride,
+ const int16x8_t left_dup, const int16x8_t sub0,
+ const int16x8_t sub1) {
+ const int16x8_t sum0 = vaddq_s16(left_dup, sub0);
+ const int16x8_t sum1 = vaddq_s16(left_dup, sub1);
+ const uint8x8_t d0 = vqmovun_s16(sum0);
+ const uint8x8_t d1 = vqmovun_s16(sum1);
+ vst1_u8(*dst, d0);
+ *dst += 8;
+ vst1_u8(*dst, d1);
+ *dst += stride - 8;
+}
+
void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
- int j, k;
- uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16;
- uint8x16_t q0u8, q1u8;
- int16x8_t q0s16, q1s16, q8s16, q11s16;
- uint16x4_t d20u16;
- uint8x8_t d2u8, d3u8, d18u8, d22u8, d23u8;
-
- q0u8 = vld1q_dup_u8(above - 1);
- q1u8 = vld1q_u8(above);
- q2u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
- q3u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
- for (k = 0; k < 2; k++, left += 8) {
- d18u8 = vld1_u8(left);
- q10u16 = vmovl_u8(d18u8);
- d20u16 = vget_low_u16(q10u16);
- for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
- q0u16 = vdupq_lane_u16(d20u16, 0);
- q8u16 = vdupq_lane_u16(d20u16, 1);
- q1s16 =
- vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16));
- q0s16 =
- vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16));
- q11s16 =
- vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16));
- q8s16 =
- vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16));
- d2u8 = vqmovun_s16(q1s16);
- d3u8 = vqmovun_s16(q0s16);
- d22u8 = vqmovun_s16(q11s16);
- d23u8 = vqmovun_s16(q8s16);
- vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
- vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
- dst += stride;
- vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
- vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
- dst += stride;
-
- q0u16 = vdupq_lane_u16(d20u16, 2);
- q8u16 = vdupq_lane_u16(d20u16, 3);
- q1s16 =
- vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16));
- q0s16 =
- vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16));
- q11s16 =
- vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16));
- q8s16 =
- vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16));
- d2u8 = vqmovun_s16(q1s16);
- d3u8 = vqmovun_s16(q0s16);
- d22u8 = vqmovun_s16(q11s16);
- d23u8 = vqmovun_s16(q8s16);
- vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
- vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
- dst += stride;
- vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
- vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
- dst += stride;
- }
+ const uint8x16_t top_left = vld1q_dup_u8(above - 1);
+ const uint8x16_t above_u8 = vld1q_u8(above);
+ const int16x8_t sub0 = vreinterpretq_s16_u16(
+ vsubl_u8(vget_low_u8(above_u8), vget_low_u8(top_left)));
+ const int16x8_t sub1 = vreinterpretq_s16_u16(
+ vsubl_u8(vget_high_u8(above_u8), vget_high_u8(top_left)));
+ int16x8_t left_dup;
+ int i;
+
+ for (i = 0; i < 2; i++, left += 8) {
+ const uint8x8_t left_u8 = vld1_u8(left);
+ const int16x8_t left_s16q = convert_u8_to_s16(left_u8);
+ const int16x4_t left_low = vget_low_s16(left_s16q);
+ const int16x4_t left_high = vget_high_s16(left_s16q);
+
+ left_dup = vdupq_lane_s16(left_low, 0);
+ tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
+ left_dup = vdupq_lane_s16(left_low, 1);
+ tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
+ left_dup = vdupq_lane_s16(left_low, 2);
+ tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
+ left_dup = vdupq_lane_s16(left_low, 3);
+ tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
+
+ left_dup = vdupq_lane_s16(left_high, 0);
+ tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
+ left_dup = vdupq_lane_s16(left_high, 1);
+ tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
+ left_dup = vdupq_lane_s16(left_high, 2);
+ tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
+ left_dup = vdupq_lane_s16(left_high, 3);
+ tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
}
}
+static INLINE void tm_32_kernel(uint8_t **dst, const ptrdiff_t stride,
+ const int16x8_t left_dup, const int16x8_t sub0,
+ const int16x8_t sub1, const int16x8_t sub2,
+ const int16x8_t sub3) {
+ const int16x8_t sum0 = vaddq_s16(left_dup, sub0);
+ const int16x8_t sum1 = vaddq_s16(left_dup, sub1);
+ const int16x8_t sum2 = vaddq_s16(left_dup, sub2);
+ const int16x8_t sum3 = vaddq_s16(left_dup, sub3);
+ const uint8x8_t d0 = vqmovun_s16(sum0);
+ const uint8x8_t d1 = vqmovun_s16(sum1);
+ const uint8x8_t d2 = vqmovun_s16(sum2);
+ const uint8x8_t d3 = vqmovun_s16(sum3);
+
+ vst1q_u8(*dst, vcombine_u8(d0, d1));
+ *dst += 16;
+ vst1q_u8(*dst, vcombine_u8(d2, d3));
+ *dst += stride - 16;
+}
+
void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
- int j, k;
- uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16;
- uint8x16_t q0u8, q1u8, q2u8;
- int16x8_t q12s16, q13s16, q14s16, q15s16;
- uint16x4_t d6u16;
- uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8;
-
- q0u8 = vld1q_dup_u8(above - 1);
- q1u8 = vld1q_u8(above);
- q2u8 = vld1q_u8(above + 16);
- q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
- q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
- q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8));
- q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8));
- for (k = 0; k < 4; k++, left += 8) {
- d26u8 = vld1_u8(left);
- q3u16 = vmovl_u8(d26u8);
- d6u16 = vget_low_u16(q3u16);
- for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) {
- q0u16 = vdupq_lane_u16(d6u16, 0);
- q12s16 =
- vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
- q13s16 =
- vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
- q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q10u16));
- q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q11u16));
- d0u8 = vqmovun_s16(q12s16);
- d1u8 = vqmovun_s16(q13s16);
- d2u8 = vqmovun_s16(q14s16);
- d3u8 = vqmovun_s16(q15s16);
- q0u8 = vcombine_u8(d0u8, d1u8);
- q1u8 = vcombine_u8(d2u8, d3u8);
- vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
- vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
- dst += stride;
-
- q0u16 = vdupq_lane_u16(d6u16, 1);
- q12s16 =
- vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
- q13s16 =
- vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
- q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q10u16));
- q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q11u16));
- d0u8 = vqmovun_s16(q12s16);
- d1u8 = vqmovun_s16(q13s16);
- d2u8 = vqmovun_s16(q14s16);
- d3u8 = vqmovun_s16(q15s16);
- q0u8 = vcombine_u8(d0u8, d1u8);
- q1u8 = vcombine_u8(d2u8, d3u8);
- vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
- vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
- dst += stride;
-
- q0u16 = vdupq_lane_u16(d6u16, 2);
- q12s16 =
- vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
- q13s16 =
- vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
- q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q10u16));
- q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q11u16));
- d0u8 = vqmovun_s16(q12s16);
- d1u8 = vqmovun_s16(q13s16);
- d2u8 = vqmovun_s16(q14s16);
- d3u8 = vqmovun_s16(q15s16);
- q0u8 = vcombine_u8(d0u8, d1u8);
- q1u8 = vcombine_u8(d2u8, d3u8);
- vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
- vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
- dst += stride;
-
- q0u16 = vdupq_lane_u16(d6u16, 3);
- q12s16 =
- vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
- q13s16 =
- vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
- q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q10u16));
- q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q11u16));
- d0u8 = vqmovun_s16(q12s16);
- d1u8 = vqmovun_s16(q13s16);
- d2u8 = vqmovun_s16(q14s16);
- d3u8 = vqmovun_s16(q15s16);
- q0u8 = vcombine_u8(d0u8, d1u8);
- q1u8 = vcombine_u8(d2u8, d3u8);
- vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
- vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
- dst += stride;
+ const uint8x16_t top_left = vld1q_dup_u8(above - 1);
+ const uint8x16_t above_low = vld1q_u8(above);
+ const uint8x16_t above_high = vld1q_u8(above + 16);
+ const int16x8_t sub0 = vreinterpretq_s16_u16(
+ vsubl_u8(vget_low_u8(above_low), vget_low_u8(top_left)));
+ const int16x8_t sub1 = vreinterpretq_s16_u16(
+ vsubl_u8(vget_high_u8(above_low), vget_high_u8(top_left)));
+ const int16x8_t sub2 = vreinterpretq_s16_u16(
+ vsubl_u8(vget_low_u8(above_high), vget_low_u8(top_left)));
+ const int16x8_t sub3 = vreinterpretq_s16_u16(
+ vsubl_u8(vget_high_u8(above_high), vget_high_u8(top_left)));
+ int16x8_t left_dup;
+ int i, j;
+
+ for (j = 0; j < 4; j++, left += 8) {
+ const uint8x8_t left_u8 = vld1_u8(left);
+ const int16x8_t left_s16q = convert_u8_to_s16(left_u8);
+ int16x4_t left_s16d = vget_low_s16(left_s16q);
+ for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16q)) {
+ left_dup = vdupq_lane_s16(left_s16d, 0);
+ tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3);
+ left_dup = vdupq_lane_s16(left_s16d, 1);
+ tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3);
+ left_dup = vdupq_lane_s16(left_s16d, 2);
+ tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3);
+ left_dup = vdupq_lane_s16(left_s16d, 3);
+ tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3);
}
}
}
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index dad34e4b4..8c91b141f 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -204,18 +204,15 @@ DSP_SRCS-$(HAVE_NEON_ASM) += arm/save_reg_neon$(ASM)
ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
ifeq ($(HAVE_NEON_ASM),yes)
-DSP_SRCS-yes += arm/idct4x4_add_neon$(ASM)
-DSP_SRCS-yes += arm/idct8x8_add_neon$(ASM)
DSP_SRCS-yes += arm/idct16x16_add_neon$(ASM)
else
ifeq ($(HAVE_NEON),yes)
-DSP_SRCS-yes += arm/idct4x4_add_neon.c
-DSP_SRCS-yes += arm/idct8x8_add_neon.c
DSP_SRCS-yes += arm/idct16x16_add_neon.c
endif # HAVE_NEON
endif # HAVE_NEON_ASM
DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_34_add_neon.c
DSP_SRCS-$(HAVE_MSA) += mips/inv_txfm_msa.h
DSP_SRCS-$(HAVE_MSA) += mips/idct4x4_msa.c
@@ -232,14 +229,20 @@ DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_cols_dspr2.c
endif # !CONFIG_VP9_HIGHBITDEPTH
ifeq ($(HAVE_NEON_ASM),yes)
+DSP_SRCS-yes += arm/idct_neon$(ASM)
DSP_SRCS-yes += arm/idct4x4_1_add_neon$(ASM)
+DSP_SRCS-yes += arm/idct4x4_add_neon$(ASM)
DSP_SRCS-yes += arm/idct8x8_1_add_neon$(ASM)
+DSP_SRCS-yes += arm/idct8x8_add_neon$(ASM)
DSP_SRCS-yes += arm/idct16x16_1_add_neon$(ASM)
else
DSP_SRCS-$(HAVE_NEON) += arm/idct4x4_1_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/idct4x4_add_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/idct8x8_1_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/idct8x8_add_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_1_add_neon.c
endif # HAVE_NEON_ASM
+DSP_SRCS-$(HAVE_NEON) += arm/idct_neon.h
DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_1_add_neon.c
endif # CONFIG_VP9
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index fa6294142..7f31a6a11 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -644,16 +644,16 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
} else {
add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/vpx_idct4x4_16_add sse2/;
+ specialize qw/vpx_idct4x4_16_add neon sse2/;
add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct4x4_1_add neon sse2/;
add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/vpx_idct8x8_64_add sse2/, "$ssse3_x86_64";
+ specialize qw/vpx_idct8x8_64_add neon sse2/, "$ssse3_x86_64";
add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/vpx_idct8x8_12_add sse2/, "$ssse3_x86_64";
+ specialize qw/vpx_idct8x8_12_add neon sse2/, "$ssse3_x86_64";
add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct8x8_1_add neon sse2/;
@@ -764,8 +764,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct32x32_34_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
- # Need to add 34 eob idct32x32 neon implementation.
- $vpx_idct32x32_34_add_neon=vpx_idct32x32_1024_add_neon;
add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct32x32_1_add sse2 neon dspr2 msa/;
diff --git a/vpxdec.c b/vpxdec.c
index 4bd16bbed..f1b09e657 100644
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -781,7 +781,7 @@ static int main_loop(int argc, const char **argv_) {
warn("Failed to decode frame %d: %s", frame_in,
vpx_codec_error(&decoder));
if (detail) warn("Additional information: %s", detail);
- frames_corrupted++;
+ corrupted = 1;
if (!keep_going) goto fail;
}
@@ -800,7 +800,7 @@ static int main_loop(int argc, const char **argv_) {
// Flush the decoder in frame parallel decode.
if (vpx_codec_decode(&decoder, NULL, 0, NULL, 0)) {
warn("Failed to flush decoder: %s", vpx_codec_error(&decoder));
- frames_corrupted++;
+ corrupted = 1;
if (!keep_going) goto fail;
}
}
@@ -814,7 +814,7 @@ static int main_loop(int argc, const char **argv_) {
vpx_usec_timer_mark(&timer);
dx_time += (unsigned int)vpx_usec_timer_elapsed(&timer);
- if (!frame_parallel &&
+ if (!frame_parallel && !corrupted &&
vpx_codec_control(&decoder, VP8D_GET_FRAME_CORRUPTED, &corrupted)) {
warn("Failed VP8_GET_FRAME_CORRUPTED: %s", vpx_codec_error(&decoder));
if (!keep_going) goto fail;