diff options
35 files changed, 2530 insertions, 1060 deletions
diff --git a/.gitignore b/.gitignore index 4337a2c32..bf5ffc713 100644 --- a/.gitignore +++ b/.gitignore @@ -39,8 +39,7 @@ /examples/vp8cx_set_ref /examples/vp9cx_set_ref /examples/vp9_lossless_encoder -/examples/vp9_spatial_scalable_encoder -/examples/vpx_temporal_scalable_patterns +/examples/vp9_spatial_svc_encoder /examples/vpx_temporal_svc_encoder /ivfdec /ivfdec.dox @@ -51,6 +50,9 @@ /samples.dox /test_intra_pred_speed /test_libvpx +/tools.dox +/tools/*.dox +/tools/tiny_ssim /vp8_api1_migration.dox /vp[89x]_rtcd.h /vpx.pc diff --git a/build/make/Android.mk b/build/make/Android.mk index 36120170e..09bdc5d2f 100644 --- a/build/make/Android.mk +++ b/build/make/Android.mk @@ -71,7 +71,7 @@ ifeq ($(TARGET_ARCH_ABI),armeabi-v7a) include $(CONFIG_DIR)libs-armv7-android-gcc.mk LOCAL_ARM_MODE := arm else ifeq ($(TARGET_ARCH_ABI),arm64-v8a) - include $(CONFIG_DIR)libs-armv8-android-gcc.mk + include $(CONFIG_DIR)libs-arm64-android-gcc.mk LOCAL_ARM_MODE := arm else ifeq ($(TARGET_ARCH_ABI),x86) include $(CONFIG_DIR)libs-x86-android-gcc.mk @@ -101,8 +101,8 @@ LOCAL_CFLAGS := -O3 # like x86inc.asm and x86_abi_support.asm LOCAL_ASMFLAGS := -I$(LIBVPX_PATH) -.PRECIOUS: %.asm.s -$(ASM_CNV_PATH)/libvpx/%.asm.s: $(LIBVPX_PATH)/%.asm +.PRECIOUS: %.asm.S +$(ASM_CNV_PATH)/libvpx/%.asm.S: $(LIBVPX_PATH)/%.asm @mkdir -p $(dir $@) @$(CONFIG_DIR)$(ASM_CONVERSION) <$< > $@ @@ -132,7 +132,7 @@ endif # Pull out assembly files, splitting NEON from the rest. This is # done to specify that the NEON assembly files use NEON assembler flags. -# x86 assembly matches %.asm, arm matches %.asm.s +# x86 assembly matches %.asm, arm matches %.asm.S # x86: @@ -140,12 +140,12 @@ CODEC_SRCS_ASM_X86 = $(filter %.asm, $(CODEC_SRCS_UNIQUE)) LOCAL_SRC_FILES += $(foreach file, $(CODEC_SRCS_ASM_X86), libvpx/$(file)) # arm: -CODEC_SRCS_ASM_ARM_ALL = $(filter %.asm.s, $(CODEC_SRCS_UNIQUE)) +CODEC_SRCS_ASM_ARM_ALL = $(filter %.asm.S, $(CODEC_SRCS_UNIQUE)) CODEC_SRCS_ASM_ARM = $(foreach v, \ $(CODEC_SRCS_ASM_ARM_ALL), \ $(if $(findstring neon,$(v)),,$(v))) -CODEC_SRCS_ASM_ADS2GAS = $(patsubst %.s, \ - $(ASM_CNV_PATH_LOCAL)/libvpx/%.s, \ +CODEC_SRCS_ASM_ADS2GAS = $(patsubst %.S, \ + $(ASM_CNV_PATH_LOCAL)/libvpx/%.S, \ $(CODEC_SRCS_ASM_ARM)) LOCAL_SRC_FILES += $(CODEC_SRCS_ASM_ADS2GAS) @@ -153,18 +153,19 @@ ifeq ($(TARGET_ARCH_ABI),armeabi-v7a) CODEC_SRCS_ASM_NEON = $(foreach v, \ $(CODEC_SRCS_ASM_ARM_ALL),\ $(if $(findstring neon,$(v)),$(v),)) - CODEC_SRCS_ASM_NEON_ADS2GAS = $(patsubst %.s, \ - $(ASM_CNV_PATH_LOCAL)/libvpx/%.s, \ + CODEC_SRCS_ASM_NEON_ADS2GAS = $(patsubst %.S, \ + $(ASM_CNV_PATH_LOCAL)/libvpx/%.S, \ $(CODEC_SRCS_ASM_NEON)) - LOCAL_SRC_FILES += $(patsubst %.s, \ - %.s.neon, \ + LOCAL_SRC_FILES += $(patsubst %.S, \ + %.S.neon, \ $(CODEC_SRCS_ASM_NEON_ADS2GAS)) endif LOCAL_CFLAGS += \ -DHAVE_CONFIG_H=vpx_config.h \ -I$(LIBVPX_PATH) \ - -I$(ASM_CNV_PATH) + -I$(ASM_CNV_PATH) \ + -I$(ASM_CNV_PATH)/libvpx LOCAL_MODULE := libvpx @@ -185,7 +186,8 @@ endif $$(rtcd_dep_template_SRCS): vpx_scale_rtcd.h $$(rtcd_dep_template_SRCS): vpx_dsp_rtcd.h -ifneq ($(findstring $(TARGET_ARCH_ABI),x86 x86_64),) +rtcd_dep_template_CONFIG_ASM_ABIS := x86 x86_64 armeabi-v7a +ifneq ($(findstring $(TARGET_ARCH_ABI),$(rtcd_dep_template_CONFIG_ASM_ABIS)),) $$(rtcd_dep_template_SRCS): vpx_config.asm endif endef diff --git a/build/make/Makefile b/build/make/Makefile index 469eb74c3..cba605786 100644 --- a/build/make/Makefile +++ b/build/make/Makefile @@ -90,7 +90,7 @@ all: .PHONY: clean clean:: - rm -f $(OBJS-yes) $(OBJS-yes:.o=.d) $(OBJS-yes:.asm.s.o=.asm.s) + rm -f $(OBJS-yes) $(OBJS-yes:.o=.d) $(OBJS-yes:.asm.S.o=.asm.S) rm -f $(CLEAN-OBJS) .PHONY: clean @@ -180,13 +180,13 @@ $(BUILD_PFX)%.asm.o: %.asm $(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@)) $(qexec)$(AS) $(ASFLAGS) -o $@ $< -$(BUILD_PFX)%.s.d: %.s +$(BUILD_PFX)%.S.d: %.S $(if $(quiet),@echo " [DEP] $@") $(qexec)mkdir -p $(dir $@) $(qexec)$(SRC_PATH_BARE)/build/make/gen_asm_deps.sh \ --build-pfx=$(BUILD_PFX) --depfile=$@ $(ASFLAGS) $< > $@ -$(BUILD_PFX)%.s.o: %.s +$(BUILD_PFX)%.S.o: %.S $(if $(quiet),@echo " [AS] $@") $(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@)) $(qexec)$(AS) $(ASFLAGS) -o $@ $< @@ -198,8 +198,8 @@ $(BUILD_PFX)%.c.S: %.c $(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@)) $(qexec)$(CC) -S $(CFLAGS) -o $@ $< -.PRECIOUS: %.asm.s -$(BUILD_PFX)%.asm.s: %.asm +.PRECIOUS: %.asm.S +$(BUILD_PFX)%.asm.S: %.asm $(if $(quiet),@echo " [ASM CONVERSION] $@") $(qexec)mkdir -p $(dir $@) $(qexec)$(ASM_CONVERSION) <$< >$@ diff --git a/build/make/configure.sh b/build/make/configure.sh index 35609e89a..f050fa06a 100644 --- a/build/make/configure.sh +++ b/build/make/configure.sh @@ -635,7 +635,7 @@ setup_gnu_toolchain() { AS=${AS:-${CROSS}as} STRIP=${STRIP:-${CROSS}strip} NM=${NM:-${CROSS}nm} - AS_SFX=.s + AS_SFX=.S EXE_SFX= } @@ -926,7 +926,7 @@ EOF ;; vs*) asm_conversion_cmd="${source_path}/build/make/ads2armasm_ms.pl" - AS_SFX=.s + AS_SFX=.S msvs_arch_dir=arm-msvs disable_feature multithread disable_feature unit_tests @@ -1034,7 +1034,7 @@ EOF STRIP="$(${XCRUN_FIND} strip)" NM="$(${XCRUN_FIND} nm)" RANLIB="$(${XCRUN_FIND} ranlib)" - AS_SFX=.s + AS_SFX=.S LD="${CXX:-$(${XCRUN_FIND} ld)}" # ASFLAGS is written here instead of using check_add_asflags diff --git a/build/make/gen_msvs_vcxproj.sh b/build/make/gen_msvs_vcxproj.sh index e98611d10..e3395afa2 100755 --- a/build/make/gen_msvs_vcxproj.sh +++ b/build/make/gen_msvs_vcxproj.sh @@ -203,7 +203,7 @@ for opt in "$@"; do # The paths in file_list are fixed outside of the loop. file_list[${#file_list[@]}]="$opt" case "$opt" in - *.asm|*.s) uses_asm=true + *.asm|*.[Ss]) uses_asm=true ;; esac ;; @@ -22,6 +22,7 @@ show_help(){ Advanced options: ${toggle_libs} libraries ${toggle_examples} examples + ${toggle_tools} tools ${toggle_docs} documentation ${toggle_unit_tests} unit tests ${toggle_decode_perf_tests} build decoder perf tests with unit tests @@ -155,7 +156,7 @@ all_platforms="${all_platforms} generic-gnu" # all_targets is a list of all targets that can be configured # note that these should be in dependency order for now. -all_targets="libs examples docs" +all_targets="libs examples tools docs" # all targets available are enabled, by default. for t in ${all_targets}; do @@ -331,6 +332,7 @@ CMDLINE_SELECT=" libs examples + tools docs libc as @@ -476,7 +478,7 @@ EOF # # Write makefiles for all enabled targets # - for tgt in libs examples docs solution; do + for tgt in libs examples tools docs solution; do tgt_fn="$tgt-$toolchain.mk" if enabled $tgt; then diff --git a/examples/vp9_spatial_svc_encoder.c b/examples/vp9_spatial_svc_encoder.c index cecdce080..fa2df7271 100644 --- a/examples/vp9_spatial_svc_encoder.c +++ b/examples/vp9_spatial_svc_encoder.c @@ -84,6 +84,8 @@ static const arg_def_t speed_arg = ARG_DEF("sp", "speed", 1, "speed configuration"); static const arg_def_t aqmode_arg = ARG_DEF("aq", "aqmode", 1, "aq-mode off/on"); +static const arg_def_t bitrates_arg = + ARG_DEF("bl", "bitrates", 1, "bitrates[sl * num_tl + tl]"); #if CONFIG_VP9_HIGHBITDEPTH static const struct arg_enum_list bitdepth_enum[] = { @@ -124,6 +126,7 @@ static const arg_def_t *svc_args[] = { &frames_arg, #endif &speed_arg, &rc_end_usage_arg, + &bitrates_arg, NULL }; static const uint32_t default_frames_to_skip = 0; @@ -250,6 +253,9 @@ static void parse_command_line(int argc, const char **argv_, } else if (arg_match(&arg, &scale_factors_arg, argi)) { snprintf(string_options, sizeof(string_options), "%s scale-factors=%s", string_options, arg.val); + } else if (arg_match(&arg, &bitrates_arg, argi)) { + snprintf(string_options, sizeof(string_options), "%s bitrates=%s", + string_options, arg.val); } else if (arg_match(&arg, &passes_arg, argi)) { passes = arg_parse_uint(&arg); if (passes < 1 || passes > 2) { @@ -417,7 +423,6 @@ static void set_rate_control_stats(struct RateControlStats *rc, for (sl = 0; sl < cfg->ss_number_layers; ++sl) { for (tl = 0; tl < cfg->ts_number_layers; ++tl) { const int layer = sl * cfg->ts_number_layers + tl; - const int tlayer0 = sl * cfg->ts_number_layers; if (cfg->ts_number_layers == 1) rc->layer_framerate[layer] = framerate; else @@ -428,8 +433,8 @@ static void set_rate_control_stats(struct RateControlStats *rc, cfg->layer_target_bitrate[layer - 1]) / (rc->layer_framerate[layer] - rc->layer_framerate[layer - 1]); } else { - rc->layer_pfb[tlayer0] = 1000.0 * cfg->layer_target_bitrate[tlayer0] / - rc->layer_framerate[tlayer0]; + rc->layer_pfb[layer] = 1000.0 * cfg->layer_target_bitrate[layer] / + rc->layer_framerate[layer]; } rc->layer_input_frames[layer] = 0; rc->layer_enc_frames[layer] = 0; @@ -449,12 +454,13 @@ static void printout_rate_control_summary(struct RateControlStats *rc, vpx_codec_enc_cfg_t *cfg, int frame_cnt) { unsigned int sl, tl; - int tot_num_frames = 0; double perc_fluctuation = 0.0; + int tot_num_frames = 0; printf("Total number of processed frames: %d\n\n", frame_cnt - 1); printf("Rate control layer stats for sl%d tl%d layer(s):\n\n", cfg->ss_number_layers, cfg->ts_number_layers); for (sl = 0; sl < cfg->ss_number_layers; ++sl) { + tot_num_frames = 0; for (tl = 0; tl < cfg->ts_number_layers; ++tl) { const int layer = sl * cfg->ts_number_layers + tl; const int num_dropped = @@ -462,7 +468,7 @@ static void printout_rate_control_summary(struct RateControlStats *rc, ? (rc->layer_input_frames[layer] - rc->layer_enc_frames[layer]) : (rc->layer_input_frames[layer] - rc->layer_enc_frames[layer] - 1); - if (!sl) tot_num_frames += rc->layer_input_frames[layer]; + tot_num_frames += rc->layer_input_frames[layer]; rc->layer_encoding_bitrate[layer] = 0.001 * rc->layer_framerate[layer] * rc->layer_encoding_bitrate[layer] / tot_num_frames; @@ -620,7 +626,7 @@ int main(int argc, const char **argv) { struct RateControlStats rc; vpx_svc_layer_id_t layer_id; vpx_svc_ref_frame_config_t ref_frame_config; - int sl, tl; + unsigned int sl, tl; double sum_bitrate = 0.0; double sum_bitrate2 = 0.0; double framerate = 30.0; @@ -695,6 +701,8 @@ int main(int argc, const char **argv) { vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (svc_ctx.threads >> 1)); if (svc_ctx.speed >= 5 && svc_ctx.aqmode == 1) vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3); + if (svc_ctx.speed >= 5) + vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1); // Encode frames while (!end_of_stream) { @@ -730,7 +738,7 @@ int main(int argc, const char **argv) { &ref_frame_config); // Keep track of input frames, to account for frame drops in rate control // stats/metrics. - for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) { + for (sl = 0; sl < (unsigned int)enc_cfg.ss_number_layers; ++sl) { ++rc.layer_input_frames[sl * enc_cfg.ts_number_layers + layer_id.temporal_layer_id]; } @@ -793,7 +801,7 @@ int main(int argc, const char **argv) { rc.layer_encoding_bitrate[layer] += 8.0 * sizes[sl]; // Keep count of rate control stats per layer, for non-key // frames. - if (tl == layer_id.temporal_layer_id && + if (tl == (unsigned int)layer_id.temporal_layer_id && !(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY)) { rc.layer_avg_frame_size[layer] += 8.0 * sizes[sl]; rc.layer_avg_rate_mismatch[layer] += @@ -807,7 +815,7 @@ int main(int argc, const char **argv) { // Update for short-time encoding bitrate states, for moving // window of size rc->window, shifted by rc->window / 2. // Ignore first window segment, due to key frame. - if (frame_cnt > rc.window_size) { + if (frame_cnt > (unsigned int)rc.window_size) { tl = layer_id.temporal_layer_id; for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) { sum_bitrate += 0.001 * 8.0 * sizes[sl] * framerate; @@ -823,13 +831,14 @@ int main(int argc, const char **argv) { } // Second shifted window. - if (frame_cnt > rc.window_size + rc.window_size / 2) { + if (frame_cnt > + (unsigned int)(rc.window_size + rc.window_size / 2)) { tl = layer_id.temporal_layer_id; for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) { sum_bitrate2 += 0.001 * 8.0 * sizes[sl] * framerate; } - if (frame_cnt > 2 * rc.window_size && + if (frame_cnt > (unsigned int)(2 * rc.window_size) && frame_cnt % rc.window_size == 0) { rc.window_count += 1; rc.avg_st_encoding_bitrate += sum_bitrate2 / rc.window_size; @@ -842,10 +851,11 @@ int main(int argc, const char **argv) { } #endif } - + /* printf("SVC frame: %d, kf: %d, size: %d, pts: %d\n", frames_received, !!(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY), (int)cx_pkt->data.frame.sz, (int)cx_pkt->data.frame.pts); + */ if (enc_cfg.ss_number_layers == 1 && enc_cfg.ts_number_layers == 1) si->bytes_sum[0] += (int)cx_pkt->data.frame.sz; ++frames_received; @@ -12,7 +12,7 @@ # ARM assembly files are written in RVCT-style. We use some make magic to # filter those files to allow GCC compilation ifeq ($(ARCH_ARM),yes) - ASM:=$(if $(filter yes,$(CONFIG_GCC)$(CONFIG_MSVS)),.asm.s,.asm) + ASM:=$(if $(filter yes,$(CONFIG_GCC)$(CONFIG_MSVS)),.asm.S,.asm) else ASM:=.asm endif @@ -366,7 +366,7 @@ endif # # Add assembler dependencies for configuration. # -$(filter %.s.o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm +$(filter %.S.o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm diff --git a/test/invalid_file_test.cc b/test/invalid_file_test.cc index 12eaa80e7..2921e5ddf 100644 --- a/test/invalid_file_test.cc +++ b/test/invalid_file_test.cc @@ -188,6 +188,7 @@ const DecodeParam kMultiThreadedVP9InvalidFileTests[] = { "invalid-vp90-2-08-tile_1x8_frame_parallel.webm.ivf.s288_r01-05_b6-.ivf" }, { 2, "invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.v2.ivf" }, { 4, "invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.v2.ivf" }, + { 2, "invalid-crbug-629481.webm" }, }; INSTANTIATE_TEST_CASE_P( diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc index 9eb4d9dbb..0c704c5c8 100644 --- a/test/partial_idct_test.cc +++ b/test/partial_idct_test.cc @@ -41,13 +41,42 @@ class PartialIDctTest : public ::testing::TestWithParam<PartialInvTxfmParam> { partial_itxfm_ = GET_PARAM(2); tx_size_ = GET_PARAM(3); last_nonzero_ = GET_PARAM(4); + + switch (tx_size_) { + case TX_4X4: size_ = 4; break; + case TX_8X8: size_ = 8; break; + case TX_16X16: size_ = 16; break; + case TX_32X32: size_ = 32; break; + default: FAIL() << "Wrong Size!"; break; + } + block_size_ = size_ * size_; + + input_block_ = reinterpret_cast<tran_low_t *>( + vpx_memalign(16, sizeof(*input_block_) * block_size_)); + output_block_ = reinterpret_cast<uint8_t *>( + vpx_memalign(16, sizeof(*output_block_) * block_size_)); + output_block_ref_ = reinterpret_cast<uint8_t *>( + vpx_memalign(16, sizeof(*output_block_ref_) * block_size_)); } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + virtual void TearDown() { + vpx_free(input_block_); + input_block_ = NULL; + vpx_free(output_block_); + output_block_ = NULL; + vpx_free(output_block_ref_); + output_block_ref_ = NULL; + libvpx_test::ClearSystemState(); + } protected: int last_nonzero_; TX_SIZE tx_size_; + tran_low_t *input_block_; + uint8_t *output_block_; + uint8_t *output_block_ref_; + int size_; + int block_size_; FwdTxfmFunc ftxfm_; InvTxfmFunc full_itxfm_; InvTxfmFunc partial_itxfm_; @@ -55,95 +84,62 @@ class PartialIDctTest : public ::testing::TestWithParam<PartialInvTxfmParam> { TEST_P(PartialIDctTest, RunQuantCheck) { ACMRandom rnd(ACMRandom::DeterministicSeed()); - int size; - switch (tx_size_) { - case TX_4X4: size = 4; break; - case TX_8X8: size = 8; break; - case TX_16X16: size = 16; break; - case TX_32X32: size = 32; break; - default: FAIL() << "Wrong Size!"; break; - } - DECLARE_ALIGNED(16, tran_low_t, test_coef_block1[kMaxNumCoeffs]); - DECLARE_ALIGNED(16, tran_low_t, test_coef_block2[kMaxNumCoeffs]); - DECLARE_ALIGNED(16, uint8_t, dst1[kMaxNumCoeffs]); - DECLARE_ALIGNED(16, uint8_t, dst2[kMaxNumCoeffs]); const int count_test_block = 1000; - const int block_size = size * size; DECLARE_ALIGNED(16, int16_t, input_extreme_block[kMaxNumCoeffs]); DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kMaxNumCoeffs]); - int max_error = 0; for (int i = 0; i < count_test_block; ++i) { // clear out destination buffer - memset(dst1, 0, sizeof(*dst1) * block_size); - memset(dst2, 0, sizeof(*dst2) * block_size); - memset(test_coef_block1, 0, sizeof(*test_coef_block1) * block_size); - memset(test_coef_block2, 0, sizeof(*test_coef_block2) * block_size); + memset(input_block_, 0, sizeof(*input_block_) * block_size_); + memset(output_block_, 0, sizeof(*output_block_) * block_size_); + memset(output_block_ref_, 0, sizeof(*output_block_ref_) * block_size_); ACMRandom rnd(ACMRandom::DeterministicSeed()); for (int i = 0; i < count_test_block; ++i) { // Initialize a test block with input range [-255, 255]. if (i == 0) { - for (int j = 0; j < block_size; ++j) input_extreme_block[j] = 255; + for (int j = 0; j < block_size_; ++j) input_extreme_block[j] = 255; } else if (i == 1) { - for (int j = 0; j < block_size; ++j) input_extreme_block[j] = -255; + for (int j = 0; j < block_size_; ++j) input_extreme_block[j] = -255; } else { - for (int j = 0; j < block_size; ++j) { + for (int j = 0; j < block_size_; ++j) { input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255; } } - ftxfm_(input_extreme_block, output_ref_block, size); + ftxfm_(input_extreme_block, output_ref_block, size_); // quantization with maximum allowed step sizes - test_coef_block1[0] = (output_ref_block[0] / 1336) * 1336; + input_block_[0] = (output_ref_block[0] / 1336) * 1336; for (int j = 1; j < last_nonzero_; ++j) { - test_coef_block1[vp9_default_scan_orders[tx_size_].scan[j]] = + input_block_[vp9_default_scan_orders[tx_size_].scan[j]] = (output_ref_block[j] / 1828) * 1828; } } - ASM_REGISTER_STATE_CHECK(full_itxfm_(test_coef_block1, dst1, size)); - ASM_REGISTER_STATE_CHECK(partial_itxfm_(test_coef_block1, dst2, size)); + ASM_REGISTER_STATE_CHECK( + full_itxfm_(input_block_, output_block_ref_, size_)); + ASM_REGISTER_STATE_CHECK( + partial_itxfm_(input_block_, output_block_, size_)); - for (int j = 0; j < block_size; ++j) { - const int diff = dst1[j] - dst2[j]; - const int error = diff * diff; - if (max_error < error) max_error = error; - } + ASSERT_EQ(0, memcmp(output_block_ref_, output_block_, + sizeof(*output_block_) * block_size_)) + << "Error: partial inverse transform produces different results"; } - - EXPECT_EQ(0, max_error) - << "Error: partial inverse transform produces different results"; } TEST_P(PartialIDctTest, ResultsMatch) { ACMRandom rnd(ACMRandom::DeterministicSeed()); - int size; - switch (tx_size_) { - case TX_4X4: size = 4; break; - case TX_8X8: size = 8; break; - case TX_16X16: size = 16; break; - case TX_32X32: size = 32; break; - default: FAIL() << "Wrong Size!"; break; - } - DECLARE_ALIGNED(16, tran_low_t, test_coef_block1[kMaxNumCoeffs]); - DECLARE_ALIGNED(16, tran_low_t, test_coef_block2[kMaxNumCoeffs]); - DECLARE_ALIGNED(16, uint8_t, dst1[kMaxNumCoeffs]); - DECLARE_ALIGNED(16, uint8_t, dst2[kMaxNumCoeffs]); const int count_test_block = 1000; const int max_coeff = 32766 / 4; - const int block_size = size * size; - int max_error = 0; for (int i = 0; i < count_test_block; ++i) { // clear out destination buffer - memset(dst1, 0, sizeof(*dst1) * block_size); - memset(dst2, 0, sizeof(*dst2) * block_size); - memset(test_coef_block1, 0, sizeof(*test_coef_block1) * block_size); - memset(test_coef_block2, 0, sizeof(*test_coef_block2) * block_size); + memset(input_block_, 0, sizeof(*input_block_) * block_size_); + memset(output_block_, 0, sizeof(*output_block_) * block_size_); + memset(output_block_ref_, 0, sizeof(*output_block_ref_) * block_size_); int max_energy_leftover = max_coeff * max_coeff; for (int j = 0; j < last_nonzero_; ++j) { int16_t coef = static_cast<int16_t>(sqrt(1.0 * max_energy_leftover) * @@ -153,24 +149,42 @@ TEST_P(PartialIDctTest, ResultsMatch) { max_energy_leftover = 0; coef = 0; } - test_coef_block1[vp9_default_scan_orders[tx_size_].scan[j]] = coef; + input_block_[vp9_default_scan_orders[tx_size_].scan[j]] = coef; } - memcpy(test_coef_block2, test_coef_block1, - sizeof(*test_coef_block2) * block_size); + ASM_REGISTER_STATE_CHECK( + full_itxfm_(input_block_, output_block_ref_, size_)); + ASM_REGISTER_STATE_CHECK( + partial_itxfm_(input_block_, output_block_, size_)); - ASM_REGISTER_STATE_CHECK(full_itxfm_(test_coef_block1, dst1, size)); - ASM_REGISTER_STATE_CHECK(partial_itxfm_(test_coef_block2, dst2, size)); + ASSERT_EQ(0, memcmp(output_block_ref_, output_block_, + sizeof(*output_block_) * block_size_)) + << "Error: partial inverse transform produces different results"; + } +} - for (int j = 0; j < block_size; ++j) { - const int diff = dst1[j] - dst2[j]; - const int error = diff * diff; - if (max_error < error) max_error = error; +TEST_P(PartialIDctTest, AddOutputBlock) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 10; + for (int i = 0; i < count_test_block; ++i) { + memset(input_block_, 0, sizeof(*input_block_) * block_size_); + for (int j = 0; j < last_nonzero_; ++j) { + input_block_[vp9_default_scan_orders[tx_size_].scan[j]] = 10; } - } - EXPECT_EQ(0, max_error) - << "Error: partial inverse transform produces different results"; + for (int j = 0; j < block_size_; ++j) { + output_block_[j] = output_block_ref_[j] = rnd.Rand8(); + } + + ASM_REGISTER_STATE_CHECK( + full_itxfm_(input_block_, output_block_ref_, size_)); + ASM_REGISTER_STATE_CHECK( + partial_itxfm_(input_block_, output_block_, size_)); + + ASSERT_EQ(0, memcmp(output_block_ref_, output_block_, + sizeof(*output_block_) * block_size_)) + << "Error: Transform results are not correctly added to output."; + } } using std::tr1::make_tuple; @@ -214,7 +228,7 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c, &vpx_idct4x4_1_add_neon, TX_4X4, 1))); #else // !CONFIG_VP9_HIGHBITDEPTH -// 32x32_34_ 32x32_135_ are implemented using the 1024 version. +// 32x32_135_ is implemented using the 1024 version. INSTANTIATE_TEST_CASE_P( NEON, PartialIDctTest, ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c, @@ -222,7 +236,7 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c, &vpx_idct32x32_1024_add_neon, TX_32X32, 135), make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c, - &vpx_idct32x32_1024_add_neon, TX_32X32, 34), + &vpx_idct32x32_34_add_neon, TX_32X32, 34), make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c, &vpx_idct32x32_1_add_neon, TX_32X32, 1), make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_c, diff --git a/test/pp_filter_test.cc b/test/pp_filter_test.cc index 2e34fed06..4f6592647 100644 --- a/test/pp_filter_test.cc +++ b/test/pp_filter_test.cc @@ -7,22 +7,23 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "test/acm_random.h" #include "test/clear_system_state.h" #include "test/register_state_check.h" #include "third_party/googletest/src/include/gtest/gtest.h" -#include "./vpx_config.h" -#include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" #include "vpx_mem/vpx_mem.h" -typedef void (*PostProcFunc)(unsigned char *src_ptr, unsigned char *dst_ptr, - int src_pixels_per_line, int dst_pixels_per_line, - int cols, unsigned char *flimit, int size); +typedef void (*VpxPostProcDownAndAcrossMbRowFunc)( + unsigned char *src_ptr, unsigned char *dst_ptr, int src_pixels_per_line, + int dst_pixels_per_line, int cols, unsigned char *flimit, int size); namespace { -class VPxPostProcessingFilterTest - : public ::testing::TestWithParam<PostProcFunc> { +class VpxPostProcDownAndAcrossMbRowTest + : public ::testing::TestWithParam<VpxPostProcDownAndAcrossMbRowFunc> { public: virtual void TearDown() { libvpx_test::ClearSystemState(); } }; @@ -30,7 +31,7 @@ class VPxPostProcessingFilterTest // Test routine for the VPx post-processing function // vpx_post_proc_down_and_across_mb_row_c. -TEST_P(VPxPostProcessingFilterTest, FilterOutputCheck) { +TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckFilterOutput) { // Size of the underlying data block that will be filtered. const int block_width = 16; const int block_height = 16; @@ -78,14 +79,14 @@ TEST_P(VPxPostProcessingFilterTest, FilterOutputCheck) { input_stride, output_stride, block_width, flimits, 16)); - static const uint8_t expected_data[block_height] = { 4, 3, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 3, 4 }; + static const uint8_t kExpectedOutput[block_height] = { + 4, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 4 + }; pixel_ptr = dst_image_ptr; for (int i = 0; i < block_height; ++i) { for (int j = 0; j < block_width; ++j) { - EXPECT_EQ(expected_data[i], pixel_ptr[j]) - << "VPxPostProcessingFilterTest failed with invalid filter output"; + ASSERT_EQ(kExpectedOutput[i], pixel_ptr[j]); } pixel_ptr += output_stride; } @@ -96,18 +97,18 @@ TEST_P(VPxPostProcessingFilterTest, FilterOutputCheck) { }; INSTANTIATE_TEST_CASE_P( - C, VPxPostProcessingFilterTest, + C, VpxPostProcDownAndAcrossMbRowTest, ::testing::Values(vpx_post_proc_down_and_across_mb_row_c)); #if HAVE_SSE2 INSTANTIATE_TEST_CASE_P( - SSE2, VPxPostProcessingFilterTest, + SSE2, VpxPostProcDownAndAcrossMbRowTest, ::testing::Values(vpx_post_proc_down_and_across_mb_row_sse2)); #endif #if HAVE_MSA INSTANTIATE_TEST_CASE_P( - MSA, VPxPostProcessingFilterTest, + MSA, VpxPostProcDownAndAcrossMbRowTest, ::testing::Values(vpx_post_proc_down_and_across_mb_row_msa)); #endif diff --git a/test/test-data.mk b/test/test-data.mk index 80b802e0a..e528c9182 100644 --- a/test/test-data.mk +++ b/test/test-data.mk @@ -775,6 +775,8 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp91-2-mixedrefcsp-444to420.iv LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-1.webm LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-2.webm LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-3.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-629481.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-629481.webm.res ifeq ($(CONFIG_DECODE_PERF_TESTS),yes) # Encode / Decode test diff --git a/test/test-data.sha1 b/test/test-data.sha1 index b97ae967e..eda46c918 100644 --- a/test/test-data.sha1 +++ b/test/test-data.sha1 @@ -840,3 +840,5 @@ a000d568431d07379dd5a8ec066061c07e560b47 *invalid-vp90-2-00-quantizer-63.ivf.kf_ 787f04f0483320d536894282f3358a4f8cac1cf9 *invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf.res 91d3cefd0deb98f3b0caf3a2d900ec7a7605e53a *invalid-vp90-2-10-show-existing-frame.webm.ivf.s180315_r01-05_b6-.ivf 1e472baaf5f6113459f0399a38a5a5e68d17799d *invalid-vp90-2-10-show-existing-frame.webm.ivf.s180315_r01-05_b6-.ivf.res +70057835bf29d14e66699ce5f022df2551fb6b37 *invalid-crbug-629481.webm +5d9474c0309b7ca09a182d888f73b37a8fe1362c *invalid-crbug-629481.webm.res diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc index a89630753..d8eb82426 100644 --- a/test/test_intra_pred_speed.cc +++ b/test/test_intra_pred_speed.cc @@ -29,6 +29,8 @@ namespace { typedef void (*VpxPredFunc)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +const int kBPS = 32; +const int kTotalPixels = 32 * kBPS; const int kNumVp9IntraPredFuncs = 13; const char *kVp9IntraPredNames[kNumVp9IntraPredFuncs] = { "DC_PRED", "DC_LEFT_PRED", "DC_TOP_PRED", "DC_128_PRED", "V_PRED", @@ -36,107 +38,121 @@ const char *kVp9IntraPredNames[kNumVp9IntraPredFuncs] = { "D207_PRED", "D63_PRED", "TM_PRED" }; +template <typename Pixel> +struct IntraPredTestMem { + void Init(int block_size, int bd) { + libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed()); + Pixel *const above = above_mem + 16; + const int mask = (1 << bd) - 1; + for (int i = 0; i < kTotalPixels; ++i) ref_src[i] = rnd.Rand16() & mask; + for (int i = 0; i < kBPS; ++i) left[i] = rnd.Rand16() & mask; + for (int i = -1; i < kBPS; ++i) above[i] = rnd.Rand16() & mask; + + // some code assumes the top row has been extended: + // d45/d63 C-code, for instance, but not the assembly. + // TODO(jzern): this style of extension isn't strictly necessary. + ASSERT_LE(block_size, kBPS); + for (int i = block_size; i < 2 * kBPS; ++i) { + above[i] = above[block_size - 1]; + } + } + + DECLARE_ALIGNED(16, Pixel, src[kTotalPixels]); + DECLARE_ALIGNED(16, Pixel, ref_src[kTotalPixels]); + DECLARE_ALIGNED(16, Pixel, left[kBPS]); + DECLARE_ALIGNED(16, Pixel, above_mem[2 * kBPS + 16]); +}; + +typedef IntraPredTestMem<uint8_t> Vp9IntraPredTestMem; + +void CheckMd5Signature(const char name[], const char *const signatures[], + const void *data, size_t data_size, int elapsed_time, + int idx) { + libvpx_test::MD5 md5; + md5.Add(reinterpret_cast<const uint8_t *>(data), data_size); + printf("Mode %s[%12s]: %5d ms MD5: %s\n", name, kVp9IntraPredNames[idx], + elapsed_time, md5.Get()); + EXPECT_STREQ(signatures[idx], md5.Get()); +} + void TestIntraPred(const char name[], VpxPredFunc const *pred_funcs, - const char *const pred_func_names[], int num_funcs, - const char *const signatures[], int block_size, - int num_pixels_per_test) { - libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed()); - const int kBPS = 32; - const int kTotalPixels = 32 * kBPS; - DECLARE_ALIGNED(16, uint8_t, src[kTotalPixels]); - DECLARE_ALIGNED(16, uint8_t, ref_src[kTotalPixels]); - DECLARE_ALIGNED(16, uint8_t, left[kBPS]); - DECLARE_ALIGNED(16, uint8_t, above_mem[2 * kBPS + 16]); - uint8_t *const above = above_mem + 16; - for (int i = 0; i < kTotalPixels; ++i) ref_src[i] = rnd.Rand8(); - for (int i = 0; i < kBPS; ++i) left[i] = rnd.Rand8(); - for (int i = -1; i < kBPS; ++i) above[i] = rnd.Rand8(); - const int kNumTests = static_cast<int>(2.e10 / num_pixels_per_test); - - // some code assumes the top row has been extended: - // d45/d63 C-code, for instance, but not the assembly. - // TODO(jzern): this style of extension isn't strictly necessary. - ASSERT_LE(block_size, kBPS); - memset(above + block_size, above[block_size - 1], 2 * kBPS - block_size); - - for (int k = 0; k < num_funcs; ++k) { + const char *const signatures[], int block_size) { + const int kNumTests = static_cast<int>( + 2.e10 / (block_size * block_size * kNumVp9IntraPredFuncs)); + Vp9IntraPredTestMem intra_pred_test_mem; + const uint8_t *const above = intra_pred_test_mem.above_mem + 16; + + intra_pred_test_mem.Init(block_size, 8); + + for (int k = 0; k < kNumVp9IntraPredFuncs; ++k) { if (pred_funcs[k] == NULL) continue; - memcpy(src, ref_src, sizeof(src)); + memcpy(intra_pred_test_mem.src, intra_pred_test_mem.ref_src, + sizeof(intra_pred_test_mem.src)); vpx_usec_timer timer; vpx_usec_timer_start(&timer); for (int num_tests = 0; num_tests < kNumTests; ++num_tests) { - pred_funcs[k](src, kBPS, above, left); + pred_funcs[k](intra_pred_test_mem.src, kBPS, above, + intra_pred_test_mem.left); } libvpx_test::ClearSystemState(); vpx_usec_timer_mark(&timer); const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer) / 1000); - libvpx_test::MD5 md5; - md5.Add(src, sizeof(src)); - printf("Mode %s[%12s]: %5d ms MD5: %s\n", name, pred_func_names[k], - elapsed_time, md5.Get()); - EXPECT_STREQ(signatures[k], md5.Get()); + CheckMd5Signature(name, signatures, intra_pred_test_mem.src, + sizeof(intra_pred_test_mem.src), elapsed_time, k); } } void TestIntraPred4(VpxPredFunc const *pred_funcs) { - static const int kNumVp9IntraFuncs = 13; - static const char *const kSignatures[kNumVp9IntraFuncs] = { - "4334156168b34ab599d9b5b30f522fe9", "bc4649d5ba47c7ff178d92e475960fb0", - "8d316e5933326dcac24e1064794b5d12", "a27270fed024eafd762c95de85f4da51", - "c33dff000d4256c2b8f3bf9e9bab14d2", "44d8cddc2ad8f79b8ed3306051722b4f", - "eb54839b2bad6699d8946f01ec041cd0", "ecb0d56ae5f677ea45127ce9d5c058e4", - "0b7936841f6813da818275944895b574", "9117972ef64f91a58ff73e1731c81db2", - "c56d5e8c729e46825f46dd5d3b5d508a", "c0889e2039bcf7bcb5d2f33cdca69adc", - "309a618577b27c648f9c5ee45252bc8f", + static const char *const kSignatures[kNumVp9IntraPredFuncs] = { + "e7ed7353c3383fff942e500e9bfe82fe", "2a4a26fcc6ce005eadc08354d196c8a9", + "269d92eff86f315d9c38fe7640d85b15", "ae2960eea9f71ee3dabe08b282ec1773", + "6c1abcc44e90148998b51acd11144e9c", "f7bb3186e1ef8a2b326037ff898cad8e", + "364c1f3fb2f445f935aec2a70a67eaa4", "141624072a4a56773f68fadbdd07c4a7", + "7be49b08687a5f24df3a2c612fca3876", "459bb5d9fd5b238348179c9a22108cd6", + "73edb8831bf1bdfce21ae8eaa43b1234", "2e2457f2009c701a355a8b25eb74fcda", + "52ae4e8bdbe41494c1f43051d4dd7f0b" }; - TestIntraPred("Intra4", pred_funcs, kVp9IntraPredNames, kNumVp9IntraFuncs, - kSignatures, 4, 4 * 4 * kNumVp9IntraFuncs); + TestIntraPred("Intra4", pred_funcs, kSignatures, 4); } void TestIntraPred8(VpxPredFunc const *pred_funcs) { - static const int kNumVp9IntraFuncs = 13; - static const char *const kSignatures[kNumVp9IntraFuncs] = { - "7694ddeeefed887faf9d339d18850928", "7d726b1213591b99f736be6dec65065b", - "19c5711281357a485591aaf9c96c0a67", "ba6b66877a089e71cd938e3b8c40caac", - "802440c93317e0f8ba93fab02ef74265", "9e09a47a15deb0b9d8372824f9805080", - "b7c2d8c662268c0c427da412d7b0311d", "78339c1c60bb1d67d248ab8c4da08b7f", - "5c97d70f7d47de1882a6cd86c165c8a9", "8182bf60688b42205acd95e59e967157", - "08323400005a297f16d7e57e7fe1eaac", "95f7bfc262329a5849eda66d8f7c68ce", - "815b75c8e0d91cc1ae766dc5d3e445a3", + static const char *const kSignatures[kNumVp9IntraPredFuncs] = { + "d8bbae5d6547cfc17e4f5f44c8730e88", "373bab6d931868d41a601d9d88ce9ac3", + "6fdd5ff4ff79656c14747598ca9e3706", "d9661c2811d6a73674f40ffb2b841847", + "7c722d10b19ccff0b8c171868e747385", "f81dd986eb2b50f750d3a7da716b7e27", + "d500f2c8fc78f46a4c74e4dcf51f14fb", "0e3523f9cab2142dd37fd07ec0760bce", + "79ac4efe907f0a0f1885d43066cfedee", "19ecf2432ac305057de3b6578474eec6", + "4f985b61acc6dd5d2d2585fa89ea2e2d", "f1bb25a9060dd262f405f15a38f5f674", + "209ea00801584829e9a0f7be7d4a74ba" }; - TestIntraPred("Intra8", pred_funcs, kVp9IntraPredNames, kNumVp9IntraFuncs, - kSignatures, 8, 8 * 8 * kNumVp9IntraFuncs); + TestIntraPred("Intra8", pred_funcs, kSignatures, 8); } void TestIntraPred16(VpxPredFunc const *pred_funcs) { - static const int kNumVp9IntraFuncs = 13; - static const char *const kSignatures[kNumVp9IntraFuncs] = { - "b40dbb555d5d16a043dc361e6694fe53", "fb08118cee3b6405d64c1fd68be878c6", - "6c190f341475c837cc38c2e566b64875", "db5c34ccbe2c7f595d9b08b0dc2c698c", - "a62cbfd153a1f0b9fed13e62b8408a7a", "143df5b4c89335e281103f610f5052e4", - "d87feb124107cdf2cfb147655aa0bb3c", "7841fae7d4d47b519322e6a03eeed9dc", - "f6ebed3f71cbcf8d6d0516ce87e11093", "3cc480297dbfeed01a1c2d78dd03d0c5", - "b9f69fa6532b372c545397dcb78ef311", "a8fe1c70432f09d0c20c67bdb6432c4d", - "b8a41aa968ec108af447af4217cba91b", + static const char *const kSignatures[kNumVp9IntraPredFuncs] = { + "50971c07ce26977d30298538fffec619", "527a6b9e0dc5b21b98cf276305432bef", + "7eff2868f80ebc2c43a4f367281d80f7", "67cd60512b54964ef6aff1bd4816d922", + "48371c87dc95c08a33b2048f89cf6468", "b0acf2872ee411d7530af6d2625a7084", + "f32aafed4d8d3776ed58bcb6188756d5", "dae208f3dca583529cff49b73f7c4183", + "7af66a2f4c8e0b4908e40f047e60c47c", "125e3ab6ab9bc961f183ec366a7afa88", + "6b90f25b23983c35386b9fd704427622", "f8d6b11d710edc136a7c62c917435f93", + "ed308f18614a362917f411c218aee532" }; - TestIntraPred("Intra16", pred_funcs, kVp9IntraPredNames, kNumVp9IntraFuncs, - kSignatures, 16, 16 * 16 * kNumVp9IntraFuncs); + TestIntraPred("Intra16", pred_funcs, kSignatures, 16); } void TestIntraPred32(VpxPredFunc const *pred_funcs) { - static const int kNumVp9IntraFuncs = 13; - static const char *const kSignatures[kNumVp9IntraFuncs] = { - "558541656d84f9ae7896db655826febe", "b3587a1f9a01495fa38c8cd3c8e2a1bf", - "4c6501e64f25aacc55a2a16c7e8f0255", "b3b01379ba08916ef6b1b35f7d9ad51c", - "0f1eb38b6cbddb3d496199ef9f329071", "911c06efb9ed1c3b4c104b232b55812f", - "9225beb0ddfa7a1d24eaa1be430a6654", "0a6d584a44f8db9aa7ade2e2fdb9fc9e", - "b01c9076525216925f3456f034fb6eee", "d267e20ad9e5cd2915d1a47254d3d149", - "ed012a4a5da71f36c2393023184a0e59", "f162b51ed618d28b936974cff4391da5", - "9e1370c6d42e08d357d9612c93a71cfc", + static const char *const kSignatures[kNumVp9IntraPredFuncs] = { + "a0a618c900e65ae521ccc8af789729f2", "985aaa7c72b4a6c2fb431d32100cf13a", + "10662d09febc3ca13ee4e700120daeb5", "b3b01379ba08916ef6b1b35f7d9ad51c", + "9f4261755795af97e34679c333ec7004", "bc2c9da91ad97ef0d1610fb0a9041657", + "75c79b1362ad18abfcdb1aa0aacfc21d", "4039bb7da0f6860090d3c57b5c85468f", + "b29fff7b61804e68383e3a609b33da58", "e1aa5e49067fd8dba66c2eb8d07b7a89", + "4e042822909c1c06d3b10a88281df1eb", "72eb9d9e0e67c93f4c66b70348e9fef7", + "a22d102bcb51ca798aac12ca4ae8f2e8" }; - TestIntraPred("Intra32", pred_funcs, kVp9IntraPredNames, kNumVp9IntraFuncs, - kSignatures, 32, 32 * 32 * kNumVp9IntraFuncs); + TestIntraPred("Intra32", pred_funcs, kSignatures, 32); } } // namespace @@ -153,7 +169,6 @@ void TestIntraPred32(VpxPredFunc const *pred_funcs) { } // ----------------------------------------------------------------------------- -// 4x4 INTRA_PRED_TEST(C, TestIntraPred4, vpx_dc_predictor_4x4_c, vpx_dc_left_predictor_4x4_c, vpx_dc_top_predictor_4x4_c, @@ -163,47 +178,6 @@ INTRA_PRED_TEST(C, TestIntraPred4, vpx_dc_predictor_4x4_c, vpx_d153_predictor_4x4_c, vpx_d207_predictor_4x4_c, vpx_d63_predictor_4x4_c, vpx_tm_predictor_4x4_c) -#if HAVE_SSE2 -INTRA_PRED_TEST(SSE2, TestIntraPred4, vpx_dc_predictor_4x4_sse2, - vpx_dc_left_predictor_4x4_sse2, vpx_dc_top_predictor_4x4_sse2, - vpx_dc_128_predictor_4x4_sse2, vpx_v_predictor_4x4_sse2, - vpx_h_predictor_4x4_sse2, vpx_d45_predictor_4x4_sse2, NULL, - NULL, NULL, vpx_d207_predictor_4x4_sse2, NULL, - vpx_tm_predictor_4x4_sse2) -#endif // HAVE_SSE2 - -#if HAVE_SSSE3 -INTRA_PRED_TEST(SSSE3, TestIntraPred4, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, vpx_d153_predictor_4x4_ssse3, NULL, - vpx_d63_predictor_4x4_ssse3, NULL) -#endif // HAVE_SSSE3 - -#if HAVE_DSPR2 -INTRA_PRED_TEST(DSPR2, TestIntraPred4, vpx_dc_predictor_4x4_dspr2, NULL, NULL, - NULL, NULL, vpx_h_predictor_4x4_dspr2, NULL, NULL, NULL, NULL, - NULL, NULL, vpx_tm_predictor_4x4_dspr2) -#endif // HAVE_DSPR2 - -#if HAVE_NEON -INTRA_PRED_TEST(NEON, TestIntraPred4, vpx_dc_predictor_4x4_neon, - vpx_dc_left_predictor_4x4_neon, vpx_dc_top_predictor_4x4_neon, - vpx_dc_128_predictor_4x4_neon, vpx_v_predictor_4x4_neon, - vpx_h_predictor_4x4_neon, vpx_d45_predictor_4x4_neon, - vpx_d135_predictor_4x4_neon, NULL, NULL, NULL, NULL, - vpx_tm_predictor_4x4_neon) -#endif // HAVE_NEON - -#if HAVE_MSA -INTRA_PRED_TEST(MSA, TestIntraPred4, vpx_dc_predictor_4x4_msa, - vpx_dc_left_predictor_4x4_msa, vpx_dc_top_predictor_4x4_msa, - vpx_dc_128_predictor_4x4_msa, vpx_v_predictor_4x4_msa, - vpx_h_predictor_4x4_msa, NULL, NULL, NULL, NULL, NULL, NULL, - vpx_tm_predictor_4x4_msa) -#endif // HAVE_MSA - -// ----------------------------------------------------------------------------- -// 8x8 - INTRA_PRED_TEST(C, TestIntraPred8, vpx_dc_predictor_8x8_c, vpx_dc_left_predictor_8x8_c, vpx_dc_top_predictor_8x8_c, vpx_dc_128_predictor_8x8_c, vpx_v_predictor_8x8_c, @@ -212,46 +186,6 @@ INTRA_PRED_TEST(C, TestIntraPred8, vpx_dc_predictor_8x8_c, vpx_d153_predictor_8x8_c, vpx_d207_predictor_8x8_c, vpx_d63_predictor_8x8_c, vpx_tm_predictor_8x8_c) -#if HAVE_SSE2 -INTRA_PRED_TEST(SSE2, TestIntraPred8, vpx_dc_predictor_8x8_sse2, - vpx_dc_left_predictor_8x8_sse2, vpx_dc_top_predictor_8x8_sse2, - vpx_dc_128_predictor_8x8_sse2, vpx_v_predictor_8x8_sse2, - vpx_h_predictor_8x8_sse2, vpx_d45_predictor_8x8_sse2, NULL, - NULL, NULL, NULL, NULL, vpx_tm_predictor_8x8_sse2) -#endif // HAVE_SSE2 - -#if HAVE_SSSE3 -INTRA_PRED_TEST(SSSE3, TestIntraPred8, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, vpx_d153_predictor_8x8_ssse3, - vpx_d207_predictor_8x8_ssse3, vpx_d63_predictor_8x8_ssse3, NULL) -#endif // HAVE_SSSE3 - -#if HAVE_DSPR2 -INTRA_PRED_TEST(DSPR2, TestIntraPred8, vpx_dc_predictor_8x8_dspr2, NULL, NULL, - NULL, NULL, vpx_h_predictor_8x8_dspr2, NULL, NULL, NULL, NULL, - NULL, NULL, vpx_tm_predictor_8x8_c) -#endif // HAVE_DSPR2 - -#if HAVE_NEON -INTRA_PRED_TEST(NEON, TestIntraPred8, vpx_dc_predictor_8x8_neon, - vpx_dc_left_predictor_8x8_neon, vpx_dc_top_predictor_8x8_neon, - vpx_dc_128_predictor_8x8_neon, vpx_v_predictor_8x8_neon, - vpx_h_predictor_8x8_neon, vpx_d45_predictor_8x8_neon, NULL, - NULL, NULL, NULL, NULL, vpx_tm_predictor_8x8_neon) - -#endif // HAVE_NEON - -#if HAVE_MSA -INTRA_PRED_TEST(MSA, TestIntraPred8, vpx_dc_predictor_8x8_msa, - vpx_dc_left_predictor_8x8_msa, vpx_dc_top_predictor_8x8_msa, - vpx_dc_128_predictor_8x8_msa, vpx_v_predictor_8x8_msa, - vpx_h_predictor_8x8_msa, NULL, NULL, NULL, NULL, NULL, NULL, - vpx_tm_predictor_8x8_msa) -#endif // HAVE_MSA - -// ----------------------------------------------------------------------------- -// 16x16 - INTRA_PRED_TEST(C, TestIntraPred16, vpx_dc_predictor_16x16_c, vpx_dc_left_predictor_16x16_c, vpx_dc_top_predictor_16x16_c, vpx_dc_128_predictor_16x16_c, vpx_v_predictor_16x16_c, @@ -260,87 +194,287 @@ INTRA_PRED_TEST(C, TestIntraPred16, vpx_dc_predictor_16x16_c, vpx_d153_predictor_16x16_c, vpx_d207_predictor_16x16_c, vpx_d63_predictor_16x16_c, vpx_tm_predictor_16x16_c) +INTRA_PRED_TEST(C, TestIntraPred32, vpx_dc_predictor_32x32_c, + vpx_dc_left_predictor_32x32_c, vpx_dc_top_predictor_32x32_c, + vpx_dc_128_predictor_32x32_c, vpx_v_predictor_32x32_c, + vpx_h_predictor_32x32_c, vpx_d45_predictor_32x32_c, + vpx_d135_predictor_32x32_c, vpx_d117_predictor_32x32_c, + vpx_d153_predictor_32x32_c, vpx_d207_predictor_32x32_c, + vpx_d63_predictor_32x32_c, vpx_tm_predictor_32x32_c) + #if HAVE_SSE2 +INTRA_PRED_TEST(SSE2, TestIntraPred4, vpx_dc_predictor_4x4_sse2, + vpx_dc_left_predictor_4x4_sse2, vpx_dc_top_predictor_4x4_sse2, + vpx_dc_128_predictor_4x4_sse2, vpx_v_predictor_4x4_sse2, + vpx_h_predictor_4x4_sse2, vpx_d45_predictor_4x4_sse2, NULL, + NULL, NULL, vpx_d207_predictor_4x4_sse2, NULL, + vpx_tm_predictor_4x4_sse2) + +INTRA_PRED_TEST(SSE2, TestIntraPred8, vpx_dc_predictor_8x8_sse2, + vpx_dc_left_predictor_8x8_sse2, vpx_dc_top_predictor_8x8_sse2, + vpx_dc_128_predictor_8x8_sse2, vpx_v_predictor_8x8_sse2, + vpx_h_predictor_8x8_sse2, vpx_d45_predictor_8x8_sse2, NULL, + NULL, NULL, NULL, NULL, vpx_tm_predictor_8x8_sse2) + INTRA_PRED_TEST(SSE2, TestIntraPred16, vpx_dc_predictor_16x16_sse2, vpx_dc_left_predictor_16x16_sse2, vpx_dc_top_predictor_16x16_sse2, vpx_dc_128_predictor_16x16_sse2, vpx_v_predictor_16x16_sse2, vpx_h_predictor_16x16_sse2, NULL, NULL, NULL, NULL, NULL, NULL, vpx_tm_predictor_16x16_sse2) + +INTRA_PRED_TEST(SSE2, TestIntraPred32, vpx_dc_predictor_32x32_sse2, + vpx_dc_left_predictor_32x32_sse2, + vpx_dc_top_predictor_32x32_sse2, + vpx_dc_128_predictor_32x32_sse2, vpx_v_predictor_32x32_sse2, + vpx_h_predictor_32x32_sse2, NULL, NULL, NULL, NULL, NULL, NULL, + vpx_tm_predictor_32x32_sse2) #endif // HAVE_SSE2 #if HAVE_SSSE3 +INTRA_PRED_TEST(SSSE3, TestIntraPred4, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, vpx_d153_predictor_4x4_ssse3, NULL, + vpx_d63_predictor_4x4_ssse3, NULL) +INTRA_PRED_TEST(SSSE3, TestIntraPred8, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, vpx_d153_predictor_8x8_ssse3, + vpx_d207_predictor_8x8_ssse3, vpx_d63_predictor_8x8_ssse3, NULL) INTRA_PRED_TEST(SSSE3, TestIntraPred16, NULL, NULL, NULL, NULL, NULL, NULL, vpx_d45_predictor_16x16_ssse3, NULL, NULL, vpx_d153_predictor_16x16_ssse3, vpx_d207_predictor_16x16_ssse3, vpx_d63_predictor_16x16_ssse3, NULL) +INTRA_PRED_TEST(SSSE3, TestIntraPred32, NULL, NULL, NULL, NULL, NULL, NULL, + vpx_d45_predictor_32x32_ssse3, NULL, NULL, + vpx_d153_predictor_32x32_ssse3, vpx_d207_predictor_32x32_ssse3, + vpx_d63_predictor_32x32_ssse3, NULL) #endif // HAVE_SSSE3 #if HAVE_DSPR2 +INTRA_PRED_TEST(DSPR2, TestIntraPred4, vpx_dc_predictor_4x4_dspr2, NULL, NULL, + NULL, NULL, vpx_h_predictor_4x4_dspr2, NULL, NULL, NULL, NULL, + NULL, NULL, vpx_tm_predictor_4x4_dspr2) +INTRA_PRED_TEST(DSPR2, TestIntraPred8, vpx_dc_predictor_8x8_dspr2, NULL, NULL, + NULL, NULL, vpx_h_predictor_8x8_dspr2, NULL, NULL, NULL, NULL, + NULL, NULL, vpx_tm_predictor_8x8_c) INTRA_PRED_TEST(DSPR2, TestIntraPred16, vpx_dc_predictor_16x16_dspr2, NULL, NULL, NULL, NULL, vpx_h_predictor_16x16_dspr2, NULL, NULL, NULL, NULL, NULL, NULL, NULL) #endif // HAVE_DSPR2 #if HAVE_NEON +INTRA_PRED_TEST(NEON, TestIntraPred4, vpx_dc_predictor_4x4_neon, + vpx_dc_left_predictor_4x4_neon, vpx_dc_top_predictor_4x4_neon, + vpx_dc_128_predictor_4x4_neon, vpx_v_predictor_4x4_neon, + vpx_h_predictor_4x4_neon, vpx_d45_predictor_4x4_neon, + vpx_d135_predictor_4x4_neon, NULL, NULL, NULL, NULL, + vpx_tm_predictor_4x4_neon) +INTRA_PRED_TEST(NEON, TestIntraPred8, vpx_dc_predictor_8x8_neon, + vpx_dc_left_predictor_8x8_neon, vpx_dc_top_predictor_8x8_neon, + vpx_dc_128_predictor_8x8_neon, vpx_v_predictor_8x8_neon, + vpx_h_predictor_8x8_neon, vpx_d45_predictor_8x8_neon, NULL, + NULL, NULL, NULL, NULL, vpx_tm_predictor_8x8_neon) INTRA_PRED_TEST(NEON, TestIntraPred16, vpx_dc_predictor_16x16_neon, vpx_dc_left_predictor_16x16_neon, vpx_dc_top_predictor_16x16_neon, vpx_dc_128_predictor_16x16_neon, vpx_v_predictor_16x16_neon, vpx_h_predictor_16x16_neon, vpx_d45_predictor_16x16_neon, NULL, NULL, NULL, NULL, NULL, vpx_tm_predictor_16x16_neon) +INTRA_PRED_TEST(NEON, TestIntraPred32, vpx_dc_predictor_32x32_neon, + vpx_dc_left_predictor_32x32_neon, + vpx_dc_top_predictor_32x32_neon, + vpx_dc_128_predictor_32x32_neon, vpx_v_predictor_32x32_neon, + vpx_h_predictor_32x32_neon, NULL, NULL, NULL, NULL, NULL, NULL, + vpx_tm_predictor_32x32_neon) #endif // HAVE_NEON #if HAVE_MSA +INTRA_PRED_TEST(MSA, TestIntraPred4, vpx_dc_predictor_4x4_msa, + vpx_dc_left_predictor_4x4_msa, vpx_dc_top_predictor_4x4_msa, + vpx_dc_128_predictor_4x4_msa, vpx_v_predictor_4x4_msa, + vpx_h_predictor_4x4_msa, NULL, NULL, NULL, NULL, NULL, NULL, + vpx_tm_predictor_4x4_msa) +INTRA_PRED_TEST(MSA, TestIntraPred8, vpx_dc_predictor_8x8_msa, + vpx_dc_left_predictor_8x8_msa, vpx_dc_top_predictor_8x8_msa, + vpx_dc_128_predictor_8x8_msa, vpx_v_predictor_8x8_msa, + vpx_h_predictor_8x8_msa, NULL, NULL, NULL, NULL, NULL, NULL, + vpx_tm_predictor_8x8_msa) INTRA_PRED_TEST(MSA, TestIntraPred16, vpx_dc_predictor_16x16_msa, vpx_dc_left_predictor_16x16_msa, vpx_dc_top_predictor_16x16_msa, vpx_dc_128_predictor_16x16_msa, vpx_v_predictor_16x16_msa, vpx_h_predictor_16x16_msa, NULL, NULL, NULL, NULL, NULL, NULL, vpx_tm_predictor_16x16_msa) +INTRA_PRED_TEST(MSA, TestIntraPred32, vpx_dc_predictor_32x32_msa, + vpx_dc_left_predictor_32x32_msa, vpx_dc_top_predictor_32x32_msa, + vpx_dc_128_predictor_32x32_msa, vpx_v_predictor_32x32_msa, + vpx_h_predictor_32x32_msa, NULL, NULL, NULL, NULL, NULL, NULL, + vpx_tm_predictor_32x32_msa) #endif // HAVE_MSA // ----------------------------------------------------------------------------- -// 32x32 -INTRA_PRED_TEST(C, TestIntraPred32, vpx_dc_predictor_32x32_c, - vpx_dc_left_predictor_32x32_c, vpx_dc_top_predictor_32x32_c, - vpx_dc_128_predictor_32x32_c, vpx_v_predictor_32x32_c, - vpx_h_predictor_32x32_c, vpx_d45_predictor_32x32_c, - vpx_d135_predictor_32x32_c, vpx_d117_predictor_32x32_c, - vpx_d153_predictor_32x32_c, vpx_d207_predictor_32x32_c, - vpx_d63_predictor_32x32_c, vpx_tm_predictor_32x32_c) +#if CONFIG_VP9_HIGHBITDEPTH +namespace { -#if HAVE_SSE2 -INTRA_PRED_TEST(SSE2, TestIntraPred32, vpx_dc_predictor_32x32_sse2, - vpx_dc_left_predictor_32x32_sse2, - vpx_dc_top_predictor_32x32_sse2, - vpx_dc_128_predictor_32x32_sse2, vpx_v_predictor_32x32_sse2, - vpx_h_predictor_32x32_sse2, NULL, NULL, NULL, NULL, NULL, NULL, - vpx_tm_predictor_32x32_sse2) -#endif // HAVE_SSE2 +typedef void (*VpxHighbdPredFunc)(uint16_t *dst, ptrdiff_t y_stride, + const uint16_t *above, const uint16_t *left, + int bd); -#if HAVE_SSSE3 -INTRA_PRED_TEST(SSSE3, TestIntraPred32, NULL, NULL, NULL, NULL, NULL, NULL, - vpx_d45_predictor_32x32_ssse3, NULL, NULL, - vpx_d153_predictor_32x32_ssse3, vpx_d207_predictor_32x32_ssse3, - vpx_d63_predictor_32x32_ssse3, NULL) -#endif // HAVE_SSSE3 +typedef IntraPredTestMem<uint16_t> Vp9HighbdIntraPredTestMem; -#if HAVE_NEON -INTRA_PRED_TEST(NEON, TestIntraPred32, vpx_dc_predictor_32x32_neon, - vpx_dc_left_predictor_32x32_neon, - vpx_dc_top_predictor_32x32_neon, - vpx_dc_128_predictor_32x32_neon, vpx_v_predictor_32x32_neon, - vpx_h_predictor_32x32_neon, NULL, NULL, NULL, NULL, NULL, NULL, - vpx_tm_predictor_32x32_neon) -#endif // HAVE_NEON +void TestHighbdIntraPred(const char name[], VpxHighbdPredFunc const *pred_funcs, + const char *const signatures[], int block_size) { + const int kNumTests = static_cast<int>( + 2.e10 / (block_size * block_size * kNumVp9IntraPredFuncs)); + Vp9HighbdIntraPredTestMem intra_pred_test_mem; + const uint16_t *const above = intra_pred_test_mem.above_mem + 16; -#if HAVE_MSA -INTRA_PRED_TEST(MSA, TestIntraPred32, vpx_dc_predictor_32x32_msa, - vpx_dc_left_predictor_32x32_msa, vpx_dc_top_predictor_32x32_msa, - vpx_dc_128_predictor_32x32_msa, vpx_v_predictor_32x32_msa, - vpx_h_predictor_32x32_msa, NULL, NULL, NULL, NULL, NULL, NULL, - vpx_tm_predictor_32x32_msa) -#endif // HAVE_MSA + intra_pred_test_mem.Init(block_size, 12); + + for (int k = 0; k < kNumVp9IntraPredFuncs; ++k) { + if (pred_funcs[k] == NULL) continue; + memcpy(intra_pred_test_mem.src, intra_pred_test_mem.ref_src, + sizeof(intra_pred_test_mem.src)); + vpx_usec_timer timer; + vpx_usec_timer_start(&timer); + for (int num_tests = 0; num_tests < kNumTests; ++num_tests) { + pred_funcs[k](intra_pred_test_mem.src, kBPS, above, + intra_pred_test_mem.left, 12); + } + libvpx_test::ClearSystemState(); + vpx_usec_timer_mark(&timer); + const int elapsed_time = + static_cast<int>(vpx_usec_timer_elapsed(&timer) / 1000); + CheckMd5Signature(name, signatures, intra_pred_test_mem.src, + sizeof(intra_pred_test_mem.src), elapsed_time, k); + } +} + +void TestHighbdIntraPred4(VpxHighbdPredFunc const *pred_funcs) { + static const char *const kSignatures[kNumVp9IntraPredFuncs] = { + "11f74af6c5737df472f3275cbde062fa", "51bea056b6447c93f6eb8f6b7e8f6f71", + "27e97f946766331795886f4de04c5594", "53ab15974b049111fb596c5168ec7e3f", + "f0b640bb176fbe4584cf3d32a9b0320a", "729783ca909e03afd4b47111c80d967b", + "fbf1c30793d9f32812e4d9f905d53530", "293fc903254a33754133314c6cdba81f", + "f8074d704233e73dfd35b458c6092374", "aa6363d08544a1ec4da33d7a0be5640d", + "462abcfdfa3d087bb33c9a88f2aec491", "863eab65d22550dd44a2397277c1ec71", + "23d61df1574d0fa308f9731811047c4b" + }; + TestHighbdIntraPred("Intra4", pred_funcs, kSignatures, 4); +} + +void TestHighbdIntraPred8(VpxHighbdPredFunc const *pred_funcs) { + static const char *const kSignatures[kNumVp9IntraPredFuncs] = { + "03da8829fe94663047fd108c5fcaa71d", "ecdb37b8120a2d3a4c706b016bd1bfd7", + "1d4543ed8d2b9368cb96898095fe8a75", "f791c9a67b913cbd82d9da8ecede30e2", + "065c70646f4dbaff913282f55a45a441", "51f87123616662ef7c35691497dfd0ba", + "2a5b0131ef4716f098ee65e6df01e3dd", "9ffe186a6bc7db95275f1bbddd6f7aba", + "a3258a2eae2e2bd55cb8f71351b22998", "8d909f0a2066e39b3216092c6289ece4", + "d183abb30b9f24c886a0517e991b22c7", "702a42fe4c7d665dc561b2aeeb60f311", + "7b5dbbbe7ae3a4ac2948731600bde5d6" + }; + TestHighbdIntraPred("Intra8", pred_funcs, kSignatures, 8); +} + +void TestHighbdIntraPred16(VpxHighbdPredFunc const *pred_funcs) { + static const char *const kSignatures[kNumVp9IntraPredFuncs] = { + "e33cb3f56a878e2fddb1b2fc51cdd275", "c7bff6f04b6052c8ab335d726dbbd52d", + "d0b0b47b654a9bcc5c6008110a44589b", "78f5da7b10b2b9ab39f114a33b6254e9", + "c78e31d23831abb40d6271a318fdd6f3", "90d1347f4ec9198a0320daecb6ff90b8", + "d2c623746cbb64a0c9e29c10f2c57041", "cf28bd387b81ad3e5f1a1c779a4b70a0", + "24c304330431ddeaf630f6ce94af2eac", "91a329798036bf64e8e00a87b131b8b1", + "d39111f22885307f920796a42084c872", "e2e702f7250ece98dd8f3f2854c31eeb", + "e2fb05b01eb8b88549e85641d8ce5b59" + }; + TestHighbdIntraPred("Intra16", pred_funcs, kSignatures, 16); +} + +void TestHighbdIntraPred32(VpxHighbdPredFunc const *pred_funcs) { + static const char *const kSignatures[kNumVp9IntraPredFuncs] = { + "a3e8056ba7e36628cce4917cd956fedd", "cc7d3024fe8748b512407edee045377e", + "2aab0a0f330a1d3e19b8ecb8f06387a3", "a547bc3fb7b06910bf3973122a426661", + "26f712514da95042f93d6e8dc8e431dc", "bb08c6e16177081daa3d936538dbc2e3", + "8f031af3e2650e89620d8d2c3a843d8b", "42867c8553285e94ee8e4df7abafbda8", + "6496bdee96100667833f546e1be3d640", "2ebfa25bf981377e682e580208504300", + "3e8ae52fd1f607f348aa4cb436c71ab7", "3d4efe797ca82193613696753ea624c4", + "cb8aab6d372278f3131e8d99efde02d9" + }; + TestHighbdIntraPred("Intra32", pred_funcs, kSignatures, 32); +} + +} // namespace + +// Defines a test case for |arch| (e.g., C, SSE2, ...) passing the predictors +// to |test_func|. The test name is 'arch.test_func', e.g., C.TestIntraPred4. +#define HIGHBD_INTRA_PRED_TEST(arch, test_func, dc, dc_left, dc_top, dc_128, \ + v, h, d45, d135, d117, d153, d207, d63, tm) \ + TEST(arch, test_func) { \ + static const VpxHighbdPredFunc vpx_intra_pred[] = { \ + dc, dc_left, dc_top, dc_128, v, h, d45, d135, d117, d153, d207, d63, tm \ + }; \ + test_func(vpx_intra_pred); \ + } + +// ----------------------------------------------------------------------------- + +HIGHBD_INTRA_PRED_TEST( + C, TestHighbdIntraPred4, vpx_highbd_dc_predictor_4x4_c, + vpx_highbd_dc_left_predictor_4x4_c, vpx_highbd_dc_top_predictor_4x4_c, + vpx_highbd_dc_128_predictor_4x4_c, vpx_highbd_v_predictor_4x4_c, + vpx_highbd_h_predictor_4x4_c, vpx_highbd_d45_predictor_4x4_c, + vpx_highbd_d135_predictor_4x4_c, vpx_highbd_d117_predictor_4x4_c, + vpx_highbd_d153_predictor_4x4_c, vpx_highbd_d207_predictor_4x4_c, + vpx_highbd_d63_predictor_4x4_c, vpx_highbd_tm_predictor_4x4_c) + +HIGHBD_INTRA_PRED_TEST( + C, TestHighbdIntraPred8, vpx_highbd_dc_predictor_8x8_c, + vpx_highbd_dc_left_predictor_8x8_c, vpx_highbd_dc_top_predictor_8x8_c, + vpx_highbd_dc_128_predictor_8x8_c, vpx_highbd_v_predictor_8x8_c, + vpx_highbd_h_predictor_8x8_c, vpx_highbd_d45_predictor_8x8_c, + vpx_highbd_d135_predictor_8x8_c, vpx_highbd_d117_predictor_8x8_c, + vpx_highbd_d153_predictor_8x8_c, vpx_highbd_d207_predictor_8x8_c, + vpx_highbd_d63_predictor_8x8_c, vpx_highbd_tm_predictor_8x8_c) + +HIGHBD_INTRA_PRED_TEST( + C, TestHighbdIntraPred16, vpx_highbd_dc_predictor_16x16_c, + vpx_highbd_dc_left_predictor_16x16_c, vpx_highbd_dc_top_predictor_16x16_c, + vpx_highbd_dc_128_predictor_16x16_c, vpx_highbd_v_predictor_16x16_c, + vpx_highbd_h_predictor_16x16_c, vpx_highbd_d45_predictor_16x16_c, + vpx_highbd_d135_predictor_16x16_c, vpx_highbd_d117_predictor_16x16_c, + vpx_highbd_d153_predictor_16x16_c, vpx_highbd_d207_predictor_16x16_c, + vpx_highbd_d63_predictor_16x16_c, vpx_highbd_tm_predictor_16x16_c) + +HIGHBD_INTRA_PRED_TEST( + C, TestHighbdIntraPred32, vpx_highbd_dc_predictor_32x32_c, + vpx_highbd_dc_left_predictor_32x32_c, vpx_highbd_dc_top_predictor_32x32_c, + vpx_highbd_dc_128_predictor_32x32_c, vpx_highbd_v_predictor_32x32_c, + vpx_highbd_h_predictor_32x32_c, vpx_highbd_d45_predictor_32x32_c, + vpx_highbd_d135_predictor_32x32_c, vpx_highbd_d117_predictor_32x32_c, + vpx_highbd_d153_predictor_32x32_c, vpx_highbd_d207_predictor_32x32_c, + vpx_highbd_d63_predictor_32x32_c, vpx_highbd_tm_predictor_32x32_c) + +#if HAVE_SSE2 +HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred4, + vpx_highbd_dc_predictor_4x4_sse2, NULL, NULL, NULL, + vpx_highbd_v_predictor_4x4_sse2, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, vpx_highbd_tm_predictor_4x4_c) + +HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred8, + vpx_highbd_dc_predictor_8x8_sse2, NULL, NULL, NULL, + vpx_highbd_v_predictor_8x8_sse2, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, vpx_highbd_tm_predictor_8x8_sse2) + +HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred16, + vpx_highbd_dc_predictor_16x16_sse2, NULL, NULL, NULL, + vpx_highbd_v_predictor_16x16_sse2, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + vpx_highbd_tm_predictor_16x16_sse2) + +HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred32, + vpx_highbd_dc_predictor_32x32_sse2, NULL, NULL, NULL, + vpx_highbd_v_predictor_32x32_sse2, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + vpx_highbd_tm_predictor_32x32_sse2) +#endif // HAVE_SSE2 + +#endif // CONFIG_VP9_HIGHBITDEPTH #include "test/test_libvpx.cc" diff --git a/test/vp9_ethread_test.cc b/test/vp9_ethread_test.cc index 804dc8956..f89f852b6 100644 --- a/test/vp9_ethread_test.cc +++ b/test/vp9_ethread_test.cc @@ -65,6 +65,7 @@ class VPxEncoderThreadTest encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7); encoder->Control(VP8E_SET_ARNR_STRENGTH, 5); encoder->Control(VP8E_SET_ARNR_TYPE, 3); + encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 0); } else { encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 0); encoder->Control(VP9E_SET_AQ_MODE, 3); @@ -127,7 +128,7 @@ VP9_INSTANTIATE_TEST_CASE(VPxEncoderThreadTest, ::testing::Values(::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood, ::libvpx_test::kRealTime), - ::testing::Range(1, 9), // cpu_used + ::testing::Range(0, 9), // cpu_used ::testing::Range(0, 3), // tile_columns ::testing::Range(2, 5)); // threads } // namespace diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc index 1c4a3392e..7cd32b990 100644 --- a/test/vp9_intrapred_test.cc +++ b/test/vp9_intrapred_test.cc @@ -28,25 +28,25 @@ using libvpx_test::ACMRandom; const int count_test_block = 100000; -typedef void (*IntraPred)(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, const uint16_t *left, int bps); +typedef void (*IntraPredFunc)(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left); -struct IntraPredFunc { - IntraPredFunc(IntraPred pred = NULL, IntraPred ref = NULL, - int block_size_value = 0, int bit_depth_value = 0) +struct IntraPredParam { + IntraPredParam(IntraPredFunc pred = NULL, IntraPredFunc ref = NULL, + int block_size_value = 0, int bit_depth_value = 0) : pred_fn(pred), ref_fn(ref), block_size(block_size_value), bit_depth(bit_depth_value) {} - IntraPred pred_fn; - IntraPred ref_fn; + IntraPredFunc pred_fn; + IntraPredFunc ref_fn; int block_size; int bit_depth; }; -class VP9IntraPredTest : public ::testing::TestWithParam<IntraPredFunc> { +template <typename Pixel, typename PredParam> +class IntraPredTest : public ::testing::TestWithParam<PredParam> { public: - void RunTest(uint16_t *left_col, uint16_t *above_data, uint16_t *dst, - uint16_t *ref_dst) { + void RunTest(Pixel *left_col, Pixel *above_data, Pixel *dst, Pixel *ref_dst) { ACMRandom rnd(ACMRandom::DeterministicSeed()); const int block_size = params_.block_size; above_row_ = above_data + 16; @@ -56,13 +56,16 @@ class VP9IntraPredTest : public ::testing::TestWithParam<IntraPredFunc> { int error_count = 0; for (int i = 0; i < count_test_block; ++i) { // Fill edges with random data, try first with saturated values. - for (int x = -1; x <= block_size * 2; x++) { + for (int x = -1; x < block_size; x++) { if (i == 0) { above_row_[x] = mask_; } else { above_row_[x] = rnd.Rand16() & mask_; } } + for (int x = block_size; x < 2 * block_size; x++) { + above_row_[x] = above_row_[block_size - 1]; + } for (int y = 0; y < block_size; y++) { if (i == 0) { left_col_[y] = mask_; @@ -78,17 +81,12 @@ class VP9IntraPredTest : public ::testing::TestWithParam<IntraPredFunc> { protected: virtual void SetUp() { - params_ = GetParam(); + params_ = this->GetParam(); stride_ = params_.block_size * 3; mask_ = (1 << params_.bit_depth) - 1; } - void Predict() { - const int bit_depth = params_.bit_depth; - params_.ref_fn(ref_dst_, stride_, above_row_, left_col_, bit_depth); - ASM_REGISTER_STATE_CHECK( - params_.pred_fn(dst_, stride_, above_row_, left_col_, bit_depth)); - } + void Predict(); void CheckPrediction(int test_case_number, int *error_count) const { // For each pixel ensure that the calculated value is the same as reference. @@ -104,18 +102,223 @@ class VP9IntraPredTest : public ::testing::TestWithParam<IntraPredFunc> { } } - uint16_t *above_row_; - uint16_t *left_col_; - uint16_t *dst_; - uint16_t *ref_dst_; + Pixel *above_row_; + Pixel *left_col_; + Pixel *dst_; + Pixel *ref_dst_; ptrdiff_t stride_; int mask_; - IntraPredFunc params_; + PredParam params_; }; +template <> +void IntraPredTest<uint8_t, IntraPredParam>::Predict() { + params_.ref_fn(ref_dst_, stride_, above_row_, left_col_); + ASM_REGISTER_STATE_CHECK( + params_.pred_fn(dst_, stride_, above_row_, left_col_)); +} + +typedef IntraPredTest<uint8_t, IntraPredParam> VP9IntraPredTest; + TEST_P(VP9IntraPredTest, IntraPredTests) { // max block size is 32 + DECLARE_ALIGNED(16, uint8_t, left_col[2 * 32]); + DECLARE_ALIGNED(16, uint8_t, above_data[2 * 32 + 32]); + DECLARE_ALIGNED(16, uint8_t, dst[3 * 32 * 32]); + DECLARE_ALIGNED(16, uint8_t, ref_dst[3 * 32 * 32]); + RunTest(left_col, above_data, dst, ref_dst); +} + +#if HAVE_SSE2 +INSTANTIATE_TEST_CASE_P( + SSE2, VP9IntraPredTest, + ::testing::Values( + IntraPredParam(&vpx_d45_predictor_4x4_sse2, &vpx_d45_predictor_4x4_c, 4, + 8), + IntraPredParam(&vpx_d45_predictor_8x8_sse2, &vpx_d45_predictor_8x8_c, 8, + 8), + IntraPredParam(&vpx_d207_predictor_4x4_sse2, &vpx_d207_predictor_4x4_c, + 4, 8), + IntraPredParam(&vpx_dc_128_predictor_4x4_sse2, + &vpx_dc_128_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_dc_128_predictor_8x8_sse2, + &vpx_dc_128_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_dc_128_predictor_16x16_sse2, + &vpx_dc_128_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_dc_128_predictor_32x32_sse2, + &vpx_dc_128_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_dc_left_predictor_4x4_sse2, + &vpx_dc_left_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_dc_left_predictor_8x8_sse2, + &vpx_dc_left_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_dc_left_predictor_16x16_sse2, + &vpx_dc_left_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_dc_left_predictor_32x32_sse2, + &vpx_dc_left_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_dc_predictor_4x4_sse2, &vpx_dc_predictor_4x4_c, 4, + 8), + IntraPredParam(&vpx_dc_predictor_8x8_sse2, &vpx_dc_predictor_8x8_c, 8, + 8), + IntraPredParam(&vpx_dc_predictor_16x16_sse2, &vpx_dc_predictor_16x16_c, + 16, 8), + IntraPredParam(&vpx_dc_predictor_32x32_sse2, &vpx_dc_predictor_32x32_c, + 32, 8), + IntraPredParam(&vpx_dc_top_predictor_4x4_sse2, + &vpx_dc_top_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_dc_top_predictor_8x8_sse2, + &vpx_dc_top_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_dc_top_predictor_16x16_sse2, + &vpx_dc_top_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_dc_top_predictor_32x32_sse2, + &vpx_dc_top_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_h_predictor_4x4_sse2, &vpx_h_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_h_predictor_8x8_sse2, &vpx_h_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_h_predictor_16x16_sse2, &vpx_h_predictor_16x16_c, + 16, 8), + IntraPredParam(&vpx_h_predictor_32x32_sse2, &vpx_h_predictor_32x32_c, + 32, 8), + IntraPredParam(&vpx_tm_predictor_4x4_sse2, &vpx_tm_predictor_4x4_c, 4, + 8), + IntraPredParam(&vpx_tm_predictor_8x8_sse2, &vpx_tm_predictor_8x8_c, 8, + 8), + IntraPredParam(&vpx_tm_predictor_16x16_sse2, &vpx_tm_predictor_16x16_c, + 16, 8), + IntraPredParam(&vpx_tm_predictor_32x32_sse2, &vpx_tm_predictor_32x32_c, + 32, 8), + IntraPredParam(&vpx_v_predictor_4x4_sse2, &vpx_v_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_v_predictor_8x8_sse2, &vpx_v_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_v_predictor_16x16_sse2, &vpx_v_predictor_16x16_c, + 16, 8), + IntraPredParam(&vpx_v_predictor_32x32_sse2, &vpx_v_predictor_32x32_c, + 32, 8))); +#endif // HAVE_SSE2 + +#if HAVE_SSSE3 +INSTANTIATE_TEST_CASE_P( + SSSE3, VP9IntraPredTest, + ::testing::Values(IntraPredParam(&vpx_d45_predictor_16x16_ssse3, + &vpx_d45_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_d45_predictor_32x32_ssse3, + &vpx_d45_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_d63_predictor_4x4_ssse3, + &vpx_d63_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_d63_predictor_8x8_ssse3, + &vpx_d63_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_d63_predictor_16x16_ssse3, + &vpx_d63_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_d63_predictor_32x32_ssse3, + &vpx_d63_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_d153_predictor_4x4_ssse3, + &vpx_d153_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_d153_predictor_8x8_ssse3, + &vpx_d153_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_d153_predictor_16x16_ssse3, + &vpx_d153_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_d153_predictor_32x32_ssse3, + &vpx_d153_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_d207_predictor_8x8_ssse3, + &vpx_d207_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_d207_predictor_16x16_ssse3, + &vpx_d207_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_d207_predictor_32x32_ssse3, + &vpx_d207_predictor_32x32_c, 32, 8))); +#endif // HAVE_SSSE3 + +#if HAVE_NEON +INSTANTIATE_TEST_CASE_P( + NEON, VP9IntraPredTest, + ::testing::Values( + IntraPredParam(&vpx_d45_predictor_4x4_neon, &vpx_d45_predictor_4x4_c, 4, + 8), + IntraPredParam(&vpx_d45_predictor_8x8_neon, &vpx_d45_predictor_8x8_c, 8, + 8), + IntraPredParam(&vpx_d45_predictor_16x16_neon, + &vpx_d45_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_d135_predictor_4x4_neon, &vpx_d135_predictor_4x4_c, + 4, 8), + IntraPredParam(&vpx_dc_128_predictor_4x4_neon, + &vpx_dc_128_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_dc_128_predictor_8x8_neon, + &vpx_dc_128_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_dc_128_predictor_16x16_neon, + &vpx_dc_128_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_dc_128_predictor_32x32_neon, + &vpx_dc_128_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_dc_left_predictor_4x4_neon, + &vpx_dc_left_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_dc_left_predictor_8x8_neon, + &vpx_dc_left_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_dc_left_predictor_16x16_neon, + &vpx_dc_left_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_dc_left_predictor_32x32_neon, + &vpx_dc_left_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_dc_predictor_4x4_neon, &vpx_dc_predictor_4x4_c, 4, + 8), + IntraPredParam(&vpx_dc_predictor_8x8_neon, &vpx_dc_predictor_8x8_c, 8, + 8), + IntraPredParam(&vpx_dc_predictor_16x16_neon, &vpx_dc_predictor_16x16_c, + 16, 8), + IntraPredParam(&vpx_dc_predictor_32x32_neon, &vpx_dc_predictor_32x32_c, + 32, 8), + IntraPredParam(&vpx_dc_top_predictor_4x4_neon, + &vpx_dc_top_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_dc_top_predictor_8x8_neon, + &vpx_dc_top_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_dc_top_predictor_16x16_neon, + &vpx_dc_top_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_dc_top_predictor_32x32_neon, + &vpx_dc_top_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_h_predictor_4x4_neon, &vpx_h_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_h_predictor_8x8_neon, &vpx_h_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_h_predictor_16x16_neon, &vpx_h_predictor_16x16_c, + 16, 8), + IntraPredParam(&vpx_h_predictor_32x32_neon, &vpx_h_predictor_32x32_c, + 32, 8), + IntraPredParam(&vpx_tm_predictor_4x4_neon, &vpx_tm_predictor_4x4_c, 4, + 8), + IntraPredParam(&vpx_tm_predictor_8x8_neon, &vpx_tm_predictor_8x8_c, 8, + 8), + IntraPredParam(&vpx_tm_predictor_16x16_neon, &vpx_tm_predictor_16x16_c, + 16, 8), + IntraPredParam(&vpx_tm_predictor_32x32_neon, &vpx_tm_predictor_32x32_c, + 32, 8), + IntraPredParam(&vpx_v_predictor_4x4_neon, &vpx_v_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_v_predictor_8x8_neon, &vpx_v_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_v_predictor_16x16_neon, &vpx_v_predictor_16x16_c, + 16, 8), + IntraPredParam(&vpx_v_predictor_32x32_neon, &vpx_v_predictor_32x32_c, + 32, 8))); +#endif // HAVE_NEON + +#if CONFIG_VP9_HIGHBITDEPTH +typedef void (*HighbdIntraPred)(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, const uint16_t *left, + int bps); +struct HighbdIntraPredParam { + HighbdIntraPredParam(HighbdIntraPred pred = NULL, HighbdIntraPred ref = NULL, + int block_size_value = 0, int bit_depth_value = 0) + : pred_fn(pred), ref_fn(ref), block_size(block_size_value), + bit_depth(bit_depth_value) {} + + HighbdIntraPred pred_fn; + HighbdIntraPred ref_fn; + int block_size; + int bit_depth; +}; + +template <> +void IntraPredTest<uint16_t, HighbdIntraPredParam>::Predict() { + const int bit_depth = params_.bit_depth; + params_.ref_fn(ref_dst_, stride_, above_row_, left_col_, bit_depth); + ASM_REGISTER_STATE_CHECK( + params_.pred_fn(dst_, stride_, above_row_, left_col_, bit_depth)); +} + +typedef IntraPredTest<uint16_t, HighbdIntraPredParam> VP9HighbdIntraPredTest; + +TEST_P(VP9HighbdIntraPredTest, HighbdIntraPredTests) { + // max block size is 32 DECLARE_ALIGNED(16, uint16_t, left_col[2 * 32]); DECLARE_ALIGNED(16, uint16_t, above_data[2 * 32 + 32]); DECLARE_ALIGNED(16, uint16_t, dst[3 * 32 * 32]); @@ -124,88 +327,90 @@ TEST_P(VP9IntraPredTest, IntraPredTests) { } #if HAVE_SSE2 -#if CONFIG_VP9_HIGHBITDEPTH INSTANTIATE_TEST_CASE_P( - SSE2_TO_C_8, VP9IntraPredTest, - ::testing::Values(IntraPredFunc(&vpx_highbd_dc_predictor_32x32_sse2, - &vpx_highbd_dc_predictor_32x32_c, 32, 8), - IntraPredFunc(&vpx_highbd_tm_predictor_16x16_sse2, - &vpx_highbd_tm_predictor_16x16_c, 16, 8), - IntraPredFunc(&vpx_highbd_tm_predictor_32x32_sse2, - &vpx_highbd_tm_predictor_32x32_c, 32, 8), - IntraPredFunc(&vpx_highbd_dc_predictor_4x4_sse2, - &vpx_highbd_dc_predictor_4x4_c, 4, 8), - IntraPredFunc(&vpx_highbd_dc_predictor_8x8_sse2, - &vpx_highbd_dc_predictor_8x8_c, 8, 8), - IntraPredFunc(&vpx_highbd_dc_predictor_16x16_sse2, - &vpx_highbd_dc_predictor_16x16_c, 16, 8), - IntraPredFunc(&vpx_highbd_v_predictor_4x4_sse2, - &vpx_highbd_v_predictor_4x4_c, 4, 8), - IntraPredFunc(&vpx_highbd_v_predictor_8x8_sse2, - &vpx_highbd_v_predictor_8x8_c, 8, 8), - IntraPredFunc(&vpx_highbd_v_predictor_16x16_sse2, - &vpx_highbd_v_predictor_16x16_c, 16, 8), - IntraPredFunc(&vpx_highbd_v_predictor_32x32_sse2, - &vpx_highbd_v_predictor_32x32_c, 32, 8), - IntraPredFunc(&vpx_highbd_tm_predictor_4x4_sse2, - &vpx_highbd_tm_predictor_4x4_c, 4, 8), - IntraPredFunc(&vpx_highbd_tm_predictor_8x8_sse2, - &vpx_highbd_tm_predictor_8x8_c, 8, 8))); + SSE2_TO_C_8, VP9HighbdIntraPredTest, + ::testing::Values( + HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_sse2, + &vpx_highbd_dc_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_sse2, + &vpx_highbd_dc_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_16x16_sse2, + &vpx_highbd_dc_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_32x32_sse2, + &vpx_highbd_dc_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_sse2, + &vpx_highbd_tm_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_sse2, + &vpx_highbd_tm_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_16x16_sse2, + &vpx_highbd_tm_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_sse2, + &vpx_highbd_tm_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_sse2, + &vpx_highbd_v_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_sse2, + &vpx_highbd_v_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_v_predictor_16x16_sse2, + &vpx_highbd_v_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_sse2, + &vpx_highbd_v_predictor_32x32_c, 32, 8))); INSTANTIATE_TEST_CASE_P( - SSE2_TO_C_10, VP9IntraPredTest, - ::testing::Values(IntraPredFunc(&vpx_highbd_dc_predictor_32x32_sse2, - &vpx_highbd_dc_predictor_32x32_c, 32, 10), - IntraPredFunc(&vpx_highbd_tm_predictor_16x16_sse2, - &vpx_highbd_tm_predictor_16x16_c, 16, 10), - IntraPredFunc(&vpx_highbd_tm_predictor_32x32_sse2, - &vpx_highbd_tm_predictor_32x32_c, 32, 10), - IntraPredFunc(&vpx_highbd_dc_predictor_4x4_sse2, - &vpx_highbd_dc_predictor_4x4_c, 4, 10), - IntraPredFunc(&vpx_highbd_dc_predictor_8x8_sse2, - &vpx_highbd_dc_predictor_8x8_c, 8, 10), - IntraPredFunc(&vpx_highbd_dc_predictor_16x16_sse2, - &vpx_highbd_dc_predictor_16x16_c, 16, 10), - IntraPredFunc(&vpx_highbd_v_predictor_4x4_sse2, - &vpx_highbd_v_predictor_4x4_c, 4, 10), - IntraPredFunc(&vpx_highbd_v_predictor_8x8_sse2, - &vpx_highbd_v_predictor_8x8_c, 8, 10), - IntraPredFunc(&vpx_highbd_v_predictor_16x16_sse2, - &vpx_highbd_v_predictor_16x16_c, 16, 10), - IntraPredFunc(&vpx_highbd_v_predictor_32x32_sse2, - &vpx_highbd_v_predictor_32x32_c, 32, 10), - IntraPredFunc(&vpx_highbd_tm_predictor_4x4_sse2, - &vpx_highbd_tm_predictor_4x4_c, 4, 10), - IntraPredFunc(&vpx_highbd_tm_predictor_8x8_sse2, - &vpx_highbd_tm_predictor_8x8_c, 8, 10))); + SSE2_TO_C_10, VP9HighbdIntraPredTest, + ::testing::Values( + HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_sse2, + &vpx_highbd_dc_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_sse2, + &vpx_highbd_dc_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_16x16_sse2, + &vpx_highbd_dc_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_32x32_sse2, + &vpx_highbd_dc_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_sse2, + &vpx_highbd_tm_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_sse2, + &vpx_highbd_tm_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_16x16_sse2, + &vpx_highbd_tm_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_sse2, + &vpx_highbd_tm_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_sse2, + &vpx_highbd_v_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_sse2, + &vpx_highbd_v_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_v_predictor_16x16_sse2, + &vpx_highbd_v_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_sse2, + &vpx_highbd_v_predictor_32x32_c, 32, 10))); INSTANTIATE_TEST_CASE_P( - SSE2_TO_C_12, VP9IntraPredTest, - ::testing::Values(IntraPredFunc(&vpx_highbd_dc_predictor_32x32_sse2, - &vpx_highbd_dc_predictor_32x32_c, 32, 12), - IntraPredFunc(&vpx_highbd_tm_predictor_16x16_sse2, - &vpx_highbd_tm_predictor_16x16_c, 16, 12), - IntraPredFunc(&vpx_highbd_tm_predictor_32x32_sse2, - &vpx_highbd_tm_predictor_32x32_c, 32, 12), - IntraPredFunc(&vpx_highbd_dc_predictor_4x4_sse2, - &vpx_highbd_dc_predictor_4x4_c, 4, 12), - IntraPredFunc(&vpx_highbd_dc_predictor_8x8_sse2, - &vpx_highbd_dc_predictor_8x8_c, 8, 12), - IntraPredFunc(&vpx_highbd_dc_predictor_16x16_sse2, - &vpx_highbd_dc_predictor_16x16_c, 16, 12), - IntraPredFunc(&vpx_highbd_v_predictor_4x4_sse2, - &vpx_highbd_v_predictor_4x4_c, 4, 12), - IntraPredFunc(&vpx_highbd_v_predictor_8x8_sse2, - &vpx_highbd_v_predictor_8x8_c, 8, 12), - IntraPredFunc(&vpx_highbd_v_predictor_16x16_sse2, - &vpx_highbd_v_predictor_16x16_c, 16, 12), - IntraPredFunc(&vpx_highbd_v_predictor_32x32_sse2, - &vpx_highbd_v_predictor_32x32_c, 32, 12), - IntraPredFunc(&vpx_highbd_tm_predictor_4x4_sse2, - &vpx_highbd_tm_predictor_4x4_c, 4, 12), - IntraPredFunc(&vpx_highbd_tm_predictor_8x8_sse2, - &vpx_highbd_tm_predictor_8x8_c, 8, 12))); + SSE2_TO_C_12, VP9HighbdIntraPredTest, + ::testing::Values( + HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_sse2, + &vpx_highbd_dc_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_sse2, + &vpx_highbd_dc_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_16x16_sse2, + &vpx_highbd_dc_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_32x32_sse2, + &vpx_highbd_dc_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_sse2, + &vpx_highbd_tm_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_sse2, + &vpx_highbd_tm_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_16x16_sse2, + &vpx_highbd_tm_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_sse2, + &vpx_highbd_tm_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_sse2, + &vpx_highbd_v_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_sse2, + &vpx_highbd_v_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_v_predictor_16x16_sse2, + &vpx_highbd_v_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_sse2, + &vpx_highbd_v_predictor_32x32_c, 32, 12))); +#endif // HAVE_SSE2 #endif // CONFIG_VP9_HIGHBITDEPTH -#endif // HAVE_SSE2 } // namespace diff --git a/tools.mk b/tools.mk new file mode 100644 index 000000000..3c660b1df --- /dev/null +++ b/tools.mk @@ -0,0 +1,110 @@ +## +## Copyright (c) 2016 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + +# List of tools to build. +TOOLS-yes += tiny_ssim.c +tiny_ssim.SRCS += vpx/vpx_integer.h +tiny_ssim.GUID = 3afa9b05-940b-4d68-b5aa-55157d8ed7b4 +tiny_ssim.DESCRIPTION = Generate SSIM/PSNR from raw .yuv files + +# +# End of specified files. The rest of the build rules should happen +# automagically from here. +# + + +# Expand list of selected tools to build (as specified above) +TOOLS = $(addprefix tools/,$(call enabled,TOOLS)) +ALL_SRCS = $(foreach ex,$(TOOLS),$($(notdir $(ex:.c=)).SRCS)) + + +# Expand all tools sources into a variable containing all sources +# for that tools (not just them main one specified in TOOLS) +# and add this file to the list (for MSVS workspace generation) +$(foreach ex,$(TOOLS),$(eval $(notdir $(ex:.c=)).SRCS += $(ex) tools.mk)) + + +# Create build/install dependencies for all tools. The common case +# is handled here. The MSVS case is handled below. +NOT_MSVS = $(if $(CONFIG_MSVS),,yes) +DIST-BINS-$(NOT_MSVS) += $(addprefix bin/,$(TOOLS:.c=$(EXE_SFX))) +DIST-SRCS-yes += $(ALL_SRCS) +OBJS-$(NOT_MSVS) += $(call objs,$(ALL_SRCS)) +BINS-$(NOT_MSVS) += $(addprefix $(BUILD_PFX),$(TOOLS:.c=$(EXE_SFX))) + + +# Instantiate linker template for all tools. +$(foreach bin,$(BINS-yes),\ + $(eval $(bin):)\ + $(eval $(call linker_template,$(bin),\ + $(call objs,$($(notdir $(bin:$(EXE_SFX)=)).SRCS)) \ + -lm\ + ))) + + +# The following pairs define a mapping of locations in the distribution +# tree to locations in the source/build trees. +INSTALL_MAPS += src/%.c %.c +INSTALL_MAPS += src/% $(SRC_PATH_BARE)/% +INSTALL_MAPS += bin/% % +INSTALL_MAPS += % % + + +# Build Visual Studio Projects. We use a template here to instantiate +# explicit rules rather than using an implicit rule because we want to +# leverage make's VPATH searching rather than specifying the paths on +# each file in TOOLS. This has the unfortunate side effect that +# touching the source files trigger a rebuild of the project files +# even though there is no real dependency there (the dependency is on +# the makefiles). We may want to revisit this. +define vcproj_template +$(1): $($(1:.$(VCPROJ_SFX)=).SRCS) vpx.$(VCPROJ_SFX) + $(if $(quiet),@echo " [vcproj] $$@") + $(qexec)$$(GEN_VCPROJ)\ + --exe\ + --target=$$(TOOLCHAIN)\ + --name=$$(@:.$(VCPROJ_SFX)=)\ + --ver=$$(CONFIG_VS_VERSION)\ + --proj-guid=$$($$(@:.$(VCPROJ_SFX)=).GUID)\ + --src-path-bare="$(SRC_PATH_BARE)" \ + $$(if $$(CONFIG_STATIC_MSVCRT),--static-crt) \ + --out=$$@ $$(INTERNAL_CFLAGS) $$(CFLAGS) \ + $$(INTERNAL_LDFLAGS) $$(LDFLAGS) $$^ +endef +TOOLS_BASENAME := $(notdir $(TOOLS)) +PROJECTS-$(CONFIG_MSVS) += $(TOOLS_BASENAME:.c=.$(VCPROJ_SFX)) +INSTALL-BINS-$(CONFIG_MSVS) += $(foreach p,$(VS_PLATFORMS),\ + $(addprefix bin/$(p)/,$(TOOLS_BASENAME:.c=.exe))) +$(foreach proj,$(call enabled,PROJECTS),\ + $(eval $(call vcproj_template,$(proj)))) + +# +# Documentation Rules +# +%.dox: %.c + @echo " [DOXY] $@" + @mkdir -p $(dir $@) + @echo "/*!\page tools_$(@F:.dox=) $(@F:.dox=)" > $@ + @echo " \includelineno $(<F)" >> $@ + @echo "*/" >> $@ + +tools.dox: tools.mk + @echo " [DOXY] $@" + @echo "/*!\page tools Tools" > $@ + @echo " This SDK includes a number of tools/utilities."\ + "The following tools are included: ">>$@ + @$(foreach ex,$(sort $(notdir $(TOOLS:.c=))),\ + echo " - \subpage tools_$(ex) $($(ex).DESCRIPTION)" >> $@;) + @echo "*/" >> $@ + +CLEAN-OBJS += tools.doxy tools.dox $(TOOLS:.c=.dox) +DOCS-yes += tools.doxy tools.dox +tools.doxy: tools.dox $(TOOLS:.c=.dox) + @echo "INPUT += $^" > $@ diff --git a/tools/tiny_ssim.c b/tools/tiny_ssim.c new file mode 100644 index 000000000..28052e0a8 --- /dev/null +++ b/tools/tiny_ssim.c @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <errno.h> +#include <math.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "vpx/vpx_integer.h" + +void vp8_ssim_parms_8x8_c(unsigned char *s, int sp, unsigned char *r, int rp, + uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, + uint32_t *sum_sq_r, uint32_t *sum_sxr) { + int i, j; + for (i = 0; i < 8; i++, s += sp, r += rp) { + for (j = 0; j < 8; j++) { + *sum_s += s[j]; + *sum_r += r[j]; + *sum_sq_s += s[j] * s[j]; + *sum_sq_r += r[j] * r[j]; + *sum_sxr += s[j] * r[j]; + } + } +} + +static const int64_t cc1 = 26634; // (64^2*(.01*255)^2 +static const int64_t cc2 = 239708; // (64^2*(.03*255)^2 + +static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s, + uint32_t sum_sq_r, uint32_t sum_sxr, int count) { + int64_t ssim_n, ssim_d; + int64_t c1, c2; + + // scale the constants by number of pixels + c1 = (cc1 * count * count) >> 12; + c2 = (cc2 * count * count) >> 12; + + ssim_n = (2 * sum_s * sum_r + c1) * + ((int64_t)2 * count * sum_sxr - (int64_t)2 * sum_s * sum_r + c2); + + ssim_d = (sum_s * sum_s + sum_r * sum_r + c1) * + ((int64_t)count * sum_sq_s - (int64_t)sum_s * sum_s + + (int64_t)count * sum_sq_r - (int64_t)sum_r * sum_r + c2); + + return ssim_n * 1.0 / ssim_d; +} + +static double ssim_8x8(unsigned char *s, int sp, unsigned char *r, int rp) { + uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0; + vp8_ssim_parms_8x8_c(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, + &sum_sxr); + return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64); +} + +// We are using a 8x8 moving window with starting location of each 8x8 window +// on the 4x4 pixel grid. Such arrangement allows the windows to overlap +// block boundaries to penalize blocking artifacts. +double vp8_ssim2(unsigned char *img1, unsigned char *img2, int stride_img1, + int stride_img2, int width, int height) { + int i, j; + int samples = 0; + double ssim_total = 0; + + // sample point start with each 4x4 location + for (i = 0; i <= height - 8; + i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) { + for (j = 0; j <= width - 8; j += 4) { + double v = ssim_8x8(img1 + j, stride_img1, img2 + j, stride_img2); + ssim_total += v; + samples++; + } + } + ssim_total /= samples; + return ssim_total; +} + +static uint64_t calc_plane_error(uint8_t *orig, int orig_stride, uint8_t *recon, + int recon_stride, unsigned int cols, + unsigned int rows) { + unsigned int row, col; + uint64_t total_sse = 0; + int diff; + + for (row = 0; row < rows; row++) { + for (col = 0; col < cols; col++) { + diff = orig[col] - recon[col]; + total_sse += diff * diff; + } + + orig += orig_stride; + recon += recon_stride; + } + + return total_sse; +} + +#define MAX_PSNR 100 + +double vp9_mse2psnr(double samples, double peak, double mse) { + double psnr; + + if (mse > 0.0) + psnr = 10.0 * log10(peak * peak * samples / mse); + else + psnr = MAX_PSNR; // Limit to prevent / 0 + + if (psnr > MAX_PSNR) psnr = MAX_PSNR; + + return psnr; +} + +int main(int argc, char *argv[]) { + FILE *f[2]; + uint8_t *buf[2]; + int w, h, n_frames, tl_skip = 0, tl_skips_remaining = 0; + double ssim = 0, psnravg = 0, psnrglb = 0; + double ssimy, ssimu, ssimv; + uint64_t psnry, psnru, psnrv; + + if (argc < 4) { + fprintf(stderr, "Usage: %s file1.yuv file2.yuv WxH [tl_skip={0,1,3}]\n", + argv[0]); + return 1; + } + f[0] = strcmp(argv[1], "-") ? fopen(argv[1], "rb") : stdin; + f[1] = strcmp(argv[2], "-") ? fopen(argv[2], "rb") : stdin; + sscanf(argv[3], "%dx%d", &w, &h); + // Number of frames to skip from file1.yuv for every frame used. Normal values + // 0, 1 and 3 correspond to TL2, TL1 and TL0 respectively for a 3TL encoding + // in mode 10. 7 would be reasonable for comparing TL0 of a 4-layer encoding. + if (argc > 4) { + sscanf(argv[4], "%d", &tl_skip); + } + if (!f[0] || !f[1]) { + fprintf(stderr, "Could not open input files: %s\n", strerror(errno)); + return 1; + } + if (w <= 0 || h <= 0 || w & 1 || h & 1) { + fprintf(stderr, "Invalid size %dx%d\n", w, h); + return 1; + } + buf[0] = malloc(w * h * 3 / 2); + buf[1] = malloc(w * h * 3 / 2); + n_frames = 0; + while (1) { + size_t r1, r2; + r1 = fread(buf[0], w * h * 3 / 2, 1, f[0]); + if (r1) { + // Reading parts of file1.yuv that were not used in temporal layer. + if (tl_skips_remaining > 0) { + --tl_skips_remaining; + continue; + } + // Use frame, but skip |tl_skip| after it. + tl_skips_remaining = tl_skip; + } + r2 = fread(buf[1], w * h * 3 / 2, 1, f[1]); + if (r1 && r2 && r1 != r2) { + fprintf(stderr, "Failed to read data: %s [%d/%d]\n", strerror(errno), + (int)r1, (int)r2); + return 1; + } else if (r1 == 0 || r2 == 0) { + break; + } +#define psnr_and_ssim(ssim, psnr, buf0, buf1, w, h) \ + ssim = vp8_ssim2(buf0, buf1, w, w, w, h); \ + psnr = calc_plane_error(buf0, w, buf1, w, w, h); + psnr_and_ssim(ssimy, psnry, buf[0], buf[1], w, h); + psnr_and_ssim(ssimu, psnru, buf[0] + w * h, buf[1] + w * h, w / 2, h / 2); + psnr_and_ssim(ssimv, psnrv, buf[0] + w * h * 5 / 4, buf[1] + w * h * 5 / 4, + w / 2, h / 2); + ssim += 0.8 * ssimy + 0.1 * (ssimu + ssimv); + psnravg += + vp9_mse2psnr(w * h * 6 / 4, 255.0, (double)psnry + psnru + psnrv); + psnrglb += psnry + psnru + psnrv; + n_frames++; + } + free(buf[0]); + free(buf[1]); + ssim /= n_frames; + psnravg /= n_frames; + psnrglb = vp9_mse2psnr((double)n_frames * w * h * 6 / 4, 255.0, psnrglb); + + printf("AvgPSNR: %lf\n", psnravg); + printf("GlbPSNR: %lf\n", psnrglb); + printf("SSIM: %lf\n", 100 * pow(ssim, 8.0)); + printf("Nframes: %d\n", n_frames); + + if (strcmp(argv[1], "-")) fclose(f[0]); + if (strcmp(argv[2], "-")) fclose(f[1]); + + return 0; +} diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c index fde0b7e31..628d1c8d2 100644 --- a/vp9/decoder/vp9_decodeframe.c +++ b/vp9/decoder/vp9_decodeframe.c @@ -1517,7 +1517,6 @@ static int tile_worker_hook(TileWorkerData *const tile_data, return 0; } - tile_data->xd.error_info = &tile_data->error_info; tile_data->xd.corrupted = 0; do { @@ -1529,6 +1528,8 @@ static int tile_worker_hook(TileWorkerData *const tile_data, &tile_data->error_info, &tile_data->bit_reader, pbi->decrypt_cb, pbi->decrypt_state); vp9_init_macroblockd(&pbi->common, &tile_data->xd, tile_data->dqcoeff); + // init resets xd.error_info + tile_data->xd.error_info = &tile_data->error_info; for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end; mi_row += MI_BLOCK_SIZE) { diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 3bd6026d4..2a5800382 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -2441,6 +2441,8 @@ static int recode_loop_test(VP9_COMP *cpi, int high_limit, int low_limit, int q, cpi->resize_pending = 1; return 1; } + // Force recode if projected_frame_size > max_frame_bandwidth + if (rc->projected_frame_size >= rc->max_frame_bandwidth) return 1; // TODO(agrange) high_limit could be greater than the scale-down threshold. if ((rc->projected_frame_size > high_limit && q < maxq) || @@ -2799,13 +2801,13 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) { dc_quant_devisor = 4.0; #endif - fprintf(f, "%10u %dx%d %10d %10d %d %d %10d %10d %10d %10d" + fprintf(f, "%10u %dx%d %d %d %10d %10d %10d %10d" "%10"PRId64" %10"PRId64" %5d %5d %10"PRId64" " "%10"PRId64" %10"PRId64" %10d " "%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf" "%6d %6d %5d %5d %5d " "%10"PRId64" %10.3lf" - "%10lf %8u %10"PRId64" %10d %10d %10d\n", + "%10lf %8u %10"PRId64" %10d %10d %10d %10d %10d\n", cpi->common.current_video_frame, cm->width, cm->height, cpi->rc.source_alt_ref_pending, diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 2f1fe360d..788952d34 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -48,10 +48,8 @@ #define FIRST_PASS_Q 10.0 #define GF_MAX_BOOST 96.0 #define INTRA_MODE_PENALTY 1024 -#define KF_MAX_BOOST 128.0 #define MIN_ARF_GF_BOOST 240 #define MIN_DECAY_FACTOR 0.01 -#define MIN_KF_BOOST 300 #define NEW_MV_MODE_PENALTY 32 #define SVC_FACTOR_PT_LOW 0.45 #define DARK_THRESH 64 @@ -1578,7 +1576,7 @@ static double get_sr_decay_rate(const VP9_COMP *cpi, sr_decay = 1.0 - (SR_DIFF_PART * sr_diff) - motion_amplitude_part - (INTRA_PART * modified_pcnt_intra); } - return VPXMAX(sr_decay, VPXMIN(DEFAULT_DECAY_LIMIT, modified_pct_inter)); + return VPXMAX(sr_decay, DEFAULT_DECAY_LIMIT); } // This function gives an estimate of how badly we believe the prediction @@ -1681,6 +1679,7 @@ static void accumulate_frame_motion_stats(const FIRSTPASS_STATS *stats, #define BASELINE_ERR_PER_MB 1000.0 static double calc_frame_boost(VP9_COMP *cpi, const FIRSTPASS_STATS *this_frame, + double *sr_accumulator, double this_frame_mv_in_out, double max_boost) { double frame_boost; const double lq = vp9_convert_qindex_to_q( @@ -1694,17 +1693,56 @@ static double calc_frame_boost(VP9_COMP *cpi, const FIRSTPASS_STATS *this_frame, // Underlying boost factor is based on inter error ratio. frame_boost = (BASELINE_ERR_PER_MB * num_mbs) / - DOUBLE_DIVIDE_CHECK(this_frame->coded_error); + DOUBLE_DIVIDE_CHECK(this_frame->coded_error + *sr_accumulator); + + // Update the accumulator for second ref error difference. + // This is intended to give an indication of how much the coded error is + // increasing over time. + *sr_accumulator += (this_frame->sr_coded_error - this_frame->coded_error) / 1; + *sr_accumulator = VPXMAX(0.0, *sr_accumulator); + + // Small adjustment for cases where there is a zoom out + if (this_frame_mv_in_out > 0.0) + frame_boost += frame_boost * (this_frame_mv_in_out * 2.0); + + // Q correction and scalling frame_boost = frame_boost * BOOST_FACTOR * boost_q_correction; - // Increase boost for frames where new data coming into frame (e.g. zoom out). - // Slightly reduce boost if there is a net balance of motion out of the frame - // (zoom in). The range for this_frame_mv_in_out is -1.0 to +1.0. + return VPXMIN(frame_boost, max_boost * boost_q_correction); +} + +#define KF_BOOST_FACTOR 12.5 +static double calc_kf_frame_boost(VP9_COMP *cpi, + const FIRSTPASS_STATS *this_frame, + double *sr_accumulator, + double this_frame_mv_in_out, + double max_boost) { + double frame_boost; + const double lq = vp9_convert_qindex_to_q( + cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.bit_depth); + const double boost_q_correction = VPXMIN((0.50 + (lq * 0.015)), 2.00); + int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs + : cpi->common.MBs; + + // Correct for any inactive region in the image + num_mbs = (int)VPXMAX(1, num_mbs * calculate_active_area(cpi, this_frame)); + + // Underlying boost factor is based on inter error ratio. + frame_boost = (BASELINE_ERR_PER_MB * num_mbs) / + DOUBLE_DIVIDE_CHECK(this_frame->coded_error + *sr_accumulator); + + // Update the accumulator for second ref error difference. + // This is intended to give an indication of how much the coded error is + // increasing over time. + *sr_accumulator += (this_frame->sr_coded_error - this_frame->coded_error) / 1; + *sr_accumulator = VPXMAX(0.0, *sr_accumulator); + + // Small adjustment for cases where there is a zoom out if (this_frame_mv_in_out > 0.0) frame_boost += frame_boost * (this_frame_mv_in_out * 2.0); - // In the extreme case the boost is halved. - else - frame_boost += frame_boost * (this_frame_mv_in_out / 2.0); + + // Q correction and scalling + frame_boost = frame_boost * KF_BOOST_FACTOR * boost_q_correction; return VPXMIN(frame_boost, max_boost * boost_q_correction); } @@ -1719,6 +1757,7 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames, double this_frame_mv_in_out = 0.0; double mv_in_out_accumulator = 0.0; double abs_mv_in_out_accumulator = 0.0; + double sr_accumulator = 0.0; int arf_boost; int flash_detected = 0; @@ -1745,9 +1784,10 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames, : decay_accumulator; } - boost_score += - decay_accumulator * - calc_frame_boost(cpi, this_frame, this_frame_mv_in_out, GF_MAX_BOOST); + sr_accumulator = 0.0; + boost_score += decay_accumulator * + calc_frame_boost(cpi, this_frame, &sr_accumulator, + this_frame_mv_in_out, GF_MAX_BOOST); } *f_boost = (int)boost_score; @@ -1759,6 +1799,7 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames, this_frame_mv_in_out = 0.0; mv_in_out_accumulator = 0.0; abs_mv_in_out_accumulator = 0.0; + sr_accumulator = 0.0; // Search backward towards last gf position. for (i = -1; i >= -b_frames; --i) { @@ -1783,9 +1824,10 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames, : decay_accumulator; } - boost_score += - decay_accumulator * - calc_frame_boost(cpi, this_frame, this_frame_mv_in_out, GF_MAX_BOOST); + sr_accumulator = 0.0; + boost_score += decay_accumulator * + calc_frame_boost(cpi, this_frame, &sr_accumulator, + this_frame_mv_in_out, GF_MAX_BOOST); } *b_boost = (int)boost_score; @@ -2085,7 +2127,6 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { double mv_ratio_accumulator = 0.0; double decay_accumulator = 1.0; double zero_motion_accumulator = 1.0; - double loop_decay_rate = 1.00; double last_loop_decay_rate = 1.00; @@ -2095,6 +2136,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { double mv_ratio_accumulator_thresh; double mv_in_out_thresh; double abs_mv_in_out_thresh; + double sr_accumulator = 0.0; unsigned int allow_alt_ref = is_altref_enabled(cpi); int f_boost = 0; @@ -2221,9 +2263,10 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { } // Calculate a boost number for this frame. - boost_score += - decay_accumulator * - calc_frame_boost(cpi, &next_frame, this_frame_mv_in_out, GF_MAX_BOOST); + sr_accumulator = 0.0; + boost_score += decay_accumulator * + calc_frame_boost(cpi, &next_frame, &sr_accumulator, + this_frame_mv_in_out, GF_MAX_BOOST); // Break out conditions. if ( @@ -2473,6 +2516,10 @@ static int test_candidate_kf(TWO_PASS *twopass, } #define FRAMES_TO_CHECK_DECAY 8 +#define KF_MAX_FRAME_BOOST 96.0 +#define MIN_KF_TOT_BOOST 300 +#define MAX_KF_TOT_BOOST 5400 +#define KF_BOOST_SCAN_MAX_FRAMES 32 static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { int i, j; @@ -2485,14 +2532,13 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { FIRSTPASS_STATS next_frame; FIRSTPASS_STATS last_frame; int kf_bits = 0; - int loop_decay_counter = 0; double decay_accumulator = 1.0; - double av_decay_accumulator = 0.0; double zero_motion_accumulator = 1.0; double boost_score = 0.0; double kf_mod_err = 0.0; double kf_group_err = 0.0; double recent_loop_decay[FRAMES_TO_CHECK_DECAY]; + double sr_accumulator = 0.0; vp9_zero(next_frame); @@ -2642,34 +2688,36 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Scan through the kf group collating various stats used to determine // how many bits to spend on it. - decay_accumulator = 1.0; boost_score = 0.0; + for (i = 0; i < (rc->frames_to_key - 1); ++i) { if (EOF == input_stats(twopass, &next_frame)) break; - // Monitor for static sections. - zero_motion_accumulator = VPXMIN(zero_motion_accumulator, - get_zero_motion_factor(cpi, &next_frame)); - - // Not all frames in the group are necessarily used in calculating boost. - if ((i <= rc->max_gf_interval) || - ((i <= (rc->max_gf_interval * 4)) && (decay_accumulator > 0.5))) { - const double frame_boost = - calc_frame_boost(cpi, &next_frame, 0, KF_MAX_BOOST); - - // How fast is prediction quality decaying. - if (!detect_flash(twopass, 0)) { - const double loop_decay_rate = - get_prediction_decay_rate(cpi, &next_frame); - decay_accumulator *= loop_decay_rate; - decay_accumulator = VPXMAX(decay_accumulator, MIN_DECAY_FACTOR); - av_decay_accumulator += decay_accumulator; - ++loop_decay_counter; - } - boost_score += (decay_accumulator * frame_boost); + if (i <= KF_BOOST_SCAN_MAX_FRAMES) { + double frame_boost; + double zm_factor; + + // Monitor for static sections. + zero_motion_accumulator = VPXMIN( + zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame)); + + // Factor 0.75-1.25 based on how much of frame is static. + zm_factor = (0.75 + (zero_motion_accumulator / 2.0)); + + // The second (lagging) ref error is not valid immediately after + // a key frame because either the lag has not built up (in the case of + // the first key frame or it points to a refernce before the new key + // frame. + if (i < 2) sr_accumulator = 0.0; + frame_boost = calc_kf_frame_boost(cpi, &next_frame, &sr_accumulator, 0, + KF_MAX_FRAME_BOOST * zm_factor); + + boost_score += frame_boost; + if (frame_boost < 25.00) break; + } else { + break; } } - av_decay_accumulator /= (double)loop_decay_counter; reset_fpf_position(twopass, start_position); @@ -2681,9 +2729,9 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { start_position, twopass->stats_in_end, rc->frames_to_key); // Apply various clamps for min and max boost - rc->kf_boost = (int)(av_decay_accumulator * boost_score); - rc->kf_boost = VPXMAX(rc->kf_boost, (rc->frames_to_key * 3)); - rc->kf_boost = VPXMAX(rc->kf_boost, MIN_KF_BOOST); + rc->kf_boost = VPXMAX((int)boost_score, (rc->frames_to_key * 3)); + rc->kf_boost = VPXMAX(rc->kf_boost, MIN_KF_TOT_BOOST); + rc->kf_boost = VPXMIN(rc->kf_boost, MAX_KF_TOT_BOOST); // Work out how many bits to allocate for the key frame itself. kf_bits = calculate_boost_bits((rc->frames_to_key - 1), rc->kf_boost, diff --git a/vpx/src/svc_encodeframe.c b/vpx/src/svc_encodeframe.c index 5aa0b8ddb..88b1531d8 100644 --- a/vpx/src/svc_encodeframe.c +++ b/vpx/src/svc_encodeframe.c @@ -53,6 +53,10 @@ static const int DEFAULT_SCALE_FACTORS_NUM[VPX_SS_MAX_LAYERS] = { 4, 5, 7, 11, static const int DEFAULT_SCALE_FACTORS_DEN[VPX_SS_MAX_LAYERS] = { 16, 16, 16, 16, 16 }; +static const int DEFAULT_SCALE_FACTORS_NUM_2x[VPX_SS_MAX_LAYERS] = { 1, 2, 4 }; + +static const int DEFAULT_SCALE_FACTORS_DEN_2x[VPX_SS_MAX_LAYERS] = { 4, 4, 4 }; + typedef enum { QUANTIZER = 0, BITRATE, @@ -156,6 +160,9 @@ static vpx_codec_err_t parse_layer_options_from_string(SvcContext *svc_ctx, char *token; const char *delim = ","; char *save_ptr; + int num_layers = svc_ctx->spatial_layers; + if (type == BITRATE) + num_layers = svc_ctx->spatial_layers * svc_ctx->temporal_layers; if (input == NULL || option0 == NULL || (option1 == NULL && type == SCALE_FACTOR)) @@ -163,7 +170,7 @@ static vpx_codec_err_t parse_layer_options_from_string(SvcContext *svc_ctx, input_string = strdup(input); token = strtok_r(input_string, delim, &save_ptr); - for (i = 0; i < svc_ctx->spatial_layers; ++i) { + for (i = 0; i < num_layers; ++i) { if (token != NULL) { res = extract_option(type, token, option0 + i, option1 + i); if (res != VPX_CODEC_OK) break; @@ -172,11 +179,11 @@ static vpx_codec_err_t parse_layer_options_from_string(SvcContext *svc_ctx, break; } } - if (res == VPX_CODEC_OK && i != svc_ctx->spatial_layers) { + if (res == VPX_CODEC_OK && i != num_layers) { svc_log(svc_ctx, SVC_LOG_ERROR, "svc: layer params type: %d %d values required, " "but only %d specified\n", - type, svc_ctx->spatial_layers, i); + type, num_layers, i); res = VPX_CODEC_INVALID_PARAM; } free(input_string); @@ -287,24 +294,30 @@ vpx_codec_err_t vpx_svc_set_options(SvcContext *svc_ctx, const char *options) { return VPX_CODEC_OK; } -void assign_layer_bitrates(const SvcContext *svc_ctx, - vpx_codec_enc_cfg_t *const enc_cfg) { +vpx_codec_err_t assign_layer_bitrates(const SvcContext *svc_ctx, + vpx_codec_enc_cfg_t *const enc_cfg) { int i; const SvcInternal_t *const si = get_const_svc_internal(svc_ctx); int sl, tl, spatial_layer_target; if (svc_ctx->temporal_layering_mode != 0) { if (si->bitrates[0] != 0) { - enc_cfg->rc_target_bitrate = 0; + unsigned int total_bitrate = 0; for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) { - enc_cfg->ss_target_bitrate[sl * svc_ctx->temporal_layers] = 0; + total_bitrate += si->bitrates[sl * svc_ctx->temporal_layers + + svc_ctx->temporal_layers - 1]; for (tl = 0; tl < svc_ctx->temporal_layers; ++tl) { enc_cfg->ss_target_bitrate[sl * svc_ctx->temporal_layers] += (unsigned int)si->bitrates[sl * svc_ctx->temporal_layers + tl]; enc_cfg->layer_target_bitrate[sl * svc_ctx->temporal_layers + tl] = si->bitrates[sl * svc_ctx->temporal_layers + tl]; + if (tl > 0 && (si->bitrates[sl * svc_ctx->temporal_layers + tl] <= + si->bitrates[sl * svc_ctx->temporal_layers + tl - 1])) + return VPX_CODEC_INVALID_PARAM; } } + if (total_bitrate != enc_cfg->rc_target_bitrate) + return VPX_CODEC_INVALID_PARAM; } else { float total = 0; float alloc_ratio[VPX_MAX_LAYERS] = { 0 }; @@ -341,11 +354,14 @@ void assign_layer_bitrates(const SvcContext *svc_ctx, } } else { if (si->bitrates[0] != 0) { - enc_cfg->rc_target_bitrate = 0; + unsigned int total_bitrate = 0; for (i = 0; i < svc_ctx->spatial_layers; ++i) { enc_cfg->ss_target_bitrate[i] = (unsigned int)si->bitrates[i]; - enc_cfg->rc_target_bitrate += si->bitrates[i]; + enc_cfg->layer_target_bitrate[i] = (unsigned int)si->bitrates[i]; + total_bitrate += si->bitrates[i]; } + if (total_bitrate != enc_cfg->rc_target_bitrate) + return VPX_CODEC_INVALID_PARAM; } else { float total = 0; float alloc_ratio[VPX_MAX_LAYERS] = { 0 }; @@ -368,6 +384,7 @@ void assign_layer_bitrates(const SvcContext *svc_ctx, } } } + return VPX_CODEC_OK; } vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, @@ -412,12 +429,24 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, si->svc_params.scaling_factor_den[sl] = DEFAULT_SCALE_FACTORS_DEN[sl]; si->svc_params.speed_per_layer[sl] = svc_ctx->speed; } - + if (enc_cfg->rc_end_usage == VPX_CBR && enc_cfg->g_pass == VPX_RC_ONE_PASS && + svc_ctx->spatial_layers <= 3) { + for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) { + int sl2 = (svc_ctx->spatial_layers == 2) ? sl + 1 : sl; + si->svc_params.scaling_factor_num[sl] = DEFAULT_SCALE_FACTORS_NUM_2x[sl2]; + si->svc_params.scaling_factor_den[sl] = DEFAULT_SCALE_FACTORS_DEN_2x[sl2]; + } + } for (tl = 0; tl < svc_ctx->temporal_layers; ++tl) { for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) { i = sl * svc_ctx->temporal_layers + tl; si->svc_params.max_quantizers[i] = MAX_QUANTIZER; si->svc_params.min_quantizers[i] = 0; + if (enc_cfg->rc_end_usage == VPX_CBR && + enc_cfg->g_pass == VPX_RC_ONE_PASS) { + si->svc_params.max_quantizers[i] = 56; + si->svc_params.min_quantizers[i] = 2; + } } } @@ -442,7 +471,15 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, (int)VPX_MAX_LAYERS); return VPX_CODEC_INVALID_PARAM; } - assign_layer_bitrates(svc_ctx, enc_cfg); + res = assign_layer_bitrates(svc_ctx, enc_cfg); + if (res != VPX_CODEC_OK) { + svc_log(svc_ctx, SVC_LOG_ERROR, + "layer bitrates incorrect: \n" + "1) spatial layer bitrates should sum up to target \n" + "2) temporal layer bitrates should be increasing within \n" + "a spatial layer \n"); + return VPX_CODEC_INVALID_PARAM; + } #if CONFIG_SPATIAL_SVC for (i = 0; i < svc_ctx->spatial_layers; ++i) diff --git a/vpx/svc_context.h b/vpx/svc_context.h index c8bde5832..462785075 100644 --- a/vpx/svc_context.h +++ b/vpx/svc_context.h @@ -54,7 +54,7 @@ typedef struct SvcInternal { // values extracted from option, quantizers vpx_svc_extra_cfg_t svc_params; int enable_auto_alt_ref[VPX_SS_MAX_LAYERS]; - int bitrates[VPX_SS_MAX_LAYERS]; + int bitrates[VPX_MAX_LAYERS]; // accumulated statistics double psnr_sum[VPX_SS_MAX_LAYERS][COMPONENTS]; // total/Y/U/V diff --git a/vpx_dsp/arm/idct32x32_1_add_neon.c b/vpx_dsp/arm/idct32x32_1_add_neon.c index 8aad4c579..6be4b0122 100644 --- a/vpx_dsp/arm/idct32x32_1_add_neon.c +++ b/vpx_dsp/arm/idct32x32_1_add_neon.c @@ -103,7 +103,7 @@ void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, dest_stride8 = dest_stride * 8; if (a1 >= 0) { // diff_positive_32_32 a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1; - q0u8 = vdupq_n_u8(a1); + q0u8 = vdupq_n_u8((uint8_t)a1); for (i = 0; i < 2; i++, dest += 16) { // diff_positive_32_32_loop d = dest; for (j = 0; j < 4; j++) { @@ -119,7 +119,7 @@ void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, } else { // diff_negative_32_32 a1 = -a1; a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1; - q0u8 = vdupq_n_u8(a1); + q0u8 = vdupq_n_u8((uint8_t)a1); for (i = 0; i < 2; i++, dest += 16) { // diff_negative_32_32_loop d = dest; for (j = 0; j < 4; j++) { diff --git a/vpx_dsp/arm/idct32x32_34_add_neon.c b/vpx_dsp/arm/idct32x32_34_add_neon.c new file mode 100644 index 000000000..ebec9df54 --- /dev/null +++ b/vpx_dsp/arm/idct32x32_34_add_neon.c @@ -0,0 +1,519 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/txfm_common.h" + +// Only for the first pass of the _34_ variant. Since it only uses values from +// the top left 8x8 it can safely assume all the remaining values are 0 and skip +// an awful lot of calculations. In fact, only the first 6 columns make the cut. +// None of the elements in the 7th or 8th column are used so it skips any calls +// to input[67] too. +// In C this does a single row of 32 for each call. Here it transposes the top +// left 8x8 to allow using SIMD. + +// vp9/common/vp9_scan.c:vp9_default_iscan_32x32 arranges the first 34 non-zero +// coefficients as follows: +// 0 1 2 3 4 5 6 7 +// 0 0 2 5 10 17 25 +// 1 1 4 8 15 22 30 +// 2 3 7 12 18 28 +// 3 6 11 16 23 31 +// 4 9 14 19 29 +// 5 13 20 26 +// 6 21 27 33 +// 7 24 32 +static void idct32_6_neon(const int16_t *input, int16_t *output) { + int16x8_t in0, in1, in2, in3, in4, in5, in6, in7; + int16x8_t s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s1_9, s1_10, + s1_11, s1_12, s1_13, s1_14, s1_15, s1_16, s1_17, s1_18, s1_19, s1_20, + s1_21, s1_22, s1_23, s1_24, s1_25, s1_26, s1_27, s1_28, s1_29, s1_30, + s1_31; + int16x8_t s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s2_9, s2_10, + s2_11, s2_12, s2_13, s2_14, s2_15, s2_16, s2_17, s2_18, s2_19, s2_20, + s2_21, s2_22, s2_23, s2_24, s2_25, s2_26, s2_27, s2_28, s2_29, s2_30, + s2_31; + int16x8_t s3_24, s3_25, s3_26, s3_27; + + load_and_transpose_s16_8x8(input, 32, &in0, &in1, &in2, &in3, &in4, &in5, + &in6, &in7); + + // stage 1 + // input[1] * cospi_31_64 - input[31] * cospi_1_64 (but input[31] == 0) + s1_16 = multiply_shift_and_narrow_s16(in1, cospi_31_64); + // input[1] * cospi_1_64 + input[31] * cospi_31_64 (but input[31] == 0) + s1_31 = multiply_shift_and_narrow_s16(in1, cospi_1_64); + + s1_20 = multiply_shift_and_narrow_s16(in5, cospi_27_64); + s1_27 = multiply_shift_and_narrow_s16(in5, cospi_5_64); + + s1_23 = multiply_shift_and_narrow_s16(in3, -cospi_29_64); + s1_24 = multiply_shift_and_narrow_s16(in3, cospi_3_64); + + // stage 2 + s2_8 = multiply_shift_and_narrow_s16(in2, cospi_30_64); + s2_15 = multiply_shift_and_narrow_s16(in2, cospi_2_64); + + // stage 3 + s1_4 = multiply_shift_and_narrow_s16(in4, cospi_28_64); + s1_7 = multiply_shift_and_narrow_s16(in4, cospi_4_64); + + s1_17 = multiply_accumulate_shift_and_narrow_s16(s1_16, -cospi_4_64, s1_31, + cospi_28_64); + s1_30 = multiply_accumulate_shift_and_narrow_s16(s1_16, cospi_28_64, s1_31, + cospi_4_64); + + s1_21 = multiply_accumulate_shift_and_narrow_s16(s1_20, -cospi_20_64, s1_27, + cospi_12_64); + s1_26 = multiply_accumulate_shift_and_narrow_s16(s1_20, cospi_12_64, s1_27, + cospi_20_64); + + s1_22 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_12_64, s1_24, + -cospi_20_64); + s1_25 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_20_64, s1_24, + cospi_12_64); + + // stage 4 + s1_0 = multiply_shift_and_narrow_s16(in0, cospi_16_64); + + s2_9 = multiply_accumulate_shift_and_narrow_s16(s2_8, -cospi_8_64, s2_15, + cospi_24_64); + s2_14 = multiply_accumulate_shift_and_narrow_s16(s2_8, cospi_24_64, s2_15, + cospi_8_64); + + s2_20 = vsubq_s16(s1_23, s1_20); + s2_21 = vsubq_s16(s1_22, s1_21); + s2_22 = vaddq_s16(s1_21, s1_22); + s2_23 = vaddq_s16(s1_20, s1_23); + s2_24 = vaddq_s16(s1_24, s1_27); + s2_25 = vaddq_s16(s1_25, s1_26); + s2_26 = vsubq_s16(s1_25, s1_26); + s2_27 = vsubq_s16(s1_24, s1_27); + + // stage 5 + s1_5 = sub_multiply_shift_and_narrow_s16(s1_7, s1_4, cospi_16_64); + s1_6 = add_multiply_shift_and_narrow_s16(s1_4, s1_7, cospi_16_64); + + s1_18 = multiply_accumulate_shift_and_narrow_s16(s1_17, -cospi_8_64, s1_30, + cospi_24_64); + s1_29 = multiply_accumulate_shift_and_narrow_s16(s1_17, cospi_24_64, s1_30, + cospi_8_64); + + s1_19 = multiply_accumulate_shift_and_narrow_s16(s1_16, -cospi_8_64, s1_31, + cospi_24_64); + s1_28 = multiply_accumulate_shift_and_narrow_s16(s1_16, cospi_24_64, s1_31, + cospi_8_64); + + s1_20 = multiply_accumulate_shift_and_narrow_s16(s2_20, -cospi_24_64, s2_27, + -cospi_8_64); + s1_27 = multiply_accumulate_shift_and_narrow_s16(s2_20, -cospi_8_64, s2_27, + cospi_24_64); + + s1_21 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_24_64, s2_26, + -cospi_8_64); + s1_26 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_8_64, s2_26, + cospi_24_64); + + // stage 6 + s2_0 = vaddq_s16(s1_0, s1_7); + s2_1 = vaddq_s16(s1_0, s1_6); + s2_2 = vaddq_s16(s1_0, s1_5); + s2_3 = vaddq_s16(s1_0, s1_4); + s2_4 = vsubq_s16(s1_0, s1_4); + s2_5 = vsubq_s16(s1_0, s1_5); + s2_6 = vsubq_s16(s1_0, s1_6); + s2_7 = vsubq_s16(s1_0, s1_7); + + s2_10 = sub_multiply_shift_and_narrow_s16(s2_14, s2_9, cospi_16_64); + s2_13 = add_multiply_shift_and_narrow_s16(s2_9, s2_14, cospi_16_64); + + s2_11 = sub_multiply_shift_and_narrow_s16(s2_15, s2_8, cospi_16_64); + s2_12 = add_multiply_shift_and_narrow_s16(s2_8, s2_15, cospi_16_64); + + s2_16 = vaddq_s16(s1_16, s2_23); + s2_17 = vaddq_s16(s1_17, s2_22); + s2_18 = vaddq_s16(s1_18, s1_21); + s2_19 = vaddq_s16(s1_19, s1_20); + s2_20 = vsubq_s16(s1_19, s1_20); + s2_21 = vsubq_s16(s1_18, s1_21); + s2_22 = vsubq_s16(s1_17, s2_22); + s2_23 = vsubq_s16(s1_16, s2_23); + + s3_24 = vsubq_s16(s1_31, s2_24); + s3_25 = vsubq_s16(s1_30, s2_25); + s3_26 = vsubq_s16(s1_29, s1_26); + s3_27 = vsubq_s16(s1_28, s1_27); + s2_28 = vaddq_s16(s1_27, s1_28); + s2_29 = vaddq_s16(s1_26, s1_29); + s2_30 = vaddq_s16(s2_25, s1_30); + s2_31 = vaddq_s16(s2_24, s1_31); + + // stage 7 + s1_0 = vaddq_s16(s2_0, s2_15); + s1_1 = vaddq_s16(s2_1, s2_14); + s1_2 = vaddq_s16(s2_2, s2_13); + s1_3 = vaddq_s16(s2_3, s2_12); + s1_4 = vaddq_s16(s2_4, s2_11); + s1_5 = vaddq_s16(s2_5, s2_10); + s1_6 = vaddq_s16(s2_6, s2_9); + s1_7 = vaddq_s16(s2_7, s2_8); + s1_8 = vsubq_s16(s2_7, s2_8); + s1_9 = vsubq_s16(s2_6, s2_9); + s1_10 = vsubq_s16(s2_5, s2_10); + s1_11 = vsubq_s16(s2_4, s2_11); + s1_12 = vsubq_s16(s2_3, s2_12); + s1_13 = vsubq_s16(s2_2, s2_13); + s1_14 = vsubq_s16(s2_1, s2_14); + s1_15 = vsubq_s16(s2_0, s2_15); + + s1_20 = sub_multiply_shift_and_narrow_s16(s3_27, s2_20, cospi_16_64); + s1_27 = add_multiply_shift_and_narrow_s16(s2_20, s3_27, cospi_16_64); + + s1_21 = sub_multiply_shift_and_narrow_s16(s3_26, s2_21, cospi_16_64); + s1_26 = add_multiply_shift_and_narrow_s16(s2_21, s3_26, cospi_16_64); + + s1_22 = sub_multiply_shift_and_narrow_s16(s3_25, s2_22, cospi_16_64); + s1_25 = add_multiply_shift_and_narrow_s16(s2_22, s3_25, cospi_16_64); + + s1_23 = sub_multiply_shift_and_narrow_s16(s3_24, s2_23, cospi_16_64); + s1_24 = add_multiply_shift_and_narrow_s16(s2_23, s3_24, cospi_16_64); + + // final stage + vst1q_s16(output, vaddq_s16(s1_0, s2_31)); + output += 8; + vst1q_s16(output, vaddq_s16(s1_1, s2_30)); + output += 8; + vst1q_s16(output, vaddq_s16(s1_2, s2_29)); + output += 8; + vst1q_s16(output, vaddq_s16(s1_3, s2_28)); + output += 8; + vst1q_s16(output, vaddq_s16(s1_4, s1_27)); + output += 8; + vst1q_s16(output, vaddq_s16(s1_5, s1_26)); + output += 8; + vst1q_s16(output, vaddq_s16(s1_6, s1_25)); + output += 8; + vst1q_s16(output, vaddq_s16(s1_7, s1_24)); + output += 8; + + vst1q_s16(output, vaddq_s16(s1_8, s1_23)); + output += 8; + vst1q_s16(output, vaddq_s16(s1_9, s1_22)); + output += 8; + vst1q_s16(output, vaddq_s16(s1_10, s1_21)); + output += 8; + vst1q_s16(output, vaddq_s16(s1_11, s1_20)); + output += 8; + vst1q_s16(output, vaddq_s16(s1_12, s2_19)); + output += 8; + vst1q_s16(output, vaddq_s16(s1_13, s2_18)); + output += 8; + vst1q_s16(output, vaddq_s16(s1_14, s2_17)); + output += 8; + vst1q_s16(output, vaddq_s16(s1_15, s2_16)); + output += 8; + + vst1q_s16(output, vsubq_s16(s1_15, s2_16)); + output += 8; + vst1q_s16(output, vsubq_s16(s1_14, s2_17)); + output += 8; + vst1q_s16(output, vsubq_s16(s1_13, s2_18)); + output += 8; + vst1q_s16(output, vsubq_s16(s1_12, s2_19)); + output += 8; + vst1q_s16(output, vsubq_s16(s1_11, s1_20)); + output += 8; + vst1q_s16(output, vsubq_s16(s1_10, s1_21)); + output += 8; + vst1q_s16(output, vsubq_s16(s1_9, s1_22)); + output += 8; + vst1q_s16(output, vsubq_s16(s1_8, s1_23)); + output += 8; + + vst1q_s16(output, vsubq_s16(s1_7, s1_24)); + output += 8; + vst1q_s16(output, vsubq_s16(s1_6, s1_25)); + output += 8; + vst1q_s16(output, vsubq_s16(s1_5, s1_26)); + output += 8; + vst1q_s16(output, vsubq_s16(s1_4, s1_27)); + output += 8; + vst1q_s16(output, vsubq_s16(s1_3, s2_28)); + output += 8; + vst1q_s16(output, vsubq_s16(s1_2, s2_29)); + output += 8; + vst1q_s16(output, vsubq_s16(s1_1, s2_30)); + output += 8; + vst1q_s16(output, vsubq_s16(s1_0, s2_31)); +} + +static void idct32_8_neon(const int16_t *input, uint8_t *output, int stride) { + int16x8_t in0, in1, in2, in3, in4, in5, in6, in7; + int16x8_t out0, out1, out2, out3, out4, out5, out6, out7; + int16x8_t s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s1_9, s1_10, + s1_11, s1_12, s1_13, s1_14, s1_15, s1_16, s1_17, s1_18, s1_19, s1_20, + s1_21, s1_22, s1_23, s1_24, s1_25, s1_26, s1_27, s1_28, s1_29, s1_30, + s1_31; + int16x8_t s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s2_9, s2_10, + s2_11, s2_12, s2_13, s2_14, s2_15, s2_16, s2_17, s2_18, s2_19, s2_20, + s2_21, s2_22, s2_23, s2_24, s2_25, s2_26, s2_27, s2_28, s2_29, s2_30, + s2_31; + int16x8_t s3_24, s3_25, s3_26, s3_27; + + load_and_transpose_s16_8x8(input, 8, &in0, &in1, &in2, &in3, &in4, &in5, &in6, + &in7); + + // stage 1 + s1_16 = multiply_shift_and_narrow_s16(in1, cospi_31_64); + s1_31 = multiply_shift_and_narrow_s16(in1, cospi_1_64); + + // Different for _8_ + s1_19 = multiply_shift_and_narrow_s16(in7, -cospi_25_64); + s1_28 = multiply_shift_and_narrow_s16(in7, cospi_7_64); + + s1_20 = multiply_shift_and_narrow_s16(in5, cospi_27_64); + s1_27 = multiply_shift_and_narrow_s16(in5, cospi_5_64); + + s1_23 = multiply_shift_and_narrow_s16(in3, -cospi_29_64); + s1_24 = multiply_shift_and_narrow_s16(in3, cospi_3_64); + + // stage 2 + s2_8 = multiply_shift_and_narrow_s16(in2, cospi_30_64); + s2_15 = multiply_shift_and_narrow_s16(in2, cospi_2_64); + + s2_11 = multiply_shift_and_narrow_s16(in6, -cospi_26_64); + s2_12 = multiply_shift_and_narrow_s16(in6, cospi_6_64); + + // stage 3 + s1_4 = multiply_shift_and_narrow_s16(in4, cospi_28_64); + s1_7 = multiply_shift_and_narrow_s16(in4, cospi_4_64); + + s1_17 = multiply_accumulate_shift_and_narrow_s16(s1_16, -cospi_4_64, s1_31, + cospi_28_64); + s1_30 = multiply_accumulate_shift_and_narrow_s16(s1_16, cospi_28_64, s1_31, + cospi_4_64); + + // Different for _8_ + s1_18 = multiply_accumulate_shift_and_narrow_s16(s1_19, -cospi_28_64, s1_28, + -cospi_4_64); + s1_29 = multiply_accumulate_shift_and_narrow_s16(s1_19, -cospi_4_64, s1_28, + cospi_28_64); + + s1_21 = multiply_accumulate_shift_and_narrow_s16(s1_20, -cospi_20_64, s1_27, + cospi_12_64); + s1_26 = multiply_accumulate_shift_and_narrow_s16(s1_20, cospi_12_64, s1_27, + cospi_20_64); + + s1_22 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_12_64, s1_24, + -cospi_20_64); + s1_25 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_20_64, s1_24, + cospi_12_64); + + // stage 4 + s1_0 = multiply_shift_and_narrow_s16(in0, cospi_16_64); + + s2_9 = multiply_accumulate_shift_and_narrow_s16(s2_8, -cospi_8_64, s2_15, + cospi_24_64); + s2_14 = multiply_accumulate_shift_and_narrow_s16(s2_8, cospi_24_64, s2_15, + cospi_8_64); + + s2_10 = multiply_accumulate_shift_and_narrow_s16(s2_11, -cospi_24_64, s2_12, + -cospi_8_64); + s2_13 = multiply_accumulate_shift_and_narrow_s16(s2_11, -cospi_8_64, s2_12, + cospi_24_64); + + s2_16 = vaddq_s16(s1_16, s1_19); + + s2_17 = vaddq_s16(s1_17, s1_18); + s2_18 = vsubq_s16(s1_17, s1_18); + + s2_19 = vsubq_s16(s1_16, s1_19); + + s2_20 = vsubq_s16(s1_23, s1_20); + s2_21 = vsubq_s16(s1_22, s1_21); + + s2_22 = vaddq_s16(s1_21, s1_22); + s2_23 = vaddq_s16(s1_20, s1_23); + + s2_24 = vaddq_s16(s1_24, s1_27); + s2_25 = vaddq_s16(s1_25, s1_26); + s2_26 = vsubq_s16(s1_25, s1_26); + s2_27 = vsubq_s16(s1_24, s1_27); + + s2_28 = vsubq_s16(s1_31, s1_28); + s2_29 = vsubq_s16(s1_30, s1_29); + s2_30 = vaddq_s16(s1_29, s1_30); + s2_31 = vaddq_s16(s1_28, s1_31); + + // stage 5 + s1_5 = sub_multiply_shift_and_narrow_s16(s1_7, s1_4, cospi_16_64); + s1_6 = add_multiply_shift_and_narrow_s16(s1_4, s1_7, cospi_16_64); + + s1_8 = vaddq_s16(s2_8, s2_11); + s1_9 = vaddq_s16(s2_9, s2_10); + s1_10 = vsubq_s16(s2_9, s2_10); + s1_11 = vsubq_s16(s2_8, s2_11); + s1_12 = vsubq_s16(s2_15, s2_12); + s1_13 = vsubq_s16(s2_14, s2_13); + s1_14 = vaddq_s16(s2_13, s2_14); + s1_15 = vaddq_s16(s2_12, s2_15); + + s1_18 = multiply_accumulate_shift_and_narrow_s16(s2_18, -cospi_8_64, s2_29, + cospi_24_64); + s1_29 = multiply_accumulate_shift_and_narrow_s16(s2_18, cospi_24_64, s2_29, + cospi_8_64); + + s1_19 = multiply_accumulate_shift_and_narrow_s16(s2_19, -cospi_8_64, s2_28, + cospi_24_64); + s1_28 = multiply_accumulate_shift_and_narrow_s16(s2_19, cospi_24_64, s2_28, + cospi_8_64); + + s1_20 = multiply_accumulate_shift_and_narrow_s16(s2_20, -cospi_24_64, s2_27, + -cospi_8_64); + s1_27 = multiply_accumulate_shift_and_narrow_s16(s2_20, -cospi_8_64, s2_27, + cospi_24_64); + + s1_21 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_24_64, s2_26, + -cospi_8_64); + s1_26 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_8_64, s2_26, + cospi_24_64); + + // stage 6 + s2_0 = vaddq_s16(s1_0, s1_7); + s2_1 = vaddq_s16(s1_0, s1_6); + s2_2 = vaddq_s16(s1_0, s1_5); + s2_3 = vaddq_s16(s1_0, s1_4); + s2_4 = vsubq_s16(s1_0, s1_4); + s2_5 = vsubq_s16(s1_0, s1_5); + s2_6 = vsubq_s16(s1_0, s1_6); + s2_7 = vsubq_s16(s1_0, s1_7); + + s2_10 = sub_multiply_shift_and_narrow_s16(s1_13, s1_10, cospi_16_64); + s2_13 = add_multiply_shift_and_narrow_s16(s1_10, s1_13, cospi_16_64); + + s2_11 = sub_multiply_shift_and_narrow_s16(s1_12, s1_11, cospi_16_64); + s2_12 = add_multiply_shift_and_narrow_s16(s1_11, s1_12, cospi_16_64); + + s1_16 = vaddq_s16(s2_16, s2_23); + s1_17 = vaddq_s16(s2_17, s2_22); + s2_18 = vaddq_s16(s1_18, s1_21); + s2_19 = vaddq_s16(s1_19, s1_20); + s2_20 = vsubq_s16(s1_19, s1_20); + s2_21 = vsubq_s16(s1_18, s1_21); + s1_22 = vsubq_s16(s2_17, s2_22); + s1_23 = vsubq_s16(s2_16, s2_23); + + s3_24 = vsubq_s16(s2_31, s2_24); + s3_25 = vsubq_s16(s2_30, s2_25); + s3_26 = vsubq_s16(s1_29, s1_26); + s3_27 = vsubq_s16(s1_28, s1_27); + s2_28 = vaddq_s16(s1_27, s1_28); + s2_29 = vaddq_s16(s1_26, s1_29); + s2_30 = vaddq_s16(s2_25, s2_30); + s2_31 = vaddq_s16(s2_24, s2_31); + + // stage 7 + s1_0 = vaddq_s16(s2_0, s1_15); + s1_1 = vaddq_s16(s2_1, s1_14); + s1_2 = vaddq_s16(s2_2, s2_13); + s1_3 = vaddq_s16(s2_3, s2_12); + s1_4 = vaddq_s16(s2_4, s2_11); + s1_5 = vaddq_s16(s2_5, s2_10); + s1_6 = vaddq_s16(s2_6, s1_9); + s1_7 = vaddq_s16(s2_7, s1_8); + s1_8 = vsubq_s16(s2_7, s1_8); + s1_9 = vsubq_s16(s2_6, s1_9); + s1_10 = vsubq_s16(s2_5, s2_10); + s1_11 = vsubq_s16(s2_4, s2_11); + s1_12 = vsubq_s16(s2_3, s2_12); + s1_13 = vsubq_s16(s2_2, s2_13); + s1_14 = vsubq_s16(s2_1, s1_14); + s1_15 = vsubq_s16(s2_0, s1_15); + + s1_20 = sub_multiply_shift_and_narrow_s16(s3_27, s2_20, cospi_16_64); + s1_27 = add_multiply_shift_and_narrow_s16(s2_20, s3_27, cospi_16_64); + + s1_21 = sub_multiply_shift_and_narrow_s16(s3_26, s2_21, cospi_16_64); + s1_26 = add_multiply_shift_and_narrow_s16(s2_21, s3_26, cospi_16_64); + + s2_22 = sub_multiply_shift_and_narrow_s16(s3_25, s1_22, cospi_16_64); + s1_25 = add_multiply_shift_and_narrow_s16(s1_22, s3_25, cospi_16_64); + + s2_23 = sub_multiply_shift_and_narrow_s16(s3_24, s1_23, cospi_16_64); + s1_24 = add_multiply_shift_and_narrow_s16(s1_23, s3_24, cospi_16_64); + + // final stage + out0 = vaddq_s16(s1_0, s2_31); + out1 = vaddq_s16(s1_1, s2_30); + out2 = vaddq_s16(s1_2, s2_29); + out3 = vaddq_s16(s1_3, s2_28); + out4 = vaddq_s16(s1_4, s1_27); + out5 = vaddq_s16(s1_5, s1_26); + out6 = vaddq_s16(s1_6, s1_25); + out7 = vaddq_s16(s1_7, s1_24); + + add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7, output, + stride); + + out0 = vaddq_s16(s1_8, s2_23); + out1 = vaddq_s16(s1_9, s2_22); + out2 = vaddq_s16(s1_10, s1_21); + out3 = vaddq_s16(s1_11, s1_20); + out4 = vaddq_s16(s1_12, s2_19); + out5 = vaddq_s16(s1_13, s2_18); + out6 = vaddq_s16(s1_14, s1_17); + out7 = vaddq_s16(s1_15, s1_16); + + add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7, + output + (8 * stride), stride); + + out0 = vsubq_s16(s1_15, s1_16); + out1 = vsubq_s16(s1_14, s1_17); + out2 = vsubq_s16(s1_13, s2_18); + out3 = vsubq_s16(s1_12, s2_19); + out4 = vsubq_s16(s1_11, s1_20); + out5 = vsubq_s16(s1_10, s1_21); + out6 = vsubq_s16(s1_9, s2_22); + out7 = vsubq_s16(s1_8, s2_23); + + add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7, + output + (16 * stride), stride); + + out0 = vsubq_s16(s1_7, s1_24); + out1 = vsubq_s16(s1_6, s1_25); + out2 = vsubq_s16(s1_5, s1_26); + out3 = vsubq_s16(s1_4, s1_27); + out4 = vsubq_s16(s1_3, s2_28); + out5 = vsubq_s16(s1_2, s2_29); + out6 = vsubq_s16(s1_1, s2_30); + out7 = vsubq_s16(s1_0, s2_31); + + add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7, + output + (24 * stride), stride); +} + +void vpx_idct32x32_34_add_neon(const int16_t *input, uint8_t *dest, + int stride) { + int i; + int16_t temp[32 * 8]; + int16_t *t = temp; + + idct32_6_neon(input, t); + + for (i = 0; i < 32; i += 8) { + idct32_8_neon(t, dest, stride); + t += (8 * 8); + dest += 8; + } +} diff --git a/vpx_dsp/arm/idct4x4_add_neon.asm b/vpx_dsp/arm/idct4x4_add_neon.asm index a4ccba993..bd4e86ded 100644 --- a/vpx_dsp/arm/idct4x4_add_neon.asm +++ b/vpx_dsp/arm/idct4x4_add_neon.asm @@ -15,6 +15,8 @@ AREA ||.text||, CODE, READONLY, ALIGN=2 + INCLUDE vpx_dsp/arm/idct_neon.asm.S + AREA Block, CODE, READONLY ; name this block of code ;void vpx_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) ; @@ -33,7 +35,7 @@ ; So, two passes of a transpose followed by a column transform. ; load the inputs into q8-q9, d16-d19 - vld1.s16 {q8,q9}, [r0]! + LOAD_TRAN_LOW_TO_S16 d16, d17, d18, d19, r0 ; generate scalar constants ; cospi_8_64 = 15137 diff --git a/vpx_dsp/arm/idct4x4_add_neon.c b/vpx_dsp/arm/idct4x4_add_neon.c index 24b91fe48..8f669c907 100644 --- a/vpx_dsp/arm/idct4x4_add_neon.c +++ b/vpx_dsp/arm/idct4x4_add_neon.c @@ -11,6 +11,7 @@ #include <arm_neon.h> #include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/idct_neon.h" #include "vpx_dsp/txfm_common.h" void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, @@ -28,8 +29,8 @@ void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, d26u32 = d27u32 = vdup_n_u32(0); - q8s16 = vld1q_s16(input); - q9s16 = vld1q_s16(input + 8); + q8s16 = load_tran_low_to_s16(input); + q9s16 = load_tran_low_to_s16(input + 8); d16s16 = vget_low_s16(q8s16); d17s16 = vget_high_s16(q8s16); diff --git a/vpx_dsp/arm/idct8x8_add_neon.asm b/vpx_dsp/arm/idct8x8_add_neon.asm index 21e75951e..a5c9c927d 100644 --- a/vpx_dsp/arm/idct8x8_add_neon.asm +++ b/vpx_dsp/arm/idct8x8_add_neon.asm @@ -16,6 +16,8 @@ AREA ||.text||, CODE, READONLY, ALIGN=2 + INCLUDE vpx_dsp/arm/idct_neon.asm.S + ; Parallel 1D IDCT on all the columns of a 8x8 16bit data matrix which are ; loaded in q8-q15. The output will be stored back into q8-q15 registers. ; This macro will touch q0-q7 registers and use them as buffer during @@ -207,10 +209,10 @@ |vpx_idct8x8_64_add_neon| PROC push {r4-r9} vpush {d8-d15} - vld1.s16 {q8,q9}, [r0]! - vld1.s16 {q10,q11}, [r0]! - vld1.s16 {q12,q13}, [r0]! - vld1.s16 {q14,q15}, [r0]! + LOAD_TRAN_LOW_TO_S16 d16, d17, d18, d19, r0 + LOAD_TRAN_LOW_TO_S16 d20, d21, d22, d23, r0 + LOAD_TRAN_LOW_TO_S16 d24, d25, d26, d27, r0 + LOAD_TRAN_LOW_TO_S16 d28, d29, d30, d31, r0 ; transpose the input data TRANSPOSE8X8 @@ -312,10 +314,10 @@ |vpx_idct8x8_12_add_neon| PROC push {r4-r9} vpush {d8-d15} - vld1.s16 {q8,q9}, [r0]! - vld1.s16 {q10,q11}, [r0]! - vld1.s16 {q12,q13}, [r0]! - vld1.s16 {q14,q15}, [r0]! + LOAD_TRAN_LOW_TO_S16 d16, d17, d18, d19, r0 + LOAD_TRAN_LOW_TO_S16 d20, d21, d22, d23, r0 + LOAD_TRAN_LOW_TO_S16 d24, d25, d26, d27, r0 + LOAD_TRAN_LOW_TO_S16 d28, d29, d30, d31, r0 ; transpose the input data TRANSPOSE8X8 diff --git a/vpx_dsp/arm/idct8x8_add_neon.c b/vpx_dsp/arm/idct8x8_add_neon.c index d73feebec..159a6ec98 100644 --- a/vpx_dsp/arm/idct8x8_add_neon.c +++ b/vpx_dsp/arm/idct8x8_add_neon.c @@ -12,6 +12,7 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/idct_neon.h" #include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/txfm_common.h" @@ -173,14 +174,14 @@ void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; uint16x8_t q8u16, q9u16, q10u16, q11u16; - q8s16 = vld1q_s16(input); - q9s16 = vld1q_s16(input + 8); - q10s16 = vld1q_s16(input + 16); - q11s16 = vld1q_s16(input + 24); - q12s16 = vld1q_s16(input + 32); - q13s16 = vld1q_s16(input + 40); - q14s16 = vld1q_s16(input + 48); - q15s16 = vld1q_s16(input + 56); + q8s16 = load_tran_low_to_s16(input); + q9s16 = load_tran_low_to_s16(input + 8); + q10s16 = load_tran_low_to_s16(input + 16); + q11s16 = load_tran_low_to_s16(input + 24); + q12s16 = load_tran_low_to_s16(input + 32); + q13s16 = load_tran_low_to_s16(input + 40); + q14s16 = load_tran_low_to_s16(input + 48); + q15s16 = load_tran_low_to_s16(input + 56); transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); @@ -279,14 +280,14 @@ void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, uint16x8_t q8u16, q9u16, q10u16, q11u16; int32x4_t q9s32, q10s32, q11s32, q12s32; - q8s16 = vld1q_s16(input); - q9s16 = vld1q_s16(input + 8); - q10s16 = vld1q_s16(input + 16); - q11s16 = vld1q_s16(input + 24); - q12s16 = vld1q_s16(input + 32); - q13s16 = vld1q_s16(input + 40); - q14s16 = vld1q_s16(input + 48); - q15s16 = vld1q_s16(input + 56); + q8s16 = load_tran_low_to_s16(input); + q9s16 = load_tran_low_to_s16(input + 8); + q10s16 = load_tran_low_to_s16(input + 16); + q11s16 = load_tran_low_to_s16(input + 24); + q12s16 = load_tran_low_to_s16(input + 32); + q13s16 = load_tran_low_to_s16(input + 40); + q14s16 = load_tran_low_to_s16(input + 48); + q15s16 = load_tran_low_to_s16(input + 56); transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); diff --git a/vpx_dsp/arm/idct_neon.asm b/vpx_dsp/arm/idct_neon.asm new file mode 100644 index 000000000..a223c0b63 --- /dev/null +++ b/vpx_dsp/arm/idct_neon.asm @@ -0,0 +1,29 @@ +; +; Copyright (c) 2016 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + INCLUDE ./vpx_config.asm + + ; Helper function used to load tran_low_t into int16, narrowing if + ; necessary. + ; $dst0..3 are d registers with the pairs assumed to be contiguous in + ; non-high-bitdepth builds. q0-q3 are used as temporaries in high-bitdepth. + MACRO + LOAD_TRAN_LOW_TO_S16 $dst0, $dst1, $dst2, $dst3, $src + IF CONFIG_VP9_HIGHBITDEPTH + vld1.s32 {q0,q1}, [$src]! + vld1.s32 {q2,q3}, [$src]! + vmovn.i32 $dst0, q0 + vmovn.i32 $dst1, q1 + vmovn.i32 $dst2, q2 + vmovn.i32 $dst3, q3 + ELSE + vld1.s16 {$dst0-$dst1,$dst2-$dst3}, [$src]! + ENDIF + MEND diff --git a/vpx_dsp/arm/idct_neon.h b/vpx_dsp/arm/idct_neon.h new file mode 100644 index 000000000..5c2a53c03 --- /dev/null +++ b/vpx_dsp/arm/idct_neon.h @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_DSP_ARM_IDCT_NEON_H_ +#define VPX_DSP_ARM_IDCT_NEON_H_ + +#include <arm_neon.h> + +#include "./vpx_config.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/vpx_dsp_common.h" + +//------------------------------------------------------------------------------ + +// Helper function used to load tran_low_t into int16, narrowing if necessary. +static INLINE int16x8_t load_tran_low_to_s16(const tran_low_t *buf) { +#if CONFIG_VP9_HIGHBITDEPTH + const int32x4_t v0 = vld1q_s32(buf); + const int32x4_t v1 = vld1q_s32(buf + 4); + const int16x4_t s0 = vmovn_s32(v0); + const int16x4_t s1 = vmovn_s32(v1); + return vcombine_s16(s0, s1); +#else + return vld1q_s16(buf); +#endif +} + +// Multiply a by a_const. Saturate, shift and narrow by 14. +static INLINE int16x8_t multiply_shift_and_narrow_s16(const int16x8_t a, + const int16_t a_const) { + // Shift by 14 + rounding will be within 16 bits for well formed streams. + // See WRAPLOW and dct_const_round_shift for details. + // This instruction doubles the result and returns the high half, essentially + // resulting in a right shift by 15. By multiplying the constant first that + // becomes a right shift by 14. + // The largest possible value used here is + // vpx_dsp/txfm_common.h:cospi_1_64 = 16364 (* 2 = 32728) a which falls *just* + // within the range of int16_t (+32767 / -32768) even when negated. + return vqrdmulhq_n_s16(a, a_const * 2); +} + +// Add a and b, then multiply by ab_const. Shift and narrow by 14. +static INLINE int16x8_t add_multiply_shift_and_narrow_s16( + const int16x8_t a, const int16x8_t b, const int16_t ab_const) { + // In both add_ and it's pair, sub_, the input for well-formed streams will be + // well within 16 bits (input to the idct is the difference between two frames + // and will be within -255 to 255, or 9 bits) + // However, for inputs over about 25,000 (valid for int16_t, but not for idct + // input) this function can not use vaddq_s16. + // In order to match existing behavior and intentionally out of range tests, + // expand the addition up to 32 bits to prevent truncation. + int32x4_t temp_low = vaddl_s16(vget_low_s16(a), vget_low_s16(b)); + int32x4_t temp_high = vaddl_s16(vget_high_s16(a), vget_high_s16(b)); + temp_low = vmulq_n_s32(temp_low, ab_const); + temp_high = vmulq_n_s32(temp_high, ab_const); + return vcombine_s16(vrshrn_n_s32(temp_low, 14), vrshrn_n_s32(temp_high, 14)); +} + +// Subtract b from a, then multiply by ab_const. Shift and narrow by 14. +static INLINE int16x8_t sub_multiply_shift_and_narrow_s16( + const int16x8_t a, const int16x8_t b, const int16_t ab_const) { + int32x4_t temp_low = vsubl_s16(vget_low_s16(a), vget_low_s16(b)); + int32x4_t temp_high = vsubl_s16(vget_high_s16(a), vget_high_s16(b)); + temp_low = vmulq_n_s32(temp_low, ab_const); + temp_high = vmulq_n_s32(temp_high, ab_const); + return vcombine_s16(vrshrn_n_s32(temp_low, 14), vrshrn_n_s32(temp_high, 14)); +} + +// Multiply a by a_const and b by b_const, then accumulate. Shift and narrow by +// 14. +static INLINE int16x8_t multiply_accumulate_shift_and_narrow_s16( + const int16x8_t a, const int16_t a_const, const int16x8_t b, + const int16_t b_const) { + int32x4_t temp_low = vmull_n_s16(vget_low_s16(a), a_const); + int32x4_t temp_high = vmull_n_s16(vget_high_s16(a), a_const); + temp_low = vmlal_n_s16(temp_low, vget_low_s16(b), b_const); + temp_high = vmlal_n_s16(temp_high, vget_high_s16(b), b_const); + return vcombine_s16(vrshrn_n_s32(temp_low, 14), vrshrn_n_s32(temp_high, 14)); +} + +static INLINE void load_and_transpose_s16_8x8(const int16_t *a, int a_stride, + int16x8_t *a0, int16x8_t *a1, + int16x8_t *a2, int16x8_t *a3, + int16x8_t *a4, int16x8_t *a5, + int16x8_t *a6, int16x8_t *a7) { + *a0 = vld1q_s16(a); + a += a_stride; + *a1 = vld1q_s16(a); + a += a_stride; + *a2 = vld1q_s16(a); + a += a_stride; + *a3 = vld1q_s16(a); + a += a_stride; + *a4 = vld1q_s16(a); + a += a_stride; + *a5 = vld1q_s16(a); + a += a_stride; + *a6 = vld1q_s16(a); + a += a_stride; + *a7 = vld1q_s16(a); + + transpose_s16_8x8(a0, a1, a2, a3, a4, a5, a6, a7); +} + +// Shift the output down by 6 and add it to the destination buffer. +static INLINE void add_and_store_u8_s16(const int16x8_t a0, const int16x8_t a1, + const int16x8_t a2, const int16x8_t a3, + const int16x8_t a4, const int16x8_t a5, + const int16x8_t a6, const int16x8_t a7, + uint8_t *b, const int b_stride) { + uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7; + int16x8_t c0, c1, c2, c3, c4, c5, c6, c7; + b0 = vld1_u8(b); + b += b_stride; + b1 = vld1_u8(b); + b += b_stride; + b2 = vld1_u8(b); + b += b_stride; + b3 = vld1_u8(b); + b += b_stride; + b4 = vld1_u8(b); + b += b_stride; + b5 = vld1_u8(b); + b += b_stride; + b6 = vld1_u8(b); + b += b_stride; + b7 = vld1_u8(b); + b -= (7 * b_stride); + + // c = b + (a >> 6) + c0 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b0)), a0, 6); + c1 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b1)), a1, 6); + c2 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b2)), a2, 6); + c3 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b3)), a3, 6); + c4 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b4)), a4, 6); + c5 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b5)), a5, 6); + c6 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b6)), a6, 6); + c7 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b7)), a7, 6); + + b0 = vqmovun_s16(c0); + b1 = vqmovun_s16(c1); + b2 = vqmovun_s16(c2); + b3 = vqmovun_s16(c3); + b4 = vqmovun_s16(c4); + b5 = vqmovun_s16(c5); + b6 = vqmovun_s16(c6); + b7 = vqmovun_s16(c7); + + vst1_u8(b, b0); + b += b_stride; + vst1_u8(b, b1); + b += b_stride; + vst1_u8(b, b2); + b += b_stride; + vst1_u8(b, b3); + b += b_stride; + vst1_u8(b, b4); + b += b_stride; + vst1_u8(b, b5); + b += b_stride; + vst1_u8(b, b6); + b += b_stride; + vst1_u8(b, b7); +} +#endif // VPX_DSP_ARM_IDCT_NEON_H_ diff --git a/vpx_dsp/arm/intrapred_neon.c b/vpx_dsp/arm/intrapred_neon.c index 38e79ed69..e150a5302 100644 --- a/vpx_dsp/arm/intrapred_neon.c +++ b/vpx_dsp/arm/intrapred_neon.c @@ -17,306 +17,254 @@ //------------------------------------------------------------------------------ // DC 4x4 -// 'do_above' and 'do_left' facilitate branch removal when inlined. -static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, - const uint8_t *left, int do_above, int do_left) { - uint16x4_t sum_top; - uint16x4_t sum_left; - uint16x4_t dc0; - - if (do_above) { - const uint8x8_t A = vld1_u8(above); // top row - const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top - sum_top = vpadd_u16(p0, p0); - } - - if (do_left) { - const uint8x8_t L = vld1_u8(left); // left border - const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left - sum_left = vpadd_u16(p0, p0); - } - - if (do_above && do_left) { - const uint16x4_t sum = vadd_u16(sum_left, sum_top); - dc0 = vrshr_n_u16(sum, 3); - } else if (do_above) { - dc0 = vrshr_n_u16(sum_top, 2); - } else if (do_left) { - dc0 = vrshr_n_u16(sum_left, 2); - } else { - dc0 = vdup_n_u16(0x80); - } +static INLINE uint16x4_t dc_sum_4(const uint8_t *ref) { + const uint8x8_t ref_u8 = vld1_u8(ref); + const uint16x4_t p0 = vpaddl_u8(ref_u8); + return vpadd_u16(p0, p0); +} - { - const uint8x8_t dc = vdup_lane_u8(vreinterpret_u8_u16(dc0), 0); - int i; - for (i = 0; i < 4; ++i) { - vst1_lane_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc), 0); - } +static INLINE void dc_store_4x4(uint8_t *dst, ptrdiff_t stride, + const uint8x8_t dc) { + const uint8x8_t dc_dup = vdup_lane_u8(dc, 0); + int i; + for (i = 0; i < 4; ++i, dst += stride) { + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(dc_dup), 0); } } void vpx_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - dc_4x4(dst, stride, above, left, 1, 1); + const uint8x8_t a = vld1_u8(above); + const uint8x8_t l = vld1_u8(left); + const uint16x8_t al = vaddl_u8(a, l); + uint16x4_t sum; + uint8x8_t dc; + sum = vpadd_u16(vget_low_u16(al), vget_low_u16(al)); + sum = vpadd_u16(sum, sum); + dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 3)); + dc_store_4x4(dst, stride, dc); } void vpx_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { + const uint16x4_t sum = dc_sum_4(left); + const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 2)); (void)above; - dc_4x4(dst, stride, NULL, left, 0, 1); + dc_store_4x4(dst, stride, dc); } void vpx_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { + const uint16x4_t sum = dc_sum_4(above); + const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 2)); (void)left; - dc_4x4(dst, stride, above, NULL, 1, 0); + dc_store_4x4(dst, stride, dc); } void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { + const uint8x8_t dc = vdup_n_u8(0x80); (void)above; (void)left; - dc_4x4(dst, stride, NULL, NULL, 0, 0); + dc_store_4x4(dst, stride, dc); } //------------------------------------------------------------------------------ // DC 8x8 -// 'do_above' and 'do_left' facilitate branch removal when inlined. -static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, - const uint8_t *left, int do_above, int do_left) { - uint16x8_t sum_top; - uint16x8_t sum_left; - uint8x8_t dc0; - - if (do_above) { - const uint8x8_t A = vld1_u8(above); // top row - const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top - const uint16x4_t p1 = vpadd_u16(p0, p0); - const uint16x4_t p2 = vpadd_u16(p1, p1); - sum_top = vcombine_u16(p2, p2); - } - - if (do_left) { - const uint8x8_t L = vld1_u8(left); // left border - const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left - const uint16x4_t p1 = vpadd_u16(p0, p0); - const uint16x4_t p2 = vpadd_u16(p1, p1); - sum_left = vcombine_u16(p2, p2); - } - - if (do_above && do_left) { - const uint16x8_t sum = vaddq_u16(sum_left, sum_top); - dc0 = vrshrn_n_u16(sum, 4); - } else if (do_above) { - dc0 = vrshrn_n_u16(sum_top, 3); - } else if (do_left) { - dc0 = vrshrn_n_u16(sum_left, 3); - } else { - dc0 = vdup_n_u8(0x80); - } +static INLINE uint16x4_t dc_sum_8(const uint8_t *ref) { + const uint8x8_t ref_u8 = vld1_u8(ref); + uint16x4_t sum = vpaddl_u8(ref_u8); + sum = vpadd_u16(sum, sum); + return vpadd_u16(sum, sum); +} - { - const uint8x8_t dc = vdup_lane_u8(dc0, 0); - int i; - for (i = 0; i < 8; ++i) { - vst1_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc)); - } +static INLINE void dc_store_8x8(uint8_t *dst, ptrdiff_t stride, + const uint8x8_t dc) { + const uint8x8_t dc_dup = vdup_lane_u8(dc, 0); + int i; + for (i = 0; i < 8; ++i, dst += stride) { + vst1_u8(dst, dc_dup); } } void vpx_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - dc_8x8(dst, stride, above, left, 1, 1); + const uint8x8_t above_u8 = vld1_u8(above); + const uint8x8_t left_u8 = vld1_u8(left); + const uint8x16_t above_and_left = vcombine_u8(above_u8, left_u8); + const uint16x8_t p0 = vpaddlq_u8(above_and_left); + uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); + uint8x8_t dc; + sum = vpadd_u16(sum, sum); + sum = vpadd_u16(sum, sum); + dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 4)); + dc_store_8x8(dst, stride, dc); } void vpx_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { + const uint16x4_t sum = dc_sum_8(left); + const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 3)); (void)above; - dc_8x8(dst, stride, NULL, left, 0, 1); + dc_store_8x8(dst, stride, dc); } void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { + const uint16x4_t sum = dc_sum_8(above); + const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 3)); (void)left; - dc_8x8(dst, stride, above, NULL, 1, 0); + dc_store_8x8(dst, stride, dc); } void vpx_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { + const uint8x8_t dc = vdup_n_u8(0x80); (void)above; (void)left; - dc_8x8(dst, stride, NULL, NULL, 0, 0); + dc_store_8x8(dst, stride, dc); } //------------------------------------------------------------------------------ // DC 16x16 -// 'do_above' and 'do_left' facilitate branch removal when inlined. -static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left, - int do_above, int do_left) { - uint16x8_t sum_top; - uint16x8_t sum_left; - uint8x8_t dc0; - - if (do_above) { - const uint8x16_t A = vld1q_u8(above); // top row - const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top - const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); - const uint16x4_t p2 = vpadd_u16(p1, p1); - const uint16x4_t p3 = vpadd_u16(p2, p2); - sum_top = vcombine_u16(p3, p3); - } - - if (do_left) { - const uint8x16_t L = vld1q_u8(left); // left row - const uint16x8_t p0 = vpaddlq_u8(L); // cascading summation of the left - const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); - const uint16x4_t p2 = vpadd_u16(p1, p1); - const uint16x4_t p3 = vpadd_u16(p2, p2); - sum_left = vcombine_u16(p3, p3); - } - - if (do_above && do_left) { - const uint16x8_t sum = vaddq_u16(sum_left, sum_top); - dc0 = vrshrn_n_u16(sum, 5); - } else if (do_above) { - dc0 = vrshrn_n_u16(sum_top, 4); - } else if (do_left) { - dc0 = vrshrn_n_u16(sum_left, 4); - } else { - dc0 = vdup_n_u8(0x80); - } +static INLINE uint16x4_t dc_sum_16(const uint8_t *ref) { + const uint8x16_t ref_u8 = vld1q_u8(ref); + const uint16x8_t p0 = vpaddlq_u8(ref_u8); + uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); + sum = vpadd_u16(sum, sum); + return vpadd_u16(sum, sum); +} - { - const uint8x16_t dc = vdupq_lane_u8(dc0, 0); - int i; - for (i = 0; i < 16; ++i) { - vst1q_u8(dst + i * stride, dc); - } +static INLINE void dc_store_16x16(uint8_t *dst, ptrdiff_t stride, + const uint8x8_t dc) { + const uint8x16_t dc_dup = vdupq_lane_u8(dc, 0); + int i; + for (i = 0; i < 16; ++i, dst += stride) { + vst1q_u8(dst, dc_dup); } } void vpx_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - dc_16x16(dst, stride, above, left, 1, 1); + const uint8x16_t ref0 = vld1q_u8(above); + const uint8x16_t ref1 = vld1q_u8(left); + const uint16x8_t p0 = vpaddlq_u8(ref0); + const uint16x8_t p1 = vpaddlq_u8(ref1); + const uint16x8_t p2 = vaddq_u16(p0, p1); + uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); + uint8x8_t dc; + sum = vpadd_u16(sum, sum); + sum = vpadd_u16(sum, sum); + dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 5)); + dc_store_16x16(dst, stride, dc); } void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { + const uint16x4_t sum = dc_sum_16(left); + const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 4)); (void)above; - dc_16x16(dst, stride, NULL, left, 0, 1); + dc_store_16x16(dst, stride, dc); } void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { + const uint16x4_t sum = dc_sum_16(above); + const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 4)); (void)left; - dc_16x16(dst, stride, above, NULL, 1, 0); + dc_store_16x16(dst, stride, dc); } void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { + const uint8x8_t dc = vdup_n_u8(0x80); (void)above; (void)left; - dc_16x16(dst, stride, NULL, NULL, 0, 0); + dc_store_16x16(dst, stride, dc); } //------------------------------------------------------------------------------ // DC 32x32 -// 'do_above' and 'do_left' facilitate branch removal when inlined. -static INLINE void dc_32x32(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left, - int do_above, int do_left) { - uint16x8_t sum_top; - uint16x8_t sum_left; - uint8x8_t dc0; - - if (do_above) { - const uint8x16_t A0 = vld1q_u8(above); // top row - const uint8x16_t A1 = vld1q_u8(above + 16); - const uint16x8_t p0 = vpaddlq_u8(A0); // cascading summation of the top - const uint16x8_t p1 = vpaddlq_u8(A1); - const uint16x8_t p2 = vaddq_u16(p0, p1); - const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); - const uint16x4_t p4 = vpadd_u16(p3, p3); - const uint16x4_t p5 = vpadd_u16(p4, p4); - sum_top = vcombine_u16(p5, p5); - } - - if (do_left) { - const uint8x16_t L0 = vld1q_u8(left); // left row - const uint8x16_t L1 = vld1q_u8(left + 16); - const uint16x8_t p0 = vpaddlq_u8(L0); // cascading summation of the left - const uint16x8_t p1 = vpaddlq_u8(L1); - const uint16x8_t p2 = vaddq_u16(p0, p1); - const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); - const uint16x4_t p4 = vpadd_u16(p3, p3); - const uint16x4_t p5 = vpadd_u16(p4, p4); - sum_left = vcombine_u16(p5, p5); - } +static INLINE uint16x4_t dc_sum_32(const uint8_t *ref) { + const uint8x16x2_t r = vld2q_u8(ref); + const uint16x8_t p0 = vpaddlq_u8(r.val[0]); + const uint16x8_t p1 = vpaddlq_u8(r.val[1]); + const uint16x8_t p2 = vaddq_u16(p0, p1); + uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); + sum = vpadd_u16(sum, sum); + return vpadd_u16(sum, sum); +} - if (do_above && do_left) { - const uint16x8_t sum = vaddq_u16(sum_left, sum_top); - dc0 = vrshrn_n_u16(sum, 6); - } else if (do_above) { - dc0 = vrshrn_n_u16(sum_top, 5); - } else if (do_left) { - dc0 = vrshrn_n_u16(sum_left, 5); - } else { - dc0 = vdup_n_u8(0x80); - } +static INLINE void dc_store_32x32(uint8_t *dst, ptrdiff_t stride, + const uint8x8_t dc) { + uint8x16x2_t dc_dup; + int i; + dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u8(dc, 0); - { - const uint8x16_t dc = vdupq_lane_u8(dc0, 0); - int i; - for (i = 0; i < 32; ++i) { - vst1q_u8(dst + i * stride, dc); - vst1q_u8(dst + i * stride + 16, dc); - } + for (i = 0; i < 32; ++i, dst += stride) { + vst2q_u8(dst, dc_dup); } } void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - dc_32x32(dst, stride, above, left, 1, 1); + const uint8x16x2_t a = vld2q_u8(above); + const uint8x16x2_t l = vld2q_u8(left); + const uint16x8_t pa0 = vpaddlq_u8(a.val[0]); + const uint16x8_t pl0 = vpaddlq_u8(l.val[0]); + const uint16x8_t pa1 = vpaddlq_u8(a.val[1]); + const uint16x8_t pl1 = vpaddlq_u8(l.val[1]); + const uint16x8_t pa = vaddq_u16(pa0, pa1); + const uint16x8_t pl = vaddq_u16(pl0, pl1); + const uint16x8_t pal = vaddq_u16(pa, pl); + uint16x4_t sum = vadd_u16(vget_low_u16(pal), vget_high_u16(pal)); + uint8x8_t dc; + sum = vpadd_u16(sum, sum); + sum = vpadd_u16(sum, sum); + dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 6)); + dc_store_32x32(dst, stride, dc); } void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { + const uint16x4_t sum = dc_sum_32(left); + const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 5)); (void)above; - dc_32x32(dst, stride, NULL, left, 0, 1); + dc_store_32x32(dst, stride, dc); } void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { + const uint16x4_t sum = dc_sum_32(above); + const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 5)); (void)left; - dc_32x32(dst, stride, above, NULL, 1, 0); + dc_store_32x32(dst, stride, dc); } void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { + const uint8x8_t dc = vdup_n_u8(0x80); (void)above; (void)left; - dc_32x32(dst, stride, NULL, NULL, 0, 0); + dc_store_32x32(dst, stride, dc); } // ----------------------------------------------------------------------------- void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(above)); // top row - const uint64x1_t A1 = vshr_n_u64(A0, 8); - const uint64x1_t A2 = vshr_n_u64(A0, 16); - const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0); + const uint8x8_t ABCDEFGH = vld1_u8(above); + const uint64x1_t A1 = vshr_n_u64(vreinterpret_u64_u8(ABCDEFGH), 8); + const uint64x1_t A2 = vshr_n_u64(vreinterpret_u64_u8(ABCDEFGH), 16); const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1); const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2); const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGH00); @@ -331,485 +279,506 @@ void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0); vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0); vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0); - dst[3 * stride + 3] = above[7]; + vst1_lane_u8(dst + 3 * stride + 3, ABCDEFGH, 7); +} + +static INLINE void d45_store_8(uint8_t **dst, const ptrdiff_t stride, + const uint8x8_t above_right, uint8x8_t *row) { + *row = vext_u8(*row, above_right, 1); + vst1_u8(*dst, *row); + *dst += stride; } void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - static const uint8_t shuffle1[8] = { 1, 2, 3, 4, 5, 6, 7, 7 }; - static const uint8_t shuffle2[8] = { 2, 3, 4, 5, 6, 7, 7, 7 }; - const uint8x8_t sh_12345677 = vld1_u8(shuffle1); - const uint8x8_t sh_23456777 = vld1_u8(shuffle2); - const uint8x8_t A0 = vld1_u8(above); // top row - const uint8x8_t A1 = vtbl1_u8(A0, sh_12345677); - const uint8x8_t A2 = vtbl1_u8(A0, sh_23456777); + const uint8x8_t A0 = vld1_u8(above); + const uint8x8_t above_right = vdup_lane_u8(A0, 7); + const uint8x8_t A1 = vext_u8(A0, above_right, 1); + const uint8x8_t A2 = vext_u8(A0, above_right, 2); const uint8x8_t avg1 = vhadd_u8(A0, A2); uint8x8_t row = vrhadd_u8(avg1, A1); - int i; (void)left; - for (i = 0; i < 7; ++i) { - vst1_u8(dst + i * stride, row); - row = vtbl1_u8(row, sh_12345677); - } - vst1_u8(dst + i * stride, row); + + vst1_u8(dst, row); + dst += stride; + d45_store_8(&dst, stride, above_right, &row); + d45_store_8(&dst, stride, above_right, &row); + d45_store_8(&dst, stride, above_right, &row); + d45_store_8(&dst, stride, above_right, &row); + d45_store_8(&dst, stride, above_right, &row); + d45_store_8(&dst, stride, above_right, &row); + vst1_u8(dst, above_right); +} + +static INLINE void d45_store_16(uint8_t **dst, const ptrdiff_t stride, + const uint8x16_t above_right, uint8x16_t *row) { + *row = vextq_u8(*row, above_right, 1); + vst1q_u8(*dst, *row); + *dst += stride; } void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint8x16_t A0 = vld1q_u8(above); // top row - const uint8x16_t above_right = vld1q_dup_u8(above + 15); + const uint8x16_t A0 = vld1q_u8(above); + const uint8x16_t above_right = vdupq_lane_u8(vget_high_u8(A0), 7); const uint8x16_t A1 = vextq_u8(A0, above_right, 1); const uint8x16_t A2 = vextq_u8(A0, above_right, 2); const uint8x16_t avg1 = vhaddq_u8(A0, A2); uint8x16_t row = vrhaddq_u8(avg1, A1); - int i; (void)left; - for (i = 0; i < 15; ++i) { - vst1q_u8(dst + i * stride, row); - row = vextq_u8(row, above_right, 1); - } - vst1q_u8(dst + i * stride, row); + + vst1q_u8(dst, row); + dst += stride; + d45_store_16(&dst, stride, above_right, &row); + d45_store_16(&dst, stride, above_right, &row); + d45_store_16(&dst, stride, above_right, &row); + d45_store_16(&dst, stride, above_right, &row); + d45_store_16(&dst, stride, above_right, &row); + d45_store_16(&dst, stride, above_right, &row); + d45_store_16(&dst, stride, above_right, &row); + d45_store_16(&dst, stride, above_right, &row); + d45_store_16(&dst, stride, above_right, &row); + d45_store_16(&dst, stride, above_right, &row); + d45_store_16(&dst, stride, above_right, &row); + d45_store_16(&dst, stride, above_right, &row); + d45_store_16(&dst, stride, above_right, &row); + d45_store_16(&dst, stride, above_right, &row); + vst1q_u8(dst, above_right); } // ----------------------------------------------------------------------------- void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint8x8_t XABCD_u8 = vld1_u8(above - 1); - const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8); - const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32); + const uint8x8_t XABCD = vld1_u8(above - 1); const uint32x2_t zero = vdup_n_u32(0); const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0); - const uint8x8_t IJKL_u8 = vreinterpret_u8_u32(IJKL); - const uint64x1_t LKJI____ = vreinterpret_u64_u8(vrev32_u8(IJKL_u8)); - const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC); - const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8)); - const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16)); - const uint8_t D = vget_lane_u8(XABCD_u8, 4); - const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6); - const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC); - const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8); - const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_); + const uint8x8_t LKJI = vrev64_u8(vreinterpret_u8_u32(IJKL)); + const uint8x8_t LKJIXABC = vext_u8(LKJI, XABCD, 4); + const uint8x8_t KJIXABCD = vext_u8(LKJI, XABCD, 5); + const uint8x8_t JIXABCD0 = + vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(KJIXABCD), 8)); + const uint8x8_t avg1 = vhadd_u8(JIXABCD0, LKJIXABC); + const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABCD); const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2); const uint32x2_t r3 = vreinterpret_u32_u8(avg2); const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8)); const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16)); const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24)); - vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0); - vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0); - vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0); - vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0); + vst1_lane_u32((uint32_t *)dst, r0, 0); + dst += stride; + vst1_lane_u32((uint32_t *)dst, r1, 0); + dst += stride; + vst1_lane_u32((uint32_t *)dst, r2, 0); + dst += stride; + vst1_lane_u32((uint32_t *)dst, r3, 0); } +// ----------------------------------------------------------------------------- + #if !HAVE_NEON_ASM void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { + const uint32_t d = *(const uint32_t *)above; int i; - uint32x2_t d0u32 = vdup_n_u32(0); (void)left; - d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0); - for (i = 0; i < 4; i++, dst += stride) - vst1_lane_u32((uint32_t *)dst, d0u32, 0); + for (i = 0; i < 4; i++, dst += stride) { + *(uint32_t *)dst = d; + } } void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { + const uint8x8_t d = vld1_u8(above); int i; - uint8x8_t d0u8 = vdup_n_u8(0); (void)left; - d0u8 = vld1_u8(above); - for (i = 0; i < 8; i++, dst += stride) vst1_u8(dst, d0u8); + for (i = 0; i < 8; i++, dst += stride) { + vst1_u8(dst, d); + } } void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { + const uint8x16_t d = vld1q_u8(above); int i; - uint8x16_t q0u8 = vdupq_n_u8(0); (void)left; - q0u8 = vld1q_u8(above); - for (i = 0; i < 16; i++, dst += stride) vst1q_u8(dst, q0u8); + for (i = 0; i < 16; i++, dst += stride) { + vst1q_u8(dst, d); + } } void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { + const uint8x16_t d0 = vld1q_u8(above); + const uint8x16_t d1 = vld1q_u8(above + 16); int i; - uint8x16_t q0u8 = vdupq_n_u8(0); - uint8x16_t q1u8 = vdupq_n_u8(0); (void)left; - q0u8 = vld1q_u8(above); - q1u8 = vld1q_u8(above + 16); - for (i = 0; i < 32; i++, dst += stride) { - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q1u8); + for (i = 0; i < 32; i++) { + // Note: performance was worse using vst2q_u8 under gcc-4.9 & clang-3.8. + // clang-3.8 unrolled the loop fully with no filler so the cause is likely + // the latency of the instruction. + vst1q_u8(dst, d0); + dst += 16; + vst1q_u8(dst, d1); + dst += stride - 16; } } +// ----------------------------------------------------------------------------- + void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - uint8x8_t d0u8 = vdup_n_u8(0); - uint32x2_t d1u32 = vdup_n_u32(0); + const uint32x2_t zero = vdup_n_u32(0); + const uint8x8_t left_u8 = + vreinterpret_u8_u32(vld1_lane_u32((const uint32_t *)left, zero, 0)); + uint8x8_t d; (void)above; - d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0); - - d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); + d = vdup_lane_u8(left_u8, 0); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0); dst += stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); + d = vdup_lane_u8(left_u8, 1); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0); dst += stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); + d = vdup_lane_u8(left_u8, 2); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0); dst += stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); + d = vdup_lane_u8(left_u8, 3); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0); } void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - uint8x8_t d0u8 = vdup_n_u8(0); - uint64x1_t d1u64 = vdup_n_u64(0); + const uint8x8_t left_u8 = vld1_u8(left); + uint8x8_t d; (void)above; - d1u64 = vld1_u64((const uint64_t *)left); - - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0); - vst1_u8(dst, d0u8); + d = vdup_lane_u8(left_u8, 0); + vst1_u8(dst, d); dst += stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1); - vst1_u8(dst, d0u8); + d = vdup_lane_u8(left_u8, 1); + vst1_u8(dst, d); dst += stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2); - vst1_u8(dst, d0u8); + d = vdup_lane_u8(left_u8, 2); + vst1_u8(dst, d); dst += stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3); - vst1_u8(dst, d0u8); + d = vdup_lane_u8(left_u8, 3); + vst1_u8(dst, d); dst += stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4); - vst1_u8(dst, d0u8); + d = vdup_lane_u8(left_u8, 4); + vst1_u8(dst, d); dst += stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5); - vst1_u8(dst, d0u8); + d = vdup_lane_u8(left_u8, 5); + vst1_u8(dst, d); dst += stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6); - vst1_u8(dst, d0u8); + d = vdup_lane_u8(left_u8, 6); + vst1_u8(dst, d); dst += stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7); - vst1_u8(dst, d0u8); + d = vdup_lane_u8(left_u8, 7); + vst1_u8(dst, d); } void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - int j; - uint8x8_t d2u8 = vdup_n_u8(0); - uint8x16_t q0u8 = vdupq_n_u8(0); - uint8x16_t q1u8 = vdupq_n_u8(0); + const uint8x16_t left_u8q = vld1q_u8(left); + uint8x8_t left_u8d = vget_low_u8(left_u8q); + uint8x16_t d; + int i; (void)above; - q1u8 = vld1q_u8(left); - d2u8 = vget_low_u8(q1u8); - for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) { - q0u8 = vdupq_lane_u8(d2u8, 0); - vst1q_u8(dst, q0u8); + for (i = 0; i < 2; i++, left_u8d = vget_high_u8(left_u8q)) { + d = vdupq_lane_u8(left_u8d, 0); + vst1q_u8(dst, d); dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 1); - vst1q_u8(dst, q0u8); + d = vdupq_lane_u8(left_u8d, 1); + vst1q_u8(dst, d); dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 2); - vst1q_u8(dst, q0u8); + d = vdupq_lane_u8(left_u8d, 2); + vst1q_u8(dst, d); dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 3); - vst1q_u8(dst, q0u8); + d = vdupq_lane_u8(left_u8d, 3); + vst1q_u8(dst, d); dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 4); - vst1q_u8(dst, q0u8); + d = vdupq_lane_u8(left_u8d, 4); + vst1q_u8(dst, d); dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 5); - vst1q_u8(dst, q0u8); + d = vdupq_lane_u8(left_u8d, 5); + vst1q_u8(dst, d); dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 6); - vst1q_u8(dst, q0u8); + d = vdupq_lane_u8(left_u8d, 6); + vst1q_u8(dst, d); dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 7); - vst1q_u8(dst, q0u8); + d = vdupq_lane_u8(left_u8d, 7); + vst1q_u8(dst, d); dst += stride; } } void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - int j, k; - uint8x8_t d2u8 = vdup_n_u8(0); - uint8x16_t q0u8 = vdupq_n_u8(0); - uint8x16_t q1u8 = vdupq_n_u8(0); + uint8x16_t d; + int i; (void)above; - for (k = 0; k < 2; k++, left += 16) { - q1u8 = vld1q_u8(left); - d2u8 = vget_low_u8(q1u8); - for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) { - q0u8 = vdupq_lane_u8(d2u8, 0); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 1); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 2); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 3); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 4); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 5); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 6); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 7); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += stride; - } + for (i = 0; i < 2; i++, left += 16) { + const uint8x16_t left_u8 = vld1q_u8(left); + const uint8x8_t left_low = vget_low_u8(left_u8); + const uint8x8_t left_high = vget_high_u8(left_u8); + d = vdupq_lane_u8(left_low, 0); + vst1q_u8(dst, d); // Note clang-3.8 produced poor code w/vst2q_u8 + dst += 16; + vst1q_u8(dst, d); + dst += stride - 16; + d = vdupq_lane_u8(left_low, 1); + vst1q_u8(dst, d); + dst += 16; + vst1q_u8(dst, d); + dst += stride - 16; + d = vdupq_lane_u8(left_low, 2); + vst1q_u8(dst, d); + dst += 16; + vst1q_u8(dst, d); + dst += stride - 16; + d = vdupq_lane_u8(left_low, 3); + vst1q_u8(dst, d); + dst += 16; + vst1q_u8(dst, d); + dst += stride - 16; + d = vdupq_lane_u8(left_low, 4); + vst1q_u8(dst, d); + dst += 16; + vst1q_u8(dst, d); + dst += stride - 16; + d = vdupq_lane_u8(left_low, 5); + vst1q_u8(dst, d); + dst += 16; + vst1q_u8(dst, d); + dst += stride - 16; + d = vdupq_lane_u8(left_low, 6); + vst1q_u8(dst, d); + dst += 16; + vst1q_u8(dst, d); + dst += stride - 16; + d = vdupq_lane_u8(left_low, 7); + vst1q_u8(dst, d); + dst += 16; + vst1q_u8(dst, d); + dst += stride - 16; + + d = vdupq_lane_u8(left_high, 0); + vst1q_u8(dst, d); + dst += 16; + vst1q_u8(dst, d); + dst += stride - 16; + d = vdupq_lane_u8(left_high, 1); + vst1q_u8(dst, d); + dst += 16; + vst1q_u8(dst, d); + dst += stride - 16; + d = vdupq_lane_u8(left_high, 2); + vst1q_u8(dst, d); + dst += 16; + vst1q_u8(dst, d); + dst += stride - 16; + d = vdupq_lane_u8(left_high, 3); + vst1q_u8(dst, d); + dst += 16; + vst1q_u8(dst, d); + dst += stride - 16; + d = vdupq_lane_u8(left_high, 4); + vst1q_u8(dst, d); + dst += 16; + vst1q_u8(dst, d); + dst += stride - 16; + d = vdupq_lane_u8(left_high, 5); + vst1q_u8(dst, d); + dst += 16; + vst1q_u8(dst, d); + dst += stride - 16; + d = vdupq_lane_u8(left_high, 6); + vst1q_u8(dst, d); + dst += 16; + vst1q_u8(dst, d); + dst += stride - 16; + d = vdupq_lane_u8(left_high, 7); + vst1q_u8(dst, d); + dst += 16; + vst1q_u8(dst, d); + dst += stride - 16; } } +// ----------------------------------------------------------------------------- + +static INLINE int16x8_t convert_u8_to_s16(uint8x8_t v) { + return vreinterpretq_s16_u16(vmovl_u8(v)); +} + void vpx_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - int i; - uint16x8_t q1u16, q3u16; - int16x8_t q1s16; - uint8x8_t d0u8 = vdup_n_u8(0); - uint32x2_t d2u32 = vdup_n_u32(0); - - d0u8 = vld1_dup_u8(above - 1); - d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0); - q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8); - for (i = 0; i < 4; i++, dst += stride) { - q1u16 = vdupq_n_u16((uint16_t)left[i]); - q1s16 = - vaddq_s16(vreinterpretq_s16_u16(q1u16), vreinterpretq_s16_u16(q3u16)); - d0u8 = vqmovun_s16(q1s16); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); - } + const uint8x8_t top_left = vld1_dup_u8(above - 1); + const uint8x8_t left_u8 = vld1_u8(left); + const uint8x8_t above_u8 = vld1_u8(above); + const int16x4_t left_s16 = vget_low_s16(convert_u8_to_s16(left_u8)); + int16x8_t sub, sum; + uint32x2_t d; + + sub = vreinterpretq_s16_u16(vsubl_u8(above_u8, top_left)); + // Avoid vcombine_s16() which generates lots of redundant code with clang-3.8. + sub = vreinterpretq_s16_s64( + vdupq_lane_s64(vreinterpret_s64_s16(vget_low_s16(sub)), 0)); + + sum = vcombine_s16(vdup_lane_s16(left_s16, 0), vdup_lane_s16(left_s16, 1)); + sum = vaddq_s16(sum, sub); + d = vreinterpret_u32_u8(vqmovun_s16(sum)); + vst1_lane_u32((uint32_t *)dst, d, 0); + dst += stride; + vst1_lane_u32((uint32_t *)dst, d, 1); + dst += stride; + + sum = vcombine_s16(vdup_lane_s16(left_s16, 2), vdup_lane_s16(left_s16, 3)); + sum = vaddq_s16(sum, sub); + d = vreinterpret_u32_u8(vqmovun_s16(sum)); + vst1_lane_u32((uint32_t *)dst, d, 0); + dst += stride; + vst1_lane_u32((uint32_t *)dst, d, 1); +} + +static INLINE void tm_8_kernel(uint8_t **dst, const ptrdiff_t stride, + const int16x8_t left_dup, const int16x8_t sub) { + const int16x8_t sum = vaddq_s16(left_dup, sub); + const uint8x8_t d = vqmovun_s16(sum); + vst1_u8(*dst, d); + *dst += stride; } void vpx_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - int j; - uint16x8_t q0u16, q3u16, q10u16; - int16x8_t q0s16; - uint16x4_t d20u16; - uint8x8_t d0u8, d2u8, d30u8; - - d0u8 = vld1_dup_u8(above - 1); - d30u8 = vld1_u8(left); - d2u8 = vld1_u8(above); - q10u16 = vmovl_u8(d30u8); - q3u16 = vsubl_u8(d2u8, d0u8); - d20u16 = vget_low_u16(q10u16); - for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) { - q0u16 = vdupq_lane_u16(d20u16, 0); - q0s16 = - vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16)); - d0u8 = vqmovun_s16(q0s16); - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); - dst += stride; - q0u16 = vdupq_lane_u16(d20u16, 1); - q0s16 = - vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16)); - d0u8 = vqmovun_s16(q0s16); - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); - dst += stride; - q0u16 = vdupq_lane_u16(d20u16, 2); - q0s16 = - vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16)); - d0u8 = vqmovun_s16(q0s16); - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); - dst += stride; - q0u16 = vdupq_lane_u16(d20u16, 3); - q0s16 = - vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16)); - d0u8 = vqmovun_s16(q0s16); - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); - dst += stride; + const uint8x8_t top_left = vld1_dup_u8(above - 1); + const uint8x8_t above_u8 = vld1_u8(above); + const uint8x8_t left_u8 = vld1_u8(left); + const int16x8_t left_s16q = convert_u8_to_s16(left_u8); + const int16x8_t sub = vreinterpretq_s16_u16(vsubl_u8(above_u8, top_left)); + int16x4_t left_s16d = vget_low_s16(left_s16q); + int i; + + for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16q)) { + int16x8_t left_dup; + + left_dup = vdupq_lane_s16(left_s16d, 0); + tm_8_kernel(&dst, stride, left_dup, sub); + left_dup = vdupq_lane_s16(left_s16d, 1); + tm_8_kernel(&dst, stride, left_dup, sub); + left_dup = vdupq_lane_s16(left_s16d, 2); + tm_8_kernel(&dst, stride, left_dup, sub); + left_dup = vdupq_lane_s16(left_s16d, 3); + tm_8_kernel(&dst, stride, left_dup, sub); } } +static INLINE void tm_16_kernel(uint8_t **dst, const ptrdiff_t stride, + const int16x8_t left_dup, const int16x8_t sub0, + const int16x8_t sub1) { + const int16x8_t sum0 = vaddq_s16(left_dup, sub0); + const int16x8_t sum1 = vaddq_s16(left_dup, sub1); + const uint8x8_t d0 = vqmovun_s16(sum0); + const uint8x8_t d1 = vqmovun_s16(sum1); + vst1_u8(*dst, d0); + *dst += 8; + vst1_u8(*dst, d1); + *dst += stride - 8; +} + void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - int j, k; - uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16; - uint8x16_t q0u8, q1u8; - int16x8_t q0s16, q1s16, q8s16, q11s16; - uint16x4_t d20u16; - uint8x8_t d2u8, d3u8, d18u8, d22u8, d23u8; - - q0u8 = vld1q_dup_u8(above - 1); - q1u8 = vld1q_u8(above); - q2u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8)); - q3u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8)); - for (k = 0; k < 2; k++, left += 8) { - d18u8 = vld1_u8(left); - q10u16 = vmovl_u8(d18u8); - d20u16 = vget_low_u16(q10u16); - for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) { - q0u16 = vdupq_lane_u16(d20u16, 0); - q8u16 = vdupq_lane_u16(d20u16, 1); - q1s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16)); - q0s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16)); - q11s16 = - vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16)); - q8s16 = - vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16)); - d2u8 = vqmovun_s16(q1s16); - d3u8 = vqmovun_s16(q0s16); - d22u8 = vqmovun_s16(q11s16); - d23u8 = vqmovun_s16(q8s16); - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8)); - vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8)); - dst += stride; - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8)); - vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8)); - dst += stride; - - q0u16 = vdupq_lane_u16(d20u16, 2); - q8u16 = vdupq_lane_u16(d20u16, 3); - q1s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16)); - q0s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16)); - q11s16 = - vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16)); - q8s16 = - vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16)); - d2u8 = vqmovun_s16(q1s16); - d3u8 = vqmovun_s16(q0s16); - d22u8 = vqmovun_s16(q11s16); - d23u8 = vqmovun_s16(q8s16); - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8)); - vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8)); - dst += stride; - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8)); - vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8)); - dst += stride; - } + const uint8x16_t top_left = vld1q_dup_u8(above - 1); + const uint8x16_t above_u8 = vld1q_u8(above); + const int16x8_t sub0 = vreinterpretq_s16_u16( + vsubl_u8(vget_low_u8(above_u8), vget_low_u8(top_left))); + const int16x8_t sub1 = vreinterpretq_s16_u16( + vsubl_u8(vget_high_u8(above_u8), vget_high_u8(top_left))); + int16x8_t left_dup; + int i; + + for (i = 0; i < 2; i++, left += 8) { + const uint8x8_t left_u8 = vld1_u8(left); + const int16x8_t left_s16q = convert_u8_to_s16(left_u8); + const int16x4_t left_low = vget_low_s16(left_s16q); + const int16x4_t left_high = vget_high_s16(left_s16q); + + left_dup = vdupq_lane_s16(left_low, 0); + tm_16_kernel(&dst, stride, left_dup, sub0, sub1); + left_dup = vdupq_lane_s16(left_low, 1); + tm_16_kernel(&dst, stride, left_dup, sub0, sub1); + left_dup = vdupq_lane_s16(left_low, 2); + tm_16_kernel(&dst, stride, left_dup, sub0, sub1); + left_dup = vdupq_lane_s16(left_low, 3); + tm_16_kernel(&dst, stride, left_dup, sub0, sub1); + + left_dup = vdupq_lane_s16(left_high, 0); + tm_16_kernel(&dst, stride, left_dup, sub0, sub1); + left_dup = vdupq_lane_s16(left_high, 1); + tm_16_kernel(&dst, stride, left_dup, sub0, sub1); + left_dup = vdupq_lane_s16(left_high, 2); + tm_16_kernel(&dst, stride, left_dup, sub0, sub1); + left_dup = vdupq_lane_s16(left_high, 3); + tm_16_kernel(&dst, stride, left_dup, sub0, sub1); } } +static INLINE void tm_32_kernel(uint8_t **dst, const ptrdiff_t stride, + const int16x8_t left_dup, const int16x8_t sub0, + const int16x8_t sub1, const int16x8_t sub2, + const int16x8_t sub3) { + const int16x8_t sum0 = vaddq_s16(left_dup, sub0); + const int16x8_t sum1 = vaddq_s16(left_dup, sub1); + const int16x8_t sum2 = vaddq_s16(left_dup, sub2); + const int16x8_t sum3 = vaddq_s16(left_dup, sub3); + const uint8x8_t d0 = vqmovun_s16(sum0); + const uint8x8_t d1 = vqmovun_s16(sum1); + const uint8x8_t d2 = vqmovun_s16(sum2); + const uint8x8_t d3 = vqmovun_s16(sum3); + + vst1q_u8(*dst, vcombine_u8(d0, d1)); + *dst += 16; + vst1q_u8(*dst, vcombine_u8(d2, d3)); + *dst += stride - 16; +} + void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - int j, k; - uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16; - uint8x16_t q0u8, q1u8, q2u8; - int16x8_t q12s16, q13s16, q14s16, q15s16; - uint16x4_t d6u16; - uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8; - - q0u8 = vld1q_dup_u8(above - 1); - q1u8 = vld1q_u8(above); - q2u8 = vld1q_u8(above + 16); - q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8)); - q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8)); - q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8)); - q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8)); - for (k = 0; k < 4; k++, left += 8) { - d26u8 = vld1_u8(left); - q3u16 = vmovl_u8(d26u8); - d6u16 = vget_low_u16(q3u16); - for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) { - q0u16 = vdupq_lane_u16(d6u16, 0); - q12s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16)); - q13s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16)); - q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q10u16)); - q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q11u16)); - d0u8 = vqmovun_s16(q12s16); - d1u8 = vqmovun_s16(q13s16); - d2u8 = vqmovun_s16(q14s16); - d3u8 = vqmovun_s16(q15s16); - q0u8 = vcombine_u8(d0u8, d1u8); - q1u8 = vcombine_u8(d2u8, d3u8); - vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); - vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); - dst += stride; - - q0u16 = vdupq_lane_u16(d6u16, 1); - q12s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16)); - q13s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16)); - q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q10u16)); - q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q11u16)); - d0u8 = vqmovun_s16(q12s16); - d1u8 = vqmovun_s16(q13s16); - d2u8 = vqmovun_s16(q14s16); - d3u8 = vqmovun_s16(q15s16); - q0u8 = vcombine_u8(d0u8, d1u8); - q1u8 = vcombine_u8(d2u8, d3u8); - vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); - vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); - dst += stride; - - q0u16 = vdupq_lane_u16(d6u16, 2); - q12s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16)); - q13s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16)); - q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q10u16)); - q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q11u16)); - d0u8 = vqmovun_s16(q12s16); - d1u8 = vqmovun_s16(q13s16); - d2u8 = vqmovun_s16(q14s16); - d3u8 = vqmovun_s16(q15s16); - q0u8 = vcombine_u8(d0u8, d1u8); - q1u8 = vcombine_u8(d2u8, d3u8); - vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); - vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); - dst += stride; - - q0u16 = vdupq_lane_u16(d6u16, 3); - q12s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16)); - q13s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16)); - q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q10u16)); - q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q11u16)); - d0u8 = vqmovun_s16(q12s16); - d1u8 = vqmovun_s16(q13s16); - d2u8 = vqmovun_s16(q14s16); - d3u8 = vqmovun_s16(q15s16); - q0u8 = vcombine_u8(d0u8, d1u8); - q1u8 = vcombine_u8(d2u8, d3u8); - vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); - vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); - dst += stride; + const uint8x16_t top_left = vld1q_dup_u8(above - 1); + const uint8x16_t above_low = vld1q_u8(above); + const uint8x16_t above_high = vld1q_u8(above + 16); + const int16x8_t sub0 = vreinterpretq_s16_u16( + vsubl_u8(vget_low_u8(above_low), vget_low_u8(top_left))); + const int16x8_t sub1 = vreinterpretq_s16_u16( + vsubl_u8(vget_high_u8(above_low), vget_high_u8(top_left))); + const int16x8_t sub2 = vreinterpretq_s16_u16( + vsubl_u8(vget_low_u8(above_high), vget_low_u8(top_left))); + const int16x8_t sub3 = vreinterpretq_s16_u16( + vsubl_u8(vget_high_u8(above_high), vget_high_u8(top_left))); + int16x8_t left_dup; + int i, j; + + for (j = 0; j < 4; j++, left += 8) { + const uint8x8_t left_u8 = vld1_u8(left); + const int16x8_t left_s16q = convert_u8_to_s16(left_u8); + int16x4_t left_s16d = vget_low_s16(left_s16q); + for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16q)) { + left_dup = vdupq_lane_s16(left_s16d, 0); + tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3); + left_dup = vdupq_lane_s16(left_s16d, 1); + tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3); + left_dup = vdupq_lane_s16(left_s16d, 2); + tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3); + left_dup = vdupq_lane_s16(left_s16d, 3); + tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3); } } } diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index dad34e4b4..8c91b141f 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -204,18 +204,15 @@ DSP_SRCS-$(HAVE_NEON_ASM) += arm/save_reg_neon$(ASM) ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) ifeq ($(HAVE_NEON_ASM),yes) -DSP_SRCS-yes += arm/idct4x4_add_neon$(ASM) -DSP_SRCS-yes += arm/idct8x8_add_neon$(ASM) DSP_SRCS-yes += arm/idct16x16_add_neon$(ASM) else ifeq ($(HAVE_NEON),yes) -DSP_SRCS-yes += arm/idct4x4_add_neon.c -DSP_SRCS-yes += arm/idct8x8_add_neon.c DSP_SRCS-yes += arm/idct16x16_add_neon.c endif # HAVE_NEON endif # HAVE_NEON_ASM DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_neon.c DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_34_add_neon.c DSP_SRCS-$(HAVE_MSA) += mips/inv_txfm_msa.h DSP_SRCS-$(HAVE_MSA) += mips/idct4x4_msa.c @@ -232,14 +229,20 @@ DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_cols_dspr2.c endif # !CONFIG_VP9_HIGHBITDEPTH ifeq ($(HAVE_NEON_ASM),yes) +DSP_SRCS-yes += arm/idct_neon$(ASM) DSP_SRCS-yes += arm/idct4x4_1_add_neon$(ASM) +DSP_SRCS-yes += arm/idct4x4_add_neon$(ASM) DSP_SRCS-yes += arm/idct8x8_1_add_neon$(ASM) +DSP_SRCS-yes += arm/idct8x8_add_neon$(ASM) DSP_SRCS-yes += arm/idct16x16_1_add_neon$(ASM) else DSP_SRCS-$(HAVE_NEON) += arm/idct4x4_1_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/idct4x4_add_neon.c DSP_SRCS-$(HAVE_NEON) += arm/idct8x8_1_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/idct8x8_add_neon.c DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_1_add_neon.c endif # HAVE_NEON_ASM +DSP_SRCS-$(HAVE_NEON) += arm/idct_neon.h DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_1_add_neon.c endif # CONFIG_VP9 diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index fa6294142..7f31a6a11 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -644,16 +644,16 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; } else { add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vpx_idct4x4_16_add sse2/; + specialize qw/vpx_idct4x4_16_add neon sse2/; add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; specialize qw/vpx_idct4x4_1_add neon sse2/; add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vpx_idct8x8_64_add sse2/, "$ssse3_x86_64"; + specialize qw/vpx_idct8x8_64_add neon sse2/, "$ssse3_x86_64"; add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vpx_idct8x8_12_add sse2/, "$ssse3_x86_64"; + specialize qw/vpx_idct8x8_12_add neon sse2/, "$ssse3_x86_64"; add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; specialize qw/vpx_idct8x8_1_add neon sse2/; @@ -764,8 +764,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; specialize qw/vpx_idct32x32_34_add sse2 neon dspr2 msa/, "$ssse3_x86_64"; - # Need to add 34 eob idct32x32 neon implementation. - $vpx_idct32x32_34_add_neon=vpx_idct32x32_1024_add_neon; add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; specialize qw/vpx_idct32x32_1_add sse2 neon dspr2 msa/; @@ -781,7 +781,7 @@ static int main_loop(int argc, const char **argv_) { warn("Failed to decode frame %d: %s", frame_in, vpx_codec_error(&decoder)); if (detail) warn("Additional information: %s", detail); - frames_corrupted++; + corrupted = 1; if (!keep_going) goto fail; } @@ -800,7 +800,7 @@ static int main_loop(int argc, const char **argv_) { // Flush the decoder in frame parallel decode. if (vpx_codec_decode(&decoder, NULL, 0, NULL, 0)) { warn("Failed to flush decoder: %s", vpx_codec_error(&decoder)); - frames_corrupted++; + corrupted = 1; if (!keep_going) goto fail; } } @@ -814,7 +814,7 @@ static int main_loop(int argc, const char **argv_) { vpx_usec_timer_mark(&timer); dx_time += (unsigned int)vpx_usec_timer_elapsed(&timer); - if (!frame_parallel && + if (!frame_parallel && !corrupted && vpx_codec_control(&decoder, VP8D_GET_FRAME_CORRUPTED, &corrupted)) { warn("Failed VP8_GET_FRAME_CORRUPTED: %s", vpx_codec_error(&decoder)); if (!keep_going) goto fail; |