diff options
-rw-r--r-- | examples/vpx_temporal_svc_encoder.c | 58 | ||||
-rw-r--r-- | test/datarate_test.cc | 70 | ||||
-rw-r--r-- | test/encode_test_driver.h | 7 | ||||
-rw-r--r-- | test/hadamard_test.cc | 5 | ||||
-rw-r--r-- | test/set_roi.cc | 8 | ||||
-rw-r--r-- | vp8/encoder/onyx_if.c | 19 | ||||
-rw-r--r-- | vp8/encoder/onyx_int.h | 3 | ||||
-rw-r--r-- | vp9/encoder/vp9_encoder.c | 5 | ||||
-rw-r--r-- | vp9/encoder/vp9_firstpass.c | 2 | ||||
-rw-r--r-- | vp9/encoder/vp9_ratectrl.c | 25 | ||||
-rw-r--r-- | vp9/encoder/vp9_ratectrl.h | 2 | ||||
-rw-r--r-- | vp9/encoder/vp9_speed_features.c | 2 | ||||
-rw-r--r-- | vpx_dsp/vpx_dsp.mk | 1 | ||||
-rw-r--r-- | vpx_dsp/vpx_dsp_rtcd_defs.pl | 4 | ||||
-rw-r--r-- | vpx_dsp/x86/avg_intrin_avx2.c | 173 | ||||
-rw-r--r-- | vpx_dsp/x86/bitdepth_conversion_avx2.h | 13 |
16 files changed, 369 insertions, 28 deletions
diff --git a/examples/vpx_temporal_svc_encoder.c b/examples/vpx_temporal_svc_encoder.c index 72ea396d1..bff6e1722 100644 --- a/examples/vpx_temporal_svc_encoder.c +++ b/examples/vpx_temporal_svc_encoder.c @@ -26,6 +26,8 @@ #include "../tools_common.h" #include "../video_writer.h" +#define VP8_ROI_MAP 0 + static const char *exec_name; void usage_exit(void) { exit(EXIT_FAILURE); } @@ -154,6 +156,53 @@ static void printout_rate_control_summary(struct RateControlMetrics *rc, die("Error: Number of input frames not equal to output! \n"); } +#if VP8_ROI_MAP +static void vp8_set_roi_map(vpx_codec_enc_cfg_t *cfg, vpx_roi_map_t *roi) { + unsigned int i, j; + memset(roi, 0, sizeof(*roi)); + + // ROI is based on the segments (4 for vp8, 8 for vp9), smallest unit for + // segment is 16x16 for vp8, 8x8 for vp9. + roi->rows = (cfg->g_h + 15) / 16; + roi->cols = (cfg->g_w + 15) / 16; + + // Applies delta QP on the segment blocks, varies from -63 to 63. + // Setting to negative means lower QP (better quality). + // Below we set delta_q to the extreme (-63) to show strong effect. + roi->delta_q[0] = 0; + roi->delta_q[1] = -63; + roi->delta_q[2] = 0; + roi->delta_q[3] = 0; + + // Applies delta loopfilter strength on the segment blocks, varies from -63 to + // 63. Setting to positive means stronger loopfilter. + roi->delta_lf[0] = 0; + roi->delta_lf[1] = 0; + roi->delta_lf[2] = 0; + roi->delta_lf[3] = 0; + + // Applies skip encoding threshold on the segment blocks, varies from 0 to + // UINT_MAX. Larger value means more skipping of encoding is possible. + // This skip threshold only applies on delta frames. + roi->static_threshold[0] = 0; + roi->static_threshold[1] = 0; + roi->static_threshold[2] = 0; + roi->static_threshold[3] = 0; + + // Use 2 states: 1 is center square, 0 is the rest. + roi->roi_map = + (uint8_t *)calloc(roi->rows * roi->cols, sizeof(*roi->roi_map)); + for (i = 0; i < roi->rows; ++i) { + for (j = 0; j < roi->cols; ++j) { + if (i > (roi->rows >> 2) && i < ((roi->rows * 3) >> 2) && + j > (roi->cols >> 2) && j < ((roi->cols * 3) >> 2)) { + roi->roi_map[i * roi->cols + j] = 1; + } + } + } +} +#endif + // Temporal scaling parameters: // NOTE: The 3 prediction frames cannot be used interchangeably due to // differences in the way they are handled throughout the code. The @@ -506,6 +555,9 @@ int main(int argc, char **argv) { int layering_mode = 0; int layer_flags[VPX_TS_MAX_PERIODICITY] = { 0 }; int flag_periodicity = 1; +#if VP8_ROI_MAP + vpx_roi_map_t roi; +#endif #if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION) vpx_svc_layer_id_t layer_id = { 0, 0 }; #else @@ -710,6 +762,12 @@ int main(int argc, char **argv) { vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kDenoiserOff); vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1); vpx_codec_control(&codec, VP8E_SET_GF_CBR_BOOST_PCT, 0); +#if VP8_ROI_MAP + vp8_set_roi_map(&cfg, &roi); + if (vpx_codec_control(&codec, VP8E_SET_ROI_MAP, &roi)) + die_codec(&codec, "Failed to set ROI map"); +#endif + } else if (strncmp(encoder->name, "vp9", 3) == 0) { vpx_svc_extra_cfg_t svc_params; memset(&svc_params, 0, sizeof(svc_params)); diff --git a/test/datarate_test.cc b/test/datarate_test.cc index 7b0d62818..7ae761fd4 100644 --- a/test/datarate_test.cc +++ b/test/datarate_test.cc @@ -44,6 +44,7 @@ class DatarateTestLarge denoiser_offon_test_ = 0; denoiser_offon_period_ = -1; gf_boost_ = 0; + use_roi_ = 0; } virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, @@ -54,6 +55,12 @@ class DatarateTestLarge encoder->Control(VP8E_SET_GF_CBR_BOOST_PCT, gf_boost_); } +#if CONFIG_VP8_ENCODER + if (use_roi_ == 1) { + encoder->Control(VP8E_SET_ROI_MAP, &roi_); + } +#endif + if (denoiser_offon_test_) { ASSERT_GT(denoiser_offon_period_, 0) << "denoiser_offon_period_ is not positive."; @@ -145,6 +152,8 @@ class DatarateTestLarge int denoiser_offon_period_; int set_cpu_used_; int gf_boost_; + int use_roi_; + vpx_roi_map_t roi_; }; #if CONFIG_TEMPORAL_DENOISING @@ -414,6 +423,67 @@ TEST_P(DatarateTestRealTime, DropFramesMultiThreads) { << " The datarate for the file missed the target!"; } +TEST_P(DatarateTestRealTime, RegionOfInterest) { + denoiser_on_ = 0; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + // Encode using multiple threads. + cfg_.g_threads = 2; + + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 300); + cfg_.rc_target_bitrate = 450; + cfg_.g_w = 352; + cfg_.g_h = 288; + + ResetModel(); + + // Set ROI parameters + use_roi_ = 1; + memset(&roi_, 0, sizeof(roi_)); + + roi_.rows = (cfg_.g_h + 15) / 16; + roi_.cols = (cfg_.g_w + 15) / 16; + + roi_.delta_q[0] = 0; + roi_.delta_q[1] = -20; + roi_.delta_q[2] = 0; + roi_.delta_q[3] = 0; + + roi_.delta_lf[0] = 0; + roi_.delta_lf[1] = -20; + roi_.delta_lf[2] = 0; + roi_.delta_lf[3] = 0; + + roi_.static_threshold[0] = 0; + roi_.static_threshold[1] = 1000; + roi_.static_threshold[2] = 0; + roi_.static_threshold[3] = 0; + + // Use 2 states: 1 is center square, 0 is the rest. + roi_.roi_map = + (uint8_t *)calloc(roi_.rows * roi_.cols, sizeof(*roi_.roi_map)); + for (unsigned int i = 0; i < roi_.rows; ++i) { + for (unsigned int j = 0; j < roi_.cols; ++j) { + if (i > (roi_.rows >> 2) && i < ((roi_.rows * 3) >> 2) && + j > (roi_.cols >> 2) && j < ((roi_.cols * 3) >> 2)) { + roi_.roi_map[i * roi_.cols + j] = 1; + } + } + } + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) + << " The datarate for the file exceeds the target!"; + + ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) + << " The datarate for the file missed the target!"; + + free(roi_.roi_map); +} + TEST_P(DatarateTestRealTime, GFBoost) { denoiser_on_ = 0; cfg_.rc_buf_initial_sz = 500; diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h index 08a57ad77..1b4a5a671 100644 --- a/test/encode_test_driver.h +++ b/test/encode_test_driver.h @@ -139,6 +139,13 @@ class Encoder { } #endif +#if CONFIG_VP8_ENCODER + void Control(int ctrl_id, vpx_roi_map_t *arg) { + const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg); + ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); + } +#endif + void Config(const vpx_codec_enc_cfg_t *cfg) { const vpx_codec_err_t res = vpx_codec_enc_config_set(&encoder_, cfg); ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc index a55b15ad0..eacd84635 100644 --- a/test/hadamard_test.cc +++ b/test/hadamard_test.cc @@ -268,6 +268,11 @@ INSTANTIATE_TEST_CASE_P(SSE2, Hadamard16x16Test, ::testing::Values(&vpx_hadamard_16x16_sse2)); #endif // HAVE_SSE2 +#if HAVE_AVX2 +INSTANTIATE_TEST_CASE_P(AVX2, Hadamard16x16Test, + ::testing::Values(&vpx_hadamard_16x16_avx2)); +#endif // HAVE_AVX2 + #if HAVE_VSX INSTANTIATE_TEST_CASE_P(VSX, Hadamard16x16Test, ::testing::Values(&vpx_hadamard_16x16_vsx)); diff --git a/test/set_roi.cc b/test/set_roi.cc index 38711a806..f63954752 100644 --- a/test/set_roi.cc +++ b/test/set_roi.cc @@ -146,14 +146,6 @@ TEST(VP8RoiMapTest, ParameterCheck) { if (deltas_valid != roi_retval) break; } - // Test that we report and error if cyclic refresh is enabled. - cpi.cyclic_refresh_mode_enabled = 1; - roi_retval = - vp8_set_roimap(&cpi, roi_map, cpi.common.mb_rows, cpi.common.mb_cols, - delta_q, delta_lf, threshold); - EXPECT_EQ(-1, roi_retval) << "cyclic refresh check error"; - cpi.cyclic_refresh_mode_enabled = 0; - // Test invalid number of rows or colums. roi_retval = vp8_set_roimap(&cpi, roi_map, cpi.common.mb_rows + 1, diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index 2c2a783a9..5d714e122 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -1553,9 +1553,8 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) { setup_features(cpi); - { + if (!cpi->use_roi_static_threshold) { int i; - for (i = 0; i < MAX_MB_SEGMENTS; ++i) { cpi->segment_encode_breakout[i] = cpi->oxcf.encode_breakout; } @@ -1815,6 +1814,8 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) { cpi->active_map_enabled = 0; + cpi->use_roi_static_threshold = 0; + #if 0 /* Experimental code for lagged and one pass */ /* Initialise one_pass GF frames stats */ @@ -5354,9 +5355,6 @@ int vp8_set_roimap(VP8_COMP *cpi, unsigned char *map, unsigned int rows, const int range = 63; int i; - // This method is currently incompatible with the cyclic refresh method - if (cpi->cyclic_refresh_mode_enabled) return -1; - // Check number of rows and columns match if (cpi->common.mb_rows != (int)rows || cpi->common.mb_cols != (int)cols) { return -1; @@ -5375,7 +5373,11 @@ int vp8_set_roimap(VP8_COMP *cpi, unsigned char *map, unsigned int rows, return -1; } - if (!map) { + // Also disable segmentation if no deltas are specified. + if (!map || (delta_q[0] == 0 && delta_q[1] == 0 && delta_q[2] == 0 && + delta_q[3] == 0 && delta_lf[0] == 0 && delta_lf[1] == 0 && + delta_lf[2] == 0 && delta_lf[3] == 0 && threshold[0] == 0 && + threshold[1] == 0 && threshold[2] == 0 && threshold[3] == 0)) { disable_segmentation(cpi); return 0; } @@ -5412,6 +5414,11 @@ int vp8_set_roimap(VP8_COMP *cpi, unsigned char *map, unsigned int rows, /* Initialise the feature data structure */ set_segment_data(cpi, &feature_data[0][0], SEGMENT_DELTADATA); + if (threshold[0] != 0 || threshold[1] != 0 || threshold[2] != 0 || + threshold[3] != 0) + cpi->use_roi_static_threshold = 1; + cpi->cyclic_refresh_mode_enabled = 0; + return 0; } diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h index 0ee2d3553..c489b46c2 100644 --- a/vp8/encoder/onyx_int.h +++ b/vp8/encoder/onyx_int.h @@ -692,6 +692,9 @@ typedef struct VP8_COMP { int token_costs[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS]; } rd_costs; + + // Use the static threshold from ROI settings. + int use_roi_static_threshold; } VP8_COMP; void vp8_initialize_enc(void); diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 765c0578b..20901d21d 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -3956,6 +3956,11 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, // rate miss. If so adjust the active maxQ for the subsequent frames. if (q > cpi->twopass.active_worst_quality) { cpi->twopass.active_worst_quality = q; +#ifdef CORPUS_VBR_EXPERIMENT + } else if (q == q_low && rc->projected_frame_size < rc->this_frame_target) { + cpi->twopass.active_worst_quality = + VPXMAX(q, cpi->twopass.active_worst_quality - 1); +#endif } } diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 9d9779f7b..f9494dc51 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -108,7 +108,7 @@ static void output_stats(FIRSTPASS_STATS *stats, fpfile = fopen("firstpass.stt", "a"); fprintf(fpfile, - "%12.0lf %12.4lf %12.0lf %12.0lf %12.0lf %12.0lf %12.4lf %12.4lf" + "%12.0lf %12.4lf %12.2lf %12.2lf %12.2lf %12.0lf %12.4lf %12.4lf" "%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf" "%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.0lf %12.0lf %12.0lf" "%12.4lf" diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index 73d78a30c..a936ec943 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -44,8 +44,6 @@ #define MIN_BPB_FACTOR 0.005 #define MAX_BPB_FACTOR 50 -#define FRAME_OVERHEAD_BITS 200 - #if CONFIG_VP9_HIGHBITDEPTH #define ASSIGN_MINQ_TABLE(bit_depth, name) \ do { \ @@ -212,18 +210,23 @@ int vp9_estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs, int vp9_rc_clamp_pframe_target_size(const VP9_COMP *const cpi, int target) { const RATE_CONTROL *rc = &cpi->rc; const VP9EncoderConfig *oxcf = &cpi->oxcf; - const int min_frame_target = - VPXMAX(rc->min_frame_bandwidth, rc->avg_frame_bandwidth >> 5); - if (target < min_frame_target) target = min_frame_target; - if (cpi->refresh_golden_frame && rc->is_src_frame_alt_ref) { - // If there is an active ARF at this location use the minimum - // bits on this frame even if it is a constructed arf. - // The active maximum quantizer insures that an appropriate - // number of bits will be spent if needed for constructed ARFs. - target = min_frame_target; + + if (cpi->oxcf.pass != 2) { + const int min_frame_target = + VPXMAX(rc->min_frame_bandwidth, rc->avg_frame_bandwidth >> 5); + if (target < min_frame_target) target = min_frame_target; + if (cpi->refresh_golden_frame && rc->is_src_frame_alt_ref) { + // If there is an active ARF at this location use the minimum + // bits on this frame even if it is a constructed arf. + // The active maximum quantizer insures that an appropriate + // number of bits will be spent if needed for constructed ARFs. + target = min_frame_target; + } } + // Clip the frame target to the maximum allowed value. if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth; + if (oxcf->rc_max_inter_bitrate_pct) { const int max_rate = rc->avg_frame_bandwidth * oxcf->rc_max_inter_bitrate_pct / 100; diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h index f851e4286..61e50e9f7 100644 --- a/vp9/encoder/vp9_ratectrl.h +++ b/vp9/encoder/vp9_ratectrl.h @@ -35,6 +35,8 @@ extern "C" { #define FIXED_GF_INTERVAL 8 // Used in some testing modes only #define ONEHALFONLY_RESIZE 0 +#define FRAME_OVERHEAD_BITS 200 + typedef enum { INTER_NORMAL = 0, INTER_HIGH = 1, diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index e5499d6dd..cebaca7fc 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -583,6 +583,8 @@ static void set_rt_speed_feature_framesize_independent( if (cpi->svc.non_reference_frame) sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_EVENMORE; } + if (cpi->use_svc && cpi->row_mt && cpi->oxcf.max_threads > 1) + sf->adaptive_rd_thresh_row_mt = 1; // Enable partition copy. For SVC only enabled for top spatial resolution // layer. cpi->max_copied_frame = 0; diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index fa5feca16..808ee36de 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -290,6 +290,7 @@ endif # avg DSP_SRCS-yes += avg.c DSP_SRCS-$(HAVE_SSE2) += x86/avg_intrin_sse2.c +DSP_SRCS-$(HAVE_AVX2) += x86/avg_intrin_avx2.c DSP_SRCS-$(HAVE_NEON) += arm/avg_neon.c DSP_SRCS-$(HAVE_NEON) += arm/hadamard_neon.c DSP_SRCS-$(HAVE_MSA) += mips/avg_msa.c diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 474f50519..16b1f235a 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -769,7 +769,7 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { specialize qw/vpx_hadamard_8x8 sse2 neon vsx/, "$ssse3_x86_64"; add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, int src_stride, tran_low_t *coeff"; - specialize qw/vpx_hadamard_16x16 sse2 neon vsx/; + specialize qw/vpx_hadamard_16x16 avx2 sse2 neon vsx/; add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length"; specialize qw/vpx_satd sse2 neon/; @@ -778,7 +778,7 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { specialize qw/vpx_hadamard_8x8 sse2 neon msa vsx/, "$ssse3_x86_64"; add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, int src_stride, int16_t *coeff"; - specialize qw/vpx_hadamard_16x16 sse2 neon msa vsx/; + specialize qw/vpx_hadamard_16x16 avx2 sse2 neon msa vsx/; add_proto qw/int vpx_satd/, "const int16_t *coeff, int length"; specialize qw/vpx_satd sse2 neon msa/; diff --git a/vpx_dsp/x86/avg_intrin_avx2.c b/vpx_dsp/x86/avg_intrin_avx2.c new file mode 100644 index 000000000..3fc00f6df --- /dev/null +++ b/vpx_dsp/x86/avg_intrin_avx2.c @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <immintrin.h> + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/x86/bitdepth_conversion_avx2.h" +#include "vpx_ports/mem.h" + +static void hadamard_col8x2_avx2(__m256i *in, int iter) { + __m256i a0 = in[0]; + __m256i a1 = in[1]; + __m256i a2 = in[2]; + __m256i a3 = in[3]; + __m256i a4 = in[4]; + __m256i a5 = in[5]; + __m256i a6 = in[6]; + __m256i a7 = in[7]; + + __m256i b0 = _mm256_add_epi16(a0, a1); + __m256i b1 = _mm256_sub_epi16(a0, a1); + __m256i b2 = _mm256_add_epi16(a2, a3); + __m256i b3 = _mm256_sub_epi16(a2, a3); + __m256i b4 = _mm256_add_epi16(a4, a5); + __m256i b5 = _mm256_sub_epi16(a4, a5); + __m256i b6 = _mm256_add_epi16(a6, a7); + __m256i b7 = _mm256_sub_epi16(a6, a7); + + a0 = _mm256_add_epi16(b0, b2); + a1 = _mm256_add_epi16(b1, b3); + a2 = _mm256_sub_epi16(b0, b2); + a3 = _mm256_sub_epi16(b1, b3); + a4 = _mm256_add_epi16(b4, b6); + a5 = _mm256_add_epi16(b5, b7); + a6 = _mm256_sub_epi16(b4, b6); + a7 = _mm256_sub_epi16(b5, b7); + + if (iter == 0) { + b0 = _mm256_add_epi16(a0, a4); + b7 = _mm256_add_epi16(a1, a5); + b3 = _mm256_add_epi16(a2, a6); + b4 = _mm256_add_epi16(a3, a7); + b2 = _mm256_sub_epi16(a0, a4); + b6 = _mm256_sub_epi16(a1, a5); + b1 = _mm256_sub_epi16(a2, a6); + b5 = _mm256_sub_epi16(a3, a7); + + a0 = _mm256_unpacklo_epi16(b0, b1); + a1 = _mm256_unpacklo_epi16(b2, b3); + a2 = _mm256_unpackhi_epi16(b0, b1); + a3 = _mm256_unpackhi_epi16(b2, b3); + a4 = _mm256_unpacklo_epi16(b4, b5); + a5 = _mm256_unpacklo_epi16(b6, b7); + a6 = _mm256_unpackhi_epi16(b4, b5); + a7 = _mm256_unpackhi_epi16(b6, b7); + + b0 = _mm256_unpacklo_epi32(a0, a1); + b1 = _mm256_unpacklo_epi32(a4, a5); + b2 = _mm256_unpackhi_epi32(a0, a1); + b3 = _mm256_unpackhi_epi32(a4, a5); + b4 = _mm256_unpacklo_epi32(a2, a3); + b5 = _mm256_unpacklo_epi32(a6, a7); + b6 = _mm256_unpackhi_epi32(a2, a3); + b7 = _mm256_unpackhi_epi32(a6, a7); + + in[0] = _mm256_unpacklo_epi64(b0, b1); + in[1] = _mm256_unpackhi_epi64(b0, b1); + in[2] = _mm256_unpacklo_epi64(b2, b3); + in[3] = _mm256_unpackhi_epi64(b2, b3); + in[4] = _mm256_unpacklo_epi64(b4, b5); + in[5] = _mm256_unpackhi_epi64(b4, b5); + in[6] = _mm256_unpacklo_epi64(b6, b7); + in[7] = _mm256_unpackhi_epi64(b6, b7); + } else { + in[0] = _mm256_add_epi16(a0, a4); + in[7] = _mm256_add_epi16(a1, a5); + in[3] = _mm256_add_epi16(a2, a6); + in[4] = _mm256_add_epi16(a3, a7); + in[2] = _mm256_sub_epi16(a0, a4); + in[6] = _mm256_sub_epi16(a1, a5); + in[1] = _mm256_sub_epi16(a2, a6); + in[5] = _mm256_sub_epi16(a3, a7); + } +} + +static void hadamard_8x8x2_avx2(int16_t const *src_diff, int src_stride, + int16_t *coeff) { + __m256i src[8]; + src[0] = _mm256_loadu_si256((const __m256i *)src_diff); + src[1] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + src[2] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + src[3] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + src[4] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + src[5] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + src[6] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + src[7] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + + hadamard_col8x2_avx2(src, 0); + hadamard_col8x2_avx2(src, 1); + + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[0], src[1], 0x20)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[2], src[3], 0x20)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[4], src[5], 0x20)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[6], src[7], 0x20)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[0], src[1], 0x31)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[2], src[3], 0x31)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[4], src[5], 0x31)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[6], src[7], 0x31)); +} + +void vpx_hadamard_16x16_avx2(int16_t const *src_diff, int src_stride, + tran_low_t *coeff) { + int idx; +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]); + int16_t *t_coeff = temp_coeff; +#else + int16_t *t_coeff = coeff; +#endif + + for (idx = 0; idx < 2; ++idx) { + int16_t const *src_ptr = src_diff + idx * 8 * src_stride; + hadamard_8x8x2_avx2(src_ptr, src_stride, t_coeff + (idx * 64 * 2)); + } + + for (idx = 0; idx < 64; idx += 16) { + const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff); + const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64)); + const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128)); + const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192)); + + __m256i b0 = _mm256_add_epi16(coeff0, coeff1); + __m256i b1 = _mm256_sub_epi16(coeff0, coeff1); + __m256i b2 = _mm256_add_epi16(coeff2, coeff3); + __m256i b3 = _mm256_sub_epi16(coeff2, coeff3); + + b0 = _mm256_srai_epi16(b0, 1); + b1 = _mm256_srai_epi16(b1, 1); + b2 = _mm256_srai_epi16(b2, 1); + b3 = _mm256_srai_epi16(b3, 1); + + store_tran_low(_mm256_add_epi16(b0, b2), coeff); + store_tran_low(_mm256_add_epi16(b1, b3), coeff + 64); + store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 128); + store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 192); + + coeff += 16; + t_coeff += 16; + } +} diff --git a/vpx_dsp/x86/bitdepth_conversion_avx2.h b/vpx_dsp/x86/bitdepth_conversion_avx2.h index b9116f049..b8fd1cb58 100644 --- a/vpx_dsp/x86/bitdepth_conversion_avx2.h +++ b/vpx_dsp/x86/bitdepth_conversion_avx2.h @@ -27,4 +27,17 @@ static INLINE __m256i load_tran_low(const tran_low_t *a) { #endif } +static INLINE void store_tran_low(__m256i a, tran_low_t *b) { +#if CONFIG_VP9_HIGHBITDEPTH + const __m256i one = _mm256_set1_epi16(1); + const __m256i a_hi = _mm256_mulhi_epi16(a, one); + const __m256i a_lo = _mm256_mullo_epi16(a, one); + const __m256i a_1 = _mm256_unpacklo_epi16(a_lo, a_hi); + const __m256i a_2 = _mm256_unpackhi_epi16(a_lo, a_hi); + _mm256_storeu_si256((__m256i *)b, a_1); + _mm256_storeu_si256((__m256i *)(b + 8), a_2); +#else + _mm256_storeu_si256((__m256i *)b, a); +#endif +} #endif // VPX_DSP_X86_BITDEPTH_CONVERSION_AVX2_H_ |