summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--examples/vpx_temporal_svc_encoder.c58
-rw-r--r--test/datarate_test.cc70
-rw-r--r--test/encode_test_driver.h7
-rw-r--r--test/hadamard_test.cc5
-rw-r--r--test/set_roi.cc8
-rw-r--r--vp8/encoder/onyx_if.c19
-rw-r--r--vp8/encoder/onyx_int.h3
-rw-r--r--vp9/encoder/vp9_encoder.c5
-rw-r--r--vp9/encoder/vp9_firstpass.c2
-rw-r--r--vp9/encoder/vp9_ratectrl.c25
-rw-r--r--vp9/encoder/vp9_ratectrl.h2
-rw-r--r--vp9/encoder/vp9_speed_features.c2
-rw-r--r--vpx_dsp/vpx_dsp.mk1
-rw-r--r--vpx_dsp/vpx_dsp_rtcd_defs.pl4
-rw-r--r--vpx_dsp/x86/avg_intrin_avx2.c173
-rw-r--r--vpx_dsp/x86/bitdepth_conversion_avx2.h13
16 files changed, 369 insertions, 28 deletions
diff --git a/examples/vpx_temporal_svc_encoder.c b/examples/vpx_temporal_svc_encoder.c
index 72ea396d1..bff6e1722 100644
--- a/examples/vpx_temporal_svc_encoder.c
+++ b/examples/vpx_temporal_svc_encoder.c
@@ -26,6 +26,8 @@
#include "../tools_common.h"
#include "../video_writer.h"
+#define VP8_ROI_MAP 0
+
static const char *exec_name;
void usage_exit(void) { exit(EXIT_FAILURE); }
@@ -154,6 +156,53 @@ static void printout_rate_control_summary(struct RateControlMetrics *rc,
die("Error: Number of input frames not equal to output! \n");
}
+#if VP8_ROI_MAP
+static void vp8_set_roi_map(vpx_codec_enc_cfg_t *cfg, vpx_roi_map_t *roi) {
+ unsigned int i, j;
+ memset(roi, 0, sizeof(*roi));
+
+ // ROI is based on the segments (4 for vp8, 8 for vp9), smallest unit for
+ // segment is 16x16 for vp8, 8x8 for vp9.
+ roi->rows = (cfg->g_h + 15) / 16;
+ roi->cols = (cfg->g_w + 15) / 16;
+
+ // Applies delta QP on the segment blocks, varies from -63 to 63.
+ // Setting to negative means lower QP (better quality).
+ // Below we set delta_q to the extreme (-63) to show strong effect.
+ roi->delta_q[0] = 0;
+ roi->delta_q[1] = -63;
+ roi->delta_q[2] = 0;
+ roi->delta_q[3] = 0;
+
+ // Applies delta loopfilter strength on the segment blocks, varies from -63 to
+ // 63. Setting to positive means stronger loopfilter.
+ roi->delta_lf[0] = 0;
+ roi->delta_lf[1] = 0;
+ roi->delta_lf[2] = 0;
+ roi->delta_lf[3] = 0;
+
+ // Applies skip encoding threshold on the segment blocks, varies from 0 to
+ // UINT_MAX. Larger value means more skipping of encoding is possible.
+ // This skip threshold only applies on delta frames.
+ roi->static_threshold[0] = 0;
+ roi->static_threshold[1] = 0;
+ roi->static_threshold[2] = 0;
+ roi->static_threshold[3] = 0;
+
+ // Use 2 states: 1 is center square, 0 is the rest.
+ roi->roi_map =
+ (uint8_t *)calloc(roi->rows * roi->cols, sizeof(*roi->roi_map));
+ for (i = 0; i < roi->rows; ++i) {
+ for (j = 0; j < roi->cols; ++j) {
+ if (i > (roi->rows >> 2) && i < ((roi->rows * 3) >> 2) &&
+ j > (roi->cols >> 2) && j < ((roi->cols * 3) >> 2)) {
+ roi->roi_map[i * roi->cols + j] = 1;
+ }
+ }
+ }
+}
+#endif
+
// Temporal scaling parameters:
// NOTE: The 3 prediction frames cannot be used interchangeably due to
// differences in the way they are handled throughout the code. The
@@ -506,6 +555,9 @@ int main(int argc, char **argv) {
int layering_mode = 0;
int layer_flags[VPX_TS_MAX_PERIODICITY] = { 0 };
int flag_periodicity = 1;
+#if VP8_ROI_MAP
+ vpx_roi_map_t roi;
+#endif
#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION)
vpx_svc_layer_id_t layer_id = { 0, 0 };
#else
@@ -710,6 +762,12 @@ int main(int argc, char **argv) {
vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kDenoiserOff);
vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
vpx_codec_control(&codec, VP8E_SET_GF_CBR_BOOST_PCT, 0);
+#if VP8_ROI_MAP
+ vp8_set_roi_map(&cfg, &roi);
+ if (vpx_codec_control(&codec, VP8E_SET_ROI_MAP, &roi))
+ die_codec(&codec, "Failed to set ROI map");
+#endif
+
} else if (strncmp(encoder->name, "vp9", 3) == 0) {
vpx_svc_extra_cfg_t svc_params;
memset(&svc_params, 0, sizeof(svc_params));
diff --git a/test/datarate_test.cc b/test/datarate_test.cc
index 7b0d62818..7ae761fd4 100644
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -44,6 +44,7 @@ class DatarateTestLarge
denoiser_offon_test_ = 0;
denoiser_offon_period_ = -1;
gf_boost_ = 0;
+ use_roi_ = 0;
}
virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
@@ -54,6 +55,12 @@ class DatarateTestLarge
encoder->Control(VP8E_SET_GF_CBR_BOOST_PCT, gf_boost_);
}
+#if CONFIG_VP8_ENCODER
+ if (use_roi_ == 1) {
+ encoder->Control(VP8E_SET_ROI_MAP, &roi_);
+ }
+#endif
+
if (denoiser_offon_test_) {
ASSERT_GT(denoiser_offon_period_, 0)
<< "denoiser_offon_period_ is not positive.";
@@ -145,6 +152,8 @@ class DatarateTestLarge
int denoiser_offon_period_;
int set_cpu_used_;
int gf_boost_;
+ int use_roi_;
+ vpx_roi_map_t roi_;
};
#if CONFIG_TEMPORAL_DENOISING
@@ -414,6 +423,67 @@ TEST_P(DatarateTestRealTime, DropFramesMultiThreads) {
<< " The datarate for the file missed the target!";
}
+TEST_P(DatarateTestRealTime, RegionOfInterest) {
+ denoiser_on_ = 0;
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.rc_max_quantizer = 56;
+ cfg_.rc_end_usage = VPX_CBR;
+ // Encode using multiple threads.
+ cfg_.g_threads = 2;
+
+ ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+ 30, 1, 0, 300);
+ cfg_.rc_target_bitrate = 450;
+ cfg_.g_w = 352;
+ cfg_.g_h = 288;
+
+ ResetModel();
+
+ // Set ROI parameters
+ use_roi_ = 1;
+ memset(&roi_, 0, sizeof(roi_));
+
+ roi_.rows = (cfg_.g_h + 15) / 16;
+ roi_.cols = (cfg_.g_w + 15) / 16;
+
+ roi_.delta_q[0] = 0;
+ roi_.delta_q[1] = -20;
+ roi_.delta_q[2] = 0;
+ roi_.delta_q[3] = 0;
+
+ roi_.delta_lf[0] = 0;
+ roi_.delta_lf[1] = -20;
+ roi_.delta_lf[2] = 0;
+ roi_.delta_lf[3] = 0;
+
+ roi_.static_threshold[0] = 0;
+ roi_.static_threshold[1] = 1000;
+ roi_.static_threshold[2] = 0;
+ roi_.static_threshold[3] = 0;
+
+ // Use 2 states: 1 is center square, 0 is the rest.
+ roi_.roi_map =
+ (uint8_t *)calloc(roi_.rows * roi_.cols, sizeof(*roi_.roi_map));
+ for (unsigned int i = 0; i < roi_.rows; ++i) {
+ for (unsigned int j = 0; j < roi_.cols; ++j) {
+ if (i > (roi_.rows >> 2) && i < ((roi_.rows * 3) >> 2) &&
+ j > (roi_.cols >> 2) && j < ((roi_.cols * 3) >> 2)) {
+ roi_.roi_map[i * roi_.cols + j] = 1;
+ }
+ }
+ }
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
+ << " The datarate for the file exceeds the target!";
+
+ ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
+ << " The datarate for the file missed the target!";
+
+ free(roi_.roi_map);
+}
+
TEST_P(DatarateTestRealTime, GFBoost) {
denoiser_on_ = 0;
cfg_.rc_buf_initial_sz = 500;
diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h
index 08a57ad77..1b4a5a671 100644
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h
@@ -139,6 +139,13 @@ class Encoder {
}
#endif
+#if CONFIG_VP8_ENCODER
+ void Control(int ctrl_id, vpx_roi_map_t *arg) {
+ const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
+ ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
+ }
+#endif
+
void Config(const vpx_codec_enc_cfg_t *cfg) {
const vpx_codec_err_t res = vpx_codec_enc_config_set(&encoder_, cfg);
ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc
index a55b15ad0..eacd84635 100644
--- a/test/hadamard_test.cc
+++ b/test/hadamard_test.cc
@@ -268,6 +268,11 @@ INSTANTIATE_TEST_CASE_P(SSE2, Hadamard16x16Test,
::testing::Values(&vpx_hadamard_16x16_sse2));
#endif // HAVE_SSE2
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(AVX2, Hadamard16x16Test,
+ ::testing::Values(&vpx_hadamard_16x16_avx2));
+#endif // HAVE_AVX2
+
#if HAVE_VSX
INSTANTIATE_TEST_CASE_P(VSX, Hadamard16x16Test,
::testing::Values(&vpx_hadamard_16x16_vsx));
diff --git a/test/set_roi.cc b/test/set_roi.cc
index 38711a806..f63954752 100644
--- a/test/set_roi.cc
+++ b/test/set_roi.cc
@@ -146,14 +146,6 @@ TEST(VP8RoiMapTest, ParameterCheck) {
if (deltas_valid != roi_retval) break;
}
- // Test that we report and error if cyclic refresh is enabled.
- cpi.cyclic_refresh_mode_enabled = 1;
- roi_retval =
- vp8_set_roimap(&cpi, roi_map, cpi.common.mb_rows, cpi.common.mb_cols,
- delta_q, delta_lf, threshold);
- EXPECT_EQ(-1, roi_retval) << "cyclic refresh check error";
- cpi.cyclic_refresh_mode_enabled = 0;
-
// Test invalid number of rows or colums.
roi_retval =
vp8_set_roimap(&cpi, roi_map, cpi.common.mb_rows + 1,
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 2c2a783a9..5d714e122 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -1553,9 +1553,8 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) {
setup_features(cpi);
- {
+ if (!cpi->use_roi_static_threshold) {
int i;
-
for (i = 0; i < MAX_MB_SEGMENTS; ++i) {
cpi->segment_encode_breakout[i] = cpi->oxcf.encode_breakout;
}
@@ -1815,6 +1814,8 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
cpi->active_map_enabled = 0;
+ cpi->use_roi_static_threshold = 0;
+
#if 0
/* Experimental code for lagged and one pass */
/* Initialise one_pass GF frames stats */
@@ -5354,9 +5355,6 @@ int vp8_set_roimap(VP8_COMP *cpi, unsigned char *map, unsigned int rows,
const int range = 63;
int i;
- // This method is currently incompatible with the cyclic refresh method
- if (cpi->cyclic_refresh_mode_enabled) return -1;
-
// Check number of rows and columns match
if (cpi->common.mb_rows != (int)rows || cpi->common.mb_cols != (int)cols) {
return -1;
@@ -5375,7 +5373,11 @@ int vp8_set_roimap(VP8_COMP *cpi, unsigned char *map, unsigned int rows,
return -1;
}
- if (!map) {
+ // Also disable segmentation if no deltas are specified.
+ if (!map || (delta_q[0] == 0 && delta_q[1] == 0 && delta_q[2] == 0 &&
+ delta_q[3] == 0 && delta_lf[0] == 0 && delta_lf[1] == 0 &&
+ delta_lf[2] == 0 && delta_lf[3] == 0 && threshold[0] == 0 &&
+ threshold[1] == 0 && threshold[2] == 0 && threshold[3] == 0)) {
disable_segmentation(cpi);
return 0;
}
@@ -5412,6 +5414,11 @@ int vp8_set_roimap(VP8_COMP *cpi, unsigned char *map, unsigned int rows,
/* Initialise the feature data structure */
set_segment_data(cpi, &feature_data[0][0], SEGMENT_DELTADATA);
+ if (threshold[0] != 0 || threshold[1] != 0 || threshold[2] != 0 ||
+ threshold[3] != 0)
+ cpi->use_roi_static_threshold = 1;
+ cpi->cyclic_refresh_mode_enabled = 0;
+
return 0;
}
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index 0ee2d3553..c489b46c2 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -692,6 +692,9 @@ typedef struct VP8_COMP {
int token_costs[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
[MAX_ENTROPY_TOKENS];
} rd_costs;
+
+ // Use the static threshold from ROI settings.
+ int use_roi_static_threshold;
} VP8_COMP;
void vp8_initialize_enc(void);
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 765c0578b..20901d21d 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -3956,6 +3956,11 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
// rate miss. If so adjust the active maxQ for the subsequent frames.
if (q > cpi->twopass.active_worst_quality) {
cpi->twopass.active_worst_quality = q;
+#ifdef CORPUS_VBR_EXPERIMENT
+ } else if (q == q_low && rc->projected_frame_size < rc->this_frame_target) {
+ cpi->twopass.active_worst_quality =
+ VPXMAX(q, cpi->twopass.active_worst_quality - 1);
+#endif
}
}
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 9d9779f7b..f9494dc51 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -108,7 +108,7 @@ static void output_stats(FIRSTPASS_STATS *stats,
fpfile = fopen("firstpass.stt", "a");
fprintf(fpfile,
- "%12.0lf %12.4lf %12.0lf %12.0lf %12.0lf %12.0lf %12.4lf %12.4lf"
+ "%12.0lf %12.4lf %12.2lf %12.2lf %12.2lf %12.0lf %12.4lf %12.4lf"
"%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf"
"%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.0lf %12.0lf %12.0lf"
"%12.4lf"
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 73d78a30c..a936ec943 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -44,8 +44,6 @@
#define MIN_BPB_FACTOR 0.005
#define MAX_BPB_FACTOR 50
-#define FRAME_OVERHEAD_BITS 200
-
#if CONFIG_VP9_HIGHBITDEPTH
#define ASSIGN_MINQ_TABLE(bit_depth, name) \
do { \
@@ -212,18 +210,23 @@ int vp9_estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs,
int vp9_rc_clamp_pframe_target_size(const VP9_COMP *const cpi, int target) {
const RATE_CONTROL *rc = &cpi->rc;
const VP9EncoderConfig *oxcf = &cpi->oxcf;
- const int min_frame_target =
- VPXMAX(rc->min_frame_bandwidth, rc->avg_frame_bandwidth >> 5);
- if (target < min_frame_target) target = min_frame_target;
- if (cpi->refresh_golden_frame && rc->is_src_frame_alt_ref) {
- // If there is an active ARF at this location use the minimum
- // bits on this frame even if it is a constructed arf.
- // The active maximum quantizer insures that an appropriate
- // number of bits will be spent if needed for constructed ARFs.
- target = min_frame_target;
+
+ if (cpi->oxcf.pass != 2) {
+ const int min_frame_target =
+ VPXMAX(rc->min_frame_bandwidth, rc->avg_frame_bandwidth >> 5);
+ if (target < min_frame_target) target = min_frame_target;
+ if (cpi->refresh_golden_frame && rc->is_src_frame_alt_ref) {
+ // If there is an active ARF at this location use the minimum
+ // bits on this frame even if it is a constructed arf.
+ // The active maximum quantizer insures that an appropriate
+ // number of bits will be spent if needed for constructed ARFs.
+ target = min_frame_target;
+ }
}
+
// Clip the frame target to the maximum allowed value.
if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth;
+
if (oxcf->rc_max_inter_bitrate_pct) {
const int max_rate =
rc->avg_frame_bandwidth * oxcf->rc_max_inter_bitrate_pct / 100;
diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h
index f851e4286..61e50e9f7 100644
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -35,6 +35,8 @@ extern "C" {
#define FIXED_GF_INTERVAL 8 // Used in some testing modes only
#define ONEHALFONLY_RESIZE 0
+#define FRAME_OVERHEAD_BITS 200
+
typedef enum {
INTER_NORMAL = 0,
INTER_HIGH = 1,
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index e5499d6dd..cebaca7fc 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -583,6 +583,8 @@ static void set_rt_speed_feature_framesize_independent(
if (cpi->svc.non_reference_frame)
sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_EVENMORE;
}
+ if (cpi->use_svc && cpi->row_mt && cpi->oxcf.max_threads > 1)
+ sf->adaptive_rd_thresh_row_mt = 1;
// Enable partition copy. For SVC only enabled for top spatial resolution
// layer.
cpi->max_copied_frame = 0;
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index fa5feca16..808ee36de 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -290,6 +290,7 @@ endif
# avg
DSP_SRCS-yes += avg.c
DSP_SRCS-$(HAVE_SSE2) += x86/avg_intrin_sse2.c
+DSP_SRCS-$(HAVE_AVX2) += x86/avg_intrin_avx2.c
DSP_SRCS-$(HAVE_NEON) += arm/avg_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/hadamard_neon.c
DSP_SRCS-$(HAVE_MSA) += mips/avg_msa.c
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 474f50519..16b1f235a 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -769,7 +769,7 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
specialize qw/vpx_hadamard_8x8 sse2 neon vsx/, "$ssse3_x86_64";
add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, int src_stride, tran_low_t *coeff";
- specialize qw/vpx_hadamard_16x16 sse2 neon vsx/;
+ specialize qw/vpx_hadamard_16x16 avx2 sse2 neon vsx/;
add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length";
specialize qw/vpx_satd sse2 neon/;
@@ -778,7 +778,7 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
specialize qw/vpx_hadamard_8x8 sse2 neon msa vsx/, "$ssse3_x86_64";
add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, int src_stride, int16_t *coeff";
- specialize qw/vpx_hadamard_16x16 sse2 neon msa vsx/;
+ specialize qw/vpx_hadamard_16x16 avx2 sse2 neon msa vsx/;
add_proto qw/int vpx_satd/, "const int16_t *coeff, int length";
specialize qw/vpx_satd sse2 neon msa/;
diff --git a/vpx_dsp/x86/avg_intrin_avx2.c b/vpx_dsp/x86/avg_intrin_avx2.c
new file mode 100644
index 000000000..3fc00f6df
--- /dev/null
+++ b/vpx_dsp/x86/avg_intrin_avx2.c
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/bitdepth_conversion_avx2.h"
+#include "vpx_ports/mem.h"
+
+static void hadamard_col8x2_avx2(__m256i *in, int iter) {
+ __m256i a0 = in[0];
+ __m256i a1 = in[1];
+ __m256i a2 = in[2];
+ __m256i a3 = in[3];
+ __m256i a4 = in[4];
+ __m256i a5 = in[5];
+ __m256i a6 = in[6];
+ __m256i a7 = in[7];
+
+ __m256i b0 = _mm256_add_epi16(a0, a1);
+ __m256i b1 = _mm256_sub_epi16(a0, a1);
+ __m256i b2 = _mm256_add_epi16(a2, a3);
+ __m256i b3 = _mm256_sub_epi16(a2, a3);
+ __m256i b4 = _mm256_add_epi16(a4, a5);
+ __m256i b5 = _mm256_sub_epi16(a4, a5);
+ __m256i b6 = _mm256_add_epi16(a6, a7);
+ __m256i b7 = _mm256_sub_epi16(a6, a7);
+
+ a0 = _mm256_add_epi16(b0, b2);
+ a1 = _mm256_add_epi16(b1, b3);
+ a2 = _mm256_sub_epi16(b0, b2);
+ a3 = _mm256_sub_epi16(b1, b3);
+ a4 = _mm256_add_epi16(b4, b6);
+ a5 = _mm256_add_epi16(b5, b7);
+ a6 = _mm256_sub_epi16(b4, b6);
+ a7 = _mm256_sub_epi16(b5, b7);
+
+ if (iter == 0) {
+ b0 = _mm256_add_epi16(a0, a4);
+ b7 = _mm256_add_epi16(a1, a5);
+ b3 = _mm256_add_epi16(a2, a6);
+ b4 = _mm256_add_epi16(a3, a7);
+ b2 = _mm256_sub_epi16(a0, a4);
+ b6 = _mm256_sub_epi16(a1, a5);
+ b1 = _mm256_sub_epi16(a2, a6);
+ b5 = _mm256_sub_epi16(a3, a7);
+
+ a0 = _mm256_unpacklo_epi16(b0, b1);
+ a1 = _mm256_unpacklo_epi16(b2, b3);
+ a2 = _mm256_unpackhi_epi16(b0, b1);
+ a3 = _mm256_unpackhi_epi16(b2, b3);
+ a4 = _mm256_unpacklo_epi16(b4, b5);
+ a5 = _mm256_unpacklo_epi16(b6, b7);
+ a6 = _mm256_unpackhi_epi16(b4, b5);
+ a7 = _mm256_unpackhi_epi16(b6, b7);
+
+ b0 = _mm256_unpacklo_epi32(a0, a1);
+ b1 = _mm256_unpacklo_epi32(a4, a5);
+ b2 = _mm256_unpackhi_epi32(a0, a1);
+ b3 = _mm256_unpackhi_epi32(a4, a5);
+ b4 = _mm256_unpacklo_epi32(a2, a3);
+ b5 = _mm256_unpacklo_epi32(a6, a7);
+ b6 = _mm256_unpackhi_epi32(a2, a3);
+ b7 = _mm256_unpackhi_epi32(a6, a7);
+
+ in[0] = _mm256_unpacklo_epi64(b0, b1);
+ in[1] = _mm256_unpackhi_epi64(b0, b1);
+ in[2] = _mm256_unpacklo_epi64(b2, b3);
+ in[3] = _mm256_unpackhi_epi64(b2, b3);
+ in[4] = _mm256_unpacklo_epi64(b4, b5);
+ in[5] = _mm256_unpackhi_epi64(b4, b5);
+ in[6] = _mm256_unpacklo_epi64(b6, b7);
+ in[7] = _mm256_unpackhi_epi64(b6, b7);
+ } else {
+ in[0] = _mm256_add_epi16(a0, a4);
+ in[7] = _mm256_add_epi16(a1, a5);
+ in[3] = _mm256_add_epi16(a2, a6);
+ in[4] = _mm256_add_epi16(a3, a7);
+ in[2] = _mm256_sub_epi16(a0, a4);
+ in[6] = _mm256_sub_epi16(a1, a5);
+ in[1] = _mm256_sub_epi16(a2, a6);
+ in[5] = _mm256_sub_epi16(a3, a7);
+ }
+}
+
+static void hadamard_8x8x2_avx2(int16_t const *src_diff, int src_stride,
+ int16_t *coeff) {
+ __m256i src[8];
+ src[0] = _mm256_loadu_si256((const __m256i *)src_diff);
+ src[1] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+ src[2] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+ src[3] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+ src[4] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+ src[5] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+ src[6] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+ src[7] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+
+ hadamard_col8x2_avx2(src, 0);
+ hadamard_col8x2_avx2(src, 1);
+
+ _mm256_storeu_si256((__m256i *)coeff,
+ _mm256_permute2x128_si256(src[0], src[1], 0x20));
+ coeff += 16;
+ _mm256_storeu_si256((__m256i *)coeff,
+ _mm256_permute2x128_si256(src[2], src[3], 0x20));
+ coeff += 16;
+ _mm256_storeu_si256((__m256i *)coeff,
+ _mm256_permute2x128_si256(src[4], src[5], 0x20));
+ coeff += 16;
+ _mm256_storeu_si256((__m256i *)coeff,
+ _mm256_permute2x128_si256(src[6], src[7], 0x20));
+ coeff += 16;
+ _mm256_storeu_si256((__m256i *)coeff,
+ _mm256_permute2x128_si256(src[0], src[1], 0x31));
+ coeff += 16;
+ _mm256_storeu_si256((__m256i *)coeff,
+ _mm256_permute2x128_si256(src[2], src[3], 0x31));
+ coeff += 16;
+ _mm256_storeu_si256((__m256i *)coeff,
+ _mm256_permute2x128_si256(src[4], src[5], 0x31));
+ coeff += 16;
+ _mm256_storeu_si256((__m256i *)coeff,
+ _mm256_permute2x128_si256(src[6], src[7], 0x31));
+}
+
+void vpx_hadamard_16x16_avx2(int16_t const *src_diff, int src_stride,
+ tran_low_t *coeff) {
+ int idx;
+#if CONFIG_VP9_HIGHBITDEPTH
+ DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]);
+ int16_t *t_coeff = temp_coeff;
+#else
+ int16_t *t_coeff = coeff;
+#endif
+
+ for (idx = 0; idx < 2; ++idx) {
+ int16_t const *src_ptr = src_diff + idx * 8 * src_stride;
+ hadamard_8x8x2_avx2(src_ptr, src_stride, t_coeff + (idx * 64 * 2));
+ }
+
+ for (idx = 0; idx < 64; idx += 16) {
+ const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+ const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64));
+ const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128));
+ const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192));
+
+ __m256i b0 = _mm256_add_epi16(coeff0, coeff1);
+ __m256i b1 = _mm256_sub_epi16(coeff0, coeff1);
+ __m256i b2 = _mm256_add_epi16(coeff2, coeff3);
+ __m256i b3 = _mm256_sub_epi16(coeff2, coeff3);
+
+ b0 = _mm256_srai_epi16(b0, 1);
+ b1 = _mm256_srai_epi16(b1, 1);
+ b2 = _mm256_srai_epi16(b2, 1);
+ b3 = _mm256_srai_epi16(b3, 1);
+
+ store_tran_low(_mm256_add_epi16(b0, b2), coeff);
+ store_tran_low(_mm256_add_epi16(b1, b3), coeff + 64);
+ store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 128);
+ store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 192);
+
+ coeff += 16;
+ t_coeff += 16;
+ }
+}
diff --git a/vpx_dsp/x86/bitdepth_conversion_avx2.h b/vpx_dsp/x86/bitdepth_conversion_avx2.h
index b9116f049..b8fd1cb58 100644
--- a/vpx_dsp/x86/bitdepth_conversion_avx2.h
+++ b/vpx_dsp/x86/bitdepth_conversion_avx2.h
@@ -27,4 +27,17 @@ static INLINE __m256i load_tran_low(const tran_low_t *a) {
#endif
}
+static INLINE void store_tran_low(__m256i a, tran_low_t *b) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const __m256i one = _mm256_set1_epi16(1);
+ const __m256i a_hi = _mm256_mulhi_epi16(a, one);
+ const __m256i a_lo = _mm256_mullo_epi16(a, one);
+ const __m256i a_1 = _mm256_unpacklo_epi16(a_lo, a_hi);
+ const __m256i a_2 = _mm256_unpackhi_epi16(a_lo, a_hi);
+ _mm256_storeu_si256((__m256i *)b, a_1);
+ _mm256_storeu_si256((__m256i *)(b + 8), a_2);
+#else
+ _mm256_storeu_si256((__m256i *)b, a);
+#endif
+}
#endif // VPX_DSP_X86_BITDEPTH_CONVERSION_AVX2_H_