summaryrefslogtreecommitdiff
path: root/vp9/encoder
diff options
context:
space:
mode:
Diffstat (limited to 'vp9/encoder')
-rw-r--r--vp9/encoder/arm/neon/vp9_dct_neon.c223
-rw-r--r--vp9/encoder/arm/neon/vp9_quantize_neon.c102
-rw-r--r--vp9/encoder/arm/neon/vp9_sad_neon.c24
-rw-r--r--vp9/encoder/arm/neon/vp9_subtract_neon.c81
-rw-r--r--vp9/encoder/arm/neon/vp9_variance_neon.c227
-rw-r--r--vp9/encoder/vp9_bitstream.c7
-rw-r--r--vp9/encoder/vp9_block.h2
-rw-r--r--vp9/encoder/vp9_encodeframe.c262
-rw-r--r--vp9/encoder/vp9_encodemb.c9
-rw-r--r--vp9/encoder/vp9_encodemb.h2
-rw-r--r--vp9/encoder/vp9_encoder.c12
-rw-r--r--vp9/encoder/vp9_encoder.h5
-rw-r--r--vp9/encoder/vp9_firstpass.c86
-rw-r--r--vp9/encoder/vp9_firstpass.h14
-rw-r--r--vp9/encoder/vp9_pickmode.c7
-rw-r--r--vp9/encoder/vp9_ratectrl.c3
-rw-r--r--vp9/encoder/vp9_rd.c5
-rw-r--r--vp9/encoder/vp9_rdopt.c168
-rw-r--r--vp9/encoder/vp9_speed_features.c18
-rw-r--r--vp9/encoder/vp9_speed_features.h2
-rw-r--r--vp9/encoder/vp9_svc_layercontext.c21
-rw-r--r--vp9/encoder/vp9_svc_layercontext.h1
-rw-r--r--vp9/encoder/x86/vp9_dct_avx2.c2566
23 files changed, 1136 insertions, 2711 deletions
diff --git a/vp9/encoder/arm/neon/vp9_dct_neon.c b/vp9/encoder/arm/neon/vp9_dct_neon.c
new file mode 100644
index 000000000..6c66f5d5b
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_dct_neon.c
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_idct.h"
+
+void vp9_fdct8x8_1_neon(const int16_t *input, int16_t *output, int stride) {
+ int r;
+ int16x8_t sum = vld1q_s16(&input[0]);
+ for (r = 1; r < 8; ++r) {
+ const int16x8_t input_00 = vld1q_s16(&input[r * stride]);
+ sum = vaddq_s16(sum, input_00);
+ }
+ {
+ const int32x4_t a = vpaddlq_s16(sum);
+ const int64x2_t b = vpaddlq_s32(a);
+ const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+ vreinterpret_s32_s64(vget_high_s64(b)));
+ output[0] = vget_lane_s16(vreinterpret_s16_s32(c), 0);
+ output[1] = 0;
+ }
+}
+
+void vp9_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
+ int i;
+ // stage 1
+ int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
+ int16x8_t input_1 = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2);
+ int16x8_t input_2 = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2);
+ int16x8_t input_3 = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2);
+ int16x8_t input_4 = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2);
+ int16x8_t input_5 = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2);
+ int16x8_t input_6 = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2);
+ int16x8_t input_7 = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2);
+ for (i = 0; i < 2; ++i) {
+ int16x8_t out_0, out_1, out_2, out_3, out_4, out_5, out_6, out_7;
+ const int16x8_t v_s0 = vaddq_s16(input_0, input_7);
+ const int16x8_t v_s1 = vaddq_s16(input_1, input_6);
+ const int16x8_t v_s2 = vaddq_s16(input_2, input_5);
+ const int16x8_t v_s3 = vaddq_s16(input_3, input_4);
+ const int16x8_t v_s4 = vsubq_s16(input_3, input_4);
+ const int16x8_t v_s5 = vsubq_s16(input_2, input_5);
+ const int16x8_t v_s6 = vsubq_s16(input_1, input_6);
+ const int16x8_t v_s7 = vsubq_s16(input_0, input_7);
+ // fdct4(step, step);
+ int16x8_t v_x0 = vaddq_s16(v_s0, v_s3);
+ int16x8_t v_x1 = vaddq_s16(v_s1, v_s2);
+ int16x8_t v_x2 = vsubq_s16(v_s1, v_s2);
+ int16x8_t v_x3 = vsubq_s16(v_s0, v_s3);
+ // fdct4(step, step);
+ int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
+ int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
+ int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
+ int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
+ int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_24_64);
+ int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_24_64);
+ int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_24_64);
+ int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_24_64);
+ v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), (int16_t)cospi_8_64);
+ v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), (int16_t)cospi_8_64);
+ v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), (int16_t)cospi_8_64);
+ v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), (int16_t)cospi_8_64);
+ v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64);
+ v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64);
+ v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64);
+ v_t1_hi = vmulq_n_s32(v_t1_hi, cospi_16_64);
+ {
+ const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+ const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+ const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+ const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+ const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
+ const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
+ const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
+ const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
+ out_0 = vcombine_s16(a, c); // 00 01 02 03 40 41 42 43
+ out_2 = vcombine_s16(e, g); // 20 21 22 23 60 61 62 63
+ out_4 = vcombine_s16(b, d); // 04 05 06 07 44 45 46 47
+ out_6 = vcombine_s16(f, h); // 24 25 26 27 64 65 66 67
+ }
+ // Stage 2
+ v_x0 = vsubq_s16(v_s6, v_s5);
+ v_x1 = vaddq_s16(v_s6, v_s5);
+ v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), (int16_t)cospi_16_64);
+ v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), (int16_t)cospi_16_64);
+ v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_16_64);
+ v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_16_64);
+ {
+ const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+ const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+ const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+ const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+ const int16x8_t ab = vcombine_s16(a, b);
+ const int16x8_t cd = vcombine_s16(c, d);
+ // Stage 3
+ v_x0 = vaddq_s16(v_s4, ab);
+ v_x1 = vsubq_s16(v_s4, ab);
+ v_x2 = vsubq_s16(v_s7, cd);
+ v_x3 = vaddq_s16(v_s7, cd);
+ }
+ // Stage 4
+ v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_4_64);
+ v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_4_64);
+ v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), (int16_t)cospi_28_64);
+ v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), (int16_t)cospi_28_64);
+ v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_12_64);
+ v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_12_64);
+ v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), (int16_t)cospi_20_64);
+ v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), (int16_t)cospi_20_64);
+ v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_12_64);
+ v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_12_64);
+ v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), (int16_t)cospi_20_64);
+ v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), (int16_t)cospi_20_64);
+ v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_28_64);
+ v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_28_64);
+ v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), (int16_t)cospi_4_64);
+ v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), (int16_t)cospi_4_64);
+ {
+ const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+ const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+ const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+ const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+ const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
+ const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
+ const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
+ const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
+ out_1 = vcombine_s16(a, c); // 10 11 12 13 50 51 52 53
+ out_3 = vcombine_s16(e, g); // 30 31 32 33 70 71 72 73
+ out_5 = vcombine_s16(b, d); // 14 15 16 17 54 55 56 57
+ out_7 = vcombine_s16(f, h); // 34 35 36 37 74 75 76 77
+ }
+ // transpose 8x8
+ {
+ // 00 01 02 03 40 41 42 43
+ // 10 11 12 13 50 51 52 53
+ // 20 21 22 23 60 61 62 63
+ // 30 31 32 33 70 71 72 73
+ // 04 05 06 07 44 45 46 47
+ // 14 15 16 17 54 55 56 57
+ // 24 25 26 27 64 65 66 67
+ // 34 35 36 37 74 75 76 77
+ const int32x4x2_t r02_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_0),
+ vreinterpretq_s32_s16(out_2));
+ const int32x4x2_t r13_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_1),
+ vreinterpretq_s32_s16(out_3));
+ const int32x4x2_t r46_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_4),
+ vreinterpretq_s32_s16(out_6));
+ const int32x4x2_t r57_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_5),
+ vreinterpretq_s32_s16(out_7));
+ const int16x8x2_t r01_s16 =
+ vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]),
+ vreinterpretq_s16_s32(r13_s32.val[0]));
+ const int16x8x2_t r23_s16 =
+ vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]),
+ vreinterpretq_s16_s32(r13_s32.val[1]));
+ const int16x8x2_t r45_s16 =
+ vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]),
+ vreinterpretq_s16_s32(r57_s32.val[0]));
+ const int16x8x2_t r67_s16 =
+ vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]),
+ vreinterpretq_s16_s32(r57_s32.val[1]));
+ input_0 = r01_s16.val[0];
+ input_1 = r01_s16.val[1];
+ input_2 = r23_s16.val[0];
+ input_3 = r23_s16.val[1];
+ input_4 = r45_s16.val[0];
+ input_5 = r45_s16.val[1];
+ input_6 = r67_s16.val[0];
+ input_7 = r67_s16.val[1];
+ // 00 10 20 30 40 50 60 70
+ // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72
+ // 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74
+ // 05 15 25 35 45 55 65 75
+ // 06 16 26 36 46 56 66 76
+ // 07 17 27 37 47 57 67 77
+ }
+ } // for
+ {
+ // from vp9_dct_sse2.c
+ // Post-condition (division by two)
+ // division of two 16 bits signed numbers using shifts
+ // n / 2 = (n - (n >> 15)) >> 1
+ const int16x8_t sign_in0 = vshrq_n_s16(input_0, 15);
+ const int16x8_t sign_in1 = vshrq_n_s16(input_1, 15);
+ const int16x8_t sign_in2 = vshrq_n_s16(input_2, 15);
+ const int16x8_t sign_in3 = vshrq_n_s16(input_3, 15);
+ const int16x8_t sign_in4 = vshrq_n_s16(input_4, 15);
+ const int16x8_t sign_in5 = vshrq_n_s16(input_5, 15);
+ const int16x8_t sign_in6 = vshrq_n_s16(input_6, 15);
+ const int16x8_t sign_in7 = vshrq_n_s16(input_7, 15);
+ input_0 = vhsubq_s16(input_0, sign_in0);
+ input_1 = vhsubq_s16(input_1, sign_in1);
+ input_2 = vhsubq_s16(input_2, sign_in2);
+ input_3 = vhsubq_s16(input_3, sign_in3);
+ input_4 = vhsubq_s16(input_4, sign_in4);
+ input_5 = vhsubq_s16(input_5, sign_in5);
+ input_6 = vhsubq_s16(input_6, sign_in6);
+ input_7 = vhsubq_s16(input_7, sign_in7);
+ // store results
+ vst1q_s16(&final_output[0 * 8], input_0);
+ vst1q_s16(&final_output[1 * 8], input_1);
+ vst1q_s16(&final_output[2 * 8], input_2);
+ vst1q_s16(&final_output[3 * 8], input_3);
+ vst1q_s16(&final_output[4 * 8], input_4);
+ vst1q_s16(&final_output[5 * 8], input_5);
+ vst1q_s16(&final_output[6 * 8], input_6);
+ vst1q_s16(&final_output[7 * 8], input_7);
+ }
+}
+
diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c
new file mode 100644
index 000000000..2d5ec79b3
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include <math.h>
+
+#include "vpx_mem/vpx_mem.h"
+
+#include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_seg_common.h"
+
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_quantize.h"
+#include "vp9/encoder/vp9_rd.h"
+
+void vp9_quantize_fp_neon(const int16_t *coeff_ptr, intptr_t count,
+ int skip_block, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
+ int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ int zbin_oq_value, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ int i;
+ // TODO(jingning) Decide the need of these arguments after the
+ // quantization process is completed.
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+ (void)zbin_oq_value;
+ (void)scan;
+
+ if (!skip_block) {
+ // Quantization pass: All coefficients with index >= zero_flag are
+ // skippable. Note: zero_flag can be zero.
+
+ const int16x8_t v_zero = vdupq_n_s16(0);
+ const int16x8_t v_one = vdupq_n_s16(1);
+ int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
+ int16x8_t v_round = vmovq_n_s16(round_ptr[1]);
+ int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]);
+ int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]);
+ // adjust for dc
+ v_round = vsetq_lane_s16(round_ptr[0], v_round, 0);
+ v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0);
+ v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0);
+
+ for (i = 0; i < count; i += 8) {
+ const int16x8_t v_iscan = vld1q_s16(&iscan[i]);
+ const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[i]);
+ const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+ const int16x8_t v_abs_coeff = vabsq_s16(v_coeff);
+ const int16x8_t v_tmp = vqaddq_s16(v_abs_coeff, v_round);
+ const int32x4_t v_tmp_lo = vmull_s16(vget_low_s16(v_tmp),
+ vget_low_s16(v_quant));
+ const int32x4_t v_tmp_hi = vmull_s16(vget_high_s16(v_tmp),
+ vget_high_s16(v_quant));
+ const int16x8_t v_tmp2 = vcombine_s16(vshrn_n_s32(v_tmp_lo, 16),
+ vshrn_n_s32(v_tmp_hi, 16));
+ const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
+ const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
+ const int16x8_t v_nz_iscan =
+ vandq_s16(vmvnq_s16(vreinterpretq_s16_u16(v_nz_mask)), v_iscan_plus1);
+ const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
+ const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
+ const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
+
+ v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
+
+ vst1q_s16(&qcoeff_ptr[i], v_qcoeff);
+ vst1q_s16(&dqcoeff_ptr[i], v_dqcoeff);
+ v_round = vmovq_n_s16(round_ptr[1]);
+ v_quant = vmovq_n_s16(quant_ptr[1]);
+ v_dequant = vmovq_n_s16(dequant_ptr[1]);
+ }
+ {
+ const int16x4_t v_eobmax_3210 =
+ vmax_s16(vget_low_s16(v_eobmax_76543210),
+ vget_high_s16(v_eobmax_76543210));
+ const int64x1_t v_eobmax_xx32 =
+ vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
+ const int16x4_t v_eobmax_tmp =
+ vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
+ const int64x1_t v_eobmax_xxx3 =
+ vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
+ const int16x4_t v_eobmax_final =
+ vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
+
+ *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0);
+ }
+ } else {
+ vpx_memset(qcoeff_ptr, 0, count * sizeof(int16_t));
+ vpx_memset(dqcoeff_ptr, 0, count * sizeof(int16_t));
+ *eob_ptr = 0;
+ }
+}
diff --git a/vp9/encoder/arm/neon/vp9_sad_neon.c b/vp9/encoder/arm/neon/vp9_sad_neon.c
index fe40b5452..c4cd85680 100644
--- a/vp9/encoder/arm/neon/vp9_sad_neon.c
+++ b/vp9/encoder/arm/neon/vp9_sad_neon.c
@@ -26,9 +26,8 @@ static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
vreinterpret_u32_u64(vget_high_u64(b)));
return vget_lane_u32(c, 0);
}
-static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_lo,
- const uint16x8_t vec_hi) {
- const uint32x4_t a = vpaddlq_u16(vaddq_u16(vec_lo, vec_hi));
+static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) {
+ const uint32x4_t a = vpaddlq_u16(vec_16x8);
const uint64x2_t b = vpaddlq_u32(a);
const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
vreinterpret_u32_u64(vget_high_u64(b)));
@@ -93,7 +92,7 @@ unsigned int vp9_sad32x32_neon(const uint8_t *src, int src_stride,
vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),
vget_high_u8(vec_ref_16));
}
- return horizontal_add_16x8(vec_accum_lo, vec_accum_hi);
+ return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
}
unsigned int vp9_sad16x16_neon(const uint8_t *src, int src_stride,
@@ -112,5 +111,20 @@ unsigned int vp9_sad16x16_neon(const uint8_t *src, int src_stride,
vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src),
vget_high_u8(vec_ref));
}
- return horizontal_add_16x8(vec_accum_lo, vec_accum_hi);
+ return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
+}
+
+unsigned int vp9_sad8x8_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride) {
+ int i;
+ uint16x8_t vec_accum = vdupq_n_u16(0);
+
+ for (i = 0; i < 8; ++i) {
+ const uint8x8_t vec_src = vld1_u8(src);
+ const uint8x8_t vec_ref = vld1_u8(ref);
+ src += src_stride;
+ ref += ref_stride;
+ vec_accum = vabal_u8(vec_accum, vec_src, vec_ref);
+ }
+ return horizontal_add_16x8(vec_accum);
}
diff --git a/vp9/encoder/arm/neon/vp9_subtract_neon.c b/vp9/encoder/arm/neon/vp9_subtract_neon.c
new file mode 100644
index 000000000..b4bf567db
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_subtract_neon.c
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+
+void vp9_subtract_block_neon(int rows, int cols,
+ int16_t *diff, ptrdiff_t diff_stride,
+ const uint8_t *src, ptrdiff_t src_stride,
+ const uint8_t *pred, ptrdiff_t pred_stride) {
+ int r, c;
+
+ if (cols > 16) {
+ for (r = 0; r < rows; ++r) {
+ for (c = 0; c < cols; c += 32) {
+ const uint8x16_t v_src_00 = vld1q_u8(&src[c + 0]);
+ const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]);
+ const uint8x16_t v_pred_00 = vld1q_u8(&pred[c + 0]);
+ const uint8x16_t v_pred_16 = vld1q_u8(&pred[c + 16]);
+ const uint16x8_t v_diff_lo_00 = vsubl_u8(vget_low_u8(v_src_00),
+ vget_low_u8(v_pred_00));
+ const uint16x8_t v_diff_hi_00 = vsubl_u8(vget_high_u8(v_src_00),
+ vget_high_u8(v_pred_00));
+ const uint16x8_t v_diff_lo_16 = vsubl_u8(vget_low_u8(v_src_16),
+ vget_low_u8(v_pred_16));
+ const uint16x8_t v_diff_hi_16 = vsubl_u8(vget_high_u8(v_src_16),
+ vget_high_u8(v_pred_16));
+ vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_lo_00));
+ vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_hi_00));
+ vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16));
+ vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(v_diff_hi_16));
+ }
+ diff += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ }
+ } else if (cols > 8) {
+ for (r = 0; r < rows; ++r) {
+ const uint8x16_t v_src = vld1q_u8(&src[0]);
+ const uint8x16_t v_pred = vld1q_u8(&pred[0]);
+ const uint16x8_t v_diff_lo = vsubl_u8(vget_low_u8(v_src),
+ vget_low_u8(v_pred));
+ const uint16x8_t v_diff_hi = vsubl_u8(vget_high_u8(v_src),
+ vget_high_u8(v_pred));
+ vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_lo));
+ vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_hi));
+ diff += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ }
+ } else if (cols > 4) {
+ for (r = 0; r < rows; ++r) {
+ const uint8x8_t v_src = vld1_u8(&src[0]);
+ const uint8x8_t v_pred = vld1_u8(&pred[0]);
+ const uint16x8_t v_diff = vsubl_u8(v_src, v_pred);
+ vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff));
+ diff += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ }
+ } else {
+ for (r = 0; r < rows; ++r) {
+ for (c = 0; c < cols; ++c)
+ diff[c] = src[c] - pred[c];
+
+ diff += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ }
+ }
+}
diff --git a/vp9/encoder/arm/neon/vp9_variance_neon.c b/vp9/encoder/arm/neon/vp9_variance_neon.c
new file mode 100644
index 000000000..816fbda1f
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_variance_neon.c
@@ -0,0 +1,227 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vp9_rtcd.h"
+
+#include "vpx_ports/mem.h"
+#include "vpx/vpx_integer.h"
+
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_filter.h"
+
+#include "vp9/encoder/vp9_variance.h"
+
+enum { kWidth8 = 8 };
+enum { kHeight8 = 8 };
+enum { kHeight8PlusOne = 9 };
+enum { kWidth16 = 16 };
+enum { kHeight16 = 16 };
+enum { kHeight16PlusOne = 17 };
+enum { kWidth32 = 32 };
+enum { kHeight32 = 32 };
+enum { kHeight32PlusOne = 33 };
+enum { kPixelStepOne = 1 };
+enum { kAlign16 = 16 };
+
+static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
+ const int32x4_t a = vpaddlq_s16(v_16x8);
+ const int64x2_t b = vpaddlq_s32(a);
+ const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+ vreinterpret_s32_s64(vget_high_s64(b)));
+ return vget_lane_s32(c, 0);
+}
+
+static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
+ const int64x2_t b = vpaddlq_s32(v_32x4);
+ const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+ vreinterpret_s32_s64(vget_high_s64(b)));
+ return vget_lane_s32(c, 0);
+}
+
+static void variance_neon_w8(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ int w, int h, unsigned int *sse, int *sum) {
+ int i, j;
+ int16x8_t v_sum = vdupq_n_s16(0);
+ int32x4_t v_sse_lo = vdupq_n_s32(0);
+ int32x4_t v_sse_hi = vdupq_n_s32(0);
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ const uint8x8_t v_a = vld1_u8(&a[j]);
+ const uint8x8_t v_b = vld1_u8(&b[j]);
+ const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
+ const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
+ v_sum = vaddq_s16(v_sum, sv_diff);
+ v_sse_lo = vmlal_s16(v_sse_lo,
+ vget_low_s16(sv_diff),
+ vget_low_s16(sv_diff));
+ v_sse_hi = vmlal_s16(v_sse_hi,
+ vget_high_s16(sv_diff),
+ vget_high_s16(sv_diff));
+ }
+ a += a_stride;
+ b += b_stride;
+ }
+
+ *sum = horizontal_add_s16x8(v_sum);
+ *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
+}
+
+void vp9_get8x8var_neon(const uint8_t *src_ptr, int source_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse, int *sum) {
+ variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, kWidth8,
+ kHeight8, sse, sum);
+}
+
+unsigned int vp9_variance8x8_neon(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ unsigned int *sse) {
+ int sum;
+ variance_neon_w8(a, a_stride, b, b_stride, kWidth8, kHeight8, sse, &sum);
+ return *sse - (((int64_t)sum * sum) / (kWidth8 * kHeight8));
+}
+
+void vp9_get16x16var_neon(const uint8_t *src_ptr, int source_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse, int *sum) {
+ variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, kWidth16,
+ kHeight16, sse, sum);
+}
+
+unsigned int vp9_variance16x16_neon(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ unsigned int *sse) {
+ int sum;
+ variance_neon_w8(a, a_stride, b, b_stride, kWidth16, kHeight16, sse, &sum);
+ return *sse - (((int64_t)sum * sum) / (kWidth16 * kHeight16));
+}
+
+static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
+ uint8_t *output_ptr,
+ unsigned int src_pixels_per_line,
+ int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const int16_t *vp9_filter) {
+ const uint8x8_t f0 = vmov_n_u8((uint8_t)vp9_filter[0]);
+ const uint8x8_t f1 = vmov_n_u8((uint8_t)vp9_filter[1]);
+ unsigned int i;
+ for (i = 0; i < output_height; ++i) {
+ const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
+ const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]);
+ const uint16x8_t a = vmull_u8(src_0, f0);
+ const uint16x8_t b = vmlal_u8(a, src_1, f1);
+ const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
+ vst1_u8(&output_ptr[0], out);
+ // Next row...
+ src_ptr += src_pixels_per_line;
+ output_ptr += output_width;
+ }
+}
+
+static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
+ uint8_t *output_ptr,
+ unsigned int src_pixels_per_line,
+ int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const int16_t *vp9_filter) {
+ const uint8x8_t f0 = vmov_n_u8((uint8_t)vp9_filter[0]);
+ const uint8x8_t f1 = vmov_n_u8((uint8_t)vp9_filter[1]);
+ unsigned int i, j;
+ for (i = 0; i < output_height; ++i) {
+ for (j = 0; j < output_width; j += 16) {
+ const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]);
+ const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]);
+ const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0);
+ const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1);
+ const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS);
+ const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0);
+ const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1);
+ const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS);
+ vst1q_u8(&output_ptr[j], vcombine_u8(out_lo, out_hi));
+ }
+ // Next row...
+ src_ptr += src_pixels_per_line;
+ output_ptr += output_width;
+ }
+}
+
+unsigned int vp9_sub_pixel_variance8x8_neon(const uint8_t *src,
+ int src_stride,
+ int xoffset,
+ int yoffset,
+ const uint8_t *dst,
+ int dst_stride,
+ unsigned int *sse) {
+ DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight8 * kWidth8);
+ DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight8PlusOne * kWidth8);
+
+ var_filter_block2d_bil_w8(src, fdata3, src_stride, kPixelStepOne,
+ kHeight8PlusOne, kWidth8,
+ BILINEAR_FILTERS_2TAP(xoffset));
+ var_filter_block2d_bil_w8(fdata3, temp2, kWidth8, kWidth8, kHeight8,
+ kWidth8, BILINEAR_FILTERS_2TAP(yoffset));
+ return vp9_variance8x8_neon(temp2, kWidth8, dst, dst_stride, sse);
+}
+
+unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src,
+ int src_stride,
+ int xoffset,
+ int yoffset,
+ const uint8_t *dst,
+ int dst_stride,
+ unsigned int *sse) {
+ DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight16 * kWidth16);
+ DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight16PlusOne * kWidth16);
+
+ var_filter_block2d_bil_w16(src, fdata3, src_stride, kPixelStepOne,
+ kHeight16PlusOne, kWidth16,
+ BILINEAR_FILTERS_2TAP(xoffset));
+ var_filter_block2d_bil_w16(fdata3, temp2, kWidth16, kWidth16, kHeight16,
+ kWidth16, BILINEAR_FILTERS_2TAP(yoffset));
+ return vp9_variance16x16_neon(temp2, kWidth16, dst, dst_stride, sse);
+}
+
+void vp9_get32x32var_neon(const uint8_t *src_ptr, int source_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse, int *sum) {
+ variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, kWidth32,
+ kHeight32, sse, sum);
+}
+
+unsigned int vp9_variance32x32_neon(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ unsigned int *sse) {
+ int sum;
+ variance_neon_w8(a, a_stride, b, b_stride, kWidth32, kHeight32, sse, &sum);
+ return *sse - (((int64_t)sum * sum) / (kWidth32 * kHeight32));
+}
+
+unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src,
+ int src_stride,
+ int xoffset,
+ int yoffset,
+ const uint8_t *dst,
+ int dst_stride,
+ unsigned int *sse) {
+ DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight32 * kWidth32);
+ DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight32PlusOne * kWidth32);
+
+ var_filter_block2d_bil_w16(src, fdata3, src_stride, kPixelStepOne,
+ kHeight32PlusOne, kWidth32,
+ BILINEAR_FILTERS_2TAP(xoffset));
+ var_filter_block2d_bil_w16(fdata3, temp2, kWidth32, kWidth32, kHeight32,
+ kWidth32, BILINEAR_FILTERS_2TAP(yoffset));
+ return vp9_variance32x32_neon(temp2, kWidth32, dst, dst_stride, sse);
+}
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index e00c3d517..40379555a 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -889,7 +889,12 @@ static void write_tile_info(VP9_COMMON *cm, struct vp9_write_bit_buffer *wb) {
static int get_refresh_mask(VP9_COMP *cpi) {
if (!cpi->multi_arf_allowed && cpi->refresh_golden_frame &&
- cpi->rc.is_src_frame_alt_ref && !cpi->use_svc) {
+ cpi->rc.is_src_frame_alt_ref &&
+ (!cpi->use_svc || // Add spatial svc base layer case here
+ (cpi->svc.number_temporal_layers == 1 &&
+ cpi->svc.spatial_layer_id == 0 &&
+ cpi->svc.layer_context[0].gold_ref_idx >=0 &&
+ cpi->oxcf.ss_play_alternate[0]))) {
// Preserve the previously existing golden frame and update the frame in
// the alt ref slot instead. This is highly specific to the use of
// alt-ref as a forward reference, and this needs to be generalized as
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index ab7991e05..ed7029ede 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -66,7 +66,7 @@ struct macroblock {
int sadperbit4;
int rddiv;
int rdmult;
- unsigned int mb_energy;
+ int mb_energy;
int mv_best_ref_index[MAX_REF_FRAMES];
unsigned int max_mv_context[MAX_REF_FRAMES];
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 584bcb8f5..d7efc5981 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -293,6 +293,7 @@ static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) {
}
default: {
assert(0);
+ break;
}
}
}
@@ -985,6 +986,7 @@ static void encode_sb(VP9_COMP *cpi, const TileInfo *const tile,
break;
default:
assert("Invalid partition type.");
+ break;
}
if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
@@ -1412,6 +1414,7 @@ static void encode_sb_rt(VP9_COMP *cpi, const TileInfo *const tile,
break;
default:
assert("Invalid partition type.");
+ break;
}
if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
@@ -1463,7 +1466,7 @@ static void rd_use_partition(VP9_COMP *cpi,
pc_tree->partitioning = partition;
save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
- if (bsize == BLOCK_16X16) {
+ if (bsize == BLOCK_16X16 && cpi->oxcf.aq_mode) {
set_offsets(cpi, tile, mi_row, mi_col, bsize);
x->mb_energy = vp9_block_energy(cpi, x, bsize);
}
@@ -1590,6 +1593,7 @@ static void rd_use_partition(VP9_COMP *cpi,
break;
default:
assert(0);
+ break;
}
pl = partition_plane_context(xd, mi_row, mi_col, bsize);
@@ -1836,7 +1840,7 @@ static void auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile,
BLOCK_SIZE max_size = BLOCK_8X8;
int bsl = mi_width_log2(BLOCK_64X64);
const int search_range_ctrl = (((mi_row + mi_col) >> bsl) +
- get_chessboard_index(cm)) % 2;
+ get_chessboard_index(cm->current_video_frame)) & 0x1;
// Trap case where we do not have a prediction.
if (search_range_ctrl &&
(left_in_image || above_in_image || cm->frame_type != KEY_FRAME)) {
@@ -1880,6 +1884,60 @@ static void auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile,
*max_block_size = max_size;
}
+// TODO(jingning) refactor functions setting partition search range
+static void set_partition_range(VP9_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ BLOCK_SIZE *min_bs, BLOCK_SIZE *max_bs) {
+ int mi_width = num_8x8_blocks_wide_lookup[bsize];
+ int mi_height = num_8x8_blocks_high_lookup[bsize];
+ int idx, idy;
+
+ MODE_INFO *mi;
+ MODE_INFO **prev_mi =
+ &cm->prev_mi_grid_visible[mi_row * cm->mi_stride + mi_col];
+ BLOCK_SIZE bs, min_size, max_size;
+
+ min_size = BLOCK_64X64;
+ max_size = BLOCK_4X4;
+
+ if (prev_mi) {
+ for (idy = 0; idy < mi_height; ++idy) {
+ for (idx = 0; idx < mi_width; ++idx) {
+ mi = prev_mi[idy * cm->mi_stride + idx];
+ bs = mi ? mi->mbmi.sb_type : bsize;
+ min_size = MIN(min_size, bs);
+ max_size = MAX(max_size, bs);
+ }
+ }
+ }
+
+ if (xd->left_available) {
+ for (idy = 0; idy < mi_height; ++idy) {
+ mi = xd->mi[idy * cm->mi_stride - 1];
+ bs = mi ? mi->mbmi.sb_type : bsize;
+ min_size = MIN(min_size, bs);
+ max_size = MAX(max_size, bs);
+ }
+ }
+
+ if (xd->up_available) {
+ for (idx = 0; idx < mi_width; ++idx) {
+ mi = xd->mi[idx - cm->mi_stride];
+ bs = mi ? mi->mbmi.sb_type : bsize;
+ min_size = MIN(min_size, bs);
+ max_size = MAX(max_size, bs);
+ }
+ }
+
+ if (min_size == max_size) {
+ min_size = min_partition_size[min_size];
+ max_size = max_partition_size[max_size];
+ }
+
+ *min_bs = min_size;
+ *max_bs = max_size;
+}
+
static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
vpx_memcpy(ctx->pred_mv, x->pred_mv, sizeof(x->pred_mv));
}
@@ -1888,13 +1946,58 @@ static INLINE void load_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
vpx_memcpy(x->pred_mv, ctx->pred_mv, sizeof(x->pred_mv));
}
+#if CONFIG_FP_MB_STATS
+const int num_16x16_blocks_wide_lookup[BLOCK_SIZES] =
+ {1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 4, 4};
+const int num_16x16_blocks_high_lookup[BLOCK_SIZES] =
+ {1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 4, 2, 4};
+const int qindex_skip_threshold_lookup[BLOCK_SIZES] =
+ {0, 10, 10, 30, 40, 40, 60, 80, 80, 90, 100, 100, 120};
+const int qindex_split_threshold_lookup[BLOCK_SIZES] =
+ {0, 3, 3, 7, 15, 15, 30, 40, 40, 60, 80, 80, 120};
+const int complexity_16x16_blocks_threshold[BLOCK_SIZES] =
+ {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 6};
+
+typedef enum {
+ MV_ZERO = 0,
+ MV_LEFT = 1,
+ MV_UP = 2,
+ MV_RIGHT = 3,
+ MV_DOWN = 4,
+ MV_INVALID
+} MOTION_DIRECTION;
+
+static INLINE MOTION_DIRECTION get_motion_direction_fp(uint8_t fp_byte) {
+ if (fp_byte & FPMB_MOTION_ZERO_MASK) {
+ return MV_ZERO;
+ } else if (fp_byte & FPMB_MOTION_LEFT_MASK) {
+ return MV_LEFT;
+ } else if (fp_byte & FPMB_MOTION_RIGHT_MASK) {
+ return MV_RIGHT;
+ } else if (fp_byte & FPMB_MOTION_UP_MASK) {
+ return MV_UP;
+ } else {
+ return MV_DOWN;
+ }
+}
+
+static INLINE int get_motion_inconsistency(MOTION_DIRECTION this_mv,
+ MOTION_DIRECTION that_mv) {
+ if (this_mv == that_mv) {
+ return 0;
+ } else {
+ return abs(this_mv - that_mv) == 2 ? 2 : 1;
+ }
+}
+#endif
+
// TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
// unlikely to be selected depending on previous rate-distortion optimization
// results, for encoding speed-up.
static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
TOKENEXTRA **tp, int mi_row,
int mi_col, BLOCK_SIZE bsize, int *rate,
- int64_t *dist, int do_recon, int64_t best_rd,
+ int64_t *dist, int64_t best_rd,
PC_TREE *pc_tree) {
VP9_COMMON *const cm = &cpi->common;
MACROBLOCK *const x = &cpi->mb;
@@ -1911,12 +2014,21 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
int64_t sum_rd = 0;
int do_split = bsize >= BLOCK_8X8;
int do_rect = 1;
+
// Override skipping rectangular partition operations for edge blocks
const int force_horz_split = (mi_row + mi_step >= cm->mi_rows);
const int force_vert_split = (mi_col + mi_step >= cm->mi_cols);
const int xss = x->e_mbd.plane[1].subsampling_x;
const int yss = x->e_mbd.plane[1].subsampling_y;
+ BLOCK_SIZE min_size = cpi->sf.min_partition_size;
+ BLOCK_SIZE max_size = cpi->sf.max_partition_size;
+
+#if CONFIG_FP_MB_STATS
+ unsigned int src_diff_var = UINT_MAX;
+ int none_complexity = 0;
+#endif
+
int partition_none_allowed = !force_horz_split && !force_vert_split;
int partition_horz_allowed = !force_vert_split && yss <= xss &&
bsize >= BLOCK_8X8;
@@ -1927,22 +2039,28 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
assert(num_8x8_blocks_wide_lookup[bsize] ==
num_8x8_blocks_high_lookup[bsize]);
- if (bsize == BLOCK_16X16) {
- set_offsets(cpi, tile, mi_row, mi_col, bsize);
+ set_offsets(cpi, tile, mi_row, mi_col, bsize);
+
+ if (bsize == BLOCK_16X16 && cpi->oxcf.aq_mode)
x->mb_energy = vp9_block_energy(cpi, x, bsize);
+
+ if (cpi->sf.cb_partition_search && bsize == BLOCK_16X16) {
+ int cb_partition_search_ctrl = ((pc_tree->index == 0 || pc_tree->index == 3)
+ + get_chessboard_index(cm->current_video_frame)) & 0x1;
+
+ if (cb_partition_search_ctrl && bsize > min_size && bsize < max_size)
+ set_partition_range(cm, xd, mi_row, mi_col, bsize, &min_size, &max_size);
}
+
// Determine partition types in search according to the speed features.
// The threshold set here has to be of square block size.
if (cpi->sf.auto_min_max_partition_size) {
- partition_none_allowed &= (bsize <= cpi->sf.max_partition_size &&
- bsize >= cpi->sf.min_partition_size);
- partition_horz_allowed &= ((bsize <= cpi->sf.max_partition_size &&
- bsize > cpi->sf.min_partition_size) ||
+ partition_none_allowed &= (bsize <= max_size && bsize >= min_size);
+ partition_horz_allowed &= ((bsize <= max_size && bsize > min_size) ||
force_horz_split);
- partition_vert_allowed &= ((bsize <= cpi->sf.max_partition_size &&
- bsize > cpi->sf.min_partition_size) ||
+ partition_vert_allowed &= ((bsize <= max_size && bsize > min_size) ||
force_vert_split);
- do_split &= bsize > cpi->sf.min_partition_size;
+ do_split &= bsize > min_size;
}
if (cpi->sf.use_square_partition_only) {
partition_horz_allowed &= force_horz_split;
@@ -1962,6 +2080,65 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
}
}
+#if CONFIG_FP_MB_STATS
+ if (cpi->use_fp_mb_stats) {
+ set_offsets(cpi, tile, mi_row, mi_col, bsize);
+ src_diff_var = get_sby_perpixel_diff_variance(cpi, &cpi->mb.plane[0].src,
+ mi_row, mi_col, bsize);
+ }
+#endif
+
+#if CONFIG_FP_MB_STATS
+ // Decide whether we shall split directly and skip searching NONE by using
+ // the first pass block statistics
+ if (cpi->use_fp_mb_stats && bsize >= BLOCK_32X32 && do_split &&
+ partition_none_allowed && src_diff_var > 4 &&
+ cm->base_qindex < qindex_split_threshold_lookup[bsize]) {
+ int mb_row = mi_row >> 1;
+ int mb_col = mi_col >> 1;
+ int mb_row_end =
+ MIN(mb_row + num_16x16_blocks_high_lookup[bsize], cm->mb_rows);
+ int mb_col_end =
+ MIN(mb_col + num_16x16_blocks_wide_lookup[bsize], cm->mb_cols);
+ int r, c;
+
+ // compute a complexity measure, basically measure inconsistency of motion
+ // vectors obtained from the first pass in the current block
+ for (r = mb_row; r < mb_row_end ; r++) {
+ for (c = mb_col; c < mb_col_end; c++) {
+ const int mb_index = r * cm->mb_cols + c;
+
+ MOTION_DIRECTION this_mv;
+ MOTION_DIRECTION right_mv;
+ MOTION_DIRECTION bottom_mv;
+
+ this_mv =
+ get_motion_direction_fp(cpi->twopass.this_frame_mb_stats[mb_index]);
+
+ // to its right
+ if (c != mb_col_end - 1) {
+ right_mv = get_motion_direction_fp(
+ cpi->twopass.this_frame_mb_stats[mb_index + 1]);
+ none_complexity += get_motion_inconsistency(this_mv, right_mv);
+ }
+
+ // to its bottom
+ if (r != mb_row_end - 1) {
+ bottom_mv = get_motion_direction_fp(
+ cpi->twopass.this_frame_mb_stats[mb_index + cm->mb_cols]);
+ none_complexity += get_motion_inconsistency(this_mv, bottom_mv);
+ }
+
+ // do not count its left and top neighbors to avoid double counting
+ }
+ }
+
+ if (none_complexity > complexity_16x16_blocks_threshold[bsize]) {
+ partition_none_allowed = 0;
+ }
+ }
+#endif
+
// PARTITION_NONE
if (partition_none_allowed) {
rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &this_rate, &this_dist, bsize,
@@ -1972,6 +2149,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
this_rate += cpi->partition_cost[pl][PARTITION_NONE];
}
sum_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_dist);
+
if (sum_rd < best_rd) {
int64_t stop_thresh = 4096;
int64_t stop_thresh_rd;
@@ -1993,6 +2171,52 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
do_split = 0;
do_rect = 0;
}
+
+#if CONFIG_FP_MB_STATS
+ // Check if every 16x16 first pass block statistics has zero
+ // motion and the corresponding first pass residue is small enough.
+ // If that is the case, check the difference variance between the
+ // current frame and the last frame. If the variance is small enough,
+ // stop further splitting in RD optimization
+ if (cpi->use_fp_mb_stats && do_split != 0 &&
+ cm->base_qindex > qindex_skip_threshold_lookup[bsize]) {
+ int mb_row = mi_row >> 1;
+ int mb_col = mi_col >> 1;
+ int mb_row_end =
+ MIN(mb_row + num_16x16_blocks_high_lookup[bsize], cm->mb_rows);
+ int mb_col_end =
+ MIN(mb_col + num_16x16_blocks_wide_lookup[bsize], cm->mb_cols);
+ int r, c;
+
+ int skip = 1;
+ for (r = mb_row; r < mb_row_end; r++) {
+ for (c = mb_col; c < mb_col_end; c++) {
+ const int mb_index = r * cm->mb_cols + c;
+ if (!(cpi->twopass.this_frame_mb_stats[mb_index] &
+ FPMB_MOTION_ZERO_MASK) ||
+ !(cpi->twopass.this_frame_mb_stats[mb_index] &
+ FPMB_ERROR_SMALL_MASK)) {
+ skip = 0;
+ break;
+ }
+ }
+ if (skip == 0) {
+ break;
+ }
+ }
+ if (skip) {
+ if (src_diff_var == UINT_MAX) {
+ set_offsets(cpi, tile, mi_row, mi_col, bsize);
+ src_diff_var = get_sby_perpixel_diff_variance(
+ cpi, &cpi->mb.plane[0].src, mi_row, mi_col, bsize);
+ }
+ if (src_diff_var < 8) {
+ do_split = 0;
+ do_rect = 0;
+ }
+ }
+ }
+#endif
}
}
restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
@@ -2030,8 +2254,9 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
if (cpi->sf.adaptive_motion_search)
load_pred_mv(x, ctx);
+ pc_tree->split[i]->index = i;
rd_pick_partition(cpi, tile, tp, mi_row + y_idx, mi_col + x_idx,
- subsize, &this_rate, &this_dist, i != 3,
+ subsize, &this_rate, &this_dist,
best_rd - sum_rd, pc_tree->split[i]);
if (this_rate == INT_MAX) {
@@ -2048,6 +2273,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
pl = partition_plane_context(xd, mi_row, mi_col, bsize);
sum_rate += cpi->partition_cost[pl][PARTITION_SPLIT];
sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+
if (sum_rd < best_rd) {
best_rate = sum_rate;
best_dist = sum_dist;
@@ -2160,6 +2386,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
}
restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
}
+
// TODO(jbb): This code added so that we avoid static analysis
// warning related to the fact that best_rd isn't used after this
// point. This code should be refactored so that the duplicate
@@ -2168,7 +2395,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
*rate = best_rate;
*dist = best_dist;
- if (best_rate < INT_MAX && best_dist < INT64_MAX && do_recon) {
+ if (best_rate < INT_MAX && best_dist < INT64_MAX && pc_tree->index != 3) {
int output_enabled = (bsize == BLOCK_64X64);
// Check the projected output rate for this SB against it's target
@@ -2225,6 +2452,7 @@ static void encode_rd_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
}
vp9_zero(cpi->mb.pred_mv);
+ cpi->pc_root->index = 0;
if ((sf->partition_search_type == SEARCH_PARTITION &&
sf->use_lastframe_partitioning) ||
@@ -2278,7 +2506,7 @@ static void encode_rd_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
&sf->max_partition_size);
}
rd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64,
- &dummy_rate, &dummy_dist, 1, INT64_MAX,
+ &dummy_rate, &dummy_dist, INT64_MAX,
cpi->pc_root);
} else {
if (sf->constrain_copy_partition &&
@@ -2300,7 +2528,7 @@ static void encode_rd_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
&sf->max_partition_size);
}
rd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64,
- &dummy_rate, &dummy_dist, 1, INT64_MAX, cpi->pc_root);
+ &dummy_rate, &dummy_dist, INT64_MAX, cpi->pc_root);
}
}
}
@@ -2832,6 +3060,7 @@ static void nonrd_use_partition(VP9_COMP *cpi,
break;
default:
assert("Invalid partition type.");
+ break;
}
if (bsize == BLOCK_64X64 && output_enabled) {
@@ -2908,6 +3137,7 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
break;
default:
assert(0);
+ break;
}
}
}
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index cd0191e0a..a409d6a17 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -32,7 +32,7 @@ struct optimize_ctx {
struct encode_b_args {
MACROBLOCK *x;
struct optimize_ctx *ctx;
- unsigned char *skip;
+ int8_t *skip;
};
void vp9_subtract_block_c(int rows, int cols,
@@ -348,6 +348,7 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
break;
default:
assert(0);
+ break;
}
}
@@ -394,6 +395,7 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block,
break;
default:
assert(0);
+ break;
}
}
@@ -444,6 +446,7 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
break;
default:
assert(0);
+ break;
}
}
@@ -521,6 +524,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
break;
default:
assert(0 && "Invalid transform size");
+ break;
}
}
@@ -692,6 +696,7 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
break;
default:
assert(0);
+ break;
}
if (*eob)
*(args->skip) = 0;
@@ -699,7 +704,7 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
void vp9_encode_block_intra(MACROBLOCK *x, int plane, int block,
BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
- unsigned char *skip) {
+ int8_t *skip) {
struct encode_b_args arg = {x, NULL, skip};
encode_block_intra(plane, block, plane_bsize, tx_size, &arg);
}
diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h
index 0b8c3d2b0..199971865 100644
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h
@@ -33,7 +33,7 @@ void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
void vp9_encode_block_intra(MACROBLOCK *x, int plane, int block,
BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
- unsigned char *skip);
+ int8_t *skip);
void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index f8d26110d..b1c5326f4 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -1565,7 +1565,12 @@ void vp9_update_reference_frames(VP9_COMP *cpi) {
ref_cnt_fb(cm->frame_bufs,
&cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx);
} else if (!cpi->multi_arf_allowed && cpi->refresh_golden_frame &&
- cpi->rc.is_src_frame_alt_ref && !cpi->use_svc) {
+ cpi->rc.is_src_frame_alt_ref &&
+ (!cpi->use_svc || // Add spatial svc base layer case here
+ (cpi->svc.number_temporal_layers == 1 &&
+ cpi->svc.spatial_layer_id == 0 &&
+ cpi->svc.layer_context[0].gold_ref_idx >=0 &&
+ cpi->oxcf.ss_play_alternate[0]))) {
/* Preserve the previously existing golden frame and update the frame in
* the alt ref slot instead. This is highly specific to the current use of
* alt-ref as a forward reference, and this needs to be generalized as
@@ -1583,6 +1588,11 @@ void vp9_update_reference_frames(VP9_COMP *cpi) {
tmp = cpi->alt_fb_idx;
cpi->alt_fb_idx = cpi->gld_fb_idx;
cpi->gld_fb_idx = tmp;
+
+ if (cpi->use_svc && cpi->svc.number_temporal_layers == 1) {
+ cpi->svc.layer_context[0].gold_ref_idx = cpi->gld_fb_idx;
+ cpi->svc.layer_context[0].alt_ref_idx = cpi->alt_fb_idx;
+ }
} else { /* For non key/golden frames */
if (cpi->refresh_alt_ref_frame) {
int arf_idx = cpi->alt_fb_idx;
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 4b3f2ad56..a60d47321 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -232,6 +232,7 @@ typedef struct VP9EncoderConfig {
#endif
vp8e_tuning tuning;
+ vp9e_tune_content content;
} VP9EncoderConfig;
static INLINE int is_lossless_requested(const VP9EncoderConfig *cfg) {
@@ -551,8 +552,8 @@ static INLINE void set_ref_ptrs(VP9_COMMON *cm, MACROBLOCKD *xd,
: 0];
}
-static INLINE int get_chessboard_index(const VP9_COMMON *cm) {
- return cm->current_video_frame % 2;
+static INLINE int get_chessboard_index(const int frame_index) {
+ return frame_index & 0x1;
}
#ifdef __cplusplus
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 3c41f5f73..627de47b1 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -225,26 +225,6 @@ static void subtract_stats(FIRSTPASS_STATS *section,
section->duration -= frame->duration;
}
-static void avg_stats(FIRSTPASS_STATS *section) {
- if (section->count < 1.0)
- return;
-
- section->intra_error /= section->count;
- section->coded_error /= section->count;
- section->sr_coded_error /= section->count;
- section->pcnt_inter /= section->count;
- section->pcnt_second_ref /= section->count;
- section->pcnt_neutral /= section->count;
- section->pcnt_motion /= section->count;
- section->MVr /= section->count;
- section->mvr_abs /= section->count;
- section->MVc /= section->count;
- section->mvc_abs /= section->count;
- section->MVrv /= section->count;
- section->MVcv /= section->count;
- section->mv_in_out_count /= section->count;
- section->duration /= section->count;
-}
// Calculate a modified Error used in distributing bits between easier and
// harder frames.
@@ -553,6 +533,9 @@ void vp9_first_pass(VP9_COMP *cpi) {
const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
double error_weight = 1.0;
const BLOCK_SIZE bsize = get_bsize(cm, mb_row, mb_col);
+#if CONFIG_FP_MB_STATS
+ const int mb_index = mb_row * cm->mb_cols + mb_col;
+#endif
vp9_clear_system_state();
@@ -599,7 +582,8 @@ void vp9_first_pass(VP9_COMP *cpi) {
#if CONFIG_FP_MB_STATS
if (cpi->use_fp_mb_stats) {
- // TODO(pengchong): store some related block statistics here
+ // initialization
+ cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
}
#endif
@@ -700,6 +684,20 @@ void vp9_first_pass(VP9_COMP *cpi) {
// Start by assuming that intra mode is best.
best_ref_mv.as_int = 0;
+#if CONFIG_FP_MB_STATS
+ if (cpi->use_fp_mb_stats) {
+ // intra predication statistics
+ cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
+ cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_DCINTRA_MASK;
+ cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK;
+ if (this_error > FPMB_ERROR_LARGE_TH) {
+ cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LARGE_MASK;
+ } else if (this_error < FPMB_ERROR_SMALL_TH) {
+ cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_SMALL_MASK;
+ }
+ }
+#endif
+
if (motion_error <= this_error) {
// Keep a count of cases where the inter and intra were very close
// and very low. This helps with scene cut detection for example in
@@ -730,13 +728,50 @@ void vp9_first_pass(VP9_COMP *cpi) {
#if CONFIG_FP_MB_STATS
if (cpi->use_fp_mb_stats) {
- // TODO(pengchong): save some related block statistics here
+ // inter predication statistics
+ cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
+ cpi->twopass.frame_mb_stats_buf[mb_index] &= ~FPMB_DCINTRA_MASK;
+ cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK;
+ if (this_error > FPMB_ERROR_LARGE_TH) {
+ cpi->twopass.frame_mb_stats_buf[mb_index] |=
+ FPMB_ERROR_LARGE_MASK;
+ } else if (this_error < FPMB_ERROR_SMALL_TH) {
+ cpi->twopass.frame_mb_stats_buf[mb_index] |=
+ FPMB_ERROR_SMALL_MASK;
+ }
}
#endif
if (mv.as_int) {
++mvcount;
+#if CONFIG_FP_MB_STATS
+ if (cpi->use_fp_mb_stats) {
+ cpi->twopass.frame_mb_stats_buf[mb_index] &=
+ ~FPMB_MOTION_ZERO_MASK;
+ // check estimated motion direction
+ if (mv.as_mv.col > 0 && mv.as_mv.col >= abs(mv.as_mv.row)) {
+ // right direction
+ cpi->twopass.frame_mb_stats_buf[mb_index] |=
+ FPMB_MOTION_RIGHT_MASK;
+ } else if (mv.as_mv.row < 0 &&
+ abs(mv.as_mv.row) >= abs(mv.as_mv.col)) {
+ // up direction
+ cpi->twopass.frame_mb_stats_buf[mb_index] |=
+ FPMB_MOTION_UP_MASK;
+ } else if (mv.as_mv.col < 0 &&
+ abs(mv.as_mv.col) >= abs(mv.as_mv.row)) {
+ // left direction
+ cpi->twopass.frame_mb_stats_buf[mb_index] |=
+ FPMB_MOTION_LEFT_MASK;
+ } else {
+ // down direction
+ cpi->twopass.frame_mb_stats_buf[mb_index] |=
+ FPMB_MOTION_DOWN_MASK;
+ }
+ }
+#endif
+
// Non-zero vector, was it different from the last non zero vector?
if (mv.as_int != lastmv_as_int)
++new_mv_count;
@@ -2065,9 +2100,11 @@ void configure_buffer_updates(VP9_COMP *cpi) {
break;
default:
assert(0);
+ break;
}
if (cpi->use_svc && cpi->svc.number_temporal_layers == 1) {
- cpi->refresh_golden_frame = 0;
+ if (cpi->svc.layer_context[cpi->svc.spatial_layer_id].gold_ref_idx < 0)
+ cpi->refresh_golden_frame = 0;
if (cpi->alt_ref_source == NULL)
cpi->refresh_alt_ref_frame = 0;
}
@@ -2167,6 +2204,9 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
if (is_spatial_svc) {
if (cpi->svc.spatial_layer_id == 0) {
lc->is_key_frame = (cm->frame_type == KEY_FRAME);
+ if (lc->is_key_frame)
+ cpi->ref_frame_flags &=
+ (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG);
} else {
cm->frame_type = INTER_FRAME;
lc->is_key_frame = cpi->svc.layer_context[0].is_key_frame;
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index 33a795f26..bf8c9fd96 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -19,6 +19,20 @@ extern "C" {
#endif
#if CONFIG_FP_MB_STATS
+
+#define FPMB_DCINTRA_MASK 0x01
+
+#define FPMB_MOTION_ZERO_MASK 0x02
+#define FPMB_MOTION_LEFT_MASK 0x04
+#define FPMB_MOTION_RIGHT_MASK 0x08
+#define FPMB_MOTION_UP_MASK 0x10
+#define FPMB_MOTION_DOWN_MASK 0x20
+
+#define FPMB_ERROR_SMALL_MASK 0x40
+#define FPMB_ERROR_LARGE_MASK 0x80
+#define FPMB_ERROR_SMALL_TH 2000
+#define FPMB_ERROR_LARGE_TH 48000
+
typedef struct {
uint8_t *mb_stats_start;
uint8_t *mb_stats_end;
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 30a0e9d0d..7a1600155 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -394,7 +394,8 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
INTERP_FILTER filter_ref = cm->interp_filter;
int bsl = mi_width_log2(bsize);
const int pred_filter_search = cm->interp_filter == SWITCHABLE ?
- (((mi_row + mi_col) >> bsl) + get_chessboard_index(cm)) % 2 : 0;
+ (((mi_row + mi_col) >> bsl) +
+ get_chessboard_index(cm->current_video_frame)) & 0x1 : 0;
int const_motion[MAX_REF_FRAMES] = { 0 };
int bh = num_4x4_blocks_high_lookup[bsize] << 2;
int bw = num_4x4_blocks_wide_lookup[bsize] << 2;
@@ -409,6 +410,10 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
PRED_BUFFER *this_mode_pred = NULL;
int i;
+ // CTX is used by the temporal denoiser which is currently being developed.
+ // TODO(jbb): when temporal denoiser is finished and in the default build
+ // remove the following line;
+ (void) ctx;
if (cpi->sf.reuse_inter_pred_sby) {
for (i = 0; i < 3; i++) {
tmp[i].data = &pred_buf[pixels_in_block * i];
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 1adbad9cf..1a479f112 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -1238,7 +1238,8 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) {
if (cpi->use_svc && cpi->svc.number_temporal_layers == 1) {
cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame = 1;
- cpi->ref_frame_flags &= (~VP9_ALT_FLAG);
+ cpi->ref_frame_flags &=
+ (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG);
}
if (cpi->pass == 0 && cpi->oxcf.rc_mode == VPX_CBR) {
diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c
index c149c6136..a9cff1ee0 100644
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c
@@ -313,8 +313,8 @@ void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n,
int d_q10, r_q10;
const uint64_t xsq_q10_64 =
((((uint64_t)qstep * qstep * n) << 10) + (var >> 1)) / var;
- const int xsq_q10 = xsq_q10_64 > MAX_XSQ_Q10 ?
- MAX_XSQ_Q10 : (int)xsq_q10_64;
+ const int xsq_q10 = xsq_q10_64 > (uint64_t)MAX_XSQ_Q10 ?
+ (int)MAX_XSQ_Q10 : (int)xsq_q10_64;
model_rd_norm(xsq_q10, &r_q10, &d_q10);
*rate = (n * r_q10 + 2) >> 2;
*dist = (var * (int64_t)d_q10 + 512) >> 10;
@@ -357,6 +357,7 @@ void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
break;
default:
assert(0 && "Invalid transform size.");
+ break;
}
}
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index f65ac7b2b..5caafd370 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -1988,6 +1988,86 @@ static INLINE void restore_dst_buf(MACROBLOCKD *xd,
}
}
+static void rd_encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int *rate2,
+ int64_t *distortion, int64_t *distortion_uv,
+ int *disable_skip) {
+ VP9_COMMON *cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ const BLOCK_SIZE y_size = get_plane_block_size(bsize, &xd->plane[0]);
+ const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]);
+ unsigned int var, sse;
+ // Skipping threshold for ac.
+ unsigned int thresh_ac;
+ // Skipping threshold for dc
+ unsigned int thresh_dc;
+
+ var = cpi->fn_ptr[y_size].vf(x->plane[0].src.buf, x->plane[0].src.stride,
+ xd->plane[0].dst.buf,
+ xd->plane[0].dst.stride, &sse);
+
+ if (x->encode_breakout > 0) {
+ // Set a maximum for threshold to avoid big PSNR loss in low bitrate
+ // case. Use extreme low threshold for static frames to limit skipping.
+ const unsigned int max_thresh = (cpi->allow_encode_breakout ==
+ ENCODE_BREAKOUT_LIMITED) ? 128 : 36000;
+ // The encode_breakout input
+ const unsigned int min_thresh =
+ MIN(((unsigned int)x->encode_breakout << 4), max_thresh);
+
+ // Calculate threshold according to dequant value.
+ thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9;
+ thresh_ac = clamp(thresh_ac, min_thresh, max_thresh);
+
+ // Adjust threshold according to partition size.
+ thresh_ac >>= 8 - (b_width_log2(bsize) +
+ b_height_log2(bsize));
+ thresh_dc = (xd->plane[0].dequant[0] * xd->plane[0].dequant[0] >> 6);
+ } else {
+ thresh_ac = 0;
+ thresh_dc = 0;
+ }
+
+ // Y skipping condition checking
+ if (sse < thresh_ac || sse == 0) {
+ // dc skipping checking
+ if ((sse - var) < thresh_dc || sse == var) {
+ unsigned int sse_u, sse_v;
+ unsigned int var_u, var_v;
+
+ var_u = cpi->fn_ptr[uv_size].vf(x->plane[1].src.buf,
+ x->plane[1].src.stride,
+ xd->plane[1].dst.buf,
+ xd->plane[1].dst.stride, &sse_u);
+
+ // U skipping condition checking
+ if ((sse_u * 4 < thresh_ac || sse_u == 0) &&
+ (sse_u - var_u < thresh_dc || sse_u == var_u)) {
+ var_v = cpi->fn_ptr[uv_size].vf(x->plane[2].src.buf,
+ x->plane[2].src.stride,
+ xd->plane[2].dst.buf,
+ xd->plane[2].dst.stride, &sse_v);
+
+ // V skipping condition checking
+ if ((sse_v * 4 < thresh_ac || sse_v == 0) &&
+ (sse_v - var_v < thresh_dc || sse_v == var_v)) {
+ x->skip = 1;
+
+ // The cost of skip bit needs to be added.
+ *rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
+
+ // Scaling factor for SSE from spatial domain to frequency domain
+ // is 16. Adjust distortion accordingly.
+ *distortion_uv = (sse_u + sse_v) << 4;
+ *distortion = (sse << 4) + *distortion_uv;
+
+ *disable_skip = 1;
+ }
+ }
+ }
+ }
+}
+
static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
BLOCK_SIZE bsize,
int64_t txfm_cache[],
@@ -2025,7 +2105,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
int bsl = mi_width_log2_lookup[bsize];
int pred_filter_search = cpi->sf.cb_pred_filter_search ?
- (((mi_row + mi_col) >> bsl)) & 0x01 : 0;
+ (((mi_row + mi_col) >> bsl) +
+ get_chessboard_index(cm->current_video_frame)) & 0x1 : 0;
if (pred_filter_search) {
INTERP_FILTER af = SWITCHABLE, lf = SWITCHABLE;
@@ -2230,81 +2311,9 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
*rate2 += vp9_get_switchable_rate(cpi);
if (!is_comp_pred) {
- if (cpi->allow_encode_breakout) {
- const BLOCK_SIZE y_size = get_plane_block_size(bsize, &xd->plane[0]);
- const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]);
- unsigned int var, sse;
- // Skipping threshold for ac.
- unsigned int thresh_ac;
- // Skipping threshold for dc
- unsigned int thresh_dc;
-
- var = cpi->fn_ptr[y_size].vf(x->plane[0].src.buf, x->plane[0].src.stride,
- xd->plane[0].dst.buf,
- xd->plane[0].dst.stride, &sse);
-
- if (x->encode_breakout > 0) {
- // Set a maximum for threshold to avoid big PSNR loss in low bitrate
- // case. Use extreme low threshold for static frames to limit skipping.
- const unsigned int max_thresh = (cpi->allow_encode_breakout ==
- ENCODE_BREAKOUT_LIMITED) ? 128 : 36000;
- // The encode_breakout input
- const unsigned int min_thresh =
- MIN(((unsigned int)x->encode_breakout << 4), max_thresh);
-
- // Calculate threshold according to dequant value.
- thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9;
- thresh_ac = clamp(thresh_ac, min_thresh, max_thresh);
-
- // Adjust threshold according to partition size.
- thresh_ac >>= 8 - (b_width_log2(bsize) +
- b_height_log2(bsize));
- thresh_dc = (xd->plane[0].dequant[0] * xd->plane[0].dequant[0] >> 6);
- } else {
- thresh_ac = 0;
- thresh_dc = 0;
- }
-
- // Y skipping condition checking
- if (sse < thresh_ac || sse == 0) {
- // dc skipping checking
- if ((sse - var) < thresh_dc || sse == var) {
- unsigned int sse_u, sse_v;
- unsigned int var_u, var_v;
-
- var_u = cpi->fn_ptr[uv_size].vf(x->plane[1].src.buf,
- x->plane[1].src.stride,
- xd->plane[1].dst.buf,
- xd->plane[1].dst.stride, &sse_u);
-
- // U skipping condition checking
- if ((sse_u * 4 < thresh_ac || sse_u == 0) &&
- (sse_u - var_u < thresh_dc || sse_u == var_u)) {
- var_v = cpi->fn_ptr[uv_size].vf(x->plane[2].src.buf,
- x->plane[2].src.stride,
- xd->plane[2].dst.buf,
- xd->plane[2].dst.stride, &sse_v);
-
- // V skipping condition checking
- if ((sse_v * 4 < thresh_ac || sse_v == 0) &&
- (sse_v - var_v < thresh_dc || sse_v == var_v)) {
- x->skip = 1;
-
- // The cost of skip bit needs to be added.
- *rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
-
- // Scaling factor for SSE from spatial domain to frequency domain
- // is 16. Adjust distortion accordingly.
- *distortion_uv = (sse_u + sse_v) << 4;
- *distortion = (sse << 4) + *distortion_uv;
-
- *disable_skip = 1;
- this_rd = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
- }
- }
- }
- }
- }
+ if (cpi->allow_encode_breakout)
+ rd_encode_breakout_test(cpi, x, bsize, rate2, distortion, distortion_uv,
+ disable_skip);
}
if (!x->skip) {
@@ -2510,9 +2519,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
x->pred_mv_sad[ref_frame] = INT_MAX;
if (cpi->ref_frame_flags & flag_list[ref_frame]) {
- setup_buffer_inter(cpi, x, tile,
- ref_frame, bsize, mi_row, mi_col,
- frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
+ setup_buffer_inter(cpi, x, tile, ref_frame, bsize, mi_row, mi_col,
+ frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
}
frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
frame_mv[ZEROMV][ref_frame].as_int = 0;
@@ -2618,6 +2626,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
case NONE:
case MAX_REF_FRAMES:
assert(0 && "Invalid Reference frame");
+ break;
}
}
if (mode_skip_mask & (1 << mode_index))
@@ -3215,6 +3224,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
case NONE:
case MAX_REF_FRAMES:
assert(0 && "Invalid Reference frame");
+ break;
}
}
if (mode_skip_mask & (1 << ref_index))
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 7315dd454..e770f33e9 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -110,10 +110,12 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
if (speed >= 3) {
sf->tx_size_search_method = frame_is_intra_only(cm) ? USE_FULL_RD
: USE_LARGESTALL;
- if (MIN(cm->width, cm->height) >= 720)
+ if (MIN(cm->width, cm->height) >= 720) {
sf->disable_split_mask = DISABLE_ALL_SPLIT;
- else
+ sf->cb_partition_search = frame_is_boosted(cpi) ? 0 : 1;
+ } else {
sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
+ }
sf->adaptive_pred_interp_filter = 0;
sf->cb_pred_filter_search = 1;
@@ -158,7 +160,7 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
}
static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
- int speed) {
+ int speed, vp9e_tune_content content) {
VP9_COMMON *const cm = &cpi->common;
const int frames_since_key =
cm->frame_type == KEY_FRAME ? 0 : cpi->rc.frames_since_key;
@@ -273,6 +275,13 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
}
if (speed >= 6) {
+ if (content == VP9E_CONTENT_SCREEN) {
+ int i;
+ // Allow fancy modes at all sizes since SOURCE_VAR_BASED_PARTITION is used
+ for (i = 0; i < BLOCK_SIZES; ++i)
+ sf->inter_mode_mask[i] = INTER_ALL;
+ }
+
// Adaptively switch between SOURCE_VAR_BASED_PARTITION and FIXED_PARTITION.
sf->partition_search_type = SOURCE_VAR_BASED_PARTITION;
sf->search_type_check_frequency = 50;
@@ -334,6 +343,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
sf->adaptive_motion_search = 0;
sf->adaptive_pred_interp_filter = 0;
sf->cb_pred_filter_search = 0;
+ sf->cb_partition_search = 0;
sf->use_quant_fp = 0;
sf->reference_masking = 0;
sf->partition_search_type = SEARCH_PARTITION;
@@ -389,7 +399,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
set_good_speed_feature(cpi, cm, sf, oxcf->speed);
break;
case REALTIME:
- set_rt_speed_feature(cpi, sf, oxcf->speed);
+ set_rt_speed_feature(cpi, sf, oxcf->speed, oxcf->content);
break;
}
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index 929acaf3e..de731cee1 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -286,6 +286,8 @@ typedef struct SPEED_FEATURES {
// Chessboard pattern prediction filter type search
int cb_pred_filter_search;
+ int cb_partition_search;
+
// Fast quantization process path
int use_quant_fp;
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index f8d1a83c3..0e921be8c 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -35,6 +35,7 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
RATE_CONTROL *const lrc = &lc->rc;
int i;
lc->current_video_frame_in_layer = 0;
+ lc->layer_size = 0;
lrc->ni_av_qi = oxcf->worst_allowed_q;
lrc->total_actual_bits = 0;
lrc->total_target_vs_actual = 0;
@@ -48,7 +49,6 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
for (i = 0; i < RATE_FACTOR_LEVELS; ++i) {
lrc->rate_correction_factors[i] = 1.0;
}
- lc->layer_size = 0;
if (svc->number_temporal_layers > 1) {
lc->target_bandwidth = oxcf->ts_target_bitrate[layer];
@@ -66,12 +66,17 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
lc->alt_ref_idx = alt_ref_idx++;
else
lc->alt_ref_idx = -1;
+ lc->gold_ref_idx = -1;
}
lrc->buffer_level = vp9_rescale((int)(oxcf->starting_buffer_level_ms),
lc->target_bandwidth, 1000);
lrc->bits_off_target = lrc->buffer_level;
}
+
+ // Still have extra buffer for base layer golden frame
+ if (svc->number_spatial_layers > 1 && alt_ref_idx < REF_FRAMES)
+ svc->layer_context[0].gold_ref_idx = alt_ref_idx;
}
// Update the layer context from a change_config() call.
@@ -266,21 +271,25 @@ static int copy_svc_params(VP9_COMP *const cpi, struct lookahead_entry *buf) {
layer_param = &buf->svc_params[layer_id];
cpi->svc.spatial_layer_id = layer_param->spatial_layer;
cpi->svc.temporal_layer_id = layer_param->temporal_layer;
+ cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
+
+ lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id];
cpi->lst_fb_idx = cpi->svc.spatial_layer_id;
if (cpi->svc.spatial_layer_id < 1)
- cpi->gld_fb_idx = cpi->lst_fb_idx;
+ cpi->gld_fb_idx = lc->gold_ref_idx >= 0 ?
+ lc->gold_ref_idx : cpi->lst_fb_idx;
else
cpi->gld_fb_idx = cpi->svc.spatial_layer_id - 1;
- lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id];
-
if (lc->current_video_frame_in_layer == 0) {
- if (cpi->svc.spatial_layer_id >= 2)
+ if (cpi->svc.spatial_layer_id >= 2) {
cpi->alt_fb_idx = cpi->svc.spatial_layer_id - 2;
- else
+ } else {
cpi->alt_fb_idx = cpi->lst_fb_idx;
+ cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_ALT_FLAG);
+ }
} else {
if (cpi->oxcf.ss_play_alternate[cpi->svc.spatial_layer_id]) {
cpi->alt_fb_idx = lc->alt_ref_idx;
diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h
index 7b533e467..801449b6f 100644
--- a/vp9/encoder/vp9_svc_layercontext.h
+++ b/vp9/encoder/vp9_svc_layercontext.h
@@ -31,6 +31,7 @@ typedef struct {
vpx_svc_parameters_t svc_params_received;
struct lookahead_entry *alt_ref_source;
int alt_ref_idx;
+ int gold_ref_idx;
int has_alt_frame;
size_t layer_size;
} LAYER_CONTEXT;
diff --git a/vp9/encoder/x86/vp9_dct_avx2.c b/vp9/encoder/x86/vp9_dct_avx2.c
index b5269ed03..3a19f5274 100644
--- a/vp9/encoder/x86/vp9_dct_avx2.c
+++ b/vp9/encoder/x86/vp9_dct_avx2.c
@@ -12,2572 +12,6 @@
#include "vp9/common/vp9_idct.h" // for cospi constants
#include "vpx_ports/mem.h"
-void vp9_fdct4x4_avx2(const int16_t *input, int16_t *output, int stride) {
- // The 2D transform is done with two passes which are actually pretty
- // similar. In the first one, we transform the columns and transpose
- // the results. In the second one, we transform the rows. To achieve that,
- // as the first pass results are transposed, we transpose the columns (that
- // is the transposed rows) and transpose the results (so that it goes back
- // in normal/row positions).
- int pass;
- // Constants
- // When we use them, in one case, they are all the same. In all others
- // it's a pair of them that we need to repeat four times. This is done
- // by constructing the 32 bit constant corresponding to that pair.
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
- const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
- const __m128i kOne = _mm_set1_epi16(1);
- __m128i in0, in1, in2, in3;
- // Load inputs.
- {
- in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
- in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
- in2 = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
- in3 = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
- // x = x << 4
- in0 = _mm_slli_epi16(in0, 4);
- in1 = _mm_slli_epi16(in1, 4);
- in2 = _mm_slli_epi16(in2, 4);
- in3 = _mm_slli_epi16(in3, 4);
- // if (i == 0 && input[0]) input[0] += 1;
- {
- // The mask will only contain whether the first value is zero, all
- // other comparison will fail as something shifted by 4 (above << 4)
- // can never be equal to one. To increment in the non-zero case, we
- // add the mask and one for the first element:
- // - if zero, mask = -1, v = v - 1 + 1 = v
- // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1
- __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a);
- in0 = _mm_add_epi16(in0, mask);
- in0 = _mm_add_epi16(in0, k__nonzero_bias_b);
- }
- }
- // Do the two transform/transpose passes
- for (pass = 0; pass < 2; ++pass) {
- // Transform 1/2: Add/subtract
- const __m128i r0 = _mm_add_epi16(in0, in3);
- const __m128i r1 = _mm_add_epi16(in1, in2);
- const __m128i r2 = _mm_sub_epi16(in1, in2);
- const __m128i r3 = _mm_sub_epi16(in0, in3);
- // Transform 1/2: Interleave to do the multiply by constants which gets us
- // into 32 bits.
- const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
- const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
- const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
- const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
- const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
- const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
- const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
- // Combine and transpose
- const __m128i res0 = _mm_packs_epi32(w0, w2);
- const __m128i res1 = _mm_packs_epi32(w4, w6);
- // 00 01 02 03 20 21 22 23
- // 10 11 12 13 30 31 32 33
- const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
- const __m128i tr0_1 = _mm_unpackhi_epi16(res0, res1);
- // 00 10 01 11 02 12 03 13
- // 20 30 21 31 22 32 23 33
- in0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
- in2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
- // 00 10 20 30 01 11 21 31 in0 contains 0 followed by 1
- // 02 12 22 32 03 13 23 33 in2 contains 2 followed by 3
- if (0 == pass) {
- // Extract values in the high part for second pass as transform code
- // only uses the first four values.
- in1 = _mm_unpackhi_epi64(in0, in0);
- in3 = _mm_unpackhi_epi64(in2, in2);
- } else {
- // Post-condition output and store it (v + 1) >> 2, taking advantage
- // of the fact 1/3 are stored just after 0/2.
- __m128i out01 = _mm_add_epi16(in0, kOne);
- __m128i out23 = _mm_add_epi16(in2, kOne);
- out01 = _mm_srai_epi16(out01, 2);
- out23 = _mm_srai_epi16(out23, 2);
- _mm_storeu_si128((__m128i *)(output + 0 * 4), out01);
- _mm_storeu_si128((__m128i *)(output + 2 * 4), out23);
- }
- }
-}
-
-static INLINE void load_buffer_4x4_avx2(const int16_t *input, __m128i *in,
- int stride) {
- const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
- const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
- __m128i mask;
-
- in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
- in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
- in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
- in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
-
- in[0] = _mm_slli_epi16(in[0], 4);
- in[1] = _mm_slli_epi16(in[1], 4);
- in[2] = _mm_slli_epi16(in[2], 4);
- in[3] = _mm_slli_epi16(in[3], 4);
-
- mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a);
- in[0] = _mm_add_epi16(in[0], mask);
- in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b);
-}
-
-static INLINE void write_buffer_4x4_avx2(int16_t *output, __m128i *res) {
- const __m128i kOne = _mm_set1_epi16(1);
- __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]);
- __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]);
- __m128i out01 = _mm_add_epi16(in01, kOne);
- __m128i out23 = _mm_add_epi16(in23, kOne);
- out01 = _mm_srai_epi16(out01, 2);
- out23 = _mm_srai_epi16(out23, 2);
- _mm_store_si128((__m128i *)(output + 0 * 8), out01);
- _mm_store_si128((__m128i *)(output + 1 * 8), out23);
-}
-
-static INLINE void transpose_4x4_avx2(__m128i *res) {
- // Combine and transpose
- // 00 01 02 03 20 21 22 23
- // 10 11 12 13 30 31 32 33
- const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
- const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
-
- // 00 10 01 11 02 12 03 13
- // 20 30 21 31 22 32 23 33
- res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
- res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);
-
- // 00 10 20 30 01 11 21 31
- // 02 12 22 32 03 13 23 33
- // only use the first 4 16-bit integers
- res[1] = _mm_unpackhi_epi64(res[0], res[0]);
- res[3] = _mm_unpackhi_epi64(res[2], res[2]);
-}
-
-void fdct4_avx2(__m128i *in) {
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
- const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
- __m128i u[4], v[4];
- u[0]=_mm_unpacklo_epi16(in[0], in[1]);
- u[1]=_mm_unpacklo_epi16(in[3], in[2]);
-
- v[0] = _mm_add_epi16(u[0], u[1]);
- v[1] = _mm_sub_epi16(u[0], u[1]);
-
- u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); // 0
- u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16); // 2
- u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24); // 1
- u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08); // 3
-
- v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
- u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-
- in[0] = _mm_packs_epi32(u[0], u[1]);
- in[1] = _mm_packs_epi32(u[2], u[3]);
- transpose_4x4_avx2(in);
-}
-
-void fadst4_avx2(__m128i *in) {
- const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9);
- const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9);
- const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
- const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9);
- const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9);
- const __m128i kZero = _mm_set1_epi16(0);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
- __m128i u[8], v[8];
- __m128i in7 = _mm_add_epi16(in[0], in[1]);
-
- u[0] = _mm_unpacklo_epi16(in[0], in[1]);
- u[1] = _mm_unpacklo_epi16(in[2], in[3]);
- u[2] = _mm_unpacklo_epi16(in7, kZero);
- u[3] = _mm_unpacklo_epi16(in[2], kZero);
- u[4] = _mm_unpacklo_epi16(in[3], kZero);
-
- v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02); // s0 + s2
- v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04); // s4 + s5
- v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x1
- v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01); // s1 - s3
- v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02); // -s4 + s6
- v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s4
- v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03);
-
- u[0] = _mm_add_epi32(v[0], v[1]);
- u[1] = _mm_sub_epi32(v[2], v[6]);
- u[2] = _mm_add_epi32(v[3], v[4]);
- u[3] = _mm_sub_epi32(u[2], u[0]);
- u[4] = _mm_slli_epi32(v[5], 2);
- u[5] = _mm_sub_epi32(u[4], v[5]);
- u[6] = _mm_add_epi32(u[3], u[5]);
-
- v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-
- in[0] = _mm_packs_epi32(u[0], u[2]);
- in[1] = _mm_packs_epi32(u[1], u[3]);
- transpose_4x4_avx2(in);
-}
-
-void vp9_fht4x4_avx2(const int16_t *input, int16_t *output,
- int stride, int tx_type) {
- __m128i in[4];
-
- switch (tx_type) {
- case DCT_DCT:
- vp9_fdct4x4_avx2(input, output, stride);
- break;
- case ADST_DCT:
- load_buffer_4x4_avx2(input, in, stride);
- fadst4_avx2(in);
- fdct4_avx2(in);
- write_buffer_4x4_avx2(output, in);
- break;
- case DCT_ADST:
- load_buffer_4x4_avx2(input, in, stride);
- fdct4_avx2(in);
- fadst4_avx2(in);
- write_buffer_4x4_avx2(output, in);
- break;
- case ADST_ADST:
- load_buffer_4x4_avx2(input, in, stride);
- fadst4_avx2(in);
- fadst4_avx2(in);
- write_buffer_4x4_avx2(output, in);
- break;
- default:
- assert(0);
- break;
- }
-}
-
-void vp9_fdct8x8_avx2(const int16_t *input, int16_t *output, int stride) {
- int pass;
- // Constants
- // When we use them, in one case, they are all the same. In all others
- // it's a pair of them that we need to repeat four times. This is done
- // by constructing the 32 bit constant corresponding to that pair.
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
- const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
- const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
- const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
- // Load input
- __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
- __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
- __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
- __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
- __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride));
- __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride));
- __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride));
- __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride));
- // Pre-condition input (shift by two)
- in0 = _mm_slli_epi16(in0, 2);
- in1 = _mm_slli_epi16(in1, 2);
- in2 = _mm_slli_epi16(in2, 2);
- in3 = _mm_slli_epi16(in3, 2);
- in4 = _mm_slli_epi16(in4, 2);
- in5 = _mm_slli_epi16(in5, 2);
- in6 = _mm_slli_epi16(in6, 2);
- in7 = _mm_slli_epi16(in7, 2);
-
- // We do two passes, first the columns, then the rows. The results of the
- // first pass are transposed so that the same column code can be reused. The
- // results of the second pass are also transposed so that the rows (processed
- // as columns) are put back in row positions.
- for (pass = 0; pass < 2; pass++) {
- // To store results of each pass before the transpose.
- __m128i res0, res1, res2, res3, res4, res5, res6, res7;
- // Add/subtract
- const __m128i q0 = _mm_add_epi16(in0, in7);
- const __m128i q1 = _mm_add_epi16(in1, in6);
- const __m128i q2 = _mm_add_epi16(in2, in5);
- const __m128i q3 = _mm_add_epi16(in3, in4);
- const __m128i q4 = _mm_sub_epi16(in3, in4);
- const __m128i q5 = _mm_sub_epi16(in2, in5);
- const __m128i q6 = _mm_sub_epi16(in1, in6);
- const __m128i q7 = _mm_sub_epi16(in0, in7);
- // Work on first four results
- {
- // Add/subtract
- const __m128i r0 = _mm_add_epi16(q0, q3);
- const __m128i r1 = _mm_add_epi16(q1, q2);
- const __m128i r2 = _mm_sub_epi16(q1, q2);
- const __m128i r3 = _mm_sub_epi16(q0, q3);
- // Interleave to do the multiply by constants which gets us into 32bits
- const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
- const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
- const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
- const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
- const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
- const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
- const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
- const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
- const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
- const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
- const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
- // dct_const_round_shift
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
- const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
- const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
- const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
- const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
- const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
- const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
- const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
- // Combine
- res0 = _mm_packs_epi32(w0, w1);
- res4 = _mm_packs_epi32(w2, w3);
- res2 = _mm_packs_epi32(w4, w5);
- res6 = _mm_packs_epi32(w6, w7);
- }
- // Work on next four results
- {
- // Interleave to do the multiply by constants which gets us into 32bits
- const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
- const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
- const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
- const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
- const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
- const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
- // dct_const_round_shift
- const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
- const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
- const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
- const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
- const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
- const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
- const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
- const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
- // Combine
- const __m128i r0 = _mm_packs_epi32(s0, s1);
- const __m128i r1 = _mm_packs_epi32(s2, s3);
- // Add/subtract
- const __m128i x0 = _mm_add_epi16(q4, r0);
- const __m128i x1 = _mm_sub_epi16(q4, r0);
- const __m128i x2 = _mm_sub_epi16(q7, r1);
- const __m128i x3 = _mm_add_epi16(q7, r1);
- // Interleave to do the multiply by constants which gets us into 32bits
- const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
- const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
- const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
- const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
- const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
- const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
- const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
- const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
- const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
- const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
- const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
- // dct_const_round_shift
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
- const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
- const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
- const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
- const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
- const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
- const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
- const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
- // Combine
- res1 = _mm_packs_epi32(w0, w1);
- res7 = _mm_packs_epi32(w2, w3);
- res5 = _mm_packs_epi32(w4, w5);
- res3 = _mm_packs_epi32(w6, w7);
- }
- // Transpose the 8x8.
- {
- // 00 01 02 03 04 05 06 07
- // 10 11 12 13 14 15 16 17
- // 20 21 22 23 24 25 26 27
- // 30 31 32 33 34 35 36 37
- // 40 41 42 43 44 45 46 47
- // 50 51 52 53 54 55 56 57
- // 60 61 62 63 64 65 66 67
- // 70 71 72 73 74 75 76 77
- const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
- const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
- const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
- const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
- const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
- const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
- const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
- const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
- // 00 10 01 11 02 12 03 13
- // 20 30 21 31 22 32 23 33
- // 04 14 05 15 06 16 07 17
- // 24 34 25 35 26 36 27 37
- // 40 50 41 51 42 52 43 53
- // 60 70 61 71 62 72 63 73
- // 54 54 55 55 56 56 57 57
- // 64 74 65 75 66 76 67 77
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
- const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
- const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
- const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
- const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
- const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
- const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
- const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
- // 00 10 20 30 01 11 21 31
- // 40 50 60 70 41 51 61 71
- // 02 12 22 32 03 13 23 33
- // 42 52 62 72 43 53 63 73
- // 04 14 24 34 05 15 21 36
- // 44 54 64 74 45 55 61 76
- // 06 16 26 36 07 17 27 37
- // 46 56 66 76 47 57 67 77
- in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
- in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
- in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
- in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
- in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
- in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
- in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
- in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
- // 00 10 20 30 40 50 60 70
- // 01 11 21 31 41 51 61 71
- // 02 12 22 32 42 52 62 72
- // 03 13 23 33 43 53 63 73
- // 04 14 24 34 44 54 64 74
- // 05 15 25 35 45 55 65 75
- // 06 16 26 36 46 56 66 76
- // 07 17 27 37 47 57 67 77
- }
- }
- // Post-condition output and store it
- {
- // Post-condition (division by two)
- // division of two 16 bits signed numbers using shifts
- // n / 2 = (n - (n >> 15)) >> 1
- const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
- const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
- const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
- const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
- const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
- const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
- const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
- const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
- in0 = _mm_sub_epi16(in0, sign_in0);
- in1 = _mm_sub_epi16(in1, sign_in1);
- in2 = _mm_sub_epi16(in2, sign_in2);
- in3 = _mm_sub_epi16(in3, sign_in3);
- in4 = _mm_sub_epi16(in4, sign_in4);
- in5 = _mm_sub_epi16(in5, sign_in5);
- in6 = _mm_sub_epi16(in6, sign_in6);
- in7 = _mm_sub_epi16(in7, sign_in7);
- in0 = _mm_srai_epi16(in0, 1);
- in1 = _mm_srai_epi16(in1, 1);
- in2 = _mm_srai_epi16(in2, 1);
- in3 = _mm_srai_epi16(in3, 1);
- in4 = _mm_srai_epi16(in4, 1);
- in5 = _mm_srai_epi16(in5, 1);
- in6 = _mm_srai_epi16(in6, 1);
- in7 = _mm_srai_epi16(in7, 1);
- // store results
- _mm_store_si128((__m128i *)(output + 0 * 8), in0);
- _mm_store_si128((__m128i *)(output + 1 * 8), in1);
- _mm_store_si128((__m128i *)(output + 2 * 8), in2);
- _mm_store_si128((__m128i *)(output + 3 * 8), in3);
- _mm_store_si128((__m128i *)(output + 4 * 8), in4);
- _mm_store_si128((__m128i *)(output + 5 * 8), in5);
- _mm_store_si128((__m128i *)(output + 6 * 8), in6);
- _mm_store_si128((__m128i *)(output + 7 * 8), in7);
- }
-}
-
-// load 8x8 array
-static INLINE void load_buffer_8x8_avx2(const int16_t *input, __m128i *in,
- int stride) {
- in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
- in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
- in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
- in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
- in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
- in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
- in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
- in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
-
- in[0] = _mm_slli_epi16(in[0], 2);
- in[1] = _mm_slli_epi16(in[1], 2);
- in[2] = _mm_slli_epi16(in[2], 2);
- in[3] = _mm_slli_epi16(in[3], 2);
- in[4] = _mm_slli_epi16(in[4], 2);
- in[5] = _mm_slli_epi16(in[5], 2);
- in[6] = _mm_slli_epi16(in[6], 2);
- in[7] = _mm_slli_epi16(in[7], 2);
-}
-
-// right shift and rounding
-static INLINE void right_shift_8x8_avx2(__m128i *res, int const bit) {
- const __m128i kOne = _mm_set1_epi16(1);
- const int bit_m02 = bit - 2;
- __m128i sign0 = _mm_srai_epi16(res[0], 15);
- __m128i sign1 = _mm_srai_epi16(res[1], 15);
- __m128i sign2 = _mm_srai_epi16(res[2], 15);
- __m128i sign3 = _mm_srai_epi16(res[3], 15);
- __m128i sign4 = _mm_srai_epi16(res[4], 15);
- __m128i sign5 = _mm_srai_epi16(res[5], 15);
- __m128i sign6 = _mm_srai_epi16(res[6], 15);
- __m128i sign7 = _mm_srai_epi16(res[7], 15);
-
- if (bit_m02 >= 0) {
- __m128i k_const_rounding = _mm_slli_epi16(kOne, bit_m02);
- res[0] = _mm_add_epi16(res[0], k_const_rounding);
- res[1] = _mm_add_epi16(res[1], k_const_rounding);
- res[2] = _mm_add_epi16(res[2], k_const_rounding);
- res[3] = _mm_add_epi16(res[3], k_const_rounding);
- res[4] = _mm_add_epi16(res[4], k_const_rounding);
- res[5] = _mm_add_epi16(res[5], k_const_rounding);
- res[6] = _mm_add_epi16(res[6], k_const_rounding);
- res[7] = _mm_add_epi16(res[7], k_const_rounding);
- }
-
- res[0] = _mm_sub_epi16(res[0], sign0);
- res[1] = _mm_sub_epi16(res[1], sign1);
- res[2] = _mm_sub_epi16(res[2], sign2);
- res[3] = _mm_sub_epi16(res[3], sign3);
- res[4] = _mm_sub_epi16(res[4], sign4);
- res[5] = _mm_sub_epi16(res[5], sign5);
- res[6] = _mm_sub_epi16(res[6], sign6);
- res[7] = _mm_sub_epi16(res[7], sign7);
-
- res[0] = _mm_srai_epi16(res[0], bit);
- res[1] = _mm_srai_epi16(res[1], bit);
- res[2] = _mm_srai_epi16(res[2], bit);
- res[3] = _mm_srai_epi16(res[3], bit);
- res[4] = _mm_srai_epi16(res[4], bit);
- res[5] = _mm_srai_epi16(res[5], bit);
- res[6] = _mm_srai_epi16(res[6], bit);
- res[7] = _mm_srai_epi16(res[7], bit);
-}
-
-// write 8x8 array
-static INLINE void write_buffer_8x8_avx2(int16_t *output, __m128i *res, int stride) {
- _mm_store_si128((__m128i *)(output + 0 * stride), res[0]);
- _mm_store_si128((__m128i *)(output + 1 * stride), res[1]);
- _mm_store_si128((__m128i *)(output + 2 * stride), res[2]);
- _mm_store_si128((__m128i *)(output + 3 * stride), res[3]);
- _mm_store_si128((__m128i *)(output + 4 * stride), res[4]);
- _mm_store_si128((__m128i *)(output + 5 * stride), res[5]);
- _mm_store_si128((__m128i *)(output + 6 * stride), res[6]);
- _mm_store_si128((__m128i *)(output + 7 * stride), res[7]);
-}
-
-// perform in-place transpose
-static INLINE void array_transpose_8x8_avx2(__m128i *in, __m128i *res) {
- const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
- const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
- const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
- const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
- const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
- const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
- const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
- const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
- // 00 10 01 11 02 12 03 13
- // 20 30 21 31 22 32 23 33
- // 04 14 05 15 06 16 07 17
- // 24 34 25 35 26 36 27 37
- // 40 50 41 51 42 52 43 53
- // 60 70 61 71 62 72 63 73
- // 44 54 45 55 46 56 47 57
- // 64 74 65 75 66 76 67 77
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
- const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
- const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
- const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
- const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
- const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
- const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
- const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
- // 00 10 20 30 01 11 21 31
- // 40 50 60 70 41 51 61 71
- // 02 12 22 32 03 13 23 33
- // 42 52 62 72 43 53 63 73
- // 04 14 24 34 05 15 25 35
- // 44 54 64 74 45 55 65 75
- // 06 16 26 36 07 17 27 37
- // 46 56 66 76 47 57 67 77
- res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
- res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
- res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
- res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
- res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
- res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
- res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
- res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
- // 00 10 20 30 40 50 60 70
- // 01 11 21 31 41 51 61 71
- // 02 12 22 32 42 52 62 72
- // 03 13 23 33 43 53 63 73
- // 04 14 24 34 44 54 64 74
- // 05 15 25 35 45 55 65 75
- // 06 16 26 36 46 56 66 76
- // 07 17 27 37 47 57 67 77
-}
-
-void fdct8_avx2(__m128i *in) {
- // constants
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
- const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
- const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
- const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
- __m128i u0, u1, u2, u3, u4, u5, u6, u7;
- __m128i v0, v1, v2, v3, v4, v5, v6, v7;
- __m128i s0, s1, s2, s3, s4, s5, s6, s7;
-
- // stage 1
- s0 = _mm_add_epi16(in[0], in[7]);
- s1 = _mm_add_epi16(in[1], in[6]);
- s2 = _mm_add_epi16(in[2], in[5]);
- s3 = _mm_add_epi16(in[3], in[4]);
- s4 = _mm_sub_epi16(in[3], in[4]);
- s5 = _mm_sub_epi16(in[2], in[5]);
- s6 = _mm_sub_epi16(in[1], in[6]);
- s7 = _mm_sub_epi16(in[0], in[7]);
-
- u0 = _mm_add_epi16(s0, s3);
- u1 = _mm_add_epi16(s1, s2);
- u2 = _mm_sub_epi16(s1, s2);
- u3 = _mm_sub_epi16(s0, s3);
- // interleave and perform butterfly multiplication/addition
- v0 = _mm_unpacklo_epi16(u0, u1);
- v1 = _mm_unpackhi_epi16(u0, u1);
- v2 = _mm_unpacklo_epi16(u2, u3);
- v3 = _mm_unpackhi_epi16(u2, u3);
-
- u0 = _mm_madd_epi16(v0, k__cospi_p16_p16);
- u1 = _mm_madd_epi16(v1, k__cospi_p16_p16);
- u2 = _mm_madd_epi16(v0, k__cospi_p16_m16);
- u3 = _mm_madd_epi16(v1, k__cospi_p16_m16);
- u4 = _mm_madd_epi16(v2, k__cospi_p24_p08);
- u5 = _mm_madd_epi16(v3, k__cospi_p24_p08);
- u6 = _mm_madd_epi16(v2, k__cospi_m08_p24);
- u7 = _mm_madd_epi16(v3, k__cospi_m08_p24);
-
- // shift and rounding
- v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
- v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
- v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
- v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
- v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
-
- u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
- u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
- u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
- u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-
- in[0] = _mm_packs_epi32(u0, u1);
- in[2] = _mm_packs_epi32(u4, u5);
- in[4] = _mm_packs_epi32(u2, u3);
- in[6] = _mm_packs_epi32(u6, u7);
-
- // stage 2
- // interleave and perform butterfly multiplication/addition
- u0 = _mm_unpacklo_epi16(s6, s5);
- u1 = _mm_unpackhi_epi16(s6, s5);
- v0 = _mm_madd_epi16(u0, k__cospi_p16_m16);
- v1 = _mm_madd_epi16(u1, k__cospi_p16_m16);
- v2 = _mm_madd_epi16(u0, k__cospi_p16_p16);
- v3 = _mm_madd_epi16(u1, k__cospi_p16_p16);
-
- // shift and rounding
- u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
- u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
- u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
- u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
-
- v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
- v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
- v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
- v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
-
- u0 = _mm_packs_epi32(v0, v1);
- u1 = _mm_packs_epi32(v2, v3);
-
- // stage 3
- s0 = _mm_add_epi16(s4, u0);
- s1 = _mm_sub_epi16(s4, u0);
- s2 = _mm_sub_epi16(s7, u1);
- s3 = _mm_add_epi16(s7, u1);
-
- // stage 4
- u0 = _mm_unpacklo_epi16(s0, s3);
- u1 = _mm_unpackhi_epi16(s0, s3);
- u2 = _mm_unpacklo_epi16(s1, s2);
- u3 = _mm_unpackhi_epi16(s1, s2);
-
- v0 = _mm_madd_epi16(u0, k__cospi_p28_p04);
- v1 = _mm_madd_epi16(u1, k__cospi_p28_p04);
- v2 = _mm_madd_epi16(u2, k__cospi_p12_p20);
- v3 = _mm_madd_epi16(u3, k__cospi_p12_p20);
- v4 = _mm_madd_epi16(u2, k__cospi_m20_p12);
- v5 = _mm_madd_epi16(u3, k__cospi_m20_p12);
- v6 = _mm_madd_epi16(u0, k__cospi_m04_p28);
- v7 = _mm_madd_epi16(u1, k__cospi_m04_p28);
-
- // shift and rounding
- u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
- u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
- u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
- u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
- u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
- u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
- u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
- u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
-
- v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
- v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
- v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
- v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
- v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
- v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
- v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
- v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
-
- in[1] = _mm_packs_epi32(v0, v1);
- in[3] = _mm_packs_epi32(v4, v5);
- in[5] = _mm_packs_epi32(v2, v3);
- in[7] = _mm_packs_epi32(v6, v7);
-
- // transpose
- array_transpose_8x8_avx2(in, in);
-}
-
-void fadst8_avx2(__m128i *in) {
- // Constants
- const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
- const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
- const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
- const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
- const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
- const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
- const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
- const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
- const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
- const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
- const __m128i k__const_0 = _mm_set1_epi16(0);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
- __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
- __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
- __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
- __m128i s0, s1, s2, s3, s4, s5, s6, s7;
- __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-
- // properly aligned for butterfly input
- in0 = in[7];
- in1 = in[0];
- in2 = in[5];
- in3 = in[2];
- in4 = in[3];
- in5 = in[4];
- in6 = in[1];
- in7 = in[6];
-
- // column transformation
- // stage 1
- // interleave and multiply/add into 32-bit integer
- s0 = _mm_unpacklo_epi16(in0, in1);
- s1 = _mm_unpackhi_epi16(in0, in1);
- s2 = _mm_unpacklo_epi16(in2, in3);
- s3 = _mm_unpackhi_epi16(in2, in3);
- s4 = _mm_unpacklo_epi16(in4, in5);
- s5 = _mm_unpackhi_epi16(in4, in5);
- s6 = _mm_unpacklo_epi16(in6, in7);
- s7 = _mm_unpackhi_epi16(in6, in7);
-
- u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
- u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
- u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
- u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
- u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
- u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
- u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
- u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
- u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
- u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
- u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
- u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
- u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
- u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
- u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
- u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
-
- // addition
- w0 = _mm_add_epi32(u0, u8);
- w1 = _mm_add_epi32(u1, u9);
- w2 = _mm_add_epi32(u2, u10);
- w3 = _mm_add_epi32(u3, u11);
- w4 = _mm_add_epi32(u4, u12);
- w5 = _mm_add_epi32(u5, u13);
- w6 = _mm_add_epi32(u6, u14);
- w7 = _mm_add_epi32(u7, u15);
- w8 = _mm_sub_epi32(u0, u8);
- w9 = _mm_sub_epi32(u1, u9);
- w10 = _mm_sub_epi32(u2, u10);
- w11 = _mm_sub_epi32(u3, u11);
- w12 = _mm_sub_epi32(u4, u12);
- w13 = _mm_sub_epi32(u5, u13);
- w14 = _mm_sub_epi32(u6, u14);
- w15 = _mm_sub_epi32(u7, u15);
-
- // shift and rounding
- v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
- v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
- v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
- v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
- v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
- v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
- v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
- v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
- v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
- v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
- v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
- v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
- v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
- v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
- v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
- v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
-
- u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
- u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
- u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
- u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
- u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
- u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
- u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
- u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
- u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
- u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
- u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
- u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
-
- // back to 16-bit and pack 8 integers into __m128i
- in[0] = _mm_packs_epi32(u0, u1);
- in[1] = _mm_packs_epi32(u2, u3);
- in[2] = _mm_packs_epi32(u4, u5);
- in[3] = _mm_packs_epi32(u6, u7);
- in[4] = _mm_packs_epi32(u8, u9);
- in[5] = _mm_packs_epi32(u10, u11);
- in[6] = _mm_packs_epi32(u12, u13);
- in[7] = _mm_packs_epi32(u14, u15);
-
- // stage 2
- s0 = _mm_add_epi16(in[0], in[2]);
- s1 = _mm_add_epi16(in[1], in[3]);
- s2 = _mm_sub_epi16(in[0], in[2]);
- s3 = _mm_sub_epi16(in[1], in[3]);
- u0 = _mm_unpacklo_epi16(in[4], in[5]);
- u1 = _mm_unpackhi_epi16(in[4], in[5]);
- u2 = _mm_unpacklo_epi16(in[6], in[7]);
- u3 = _mm_unpackhi_epi16(in[6], in[7]);
-
- v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
- v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
- v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
- v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
- v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
- v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
- v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
- v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
-
- w0 = _mm_add_epi32(v0, v4);
- w1 = _mm_add_epi32(v1, v5);
- w2 = _mm_add_epi32(v2, v6);
- w3 = _mm_add_epi32(v3, v7);
- w4 = _mm_sub_epi32(v0, v4);
- w5 = _mm_sub_epi32(v1, v5);
- w6 = _mm_sub_epi32(v2, v6);
- w7 = _mm_sub_epi32(v3, v7);
-
- v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
- v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
- v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
- v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
- v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
- v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
- v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
- v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
-
- u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
- u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
- u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
- u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-
- // back to 16-bit intergers
- s4 = _mm_packs_epi32(u0, u1);
- s5 = _mm_packs_epi32(u2, u3);
- s6 = _mm_packs_epi32(u4, u5);
- s7 = _mm_packs_epi32(u6, u7);
-
- // stage 3
- u0 = _mm_unpacklo_epi16(s2, s3);
- u1 = _mm_unpackhi_epi16(s2, s3);
- u2 = _mm_unpacklo_epi16(s6, s7);
- u3 = _mm_unpackhi_epi16(s6, s7);
-
- v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
- v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
- v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
- v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
- v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
- v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
- v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
- v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
-
- u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
- u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
- u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
- u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
- u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
- u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
- u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
- u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
-
- v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
- v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
- v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
- v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
- v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
- v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
- v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
- v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
-
- s2 = _mm_packs_epi32(v0, v1);
- s3 = _mm_packs_epi32(v2, v3);
- s6 = _mm_packs_epi32(v4, v5);
- s7 = _mm_packs_epi32(v6, v7);
-
- // FIXME(jingning): do subtract using bit inversion?
- in[0] = s0;
- in[1] = _mm_sub_epi16(k__const_0, s4);
- in[2] = s6;
- in[3] = _mm_sub_epi16(k__const_0, s2);
- in[4] = s3;
- in[5] = _mm_sub_epi16(k__const_0, s7);
- in[6] = s5;
- in[7] = _mm_sub_epi16(k__const_0, s1);
-
- // transpose
- array_transpose_8x8_avx2(in, in);
-}
-
-void vp9_fht8x8_avx2(const int16_t *input, int16_t *output,
- int stride, int tx_type) {
- __m128i in[8];
-
- switch (tx_type) {
- case DCT_DCT:
- vp9_fdct8x8_avx2(input, output, stride);
- break;
- case ADST_DCT:
- load_buffer_8x8_avx2(input, in, stride);
- fadst8_avx2(in);
- fdct8_avx2(in);
- right_shift_8x8_avx2(in, 1);
- write_buffer_8x8_avx2(output, in, 8);
- break;
- case DCT_ADST:
- load_buffer_8x8_avx2(input, in, stride);
- fdct8_avx2(in);
- fadst8_avx2(in);
- right_shift_8x8_avx2(in, 1);
- write_buffer_8x8_avx2(output, in, 8);
- break;
- case ADST_ADST:
- load_buffer_8x8_avx2(input, in, stride);
- fadst8_avx2(in);
- fadst8_avx2(in);
- right_shift_8x8_avx2(in, 1);
- write_buffer_8x8_avx2(output, in, 8);
- break;
- default:
- assert(0);
- break;
- }
-}
-
-void vp9_fdct16x16_avx2(const int16_t *input, int16_t *output, int stride) {
- // The 2D transform is done with two passes which are actually pretty
- // similar. In the first one, we transform the columns and transpose
- // the results. In the second one, we transform the rows. To achieve that,
- // as the first pass results are transposed, we transpose the columns (that
- // is the transposed rows) and transpose the results (so that it goes back
- // in normal/row positions).
- int pass;
- // We need an intermediate buffer between passes.
- DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256);
- const int16_t *in = input;
- int16_t *out = intermediate;
- // Constants
- // When we use them, in one case, they are all the same. In all others
- // it's a pair of them that we need to repeat four times. This is done
- // by constructing the 32 bit constant corresponding to that pair.
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
- const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
- const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
- const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
- const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
- const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
- const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
- const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
- const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
- const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
- const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
- const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i kOne = _mm_set1_epi16(1);
- // Do the two transform/transpose passes
- for (pass = 0; pass < 2; ++pass) {
- // We process eight columns (transposed rows in second pass) at a time.
- int column_start;
- for (column_start = 0; column_start < 16; column_start += 8) {
- __m128i in00, in01, in02, in03, in04, in05, in06, in07;
- __m128i in08, in09, in10, in11, in12, in13, in14, in15;
- __m128i input0, input1, input2, input3, input4, input5, input6, input7;
- __m128i step1_0, step1_1, step1_2, step1_3;
- __m128i step1_4, step1_5, step1_6, step1_7;
- __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
- __m128i step3_0, step3_1, step3_2, step3_3;
- __m128i step3_4, step3_5, step3_6, step3_7;
- __m128i res00, res01, res02, res03, res04, res05, res06, res07;
- __m128i res08, res09, res10, res11, res12, res13, res14, res15;
- // Load and pre-condition input.
- if (0 == pass) {
- in00 = _mm_load_si128((const __m128i *)(in + 0 * stride));
- in01 = _mm_load_si128((const __m128i *)(in + 1 * stride));
- in02 = _mm_load_si128((const __m128i *)(in + 2 * stride));
- in03 = _mm_load_si128((const __m128i *)(in + 3 * stride));
- in04 = _mm_load_si128((const __m128i *)(in + 4 * stride));
- in05 = _mm_load_si128((const __m128i *)(in + 5 * stride));
- in06 = _mm_load_si128((const __m128i *)(in + 6 * stride));
- in07 = _mm_load_si128((const __m128i *)(in + 7 * stride));
- in08 = _mm_load_si128((const __m128i *)(in + 8 * stride));
- in09 = _mm_load_si128((const __m128i *)(in + 9 * stride));
- in10 = _mm_load_si128((const __m128i *)(in + 10 * stride));
- in11 = _mm_load_si128((const __m128i *)(in + 11 * stride));
- in12 = _mm_load_si128((const __m128i *)(in + 12 * stride));
- in13 = _mm_load_si128((const __m128i *)(in + 13 * stride));
- in14 = _mm_load_si128((const __m128i *)(in + 14 * stride));
- in15 = _mm_load_si128((const __m128i *)(in + 15 * stride));
- // x = x << 2
- in00 = _mm_slli_epi16(in00, 2);
- in01 = _mm_slli_epi16(in01, 2);
- in02 = _mm_slli_epi16(in02, 2);
- in03 = _mm_slli_epi16(in03, 2);
- in04 = _mm_slli_epi16(in04, 2);
- in05 = _mm_slli_epi16(in05, 2);
- in06 = _mm_slli_epi16(in06, 2);
- in07 = _mm_slli_epi16(in07, 2);
- in08 = _mm_slli_epi16(in08, 2);
- in09 = _mm_slli_epi16(in09, 2);
- in10 = _mm_slli_epi16(in10, 2);
- in11 = _mm_slli_epi16(in11, 2);
- in12 = _mm_slli_epi16(in12, 2);
- in13 = _mm_slli_epi16(in13, 2);
- in14 = _mm_slli_epi16(in14, 2);
- in15 = _mm_slli_epi16(in15, 2);
- } else {
- in00 = _mm_load_si128((const __m128i *)(in + 0 * 16));
- in01 = _mm_load_si128((const __m128i *)(in + 1 * 16));
- in02 = _mm_load_si128((const __m128i *)(in + 2 * 16));
- in03 = _mm_load_si128((const __m128i *)(in + 3 * 16));
- in04 = _mm_load_si128((const __m128i *)(in + 4 * 16));
- in05 = _mm_load_si128((const __m128i *)(in + 5 * 16));
- in06 = _mm_load_si128((const __m128i *)(in + 6 * 16));
- in07 = _mm_load_si128((const __m128i *)(in + 7 * 16));
- in08 = _mm_load_si128((const __m128i *)(in + 8 * 16));
- in09 = _mm_load_si128((const __m128i *)(in + 9 * 16));
- in10 = _mm_load_si128((const __m128i *)(in + 10 * 16));
- in11 = _mm_load_si128((const __m128i *)(in + 11 * 16));
- in12 = _mm_load_si128((const __m128i *)(in + 12 * 16));
- in13 = _mm_load_si128((const __m128i *)(in + 13 * 16));
- in14 = _mm_load_si128((const __m128i *)(in + 14 * 16));
- in15 = _mm_load_si128((const __m128i *)(in + 15 * 16));
- // x = (x + 1) >> 2
- in00 = _mm_add_epi16(in00, kOne);
- in01 = _mm_add_epi16(in01, kOne);
- in02 = _mm_add_epi16(in02, kOne);
- in03 = _mm_add_epi16(in03, kOne);
- in04 = _mm_add_epi16(in04, kOne);
- in05 = _mm_add_epi16(in05, kOne);
- in06 = _mm_add_epi16(in06, kOne);
- in07 = _mm_add_epi16(in07, kOne);
- in08 = _mm_add_epi16(in08, kOne);
- in09 = _mm_add_epi16(in09, kOne);
- in10 = _mm_add_epi16(in10, kOne);
- in11 = _mm_add_epi16(in11, kOne);
- in12 = _mm_add_epi16(in12, kOne);
- in13 = _mm_add_epi16(in13, kOne);
- in14 = _mm_add_epi16(in14, kOne);
- in15 = _mm_add_epi16(in15, kOne);
- in00 = _mm_srai_epi16(in00, 2);
- in01 = _mm_srai_epi16(in01, 2);
- in02 = _mm_srai_epi16(in02, 2);
- in03 = _mm_srai_epi16(in03, 2);
- in04 = _mm_srai_epi16(in04, 2);
- in05 = _mm_srai_epi16(in05, 2);
- in06 = _mm_srai_epi16(in06, 2);
- in07 = _mm_srai_epi16(in07, 2);
- in08 = _mm_srai_epi16(in08, 2);
- in09 = _mm_srai_epi16(in09, 2);
- in10 = _mm_srai_epi16(in10, 2);
- in11 = _mm_srai_epi16(in11, 2);
- in12 = _mm_srai_epi16(in12, 2);
- in13 = _mm_srai_epi16(in13, 2);
- in14 = _mm_srai_epi16(in14, 2);
- in15 = _mm_srai_epi16(in15, 2);
- }
- in += 8;
- // Calculate input for the first 8 results.
- {
- input0 = _mm_add_epi16(in00, in15);
- input1 = _mm_add_epi16(in01, in14);
- input2 = _mm_add_epi16(in02, in13);
- input3 = _mm_add_epi16(in03, in12);
- input4 = _mm_add_epi16(in04, in11);
- input5 = _mm_add_epi16(in05, in10);
- input6 = _mm_add_epi16(in06, in09);
- input7 = _mm_add_epi16(in07, in08);
- }
- // Calculate input for the next 8 results.
- {
- step1_0 = _mm_sub_epi16(in07, in08);
- step1_1 = _mm_sub_epi16(in06, in09);
- step1_2 = _mm_sub_epi16(in05, in10);
- step1_3 = _mm_sub_epi16(in04, in11);
- step1_4 = _mm_sub_epi16(in03, in12);
- step1_5 = _mm_sub_epi16(in02, in13);
- step1_6 = _mm_sub_epi16(in01, in14);
- step1_7 = _mm_sub_epi16(in00, in15);
- }
- // Work on the first eight values; fdct8(input, even_results);
- {
- // Add/subtract
- const __m128i q0 = _mm_add_epi16(input0, input7);
- const __m128i q1 = _mm_add_epi16(input1, input6);
- const __m128i q2 = _mm_add_epi16(input2, input5);
- const __m128i q3 = _mm_add_epi16(input3, input4);
- const __m128i q4 = _mm_sub_epi16(input3, input4);
- const __m128i q5 = _mm_sub_epi16(input2, input5);
- const __m128i q6 = _mm_sub_epi16(input1, input6);
- const __m128i q7 = _mm_sub_epi16(input0, input7);
- // Work on first four results
- {
- // Add/subtract
- const __m128i r0 = _mm_add_epi16(q0, q3);
- const __m128i r1 = _mm_add_epi16(q1, q2);
- const __m128i r2 = _mm_sub_epi16(q1, q2);
- const __m128i r3 = _mm_sub_epi16(q0, q3);
- // Interleave to do the multiply by constants which gets us
- // into 32 bits.
- const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
- const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
- const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
- const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
- const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
- const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
- const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
- const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
- const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
- const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
- const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
- // dct_const_round_shift
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
- const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
- const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
- const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
- const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
- const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
- const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
- const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
- // Combine
- res00 = _mm_packs_epi32(w0, w1);
- res08 = _mm_packs_epi32(w2, w3);
- res04 = _mm_packs_epi32(w4, w5);
- res12 = _mm_packs_epi32(w6, w7);
- }
- // Work on next four results
- {
- // Interleave to do the multiply by constants which gets us
- // into 32 bits.
- const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
- const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
- const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
- const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
- const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
- const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
- // dct_const_round_shift
- const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
- const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
- const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
- const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
- const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
- const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
- const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
- const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
- // Combine
- const __m128i r0 = _mm_packs_epi32(s0, s1);
- const __m128i r1 = _mm_packs_epi32(s2, s3);
- // Add/subtract
- const __m128i x0 = _mm_add_epi16(q4, r0);
- const __m128i x1 = _mm_sub_epi16(q4, r0);
- const __m128i x2 = _mm_sub_epi16(q7, r1);
- const __m128i x3 = _mm_add_epi16(q7, r1);
- // Interleave to do the multiply by constants which gets us
- // into 32 bits.
- const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
- const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
- const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
- const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
- const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
- const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
- const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
- const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
- const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
- const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
- const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
- // dct_const_round_shift
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
- const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
- const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
- const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
- const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
- const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
- const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
- const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
- // Combine
- res02 = _mm_packs_epi32(w0, w1);
- res14 = _mm_packs_epi32(w2, w3);
- res10 = _mm_packs_epi32(w4, w5);
- res06 = _mm_packs_epi32(w6, w7);
- }
- }
- // Work on the next eight values; step1 -> odd_results
- {
- // step 2
- {
- const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
- const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
- const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
- const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_m16);
- const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_m16);
- const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_m16);
- const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_m16);
- // dct_const_round_shift
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- // Combine
- step2_2 = _mm_packs_epi32(w0, w1);
- step2_3 = _mm_packs_epi32(w2, w3);
- }
- {
- const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
- const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
- const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
- const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
- const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
- const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_p16);
- const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_p16);
- // dct_const_round_shift
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- // Combine
- step2_5 = _mm_packs_epi32(w0, w1);
- step2_4 = _mm_packs_epi32(w2, w3);
- }
- // step 3
- {
- step3_0 = _mm_add_epi16(step1_0, step2_3);
- step3_1 = _mm_add_epi16(step1_1, step2_2);
- step3_2 = _mm_sub_epi16(step1_1, step2_2);
- step3_3 = _mm_sub_epi16(step1_0, step2_3);
- step3_4 = _mm_sub_epi16(step1_7, step2_4);
- step3_5 = _mm_sub_epi16(step1_6, step2_5);
- step3_6 = _mm_add_epi16(step1_6, step2_5);
- step3_7 = _mm_add_epi16(step1_7, step2_4);
- }
- // step 4
- {
- const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
- const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
- const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
- const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m08_p24);
- const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m08_p24);
- const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m24_m08);
- const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m24_m08);
- // dct_const_round_shift
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- // Combine
- step2_1 = _mm_packs_epi32(w0, w1);
- step2_2 = _mm_packs_epi32(w2, w3);
- }
- {
- const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
- const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
- const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
- const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p24_p08);
- const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p24_p08);
- const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m08_p24);
- const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m08_p24);
- // dct_const_round_shift
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- // Combine
- step2_6 = _mm_packs_epi32(w0, w1);
- step2_5 = _mm_packs_epi32(w2, w3);
- }
- // step 5
- {
- step1_0 = _mm_add_epi16(step3_0, step2_1);
- step1_1 = _mm_sub_epi16(step3_0, step2_1);
- step1_2 = _mm_sub_epi16(step3_3, step2_2);
- step1_3 = _mm_add_epi16(step3_3, step2_2);
- step1_4 = _mm_add_epi16(step3_4, step2_5);
- step1_5 = _mm_sub_epi16(step3_4, step2_5);
- step1_6 = _mm_sub_epi16(step3_7, step2_6);
- step1_7 = _mm_add_epi16(step3_7, step2_6);
- }
- // step 6
- {
- const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
- const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
- const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
- const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p30_p02);
- const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p30_p02);
- const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p14_p18);
- const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p14_p18);
- // dct_const_round_shift
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- // Combine
- res01 = _mm_packs_epi32(w0, w1);
- res09 = _mm_packs_epi32(w2, w3);
- }
- {
- const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
- const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
- const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
- const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p22_p10);
- const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p22_p10);
- const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p06_p26);
- const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p06_p26);
- // dct_const_round_shift
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- // Combine
- res05 = _mm_packs_epi32(w0, w1);
- res13 = _mm_packs_epi32(w2, w3);
- }
- {
- const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
- const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
- const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
- const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m10_p22);
- const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m10_p22);
- const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m26_p06);
- const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m26_p06);
- // dct_const_round_shift
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- // Combine
- res11 = _mm_packs_epi32(w0, w1);
- res03 = _mm_packs_epi32(w2, w3);
- }
- {
- const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
- const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
- const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
- const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m02_p30);
- const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m02_p30);
- const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m18_p14);
- const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m18_p14);
- // dct_const_round_shift
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- // Combine
- res15 = _mm_packs_epi32(w0, w1);
- res07 = _mm_packs_epi32(w2, w3);
- }
- }
- // Transpose the results, do it as two 8x8 transposes.
- {
- // 00 01 02 03 04 05 06 07
- // 10 11 12 13 14 15 16 17
- // 20 21 22 23 24 25 26 27
- // 30 31 32 33 34 35 36 37
- // 40 41 42 43 44 45 46 47
- // 50 51 52 53 54 55 56 57
- // 60 61 62 63 64 65 66 67
- // 70 71 72 73 74 75 76 77
- const __m128i tr0_0 = _mm_unpacklo_epi16(res00, res01);
- const __m128i tr0_1 = _mm_unpacklo_epi16(res02, res03);
- const __m128i tr0_2 = _mm_unpackhi_epi16(res00, res01);
- const __m128i tr0_3 = _mm_unpackhi_epi16(res02, res03);
- const __m128i tr0_4 = _mm_unpacklo_epi16(res04, res05);
- const __m128i tr0_5 = _mm_unpacklo_epi16(res06, res07);
- const __m128i tr0_6 = _mm_unpackhi_epi16(res04, res05);
- const __m128i tr0_7 = _mm_unpackhi_epi16(res06, res07);
- // 00 10 01 11 02 12 03 13
- // 20 30 21 31 22 32 23 33
- // 04 14 05 15 06 16 07 17
- // 24 34 25 35 26 36 27 37
- // 40 50 41 51 42 52 43 53
- // 60 70 61 71 62 72 63 73
- // 54 54 55 55 56 56 57 57
- // 64 74 65 75 66 76 67 77
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
- const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
- const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
- const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
- const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
- const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
- const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
- const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
- // 00 10 20 30 01 11 21 31
- // 40 50 60 70 41 51 61 71
- // 02 12 22 32 03 13 23 33
- // 42 52 62 72 43 53 63 73
- // 04 14 24 34 05 15 21 36
- // 44 54 64 74 45 55 61 76
- // 06 16 26 36 07 17 27 37
- // 46 56 66 76 47 57 67 77
- const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
- const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
- const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
- const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
- const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
- const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
- const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
- const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
- // 00 10 20 30 40 50 60 70
- // 01 11 21 31 41 51 61 71
- // 02 12 22 32 42 52 62 72
- // 03 13 23 33 43 53 63 73
- // 04 14 24 34 44 54 64 74
- // 05 15 25 35 45 55 65 75
- // 06 16 26 36 46 56 66 76
- // 07 17 27 37 47 57 67 77
- _mm_storeu_si128((__m128i *)(out + 0 * 16), tr2_0);
- _mm_storeu_si128((__m128i *)(out + 1 * 16), tr2_1);
- _mm_storeu_si128((__m128i *)(out + 2 * 16), tr2_2);
- _mm_storeu_si128((__m128i *)(out + 3 * 16), tr2_3);
- _mm_storeu_si128((__m128i *)(out + 4 * 16), tr2_4);
- _mm_storeu_si128((__m128i *)(out + 5 * 16), tr2_5);
- _mm_storeu_si128((__m128i *)(out + 6 * 16), tr2_6);
- _mm_storeu_si128((__m128i *)(out + 7 * 16), tr2_7);
- }
- {
- // 00 01 02 03 04 05 06 07
- // 10 11 12 13 14 15 16 17
- // 20 21 22 23 24 25 26 27
- // 30 31 32 33 34 35 36 37
- // 40 41 42 43 44 45 46 47
- // 50 51 52 53 54 55 56 57
- // 60 61 62 63 64 65 66 67
- // 70 71 72 73 74 75 76 77
- const __m128i tr0_0 = _mm_unpacklo_epi16(res08, res09);
- const __m128i tr0_1 = _mm_unpacklo_epi16(res10, res11);
- const __m128i tr0_2 = _mm_unpackhi_epi16(res08, res09);
- const __m128i tr0_3 = _mm_unpackhi_epi16(res10, res11);
- const __m128i tr0_4 = _mm_unpacklo_epi16(res12, res13);
- const __m128i tr0_5 = _mm_unpacklo_epi16(res14, res15);
- const __m128i tr0_6 = _mm_unpackhi_epi16(res12, res13);
- const __m128i tr0_7 = _mm_unpackhi_epi16(res14, res15);
- // 00 10 01 11 02 12 03 13
- // 20 30 21 31 22 32 23 33
- // 04 14 05 15 06 16 07 17
- // 24 34 25 35 26 36 27 37
- // 40 50 41 51 42 52 43 53
- // 60 70 61 71 62 72 63 73
- // 54 54 55 55 56 56 57 57
- // 64 74 65 75 66 76 67 77
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
- const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
- const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
- const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
- const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
- const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
- const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
- const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
- // 00 10 20 30 01 11 21 31
- // 40 50 60 70 41 51 61 71
- // 02 12 22 32 03 13 23 33
- // 42 52 62 72 43 53 63 73
- // 04 14 24 34 05 15 21 36
- // 44 54 64 74 45 55 61 76
- // 06 16 26 36 07 17 27 37
- // 46 56 66 76 47 57 67 77
- const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
- const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
- const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
- const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
- const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
- const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
- const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
- const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
- // 00 10 20 30 40 50 60 70
- // 01 11 21 31 41 51 61 71
- // 02 12 22 32 42 52 62 72
- // 03 13 23 33 43 53 63 73
- // 04 14 24 34 44 54 64 74
- // 05 15 25 35 45 55 65 75
- // 06 16 26 36 46 56 66 76
- // 07 17 27 37 47 57 67 77
- // Store results
- _mm_store_si128((__m128i *)(out + 8 + 0 * 16), tr2_0);
- _mm_store_si128((__m128i *)(out + 8 + 1 * 16), tr2_1);
- _mm_store_si128((__m128i *)(out + 8 + 2 * 16), tr2_2);
- _mm_store_si128((__m128i *)(out + 8 + 3 * 16), tr2_3);
- _mm_store_si128((__m128i *)(out + 8 + 4 * 16), tr2_4);
- _mm_store_si128((__m128i *)(out + 8 + 5 * 16), tr2_5);
- _mm_store_si128((__m128i *)(out + 8 + 6 * 16), tr2_6);
- _mm_store_si128((__m128i *)(out + 8 + 7 * 16), tr2_7);
- }
- out += 8*16;
- }
- // Setup in/out for next pass.
- in = intermediate;
- out = output;
- }
-}
-
-static INLINE void load_buffer_16x16_avx2(const int16_t* input, __m128i *in0,
- __m128i *in1, int stride) {
- // load first 8 columns
- load_buffer_8x8_avx2(input, in0, stride);
- load_buffer_8x8_avx2(input + 8 * stride, in0 + 8, stride);
-
- input += 8;
- // load second 8 columns
- load_buffer_8x8_avx2(input, in1, stride);
- load_buffer_8x8_avx2(input + 8 * stride, in1 + 8, stride);
-}
-
-static INLINE void write_buffer_16x16_avx2(int16_t *output, __m128i *in0,
- __m128i *in1, int stride) {
- // write first 8 columns
- write_buffer_8x8_avx2(output, in0, stride);
- write_buffer_8x8_avx2(output + 8 * stride, in0 + 8, stride);
- // write second 8 columns
- output += 8;
- write_buffer_8x8_avx2(output, in1, stride);
- write_buffer_8x8_avx2(output + 8 * stride, in1 + 8, stride);
-}
-
-static INLINE void array_transpose_16x16_avx2(__m128i *res0, __m128i *res1) {
- __m128i tbuf[8];
- array_transpose_8x8_avx2(res0, res0);
- array_transpose_8x8_avx2(res1, tbuf);
- array_transpose_8x8_avx2(res0 + 8, res1);
- array_transpose_8x8_avx2(res1 + 8, res1 + 8);
-
- res0[8] = tbuf[0];
- res0[9] = tbuf[1];
- res0[10] = tbuf[2];
- res0[11] = tbuf[3];
- res0[12] = tbuf[4];
- res0[13] = tbuf[5];
- res0[14] = tbuf[6];
- res0[15] = tbuf[7];
-}
-
-static INLINE void right_shift_16x16_avx2(__m128i *res0, __m128i *res1) {
- // perform rounding operations
- right_shift_8x8_avx2(res0, 2);
- right_shift_8x8_avx2(res0 + 8, 2);
- right_shift_8x8_avx2(res1, 2);
- right_shift_8x8_avx2(res1 + 8, 2);
-}
-
-void fdct16_8col_avx2(__m128i *in) {
- // perform 16x16 1-D DCT for 8 columns
- __m128i i[8], s[8], p[8], t[8], u[16], v[16];
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
- const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
- const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
- const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
- const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
- const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
- const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
- const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
- const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
- const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
- const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
- const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
- const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
- // stage 1
- i[0] = _mm_add_epi16(in[0], in[15]);
- i[1] = _mm_add_epi16(in[1], in[14]);
- i[2] = _mm_add_epi16(in[2], in[13]);
- i[3] = _mm_add_epi16(in[3], in[12]);
- i[4] = _mm_add_epi16(in[4], in[11]);
- i[5] = _mm_add_epi16(in[5], in[10]);
- i[6] = _mm_add_epi16(in[6], in[9]);
- i[7] = _mm_add_epi16(in[7], in[8]);
-
- s[0] = _mm_sub_epi16(in[7], in[8]);
- s[1] = _mm_sub_epi16(in[6], in[9]);
- s[2] = _mm_sub_epi16(in[5], in[10]);
- s[3] = _mm_sub_epi16(in[4], in[11]);
- s[4] = _mm_sub_epi16(in[3], in[12]);
- s[5] = _mm_sub_epi16(in[2], in[13]);
- s[6] = _mm_sub_epi16(in[1], in[14]);
- s[7] = _mm_sub_epi16(in[0], in[15]);
-
- p[0] = _mm_add_epi16(i[0], i[7]);
- p[1] = _mm_add_epi16(i[1], i[6]);
- p[2] = _mm_add_epi16(i[2], i[5]);
- p[3] = _mm_add_epi16(i[3], i[4]);
- p[4] = _mm_sub_epi16(i[3], i[4]);
- p[5] = _mm_sub_epi16(i[2], i[5]);
- p[6] = _mm_sub_epi16(i[1], i[6]);
- p[7] = _mm_sub_epi16(i[0], i[7]);
-
- u[0] = _mm_add_epi16(p[0], p[3]);
- u[1] = _mm_add_epi16(p[1], p[2]);
- u[2] = _mm_sub_epi16(p[1], p[2]);
- u[3] = _mm_sub_epi16(p[0], p[3]);
-
- v[0] = _mm_unpacklo_epi16(u[0], u[1]);
- v[1] = _mm_unpackhi_epi16(u[0], u[1]);
- v[2] = _mm_unpacklo_epi16(u[2], u[3]);
- v[3] = _mm_unpackhi_epi16(u[2], u[3]);
-
- u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);
- u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16);
- u[2] = _mm_madd_epi16(v[0], k__cospi_p16_m16);
- u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16);
- u[4] = _mm_madd_epi16(v[2], k__cospi_p24_p08);
- u[5] = _mm_madd_epi16(v[3], k__cospi_p24_p08);
- u[6] = _mm_madd_epi16(v[2], k__cospi_m08_p24);
- u[7] = _mm_madd_epi16(v[3], k__cospi_m08_p24);
-
- v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
- v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
- v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
- v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
- v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
- u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
- u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
- u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
- u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-
- in[0] = _mm_packs_epi32(u[0], u[1]);
- in[4] = _mm_packs_epi32(u[4], u[5]);
- in[8] = _mm_packs_epi32(u[2], u[3]);
- in[12] = _mm_packs_epi32(u[6], u[7]);
-
- u[0] = _mm_unpacklo_epi16(p[5], p[6]);
- u[1] = _mm_unpackhi_epi16(p[5], p[6]);
- v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
- v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-
- v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-
- u[0] = _mm_packs_epi32(v[0], v[1]);
- u[1] = _mm_packs_epi32(v[2], v[3]);
-
- t[0] = _mm_add_epi16(p[4], u[0]);
- t[1] = _mm_sub_epi16(p[4], u[0]);
- t[2] = _mm_sub_epi16(p[7], u[1]);
- t[3] = _mm_add_epi16(p[7], u[1]);
-
- u[0] = _mm_unpacklo_epi16(t[0], t[3]);
- u[1] = _mm_unpackhi_epi16(t[0], t[3]);
- u[2] = _mm_unpacklo_epi16(t[1], t[2]);
- u[3] = _mm_unpackhi_epi16(t[1], t[2]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_p28_p04);
- v[1] = _mm_madd_epi16(u[1], k__cospi_p28_p04);
- v[2] = _mm_madd_epi16(u[2], k__cospi_p12_p20);
- v[3] = _mm_madd_epi16(u[3], k__cospi_p12_p20);
- v[4] = _mm_madd_epi16(u[2], k__cospi_m20_p12);
- v[5] = _mm_madd_epi16(u[3], k__cospi_m20_p12);
- v[6] = _mm_madd_epi16(u[0], k__cospi_m04_p28);
- v[7] = _mm_madd_epi16(u[1], k__cospi_m04_p28);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
- v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
- in[2] = _mm_packs_epi32(v[0], v[1]);
- in[6] = _mm_packs_epi32(v[4], v[5]);
- in[10] = _mm_packs_epi32(v[2], v[3]);
- in[14] = _mm_packs_epi32(v[6], v[7]);
-
- // stage 2
- u[0] = _mm_unpacklo_epi16(s[2], s[5]);
- u[1] = _mm_unpackhi_epi16(s[2], s[5]);
- u[2] = _mm_unpacklo_epi16(s[3], s[4]);
- u[3] = _mm_unpackhi_epi16(s[3], s[4]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
- v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
- v[2] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
- v[3] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
- v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
- v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
- v[6] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
- v[7] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
- v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
- t[2] = _mm_packs_epi32(v[0], v[1]);
- t[3] = _mm_packs_epi32(v[2], v[3]);
- t[4] = _mm_packs_epi32(v[4], v[5]);
- t[5] = _mm_packs_epi32(v[6], v[7]);
-
- // stage 3
- p[0] = _mm_add_epi16(s[0], t[3]);
- p[1] = _mm_add_epi16(s[1], t[2]);
- p[2] = _mm_sub_epi16(s[1], t[2]);
- p[3] = _mm_sub_epi16(s[0], t[3]);
- p[4] = _mm_sub_epi16(s[7], t[4]);
- p[5] = _mm_sub_epi16(s[6], t[5]);
- p[6] = _mm_add_epi16(s[6], t[5]);
- p[7] = _mm_add_epi16(s[7], t[4]);
-
- // stage 4
- u[0] = _mm_unpacklo_epi16(p[1], p[6]);
- u[1] = _mm_unpackhi_epi16(p[1], p[6]);
- u[2] = _mm_unpacklo_epi16(p[2], p[5]);
- u[3] = _mm_unpackhi_epi16(p[2], p[5]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24);
- v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24);
- v[2] = _mm_madd_epi16(u[2], k__cospi_m24_m08);
- v[3] = _mm_madd_epi16(u[3], k__cospi_m24_m08);
- v[4] = _mm_madd_epi16(u[2], k__cospi_m08_p24);
- v[5] = _mm_madd_epi16(u[3], k__cospi_m08_p24);
- v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08);
- v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
- v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
- t[1] = _mm_packs_epi32(v[0], v[1]);
- t[2] = _mm_packs_epi32(v[2], v[3]);
- t[5] = _mm_packs_epi32(v[4], v[5]);
- t[6] = _mm_packs_epi32(v[6], v[7]);
-
- // stage 5
- s[0] = _mm_add_epi16(p[0], t[1]);
- s[1] = _mm_sub_epi16(p[0], t[1]);
- s[2] = _mm_sub_epi16(p[3], t[2]);
- s[3] = _mm_add_epi16(p[3], t[2]);
- s[4] = _mm_add_epi16(p[4], t[5]);
- s[5] = _mm_sub_epi16(p[4], t[5]);
- s[6] = _mm_sub_epi16(p[7], t[6]);
- s[7] = _mm_add_epi16(p[7], t[6]);
-
- // stage 6
- u[0] = _mm_unpacklo_epi16(s[0], s[7]);
- u[1] = _mm_unpackhi_epi16(s[0], s[7]);
- u[2] = _mm_unpacklo_epi16(s[1], s[6]);
- u[3] = _mm_unpackhi_epi16(s[1], s[6]);
- u[4] = _mm_unpacklo_epi16(s[2], s[5]);
- u[5] = _mm_unpackhi_epi16(s[2], s[5]);
- u[6] = _mm_unpacklo_epi16(s[3], s[4]);
- u[7] = _mm_unpackhi_epi16(s[3], s[4]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_p30_p02);
- v[1] = _mm_madd_epi16(u[1], k__cospi_p30_p02);
- v[2] = _mm_madd_epi16(u[2], k__cospi_p14_p18);
- v[3] = _mm_madd_epi16(u[3], k__cospi_p14_p18);
- v[4] = _mm_madd_epi16(u[4], k__cospi_p22_p10);
- v[5] = _mm_madd_epi16(u[5], k__cospi_p22_p10);
- v[6] = _mm_madd_epi16(u[6], k__cospi_p06_p26);
- v[7] = _mm_madd_epi16(u[7], k__cospi_p06_p26);
- v[8] = _mm_madd_epi16(u[6], k__cospi_m26_p06);
- v[9] = _mm_madd_epi16(u[7], k__cospi_m26_p06);
- v[10] = _mm_madd_epi16(u[4], k__cospi_m10_p22);
- v[11] = _mm_madd_epi16(u[5], k__cospi_m10_p22);
- v[12] = _mm_madd_epi16(u[2], k__cospi_m18_p14);
- v[13] = _mm_madd_epi16(u[3], k__cospi_m18_p14);
- v[14] = _mm_madd_epi16(u[0], k__cospi_m02_p30);
- v[15] = _mm_madd_epi16(u[1], k__cospi_m02_p30);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
- u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
- u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
- u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
- u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
- u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
- u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
- u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
- u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
- v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
- v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
- v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
- v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
- v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
- v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
- v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
- v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
- v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
- in[1] = _mm_packs_epi32(v[0], v[1]);
- in[9] = _mm_packs_epi32(v[2], v[3]);
- in[5] = _mm_packs_epi32(v[4], v[5]);
- in[13] = _mm_packs_epi32(v[6], v[7]);
- in[3] = _mm_packs_epi32(v[8], v[9]);
- in[11] = _mm_packs_epi32(v[10], v[11]);
- in[7] = _mm_packs_epi32(v[12], v[13]);
- in[15] = _mm_packs_epi32(v[14], v[15]);
-}
-
-void fadst16_8col_avx2(__m128i *in) {
- // perform 16x16 1-D ADST for 8 columns
- __m128i s[16], x[16], u[32], v[32];
- const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
- const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
- const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
- const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
- const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
- const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
- const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
- const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
- const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
- const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
- const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
- const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
- const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
- const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
- const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
- const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
- const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
- const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
- const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
- const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
- const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
- const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
- const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
- const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
- const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i kZero = _mm_set1_epi16(0);
-
- u[0] = _mm_unpacklo_epi16(in[15], in[0]);
- u[1] = _mm_unpackhi_epi16(in[15], in[0]);
- u[2] = _mm_unpacklo_epi16(in[13], in[2]);
- u[3] = _mm_unpackhi_epi16(in[13], in[2]);
- u[4] = _mm_unpacklo_epi16(in[11], in[4]);
- u[5] = _mm_unpackhi_epi16(in[11], in[4]);
- u[6] = _mm_unpacklo_epi16(in[9], in[6]);
- u[7] = _mm_unpackhi_epi16(in[9], in[6]);
- u[8] = _mm_unpacklo_epi16(in[7], in[8]);
- u[9] = _mm_unpackhi_epi16(in[7], in[8]);
- u[10] = _mm_unpacklo_epi16(in[5], in[10]);
- u[11] = _mm_unpackhi_epi16(in[5], in[10]);
- u[12] = _mm_unpacklo_epi16(in[3], in[12]);
- u[13] = _mm_unpackhi_epi16(in[3], in[12]);
- u[14] = _mm_unpacklo_epi16(in[1], in[14]);
- u[15] = _mm_unpackhi_epi16(in[1], in[14]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
- v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
- v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
- v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
- v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
- v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
- v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
- v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
- v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
- v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
- v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
- v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
- v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
- v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
- v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
- v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
- v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
- v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
- v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
- v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
- v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
- v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
- v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
- v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
- v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
- v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
- v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
- v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
- v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
- v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
-
- u[0] = _mm_add_epi32(v[0], v[16]);
- u[1] = _mm_add_epi32(v[1], v[17]);
- u[2] = _mm_add_epi32(v[2], v[18]);
- u[3] = _mm_add_epi32(v[3], v[19]);
- u[4] = _mm_add_epi32(v[4], v[20]);
- u[5] = _mm_add_epi32(v[5], v[21]);
- u[6] = _mm_add_epi32(v[6], v[22]);
- u[7] = _mm_add_epi32(v[7], v[23]);
- u[8] = _mm_add_epi32(v[8], v[24]);
- u[9] = _mm_add_epi32(v[9], v[25]);
- u[10] = _mm_add_epi32(v[10], v[26]);
- u[11] = _mm_add_epi32(v[11], v[27]);
- u[12] = _mm_add_epi32(v[12], v[28]);
- u[13] = _mm_add_epi32(v[13], v[29]);
- u[14] = _mm_add_epi32(v[14], v[30]);
- u[15] = _mm_add_epi32(v[15], v[31]);
- u[16] = _mm_sub_epi32(v[0], v[16]);
- u[17] = _mm_sub_epi32(v[1], v[17]);
- u[18] = _mm_sub_epi32(v[2], v[18]);
- u[19] = _mm_sub_epi32(v[3], v[19]);
- u[20] = _mm_sub_epi32(v[4], v[20]);
- u[21] = _mm_sub_epi32(v[5], v[21]);
- u[22] = _mm_sub_epi32(v[6], v[22]);
- u[23] = _mm_sub_epi32(v[7], v[23]);
- u[24] = _mm_sub_epi32(v[8], v[24]);
- u[25] = _mm_sub_epi32(v[9], v[25]);
- u[26] = _mm_sub_epi32(v[10], v[26]);
- u[27] = _mm_sub_epi32(v[11], v[27]);
- u[28] = _mm_sub_epi32(v[12], v[28]);
- u[29] = _mm_sub_epi32(v[13], v[29]);
- u[30] = _mm_sub_epi32(v[14], v[30]);
- u[31] = _mm_sub_epi32(v[15], v[31]);
-
- v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
- v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
- v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
- v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
- v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
- v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
- v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
- v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
- v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
- v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
- v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
- v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
- v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
- v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
- v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
- v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
- v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
- v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
- v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
- v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
- v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
- v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
- v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
- v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
- v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
- v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
- v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
- v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
- v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
- u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
- u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
- u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
- u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
- u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
- u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
- u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
- u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
- u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
- u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
- u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
- u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
- u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
- u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
- u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
- u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
- u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
- u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
- u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
- u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
- u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
- u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
- u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
- u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
- u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
- u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
- u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
- u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
-
- s[0] = _mm_packs_epi32(u[0], u[1]);
- s[1] = _mm_packs_epi32(u[2], u[3]);
- s[2] = _mm_packs_epi32(u[4], u[5]);
- s[3] = _mm_packs_epi32(u[6], u[7]);
- s[4] = _mm_packs_epi32(u[8], u[9]);
- s[5] = _mm_packs_epi32(u[10], u[11]);
- s[6] = _mm_packs_epi32(u[12], u[13]);
- s[7] = _mm_packs_epi32(u[14], u[15]);
- s[8] = _mm_packs_epi32(u[16], u[17]);
- s[9] = _mm_packs_epi32(u[18], u[19]);
- s[10] = _mm_packs_epi32(u[20], u[21]);
- s[11] = _mm_packs_epi32(u[22], u[23]);
- s[12] = _mm_packs_epi32(u[24], u[25]);
- s[13] = _mm_packs_epi32(u[26], u[27]);
- s[14] = _mm_packs_epi32(u[28], u[29]);
- s[15] = _mm_packs_epi32(u[30], u[31]);
-
- // stage 2
- u[0] = _mm_unpacklo_epi16(s[8], s[9]);
- u[1] = _mm_unpackhi_epi16(s[8], s[9]);
- u[2] = _mm_unpacklo_epi16(s[10], s[11]);
- u[3] = _mm_unpackhi_epi16(s[10], s[11]);
- u[4] = _mm_unpacklo_epi16(s[12], s[13]);
- u[5] = _mm_unpackhi_epi16(s[12], s[13]);
- u[6] = _mm_unpacklo_epi16(s[14], s[15]);
- u[7] = _mm_unpackhi_epi16(s[14], s[15]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
- v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
- v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
- v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
- v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
- v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
- v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
- v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
- v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
- v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
- v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
- v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
- v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
- v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
-
- u[0] = _mm_add_epi32(v[0], v[8]);
- u[1] = _mm_add_epi32(v[1], v[9]);
- u[2] = _mm_add_epi32(v[2], v[10]);
- u[3] = _mm_add_epi32(v[3], v[11]);
- u[4] = _mm_add_epi32(v[4], v[12]);
- u[5] = _mm_add_epi32(v[5], v[13]);
- u[6] = _mm_add_epi32(v[6], v[14]);
- u[7] = _mm_add_epi32(v[7], v[15]);
- u[8] = _mm_sub_epi32(v[0], v[8]);
- u[9] = _mm_sub_epi32(v[1], v[9]);
- u[10] = _mm_sub_epi32(v[2], v[10]);
- u[11] = _mm_sub_epi32(v[3], v[11]);
- u[12] = _mm_sub_epi32(v[4], v[12]);
- u[13] = _mm_sub_epi32(v[5], v[13]);
- u[14] = _mm_sub_epi32(v[6], v[14]);
- u[15] = _mm_sub_epi32(v[7], v[15]);
-
- v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
- v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
- v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
- v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
- v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
- v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
- v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
- v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
- v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
- v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
- v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
- v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
- v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
- u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
- u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
- u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
- u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
- u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
- u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
- u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
- u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
- u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
- u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
- u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
- u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-
- x[0] = _mm_add_epi16(s[0], s[4]);
- x[1] = _mm_add_epi16(s[1], s[5]);
- x[2] = _mm_add_epi16(s[2], s[6]);
- x[3] = _mm_add_epi16(s[3], s[7]);
- x[4] = _mm_sub_epi16(s[0], s[4]);
- x[5] = _mm_sub_epi16(s[1], s[5]);
- x[6] = _mm_sub_epi16(s[2], s[6]);
- x[7] = _mm_sub_epi16(s[3], s[7]);
- x[8] = _mm_packs_epi32(u[0], u[1]);
- x[9] = _mm_packs_epi32(u[2], u[3]);
- x[10] = _mm_packs_epi32(u[4], u[5]);
- x[11] = _mm_packs_epi32(u[6], u[7]);
- x[12] = _mm_packs_epi32(u[8], u[9]);
- x[13] = _mm_packs_epi32(u[10], u[11]);
- x[14] = _mm_packs_epi32(u[12], u[13]);
- x[15] = _mm_packs_epi32(u[14], u[15]);
-
- // stage 3
- u[0] = _mm_unpacklo_epi16(x[4], x[5]);
- u[1] = _mm_unpackhi_epi16(x[4], x[5]);
- u[2] = _mm_unpacklo_epi16(x[6], x[7]);
- u[3] = _mm_unpackhi_epi16(x[6], x[7]);
- u[4] = _mm_unpacklo_epi16(x[12], x[13]);
- u[5] = _mm_unpackhi_epi16(x[12], x[13]);
- u[6] = _mm_unpacklo_epi16(x[14], x[15]);
- u[7] = _mm_unpackhi_epi16(x[14], x[15]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
- v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
- v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
- v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
- v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
- v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
- v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
- v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
- v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
- v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
- v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
- v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
- v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
- v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
-
- u[0] = _mm_add_epi32(v[0], v[4]);
- u[1] = _mm_add_epi32(v[1], v[5]);
- u[2] = _mm_add_epi32(v[2], v[6]);
- u[3] = _mm_add_epi32(v[3], v[7]);
- u[4] = _mm_sub_epi32(v[0], v[4]);
- u[5] = _mm_sub_epi32(v[1], v[5]);
- u[6] = _mm_sub_epi32(v[2], v[6]);
- u[7] = _mm_sub_epi32(v[3], v[7]);
- u[8] = _mm_add_epi32(v[8], v[12]);
- u[9] = _mm_add_epi32(v[9], v[13]);
- u[10] = _mm_add_epi32(v[10], v[14]);
- u[11] = _mm_add_epi32(v[11], v[15]);
- u[12] = _mm_sub_epi32(v[8], v[12]);
- u[13] = _mm_sub_epi32(v[9], v[13]);
- u[14] = _mm_sub_epi32(v[10], v[14]);
- u[15] = _mm_sub_epi32(v[11], v[15]);
-
- u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
- u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
- u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
- u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
- u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
- u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
- u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
- u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
- u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
- v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
- v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
- v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
- v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
- v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
- v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
- v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
- v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
- v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
- s[0] = _mm_add_epi16(x[0], x[2]);
- s[1] = _mm_add_epi16(x[1], x[3]);
- s[2] = _mm_sub_epi16(x[0], x[2]);
- s[3] = _mm_sub_epi16(x[1], x[3]);
- s[4] = _mm_packs_epi32(v[0], v[1]);
- s[5] = _mm_packs_epi32(v[2], v[3]);
- s[6] = _mm_packs_epi32(v[4], v[5]);
- s[7] = _mm_packs_epi32(v[6], v[7]);
- s[8] = _mm_add_epi16(x[8], x[10]);
- s[9] = _mm_add_epi16(x[9], x[11]);
- s[10] = _mm_sub_epi16(x[8], x[10]);
- s[11] = _mm_sub_epi16(x[9], x[11]);
- s[12] = _mm_packs_epi32(v[8], v[9]);
- s[13] = _mm_packs_epi32(v[10], v[11]);
- s[14] = _mm_packs_epi32(v[12], v[13]);
- s[15] = _mm_packs_epi32(v[14], v[15]);
-
- // stage 4
- u[0] = _mm_unpacklo_epi16(s[2], s[3]);
- u[1] = _mm_unpackhi_epi16(s[2], s[3]);
- u[2] = _mm_unpacklo_epi16(s[6], s[7]);
- u[3] = _mm_unpackhi_epi16(s[6], s[7]);
- u[4] = _mm_unpacklo_epi16(s[10], s[11]);
- u[5] = _mm_unpackhi_epi16(s[10], s[11]);
- u[6] = _mm_unpacklo_epi16(s[14], s[15]);
- u[7] = _mm_unpackhi_epi16(s[14], s[15]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
- v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
- v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
- v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
- v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
- v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
- v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
- v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
- v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
- v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
- v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
- v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
- v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
- v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
- u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
- u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
- u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
- u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
- u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
- u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
- u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
- u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
- v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
- v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
- v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
- v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
- v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
- v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
- v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
- v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
- v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
- in[0] = s[0];
- in[1] = _mm_sub_epi16(kZero, s[8]);
- in[2] = s[12];
- in[3] = _mm_sub_epi16(kZero, s[4]);
- in[4] = _mm_packs_epi32(v[4], v[5]);
- in[5] = _mm_packs_epi32(v[12], v[13]);
- in[6] = _mm_packs_epi32(v[8], v[9]);
- in[7] = _mm_packs_epi32(v[0], v[1]);
- in[8] = _mm_packs_epi32(v[2], v[3]);
- in[9] = _mm_packs_epi32(v[10], v[11]);
- in[10] = _mm_packs_epi32(v[14], v[15]);
- in[11] = _mm_packs_epi32(v[6], v[7]);
- in[12] = s[5];
- in[13] = _mm_sub_epi16(kZero, s[13]);
- in[14] = s[9];
- in[15] = _mm_sub_epi16(kZero, s[1]);
-}
-
-void fdct16_avx2(__m128i *in0, __m128i *in1) {
- fdct16_8col_avx2(in0);
- fdct16_8col_avx2(in1);
- array_transpose_16x16_avx2(in0, in1);
-}
-
-void fadst16_avx2(__m128i *in0, __m128i *in1) {
- fadst16_8col_avx2(in0);
- fadst16_8col_avx2(in1);
- array_transpose_16x16_avx2(in0, in1);
-}
-
-void vp9_fht16x16_avx2(const int16_t *input, int16_t *output,
- int stride, int tx_type) {
- __m128i in0[16], in1[16];
-
- switch (tx_type) {
- case DCT_DCT:
- vp9_fdct16x16_avx2(input, output, stride);
- break;
- case ADST_DCT:
- load_buffer_16x16_avx2(input, in0, in1, stride);
- fadst16_avx2(in0, in1);
- right_shift_16x16_avx2(in0, in1);
- fdct16_avx2(in0, in1);
- write_buffer_16x16_avx2(output, in0, in1, 16);
- break;
- case DCT_ADST:
- load_buffer_16x16_avx2(input, in0, in1, stride);
- fdct16_avx2(in0, in1);
- right_shift_16x16_avx2(in0, in1);
- fadst16_avx2(in0, in1);
- write_buffer_16x16_avx2(output, in0, in1, 16);
- break;
- case ADST_ADST:
- load_buffer_16x16_avx2(input, in0, in1, stride);
- fadst16_avx2(in0, in1);
- right_shift_16x16_avx2(in0, in1);
- fadst16_avx2(in0, in1);
- write_buffer_16x16_avx2(output, in0, in1, 16);
- break;
- default:
- assert(0);
- break;
- }
-}
#define FDCT32x32_2D_AVX2 vp9_fdct32x32_rd_avx2
#define FDCT32x32_HIGH_PRECISION 0