diff options
-rw-r--r-- | vp9/encoder/arm/neon/vp9_dct_neon.c | 17 | ||||
-rw-r--r-- | vpx_dsp/arm/fdct16x16_neon.c | 18 | ||||
-rw-r--r-- | vpx_dsp/arm/fdct16x16_neon.h | 297 | ||||
-rw-r--r-- | vpx_dsp/arm/fdct32x32_neon.c | 1158 | ||||
-rw-r--r-- | vpx_dsp/arm/fdct32x32_neon.h | 1105 | ||||
-rw-r--r-- | vpx_dsp/arm/fdct4x4_neon.c | 13 | ||||
-rw-r--r-- | vpx_dsp/arm/fdct4x4_neon.h | 105 | ||||
-rw-r--r-- | vpx_dsp/arm/fdct8x8_neon.c | 47 | ||||
-rw-r--r-- | vpx_dsp/arm/fdct8x8_neon.h | 381 | ||||
-rw-r--r-- | vpx_dsp/arm/fdct_neon.h | 757 | ||||
-rw-r--r-- | vpx_dsp/arm/transpose_neon.h | 45 |
11 files changed, 2112 insertions, 1831 deletions
diff --git a/vp9/encoder/arm/neon/vp9_dct_neon.c b/vp9/encoder/arm/neon/vp9_dct_neon.c index a07a1608d..b8286a8dd 100644 --- a/vp9/encoder/arm/neon/vp9_dct_neon.c +++ b/vp9/encoder/arm/neon/vp9_dct_neon.c @@ -18,6 +18,8 @@ #include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/arm/fdct_neon.h" +#include "vpx_dsp/arm/fdct4x4_neon.h" +#include "vpx_dsp/arm/fdct8x8_neon.h" static INLINE void load_buffer_4x4(const int16_t *input, int16x8_t *in, int stride) { @@ -130,12 +132,14 @@ void vp9_fht4x4_neon(const int16_t *input, tran_low_t *output, int stride, case ADST_DCT: load_buffer_4x4(input, in, stride); fadst4x4_neon(in); - vpx_fdct4x4_pass1_neon((int16x4_t *)in); + // pass1 variant is not accurate enough + vpx_fdct4x4_pass2_neon((int16x4_t *)in); write_buffer_4x4(output, in); break; case DCT_ADST: load_buffer_4x4(input, in, stride); - vpx_fdct4x4_pass1_neon((int16x4_t *)in); + // pass1 variant is not accurate enough + vpx_fdct4x4_pass2_neon((int16x4_t *)in); fadst4x4_neon(in); write_buffer_4x4(output, in); break; @@ -488,13 +492,15 @@ void vp9_fht8x8_neon(const int16_t *input, tran_low_t *output, int stride, case ADST_DCT: load_buffer_8x8(input, in, stride); fadst8x8_neon(in); - vpx_fdct8x8_pass1_neon(in); + // pass1 variant is not accurate enough + vpx_fdct8x8_pass2_neon(in); right_shift_8x8(in, 1); write_buffer_8x8(output, in, 8); break; case DCT_ADST: load_buffer_8x8(input, in, stride); - vpx_fdct8x8_pass1_neon(in); + // pass1 variant is not accurate enough + vpx_fdct8x8_pass2_neon(in); fadst8x8_neon(in); right_shift_8x8(in, 1); write_buffer_8x8(output, in, 8); @@ -559,7 +565,8 @@ static void fdct16_8col(int16x8_t *in) { i[6] = vaddq_s16(in[6], in[9]); i[7] = vaddq_s16(in[7], in[8]); - vpx_fdct8x8_pass1_neon(i); + // pass1 variant is not accurate enough + vpx_fdct8x8_pass2_neon(i); transpose_s16_8x8(&i[0], &i[1], &i[2], &i[3], &i[4], &i[5], &i[6], &i[7]); // step 2 diff --git a/vpx_dsp/arm/fdct16x16_neon.c b/vpx_dsp/arm/fdct16x16_neon.c index d0c07d429..a458ecaa4 100644 --- a/vpx_dsp/arm/fdct16x16_neon.c +++ b/vpx_dsp/arm/fdct16x16_neon.c @@ -37,20 +37,21 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { // Left half. load_cross(input, stride, temp0); scale_input(temp0, temp1); - vpx_fdct16x16_body(temp1, temp0); + vpx_fdct8x16_body(temp1, temp0); // Right half. load_cross(input + 8, stride, temp1); scale_input(temp1, temp2); - vpx_fdct16x16_body(temp2, temp1); + vpx_fdct8x16_body(temp2, temp1); // Transpose top left and top right quarters into one contiguous location to // process to the top half. + transpose_s16_8x8_new(&temp0[0], &temp2[0]); transpose_s16_8x8_new(&temp1[0], &temp2[8]); partial_round_shift(temp2); cross_input(temp2, temp3); - vpx_fdct16x16_body(temp3, temp2); + vpx_fdct8x16_body(temp3, temp2); transpose_s16_8x8(&temp2[0], &temp2[1], &temp2[2], &temp2[3], &temp2[4], &temp2[5], &temp2[6], &temp2[7]); transpose_s16_8x8(&temp2[8], &temp2[9], &temp2[10], &temp2[11], &temp2[12], @@ -62,11 +63,12 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { // Transpose bottom left and bottom right quarters into one contiguous // location to process to the bottom half. transpose_s16_8x8_new(&temp0[8], &temp1[0]); + transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12], &temp1[13], &temp1[14], &temp1[15]); partial_round_shift(temp1); cross_input(temp1, temp0); - vpx_fdct16x16_body(temp0, temp1); + vpx_fdct8x16_body(temp0, temp1); transpose_s16_8x8(&temp1[0], &temp1[1], &temp1[2], &temp1[3], &temp1[4], &temp1[5], &temp1[6], &temp1[7]); transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12], @@ -86,12 +88,12 @@ void vpx_highbd_fdct16x16_neon(const int16_t *input, tran_low_t *output, // Left half. load_cross(input, stride, temp0); highbd_scale_input(temp0, left1, right1); - vpx_highbd_fdct16x16_body(left1, right1); + vpx_highbd_fdct8x16_body(left1, right1); // right half. load_cross(input + 8, stride, temp0); highbd_scale_input(temp0, left2, right2); - vpx_highbd_fdct16x16_body(left2, right2); + vpx_highbd_fdct8x16_body(left2, right2); // Transpose top left and top right quarters into one contiguous location to // process to the top half. @@ -103,14 +105,14 @@ void vpx_highbd_fdct16x16_neon(const int16_t *input, tran_low_t *output, highbd_partial_round_shift(left3, right3); highbd_cross_input(left3, right3, left1, right1); - vpx_highbd_fdct16x16_body(left1, right1); + vpx_highbd_fdct8x16_body(left1, right1); // Transpose bottom left and bottom right quarters into one contiguous // location to process to the bottom half. highbd_partial_round_shift(left4, right4); highbd_cross_input(left4, right4, left2, right2); - vpx_highbd_fdct16x16_body(left2, right2); + vpx_highbd_fdct8x16_body(left2, right2); transpose_s32_8x8_2(left1, right1, left3, right3); transpose_s32_8x8_2(left2, right2, left3 + 8, right3 + 8); diff --git a/vpx_dsp/arm/fdct16x16_neon.h b/vpx_dsp/arm/fdct16x16_neon.h index d99870903..43d820b6b 100644 --- a/vpx_dsp/arm/fdct16x16_neon.h +++ b/vpx_dsp/arm/fdct16x16_neon.h @@ -160,8 +160,8 @@ static INLINE void partial_round_shift(int16x8_t *a /*[16]*/) { } // Main body of fdct16x16. -static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/, - int16x8_t *out /*[16]*/) { +static void vpx_fdct8x16_body(const int16x8_t *in /*[16]*/, + int16x8_t *out /*[16]*/) { int16x8_t s[8]; int16x8_t x[4]; int16x8_t step[8]; @@ -186,16 +186,17 @@ static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/, // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64) // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64) - butterfly_one_coeff(x[0], x[1], cospi_16_64, &out[0], &out[8]); - // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64); + butterfly_one_coeff_s16_s32_fast_narrow(x[0], x[1], cospi_16_64, &out[0], + &out[8]); + // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64); // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64); - butterfly_two_coeff(x[3], x[2], cospi_24_64, cospi_8_64, &out[4], &out[12]); + butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[4], &out[12]); // Stage 2 // Re-using source s5/s6 // s5 = fdct_round_shift((s6 - s5) * cospi_16_64) // s6 = fdct_round_shift((s6 + s5) * cospi_16_64) - butterfly_one_coeff(s[6], s[5], cospi_16_64, &s[6], &s[5]); + butterfly_one_coeff_s16_fast(s[6], s[5], cospi_16_64, &s[6], &s[5]); // Stage 3 x[0] = vaddq_s16(s[4], s[5]); @@ -204,12 +205,12 @@ static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/, x[3] = vaddq_s16(s[7], s[6]); // Stage 4 - // out[2] = fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64) - // out[14] = fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64) - butterfly_two_coeff(x[3], x[0], cospi_28_64, cospi_4_64, &out[2], &out[14]); - // out[6] = fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64) - // out[10] = fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64) - butterfly_two_coeff(x[2], x[1], cospi_12_64, cospi_20_64, &out[10], &out[6]); + // out[2] = fdct_round_shift(x3 * cospi_4_64 + x0 * cospi_28_64) + // out[14] = fdct_round_shift(x3 * cospi_28_64 - x0 * cospi_4_64) + butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[2], &out[14]); + // out[6] = fdct_round_shift(x2 * cospi_20_64 + x1 * cospi_12_64) + // out[10] = fdct_round_shift(x2 * cospi_12_64 - x1 * cospi_20_64) + butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[10], &out[6]); // step 2 // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results" @@ -221,8 +222,8 @@ static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/, // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64) // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64) // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64) - butterfly_one_coeff(in[13], in[10], cospi_16_64, &s[5], &s[2]); - butterfly_one_coeff(in[12], in[11], cospi_16_64, &s[4], &s[3]); + butterfly_one_coeff_s16_fast(in[13], in[10], cospi_16_64, &s[5], &s[2]); + butterfly_one_coeff_s16_fast(in[12], in[11], cospi_16_64, &s[4], &s[3]); // step 3 s[0] = vaddq_s16(in[8], s[3]); @@ -235,13 +236,15 @@ static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/, s[7] = vaddq_s16(in[15], s[4]); // step 4 - // step2[1] = fdct_round_shift(step3[1] *-cospi_8_64 + step3[6] * cospi_24_64) - // step2[6] = fdct_round_shift(step3[1] * cospi_24_64 + step3[6] * cospi_8_64) - butterfly_two_coeff(s[6], s[1], cospi_24_64, cospi_8_64, &s[6], &s[1]); + // step2[6] = fdct_round_shift(step3[6] * cospi_8_64 + step3[1] * + // cospi_24_64) step2[1] = fdct_round_shift(step3[6] * cospi_24_64 - step3[1] + // * cospi_8_64) + butterfly_two_coeff(s[6], s[1], cospi_8_64, cospi_24_64, &s[6], &s[1]); // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64) - // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * cospi_24_64) - butterfly_two_coeff(x[0], x[3], cospi_8_64, cospi_24_64, &s[2], &s[5]); + // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * + // cospi_24_64) + butterfly_two_coeff(x[0], x[3], cospi_24_64, cospi_8_64, &s[2], &s[5]); // step 5 step[0] = vaddq_s16(s[0], s[1]); @@ -254,22 +257,23 @@ static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/, step[7] = vaddq_s16(s[7], s[6]); // step 6 - // out[1] = fdct_round_shift(step1[0] * cospi_30_64 + step1[7] * cospi_2_64) - // out[9] = fdct_round_shift(step1[1] * cospi_14_64 + step1[6] * cospi_18_64) - // out[5] = fdct_round_shift(step1[2] * cospi_22_64 + step1[5] * cospi_10_64) - // out[13] = fdct_round_shift(step1[3] * cospi_6_64 + step1[4] * cospi_26_64) - // out[3] = fdct_round_shift(step1[3] * -cospi_26_64 + step1[4] * cospi_6_64) - // out[11] = fdct_round_shift(step1[2] * -cospi_10_64 + step1[5] * - // cospi_22_64) - // out[7] = fdct_round_shift(step1[1] * -cospi_18_64 + step1[6] * cospi_14_64) - // out[15] = fdct_round_shift(step1[0] * -cospi_2_64 + step1[7] * cospi_30_64) - butterfly_two_coeff(step[6], step[1], cospi_14_64, cospi_18_64, &out[9], + // out[9] = fdct_round_shift(step1[6] * cospi_18_64 + step1[1] * cospi_14_64) + // out[7] = fdct_round_shift(step1[6] * cospi_14_64 - step1[1] * cospi_18_64) + butterfly_two_coeff(step[6], step[1], cospi_18_64, cospi_14_64, &out[9], &out[7]); - butterfly_two_coeff(step[7], step[0], cospi_30_64, cospi_2_64, &out[1], + // out[1] = fdct_round_shift(step1[7] * cospi_2_64 + step1[0] * cospi_30_64) + // out[15] = fdct_round_shift(step1[7] * cospi_30_64 - step1[0] * cospi_2_64) + butterfly_two_coeff(step[7], step[0], cospi_2_64, cospi_30_64, &out[1], &out[15]); - butterfly_two_coeff(step[4], step[3], cospi_6_64, cospi_26_64, &out[13], + + // out[13] = fdct_round_shift(step1[4] * cospi_26_64 + step1[3] * cospi_6_64) + // out[3] = fdct_round_shift(step1[4] * cospi_6_64 - step1[3] * cospi_26_64) + butterfly_two_coeff(step[4], step[3], cospi_26_64, cospi_6_64, &out[13], &out[3]); - butterfly_two_coeff(step[5], step[2], cospi_22_64, cospi_10_64, &out[5], + + // out[5] = fdct_round_shift(step1[5] * cospi_10_64 + step1[2] * cospi_22_64) + // out[11] = fdct_round_shift(step1[5] * cospi_22_64 - step1[2] * cospi_10_64) + butterfly_two_coeff(step[5], step[2], cospi_10_64, cospi_22_64, &out[5], &out[11]); } @@ -279,36 +283,37 @@ static INLINE void highbd_scale_input(const int16x8_t *a /*[16]*/, int32x4_t *left /*[16]*/, int32x4_t *right /* [16] */) { left[0] = vshll_n_s16(vget_low_s16(a[0]), 2); - right[0] = vshll_n_s16(vget_high_s16(a[0]), 2); left[1] = vshll_n_s16(vget_low_s16(a[1]), 2); - right[1] = vshll_n_s16(vget_high_s16(a[1]), 2); left[2] = vshll_n_s16(vget_low_s16(a[2]), 2); - right[2] = vshll_n_s16(vget_high_s16(a[2]), 2); left[3] = vshll_n_s16(vget_low_s16(a[3]), 2); - right[3] = vshll_n_s16(vget_high_s16(a[3]), 2); left[4] = vshll_n_s16(vget_low_s16(a[4]), 2); - right[4] = vshll_n_s16(vget_high_s16(a[4]), 2); left[5] = vshll_n_s16(vget_low_s16(a[5]), 2); - right[5] = vshll_n_s16(vget_high_s16(a[5]), 2); left[6] = vshll_n_s16(vget_low_s16(a[6]), 2); - right[6] = vshll_n_s16(vget_high_s16(a[6]), 2); left[7] = vshll_n_s16(vget_low_s16(a[7]), 2); - right[7] = vshll_n_s16(vget_high_s16(a[7]), 2); left[8] = vshll_n_s16(vget_low_s16(a[8]), 2); - right[8] = vshll_n_s16(vget_high_s16(a[8]), 2); left[9] = vshll_n_s16(vget_low_s16(a[9]), 2); - right[9] = vshll_n_s16(vget_high_s16(a[9]), 2); left[10] = vshll_n_s16(vget_low_s16(a[10]), 2); - right[10] = vshll_n_s16(vget_high_s16(a[10]), 2); left[11] = vshll_n_s16(vget_low_s16(a[11]), 2); - right[11] = vshll_n_s16(vget_high_s16(a[11]), 2); left[12] = vshll_n_s16(vget_low_s16(a[12]), 2); - right[12] = vshll_n_s16(vget_high_s16(a[12]), 2); left[13] = vshll_n_s16(vget_low_s16(a[13]), 2); - right[13] = vshll_n_s16(vget_high_s16(a[13]), 2); left[14] = vshll_n_s16(vget_low_s16(a[14]), 2); - right[14] = vshll_n_s16(vget_high_s16(a[14]), 2); left[15] = vshll_n_s16(vget_low_s16(a[15]), 2); + + right[0] = vshll_n_s16(vget_high_s16(a[0]), 2); + right[1] = vshll_n_s16(vget_high_s16(a[1]), 2); + right[2] = vshll_n_s16(vget_high_s16(a[2]), 2); + right[3] = vshll_n_s16(vget_high_s16(a[3]), 2); + right[4] = vshll_n_s16(vget_high_s16(a[4]), 2); + right[5] = vshll_n_s16(vget_high_s16(a[5]), 2); + right[6] = vshll_n_s16(vget_high_s16(a[6]), 2); + right[7] = vshll_n_s16(vget_high_s16(a[7]), 2); + right[8] = vshll_n_s16(vget_high_s16(a[8]), 2); + right[9] = vshll_n_s16(vget_high_s16(a[9]), 2); + right[10] = vshll_n_s16(vget_high_s16(a[10]), 2); + right[11] = vshll_n_s16(vget_high_s16(a[11]), 2); + right[12] = vshll_n_s16(vget_high_s16(a[12]), 2); + right[13] = vshll_n_s16(vget_high_s16(a[13]), 2); + right[14] = vshll_n_s16(vget_high_s16(a[14]), 2); right[15] = vshll_n_s16(vget_high_s16(a[15]), 2); } @@ -357,81 +362,38 @@ static INLINE void highbd_partial_round_shift(int32x4_t *left /*[16]*/, int32x4_t *right /* [16] */) { const int32x4_t one = vdupq_n_s32(1); left[0] = vshrq_n_s32(vaddq_s32(left[0], one), 2); - right[0] = vshrq_n_s32(vaddq_s32(right[0], one), 2); left[1] = vshrq_n_s32(vaddq_s32(left[1], one), 2); - right[1] = vshrq_n_s32(vaddq_s32(right[1], one), 2); left[2] = vshrq_n_s32(vaddq_s32(left[2], one), 2); - right[2] = vshrq_n_s32(vaddq_s32(right[2], one), 2); left[3] = vshrq_n_s32(vaddq_s32(left[3], one), 2); - right[3] = vshrq_n_s32(vaddq_s32(right[3], one), 2); left[4] = vshrq_n_s32(vaddq_s32(left[4], one), 2); - right[4] = vshrq_n_s32(vaddq_s32(right[4], one), 2); left[5] = vshrq_n_s32(vaddq_s32(left[5], one), 2); - right[5] = vshrq_n_s32(vaddq_s32(right[5], one), 2); left[6] = vshrq_n_s32(vaddq_s32(left[6], one), 2); - right[6] = vshrq_n_s32(vaddq_s32(right[6], one), 2); left[7] = vshrq_n_s32(vaddq_s32(left[7], one), 2); - right[7] = vshrq_n_s32(vaddq_s32(right[7], one), 2); left[8] = vshrq_n_s32(vaddq_s32(left[8], one), 2); - right[8] = vshrq_n_s32(vaddq_s32(right[8], one), 2); left[9] = vshrq_n_s32(vaddq_s32(left[9], one), 2); - right[9] = vshrq_n_s32(vaddq_s32(right[9], one), 2); left[10] = vshrq_n_s32(vaddq_s32(left[10], one), 2); - right[10] = vshrq_n_s32(vaddq_s32(right[10], one), 2); left[11] = vshrq_n_s32(vaddq_s32(left[11], one), 2); - right[11] = vshrq_n_s32(vaddq_s32(right[11], one), 2); left[12] = vshrq_n_s32(vaddq_s32(left[12], one), 2); - right[12] = vshrq_n_s32(vaddq_s32(right[12], one), 2); left[13] = vshrq_n_s32(vaddq_s32(left[13], one), 2); - right[13] = vshrq_n_s32(vaddq_s32(right[13], one), 2); left[14] = vshrq_n_s32(vaddq_s32(left[14], one), 2); - right[14] = vshrq_n_s32(vaddq_s32(right[14], one), 2); left[15] = vshrq_n_s32(vaddq_s32(left[15], one), 2); - right[15] = vshrq_n_s32(vaddq_s32(right[15], one), 2); -} -static INLINE void transpose_s32_8x8_2(int32x4_t *left /*[8]*/, - int32x4_t *right /*[8]*/, - int32x4_t *out_left /*[8]*/, - int32x4_t *out_right /*[8]*/) { - int32x4x2_t out[8]; - - out[0].val[0] = left[0]; - out[0].val[1] = right[0]; - out[1].val[0] = left[1]; - out[1].val[1] = right[1]; - out[2].val[0] = left[2]; - out[2].val[1] = right[2]; - out[3].val[0] = left[3]; - out[3].val[1] = right[3]; - out[4].val[0] = left[4]; - out[4].val[1] = right[4]; - out[5].val[0] = left[5]; - out[5].val[1] = right[5]; - out[6].val[0] = left[6]; - out[6].val[1] = right[6]; - out[7].val[0] = left[7]; - out[7].val[1] = right[7]; - - transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5], - &out[6], &out[7]); - - out_left[0] = out[0].val[0]; - out_left[1] = out[1].val[0]; - out_left[2] = out[2].val[0]; - out_left[3] = out[3].val[0]; - out_left[4] = out[4].val[0]; - out_left[5] = out[5].val[0]; - out_left[6] = out[6].val[0]; - out_left[7] = out[7].val[0]; - out_right[0] = out[0].val[1]; - out_right[1] = out[1].val[1]; - out_right[2] = out[2].val[1]; - out_right[3] = out[3].val[1]; - out_right[4] = out[4].val[1]; - out_right[5] = out[5].val[1]; - out_right[6] = out[6].val[1]; - out_right[7] = out[7].val[1]; + right[0] = vshrq_n_s32(vaddq_s32(right[0], one), 2); + right[1] = vshrq_n_s32(vaddq_s32(right[1], one), 2); + right[2] = vshrq_n_s32(vaddq_s32(right[2], one), 2); + right[3] = vshrq_n_s32(vaddq_s32(right[3], one), 2); + right[4] = vshrq_n_s32(vaddq_s32(right[4], one), 2); + right[5] = vshrq_n_s32(vaddq_s32(right[5], one), 2); + right[6] = vshrq_n_s32(vaddq_s32(right[6], one), 2); + right[7] = vshrq_n_s32(vaddq_s32(right[7], one), 2); + right[8] = vshrq_n_s32(vaddq_s32(right[8], one), 2); + right[9] = vshrq_n_s32(vaddq_s32(right[9], one), 2); + right[10] = vshrq_n_s32(vaddq_s32(right[10], one), 2); + right[11] = vshrq_n_s32(vaddq_s32(right[11], one), 2); + right[12] = vshrq_n_s32(vaddq_s32(right[12], one), 2); + right[13] = vshrq_n_s32(vaddq_s32(right[13], one), 2); + right[14] = vshrq_n_s32(vaddq_s32(right[14], one), 2); + right[15] = vshrq_n_s32(vaddq_s32(right[15], one), 2); } // Store 16 32x4 vectors, assuming stride == 16. @@ -469,9 +431,9 @@ static INLINE void store16_s32(tran_low_t *a, const int32x4_t *b /*[32]*/) { vst1q_s32(a, b[15]); } -// Main body of fdct16x16. -static void vpx_highbd_fdct16x16_body(int32x4_t *left /*[16]*/, - int32x4_t *right /* [16] */) { +// Main body of fdct8x16 column +static void vpx_highbd_fdct8x16_body(int32x4_t *left /*[16]*/, + int32x4_t *right /* [16] */) { int32x4_t sl[8]; int32x4_t sr[8]; int32x4_t xl[4]; @@ -531,22 +493,21 @@ static void vpx_highbd_fdct16x16_body(int32x4_t *left /*[16]*/, // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64) // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64) - highbd_butterfly_one_coeff_s32(xl[0], xl[1], cospi_16_64, &left[0], &left[8]); - highbd_butterfly_one_coeff_s32(xr[0], xr[1], cospi_16_64, &right[0], - &right[8]); - // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64); + butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64, + &left[0], &right[0], &left[8], &right[8]); + + // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64); // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64); - highbd_butterfly_two_coeff_s32(xl[3], xl[2], cospi_8_64, cospi_24_64, - &left[4], &left[12]); - highbd_butterfly_two_coeff_s32(xr[3], xr[2], cospi_8_64, cospi_24_64, - &right[4], &right[12]); + butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[2], xr[2], cospi_8_64, + cospi_24_64, &left[4], &right[4], + &left[12], &right[12]); // Stage 2 // Re-using source s5/s6 // s5 = fdct_round_shift((s6 - s5) * cospi_16_64) // s6 = fdct_round_shift((s6 + s5) * cospi_16_64) - highbd_butterfly_one_coeff_s32(sl[6], sl[5], cospi_16_64, &sl[6], &sl[5]); - highbd_butterfly_one_coeff_s32(sr[6], sr[5], cospi_16_64, &sr[6], &sr[5]); + butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &sl[6], + &sr[6], &sl[5], &sr[5]); // Stage 3 xl[0] = vaddq_s32(sl[4], sl[5]); @@ -559,18 +520,16 @@ static void vpx_highbd_fdct16x16_body(int32x4_t *left /*[16]*/, xr[3] = vaddq_s32(sr[7], sr[6]); // Stage 4 - // out[2] = fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64) - // out[14] = fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64) - highbd_butterfly_two_coeff_s32(xl[3], xl[0], cospi_4_64, cospi_28_64, - &left[2], &left[14]); - highbd_butterfly_two_coeff_s32(xr[3], xr[0], cospi_4_64, cospi_28_64, - &right[2], &right[14]); - // out[6] = fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64) - // out[10] = fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64) - highbd_butterfly_two_coeff_s32(xl[2], xl[1], cospi_20_64, cospi_12_64, - &left[10], &left[6]); - highbd_butterfly_two_coeff_s32(xr[2], xr[1], cospi_20_64, cospi_12_64, - &right[10], &right[6]); + // out[2] = fdct_round_shift(x3 * cospi_4_64 + x0 * cospi_28_64) + // out[14] = fdct_round_shift(x3 * cospi_28_64 - x0 * cospi_4_64) + butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[0], xr[0], cospi_4_64, + cospi_28_64, &left[2], &right[2], + &left[14], &right[14]); + // out[6] = fdct_round_shift(x2 * cospi_20_64 + x1 * cospi_12_64) + // out[10] = fdct_round_shift(x2 * cospi_12_64 - x1 * cospi_20_64) + butterfly_two_coeff_s32_s64_narrow(xl[2], xr[2], xl[1], xr[1], cospi_20_64, + cospi_12_64, &left[10], &right[10], + &left[6], &right[6]); // step 2 // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results" @@ -582,10 +541,10 @@ static void vpx_highbd_fdct16x16_body(int32x4_t *left /*[16]*/, // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64) // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64) // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64) - highbd_butterfly_one_coeff_s32(inl[5], inl[2], cospi_16_64, &sl[5], &sl[2]); - highbd_butterfly_one_coeff_s32(inr[5], inr[2], cospi_16_64, &sr[5], &sr[2]); - highbd_butterfly_one_coeff_s32(inl[4], inl[3], cospi_16_64, &sl[4], &sl[3]); - highbd_butterfly_one_coeff_s32(inr[4], inr[3], cospi_16_64, &sr[4], &sr[3]); + butterfly_one_coeff_s32_fast(inl[5], inr[5], inl[2], inr[2], cospi_16_64, + &sl[5], &sr[5], &sl[2], &sr[2]); + butterfly_one_coeff_s32_fast(inl[4], inr[4], inl[3], inr[3], cospi_16_64, + &sl[4], &sr[4], &sl[3], &sr[3]); // step 3 sl[0] = vaddq_s32(inl[0], sl[3]); @@ -606,19 +565,18 @@ static void vpx_highbd_fdct16x16_body(int32x4_t *left /*[16]*/, sr[7] = vaddq_s32(inr[7], sr[4]); // step 4 - // step2[1] = fdct_round_shift(step3[1] *-cospi_8_64 + step3[6] * cospi_24_64) - // step2[6] = fdct_round_shift(step3[1] * cospi_24_64 + step3[6] * cospi_8_64) - highbd_butterfly_two_coeff_s32(sl[6], sl[1], cospi_8_64, cospi_24_64, &sl[6], - &sl[1]); - highbd_butterfly_two_coeff_s32(sr[6], sr[1], cospi_8_64, cospi_24_64, &sr[6], - &sr[1]); - + // step2[6] = fdct_round_shift(step3[6] * cospi_8_64 + step3[1] * + // cospi_24_64) step2[1] = fdct_round_shift(step3[6] * cospi_24_64 - step3[1] + // * cospi_8_64) + butterfly_two_coeff_s32_s64_narrow(sl[6], sr[6], sl[1], sr[1], cospi_8_64, + cospi_24_64, &sl[6], &sr[6], &sl[1], + &sr[1]); // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64) - // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * cospi_24_64) - highbd_butterfly_two_coeff_s32(xl[0], xl[3], cospi_24_64, cospi_8_64, &sl[2], - &sl[5]); - highbd_butterfly_two_coeff_s32(xr[0], xr[3], cospi_24_64, cospi_8_64, &sr[2], - &sr[5]); + // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * + // cospi_24_64) + butterfly_two_coeff_s32_s64_narrow(xl[0], xr[0], xl[3], xr[3], cospi_24_64, + cospi_8_64, &sl[2], &sr[2], &sl[5], + &sr[5]); // step 5 stepl[0] = vaddq_s32(sl[0], sl[1]); @@ -639,31 +597,26 @@ static void vpx_highbd_fdct16x16_body(int32x4_t *left /*[16]*/, stepr[7] = vaddq_s32(sr[7], sr[6]); // step 6 - // out[1] = fdct_round_shift(step1[0] * cospi_30_64 + step1[7] * cospi_2_64) - // out[15] = fdct_round_shift(step1[0] * -cospi_2_64 + step1[7] * cospi_30_64) - // out[9] = fdct_round_shift(step1[1] * cospi_14_64 + step1[6] * cospi_18_64) - // out[7] = fdct_round_shift(step1[1] * -cospi_18_64 + step1[6] * cospi_14_64) - // out[5] = fdct_round_shift(step1[2] * cospi_22_64 + step1[5] * cospi_10_64) - // out[11] = fdct_round_shift(step1[2] * -cospi_10_64 + step1[5] * - // cospi_22_64) out[13] = fdct_round_shift(step1[3] * cospi_6_64 + step1[4] * - // cospi_26_64) out[3] = fdct_round_shift(step1[3] * -cospi_26_64 + step1[4] * - // cospi_6_64) - highbd_butterfly_two_coeff_s32(stepl[7], stepl[0], cospi_2_64, cospi_30_64, - &left[1], &left[15]); - highbd_butterfly_two_coeff_s32(stepr[7], stepr[0], cospi_2_64, cospi_30_64, - &right[1], &right[15]); - highbd_butterfly_two_coeff_s32(stepl[6], stepl[1], cospi_18_64, cospi_14_64, - &left[9], &left[7]); - highbd_butterfly_two_coeff_s32(stepr[6], stepr[1], cospi_18_64, cospi_14_64, - &right[9], &right[7]); - highbd_butterfly_two_coeff_s32(stepl[5], stepl[2], cospi_10_64, cospi_22_64, - &left[5], &left[11]); - highbd_butterfly_two_coeff_s32(stepr[5], stepr[2], cospi_10_64, cospi_22_64, - &right[5], &right[11]); - highbd_butterfly_two_coeff_s32(stepl[4], stepl[3], cospi_26_64, cospi_6_64, - &left[13], &left[3]); - highbd_butterfly_two_coeff_s32(stepr[4], stepr[3], cospi_26_64, cospi_6_64, - &right[13], &right[3]); + // out[9] = fdct_round_shift(step1[6] * cospi_18_64 + step1[1] * cospi_14_64) + // out[7] = fdct_round_shift(step1[6] * cospi_14_64 - step1[1] * cospi_18_64) + butterfly_two_coeff_s32_s64_narrow(stepl[6], stepr[6], stepl[1], stepr[1], + cospi_18_64, cospi_14_64, &left[9], + &right[9], &left[7], &right[7]); + // out[1] = fdct_round_shift(step1[7] * cospi_2_64 + step1[0] * cospi_30_64) + // out[15] = fdct_round_shift(step1[7] * cospi_30_64 - step1[0] * cospi_2_64) + butterfly_two_coeff_s32_s64_narrow(stepl[7], stepr[7], stepl[0], stepr[0], + cospi_2_64, cospi_30_64, &left[1], + &right[1], &left[15], &right[15]); + // out[13] = fdct_round_shift(step1[4] * cospi_26_64 + step1[3] * cospi_6_64) + // out[3] = fdct_round_shift(step1[4] * cospi_6_64 - step1[3] * cospi_26_64) + butterfly_two_coeff_s32_s64_narrow(stepl[4], stepr[4], stepl[3], stepr[3], + cospi_26_64, cospi_6_64, &left[13], + &right[13], &left[3], &right[3]); + // out[5] = fdct_round_shift(step1[5] * cospi_10_64 + step1[2] * cospi_22_64) + // out[11] = fdct_round_shift(step1[5] * cospi_22_64 - step1[2] * cospi_10_64) + butterfly_two_coeff_s32_s64_narrow(stepl[5], stepr[5], stepl[2], stepr[2], + cospi_10_64, cospi_22_64, &left[5], + &right[5], &left[11], &right[11]); } #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/arm/fdct32x32_neon.c b/vpx_dsp/arm/fdct32x32_neon.c index 51d81bd08..e2bf16760 100644 --- a/vpx_dsp/arm/fdct32x32_neon.c +++ b/vpx_dsp/arm/fdct32x32_neon.c @@ -16,6 +16,7 @@ #include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/arm/fdct_neon.h" +#include "vpx_dsp/arm/fdct32x32_neon.h" // Most gcc 4.9 distributions outside of Android do not generate correct code // for this function. @@ -33,1123 +34,6 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, #else -#define LOAD_INCREMENT(src, stride, dest, index) \ - do { \ - dest[index] = vld1q_s16(src); \ - src += stride; \ - } while (0) - -#define ADD_S16(src, index0, index1, dest, index3) \ - do { \ - dest[index3] = vaddq_s16(src[index0], src[index1]); \ - } while (0) - -#define ADD_SHIFT_S16(src, index0, index1) \ - do { \ - src[index1] = vshlq_n_s16(vsubq_s16(src[index0], src[index1]), 2); \ - } while (0) - -// Load, cross, and multiply by 4. Load the first 8 and last 8, then the -// middle -// 16. Doing sets of 16 at a time. Maybe sets of 8 would be better? -static INLINE void load(const int16_t *a, int stride, int16x8_t *b) { - const int16_t *a_end = a + 24 * stride; - int16x8_t c[8]; - - LOAD_INCREMENT(a, stride, b, 0); - LOAD_INCREMENT(a, stride, b, 1); - LOAD_INCREMENT(a, stride, b, 2); - LOAD_INCREMENT(a, stride, b, 3); - LOAD_INCREMENT(a, stride, b, 4); - LOAD_INCREMENT(a, stride, b, 5); - LOAD_INCREMENT(a, stride, b, 6); - LOAD_INCREMENT(a, stride, b, 7); - - LOAD_INCREMENT(a_end, stride, b, 24); - LOAD_INCREMENT(a_end, stride, b, 25); - LOAD_INCREMENT(a_end, stride, b, 26); - LOAD_INCREMENT(a_end, stride, b, 27); - LOAD_INCREMENT(a_end, stride, b, 28); - LOAD_INCREMENT(a_end, stride, b, 29); - LOAD_INCREMENT(a_end, stride, b, 30); - LOAD_INCREMENT(a_end, stride, b, 31); - - ADD_S16(b, 0, 31, c, 0); - ADD_S16(b, 1, 30, c, 1); - ADD_S16(b, 2, 29, c, 2); - ADD_S16(b, 3, 28, c, 3); - ADD_S16(b, 4, 27, c, 4); - ADD_S16(b, 5, 26, c, 5); - ADD_S16(b, 6, 25, c, 6); - ADD_S16(b, 7, 24, c, 7); - - ADD_SHIFT_S16(b, 7, 24); - ADD_SHIFT_S16(b, 6, 25); - ADD_SHIFT_S16(b, 5, 26); - ADD_SHIFT_S16(b, 4, 27); - ADD_SHIFT_S16(b, 3, 28); - ADD_SHIFT_S16(b, 2, 29); - ADD_SHIFT_S16(b, 1, 30); - ADD_SHIFT_S16(b, 0, 31); - - b[0] = vshlq_n_s16(c[0], 2); - b[1] = vshlq_n_s16(c[1], 2); - b[2] = vshlq_n_s16(c[2], 2); - b[3] = vshlq_n_s16(c[3], 2); - b[4] = vshlq_n_s16(c[4], 2); - b[5] = vshlq_n_s16(c[5], 2); - b[6] = vshlq_n_s16(c[6], 2); - b[7] = vshlq_n_s16(c[7], 2); - - LOAD_INCREMENT(a, stride, b, 8); - LOAD_INCREMENT(a, stride, b, 9); - LOAD_INCREMENT(a, stride, b, 10); - LOAD_INCREMENT(a, stride, b, 11); - LOAD_INCREMENT(a, stride, b, 12); - LOAD_INCREMENT(a, stride, b, 13); - LOAD_INCREMENT(a, stride, b, 14); - LOAD_INCREMENT(a, stride, b, 15); - LOAD_INCREMENT(a, stride, b, 16); - LOAD_INCREMENT(a, stride, b, 17); - LOAD_INCREMENT(a, stride, b, 18); - LOAD_INCREMENT(a, stride, b, 19); - LOAD_INCREMENT(a, stride, b, 20); - LOAD_INCREMENT(a, stride, b, 21); - LOAD_INCREMENT(a, stride, b, 22); - LOAD_INCREMENT(a, stride, b, 23); - - ADD_S16(b, 8, 23, c, 0); - ADD_S16(b, 9, 22, c, 1); - ADD_S16(b, 10, 21, c, 2); - ADD_S16(b, 11, 20, c, 3); - ADD_S16(b, 12, 19, c, 4); - ADD_S16(b, 13, 18, c, 5); - ADD_S16(b, 14, 17, c, 6); - ADD_S16(b, 15, 16, c, 7); - - ADD_SHIFT_S16(b, 15, 16); - ADD_SHIFT_S16(b, 14, 17); - ADD_SHIFT_S16(b, 13, 18); - ADD_SHIFT_S16(b, 12, 19); - ADD_SHIFT_S16(b, 11, 20); - ADD_SHIFT_S16(b, 10, 21); - ADD_SHIFT_S16(b, 9, 22); - ADD_SHIFT_S16(b, 8, 23); - - b[8] = vshlq_n_s16(c[0], 2); - b[9] = vshlq_n_s16(c[1], 2); - b[10] = vshlq_n_s16(c[2], 2); - b[11] = vshlq_n_s16(c[3], 2); - b[12] = vshlq_n_s16(c[4], 2); - b[13] = vshlq_n_s16(c[5], 2); - b[14] = vshlq_n_s16(c[6], 2); - b[15] = vshlq_n_s16(c[7], 2); -} - -#undef LOAD_INCREMENT -#undef ADD_S16 -#undef ADD_SHIFT_S16 - -#define STORE_S16(src, index, dest) \ - do { \ - store_s16q_to_tran_low(dest, src[index]); \ - dest += 8; \ - } while (0) - -// Store 32 16x8 values, assuming stride == 32. -// Slight twist: store horizontally in blocks of 8. -static INLINE void store(tran_low_t *a, const int16x8_t *b) { - STORE_S16(b, 0, a); - STORE_S16(b, 8, a); - STORE_S16(b, 16, a); - STORE_S16(b, 24, a); - STORE_S16(b, 1, a); - STORE_S16(b, 9, a); - STORE_S16(b, 17, a); - STORE_S16(b, 25, a); - STORE_S16(b, 2, a); - STORE_S16(b, 10, a); - STORE_S16(b, 18, a); - STORE_S16(b, 26, a); - STORE_S16(b, 3, a); - STORE_S16(b, 11, a); - STORE_S16(b, 19, a); - STORE_S16(b, 27, a); - STORE_S16(b, 4, a); - STORE_S16(b, 12, a); - STORE_S16(b, 20, a); - STORE_S16(b, 28, a); - STORE_S16(b, 5, a); - STORE_S16(b, 13, a); - STORE_S16(b, 21, a); - STORE_S16(b, 29, a); - STORE_S16(b, 6, a); - STORE_S16(b, 14, a); - STORE_S16(b, 22, a); - STORE_S16(b, 30, a); - STORE_S16(b, 7, a); - STORE_S16(b, 15, a); - STORE_S16(b, 23, a); - STORE_S16(b, 31, a); -} - -#undef STORE_S16 - -static void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) { - int16x8_t a[32]; - int16x8_t b[32]; - - // Stage 1: Done as part of the load. - - // Stage 2. - // Mini cross. X the first 16 values and the middle 8 of the second half. - a[0] = vaddq_s16(in[0], in[15]); - a[1] = vaddq_s16(in[1], in[14]); - a[2] = vaddq_s16(in[2], in[13]); - a[3] = vaddq_s16(in[3], in[12]); - a[4] = vaddq_s16(in[4], in[11]); - a[5] = vaddq_s16(in[5], in[10]); - a[6] = vaddq_s16(in[6], in[9]); - a[7] = vaddq_s16(in[7], in[8]); - - a[8] = vsubq_s16(in[7], in[8]); - a[9] = vsubq_s16(in[6], in[9]); - a[10] = vsubq_s16(in[5], in[10]); - a[11] = vsubq_s16(in[4], in[11]); - a[12] = vsubq_s16(in[3], in[12]); - a[13] = vsubq_s16(in[2], in[13]); - a[14] = vsubq_s16(in[1], in[14]); - a[15] = vsubq_s16(in[0], in[15]); - - a[16] = in[16]; - a[17] = in[17]; - a[18] = in[18]; - a[19] = in[19]; - - butterfly_one_coeff(in[27], in[20], cospi_16_64, &a[27], &a[20]); - butterfly_one_coeff(in[26], in[21], cospi_16_64, &a[26], &a[21]); - butterfly_one_coeff(in[25], in[22], cospi_16_64, &a[25], &a[22]); - butterfly_one_coeff(in[24], in[23], cospi_16_64, &a[24], &a[23]); - - a[28] = in[28]; - a[29] = in[29]; - a[30] = in[30]; - a[31] = in[31]; - - // Stage 3. - b[0] = vaddq_s16(a[0], a[7]); - b[1] = vaddq_s16(a[1], a[6]); - b[2] = vaddq_s16(a[2], a[5]); - b[3] = vaddq_s16(a[3], a[4]); - - b[4] = vsubq_s16(a[3], a[4]); - b[5] = vsubq_s16(a[2], a[5]); - b[6] = vsubq_s16(a[1], a[6]); - b[7] = vsubq_s16(a[0], a[7]); - - b[8] = a[8]; - b[9] = a[9]; - - butterfly_one_coeff(a[13], a[10], cospi_16_64, &b[13], &b[10]); - butterfly_one_coeff(a[12], a[11], cospi_16_64, &b[12], &b[11]); - - b[14] = a[14]; - b[15] = a[15]; - - b[16] = vaddq_s16(in[16], a[23]); - b[17] = vaddq_s16(in[17], a[22]); - b[18] = vaddq_s16(in[18], a[21]); - b[19] = vaddq_s16(in[19], a[20]); - - b[20] = vsubq_s16(in[19], a[20]); - b[21] = vsubq_s16(in[18], a[21]); - b[22] = vsubq_s16(in[17], a[22]); - b[23] = vsubq_s16(in[16], a[23]); - - b[24] = vsubq_s16(in[31], a[24]); - b[25] = vsubq_s16(in[30], a[25]); - b[26] = vsubq_s16(in[29], a[26]); - b[27] = vsubq_s16(in[28], a[27]); - - b[28] = vaddq_s16(in[28], a[27]); - b[29] = vaddq_s16(in[29], a[26]); - b[30] = vaddq_s16(in[30], a[25]); - b[31] = vaddq_s16(in[31], a[24]); - - // Stage 4. - a[0] = vaddq_s16(b[0], b[3]); - a[1] = vaddq_s16(b[1], b[2]); - a[2] = vsubq_s16(b[1], b[2]); - a[3] = vsubq_s16(b[0], b[3]); - - a[4] = b[4]; - - butterfly_one_coeff(b[6], b[5], cospi_16_64, &a[6], &a[5]); - - a[7] = b[7]; - - a[8] = vaddq_s16(b[8], b[11]); - a[9] = vaddq_s16(b[9], b[10]); - a[10] = vsubq_s16(b[9], b[10]); - a[11] = vsubq_s16(b[8], b[11]); - a[12] = vsubq_s16(b[15], b[12]); - a[13] = vsubq_s16(b[14], b[13]); - a[14] = vaddq_s16(b[14], b[13]); - a[15] = vaddq_s16(b[15], b[12]); - - a[16] = b[16]; - a[17] = b[17]; - - butterfly_two_coeff(b[29], b[18], cospi_24_64, cospi_8_64, &a[29], &a[18]); - butterfly_two_coeff(b[28], b[19], cospi_24_64, cospi_8_64, &a[28], &a[19]); - butterfly_two_coeff(b[27], b[20], -cospi_8_64, cospi_24_64, &a[27], &a[20]); - butterfly_two_coeff(b[26], b[21], -cospi_8_64, cospi_24_64, &a[26], &a[21]); - - a[22] = b[22]; - a[23] = b[23]; - a[24] = b[24]; - a[25] = b[25]; - - a[30] = b[30]; - a[31] = b[31]; - - // Stage 5. - butterfly_one_coeff(a[0], a[1], cospi_16_64, &b[0], &b[1]); - butterfly_two_coeff(a[3], a[2], cospi_24_64, cospi_8_64, &b[2], &b[3]); - - b[4] = vaddq_s16(a[4], a[5]); - b[5] = vsubq_s16(a[4], a[5]); - b[6] = vsubq_s16(a[7], a[6]); - b[7] = vaddq_s16(a[7], a[6]); - - b[8] = a[8]; - - butterfly_two_coeff(a[14], a[9], cospi_24_64, cospi_8_64, &b[14], &b[9]); - butterfly_two_coeff(a[13], a[10], -cospi_8_64, cospi_24_64, &b[13], &b[10]); - - b[11] = a[11]; - b[12] = a[12]; - - b[15] = a[15]; - - b[16] = vaddq_s16(a[19], a[16]); - b[17] = vaddq_s16(a[18], a[17]); - b[18] = vsubq_s16(a[17], a[18]); - b[19] = vsubq_s16(a[16], a[19]); - b[20] = vsubq_s16(a[23], a[20]); - b[21] = vsubq_s16(a[22], a[21]); - b[22] = vaddq_s16(a[21], a[22]); - b[23] = vaddq_s16(a[20], a[23]); - b[24] = vaddq_s16(a[27], a[24]); - b[25] = vaddq_s16(a[26], a[25]); - b[26] = vsubq_s16(a[25], a[26]); - b[27] = vsubq_s16(a[24], a[27]); - b[28] = vsubq_s16(a[31], a[28]); - b[29] = vsubq_s16(a[30], a[29]); - b[30] = vaddq_s16(a[29], a[30]); - b[31] = vaddq_s16(a[28], a[31]); - - // Stage 6. - a[0] = b[0]; - a[1] = b[1]; - a[2] = b[2]; - a[3] = b[3]; - - butterfly_two_coeff(b[7], b[4], cospi_28_64, cospi_4_64, &a[4], &a[7]); - butterfly_two_coeff(b[6], b[5], cospi_12_64, cospi_20_64, &a[5], &a[6]); - - a[8] = vaddq_s16(b[8], b[9]); - a[9] = vsubq_s16(b[8], b[9]); - a[10] = vsubq_s16(b[11], b[10]); - a[11] = vaddq_s16(b[11], b[10]); - a[12] = vaddq_s16(b[12], b[13]); - a[13] = vsubq_s16(b[12], b[13]); - a[14] = vsubq_s16(b[15], b[14]); - a[15] = vaddq_s16(b[15], b[14]); - - a[16] = b[16]; - a[19] = b[19]; - a[20] = b[20]; - a[23] = b[23]; - a[24] = b[24]; - a[27] = b[27]; - a[28] = b[28]; - a[31] = b[31]; - - butterfly_two_coeff(b[30], b[17], cospi_28_64, cospi_4_64, &a[30], &a[17]); - butterfly_two_coeff(b[29], b[18], -cospi_4_64, cospi_28_64, &a[29], &a[18]); - - butterfly_two_coeff(b[26], b[21], cospi_12_64, cospi_20_64, &a[26], &a[21]); - butterfly_two_coeff(b[25], b[22], -cospi_20_64, cospi_12_64, &a[25], &a[22]); - - // Stage 7. - b[0] = a[0]; - b[1] = a[1]; - b[2] = a[2]; - b[3] = a[3]; - b[4] = a[4]; - b[5] = a[5]; - b[6] = a[6]; - b[7] = a[7]; - - butterfly_two_coeff(a[15], a[8], cospi_30_64, cospi_2_64, &b[8], &b[15]); - butterfly_two_coeff(a[14], a[9], cospi_14_64, cospi_18_64, &b[9], &b[14]); - butterfly_two_coeff(a[13], a[10], cospi_22_64, cospi_10_64, &b[10], &b[13]); - butterfly_two_coeff(a[12], a[11], cospi_6_64, cospi_26_64, &b[11], &b[12]); - - b[16] = vaddq_s16(a[16], a[17]); - b[17] = vsubq_s16(a[16], a[17]); - b[18] = vsubq_s16(a[19], a[18]); - b[19] = vaddq_s16(a[19], a[18]); - b[20] = vaddq_s16(a[20], a[21]); - b[21] = vsubq_s16(a[20], a[21]); - b[22] = vsubq_s16(a[23], a[22]); - b[23] = vaddq_s16(a[23], a[22]); - b[24] = vaddq_s16(a[24], a[25]); - b[25] = vsubq_s16(a[24], a[25]); - b[26] = vsubq_s16(a[27], a[26]); - b[27] = vaddq_s16(a[27], a[26]); - b[28] = vaddq_s16(a[28], a[29]); - b[29] = vsubq_s16(a[28], a[29]); - b[30] = vsubq_s16(a[31], a[30]); - b[31] = vaddq_s16(a[31], a[30]); - - // Final stage. - // Also compute partial rounding shift: - // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; - out[0] = sub_round_shift(b[0]); - out[16] = sub_round_shift(b[1]); - out[8] = sub_round_shift(b[2]); - out[24] = sub_round_shift(b[3]); - out[4] = sub_round_shift(b[4]); - out[20] = sub_round_shift(b[5]); - out[12] = sub_round_shift(b[6]); - out[28] = sub_round_shift(b[7]); - out[2] = sub_round_shift(b[8]); - out[18] = sub_round_shift(b[9]); - out[10] = sub_round_shift(b[10]); - out[26] = sub_round_shift(b[11]); - out[6] = sub_round_shift(b[12]); - out[22] = sub_round_shift(b[13]); - out[14] = sub_round_shift(b[14]); - out[30] = sub_round_shift(b[15]); - - butterfly_two_coeff(b[31], b[16], cospi_31_64, cospi_1_64, &a[1], &a[31]); - out[1] = sub_round_shift(a[1]); - out[31] = sub_round_shift(a[31]); - - butterfly_two_coeff(b[30], b[17], cospi_15_64, cospi_17_64, &a[17], &a[15]); - out[17] = sub_round_shift(a[17]); - out[15] = sub_round_shift(a[15]); - - butterfly_two_coeff(b[29], b[18], cospi_23_64, cospi_9_64, &a[9], &a[23]); - out[9] = sub_round_shift(a[9]); - out[23] = sub_round_shift(a[23]); - - butterfly_two_coeff(b[28], b[19], cospi_7_64, cospi_25_64, &a[25], &a[7]); - out[25] = sub_round_shift(a[25]); - out[7] = sub_round_shift(a[7]); - - butterfly_two_coeff(b[27], b[20], cospi_27_64, cospi_5_64, &a[5], &a[27]); - out[5] = sub_round_shift(a[5]); - out[27] = sub_round_shift(a[27]); - - butterfly_two_coeff(b[26], b[21], cospi_11_64, cospi_21_64, &a[21], &a[11]); - out[21] = sub_round_shift(a[21]); - out[11] = sub_round_shift(a[11]); - - butterfly_two_coeff(b[25], b[22], cospi_19_64, cospi_13_64, &a[13], &a[19]); - out[13] = sub_round_shift(a[13]); - out[19] = sub_round_shift(a[19]); - - butterfly_two_coeff(b[24], b[23], cospi_3_64, cospi_29_64, &a[29], &a[3]); - out[29] = sub_round_shift(a[29]); - out[3] = sub_round_shift(a[3]); -} - -#define PASS_THROUGH(src, dst, element) \ - do { \ - dst##_lo[element] = src##_lo[element]; \ - dst##_hi[element] = src##_hi[element]; \ - } while (0) - -#define ADD_S16_S32(a, left_index, right_index, b, b_index) \ - do { \ - b##_lo[b_index] = \ - vaddl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \ - b##_hi[b_index] = vaddl_s16(vget_high_s16(a[left_index]), \ - vget_high_s16(a[right_index])); \ - } while (0) - -#define SUB_S16_S32(a, left_index, right_index, b, b_index) \ - do { \ - b##_lo[b_index] = \ - vsubl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \ - b##_hi[b_index] = vsubl_s16(vget_high_s16(a[left_index]), \ - vget_high_s16(a[right_index])); \ - } while (0) - -#define ADDW_S16_S32(a, a_index, b, b_index, c, c_index) \ - do { \ - c##_lo[c_index] = vaddw_s16(a##_lo[a_index], vget_low_s16(b[b_index])); \ - c##_hi[c_index] = vaddw_s16(a##_hi[a_index], vget_high_s16(b[b_index])); \ - } while (0) - -#define SUBW_S16_S32(a, a_index, b, b_index, temp, temp_index, c, c_index) \ - do { \ - temp##_lo[temp_index] = vmovl_s16(vget_low_s16(a[a_index])); \ - temp##_hi[temp_index] = vmovl_s16(vget_high_s16(a[a_index])); \ - c##_lo[c_index] = vsubq_s32(temp##_lo[temp_index], b##_lo[b_index]); \ - c##_hi[c_index] = vsubq_s32(temp##_hi[temp_index], b##_hi[b_index]); \ - } while (0) - -#define ADD_S32(a, left_index, right_index, b, b_index) \ - do { \ - b##_lo[b_index] = vaddq_s32(a##_lo[left_index], a##_lo[right_index]); \ - b##_hi[b_index] = vaddq_s32(a##_hi[left_index], a##_hi[right_index]); \ - } while (0) - -#define SUB_S32(a, left_index, right_index, b, b_index) \ - do { \ - b##_lo[b_index] = vsubq_s32(a##_lo[left_index], a##_lo[right_index]); \ - b##_hi[b_index] = vsubq_s32(a##_hi[left_index], a##_hi[right_index]); \ - } while (0) - -#define BUTTERFLY_ONE_S16_S32(a, left_index, right_index, constant, b, \ - add_index, sub_index) \ - do { \ - butterfly_one_coeff_s16_s32(a[left_index], a[right_index], constant, \ - &b##_lo[add_index], &b##_hi[add_index], \ - &b##_lo[sub_index], &b##_hi[sub_index]); \ - } while (0) - -#define BUTTERFLY_ONE_S32(a, left_index, right_index, constant, b, add_index, \ - sub_index) \ - do { \ - butterfly_one_coeff_s32(a##_lo[left_index], a##_hi[left_index], \ - a##_lo[right_index], a##_hi[right_index], \ - constant, &b##_lo[add_index], &b##_hi[add_index], \ - &b##_lo[sub_index], &b##_hi[sub_index]); \ - } while (0) - -#define BUTTERFLY_TWO_S32(a, left_index, right_index, left_constant, \ - right_constant, b, add_index, sub_index) \ - do { \ - butterfly_two_coeff_s32(a##_lo[left_index], a##_hi[left_index], \ - a##_lo[right_index], a##_hi[right_index], \ - left_constant, right_constant, &b##_lo[add_index], \ - &b##_hi[add_index], &b##_lo[sub_index], \ - &b##_hi[sub_index]); \ - } while (0) - -static void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) { - int16x8_t a[32]; - int16x8_t b[32]; - int32x4_t c_lo[32]; - int32x4_t c_hi[32]; - int32x4_t d_lo[32]; - int32x4_t d_hi[32]; - - // Stage 1. Done as part of the load for the first pass. - a[0] = vaddq_s16(in[0], in[31]); - a[1] = vaddq_s16(in[1], in[30]); - a[2] = vaddq_s16(in[2], in[29]); - a[3] = vaddq_s16(in[3], in[28]); - a[4] = vaddq_s16(in[4], in[27]); - a[5] = vaddq_s16(in[5], in[26]); - a[6] = vaddq_s16(in[6], in[25]); - a[7] = vaddq_s16(in[7], in[24]); - a[8] = vaddq_s16(in[8], in[23]); - a[9] = vaddq_s16(in[9], in[22]); - a[10] = vaddq_s16(in[10], in[21]); - a[11] = vaddq_s16(in[11], in[20]); - a[12] = vaddq_s16(in[12], in[19]); - a[13] = vaddq_s16(in[13], in[18]); - a[14] = vaddq_s16(in[14], in[17]); - a[15] = vaddq_s16(in[15], in[16]); - a[16] = vsubq_s16(in[15], in[16]); - a[17] = vsubq_s16(in[14], in[17]); - a[18] = vsubq_s16(in[13], in[18]); - a[19] = vsubq_s16(in[12], in[19]); - a[20] = vsubq_s16(in[11], in[20]); - a[21] = vsubq_s16(in[10], in[21]); - a[22] = vsubq_s16(in[9], in[22]); - a[23] = vsubq_s16(in[8], in[23]); - a[24] = vsubq_s16(in[7], in[24]); - a[25] = vsubq_s16(in[6], in[25]); - a[26] = vsubq_s16(in[5], in[26]); - a[27] = vsubq_s16(in[4], in[27]); - a[28] = vsubq_s16(in[3], in[28]); - a[29] = vsubq_s16(in[2], in[29]); - a[30] = vsubq_s16(in[1], in[30]); - a[31] = vsubq_s16(in[0], in[31]); - - // Stage 2. - b[0] = vaddq_s16(a[0], a[15]); - b[1] = vaddq_s16(a[1], a[14]); - b[2] = vaddq_s16(a[2], a[13]); - b[3] = vaddq_s16(a[3], a[12]); - b[4] = vaddq_s16(a[4], a[11]); - b[5] = vaddq_s16(a[5], a[10]); - b[6] = vaddq_s16(a[6], a[9]); - b[7] = vaddq_s16(a[7], a[8]); - - b[8] = vsubq_s16(a[7], a[8]); - b[9] = vsubq_s16(a[6], a[9]); - b[10] = vsubq_s16(a[5], a[10]); - b[11] = vsubq_s16(a[4], a[11]); - b[12] = vsubq_s16(a[3], a[12]); - b[13] = vsubq_s16(a[2], a[13]); - b[14] = vsubq_s16(a[1], a[14]); - b[15] = vsubq_s16(a[0], a[15]); - - b[16] = a[16]; - b[17] = a[17]; - b[18] = a[18]; - b[19] = a[19]; - - butterfly_one_coeff(a[27], a[20], cospi_16_64, &b[27], &b[20]); - butterfly_one_coeff(a[26], a[21], cospi_16_64, &b[26], &b[21]); - butterfly_one_coeff(a[25], a[22], cospi_16_64, &b[25], &b[22]); - butterfly_one_coeff(a[24], a[23], cospi_16_64, &b[24], &b[23]); - - b[28] = a[28]; - b[29] = a[29]; - b[30] = a[30]; - b[31] = a[31]; - - // Stage 3. With extreme values for input this calculation rolls over int16_t. - // The sources for b[0] get added multiple times and, through testing, have - // been shown to overflow starting here. - ADD_S16_S32(b, 0, 7, c, 0); - ADD_S16_S32(b, 1, 6, c, 1); - ADD_S16_S32(b, 2, 5, c, 2); - ADD_S16_S32(b, 3, 4, c, 3); - SUB_S16_S32(b, 3, 4, c, 4); - SUB_S16_S32(b, 2, 5, c, 5); - SUB_S16_S32(b, 1, 6, c, 6); - SUB_S16_S32(b, 0, 7, c, 7); - - a[8] = b[8]; - a[9] = b[9]; - - BUTTERFLY_ONE_S16_S32(b, 13, 10, cospi_16_64, c, 13, 10); - BUTTERFLY_ONE_S16_S32(b, 12, 11, cospi_16_64, c, 12, 11); - - a[14] = b[14]; - a[15] = b[15]; - - ADD_S16_S32(b, 16, 23, c, 16); - ADD_S16_S32(b, 17, 22, c, 17); - ADD_S16_S32(b, 18, 21, c, 18); - ADD_S16_S32(b, 19, 20, c, 19); - SUB_S16_S32(b, 19, 20, c, 20); - SUB_S16_S32(b, 18, 21, c, 21); - SUB_S16_S32(b, 17, 22, c, 22); - SUB_S16_S32(b, 16, 23, c, 23); - SUB_S16_S32(b, 31, 24, c, 24); - SUB_S16_S32(b, 30, 25, c, 25); - SUB_S16_S32(b, 29, 26, c, 26); - SUB_S16_S32(b, 28, 27, c, 27); - ADD_S16_S32(b, 28, 27, c, 28); - ADD_S16_S32(b, 29, 26, c, 29); - ADD_S16_S32(b, 30, 25, c, 30); - ADD_S16_S32(b, 31, 24, c, 31); - - // Stage 4. - ADD_S32(c, 0, 3, d, 0); - ADD_S32(c, 1, 2, d, 1); - SUB_S32(c, 1, 2, d, 2); - SUB_S32(c, 0, 3, d, 3); - - PASS_THROUGH(c, d, 4); - - BUTTERFLY_ONE_S32(c, 6, 5, cospi_16_64, d, 6, 5); - - PASS_THROUGH(c, d, 7); - - ADDW_S16_S32(c, 11, a, 8, d, 8); - ADDW_S16_S32(c, 10, a, 9, d, 9); - SUBW_S16_S32(a, 9, c, 10, c, 9, d, 10); - SUBW_S16_S32(a, 8, c, 11, c, 8, d, 11); - SUBW_S16_S32(a, 15, c, 12, c, 15, d, 12); - SUBW_S16_S32(a, 14, c, 13, c, 14, d, 13); - ADDW_S16_S32(c, 13, b, 14, d, 14); - ADDW_S16_S32(c, 12, b, 15, d, 15); - - PASS_THROUGH(c, d, 16); - PASS_THROUGH(c, d, 17); - - BUTTERFLY_TWO_S32(c, 29, 18, cospi_24_64, cospi_8_64, d, 29, 18); - BUTTERFLY_TWO_S32(c, 28, 19, cospi_24_64, cospi_8_64, d, 28, 19); - BUTTERFLY_TWO_S32(c, 27, 20, -cospi_8_64, cospi_24_64, d, 27, 20); - BUTTERFLY_TWO_S32(c, 26, 21, -cospi_8_64, cospi_24_64, d, 26, 21); - - PASS_THROUGH(c, d, 22); - PASS_THROUGH(c, d, 23); - PASS_THROUGH(c, d, 24); - PASS_THROUGH(c, d, 25); - - PASS_THROUGH(c, d, 30); - PASS_THROUGH(c, d, 31); - - // Stage 5. - BUTTERFLY_ONE_S32(d, 0, 1, cospi_16_64, c, 0, 1); - BUTTERFLY_TWO_S32(d, 3, 2, cospi_24_64, cospi_8_64, c, 2, 3); - - ADD_S32(d, 4, 5, c, 4); - SUB_S32(d, 4, 5, c, 5); - SUB_S32(d, 7, 6, c, 6); - ADD_S32(d, 7, 6, c, 7); - - PASS_THROUGH(d, c, 8); - - BUTTERFLY_TWO_S32(d, 14, 9, cospi_24_64, cospi_8_64, c, 14, 9); - BUTTERFLY_TWO_S32(d, 13, 10, -cospi_8_64, cospi_24_64, c, 13, 10); - - PASS_THROUGH(d, c, 11); - PASS_THROUGH(d, c, 12); - PASS_THROUGH(d, c, 15); - - ADD_S32(d, 16, 19, c, 16); - ADD_S32(d, 17, 18, c, 17); - SUB_S32(d, 17, 18, c, 18); - SUB_S32(d, 16, 19, c, 19); - SUB_S32(d, 23, 20, c, 20); - SUB_S32(d, 22, 21, c, 21); - ADD_S32(d, 22, 21, c, 22); - ADD_S32(d, 23, 20, c, 23); - ADD_S32(d, 24, 27, c, 24); - ADD_S32(d, 25, 26, c, 25); - SUB_S32(d, 25, 26, c, 26); - SUB_S32(d, 24, 27, c, 27); - SUB_S32(d, 31, 28, c, 28); - SUB_S32(d, 30, 29, c, 29); - ADD_S32(d, 30, 29, c, 30); - ADD_S32(d, 31, 28, c, 31); - - // Stage 6. - PASS_THROUGH(c, d, 0); - PASS_THROUGH(c, d, 1); - PASS_THROUGH(c, d, 2); - PASS_THROUGH(c, d, 3); - - BUTTERFLY_TWO_S32(c, 7, 4, cospi_28_64, cospi_4_64, d, 4, 7); - BUTTERFLY_TWO_S32(c, 6, 5, cospi_12_64, cospi_20_64, d, 5, 6); - - ADD_S32(c, 8, 9, d, 8); - SUB_S32(c, 8, 9, d, 9); - SUB_S32(c, 11, 10, d, 10); - ADD_S32(c, 11, 10, d, 11); - ADD_S32(c, 12, 13, d, 12); - SUB_S32(c, 12, 13, d, 13); - SUB_S32(c, 15, 14, d, 14); - ADD_S32(c, 15, 14, d, 15); - - PASS_THROUGH(c, d, 16); - PASS_THROUGH(c, d, 19); - PASS_THROUGH(c, d, 20); - PASS_THROUGH(c, d, 23); - PASS_THROUGH(c, d, 24); - PASS_THROUGH(c, d, 27); - PASS_THROUGH(c, d, 28); - PASS_THROUGH(c, d, 31); - - BUTTERFLY_TWO_S32(c, 30, 17, cospi_28_64, cospi_4_64, d, 30, 17); - BUTTERFLY_TWO_S32(c, 29, 18, -cospi_4_64, cospi_28_64, d, 29, 18); - BUTTERFLY_TWO_S32(c, 26, 21, cospi_12_64, cospi_20_64, d, 26, 21); - BUTTERFLY_TWO_S32(c, 25, 22, -cospi_20_64, cospi_12_64, d, 25, 22); - - // Stage 7. - PASS_THROUGH(d, c, 0); - PASS_THROUGH(d, c, 1); - PASS_THROUGH(d, c, 2); - PASS_THROUGH(d, c, 3); - PASS_THROUGH(d, c, 4); - PASS_THROUGH(d, c, 5); - PASS_THROUGH(d, c, 6); - PASS_THROUGH(d, c, 7); - - BUTTERFLY_TWO_S32(d, 15, 8, cospi_30_64, cospi_2_64, c, 8, 15); - BUTTERFLY_TWO_S32(d, 14, 9, cospi_14_64, cospi_18_64, c, 9, 14); - BUTTERFLY_TWO_S32(d, 13, 10, cospi_22_64, cospi_10_64, c, 10, 13); - BUTTERFLY_TWO_S32(d, 12, 11, cospi_6_64, cospi_26_64, c, 11, 12); - - ADD_S32(d, 16, 17, c, 16); - SUB_S32(d, 16, 17, c, 17); - SUB_S32(d, 19, 18, c, 18); - ADD_S32(d, 19, 18, c, 19); - ADD_S32(d, 20, 21, c, 20); - SUB_S32(d, 20, 21, c, 21); - SUB_S32(d, 23, 22, c, 22); - ADD_S32(d, 23, 22, c, 23); - ADD_S32(d, 24, 25, c, 24); - SUB_S32(d, 24, 25, c, 25); - SUB_S32(d, 27, 26, c, 26); - ADD_S32(d, 27, 26, c, 27); - ADD_S32(d, 28, 29, c, 28); - SUB_S32(d, 28, 29, c, 29); - SUB_S32(d, 31, 30, c, 30); - ADD_S32(d, 31, 30, c, 31); - - // Final stage. - // Roll rounding into this function so we can pass back int16x8. - - out[0] = add_round_shift_s32(c_lo[0], c_hi[0]); - out[16] = add_round_shift_s32(c_lo[1], c_hi[1]); - - out[8] = add_round_shift_s32(c_lo[2], c_hi[2]); - out[24] = add_round_shift_s32(c_lo[3], c_hi[3]); - out[4] = add_round_shift_s32(c_lo[4], c_hi[4]); - out[20] = add_round_shift_s32(c_lo[5], c_hi[5]); - out[12] = add_round_shift_s32(c_lo[6], c_hi[6]); - - out[28] = add_round_shift_s32(c_lo[7], c_hi[7]); - out[2] = add_round_shift_s32(c_lo[8], c_hi[8]); - out[18] = add_round_shift_s32(c_lo[9], c_hi[9]); - out[10] = add_round_shift_s32(c_lo[10], c_hi[10]); - - out[26] = add_round_shift_s32(c_lo[11], c_hi[11]); - out[6] = add_round_shift_s32(c_lo[12], c_hi[12]); - out[22] = add_round_shift_s32(c_lo[13], c_hi[13]); - out[14] = add_round_shift_s32(c_lo[14], c_hi[14]); - out[30] = add_round_shift_s32(c_lo[15], c_hi[15]); - - BUTTERFLY_TWO_S32(c, 31, 16, cospi_31_64, cospi_1_64, d, 1, 31); - out[1] = add_round_shift_s32(d_lo[1], d_hi[1]); - out[31] = add_round_shift_s32(d_lo[31], d_hi[31]); - - BUTTERFLY_TWO_S32(c, 30, 17, cospi_15_64, cospi_17_64, d, 17, 15); - out[17] = add_round_shift_s32(d_lo[17], d_hi[17]); - out[15] = add_round_shift_s32(d_lo[15], d_hi[15]); - - BUTTERFLY_TWO_S32(c, 29, 18, cospi_23_64, cospi_9_64, d, 9, 23); - out[9] = add_round_shift_s32(d_lo[9], d_hi[9]); - out[23] = add_round_shift_s32(d_lo[23], d_hi[23]); - - BUTTERFLY_TWO_S32(c, 28, 19, cospi_7_64, cospi_25_64, d, 25, 7); - out[25] = add_round_shift_s32(d_lo[25], d_hi[25]); - out[7] = add_round_shift_s32(d_lo[7], d_hi[7]); - - BUTTERFLY_TWO_S32(c, 27, 20, cospi_27_64, cospi_5_64, d, 5, 27); - out[5] = add_round_shift_s32(d_lo[5], d_hi[5]); - out[27] = add_round_shift_s32(d_lo[27], d_hi[27]); - - BUTTERFLY_TWO_S32(c, 26, 21, cospi_11_64, cospi_21_64, d, 21, 11); - out[21] = add_round_shift_s32(d_lo[21], d_hi[21]); - out[11] = add_round_shift_s32(d_lo[11], d_hi[11]); - - BUTTERFLY_TWO_S32(c, 25, 22, cospi_19_64, cospi_13_64, d, 13, 19); - out[13] = add_round_shift_s32(d_lo[13], d_hi[13]); - out[19] = add_round_shift_s32(d_lo[19], d_hi[19]); - - BUTTERFLY_TWO_S32(c, 24, 23, cospi_3_64, cospi_29_64, d, 29, 3); - out[29] = add_round_shift_s32(d_lo[29], d_hi[29]); - out[3] = add_round_shift_s32(d_lo[3], d_hi[3]); -} - -static void dct_body_second_pass_rd(const int16x8_t *in, int16x8_t *out) { - int16x8_t a[32]; - int16x8_t b[32]; - - // Stage 1. Done as part of the load for the first pass. - a[0] = vaddq_s16(in[0], in[31]); - a[1] = vaddq_s16(in[1], in[30]); - a[2] = vaddq_s16(in[2], in[29]); - a[3] = vaddq_s16(in[3], in[28]); - a[4] = vaddq_s16(in[4], in[27]); - a[5] = vaddq_s16(in[5], in[26]); - a[6] = vaddq_s16(in[6], in[25]); - a[7] = vaddq_s16(in[7], in[24]); - a[8] = vaddq_s16(in[8], in[23]); - a[9] = vaddq_s16(in[9], in[22]); - a[10] = vaddq_s16(in[10], in[21]); - a[11] = vaddq_s16(in[11], in[20]); - a[12] = vaddq_s16(in[12], in[19]); - a[13] = vaddq_s16(in[13], in[18]); - a[14] = vaddq_s16(in[14], in[17]); - a[15] = vaddq_s16(in[15], in[16]); - a[16] = vsubq_s16(in[15], in[16]); - a[17] = vsubq_s16(in[14], in[17]); - a[18] = vsubq_s16(in[13], in[18]); - a[19] = vsubq_s16(in[12], in[19]); - a[20] = vsubq_s16(in[11], in[20]); - a[21] = vsubq_s16(in[10], in[21]); - a[22] = vsubq_s16(in[9], in[22]); - a[23] = vsubq_s16(in[8], in[23]); - a[24] = vsubq_s16(in[7], in[24]); - a[25] = vsubq_s16(in[6], in[25]); - a[26] = vsubq_s16(in[5], in[26]); - a[27] = vsubq_s16(in[4], in[27]); - a[28] = vsubq_s16(in[3], in[28]); - a[29] = vsubq_s16(in[2], in[29]); - a[30] = vsubq_s16(in[1], in[30]); - a[31] = vsubq_s16(in[0], in[31]); - - // Stage 2. - // For the "rd" version, all the values are rounded down after stage 2 to keep - // the values in 16 bits. - b[0] = add_round_shift_s16(vaddq_s16(a[0], a[15])); - b[1] = add_round_shift_s16(vaddq_s16(a[1], a[14])); - b[2] = add_round_shift_s16(vaddq_s16(a[2], a[13])); - b[3] = add_round_shift_s16(vaddq_s16(a[3], a[12])); - b[4] = add_round_shift_s16(vaddq_s16(a[4], a[11])); - b[5] = add_round_shift_s16(vaddq_s16(a[5], a[10])); - b[6] = add_round_shift_s16(vaddq_s16(a[6], a[9])); - b[7] = add_round_shift_s16(vaddq_s16(a[7], a[8])); - - b[8] = add_round_shift_s16(vsubq_s16(a[7], a[8])); - b[9] = add_round_shift_s16(vsubq_s16(a[6], a[9])); - b[10] = add_round_shift_s16(vsubq_s16(a[5], a[10])); - b[11] = add_round_shift_s16(vsubq_s16(a[4], a[11])); - b[12] = add_round_shift_s16(vsubq_s16(a[3], a[12])); - b[13] = add_round_shift_s16(vsubq_s16(a[2], a[13])); - b[14] = add_round_shift_s16(vsubq_s16(a[1], a[14])); - b[15] = add_round_shift_s16(vsubq_s16(a[0], a[15])); - - b[16] = add_round_shift_s16(a[16]); - b[17] = add_round_shift_s16(a[17]); - b[18] = add_round_shift_s16(a[18]); - b[19] = add_round_shift_s16(a[19]); - - butterfly_one_coeff(a[27], a[20], cospi_16_64, &b[27], &b[20]); - butterfly_one_coeff(a[26], a[21], cospi_16_64, &b[26], &b[21]); - butterfly_one_coeff(a[25], a[22], cospi_16_64, &b[25], &b[22]); - butterfly_one_coeff(a[24], a[23], cospi_16_64, &b[24], &b[23]); - b[20] = add_round_shift_s16(b[20]); - b[21] = add_round_shift_s16(b[21]); - b[22] = add_round_shift_s16(b[22]); - b[23] = add_round_shift_s16(b[23]); - b[24] = add_round_shift_s16(b[24]); - b[25] = add_round_shift_s16(b[25]); - b[26] = add_round_shift_s16(b[26]); - b[27] = add_round_shift_s16(b[27]); - - b[28] = add_round_shift_s16(a[28]); - b[29] = add_round_shift_s16(a[29]); - b[30] = add_round_shift_s16(a[30]); - b[31] = add_round_shift_s16(a[31]); - - // Stage 3. - a[0] = vaddq_s16(b[0], b[7]); - a[1] = vaddq_s16(b[1], b[6]); - a[2] = vaddq_s16(b[2], b[5]); - a[3] = vaddq_s16(b[3], b[4]); - - a[4] = vsubq_s16(b[3], b[4]); - a[5] = vsubq_s16(b[2], b[5]); - a[6] = vsubq_s16(b[1], b[6]); - a[7] = vsubq_s16(b[0], b[7]); - - a[8] = b[8]; - a[9] = b[9]; - - butterfly_one_coeff(b[13], b[10], cospi_16_64, &a[13], &a[10]); - butterfly_one_coeff(b[12], b[11], cospi_16_64, &a[12], &a[11]); - - a[14] = b[14]; - a[15] = b[15]; - - a[16] = vaddq_s16(b[16], b[23]); - a[17] = vaddq_s16(b[17], b[22]); - a[18] = vaddq_s16(b[18], b[21]); - a[19] = vaddq_s16(b[19], b[20]); - - a[20] = vsubq_s16(b[19], b[20]); - a[21] = vsubq_s16(b[18], b[21]); - a[22] = vsubq_s16(b[17], b[22]); - a[23] = vsubq_s16(b[16], b[23]); - - a[24] = vsubq_s16(b[31], b[24]); - a[25] = vsubq_s16(b[30], b[25]); - a[26] = vsubq_s16(b[29], b[26]); - a[27] = vsubq_s16(b[28], b[27]); - - a[28] = vaddq_s16(b[28], b[27]); - a[29] = vaddq_s16(b[29], b[26]); - a[30] = vaddq_s16(b[30], b[25]); - a[31] = vaddq_s16(b[31], b[24]); - - // Stage 4. - b[0] = vaddq_s16(a[0], a[3]); - b[1] = vaddq_s16(a[1], a[2]); - b[2] = vsubq_s16(a[1], a[2]); - b[3] = vsubq_s16(a[0], a[3]); - - b[4] = a[4]; - - butterfly_one_coeff(a[6], a[5], cospi_16_64, &b[6], &b[5]); - - b[7] = a[7]; - - b[8] = vaddq_s16(a[8], a[11]); - b[9] = vaddq_s16(a[9], a[10]); - b[10] = vsubq_s16(a[9], a[10]); - b[11] = vsubq_s16(a[8], a[11]); - b[12] = vsubq_s16(a[15], a[12]); - b[13] = vsubq_s16(a[14], a[13]); - b[14] = vaddq_s16(a[14], a[13]); - b[15] = vaddq_s16(a[15], a[12]); - - b[16] = a[16]; - b[17] = a[17]; - - butterfly_two_coeff(a[29], a[18], cospi_24_64, cospi_8_64, &b[29], &b[18]); - butterfly_two_coeff(a[28], a[19], cospi_24_64, cospi_8_64, &b[28], &b[19]); - butterfly_two_coeff(a[27], a[20], -cospi_8_64, cospi_24_64, &b[27], &b[20]); - butterfly_two_coeff(a[26], a[21], -cospi_8_64, cospi_24_64, &b[26], &b[21]); - - b[22] = a[22]; - b[23] = a[23]; - b[24] = a[24]; - b[25] = a[25]; - - b[30] = a[30]; - b[31] = a[31]; - - // Stage 5. - butterfly_one_coeff(b[0], b[1], cospi_16_64, &a[0], &a[1]); - butterfly_two_coeff(b[3], b[2], cospi_24_64, cospi_8_64, &a[2], &a[3]); - - a[4] = vaddq_s16(b[4], b[5]); - a[5] = vsubq_s16(b[4], b[5]); - a[6] = vsubq_s16(b[7], b[6]); - a[7] = vaddq_s16(b[7], b[6]); - - a[8] = b[8]; - - butterfly_two_coeff(b[14], b[9], cospi_24_64, cospi_8_64, &a[14], &a[9]); - butterfly_two_coeff(b[13], b[10], -cospi_8_64, cospi_24_64, &a[13], &a[10]); - - a[11] = b[11]; - a[12] = b[12]; - - a[15] = b[15]; - - a[16] = vaddq_s16(b[19], b[16]); - a[17] = vaddq_s16(b[18], b[17]); - a[18] = vsubq_s16(b[17], b[18]); - a[19] = vsubq_s16(b[16], b[19]); - a[20] = vsubq_s16(b[23], b[20]); - a[21] = vsubq_s16(b[22], b[21]); - a[22] = vaddq_s16(b[21], b[22]); - a[23] = vaddq_s16(b[20], b[23]); - a[24] = vaddq_s16(b[27], b[24]); - a[25] = vaddq_s16(b[26], b[25]); - a[26] = vsubq_s16(b[25], b[26]); - a[27] = vsubq_s16(b[24], b[27]); - a[28] = vsubq_s16(b[31], b[28]); - a[29] = vsubq_s16(b[30], b[29]); - a[30] = vaddq_s16(b[29], b[30]); - a[31] = vaddq_s16(b[28], b[31]); - - // Stage 6. - b[0] = a[0]; - b[1] = a[1]; - b[2] = a[2]; - b[3] = a[3]; - - butterfly_two_coeff(a[7], a[4], cospi_28_64, cospi_4_64, &b[4], &b[7]); - butterfly_two_coeff(a[6], a[5], cospi_12_64, cospi_20_64, &b[5], &b[6]); - - b[8] = vaddq_s16(a[8], a[9]); - b[9] = vsubq_s16(a[8], a[9]); - b[10] = vsubq_s16(a[11], a[10]); - b[11] = vaddq_s16(a[11], a[10]); - b[12] = vaddq_s16(a[12], a[13]); - b[13] = vsubq_s16(a[12], a[13]); - b[14] = vsubq_s16(a[15], a[14]); - b[15] = vaddq_s16(a[15], a[14]); - - b[16] = a[16]; - b[19] = a[19]; - b[20] = a[20]; - b[23] = a[23]; - b[24] = a[24]; - b[27] = a[27]; - b[28] = a[28]; - b[31] = a[31]; - - butterfly_two_coeff(a[30], a[17], cospi_28_64, cospi_4_64, &b[30], &b[17]); - butterfly_two_coeff(a[29], a[18], -cospi_4_64, cospi_28_64, &b[29], &b[18]); - - butterfly_two_coeff(a[26], a[21], cospi_12_64, cospi_20_64, &b[26], &b[21]); - butterfly_two_coeff(a[25], a[22], -cospi_20_64, cospi_12_64, &b[25], &b[22]); - - // Stage 7. - a[0] = b[0]; - a[1] = b[1]; - a[2] = b[2]; - a[3] = b[3]; - a[4] = b[4]; - a[5] = b[5]; - a[6] = b[6]; - a[7] = b[7]; - - butterfly_two_coeff(b[15], b[8], cospi_30_64, cospi_2_64, &a[8], &a[15]); - butterfly_two_coeff(b[14], b[9], cospi_14_64, cospi_18_64, &a[9], &a[14]); - butterfly_two_coeff(b[13], b[10], cospi_22_64, cospi_10_64, &a[10], &a[13]); - butterfly_two_coeff(b[12], b[11], cospi_6_64, cospi_26_64, &a[11], &a[12]); - - a[16] = vaddq_s16(b[16], b[17]); - a[17] = vsubq_s16(b[16], b[17]); - a[18] = vsubq_s16(b[19], b[18]); - a[19] = vaddq_s16(b[19], b[18]); - a[20] = vaddq_s16(b[20], b[21]); - a[21] = vsubq_s16(b[20], b[21]); - a[22] = vsubq_s16(b[23], b[22]); - a[23] = vaddq_s16(b[23], b[22]); - a[24] = vaddq_s16(b[24], b[25]); - a[25] = vsubq_s16(b[24], b[25]); - a[26] = vsubq_s16(b[27], b[26]); - a[27] = vaddq_s16(b[27], b[26]); - a[28] = vaddq_s16(b[28], b[29]); - a[29] = vsubq_s16(b[28], b[29]); - a[30] = vsubq_s16(b[31], b[30]); - a[31] = vaddq_s16(b[31], b[30]); - - // Final stage. - out[0] = a[0]; - out[16] = a[1]; - out[8] = a[2]; - out[24] = a[3]; - out[4] = a[4]; - out[20] = a[5]; - out[12] = a[6]; - out[28] = a[7]; - out[2] = a[8]; - out[18] = a[9]; - out[10] = a[10]; - out[26] = a[11]; - out[6] = a[12]; - out[22] = a[13]; - out[14] = a[14]; - out[30] = a[15]; - - butterfly_two_coeff(a[31], a[16], cospi_31_64, cospi_1_64, &out[1], &out[31]); - butterfly_two_coeff(a[30], a[17], cospi_15_64, cospi_17_64, &out[17], - &out[15]); - butterfly_two_coeff(a[29], a[18], cospi_23_64, cospi_9_64, &out[9], &out[23]); - butterfly_two_coeff(a[28], a[19], cospi_7_64, cospi_25_64, &out[25], &out[7]); - butterfly_two_coeff(a[27], a[20], cospi_27_64, cospi_5_64, &out[5], &out[27]); - butterfly_two_coeff(a[26], a[21], cospi_11_64, cospi_21_64, &out[21], - &out[11]); - butterfly_two_coeff(a[25], a[22], cospi_19_64, cospi_13_64, &out[13], - &out[19]); - butterfly_two_coeff(a[24], a[23], cospi_3_64, cospi_29_64, &out[29], &out[3]); -} - -#undef PASS_THROUGH -#undef ADD_S16_S32 -#undef SUB_S16_S32 -#undef ADDW_S16_S32 -#undef SUBW_S16_S32 -#undef ADD_S32 -#undef SUB_S32 -#undef BUTTERFLY_ONE_S16_S32 -#undef BUTTERFLY_ONE_S32 -#undef BUTTERFLY_TWO_S32 - void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) { int16x8_t temp0[32]; int16x8_t temp1[32]; @@ -1159,17 +43,21 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) { int16x8_t temp5[32]; // Process in 8x32 columns. - load(input, stride, temp0); - dct_body_first_pass(temp0, temp1); + load_cross(input, stride, temp0); + scale_input(temp0, temp5); + dct_body_first_pass(temp5, temp1); - load(input + 8, stride, temp0); - dct_body_first_pass(temp0, temp2); + load_cross(input + 8, stride, temp0); + scale_input(temp0, temp5); + dct_body_first_pass(temp5, temp2); - load(input + 16, stride, temp0); - dct_body_first_pass(temp0, temp3); + load_cross(input + 16, stride, temp0); + scale_input(temp0, temp5); + dct_body_first_pass(temp5, temp3); - load(input + 24, stride, temp0); - dct_body_first_pass(temp0, temp4); + load_cross(input + 24, stride, temp0); + scale_input(temp0, temp5); + dct_body_first_pass(temp5, temp4); // Generate the top row by munging the first set of 8 from each one together. transpose_s16_8x8_new(&temp1[0], &temp0[0]); @@ -1254,17 +142,21 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, int16x8_t temp5[32]; // Process in 8x32 columns. - load(input, stride, temp0); - dct_body_first_pass(temp0, temp1); + load_cross(input, stride, temp0); + scale_input(temp0, temp5); + dct_body_first_pass(temp5, temp1); - load(input + 8, stride, temp0); - dct_body_first_pass(temp0, temp2); + load_cross(input + 8, stride, temp0); + scale_input(temp0, temp5); + dct_body_first_pass(temp5, temp2); - load(input + 16, stride, temp0); - dct_body_first_pass(temp0, temp3); + load_cross(input + 16, stride, temp0); + scale_input(temp0, temp5); + dct_body_first_pass(temp5, temp3); - load(input + 24, stride, temp0); - dct_body_first_pass(temp0, temp4); + load_cross(input + 24, stride, temp0); + scale_input(temp0, temp5); + dct_body_first_pass(temp5, temp4); // Generate the top row by munging the first set of 8 from each one together. transpose_s16_8x8_new(&temp1[0], &temp0[0]); diff --git a/vpx_dsp/arm/fdct32x32_neon.h b/vpx_dsp/arm/fdct32x32_neon.h new file mode 100644 index 000000000..dd647918b --- /dev/null +++ b/vpx_dsp/arm/fdct32x32_neon.h @@ -0,0 +1,1105 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_ +#define VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_ + +#include <arm_neon.h> + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/txfm_common.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/arm/fdct_neon.h" + +// Load & cross the first 8 and last 8, then the middle +static INLINE void load_cross(const int16_t *a, int stride, int16x8_t *b) { + b[0] = vaddq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 31 * stride)); + b[1] = vaddq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 30 * stride)); + b[2] = vaddq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 29 * stride)); + b[3] = vaddq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 28 * stride)); + b[4] = vaddq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 27 * stride)); + b[5] = vaddq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 26 * stride)); + b[6] = vaddq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 25 * stride)); + b[7] = vaddq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 24 * stride)); + + b[24] = vsubq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 24 * stride)); + b[25] = vsubq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 25 * stride)); + b[26] = vsubq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 26 * stride)); + b[27] = vsubq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 27 * stride)); + b[28] = vsubq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 28 * stride)); + b[29] = vsubq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 29 * stride)); + b[30] = vsubq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 30 * stride)); + b[31] = vsubq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 31 * stride)); + + b[8] = vaddq_s16(vld1q_s16(a + 8 * stride), vld1q_s16(a + 23 * stride)); + b[9] = vaddq_s16(vld1q_s16(a + 9 * stride), vld1q_s16(a + 22 * stride)); + b[10] = vaddq_s16(vld1q_s16(a + 10 * stride), vld1q_s16(a + 21 * stride)); + b[11] = vaddq_s16(vld1q_s16(a + 11 * stride), vld1q_s16(a + 20 * stride)); + b[12] = vaddq_s16(vld1q_s16(a + 12 * stride), vld1q_s16(a + 19 * stride)); + b[13] = vaddq_s16(vld1q_s16(a + 13 * stride), vld1q_s16(a + 18 * stride)); + b[14] = vaddq_s16(vld1q_s16(a + 14 * stride), vld1q_s16(a + 17 * stride)); + b[15] = vaddq_s16(vld1q_s16(a + 15 * stride), vld1q_s16(a + 16 * stride)); + + b[16] = vsubq_s16(vld1q_s16(a + 15 * stride), vld1q_s16(a + 16 * stride)); + b[17] = vsubq_s16(vld1q_s16(a + 14 * stride), vld1q_s16(a + 17 * stride)); + b[18] = vsubq_s16(vld1q_s16(a + 13 * stride), vld1q_s16(a + 18 * stride)); + b[19] = vsubq_s16(vld1q_s16(a + 12 * stride), vld1q_s16(a + 19 * stride)); + b[20] = vsubq_s16(vld1q_s16(a + 11 * stride), vld1q_s16(a + 20 * stride)); + b[21] = vsubq_s16(vld1q_s16(a + 10 * stride), vld1q_s16(a + 21 * stride)); + b[22] = vsubq_s16(vld1q_s16(a + 9 * stride), vld1q_s16(a + 22 * stride)); + b[23] = vsubq_s16(vld1q_s16(a + 8 * stride), vld1q_s16(a + 23 * stride)); +} + +#define STORE_S16(src, index, dest) \ + do { \ + store_s16q_to_tran_low(dest, src[index]); \ + dest += 8; \ + } while (0) + +// Store 32 16x8 values, assuming stride == 32. +// Slight twist: store horizontally in blocks of 8. +static INLINE void store(tran_low_t *a, const int16x8_t *b) { + STORE_S16(b, 0, a); + STORE_S16(b, 8, a); + STORE_S16(b, 16, a); + STORE_S16(b, 24, a); + STORE_S16(b, 1, a); + STORE_S16(b, 9, a); + STORE_S16(b, 17, a); + STORE_S16(b, 25, a); + STORE_S16(b, 2, a); + STORE_S16(b, 10, a); + STORE_S16(b, 18, a); + STORE_S16(b, 26, a); + STORE_S16(b, 3, a); + STORE_S16(b, 11, a); + STORE_S16(b, 19, a); + STORE_S16(b, 27, a); + STORE_S16(b, 4, a); + STORE_S16(b, 12, a); + STORE_S16(b, 20, a); + STORE_S16(b, 28, a); + STORE_S16(b, 5, a); + STORE_S16(b, 13, a); + STORE_S16(b, 21, a); + STORE_S16(b, 29, a); + STORE_S16(b, 6, a); + STORE_S16(b, 14, a); + STORE_S16(b, 22, a); + STORE_S16(b, 30, a); + STORE_S16(b, 7, a); + STORE_S16(b, 15, a); + STORE_S16(b, 23, a); + STORE_S16(b, 31, a); +} + +#undef STORE_S16 + +static INLINE void scale_input(const int16x8_t *in /*32*/, + int16x8_t *out /*32*/) { + out[0] = vshlq_n_s16(in[0], 2); + out[1] = vshlq_n_s16(in[1], 2); + out[2] = vshlq_n_s16(in[2], 2); + out[3] = vshlq_n_s16(in[3], 2); + out[4] = vshlq_n_s16(in[4], 2); + out[5] = vshlq_n_s16(in[5], 2); + out[6] = vshlq_n_s16(in[6], 2); + out[7] = vshlq_n_s16(in[7], 2); + + out[8] = vshlq_n_s16(in[8], 2); + out[9] = vshlq_n_s16(in[9], 2); + out[10] = vshlq_n_s16(in[10], 2); + out[11] = vshlq_n_s16(in[11], 2); + out[12] = vshlq_n_s16(in[12], 2); + out[13] = vshlq_n_s16(in[13], 2); + out[14] = vshlq_n_s16(in[14], 2); + out[15] = vshlq_n_s16(in[15], 2); + + out[16] = vshlq_n_s16(in[16], 2); + out[17] = vshlq_n_s16(in[17], 2); + out[18] = vshlq_n_s16(in[18], 2); + out[19] = vshlq_n_s16(in[19], 2); + out[20] = vshlq_n_s16(in[20], 2); + out[21] = vshlq_n_s16(in[21], 2); + out[22] = vshlq_n_s16(in[22], 2); + out[23] = vshlq_n_s16(in[23], 2); + + out[24] = vshlq_n_s16(in[24], 2); + out[25] = vshlq_n_s16(in[25], 2); + out[26] = vshlq_n_s16(in[26], 2); + out[27] = vshlq_n_s16(in[27], 2); + out[28] = vshlq_n_s16(in[28], 2); + out[29] = vshlq_n_s16(in[29], 2); + out[30] = vshlq_n_s16(in[30], 2); + out[31] = vshlq_n_s16(in[31], 2); +} + +static void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) { + int16x8_t a[32]; + int16x8_t b[32]; + + // Stage 1: Done as part of the load. + + // Stage 2. + // Mini cross. X the first 16 values and the middle 8 of the second half. + a[0] = vaddq_s16(in[0], in[15]); + a[1] = vaddq_s16(in[1], in[14]); + a[2] = vaddq_s16(in[2], in[13]); + a[3] = vaddq_s16(in[3], in[12]); + a[4] = vaddq_s16(in[4], in[11]); + a[5] = vaddq_s16(in[5], in[10]); + a[6] = vaddq_s16(in[6], in[9]); + a[7] = vaddq_s16(in[7], in[8]); + + a[8] = vsubq_s16(in[7], in[8]); + a[9] = vsubq_s16(in[6], in[9]); + a[10] = vsubq_s16(in[5], in[10]); + a[11] = vsubq_s16(in[4], in[11]); + a[12] = vsubq_s16(in[3], in[12]); + a[13] = vsubq_s16(in[2], in[13]); + a[14] = vsubq_s16(in[1], in[14]); + a[15] = vsubq_s16(in[0], in[15]); + + a[16] = in[16]; + a[17] = in[17]; + a[18] = in[18]; + a[19] = in[19]; + + butterfly_one_coeff_s16_s32_narrow(in[27], in[20], cospi_16_64, &a[27], + &a[20]); + butterfly_one_coeff_s16_s32_narrow(in[26], in[21], cospi_16_64, &a[26], + &a[21]); + butterfly_one_coeff_s16_s32_narrow(in[25], in[22], cospi_16_64, &a[25], + &a[22]); + butterfly_one_coeff_s16_s32_narrow(in[24], in[23], cospi_16_64, &a[24], + &a[23]); + + a[28] = in[28]; + a[29] = in[29]; + a[30] = in[30]; + a[31] = in[31]; + + // Stage 3. + b[0] = vaddq_s16(a[0], a[7]); + b[1] = vaddq_s16(a[1], a[6]); + b[2] = vaddq_s16(a[2], a[5]); + b[3] = vaddq_s16(a[3], a[4]); + + b[4] = vsubq_s16(a[3], a[4]); + b[5] = vsubq_s16(a[2], a[5]); + b[6] = vsubq_s16(a[1], a[6]); + b[7] = vsubq_s16(a[0], a[7]); + + b[8] = a[8]; + b[9] = a[9]; + + butterfly_one_coeff_s16_s32_narrow(a[13], a[10], cospi_16_64, &b[13], &b[10]); + butterfly_one_coeff_s16_s32_narrow(a[12], a[11], cospi_16_64, &b[12], &b[11]); + + b[14] = a[14]; + b[15] = a[15]; + + b[16] = vaddq_s16(in[16], a[23]); + b[17] = vaddq_s16(in[17], a[22]); + b[18] = vaddq_s16(in[18], a[21]); + b[19] = vaddq_s16(in[19], a[20]); + + b[20] = vsubq_s16(in[19], a[20]); + b[21] = vsubq_s16(in[18], a[21]); + b[22] = vsubq_s16(in[17], a[22]); + b[23] = vsubq_s16(in[16], a[23]); + + b[24] = vsubq_s16(in[31], a[24]); + b[25] = vsubq_s16(in[30], a[25]); + b[26] = vsubq_s16(in[29], a[26]); + b[27] = vsubq_s16(in[28], a[27]); + + b[28] = vaddq_s16(in[28], a[27]); + b[29] = vaddq_s16(in[29], a[26]); + b[30] = vaddq_s16(in[30], a[25]); + b[31] = vaddq_s16(in[31], a[24]); + + // Stage 4. + a[0] = vaddq_s16(b[0], b[3]); + a[1] = vaddq_s16(b[1], b[2]); + a[2] = vsubq_s16(b[1], b[2]); + a[3] = vsubq_s16(b[0], b[3]); + + a[4] = b[4]; + + butterfly_one_coeff_s16_s32_narrow(b[6], b[5], cospi_16_64, &a[6], &a[5]); + + a[7] = b[7]; + + a[8] = vaddq_s16(b[8], b[11]); + a[9] = vaddq_s16(b[9], b[10]); + a[10] = vsubq_s16(b[9], b[10]); + a[11] = vsubq_s16(b[8], b[11]); + a[12] = vsubq_s16(b[15], b[12]); + a[13] = vsubq_s16(b[14], b[13]); + a[14] = vaddq_s16(b[14], b[13]); + a[15] = vaddq_s16(b[15], b[12]); + + a[16] = b[16]; + a[17] = b[17]; + + butterfly_two_coeff(b[29], b[18], cospi_8_64, cospi_24_64, &a[29], &a[18]); + butterfly_two_coeff(b[28], b[19], cospi_8_64, cospi_24_64, &a[28], &a[19]); + butterfly_two_coeff(b[27], b[20], cospi_24_64, -cospi_8_64, &a[27], &a[20]); + butterfly_two_coeff(b[26], b[21], cospi_24_64, -cospi_8_64, &a[26], &a[21]); + + a[22] = b[22]; + a[23] = b[23]; + a[24] = b[24]; + a[25] = b[25]; + + a[30] = b[30]; + a[31] = b[31]; + + // Stage 5. + butterfly_one_coeff_s16_fast(a[0], a[1], cospi_16_64, &b[0], &b[1]); + butterfly_two_coeff(a[3], a[2], cospi_8_64, cospi_24_64, &b[2], &b[3]); + + b[4] = vaddq_s16(a[4], a[5]); + b[5] = vsubq_s16(a[4], a[5]); + b[6] = vsubq_s16(a[7], a[6]); + b[7] = vaddq_s16(a[7], a[6]); + + b[8] = a[8]; + + butterfly_two_coeff(a[14], a[9], cospi_8_64, cospi_24_64, &b[14], &b[9]); + butterfly_two_coeff(a[13], a[10], cospi_24_64, -cospi_8_64, &b[13], &b[10]); + + b[11] = a[11]; + b[12] = a[12]; + + b[15] = a[15]; + + b[16] = vaddq_s16(a[19], a[16]); + b[17] = vaddq_s16(a[18], a[17]); + b[18] = vsubq_s16(a[17], a[18]); + b[19] = vsubq_s16(a[16], a[19]); + b[20] = vsubq_s16(a[23], a[20]); + b[21] = vsubq_s16(a[22], a[21]); + b[22] = vaddq_s16(a[21], a[22]); + b[23] = vaddq_s16(a[20], a[23]); + b[24] = vaddq_s16(a[27], a[24]); + b[25] = vaddq_s16(a[26], a[25]); + b[26] = vsubq_s16(a[25], a[26]); + b[27] = vsubq_s16(a[24], a[27]); + b[28] = vsubq_s16(a[31], a[28]); + b[29] = vsubq_s16(a[30], a[29]); + b[30] = vaddq_s16(a[29], a[30]); + b[31] = vaddq_s16(a[28], a[31]); + + // Stage 6. + a[0] = b[0]; + a[1] = b[1]; + a[2] = b[2]; + a[3] = b[3]; + + butterfly_two_coeff(b[7], b[4], cospi_4_64, cospi_28_64, &a[4], &a[7]); + butterfly_two_coeff(b[6], b[5], cospi_20_64, cospi_12_64, &a[5], &a[6]); + + a[8] = vaddq_s16(b[8], b[9]); + a[9] = vsubq_s16(b[8], b[9]); + a[10] = vsubq_s16(b[11], b[10]); + a[11] = vaddq_s16(b[11], b[10]); + a[12] = vaddq_s16(b[12], b[13]); + a[13] = vsubq_s16(b[12], b[13]); + a[14] = vsubq_s16(b[15], b[14]); + a[15] = vaddq_s16(b[15], b[14]); + + a[16] = b[16]; + a[19] = b[19]; + a[20] = b[20]; + a[23] = b[23]; + a[24] = b[24]; + a[27] = b[27]; + a[28] = b[28]; + a[31] = b[31]; + + butterfly_two_coeff(b[30], b[17], cospi_4_64, cospi_28_64, &a[30], &a[17]); + butterfly_two_coeff(b[29], b[18], cospi_28_64, -cospi_4_64, &a[29], &a[18]); + + butterfly_two_coeff(b[26], b[21], cospi_20_64, cospi_12_64, &a[26], &a[21]); + butterfly_two_coeff(b[25], b[22], cospi_12_64, -cospi_20_64, &a[25], &a[22]); + + // Stage 7. + b[0] = a[0]; + b[1] = a[1]; + b[2] = a[2]; + b[3] = a[3]; + b[4] = a[4]; + b[5] = a[5]; + b[6] = a[6]; + b[7] = a[7]; + + butterfly_two_coeff(a[15], a[8], cospi_2_64, cospi_30_64, &b[8], &b[15]); + butterfly_two_coeff(a[14], a[9], cospi_18_64, cospi_14_64, &b[9], &b[14]); + butterfly_two_coeff(a[13], a[10], cospi_10_64, cospi_22_64, &b[10], &b[13]); + butterfly_two_coeff(a[12], a[11], cospi_26_64, cospi_6_64, &b[11], &b[12]); + + b[16] = vaddq_s16(a[16], a[17]); + b[17] = vsubq_s16(a[16], a[17]); + b[18] = vsubq_s16(a[19], a[18]); + b[19] = vaddq_s16(a[19], a[18]); + b[20] = vaddq_s16(a[20], a[21]); + b[21] = vsubq_s16(a[20], a[21]); + b[22] = vsubq_s16(a[23], a[22]); + b[23] = vaddq_s16(a[23], a[22]); + b[24] = vaddq_s16(a[24], a[25]); + b[25] = vsubq_s16(a[24], a[25]); + b[26] = vsubq_s16(a[27], a[26]); + b[27] = vaddq_s16(a[27], a[26]); + b[28] = vaddq_s16(a[28], a[29]); + b[29] = vsubq_s16(a[28], a[29]); + b[30] = vsubq_s16(a[31], a[30]); + b[31] = vaddq_s16(a[31], a[30]); + + // Final stage. + // Also compute partial rounding shift: + // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; + out[0] = sub_round_shift_s16(b[0]); + out[16] = sub_round_shift_s16(b[1]); + out[8] = sub_round_shift_s16(b[2]); + out[24] = sub_round_shift_s16(b[3]); + out[4] = sub_round_shift_s16(b[4]); + out[20] = sub_round_shift_s16(b[5]); + out[12] = sub_round_shift_s16(b[6]); + out[28] = sub_round_shift_s16(b[7]); + out[2] = sub_round_shift_s16(b[8]); + out[18] = sub_round_shift_s16(b[9]); + out[10] = sub_round_shift_s16(b[10]); + out[26] = sub_round_shift_s16(b[11]); + out[6] = sub_round_shift_s16(b[12]); + out[22] = sub_round_shift_s16(b[13]); + out[14] = sub_round_shift_s16(b[14]); + out[30] = sub_round_shift_s16(b[15]); + + butterfly_two_coeff(b[31], b[16], cospi_1_64, cospi_31_64, &a[1], &a[31]); + out[1] = sub_round_shift_s16(a[1]); + out[31] = sub_round_shift_s16(a[31]); + + butterfly_two_coeff(b[30], b[17], cospi_17_64, cospi_15_64, &a[17], &a[15]); + out[17] = sub_round_shift_s16(a[17]); + out[15] = sub_round_shift_s16(a[15]); + + butterfly_two_coeff(b[29], b[18], cospi_9_64, cospi_23_64, &a[9], &a[23]); + out[9] = sub_round_shift_s16(a[9]); + out[23] = sub_round_shift_s16(a[23]); + + butterfly_two_coeff(b[28], b[19], cospi_25_64, cospi_7_64, &a[25], &a[7]); + out[25] = sub_round_shift_s16(a[25]); + out[7] = sub_round_shift_s16(a[7]); + + butterfly_two_coeff(b[27], b[20], cospi_5_64, cospi_27_64, &a[5], &a[27]); + out[5] = sub_round_shift_s16(a[5]); + out[27] = sub_round_shift_s16(a[27]); + + butterfly_two_coeff(b[26], b[21], cospi_21_64, cospi_11_64, &a[21], &a[11]); + out[21] = sub_round_shift_s16(a[21]); + out[11] = sub_round_shift_s16(a[11]); + + butterfly_two_coeff(b[25], b[22], cospi_13_64, cospi_19_64, &a[13], &a[19]); + out[13] = sub_round_shift_s16(a[13]); + out[19] = sub_round_shift_s16(a[19]); + + butterfly_two_coeff(b[24], b[23], cospi_29_64, cospi_3_64, &a[29], &a[3]); + out[29] = sub_round_shift_s16(a[29]); + out[3] = sub_round_shift_s16(a[3]); +} + +#define PASS_THROUGH(src, dst, element) \ + do { \ + dst##_lo[element] = src##_lo[element]; \ + dst##_hi[element] = src##_hi[element]; \ + } while (0) + +#define ADD_S16_S32(a, left_index, right_index, b, b_index) \ + do { \ + b##_lo[b_index] = \ + vaddl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \ + b##_hi[b_index] = vaddl_s16(vget_high_s16(a[left_index]), \ + vget_high_s16(a[right_index])); \ + } while (0) + +#define SUB_S16_S32(a, left_index, right_index, b, b_index) \ + do { \ + b##_lo[b_index] = \ + vsubl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \ + b##_hi[b_index] = vsubl_s16(vget_high_s16(a[left_index]), \ + vget_high_s16(a[right_index])); \ + } while (0) + +#define ADDW_S16_S32(a, a_index, b, b_index, c, c_index) \ + do { \ + c##_lo[c_index] = vaddw_s16(a##_lo[a_index], vget_low_s16(b[b_index])); \ + c##_hi[c_index] = vaddw_s16(a##_hi[a_index], vget_high_s16(b[b_index])); \ + } while (0) + +#define SUBW_S16_S32(a, a_index, b, b_index, temp, temp_index, c, c_index) \ + do { \ + temp##_lo[temp_index] = vmovl_s16(vget_low_s16(a[a_index])); \ + temp##_hi[temp_index] = vmovl_s16(vget_high_s16(a[a_index])); \ + c##_lo[c_index] = vsubq_s32(temp##_lo[temp_index], b##_lo[b_index]); \ + c##_hi[c_index] = vsubq_s32(temp##_hi[temp_index], b##_hi[b_index]); \ + } while (0) + +#define ADD_S32(a, left_index, right_index, b, b_index) \ + do { \ + b##_lo[b_index] = vaddq_s32(a##_lo[left_index], a##_lo[right_index]); \ + b##_hi[b_index] = vaddq_s32(a##_hi[left_index], a##_hi[right_index]); \ + } while (0) + +#define SUB_S32(a, left_index, right_index, b, b_index) \ + do { \ + b##_lo[b_index] = vsubq_s32(a##_lo[left_index], a##_lo[right_index]); \ + b##_hi[b_index] = vsubq_s32(a##_hi[left_index], a##_hi[right_index]); \ + } while (0) + +#define BUTTERFLY_ONE_S16_S32(a, left_index, right_index, constant, b, \ + add_index, sub_index) \ + do { \ + butterfly_one_coeff_s16_s32(a[left_index], a[right_index], constant, \ + &b##_lo[add_index], &b##_hi[add_index], \ + &b##_lo[sub_index], &b##_hi[sub_index]); \ + } while (0) + +#define BUTTERFLY_ONE_S32(a, left_index, right_index, constant, b, add_index, \ + sub_index) \ + do { \ + butterfly_one_coeff_s32_fast( \ + a##_lo[left_index], a##_hi[left_index], a##_lo[right_index], \ + a##_hi[right_index], constant, &b##_lo[add_index], &b##_hi[add_index], \ + &b##_lo[sub_index], &b##_hi[sub_index]); \ + } while (0) + +#define BUTTERFLY_TWO_S32(a, left_index, right_index, left_constant, \ + right_constant, b, add_index, sub_index) \ + do { \ + butterfly_two_coeff_s32(a##_lo[left_index], a##_hi[left_index], \ + a##_lo[right_index], a##_hi[right_index], \ + left_constant, right_constant, &b##_lo[add_index], \ + &b##_hi[add_index], &b##_lo[sub_index], \ + &b##_hi[sub_index]); \ + } while (0) + +static void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) { + int16x8_t a[32]; + int16x8_t b[32]; + int32x4_t c_lo[32]; + int32x4_t c_hi[32]; + int32x4_t d_lo[32]; + int32x4_t d_hi[32]; + + // Stage 1. Done as part of the load for the first pass. + a[0] = vaddq_s16(in[0], in[31]); + a[1] = vaddq_s16(in[1], in[30]); + a[2] = vaddq_s16(in[2], in[29]); + a[3] = vaddq_s16(in[3], in[28]); + a[4] = vaddq_s16(in[4], in[27]); + a[5] = vaddq_s16(in[5], in[26]); + a[6] = vaddq_s16(in[6], in[25]); + a[7] = vaddq_s16(in[7], in[24]); + a[8] = vaddq_s16(in[8], in[23]); + a[9] = vaddq_s16(in[9], in[22]); + a[10] = vaddq_s16(in[10], in[21]); + a[11] = vaddq_s16(in[11], in[20]); + a[12] = vaddq_s16(in[12], in[19]); + a[13] = vaddq_s16(in[13], in[18]); + a[14] = vaddq_s16(in[14], in[17]); + a[15] = vaddq_s16(in[15], in[16]); + a[16] = vsubq_s16(in[15], in[16]); + a[17] = vsubq_s16(in[14], in[17]); + a[18] = vsubq_s16(in[13], in[18]); + a[19] = vsubq_s16(in[12], in[19]); + a[20] = vsubq_s16(in[11], in[20]); + a[21] = vsubq_s16(in[10], in[21]); + a[22] = vsubq_s16(in[9], in[22]); + a[23] = vsubq_s16(in[8], in[23]); + a[24] = vsubq_s16(in[7], in[24]); + a[25] = vsubq_s16(in[6], in[25]); + a[26] = vsubq_s16(in[5], in[26]); + a[27] = vsubq_s16(in[4], in[27]); + a[28] = vsubq_s16(in[3], in[28]); + a[29] = vsubq_s16(in[2], in[29]); + a[30] = vsubq_s16(in[1], in[30]); + a[31] = vsubq_s16(in[0], in[31]); + + // Stage 2. + b[0] = vaddq_s16(a[0], a[15]); + b[1] = vaddq_s16(a[1], a[14]); + b[2] = vaddq_s16(a[2], a[13]); + b[3] = vaddq_s16(a[3], a[12]); + b[4] = vaddq_s16(a[4], a[11]); + b[5] = vaddq_s16(a[5], a[10]); + b[6] = vaddq_s16(a[6], a[9]); + b[7] = vaddq_s16(a[7], a[8]); + + b[8] = vsubq_s16(a[7], a[8]); + b[9] = vsubq_s16(a[6], a[9]); + b[10] = vsubq_s16(a[5], a[10]); + b[11] = vsubq_s16(a[4], a[11]); + b[12] = vsubq_s16(a[3], a[12]); + b[13] = vsubq_s16(a[2], a[13]); + b[14] = vsubq_s16(a[1], a[14]); + b[15] = vsubq_s16(a[0], a[15]); + + b[16] = a[16]; + b[17] = a[17]; + b[18] = a[18]; + b[19] = a[19]; + + butterfly_one_coeff_s16_s32_narrow(a[27], a[20], cospi_16_64, &b[27], &b[20]); + butterfly_one_coeff_s16_s32_narrow(a[26], a[21], cospi_16_64, &b[26], &b[21]); + butterfly_one_coeff_s16_s32_narrow(a[25], a[22], cospi_16_64, &b[25], &b[22]); + butterfly_one_coeff_s16_s32_narrow(a[24], a[23], cospi_16_64, &b[24], &b[23]); + + b[28] = a[28]; + b[29] = a[29]; + b[30] = a[30]; + b[31] = a[31]; + + // Stage 3. With extreme values for input this calculation rolls over int16_t. + // The sources for b[0] get added multiple times and, through testing, have + // been shown to overflow starting here. + ADD_S16_S32(b, 0, 7, c, 0); + ADD_S16_S32(b, 1, 6, c, 1); + ADD_S16_S32(b, 2, 5, c, 2); + ADD_S16_S32(b, 3, 4, c, 3); + SUB_S16_S32(b, 3, 4, c, 4); + SUB_S16_S32(b, 2, 5, c, 5); + SUB_S16_S32(b, 1, 6, c, 6); + SUB_S16_S32(b, 0, 7, c, 7); + + a[8] = b[8]; + a[9] = b[9]; + + BUTTERFLY_ONE_S16_S32(b, 13, 10, cospi_16_64, c, 13, 10); + BUTTERFLY_ONE_S16_S32(b, 12, 11, cospi_16_64, c, 12, 11); + + a[14] = b[14]; + a[15] = b[15]; + + ADD_S16_S32(b, 16, 23, c, 16); + ADD_S16_S32(b, 17, 22, c, 17); + ADD_S16_S32(b, 18, 21, c, 18); + ADD_S16_S32(b, 19, 20, c, 19); + SUB_S16_S32(b, 19, 20, c, 20); + SUB_S16_S32(b, 18, 21, c, 21); + SUB_S16_S32(b, 17, 22, c, 22); + SUB_S16_S32(b, 16, 23, c, 23); + SUB_S16_S32(b, 31, 24, c, 24); + SUB_S16_S32(b, 30, 25, c, 25); + SUB_S16_S32(b, 29, 26, c, 26); + SUB_S16_S32(b, 28, 27, c, 27); + ADD_S16_S32(b, 28, 27, c, 28); + ADD_S16_S32(b, 29, 26, c, 29); + ADD_S16_S32(b, 30, 25, c, 30); + ADD_S16_S32(b, 31, 24, c, 31); + + // Stage 4. + ADD_S32(c, 0, 3, d, 0); + ADD_S32(c, 1, 2, d, 1); + SUB_S32(c, 1, 2, d, 2); + SUB_S32(c, 0, 3, d, 3); + + PASS_THROUGH(c, d, 4); + + BUTTERFLY_ONE_S32(c, 6, 5, cospi_16_64, d, 6, 5); + + PASS_THROUGH(c, d, 7); + + ADDW_S16_S32(c, 11, a, 8, d, 8); + ADDW_S16_S32(c, 10, a, 9, d, 9); + SUBW_S16_S32(a, 9, c, 10, c, 9, d, 10); + SUBW_S16_S32(a, 8, c, 11, c, 8, d, 11); + SUBW_S16_S32(a, 15, c, 12, c, 15, d, 12); + SUBW_S16_S32(a, 14, c, 13, c, 14, d, 13); + ADDW_S16_S32(c, 13, b, 14, d, 14); + ADDW_S16_S32(c, 12, b, 15, d, 15); + + PASS_THROUGH(c, d, 16); + PASS_THROUGH(c, d, 17); + + BUTTERFLY_TWO_S32(c, 29, 18, cospi_8_64, cospi_24_64, d, 29, 18); + BUTTERFLY_TWO_S32(c, 28, 19, cospi_8_64, cospi_24_64, d, 28, 19); + BUTTERFLY_TWO_S32(c, 27, 20, cospi_24_64, -cospi_8_64, d, 27, 20); + BUTTERFLY_TWO_S32(c, 26, 21, cospi_24_64, -cospi_8_64, d, 26, 21); + + PASS_THROUGH(c, d, 22); + PASS_THROUGH(c, d, 23); + PASS_THROUGH(c, d, 24); + PASS_THROUGH(c, d, 25); + + PASS_THROUGH(c, d, 30); + PASS_THROUGH(c, d, 31); + + // Stage 5. + BUTTERFLY_ONE_S32(d, 0, 1, cospi_16_64, c, 0, 1); + BUTTERFLY_TWO_S32(d, 3, 2, cospi_8_64, cospi_24_64, c, 2, 3); + + ADD_S32(d, 4, 5, c, 4); + SUB_S32(d, 4, 5, c, 5); + SUB_S32(d, 7, 6, c, 6); + ADD_S32(d, 7, 6, c, 7); + + PASS_THROUGH(d, c, 8); + + BUTTERFLY_TWO_S32(d, 14, 9, cospi_8_64, cospi_24_64, c, 14, 9); + BUTTERFLY_TWO_S32(d, 13, 10, cospi_24_64, -cospi_8_64, c, 13, 10); + + PASS_THROUGH(d, c, 11); + PASS_THROUGH(d, c, 12); + PASS_THROUGH(d, c, 15); + + ADD_S32(d, 16, 19, c, 16); + ADD_S32(d, 17, 18, c, 17); + SUB_S32(d, 17, 18, c, 18); + SUB_S32(d, 16, 19, c, 19); + SUB_S32(d, 23, 20, c, 20); + SUB_S32(d, 22, 21, c, 21); + ADD_S32(d, 22, 21, c, 22); + ADD_S32(d, 23, 20, c, 23); + ADD_S32(d, 24, 27, c, 24); + ADD_S32(d, 25, 26, c, 25); + SUB_S32(d, 25, 26, c, 26); + SUB_S32(d, 24, 27, c, 27); + SUB_S32(d, 31, 28, c, 28); + SUB_S32(d, 30, 29, c, 29); + ADD_S32(d, 30, 29, c, 30); + ADD_S32(d, 31, 28, c, 31); + + // Stage 6. + PASS_THROUGH(c, d, 0); + PASS_THROUGH(c, d, 1); + PASS_THROUGH(c, d, 2); + PASS_THROUGH(c, d, 3); + + BUTTERFLY_TWO_S32(c, 7, 4, cospi_4_64, cospi_28_64, d, 4, 7); + BUTTERFLY_TWO_S32(c, 6, 5, cospi_20_64, cospi_12_64, d, 5, 6); + + ADD_S32(c, 8, 9, d, 8); + SUB_S32(c, 8, 9, d, 9); + SUB_S32(c, 11, 10, d, 10); + ADD_S32(c, 11, 10, d, 11); + ADD_S32(c, 12, 13, d, 12); + SUB_S32(c, 12, 13, d, 13); + SUB_S32(c, 15, 14, d, 14); + ADD_S32(c, 15, 14, d, 15); + + PASS_THROUGH(c, d, 16); + PASS_THROUGH(c, d, 19); + PASS_THROUGH(c, d, 20); + PASS_THROUGH(c, d, 23); + PASS_THROUGH(c, d, 24); + PASS_THROUGH(c, d, 27); + PASS_THROUGH(c, d, 28); + PASS_THROUGH(c, d, 31); + + BUTTERFLY_TWO_S32(c, 30, 17, cospi_4_64, cospi_28_64, d, 30, 17); + BUTTERFLY_TWO_S32(c, 29, 18, cospi_28_64, -cospi_4_64, d, 29, 18); + BUTTERFLY_TWO_S32(c, 26, 21, cospi_20_64, cospi_12_64, d, 26, 21); + BUTTERFLY_TWO_S32(c, 25, 22, cospi_12_64, -cospi_20_64, d, 25, 22); + + // Stage 7. + PASS_THROUGH(d, c, 0); + PASS_THROUGH(d, c, 1); + PASS_THROUGH(d, c, 2); + PASS_THROUGH(d, c, 3); + PASS_THROUGH(d, c, 4); + PASS_THROUGH(d, c, 5); + PASS_THROUGH(d, c, 6); + PASS_THROUGH(d, c, 7); + + BUTTERFLY_TWO_S32(d, 15, 8, cospi_2_64, cospi_30_64, c, 8, 15); + BUTTERFLY_TWO_S32(d, 14, 9, cospi_18_64, cospi_14_64, c, 9, 14); + BUTTERFLY_TWO_S32(d, 13, 10, cospi_10_64, cospi_22_64, c, 10, 13); + BUTTERFLY_TWO_S32(d, 12, 11, cospi_26_64, cospi_6_64, c, 11, 12); + + ADD_S32(d, 16, 17, c, 16); + SUB_S32(d, 16, 17, c, 17); + SUB_S32(d, 19, 18, c, 18); + ADD_S32(d, 19, 18, c, 19); + ADD_S32(d, 20, 21, c, 20); + SUB_S32(d, 20, 21, c, 21); + SUB_S32(d, 23, 22, c, 22); + ADD_S32(d, 23, 22, c, 23); + ADD_S32(d, 24, 25, c, 24); + SUB_S32(d, 24, 25, c, 25); + SUB_S32(d, 27, 26, c, 26); + ADD_S32(d, 27, 26, c, 27); + ADD_S32(d, 28, 29, c, 28); + SUB_S32(d, 28, 29, c, 29); + SUB_S32(d, 31, 30, c, 30); + ADD_S32(d, 31, 30, c, 31); + + // Final stage. + // Roll rounding into this function so we can pass back int16x8. + + out[0] = add_round_shift_s32_narrow(c_lo[0], c_hi[0]); + out[16] = add_round_shift_s32_narrow(c_lo[1], c_hi[1]); + + out[8] = add_round_shift_s32_narrow(c_lo[2], c_hi[2]); + out[24] = add_round_shift_s32_narrow(c_lo[3], c_hi[3]); + out[4] = add_round_shift_s32_narrow(c_lo[4], c_hi[4]); + out[20] = add_round_shift_s32_narrow(c_lo[5], c_hi[5]); + out[12] = add_round_shift_s32_narrow(c_lo[6], c_hi[6]); + + out[28] = add_round_shift_s32_narrow(c_lo[7], c_hi[7]); + out[2] = add_round_shift_s32_narrow(c_lo[8], c_hi[8]); + out[18] = add_round_shift_s32_narrow(c_lo[9], c_hi[9]); + out[10] = add_round_shift_s32_narrow(c_lo[10], c_hi[10]); + + out[26] = add_round_shift_s32_narrow(c_lo[11], c_hi[11]); + out[6] = add_round_shift_s32_narrow(c_lo[12], c_hi[12]); + out[22] = add_round_shift_s32_narrow(c_lo[13], c_hi[13]); + out[14] = add_round_shift_s32_narrow(c_lo[14], c_hi[14]); + out[30] = add_round_shift_s32_narrow(c_lo[15], c_hi[15]); + + BUTTERFLY_TWO_S32(c, 31, 16, cospi_1_64, cospi_31_64, d, 1, 31); + out[1] = add_round_shift_s32_narrow(d_lo[1], d_hi[1]); + out[31] = add_round_shift_s32_narrow(d_lo[31], d_hi[31]); + + BUTTERFLY_TWO_S32(c, 30, 17, cospi_17_64, cospi_15_64, d, 17, 15); + out[17] = add_round_shift_s32_narrow(d_lo[17], d_hi[17]); + out[15] = add_round_shift_s32_narrow(d_lo[15], d_hi[15]); + + BUTTERFLY_TWO_S32(c, 29, 18, cospi_9_64, cospi_23_64, d, 9, 23); + out[9] = add_round_shift_s32_narrow(d_lo[9], d_hi[9]); + out[23] = add_round_shift_s32_narrow(d_lo[23], d_hi[23]); + + BUTTERFLY_TWO_S32(c, 28, 19, cospi_25_64, cospi_7_64, d, 25, 7); + out[25] = add_round_shift_s32_narrow(d_lo[25], d_hi[25]); + out[7] = add_round_shift_s32_narrow(d_lo[7], d_hi[7]); + + BUTTERFLY_TWO_S32(c, 27, 20, cospi_5_64, cospi_27_64, d, 5, 27); + out[5] = add_round_shift_s32_narrow(d_lo[5], d_hi[5]); + out[27] = add_round_shift_s32_narrow(d_lo[27], d_hi[27]); + + BUTTERFLY_TWO_S32(c, 26, 21, cospi_21_64, cospi_11_64, d, 21, 11); + out[21] = add_round_shift_s32_narrow(d_lo[21], d_hi[21]); + out[11] = add_round_shift_s32_narrow(d_lo[11], d_hi[11]); + + BUTTERFLY_TWO_S32(c, 25, 22, cospi_13_64, cospi_19_64, d, 13, 19); + out[13] = add_round_shift_s32_narrow(d_lo[13], d_hi[13]); + out[19] = add_round_shift_s32_narrow(d_lo[19], d_hi[19]); + + BUTTERFLY_TWO_S32(c, 24, 23, cospi_29_64, cospi_3_64, d, 29, 3); + out[29] = add_round_shift_s32_narrow(d_lo[29], d_hi[29]); + out[3] = add_round_shift_s32_narrow(d_lo[3], d_hi[3]); +} + +static void dct_body_second_pass_rd(const int16x8_t *in, int16x8_t *out) { + int16x8_t a[32]; + int16x8_t b[32]; + + // Stage 1. Done as part of the load for the first pass. + a[0] = vaddq_s16(in[0], in[31]); + a[1] = vaddq_s16(in[1], in[30]); + a[2] = vaddq_s16(in[2], in[29]); + a[3] = vaddq_s16(in[3], in[28]); + a[4] = vaddq_s16(in[4], in[27]); + a[5] = vaddq_s16(in[5], in[26]); + a[6] = vaddq_s16(in[6], in[25]); + a[7] = vaddq_s16(in[7], in[24]); + a[8] = vaddq_s16(in[8], in[23]); + a[9] = vaddq_s16(in[9], in[22]); + a[10] = vaddq_s16(in[10], in[21]); + a[11] = vaddq_s16(in[11], in[20]); + a[12] = vaddq_s16(in[12], in[19]); + a[13] = vaddq_s16(in[13], in[18]); + a[14] = vaddq_s16(in[14], in[17]); + a[15] = vaddq_s16(in[15], in[16]); + a[16] = vsubq_s16(in[15], in[16]); + a[17] = vsubq_s16(in[14], in[17]); + a[18] = vsubq_s16(in[13], in[18]); + a[19] = vsubq_s16(in[12], in[19]); + a[20] = vsubq_s16(in[11], in[20]); + a[21] = vsubq_s16(in[10], in[21]); + a[22] = vsubq_s16(in[9], in[22]); + a[23] = vsubq_s16(in[8], in[23]); + a[24] = vsubq_s16(in[7], in[24]); + a[25] = vsubq_s16(in[6], in[25]); + a[26] = vsubq_s16(in[5], in[26]); + a[27] = vsubq_s16(in[4], in[27]); + a[28] = vsubq_s16(in[3], in[28]); + a[29] = vsubq_s16(in[2], in[29]); + a[30] = vsubq_s16(in[1], in[30]); + a[31] = vsubq_s16(in[0], in[31]); + + // Stage 2. + // For the "rd" version, all the values are rounded down after stage 2 to keep + // the values in 16 bits. + b[0] = add_round_shift_s16(vaddq_s16(a[0], a[15])); + b[1] = add_round_shift_s16(vaddq_s16(a[1], a[14])); + b[2] = add_round_shift_s16(vaddq_s16(a[2], a[13])); + b[3] = add_round_shift_s16(vaddq_s16(a[3], a[12])); + b[4] = add_round_shift_s16(vaddq_s16(a[4], a[11])); + b[5] = add_round_shift_s16(vaddq_s16(a[5], a[10])); + b[6] = add_round_shift_s16(vaddq_s16(a[6], a[9])); + b[7] = add_round_shift_s16(vaddq_s16(a[7], a[8])); + + b[8] = add_round_shift_s16(vsubq_s16(a[7], a[8])); + b[9] = add_round_shift_s16(vsubq_s16(a[6], a[9])); + b[10] = add_round_shift_s16(vsubq_s16(a[5], a[10])); + b[11] = add_round_shift_s16(vsubq_s16(a[4], a[11])); + b[12] = add_round_shift_s16(vsubq_s16(a[3], a[12])); + b[13] = add_round_shift_s16(vsubq_s16(a[2], a[13])); + b[14] = add_round_shift_s16(vsubq_s16(a[1], a[14])); + b[15] = add_round_shift_s16(vsubq_s16(a[0], a[15])); + + b[16] = add_round_shift_s16(a[16]); + b[17] = add_round_shift_s16(a[17]); + b[18] = add_round_shift_s16(a[18]); + b[19] = add_round_shift_s16(a[19]); + + butterfly_one_coeff_s16_s32_narrow(a[27], a[20], cospi_16_64, &b[27], &b[20]); + butterfly_one_coeff_s16_s32_narrow(a[26], a[21], cospi_16_64, &b[26], &b[21]); + butterfly_one_coeff_s16_s32_narrow(a[25], a[22], cospi_16_64, &b[25], &b[22]); + butterfly_one_coeff_s16_s32_narrow(a[24], a[23], cospi_16_64, &b[24], &b[23]); + b[20] = add_round_shift_s16(b[20]); + b[21] = add_round_shift_s16(b[21]); + b[22] = add_round_shift_s16(b[22]); + b[23] = add_round_shift_s16(b[23]); + b[24] = add_round_shift_s16(b[24]); + b[25] = add_round_shift_s16(b[25]); + b[26] = add_round_shift_s16(b[26]); + b[27] = add_round_shift_s16(b[27]); + + b[28] = add_round_shift_s16(a[28]); + b[29] = add_round_shift_s16(a[29]); + b[30] = add_round_shift_s16(a[30]); + b[31] = add_round_shift_s16(a[31]); + + // Stage 3. + a[0] = vaddq_s16(b[0], b[7]); + a[1] = vaddq_s16(b[1], b[6]); + a[2] = vaddq_s16(b[2], b[5]); + a[3] = vaddq_s16(b[3], b[4]); + + a[4] = vsubq_s16(b[3], b[4]); + a[5] = vsubq_s16(b[2], b[5]); + a[6] = vsubq_s16(b[1], b[6]); + a[7] = vsubq_s16(b[0], b[7]); + + a[8] = b[8]; + a[9] = b[9]; + + butterfly_one_coeff_s16_s32_narrow(b[13], b[10], cospi_16_64, &a[13], &a[10]); + butterfly_one_coeff_s16_s32_narrow(b[12], b[11], cospi_16_64, &a[12], &a[11]); + + a[14] = b[14]; + a[15] = b[15]; + + a[16] = vaddq_s16(b[16], b[23]); + a[17] = vaddq_s16(b[17], b[22]); + a[18] = vaddq_s16(b[18], b[21]); + a[19] = vaddq_s16(b[19], b[20]); + + a[20] = vsubq_s16(b[19], b[20]); + a[21] = vsubq_s16(b[18], b[21]); + a[22] = vsubq_s16(b[17], b[22]); + a[23] = vsubq_s16(b[16], b[23]); + + a[24] = vsubq_s16(b[31], b[24]); + a[25] = vsubq_s16(b[30], b[25]); + a[26] = vsubq_s16(b[29], b[26]); + a[27] = vsubq_s16(b[28], b[27]); + + a[28] = vaddq_s16(b[28], b[27]); + a[29] = vaddq_s16(b[29], b[26]); + a[30] = vaddq_s16(b[30], b[25]); + a[31] = vaddq_s16(b[31], b[24]); + + // Stage 4. + b[0] = vaddq_s16(a[0], a[3]); + b[1] = vaddq_s16(a[1], a[2]); + b[2] = vsubq_s16(a[1], a[2]); + b[3] = vsubq_s16(a[0], a[3]); + + b[4] = a[4]; + + butterfly_one_coeff_s16_s32_narrow(a[6], a[5], cospi_16_64, &b[6], &b[5]); + + b[7] = a[7]; + + b[8] = vaddq_s16(a[8], a[11]); + b[9] = vaddq_s16(a[9], a[10]); + b[10] = vsubq_s16(a[9], a[10]); + b[11] = vsubq_s16(a[8], a[11]); + b[12] = vsubq_s16(a[15], a[12]); + b[13] = vsubq_s16(a[14], a[13]); + b[14] = vaddq_s16(a[14], a[13]); + b[15] = vaddq_s16(a[15], a[12]); + + b[16] = a[16]; + b[17] = a[17]; + + butterfly_two_coeff(a[29], a[18], cospi_8_64, cospi_24_64, &b[29], &b[18]); + butterfly_two_coeff(a[28], a[19], cospi_8_64, cospi_24_64, &b[28], &b[19]); + butterfly_two_coeff(a[27], a[20], cospi_24_64, -cospi_8_64, &b[27], &b[20]); + butterfly_two_coeff(a[26], a[21], cospi_24_64, -cospi_8_64, &b[26], &b[21]); + + b[22] = a[22]; + b[23] = a[23]; + b[24] = a[24]; + b[25] = a[25]; + + b[30] = a[30]; + b[31] = a[31]; + + // Stage 5. + butterfly_one_coeff_s16_s32_narrow(b[0], b[1], cospi_16_64, &a[0], &a[1]); + butterfly_two_coeff(b[3], b[2], cospi_8_64, cospi_24_64, &a[2], &a[3]); + + a[4] = vaddq_s16(b[4], b[5]); + a[5] = vsubq_s16(b[4], b[5]); + a[6] = vsubq_s16(b[7], b[6]); + a[7] = vaddq_s16(b[7], b[6]); + + a[8] = b[8]; + + butterfly_two_coeff(b[14], b[9], cospi_8_64, cospi_24_64, &a[14], &a[9]); + butterfly_two_coeff(b[13], b[10], cospi_24_64, -cospi_8_64, &a[13], &a[10]); + + a[11] = b[11]; + a[12] = b[12]; + + a[15] = b[15]; + + a[16] = vaddq_s16(b[19], b[16]); + a[17] = vaddq_s16(b[18], b[17]); + a[18] = vsubq_s16(b[17], b[18]); + a[19] = vsubq_s16(b[16], b[19]); + a[20] = vsubq_s16(b[23], b[20]); + a[21] = vsubq_s16(b[22], b[21]); + a[22] = vaddq_s16(b[21], b[22]); + a[23] = vaddq_s16(b[20], b[23]); + a[24] = vaddq_s16(b[27], b[24]); + a[25] = vaddq_s16(b[26], b[25]); + a[26] = vsubq_s16(b[25], b[26]); + a[27] = vsubq_s16(b[24], b[27]); + a[28] = vsubq_s16(b[31], b[28]); + a[29] = vsubq_s16(b[30], b[29]); + a[30] = vaddq_s16(b[29], b[30]); + a[31] = vaddq_s16(b[28], b[31]); + + // Stage 6. + b[0] = a[0]; + b[1] = a[1]; + b[2] = a[2]; + b[3] = a[3]; + + butterfly_two_coeff(a[7], a[4], cospi_4_64, cospi_28_64, &b[4], &b[7]); + butterfly_two_coeff(a[6], a[5], cospi_20_64, cospi_12_64, &b[5], &b[6]); + + b[8] = vaddq_s16(a[8], a[9]); + b[9] = vsubq_s16(a[8], a[9]); + b[10] = vsubq_s16(a[11], a[10]); + b[11] = vaddq_s16(a[11], a[10]); + b[12] = vaddq_s16(a[12], a[13]); + b[13] = vsubq_s16(a[12], a[13]); + b[14] = vsubq_s16(a[15], a[14]); + b[15] = vaddq_s16(a[15], a[14]); + + b[16] = a[16]; + b[19] = a[19]; + b[20] = a[20]; + b[23] = a[23]; + b[24] = a[24]; + b[27] = a[27]; + b[28] = a[28]; + b[31] = a[31]; + + butterfly_two_coeff(a[30], a[17], cospi_4_64, cospi_28_64, &b[30], &b[17]); + butterfly_two_coeff(a[29], a[18], cospi_28_64, -cospi_4_64, &b[29], &b[18]); + + butterfly_two_coeff(a[26], a[21], cospi_20_64, cospi_12_64, &b[26], &b[21]); + butterfly_two_coeff(a[25], a[22], cospi_12_64, -cospi_20_64, &b[25], &b[22]); + + // Stage 7. + a[0] = b[0]; + a[1] = b[1]; + a[2] = b[2]; + a[3] = b[3]; + a[4] = b[4]; + a[5] = b[5]; + a[6] = b[6]; + a[7] = b[7]; + + butterfly_two_coeff(b[15], b[8], cospi_2_64, cospi_30_64, &a[8], &a[15]); + butterfly_two_coeff(b[14], b[9], cospi_18_64, cospi_14_64, &a[9], &a[14]); + butterfly_two_coeff(b[13], b[10], cospi_10_64, cospi_22_64, &a[10], &a[13]); + butterfly_two_coeff(b[12], b[11], cospi_26_64, cospi_6_64, &a[11], &a[12]); + + a[16] = vaddq_s16(b[16], b[17]); + a[17] = vsubq_s16(b[16], b[17]); + a[18] = vsubq_s16(b[19], b[18]); + a[19] = vaddq_s16(b[19], b[18]); + a[20] = vaddq_s16(b[20], b[21]); + a[21] = vsubq_s16(b[20], b[21]); + a[22] = vsubq_s16(b[23], b[22]); + a[23] = vaddq_s16(b[23], b[22]); + a[24] = vaddq_s16(b[24], b[25]); + a[25] = vsubq_s16(b[24], b[25]); + a[26] = vsubq_s16(b[27], b[26]); + a[27] = vaddq_s16(b[27], b[26]); + a[28] = vaddq_s16(b[28], b[29]); + a[29] = vsubq_s16(b[28], b[29]); + a[30] = vsubq_s16(b[31], b[30]); + a[31] = vaddq_s16(b[31], b[30]); + + // Final stage. + out[0] = a[0]; + out[16] = a[1]; + out[8] = a[2]; + out[24] = a[3]; + out[4] = a[4]; + out[20] = a[5]; + out[12] = a[6]; + out[28] = a[7]; + out[2] = a[8]; + out[18] = a[9]; + out[10] = a[10]; + out[26] = a[11]; + out[6] = a[12]; + out[22] = a[13]; + out[14] = a[14]; + out[30] = a[15]; + + butterfly_two_coeff(a[31], a[16], cospi_1_64, cospi_31_64, &out[1], &out[31]); + butterfly_two_coeff(a[30], a[17], cospi_17_64, cospi_15_64, &out[17], + &out[15]); + butterfly_two_coeff(a[29], a[18], cospi_9_64, cospi_23_64, &out[9], &out[23]); + butterfly_two_coeff(a[28], a[19], cospi_25_64, cospi_7_64, &out[25], &out[7]); + butterfly_two_coeff(a[27], a[20], cospi_5_64, cospi_27_64, &out[5], &out[27]); + butterfly_two_coeff(a[26], a[21], cospi_21_64, cospi_11_64, &out[21], + &out[11]); + butterfly_two_coeff(a[25], a[22], cospi_13_64, cospi_19_64, &out[13], + &out[19]); + butterfly_two_coeff(a[24], a[23], cospi_29_64, cospi_3_64, &out[29], &out[3]); +} + +#undef PASS_THROUGH +#undef ADD_S16_S32 +#undef SUB_S16_S32 +#undef ADDW_S16_S32 +#undef SUBW_S16_S32 +#undef ADD_S32 +#undef SUB_S32 +#undef BUTTERFLY_ONE_S16_S32 +#undef BUTTERFLY_ONE_S32 +#undef BUTTERFLY_TWO_S32 + +#endif // VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_ diff --git a/vpx_dsp/arm/fdct4x4_neon.c b/vpx_dsp/arm/fdct4x4_neon.c index 11df7292d..3b9196fae 100644 --- a/vpx_dsp/arm/fdct4x4_neon.c +++ b/vpx_dsp/arm/fdct4x4_neon.c @@ -18,10 +18,10 @@ #include "vpx_dsp/arm/fdct_neon.h" #include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/arm/fdct4x4_neon.h" void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output, int stride) { - int i; // input[M * stride] * 16 int16x4_t in[4]; in[0] = vshl_n_s16(vld1_s16(input + 0 * stride), 4); @@ -34,9 +34,8 @@ void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output, const int16x4_t one = vreinterpret_s16_s64(vdup_n_s64(1)); in[0] = vadd_s16(in[0], one); } - for (i = 0; i < 2; ++i) { - vpx_fdct4x4_pass1_neon(in); - } + vpx_fdct4x4_pass1_neon(in); + vpx_fdct4x4_pass2_neon(in); { // Not quite a rounding shift. Only add 1 despite shifting by 2. const int16x8_t one = vdupq_n_s16(1); @@ -53,7 +52,6 @@ void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output, void vpx_highbd_fdct4x4_neon(const int16_t *input, tran_low_t *final_output, int stride) { - int i; static const int32x4_t const_1000 = { 1, 0, 0, 0 }; const int32x4_t const_one = vdupq_n_s32(1); @@ -69,9 +67,8 @@ void vpx_highbd_fdct4x4_neon(const int16_t *input, tran_low_t *final_output, in[0] = vaddq_s32(in[0], const_1000); } - for (i = 0; i < 2; ++i) { - vpx_highbd_fdct4x4_pass1_neon(in); - } + vpx_highbd_fdct4x4_pass1_neon(in); + vpx_highbd_fdct4x4_pass1_neon(in); { // Not quite a rounding shift. Only add 1 despite shifting by 2. in[0] = vshrq_n_s32(vaddq_s32(in[0], const_one), 2); diff --git a/vpx_dsp/arm/fdct4x4_neon.h b/vpx_dsp/arm/fdct4x4_neon.h new file mode 100644 index 000000000..de3db9774 --- /dev/null +++ b/vpx_dsp/arm/fdct4x4_neon.h @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_ +#define VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_ + +#include <arm_neon.h> + +static INLINE void vpx_fdct4x4_pass1_neon(int16x4_t *in) { + int16x4_t out[4]; + + const int16x8_t input_01 = vcombine_s16(in[0], in[1]); + const int16x8_t input_32 = vcombine_s16(in[3], in[2]); + + // in_0 +/- in_3, in_1 +/- in_2 + const int16x8_t s_01 = vaddq_s16(input_01, input_32); + const int16x8_t s_32 = vsubq_s16(input_01, input_32); + + // step_0 +/- step_1, step_2 +/- step_3 + const int16x4_t s_0 = vget_low_s16(s_01); + const int16x4_t s_1 = vget_high_s16(s_01); + const int16x4_t s_2 = vget_high_s16(s_32); + const int16x4_t s_3 = vget_low_s16(s_32); + + // fdct_round_shift(s_0 +/- s_1) * cospi_16_64 + butterfly_one_coeff_s16_fast_half(s_0, s_1, cospi_16_64, &out[0], &out[2]); + + // s_3 * cospi_8_64 + s_2 * cospi_24_64 + // s_3 * cospi_24_64 - s_2 * cospi_8_64 + butterfly_two_coeff_half(s_3, s_2, cospi_8_64, cospi_24_64, &out[1], &out[3]); + + transpose_s16_4x4d(&out[0], &out[1], &out[2], &out[3]); + + in[0] = out[0]; + in[1] = out[1]; + in[2] = out[2]; + in[3] = out[3]; +} + +static INLINE void vpx_fdct4x4_pass2_neon(int16x4_t *in) { + int16x4_t out[4]; + + const int16x8_t input_01 = vcombine_s16(in[0], in[1]); + const int16x8_t input_32 = vcombine_s16(in[3], in[2]); + + // in_0 +/- in_3, in_1 +/- in_2 + const int16x8_t s_01 = vaddq_s16(input_01, input_32); + const int16x8_t s_32 = vsubq_s16(input_01, input_32); + + // step_0 +/- step_1, step_2 +/- step_3 + const int16x4_t s_0 = vget_low_s16(s_01); + const int16x4_t s_1 = vget_high_s16(s_01); + const int16x4_t s_2 = vget_high_s16(s_32); + const int16x4_t s_3 = vget_low_s16(s_32); + + // fdct_round_shift(s_0 +/- s_1) * cospi_16_64 + butterfly_one_coeff_s16_s32_fast_narrow_half(s_0, s_1, cospi_16_64, &out[0], + &out[2]); + + // s_3 * cospi_8_64 + s_2 * cospi_24_64 + // s_3 * cospi_24_64 - s_2 * cospi_8_64 + butterfly_two_coeff_half(s_3, s_2, cospi_8_64, cospi_24_64, &out[1], &out[3]); + + transpose_s16_4x4d(&out[0], &out[1], &out[2], &out[3]); + + in[0] = out[0]; + in[1] = out[1]; + in[2] = out[2]; + in[3] = out[3]; +} + +#if CONFIG_VP9_HIGHBITDEPTH + +static INLINE void vpx_highbd_fdct4x4_pass1_neon(int32x4_t *in) { + int32x4_t out[4]; + // in_0 +/- in_3, in_1 +/- in_2 + const int32x4_t s_0 = vaddq_s32(in[0], in[3]); + const int32x4_t s_1 = vaddq_s32(in[1], in[2]); + const int32x4_t s_2 = vsubq_s32(in[1], in[2]); + const int32x4_t s_3 = vsubq_s32(in[0], in[3]); + + butterfly_one_coeff_s32_fast_half(s_0, s_1, cospi_16_64, &out[0], &out[2]); + + // out[1] = s_3 * cospi_8_64 + s_2 * cospi_24_64 + // out[3] = s_3 * cospi_24_64 - s_2 * cospi_8_64 + butterfly_two_coeff_s32_s64_narrow_half(s_3, s_2, cospi_8_64, cospi_24_64, + &out[1], &out[3]); + + transpose_s32_4x4(&out[0], &out[1], &out[2], &out[3]); + + in[0] = out[0]; + in[1] = out[1]; + in[2] = out[2]; + in[3] = out[3]; +} + +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_ diff --git a/vpx_dsp/arm/fdct8x8_neon.c b/vpx_dsp/arm/fdct8x8_neon.c index 3fb15cc17..75ee6f223 100644 --- a/vpx_dsp/arm/fdct8x8_neon.c +++ b/vpx_dsp/arm/fdct8x8_neon.c @@ -17,10 +17,10 @@ #include "vpx_dsp/arm/idct_neon.h" #include "vpx_dsp/arm/fdct_neon.h" #include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/fdct8x8_neon.h" void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output, int stride) { - int i; // stage 1 int16x8_t in[8]; in[0] = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2); @@ -31,9 +31,9 @@ void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output, in[5] = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2); in[6] = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2); in[7] = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2); - for (i = 0; i < 2; ++i) { - vpx_fdct8x8_pass1_neon(in); - } // for + + vpx_fdct8x8_pass1_neon(in); + vpx_fdct8x8_pass2_neon(in); { // from vpx_dct_sse2.c // Post-condition (division by two) @@ -71,8 +71,6 @@ void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output, void vpx_highbd_fdct8x8_neon(const int16_t *input, tran_low_t *final_output, int stride) { - int i; - // input[M * stride] * 16 int32x4_t left[8], right[8]; int16x8_t in[8]; @@ -102,26 +100,25 @@ void vpx_highbd_fdct8x8_neon(const int16_t *input, tran_low_t *final_output, right[6] = vshll_n_s16(vget_high_s16(in[6]), 2); right[7] = vshll_n_s16(vget_high_s16(in[7]), 2); - for (i = 0; i < 2; ++i) { - vpx_highbd_fdct8x8_pass1_neon(left, right); - } + vpx_highbd_fdct8x8_pass1_neon(left, right); + vpx_highbd_fdct8x8_pass2_neon(left, right); { - left[0] = highbd_add_round_shift_s32(left[0]); - left[1] = highbd_add_round_shift_s32(left[1]); - left[2] = highbd_add_round_shift_s32(left[2]); - left[3] = highbd_add_round_shift_s32(left[3]); - left[4] = highbd_add_round_shift_s32(left[4]); - left[5] = highbd_add_round_shift_s32(left[5]); - left[6] = highbd_add_round_shift_s32(left[6]); - left[7] = highbd_add_round_shift_s32(left[7]); - right[0] = highbd_add_round_shift_s32(right[0]); - right[1] = highbd_add_round_shift_s32(right[1]); - right[2] = highbd_add_round_shift_s32(right[2]); - right[3] = highbd_add_round_shift_s32(right[3]); - right[4] = highbd_add_round_shift_s32(right[4]); - right[5] = highbd_add_round_shift_s32(right[5]); - right[6] = highbd_add_round_shift_s32(right[6]); - right[7] = highbd_add_round_shift_s32(right[7]); + left[0] = add_round_shift_half_s32(left[0]); + left[1] = add_round_shift_half_s32(left[1]); + left[2] = add_round_shift_half_s32(left[2]); + left[3] = add_round_shift_half_s32(left[3]); + left[4] = add_round_shift_half_s32(left[4]); + left[5] = add_round_shift_half_s32(left[5]); + left[6] = add_round_shift_half_s32(left[6]); + left[7] = add_round_shift_half_s32(left[7]); + right[0] = add_round_shift_half_s32(right[0]); + right[1] = add_round_shift_half_s32(right[1]); + right[2] = add_round_shift_half_s32(right[2]); + right[3] = add_round_shift_half_s32(right[3]); + right[4] = add_round_shift_half_s32(right[4]); + right[5] = add_round_shift_half_s32(right[5]); + right[6] = add_round_shift_half_s32(right[6]); + right[7] = add_round_shift_half_s32(right[7]); // store results vst1q_s32(final_output, left[0]); diff --git a/vpx_dsp/arm/fdct8x8_neon.h b/vpx_dsp/arm/fdct8x8_neon.h new file mode 100644 index 000000000..d8fa60044 --- /dev/null +++ b/vpx_dsp/arm/fdct8x8_neon.h @@ -0,0 +1,381 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_ +#define VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_ + +#include <arm_neon.h> + +static INLINE void vpx_fdct8x8_pass1_notranspose_neon(int16x8_t *in, + int16x8_t *out) { + int16x8_t s[8], x[4], t[2]; + + s[0] = vaddq_s16(in[0], in[7]); + s[1] = vaddq_s16(in[1], in[6]); + s[2] = vaddq_s16(in[2], in[5]); + s[3] = vaddq_s16(in[3], in[4]); + s[4] = vsubq_s16(in[3], in[4]); + s[5] = vsubq_s16(in[2], in[5]); + s[6] = vsubq_s16(in[1], in[6]); + s[7] = vsubq_s16(in[0], in[7]); + // fdct4(step, step); + x[0] = vaddq_s16(s[0], s[3]); + x[1] = vaddq_s16(s[1], s[2]); + x[2] = vsubq_s16(s[1], s[2]); + x[3] = vsubq_s16(s[0], s[3]); + + // fdct4(step, step); + // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64) + // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64) + butterfly_one_coeff_s16_fast(x[0], x[1], cospi_16_64, &out[0], &out[4]); + // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64) + // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64) + butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[2], &out[6]); + + // Stage 2 + // t0 = (s6 - s5) * cospi_16_64; + // t1 = (s6 + s5) * cospi_16_64; + butterfly_one_coeff_s16_fast(s[6], s[5], cospi_16_64, &t[1], &t[0]); + + // Stage 3 + x[0] = vaddq_s16(s[4], t[0]); + x[1] = vsubq_s16(s[4], t[0]); + x[2] = vsubq_s16(s[7], t[1]); + x[3] = vaddq_s16(s[7], t[1]); + + // Stage 4 + // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64) + // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64) + butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[1], &out[7]); + + // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64) + // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64) + butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[5], &out[3]); +} + +static INLINE void vpx_fdct8x8_pass2_notranspose_neon(int16x8_t *in, + int16x8_t *out) { + int16x8_t s[8], x[4], t[2]; + + s[0] = vaddq_s16(in[0], in[7]); + s[1] = vaddq_s16(in[1], in[6]); + s[2] = vaddq_s16(in[2], in[5]); + s[3] = vaddq_s16(in[3], in[4]); + s[4] = vsubq_s16(in[3], in[4]); + s[5] = vsubq_s16(in[2], in[5]); + s[6] = vsubq_s16(in[1], in[6]); + s[7] = vsubq_s16(in[0], in[7]); + // fdct4(step, step); + x[0] = vaddq_s16(s[0], s[3]); + x[1] = vaddq_s16(s[1], s[2]); + x[2] = vsubq_s16(s[1], s[2]); + x[3] = vsubq_s16(s[0], s[3]); + + // fdct4(step, step); + // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64) + // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64) + butterfly_one_coeff_s16_s32_fast_narrow(x[0], x[1], cospi_16_64, &out[0], + &out[4]); + // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64) + // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64) + butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[2], &out[6]); + + // Stage 2 + // t0 = (s6 - s5) * cospi_16_64; + // t1 = (s6 + s5) * cospi_16_64; + butterfly_one_coeff_s16_s32_fast_narrow(s[6], s[5], cospi_16_64, &t[1], + &t[0]); + + // Stage 3 + x[0] = vaddq_s16(s[4], t[0]); + x[1] = vsubq_s16(s[4], t[0]); + x[2] = vsubq_s16(s[7], t[1]); + x[3] = vaddq_s16(s[7], t[1]); + + // Stage 4 + // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64) + // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64) + butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[1], &out[7]); + + // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64) + // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64) + butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[5], &out[3]); +} + +static INLINE void vpx_fdct8x8_pass1_neon(int16x8_t *in) { + int16x8_t out[8]; + vpx_fdct8x8_pass1_notranspose_neon(in, out); + // transpose 8x8 + transpose_s16_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5], + &out[6], &out[7]); + in[0] = out[0]; + in[1] = out[1]; + in[2] = out[2]; + in[3] = out[3]; + in[4] = out[4]; + in[5] = out[5]; + in[6] = out[6]; + in[7] = out[7]; +} + +static INLINE void vpx_fdct8x8_pass2_neon(int16x8_t *in) { + int16x8_t out[8]; + vpx_fdct8x8_pass2_notranspose_neon(in, out); + // transpose 8x8 + transpose_s16_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5], + &out[6], &out[7]); + in[0] = out[0]; + in[1] = out[1]; + in[2] = out[2]; + in[3] = out[3]; + in[4] = out[4]; + in[5] = out[5]; + in[6] = out[6]; + in[7] = out[7]; +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE void vpx_highbd_fdct8x8_pass1_notranspose_neon(int32x4_t *left, + int32x4_t *right) { + int32x4_t sl[8], sr[8], xl[4], xr[4], tl[4], tr[4]; + + sl[0] = vaddq_s32(left[0], left[7]); + sl[1] = vaddq_s32(left[1], left[6]); + sl[2] = vaddq_s32(left[2], left[5]); + sl[3] = vaddq_s32(left[3], left[4]); + sl[4] = vsubq_s32(left[3], left[4]); + sl[5] = vsubq_s32(left[2], left[5]); + sl[6] = vsubq_s32(left[1], left[6]); + sl[7] = vsubq_s32(left[0], left[7]); + sr[0] = vaddq_s32(right[0], right[7]); + sr[1] = vaddq_s32(right[1], right[6]); + sr[2] = vaddq_s32(right[2], right[5]); + sr[3] = vaddq_s32(right[3], right[4]); + sr[4] = vsubq_s32(right[3], right[4]); + sr[5] = vsubq_s32(right[2], right[5]); + sr[6] = vsubq_s32(right[1], right[6]); + sr[7] = vsubq_s32(right[0], right[7]); + + // fdct4(step, step); + // x0 = s0 + s3; + xl[0] = vaddq_s32(sl[0], sl[3]); + xr[0] = vaddq_s32(sr[0], sr[3]); + // x1 = s1 + s2; + xl[1] = vaddq_s32(sl[1], sl[2]); + xr[1] = vaddq_s32(sr[1], sr[2]); + // x2 = s1 - s2; + xl[2] = vsubq_s32(sl[1], sl[2]); + xr[2] = vsubq_s32(sr[1], sr[2]); + // x3 = s0 - s3; + xl[3] = vsubq_s32(sl[0], sl[3]); + xr[3] = vsubq_s32(sr[0], sr[3]); + + // fdct4(step, step); + // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64) + // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64) + butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64, + &left[0], &right[0], &left[4], &right[4]); + // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64) + // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64) + butterfly_two_coeff_s32(xl[3], xr[3], xl[2], xr[2], cospi_8_64, cospi_24_64, + &left[2], &right[2], &left[6], &right[6]); + + // Stage 2 + // t0 = (s6 - s5) * cospi_16_64; + // t1 = (s6 + s5) * cospi_16_64; + butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &tl[1], + &tr[1], &tl[0], &tr[0]); + + // Stage 3 + xl[0] = vaddq_s32(sl[4], tl[0]); + xr[0] = vaddq_s32(sr[4], tr[0]); + xl[1] = vsubq_s32(sl[4], tl[0]); + xr[1] = vsubq_s32(sr[4], tr[0]); + xl[2] = vsubq_s32(sl[7], tl[1]); + xr[2] = vsubq_s32(sr[7], tr[1]); + xl[3] = vaddq_s32(sl[7], tl[1]); + xr[3] = vaddq_s32(sr[7], tr[1]); + + // Stage 4 + // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64) + // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64) + butterfly_two_coeff_s32(xl[3], xr[3], xl[0], xr[0], cospi_4_64, cospi_28_64, + &left[1], &right[1], &left[7], &right[7]); + + // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64) + // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64) + butterfly_two_coeff_s32(xl[2], xr[2], xl[1], xr[1], cospi_20_64, cospi_12_64, + &left[5], &right[5], &left[3], &right[3]); +} + +static INLINE void vpx_highbd_fdct8x8_pass2_notranspose_neon(int32x4_t *left, + int32x4_t *right) { + int32x4_t sl[8], sr[8], xl[4], xr[4], tl[4], tr[4]; + + sl[0] = vaddq_s32(left[0], left[7]); + sl[1] = vaddq_s32(left[1], left[6]); + sl[2] = vaddq_s32(left[2], left[5]); + sl[3] = vaddq_s32(left[3], left[4]); + sl[4] = vsubq_s32(left[3], left[4]); + sl[5] = vsubq_s32(left[2], left[5]); + sl[6] = vsubq_s32(left[1], left[6]); + sl[7] = vsubq_s32(left[0], left[7]); + sr[0] = vaddq_s32(right[0], right[7]); + sr[1] = vaddq_s32(right[1], right[6]); + sr[2] = vaddq_s32(right[2], right[5]); + sr[3] = vaddq_s32(right[3], right[4]); + sr[4] = vsubq_s32(right[3], right[4]); + sr[5] = vsubq_s32(right[2], right[5]); + sr[6] = vsubq_s32(right[1], right[6]); + sr[7] = vsubq_s32(right[0], right[7]); + + // fdct4(step, step); + // x0 = s0 + s3; + xl[0] = vaddq_s32(sl[0], sl[3]); + xr[0] = vaddq_s32(sr[0], sr[3]); + // x1 = s1 + s2; + xl[1] = vaddq_s32(sl[1], sl[2]); + xr[1] = vaddq_s32(sr[1], sr[2]); + // x2 = s1 - s2; + xl[2] = vsubq_s32(sl[1], sl[2]); + xr[2] = vsubq_s32(sr[1], sr[2]); + // x3 = s0 - s3; + xl[3] = vsubq_s32(sl[0], sl[3]); + xr[3] = vsubq_s32(sr[0], sr[3]); + + // fdct4(step, step); + // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64) + // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64) + butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64, + &left[0], &right[0], &left[4], &right[4]); + // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64) + // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64) + butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[2], xr[2], cospi_8_64, + cospi_24_64, &left[2], &right[2], &left[6], + &right[6]); + + // Stage 2 + // t0 = (s6 - s5) * cospi_16_64; + // t1 = (s6 + s5) * cospi_16_64; + butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &tl[1], + &tr[1], &tl[0], &tr[0]); + + // Stage 3 + xl[0] = vaddq_s32(sl[4], tl[0]); + xr[0] = vaddq_s32(sr[4], tr[0]); + xl[1] = vsubq_s32(sl[4], tl[0]); + xr[1] = vsubq_s32(sr[4], tr[0]); + xl[2] = vsubq_s32(sl[7], tl[1]); + xr[2] = vsubq_s32(sr[7], tr[1]); + xl[3] = vaddq_s32(sl[7], tl[1]); + xr[3] = vaddq_s32(sr[7], tr[1]); + + // Stage 4 + // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64) + // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64) + butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[0], xr[0], cospi_4_64, + cospi_28_64, &left[1], &right[1], &left[7], + &right[7]); + + // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64) + // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64) + butterfly_two_coeff_s32_s64_narrow(xl[2], xr[2], xl[1], xr[1], cospi_20_64, + cospi_12_64, &left[5], &right[5], &left[3], + &right[3]); +} + +static INLINE void vpx_highbd_fdct8x8_pass1_neon(int32x4_t *left, + int32x4_t *right) { + int32x4x2_t out[8]; + vpx_highbd_fdct8x8_pass1_notranspose_neon(left, right); + + out[0].val[0] = left[0]; + out[0].val[1] = right[0]; + out[1].val[0] = left[1]; + out[1].val[1] = right[1]; + out[2].val[0] = left[2]; + out[2].val[1] = right[2]; + out[3].val[0] = left[3]; + out[3].val[1] = right[3]; + out[4].val[0] = left[4]; + out[4].val[1] = right[4]; + out[5].val[0] = left[5]; + out[5].val[1] = right[5]; + out[6].val[0] = left[6]; + out[6].val[1] = right[6]; + out[7].val[0] = left[7]; + out[7].val[1] = right[7]; + + transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5], + &out[6], &out[7]); + + left[0] = out[0].val[0]; + right[0] = out[0].val[1]; + left[1] = out[1].val[0]; + right[1] = out[1].val[1]; + left[2] = out[2].val[0]; + right[2] = out[2].val[1]; + left[3] = out[3].val[0]; + right[3] = out[3].val[1]; + left[4] = out[4].val[0]; + right[4] = out[4].val[1]; + left[5] = out[5].val[0]; + right[5] = out[5].val[1]; + left[6] = out[6].val[0]; + right[6] = out[6].val[1]; + left[7] = out[7].val[0]; + right[7] = out[7].val[1]; +} + +static INLINE void vpx_highbd_fdct8x8_pass2_neon(int32x4_t *left, + int32x4_t *right) { + int32x4x2_t out[8]; + vpx_highbd_fdct8x8_pass2_notranspose_neon(left, right); + + out[0].val[0] = left[0]; + out[0].val[1] = right[0]; + out[1].val[0] = left[1]; + out[1].val[1] = right[1]; + out[2].val[0] = left[2]; + out[2].val[1] = right[2]; + out[3].val[0] = left[3]; + out[3].val[1] = right[3]; + out[4].val[0] = left[4]; + out[4].val[1] = right[4]; + out[5].val[0] = left[5]; + out[5].val[1] = right[5]; + out[6].val[0] = left[6]; + out[6].val[1] = right[6]; + out[7].val[0] = left[7]; + out[7].val[1] = right[7]; + + transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5], + &out[6], &out[7]); + + left[0] = out[0].val[0]; + right[0] = out[0].val[1]; + left[1] = out[1].val[0]; + right[1] = out[1].val[1]; + left[2] = out[2].val[0]; + right[2] = out[2].val[1]; + left[3] = out[3].val[0]; + right[3] = out[3].val[1]; + left[4] = out[4].val[0]; + right[4] = out[4].val[1]; + left[5] = out[5].val[0]; + right[5] = out[5].val[1]; + left[6] = out[6].val[0]; + right[6] = out[6].val[1]; + left[7] = out[7].val[0]; + right[7] = out[7].val[1]; +} + +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_ diff --git a/vpx_dsp/arm/fdct_neon.h b/vpx_dsp/arm/fdct_neon.h index ce669061d..1ea948b3f 100644 --- a/vpx_dsp/arm/fdct_neon.h +++ b/vpx_dsp/arm/fdct_neon.h @@ -14,56 +14,94 @@ #include <arm_neon.h> // fdct_round_shift((a +/- b) * c) -static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b, - const tran_high_t constant, - int16x8_t *add, int16x8_t *sub) { - const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant); - const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant); - const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant); - const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant); - const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant); - const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant); - const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS); - const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS); - const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS); - const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS); - *add = vcombine_s16(rounded0, rounded1); - *sub = vcombine_s16(rounded2, rounded3); +// Variant that performs fast vqrdmulh_s16 operation on half vector +// can be slightly less accurate, adequate for pass1 +static INLINE void butterfly_one_coeff_s16_fast_half(const int16x4_t a, + const int16x4_t b, + const tran_coef_t constant, + int16x4_t *add, + int16x4_t *sub) { + int16x4_t c = vdup_n_s16(2 * constant); + *add = vqrdmulh_s16(vadd_s16(a, b), c); + *sub = vqrdmulh_s16(vsub_s16(a, b), c); } -// fdct_round_shift(a * c0 +/- b * c1) -static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b, - const tran_coef_t constant0, - const tran_coef_t constant1, - int16x8_t *add, int16x8_t *sub) { - const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant0); - const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant0); - const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), constant1); - const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), constant1); - const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), constant0); - const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), constant0); - const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant1); - const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant1); - const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS); - const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS); - const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS); - const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS); - *add = vcombine_s16(rounded0, rounded1); - *sub = vcombine_s16(rounded2, rounded3); +// fdct_round_shift((a +/- b) * c) +// Variant that performs fast vqrdmulh_s16 operation on full vector +// can be slightly less accurate, adequate for pass1 +static INLINE void butterfly_one_coeff_s16_fast(const int16x8_t a, + const int16x8_t b, + const tran_coef_t constant, + int16x8_t *add, + int16x8_t *sub) { + int16x8_t c = vdupq_n_s16(2 * constant); + *add = vqrdmulhq_s16(vaddq_s16(a, b), c); + *sub = vqrdmulhq_s16(vsubq_s16(a, b), c); } -// Add 2 if positive, 1 if negative, and shift by 2. -// In practice, subtract the sign bit, then shift with rounding. -static INLINE int16x8_t sub_round_shift(const int16x8_t a) { - const uint16x8_t a_u16 = vreinterpretq_u16_s16(a); - const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15); - const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16); - return vrshrq_n_s16(vsubq_s16(a, a_sign_s16), 2); +// fdct_round_shift((a +/- b) * c) +// Variant that performs fast vqrdmulhq_s32 operation on full vector +// more accurate does 32-bit processing, takes 16-bit input values, +// returns full 32-bit values, high/low +static INLINE void butterfly_one_coeff_s16_s32_fast( + const int16x8_t a, const int16x8_t b, const tran_coef_t constant, + int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo, + int32x4_t *sub_hi) { + int32x4_t c = vdupq_n_s32(constant << 17); + const int16x4_t a_lo = vget_low_s16(a); + const int16x4_t a_hi = vget_high_s16(a); + const int16x4_t b_lo = vget_low_s16(b); + const int16x4_t b_hi = vget_high_s16(b); + *add_lo = vqrdmulhq_s32(vaddl_s16(a_lo, b_lo), c); + *add_hi = vqrdmulhq_s32(vaddl_s16(a_hi, b_hi), c); + *sub_lo = vqrdmulhq_s32(vsubl_s16(a_lo, b_lo), c); + *sub_hi = vqrdmulhq_s32(vsubl_s16(a_hi, b_hi), c); +} + +// fdct_round_shift((a +/- b) * c) +// Variant that performs fast vqrdmulhq_s32 operation on full vector +// more accurate does 32-bit processing, takes 16-bit input values, +// returns full 32-bit values, high/low +static INLINE void butterfly_one_coeff_s16_s32_fast_narrow( + const int16x8_t a, const int16x8_t b, const tran_coef_t constant, + int16x8_t *add, int16x8_t *sub) { + int32x4_t add_lo, add_hi, sub_lo, sub_hi; + butterfly_one_coeff_s16_s32_fast(a, b, constant, &add_lo, &add_hi, &sub_lo, + &sub_hi); + *add = vcombine_s16(vmovn_s32(add_lo), vmovn_s32(add_hi)); + *sub = vcombine_s16(vmovn_s32(sub_lo), vmovn_s32(sub_hi)); } -// Like butterfly_one_coeff, but don't narrow results. +// fdct_round_shift((a +/- b) * c) +// Variant that performs fast vqrdmulhq_s32 operation on full vector +// more accurate does 32-bit processing, takes 16-bit input values, +// returns full 32-bit values, high/low +static INLINE void butterfly_one_coeff_s16_s32_fast_half( + const int16x4_t a, const int16x4_t b, const tran_coef_t constant, + int32x4_t *add, int32x4_t *sub) { + int32x4_t c = vdupq_n_s32(constant << 17); + *add = vqrdmulhq_s32(vaddl_s16(a, b), c); + *sub = vqrdmulhq_s32(vsubl_s16(a, b), c); +} + +// fdct_round_shift((a +/- b) * c) +// Variant that performs fast vqrdmulhq_s32 operation on half vector +// more accurate does 32-bit processing, takes 16-bit input values, +// returns narrowed down 16-bit values +static INLINE void butterfly_one_coeff_s16_s32_fast_narrow_half( + const int16x4_t a, const int16x4_t b, const tran_coef_t constant, + int16x4_t *add, int16x4_t *sub) { + int32x4_t add32, sub32; + butterfly_one_coeff_s16_s32_fast_half(a, b, constant, &add32, &sub32); + *add = vmovn_s32(add32); + *sub = vmovn_s32(sub32); +} + +// fdct_round_shift((a +/- b) * c) +// Original Variant that performs normal implementation on full vector +// fully accurate does 32-bit processing, takes 16-bit values static INLINE void butterfly_one_coeff_s16_s32( - const int16x8_t a, const int16x8_t b, const tran_high_t constant, + const int16x8_t a, const int16x8_t b, const tran_coef_t constant, int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) { const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant); @@ -78,37 +116,182 @@ static INLINE void butterfly_one_coeff_s16_s32( *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS); } -// Like butterfly_one_coeff, but with s32. -static INLINE void butterfly_one_coeff_s32( +// fdct_round_shift((a +/- b) * c) +// Original Variant that performs normal implementation on full vector +// fully accurate does 32-bit processing, takes 16-bit values +// returns narrowed down 16-bit values +static INLINE void butterfly_one_coeff_s16_s32_narrow( + const int16x8_t a, const int16x8_t b, const tran_coef_t constant, + int16x8_t *add, int16x8_t *sub) { + int32x4_t add32_lo, add32_hi, sub32_lo, sub32_hi; + butterfly_one_coeff_s16_s32(a, b, constant, &add32_lo, &add32_hi, &sub32_lo, + &sub32_hi); + *add = vcombine_s16(vmovn_s32(add32_lo), vmovn_s32(add32_hi)); + *sub = vcombine_s16(vmovn_s32(sub32_lo), vmovn_s32(sub32_hi)); +} + +// fdct_round_shift((a +/- b) * c) +// Variant that performs fast vqrdmulhq_s32 operation on full vector +// more accurate does 32-bit processing, takes and returns 32-bit values, +// high/low +static INLINE void butterfly_one_coeff_s32_fast_half(const int32x4_t a, + const int32x4_t b, + const tran_coef_t constant, + int32x4_t *add, + int32x4_t *sub) { + const int32x4_t c = vdupq_n_s32(constant << 17); + *add = vqrdmulhq_s32(vaddq_s32(a, b), c); + *sub = vqrdmulhq_s32(vsubq_s32(a, b), c); +} + +// fdct_round_shift((a +/- b) * c) +// Variant that performs fast vqrdmulhq_s32 operation on full vector +// more accurate does 32-bit processing, takes and returns 32-bit values, +// high/low +static INLINE void butterfly_one_coeff_s32_fast( const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo, - const int32x4_t b_hi, const int32_t constant, int32x4_t *add_lo, + const int32x4_t b_hi, const tran_coef_t constant, int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) { - const int32x4_t a_lo_0 = vmulq_n_s32(a_lo, constant); - const int32x4_t a_hi_0 = vmulq_n_s32(a_hi, constant); - const int32x4_t sum0 = vmlaq_n_s32(a_lo_0, b_lo, constant); - const int32x4_t sum1 = vmlaq_n_s32(a_hi_0, b_hi, constant); - const int32x4_t diff0 = vmlsq_n_s32(a_lo_0, b_lo, constant); - const int32x4_t diff1 = vmlsq_n_s32(a_hi_0, b_hi, constant); - *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS); - *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS); - *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS); - *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS); + const int32x4_t c = vdupq_n_s32(constant << 17); + *add_lo = vqrdmulhq_s32(vaddq_s32(a_lo, b_lo), c); + *add_hi = vqrdmulhq_s32(vaddq_s32(a_hi, b_hi), c); + *sub_lo = vqrdmulhq_s32(vsubq_s32(a_lo, b_lo), c); + *sub_hi = vqrdmulhq_s32(vsubq_s32(a_hi, b_hi), c); +} + +// fdct_round_shift(a * c1 +/- b * c2) +// Variant that performs normal implementation on half vector +// more accurate does 64-bit processing, takes and returns 32-bit values +// returns narrowed results +static INLINE void butterfly_two_coeff_s32_s64_narrow_half( + const int32x4_t a, const int32x4_t b, const tran_coef_t constant1, + const tran_coef_t constant2, int32x4_t *add, int32x4_t *sub) { + const int32x2_t a_lo = vget_low_s32(a); + const int32x2_t a_hi = vget_high_s32(a); + const int32x2_t b_lo = vget_low_s32(b); + const int32x2_t b_hi = vget_high_s32(b); + + const int64x2_t axc0_64_lo = vmull_n_s32(a_lo, constant1); + const int64x2_t axc0_64_hi = vmull_n_s32(a_hi, constant1); + const int64x2_t axc1_64_lo = vmull_n_s32(a_lo, constant2); + const int64x2_t axc1_64_hi = vmull_n_s32(a_hi, constant2); + + const int64x2_t sum_lo = vmlal_n_s32(axc0_64_lo, b_lo, constant2); + const int64x2_t sum_hi = vmlal_n_s32(axc0_64_hi, b_hi, constant2); + const int64x2_t diff_lo = vmlsl_n_s32(axc1_64_lo, b_lo, constant1); + const int64x2_t diff_hi = vmlsl_n_s32(axc1_64_hi, b_hi, constant1); + + *add = vcombine_s32(vrshrn_n_s64(sum_lo, DCT_CONST_BITS), + vrshrn_n_s64(sum_hi, DCT_CONST_BITS)); + *sub = vcombine_s32(vrshrn_n_s64(diff_lo, DCT_CONST_BITS), + vrshrn_n_s64(diff_hi, DCT_CONST_BITS)); } -// Like butterfly_two_coeff, but with s32. +// fdct_round_shift(a * c1 +/- b * c2) +// Variant that performs normal implementation on full vector +// more accurate does 64-bit processing, takes and returns 32-bit values +// returns narrowed results +static INLINE void butterfly_two_coeff_s32_s64_narrow( + const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo, + const int32x4_t b_hi, const tran_coef_t constant1, + const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi, + int32x4_t *sub_lo, int32x4_t *sub_hi) { + // ac1/ac2 hold the following values: + // ac1: vget_low_s32(a_lo) * c1, vget_high_s32(a_lo) * c1, + // vget_low_s32(a_hi) * c1, vget_high_s32(a_hi) * c1 + // ac2: vget_low_s32(a_lo) * c2, vget_high_s32(a_lo) * c2, + // vget_low_s32(a_hi) * c2, vget_high_s32(a_hi) * c2 + int64x2_t ac1[4]; + int64x2_t ac2[4]; + int64x2_t sum[4]; + int64x2_t diff[4]; + + ac1[0] = vmull_n_s32(vget_low_s32(a_lo), constant1); + ac1[1] = vmull_n_s32(vget_high_s32(a_lo), constant1); + ac1[2] = vmull_n_s32(vget_low_s32(a_hi), constant1); + ac1[3] = vmull_n_s32(vget_high_s32(a_hi), constant1); + ac2[0] = vmull_n_s32(vget_low_s32(a_lo), constant2); + ac2[1] = vmull_n_s32(vget_high_s32(a_lo), constant2); + ac2[2] = vmull_n_s32(vget_low_s32(a_hi), constant2); + ac2[3] = vmull_n_s32(vget_high_s32(a_hi), constant2); + + sum[0] = vmlal_n_s32(ac1[0], vget_low_s32(b_lo), constant2); + sum[1] = vmlal_n_s32(ac1[1], vget_high_s32(b_lo), constant2); + sum[2] = vmlal_n_s32(ac1[2], vget_low_s32(b_hi), constant2); + sum[3] = vmlal_n_s32(ac1[3], vget_high_s32(b_hi), constant2); + *add_lo = vcombine_s32(vrshrn_n_s64(sum[0], DCT_CONST_BITS), + vrshrn_n_s64(sum[1], DCT_CONST_BITS)); + *add_hi = vcombine_s32(vrshrn_n_s64(sum[2], DCT_CONST_BITS), + vrshrn_n_s64(sum[3], DCT_CONST_BITS)); + + diff[0] = vmlsl_n_s32(ac2[0], vget_low_s32(b_lo), constant1); + diff[1] = vmlsl_n_s32(ac2[1], vget_high_s32(b_lo), constant1); + diff[2] = vmlsl_n_s32(ac2[2], vget_low_s32(b_hi), constant1); + diff[3] = vmlsl_n_s32(ac2[3], vget_high_s32(b_hi), constant1); + *sub_lo = vcombine_s32(vrshrn_n_s64(diff[0], DCT_CONST_BITS), + vrshrn_n_s64(diff[1], DCT_CONST_BITS)); + *sub_hi = vcombine_s32(vrshrn_n_s64(diff[2], DCT_CONST_BITS), + vrshrn_n_s64(diff[3], DCT_CONST_BITS)); +} + +// fdct_round_shift(a * c1 +/- b * c2) +// Variant that performs normal implementation on half vector +// more accurate does 32-bit processing, takes and returns 16-bit values +// returns narrowed results +static INLINE void butterfly_two_coeff_half(const int16x4_t a, + const int16x4_t b, + const tran_coef_t constant1, + const tran_coef_t constant2, + int16x4_t *add, int16x4_t *sub) { + const int32x4_t a1 = vmull_n_s16(a, constant1); + const int32x4_t a2 = vmull_n_s16(a, constant2); + const int32x4_t sum = vmlal_n_s16(a1, b, constant2); + const int32x4_t diff = vmlsl_n_s16(a2, b, constant1); + *add = vqrshrn_n_s32(sum, DCT_CONST_BITS); + *sub = vqrshrn_n_s32(diff, DCT_CONST_BITS); +} + +// fdct_round_shift(a * c1 +/- b * c2) +// Original Variant that performs normal implementation on full vector +// more accurate does 32-bit processing, takes and returns 16-bit values +// returns narrowed results +static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b, + const tran_coef_t constant1, + const tran_coef_t constant2, + int16x8_t *add, int16x8_t *sub) { + const int32x4_t a1 = vmull_n_s16(vget_low_s16(a), constant1); + const int32x4_t a2 = vmull_n_s16(vget_high_s16(a), constant1); + const int32x4_t a3 = vmull_n_s16(vget_low_s16(a), constant2); + const int32x4_t a4 = vmull_n_s16(vget_high_s16(a), constant2); + const int32x4_t sum0 = vmlal_n_s16(a1, vget_low_s16(b), constant2); + const int32x4_t sum1 = vmlal_n_s16(a2, vget_high_s16(b), constant2); + const int32x4_t diff0 = vmlsl_n_s16(a3, vget_low_s16(b), constant1); + const int32x4_t diff1 = vmlsl_n_s16(a4, vget_high_s16(b), constant1); + const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS); + const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS); + const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS); + const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS); + *add = vcombine_s16(rounded0, rounded1); + *sub = vcombine_s16(rounded2, rounded3); +} + +// fdct_round_shift(a * c1 +/- b * c2) +// Original Variant that performs normal implementation on full vector +// more accurate does 32-bit processing, takes and returns 32-bit values +// returns narrowed results static INLINE void butterfly_two_coeff_s32( const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo, - const int32x4_t b_hi, const int32_t constant0, const int32_t constant1, - int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo, - int32x4_t *sub_hi) { - const int32x4_t a0 = vmulq_n_s32(a_lo, constant0); - const int32x4_t a1 = vmulq_n_s32(a_hi, constant0); - const int32x4_t a2 = vmulq_n_s32(a_lo, constant1); - const int32x4_t a3 = vmulq_n_s32(a_hi, constant1); - const int32x4_t sum0 = vmlaq_n_s32(a2, b_lo, constant0); - const int32x4_t sum1 = vmlaq_n_s32(a3, b_hi, constant0); - const int32x4_t diff0 = vmlsq_n_s32(a0, b_lo, constant1); - const int32x4_t diff1 = vmlsq_n_s32(a1, b_hi, constant1); + const int32x4_t b_hi, const tran_coef_t constant1, + const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi, + int32x4_t *sub_lo, int32x4_t *sub_hi) { + const int32x4_t a1 = vmulq_n_s32(a_lo, constant1); + const int32x4_t a2 = vmulq_n_s32(a_hi, constant1); + const int32x4_t a3 = vmulq_n_s32(a_lo, constant2); + const int32x4_t a4 = vmulq_n_s32(a_hi, constant2); + const int32x4_t sum0 = vmlaq_n_s32(a1, b_lo, constant2); + const int32x4_t sum1 = vmlaq_n_s32(a2, b_hi, constant2); + const int32x4_t diff0 = vmlsq_n_s32(a3, b_lo, constant1); + const int32x4_t diff1 = vmlsq_n_s32(a4, b_hi, constant1); *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS); *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS); *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS); @@ -126,9 +309,10 @@ static INLINE int16x8_t add_round_shift_s16(const int16x8_t a) { } // Add 1 if positive, 2 if negative, and shift by 2. -// In practice, add 1, then add the sign bit, then shift without rounding. -static INLINE int16x8_t add_round_shift_s32(const int32x4_t a_lo, - const int32x4_t a_hi) { +// In practice, add 1, then add the sign bit, then shift and round, +// return narrowed results +static INLINE int16x8_t add_round_shift_s32_narrow(const int32x4_t a_lo, + const int32x4_t a_hi) { const int32x4_t one = vdupq_n_s32(1); const uint32x4_t a_lo_u32 = vreinterpretq_u32_s32(a_lo); const uint32x4_t a_lo_sign_u32 = vshrq_n_u32(a_lo_u32, 31); @@ -143,419 +327,32 @@ static INLINE int16x8_t add_round_shift_s32(const int32x4_t a_lo, return vcombine_s16(b_lo, b_hi); } -static INLINE void vpx_fdct4x4_pass1_neon(int16x4_t *in) { - const int16x8_t input_01 = vcombine_s16(in[0], in[1]); - const int16x8_t input_32 = vcombine_s16(in[3], in[2]); - - // in_0 +/- in_3, in_1 +/- in_2 - const int16x8_t s_01 = vaddq_s16(input_01, input_32); - const int16x8_t s_32 = vsubq_s16(input_01, input_32); - - // step_0 +/- step_1, step_2 +/- step_3 - const int16x4_t s_0 = vget_low_s16(s_01); - const int16x4_t s_1 = vget_high_s16(s_01); - const int16x4_t s_2 = vget_high_s16(s_32); - const int16x4_t s_3 = vget_low_s16(s_32); - - // (s_0 +/- s_1) * cospi_16_64 - // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c. - const int32x4_t s_0_p_s_1 = vaddl_s16(s_0, s_1); - const int32x4_t s_0_m_s_1 = vsubl_s16(s_0, s_1); - const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, cospi_16_64); - const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, cospi_16_64); - - // fdct_round_shift - int16x4_t out_0 = vrshrn_n_s32(temp1, DCT_CONST_BITS); - int16x4_t out_2 = vrshrn_n_s32(temp2, DCT_CONST_BITS); - - // s_3 * cospi_8_64 + s_2 * cospi_24_64 - // s_3 * cospi_24_64 - s_2 * cospi_8_64 - const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, cospi_8_64); - const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, cospi_24_64); - - const int32x4_t temp3 = vmlal_n_s16(s_3_cospi_8_64, s_2, cospi_24_64); - const int32x4_t temp4 = vmlsl_n_s16(s_3_cospi_24_64, s_2, cospi_8_64); - - // fdct_round_shift - int16x4_t out_1 = vrshrn_n_s32(temp3, DCT_CONST_BITS); - int16x4_t out_3 = vrshrn_n_s32(temp4, DCT_CONST_BITS); - - transpose_s16_4x4d(&out_0, &out_1, &out_2, &out_3); - - in[0] = out_0; - in[1] = out_1; - in[2] = out_2; - in[3] = out_3; -} - -static INLINE void vpx_fdct8x8_pass1_notranspose_neon(int16x8_t *in, - int16x8_t *out) { - const int16x8_t v_s0 = vaddq_s16(in[0], in[7]); - const int16x8_t v_s1 = vaddq_s16(in[1], in[6]); - const int16x8_t v_s2 = vaddq_s16(in[2], in[5]); - const int16x8_t v_s3 = vaddq_s16(in[3], in[4]); - const int16x8_t v_s4 = vsubq_s16(in[3], in[4]); - const int16x8_t v_s5 = vsubq_s16(in[2], in[5]); - const int16x8_t v_s6 = vsubq_s16(in[1], in[6]); - const int16x8_t v_s7 = vsubq_s16(in[0], in[7]); - // fdct4(step, step); - int16x8_t v_x0 = vaddq_s16(v_s0, v_s3); - int16x8_t v_x1 = vaddq_s16(v_s1, v_s2); - int16x8_t v_x2 = vsubq_s16(v_s1, v_s2); - int16x8_t v_x3 = vsubq_s16(v_s0, v_s3); - // fdct4(step, step); - int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1)); - int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1)); - int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1)); - int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1)); - int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_24_64); - int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_24_64); - int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_24_64); - int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_24_64); - v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), cospi_8_64); - v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), cospi_8_64); - v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), cospi_8_64); - v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), cospi_8_64); - v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64); - v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64); - v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64); - v_t1_hi = vmulq_n_s32(v_t1_hi, cospi_16_64); - { - const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); - const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); - const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); - const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); - const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS); - const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS); - const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS); - const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS); - out[0] = vcombine_s16(a, c); // 00 01 02 03 40 41 42 43 - out[2] = vcombine_s16(e, g); // 20 21 22 23 60 61 62 63 - out[4] = vcombine_s16(b, d); // 04 05 06 07 44 45 46 47 - out[6] = vcombine_s16(f, h); // 24 25 26 27 64 65 66 67 - } - // Stage 2 - v_x0 = vsubq_s16(v_s6, v_s5); - v_x1 = vaddq_s16(v_s6, v_s5); - v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), cospi_16_64); - v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), cospi_16_64); - v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_16_64); - v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_16_64); - { - const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); - const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); - const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); - const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); - const int16x8_t ab = vcombine_s16(a, b); - const int16x8_t cd = vcombine_s16(c, d); - // Stage 3 - v_x0 = vaddq_s16(v_s4, ab); - v_x1 = vsubq_s16(v_s4, ab); - v_x2 = vsubq_s16(v_s7, cd); - v_x3 = vaddq_s16(v_s7, cd); - } - // Stage 4 - v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_4_64); - v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_4_64); - v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), cospi_28_64); - v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), cospi_28_64); - v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_12_64); - v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_12_64); - v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), cospi_20_64); - v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), cospi_20_64); - v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_12_64); - v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_12_64); - v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), cospi_20_64); - v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), cospi_20_64); - v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_28_64); - v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_28_64); - v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), cospi_4_64); - v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), cospi_4_64); - { - const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); - const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); - const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); - const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); - const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS); - const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS); - const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS); - const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS); - out[1] = vcombine_s16(a, c); // 10 11 12 13 50 51 52 53 - out[3] = vcombine_s16(e, g); // 30 31 32 33 70 71 72 73 - out[5] = vcombine_s16(b, d); // 14 15 16 17 54 55 56 57 - out[7] = vcombine_s16(f, h); // 34 35 36 37 74 75 76 77 - } -} - -static INLINE void vpx_fdct8x8_pass1_neon(int16x8_t *in) { - int16x8_t out[8]; - vpx_fdct8x8_pass1_notranspose_neon(in, out); - // transpose 8x8 - // Can't use transpose_s16_8x8() because the values are arranged in two 4x8 - // columns. - { - // 00 01 02 03 40 41 42 43 - // 10 11 12 13 50 51 52 53 - // 20 21 22 23 60 61 62 63 - // 30 31 32 33 70 71 72 73 - // 04 05 06 07 44 45 46 47 - // 14 15 16 17 54 55 56 57 - // 24 25 26 27 64 65 66 67 - // 34 35 36 37 74 75 76 77 - const int32x4x2_t r02_s32 = - vtrnq_s32(vreinterpretq_s32_s16(out[0]), vreinterpretq_s32_s16(out[2])); - const int32x4x2_t r13_s32 = - vtrnq_s32(vreinterpretq_s32_s16(out[1]), vreinterpretq_s32_s16(out[3])); - const int32x4x2_t r46_s32 = - vtrnq_s32(vreinterpretq_s32_s16(out[4]), vreinterpretq_s32_s16(out[6])); - const int32x4x2_t r57_s32 = - vtrnq_s32(vreinterpretq_s32_s16(out[5]), vreinterpretq_s32_s16(out[7])); - const int16x8x2_t r01_s16 = - vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]), - vreinterpretq_s16_s32(r13_s32.val[0])); - const int16x8x2_t r23_s16 = - vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]), - vreinterpretq_s16_s32(r13_s32.val[1])); - const int16x8x2_t r45_s16 = - vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]), - vreinterpretq_s16_s32(r57_s32.val[0])); - const int16x8x2_t r67_s16 = - vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]), - vreinterpretq_s16_s32(r57_s32.val[1])); - in[0] = r01_s16.val[0]; - in[1] = r01_s16.val[1]; - in[2] = r23_s16.val[0]; - in[3] = r23_s16.val[1]; - in[4] = r45_s16.val[0]; - in[5] = r45_s16.val[1]; - in[6] = r67_s16.val[0]; - in[7] = r67_s16.val[1]; - // 00 10 20 30 40 50 60 70 - // 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 - // 03 13 23 33 43 53 63 73 - // 04 14 24 34 44 54 64 74 - // 05 15 25 35 45 55 65 75 - // 06 16 26 36 46 56 66 76 - // 07 17 27 37 47 57 67 77 - } -} - -#if CONFIG_VP9_HIGHBITDEPTH -static INLINE int32x4_t highbd_add_round_shift_s32(int32x4_t x) { - const int32x2_t x_lo = vget_low_s32(x); - const int32x2_t x_hi = vget_high_s32(x); - const int64x2_t x64_lo = vmovl_s32(x_lo); - const int64x2_t x64_hi = vmovl_s32(x_hi); - - const int64x2_t sign_lo = (int64x2_t)vshrq_n_u64((uint64x2_t)x64_lo, 63); - const int64x2_t sign_hi = (int64x2_t)vshrq_n_u64((uint64x2_t)x64_hi, 63); - - const int64x2_t sum_lo = vaddq_s64(x64_lo, sign_lo); - const int64x2_t sum_hi = vaddq_s64(x64_hi, sign_hi); - return vcombine_s32(vshrn_n_s64(sum_lo, 1), vshrn_n_s64(sum_hi, 1)); -} - -static INLINE void highbd_butterfly_one_coeff_s32(const int32x4_t a, - const int32x4_t b, - const tran_coef_t c, - int32x4_t *add, - int32x4_t *sub) { - const int32x2_t a_lo = vget_low_s32(a); - const int32x2_t a_hi = vget_high_s32(a); - const int32x2_t b_lo = vget_low_s32(b); - const int32x2_t b_hi = vget_high_s32(b); - - const int64x2_t a64_lo = vmull_n_s32(a_lo, c); - const int64x2_t a64_hi = vmull_n_s32(a_hi, c); - - const int64x2_t sum_lo = vmlal_n_s32(a64_lo, b_lo, c); - const int64x2_t sum_hi = vmlal_n_s32(a64_hi, b_hi, c); - const int64x2_t diff_lo = vmlsl_n_s32(a64_lo, b_lo, c); - const int64x2_t diff_hi = vmlsl_n_s32(a64_hi, b_hi, c); - - *add = vcombine_s32(vrshrn_n_s64(sum_lo, DCT_CONST_BITS), - vrshrn_n_s64(sum_hi, DCT_CONST_BITS)); - *sub = vcombine_s32(vrshrn_n_s64(diff_lo, DCT_CONST_BITS), - vrshrn_n_s64(diff_hi, DCT_CONST_BITS)); -} - -static INLINE void highbd_butterfly_two_coeff_s32( - const int32x4_t a, const int32x4_t b, const tran_coef_t c0, - const tran_coef_t c1, int32x4_t *add, int32x4_t *sub) { - const int32x2_t a_lo = vget_low_s32(a); - const int32x2_t a_hi = vget_high_s32(a); - const int32x2_t b_lo = vget_low_s32(b); - const int32x2_t b_hi = vget_high_s32(b); - - const int64x2_t axc0_64_lo = vmull_n_s32(a_lo, c0); - const int64x2_t axc0_64_hi = vmull_n_s32(a_hi, c0); - const int64x2_t axc1_64_lo = vmull_n_s32(a_lo, c1); - const int64x2_t axc1_64_hi = vmull_n_s32(a_hi, c1); - - const int64x2_t sum_lo = vmlal_n_s32(axc0_64_lo, b_lo, c1); - const int64x2_t sum_hi = vmlal_n_s32(axc0_64_hi, b_hi, c1); - const int64x2_t diff_lo = vmlsl_n_s32(axc1_64_lo, b_lo, c0); - const int64x2_t diff_hi = vmlsl_n_s32(axc1_64_hi, b_hi, c0); - - *add = vcombine_s32(vrshrn_n_s64(sum_lo, DCT_CONST_BITS), - vrshrn_n_s64(sum_hi, DCT_CONST_BITS)); - *sub = vcombine_s32(vrshrn_n_s64(diff_lo, DCT_CONST_BITS), - vrshrn_n_s64(diff_hi, DCT_CONST_BITS)); -} - -static INLINE void vpx_highbd_fdct4x4_pass1_neon(int32x4_t *in) { - int32x4_t out[4]; - // in_0 +/- in_3, in_1 +/- in_2 - const int32x4_t s_0 = vaddq_s32(in[0], in[3]); - const int32x4_t s_1 = vaddq_s32(in[1], in[2]); - const int32x4_t s_2 = vsubq_s32(in[1], in[2]); - const int32x4_t s_3 = vsubq_s32(in[0], in[3]); - - highbd_butterfly_one_coeff_s32(s_0, s_1, cospi_16_64, &out[0], &out[2]); - - // out[1] = s_3 * cospi_8_64 + s_2 * cospi_24_64 - // out[3] = s_3 * cospi_24_64 - s_2 * cospi_8_64 - highbd_butterfly_two_coeff_s32(s_3, s_2, cospi_8_64, cospi_24_64, &out[1], - &out[3]); - - transpose_s32_4x4(&out[0], &out[1], &out[2], &out[3]); - - in[0] = out[0]; - in[1] = out[1]; - in[2] = out[2]; - in[3] = out[3]; +// Add 1 if negative, and shift by 1. +// In practice, add the sign bit, then shift and round +static INLINE int32x4_t add_round_shift_half_s32(const int32x4_t a) { + const uint32x4_t a_u32 = vreinterpretq_u32_s32(a); + const uint32x4_t a_sign_u32 = vshrq_n_u32(a_u32, 31); + const int32x4_t a_sign_s32 = vreinterpretq_s32_u32(a_sign_u32); + return vshrq_n_s32(vaddq_s32(a, a_sign_s32), 1); } -static INLINE void vpx_highbd_fdct8x8_pass1_notranspose_neon(int32x4_t *left, - int32x4_t *right) { - int32x4_t sl[8], sr[8], xl[4], xr[4], tl[4], tr[4]; - - sl[0] = vaddq_s32(left[0], left[7]); - sl[1] = vaddq_s32(left[1], left[6]); - sl[2] = vaddq_s32(left[2], left[5]); - sl[3] = vaddq_s32(left[3], left[4]); - sl[4] = vsubq_s32(left[3], left[4]); - sl[5] = vsubq_s32(left[2], left[5]); - sl[6] = vsubq_s32(left[1], left[6]); - sl[7] = vsubq_s32(left[0], left[7]); - sr[0] = vaddq_s32(right[0], right[7]); - sr[1] = vaddq_s32(right[1], right[6]); - sr[2] = vaddq_s32(right[2], right[5]); - sr[3] = vaddq_s32(right[3], right[4]); - sr[4] = vsubq_s32(right[3], right[4]); - sr[5] = vsubq_s32(right[2], right[5]); - sr[6] = vsubq_s32(right[1], right[6]); - sr[7] = vsubq_s32(right[0], right[7]); - - // fdct4(step, step); - // x0 = s0 + s3; - xl[0] = vaddq_s32(sl[0], sl[3]); - xr[0] = vaddq_s32(sr[0], sr[3]); - // x1 = s1 + s2; - xl[1] = vaddq_s32(sl[1], sl[2]); - xr[1] = vaddq_s32(sr[1], sr[2]); - // x2 = s1 - s2; - xl[2] = vsubq_s32(sl[1], sl[2]); - xr[2] = vsubq_s32(sr[1], sr[2]); - // x3 = s0 - s3; - xl[3] = vsubq_s32(sl[0], sl[3]); - xr[3] = vsubq_s32(sr[0], sr[3]); - - // fdct4(step, step); - // t0 = (x0 + x1) * cospi_16_64; - // t1 = (x0 - x1) * cospi_16_64; - // out[0] = (tran_low_t)fdct_round_shift(t0); - // out[4] = (tran_low_t)fdct_round_shift(t1); - highbd_butterfly_one_coeff_s32(xl[0], xl[1], cospi_16_64, &left[0], &left[4]); - highbd_butterfly_one_coeff_s32(xr[0], xr[1], cospi_16_64, &right[0], - &right[4]); - // t2 = x2 * cospi_24_64 + x3 * cospi_8_64; - // t3 = -x2 * cospi_8_64 + x3 * cospi_24_64; - // out[2] = (tran_low_t)fdct_round_shift(t2); - // out[6] = (tran_low_t)fdct_round_shift(t3); - highbd_butterfly_two_coeff_s32(xl[3], xl[2], cospi_8_64, cospi_24_64, - &left[2], &left[6]); - highbd_butterfly_two_coeff_s32(xr[3], xr[2], cospi_8_64, cospi_24_64, - &right[2], &right[6]); - - // Stage 2 - // t0 = (s6 - s5) * cospi_16_64; - highbd_butterfly_one_coeff_s32(sl[6], sl[5], cospi_16_64, &tl[1], &tl[0]); - highbd_butterfly_one_coeff_s32(sr[6], sr[5], cospi_16_64, &tr[1], &tr[0]); - - // Stage 3 - xl[0] = vaddq_s32(sl[4], tl[0]); - xr[0] = vaddq_s32(sr[4], tr[0]); - xl[1] = vsubq_s32(sl[4], tl[0]); - xr[1] = vsubq_s32(sr[4], tr[0]); - xl[2] = vsubq_s32(sl[7], tl[1]); - xr[2] = vsubq_s32(sr[7], tr[1]); - xl[3] = vaddq_s32(sl[7], tl[1]); - xr[3] = vaddq_s32(sr[7], tr[1]); - - // Stage 4 - // t0 = x0 * cospi_28_64 + x3 * cospi_4_64; - // out[1] = (tran_low_t)fdct_round_shift(t0); - // t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; - // out[7] = (tran_low_t)fdct_round_shift(t3); - highbd_butterfly_two_coeff_s32(xl[3], xl[0], cospi_4_64, cospi_28_64, - &left[1], &left[7]); - highbd_butterfly_two_coeff_s32(xr[3], xr[0], cospi_4_64, cospi_28_64, - &right[1], &right[7]); - - // t1 = x1 * cospi_12_64 + x2 * cospi_20_64; - // out[5] = (tran_low_t)fdct_round_shift(t1); - // t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; - // out[3] = (tran_low_t)fdct_round_shift(t2); - highbd_butterfly_two_coeff_s32(xl[2], xl[1], cospi_20_64, cospi_12_64, - &left[5], &left[3]); - highbd_butterfly_two_coeff_s32(xr[2], xr[1], cospi_20_64, cospi_12_64, - &right[5], &right[3]); +// Add 1 if positive, 2 if negative, and shift by 2. +// In practice, add 1, then add the sign bit, then shift without rounding. +static INLINE int32x4_t add_round_shift_s32(const int32x4_t a) { + const int32x4_t one = vdupq_n_s32(1); + const uint32x4_t a_u32 = vreinterpretq_u32_s32(a); + const uint32x4_t a_sign_u32 = vshrq_n_u32(a_u32, 31); + const int32x4_t a_sign_s32 = vreinterpretq_s32_u32(a_sign_u32); + return vshrq_n_s32(vaddq_s32(vaddq_s32(a, a_sign_s32), one), 2); } -static INLINE void vpx_highbd_fdct8x8_pass1_neon(int32x4_t *left, - int32x4_t *right) { - int32x4x2_t out[8]; - vpx_highbd_fdct8x8_pass1_notranspose_neon(left, right); - - out[0].val[0] = left[0]; - out[0].val[1] = right[0]; - out[1].val[0] = left[1]; - out[1].val[1] = right[1]; - out[2].val[0] = left[2]; - out[2].val[1] = right[2]; - out[3].val[0] = left[3]; - out[3].val[1] = right[3]; - out[4].val[0] = left[4]; - out[4].val[1] = right[4]; - out[5].val[0] = left[5]; - out[5].val[1] = right[5]; - out[6].val[0] = left[6]; - out[6].val[1] = right[6]; - out[7].val[0] = left[7]; - out[7].val[1] = right[7]; - - transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5], - &out[6], &out[7]); - - left[0] = out[0].val[0]; - right[0] = out[0].val[1]; - left[1] = out[1].val[0]; - right[1] = out[1].val[1]; - left[2] = out[2].val[0]; - right[2] = out[2].val[1]; - left[3] = out[3].val[0]; - right[3] = out[3].val[1]; - left[4] = out[4].val[0]; - right[4] = out[4].val[1]; - left[5] = out[5].val[0]; - right[5] = out[5].val[1]; - left[6] = out[6].val[0]; - right[6] = out[6].val[1]; - left[7] = out[7].val[0]; - right[7] = out[7].val[1]; +// Add 2 if positive, 1 if negative, and shift by 2. +// In practice, subtract the sign bit, then shift with rounding. +static INLINE int16x8_t sub_round_shift_s16(const int16x8_t a) { + const uint16x8_t a_u16 = vreinterpretq_u16_s16(a); + const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15); + const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16); + return vrshrq_n_s16(vsubq_s16(a, a_sign_s16), 2); } -#endif // CONFIG_VP9_HIGHBITDEPTH #endif // VPX_VPX_DSP_ARM_FDCT_NEON_H_ diff --git a/vpx_dsp/arm/transpose_neon.h b/vpx_dsp/arm/transpose_neon.h index bf06d6abe..41d44f2b1 100644 --- a/vpx_dsp/arm/transpose_neon.h +++ b/vpx_dsp/arm/transpose_neon.h @@ -821,6 +821,51 @@ static INLINE void transpose_s32_8x8(int32x4x2_t *a0, int32x4x2_t *a1, a7->val[1] = c7.val[1]; } +// Helper transpose function for highbd FDCT variants +static INLINE void transpose_s32_8x8_2(int32x4_t *left /*[8]*/, + int32x4_t *right /*[8]*/, + int32x4_t *out_left /*[8]*/, + int32x4_t *out_right /*[8]*/) { + int32x4x2_t out[8]; + + out[0].val[0] = left[0]; + out[0].val[1] = right[0]; + out[1].val[0] = left[1]; + out[1].val[1] = right[1]; + out[2].val[0] = left[2]; + out[2].val[1] = right[2]; + out[3].val[0] = left[3]; + out[3].val[1] = right[3]; + out[4].val[0] = left[4]; + out[4].val[1] = right[4]; + out[5].val[0] = left[5]; + out[5].val[1] = right[5]; + out[6].val[0] = left[6]; + out[6].val[1] = right[6]; + out[7].val[0] = left[7]; + out[7].val[1] = right[7]; + + transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5], + &out[6], &out[7]); + + out_left[0] = out[0].val[0]; + out_left[1] = out[1].val[0]; + out_left[2] = out[2].val[0]; + out_left[3] = out[3].val[0]; + out_left[4] = out[4].val[0]; + out_left[5] = out[5].val[0]; + out_left[6] = out[6].val[0]; + out_left[7] = out[7].val[0]; + out_right[0] = out[0].val[1]; + out_right[1] = out[1].val[1]; + out_right[2] = out[2].val[1]; + out_right[3] = out[3].val[1]; + out_right[4] = out[4].val[1]; + out_right[5] = out[5].val[1]; + out_right[6] = out[6].val[1]; + out_right[7] = out[7].val[1]; +} + static INLINE void transpose_u8_16x8( const uint8x16_t i0, const uint8x16_t i1, const uint8x16_t i2, const uint8x16_t i3, const uint8x16_t i4, const uint8x16_t i5, |