summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--vp9/encoder/arm/neon/vp9_dct_neon.c17
-rw-r--r--vpx_dsp/arm/fdct16x16_neon.c18
-rw-r--r--vpx_dsp/arm/fdct16x16_neon.h297
-rw-r--r--vpx_dsp/arm/fdct32x32_neon.c1158
-rw-r--r--vpx_dsp/arm/fdct32x32_neon.h1105
-rw-r--r--vpx_dsp/arm/fdct4x4_neon.c13
-rw-r--r--vpx_dsp/arm/fdct4x4_neon.h105
-rw-r--r--vpx_dsp/arm/fdct8x8_neon.c47
-rw-r--r--vpx_dsp/arm/fdct8x8_neon.h381
-rw-r--r--vpx_dsp/arm/fdct_neon.h757
-rw-r--r--vpx_dsp/arm/transpose_neon.h45
11 files changed, 2112 insertions, 1831 deletions
diff --git a/vp9/encoder/arm/neon/vp9_dct_neon.c b/vp9/encoder/arm/neon/vp9_dct_neon.c
index a07a1608d..b8286a8dd 100644
--- a/vp9/encoder/arm/neon/vp9_dct_neon.c
+++ b/vp9/encoder/arm/neon/vp9_dct_neon.c
@@ -18,6 +18,8 @@
#include "vpx_dsp/arm/mem_neon.h"
#include "vpx_dsp/arm/transpose_neon.h"
#include "vpx_dsp/arm/fdct_neon.h"
+#include "vpx_dsp/arm/fdct4x4_neon.h"
+#include "vpx_dsp/arm/fdct8x8_neon.h"
static INLINE void load_buffer_4x4(const int16_t *input, int16x8_t *in,
int stride) {
@@ -130,12 +132,14 @@ void vp9_fht4x4_neon(const int16_t *input, tran_low_t *output, int stride,
case ADST_DCT:
load_buffer_4x4(input, in, stride);
fadst4x4_neon(in);
- vpx_fdct4x4_pass1_neon((int16x4_t *)in);
+ // pass1 variant is not accurate enough
+ vpx_fdct4x4_pass2_neon((int16x4_t *)in);
write_buffer_4x4(output, in);
break;
case DCT_ADST:
load_buffer_4x4(input, in, stride);
- vpx_fdct4x4_pass1_neon((int16x4_t *)in);
+ // pass1 variant is not accurate enough
+ vpx_fdct4x4_pass2_neon((int16x4_t *)in);
fadst4x4_neon(in);
write_buffer_4x4(output, in);
break;
@@ -488,13 +492,15 @@ void vp9_fht8x8_neon(const int16_t *input, tran_low_t *output, int stride,
case ADST_DCT:
load_buffer_8x8(input, in, stride);
fadst8x8_neon(in);
- vpx_fdct8x8_pass1_neon(in);
+ // pass1 variant is not accurate enough
+ vpx_fdct8x8_pass2_neon(in);
right_shift_8x8(in, 1);
write_buffer_8x8(output, in, 8);
break;
case DCT_ADST:
load_buffer_8x8(input, in, stride);
- vpx_fdct8x8_pass1_neon(in);
+ // pass1 variant is not accurate enough
+ vpx_fdct8x8_pass2_neon(in);
fadst8x8_neon(in);
right_shift_8x8(in, 1);
write_buffer_8x8(output, in, 8);
@@ -559,7 +565,8 @@ static void fdct16_8col(int16x8_t *in) {
i[6] = vaddq_s16(in[6], in[9]);
i[7] = vaddq_s16(in[7], in[8]);
- vpx_fdct8x8_pass1_neon(i);
+ // pass1 variant is not accurate enough
+ vpx_fdct8x8_pass2_neon(i);
transpose_s16_8x8(&i[0], &i[1], &i[2], &i[3], &i[4], &i[5], &i[6], &i[7]);
// step 2
diff --git a/vpx_dsp/arm/fdct16x16_neon.c b/vpx_dsp/arm/fdct16x16_neon.c
index d0c07d429..a458ecaa4 100644
--- a/vpx_dsp/arm/fdct16x16_neon.c
+++ b/vpx_dsp/arm/fdct16x16_neon.c
@@ -37,20 +37,21 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
// Left half.
load_cross(input, stride, temp0);
scale_input(temp0, temp1);
- vpx_fdct16x16_body(temp1, temp0);
+ vpx_fdct8x16_body(temp1, temp0);
// Right half.
load_cross(input + 8, stride, temp1);
scale_input(temp1, temp2);
- vpx_fdct16x16_body(temp2, temp1);
+ vpx_fdct8x16_body(temp2, temp1);
// Transpose top left and top right quarters into one contiguous location to
// process to the top half.
+
transpose_s16_8x8_new(&temp0[0], &temp2[0]);
transpose_s16_8x8_new(&temp1[0], &temp2[8]);
partial_round_shift(temp2);
cross_input(temp2, temp3);
- vpx_fdct16x16_body(temp3, temp2);
+ vpx_fdct8x16_body(temp3, temp2);
transpose_s16_8x8(&temp2[0], &temp2[1], &temp2[2], &temp2[3], &temp2[4],
&temp2[5], &temp2[6], &temp2[7]);
transpose_s16_8x8(&temp2[8], &temp2[9], &temp2[10], &temp2[11], &temp2[12],
@@ -62,11 +63,12 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
// Transpose bottom left and bottom right quarters into one contiguous
// location to process to the bottom half.
transpose_s16_8x8_new(&temp0[8], &temp1[0]);
+
transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12],
&temp1[13], &temp1[14], &temp1[15]);
partial_round_shift(temp1);
cross_input(temp1, temp0);
- vpx_fdct16x16_body(temp0, temp1);
+ vpx_fdct8x16_body(temp0, temp1);
transpose_s16_8x8(&temp1[0], &temp1[1], &temp1[2], &temp1[3], &temp1[4],
&temp1[5], &temp1[6], &temp1[7]);
transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12],
@@ -86,12 +88,12 @@ void vpx_highbd_fdct16x16_neon(const int16_t *input, tran_low_t *output,
// Left half.
load_cross(input, stride, temp0);
highbd_scale_input(temp0, left1, right1);
- vpx_highbd_fdct16x16_body(left1, right1);
+ vpx_highbd_fdct8x16_body(left1, right1);
// right half.
load_cross(input + 8, stride, temp0);
highbd_scale_input(temp0, left2, right2);
- vpx_highbd_fdct16x16_body(left2, right2);
+ vpx_highbd_fdct8x16_body(left2, right2);
// Transpose top left and top right quarters into one contiguous location to
// process to the top half.
@@ -103,14 +105,14 @@ void vpx_highbd_fdct16x16_neon(const int16_t *input, tran_low_t *output,
highbd_partial_round_shift(left3, right3);
highbd_cross_input(left3, right3, left1, right1);
- vpx_highbd_fdct16x16_body(left1, right1);
+ vpx_highbd_fdct8x16_body(left1, right1);
// Transpose bottom left and bottom right quarters into one contiguous
// location to process to the bottom half.
highbd_partial_round_shift(left4, right4);
highbd_cross_input(left4, right4, left2, right2);
- vpx_highbd_fdct16x16_body(left2, right2);
+ vpx_highbd_fdct8x16_body(left2, right2);
transpose_s32_8x8_2(left1, right1, left3, right3);
transpose_s32_8x8_2(left2, right2, left3 + 8, right3 + 8);
diff --git a/vpx_dsp/arm/fdct16x16_neon.h b/vpx_dsp/arm/fdct16x16_neon.h
index d99870903..43d820b6b 100644
--- a/vpx_dsp/arm/fdct16x16_neon.h
+++ b/vpx_dsp/arm/fdct16x16_neon.h
@@ -160,8 +160,8 @@ static INLINE void partial_round_shift(int16x8_t *a /*[16]*/) {
}
// Main body of fdct16x16.
-static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/,
- int16x8_t *out /*[16]*/) {
+static void vpx_fdct8x16_body(const int16x8_t *in /*[16]*/,
+ int16x8_t *out /*[16]*/) {
int16x8_t s[8];
int16x8_t x[4];
int16x8_t step[8];
@@ -186,16 +186,17 @@ static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/,
// out[0] = fdct_round_shift((x0 + x1) * cospi_16_64)
// out[8] = fdct_round_shift((x0 - x1) * cospi_16_64)
- butterfly_one_coeff(x[0], x[1], cospi_16_64, &out[0], &out[8]);
- // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64);
+ butterfly_one_coeff_s16_s32_fast_narrow(x[0], x[1], cospi_16_64, &out[0],
+ &out[8]);
+ // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64);
// out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64);
- butterfly_two_coeff(x[3], x[2], cospi_24_64, cospi_8_64, &out[4], &out[12]);
+ butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[4], &out[12]);
// Stage 2
// Re-using source s5/s6
// s5 = fdct_round_shift((s6 - s5) * cospi_16_64)
// s6 = fdct_round_shift((s6 + s5) * cospi_16_64)
- butterfly_one_coeff(s[6], s[5], cospi_16_64, &s[6], &s[5]);
+ butterfly_one_coeff_s16_fast(s[6], s[5], cospi_16_64, &s[6], &s[5]);
// Stage 3
x[0] = vaddq_s16(s[4], s[5]);
@@ -204,12 +205,12 @@ static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/,
x[3] = vaddq_s16(s[7], s[6]);
// Stage 4
- // out[2] = fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
- // out[14] = fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
- butterfly_two_coeff(x[3], x[0], cospi_28_64, cospi_4_64, &out[2], &out[14]);
- // out[6] = fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
- // out[10] = fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
- butterfly_two_coeff(x[2], x[1], cospi_12_64, cospi_20_64, &out[10], &out[6]);
+ // out[2] = fdct_round_shift(x3 * cospi_4_64 + x0 * cospi_28_64)
+ // out[14] = fdct_round_shift(x3 * cospi_28_64 - x0 * cospi_4_64)
+ butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[2], &out[14]);
+ // out[6] = fdct_round_shift(x2 * cospi_20_64 + x1 * cospi_12_64)
+ // out[10] = fdct_round_shift(x2 * cospi_12_64 - x1 * cospi_20_64)
+ butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[10], &out[6]);
// step 2
// From fwd_txfm.c: Work on the next eight values; step1 -> odd_results"
@@ -221,8 +222,8 @@ static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/,
// step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64)
// step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64)
// step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64)
- butterfly_one_coeff(in[13], in[10], cospi_16_64, &s[5], &s[2]);
- butterfly_one_coeff(in[12], in[11], cospi_16_64, &s[4], &s[3]);
+ butterfly_one_coeff_s16_fast(in[13], in[10], cospi_16_64, &s[5], &s[2]);
+ butterfly_one_coeff_s16_fast(in[12], in[11], cospi_16_64, &s[4], &s[3]);
// step 3
s[0] = vaddq_s16(in[8], s[3]);
@@ -235,13 +236,15 @@ static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/,
s[7] = vaddq_s16(in[15], s[4]);
// step 4
- // step2[1] = fdct_round_shift(step3[1] *-cospi_8_64 + step3[6] * cospi_24_64)
- // step2[6] = fdct_round_shift(step3[1] * cospi_24_64 + step3[6] * cospi_8_64)
- butterfly_two_coeff(s[6], s[1], cospi_24_64, cospi_8_64, &s[6], &s[1]);
+ // step2[6] = fdct_round_shift(step3[6] * cospi_8_64 + step3[1] *
+ // cospi_24_64) step2[1] = fdct_round_shift(step3[6] * cospi_24_64 - step3[1]
+ // * cospi_8_64)
+ butterfly_two_coeff(s[6], s[1], cospi_8_64, cospi_24_64, &s[6], &s[1]);
// step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64)
- // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * cospi_24_64)
- butterfly_two_coeff(x[0], x[3], cospi_8_64, cospi_24_64, &s[2], &s[5]);
+ // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] *
+ // cospi_24_64)
+ butterfly_two_coeff(x[0], x[3], cospi_24_64, cospi_8_64, &s[2], &s[5]);
// step 5
step[0] = vaddq_s16(s[0], s[1]);
@@ -254,22 +257,23 @@ static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/,
step[7] = vaddq_s16(s[7], s[6]);
// step 6
- // out[1] = fdct_round_shift(step1[0] * cospi_30_64 + step1[7] * cospi_2_64)
- // out[9] = fdct_round_shift(step1[1] * cospi_14_64 + step1[6] * cospi_18_64)
- // out[5] = fdct_round_shift(step1[2] * cospi_22_64 + step1[5] * cospi_10_64)
- // out[13] = fdct_round_shift(step1[3] * cospi_6_64 + step1[4] * cospi_26_64)
- // out[3] = fdct_round_shift(step1[3] * -cospi_26_64 + step1[4] * cospi_6_64)
- // out[11] = fdct_round_shift(step1[2] * -cospi_10_64 + step1[5] *
- // cospi_22_64)
- // out[7] = fdct_round_shift(step1[1] * -cospi_18_64 + step1[6] * cospi_14_64)
- // out[15] = fdct_round_shift(step1[0] * -cospi_2_64 + step1[7] * cospi_30_64)
- butterfly_two_coeff(step[6], step[1], cospi_14_64, cospi_18_64, &out[9],
+ // out[9] = fdct_round_shift(step1[6] * cospi_18_64 + step1[1] * cospi_14_64)
+ // out[7] = fdct_round_shift(step1[6] * cospi_14_64 - step1[1] * cospi_18_64)
+ butterfly_two_coeff(step[6], step[1], cospi_18_64, cospi_14_64, &out[9],
&out[7]);
- butterfly_two_coeff(step[7], step[0], cospi_30_64, cospi_2_64, &out[1],
+ // out[1] = fdct_round_shift(step1[7] * cospi_2_64 + step1[0] * cospi_30_64)
+ // out[15] = fdct_round_shift(step1[7] * cospi_30_64 - step1[0] * cospi_2_64)
+ butterfly_two_coeff(step[7], step[0], cospi_2_64, cospi_30_64, &out[1],
&out[15]);
- butterfly_two_coeff(step[4], step[3], cospi_6_64, cospi_26_64, &out[13],
+
+ // out[13] = fdct_round_shift(step1[4] * cospi_26_64 + step1[3] * cospi_6_64)
+ // out[3] = fdct_round_shift(step1[4] * cospi_6_64 - step1[3] * cospi_26_64)
+ butterfly_two_coeff(step[4], step[3], cospi_26_64, cospi_6_64, &out[13],
&out[3]);
- butterfly_two_coeff(step[5], step[2], cospi_22_64, cospi_10_64, &out[5],
+
+ // out[5] = fdct_round_shift(step1[5] * cospi_10_64 + step1[2] * cospi_22_64)
+ // out[11] = fdct_round_shift(step1[5] * cospi_22_64 - step1[2] * cospi_10_64)
+ butterfly_two_coeff(step[5], step[2], cospi_10_64, cospi_22_64, &out[5],
&out[11]);
}
@@ -279,36 +283,37 @@ static INLINE void highbd_scale_input(const int16x8_t *a /*[16]*/,
int32x4_t *left /*[16]*/,
int32x4_t *right /* [16] */) {
left[0] = vshll_n_s16(vget_low_s16(a[0]), 2);
- right[0] = vshll_n_s16(vget_high_s16(a[0]), 2);
left[1] = vshll_n_s16(vget_low_s16(a[1]), 2);
- right[1] = vshll_n_s16(vget_high_s16(a[1]), 2);
left[2] = vshll_n_s16(vget_low_s16(a[2]), 2);
- right[2] = vshll_n_s16(vget_high_s16(a[2]), 2);
left[3] = vshll_n_s16(vget_low_s16(a[3]), 2);
- right[3] = vshll_n_s16(vget_high_s16(a[3]), 2);
left[4] = vshll_n_s16(vget_low_s16(a[4]), 2);
- right[4] = vshll_n_s16(vget_high_s16(a[4]), 2);
left[5] = vshll_n_s16(vget_low_s16(a[5]), 2);
- right[5] = vshll_n_s16(vget_high_s16(a[5]), 2);
left[6] = vshll_n_s16(vget_low_s16(a[6]), 2);
- right[6] = vshll_n_s16(vget_high_s16(a[6]), 2);
left[7] = vshll_n_s16(vget_low_s16(a[7]), 2);
- right[7] = vshll_n_s16(vget_high_s16(a[7]), 2);
left[8] = vshll_n_s16(vget_low_s16(a[8]), 2);
- right[8] = vshll_n_s16(vget_high_s16(a[8]), 2);
left[9] = vshll_n_s16(vget_low_s16(a[9]), 2);
- right[9] = vshll_n_s16(vget_high_s16(a[9]), 2);
left[10] = vshll_n_s16(vget_low_s16(a[10]), 2);
- right[10] = vshll_n_s16(vget_high_s16(a[10]), 2);
left[11] = vshll_n_s16(vget_low_s16(a[11]), 2);
- right[11] = vshll_n_s16(vget_high_s16(a[11]), 2);
left[12] = vshll_n_s16(vget_low_s16(a[12]), 2);
- right[12] = vshll_n_s16(vget_high_s16(a[12]), 2);
left[13] = vshll_n_s16(vget_low_s16(a[13]), 2);
- right[13] = vshll_n_s16(vget_high_s16(a[13]), 2);
left[14] = vshll_n_s16(vget_low_s16(a[14]), 2);
- right[14] = vshll_n_s16(vget_high_s16(a[14]), 2);
left[15] = vshll_n_s16(vget_low_s16(a[15]), 2);
+
+ right[0] = vshll_n_s16(vget_high_s16(a[0]), 2);
+ right[1] = vshll_n_s16(vget_high_s16(a[1]), 2);
+ right[2] = vshll_n_s16(vget_high_s16(a[2]), 2);
+ right[3] = vshll_n_s16(vget_high_s16(a[3]), 2);
+ right[4] = vshll_n_s16(vget_high_s16(a[4]), 2);
+ right[5] = vshll_n_s16(vget_high_s16(a[5]), 2);
+ right[6] = vshll_n_s16(vget_high_s16(a[6]), 2);
+ right[7] = vshll_n_s16(vget_high_s16(a[7]), 2);
+ right[8] = vshll_n_s16(vget_high_s16(a[8]), 2);
+ right[9] = vshll_n_s16(vget_high_s16(a[9]), 2);
+ right[10] = vshll_n_s16(vget_high_s16(a[10]), 2);
+ right[11] = vshll_n_s16(vget_high_s16(a[11]), 2);
+ right[12] = vshll_n_s16(vget_high_s16(a[12]), 2);
+ right[13] = vshll_n_s16(vget_high_s16(a[13]), 2);
+ right[14] = vshll_n_s16(vget_high_s16(a[14]), 2);
right[15] = vshll_n_s16(vget_high_s16(a[15]), 2);
}
@@ -357,81 +362,38 @@ static INLINE void highbd_partial_round_shift(int32x4_t *left /*[16]*/,
int32x4_t *right /* [16] */) {
const int32x4_t one = vdupq_n_s32(1);
left[0] = vshrq_n_s32(vaddq_s32(left[0], one), 2);
- right[0] = vshrq_n_s32(vaddq_s32(right[0], one), 2);
left[1] = vshrq_n_s32(vaddq_s32(left[1], one), 2);
- right[1] = vshrq_n_s32(vaddq_s32(right[1], one), 2);
left[2] = vshrq_n_s32(vaddq_s32(left[2], one), 2);
- right[2] = vshrq_n_s32(vaddq_s32(right[2], one), 2);
left[3] = vshrq_n_s32(vaddq_s32(left[3], one), 2);
- right[3] = vshrq_n_s32(vaddq_s32(right[3], one), 2);
left[4] = vshrq_n_s32(vaddq_s32(left[4], one), 2);
- right[4] = vshrq_n_s32(vaddq_s32(right[4], one), 2);
left[5] = vshrq_n_s32(vaddq_s32(left[5], one), 2);
- right[5] = vshrq_n_s32(vaddq_s32(right[5], one), 2);
left[6] = vshrq_n_s32(vaddq_s32(left[6], one), 2);
- right[6] = vshrq_n_s32(vaddq_s32(right[6], one), 2);
left[7] = vshrq_n_s32(vaddq_s32(left[7], one), 2);
- right[7] = vshrq_n_s32(vaddq_s32(right[7], one), 2);
left[8] = vshrq_n_s32(vaddq_s32(left[8], one), 2);
- right[8] = vshrq_n_s32(vaddq_s32(right[8], one), 2);
left[9] = vshrq_n_s32(vaddq_s32(left[9], one), 2);
- right[9] = vshrq_n_s32(vaddq_s32(right[9], one), 2);
left[10] = vshrq_n_s32(vaddq_s32(left[10], one), 2);
- right[10] = vshrq_n_s32(vaddq_s32(right[10], one), 2);
left[11] = vshrq_n_s32(vaddq_s32(left[11], one), 2);
- right[11] = vshrq_n_s32(vaddq_s32(right[11], one), 2);
left[12] = vshrq_n_s32(vaddq_s32(left[12], one), 2);
- right[12] = vshrq_n_s32(vaddq_s32(right[12], one), 2);
left[13] = vshrq_n_s32(vaddq_s32(left[13], one), 2);
- right[13] = vshrq_n_s32(vaddq_s32(right[13], one), 2);
left[14] = vshrq_n_s32(vaddq_s32(left[14], one), 2);
- right[14] = vshrq_n_s32(vaddq_s32(right[14], one), 2);
left[15] = vshrq_n_s32(vaddq_s32(left[15], one), 2);
- right[15] = vshrq_n_s32(vaddq_s32(right[15], one), 2);
-}
-static INLINE void transpose_s32_8x8_2(int32x4_t *left /*[8]*/,
- int32x4_t *right /*[8]*/,
- int32x4_t *out_left /*[8]*/,
- int32x4_t *out_right /*[8]*/) {
- int32x4x2_t out[8];
-
- out[0].val[0] = left[0];
- out[0].val[1] = right[0];
- out[1].val[0] = left[1];
- out[1].val[1] = right[1];
- out[2].val[0] = left[2];
- out[2].val[1] = right[2];
- out[3].val[0] = left[3];
- out[3].val[1] = right[3];
- out[4].val[0] = left[4];
- out[4].val[1] = right[4];
- out[5].val[0] = left[5];
- out[5].val[1] = right[5];
- out[6].val[0] = left[6];
- out[6].val[1] = right[6];
- out[7].val[0] = left[7];
- out[7].val[1] = right[7];
-
- transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
- &out[6], &out[7]);
-
- out_left[0] = out[0].val[0];
- out_left[1] = out[1].val[0];
- out_left[2] = out[2].val[0];
- out_left[3] = out[3].val[0];
- out_left[4] = out[4].val[0];
- out_left[5] = out[5].val[0];
- out_left[6] = out[6].val[0];
- out_left[7] = out[7].val[0];
- out_right[0] = out[0].val[1];
- out_right[1] = out[1].val[1];
- out_right[2] = out[2].val[1];
- out_right[3] = out[3].val[1];
- out_right[4] = out[4].val[1];
- out_right[5] = out[5].val[1];
- out_right[6] = out[6].val[1];
- out_right[7] = out[7].val[1];
+ right[0] = vshrq_n_s32(vaddq_s32(right[0], one), 2);
+ right[1] = vshrq_n_s32(vaddq_s32(right[1], one), 2);
+ right[2] = vshrq_n_s32(vaddq_s32(right[2], one), 2);
+ right[3] = vshrq_n_s32(vaddq_s32(right[3], one), 2);
+ right[4] = vshrq_n_s32(vaddq_s32(right[4], one), 2);
+ right[5] = vshrq_n_s32(vaddq_s32(right[5], one), 2);
+ right[6] = vshrq_n_s32(vaddq_s32(right[6], one), 2);
+ right[7] = vshrq_n_s32(vaddq_s32(right[7], one), 2);
+ right[8] = vshrq_n_s32(vaddq_s32(right[8], one), 2);
+ right[9] = vshrq_n_s32(vaddq_s32(right[9], one), 2);
+ right[10] = vshrq_n_s32(vaddq_s32(right[10], one), 2);
+ right[11] = vshrq_n_s32(vaddq_s32(right[11], one), 2);
+ right[12] = vshrq_n_s32(vaddq_s32(right[12], one), 2);
+ right[13] = vshrq_n_s32(vaddq_s32(right[13], one), 2);
+ right[14] = vshrq_n_s32(vaddq_s32(right[14], one), 2);
+ right[15] = vshrq_n_s32(vaddq_s32(right[15], one), 2);
}
// Store 16 32x4 vectors, assuming stride == 16.
@@ -469,9 +431,9 @@ static INLINE void store16_s32(tran_low_t *a, const int32x4_t *b /*[32]*/) {
vst1q_s32(a, b[15]);
}
-// Main body of fdct16x16.
-static void vpx_highbd_fdct16x16_body(int32x4_t *left /*[16]*/,
- int32x4_t *right /* [16] */) {
+// Main body of fdct8x16 column
+static void vpx_highbd_fdct8x16_body(int32x4_t *left /*[16]*/,
+ int32x4_t *right /* [16] */) {
int32x4_t sl[8];
int32x4_t sr[8];
int32x4_t xl[4];
@@ -531,22 +493,21 @@ static void vpx_highbd_fdct16x16_body(int32x4_t *left /*[16]*/,
// out[0] = fdct_round_shift((x0 + x1) * cospi_16_64)
// out[8] = fdct_round_shift((x0 - x1) * cospi_16_64)
- highbd_butterfly_one_coeff_s32(xl[0], xl[1], cospi_16_64, &left[0], &left[8]);
- highbd_butterfly_one_coeff_s32(xr[0], xr[1], cospi_16_64, &right[0],
- &right[8]);
- // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64);
+ butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64,
+ &left[0], &right[0], &left[8], &right[8]);
+
+ // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64);
// out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64);
- highbd_butterfly_two_coeff_s32(xl[3], xl[2], cospi_8_64, cospi_24_64,
- &left[4], &left[12]);
- highbd_butterfly_two_coeff_s32(xr[3], xr[2], cospi_8_64, cospi_24_64,
- &right[4], &right[12]);
+ butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[2], xr[2], cospi_8_64,
+ cospi_24_64, &left[4], &right[4],
+ &left[12], &right[12]);
// Stage 2
// Re-using source s5/s6
// s5 = fdct_round_shift((s6 - s5) * cospi_16_64)
// s6 = fdct_round_shift((s6 + s5) * cospi_16_64)
- highbd_butterfly_one_coeff_s32(sl[6], sl[5], cospi_16_64, &sl[6], &sl[5]);
- highbd_butterfly_one_coeff_s32(sr[6], sr[5], cospi_16_64, &sr[6], &sr[5]);
+ butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &sl[6],
+ &sr[6], &sl[5], &sr[5]);
// Stage 3
xl[0] = vaddq_s32(sl[4], sl[5]);
@@ -559,18 +520,16 @@ static void vpx_highbd_fdct16x16_body(int32x4_t *left /*[16]*/,
xr[3] = vaddq_s32(sr[7], sr[6]);
// Stage 4
- // out[2] = fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
- // out[14] = fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
- highbd_butterfly_two_coeff_s32(xl[3], xl[0], cospi_4_64, cospi_28_64,
- &left[2], &left[14]);
- highbd_butterfly_two_coeff_s32(xr[3], xr[0], cospi_4_64, cospi_28_64,
- &right[2], &right[14]);
- // out[6] = fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
- // out[10] = fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
- highbd_butterfly_two_coeff_s32(xl[2], xl[1], cospi_20_64, cospi_12_64,
- &left[10], &left[6]);
- highbd_butterfly_two_coeff_s32(xr[2], xr[1], cospi_20_64, cospi_12_64,
- &right[10], &right[6]);
+ // out[2] = fdct_round_shift(x3 * cospi_4_64 + x0 * cospi_28_64)
+ // out[14] = fdct_round_shift(x3 * cospi_28_64 - x0 * cospi_4_64)
+ butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[0], xr[0], cospi_4_64,
+ cospi_28_64, &left[2], &right[2],
+ &left[14], &right[14]);
+ // out[6] = fdct_round_shift(x2 * cospi_20_64 + x1 * cospi_12_64)
+ // out[10] = fdct_round_shift(x2 * cospi_12_64 - x1 * cospi_20_64)
+ butterfly_two_coeff_s32_s64_narrow(xl[2], xr[2], xl[1], xr[1], cospi_20_64,
+ cospi_12_64, &left[10], &right[10],
+ &left[6], &right[6]);
// step 2
// From fwd_txfm.c: Work on the next eight values; step1 -> odd_results"
@@ -582,10 +541,10 @@ static void vpx_highbd_fdct16x16_body(int32x4_t *left /*[16]*/,
// step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64)
// step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64)
// step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64)
- highbd_butterfly_one_coeff_s32(inl[5], inl[2], cospi_16_64, &sl[5], &sl[2]);
- highbd_butterfly_one_coeff_s32(inr[5], inr[2], cospi_16_64, &sr[5], &sr[2]);
- highbd_butterfly_one_coeff_s32(inl[4], inl[3], cospi_16_64, &sl[4], &sl[3]);
- highbd_butterfly_one_coeff_s32(inr[4], inr[3], cospi_16_64, &sr[4], &sr[3]);
+ butterfly_one_coeff_s32_fast(inl[5], inr[5], inl[2], inr[2], cospi_16_64,
+ &sl[5], &sr[5], &sl[2], &sr[2]);
+ butterfly_one_coeff_s32_fast(inl[4], inr[4], inl[3], inr[3], cospi_16_64,
+ &sl[4], &sr[4], &sl[3], &sr[3]);
// step 3
sl[0] = vaddq_s32(inl[0], sl[3]);
@@ -606,19 +565,18 @@ static void vpx_highbd_fdct16x16_body(int32x4_t *left /*[16]*/,
sr[7] = vaddq_s32(inr[7], sr[4]);
// step 4
- // step2[1] = fdct_round_shift(step3[1] *-cospi_8_64 + step3[6] * cospi_24_64)
- // step2[6] = fdct_round_shift(step3[1] * cospi_24_64 + step3[6] * cospi_8_64)
- highbd_butterfly_two_coeff_s32(sl[6], sl[1], cospi_8_64, cospi_24_64, &sl[6],
- &sl[1]);
- highbd_butterfly_two_coeff_s32(sr[6], sr[1], cospi_8_64, cospi_24_64, &sr[6],
- &sr[1]);
-
+ // step2[6] = fdct_round_shift(step3[6] * cospi_8_64 + step3[1] *
+ // cospi_24_64) step2[1] = fdct_round_shift(step3[6] * cospi_24_64 - step3[1]
+ // * cospi_8_64)
+ butterfly_two_coeff_s32_s64_narrow(sl[6], sr[6], sl[1], sr[1], cospi_8_64,
+ cospi_24_64, &sl[6], &sr[6], &sl[1],
+ &sr[1]);
// step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64)
- // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * cospi_24_64)
- highbd_butterfly_two_coeff_s32(xl[0], xl[3], cospi_24_64, cospi_8_64, &sl[2],
- &sl[5]);
- highbd_butterfly_two_coeff_s32(xr[0], xr[3], cospi_24_64, cospi_8_64, &sr[2],
- &sr[5]);
+ // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] *
+ // cospi_24_64)
+ butterfly_two_coeff_s32_s64_narrow(xl[0], xr[0], xl[3], xr[3], cospi_24_64,
+ cospi_8_64, &sl[2], &sr[2], &sl[5],
+ &sr[5]);
// step 5
stepl[0] = vaddq_s32(sl[0], sl[1]);
@@ -639,31 +597,26 @@ static void vpx_highbd_fdct16x16_body(int32x4_t *left /*[16]*/,
stepr[7] = vaddq_s32(sr[7], sr[6]);
// step 6
- // out[1] = fdct_round_shift(step1[0] * cospi_30_64 + step1[7] * cospi_2_64)
- // out[15] = fdct_round_shift(step1[0] * -cospi_2_64 + step1[7] * cospi_30_64)
- // out[9] = fdct_round_shift(step1[1] * cospi_14_64 + step1[6] * cospi_18_64)
- // out[7] = fdct_round_shift(step1[1] * -cospi_18_64 + step1[6] * cospi_14_64)
- // out[5] = fdct_round_shift(step1[2] * cospi_22_64 + step1[5] * cospi_10_64)
- // out[11] = fdct_round_shift(step1[2] * -cospi_10_64 + step1[5] *
- // cospi_22_64) out[13] = fdct_round_shift(step1[3] * cospi_6_64 + step1[4] *
- // cospi_26_64) out[3] = fdct_round_shift(step1[3] * -cospi_26_64 + step1[4] *
- // cospi_6_64)
- highbd_butterfly_two_coeff_s32(stepl[7], stepl[0], cospi_2_64, cospi_30_64,
- &left[1], &left[15]);
- highbd_butterfly_two_coeff_s32(stepr[7], stepr[0], cospi_2_64, cospi_30_64,
- &right[1], &right[15]);
- highbd_butterfly_two_coeff_s32(stepl[6], stepl[1], cospi_18_64, cospi_14_64,
- &left[9], &left[7]);
- highbd_butterfly_two_coeff_s32(stepr[6], stepr[1], cospi_18_64, cospi_14_64,
- &right[9], &right[7]);
- highbd_butterfly_two_coeff_s32(stepl[5], stepl[2], cospi_10_64, cospi_22_64,
- &left[5], &left[11]);
- highbd_butterfly_two_coeff_s32(stepr[5], stepr[2], cospi_10_64, cospi_22_64,
- &right[5], &right[11]);
- highbd_butterfly_two_coeff_s32(stepl[4], stepl[3], cospi_26_64, cospi_6_64,
- &left[13], &left[3]);
- highbd_butterfly_two_coeff_s32(stepr[4], stepr[3], cospi_26_64, cospi_6_64,
- &right[13], &right[3]);
+ // out[9] = fdct_round_shift(step1[6] * cospi_18_64 + step1[1] * cospi_14_64)
+ // out[7] = fdct_round_shift(step1[6] * cospi_14_64 - step1[1] * cospi_18_64)
+ butterfly_two_coeff_s32_s64_narrow(stepl[6], stepr[6], stepl[1], stepr[1],
+ cospi_18_64, cospi_14_64, &left[9],
+ &right[9], &left[7], &right[7]);
+ // out[1] = fdct_round_shift(step1[7] * cospi_2_64 + step1[0] * cospi_30_64)
+ // out[15] = fdct_round_shift(step1[7] * cospi_30_64 - step1[0] * cospi_2_64)
+ butterfly_two_coeff_s32_s64_narrow(stepl[7], stepr[7], stepl[0], stepr[0],
+ cospi_2_64, cospi_30_64, &left[1],
+ &right[1], &left[15], &right[15]);
+ // out[13] = fdct_round_shift(step1[4] * cospi_26_64 + step1[3] * cospi_6_64)
+ // out[3] = fdct_round_shift(step1[4] * cospi_6_64 - step1[3] * cospi_26_64)
+ butterfly_two_coeff_s32_s64_narrow(stepl[4], stepr[4], stepl[3], stepr[3],
+ cospi_26_64, cospi_6_64, &left[13],
+ &right[13], &left[3], &right[3]);
+ // out[5] = fdct_round_shift(step1[5] * cospi_10_64 + step1[2] * cospi_22_64)
+ // out[11] = fdct_round_shift(step1[5] * cospi_22_64 - step1[2] * cospi_10_64)
+ butterfly_two_coeff_s32_s64_narrow(stepl[5], stepr[5], stepl[2], stepr[2],
+ cospi_10_64, cospi_22_64, &left[5],
+ &right[5], &left[11], &right[11]);
}
#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/arm/fdct32x32_neon.c b/vpx_dsp/arm/fdct32x32_neon.c
index 51d81bd08..e2bf16760 100644
--- a/vpx_dsp/arm/fdct32x32_neon.c
+++ b/vpx_dsp/arm/fdct32x32_neon.c
@@ -16,6 +16,7 @@
#include "vpx_dsp/arm/mem_neon.h"
#include "vpx_dsp/arm/transpose_neon.h"
#include "vpx_dsp/arm/fdct_neon.h"
+#include "vpx_dsp/arm/fdct32x32_neon.h"
// Most gcc 4.9 distributions outside of Android do not generate correct code
// for this function.
@@ -33,1123 +34,6 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
#else
-#define LOAD_INCREMENT(src, stride, dest, index) \
- do { \
- dest[index] = vld1q_s16(src); \
- src += stride; \
- } while (0)
-
-#define ADD_S16(src, index0, index1, dest, index3) \
- do { \
- dest[index3] = vaddq_s16(src[index0], src[index1]); \
- } while (0)
-
-#define ADD_SHIFT_S16(src, index0, index1) \
- do { \
- src[index1] = vshlq_n_s16(vsubq_s16(src[index0], src[index1]), 2); \
- } while (0)
-
-// Load, cross, and multiply by 4. Load the first 8 and last 8, then the
-// middle
-// 16. Doing sets of 16 at a time. Maybe sets of 8 would be better?
-static INLINE void load(const int16_t *a, int stride, int16x8_t *b) {
- const int16_t *a_end = a + 24 * stride;
- int16x8_t c[8];
-
- LOAD_INCREMENT(a, stride, b, 0);
- LOAD_INCREMENT(a, stride, b, 1);
- LOAD_INCREMENT(a, stride, b, 2);
- LOAD_INCREMENT(a, stride, b, 3);
- LOAD_INCREMENT(a, stride, b, 4);
- LOAD_INCREMENT(a, stride, b, 5);
- LOAD_INCREMENT(a, stride, b, 6);
- LOAD_INCREMENT(a, stride, b, 7);
-
- LOAD_INCREMENT(a_end, stride, b, 24);
- LOAD_INCREMENT(a_end, stride, b, 25);
- LOAD_INCREMENT(a_end, stride, b, 26);
- LOAD_INCREMENT(a_end, stride, b, 27);
- LOAD_INCREMENT(a_end, stride, b, 28);
- LOAD_INCREMENT(a_end, stride, b, 29);
- LOAD_INCREMENT(a_end, stride, b, 30);
- LOAD_INCREMENT(a_end, stride, b, 31);
-
- ADD_S16(b, 0, 31, c, 0);
- ADD_S16(b, 1, 30, c, 1);
- ADD_S16(b, 2, 29, c, 2);
- ADD_S16(b, 3, 28, c, 3);
- ADD_S16(b, 4, 27, c, 4);
- ADD_S16(b, 5, 26, c, 5);
- ADD_S16(b, 6, 25, c, 6);
- ADD_S16(b, 7, 24, c, 7);
-
- ADD_SHIFT_S16(b, 7, 24);
- ADD_SHIFT_S16(b, 6, 25);
- ADD_SHIFT_S16(b, 5, 26);
- ADD_SHIFT_S16(b, 4, 27);
- ADD_SHIFT_S16(b, 3, 28);
- ADD_SHIFT_S16(b, 2, 29);
- ADD_SHIFT_S16(b, 1, 30);
- ADD_SHIFT_S16(b, 0, 31);
-
- b[0] = vshlq_n_s16(c[0], 2);
- b[1] = vshlq_n_s16(c[1], 2);
- b[2] = vshlq_n_s16(c[2], 2);
- b[3] = vshlq_n_s16(c[3], 2);
- b[4] = vshlq_n_s16(c[4], 2);
- b[5] = vshlq_n_s16(c[5], 2);
- b[6] = vshlq_n_s16(c[6], 2);
- b[7] = vshlq_n_s16(c[7], 2);
-
- LOAD_INCREMENT(a, stride, b, 8);
- LOAD_INCREMENT(a, stride, b, 9);
- LOAD_INCREMENT(a, stride, b, 10);
- LOAD_INCREMENT(a, stride, b, 11);
- LOAD_INCREMENT(a, stride, b, 12);
- LOAD_INCREMENT(a, stride, b, 13);
- LOAD_INCREMENT(a, stride, b, 14);
- LOAD_INCREMENT(a, stride, b, 15);
- LOAD_INCREMENT(a, stride, b, 16);
- LOAD_INCREMENT(a, stride, b, 17);
- LOAD_INCREMENT(a, stride, b, 18);
- LOAD_INCREMENT(a, stride, b, 19);
- LOAD_INCREMENT(a, stride, b, 20);
- LOAD_INCREMENT(a, stride, b, 21);
- LOAD_INCREMENT(a, stride, b, 22);
- LOAD_INCREMENT(a, stride, b, 23);
-
- ADD_S16(b, 8, 23, c, 0);
- ADD_S16(b, 9, 22, c, 1);
- ADD_S16(b, 10, 21, c, 2);
- ADD_S16(b, 11, 20, c, 3);
- ADD_S16(b, 12, 19, c, 4);
- ADD_S16(b, 13, 18, c, 5);
- ADD_S16(b, 14, 17, c, 6);
- ADD_S16(b, 15, 16, c, 7);
-
- ADD_SHIFT_S16(b, 15, 16);
- ADD_SHIFT_S16(b, 14, 17);
- ADD_SHIFT_S16(b, 13, 18);
- ADD_SHIFT_S16(b, 12, 19);
- ADD_SHIFT_S16(b, 11, 20);
- ADD_SHIFT_S16(b, 10, 21);
- ADD_SHIFT_S16(b, 9, 22);
- ADD_SHIFT_S16(b, 8, 23);
-
- b[8] = vshlq_n_s16(c[0], 2);
- b[9] = vshlq_n_s16(c[1], 2);
- b[10] = vshlq_n_s16(c[2], 2);
- b[11] = vshlq_n_s16(c[3], 2);
- b[12] = vshlq_n_s16(c[4], 2);
- b[13] = vshlq_n_s16(c[5], 2);
- b[14] = vshlq_n_s16(c[6], 2);
- b[15] = vshlq_n_s16(c[7], 2);
-}
-
-#undef LOAD_INCREMENT
-#undef ADD_S16
-#undef ADD_SHIFT_S16
-
-#define STORE_S16(src, index, dest) \
- do { \
- store_s16q_to_tran_low(dest, src[index]); \
- dest += 8; \
- } while (0)
-
-// Store 32 16x8 values, assuming stride == 32.
-// Slight twist: store horizontally in blocks of 8.
-static INLINE void store(tran_low_t *a, const int16x8_t *b) {
- STORE_S16(b, 0, a);
- STORE_S16(b, 8, a);
- STORE_S16(b, 16, a);
- STORE_S16(b, 24, a);
- STORE_S16(b, 1, a);
- STORE_S16(b, 9, a);
- STORE_S16(b, 17, a);
- STORE_S16(b, 25, a);
- STORE_S16(b, 2, a);
- STORE_S16(b, 10, a);
- STORE_S16(b, 18, a);
- STORE_S16(b, 26, a);
- STORE_S16(b, 3, a);
- STORE_S16(b, 11, a);
- STORE_S16(b, 19, a);
- STORE_S16(b, 27, a);
- STORE_S16(b, 4, a);
- STORE_S16(b, 12, a);
- STORE_S16(b, 20, a);
- STORE_S16(b, 28, a);
- STORE_S16(b, 5, a);
- STORE_S16(b, 13, a);
- STORE_S16(b, 21, a);
- STORE_S16(b, 29, a);
- STORE_S16(b, 6, a);
- STORE_S16(b, 14, a);
- STORE_S16(b, 22, a);
- STORE_S16(b, 30, a);
- STORE_S16(b, 7, a);
- STORE_S16(b, 15, a);
- STORE_S16(b, 23, a);
- STORE_S16(b, 31, a);
-}
-
-#undef STORE_S16
-
-static void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) {
- int16x8_t a[32];
- int16x8_t b[32];
-
- // Stage 1: Done as part of the load.
-
- // Stage 2.
- // Mini cross. X the first 16 values and the middle 8 of the second half.
- a[0] = vaddq_s16(in[0], in[15]);
- a[1] = vaddq_s16(in[1], in[14]);
- a[2] = vaddq_s16(in[2], in[13]);
- a[3] = vaddq_s16(in[3], in[12]);
- a[4] = vaddq_s16(in[4], in[11]);
- a[5] = vaddq_s16(in[5], in[10]);
- a[6] = vaddq_s16(in[6], in[9]);
- a[7] = vaddq_s16(in[7], in[8]);
-
- a[8] = vsubq_s16(in[7], in[8]);
- a[9] = vsubq_s16(in[6], in[9]);
- a[10] = vsubq_s16(in[5], in[10]);
- a[11] = vsubq_s16(in[4], in[11]);
- a[12] = vsubq_s16(in[3], in[12]);
- a[13] = vsubq_s16(in[2], in[13]);
- a[14] = vsubq_s16(in[1], in[14]);
- a[15] = vsubq_s16(in[0], in[15]);
-
- a[16] = in[16];
- a[17] = in[17];
- a[18] = in[18];
- a[19] = in[19];
-
- butterfly_one_coeff(in[27], in[20], cospi_16_64, &a[27], &a[20]);
- butterfly_one_coeff(in[26], in[21], cospi_16_64, &a[26], &a[21]);
- butterfly_one_coeff(in[25], in[22], cospi_16_64, &a[25], &a[22]);
- butterfly_one_coeff(in[24], in[23], cospi_16_64, &a[24], &a[23]);
-
- a[28] = in[28];
- a[29] = in[29];
- a[30] = in[30];
- a[31] = in[31];
-
- // Stage 3.
- b[0] = vaddq_s16(a[0], a[7]);
- b[1] = vaddq_s16(a[1], a[6]);
- b[2] = vaddq_s16(a[2], a[5]);
- b[3] = vaddq_s16(a[3], a[4]);
-
- b[4] = vsubq_s16(a[3], a[4]);
- b[5] = vsubq_s16(a[2], a[5]);
- b[6] = vsubq_s16(a[1], a[6]);
- b[7] = vsubq_s16(a[0], a[7]);
-
- b[8] = a[8];
- b[9] = a[9];
-
- butterfly_one_coeff(a[13], a[10], cospi_16_64, &b[13], &b[10]);
- butterfly_one_coeff(a[12], a[11], cospi_16_64, &b[12], &b[11]);
-
- b[14] = a[14];
- b[15] = a[15];
-
- b[16] = vaddq_s16(in[16], a[23]);
- b[17] = vaddq_s16(in[17], a[22]);
- b[18] = vaddq_s16(in[18], a[21]);
- b[19] = vaddq_s16(in[19], a[20]);
-
- b[20] = vsubq_s16(in[19], a[20]);
- b[21] = vsubq_s16(in[18], a[21]);
- b[22] = vsubq_s16(in[17], a[22]);
- b[23] = vsubq_s16(in[16], a[23]);
-
- b[24] = vsubq_s16(in[31], a[24]);
- b[25] = vsubq_s16(in[30], a[25]);
- b[26] = vsubq_s16(in[29], a[26]);
- b[27] = vsubq_s16(in[28], a[27]);
-
- b[28] = vaddq_s16(in[28], a[27]);
- b[29] = vaddq_s16(in[29], a[26]);
- b[30] = vaddq_s16(in[30], a[25]);
- b[31] = vaddq_s16(in[31], a[24]);
-
- // Stage 4.
- a[0] = vaddq_s16(b[0], b[3]);
- a[1] = vaddq_s16(b[1], b[2]);
- a[2] = vsubq_s16(b[1], b[2]);
- a[3] = vsubq_s16(b[0], b[3]);
-
- a[4] = b[4];
-
- butterfly_one_coeff(b[6], b[5], cospi_16_64, &a[6], &a[5]);
-
- a[7] = b[7];
-
- a[8] = vaddq_s16(b[8], b[11]);
- a[9] = vaddq_s16(b[9], b[10]);
- a[10] = vsubq_s16(b[9], b[10]);
- a[11] = vsubq_s16(b[8], b[11]);
- a[12] = vsubq_s16(b[15], b[12]);
- a[13] = vsubq_s16(b[14], b[13]);
- a[14] = vaddq_s16(b[14], b[13]);
- a[15] = vaddq_s16(b[15], b[12]);
-
- a[16] = b[16];
- a[17] = b[17];
-
- butterfly_two_coeff(b[29], b[18], cospi_24_64, cospi_8_64, &a[29], &a[18]);
- butterfly_two_coeff(b[28], b[19], cospi_24_64, cospi_8_64, &a[28], &a[19]);
- butterfly_two_coeff(b[27], b[20], -cospi_8_64, cospi_24_64, &a[27], &a[20]);
- butterfly_two_coeff(b[26], b[21], -cospi_8_64, cospi_24_64, &a[26], &a[21]);
-
- a[22] = b[22];
- a[23] = b[23];
- a[24] = b[24];
- a[25] = b[25];
-
- a[30] = b[30];
- a[31] = b[31];
-
- // Stage 5.
- butterfly_one_coeff(a[0], a[1], cospi_16_64, &b[0], &b[1]);
- butterfly_two_coeff(a[3], a[2], cospi_24_64, cospi_8_64, &b[2], &b[3]);
-
- b[4] = vaddq_s16(a[4], a[5]);
- b[5] = vsubq_s16(a[4], a[5]);
- b[6] = vsubq_s16(a[7], a[6]);
- b[7] = vaddq_s16(a[7], a[6]);
-
- b[8] = a[8];
-
- butterfly_two_coeff(a[14], a[9], cospi_24_64, cospi_8_64, &b[14], &b[9]);
- butterfly_two_coeff(a[13], a[10], -cospi_8_64, cospi_24_64, &b[13], &b[10]);
-
- b[11] = a[11];
- b[12] = a[12];
-
- b[15] = a[15];
-
- b[16] = vaddq_s16(a[19], a[16]);
- b[17] = vaddq_s16(a[18], a[17]);
- b[18] = vsubq_s16(a[17], a[18]);
- b[19] = vsubq_s16(a[16], a[19]);
- b[20] = vsubq_s16(a[23], a[20]);
- b[21] = vsubq_s16(a[22], a[21]);
- b[22] = vaddq_s16(a[21], a[22]);
- b[23] = vaddq_s16(a[20], a[23]);
- b[24] = vaddq_s16(a[27], a[24]);
- b[25] = vaddq_s16(a[26], a[25]);
- b[26] = vsubq_s16(a[25], a[26]);
- b[27] = vsubq_s16(a[24], a[27]);
- b[28] = vsubq_s16(a[31], a[28]);
- b[29] = vsubq_s16(a[30], a[29]);
- b[30] = vaddq_s16(a[29], a[30]);
- b[31] = vaddq_s16(a[28], a[31]);
-
- // Stage 6.
- a[0] = b[0];
- a[1] = b[1];
- a[2] = b[2];
- a[3] = b[3];
-
- butterfly_two_coeff(b[7], b[4], cospi_28_64, cospi_4_64, &a[4], &a[7]);
- butterfly_two_coeff(b[6], b[5], cospi_12_64, cospi_20_64, &a[5], &a[6]);
-
- a[8] = vaddq_s16(b[8], b[9]);
- a[9] = vsubq_s16(b[8], b[9]);
- a[10] = vsubq_s16(b[11], b[10]);
- a[11] = vaddq_s16(b[11], b[10]);
- a[12] = vaddq_s16(b[12], b[13]);
- a[13] = vsubq_s16(b[12], b[13]);
- a[14] = vsubq_s16(b[15], b[14]);
- a[15] = vaddq_s16(b[15], b[14]);
-
- a[16] = b[16];
- a[19] = b[19];
- a[20] = b[20];
- a[23] = b[23];
- a[24] = b[24];
- a[27] = b[27];
- a[28] = b[28];
- a[31] = b[31];
-
- butterfly_two_coeff(b[30], b[17], cospi_28_64, cospi_4_64, &a[30], &a[17]);
- butterfly_two_coeff(b[29], b[18], -cospi_4_64, cospi_28_64, &a[29], &a[18]);
-
- butterfly_two_coeff(b[26], b[21], cospi_12_64, cospi_20_64, &a[26], &a[21]);
- butterfly_two_coeff(b[25], b[22], -cospi_20_64, cospi_12_64, &a[25], &a[22]);
-
- // Stage 7.
- b[0] = a[0];
- b[1] = a[1];
- b[2] = a[2];
- b[3] = a[3];
- b[4] = a[4];
- b[5] = a[5];
- b[6] = a[6];
- b[7] = a[7];
-
- butterfly_two_coeff(a[15], a[8], cospi_30_64, cospi_2_64, &b[8], &b[15]);
- butterfly_two_coeff(a[14], a[9], cospi_14_64, cospi_18_64, &b[9], &b[14]);
- butterfly_two_coeff(a[13], a[10], cospi_22_64, cospi_10_64, &b[10], &b[13]);
- butterfly_two_coeff(a[12], a[11], cospi_6_64, cospi_26_64, &b[11], &b[12]);
-
- b[16] = vaddq_s16(a[16], a[17]);
- b[17] = vsubq_s16(a[16], a[17]);
- b[18] = vsubq_s16(a[19], a[18]);
- b[19] = vaddq_s16(a[19], a[18]);
- b[20] = vaddq_s16(a[20], a[21]);
- b[21] = vsubq_s16(a[20], a[21]);
- b[22] = vsubq_s16(a[23], a[22]);
- b[23] = vaddq_s16(a[23], a[22]);
- b[24] = vaddq_s16(a[24], a[25]);
- b[25] = vsubq_s16(a[24], a[25]);
- b[26] = vsubq_s16(a[27], a[26]);
- b[27] = vaddq_s16(a[27], a[26]);
- b[28] = vaddq_s16(a[28], a[29]);
- b[29] = vsubq_s16(a[28], a[29]);
- b[30] = vsubq_s16(a[31], a[30]);
- b[31] = vaddq_s16(a[31], a[30]);
-
- // Final stage.
- // Also compute partial rounding shift:
- // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
- out[0] = sub_round_shift(b[0]);
- out[16] = sub_round_shift(b[1]);
- out[8] = sub_round_shift(b[2]);
- out[24] = sub_round_shift(b[3]);
- out[4] = sub_round_shift(b[4]);
- out[20] = sub_round_shift(b[5]);
- out[12] = sub_round_shift(b[6]);
- out[28] = sub_round_shift(b[7]);
- out[2] = sub_round_shift(b[8]);
- out[18] = sub_round_shift(b[9]);
- out[10] = sub_round_shift(b[10]);
- out[26] = sub_round_shift(b[11]);
- out[6] = sub_round_shift(b[12]);
- out[22] = sub_round_shift(b[13]);
- out[14] = sub_round_shift(b[14]);
- out[30] = sub_round_shift(b[15]);
-
- butterfly_two_coeff(b[31], b[16], cospi_31_64, cospi_1_64, &a[1], &a[31]);
- out[1] = sub_round_shift(a[1]);
- out[31] = sub_round_shift(a[31]);
-
- butterfly_two_coeff(b[30], b[17], cospi_15_64, cospi_17_64, &a[17], &a[15]);
- out[17] = sub_round_shift(a[17]);
- out[15] = sub_round_shift(a[15]);
-
- butterfly_two_coeff(b[29], b[18], cospi_23_64, cospi_9_64, &a[9], &a[23]);
- out[9] = sub_round_shift(a[9]);
- out[23] = sub_round_shift(a[23]);
-
- butterfly_two_coeff(b[28], b[19], cospi_7_64, cospi_25_64, &a[25], &a[7]);
- out[25] = sub_round_shift(a[25]);
- out[7] = sub_round_shift(a[7]);
-
- butterfly_two_coeff(b[27], b[20], cospi_27_64, cospi_5_64, &a[5], &a[27]);
- out[5] = sub_round_shift(a[5]);
- out[27] = sub_round_shift(a[27]);
-
- butterfly_two_coeff(b[26], b[21], cospi_11_64, cospi_21_64, &a[21], &a[11]);
- out[21] = sub_round_shift(a[21]);
- out[11] = sub_round_shift(a[11]);
-
- butterfly_two_coeff(b[25], b[22], cospi_19_64, cospi_13_64, &a[13], &a[19]);
- out[13] = sub_round_shift(a[13]);
- out[19] = sub_round_shift(a[19]);
-
- butterfly_two_coeff(b[24], b[23], cospi_3_64, cospi_29_64, &a[29], &a[3]);
- out[29] = sub_round_shift(a[29]);
- out[3] = sub_round_shift(a[3]);
-}
-
-#define PASS_THROUGH(src, dst, element) \
- do { \
- dst##_lo[element] = src##_lo[element]; \
- dst##_hi[element] = src##_hi[element]; \
- } while (0)
-
-#define ADD_S16_S32(a, left_index, right_index, b, b_index) \
- do { \
- b##_lo[b_index] = \
- vaddl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
- b##_hi[b_index] = vaddl_s16(vget_high_s16(a[left_index]), \
- vget_high_s16(a[right_index])); \
- } while (0)
-
-#define SUB_S16_S32(a, left_index, right_index, b, b_index) \
- do { \
- b##_lo[b_index] = \
- vsubl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
- b##_hi[b_index] = vsubl_s16(vget_high_s16(a[left_index]), \
- vget_high_s16(a[right_index])); \
- } while (0)
-
-#define ADDW_S16_S32(a, a_index, b, b_index, c, c_index) \
- do { \
- c##_lo[c_index] = vaddw_s16(a##_lo[a_index], vget_low_s16(b[b_index])); \
- c##_hi[c_index] = vaddw_s16(a##_hi[a_index], vget_high_s16(b[b_index])); \
- } while (0)
-
-#define SUBW_S16_S32(a, a_index, b, b_index, temp, temp_index, c, c_index) \
- do { \
- temp##_lo[temp_index] = vmovl_s16(vget_low_s16(a[a_index])); \
- temp##_hi[temp_index] = vmovl_s16(vget_high_s16(a[a_index])); \
- c##_lo[c_index] = vsubq_s32(temp##_lo[temp_index], b##_lo[b_index]); \
- c##_hi[c_index] = vsubq_s32(temp##_hi[temp_index], b##_hi[b_index]); \
- } while (0)
-
-#define ADD_S32(a, left_index, right_index, b, b_index) \
- do { \
- b##_lo[b_index] = vaddq_s32(a##_lo[left_index], a##_lo[right_index]); \
- b##_hi[b_index] = vaddq_s32(a##_hi[left_index], a##_hi[right_index]); \
- } while (0)
-
-#define SUB_S32(a, left_index, right_index, b, b_index) \
- do { \
- b##_lo[b_index] = vsubq_s32(a##_lo[left_index], a##_lo[right_index]); \
- b##_hi[b_index] = vsubq_s32(a##_hi[left_index], a##_hi[right_index]); \
- } while (0)
-
-#define BUTTERFLY_ONE_S16_S32(a, left_index, right_index, constant, b, \
- add_index, sub_index) \
- do { \
- butterfly_one_coeff_s16_s32(a[left_index], a[right_index], constant, \
- &b##_lo[add_index], &b##_hi[add_index], \
- &b##_lo[sub_index], &b##_hi[sub_index]); \
- } while (0)
-
-#define BUTTERFLY_ONE_S32(a, left_index, right_index, constant, b, add_index, \
- sub_index) \
- do { \
- butterfly_one_coeff_s32(a##_lo[left_index], a##_hi[left_index], \
- a##_lo[right_index], a##_hi[right_index], \
- constant, &b##_lo[add_index], &b##_hi[add_index], \
- &b##_lo[sub_index], &b##_hi[sub_index]); \
- } while (0)
-
-#define BUTTERFLY_TWO_S32(a, left_index, right_index, left_constant, \
- right_constant, b, add_index, sub_index) \
- do { \
- butterfly_two_coeff_s32(a##_lo[left_index], a##_hi[left_index], \
- a##_lo[right_index], a##_hi[right_index], \
- left_constant, right_constant, &b##_lo[add_index], \
- &b##_hi[add_index], &b##_lo[sub_index], \
- &b##_hi[sub_index]); \
- } while (0)
-
-static void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) {
- int16x8_t a[32];
- int16x8_t b[32];
- int32x4_t c_lo[32];
- int32x4_t c_hi[32];
- int32x4_t d_lo[32];
- int32x4_t d_hi[32];
-
- // Stage 1. Done as part of the load for the first pass.
- a[0] = vaddq_s16(in[0], in[31]);
- a[1] = vaddq_s16(in[1], in[30]);
- a[2] = vaddq_s16(in[2], in[29]);
- a[3] = vaddq_s16(in[3], in[28]);
- a[4] = vaddq_s16(in[4], in[27]);
- a[5] = vaddq_s16(in[5], in[26]);
- a[6] = vaddq_s16(in[6], in[25]);
- a[7] = vaddq_s16(in[7], in[24]);
- a[8] = vaddq_s16(in[8], in[23]);
- a[9] = vaddq_s16(in[9], in[22]);
- a[10] = vaddq_s16(in[10], in[21]);
- a[11] = vaddq_s16(in[11], in[20]);
- a[12] = vaddq_s16(in[12], in[19]);
- a[13] = vaddq_s16(in[13], in[18]);
- a[14] = vaddq_s16(in[14], in[17]);
- a[15] = vaddq_s16(in[15], in[16]);
- a[16] = vsubq_s16(in[15], in[16]);
- a[17] = vsubq_s16(in[14], in[17]);
- a[18] = vsubq_s16(in[13], in[18]);
- a[19] = vsubq_s16(in[12], in[19]);
- a[20] = vsubq_s16(in[11], in[20]);
- a[21] = vsubq_s16(in[10], in[21]);
- a[22] = vsubq_s16(in[9], in[22]);
- a[23] = vsubq_s16(in[8], in[23]);
- a[24] = vsubq_s16(in[7], in[24]);
- a[25] = vsubq_s16(in[6], in[25]);
- a[26] = vsubq_s16(in[5], in[26]);
- a[27] = vsubq_s16(in[4], in[27]);
- a[28] = vsubq_s16(in[3], in[28]);
- a[29] = vsubq_s16(in[2], in[29]);
- a[30] = vsubq_s16(in[1], in[30]);
- a[31] = vsubq_s16(in[0], in[31]);
-
- // Stage 2.
- b[0] = vaddq_s16(a[0], a[15]);
- b[1] = vaddq_s16(a[1], a[14]);
- b[2] = vaddq_s16(a[2], a[13]);
- b[3] = vaddq_s16(a[3], a[12]);
- b[4] = vaddq_s16(a[4], a[11]);
- b[5] = vaddq_s16(a[5], a[10]);
- b[6] = vaddq_s16(a[6], a[9]);
- b[7] = vaddq_s16(a[7], a[8]);
-
- b[8] = vsubq_s16(a[7], a[8]);
- b[9] = vsubq_s16(a[6], a[9]);
- b[10] = vsubq_s16(a[5], a[10]);
- b[11] = vsubq_s16(a[4], a[11]);
- b[12] = vsubq_s16(a[3], a[12]);
- b[13] = vsubq_s16(a[2], a[13]);
- b[14] = vsubq_s16(a[1], a[14]);
- b[15] = vsubq_s16(a[0], a[15]);
-
- b[16] = a[16];
- b[17] = a[17];
- b[18] = a[18];
- b[19] = a[19];
-
- butterfly_one_coeff(a[27], a[20], cospi_16_64, &b[27], &b[20]);
- butterfly_one_coeff(a[26], a[21], cospi_16_64, &b[26], &b[21]);
- butterfly_one_coeff(a[25], a[22], cospi_16_64, &b[25], &b[22]);
- butterfly_one_coeff(a[24], a[23], cospi_16_64, &b[24], &b[23]);
-
- b[28] = a[28];
- b[29] = a[29];
- b[30] = a[30];
- b[31] = a[31];
-
- // Stage 3. With extreme values for input this calculation rolls over int16_t.
- // The sources for b[0] get added multiple times and, through testing, have
- // been shown to overflow starting here.
- ADD_S16_S32(b, 0, 7, c, 0);
- ADD_S16_S32(b, 1, 6, c, 1);
- ADD_S16_S32(b, 2, 5, c, 2);
- ADD_S16_S32(b, 3, 4, c, 3);
- SUB_S16_S32(b, 3, 4, c, 4);
- SUB_S16_S32(b, 2, 5, c, 5);
- SUB_S16_S32(b, 1, 6, c, 6);
- SUB_S16_S32(b, 0, 7, c, 7);
-
- a[8] = b[8];
- a[9] = b[9];
-
- BUTTERFLY_ONE_S16_S32(b, 13, 10, cospi_16_64, c, 13, 10);
- BUTTERFLY_ONE_S16_S32(b, 12, 11, cospi_16_64, c, 12, 11);
-
- a[14] = b[14];
- a[15] = b[15];
-
- ADD_S16_S32(b, 16, 23, c, 16);
- ADD_S16_S32(b, 17, 22, c, 17);
- ADD_S16_S32(b, 18, 21, c, 18);
- ADD_S16_S32(b, 19, 20, c, 19);
- SUB_S16_S32(b, 19, 20, c, 20);
- SUB_S16_S32(b, 18, 21, c, 21);
- SUB_S16_S32(b, 17, 22, c, 22);
- SUB_S16_S32(b, 16, 23, c, 23);
- SUB_S16_S32(b, 31, 24, c, 24);
- SUB_S16_S32(b, 30, 25, c, 25);
- SUB_S16_S32(b, 29, 26, c, 26);
- SUB_S16_S32(b, 28, 27, c, 27);
- ADD_S16_S32(b, 28, 27, c, 28);
- ADD_S16_S32(b, 29, 26, c, 29);
- ADD_S16_S32(b, 30, 25, c, 30);
- ADD_S16_S32(b, 31, 24, c, 31);
-
- // Stage 4.
- ADD_S32(c, 0, 3, d, 0);
- ADD_S32(c, 1, 2, d, 1);
- SUB_S32(c, 1, 2, d, 2);
- SUB_S32(c, 0, 3, d, 3);
-
- PASS_THROUGH(c, d, 4);
-
- BUTTERFLY_ONE_S32(c, 6, 5, cospi_16_64, d, 6, 5);
-
- PASS_THROUGH(c, d, 7);
-
- ADDW_S16_S32(c, 11, a, 8, d, 8);
- ADDW_S16_S32(c, 10, a, 9, d, 9);
- SUBW_S16_S32(a, 9, c, 10, c, 9, d, 10);
- SUBW_S16_S32(a, 8, c, 11, c, 8, d, 11);
- SUBW_S16_S32(a, 15, c, 12, c, 15, d, 12);
- SUBW_S16_S32(a, 14, c, 13, c, 14, d, 13);
- ADDW_S16_S32(c, 13, b, 14, d, 14);
- ADDW_S16_S32(c, 12, b, 15, d, 15);
-
- PASS_THROUGH(c, d, 16);
- PASS_THROUGH(c, d, 17);
-
- BUTTERFLY_TWO_S32(c, 29, 18, cospi_24_64, cospi_8_64, d, 29, 18);
- BUTTERFLY_TWO_S32(c, 28, 19, cospi_24_64, cospi_8_64, d, 28, 19);
- BUTTERFLY_TWO_S32(c, 27, 20, -cospi_8_64, cospi_24_64, d, 27, 20);
- BUTTERFLY_TWO_S32(c, 26, 21, -cospi_8_64, cospi_24_64, d, 26, 21);
-
- PASS_THROUGH(c, d, 22);
- PASS_THROUGH(c, d, 23);
- PASS_THROUGH(c, d, 24);
- PASS_THROUGH(c, d, 25);
-
- PASS_THROUGH(c, d, 30);
- PASS_THROUGH(c, d, 31);
-
- // Stage 5.
- BUTTERFLY_ONE_S32(d, 0, 1, cospi_16_64, c, 0, 1);
- BUTTERFLY_TWO_S32(d, 3, 2, cospi_24_64, cospi_8_64, c, 2, 3);
-
- ADD_S32(d, 4, 5, c, 4);
- SUB_S32(d, 4, 5, c, 5);
- SUB_S32(d, 7, 6, c, 6);
- ADD_S32(d, 7, 6, c, 7);
-
- PASS_THROUGH(d, c, 8);
-
- BUTTERFLY_TWO_S32(d, 14, 9, cospi_24_64, cospi_8_64, c, 14, 9);
- BUTTERFLY_TWO_S32(d, 13, 10, -cospi_8_64, cospi_24_64, c, 13, 10);
-
- PASS_THROUGH(d, c, 11);
- PASS_THROUGH(d, c, 12);
- PASS_THROUGH(d, c, 15);
-
- ADD_S32(d, 16, 19, c, 16);
- ADD_S32(d, 17, 18, c, 17);
- SUB_S32(d, 17, 18, c, 18);
- SUB_S32(d, 16, 19, c, 19);
- SUB_S32(d, 23, 20, c, 20);
- SUB_S32(d, 22, 21, c, 21);
- ADD_S32(d, 22, 21, c, 22);
- ADD_S32(d, 23, 20, c, 23);
- ADD_S32(d, 24, 27, c, 24);
- ADD_S32(d, 25, 26, c, 25);
- SUB_S32(d, 25, 26, c, 26);
- SUB_S32(d, 24, 27, c, 27);
- SUB_S32(d, 31, 28, c, 28);
- SUB_S32(d, 30, 29, c, 29);
- ADD_S32(d, 30, 29, c, 30);
- ADD_S32(d, 31, 28, c, 31);
-
- // Stage 6.
- PASS_THROUGH(c, d, 0);
- PASS_THROUGH(c, d, 1);
- PASS_THROUGH(c, d, 2);
- PASS_THROUGH(c, d, 3);
-
- BUTTERFLY_TWO_S32(c, 7, 4, cospi_28_64, cospi_4_64, d, 4, 7);
- BUTTERFLY_TWO_S32(c, 6, 5, cospi_12_64, cospi_20_64, d, 5, 6);
-
- ADD_S32(c, 8, 9, d, 8);
- SUB_S32(c, 8, 9, d, 9);
- SUB_S32(c, 11, 10, d, 10);
- ADD_S32(c, 11, 10, d, 11);
- ADD_S32(c, 12, 13, d, 12);
- SUB_S32(c, 12, 13, d, 13);
- SUB_S32(c, 15, 14, d, 14);
- ADD_S32(c, 15, 14, d, 15);
-
- PASS_THROUGH(c, d, 16);
- PASS_THROUGH(c, d, 19);
- PASS_THROUGH(c, d, 20);
- PASS_THROUGH(c, d, 23);
- PASS_THROUGH(c, d, 24);
- PASS_THROUGH(c, d, 27);
- PASS_THROUGH(c, d, 28);
- PASS_THROUGH(c, d, 31);
-
- BUTTERFLY_TWO_S32(c, 30, 17, cospi_28_64, cospi_4_64, d, 30, 17);
- BUTTERFLY_TWO_S32(c, 29, 18, -cospi_4_64, cospi_28_64, d, 29, 18);
- BUTTERFLY_TWO_S32(c, 26, 21, cospi_12_64, cospi_20_64, d, 26, 21);
- BUTTERFLY_TWO_S32(c, 25, 22, -cospi_20_64, cospi_12_64, d, 25, 22);
-
- // Stage 7.
- PASS_THROUGH(d, c, 0);
- PASS_THROUGH(d, c, 1);
- PASS_THROUGH(d, c, 2);
- PASS_THROUGH(d, c, 3);
- PASS_THROUGH(d, c, 4);
- PASS_THROUGH(d, c, 5);
- PASS_THROUGH(d, c, 6);
- PASS_THROUGH(d, c, 7);
-
- BUTTERFLY_TWO_S32(d, 15, 8, cospi_30_64, cospi_2_64, c, 8, 15);
- BUTTERFLY_TWO_S32(d, 14, 9, cospi_14_64, cospi_18_64, c, 9, 14);
- BUTTERFLY_TWO_S32(d, 13, 10, cospi_22_64, cospi_10_64, c, 10, 13);
- BUTTERFLY_TWO_S32(d, 12, 11, cospi_6_64, cospi_26_64, c, 11, 12);
-
- ADD_S32(d, 16, 17, c, 16);
- SUB_S32(d, 16, 17, c, 17);
- SUB_S32(d, 19, 18, c, 18);
- ADD_S32(d, 19, 18, c, 19);
- ADD_S32(d, 20, 21, c, 20);
- SUB_S32(d, 20, 21, c, 21);
- SUB_S32(d, 23, 22, c, 22);
- ADD_S32(d, 23, 22, c, 23);
- ADD_S32(d, 24, 25, c, 24);
- SUB_S32(d, 24, 25, c, 25);
- SUB_S32(d, 27, 26, c, 26);
- ADD_S32(d, 27, 26, c, 27);
- ADD_S32(d, 28, 29, c, 28);
- SUB_S32(d, 28, 29, c, 29);
- SUB_S32(d, 31, 30, c, 30);
- ADD_S32(d, 31, 30, c, 31);
-
- // Final stage.
- // Roll rounding into this function so we can pass back int16x8.
-
- out[0] = add_round_shift_s32(c_lo[0], c_hi[0]);
- out[16] = add_round_shift_s32(c_lo[1], c_hi[1]);
-
- out[8] = add_round_shift_s32(c_lo[2], c_hi[2]);
- out[24] = add_round_shift_s32(c_lo[3], c_hi[3]);
- out[4] = add_round_shift_s32(c_lo[4], c_hi[4]);
- out[20] = add_round_shift_s32(c_lo[5], c_hi[5]);
- out[12] = add_round_shift_s32(c_lo[6], c_hi[6]);
-
- out[28] = add_round_shift_s32(c_lo[7], c_hi[7]);
- out[2] = add_round_shift_s32(c_lo[8], c_hi[8]);
- out[18] = add_round_shift_s32(c_lo[9], c_hi[9]);
- out[10] = add_round_shift_s32(c_lo[10], c_hi[10]);
-
- out[26] = add_round_shift_s32(c_lo[11], c_hi[11]);
- out[6] = add_round_shift_s32(c_lo[12], c_hi[12]);
- out[22] = add_round_shift_s32(c_lo[13], c_hi[13]);
- out[14] = add_round_shift_s32(c_lo[14], c_hi[14]);
- out[30] = add_round_shift_s32(c_lo[15], c_hi[15]);
-
- BUTTERFLY_TWO_S32(c, 31, 16, cospi_31_64, cospi_1_64, d, 1, 31);
- out[1] = add_round_shift_s32(d_lo[1], d_hi[1]);
- out[31] = add_round_shift_s32(d_lo[31], d_hi[31]);
-
- BUTTERFLY_TWO_S32(c, 30, 17, cospi_15_64, cospi_17_64, d, 17, 15);
- out[17] = add_round_shift_s32(d_lo[17], d_hi[17]);
- out[15] = add_round_shift_s32(d_lo[15], d_hi[15]);
-
- BUTTERFLY_TWO_S32(c, 29, 18, cospi_23_64, cospi_9_64, d, 9, 23);
- out[9] = add_round_shift_s32(d_lo[9], d_hi[9]);
- out[23] = add_round_shift_s32(d_lo[23], d_hi[23]);
-
- BUTTERFLY_TWO_S32(c, 28, 19, cospi_7_64, cospi_25_64, d, 25, 7);
- out[25] = add_round_shift_s32(d_lo[25], d_hi[25]);
- out[7] = add_round_shift_s32(d_lo[7], d_hi[7]);
-
- BUTTERFLY_TWO_S32(c, 27, 20, cospi_27_64, cospi_5_64, d, 5, 27);
- out[5] = add_round_shift_s32(d_lo[5], d_hi[5]);
- out[27] = add_round_shift_s32(d_lo[27], d_hi[27]);
-
- BUTTERFLY_TWO_S32(c, 26, 21, cospi_11_64, cospi_21_64, d, 21, 11);
- out[21] = add_round_shift_s32(d_lo[21], d_hi[21]);
- out[11] = add_round_shift_s32(d_lo[11], d_hi[11]);
-
- BUTTERFLY_TWO_S32(c, 25, 22, cospi_19_64, cospi_13_64, d, 13, 19);
- out[13] = add_round_shift_s32(d_lo[13], d_hi[13]);
- out[19] = add_round_shift_s32(d_lo[19], d_hi[19]);
-
- BUTTERFLY_TWO_S32(c, 24, 23, cospi_3_64, cospi_29_64, d, 29, 3);
- out[29] = add_round_shift_s32(d_lo[29], d_hi[29]);
- out[3] = add_round_shift_s32(d_lo[3], d_hi[3]);
-}
-
-static void dct_body_second_pass_rd(const int16x8_t *in, int16x8_t *out) {
- int16x8_t a[32];
- int16x8_t b[32];
-
- // Stage 1. Done as part of the load for the first pass.
- a[0] = vaddq_s16(in[0], in[31]);
- a[1] = vaddq_s16(in[1], in[30]);
- a[2] = vaddq_s16(in[2], in[29]);
- a[3] = vaddq_s16(in[3], in[28]);
- a[4] = vaddq_s16(in[4], in[27]);
- a[5] = vaddq_s16(in[5], in[26]);
- a[6] = vaddq_s16(in[6], in[25]);
- a[7] = vaddq_s16(in[7], in[24]);
- a[8] = vaddq_s16(in[8], in[23]);
- a[9] = vaddq_s16(in[9], in[22]);
- a[10] = vaddq_s16(in[10], in[21]);
- a[11] = vaddq_s16(in[11], in[20]);
- a[12] = vaddq_s16(in[12], in[19]);
- a[13] = vaddq_s16(in[13], in[18]);
- a[14] = vaddq_s16(in[14], in[17]);
- a[15] = vaddq_s16(in[15], in[16]);
- a[16] = vsubq_s16(in[15], in[16]);
- a[17] = vsubq_s16(in[14], in[17]);
- a[18] = vsubq_s16(in[13], in[18]);
- a[19] = vsubq_s16(in[12], in[19]);
- a[20] = vsubq_s16(in[11], in[20]);
- a[21] = vsubq_s16(in[10], in[21]);
- a[22] = vsubq_s16(in[9], in[22]);
- a[23] = vsubq_s16(in[8], in[23]);
- a[24] = vsubq_s16(in[7], in[24]);
- a[25] = vsubq_s16(in[6], in[25]);
- a[26] = vsubq_s16(in[5], in[26]);
- a[27] = vsubq_s16(in[4], in[27]);
- a[28] = vsubq_s16(in[3], in[28]);
- a[29] = vsubq_s16(in[2], in[29]);
- a[30] = vsubq_s16(in[1], in[30]);
- a[31] = vsubq_s16(in[0], in[31]);
-
- // Stage 2.
- // For the "rd" version, all the values are rounded down after stage 2 to keep
- // the values in 16 bits.
- b[0] = add_round_shift_s16(vaddq_s16(a[0], a[15]));
- b[1] = add_round_shift_s16(vaddq_s16(a[1], a[14]));
- b[2] = add_round_shift_s16(vaddq_s16(a[2], a[13]));
- b[3] = add_round_shift_s16(vaddq_s16(a[3], a[12]));
- b[4] = add_round_shift_s16(vaddq_s16(a[4], a[11]));
- b[5] = add_round_shift_s16(vaddq_s16(a[5], a[10]));
- b[6] = add_round_shift_s16(vaddq_s16(a[6], a[9]));
- b[7] = add_round_shift_s16(vaddq_s16(a[7], a[8]));
-
- b[8] = add_round_shift_s16(vsubq_s16(a[7], a[8]));
- b[9] = add_round_shift_s16(vsubq_s16(a[6], a[9]));
- b[10] = add_round_shift_s16(vsubq_s16(a[5], a[10]));
- b[11] = add_round_shift_s16(vsubq_s16(a[4], a[11]));
- b[12] = add_round_shift_s16(vsubq_s16(a[3], a[12]));
- b[13] = add_round_shift_s16(vsubq_s16(a[2], a[13]));
- b[14] = add_round_shift_s16(vsubq_s16(a[1], a[14]));
- b[15] = add_round_shift_s16(vsubq_s16(a[0], a[15]));
-
- b[16] = add_round_shift_s16(a[16]);
- b[17] = add_round_shift_s16(a[17]);
- b[18] = add_round_shift_s16(a[18]);
- b[19] = add_round_shift_s16(a[19]);
-
- butterfly_one_coeff(a[27], a[20], cospi_16_64, &b[27], &b[20]);
- butterfly_one_coeff(a[26], a[21], cospi_16_64, &b[26], &b[21]);
- butterfly_one_coeff(a[25], a[22], cospi_16_64, &b[25], &b[22]);
- butterfly_one_coeff(a[24], a[23], cospi_16_64, &b[24], &b[23]);
- b[20] = add_round_shift_s16(b[20]);
- b[21] = add_round_shift_s16(b[21]);
- b[22] = add_round_shift_s16(b[22]);
- b[23] = add_round_shift_s16(b[23]);
- b[24] = add_round_shift_s16(b[24]);
- b[25] = add_round_shift_s16(b[25]);
- b[26] = add_round_shift_s16(b[26]);
- b[27] = add_round_shift_s16(b[27]);
-
- b[28] = add_round_shift_s16(a[28]);
- b[29] = add_round_shift_s16(a[29]);
- b[30] = add_round_shift_s16(a[30]);
- b[31] = add_round_shift_s16(a[31]);
-
- // Stage 3.
- a[0] = vaddq_s16(b[0], b[7]);
- a[1] = vaddq_s16(b[1], b[6]);
- a[2] = vaddq_s16(b[2], b[5]);
- a[3] = vaddq_s16(b[3], b[4]);
-
- a[4] = vsubq_s16(b[3], b[4]);
- a[5] = vsubq_s16(b[2], b[5]);
- a[6] = vsubq_s16(b[1], b[6]);
- a[7] = vsubq_s16(b[0], b[7]);
-
- a[8] = b[8];
- a[9] = b[9];
-
- butterfly_one_coeff(b[13], b[10], cospi_16_64, &a[13], &a[10]);
- butterfly_one_coeff(b[12], b[11], cospi_16_64, &a[12], &a[11]);
-
- a[14] = b[14];
- a[15] = b[15];
-
- a[16] = vaddq_s16(b[16], b[23]);
- a[17] = vaddq_s16(b[17], b[22]);
- a[18] = vaddq_s16(b[18], b[21]);
- a[19] = vaddq_s16(b[19], b[20]);
-
- a[20] = vsubq_s16(b[19], b[20]);
- a[21] = vsubq_s16(b[18], b[21]);
- a[22] = vsubq_s16(b[17], b[22]);
- a[23] = vsubq_s16(b[16], b[23]);
-
- a[24] = vsubq_s16(b[31], b[24]);
- a[25] = vsubq_s16(b[30], b[25]);
- a[26] = vsubq_s16(b[29], b[26]);
- a[27] = vsubq_s16(b[28], b[27]);
-
- a[28] = vaddq_s16(b[28], b[27]);
- a[29] = vaddq_s16(b[29], b[26]);
- a[30] = vaddq_s16(b[30], b[25]);
- a[31] = vaddq_s16(b[31], b[24]);
-
- // Stage 4.
- b[0] = vaddq_s16(a[0], a[3]);
- b[1] = vaddq_s16(a[1], a[2]);
- b[2] = vsubq_s16(a[1], a[2]);
- b[3] = vsubq_s16(a[0], a[3]);
-
- b[4] = a[4];
-
- butterfly_one_coeff(a[6], a[5], cospi_16_64, &b[6], &b[5]);
-
- b[7] = a[7];
-
- b[8] = vaddq_s16(a[8], a[11]);
- b[9] = vaddq_s16(a[9], a[10]);
- b[10] = vsubq_s16(a[9], a[10]);
- b[11] = vsubq_s16(a[8], a[11]);
- b[12] = vsubq_s16(a[15], a[12]);
- b[13] = vsubq_s16(a[14], a[13]);
- b[14] = vaddq_s16(a[14], a[13]);
- b[15] = vaddq_s16(a[15], a[12]);
-
- b[16] = a[16];
- b[17] = a[17];
-
- butterfly_two_coeff(a[29], a[18], cospi_24_64, cospi_8_64, &b[29], &b[18]);
- butterfly_two_coeff(a[28], a[19], cospi_24_64, cospi_8_64, &b[28], &b[19]);
- butterfly_two_coeff(a[27], a[20], -cospi_8_64, cospi_24_64, &b[27], &b[20]);
- butterfly_two_coeff(a[26], a[21], -cospi_8_64, cospi_24_64, &b[26], &b[21]);
-
- b[22] = a[22];
- b[23] = a[23];
- b[24] = a[24];
- b[25] = a[25];
-
- b[30] = a[30];
- b[31] = a[31];
-
- // Stage 5.
- butterfly_one_coeff(b[0], b[1], cospi_16_64, &a[0], &a[1]);
- butterfly_two_coeff(b[3], b[2], cospi_24_64, cospi_8_64, &a[2], &a[3]);
-
- a[4] = vaddq_s16(b[4], b[5]);
- a[5] = vsubq_s16(b[4], b[5]);
- a[6] = vsubq_s16(b[7], b[6]);
- a[7] = vaddq_s16(b[7], b[6]);
-
- a[8] = b[8];
-
- butterfly_two_coeff(b[14], b[9], cospi_24_64, cospi_8_64, &a[14], &a[9]);
- butterfly_two_coeff(b[13], b[10], -cospi_8_64, cospi_24_64, &a[13], &a[10]);
-
- a[11] = b[11];
- a[12] = b[12];
-
- a[15] = b[15];
-
- a[16] = vaddq_s16(b[19], b[16]);
- a[17] = vaddq_s16(b[18], b[17]);
- a[18] = vsubq_s16(b[17], b[18]);
- a[19] = vsubq_s16(b[16], b[19]);
- a[20] = vsubq_s16(b[23], b[20]);
- a[21] = vsubq_s16(b[22], b[21]);
- a[22] = vaddq_s16(b[21], b[22]);
- a[23] = vaddq_s16(b[20], b[23]);
- a[24] = vaddq_s16(b[27], b[24]);
- a[25] = vaddq_s16(b[26], b[25]);
- a[26] = vsubq_s16(b[25], b[26]);
- a[27] = vsubq_s16(b[24], b[27]);
- a[28] = vsubq_s16(b[31], b[28]);
- a[29] = vsubq_s16(b[30], b[29]);
- a[30] = vaddq_s16(b[29], b[30]);
- a[31] = vaddq_s16(b[28], b[31]);
-
- // Stage 6.
- b[0] = a[0];
- b[1] = a[1];
- b[2] = a[2];
- b[3] = a[3];
-
- butterfly_two_coeff(a[7], a[4], cospi_28_64, cospi_4_64, &b[4], &b[7]);
- butterfly_two_coeff(a[6], a[5], cospi_12_64, cospi_20_64, &b[5], &b[6]);
-
- b[8] = vaddq_s16(a[8], a[9]);
- b[9] = vsubq_s16(a[8], a[9]);
- b[10] = vsubq_s16(a[11], a[10]);
- b[11] = vaddq_s16(a[11], a[10]);
- b[12] = vaddq_s16(a[12], a[13]);
- b[13] = vsubq_s16(a[12], a[13]);
- b[14] = vsubq_s16(a[15], a[14]);
- b[15] = vaddq_s16(a[15], a[14]);
-
- b[16] = a[16];
- b[19] = a[19];
- b[20] = a[20];
- b[23] = a[23];
- b[24] = a[24];
- b[27] = a[27];
- b[28] = a[28];
- b[31] = a[31];
-
- butterfly_two_coeff(a[30], a[17], cospi_28_64, cospi_4_64, &b[30], &b[17]);
- butterfly_two_coeff(a[29], a[18], -cospi_4_64, cospi_28_64, &b[29], &b[18]);
-
- butterfly_two_coeff(a[26], a[21], cospi_12_64, cospi_20_64, &b[26], &b[21]);
- butterfly_two_coeff(a[25], a[22], -cospi_20_64, cospi_12_64, &b[25], &b[22]);
-
- // Stage 7.
- a[0] = b[0];
- a[1] = b[1];
- a[2] = b[2];
- a[3] = b[3];
- a[4] = b[4];
- a[5] = b[5];
- a[6] = b[6];
- a[7] = b[7];
-
- butterfly_two_coeff(b[15], b[8], cospi_30_64, cospi_2_64, &a[8], &a[15]);
- butterfly_two_coeff(b[14], b[9], cospi_14_64, cospi_18_64, &a[9], &a[14]);
- butterfly_two_coeff(b[13], b[10], cospi_22_64, cospi_10_64, &a[10], &a[13]);
- butterfly_two_coeff(b[12], b[11], cospi_6_64, cospi_26_64, &a[11], &a[12]);
-
- a[16] = vaddq_s16(b[16], b[17]);
- a[17] = vsubq_s16(b[16], b[17]);
- a[18] = vsubq_s16(b[19], b[18]);
- a[19] = vaddq_s16(b[19], b[18]);
- a[20] = vaddq_s16(b[20], b[21]);
- a[21] = vsubq_s16(b[20], b[21]);
- a[22] = vsubq_s16(b[23], b[22]);
- a[23] = vaddq_s16(b[23], b[22]);
- a[24] = vaddq_s16(b[24], b[25]);
- a[25] = vsubq_s16(b[24], b[25]);
- a[26] = vsubq_s16(b[27], b[26]);
- a[27] = vaddq_s16(b[27], b[26]);
- a[28] = vaddq_s16(b[28], b[29]);
- a[29] = vsubq_s16(b[28], b[29]);
- a[30] = vsubq_s16(b[31], b[30]);
- a[31] = vaddq_s16(b[31], b[30]);
-
- // Final stage.
- out[0] = a[0];
- out[16] = a[1];
- out[8] = a[2];
- out[24] = a[3];
- out[4] = a[4];
- out[20] = a[5];
- out[12] = a[6];
- out[28] = a[7];
- out[2] = a[8];
- out[18] = a[9];
- out[10] = a[10];
- out[26] = a[11];
- out[6] = a[12];
- out[22] = a[13];
- out[14] = a[14];
- out[30] = a[15];
-
- butterfly_two_coeff(a[31], a[16], cospi_31_64, cospi_1_64, &out[1], &out[31]);
- butterfly_two_coeff(a[30], a[17], cospi_15_64, cospi_17_64, &out[17],
- &out[15]);
- butterfly_two_coeff(a[29], a[18], cospi_23_64, cospi_9_64, &out[9], &out[23]);
- butterfly_two_coeff(a[28], a[19], cospi_7_64, cospi_25_64, &out[25], &out[7]);
- butterfly_two_coeff(a[27], a[20], cospi_27_64, cospi_5_64, &out[5], &out[27]);
- butterfly_two_coeff(a[26], a[21], cospi_11_64, cospi_21_64, &out[21],
- &out[11]);
- butterfly_two_coeff(a[25], a[22], cospi_19_64, cospi_13_64, &out[13],
- &out[19]);
- butterfly_two_coeff(a[24], a[23], cospi_3_64, cospi_29_64, &out[29], &out[3]);
-}
-
-#undef PASS_THROUGH
-#undef ADD_S16_S32
-#undef SUB_S16_S32
-#undef ADDW_S16_S32
-#undef SUBW_S16_S32
-#undef ADD_S32
-#undef SUB_S32
-#undef BUTTERFLY_ONE_S16_S32
-#undef BUTTERFLY_ONE_S32
-#undef BUTTERFLY_TWO_S32
-
void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
int16x8_t temp0[32];
int16x8_t temp1[32];
@@ -1159,17 +43,21 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
int16x8_t temp5[32];
// Process in 8x32 columns.
- load(input, stride, temp0);
- dct_body_first_pass(temp0, temp1);
+ load_cross(input, stride, temp0);
+ scale_input(temp0, temp5);
+ dct_body_first_pass(temp5, temp1);
- load(input + 8, stride, temp0);
- dct_body_first_pass(temp0, temp2);
+ load_cross(input + 8, stride, temp0);
+ scale_input(temp0, temp5);
+ dct_body_first_pass(temp5, temp2);
- load(input + 16, stride, temp0);
- dct_body_first_pass(temp0, temp3);
+ load_cross(input + 16, stride, temp0);
+ scale_input(temp0, temp5);
+ dct_body_first_pass(temp5, temp3);
- load(input + 24, stride, temp0);
- dct_body_first_pass(temp0, temp4);
+ load_cross(input + 24, stride, temp0);
+ scale_input(temp0, temp5);
+ dct_body_first_pass(temp5, temp4);
// Generate the top row by munging the first set of 8 from each one together.
transpose_s16_8x8_new(&temp1[0], &temp0[0]);
@@ -1254,17 +142,21 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
int16x8_t temp5[32];
// Process in 8x32 columns.
- load(input, stride, temp0);
- dct_body_first_pass(temp0, temp1);
+ load_cross(input, stride, temp0);
+ scale_input(temp0, temp5);
+ dct_body_first_pass(temp5, temp1);
- load(input + 8, stride, temp0);
- dct_body_first_pass(temp0, temp2);
+ load_cross(input + 8, stride, temp0);
+ scale_input(temp0, temp5);
+ dct_body_first_pass(temp5, temp2);
- load(input + 16, stride, temp0);
- dct_body_first_pass(temp0, temp3);
+ load_cross(input + 16, stride, temp0);
+ scale_input(temp0, temp5);
+ dct_body_first_pass(temp5, temp3);
- load(input + 24, stride, temp0);
- dct_body_first_pass(temp0, temp4);
+ load_cross(input + 24, stride, temp0);
+ scale_input(temp0, temp5);
+ dct_body_first_pass(temp5, temp4);
// Generate the top row by munging the first set of 8 from each one together.
transpose_s16_8x8_new(&temp1[0], &temp0[0]);
diff --git a/vpx_dsp/arm/fdct32x32_neon.h b/vpx_dsp/arm/fdct32x32_neon.h
new file mode 100644
index 000000000..dd647918b
--- /dev/null
+++ b/vpx_dsp/arm/fdct32x32_neon.h
@@ -0,0 +1,1105 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_
+#define VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/fdct_neon.h"
+
+// Load & cross the first 8 and last 8, then the middle
+static INLINE void load_cross(const int16_t *a, int stride, int16x8_t *b) {
+ b[0] = vaddq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 31 * stride));
+ b[1] = vaddq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 30 * stride));
+ b[2] = vaddq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 29 * stride));
+ b[3] = vaddq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 28 * stride));
+ b[4] = vaddq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 27 * stride));
+ b[5] = vaddq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 26 * stride));
+ b[6] = vaddq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 25 * stride));
+ b[7] = vaddq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 24 * stride));
+
+ b[24] = vsubq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 24 * stride));
+ b[25] = vsubq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 25 * stride));
+ b[26] = vsubq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 26 * stride));
+ b[27] = vsubq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 27 * stride));
+ b[28] = vsubq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 28 * stride));
+ b[29] = vsubq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 29 * stride));
+ b[30] = vsubq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 30 * stride));
+ b[31] = vsubq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 31 * stride));
+
+ b[8] = vaddq_s16(vld1q_s16(a + 8 * stride), vld1q_s16(a + 23 * stride));
+ b[9] = vaddq_s16(vld1q_s16(a + 9 * stride), vld1q_s16(a + 22 * stride));
+ b[10] = vaddq_s16(vld1q_s16(a + 10 * stride), vld1q_s16(a + 21 * stride));
+ b[11] = vaddq_s16(vld1q_s16(a + 11 * stride), vld1q_s16(a + 20 * stride));
+ b[12] = vaddq_s16(vld1q_s16(a + 12 * stride), vld1q_s16(a + 19 * stride));
+ b[13] = vaddq_s16(vld1q_s16(a + 13 * stride), vld1q_s16(a + 18 * stride));
+ b[14] = vaddq_s16(vld1q_s16(a + 14 * stride), vld1q_s16(a + 17 * stride));
+ b[15] = vaddq_s16(vld1q_s16(a + 15 * stride), vld1q_s16(a + 16 * stride));
+
+ b[16] = vsubq_s16(vld1q_s16(a + 15 * stride), vld1q_s16(a + 16 * stride));
+ b[17] = vsubq_s16(vld1q_s16(a + 14 * stride), vld1q_s16(a + 17 * stride));
+ b[18] = vsubq_s16(vld1q_s16(a + 13 * stride), vld1q_s16(a + 18 * stride));
+ b[19] = vsubq_s16(vld1q_s16(a + 12 * stride), vld1q_s16(a + 19 * stride));
+ b[20] = vsubq_s16(vld1q_s16(a + 11 * stride), vld1q_s16(a + 20 * stride));
+ b[21] = vsubq_s16(vld1q_s16(a + 10 * stride), vld1q_s16(a + 21 * stride));
+ b[22] = vsubq_s16(vld1q_s16(a + 9 * stride), vld1q_s16(a + 22 * stride));
+ b[23] = vsubq_s16(vld1q_s16(a + 8 * stride), vld1q_s16(a + 23 * stride));
+}
+
+#define STORE_S16(src, index, dest) \
+ do { \
+ store_s16q_to_tran_low(dest, src[index]); \
+ dest += 8; \
+ } while (0)
+
+// Store 32 16x8 values, assuming stride == 32.
+// Slight twist: store horizontally in blocks of 8.
+static INLINE void store(tran_low_t *a, const int16x8_t *b) {
+ STORE_S16(b, 0, a);
+ STORE_S16(b, 8, a);
+ STORE_S16(b, 16, a);
+ STORE_S16(b, 24, a);
+ STORE_S16(b, 1, a);
+ STORE_S16(b, 9, a);
+ STORE_S16(b, 17, a);
+ STORE_S16(b, 25, a);
+ STORE_S16(b, 2, a);
+ STORE_S16(b, 10, a);
+ STORE_S16(b, 18, a);
+ STORE_S16(b, 26, a);
+ STORE_S16(b, 3, a);
+ STORE_S16(b, 11, a);
+ STORE_S16(b, 19, a);
+ STORE_S16(b, 27, a);
+ STORE_S16(b, 4, a);
+ STORE_S16(b, 12, a);
+ STORE_S16(b, 20, a);
+ STORE_S16(b, 28, a);
+ STORE_S16(b, 5, a);
+ STORE_S16(b, 13, a);
+ STORE_S16(b, 21, a);
+ STORE_S16(b, 29, a);
+ STORE_S16(b, 6, a);
+ STORE_S16(b, 14, a);
+ STORE_S16(b, 22, a);
+ STORE_S16(b, 30, a);
+ STORE_S16(b, 7, a);
+ STORE_S16(b, 15, a);
+ STORE_S16(b, 23, a);
+ STORE_S16(b, 31, a);
+}
+
+#undef STORE_S16
+
+static INLINE void scale_input(const int16x8_t *in /*32*/,
+ int16x8_t *out /*32*/) {
+ out[0] = vshlq_n_s16(in[0], 2);
+ out[1] = vshlq_n_s16(in[1], 2);
+ out[2] = vshlq_n_s16(in[2], 2);
+ out[3] = vshlq_n_s16(in[3], 2);
+ out[4] = vshlq_n_s16(in[4], 2);
+ out[5] = vshlq_n_s16(in[5], 2);
+ out[6] = vshlq_n_s16(in[6], 2);
+ out[7] = vshlq_n_s16(in[7], 2);
+
+ out[8] = vshlq_n_s16(in[8], 2);
+ out[9] = vshlq_n_s16(in[9], 2);
+ out[10] = vshlq_n_s16(in[10], 2);
+ out[11] = vshlq_n_s16(in[11], 2);
+ out[12] = vshlq_n_s16(in[12], 2);
+ out[13] = vshlq_n_s16(in[13], 2);
+ out[14] = vshlq_n_s16(in[14], 2);
+ out[15] = vshlq_n_s16(in[15], 2);
+
+ out[16] = vshlq_n_s16(in[16], 2);
+ out[17] = vshlq_n_s16(in[17], 2);
+ out[18] = vshlq_n_s16(in[18], 2);
+ out[19] = vshlq_n_s16(in[19], 2);
+ out[20] = vshlq_n_s16(in[20], 2);
+ out[21] = vshlq_n_s16(in[21], 2);
+ out[22] = vshlq_n_s16(in[22], 2);
+ out[23] = vshlq_n_s16(in[23], 2);
+
+ out[24] = vshlq_n_s16(in[24], 2);
+ out[25] = vshlq_n_s16(in[25], 2);
+ out[26] = vshlq_n_s16(in[26], 2);
+ out[27] = vshlq_n_s16(in[27], 2);
+ out[28] = vshlq_n_s16(in[28], 2);
+ out[29] = vshlq_n_s16(in[29], 2);
+ out[30] = vshlq_n_s16(in[30], 2);
+ out[31] = vshlq_n_s16(in[31], 2);
+}
+
+static void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) {
+ int16x8_t a[32];
+ int16x8_t b[32];
+
+ // Stage 1: Done as part of the load.
+
+ // Stage 2.
+ // Mini cross. X the first 16 values and the middle 8 of the second half.
+ a[0] = vaddq_s16(in[0], in[15]);
+ a[1] = vaddq_s16(in[1], in[14]);
+ a[2] = vaddq_s16(in[2], in[13]);
+ a[3] = vaddq_s16(in[3], in[12]);
+ a[4] = vaddq_s16(in[4], in[11]);
+ a[5] = vaddq_s16(in[5], in[10]);
+ a[6] = vaddq_s16(in[6], in[9]);
+ a[7] = vaddq_s16(in[7], in[8]);
+
+ a[8] = vsubq_s16(in[7], in[8]);
+ a[9] = vsubq_s16(in[6], in[9]);
+ a[10] = vsubq_s16(in[5], in[10]);
+ a[11] = vsubq_s16(in[4], in[11]);
+ a[12] = vsubq_s16(in[3], in[12]);
+ a[13] = vsubq_s16(in[2], in[13]);
+ a[14] = vsubq_s16(in[1], in[14]);
+ a[15] = vsubq_s16(in[0], in[15]);
+
+ a[16] = in[16];
+ a[17] = in[17];
+ a[18] = in[18];
+ a[19] = in[19];
+
+ butterfly_one_coeff_s16_s32_narrow(in[27], in[20], cospi_16_64, &a[27],
+ &a[20]);
+ butterfly_one_coeff_s16_s32_narrow(in[26], in[21], cospi_16_64, &a[26],
+ &a[21]);
+ butterfly_one_coeff_s16_s32_narrow(in[25], in[22], cospi_16_64, &a[25],
+ &a[22]);
+ butterfly_one_coeff_s16_s32_narrow(in[24], in[23], cospi_16_64, &a[24],
+ &a[23]);
+
+ a[28] = in[28];
+ a[29] = in[29];
+ a[30] = in[30];
+ a[31] = in[31];
+
+ // Stage 3.
+ b[0] = vaddq_s16(a[0], a[7]);
+ b[1] = vaddq_s16(a[1], a[6]);
+ b[2] = vaddq_s16(a[2], a[5]);
+ b[3] = vaddq_s16(a[3], a[4]);
+
+ b[4] = vsubq_s16(a[3], a[4]);
+ b[5] = vsubq_s16(a[2], a[5]);
+ b[6] = vsubq_s16(a[1], a[6]);
+ b[7] = vsubq_s16(a[0], a[7]);
+
+ b[8] = a[8];
+ b[9] = a[9];
+
+ butterfly_one_coeff_s16_s32_narrow(a[13], a[10], cospi_16_64, &b[13], &b[10]);
+ butterfly_one_coeff_s16_s32_narrow(a[12], a[11], cospi_16_64, &b[12], &b[11]);
+
+ b[14] = a[14];
+ b[15] = a[15];
+
+ b[16] = vaddq_s16(in[16], a[23]);
+ b[17] = vaddq_s16(in[17], a[22]);
+ b[18] = vaddq_s16(in[18], a[21]);
+ b[19] = vaddq_s16(in[19], a[20]);
+
+ b[20] = vsubq_s16(in[19], a[20]);
+ b[21] = vsubq_s16(in[18], a[21]);
+ b[22] = vsubq_s16(in[17], a[22]);
+ b[23] = vsubq_s16(in[16], a[23]);
+
+ b[24] = vsubq_s16(in[31], a[24]);
+ b[25] = vsubq_s16(in[30], a[25]);
+ b[26] = vsubq_s16(in[29], a[26]);
+ b[27] = vsubq_s16(in[28], a[27]);
+
+ b[28] = vaddq_s16(in[28], a[27]);
+ b[29] = vaddq_s16(in[29], a[26]);
+ b[30] = vaddq_s16(in[30], a[25]);
+ b[31] = vaddq_s16(in[31], a[24]);
+
+ // Stage 4.
+ a[0] = vaddq_s16(b[0], b[3]);
+ a[1] = vaddq_s16(b[1], b[2]);
+ a[2] = vsubq_s16(b[1], b[2]);
+ a[3] = vsubq_s16(b[0], b[3]);
+
+ a[4] = b[4];
+
+ butterfly_one_coeff_s16_s32_narrow(b[6], b[5], cospi_16_64, &a[6], &a[5]);
+
+ a[7] = b[7];
+
+ a[8] = vaddq_s16(b[8], b[11]);
+ a[9] = vaddq_s16(b[9], b[10]);
+ a[10] = vsubq_s16(b[9], b[10]);
+ a[11] = vsubq_s16(b[8], b[11]);
+ a[12] = vsubq_s16(b[15], b[12]);
+ a[13] = vsubq_s16(b[14], b[13]);
+ a[14] = vaddq_s16(b[14], b[13]);
+ a[15] = vaddq_s16(b[15], b[12]);
+
+ a[16] = b[16];
+ a[17] = b[17];
+
+ butterfly_two_coeff(b[29], b[18], cospi_8_64, cospi_24_64, &a[29], &a[18]);
+ butterfly_two_coeff(b[28], b[19], cospi_8_64, cospi_24_64, &a[28], &a[19]);
+ butterfly_two_coeff(b[27], b[20], cospi_24_64, -cospi_8_64, &a[27], &a[20]);
+ butterfly_two_coeff(b[26], b[21], cospi_24_64, -cospi_8_64, &a[26], &a[21]);
+
+ a[22] = b[22];
+ a[23] = b[23];
+ a[24] = b[24];
+ a[25] = b[25];
+
+ a[30] = b[30];
+ a[31] = b[31];
+
+ // Stage 5.
+ butterfly_one_coeff_s16_fast(a[0], a[1], cospi_16_64, &b[0], &b[1]);
+ butterfly_two_coeff(a[3], a[2], cospi_8_64, cospi_24_64, &b[2], &b[3]);
+
+ b[4] = vaddq_s16(a[4], a[5]);
+ b[5] = vsubq_s16(a[4], a[5]);
+ b[6] = vsubq_s16(a[7], a[6]);
+ b[7] = vaddq_s16(a[7], a[6]);
+
+ b[8] = a[8];
+
+ butterfly_two_coeff(a[14], a[9], cospi_8_64, cospi_24_64, &b[14], &b[9]);
+ butterfly_two_coeff(a[13], a[10], cospi_24_64, -cospi_8_64, &b[13], &b[10]);
+
+ b[11] = a[11];
+ b[12] = a[12];
+
+ b[15] = a[15];
+
+ b[16] = vaddq_s16(a[19], a[16]);
+ b[17] = vaddq_s16(a[18], a[17]);
+ b[18] = vsubq_s16(a[17], a[18]);
+ b[19] = vsubq_s16(a[16], a[19]);
+ b[20] = vsubq_s16(a[23], a[20]);
+ b[21] = vsubq_s16(a[22], a[21]);
+ b[22] = vaddq_s16(a[21], a[22]);
+ b[23] = vaddq_s16(a[20], a[23]);
+ b[24] = vaddq_s16(a[27], a[24]);
+ b[25] = vaddq_s16(a[26], a[25]);
+ b[26] = vsubq_s16(a[25], a[26]);
+ b[27] = vsubq_s16(a[24], a[27]);
+ b[28] = vsubq_s16(a[31], a[28]);
+ b[29] = vsubq_s16(a[30], a[29]);
+ b[30] = vaddq_s16(a[29], a[30]);
+ b[31] = vaddq_s16(a[28], a[31]);
+
+ // Stage 6.
+ a[0] = b[0];
+ a[1] = b[1];
+ a[2] = b[2];
+ a[3] = b[3];
+
+ butterfly_two_coeff(b[7], b[4], cospi_4_64, cospi_28_64, &a[4], &a[7]);
+ butterfly_two_coeff(b[6], b[5], cospi_20_64, cospi_12_64, &a[5], &a[6]);
+
+ a[8] = vaddq_s16(b[8], b[9]);
+ a[9] = vsubq_s16(b[8], b[9]);
+ a[10] = vsubq_s16(b[11], b[10]);
+ a[11] = vaddq_s16(b[11], b[10]);
+ a[12] = vaddq_s16(b[12], b[13]);
+ a[13] = vsubq_s16(b[12], b[13]);
+ a[14] = vsubq_s16(b[15], b[14]);
+ a[15] = vaddq_s16(b[15], b[14]);
+
+ a[16] = b[16];
+ a[19] = b[19];
+ a[20] = b[20];
+ a[23] = b[23];
+ a[24] = b[24];
+ a[27] = b[27];
+ a[28] = b[28];
+ a[31] = b[31];
+
+ butterfly_two_coeff(b[30], b[17], cospi_4_64, cospi_28_64, &a[30], &a[17]);
+ butterfly_two_coeff(b[29], b[18], cospi_28_64, -cospi_4_64, &a[29], &a[18]);
+
+ butterfly_two_coeff(b[26], b[21], cospi_20_64, cospi_12_64, &a[26], &a[21]);
+ butterfly_two_coeff(b[25], b[22], cospi_12_64, -cospi_20_64, &a[25], &a[22]);
+
+ // Stage 7.
+ b[0] = a[0];
+ b[1] = a[1];
+ b[2] = a[2];
+ b[3] = a[3];
+ b[4] = a[4];
+ b[5] = a[5];
+ b[6] = a[6];
+ b[7] = a[7];
+
+ butterfly_two_coeff(a[15], a[8], cospi_2_64, cospi_30_64, &b[8], &b[15]);
+ butterfly_two_coeff(a[14], a[9], cospi_18_64, cospi_14_64, &b[9], &b[14]);
+ butterfly_two_coeff(a[13], a[10], cospi_10_64, cospi_22_64, &b[10], &b[13]);
+ butterfly_two_coeff(a[12], a[11], cospi_26_64, cospi_6_64, &b[11], &b[12]);
+
+ b[16] = vaddq_s16(a[16], a[17]);
+ b[17] = vsubq_s16(a[16], a[17]);
+ b[18] = vsubq_s16(a[19], a[18]);
+ b[19] = vaddq_s16(a[19], a[18]);
+ b[20] = vaddq_s16(a[20], a[21]);
+ b[21] = vsubq_s16(a[20], a[21]);
+ b[22] = vsubq_s16(a[23], a[22]);
+ b[23] = vaddq_s16(a[23], a[22]);
+ b[24] = vaddq_s16(a[24], a[25]);
+ b[25] = vsubq_s16(a[24], a[25]);
+ b[26] = vsubq_s16(a[27], a[26]);
+ b[27] = vaddq_s16(a[27], a[26]);
+ b[28] = vaddq_s16(a[28], a[29]);
+ b[29] = vsubq_s16(a[28], a[29]);
+ b[30] = vsubq_s16(a[31], a[30]);
+ b[31] = vaddq_s16(a[31], a[30]);
+
+ // Final stage.
+ // Also compute partial rounding shift:
+ // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+ out[0] = sub_round_shift_s16(b[0]);
+ out[16] = sub_round_shift_s16(b[1]);
+ out[8] = sub_round_shift_s16(b[2]);
+ out[24] = sub_round_shift_s16(b[3]);
+ out[4] = sub_round_shift_s16(b[4]);
+ out[20] = sub_round_shift_s16(b[5]);
+ out[12] = sub_round_shift_s16(b[6]);
+ out[28] = sub_round_shift_s16(b[7]);
+ out[2] = sub_round_shift_s16(b[8]);
+ out[18] = sub_round_shift_s16(b[9]);
+ out[10] = sub_round_shift_s16(b[10]);
+ out[26] = sub_round_shift_s16(b[11]);
+ out[6] = sub_round_shift_s16(b[12]);
+ out[22] = sub_round_shift_s16(b[13]);
+ out[14] = sub_round_shift_s16(b[14]);
+ out[30] = sub_round_shift_s16(b[15]);
+
+ butterfly_two_coeff(b[31], b[16], cospi_1_64, cospi_31_64, &a[1], &a[31]);
+ out[1] = sub_round_shift_s16(a[1]);
+ out[31] = sub_round_shift_s16(a[31]);
+
+ butterfly_two_coeff(b[30], b[17], cospi_17_64, cospi_15_64, &a[17], &a[15]);
+ out[17] = sub_round_shift_s16(a[17]);
+ out[15] = sub_round_shift_s16(a[15]);
+
+ butterfly_two_coeff(b[29], b[18], cospi_9_64, cospi_23_64, &a[9], &a[23]);
+ out[9] = sub_round_shift_s16(a[9]);
+ out[23] = sub_round_shift_s16(a[23]);
+
+ butterfly_two_coeff(b[28], b[19], cospi_25_64, cospi_7_64, &a[25], &a[7]);
+ out[25] = sub_round_shift_s16(a[25]);
+ out[7] = sub_round_shift_s16(a[7]);
+
+ butterfly_two_coeff(b[27], b[20], cospi_5_64, cospi_27_64, &a[5], &a[27]);
+ out[5] = sub_round_shift_s16(a[5]);
+ out[27] = sub_round_shift_s16(a[27]);
+
+ butterfly_two_coeff(b[26], b[21], cospi_21_64, cospi_11_64, &a[21], &a[11]);
+ out[21] = sub_round_shift_s16(a[21]);
+ out[11] = sub_round_shift_s16(a[11]);
+
+ butterfly_two_coeff(b[25], b[22], cospi_13_64, cospi_19_64, &a[13], &a[19]);
+ out[13] = sub_round_shift_s16(a[13]);
+ out[19] = sub_round_shift_s16(a[19]);
+
+ butterfly_two_coeff(b[24], b[23], cospi_29_64, cospi_3_64, &a[29], &a[3]);
+ out[29] = sub_round_shift_s16(a[29]);
+ out[3] = sub_round_shift_s16(a[3]);
+}
+
+#define PASS_THROUGH(src, dst, element) \
+ do { \
+ dst##_lo[element] = src##_lo[element]; \
+ dst##_hi[element] = src##_hi[element]; \
+ } while (0)
+
+#define ADD_S16_S32(a, left_index, right_index, b, b_index) \
+ do { \
+ b##_lo[b_index] = \
+ vaddl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
+ b##_hi[b_index] = vaddl_s16(vget_high_s16(a[left_index]), \
+ vget_high_s16(a[right_index])); \
+ } while (0)
+
+#define SUB_S16_S32(a, left_index, right_index, b, b_index) \
+ do { \
+ b##_lo[b_index] = \
+ vsubl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
+ b##_hi[b_index] = vsubl_s16(vget_high_s16(a[left_index]), \
+ vget_high_s16(a[right_index])); \
+ } while (0)
+
+#define ADDW_S16_S32(a, a_index, b, b_index, c, c_index) \
+ do { \
+ c##_lo[c_index] = vaddw_s16(a##_lo[a_index], vget_low_s16(b[b_index])); \
+ c##_hi[c_index] = vaddw_s16(a##_hi[a_index], vget_high_s16(b[b_index])); \
+ } while (0)
+
+#define SUBW_S16_S32(a, a_index, b, b_index, temp, temp_index, c, c_index) \
+ do { \
+ temp##_lo[temp_index] = vmovl_s16(vget_low_s16(a[a_index])); \
+ temp##_hi[temp_index] = vmovl_s16(vget_high_s16(a[a_index])); \
+ c##_lo[c_index] = vsubq_s32(temp##_lo[temp_index], b##_lo[b_index]); \
+ c##_hi[c_index] = vsubq_s32(temp##_hi[temp_index], b##_hi[b_index]); \
+ } while (0)
+
+#define ADD_S32(a, left_index, right_index, b, b_index) \
+ do { \
+ b##_lo[b_index] = vaddq_s32(a##_lo[left_index], a##_lo[right_index]); \
+ b##_hi[b_index] = vaddq_s32(a##_hi[left_index], a##_hi[right_index]); \
+ } while (0)
+
+#define SUB_S32(a, left_index, right_index, b, b_index) \
+ do { \
+ b##_lo[b_index] = vsubq_s32(a##_lo[left_index], a##_lo[right_index]); \
+ b##_hi[b_index] = vsubq_s32(a##_hi[left_index], a##_hi[right_index]); \
+ } while (0)
+
+#define BUTTERFLY_ONE_S16_S32(a, left_index, right_index, constant, b, \
+ add_index, sub_index) \
+ do { \
+ butterfly_one_coeff_s16_s32(a[left_index], a[right_index], constant, \
+ &b##_lo[add_index], &b##_hi[add_index], \
+ &b##_lo[sub_index], &b##_hi[sub_index]); \
+ } while (0)
+
+#define BUTTERFLY_ONE_S32(a, left_index, right_index, constant, b, add_index, \
+ sub_index) \
+ do { \
+ butterfly_one_coeff_s32_fast( \
+ a##_lo[left_index], a##_hi[left_index], a##_lo[right_index], \
+ a##_hi[right_index], constant, &b##_lo[add_index], &b##_hi[add_index], \
+ &b##_lo[sub_index], &b##_hi[sub_index]); \
+ } while (0)
+
+#define BUTTERFLY_TWO_S32(a, left_index, right_index, left_constant, \
+ right_constant, b, add_index, sub_index) \
+ do { \
+ butterfly_two_coeff_s32(a##_lo[left_index], a##_hi[left_index], \
+ a##_lo[right_index], a##_hi[right_index], \
+ left_constant, right_constant, &b##_lo[add_index], \
+ &b##_hi[add_index], &b##_lo[sub_index], \
+ &b##_hi[sub_index]); \
+ } while (0)
+
+static void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) {
+ int16x8_t a[32];
+ int16x8_t b[32];
+ int32x4_t c_lo[32];
+ int32x4_t c_hi[32];
+ int32x4_t d_lo[32];
+ int32x4_t d_hi[32];
+
+ // Stage 1. Done as part of the load for the first pass.
+ a[0] = vaddq_s16(in[0], in[31]);
+ a[1] = vaddq_s16(in[1], in[30]);
+ a[2] = vaddq_s16(in[2], in[29]);
+ a[3] = vaddq_s16(in[3], in[28]);
+ a[4] = vaddq_s16(in[4], in[27]);
+ a[5] = vaddq_s16(in[5], in[26]);
+ a[6] = vaddq_s16(in[6], in[25]);
+ a[7] = vaddq_s16(in[7], in[24]);
+ a[8] = vaddq_s16(in[8], in[23]);
+ a[9] = vaddq_s16(in[9], in[22]);
+ a[10] = vaddq_s16(in[10], in[21]);
+ a[11] = vaddq_s16(in[11], in[20]);
+ a[12] = vaddq_s16(in[12], in[19]);
+ a[13] = vaddq_s16(in[13], in[18]);
+ a[14] = vaddq_s16(in[14], in[17]);
+ a[15] = vaddq_s16(in[15], in[16]);
+ a[16] = vsubq_s16(in[15], in[16]);
+ a[17] = vsubq_s16(in[14], in[17]);
+ a[18] = vsubq_s16(in[13], in[18]);
+ a[19] = vsubq_s16(in[12], in[19]);
+ a[20] = vsubq_s16(in[11], in[20]);
+ a[21] = vsubq_s16(in[10], in[21]);
+ a[22] = vsubq_s16(in[9], in[22]);
+ a[23] = vsubq_s16(in[8], in[23]);
+ a[24] = vsubq_s16(in[7], in[24]);
+ a[25] = vsubq_s16(in[6], in[25]);
+ a[26] = vsubq_s16(in[5], in[26]);
+ a[27] = vsubq_s16(in[4], in[27]);
+ a[28] = vsubq_s16(in[3], in[28]);
+ a[29] = vsubq_s16(in[2], in[29]);
+ a[30] = vsubq_s16(in[1], in[30]);
+ a[31] = vsubq_s16(in[0], in[31]);
+
+ // Stage 2.
+ b[0] = vaddq_s16(a[0], a[15]);
+ b[1] = vaddq_s16(a[1], a[14]);
+ b[2] = vaddq_s16(a[2], a[13]);
+ b[3] = vaddq_s16(a[3], a[12]);
+ b[4] = vaddq_s16(a[4], a[11]);
+ b[5] = vaddq_s16(a[5], a[10]);
+ b[6] = vaddq_s16(a[6], a[9]);
+ b[7] = vaddq_s16(a[7], a[8]);
+
+ b[8] = vsubq_s16(a[7], a[8]);
+ b[9] = vsubq_s16(a[6], a[9]);
+ b[10] = vsubq_s16(a[5], a[10]);
+ b[11] = vsubq_s16(a[4], a[11]);
+ b[12] = vsubq_s16(a[3], a[12]);
+ b[13] = vsubq_s16(a[2], a[13]);
+ b[14] = vsubq_s16(a[1], a[14]);
+ b[15] = vsubq_s16(a[0], a[15]);
+
+ b[16] = a[16];
+ b[17] = a[17];
+ b[18] = a[18];
+ b[19] = a[19];
+
+ butterfly_one_coeff_s16_s32_narrow(a[27], a[20], cospi_16_64, &b[27], &b[20]);
+ butterfly_one_coeff_s16_s32_narrow(a[26], a[21], cospi_16_64, &b[26], &b[21]);
+ butterfly_one_coeff_s16_s32_narrow(a[25], a[22], cospi_16_64, &b[25], &b[22]);
+ butterfly_one_coeff_s16_s32_narrow(a[24], a[23], cospi_16_64, &b[24], &b[23]);
+
+ b[28] = a[28];
+ b[29] = a[29];
+ b[30] = a[30];
+ b[31] = a[31];
+
+ // Stage 3. With extreme values for input this calculation rolls over int16_t.
+ // The sources for b[0] get added multiple times and, through testing, have
+ // been shown to overflow starting here.
+ ADD_S16_S32(b, 0, 7, c, 0);
+ ADD_S16_S32(b, 1, 6, c, 1);
+ ADD_S16_S32(b, 2, 5, c, 2);
+ ADD_S16_S32(b, 3, 4, c, 3);
+ SUB_S16_S32(b, 3, 4, c, 4);
+ SUB_S16_S32(b, 2, 5, c, 5);
+ SUB_S16_S32(b, 1, 6, c, 6);
+ SUB_S16_S32(b, 0, 7, c, 7);
+
+ a[8] = b[8];
+ a[9] = b[9];
+
+ BUTTERFLY_ONE_S16_S32(b, 13, 10, cospi_16_64, c, 13, 10);
+ BUTTERFLY_ONE_S16_S32(b, 12, 11, cospi_16_64, c, 12, 11);
+
+ a[14] = b[14];
+ a[15] = b[15];
+
+ ADD_S16_S32(b, 16, 23, c, 16);
+ ADD_S16_S32(b, 17, 22, c, 17);
+ ADD_S16_S32(b, 18, 21, c, 18);
+ ADD_S16_S32(b, 19, 20, c, 19);
+ SUB_S16_S32(b, 19, 20, c, 20);
+ SUB_S16_S32(b, 18, 21, c, 21);
+ SUB_S16_S32(b, 17, 22, c, 22);
+ SUB_S16_S32(b, 16, 23, c, 23);
+ SUB_S16_S32(b, 31, 24, c, 24);
+ SUB_S16_S32(b, 30, 25, c, 25);
+ SUB_S16_S32(b, 29, 26, c, 26);
+ SUB_S16_S32(b, 28, 27, c, 27);
+ ADD_S16_S32(b, 28, 27, c, 28);
+ ADD_S16_S32(b, 29, 26, c, 29);
+ ADD_S16_S32(b, 30, 25, c, 30);
+ ADD_S16_S32(b, 31, 24, c, 31);
+
+ // Stage 4.
+ ADD_S32(c, 0, 3, d, 0);
+ ADD_S32(c, 1, 2, d, 1);
+ SUB_S32(c, 1, 2, d, 2);
+ SUB_S32(c, 0, 3, d, 3);
+
+ PASS_THROUGH(c, d, 4);
+
+ BUTTERFLY_ONE_S32(c, 6, 5, cospi_16_64, d, 6, 5);
+
+ PASS_THROUGH(c, d, 7);
+
+ ADDW_S16_S32(c, 11, a, 8, d, 8);
+ ADDW_S16_S32(c, 10, a, 9, d, 9);
+ SUBW_S16_S32(a, 9, c, 10, c, 9, d, 10);
+ SUBW_S16_S32(a, 8, c, 11, c, 8, d, 11);
+ SUBW_S16_S32(a, 15, c, 12, c, 15, d, 12);
+ SUBW_S16_S32(a, 14, c, 13, c, 14, d, 13);
+ ADDW_S16_S32(c, 13, b, 14, d, 14);
+ ADDW_S16_S32(c, 12, b, 15, d, 15);
+
+ PASS_THROUGH(c, d, 16);
+ PASS_THROUGH(c, d, 17);
+
+ BUTTERFLY_TWO_S32(c, 29, 18, cospi_8_64, cospi_24_64, d, 29, 18);
+ BUTTERFLY_TWO_S32(c, 28, 19, cospi_8_64, cospi_24_64, d, 28, 19);
+ BUTTERFLY_TWO_S32(c, 27, 20, cospi_24_64, -cospi_8_64, d, 27, 20);
+ BUTTERFLY_TWO_S32(c, 26, 21, cospi_24_64, -cospi_8_64, d, 26, 21);
+
+ PASS_THROUGH(c, d, 22);
+ PASS_THROUGH(c, d, 23);
+ PASS_THROUGH(c, d, 24);
+ PASS_THROUGH(c, d, 25);
+
+ PASS_THROUGH(c, d, 30);
+ PASS_THROUGH(c, d, 31);
+
+ // Stage 5.
+ BUTTERFLY_ONE_S32(d, 0, 1, cospi_16_64, c, 0, 1);
+ BUTTERFLY_TWO_S32(d, 3, 2, cospi_8_64, cospi_24_64, c, 2, 3);
+
+ ADD_S32(d, 4, 5, c, 4);
+ SUB_S32(d, 4, 5, c, 5);
+ SUB_S32(d, 7, 6, c, 6);
+ ADD_S32(d, 7, 6, c, 7);
+
+ PASS_THROUGH(d, c, 8);
+
+ BUTTERFLY_TWO_S32(d, 14, 9, cospi_8_64, cospi_24_64, c, 14, 9);
+ BUTTERFLY_TWO_S32(d, 13, 10, cospi_24_64, -cospi_8_64, c, 13, 10);
+
+ PASS_THROUGH(d, c, 11);
+ PASS_THROUGH(d, c, 12);
+ PASS_THROUGH(d, c, 15);
+
+ ADD_S32(d, 16, 19, c, 16);
+ ADD_S32(d, 17, 18, c, 17);
+ SUB_S32(d, 17, 18, c, 18);
+ SUB_S32(d, 16, 19, c, 19);
+ SUB_S32(d, 23, 20, c, 20);
+ SUB_S32(d, 22, 21, c, 21);
+ ADD_S32(d, 22, 21, c, 22);
+ ADD_S32(d, 23, 20, c, 23);
+ ADD_S32(d, 24, 27, c, 24);
+ ADD_S32(d, 25, 26, c, 25);
+ SUB_S32(d, 25, 26, c, 26);
+ SUB_S32(d, 24, 27, c, 27);
+ SUB_S32(d, 31, 28, c, 28);
+ SUB_S32(d, 30, 29, c, 29);
+ ADD_S32(d, 30, 29, c, 30);
+ ADD_S32(d, 31, 28, c, 31);
+
+ // Stage 6.
+ PASS_THROUGH(c, d, 0);
+ PASS_THROUGH(c, d, 1);
+ PASS_THROUGH(c, d, 2);
+ PASS_THROUGH(c, d, 3);
+
+ BUTTERFLY_TWO_S32(c, 7, 4, cospi_4_64, cospi_28_64, d, 4, 7);
+ BUTTERFLY_TWO_S32(c, 6, 5, cospi_20_64, cospi_12_64, d, 5, 6);
+
+ ADD_S32(c, 8, 9, d, 8);
+ SUB_S32(c, 8, 9, d, 9);
+ SUB_S32(c, 11, 10, d, 10);
+ ADD_S32(c, 11, 10, d, 11);
+ ADD_S32(c, 12, 13, d, 12);
+ SUB_S32(c, 12, 13, d, 13);
+ SUB_S32(c, 15, 14, d, 14);
+ ADD_S32(c, 15, 14, d, 15);
+
+ PASS_THROUGH(c, d, 16);
+ PASS_THROUGH(c, d, 19);
+ PASS_THROUGH(c, d, 20);
+ PASS_THROUGH(c, d, 23);
+ PASS_THROUGH(c, d, 24);
+ PASS_THROUGH(c, d, 27);
+ PASS_THROUGH(c, d, 28);
+ PASS_THROUGH(c, d, 31);
+
+ BUTTERFLY_TWO_S32(c, 30, 17, cospi_4_64, cospi_28_64, d, 30, 17);
+ BUTTERFLY_TWO_S32(c, 29, 18, cospi_28_64, -cospi_4_64, d, 29, 18);
+ BUTTERFLY_TWO_S32(c, 26, 21, cospi_20_64, cospi_12_64, d, 26, 21);
+ BUTTERFLY_TWO_S32(c, 25, 22, cospi_12_64, -cospi_20_64, d, 25, 22);
+
+ // Stage 7.
+ PASS_THROUGH(d, c, 0);
+ PASS_THROUGH(d, c, 1);
+ PASS_THROUGH(d, c, 2);
+ PASS_THROUGH(d, c, 3);
+ PASS_THROUGH(d, c, 4);
+ PASS_THROUGH(d, c, 5);
+ PASS_THROUGH(d, c, 6);
+ PASS_THROUGH(d, c, 7);
+
+ BUTTERFLY_TWO_S32(d, 15, 8, cospi_2_64, cospi_30_64, c, 8, 15);
+ BUTTERFLY_TWO_S32(d, 14, 9, cospi_18_64, cospi_14_64, c, 9, 14);
+ BUTTERFLY_TWO_S32(d, 13, 10, cospi_10_64, cospi_22_64, c, 10, 13);
+ BUTTERFLY_TWO_S32(d, 12, 11, cospi_26_64, cospi_6_64, c, 11, 12);
+
+ ADD_S32(d, 16, 17, c, 16);
+ SUB_S32(d, 16, 17, c, 17);
+ SUB_S32(d, 19, 18, c, 18);
+ ADD_S32(d, 19, 18, c, 19);
+ ADD_S32(d, 20, 21, c, 20);
+ SUB_S32(d, 20, 21, c, 21);
+ SUB_S32(d, 23, 22, c, 22);
+ ADD_S32(d, 23, 22, c, 23);
+ ADD_S32(d, 24, 25, c, 24);
+ SUB_S32(d, 24, 25, c, 25);
+ SUB_S32(d, 27, 26, c, 26);
+ ADD_S32(d, 27, 26, c, 27);
+ ADD_S32(d, 28, 29, c, 28);
+ SUB_S32(d, 28, 29, c, 29);
+ SUB_S32(d, 31, 30, c, 30);
+ ADD_S32(d, 31, 30, c, 31);
+
+ // Final stage.
+ // Roll rounding into this function so we can pass back int16x8.
+
+ out[0] = add_round_shift_s32_narrow(c_lo[0], c_hi[0]);
+ out[16] = add_round_shift_s32_narrow(c_lo[1], c_hi[1]);
+
+ out[8] = add_round_shift_s32_narrow(c_lo[2], c_hi[2]);
+ out[24] = add_round_shift_s32_narrow(c_lo[3], c_hi[3]);
+ out[4] = add_round_shift_s32_narrow(c_lo[4], c_hi[4]);
+ out[20] = add_round_shift_s32_narrow(c_lo[5], c_hi[5]);
+ out[12] = add_round_shift_s32_narrow(c_lo[6], c_hi[6]);
+
+ out[28] = add_round_shift_s32_narrow(c_lo[7], c_hi[7]);
+ out[2] = add_round_shift_s32_narrow(c_lo[8], c_hi[8]);
+ out[18] = add_round_shift_s32_narrow(c_lo[9], c_hi[9]);
+ out[10] = add_round_shift_s32_narrow(c_lo[10], c_hi[10]);
+
+ out[26] = add_round_shift_s32_narrow(c_lo[11], c_hi[11]);
+ out[6] = add_round_shift_s32_narrow(c_lo[12], c_hi[12]);
+ out[22] = add_round_shift_s32_narrow(c_lo[13], c_hi[13]);
+ out[14] = add_round_shift_s32_narrow(c_lo[14], c_hi[14]);
+ out[30] = add_round_shift_s32_narrow(c_lo[15], c_hi[15]);
+
+ BUTTERFLY_TWO_S32(c, 31, 16, cospi_1_64, cospi_31_64, d, 1, 31);
+ out[1] = add_round_shift_s32_narrow(d_lo[1], d_hi[1]);
+ out[31] = add_round_shift_s32_narrow(d_lo[31], d_hi[31]);
+
+ BUTTERFLY_TWO_S32(c, 30, 17, cospi_17_64, cospi_15_64, d, 17, 15);
+ out[17] = add_round_shift_s32_narrow(d_lo[17], d_hi[17]);
+ out[15] = add_round_shift_s32_narrow(d_lo[15], d_hi[15]);
+
+ BUTTERFLY_TWO_S32(c, 29, 18, cospi_9_64, cospi_23_64, d, 9, 23);
+ out[9] = add_round_shift_s32_narrow(d_lo[9], d_hi[9]);
+ out[23] = add_round_shift_s32_narrow(d_lo[23], d_hi[23]);
+
+ BUTTERFLY_TWO_S32(c, 28, 19, cospi_25_64, cospi_7_64, d, 25, 7);
+ out[25] = add_round_shift_s32_narrow(d_lo[25], d_hi[25]);
+ out[7] = add_round_shift_s32_narrow(d_lo[7], d_hi[7]);
+
+ BUTTERFLY_TWO_S32(c, 27, 20, cospi_5_64, cospi_27_64, d, 5, 27);
+ out[5] = add_round_shift_s32_narrow(d_lo[5], d_hi[5]);
+ out[27] = add_round_shift_s32_narrow(d_lo[27], d_hi[27]);
+
+ BUTTERFLY_TWO_S32(c, 26, 21, cospi_21_64, cospi_11_64, d, 21, 11);
+ out[21] = add_round_shift_s32_narrow(d_lo[21], d_hi[21]);
+ out[11] = add_round_shift_s32_narrow(d_lo[11], d_hi[11]);
+
+ BUTTERFLY_TWO_S32(c, 25, 22, cospi_13_64, cospi_19_64, d, 13, 19);
+ out[13] = add_round_shift_s32_narrow(d_lo[13], d_hi[13]);
+ out[19] = add_round_shift_s32_narrow(d_lo[19], d_hi[19]);
+
+ BUTTERFLY_TWO_S32(c, 24, 23, cospi_29_64, cospi_3_64, d, 29, 3);
+ out[29] = add_round_shift_s32_narrow(d_lo[29], d_hi[29]);
+ out[3] = add_round_shift_s32_narrow(d_lo[3], d_hi[3]);
+}
+
+static void dct_body_second_pass_rd(const int16x8_t *in, int16x8_t *out) {
+ int16x8_t a[32];
+ int16x8_t b[32];
+
+ // Stage 1. Done as part of the load for the first pass.
+ a[0] = vaddq_s16(in[0], in[31]);
+ a[1] = vaddq_s16(in[1], in[30]);
+ a[2] = vaddq_s16(in[2], in[29]);
+ a[3] = vaddq_s16(in[3], in[28]);
+ a[4] = vaddq_s16(in[4], in[27]);
+ a[5] = vaddq_s16(in[5], in[26]);
+ a[6] = vaddq_s16(in[6], in[25]);
+ a[7] = vaddq_s16(in[7], in[24]);
+ a[8] = vaddq_s16(in[8], in[23]);
+ a[9] = vaddq_s16(in[9], in[22]);
+ a[10] = vaddq_s16(in[10], in[21]);
+ a[11] = vaddq_s16(in[11], in[20]);
+ a[12] = vaddq_s16(in[12], in[19]);
+ a[13] = vaddq_s16(in[13], in[18]);
+ a[14] = vaddq_s16(in[14], in[17]);
+ a[15] = vaddq_s16(in[15], in[16]);
+ a[16] = vsubq_s16(in[15], in[16]);
+ a[17] = vsubq_s16(in[14], in[17]);
+ a[18] = vsubq_s16(in[13], in[18]);
+ a[19] = vsubq_s16(in[12], in[19]);
+ a[20] = vsubq_s16(in[11], in[20]);
+ a[21] = vsubq_s16(in[10], in[21]);
+ a[22] = vsubq_s16(in[9], in[22]);
+ a[23] = vsubq_s16(in[8], in[23]);
+ a[24] = vsubq_s16(in[7], in[24]);
+ a[25] = vsubq_s16(in[6], in[25]);
+ a[26] = vsubq_s16(in[5], in[26]);
+ a[27] = vsubq_s16(in[4], in[27]);
+ a[28] = vsubq_s16(in[3], in[28]);
+ a[29] = vsubq_s16(in[2], in[29]);
+ a[30] = vsubq_s16(in[1], in[30]);
+ a[31] = vsubq_s16(in[0], in[31]);
+
+ // Stage 2.
+ // For the "rd" version, all the values are rounded down after stage 2 to keep
+ // the values in 16 bits.
+ b[0] = add_round_shift_s16(vaddq_s16(a[0], a[15]));
+ b[1] = add_round_shift_s16(vaddq_s16(a[1], a[14]));
+ b[2] = add_round_shift_s16(vaddq_s16(a[2], a[13]));
+ b[3] = add_round_shift_s16(vaddq_s16(a[3], a[12]));
+ b[4] = add_round_shift_s16(vaddq_s16(a[4], a[11]));
+ b[5] = add_round_shift_s16(vaddq_s16(a[5], a[10]));
+ b[6] = add_round_shift_s16(vaddq_s16(a[6], a[9]));
+ b[7] = add_round_shift_s16(vaddq_s16(a[7], a[8]));
+
+ b[8] = add_round_shift_s16(vsubq_s16(a[7], a[8]));
+ b[9] = add_round_shift_s16(vsubq_s16(a[6], a[9]));
+ b[10] = add_round_shift_s16(vsubq_s16(a[5], a[10]));
+ b[11] = add_round_shift_s16(vsubq_s16(a[4], a[11]));
+ b[12] = add_round_shift_s16(vsubq_s16(a[3], a[12]));
+ b[13] = add_round_shift_s16(vsubq_s16(a[2], a[13]));
+ b[14] = add_round_shift_s16(vsubq_s16(a[1], a[14]));
+ b[15] = add_round_shift_s16(vsubq_s16(a[0], a[15]));
+
+ b[16] = add_round_shift_s16(a[16]);
+ b[17] = add_round_shift_s16(a[17]);
+ b[18] = add_round_shift_s16(a[18]);
+ b[19] = add_round_shift_s16(a[19]);
+
+ butterfly_one_coeff_s16_s32_narrow(a[27], a[20], cospi_16_64, &b[27], &b[20]);
+ butterfly_one_coeff_s16_s32_narrow(a[26], a[21], cospi_16_64, &b[26], &b[21]);
+ butterfly_one_coeff_s16_s32_narrow(a[25], a[22], cospi_16_64, &b[25], &b[22]);
+ butterfly_one_coeff_s16_s32_narrow(a[24], a[23], cospi_16_64, &b[24], &b[23]);
+ b[20] = add_round_shift_s16(b[20]);
+ b[21] = add_round_shift_s16(b[21]);
+ b[22] = add_round_shift_s16(b[22]);
+ b[23] = add_round_shift_s16(b[23]);
+ b[24] = add_round_shift_s16(b[24]);
+ b[25] = add_round_shift_s16(b[25]);
+ b[26] = add_round_shift_s16(b[26]);
+ b[27] = add_round_shift_s16(b[27]);
+
+ b[28] = add_round_shift_s16(a[28]);
+ b[29] = add_round_shift_s16(a[29]);
+ b[30] = add_round_shift_s16(a[30]);
+ b[31] = add_round_shift_s16(a[31]);
+
+ // Stage 3.
+ a[0] = vaddq_s16(b[0], b[7]);
+ a[1] = vaddq_s16(b[1], b[6]);
+ a[2] = vaddq_s16(b[2], b[5]);
+ a[3] = vaddq_s16(b[3], b[4]);
+
+ a[4] = vsubq_s16(b[3], b[4]);
+ a[5] = vsubq_s16(b[2], b[5]);
+ a[6] = vsubq_s16(b[1], b[6]);
+ a[7] = vsubq_s16(b[0], b[7]);
+
+ a[8] = b[8];
+ a[9] = b[9];
+
+ butterfly_one_coeff_s16_s32_narrow(b[13], b[10], cospi_16_64, &a[13], &a[10]);
+ butterfly_one_coeff_s16_s32_narrow(b[12], b[11], cospi_16_64, &a[12], &a[11]);
+
+ a[14] = b[14];
+ a[15] = b[15];
+
+ a[16] = vaddq_s16(b[16], b[23]);
+ a[17] = vaddq_s16(b[17], b[22]);
+ a[18] = vaddq_s16(b[18], b[21]);
+ a[19] = vaddq_s16(b[19], b[20]);
+
+ a[20] = vsubq_s16(b[19], b[20]);
+ a[21] = vsubq_s16(b[18], b[21]);
+ a[22] = vsubq_s16(b[17], b[22]);
+ a[23] = vsubq_s16(b[16], b[23]);
+
+ a[24] = vsubq_s16(b[31], b[24]);
+ a[25] = vsubq_s16(b[30], b[25]);
+ a[26] = vsubq_s16(b[29], b[26]);
+ a[27] = vsubq_s16(b[28], b[27]);
+
+ a[28] = vaddq_s16(b[28], b[27]);
+ a[29] = vaddq_s16(b[29], b[26]);
+ a[30] = vaddq_s16(b[30], b[25]);
+ a[31] = vaddq_s16(b[31], b[24]);
+
+ // Stage 4.
+ b[0] = vaddq_s16(a[0], a[3]);
+ b[1] = vaddq_s16(a[1], a[2]);
+ b[2] = vsubq_s16(a[1], a[2]);
+ b[3] = vsubq_s16(a[0], a[3]);
+
+ b[4] = a[4];
+
+ butterfly_one_coeff_s16_s32_narrow(a[6], a[5], cospi_16_64, &b[6], &b[5]);
+
+ b[7] = a[7];
+
+ b[8] = vaddq_s16(a[8], a[11]);
+ b[9] = vaddq_s16(a[9], a[10]);
+ b[10] = vsubq_s16(a[9], a[10]);
+ b[11] = vsubq_s16(a[8], a[11]);
+ b[12] = vsubq_s16(a[15], a[12]);
+ b[13] = vsubq_s16(a[14], a[13]);
+ b[14] = vaddq_s16(a[14], a[13]);
+ b[15] = vaddq_s16(a[15], a[12]);
+
+ b[16] = a[16];
+ b[17] = a[17];
+
+ butterfly_two_coeff(a[29], a[18], cospi_8_64, cospi_24_64, &b[29], &b[18]);
+ butterfly_two_coeff(a[28], a[19], cospi_8_64, cospi_24_64, &b[28], &b[19]);
+ butterfly_two_coeff(a[27], a[20], cospi_24_64, -cospi_8_64, &b[27], &b[20]);
+ butterfly_two_coeff(a[26], a[21], cospi_24_64, -cospi_8_64, &b[26], &b[21]);
+
+ b[22] = a[22];
+ b[23] = a[23];
+ b[24] = a[24];
+ b[25] = a[25];
+
+ b[30] = a[30];
+ b[31] = a[31];
+
+ // Stage 5.
+ butterfly_one_coeff_s16_s32_narrow(b[0], b[1], cospi_16_64, &a[0], &a[1]);
+ butterfly_two_coeff(b[3], b[2], cospi_8_64, cospi_24_64, &a[2], &a[3]);
+
+ a[4] = vaddq_s16(b[4], b[5]);
+ a[5] = vsubq_s16(b[4], b[5]);
+ a[6] = vsubq_s16(b[7], b[6]);
+ a[7] = vaddq_s16(b[7], b[6]);
+
+ a[8] = b[8];
+
+ butterfly_two_coeff(b[14], b[9], cospi_8_64, cospi_24_64, &a[14], &a[9]);
+ butterfly_two_coeff(b[13], b[10], cospi_24_64, -cospi_8_64, &a[13], &a[10]);
+
+ a[11] = b[11];
+ a[12] = b[12];
+
+ a[15] = b[15];
+
+ a[16] = vaddq_s16(b[19], b[16]);
+ a[17] = vaddq_s16(b[18], b[17]);
+ a[18] = vsubq_s16(b[17], b[18]);
+ a[19] = vsubq_s16(b[16], b[19]);
+ a[20] = vsubq_s16(b[23], b[20]);
+ a[21] = vsubq_s16(b[22], b[21]);
+ a[22] = vaddq_s16(b[21], b[22]);
+ a[23] = vaddq_s16(b[20], b[23]);
+ a[24] = vaddq_s16(b[27], b[24]);
+ a[25] = vaddq_s16(b[26], b[25]);
+ a[26] = vsubq_s16(b[25], b[26]);
+ a[27] = vsubq_s16(b[24], b[27]);
+ a[28] = vsubq_s16(b[31], b[28]);
+ a[29] = vsubq_s16(b[30], b[29]);
+ a[30] = vaddq_s16(b[29], b[30]);
+ a[31] = vaddq_s16(b[28], b[31]);
+
+ // Stage 6.
+ b[0] = a[0];
+ b[1] = a[1];
+ b[2] = a[2];
+ b[3] = a[3];
+
+ butterfly_two_coeff(a[7], a[4], cospi_4_64, cospi_28_64, &b[4], &b[7]);
+ butterfly_two_coeff(a[6], a[5], cospi_20_64, cospi_12_64, &b[5], &b[6]);
+
+ b[8] = vaddq_s16(a[8], a[9]);
+ b[9] = vsubq_s16(a[8], a[9]);
+ b[10] = vsubq_s16(a[11], a[10]);
+ b[11] = vaddq_s16(a[11], a[10]);
+ b[12] = vaddq_s16(a[12], a[13]);
+ b[13] = vsubq_s16(a[12], a[13]);
+ b[14] = vsubq_s16(a[15], a[14]);
+ b[15] = vaddq_s16(a[15], a[14]);
+
+ b[16] = a[16];
+ b[19] = a[19];
+ b[20] = a[20];
+ b[23] = a[23];
+ b[24] = a[24];
+ b[27] = a[27];
+ b[28] = a[28];
+ b[31] = a[31];
+
+ butterfly_two_coeff(a[30], a[17], cospi_4_64, cospi_28_64, &b[30], &b[17]);
+ butterfly_two_coeff(a[29], a[18], cospi_28_64, -cospi_4_64, &b[29], &b[18]);
+
+ butterfly_two_coeff(a[26], a[21], cospi_20_64, cospi_12_64, &b[26], &b[21]);
+ butterfly_two_coeff(a[25], a[22], cospi_12_64, -cospi_20_64, &b[25], &b[22]);
+
+ // Stage 7.
+ a[0] = b[0];
+ a[1] = b[1];
+ a[2] = b[2];
+ a[3] = b[3];
+ a[4] = b[4];
+ a[5] = b[5];
+ a[6] = b[6];
+ a[7] = b[7];
+
+ butterfly_two_coeff(b[15], b[8], cospi_2_64, cospi_30_64, &a[8], &a[15]);
+ butterfly_two_coeff(b[14], b[9], cospi_18_64, cospi_14_64, &a[9], &a[14]);
+ butterfly_two_coeff(b[13], b[10], cospi_10_64, cospi_22_64, &a[10], &a[13]);
+ butterfly_two_coeff(b[12], b[11], cospi_26_64, cospi_6_64, &a[11], &a[12]);
+
+ a[16] = vaddq_s16(b[16], b[17]);
+ a[17] = vsubq_s16(b[16], b[17]);
+ a[18] = vsubq_s16(b[19], b[18]);
+ a[19] = vaddq_s16(b[19], b[18]);
+ a[20] = vaddq_s16(b[20], b[21]);
+ a[21] = vsubq_s16(b[20], b[21]);
+ a[22] = vsubq_s16(b[23], b[22]);
+ a[23] = vaddq_s16(b[23], b[22]);
+ a[24] = vaddq_s16(b[24], b[25]);
+ a[25] = vsubq_s16(b[24], b[25]);
+ a[26] = vsubq_s16(b[27], b[26]);
+ a[27] = vaddq_s16(b[27], b[26]);
+ a[28] = vaddq_s16(b[28], b[29]);
+ a[29] = vsubq_s16(b[28], b[29]);
+ a[30] = vsubq_s16(b[31], b[30]);
+ a[31] = vaddq_s16(b[31], b[30]);
+
+ // Final stage.
+ out[0] = a[0];
+ out[16] = a[1];
+ out[8] = a[2];
+ out[24] = a[3];
+ out[4] = a[4];
+ out[20] = a[5];
+ out[12] = a[6];
+ out[28] = a[7];
+ out[2] = a[8];
+ out[18] = a[9];
+ out[10] = a[10];
+ out[26] = a[11];
+ out[6] = a[12];
+ out[22] = a[13];
+ out[14] = a[14];
+ out[30] = a[15];
+
+ butterfly_two_coeff(a[31], a[16], cospi_1_64, cospi_31_64, &out[1], &out[31]);
+ butterfly_two_coeff(a[30], a[17], cospi_17_64, cospi_15_64, &out[17],
+ &out[15]);
+ butterfly_two_coeff(a[29], a[18], cospi_9_64, cospi_23_64, &out[9], &out[23]);
+ butterfly_two_coeff(a[28], a[19], cospi_25_64, cospi_7_64, &out[25], &out[7]);
+ butterfly_two_coeff(a[27], a[20], cospi_5_64, cospi_27_64, &out[5], &out[27]);
+ butterfly_two_coeff(a[26], a[21], cospi_21_64, cospi_11_64, &out[21],
+ &out[11]);
+ butterfly_two_coeff(a[25], a[22], cospi_13_64, cospi_19_64, &out[13],
+ &out[19]);
+ butterfly_two_coeff(a[24], a[23], cospi_29_64, cospi_3_64, &out[29], &out[3]);
+}
+
+#undef PASS_THROUGH
+#undef ADD_S16_S32
+#undef SUB_S16_S32
+#undef ADDW_S16_S32
+#undef SUBW_S16_S32
+#undef ADD_S32
+#undef SUB_S32
+#undef BUTTERFLY_ONE_S16_S32
+#undef BUTTERFLY_ONE_S32
+#undef BUTTERFLY_TWO_S32
+
+#endif // VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_
diff --git a/vpx_dsp/arm/fdct4x4_neon.c b/vpx_dsp/arm/fdct4x4_neon.c
index 11df7292d..3b9196fae 100644
--- a/vpx_dsp/arm/fdct4x4_neon.c
+++ b/vpx_dsp/arm/fdct4x4_neon.c
@@ -18,10 +18,10 @@
#include "vpx_dsp/arm/fdct_neon.h"
#include "vpx_dsp/arm/mem_neon.h"
#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/fdct4x4_neon.h"
void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
int stride) {
- int i;
// input[M * stride] * 16
int16x4_t in[4];
in[0] = vshl_n_s16(vld1_s16(input + 0 * stride), 4);
@@ -34,9 +34,8 @@ void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
const int16x4_t one = vreinterpret_s16_s64(vdup_n_s64(1));
in[0] = vadd_s16(in[0], one);
}
- for (i = 0; i < 2; ++i) {
- vpx_fdct4x4_pass1_neon(in);
- }
+ vpx_fdct4x4_pass1_neon(in);
+ vpx_fdct4x4_pass2_neon(in);
{
// Not quite a rounding shift. Only add 1 despite shifting by 2.
const int16x8_t one = vdupq_n_s16(1);
@@ -53,7 +52,6 @@ void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
void vpx_highbd_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
int stride) {
- int i;
static const int32x4_t const_1000 = { 1, 0, 0, 0 };
const int32x4_t const_one = vdupq_n_s32(1);
@@ -69,9 +67,8 @@ void vpx_highbd_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
in[0] = vaddq_s32(in[0], const_1000);
}
- for (i = 0; i < 2; ++i) {
- vpx_highbd_fdct4x4_pass1_neon(in);
- }
+ vpx_highbd_fdct4x4_pass1_neon(in);
+ vpx_highbd_fdct4x4_pass1_neon(in);
{
// Not quite a rounding shift. Only add 1 despite shifting by 2.
in[0] = vshrq_n_s32(vaddq_s32(in[0], const_one), 2);
diff --git a/vpx_dsp/arm/fdct4x4_neon.h b/vpx_dsp/arm/fdct4x4_neon.h
new file mode 100644
index 000000000..de3db9774
--- /dev/null
+++ b/vpx_dsp/arm/fdct4x4_neon.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_
+#define VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_
+
+#include <arm_neon.h>
+
+static INLINE void vpx_fdct4x4_pass1_neon(int16x4_t *in) {
+ int16x4_t out[4];
+
+ const int16x8_t input_01 = vcombine_s16(in[0], in[1]);
+ const int16x8_t input_32 = vcombine_s16(in[3], in[2]);
+
+ // in_0 +/- in_3, in_1 +/- in_2
+ const int16x8_t s_01 = vaddq_s16(input_01, input_32);
+ const int16x8_t s_32 = vsubq_s16(input_01, input_32);
+
+ // step_0 +/- step_1, step_2 +/- step_3
+ const int16x4_t s_0 = vget_low_s16(s_01);
+ const int16x4_t s_1 = vget_high_s16(s_01);
+ const int16x4_t s_2 = vget_high_s16(s_32);
+ const int16x4_t s_3 = vget_low_s16(s_32);
+
+ // fdct_round_shift(s_0 +/- s_1) * cospi_16_64
+ butterfly_one_coeff_s16_fast_half(s_0, s_1, cospi_16_64, &out[0], &out[2]);
+
+ // s_3 * cospi_8_64 + s_2 * cospi_24_64
+ // s_3 * cospi_24_64 - s_2 * cospi_8_64
+ butterfly_two_coeff_half(s_3, s_2, cospi_8_64, cospi_24_64, &out[1], &out[3]);
+
+ transpose_s16_4x4d(&out[0], &out[1], &out[2], &out[3]);
+
+ in[0] = out[0];
+ in[1] = out[1];
+ in[2] = out[2];
+ in[3] = out[3];
+}
+
+static INLINE void vpx_fdct4x4_pass2_neon(int16x4_t *in) {
+ int16x4_t out[4];
+
+ const int16x8_t input_01 = vcombine_s16(in[0], in[1]);
+ const int16x8_t input_32 = vcombine_s16(in[3], in[2]);
+
+ // in_0 +/- in_3, in_1 +/- in_2
+ const int16x8_t s_01 = vaddq_s16(input_01, input_32);
+ const int16x8_t s_32 = vsubq_s16(input_01, input_32);
+
+ // step_0 +/- step_1, step_2 +/- step_3
+ const int16x4_t s_0 = vget_low_s16(s_01);
+ const int16x4_t s_1 = vget_high_s16(s_01);
+ const int16x4_t s_2 = vget_high_s16(s_32);
+ const int16x4_t s_3 = vget_low_s16(s_32);
+
+ // fdct_round_shift(s_0 +/- s_1) * cospi_16_64
+ butterfly_one_coeff_s16_s32_fast_narrow_half(s_0, s_1, cospi_16_64, &out[0],
+ &out[2]);
+
+ // s_3 * cospi_8_64 + s_2 * cospi_24_64
+ // s_3 * cospi_24_64 - s_2 * cospi_8_64
+ butterfly_two_coeff_half(s_3, s_2, cospi_8_64, cospi_24_64, &out[1], &out[3]);
+
+ transpose_s16_4x4d(&out[0], &out[1], &out[2], &out[3]);
+
+ in[0] = out[0];
+ in[1] = out[1];
+ in[2] = out[2];
+ in[3] = out[3];
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+static INLINE void vpx_highbd_fdct4x4_pass1_neon(int32x4_t *in) {
+ int32x4_t out[4];
+ // in_0 +/- in_3, in_1 +/- in_2
+ const int32x4_t s_0 = vaddq_s32(in[0], in[3]);
+ const int32x4_t s_1 = vaddq_s32(in[1], in[2]);
+ const int32x4_t s_2 = vsubq_s32(in[1], in[2]);
+ const int32x4_t s_3 = vsubq_s32(in[0], in[3]);
+
+ butterfly_one_coeff_s32_fast_half(s_0, s_1, cospi_16_64, &out[0], &out[2]);
+
+ // out[1] = s_3 * cospi_8_64 + s_2 * cospi_24_64
+ // out[3] = s_3 * cospi_24_64 - s_2 * cospi_8_64
+ butterfly_two_coeff_s32_s64_narrow_half(s_3, s_2, cospi_8_64, cospi_24_64,
+ &out[1], &out[3]);
+
+ transpose_s32_4x4(&out[0], &out[1], &out[2], &out[3]);
+
+ in[0] = out[0];
+ in[1] = out[1];
+ in[2] = out[2];
+ in[3] = out[3];
+}
+
+#endif // CONFIG_VP9_HIGHBITDEPTH
+#endif // VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_
diff --git a/vpx_dsp/arm/fdct8x8_neon.c b/vpx_dsp/arm/fdct8x8_neon.c
index 3fb15cc17..75ee6f223 100644
--- a/vpx_dsp/arm/fdct8x8_neon.c
+++ b/vpx_dsp/arm/fdct8x8_neon.c
@@ -17,10 +17,10 @@
#include "vpx_dsp/arm/idct_neon.h"
#include "vpx_dsp/arm/fdct_neon.h"
#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/fdct8x8_neon.h"
void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
int stride) {
- int i;
// stage 1
int16x8_t in[8];
in[0] = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
@@ -31,9 +31,9 @@ void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
in[5] = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2);
in[6] = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2);
in[7] = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2);
- for (i = 0; i < 2; ++i) {
- vpx_fdct8x8_pass1_neon(in);
- } // for
+
+ vpx_fdct8x8_pass1_neon(in);
+ vpx_fdct8x8_pass2_neon(in);
{
// from vpx_dct_sse2.c
// Post-condition (division by two)
@@ -71,8 +71,6 @@ void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
void vpx_highbd_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
int stride) {
- int i;
-
// input[M * stride] * 16
int32x4_t left[8], right[8];
int16x8_t in[8];
@@ -102,26 +100,25 @@ void vpx_highbd_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
right[6] = vshll_n_s16(vget_high_s16(in[6]), 2);
right[7] = vshll_n_s16(vget_high_s16(in[7]), 2);
- for (i = 0; i < 2; ++i) {
- vpx_highbd_fdct8x8_pass1_neon(left, right);
- }
+ vpx_highbd_fdct8x8_pass1_neon(left, right);
+ vpx_highbd_fdct8x8_pass2_neon(left, right);
{
- left[0] = highbd_add_round_shift_s32(left[0]);
- left[1] = highbd_add_round_shift_s32(left[1]);
- left[2] = highbd_add_round_shift_s32(left[2]);
- left[3] = highbd_add_round_shift_s32(left[3]);
- left[4] = highbd_add_round_shift_s32(left[4]);
- left[5] = highbd_add_round_shift_s32(left[5]);
- left[6] = highbd_add_round_shift_s32(left[6]);
- left[7] = highbd_add_round_shift_s32(left[7]);
- right[0] = highbd_add_round_shift_s32(right[0]);
- right[1] = highbd_add_round_shift_s32(right[1]);
- right[2] = highbd_add_round_shift_s32(right[2]);
- right[3] = highbd_add_round_shift_s32(right[3]);
- right[4] = highbd_add_round_shift_s32(right[4]);
- right[5] = highbd_add_round_shift_s32(right[5]);
- right[6] = highbd_add_round_shift_s32(right[6]);
- right[7] = highbd_add_round_shift_s32(right[7]);
+ left[0] = add_round_shift_half_s32(left[0]);
+ left[1] = add_round_shift_half_s32(left[1]);
+ left[2] = add_round_shift_half_s32(left[2]);
+ left[3] = add_round_shift_half_s32(left[3]);
+ left[4] = add_round_shift_half_s32(left[4]);
+ left[5] = add_round_shift_half_s32(left[5]);
+ left[6] = add_round_shift_half_s32(left[6]);
+ left[7] = add_round_shift_half_s32(left[7]);
+ right[0] = add_round_shift_half_s32(right[0]);
+ right[1] = add_round_shift_half_s32(right[1]);
+ right[2] = add_round_shift_half_s32(right[2]);
+ right[3] = add_round_shift_half_s32(right[3]);
+ right[4] = add_round_shift_half_s32(right[4]);
+ right[5] = add_round_shift_half_s32(right[5]);
+ right[6] = add_round_shift_half_s32(right[6]);
+ right[7] = add_round_shift_half_s32(right[7]);
// store results
vst1q_s32(final_output, left[0]);
diff --git a/vpx_dsp/arm/fdct8x8_neon.h b/vpx_dsp/arm/fdct8x8_neon.h
new file mode 100644
index 000000000..d8fa60044
--- /dev/null
+++ b/vpx_dsp/arm/fdct8x8_neon.h
@@ -0,0 +1,381 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_
+#define VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_
+
+#include <arm_neon.h>
+
+static INLINE void vpx_fdct8x8_pass1_notranspose_neon(int16x8_t *in,
+ int16x8_t *out) {
+ int16x8_t s[8], x[4], t[2];
+
+ s[0] = vaddq_s16(in[0], in[7]);
+ s[1] = vaddq_s16(in[1], in[6]);
+ s[2] = vaddq_s16(in[2], in[5]);
+ s[3] = vaddq_s16(in[3], in[4]);
+ s[4] = vsubq_s16(in[3], in[4]);
+ s[5] = vsubq_s16(in[2], in[5]);
+ s[6] = vsubq_s16(in[1], in[6]);
+ s[7] = vsubq_s16(in[0], in[7]);
+ // fdct4(step, step);
+ x[0] = vaddq_s16(s[0], s[3]);
+ x[1] = vaddq_s16(s[1], s[2]);
+ x[2] = vsubq_s16(s[1], s[2]);
+ x[3] = vsubq_s16(s[0], s[3]);
+
+ // fdct4(step, step);
+ // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
+ // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
+ butterfly_one_coeff_s16_fast(x[0], x[1], cospi_16_64, &out[0], &out[4]);
+ // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
+ // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
+ butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[2], &out[6]);
+
+ // Stage 2
+ // t0 = (s6 - s5) * cospi_16_64;
+ // t1 = (s6 + s5) * cospi_16_64;
+ butterfly_one_coeff_s16_fast(s[6], s[5], cospi_16_64, &t[1], &t[0]);
+
+ // Stage 3
+ x[0] = vaddq_s16(s[4], t[0]);
+ x[1] = vsubq_s16(s[4], t[0]);
+ x[2] = vsubq_s16(s[7], t[1]);
+ x[3] = vaddq_s16(s[7], t[1]);
+
+ // Stage 4
+ // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
+ // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
+ butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[1], &out[7]);
+
+ // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
+ // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
+ butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[5], &out[3]);
+}
+
+static INLINE void vpx_fdct8x8_pass2_notranspose_neon(int16x8_t *in,
+ int16x8_t *out) {
+ int16x8_t s[8], x[4], t[2];
+
+ s[0] = vaddq_s16(in[0], in[7]);
+ s[1] = vaddq_s16(in[1], in[6]);
+ s[2] = vaddq_s16(in[2], in[5]);
+ s[3] = vaddq_s16(in[3], in[4]);
+ s[4] = vsubq_s16(in[3], in[4]);
+ s[5] = vsubq_s16(in[2], in[5]);
+ s[6] = vsubq_s16(in[1], in[6]);
+ s[7] = vsubq_s16(in[0], in[7]);
+ // fdct4(step, step);
+ x[0] = vaddq_s16(s[0], s[3]);
+ x[1] = vaddq_s16(s[1], s[2]);
+ x[2] = vsubq_s16(s[1], s[2]);
+ x[3] = vsubq_s16(s[0], s[3]);
+
+ // fdct4(step, step);
+ // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
+ // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
+ butterfly_one_coeff_s16_s32_fast_narrow(x[0], x[1], cospi_16_64, &out[0],
+ &out[4]);
+ // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
+ // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
+ butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[2], &out[6]);
+
+ // Stage 2
+ // t0 = (s6 - s5) * cospi_16_64;
+ // t1 = (s6 + s5) * cospi_16_64;
+ butterfly_one_coeff_s16_s32_fast_narrow(s[6], s[5], cospi_16_64, &t[1],
+ &t[0]);
+
+ // Stage 3
+ x[0] = vaddq_s16(s[4], t[0]);
+ x[1] = vsubq_s16(s[4], t[0]);
+ x[2] = vsubq_s16(s[7], t[1]);
+ x[3] = vaddq_s16(s[7], t[1]);
+
+ // Stage 4
+ // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
+ // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
+ butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[1], &out[7]);
+
+ // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
+ // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
+ butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[5], &out[3]);
+}
+
+static INLINE void vpx_fdct8x8_pass1_neon(int16x8_t *in) {
+ int16x8_t out[8];
+ vpx_fdct8x8_pass1_notranspose_neon(in, out);
+ // transpose 8x8
+ transpose_s16_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
+ &out[6], &out[7]);
+ in[0] = out[0];
+ in[1] = out[1];
+ in[2] = out[2];
+ in[3] = out[3];
+ in[4] = out[4];
+ in[5] = out[5];
+ in[6] = out[6];
+ in[7] = out[7];
+}
+
+static INLINE void vpx_fdct8x8_pass2_neon(int16x8_t *in) {
+ int16x8_t out[8];
+ vpx_fdct8x8_pass2_notranspose_neon(in, out);
+ // transpose 8x8
+ transpose_s16_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
+ &out[6], &out[7]);
+ in[0] = out[0];
+ in[1] = out[1];
+ in[2] = out[2];
+ in[3] = out[3];
+ in[4] = out[4];
+ in[5] = out[5];
+ in[6] = out[6];
+ in[7] = out[7];
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void vpx_highbd_fdct8x8_pass1_notranspose_neon(int32x4_t *left,
+ int32x4_t *right) {
+ int32x4_t sl[8], sr[8], xl[4], xr[4], tl[4], tr[4];
+
+ sl[0] = vaddq_s32(left[0], left[7]);
+ sl[1] = vaddq_s32(left[1], left[6]);
+ sl[2] = vaddq_s32(left[2], left[5]);
+ sl[3] = vaddq_s32(left[3], left[4]);
+ sl[4] = vsubq_s32(left[3], left[4]);
+ sl[5] = vsubq_s32(left[2], left[5]);
+ sl[6] = vsubq_s32(left[1], left[6]);
+ sl[7] = vsubq_s32(left[0], left[7]);
+ sr[0] = vaddq_s32(right[0], right[7]);
+ sr[1] = vaddq_s32(right[1], right[6]);
+ sr[2] = vaddq_s32(right[2], right[5]);
+ sr[3] = vaddq_s32(right[3], right[4]);
+ sr[4] = vsubq_s32(right[3], right[4]);
+ sr[5] = vsubq_s32(right[2], right[5]);
+ sr[6] = vsubq_s32(right[1], right[6]);
+ sr[7] = vsubq_s32(right[0], right[7]);
+
+ // fdct4(step, step);
+ // x0 = s0 + s3;
+ xl[0] = vaddq_s32(sl[0], sl[3]);
+ xr[0] = vaddq_s32(sr[0], sr[3]);
+ // x1 = s1 + s2;
+ xl[1] = vaddq_s32(sl[1], sl[2]);
+ xr[1] = vaddq_s32(sr[1], sr[2]);
+ // x2 = s1 - s2;
+ xl[2] = vsubq_s32(sl[1], sl[2]);
+ xr[2] = vsubq_s32(sr[1], sr[2]);
+ // x3 = s0 - s3;
+ xl[3] = vsubq_s32(sl[0], sl[3]);
+ xr[3] = vsubq_s32(sr[0], sr[3]);
+
+ // fdct4(step, step);
+ // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
+ // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
+ butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64,
+ &left[0], &right[0], &left[4], &right[4]);
+ // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
+ // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
+ butterfly_two_coeff_s32(xl[3], xr[3], xl[2], xr[2], cospi_8_64, cospi_24_64,
+ &left[2], &right[2], &left[6], &right[6]);
+
+ // Stage 2
+ // t0 = (s6 - s5) * cospi_16_64;
+ // t1 = (s6 + s5) * cospi_16_64;
+ butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &tl[1],
+ &tr[1], &tl[0], &tr[0]);
+
+ // Stage 3
+ xl[0] = vaddq_s32(sl[4], tl[0]);
+ xr[0] = vaddq_s32(sr[4], tr[0]);
+ xl[1] = vsubq_s32(sl[4], tl[0]);
+ xr[1] = vsubq_s32(sr[4], tr[0]);
+ xl[2] = vsubq_s32(sl[7], tl[1]);
+ xr[2] = vsubq_s32(sr[7], tr[1]);
+ xl[3] = vaddq_s32(sl[7], tl[1]);
+ xr[3] = vaddq_s32(sr[7], tr[1]);
+
+ // Stage 4
+ // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
+ // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
+ butterfly_two_coeff_s32(xl[3], xr[3], xl[0], xr[0], cospi_4_64, cospi_28_64,
+ &left[1], &right[1], &left[7], &right[7]);
+
+ // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
+ // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
+ butterfly_two_coeff_s32(xl[2], xr[2], xl[1], xr[1], cospi_20_64, cospi_12_64,
+ &left[5], &right[5], &left[3], &right[3]);
+}
+
+static INLINE void vpx_highbd_fdct8x8_pass2_notranspose_neon(int32x4_t *left,
+ int32x4_t *right) {
+ int32x4_t sl[8], sr[8], xl[4], xr[4], tl[4], tr[4];
+
+ sl[0] = vaddq_s32(left[0], left[7]);
+ sl[1] = vaddq_s32(left[1], left[6]);
+ sl[2] = vaddq_s32(left[2], left[5]);
+ sl[3] = vaddq_s32(left[3], left[4]);
+ sl[4] = vsubq_s32(left[3], left[4]);
+ sl[5] = vsubq_s32(left[2], left[5]);
+ sl[6] = vsubq_s32(left[1], left[6]);
+ sl[7] = vsubq_s32(left[0], left[7]);
+ sr[0] = vaddq_s32(right[0], right[7]);
+ sr[1] = vaddq_s32(right[1], right[6]);
+ sr[2] = vaddq_s32(right[2], right[5]);
+ sr[3] = vaddq_s32(right[3], right[4]);
+ sr[4] = vsubq_s32(right[3], right[4]);
+ sr[5] = vsubq_s32(right[2], right[5]);
+ sr[6] = vsubq_s32(right[1], right[6]);
+ sr[7] = vsubq_s32(right[0], right[7]);
+
+ // fdct4(step, step);
+ // x0 = s0 + s3;
+ xl[0] = vaddq_s32(sl[0], sl[3]);
+ xr[0] = vaddq_s32(sr[0], sr[3]);
+ // x1 = s1 + s2;
+ xl[1] = vaddq_s32(sl[1], sl[2]);
+ xr[1] = vaddq_s32(sr[1], sr[2]);
+ // x2 = s1 - s2;
+ xl[2] = vsubq_s32(sl[1], sl[2]);
+ xr[2] = vsubq_s32(sr[1], sr[2]);
+ // x3 = s0 - s3;
+ xl[3] = vsubq_s32(sl[0], sl[3]);
+ xr[3] = vsubq_s32(sr[0], sr[3]);
+
+ // fdct4(step, step);
+ // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
+ // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
+ butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64,
+ &left[0], &right[0], &left[4], &right[4]);
+ // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
+ // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
+ butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[2], xr[2], cospi_8_64,
+ cospi_24_64, &left[2], &right[2], &left[6],
+ &right[6]);
+
+ // Stage 2
+ // t0 = (s6 - s5) * cospi_16_64;
+ // t1 = (s6 + s5) * cospi_16_64;
+ butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &tl[1],
+ &tr[1], &tl[0], &tr[0]);
+
+ // Stage 3
+ xl[0] = vaddq_s32(sl[4], tl[0]);
+ xr[0] = vaddq_s32(sr[4], tr[0]);
+ xl[1] = vsubq_s32(sl[4], tl[0]);
+ xr[1] = vsubq_s32(sr[4], tr[0]);
+ xl[2] = vsubq_s32(sl[7], tl[1]);
+ xr[2] = vsubq_s32(sr[7], tr[1]);
+ xl[3] = vaddq_s32(sl[7], tl[1]);
+ xr[3] = vaddq_s32(sr[7], tr[1]);
+
+ // Stage 4
+ // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
+ // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
+ butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[0], xr[0], cospi_4_64,
+ cospi_28_64, &left[1], &right[1], &left[7],
+ &right[7]);
+
+ // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
+ // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
+ butterfly_two_coeff_s32_s64_narrow(xl[2], xr[2], xl[1], xr[1], cospi_20_64,
+ cospi_12_64, &left[5], &right[5], &left[3],
+ &right[3]);
+}
+
+static INLINE void vpx_highbd_fdct8x8_pass1_neon(int32x4_t *left,
+ int32x4_t *right) {
+ int32x4x2_t out[8];
+ vpx_highbd_fdct8x8_pass1_notranspose_neon(left, right);
+
+ out[0].val[0] = left[0];
+ out[0].val[1] = right[0];
+ out[1].val[0] = left[1];
+ out[1].val[1] = right[1];
+ out[2].val[0] = left[2];
+ out[2].val[1] = right[2];
+ out[3].val[0] = left[3];
+ out[3].val[1] = right[3];
+ out[4].val[0] = left[4];
+ out[4].val[1] = right[4];
+ out[5].val[0] = left[5];
+ out[5].val[1] = right[5];
+ out[6].val[0] = left[6];
+ out[6].val[1] = right[6];
+ out[7].val[0] = left[7];
+ out[7].val[1] = right[7];
+
+ transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
+ &out[6], &out[7]);
+
+ left[0] = out[0].val[0];
+ right[0] = out[0].val[1];
+ left[1] = out[1].val[0];
+ right[1] = out[1].val[1];
+ left[2] = out[2].val[0];
+ right[2] = out[2].val[1];
+ left[3] = out[3].val[0];
+ right[3] = out[3].val[1];
+ left[4] = out[4].val[0];
+ right[4] = out[4].val[1];
+ left[5] = out[5].val[0];
+ right[5] = out[5].val[1];
+ left[6] = out[6].val[0];
+ right[6] = out[6].val[1];
+ left[7] = out[7].val[0];
+ right[7] = out[7].val[1];
+}
+
+static INLINE void vpx_highbd_fdct8x8_pass2_neon(int32x4_t *left,
+ int32x4_t *right) {
+ int32x4x2_t out[8];
+ vpx_highbd_fdct8x8_pass2_notranspose_neon(left, right);
+
+ out[0].val[0] = left[0];
+ out[0].val[1] = right[0];
+ out[1].val[0] = left[1];
+ out[1].val[1] = right[1];
+ out[2].val[0] = left[2];
+ out[2].val[1] = right[2];
+ out[3].val[0] = left[3];
+ out[3].val[1] = right[3];
+ out[4].val[0] = left[4];
+ out[4].val[1] = right[4];
+ out[5].val[0] = left[5];
+ out[5].val[1] = right[5];
+ out[6].val[0] = left[6];
+ out[6].val[1] = right[6];
+ out[7].val[0] = left[7];
+ out[7].val[1] = right[7];
+
+ transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
+ &out[6], &out[7]);
+
+ left[0] = out[0].val[0];
+ right[0] = out[0].val[1];
+ left[1] = out[1].val[0];
+ right[1] = out[1].val[1];
+ left[2] = out[2].val[0];
+ right[2] = out[2].val[1];
+ left[3] = out[3].val[0];
+ right[3] = out[3].val[1];
+ left[4] = out[4].val[0];
+ right[4] = out[4].val[1];
+ left[5] = out[5].val[0];
+ right[5] = out[5].val[1];
+ left[6] = out[6].val[0];
+ right[6] = out[6].val[1];
+ left[7] = out[7].val[0];
+ right[7] = out[7].val[1];
+}
+
+#endif // CONFIG_VP9_HIGHBITDEPTH
+#endif // VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_
diff --git a/vpx_dsp/arm/fdct_neon.h b/vpx_dsp/arm/fdct_neon.h
index ce669061d..1ea948b3f 100644
--- a/vpx_dsp/arm/fdct_neon.h
+++ b/vpx_dsp/arm/fdct_neon.h
@@ -14,56 +14,94 @@
#include <arm_neon.h>
// fdct_round_shift((a +/- b) * c)
-static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b,
- const tran_high_t constant,
- int16x8_t *add, int16x8_t *sub) {
- const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant);
- const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant);
- const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant);
- const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant);
- const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant);
- const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant);
- const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS);
- const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS);
- const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS);
- const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS);
- *add = vcombine_s16(rounded0, rounded1);
- *sub = vcombine_s16(rounded2, rounded3);
+// Variant that performs fast vqrdmulh_s16 operation on half vector
+// can be slightly less accurate, adequate for pass1
+static INLINE void butterfly_one_coeff_s16_fast_half(const int16x4_t a,
+ const int16x4_t b,
+ const tran_coef_t constant,
+ int16x4_t *add,
+ int16x4_t *sub) {
+ int16x4_t c = vdup_n_s16(2 * constant);
+ *add = vqrdmulh_s16(vadd_s16(a, b), c);
+ *sub = vqrdmulh_s16(vsub_s16(a, b), c);
}
-// fdct_round_shift(a * c0 +/- b * c1)
-static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b,
- const tran_coef_t constant0,
- const tran_coef_t constant1,
- int16x8_t *add, int16x8_t *sub) {
- const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant0);
- const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant0);
- const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), constant1);
- const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), constant1);
- const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), constant0);
- const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), constant0);
- const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant1);
- const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant1);
- const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS);
- const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS);
- const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS);
- const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS);
- *add = vcombine_s16(rounded0, rounded1);
- *sub = vcombine_s16(rounded2, rounded3);
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulh_s16 operation on full vector
+// can be slightly less accurate, adequate for pass1
+static INLINE void butterfly_one_coeff_s16_fast(const int16x8_t a,
+ const int16x8_t b,
+ const tran_coef_t constant,
+ int16x8_t *add,
+ int16x8_t *sub) {
+ int16x8_t c = vdupq_n_s16(2 * constant);
+ *add = vqrdmulhq_s16(vaddq_s16(a, b), c);
+ *sub = vqrdmulhq_s16(vsubq_s16(a, b), c);
}
-// Add 2 if positive, 1 if negative, and shift by 2.
-// In practice, subtract the sign bit, then shift with rounding.
-static INLINE int16x8_t sub_round_shift(const int16x8_t a) {
- const uint16x8_t a_u16 = vreinterpretq_u16_s16(a);
- const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15);
- const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16);
- return vrshrq_n_s16(vsubq_s16(a, a_sign_s16), 2);
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes 16-bit input values,
+// returns full 32-bit values, high/low
+static INLINE void butterfly_one_coeff_s16_s32_fast(
+ const int16x8_t a, const int16x8_t b, const tran_coef_t constant,
+ int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
+ int32x4_t *sub_hi) {
+ int32x4_t c = vdupq_n_s32(constant << 17);
+ const int16x4_t a_lo = vget_low_s16(a);
+ const int16x4_t a_hi = vget_high_s16(a);
+ const int16x4_t b_lo = vget_low_s16(b);
+ const int16x4_t b_hi = vget_high_s16(b);
+ *add_lo = vqrdmulhq_s32(vaddl_s16(a_lo, b_lo), c);
+ *add_hi = vqrdmulhq_s32(vaddl_s16(a_hi, b_hi), c);
+ *sub_lo = vqrdmulhq_s32(vsubl_s16(a_lo, b_lo), c);
+ *sub_hi = vqrdmulhq_s32(vsubl_s16(a_hi, b_hi), c);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes 16-bit input values,
+// returns full 32-bit values, high/low
+static INLINE void butterfly_one_coeff_s16_s32_fast_narrow(
+ const int16x8_t a, const int16x8_t b, const tran_coef_t constant,
+ int16x8_t *add, int16x8_t *sub) {
+ int32x4_t add_lo, add_hi, sub_lo, sub_hi;
+ butterfly_one_coeff_s16_s32_fast(a, b, constant, &add_lo, &add_hi, &sub_lo,
+ &sub_hi);
+ *add = vcombine_s16(vmovn_s32(add_lo), vmovn_s32(add_hi));
+ *sub = vcombine_s16(vmovn_s32(sub_lo), vmovn_s32(sub_hi));
}
-// Like butterfly_one_coeff, but don't narrow results.
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes 16-bit input values,
+// returns full 32-bit values, high/low
+static INLINE void butterfly_one_coeff_s16_s32_fast_half(
+ const int16x4_t a, const int16x4_t b, const tran_coef_t constant,
+ int32x4_t *add, int32x4_t *sub) {
+ int32x4_t c = vdupq_n_s32(constant << 17);
+ *add = vqrdmulhq_s32(vaddl_s16(a, b), c);
+ *sub = vqrdmulhq_s32(vsubl_s16(a, b), c);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on half vector
+// more accurate does 32-bit processing, takes 16-bit input values,
+// returns narrowed down 16-bit values
+static INLINE void butterfly_one_coeff_s16_s32_fast_narrow_half(
+ const int16x4_t a, const int16x4_t b, const tran_coef_t constant,
+ int16x4_t *add, int16x4_t *sub) {
+ int32x4_t add32, sub32;
+ butterfly_one_coeff_s16_s32_fast_half(a, b, constant, &add32, &sub32);
+ *add = vmovn_s32(add32);
+ *sub = vmovn_s32(sub32);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Original Variant that performs normal implementation on full vector
+// fully accurate does 32-bit processing, takes 16-bit values
static INLINE void butterfly_one_coeff_s16_s32(
- const int16x8_t a, const int16x8_t b, const tran_high_t constant,
+ const int16x8_t a, const int16x8_t b, const tran_coef_t constant,
int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
int32x4_t *sub_hi) {
const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant);
@@ -78,37 +116,182 @@ static INLINE void butterfly_one_coeff_s16_s32(
*sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
}
-// Like butterfly_one_coeff, but with s32.
-static INLINE void butterfly_one_coeff_s32(
+// fdct_round_shift((a +/- b) * c)
+// Original Variant that performs normal implementation on full vector
+// fully accurate does 32-bit processing, takes 16-bit values
+// returns narrowed down 16-bit values
+static INLINE void butterfly_one_coeff_s16_s32_narrow(
+ const int16x8_t a, const int16x8_t b, const tran_coef_t constant,
+ int16x8_t *add, int16x8_t *sub) {
+ int32x4_t add32_lo, add32_hi, sub32_lo, sub32_hi;
+ butterfly_one_coeff_s16_s32(a, b, constant, &add32_lo, &add32_hi, &sub32_lo,
+ &sub32_hi);
+ *add = vcombine_s16(vmovn_s32(add32_lo), vmovn_s32(add32_hi));
+ *sub = vcombine_s16(vmovn_s32(sub32_lo), vmovn_s32(sub32_hi));
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values,
+// high/low
+static INLINE void butterfly_one_coeff_s32_fast_half(const int32x4_t a,
+ const int32x4_t b,
+ const tran_coef_t constant,
+ int32x4_t *add,
+ int32x4_t *sub) {
+ const int32x4_t c = vdupq_n_s32(constant << 17);
+ *add = vqrdmulhq_s32(vaddq_s32(a, b), c);
+ *sub = vqrdmulhq_s32(vsubq_s32(a, b), c);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values,
+// high/low
+static INLINE void butterfly_one_coeff_s32_fast(
const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
- const int32x4_t b_hi, const int32_t constant, int32x4_t *add_lo,
+ const int32x4_t b_hi, const tran_coef_t constant, int32x4_t *add_lo,
int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) {
- const int32x4_t a_lo_0 = vmulq_n_s32(a_lo, constant);
- const int32x4_t a_hi_0 = vmulq_n_s32(a_hi, constant);
- const int32x4_t sum0 = vmlaq_n_s32(a_lo_0, b_lo, constant);
- const int32x4_t sum1 = vmlaq_n_s32(a_hi_0, b_hi, constant);
- const int32x4_t diff0 = vmlsq_n_s32(a_lo_0, b_lo, constant);
- const int32x4_t diff1 = vmlsq_n_s32(a_hi_0, b_hi, constant);
- *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
- *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
- *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
- *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
+ const int32x4_t c = vdupq_n_s32(constant << 17);
+ *add_lo = vqrdmulhq_s32(vaddq_s32(a_lo, b_lo), c);
+ *add_hi = vqrdmulhq_s32(vaddq_s32(a_hi, b_hi), c);
+ *sub_lo = vqrdmulhq_s32(vsubq_s32(a_lo, b_lo), c);
+ *sub_hi = vqrdmulhq_s32(vsubq_s32(a_hi, b_hi), c);
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Variant that performs normal implementation on half vector
+// more accurate does 64-bit processing, takes and returns 32-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_s32_s64_narrow_half(
+ const int32x4_t a, const int32x4_t b, const tran_coef_t constant1,
+ const tran_coef_t constant2, int32x4_t *add, int32x4_t *sub) {
+ const int32x2_t a_lo = vget_low_s32(a);
+ const int32x2_t a_hi = vget_high_s32(a);
+ const int32x2_t b_lo = vget_low_s32(b);
+ const int32x2_t b_hi = vget_high_s32(b);
+
+ const int64x2_t axc0_64_lo = vmull_n_s32(a_lo, constant1);
+ const int64x2_t axc0_64_hi = vmull_n_s32(a_hi, constant1);
+ const int64x2_t axc1_64_lo = vmull_n_s32(a_lo, constant2);
+ const int64x2_t axc1_64_hi = vmull_n_s32(a_hi, constant2);
+
+ const int64x2_t sum_lo = vmlal_n_s32(axc0_64_lo, b_lo, constant2);
+ const int64x2_t sum_hi = vmlal_n_s32(axc0_64_hi, b_hi, constant2);
+ const int64x2_t diff_lo = vmlsl_n_s32(axc1_64_lo, b_lo, constant1);
+ const int64x2_t diff_hi = vmlsl_n_s32(axc1_64_hi, b_hi, constant1);
+
+ *add = vcombine_s32(vrshrn_n_s64(sum_lo, DCT_CONST_BITS),
+ vrshrn_n_s64(sum_hi, DCT_CONST_BITS));
+ *sub = vcombine_s32(vrshrn_n_s64(diff_lo, DCT_CONST_BITS),
+ vrshrn_n_s64(diff_hi, DCT_CONST_BITS));
}
-// Like butterfly_two_coeff, but with s32.
+// fdct_round_shift(a * c1 +/- b * c2)
+// Variant that performs normal implementation on full vector
+// more accurate does 64-bit processing, takes and returns 32-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_s32_s64_narrow(
+ const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+ const int32x4_t b_hi, const tran_coef_t constant1,
+ const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi,
+ int32x4_t *sub_lo, int32x4_t *sub_hi) {
+ // ac1/ac2 hold the following values:
+ // ac1: vget_low_s32(a_lo) * c1, vget_high_s32(a_lo) * c1,
+ // vget_low_s32(a_hi) * c1, vget_high_s32(a_hi) * c1
+ // ac2: vget_low_s32(a_lo) * c2, vget_high_s32(a_lo) * c2,
+ // vget_low_s32(a_hi) * c2, vget_high_s32(a_hi) * c2
+ int64x2_t ac1[4];
+ int64x2_t ac2[4];
+ int64x2_t sum[4];
+ int64x2_t diff[4];
+
+ ac1[0] = vmull_n_s32(vget_low_s32(a_lo), constant1);
+ ac1[1] = vmull_n_s32(vget_high_s32(a_lo), constant1);
+ ac1[2] = vmull_n_s32(vget_low_s32(a_hi), constant1);
+ ac1[3] = vmull_n_s32(vget_high_s32(a_hi), constant1);
+ ac2[0] = vmull_n_s32(vget_low_s32(a_lo), constant2);
+ ac2[1] = vmull_n_s32(vget_high_s32(a_lo), constant2);
+ ac2[2] = vmull_n_s32(vget_low_s32(a_hi), constant2);
+ ac2[3] = vmull_n_s32(vget_high_s32(a_hi), constant2);
+
+ sum[0] = vmlal_n_s32(ac1[0], vget_low_s32(b_lo), constant2);
+ sum[1] = vmlal_n_s32(ac1[1], vget_high_s32(b_lo), constant2);
+ sum[2] = vmlal_n_s32(ac1[2], vget_low_s32(b_hi), constant2);
+ sum[3] = vmlal_n_s32(ac1[3], vget_high_s32(b_hi), constant2);
+ *add_lo = vcombine_s32(vrshrn_n_s64(sum[0], DCT_CONST_BITS),
+ vrshrn_n_s64(sum[1], DCT_CONST_BITS));
+ *add_hi = vcombine_s32(vrshrn_n_s64(sum[2], DCT_CONST_BITS),
+ vrshrn_n_s64(sum[3], DCT_CONST_BITS));
+
+ diff[0] = vmlsl_n_s32(ac2[0], vget_low_s32(b_lo), constant1);
+ diff[1] = vmlsl_n_s32(ac2[1], vget_high_s32(b_lo), constant1);
+ diff[2] = vmlsl_n_s32(ac2[2], vget_low_s32(b_hi), constant1);
+ diff[3] = vmlsl_n_s32(ac2[3], vget_high_s32(b_hi), constant1);
+ *sub_lo = vcombine_s32(vrshrn_n_s64(diff[0], DCT_CONST_BITS),
+ vrshrn_n_s64(diff[1], DCT_CONST_BITS));
+ *sub_hi = vcombine_s32(vrshrn_n_s64(diff[2], DCT_CONST_BITS),
+ vrshrn_n_s64(diff[3], DCT_CONST_BITS));
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Variant that performs normal implementation on half vector
+// more accurate does 32-bit processing, takes and returns 16-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_half(const int16x4_t a,
+ const int16x4_t b,
+ const tran_coef_t constant1,
+ const tran_coef_t constant2,
+ int16x4_t *add, int16x4_t *sub) {
+ const int32x4_t a1 = vmull_n_s16(a, constant1);
+ const int32x4_t a2 = vmull_n_s16(a, constant2);
+ const int32x4_t sum = vmlal_n_s16(a1, b, constant2);
+ const int32x4_t diff = vmlsl_n_s16(a2, b, constant1);
+ *add = vqrshrn_n_s32(sum, DCT_CONST_BITS);
+ *sub = vqrshrn_n_s32(diff, DCT_CONST_BITS);
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Original Variant that performs normal implementation on full vector
+// more accurate does 32-bit processing, takes and returns 16-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b,
+ const tran_coef_t constant1,
+ const tran_coef_t constant2,
+ int16x8_t *add, int16x8_t *sub) {
+ const int32x4_t a1 = vmull_n_s16(vget_low_s16(a), constant1);
+ const int32x4_t a2 = vmull_n_s16(vget_high_s16(a), constant1);
+ const int32x4_t a3 = vmull_n_s16(vget_low_s16(a), constant2);
+ const int32x4_t a4 = vmull_n_s16(vget_high_s16(a), constant2);
+ const int32x4_t sum0 = vmlal_n_s16(a1, vget_low_s16(b), constant2);
+ const int32x4_t sum1 = vmlal_n_s16(a2, vget_high_s16(b), constant2);
+ const int32x4_t diff0 = vmlsl_n_s16(a3, vget_low_s16(b), constant1);
+ const int32x4_t diff1 = vmlsl_n_s16(a4, vget_high_s16(b), constant1);
+ const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS);
+ const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS);
+ const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS);
+ const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS);
+ *add = vcombine_s16(rounded0, rounded1);
+ *sub = vcombine_s16(rounded2, rounded3);
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Original Variant that performs normal implementation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values
+// returns narrowed results
static INLINE void butterfly_two_coeff_s32(
const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
- const int32x4_t b_hi, const int32_t constant0, const int32_t constant1,
- int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
- int32x4_t *sub_hi) {
- const int32x4_t a0 = vmulq_n_s32(a_lo, constant0);
- const int32x4_t a1 = vmulq_n_s32(a_hi, constant0);
- const int32x4_t a2 = vmulq_n_s32(a_lo, constant1);
- const int32x4_t a3 = vmulq_n_s32(a_hi, constant1);
- const int32x4_t sum0 = vmlaq_n_s32(a2, b_lo, constant0);
- const int32x4_t sum1 = vmlaq_n_s32(a3, b_hi, constant0);
- const int32x4_t diff0 = vmlsq_n_s32(a0, b_lo, constant1);
- const int32x4_t diff1 = vmlsq_n_s32(a1, b_hi, constant1);
+ const int32x4_t b_hi, const tran_coef_t constant1,
+ const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi,
+ int32x4_t *sub_lo, int32x4_t *sub_hi) {
+ const int32x4_t a1 = vmulq_n_s32(a_lo, constant1);
+ const int32x4_t a2 = vmulq_n_s32(a_hi, constant1);
+ const int32x4_t a3 = vmulq_n_s32(a_lo, constant2);
+ const int32x4_t a4 = vmulq_n_s32(a_hi, constant2);
+ const int32x4_t sum0 = vmlaq_n_s32(a1, b_lo, constant2);
+ const int32x4_t sum1 = vmlaq_n_s32(a2, b_hi, constant2);
+ const int32x4_t diff0 = vmlsq_n_s32(a3, b_lo, constant1);
+ const int32x4_t diff1 = vmlsq_n_s32(a4, b_hi, constant1);
*add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
*add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
*sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
@@ -126,9 +309,10 @@ static INLINE int16x8_t add_round_shift_s16(const int16x8_t a) {
}
// Add 1 if positive, 2 if negative, and shift by 2.
-// In practice, add 1, then add the sign bit, then shift without rounding.
-static INLINE int16x8_t add_round_shift_s32(const int32x4_t a_lo,
- const int32x4_t a_hi) {
+// In practice, add 1, then add the sign bit, then shift and round,
+// return narrowed results
+static INLINE int16x8_t add_round_shift_s32_narrow(const int32x4_t a_lo,
+ const int32x4_t a_hi) {
const int32x4_t one = vdupq_n_s32(1);
const uint32x4_t a_lo_u32 = vreinterpretq_u32_s32(a_lo);
const uint32x4_t a_lo_sign_u32 = vshrq_n_u32(a_lo_u32, 31);
@@ -143,419 +327,32 @@ static INLINE int16x8_t add_round_shift_s32(const int32x4_t a_lo,
return vcombine_s16(b_lo, b_hi);
}
-static INLINE void vpx_fdct4x4_pass1_neon(int16x4_t *in) {
- const int16x8_t input_01 = vcombine_s16(in[0], in[1]);
- const int16x8_t input_32 = vcombine_s16(in[3], in[2]);
-
- // in_0 +/- in_3, in_1 +/- in_2
- const int16x8_t s_01 = vaddq_s16(input_01, input_32);
- const int16x8_t s_32 = vsubq_s16(input_01, input_32);
-
- // step_0 +/- step_1, step_2 +/- step_3
- const int16x4_t s_0 = vget_low_s16(s_01);
- const int16x4_t s_1 = vget_high_s16(s_01);
- const int16x4_t s_2 = vget_high_s16(s_32);
- const int16x4_t s_3 = vget_low_s16(s_32);
-
- // (s_0 +/- s_1) * cospi_16_64
- // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c.
- const int32x4_t s_0_p_s_1 = vaddl_s16(s_0, s_1);
- const int32x4_t s_0_m_s_1 = vsubl_s16(s_0, s_1);
- const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, cospi_16_64);
- const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, cospi_16_64);
-
- // fdct_round_shift
- int16x4_t out_0 = vrshrn_n_s32(temp1, DCT_CONST_BITS);
- int16x4_t out_2 = vrshrn_n_s32(temp2, DCT_CONST_BITS);
-
- // s_3 * cospi_8_64 + s_2 * cospi_24_64
- // s_3 * cospi_24_64 - s_2 * cospi_8_64
- const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, cospi_8_64);
- const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, cospi_24_64);
-
- const int32x4_t temp3 = vmlal_n_s16(s_3_cospi_8_64, s_2, cospi_24_64);
- const int32x4_t temp4 = vmlsl_n_s16(s_3_cospi_24_64, s_2, cospi_8_64);
-
- // fdct_round_shift
- int16x4_t out_1 = vrshrn_n_s32(temp3, DCT_CONST_BITS);
- int16x4_t out_3 = vrshrn_n_s32(temp4, DCT_CONST_BITS);
-
- transpose_s16_4x4d(&out_0, &out_1, &out_2, &out_3);
-
- in[0] = out_0;
- in[1] = out_1;
- in[2] = out_2;
- in[3] = out_3;
-}
-
-static INLINE void vpx_fdct8x8_pass1_notranspose_neon(int16x8_t *in,
- int16x8_t *out) {
- const int16x8_t v_s0 = vaddq_s16(in[0], in[7]);
- const int16x8_t v_s1 = vaddq_s16(in[1], in[6]);
- const int16x8_t v_s2 = vaddq_s16(in[2], in[5]);
- const int16x8_t v_s3 = vaddq_s16(in[3], in[4]);
- const int16x8_t v_s4 = vsubq_s16(in[3], in[4]);
- const int16x8_t v_s5 = vsubq_s16(in[2], in[5]);
- const int16x8_t v_s6 = vsubq_s16(in[1], in[6]);
- const int16x8_t v_s7 = vsubq_s16(in[0], in[7]);
- // fdct4(step, step);
- int16x8_t v_x0 = vaddq_s16(v_s0, v_s3);
- int16x8_t v_x1 = vaddq_s16(v_s1, v_s2);
- int16x8_t v_x2 = vsubq_s16(v_s1, v_s2);
- int16x8_t v_x3 = vsubq_s16(v_s0, v_s3);
- // fdct4(step, step);
- int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
- int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
- int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
- int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
- int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_24_64);
- int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_24_64);
- int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_24_64);
- int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_24_64);
- v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), cospi_8_64);
- v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), cospi_8_64);
- v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), cospi_8_64);
- v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), cospi_8_64);
- v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64);
- v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64);
- v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64);
- v_t1_hi = vmulq_n_s32(v_t1_hi, cospi_16_64);
- {
- const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
- const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
- const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
- const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
- const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
- const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
- const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
- const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
- out[0] = vcombine_s16(a, c); // 00 01 02 03 40 41 42 43
- out[2] = vcombine_s16(e, g); // 20 21 22 23 60 61 62 63
- out[4] = vcombine_s16(b, d); // 04 05 06 07 44 45 46 47
- out[6] = vcombine_s16(f, h); // 24 25 26 27 64 65 66 67
- }
- // Stage 2
- v_x0 = vsubq_s16(v_s6, v_s5);
- v_x1 = vaddq_s16(v_s6, v_s5);
- v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), cospi_16_64);
- v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), cospi_16_64);
- v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_16_64);
- v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_16_64);
- {
- const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
- const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
- const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
- const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
- const int16x8_t ab = vcombine_s16(a, b);
- const int16x8_t cd = vcombine_s16(c, d);
- // Stage 3
- v_x0 = vaddq_s16(v_s4, ab);
- v_x1 = vsubq_s16(v_s4, ab);
- v_x2 = vsubq_s16(v_s7, cd);
- v_x3 = vaddq_s16(v_s7, cd);
- }
- // Stage 4
- v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_4_64);
- v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_4_64);
- v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), cospi_28_64);
- v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), cospi_28_64);
- v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_12_64);
- v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_12_64);
- v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), cospi_20_64);
- v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), cospi_20_64);
- v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_12_64);
- v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_12_64);
- v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), cospi_20_64);
- v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), cospi_20_64);
- v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_28_64);
- v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_28_64);
- v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), cospi_4_64);
- v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), cospi_4_64);
- {
- const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
- const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
- const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
- const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
- const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
- const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
- const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
- const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
- out[1] = vcombine_s16(a, c); // 10 11 12 13 50 51 52 53
- out[3] = vcombine_s16(e, g); // 30 31 32 33 70 71 72 73
- out[5] = vcombine_s16(b, d); // 14 15 16 17 54 55 56 57
- out[7] = vcombine_s16(f, h); // 34 35 36 37 74 75 76 77
- }
-}
-
-static INLINE void vpx_fdct8x8_pass1_neon(int16x8_t *in) {
- int16x8_t out[8];
- vpx_fdct8x8_pass1_notranspose_neon(in, out);
- // transpose 8x8
- // Can't use transpose_s16_8x8() because the values are arranged in two 4x8
- // columns.
- {
- // 00 01 02 03 40 41 42 43
- // 10 11 12 13 50 51 52 53
- // 20 21 22 23 60 61 62 63
- // 30 31 32 33 70 71 72 73
- // 04 05 06 07 44 45 46 47
- // 14 15 16 17 54 55 56 57
- // 24 25 26 27 64 65 66 67
- // 34 35 36 37 74 75 76 77
- const int32x4x2_t r02_s32 =
- vtrnq_s32(vreinterpretq_s32_s16(out[0]), vreinterpretq_s32_s16(out[2]));
- const int32x4x2_t r13_s32 =
- vtrnq_s32(vreinterpretq_s32_s16(out[1]), vreinterpretq_s32_s16(out[3]));
- const int32x4x2_t r46_s32 =
- vtrnq_s32(vreinterpretq_s32_s16(out[4]), vreinterpretq_s32_s16(out[6]));
- const int32x4x2_t r57_s32 =
- vtrnq_s32(vreinterpretq_s32_s16(out[5]), vreinterpretq_s32_s16(out[7]));
- const int16x8x2_t r01_s16 =
- vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]),
- vreinterpretq_s16_s32(r13_s32.val[0]));
- const int16x8x2_t r23_s16 =
- vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]),
- vreinterpretq_s16_s32(r13_s32.val[1]));
- const int16x8x2_t r45_s16 =
- vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]),
- vreinterpretq_s16_s32(r57_s32.val[0]));
- const int16x8x2_t r67_s16 =
- vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]),
- vreinterpretq_s16_s32(r57_s32.val[1]));
- in[0] = r01_s16.val[0];
- in[1] = r01_s16.val[1];
- in[2] = r23_s16.val[0];
- in[3] = r23_s16.val[1];
- in[4] = r45_s16.val[0];
- in[5] = r45_s16.val[1];
- in[6] = r67_s16.val[0];
- in[7] = r67_s16.val[1];
- // 00 10 20 30 40 50 60 70
- // 01 11 21 31 41 51 61 71
- // 02 12 22 32 42 52 62 72
- // 03 13 23 33 43 53 63 73
- // 04 14 24 34 44 54 64 74
- // 05 15 25 35 45 55 65 75
- // 06 16 26 36 46 56 66 76
- // 07 17 27 37 47 57 67 77
- }
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static INLINE int32x4_t highbd_add_round_shift_s32(int32x4_t x) {
- const int32x2_t x_lo = vget_low_s32(x);
- const int32x2_t x_hi = vget_high_s32(x);
- const int64x2_t x64_lo = vmovl_s32(x_lo);
- const int64x2_t x64_hi = vmovl_s32(x_hi);
-
- const int64x2_t sign_lo = (int64x2_t)vshrq_n_u64((uint64x2_t)x64_lo, 63);
- const int64x2_t sign_hi = (int64x2_t)vshrq_n_u64((uint64x2_t)x64_hi, 63);
-
- const int64x2_t sum_lo = vaddq_s64(x64_lo, sign_lo);
- const int64x2_t sum_hi = vaddq_s64(x64_hi, sign_hi);
- return vcombine_s32(vshrn_n_s64(sum_lo, 1), vshrn_n_s64(sum_hi, 1));
-}
-
-static INLINE void highbd_butterfly_one_coeff_s32(const int32x4_t a,
- const int32x4_t b,
- const tran_coef_t c,
- int32x4_t *add,
- int32x4_t *sub) {
- const int32x2_t a_lo = vget_low_s32(a);
- const int32x2_t a_hi = vget_high_s32(a);
- const int32x2_t b_lo = vget_low_s32(b);
- const int32x2_t b_hi = vget_high_s32(b);
-
- const int64x2_t a64_lo = vmull_n_s32(a_lo, c);
- const int64x2_t a64_hi = vmull_n_s32(a_hi, c);
-
- const int64x2_t sum_lo = vmlal_n_s32(a64_lo, b_lo, c);
- const int64x2_t sum_hi = vmlal_n_s32(a64_hi, b_hi, c);
- const int64x2_t diff_lo = vmlsl_n_s32(a64_lo, b_lo, c);
- const int64x2_t diff_hi = vmlsl_n_s32(a64_hi, b_hi, c);
-
- *add = vcombine_s32(vrshrn_n_s64(sum_lo, DCT_CONST_BITS),
- vrshrn_n_s64(sum_hi, DCT_CONST_BITS));
- *sub = vcombine_s32(vrshrn_n_s64(diff_lo, DCT_CONST_BITS),
- vrshrn_n_s64(diff_hi, DCT_CONST_BITS));
-}
-
-static INLINE void highbd_butterfly_two_coeff_s32(
- const int32x4_t a, const int32x4_t b, const tran_coef_t c0,
- const tran_coef_t c1, int32x4_t *add, int32x4_t *sub) {
- const int32x2_t a_lo = vget_low_s32(a);
- const int32x2_t a_hi = vget_high_s32(a);
- const int32x2_t b_lo = vget_low_s32(b);
- const int32x2_t b_hi = vget_high_s32(b);
-
- const int64x2_t axc0_64_lo = vmull_n_s32(a_lo, c0);
- const int64x2_t axc0_64_hi = vmull_n_s32(a_hi, c0);
- const int64x2_t axc1_64_lo = vmull_n_s32(a_lo, c1);
- const int64x2_t axc1_64_hi = vmull_n_s32(a_hi, c1);
-
- const int64x2_t sum_lo = vmlal_n_s32(axc0_64_lo, b_lo, c1);
- const int64x2_t sum_hi = vmlal_n_s32(axc0_64_hi, b_hi, c1);
- const int64x2_t diff_lo = vmlsl_n_s32(axc1_64_lo, b_lo, c0);
- const int64x2_t diff_hi = vmlsl_n_s32(axc1_64_hi, b_hi, c0);
-
- *add = vcombine_s32(vrshrn_n_s64(sum_lo, DCT_CONST_BITS),
- vrshrn_n_s64(sum_hi, DCT_CONST_BITS));
- *sub = vcombine_s32(vrshrn_n_s64(diff_lo, DCT_CONST_BITS),
- vrshrn_n_s64(diff_hi, DCT_CONST_BITS));
-}
-
-static INLINE void vpx_highbd_fdct4x4_pass1_neon(int32x4_t *in) {
- int32x4_t out[4];
- // in_0 +/- in_3, in_1 +/- in_2
- const int32x4_t s_0 = vaddq_s32(in[0], in[3]);
- const int32x4_t s_1 = vaddq_s32(in[1], in[2]);
- const int32x4_t s_2 = vsubq_s32(in[1], in[2]);
- const int32x4_t s_3 = vsubq_s32(in[0], in[3]);
-
- highbd_butterfly_one_coeff_s32(s_0, s_1, cospi_16_64, &out[0], &out[2]);
-
- // out[1] = s_3 * cospi_8_64 + s_2 * cospi_24_64
- // out[3] = s_3 * cospi_24_64 - s_2 * cospi_8_64
- highbd_butterfly_two_coeff_s32(s_3, s_2, cospi_8_64, cospi_24_64, &out[1],
- &out[3]);
-
- transpose_s32_4x4(&out[0], &out[1], &out[2], &out[3]);
-
- in[0] = out[0];
- in[1] = out[1];
- in[2] = out[2];
- in[3] = out[3];
+// Add 1 if negative, and shift by 1.
+// In practice, add the sign bit, then shift and round
+static INLINE int32x4_t add_round_shift_half_s32(const int32x4_t a) {
+ const uint32x4_t a_u32 = vreinterpretq_u32_s32(a);
+ const uint32x4_t a_sign_u32 = vshrq_n_u32(a_u32, 31);
+ const int32x4_t a_sign_s32 = vreinterpretq_s32_u32(a_sign_u32);
+ return vshrq_n_s32(vaddq_s32(a, a_sign_s32), 1);
}
-static INLINE void vpx_highbd_fdct8x8_pass1_notranspose_neon(int32x4_t *left,
- int32x4_t *right) {
- int32x4_t sl[8], sr[8], xl[4], xr[4], tl[4], tr[4];
-
- sl[0] = vaddq_s32(left[0], left[7]);
- sl[1] = vaddq_s32(left[1], left[6]);
- sl[2] = vaddq_s32(left[2], left[5]);
- sl[3] = vaddq_s32(left[3], left[4]);
- sl[4] = vsubq_s32(left[3], left[4]);
- sl[5] = vsubq_s32(left[2], left[5]);
- sl[6] = vsubq_s32(left[1], left[6]);
- sl[7] = vsubq_s32(left[0], left[7]);
- sr[0] = vaddq_s32(right[0], right[7]);
- sr[1] = vaddq_s32(right[1], right[6]);
- sr[2] = vaddq_s32(right[2], right[5]);
- sr[3] = vaddq_s32(right[3], right[4]);
- sr[4] = vsubq_s32(right[3], right[4]);
- sr[5] = vsubq_s32(right[2], right[5]);
- sr[6] = vsubq_s32(right[1], right[6]);
- sr[7] = vsubq_s32(right[0], right[7]);
-
- // fdct4(step, step);
- // x0 = s0 + s3;
- xl[0] = vaddq_s32(sl[0], sl[3]);
- xr[0] = vaddq_s32(sr[0], sr[3]);
- // x1 = s1 + s2;
- xl[1] = vaddq_s32(sl[1], sl[2]);
- xr[1] = vaddq_s32(sr[1], sr[2]);
- // x2 = s1 - s2;
- xl[2] = vsubq_s32(sl[1], sl[2]);
- xr[2] = vsubq_s32(sr[1], sr[2]);
- // x3 = s0 - s3;
- xl[3] = vsubq_s32(sl[0], sl[3]);
- xr[3] = vsubq_s32(sr[0], sr[3]);
-
- // fdct4(step, step);
- // t0 = (x0 + x1) * cospi_16_64;
- // t1 = (x0 - x1) * cospi_16_64;
- // out[0] = (tran_low_t)fdct_round_shift(t0);
- // out[4] = (tran_low_t)fdct_round_shift(t1);
- highbd_butterfly_one_coeff_s32(xl[0], xl[1], cospi_16_64, &left[0], &left[4]);
- highbd_butterfly_one_coeff_s32(xr[0], xr[1], cospi_16_64, &right[0],
- &right[4]);
- // t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
- // t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
- // out[2] = (tran_low_t)fdct_round_shift(t2);
- // out[6] = (tran_low_t)fdct_round_shift(t3);
- highbd_butterfly_two_coeff_s32(xl[3], xl[2], cospi_8_64, cospi_24_64,
- &left[2], &left[6]);
- highbd_butterfly_two_coeff_s32(xr[3], xr[2], cospi_8_64, cospi_24_64,
- &right[2], &right[6]);
-
- // Stage 2
- // t0 = (s6 - s5) * cospi_16_64;
- highbd_butterfly_one_coeff_s32(sl[6], sl[5], cospi_16_64, &tl[1], &tl[0]);
- highbd_butterfly_one_coeff_s32(sr[6], sr[5], cospi_16_64, &tr[1], &tr[0]);
-
- // Stage 3
- xl[0] = vaddq_s32(sl[4], tl[0]);
- xr[0] = vaddq_s32(sr[4], tr[0]);
- xl[1] = vsubq_s32(sl[4], tl[0]);
- xr[1] = vsubq_s32(sr[4], tr[0]);
- xl[2] = vsubq_s32(sl[7], tl[1]);
- xr[2] = vsubq_s32(sr[7], tr[1]);
- xl[3] = vaddq_s32(sl[7], tl[1]);
- xr[3] = vaddq_s32(sr[7], tr[1]);
-
- // Stage 4
- // t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
- // out[1] = (tran_low_t)fdct_round_shift(t0);
- // t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
- // out[7] = (tran_low_t)fdct_round_shift(t3);
- highbd_butterfly_two_coeff_s32(xl[3], xl[0], cospi_4_64, cospi_28_64,
- &left[1], &left[7]);
- highbd_butterfly_two_coeff_s32(xr[3], xr[0], cospi_4_64, cospi_28_64,
- &right[1], &right[7]);
-
- // t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
- // out[5] = (tran_low_t)fdct_round_shift(t1);
- // t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
- // out[3] = (tran_low_t)fdct_round_shift(t2);
- highbd_butterfly_two_coeff_s32(xl[2], xl[1], cospi_20_64, cospi_12_64,
- &left[5], &left[3]);
- highbd_butterfly_two_coeff_s32(xr[2], xr[1], cospi_20_64, cospi_12_64,
- &right[5], &right[3]);
+// Add 1 if positive, 2 if negative, and shift by 2.
+// In practice, add 1, then add the sign bit, then shift without rounding.
+static INLINE int32x4_t add_round_shift_s32(const int32x4_t a) {
+ const int32x4_t one = vdupq_n_s32(1);
+ const uint32x4_t a_u32 = vreinterpretq_u32_s32(a);
+ const uint32x4_t a_sign_u32 = vshrq_n_u32(a_u32, 31);
+ const int32x4_t a_sign_s32 = vreinterpretq_s32_u32(a_sign_u32);
+ return vshrq_n_s32(vaddq_s32(vaddq_s32(a, a_sign_s32), one), 2);
}
-static INLINE void vpx_highbd_fdct8x8_pass1_neon(int32x4_t *left,
- int32x4_t *right) {
- int32x4x2_t out[8];
- vpx_highbd_fdct8x8_pass1_notranspose_neon(left, right);
-
- out[0].val[0] = left[0];
- out[0].val[1] = right[0];
- out[1].val[0] = left[1];
- out[1].val[1] = right[1];
- out[2].val[0] = left[2];
- out[2].val[1] = right[2];
- out[3].val[0] = left[3];
- out[3].val[1] = right[3];
- out[4].val[0] = left[4];
- out[4].val[1] = right[4];
- out[5].val[0] = left[5];
- out[5].val[1] = right[5];
- out[6].val[0] = left[6];
- out[6].val[1] = right[6];
- out[7].val[0] = left[7];
- out[7].val[1] = right[7];
-
- transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
- &out[6], &out[7]);
-
- left[0] = out[0].val[0];
- right[0] = out[0].val[1];
- left[1] = out[1].val[0];
- right[1] = out[1].val[1];
- left[2] = out[2].val[0];
- right[2] = out[2].val[1];
- left[3] = out[3].val[0];
- right[3] = out[3].val[1];
- left[4] = out[4].val[0];
- right[4] = out[4].val[1];
- left[5] = out[5].val[0];
- right[5] = out[5].val[1];
- left[6] = out[6].val[0];
- right[6] = out[6].val[1];
- left[7] = out[7].val[0];
- right[7] = out[7].val[1];
+// Add 2 if positive, 1 if negative, and shift by 2.
+// In practice, subtract the sign bit, then shift with rounding.
+static INLINE int16x8_t sub_round_shift_s16(const int16x8_t a) {
+ const uint16x8_t a_u16 = vreinterpretq_u16_s16(a);
+ const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15);
+ const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16);
+ return vrshrq_n_s16(vsubq_s16(a, a_sign_s16), 2);
}
-#endif // CONFIG_VP9_HIGHBITDEPTH
#endif // VPX_VPX_DSP_ARM_FDCT_NEON_H_
diff --git a/vpx_dsp/arm/transpose_neon.h b/vpx_dsp/arm/transpose_neon.h
index bf06d6abe..41d44f2b1 100644
--- a/vpx_dsp/arm/transpose_neon.h
+++ b/vpx_dsp/arm/transpose_neon.h
@@ -821,6 +821,51 @@ static INLINE void transpose_s32_8x8(int32x4x2_t *a0, int32x4x2_t *a1,
a7->val[1] = c7.val[1];
}
+// Helper transpose function for highbd FDCT variants
+static INLINE void transpose_s32_8x8_2(int32x4_t *left /*[8]*/,
+ int32x4_t *right /*[8]*/,
+ int32x4_t *out_left /*[8]*/,
+ int32x4_t *out_right /*[8]*/) {
+ int32x4x2_t out[8];
+
+ out[0].val[0] = left[0];
+ out[0].val[1] = right[0];
+ out[1].val[0] = left[1];
+ out[1].val[1] = right[1];
+ out[2].val[0] = left[2];
+ out[2].val[1] = right[2];
+ out[3].val[0] = left[3];
+ out[3].val[1] = right[3];
+ out[4].val[0] = left[4];
+ out[4].val[1] = right[4];
+ out[5].val[0] = left[5];
+ out[5].val[1] = right[5];
+ out[6].val[0] = left[6];
+ out[6].val[1] = right[6];
+ out[7].val[0] = left[7];
+ out[7].val[1] = right[7];
+
+ transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
+ &out[6], &out[7]);
+
+ out_left[0] = out[0].val[0];
+ out_left[1] = out[1].val[0];
+ out_left[2] = out[2].val[0];
+ out_left[3] = out[3].val[0];
+ out_left[4] = out[4].val[0];
+ out_left[5] = out[5].val[0];
+ out_left[6] = out[6].val[0];
+ out_left[7] = out[7].val[0];
+ out_right[0] = out[0].val[1];
+ out_right[1] = out[1].val[1];
+ out_right[2] = out[2].val[1];
+ out_right[3] = out[3].val[1];
+ out_right[4] = out[4].val[1];
+ out_right[5] = out[5].val[1];
+ out_right[6] = out[6].val[1];
+ out_right[7] = out[7].val[1];
+}
+
static INLINE void transpose_u8_16x8(
const uint8x16_t i0, const uint8x16_t i1, const uint8x16_t i2,
const uint8x16_t i3, const uint8x16_t i4, const uint8x16_t i5,