Make sure only NEON FDCT functions are called.

[NEON] Added vpx_fdct4x4_pass1_neon(), Added vpx_fdct8x8_pass1_notranspose_neon(), Added vpx_fdct8x8_pass1_neon() to avoid code duplication Refactored vpx_fdct4x4_neon() and vpx_dct8x8_neon() to use the above Rename dct_body to vpx_fdct16x16_body to reuse later Add transpose_s16_16x16() I have run make test and all tests/configurations seem to pass. Profiled using this command on an Ampere Altra VM: sudo perf record -g ./vpxenc --codec=vp9 --height=1080 --width=1920 \ --fps=25/1 --limit=20 -o output.mkv \ ../original_videos_Sports_1080P_Sports_1080P-0063.mkv --debug –rt Before this optimization: 1.32% 1.32% vpxenc vpxenc [.] vpx_fdct4x4_neon 0.16% 0.16% vpxenc vpxenc [.] vpx_fdct4x4_c 0.79% 0.79% vpxenc vpxenc [.] vpx_fdct8x8_c 0.52% 0.52% vpxenc vpxenc [.] vpx_fdct8x8_neon 1.23% 1.23% vpxenc vpxenc [.] vpx_fdct16x16_c 0.54% 0.54% vpxenc vpxenc [.] vpx_fdct16x16_neon So, even though a _neon() version exists, the C version was called \ as well. After this patch: 1.42% 1.36% vpxenc vpxenc [.] vpx_fdct4x4_neon 0.87% 0.82% vpxenc vpxenc [.] vpx_fdct8x8_neon 0.74% 0.74% vpxenc vpxenc [.] vpx_fdct16x16_neon Change-Id: Id4e1dd315c67b4355fe4e5a1b59e181a349f16d0
author: Konstantinos Margaritis <konma@vectorcamp.gr> 2022-03-11 20:19:25 +0200
committer: Konstantinos Margaritis <konma@vectorcamp.gr> 2022-03-17 13:07:12 +0200
commit: f79d256cb28a4228df66a7a6d1cebbd9071e0639 (patch)
tree: af6e7e70ddb165208e3a7ad22132ac48f69e1b21 /vpx_dsp/arm
parent: 8a50f70ffc5eea6c2392a5c176bfe43e450ecebc (diff)
download: libvpx-f79d256cb28a4228df66a7a6d1cebbd9071e0639.tar
libvpx-f79d256cb28a4228df66a7a6d1cebbd9071e0639.tar.gz
libvpx-f79d256cb28a4228df66a7a6d1cebbd9071e0639.tar.bz2
libvpx-f79d256cb28a4228df66a7a6d1cebbd9071e0639.zip
6 files changed, 629 insertions, 542 deletions
diff --git a/vpx_dsp/arm/fdct16x16_neon.c b/vpx_dsp/arm/fdct16x16_neon.c
index 6b2bebd09..67f43246a 100644
--- a/vpx_dsp/arm/fdct16x16_neon.c
+++ b/vpx_dsp/arm/fdct16x16_neon.c
@@ -15,6 +15,7 @@
 #include "vpx_dsp/txfm_common.h"
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/fdct16x16_neon.h"
 
 // Some builds of gcc 4.9.2 and .3 have trouble with some of the inline
 // functions.
@@ -27,316 +28,6 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
 
 #else
 
-static INLINE void load(const int16_t *a, int stride, int16x8_t *b /*[16]*/) {
-  b[0] = vld1q_s16(a);
-  a += stride;
-  b[1] = vld1q_s16(a);
-  a += stride;
-  b[2] = vld1q_s16(a);
-  a += stride;
-  b[3] = vld1q_s16(a);
-  a += stride;
-  b[4] = vld1q_s16(a);
-  a += stride;
-  b[5] = vld1q_s16(a);
-  a += stride;
-  b[6] = vld1q_s16(a);
-  a += stride;
-  b[7] = vld1q_s16(a);
-  a += stride;
-  b[8] = vld1q_s16(a);
-  a += stride;
-  b[9] = vld1q_s16(a);
-  a += stride;
-  b[10] = vld1q_s16(a);
-  a += stride;
-  b[11] = vld1q_s16(a);
-  a += stride;
-  b[12] = vld1q_s16(a);
-  a += stride;
-  b[13] = vld1q_s16(a);
-  a += stride;
-  b[14] = vld1q_s16(a);
-  a += stride;
-  b[15] = vld1q_s16(a);
-}
-
-// Store 8 16x8 values, assuming stride == 16.
-static INLINE void store(tran_low_t *a, const int16x8_t *b /*[8]*/) {
-  store_s16q_to_tran_low(a, b[0]);
-  a += 16;
-  store_s16q_to_tran_low(a, b[1]);
-  a += 16;
-  store_s16q_to_tran_low(a, b[2]);
-  a += 16;
-  store_s16q_to_tran_low(a, b[3]);
-  a += 16;
-  store_s16q_to_tran_low(a, b[4]);
-  a += 16;
-  store_s16q_to_tran_low(a, b[5]);
-  a += 16;
-  store_s16q_to_tran_low(a, b[6]);
-  a += 16;
-  store_s16q_to_tran_low(a, b[7]);
-}
-
-// Load step of each pass. Add and subtract clear across the input, requiring
-// all 16 values to be loaded. For the first pass it also multiplies by 4.
-
-// To maybe reduce register usage this could be combined with the load() step to
-// get the first 4 and last 4 values, cross those, then load the middle 8 values
-// and cross them.
-static INLINE void cross_input(const int16x8_t *a /*[16]*/,
-                               int16x8_t *b /*[16]*/, const int pass) {
-  if (pass == 0) {
-    b[0] = vshlq_n_s16(vaddq_s16(a[0], a[15]), 2);
-    b[1] = vshlq_n_s16(vaddq_s16(a[1], a[14]), 2);
-    b[2] = vshlq_n_s16(vaddq_s16(a[2], a[13]), 2);
-    b[3] = vshlq_n_s16(vaddq_s16(a[3], a[12]), 2);
-    b[4] = vshlq_n_s16(vaddq_s16(a[4], a[11]), 2);
-    b[5] = vshlq_n_s16(vaddq_s16(a[5], a[10]), 2);
-    b[6] = vshlq_n_s16(vaddq_s16(a[6], a[9]), 2);
-    b[7] = vshlq_n_s16(vaddq_s16(a[7], a[8]), 2);
-
-    b[8] = vshlq_n_s16(vsubq_s16(a[7], a[8]), 2);
-    b[9] = vshlq_n_s16(vsubq_s16(a[6], a[9]), 2);
-    b[10] = vshlq_n_s16(vsubq_s16(a[5], a[10]), 2);
-    b[11] = vshlq_n_s16(vsubq_s16(a[4], a[11]), 2);
-    b[12] = vshlq_n_s16(vsubq_s16(a[3], a[12]), 2);
-    b[13] = vshlq_n_s16(vsubq_s16(a[2], a[13]), 2);
-    b[14] = vshlq_n_s16(vsubq_s16(a[1], a[14]), 2);
-    b[15] = vshlq_n_s16(vsubq_s16(a[0], a[15]), 2);
-  } else {
-    b[0] = vaddq_s16(a[0], a[15]);
-    b[1] = vaddq_s16(a[1], a[14]);
-    b[2] = vaddq_s16(a[2], a[13]);
-    b[3] = vaddq_s16(a[3], a[12]);
-    b[4] = vaddq_s16(a[4], a[11]);
-    b[5] = vaddq_s16(a[5], a[10]);
-    b[6] = vaddq_s16(a[6], a[9]);
-    b[7] = vaddq_s16(a[7], a[8]);
-
-    b[8] = vsubq_s16(a[7], a[8]);
-    b[9] = vsubq_s16(a[6], a[9]);
-    b[10] = vsubq_s16(a[5], a[10]);
-    b[11] = vsubq_s16(a[4], a[11]);
-    b[12] = vsubq_s16(a[3], a[12]);
-    b[13] = vsubq_s16(a[2], a[13]);
-    b[14] = vsubq_s16(a[1], a[14]);
-    b[15] = vsubq_s16(a[0], a[15]);
-  }
-}
-
-// Quarter round at the beginning of the second pass. Can't use vrshr (rounding)
-// because this only adds 1, not 1 << 2.
-static INLINE void partial_round_shift(int16x8_t *a /*[16]*/) {
-  const int16x8_t one = vdupq_n_s16(1);
-  a[0] = vshrq_n_s16(vaddq_s16(a[0], one), 2);
-  a[1] = vshrq_n_s16(vaddq_s16(a[1], one), 2);
-  a[2] = vshrq_n_s16(vaddq_s16(a[2], one), 2);
-  a[3] = vshrq_n_s16(vaddq_s16(a[3], one), 2);
-  a[4] = vshrq_n_s16(vaddq_s16(a[4], one), 2);
-  a[5] = vshrq_n_s16(vaddq_s16(a[5], one), 2);
-  a[6] = vshrq_n_s16(vaddq_s16(a[6], one), 2);
-  a[7] = vshrq_n_s16(vaddq_s16(a[7], one), 2);
-  a[8] = vshrq_n_s16(vaddq_s16(a[8], one), 2);
-  a[9] = vshrq_n_s16(vaddq_s16(a[9], one), 2);
-  a[10] = vshrq_n_s16(vaddq_s16(a[10], one), 2);
-  a[11] = vshrq_n_s16(vaddq_s16(a[11], one), 2);
-  a[12] = vshrq_n_s16(vaddq_s16(a[12], one), 2);
-  a[13] = vshrq_n_s16(vaddq_s16(a[13], one), 2);
-  a[14] = vshrq_n_s16(vaddq_s16(a[14], one), 2);
-  a[15] = vshrq_n_s16(vaddq_s16(a[15], one), 2);
-}
-
-// fdct_round_shift((a +/- b) * c)
-static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b,
-                                       const tran_high_t c, int16x8_t *add,
-                                       int16x8_t *sub) {
-  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c);
-  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c);
-  const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), c);
-  const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), c);
-  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), c);
-  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), c);
-  const int16x4_t rounded0 = vqrshrn_n_s32(sum0, 14);
-  const int16x4_t rounded1 = vqrshrn_n_s32(sum1, 14);
-  const int16x4_t rounded2 = vqrshrn_n_s32(diff0, 14);
-  const int16x4_t rounded3 = vqrshrn_n_s32(diff1, 14);
-  *add = vcombine_s16(rounded0, rounded1);
-  *sub = vcombine_s16(rounded2, rounded3);
-}
-
-// fdct_round_shift(a * c0 +/- b * c1)
-static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b,
-                                       const tran_coef_t c0,
-                                       const tran_coef_t c1, int16x8_t *add,
-                                       int16x8_t *sub) {
-  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c0);
-  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c0);
-  const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), c1);
-  const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), c1);
-  const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), c0);
-  const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), c0);
-  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), c1);
-  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), c1);
-  const int16x4_t rounded0 = vqrshrn_n_s32(sum0, 14);
-  const int16x4_t rounded1 = vqrshrn_n_s32(sum1, 14);
-  const int16x4_t rounded2 = vqrshrn_n_s32(diff0, 14);
-  const int16x4_t rounded3 = vqrshrn_n_s32(diff1, 14);
-  *add = vcombine_s16(rounded0, rounded1);
-  *sub = vcombine_s16(rounded2, rounded3);
-}
-
-// Transpose 8x8 to a new location. Don't use transpose_neon.h because those
-// are all in-place.
-static INLINE void transpose_8x8(const int16x8_t *a /*[8]*/,
-                                 int16x8_t *b /*[8]*/) {
-  // Swap 16 bit elements.
-  const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]);
-  const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]);
-  const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]);
-  const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]);
-
-  // Swap 32 bit elements.
-  const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
-                                   vreinterpretq_s32_s16(c1.val[0]));
-  const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
-                                   vreinterpretq_s32_s16(c1.val[1]));
-  const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]),
-                                   vreinterpretq_s32_s16(c3.val[0]));
-  const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]),
-                                   vreinterpretq_s32_s16(c3.val[1]));
-
-  // Swap 64 bit elements
-  const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]);
-  const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]);
-  const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]);
-  const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]);
-
-  b[0] = e0.val[0];
-  b[1] = e1.val[0];
-  b[2] = e2.val[0];
-  b[3] = e3.val[0];
-  b[4] = e0.val[1];
-  b[5] = e1.val[1];
-  b[6] = e2.val[1];
-  b[7] = e3.val[1];
-}
-
-// Main body of fdct16x16.
-static void dct_body(const int16x8_t *in /*[16]*/, int16x8_t *out /*[16]*/) {
-  int16x8_t s[8];
-  int16x8_t x[4];
-  int16x8_t step[8];
-
-  // stage 1
-  // From fwd_txfm.c: Work on the first eight values; fdct8(input,
-  // even_results);"
-  s[0] = vaddq_s16(in[0], in[7]);
-  s[1] = vaddq_s16(in[1], in[6]);
-  s[2] = vaddq_s16(in[2], in[5]);
-  s[3] = vaddq_s16(in[3], in[4]);
-  s[4] = vsubq_s16(in[3], in[4]);
-  s[5] = vsubq_s16(in[2], in[5]);
-  s[6] = vsubq_s16(in[1], in[6]);
-  s[7] = vsubq_s16(in[0], in[7]);
-
-  // fdct4(step, step);
-  x[0] = vaddq_s16(s[0], s[3]);
-  x[1] = vaddq_s16(s[1], s[2]);
-  x[2] = vsubq_s16(s[1], s[2]);
-  x[3] = vsubq_s16(s[0], s[3]);
-
-  // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64)
-  // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64)
-  butterfly_one_coeff(x[0], x[1], cospi_16_64, &out[0], &out[8]);
-  // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64);
-  // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64);
-  butterfly_two_coeff(x[3], x[2], cospi_24_64, cospi_8_64, &out[4], &out[12]);
-
-  //  Stage 2
-  // Re-using source s5/s6
-  // s5 = fdct_round_shift((s6 - s5) * cospi_16_64)
-  // s6 = fdct_round_shift((s6 + s5) * cospi_16_64)
-  butterfly_one_coeff(s[6], s[5], cospi_16_64, &s[6], &s[5]);
-
-  //  Stage 3
-  x[0] = vaddq_s16(s[4], s[5]);
-  x[1] = vsubq_s16(s[4], s[5]);
-  x[2] = vsubq_s16(s[7], s[6]);
-  x[3] = vaddq_s16(s[7], s[6]);
-
-  // Stage 4
-  // out[2] = fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
-  // out[14] = fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
-  butterfly_two_coeff(x[3], x[0], cospi_28_64, cospi_4_64, &out[2], &out[14]);
-  // out[6] = fdct_round_shift(x1 * cospi_12_64 + x2 *  cospi_20_64)
-  // out[10] = fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
-  butterfly_two_coeff(x[2], x[1], cospi_12_64, cospi_20_64, &out[10], &out[6]);
-
-  // step 2
-  // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results"
-  // That file distinguished between "in_high" and "step1" but the only
-  // difference is that "in_high" is the first 8 values and "step 1" is the
-  // second. Here, since they are all in one array, "step1" values are += 8.
-
-  // step2[2] = fdct_round_shift((step1[5] - step1[2]) * cospi_16_64)
-  // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64)
-  // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64)
-  // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64)
-  butterfly_one_coeff(in[13], in[10], cospi_16_64, &s[5], &s[2]);
-  butterfly_one_coeff(in[12], in[11], cospi_16_64, &s[4], &s[3]);
-
-  // step 3
-  s[0] = vaddq_s16(in[8], s[3]);
-  s[1] = vaddq_s16(in[9], s[2]);
-  x[0] = vsubq_s16(in[9], s[2]);
-  x[1] = vsubq_s16(in[8], s[3]);
-  x[2] = vsubq_s16(in[15], s[4]);
-  x[3] = vsubq_s16(in[14], s[5]);
-  s[6] = vaddq_s16(in[14], s[5]);
-  s[7] = vaddq_s16(in[15], s[4]);
-
-  // step 4
-  // step2[1] = fdct_round_shift(step3[1] *-cospi_8_64 + step3[6] * cospi_24_64)
-  // step2[6] = fdct_round_shift(step3[1] * cospi_24_64 + step3[6] * cospi_8_64)
-  butterfly_two_coeff(s[6], s[1], cospi_24_64, cospi_8_64, &s[6], &s[1]);
-
-  // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64)
-  // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * cospi_24_64)
-  butterfly_two_coeff(x[0], x[3], cospi_8_64, cospi_24_64, &s[2], &s[5]);
-
-  // step 5
-  step[0] = vaddq_s16(s[0], s[1]);
-  step[1] = vsubq_s16(s[0], s[1]);
-  step[2] = vaddq_s16(x[1], s[2]);
-  step[3] = vsubq_s16(x[1], s[2]);
-  step[4] = vsubq_s16(x[2], s[5]);
-  step[5] = vaddq_s16(x[2], s[5]);
-  step[6] = vsubq_s16(s[7], s[6]);
-  step[7] = vaddq_s16(s[7], s[6]);
-
-  // step 6
-  // out[1] = fdct_round_shift(step1[0] * cospi_30_64 + step1[7] * cospi_2_64)
-  // out[9] = fdct_round_shift(step1[1] * cospi_14_64 + step1[6] * cospi_18_64)
-  // out[5] = fdct_round_shift(step1[2] * cospi_22_64 + step1[5] * cospi_10_64)
-  // out[13] = fdct_round_shift(step1[3] * cospi_6_64 + step1[4] * cospi_26_64)
-  // out[3] = fdct_round_shift(step1[3] * -cospi_26_64 + step1[4] * cospi_6_64)
-  // out[11] = fdct_round_shift(step1[2] * -cospi_10_64 + step1[5] *
-  // cospi_22_64)
-  // out[7] = fdct_round_shift(step1[1] * -cospi_18_64 + step1[6] * cospi_14_64)
-  // out[15] = fdct_round_shift(step1[0] * -cospi_2_64 + step1[7] * cospi_30_64)
-  butterfly_two_coeff(step[6], step[1], cospi_14_64, cospi_18_64, &out[9],
-                      &out[7]);
-  butterfly_two_coeff(step[7], step[0], cospi_30_64, cospi_2_64, &out[1],
-                      &out[15]);
-  butterfly_two_coeff(step[4], step[3], cospi_6_64, cospi_26_64, &out[13],
-                      &out[3]);
-  butterfly_two_coeff(step[5], step[2], cospi_22_64, cospi_10_64, &out[5],
-                      &out[11]);
-}
-
 void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
   int16x8_t temp0[16];
   int16x8_t temp1[16];
@@ -346,12 +37,12 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
   // Left half.
   load(input, stride, temp0);
   cross_input(temp0, temp1, 0);
-  dct_body(temp1, temp0);
+  vpx_fdct16x16_body(temp1, temp0);
 
   // Right half.
   load(input + 8, stride, temp1);
   cross_input(temp1, temp2, 0);
-  dct_body(temp2, temp1);
+  vpx_fdct16x16_body(temp2, temp1);
 
   // Transpose top left and top right quarters into one contiguous location to
   // process to the top half.
@@ -359,7 +50,7 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
   transpose_8x8(&temp1[0], &temp2[8]);
   partial_round_shift(temp2);
   cross_input(temp2, temp3, 1);
-  dct_body(temp3, temp2);
+  vpx_fdct16x16_body(temp3, temp2);
   transpose_s16_8x8(&temp2[0], &temp2[1], &temp2[2], &temp2[3], &temp2[4],
                     &temp2[5], &temp2[6], &temp2[7]);
   transpose_s16_8x8(&temp2[8], &temp2[9], &temp2[10], &temp2[11], &temp2[12],
@@ -375,7 +66,7 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
                     &temp1[13], &temp1[14], &temp1[15]);
   partial_round_shift(temp1);
   cross_input(temp1, temp0, 1);
-  dct_body(temp0, temp1);
+  vpx_fdct16x16_body(temp0, temp1);
   transpose_s16_8x8(&temp1[0], &temp1[1], &temp1[2], &temp1[3], &temp1[4],
                     &temp1[5], &temp1[6], &temp1[7]);
   transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12],
diff --git a/vpx_dsp/arm/fdct16x16_neon.h b/vpx_dsp/arm/fdct16x16_neon.h
new file mode 100644
index 000000000..839123899
--- /dev/null
+++ b/vpx_dsp/arm/fdct16x16_neon.h
@@ -0,0 +1,327 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_FDCT16x16_NEON_H_
+#define VPX_VPX_DSP_ARM_FDCT16x16_NEON_H_
+
+#include <arm_neon.h>
+
+static INLINE void load(const int16_t *a, int stride, int16x8_t *b /*[16]*/) {
+  b[0] = vld1q_s16(a);
+  a += stride;
+  b[1] = vld1q_s16(a);
+  a += stride;
+  b[2] = vld1q_s16(a);
+  a += stride;
+  b[3] = vld1q_s16(a);
+  a += stride;
+  b[4] = vld1q_s16(a);
+  a += stride;
+  b[5] = vld1q_s16(a);
+  a += stride;
+  b[6] = vld1q_s16(a);
+  a += stride;
+  b[7] = vld1q_s16(a);
+  a += stride;
+  b[8] = vld1q_s16(a);
+  a += stride;
+  b[9] = vld1q_s16(a);
+  a += stride;
+  b[10] = vld1q_s16(a);
+  a += stride;
+  b[11] = vld1q_s16(a);
+  a += stride;
+  b[12] = vld1q_s16(a);
+  a += stride;
+  b[13] = vld1q_s16(a);
+  a += stride;
+  b[14] = vld1q_s16(a);
+  a += stride;
+  b[15] = vld1q_s16(a);
+}
+
+// Store 8 16x8 values, assuming stride == 16.
+static INLINE void store(tran_low_t *a, const int16x8_t *b /*[8]*/) {
+  store_s16q_to_tran_low(a, b[0]);
+  a += 16;
+  store_s16q_to_tran_low(a, b[1]);
+  a += 16;
+  store_s16q_to_tran_low(a, b[2]);
+  a += 16;
+  store_s16q_to_tran_low(a, b[3]);
+  a += 16;
+  store_s16q_to_tran_low(a, b[4]);
+  a += 16;
+  store_s16q_to_tran_low(a, b[5]);
+  a += 16;
+  store_s16q_to_tran_low(a, b[6]);
+  a += 16;
+  store_s16q_to_tran_low(a, b[7]);
+}
+
+// Load step of each pass. Add and subtract clear across the input, requiring
+// all 16 values to be loaded. For the first pass it also multiplies by 4.
+
+// To maybe reduce register usage this could be combined with the load() step to
+// get the first 4 and last 4 values, cross those, then load the middle 8 values
+// and cross them.
+static INLINE void cross_input(const int16x8_t *a /*[16]*/,
+                               int16x8_t *b /*[16]*/, const int pass) {
+  if (pass == 0) {
+    b[0] = vshlq_n_s16(vaddq_s16(a[0], a[15]), 2);
+    b[1] = vshlq_n_s16(vaddq_s16(a[1], a[14]), 2);
+    b[2] = vshlq_n_s16(vaddq_s16(a[2], a[13]), 2);
+    b[3] = vshlq_n_s16(vaddq_s16(a[3], a[12]), 2);
+    b[4] = vshlq_n_s16(vaddq_s16(a[4], a[11]), 2);
+    b[5] = vshlq_n_s16(vaddq_s16(a[5], a[10]), 2);
+    b[6] = vshlq_n_s16(vaddq_s16(a[6], a[9]), 2);
+    b[7] = vshlq_n_s16(vaddq_s16(a[7], a[8]), 2);
+
+    b[8] = vshlq_n_s16(vsubq_s16(a[7], a[8]), 2);
+    b[9] = vshlq_n_s16(vsubq_s16(a[6], a[9]), 2);
+    b[10] = vshlq_n_s16(vsubq_s16(a[5], a[10]), 2);
+    b[11] = vshlq_n_s16(vsubq_s16(a[4], a[11]), 2);
+    b[12] = vshlq_n_s16(vsubq_s16(a[3], a[12]), 2);
+    b[13] = vshlq_n_s16(vsubq_s16(a[2], a[13]), 2);
+    b[14] = vshlq_n_s16(vsubq_s16(a[1], a[14]), 2);
+    b[15] = vshlq_n_s16(vsubq_s16(a[0], a[15]), 2);
+  } else {
+    b[0] = vaddq_s16(a[0], a[15]);
+    b[1] = vaddq_s16(a[1], a[14]);
+    b[2] = vaddq_s16(a[2], a[13]);
+    b[3] = vaddq_s16(a[3], a[12]);
+    b[4] = vaddq_s16(a[4], a[11]);
+    b[5] = vaddq_s16(a[5], a[10]);
+    b[6] = vaddq_s16(a[6], a[9]);
+    b[7] = vaddq_s16(a[7], a[8]);
+
+    b[8] = vsubq_s16(a[7], a[8]);
+    b[9] = vsubq_s16(a[6], a[9]);
+    b[10] = vsubq_s16(a[5], a[10]);
+    b[11] = vsubq_s16(a[4], a[11]);
+    b[12] = vsubq_s16(a[3], a[12]);
+    b[13] = vsubq_s16(a[2], a[13]);
+    b[14] = vsubq_s16(a[1], a[14]);
+    b[15] = vsubq_s16(a[0], a[15]);
+  }
+}
+
+// Quarter round at the beginning of the second pass. Can't use vrshr (rounding)
+// because this only adds 1, not 1 << 2.
+static INLINE void partial_round_shift(int16x8_t *a /*[16]*/) {
+  const int16x8_t one = vdupq_n_s16(1);
+  a[0] = vshrq_n_s16(vaddq_s16(a[0], one), 2);
+  a[1] = vshrq_n_s16(vaddq_s16(a[1], one), 2);
+  a[2] = vshrq_n_s16(vaddq_s16(a[2], one), 2);
+  a[3] = vshrq_n_s16(vaddq_s16(a[3], one), 2);
+  a[4] = vshrq_n_s16(vaddq_s16(a[4], one), 2);
+  a[5] = vshrq_n_s16(vaddq_s16(a[5], one), 2);
+  a[6] = vshrq_n_s16(vaddq_s16(a[6], one), 2);
+  a[7] = vshrq_n_s16(vaddq_s16(a[7], one), 2);
+  a[8] = vshrq_n_s16(vaddq_s16(a[8], one), 2);
+  a[9] = vshrq_n_s16(vaddq_s16(a[9], one), 2);
+  a[10] = vshrq_n_s16(vaddq_s16(a[10], one), 2);
+  a[11] = vshrq_n_s16(vaddq_s16(a[11], one), 2);
+  a[12] = vshrq_n_s16(vaddq_s16(a[12], one), 2);
+  a[13] = vshrq_n_s16(vaddq_s16(a[13], one), 2);
+  a[14] = vshrq_n_s16(vaddq_s16(a[14], one), 2);
+  a[15] = vshrq_n_s16(vaddq_s16(a[15], one), 2);
+}
+
+// fdct_round_shift((a +/- b) * c)
+static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b,
+                                       const tran_high_t c, int16x8_t *add,
+                                       int16x8_t *sub) {
+  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c);
+  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c);
+  const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), c);
+  const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), c);
+  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), c);
+  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), c);
+  const int16x4_t rounded0 = vqrshrn_n_s32(sum0, 14);
+  const int16x4_t rounded1 = vqrshrn_n_s32(sum1, 14);
+  const int16x4_t rounded2 = vqrshrn_n_s32(diff0, 14);
+  const int16x4_t rounded3 = vqrshrn_n_s32(diff1, 14);
+  *add = vcombine_s16(rounded0, rounded1);
+  *sub = vcombine_s16(rounded2, rounded3);
+}
+
+// fdct_round_shift(a * c0 +/- b * c1)
+static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b,
+                                       const tran_coef_t c0,
+                                       const tran_coef_t c1, int16x8_t *add,
+                                       int16x8_t *sub) {
+  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c0);
+  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c0);
+  const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), c1);
+  const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), c1);
+  const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), c0);
+  const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), c0);
+  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), c1);
+  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), c1);
+  const int16x4_t rounded0 = vqrshrn_n_s32(sum0, 14);
+  const int16x4_t rounded1 = vqrshrn_n_s32(sum1, 14);
+  const int16x4_t rounded2 = vqrshrn_n_s32(diff0, 14);
+  const int16x4_t rounded3 = vqrshrn_n_s32(diff1, 14);
+  *add = vcombine_s16(rounded0, rounded1);
+  *sub = vcombine_s16(rounded2, rounded3);
+}
+
+// Transpose 8x8 to a new location. Don't use transpose_neon.h because those
+// are all in-place.
+static INLINE void transpose_8x8(const int16x8_t *a /*[8]*/,
+                                 int16x8_t *b /*[8]*/) {
+  // Swap 16 bit elements.
+  const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]);
+  const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]);
+  const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]);
+  const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]);
+
+  // Swap 32 bit elements.
+  const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
+                                   vreinterpretq_s32_s16(c1.val[0]));
+  const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
+                                   vreinterpretq_s32_s16(c1.val[1]));
+  const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]),
+                                   vreinterpretq_s32_s16(c3.val[0]));
+  const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]),
+                                   vreinterpretq_s32_s16(c3.val[1]));
+
+  // Swap 64 bit elements
+  const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]);
+  const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]);
+  const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]);
+  const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]);
+
+  b[0] = e0.val[0];
+  b[1] = e1.val[0];
+  b[2] = e2.val[0];
+  b[3] = e3.val[0];
+  b[4] = e0.val[1];
+  b[5] = e1.val[1];
+  b[6] = e2.val[1];
+  b[7] = e3.val[1];
+}
+
+// Main body of fdct16x16.
+static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/,
+                               int16x8_t *out /*[16]*/) {
+  int16x8_t s[8];
+  int16x8_t x[4];
+  int16x8_t step[8];
+
+  // stage 1
+  // From fwd_txfm.c: Work on the first eight values; fdct8(input,
+  // even_results);"
+  s[0] = vaddq_s16(in[0], in[7]);
+  s[1] = vaddq_s16(in[1], in[6]);
+  s[2] = vaddq_s16(in[2], in[5]);
+  s[3] = vaddq_s16(in[3], in[4]);
+  s[4] = vsubq_s16(in[3], in[4]);
+  s[5] = vsubq_s16(in[2], in[5]);
+  s[6] = vsubq_s16(in[1], in[6]);
+  s[7] = vsubq_s16(in[0], in[7]);
+
+  // fdct4(step, step);
+  x[0] = vaddq_s16(s[0], s[3]);
+  x[1] = vaddq_s16(s[1], s[2]);
+  x[2] = vsubq_s16(s[1], s[2]);
+  x[3] = vsubq_s16(s[0], s[3]);
+
+  // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64)
+  // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64)
+  butterfly_one_coeff(x[0], x[1], cospi_16_64, &out[0], &out[8]);
+  // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64);
+  // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64);
+  butterfly_two_coeff(x[3], x[2], cospi_24_64, cospi_8_64, &out[4], &out[12]);
+
+  //  Stage 2
+  // Re-using source s5/s6
+  // s5 = fdct_round_shift((s6 - s5) * cospi_16_64)
+  // s6 = fdct_round_shift((s6 + s5) * cospi_16_64)
+  butterfly_one_coeff(s[6], s[5], cospi_16_64, &s[6], &s[5]);
+
+  //  Stage 3
+  x[0] = vaddq_s16(s[4], s[5]);
+  x[1] = vsubq_s16(s[4], s[5]);
+  x[2] = vsubq_s16(s[7], s[6]);
+  x[3] = vaddq_s16(s[7], s[6]);
+
+  // Stage 4
+  // out[2] = fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
+  // out[14] = fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
+  butterfly_two_coeff(x[3], x[0], cospi_28_64, cospi_4_64, &out[2], &out[14]);
+  // out[6] = fdct_round_shift(x1 * cospi_12_64 + x2 *  cospi_20_64)
+  // out[10] = fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
+  butterfly_two_coeff(x[2], x[1], cospi_12_64, cospi_20_64, &out[10], &out[6]);
+
+  // step 2
+  // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results"
+  // That file distinguished between "in_high" and "step1" but the only
+  // difference is that "in_high" is the first 8 values and "step 1" is the
+  // second. Here, since they are all in one array, "step1" values are += 8.
+
+  // step2[2] = fdct_round_shift((step1[5] - step1[2]) * cospi_16_64)
+  // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64)
+  // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64)
+  // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64)
+  butterfly_one_coeff(in[13], in[10], cospi_16_64, &s[5], &s[2]);
+  butterfly_one_coeff(in[12], in[11], cospi_16_64, &s[4], &s[3]);
+
+  // step 3
+  s[0] = vaddq_s16(in[8], s[3]);
+  s[1] = vaddq_s16(in[9], s[2]);
+  x[0] = vsubq_s16(in[9], s[2]);
+  x[1] = vsubq_s16(in[8], s[3]);
+  x[2] = vsubq_s16(in[15], s[4]);
+  x[3] = vsubq_s16(in[14], s[5]);
+  s[6] = vaddq_s16(in[14], s[5]);
+  s[7] = vaddq_s16(in[15], s[4]);
+
+  // step 4
+  // step2[1] = fdct_round_shift(step3[1] *-cospi_8_64 + step3[6] * cospi_24_64)
+  // step2[6] = fdct_round_shift(step3[1] * cospi_24_64 + step3[6] * cospi_8_64)
+  butterfly_two_coeff(s[6], s[1], cospi_24_64, cospi_8_64, &s[6], &s[1]);
+
+  // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64)
+  // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * cospi_24_64)
+  butterfly_two_coeff(x[0], x[3], cospi_8_64, cospi_24_64, &s[2], &s[5]);
+
+  // step 5
+  step[0] = vaddq_s16(s[0], s[1]);
+  step[1] = vsubq_s16(s[0], s[1]);
+  step[2] = vaddq_s16(x[1], s[2]);
+  step[3] = vsubq_s16(x[1], s[2]);
+  step[4] = vsubq_s16(x[2], s[5]);
+  step[5] = vaddq_s16(x[2], s[5]);
+  step[6] = vsubq_s16(s[7], s[6]);
+  step[7] = vaddq_s16(s[7], s[6]);
+
+  // step 6
+  // out[1] = fdct_round_shift(step1[0] * cospi_30_64 + step1[7] * cospi_2_64)
+  // out[9] = fdct_round_shift(step1[1] * cospi_14_64 + step1[6] * cospi_18_64)
+  // out[5] = fdct_round_shift(step1[2] * cospi_22_64 + step1[5] * cospi_10_64)
+  // out[13] = fdct_round_shift(step1[3] * cospi_6_64 + step1[4] * cospi_26_64)
+  // out[3] = fdct_round_shift(step1[3] * -cospi_26_64 + step1[4] * cospi_6_64)
+  // out[11] = fdct_round_shift(step1[2] * -cospi_10_64 + step1[5] *
+  // cospi_22_64)
+  // out[7] = fdct_round_shift(step1[1] * -cospi_18_64 + step1[6] * cospi_14_64)
+  // out[15] = fdct_round_shift(step1[0] * -cospi_2_64 + step1[7] * cospi_30_64)
+  butterfly_two_coeff(step[6], step[1], cospi_14_64, cospi_18_64, &out[9],
+                      &out[7]);
+  butterfly_two_coeff(step[7], step[0], cospi_30_64, cospi_2_64, &out[1],
+                      &out[15]);
+  butterfly_two_coeff(step[4], step[3], cospi_6_64, cospi_26_64, &out[13],
+                      &out[3]);
+  butterfly_two_coeff(step[5], step[2], cospi_22_64, cospi_10_64, &out[5],
+                      &out[11]);
+}
+
+#endif  // VPX_VPX_DSP_ARM_FDCT16x16_NEON_H_
diff --git a/vpx_dsp/arm/fdct_neon.c b/vpx_dsp/arm/fdct_neon.c
index 3708cbb11..2827791f1 100644
--- a/vpx_dsp/arm/fdct_neon.c
+++ b/vpx_dsp/arm/fdct_neon.c
@@ -15,6 +15,7 @@
 #include "vpx_dsp/txfm_common.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/fdct_neon.h"
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
 
@@ -22,67 +23,25 @@ void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
                       int stride) {
   int i;
   // input[M * stride] * 16
-  int16x4_t input_0 = vshl_n_s16(vld1_s16(input + 0 * stride), 4);
-  int16x4_t input_1 = vshl_n_s16(vld1_s16(input + 1 * stride), 4);
-  int16x4_t input_2 = vshl_n_s16(vld1_s16(input + 2 * stride), 4);
-  int16x4_t input_3 = vshl_n_s16(vld1_s16(input + 3 * stride), 4);
+  int16x4_t in[4];
+  in[0] = vshl_n_s16(vld1_s16(input + 0 * stride), 4);
+  in[1] = vshl_n_s16(vld1_s16(input + 1 * stride), 4);
+  in[2] = vshl_n_s16(vld1_s16(input + 2 * stride), 4);
+  in[3] = vshl_n_s16(vld1_s16(input + 3 * stride), 4);
 
   // If the very first value != 0, then add 1.
   if (input[0] != 0) {
     const int16x4_t one = vreinterpret_s16_s64(vdup_n_s64(1));
-    input_0 = vadd_s16(input_0, one);
+    in[0] = vadd_s16(in[0], one);
   }
-
   for (i = 0; i < 2; ++i) {
-    const int16x8_t input_01 = vcombine_s16(input_0, input_1);
-    const int16x8_t input_32 = vcombine_s16(input_3, input_2);
-
-    // in_0 +/- in_3, in_1 +/- in_2
-    const int16x8_t s_01 = vaddq_s16(input_01, input_32);
-    const int16x8_t s_32 = vsubq_s16(input_01, input_32);
-
-    // step_0 +/- step_1, step_2 +/- step_3
-    const int16x4_t s_0 = vget_low_s16(s_01);
-    const int16x4_t s_1 = vget_high_s16(s_01);
-    const int16x4_t s_2 = vget_high_s16(s_32);
-    const int16x4_t s_3 = vget_low_s16(s_32);
-
-    // (s_0 +/- s_1) * cospi_16_64
-    // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c.
-    const int32x4_t s_0_p_s_1 = vaddl_s16(s_0, s_1);
-    const int32x4_t s_0_m_s_1 = vsubl_s16(s_0, s_1);
-    const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, cospi_16_64);
-    const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, cospi_16_64);
-
-    // fdct_round_shift
-    int16x4_t out_0 = vrshrn_n_s32(temp1, DCT_CONST_BITS);
-    int16x4_t out_2 = vrshrn_n_s32(temp2, DCT_CONST_BITS);
-
-    // s_3 * cospi_8_64 + s_2 * cospi_24_64
-    // s_3 * cospi_24_64 - s_2 * cospi_8_64
-    const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, cospi_8_64);
-    const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, cospi_24_64);
-
-    const int32x4_t temp3 = vmlal_n_s16(s_3_cospi_8_64, s_2, cospi_24_64);
-    const int32x4_t temp4 = vmlsl_n_s16(s_3_cospi_24_64, s_2, cospi_8_64);
-
-    // fdct_round_shift
-    int16x4_t out_1 = vrshrn_n_s32(temp3, DCT_CONST_BITS);
-    int16x4_t out_3 = vrshrn_n_s32(temp4, DCT_CONST_BITS);
-
-    transpose_s16_4x4d(&out_0, &out_1, &out_2, &out_3);
-
-    input_0 = out_0;
-    input_1 = out_1;
-    input_2 = out_2;
-    input_3 = out_3;
+    vpx_fdct4x4_pass1_neon(in);
   }
-
   {
     // Not quite a rounding shift. Only add 1 despite shifting by 2.
     const int16x8_t one = vdupq_n_s16(1);
-    int16x8_t out_01 = vcombine_s16(input_0, input_1);
-    int16x8_t out_23 = vcombine_s16(input_2, input_3);
+    int16x8_t out_01 = vcombine_s16(in[0], in[1]);
+    int16x8_t out_23 = vcombine_s16(in[2], in[3]);
     out_01 = vshrq_n_s16(vaddq_s16(out_01, one), 2);
     out_23 = vshrq_n_s16(vaddq_s16(out_23, one), 2);
     store_s16q_to_tran_low(final_output + 0 * 8, out_01);
diff --git a/vpx_dsp/arm/fdct_neon.h b/vpx_dsp/arm/fdct_neon.h
new file mode 100644
index 000000000..28d7d86bf
--- /dev/null
+++ b/vpx_dsp/arm/fdct_neon.h
@@ -0,0 +1,213 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_FDCT_NEON_H_
+#define VPX_VPX_DSP_ARM_FDCT_NEON_H_
+
+#include <arm_neon.h>
+
+static INLINE void vpx_fdct4x4_pass1_neon(int16x4_t *in) {
+  const int16x8_t input_01 = vcombine_s16(in[0], in[1]);
+  const int16x8_t input_32 = vcombine_s16(in[3], in[2]);
+
+  // in_0 +/- in_3, in_1 +/- in_2
+  const int16x8_t s_01 = vaddq_s16(input_01, input_32);
+  const int16x8_t s_32 = vsubq_s16(input_01, input_32);
+
+  // step_0 +/- step_1, step_2 +/- step_3
+  const int16x4_t s_0 = vget_low_s16(s_01);
+  const int16x4_t s_1 = vget_high_s16(s_01);
+  const int16x4_t s_2 = vget_high_s16(s_32);
+  const int16x4_t s_3 = vget_low_s16(s_32);
+
+  // (s_0 +/- s_1) * cospi_16_64
+  // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c.
+  const int32x4_t s_0_p_s_1 = vaddl_s16(s_0, s_1);
+  const int32x4_t s_0_m_s_1 = vsubl_s16(s_0, s_1);
+  const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, cospi_16_64);
+  const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, cospi_16_64);
+
+  // fdct_round_shift
+  int16x4_t out_0 = vrshrn_n_s32(temp1, DCT_CONST_BITS);
+  int16x4_t out_2 = vrshrn_n_s32(temp2, DCT_CONST_BITS);
+
+  // s_3 * cospi_8_64 + s_2 * cospi_24_64
+  // s_3 * cospi_24_64 - s_2 * cospi_8_64
+  const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, cospi_8_64);
+  const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, cospi_24_64);
+
+  const int32x4_t temp3 = vmlal_n_s16(s_3_cospi_8_64, s_2, cospi_24_64);
+  const int32x4_t temp4 = vmlsl_n_s16(s_3_cospi_24_64, s_2, cospi_8_64);
+
+  // fdct_round_shift
+  int16x4_t out_1 = vrshrn_n_s32(temp3, DCT_CONST_BITS);
+  int16x4_t out_3 = vrshrn_n_s32(temp4, DCT_CONST_BITS);
+
+  transpose_s16_4x4d(&out_0, &out_1, &out_2, &out_3);
+
+  in[0] = out_0;
+  in[1] = out_1;
+  in[2] = out_2;
+  in[3] = out_3;
+}
+
+static INLINE void vpx_fdct8x8_pass1_notranspose_neon(int16x8_t *in,
+                                                      int16x8_t *out) {
+  const int16x8_t v_s0 = vaddq_s16(in[0], in[7]);
+  const int16x8_t v_s1 = vaddq_s16(in[1], in[6]);
+  const int16x8_t v_s2 = vaddq_s16(in[2], in[5]);
+  const int16x8_t v_s3 = vaddq_s16(in[3], in[4]);
+  const int16x8_t v_s4 = vsubq_s16(in[3], in[4]);
+  const int16x8_t v_s5 = vsubq_s16(in[2], in[5]);
+  const int16x8_t v_s6 = vsubq_s16(in[1], in[6]);
+  const int16x8_t v_s7 = vsubq_s16(in[0], in[7]);
+  // fdct4(step, step);
+  int16x8_t v_x0 = vaddq_s16(v_s0, v_s3);
+  int16x8_t v_x1 = vaddq_s16(v_s1, v_s2);
+  int16x8_t v_x2 = vsubq_s16(v_s1, v_s2);
+  int16x8_t v_x3 = vsubq_s16(v_s0, v_s3);
+  // fdct4(step, step);
+  int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
+  int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
+  int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
+  int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
+  int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_24_64);
+  int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_24_64);
+  int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_24_64);
+  int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_24_64);
+  v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), cospi_8_64);
+  v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), cospi_8_64);
+  v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), cospi_8_64);
+  v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), cospi_8_64);
+  v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64);
+  v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64);
+  v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64);
+  v_t1_hi = vmulq_n_s32(v_t1_hi, cospi_16_64);
+  {
+    const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+    const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+    const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+    const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+    const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
+    const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
+    const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
+    const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
+    out[0] = vcombine_s16(a, c);  // 00 01 02 03 40 41 42 43
+    out[2] = vcombine_s16(e, g);  // 20 21 22 23 60 61 62 63
+    out[4] = vcombine_s16(b, d);  // 04 05 06 07 44 45 46 47
+    out[6] = vcombine_s16(f, h);  // 24 25 26 27 64 65 66 67
+  }
+  // Stage 2
+  v_x0 = vsubq_s16(v_s6, v_s5);
+  v_x1 = vaddq_s16(v_s6, v_s5);
+  v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), cospi_16_64);
+  v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), cospi_16_64);
+  v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_16_64);
+  v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_16_64);
+  {
+    const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+    const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+    const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+    const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+    const int16x8_t ab = vcombine_s16(a, b);
+    const int16x8_t cd = vcombine_s16(c, d);
+    // Stage 3
+    v_x0 = vaddq_s16(v_s4, ab);
+    v_x1 = vsubq_s16(v_s4, ab);
+    v_x2 = vsubq_s16(v_s7, cd);
+    v_x3 = vaddq_s16(v_s7, cd);
+  }
+  // Stage 4
+  v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_4_64);
+  v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_4_64);
+  v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), cospi_28_64);
+  v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), cospi_28_64);
+  v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_12_64);
+  v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_12_64);
+  v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), cospi_20_64);
+  v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), cospi_20_64);
+  v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_12_64);
+  v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_12_64);
+  v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), cospi_20_64);
+  v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), cospi_20_64);
+  v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_28_64);
+  v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_28_64);
+  v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), cospi_4_64);
+  v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), cospi_4_64);
+  {
+    const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+    const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+    const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+    const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+    const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
+    const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
+    const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
+    const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
+    out[1] = vcombine_s16(a, c);  // 10 11 12 13 50 51 52 53
+    out[3] = vcombine_s16(e, g);  // 30 31 32 33 70 71 72 73
+    out[5] = vcombine_s16(b, d);  // 14 15 16 17 54 55 56 57
+    out[7] = vcombine_s16(f, h);  // 34 35 36 37 74 75 76 77
+  }
+}
+
+static INLINE void vpx_fdct8x8_pass1_neon(int16x8_t *in) {
+  int16x8_t out[8];
+  vpx_fdct8x8_pass1_notranspose_neon(in, out);
+  // transpose 8x8
+  // Can't use transpose_s16_8x8() because the values are arranged in two 4x8
+  // columns.
+  {
+    // 00 01 02 03 40 41 42 43
+    // 10 11 12 13 50 51 52 53
+    // 20 21 22 23 60 61 62 63
+    // 30 31 32 33 70 71 72 73
+    // 04 05 06 07 44 45 46 47
+    // 14 15 16 17 54 55 56 57
+    // 24 25 26 27 64 65 66 67
+    // 34 35 36 37 74 75 76 77
+    const int32x4x2_t r02_s32 =
+        vtrnq_s32(vreinterpretq_s32_s16(out[0]), vreinterpretq_s32_s16(out[2]));
+    const int32x4x2_t r13_s32 =
+        vtrnq_s32(vreinterpretq_s32_s16(out[1]), vreinterpretq_s32_s16(out[3]));
+    const int32x4x2_t r46_s32 =
+        vtrnq_s32(vreinterpretq_s32_s16(out[4]), vreinterpretq_s32_s16(out[6]));
+    const int32x4x2_t r57_s32 =
+        vtrnq_s32(vreinterpretq_s32_s16(out[5]), vreinterpretq_s32_s16(out[7]));
+    const int16x8x2_t r01_s16 =
+        vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]),
+                  vreinterpretq_s16_s32(r13_s32.val[0]));
+    const int16x8x2_t r23_s16 =
+        vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]),
+                  vreinterpretq_s16_s32(r13_s32.val[1]));
+    const int16x8x2_t r45_s16 =
+        vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]),
+                  vreinterpretq_s16_s32(r57_s32.val[0]));
+    const int16x8x2_t r67_s16 =
+        vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]),
+                  vreinterpretq_s16_s32(r57_s32.val[1]));
+    in[0] = r01_s16.val[0];
+    in[1] = r01_s16.val[1];
+    in[2] = r23_s16.val[0];
+    in[3] = r23_s16.val[1];
+    in[4] = r45_s16.val[0];
+    in[5] = r45_s16.val[1];
+    in[6] = r67_s16.val[0];
+    in[7] = r67_s16.val[1];
+    // 00 10 20 30 40 50 60 70
+    // 01 11 21 31 41 51 61 71
+    // 02 12 22 32 42 52 62 72
+    // 03 13 23 33 43 53 63 73
+    // 04 14 24 34 44 54 64 74
+    // 05 15 25 35 45 55 65 75
+    // 06 16 26 36 46 56 66 76
+    // 07 17 27 37 47 57 67 77
+  }
+}
+#endif  // VPX_VPX_DSP_ARM_FDCT_NEON_H_
diff --git a/vpx_dsp/arm/fwd_txfm_neon.c b/vpx_dsp/arm/fwd_txfm_neon.c
index 374a262b9..d9161c6d3 100644
--- a/vpx_dsp/arm/fwd_txfm_neon.c
+++ b/vpx_dsp/arm/fwd_txfm_neon.c
@@ -15,196 +15,54 @@
 #include "vpx_dsp/txfm_common.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/fdct_neon.h"
 #include "vpx_dsp/arm/mem_neon.h"
 
 void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
                       int stride) {
   int i;
   // stage 1
-  int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
-  int16x8_t input_1 = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2);
-  int16x8_t input_2 = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2);
-  int16x8_t input_3 = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2);
-  int16x8_t input_4 = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2);
-  int16x8_t input_5 = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2);
-  int16x8_t input_6 = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2);
-  int16x8_t input_7 = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2);
+  int16x8_t in[8];
+  in[0] = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
+  in[1] = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2);
+  in[2] = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2);
+  in[3] = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2);
+  in[4] = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2);
+  in[5] = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2);
+  in[6] = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2);
+  in[7] = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2);
   for (i = 0; i < 2; ++i) {
-    int16x8_t out_0, out_1, out_2, out_3, out_4, out_5, out_6, out_7;
-    const int16x8_t v_s0 = vaddq_s16(input_0, input_7);
-    const int16x8_t v_s1 = vaddq_s16(input_1, input_6);
-    const int16x8_t v_s2 = vaddq_s16(input_2, input_5);
-    const int16x8_t v_s3 = vaddq_s16(input_3, input_4);
-    const int16x8_t v_s4 = vsubq_s16(input_3, input_4);
-    const int16x8_t v_s5 = vsubq_s16(input_2, input_5);
-    const int16x8_t v_s6 = vsubq_s16(input_1, input_6);
-    const int16x8_t v_s7 = vsubq_s16(input_0, input_7);
-    // fdct4(step, step);
-    int16x8_t v_x0 = vaddq_s16(v_s0, v_s3);
-    int16x8_t v_x1 = vaddq_s16(v_s1, v_s2);
-    int16x8_t v_x2 = vsubq_s16(v_s1, v_s2);
-    int16x8_t v_x3 = vsubq_s16(v_s0, v_s3);
-    // fdct4(step, step);
-    int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
-    int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
-    int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
-    int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
-    int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_24_64);
-    int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_24_64);
-    int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_24_64);
-    int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_24_64);
-    v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), cospi_8_64);
-    v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), cospi_8_64);
-    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), cospi_8_64);
-    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), cospi_8_64);
-    v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64);
-    v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64);
-    v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64);
-    v_t1_hi = vmulq_n_s32(v_t1_hi, cospi_16_64);
-    {
-      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
-      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
-      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
-      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
-      const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
-      const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
-      const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
-      const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
-      out_0 = vcombine_s16(a, c);  // 00 01 02 03 40 41 42 43
-      out_2 = vcombine_s16(e, g);  // 20 21 22 23 60 61 62 63
-      out_4 = vcombine_s16(b, d);  // 04 05 06 07 44 45 46 47
-      out_6 = vcombine_s16(f, h);  // 24 25 26 27 64 65 66 67
-    }
-    // Stage 2
-    v_x0 = vsubq_s16(v_s6, v_s5);
-    v_x1 = vaddq_s16(v_s6, v_s5);
-    v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), cospi_16_64);
-    v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), cospi_16_64);
-    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_16_64);
-    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_16_64);
-    {
-      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
-      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
-      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
-      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
-      const int16x8_t ab = vcombine_s16(a, b);
-      const int16x8_t cd = vcombine_s16(c, d);
-      // Stage 3
-      v_x0 = vaddq_s16(v_s4, ab);
-      v_x1 = vsubq_s16(v_s4, ab);
-      v_x2 = vsubq_s16(v_s7, cd);
-      v_x3 = vaddq_s16(v_s7, cd);
-    }
-    // Stage 4
-    v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_4_64);
-    v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_4_64);
-    v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), cospi_28_64);
-    v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), cospi_28_64);
-    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_12_64);
-    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_12_64);
-    v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), cospi_20_64);
-    v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), cospi_20_64);
-    v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_12_64);
-    v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_12_64);
-    v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), cospi_20_64);
-    v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), cospi_20_64);
-    v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_28_64);
-    v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_28_64);
-    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), cospi_4_64);
-    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), cospi_4_64);
-    {
-      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
-      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
-      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
-      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
-      const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
-      const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
-      const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
-      const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
-      out_1 = vcombine_s16(a, c);  // 10 11 12 13 50 51 52 53
-      out_3 = vcombine_s16(e, g);  // 30 31 32 33 70 71 72 73
-      out_5 = vcombine_s16(b, d);  // 14 15 16 17 54 55 56 57
-      out_7 = vcombine_s16(f, h);  // 34 35 36 37 74 75 76 77
-    }
-    // transpose 8x8
-    // Can't use transpose_s16_8x8() because the values are arranged in two 4x8
-    // columns.
-    {
-      // 00 01 02 03 40 41 42 43
-      // 10 11 12 13 50 51 52 53
-      // 20 21 22 23 60 61 62 63
-      // 30 31 32 33 70 71 72 73
-      // 04 05 06 07 44 45 46 47
-      // 14 15 16 17 54 55 56 57
-      // 24 25 26 27 64 65 66 67
-      // 34 35 36 37 74 75 76 77
-      const int32x4x2_t r02_s32 =
-          vtrnq_s32(vreinterpretq_s32_s16(out_0), vreinterpretq_s32_s16(out_2));
-      const int32x4x2_t r13_s32 =
-          vtrnq_s32(vreinterpretq_s32_s16(out_1), vreinterpretq_s32_s16(out_3));
-      const int32x4x2_t r46_s32 =
-          vtrnq_s32(vreinterpretq_s32_s16(out_4), vreinterpretq_s32_s16(out_6));
-      const int32x4x2_t r57_s32 =
-          vtrnq_s32(vreinterpretq_s32_s16(out_5), vreinterpretq_s32_s16(out_7));
-      const int16x8x2_t r01_s16 =
-          vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]),
-                    vreinterpretq_s16_s32(r13_s32.val[0]));
-      const int16x8x2_t r23_s16 =
-          vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]),
-                    vreinterpretq_s16_s32(r13_s32.val[1]));
-      const int16x8x2_t r45_s16 =
-          vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]),
-                    vreinterpretq_s16_s32(r57_s32.val[0]));
-      const int16x8x2_t r67_s16 =
-          vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]),
-                    vreinterpretq_s16_s32(r57_s32.val[1]));
-      input_0 = r01_s16.val[0];
-      input_1 = r01_s16.val[1];
-      input_2 = r23_s16.val[0];
-      input_3 = r23_s16.val[1];
-      input_4 = r45_s16.val[0];
-      input_5 = r45_s16.val[1];
-      input_6 = r67_s16.val[0];
-      input_7 = r67_s16.val[1];
-      // 00 10 20 30 40 50 60 70
-      // 01 11 21 31 41 51 61 71
-      // 02 12 22 32 42 52 62 72
-      // 03 13 23 33 43 53 63 73
-      // 04 14 24 34 44 54 64 74
-      // 05 15 25 35 45 55 65 75
-      // 06 16 26 36 46 56 66 76
-      // 07 17 27 37 47 57 67 77
-    }
+    vpx_fdct8x8_pass1_neon(in);
   }  // for
   {
     // from vpx_dct_sse2.c
     // Post-condition (division by two)
     //    division of two 16 bits signed numbers using shifts
     //    n / 2 = (n - (n >> 15)) >> 1
-    const int16x8_t sign_in0 = vshrq_n_s16(input_0, 15);
-    const int16x8_t sign_in1 = vshrq_n_s16(input_1, 15);
-    const int16x8_t sign_in2 = vshrq_n_s16(input_2, 15);
-    const int16x8_t sign_in3 = vshrq_n_s16(input_3, 15);
-    const int16x8_t sign_in4 = vshrq_n_s16(input_4, 15);
-    const int16x8_t sign_in5 = vshrq_n_s16(input_5, 15);
-    const int16x8_t sign_in6 = vshrq_n_s16(input_6, 15);
-    const int16x8_t sign_in7 = vshrq_n_s16(input_7, 15);
-    input_0 = vhsubq_s16(input_0, sign_in0);
-    input_1 = vhsubq_s16(input_1, sign_in1);
-    input_2 = vhsubq_s16(input_2, sign_in2);
-    input_3 = vhsubq_s16(input_3, sign_in3);
-    input_4 = vhsubq_s16(input_4, sign_in4);
-    input_5 = vhsubq_s16(input_5, sign_in5);
-    input_6 = vhsubq_s16(input_6, sign_in6);
-    input_7 = vhsubq_s16(input_7, sign_in7);
+    const int16x8_t sign_in0 = vshrq_n_s16(in[0], 15);
+    const int16x8_t sign_in1 = vshrq_n_s16(in[1], 15);
+    const int16x8_t sign_in2 = vshrq_n_s16(in[2], 15);
+    const int16x8_t sign_in3 = vshrq_n_s16(in[3], 15);
+    const int16x8_t sign_in4 = vshrq_n_s16(in[4], 15);
+    const int16x8_t sign_in5 = vshrq_n_s16(in[5], 15);
+    const int16x8_t sign_in6 = vshrq_n_s16(in[6], 15);
+    const int16x8_t sign_in7 = vshrq_n_s16(in[7], 15);
+    in[0] = vhsubq_s16(in[0], sign_in0);
+    in[1] = vhsubq_s16(in[1], sign_in1);
+    in[2] = vhsubq_s16(in[2], sign_in2);
+    in[3] = vhsubq_s16(in[3], sign_in3);
+    in[4] = vhsubq_s16(in[4], sign_in4);
+    in[5] = vhsubq_s16(in[5], sign_in5);
+    in[6] = vhsubq_s16(in[6], sign_in6);
+    in[7] = vhsubq_s16(in[7], sign_in7);
     // store results
-    store_s16q_to_tran_low(final_output + 0 * 8, input_0);
-    store_s16q_to_tran_low(final_output + 1 * 8, input_1);
-    store_s16q_to_tran_low(final_output + 2 * 8, input_2);
-    store_s16q_to_tran_low(final_output + 3 * 8, input_3);
-    store_s16q_to_tran_low(final_output + 4 * 8, input_4);
-    store_s16q_to_tran_low(final_output + 5 * 8, input_5);
-    store_s16q_to_tran_low(final_output + 6 * 8, input_6);
-    store_s16q_to_tran_low(final_output + 7 * 8, input_7);
+    store_s16q_to_tran_low(final_output + 0 * 8, in[0]);
+    store_s16q_to_tran_low(final_output + 1 * 8, in[1]);
+    store_s16q_to_tran_low(final_output + 2 * 8, in[2]);
+    store_s16q_to_tran_low(final_output + 3 * 8, in[3]);
+    store_s16q_to_tran_low(final_output + 4 * 8, in[4]);
+    store_s16q_to_tran_low(final_output + 5 * 8, in[5]);
+    store_s16q_to_tran_low(final_output + 6 * 8, in[6]);
+    store_s16q_to_tran_low(final_output + 7 * 8, in[7]);
   }
 }
diff --git a/vpx_dsp/arm/transpose_neon.h b/vpx_dsp/arm/transpose_neon.h
index 752308160..c098ad31b 100644
--- a/vpx_dsp/arm/transpose_neon.h
+++ b/vpx_dsp/arm/transpose_neon.h
@@ -1184,6 +1184,45 @@ static INLINE void transpose_u8_16x16(
   *o15 = e7.val[1];
 }
 
+static INLINE void transpose_s16_16x16(int16x8_t *in0, int16x8_t *in1) {
+  int16x8_t t[8];
+
+  // transpose the 4 8x8 quadrants separately but first swap quadrants 2 and 3.
+  t[0] = in0[8];
+  t[1] = in0[9];
+  t[2] = in0[10];
+  t[3] = in0[11];
+  t[4] = in0[12];
+  t[5] = in0[13];
+  t[6] = in0[14];
+  t[7] = in0[15];
+  in0[8] = in1[0];
+  in0[9] = in1[1];
+  in0[10] = in1[2];
+  in0[11] = in1[3];
+  in0[12] = in1[4];
+  in0[13] = in1[5];
+  in0[14] = in1[6];
+  in0[15] = in1[7];
+  in1[0] = t[0];
+  in1[1] = t[1];
+  in1[2] = t[2];
+  in1[3] = t[3];
+  in1[4] = t[4];
+  in1[5] = t[5];
+  in1[6] = t[6];
+  in1[7] = t[7];
+
+  transpose_s16_8x8(&in0[0], &in0[1], &in0[2], &in0[3], &in0[4], &in0[5],
+                    &in0[6], &in0[7]);
+  transpose_s16_8x8(&in0[8], &in0[9], &in0[10], &in0[11], &in0[12], &in0[13],
+                    &in0[14], &in0[15]);
+  transpose_s16_8x8(&in1[0], &in1[1], &in1[2], &in1[3], &in1[4], &in1[5],
+                    &in1[6], &in1[7]);
+  transpose_s16_8x8(&in1[8], &in1[9], &in1[10], &in1[11], &in1[12], &in1[13],
+                    &in1[14], &in1[15]);
+}
+
 static INLINE void load_and_transpose_u8_4x8(const uint8_t *a,
                                              const int a_stride, uint8x8_t *a0,
                                              uint8x8_t *a1, uint8x8_t *a2,
author	Konstantinos Margaritis <konma@vectorcamp.gr>	2022-03-11 20:19:25 +0200
committer	Konstantinos Margaritis <konma@vectorcamp.gr>	2022-03-17 13:07:12 +0200
commit	f79d256cb28a4228df66a7a6d1cebbd9071e0639 (patch)
tree	af6e7e70ddb165208e3a7ad22132ac48f69e1b21 /vpx_dsp/arm
parent	8a50f70ffc5eea6c2392a5c176bfe43e450ecebc (diff)
download	libvpx-f79d256cb28a4228df66a7a6d1cebbd9071e0639.tar libvpx-f79d256cb28a4228df66a7a6d1cebbd9071e0639.tar.gz libvpx-f79d256cb28a4228df66a7a6d1cebbd9071e0639.tar.bz2 libvpx-f79d256cb28a4228df66a7a6d1cebbd9071e0639.zip