summaryrefslogtreecommitdiff
path: root/vpx_dsp/arm
diff options
context:
space:
mode:
authorKonstantinos Margaritis <konma@vectorcamp.gr>2022-03-11 20:19:25 +0200
committerKonstantinos Margaritis <konma@vectorcamp.gr>2022-03-17 13:07:12 +0200
commitf79d256cb28a4228df66a7a6d1cebbd9071e0639 (patch)
treeaf6e7e70ddb165208e3a7ad22132ac48f69e1b21 /vpx_dsp/arm
parent8a50f70ffc5eea6c2392a5c176bfe43e450ecebc (diff)
downloadlibvpx-f79d256cb28a4228df66a7a6d1cebbd9071e0639.tar
libvpx-f79d256cb28a4228df66a7a6d1cebbd9071e0639.tar.gz
libvpx-f79d256cb28a4228df66a7a6d1cebbd9071e0639.tar.bz2
libvpx-f79d256cb28a4228df66a7a6d1cebbd9071e0639.zip
Make sure only NEON FDCT functions are called.
[NEON] Added vpx_fdct4x4_pass1_neon(), Added vpx_fdct8x8_pass1_notranspose_neon(), Added vpx_fdct8x8_pass1_neon() to avoid code duplication Refactored vpx_fdct4x4_neon() and vpx_dct8x8_neon() to use the above Rename dct_body to vpx_fdct16x16_body to reuse later Add transpose_s16_16x16() I have run make test and all tests/configurations seem to pass. Profiled using this command on an Ampere Altra VM: sudo perf record -g ./vpxenc --codec=vp9 --height=1080 --width=1920 \ --fps=25/1 --limit=20 -o output.mkv \ ../original_videos_Sports_1080P_Sports_1080P-0063.mkv --debug –rt Before this optimization: 1.32% 1.32% vpxenc vpxenc [.] vpx_fdct4x4_neon 0.16% 0.16% vpxenc vpxenc [.] vpx_fdct4x4_c 0.79% 0.79% vpxenc vpxenc [.] vpx_fdct8x8_c 0.52% 0.52% vpxenc vpxenc [.] vpx_fdct8x8_neon 1.23% 1.23% vpxenc vpxenc [.] vpx_fdct16x16_c 0.54% 0.54% vpxenc vpxenc [.] vpx_fdct16x16_neon So, even though a _neon() version exists, the C version was called \ as well. After this patch: 1.42% 1.36% vpxenc vpxenc [.] vpx_fdct4x4_neon 0.87% 0.82% vpxenc vpxenc [.] vpx_fdct8x8_neon 0.74% 0.74% vpxenc vpxenc [.] vpx_fdct16x16_neon Change-Id: Id4e1dd315c67b4355fe4e5a1b59e181a349f16d0
Diffstat (limited to 'vpx_dsp/arm')
-rw-r--r--vpx_dsp/arm/fdct16x16_neon.c319
-rw-r--r--vpx_dsp/arm/fdct16x16_neon.h327
-rw-r--r--vpx_dsp/arm/fdct_neon.c61
-rw-r--r--vpx_dsp/arm/fdct_neon.h213
-rw-r--r--vpx_dsp/arm/fwd_txfm_neon.c212
-rw-r--r--vpx_dsp/arm/transpose_neon.h39
6 files changed, 629 insertions, 542 deletions
diff --git a/vpx_dsp/arm/fdct16x16_neon.c b/vpx_dsp/arm/fdct16x16_neon.c
index 6b2bebd09..67f43246a 100644
--- a/vpx_dsp/arm/fdct16x16_neon.c
+++ b/vpx_dsp/arm/fdct16x16_neon.c
@@ -15,6 +15,7 @@
#include "vpx_dsp/txfm_common.h"
#include "vpx_dsp/arm/mem_neon.h"
#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/fdct16x16_neon.h"
// Some builds of gcc 4.9.2 and .3 have trouble with some of the inline
// functions.
@@ -27,316 +28,6 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
#else
-static INLINE void load(const int16_t *a, int stride, int16x8_t *b /*[16]*/) {
- b[0] = vld1q_s16(a);
- a += stride;
- b[1] = vld1q_s16(a);
- a += stride;
- b[2] = vld1q_s16(a);
- a += stride;
- b[3] = vld1q_s16(a);
- a += stride;
- b[4] = vld1q_s16(a);
- a += stride;
- b[5] = vld1q_s16(a);
- a += stride;
- b[6] = vld1q_s16(a);
- a += stride;
- b[7] = vld1q_s16(a);
- a += stride;
- b[8] = vld1q_s16(a);
- a += stride;
- b[9] = vld1q_s16(a);
- a += stride;
- b[10] = vld1q_s16(a);
- a += stride;
- b[11] = vld1q_s16(a);
- a += stride;
- b[12] = vld1q_s16(a);
- a += stride;
- b[13] = vld1q_s16(a);
- a += stride;
- b[14] = vld1q_s16(a);
- a += stride;
- b[15] = vld1q_s16(a);
-}
-
-// Store 8 16x8 values, assuming stride == 16.
-static INLINE void store(tran_low_t *a, const int16x8_t *b /*[8]*/) {
- store_s16q_to_tran_low(a, b[0]);
- a += 16;
- store_s16q_to_tran_low(a, b[1]);
- a += 16;
- store_s16q_to_tran_low(a, b[2]);
- a += 16;
- store_s16q_to_tran_low(a, b[3]);
- a += 16;
- store_s16q_to_tran_low(a, b[4]);
- a += 16;
- store_s16q_to_tran_low(a, b[5]);
- a += 16;
- store_s16q_to_tran_low(a, b[6]);
- a += 16;
- store_s16q_to_tran_low(a, b[7]);
-}
-
-// Load step of each pass. Add and subtract clear across the input, requiring
-// all 16 values to be loaded. For the first pass it also multiplies by 4.
-
-// To maybe reduce register usage this could be combined with the load() step to
-// get the first 4 and last 4 values, cross those, then load the middle 8 values
-// and cross them.
-static INLINE void cross_input(const int16x8_t *a /*[16]*/,
- int16x8_t *b /*[16]*/, const int pass) {
- if (pass == 0) {
- b[0] = vshlq_n_s16(vaddq_s16(a[0], a[15]), 2);
- b[1] = vshlq_n_s16(vaddq_s16(a[1], a[14]), 2);
- b[2] = vshlq_n_s16(vaddq_s16(a[2], a[13]), 2);
- b[3] = vshlq_n_s16(vaddq_s16(a[3], a[12]), 2);
- b[4] = vshlq_n_s16(vaddq_s16(a[4], a[11]), 2);
- b[5] = vshlq_n_s16(vaddq_s16(a[5], a[10]), 2);
- b[6] = vshlq_n_s16(vaddq_s16(a[6], a[9]), 2);
- b[7] = vshlq_n_s16(vaddq_s16(a[7], a[8]), 2);
-
- b[8] = vshlq_n_s16(vsubq_s16(a[7], a[8]), 2);
- b[9] = vshlq_n_s16(vsubq_s16(a[6], a[9]), 2);
- b[10] = vshlq_n_s16(vsubq_s16(a[5], a[10]), 2);
- b[11] = vshlq_n_s16(vsubq_s16(a[4], a[11]), 2);
- b[12] = vshlq_n_s16(vsubq_s16(a[3], a[12]), 2);
- b[13] = vshlq_n_s16(vsubq_s16(a[2], a[13]), 2);
- b[14] = vshlq_n_s16(vsubq_s16(a[1], a[14]), 2);
- b[15] = vshlq_n_s16(vsubq_s16(a[0], a[15]), 2);
- } else {
- b[0] = vaddq_s16(a[0], a[15]);
- b[1] = vaddq_s16(a[1], a[14]);
- b[2] = vaddq_s16(a[2], a[13]);
- b[3] = vaddq_s16(a[3], a[12]);
- b[4] = vaddq_s16(a[4], a[11]);
- b[5] = vaddq_s16(a[5], a[10]);
- b[6] = vaddq_s16(a[6], a[9]);
- b[7] = vaddq_s16(a[7], a[8]);
-
- b[8] = vsubq_s16(a[7], a[8]);
- b[9] = vsubq_s16(a[6], a[9]);
- b[10] = vsubq_s16(a[5], a[10]);
- b[11] = vsubq_s16(a[4], a[11]);
- b[12] = vsubq_s16(a[3], a[12]);
- b[13] = vsubq_s16(a[2], a[13]);
- b[14] = vsubq_s16(a[1], a[14]);
- b[15] = vsubq_s16(a[0], a[15]);
- }
-}
-
-// Quarter round at the beginning of the second pass. Can't use vrshr (rounding)
-// because this only adds 1, not 1 << 2.
-static INLINE void partial_round_shift(int16x8_t *a /*[16]*/) {
- const int16x8_t one = vdupq_n_s16(1);
- a[0] = vshrq_n_s16(vaddq_s16(a[0], one), 2);
- a[1] = vshrq_n_s16(vaddq_s16(a[1], one), 2);
- a[2] = vshrq_n_s16(vaddq_s16(a[2], one), 2);
- a[3] = vshrq_n_s16(vaddq_s16(a[3], one), 2);
- a[4] = vshrq_n_s16(vaddq_s16(a[4], one), 2);
- a[5] = vshrq_n_s16(vaddq_s16(a[5], one), 2);
- a[6] = vshrq_n_s16(vaddq_s16(a[6], one), 2);
- a[7] = vshrq_n_s16(vaddq_s16(a[7], one), 2);
- a[8] = vshrq_n_s16(vaddq_s16(a[8], one), 2);
- a[9] = vshrq_n_s16(vaddq_s16(a[9], one), 2);
- a[10] = vshrq_n_s16(vaddq_s16(a[10], one), 2);
- a[11] = vshrq_n_s16(vaddq_s16(a[11], one), 2);
- a[12] = vshrq_n_s16(vaddq_s16(a[12], one), 2);
- a[13] = vshrq_n_s16(vaddq_s16(a[13], one), 2);
- a[14] = vshrq_n_s16(vaddq_s16(a[14], one), 2);
- a[15] = vshrq_n_s16(vaddq_s16(a[15], one), 2);
-}
-
-// fdct_round_shift((a +/- b) * c)
-static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b,
- const tran_high_t c, int16x8_t *add,
- int16x8_t *sub) {
- const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c);
- const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c);
- const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), c);
- const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), c);
- const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), c);
- const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), c);
- const int16x4_t rounded0 = vqrshrn_n_s32(sum0, 14);
- const int16x4_t rounded1 = vqrshrn_n_s32(sum1, 14);
- const int16x4_t rounded2 = vqrshrn_n_s32(diff0, 14);
- const int16x4_t rounded3 = vqrshrn_n_s32(diff1, 14);
- *add = vcombine_s16(rounded0, rounded1);
- *sub = vcombine_s16(rounded2, rounded3);
-}
-
-// fdct_round_shift(a * c0 +/- b * c1)
-static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b,
- const tran_coef_t c0,
- const tran_coef_t c1, int16x8_t *add,
- int16x8_t *sub) {
- const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c0);
- const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c0);
- const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), c1);
- const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), c1);
- const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), c0);
- const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), c0);
- const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), c1);
- const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), c1);
- const int16x4_t rounded0 = vqrshrn_n_s32(sum0, 14);
- const int16x4_t rounded1 = vqrshrn_n_s32(sum1, 14);
- const int16x4_t rounded2 = vqrshrn_n_s32(diff0, 14);
- const int16x4_t rounded3 = vqrshrn_n_s32(diff1, 14);
- *add = vcombine_s16(rounded0, rounded1);
- *sub = vcombine_s16(rounded2, rounded3);
-}
-
-// Transpose 8x8 to a new location. Don't use transpose_neon.h because those
-// are all in-place.
-static INLINE void transpose_8x8(const int16x8_t *a /*[8]*/,
- int16x8_t *b /*[8]*/) {
- // Swap 16 bit elements.
- const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]);
- const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]);
- const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]);
- const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]);
-
- // Swap 32 bit elements.
- const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
- vreinterpretq_s32_s16(c1.val[0]));
- const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
- vreinterpretq_s32_s16(c1.val[1]));
- const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]),
- vreinterpretq_s32_s16(c3.val[0]));
- const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]),
- vreinterpretq_s32_s16(c3.val[1]));
-
- // Swap 64 bit elements
- const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]);
- const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]);
- const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]);
- const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]);
-
- b[0] = e0.val[0];
- b[1] = e1.val[0];
- b[2] = e2.val[0];
- b[3] = e3.val[0];
- b[4] = e0.val[1];
- b[5] = e1.val[1];
- b[6] = e2.val[1];
- b[7] = e3.val[1];
-}
-
-// Main body of fdct16x16.
-static void dct_body(const int16x8_t *in /*[16]*/, int16x8_t *out /*[16]*/) {
- int16x8_t s[8];
- int16x8_t x[4];
- int16x8_t step[8];
-
- // stage 1
- // From fwd_txfm.c: Work on the first eight values; fdct8(input,
- // even_results);"
- s[0] = vaddq_s16(in[0], in[7]);
- s[1] = vaddq_s16(in[1], in[6]);
- s[2] = vaddq_s16(in[2], in[5]);
- s[3] = vaddq_s16(in[3], in[4]);
- s[4] = vsubq_s16(in[3], in[4]);
- s[5] = vsubq_s16(in[2], in[5]);
- s[6] = vsubq_s16(in[1], in[6]);
- s[7] = vsubq_s16(in[0], in[7]);
-
- // fdct4(step, step);
- x[0] = vaddq_s16(s[0], s[3]);
- x[1] = vaddq_s16(s[1], s[2]);
- x[2] = vsubq_s16(s[1], s[2]);
- x[3] = vsubq_s16(s[0], s[3]);
-
- // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64)
- // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64)
- butterfly_one_coeff(x[0], x[1], cospi_16_64, &out[0], &out[8]);
- // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64);
- // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64);
- butterfly_two_coeff(x[3], x[2], cospi_24_64, cospi_8_64, &out[4], &out[12]);
-
- // Stage 2
- // Re-using source s5/s6
- // s5 = fdct_round_shift((s6 - s5) * cospi_16_64)
- // s6 = fdct_round_shift((s6 + s5) * cospi_16_64)
- butterfly_one_coeff(s[6], s[5], cospi_16_64, &s[6], &s[5]);
-
- // Stage 3
- x[0] = vaddq_s16(s[4], s[5]);
- x[1] = vsubq_s16(s[4], s[5]);
- x[2] = vsubq_s16(s[7], s[6]);
- x[3] = vaddq_s16(s[7], s[6]);
-
- // Stage 4
- // out[2] = fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
- // out[14] = fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
- butterfly_two_coeff(x[3], x[0], cospi_28_64, cospi_4_64, &out[2], &out[14]);
- // out[6] = fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
- // out[10] = fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
- butterfly_two_coeff(x[2], x[1], cospi_12_64, cospi_20_64, &out[10], &out[6]);
-
- // step 2
- // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results"
- // That file distinguished between "in_high" and "step1" but the only
- // difference is that "in_high" is the first 8 values and "step 1" is the
- // second. Here, since they are all in one array, "step1" values are += 8.
-
- // step2[2] = fdct_round_shift((step1[5] - step1[2]) * cospi_16_64)
- // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64)
- // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64)
- // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64)
- butterfly_one_coeff(in[13], in[10], cospi_16_64, &s[5], &s[2]);
- butterfly_one_coeff(in[12], in[11], cospi_16_64, &s[4], &s[3]);
-
- // step 3
- s[0] = vaddq_s16(in[8], s[3]);
- s[1] = vaddq_s16(in[9], s[2]);
- x[0] = vsubq_s16(in[9], s[2]);
- x[1] = vsubq_s16(in[8], s[3]);
- x[2] = vsubq_s16(in[15], s[4]);
- x[3] = vsubq_s16(in[14], s[5]);
- s[6] = vaddq_s16(in[14], s[5]);
- s[7] = vaddq_s16(in[15], s[4]);
-
- // step 4
- // step2[1] = fdct_round_shift(step3[1] *-cospi_8_64 + step3[6] * cospi_24_64)
- // step2[6] = fdct_round_shift(step3[1] * cospi_24_64 + step3[6] * cospi_8_64)
- butterfly_two_coeff(s[6], s[1], cospi_24_64, cospi_8_64, &s[6], &s[1]);
-
- // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64)
- // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * cospi_24_64)
- butterfly_two_coeff(x[0], x[3], cospi_8_64, cospi_24_64, &s[2], &s[5]);
-
- // step 5
- step[0] = vaddq_s16(s[0], s[1]);
- step[1] = vsubq_s16(s[0], s[1]);
- step[2] = vaddq_s16(x[1], s[2]);
- step[3] = vsubq_s16(x[1], s[2]);
- step[4] = vsubq_s16(x[2], s[5]);
- step[5] = vaddq_s16(x[2], s[5]);
- step[6] = vsubq_s16(s[7], s[6]);
- step[7] = vaddq_s16(s[7], s[6]);
-
- // step 6
- // out[1] = fdct_round_shift(step1[0] * cospi_30_64 + step1[7] * cospi_2_64)
- // out[9] = fdct_round_shift(step1[1] * cospi_14_64 + step1[6] * cospi_18_64)
- // out[5] = fdct_round_shift(step1[2] * cospi_22_64 + step1[5] * cospi_10_64)
- // out[13] = fdct_round_shift(step1[3] * cospi_6_64 + step1[4] * cospi_26_64)
- // out[3] = fdct_round_shift(step1[3] * -cospi_26_64 + step1[4] * cospi_6_64)
- // out[11] = fdct_round_shift(step1[2] * -cospi_10_64 + step1[5] *
- // cospi_22_64)
- // out[7] = fdct_round_shift(step1[1] * -cospi_18_64 + step1[6] * cospi_14_64)
- // out[15] = fdct_round_shift(step1[0] * -cospi_2_64 + step1[7] * cospi_30_64)
- butterfly_two_coeff(step[6], step[1], cospi_14_64, cospi_18_64, &out[9],
- &out[7]);
- butterfly_two_coeff(step[7], step[0], cospi_30_64, cospi_2_64, &out[1],
- &out[15]);
- butterfly_two_coeff(step[4], step[3], cospi_6_64, cospi_26_64, &out[13],
- &out[3]);
- butterfly_two_coeff(step[5], step[2], cospi_22_64, cospi_10_64, &out[5],
- &out[11]);
-}
-
void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
int16x8_t temp0[16];
int16x8_t temp1[16];
@@ -346,12 +37,12 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
// Left half.
load(input, stride, temp0);
cross_input(temp0, temp1, 0);
- dct_body(temp1, temp0);
+ vpx_fdct16x16_body(temp1, temp0);
// Right half.
load(input + 8, stride, temp1);
cross_input(temp1, temp2, 0);
- dct_body(temp2, temp1);
+ vpx_fdct16x16_body(temp2, temp1);
// Transpose top left and top right quarters into one contiguous location to
// process to the top half.
@@ -359,7 +50,7 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
transpose_8x8(&temp1[0], &temp2[8]);
partial_round_shift(temp2);
cross_input(temp2, temp3, 1);
- dct_body(temp3, temp2);
+ vpx_fdct16x16_body(temp3, temp2);
transpose_s16_8x8(&temp2[0], &temp2[1], &temp2[2], &temp2[3], &temp2[4],
&temp2[5], &temp2[6], &temp2[7]);
transpose_s16_8x8(&temp2[8], &temp2[9], &temp2[10], &temp2[11], &temp2[12],
@@ -375,7 +66,7 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
&temp1[13], &temp1[14], &temp1[15]);
partial_round_shift(temp1);
cross_input(temp1, temp0, 1);
- dct_body(temp0, temp1);
+ vpx_fdct16x16_body(temp0, temp1);
transpose_s16_8x8(&temp1[0], &temp1[1], &temp1[2], &temp1[3], &temp1[4],
&temp1[5], &temp1[6], &temp1[7]);
transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12],
diff --git a/vpx_dsp/arm/fdct16x16_neon.h b/vpx_dsp/arm/fdct16x16_neon.h
new file mode 100644
index 000000000..839123899
--- /dev/null
+++ b/vpx_dsp/arm/fdct16x16_neon.h
@@ -0,0 +1,327 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_FDCT16x16_NEON_H_
+#define VPX_VPX_DSP_ARM_FDCT16x16_NEON_H_
+
+#include <arm_neon.h>
+
+static INLINE void load(const int16_t *a, int stride, int16x8_t *b /*[16]*/) {
+ b[0] = vld1q_s16(a);
+ a += stride;
+ b[1] = vld1q_s16(a);
+ a += stride;
+ b[2] = vld1q_s16(a);
+ a += stride;
+ b[3] = vld1q_s16(a);
+ a += stride;
+ b[4] = vld1q_s16(a);
+ a += stride;
+ b[5] = vld1q_s16(a);
+ a += stride;
+ b[6] = vld1q_s16(a);
+ a += stride;
+ b[7] = vld1q_s16(a);
+ a += stride;
+ b[8] = vld1q_s16(a);
+ a += stride;
+ b[9] = vld1q_s16(a);
+ a += stride;
+ b[10] = vld1q_s16(a);
+ a += stride;
+ b[11] = vld1q_s16(a);
+ a += stride;
+ b[12] = vld1q_s16(a);
+ a += stride;
+ b[13] = vld1q_s16(a);
+ a += stride;
+ b[14] = vld1q_s16(a);
+ a += stride;
+ b[15] = vld1q_s16(a);
+}
+
+// Store 8 16x8 values, assuming stride == 16.
+static INLINE void store(tran_low_t *a, const int16x8_t *b /*[8]*/) {
+ store_s16q_to_tran_low(a, b[0]);
+ a += 16;
+ store_s16q_to_tran_low(a, b[1]);
+ a += 16;
+ store_s16q_to_tran_low(a, b[2]);
+ a += 16;
+ store_s16q_to_tran_low(a, b[3]);
+ a += 16;
+ store_s16q_to_tran_low(a, b[4]);
+ a += 16;
+ store_s16q_to_tran_low(a, b[5]);
+ a += 16;
+ store_s16q_to_tran_low(a, b[6]);
+ a += 16;
+ store_s16q_to_tran_low(a, b[7]);
+}
+
+// Load step of each pass. Add and subtract clear across the input, requiring
+// all 16 values to be loaded. For the first pass it also multiplies by 4.
+
+// To maybe reduce register usage this could be combined with the load() step to
+// get the first 4 and last 4 values, cross those, then load the middle 8 values
+// and cross them.
+static INLINE void cross_input(const int16x8_t *a /*[16]*/,
+ int16x8_t *b /*[16]*/, const int pass) {
+ if (pass == 0) {
+ b[0] = vshlq_n_s16(vaddq_s16(a[0], a[15]), 2);
+ b[1] = vshlq_n_s16(vaddq_s16(a[1], a[14]), 2);
+ b[2] = vshlq_n_s16(vaddq_s16(a[2], a[13]), 2);
+ b[3] = vshlq_n_s16(vaddq_s16(a[3], a[12]), 2);
+ b[4] = vshlq_n_s16(vaddq_s16(a[4], a[11]), 2);
+ b[5] = vshlq_n_s16(vaddq_s16(a[5], a[10]), 2);
+ b[6] = vshlq_n_s16(vaddq_s16(a[6], a[9]), 2);
+ b[7] = vshlq_n_s16(vaddq_s16(a[7], a[8]), 2);
+
+ b[8] = vshlq_n_s16(vsubq_s16(a[7], a[8]), 2);
+ b[9] = vshlq_n_s16(vsubq_s16(a[6], a[9]), 2);
+ b[10] = vshlq_n_s16(vsubq_s16(a[5], a[10]), 2);
+ b[11] = vshlq_n_s16(vsubq_s16(a[4], a[11]), 2);
+ b[12] = vshlq_n_s16(vsubq_s16(a[3], a[12]), 2);
+ b[13] = vshlq_n_s16(vsubq_s16(a[2], a[13]), 2);
+ b[14] = vshlq_n_s16(vsubq_s16(a[1], a[14]), 2);
+ b[15] = vshlq_n_s16(vsubq_s16(a[0], a[15]), 2);
+ } else {
+ b[0] = vaddq_s16(a[0], a[15]);
+ b[1] = vaddq_s16(a[1], a[14]);
+ b[2] = vaddq_s16(a[2], a[13]);
+ b[3] = vaddq_s16(a[3], a[12]);
+ b[4] = vaddq_s16(a[4], a[11]);
+ b[5] = vaddq_s16(a[5], a[10]);
+ b[6] = vaddq_s16(a[6], a[9]);
+ b[7] = vaddq_s16(a[7], a[8]);
+
+ b[8] = vsubq_s16(a[7], a[8]);
+ b[9] = vsubq_s16(a[6], a[9]);
+ b[10] = vsubq_s16(a[5], a[10]);
+ b[11] = vsubq_s16(a[4], a[11]);
+ b[12] = vsubq_s16(a[3], a[12]);
+ b[13] = vsubq_s16(a[2], a[13]);
+ b[14] = vsubq_s16(a[1], a[14]);
+ b[15] = vsubq_s16(a[0], a[15]);
+ }
+}
+
+// Quarter round at the beginning of the second pass. Can't use vrshr (rounding)
+// because this only adds 1, not 1 << 2.
+static INLINE void partial_round_shift(int16x8_t *a /*[16]*/) {
+ const int16x8_t one = vdupq_n_s16(1);
+ a[0] = vshrq_n_s16(vaddq_s16(a[0], one), 2);
+ a[1] = vshrq_n_s16(vaddq_s16(a[1], one), 2);
+ a[2] = vshrq_n_s16(vaddq_s16(a[2], one), 2);
+ a[3] = vshrq_n_s16(vaddq_s16(a[3], one), 2);
+ a[4] = vshrq_n_s16(vaddq_s16(a[4], one), 2);
+ a[5] = vshrq_n_s16(vaddq_s16(a[5], one), 2);
+ a[6] = vshrq_n_s16(vaddq_s16(a[6], one), 2);
+ a[7] = vshrq_n_s16(vaddq_s16(a[7], one), 2);
+ a[8] = vshrq_n_s16(vaddq_s16(a[8], one), 2);
+ a[9] = vshrq_n_s16(vaddq_s16(a[9], one), 2);
+ a[10] = vshrq_n_s16(vaddq_s16(a[10], one), 2);
+ a[11] = vshrq_n_s16(vaddq_s16(a[11], one), 2);
+ a[12] = vshrq_n_s16(vaddq_s16(a[12], one), 2);
+ a[13] = vshrq_n_s16(vaddq_s16(a[13], one), 2);
+ a[14] = vshrq_n_s16(vaddq_s16(a[14], one), 2);
+ a[15] = vshrq_n_s16(vaddq_s16(a[15], one), 2);
+}
+
+// fdct_round_shift((a +/- b) * c)
+static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b,
+ const tran_high_t c, int16x8_t *add,
+ int16x8_t *sub) {
+ const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c);
+ const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c);
+ const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), c);
+ const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), c);
+ const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), c);
+ const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), c);
+ const int16x4_t rounded0 = vqrshrn_n_s32(sum0, 14);
+ const int16x4_t rounded1 = vqrshrn_n_s32(sum1, 14);
+ const int16x4_t rounded2 = vqrshrn_n_s32(diff0, 14);
+ const int16x4_t rounded3 = vqrshrn_n_s32(diff1, 14);
+ *add = vcombine_s16(rounded0, rounded1);
+ *sub = vcombine_s16(rounded2, rounded3);
+}
+
+// fdct_round_shift(a * c0 +/- b * c1)
+static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b,
+ const tran_coef_t c0,
+ const tran_coef_t c1, int16x8_t *add,
+ int16x8_t *sub) {
+ const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c0);
+ const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c0);
+ const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), c1);
+ const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), c1);
+ const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), c0);
+ const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), c0);
+ const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), c1);
+ const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), c1);
+ const int16x4_t rounded0 = vqrshrn_n_s32(sum0, 14);
+ const int16x4_t rounded1 = vqrshrn_n_s32(sum1, 14);
+ const int16x4_t rounded2 = vqrshrn_n_s32(diff0, 14);
+ const int16x4_t rounded3 = vqrshrn_n_s32(diff1, 14);
+ *add = vcombine_s16(rounded0, rounded1);
+ *sub = vcombine_s16(rounded2, rounded3);
+}
+
+// Transpose 8x8 to a new location. Don't use transpose_neon.h because those
+// are all in-place.
+static INLINE void transpose_8x8(const int16x8_t *a /*[8]*/,
+ int16x8_t *b /*[8]*/) {
+ // Swap 16 bit elements.
+ const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]);
+ const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]);
+ const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]);
+ const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]);
+
+ // Swap 32 bit elements.
+ const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
+ vreinterpretq_s32_s16(c1.val[0]));
+ const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
+ vreinterpretq_s32_s16(c1.val[1]));
+ const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]),
+ vreinterpretq_s32_s16(c3.val[0]));
+ const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]),
+ vreinterpretq_s32_s16(c3.val[1]));
+
+ // Swap 64 bit elements
+ const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]);
+ const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]);
+ const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]);
+ const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]);
+
+ b[0] = e0.val[0];
+ b[1] = e1.val[0];
+ b[2] = e2.val[0];
+ b[3] = e3.val[0];
+ b[4] = e0.val[1];
+ b[5] = e1.val[1];
+ b[6] = e2.val[1];
+ b[7] = e3.val[1];
+}
+
+// Main body of fdct16x16.
+static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/,
+ int16x8_t *out /*[16]*/) {
+ int16x8_t s[8];
+ int16x8_t x[4];
+ int16x8_t step[8];
+
+ // stage 1
+ // From fwd_txfm.c: Work on the first eight values; fdct8(input,
+ // even_results);"
+ s[0] = vaddq_s16(in[0], in[7]);
+ s[1] = vaddq_s16(in[1], in[6]);
+ s[2] = vaddq_s16(in[2], in[5]);
+ s[3] = vaddq_s16(in[3], in[4]);
+ s[4] = vsubq_s16(in[3], in[4]);
+ s[5] = vsubq_s16(in[2], in[5]);
+ s[6] = vsubq_s16(in[1], in[6]);
+ s[7] = vsubq_s16(in[0], in[7]);
+
+ // fdct4(step, step);
+ x[0] = vaddq_s16(s[0], s[3]);
+ x[1] = vaddq_s16(s[1], s[2]);
+ x[2] = vsubq_s16(s[1], s[2]);
+ x[3] = vsubq_s16(s[0], s[3]);
+
+ // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64)
+ // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64)
+ butterfly_one_coeff(x[0], x[1], cospi_16_64, &out[0], &out[8]);
+ // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64);
+ // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64);
+ butterfly_two_coeff(x[3], x[2], cospi_24_64, cospi_8_64, &out[4], &out[12]);
+
+ // Stage 2
+ // Re-using source s5/s6
+ // s5 = fdct_round_shift((s6 - s5) * cospi_16_64)
+ // s6 = fdct_round_shift((s6 + s5) * cospi_16_64)
+ butterfly_one_coeff(s[6], s[5], cospi_16_64, &s[6], &s[5]);
+
+ // Stage 3
+ x[0] = vaddq_s16(s[4], s[5]);
+ x[1] = vsubq_s16(s[4], s[5]);
+ x[2] = vsubq_s16(s[7], s[6]);
+ x[3] = vaddq_s16(s[7], s[6]);
+
+ // Stage 4
+ // out[2] = fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
+ // out[14] = fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
+ butterfly_two_coeff(x[3], x[0], cospi_28_64, cospi_4_64, &out[2], &out[14]);
+ // out[6] = fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
+ // out[10] = fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
+ butterfly_two_coeff(x[2], x[1], cospi_12_64, cospi_20_64, &out[10], &out[6]);
+
+ // step 2
+ // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results"
+ // That file distinguished between "in_high" and "step1" but the only
+ // difference is that "in_high" is the first 8 values and "step 1" is the
+ // second. Here, since they are all in one array, "step1" values are += 8.
+
+ // step2[2] = fdct_round_shift((step1[5] - step1[2]) * cospi_16_64)
+ // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64)
+ // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64)
+ // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64)
+ butterfly_one_coeff(in[13], in[10], cospi_16_64, &s[5], &s[2]);
+ butterfly_one_coeff(in[12], in[11], cospi_16_64, &s[4], &s[3]);
+
+ // step 3
+ s[0] = vaddq_s16(in[8], s[3]);
+ s[1] = vaddq_s16(in[9], s[2]);
+ x[0] = vsubq_s16(in[9], s[2]);
+ x[1] = vsubq_s16(in[8], s[3]);
+ x[2] = vsubq_s16(in[15], s[4]);
+ x[3] = vsubq_s16(in[14], s[5]);
+ s[6] = vaddq_s16(in[14], s[5]);
+ s[7] = vaddq_s16(in[15], s[4]);
+
+ // step 4
+ // step2[1] = fdct_round_shift(step3[1] *-cospi_8_64 + step3[6] * cospi_24_64)
+ // step2[6] = fdct_round_shift(step3[1] * cospi_24_64 + step3[6] * cospi_8_64)
+ butterfly_two_coeff(s[6], s[1], cospi_24_64, cospi_8_64, &s[6], &s[1]);
+
+ // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64)
+ // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * cospi_24_64)
+ butterfly_two_coeff(x[0], x[3], cospi_8_64, cospi_24_64, &s[2], &s[5]);
+
+ // step 5
+ step[0] = vaddq_s16(s[0], s[1]);
+ step[1] = vsubq_s16(s[0], s[1]);
+ step[2] = vaddq_s16(x[1], s[2]);
+ step[3] = vsubq_s16(x[1], s[2]);
+ step[4] = vsubq_s16(x[2], s[5]);
+ step[5] = vaddq_s16(x[2], s[5]);
+ step[6] = vsubq_s16(s[7], s[6]);
+ step[7] = vaddq_s16(s[7], s[6]);
+
+ // step 6
+ // out[1] = fdct_round_shift(step1[0] * cospi_30_64 + step1[7] * cospi_2_64)
+ // out[9] = fdct_round_shift(step1[1] * cospi_14_64 + step1[6] * cospi_18_64)
+ // out[5] = fdct_round_shift(step1[2] * cospi_22_64 + step1[5] * cospi_10_64)
+ // out[13] = fdct_round_shift(step1[3] * cospi_6_64 + step1[4] * cospi_26_64)
+ // out[3] = fdct_round_shift(step1[3] * -cospi_26_64 + step1[4] * cospi_6_64)
+ // out[11] = fdct_round_shift(step1[2] * -cospi_10_64 + step1[5] *
+ // cospi_22_64)
+ // out[7] = fdct_round_shift(step1[1] * -cospi_18_64 + step1[6] * cospi_14_64)
+ // out[15] = fdct_round_shift(step1[0] * -cospi_2_64 + step1[7] * cospi_30_64)
+ butterfly_two_coeff(step[6], step[1], cospi_14_64, cospi_18_64, &out[9],
+ &out[7]);
+ butterfly_two_coeff(step[7], step[0], cospi_30_64, cospi_2_64, &out[1],
+ &out[15]);
+ butterfly_two_coeff(step[4], step[3], cospi_6_64, cospi_26_64, &out[13],
+ &out[3]);
+ butterfly_two_coeff(step[5], step[2], cospi_22_64, cospi_10_64, &out[5],
+ &out[11]);
+}
+
+#endif // VPX_VPX_DSP_ARM_FDCT16x16_NEON_H_
diff --git a/vpx_dsp/arm/fdct_neon.c b/vpx_dsp/arm/fdct_neon.c
index 3708cbb11..2827791f1 100644
--- a/vpx_dsp/arm/fdct_neon.c
+++ b/vpx_dsp/arm/fdct_neon.c
@@ -15,6 +15,7 @@
#include "vpx_dsp/txfm_common.h"
#include "vpx_dsp/vpx_dsp_common.h"
#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/fdct_neon.h"
#include "vpx_dsp/arm/mem_neon.h"
#include "vpx_dsp/arm/transpose_neon.h"
@@ -22,67 +23,25 @@ void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
int stride) {
int i;
// input[M * stride] * 16
- int16x4_t input_0 = vshl_n_s16(vld1_s16(input + 0 * stride), 4);
- int16x4_t input_1 = vshl_n_s16(vld1_s16(input + 1 * stride), 4);
- int16x4_t input_2 = vshl_n_s16(vld1_s16(input + 2 * stride), 4);
- int16x4_t input_3 = vshl_n_s16(vld1_s16(input + 3 * stride), 4);
+ int16x4_t in[4];
+ in[0] = vshl_n_s16(vld1_s16(input + 0 * stride), 4);
+ in[1] = vshl_n_s16(vld1_s16(input + 1 * stride), 4);
+ in[2] = vshl_n_s16(vld1_s16(input + 2 * stride), 4);
+ in[3] = vshl_n_s16(vld1_s16(input + 3 * stride), 4);
// If the very first value != 0, then add 1.
if (input[0] != 0) {
const int16x4_t one = vreinterpret_s16_s64(vdup_n_s64(1));
- input_0 = vadd_s16(input_0, one);
+ in[0] = vadd_s16(in[0], one);
}
-
for (i = 0; i < 2; ++i) {
- const int16x8_t input_01 = vcombine_s16(input_0, input_1);
- const int16x8_t input_32 = vcombine_s16(input_3, input_2);
-
- // in_0 +/- in_3, in_1 +/- in_2
- const int16x8_t s_01 = vaddq_s16(input_01, input_32);
- const int16x8_t s_32 = vsubq_s16(input_01, input_32);
-
- // step_0 +/- step_1, step_2 +/- step_3
- const int16x4_t s_0 = vget_low_s16(s_01);
- const int16x4_t s_1 = vget_high_s16(s_01);
- const int16x4_t s_2 = vget_high_s16(s_32);
- const int16x4_t s_3 = vget_low_s16(s_32);
-
- // (s_0 +/- s_1) * cospi_16_64
- // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c.
- const int32x4_t s_0_p_s_1 = vaddl_s16(s_0, s_1);
- const int32x4_t s_0_m_s_1 = vsubl_s16(s_0, s_1);
- const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, cospi_16_64);
- const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, cospi_16_64);
-
- // fdct_round_shift
- int16x4_t out_0 = vrshrn_n_s32(temp1, DCT_CONST_BITS);
- int16x4_t out_2 = vrshrn_n_s32(temp2, DCT_CONST_BITS);
-
- // s_3 * cospi_8_64 + s_2 * cospi_24_64
- // s_3 * cospi_24_64 - s_2 * cospi_8_64
- const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, cospi_8_64);
- const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, cospi_24_64);
-
- const int32x4_t temp3 = vmlal_n_s16(s_3_cospi_8_64, s_2, cospi_24_64);
- const int32x4_t temp4 = vmlsl_n_s16(s_3_cospi_24_64, s_2, cospi_8_64);
-
- // fdct_round_shift
- int16x4_t out_1 = vrshrn_n_s32(temp3, DCT_CONST_BITS);
- int16x4_t out_3 = vrshrn_n_s32(temp4, DCT_CONST_BITS);
-
- transpose_s16_4x4d(&out_0, &out_1, &out_2, &out_3);
-
- input_0 = out_0;
- input_1 = out_1;
- input_2 = out_2;
- input_3 = out_3;
+ vpx_fdct4x4_pass1_neon(in);
}
-
{
// Not quite a rounding shift. Only add 1 despite shifting by 2.
const int16x8_t one = vdupq_n_s16(1);
- int16x8_t out_01 = vcombine_s16(input_0, input_1);
- int16x8_t out_23 = vcombine_s16(input_2, input_3);
+ int16x8_t out_01 = vcombine_s16(in[0], in[1]);
+ int16x8_t out_23 = vcombine_s16(in[2], in[3]);
out_01 = vshrq_n_s16(vaddq_s16(out_01, one), 2);
out_23 = vshrq_n_s16(vaddq_s16(out_23, one), 2);
store_s16q_to_tran_low(final_output + 0 * 8, out_01);
diff --git a/vpx_dsp/arm/fdct_neon.h b/vpx_dsp/arm/fdct_neon.h
new file mode 100644
index 000000000..28d7d86bf
--- /dev/null
+++ b/vpx_dsp/arm/fdct_neon.h
@@ -0,0 +1,213 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_FDCT_NEON_H_
+#define VPX_VPX_DSP_ARM_FDCT_NEON_H_
+
+#include <arm_neon.h>
+
+static INLINE void vpx_fdct4x4_pass1_neon(int16x4_t *in) {
+ const int16x8_t input_01 = vcombine_s16(in[0], in[1]);
+ const int16x8_t input_32 = vcombine_s16(in[3], in[2]);
+
+ // in_0 +/- in_3, in_1 +/- in_2
+ const int16x8_t s_01 = vaddq_s16(input_01, input_32);
+ const int16x8_t s_32 = vsubq_s16(input_01, input_32);
+
+ // step_0 +/- step_1, step_2 +/- step_3
+ const int16x4_t s_0 = vget_low_s16(s_01);
+ const int16x4_t s_1 = vget_high_s16(s_01);
+ const int16x4_t s_2 = vget_high_s16(s_32);
+ const int16x4_t s_3 = vget_low_s16(s_32);
+
+ // (s_0 +/- s_1) * cospi_16_64
+ // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c.
+ const int32x4_t s_0_p_s_1 = vaddl_s16(s_0, s_1);
+ const int32x4_t s_0_m_s_1 = vsubl_s16(s_0, s_1);
+ const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, cospi_16_64);
+ const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, cospi_16_64);
+
+ // fdct_round_shift
+ int16x4_t out_0 = vrshrn_n_s32(temp1, DCT_CONST_BITS);
+ int16x4_t out_2 = vrshrn_n_s32(temp2, DCT_CONST_BITS);
+
+ // s_3 * cospi_8_64 + s_2 * cospi_24_64
+ // s_3 * cospi_24_64 - s_2 * cospi_8_64
+ const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, cospi_8_64);
+ const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, cospi_24_64);
+
+ const int32x4_t temp3 = vmlal_n_s16(s_3_cospi_8_64, s_2, cospi_24_64);
+ const int32x4_t temp4 = vmlsl_n_s16(s_3_cospi_24_64, s_2, cospi_8_64);
+
+ // fdct_round_shift
+ int16x4_t out_1 = vrshrn_n_s32(temp3, DCT_CONST_BITS);
+ int16x4_t out_3 = vrshrn_n_s32(temp4, DCT_CONST_BITS);
+
+ transpose_s16_4x4d(&out_0, &out_1, &out_2, &out_3);
+
+ in[0] = out_0;
+ in[1] = out_1;
+ in[2] = out_2;
+ in[3] = out_3;
+}
+
+static INLINE void vpx_fdct8x8_pass1_notranspose_neon(int16x8_t *in,
+ int16x8_t *out) {
+ const int16x8_t v_s0 = vaddq_s16(in[0], in[7]);
+ const int16x8_t v_s1 = vaddq_s16(in[1], in[6]);
+ const int16x8_t v_s2 = vaddq_s16(in[2], in[5]);
+ const int16x8_t v_s3 = vaddq_s16(in[3], in[4]);
+ const int16x8_t v_s4 = vsubq_s16(in[3], in[4]);
+ const int16x8_t v_s5 = vsubq_s16(in[2], in[5]);
+ const int16x8_t v_s6 = vsubq_s16(in[1], in[6]);
+ const int16x8_t v_s7 = vsubq_s16(in[0], in[7]);
+ // fdct4(step, step);
+ int16x8_t v_x0 = vaddq_s16(v_s0, v_s3);
+ int16x8_t v_x1 = vaddq_s16(v_s1, v_s2);
+ int16x8_t v_x2 = vsubq_s16(v_s1, v_s2);
+ int16x8_t v_x3 = vsubq_s16(v_s0, v_s3);
+ // fdct4(step, step);
+ int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
+ int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
+ int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
+ int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
+ int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_24_64);
+ int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_24_64);
+ int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_24_64);
+ int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_24_64);
+ v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), cospi_8_64);
+ v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), cospi_8_64);
+ v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), cospi_8_64);
+ v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), cospi_8_64);
+ v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64);
+ v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64);
+ v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64);
+ v_t1_hi = vmulq_n_s32(v_t1_hi, cospi_16_64);
+ {
+ const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+ const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+ const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+ const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+ const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
+ const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
+ const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
+ const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
+ out[0] = vcombine_s16(a, c); // 00 01 02 03 40 41 42 43
+ out[2] = vcombine_s16(e, g); // 20 21 22 23 60 61 62 63
+ out[4] = vcombine_s16(b, d); // 04 05 06 07 44 45 46 47
+ out[6] = vcombine_s16(f, h); // 24 25 26 27 64 65 66 67
+ }
+ // Stage 2
+ v_x0 = vsubq_s16(v_s6, v_s5);
+ v_x1 = vaddq_s16(v_s6, v_s5);
+ v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), cospi_16_64);
+ v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), cospi_16_64);
+ v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_16_64);
+ v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_16_64);
+ {
+ const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+ const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+ const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+ const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+ const int16x8_t ab = vcombine_s16(a, b);
+ const int16x8_t cd = vcombine_s16(c, d);
+ // Stage 3
+ v_x0 = vaddq_s16(v_s4, ab);
+ v_x1 = vsubq_s16(v_s4, ab);
+ v_x2 = vsubq_s16(v_s7, cd);
+ v_x3 = vaddq_s16(v_s7, cd);
+ }
+ // Stage 4
+ v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_4_64);
+ v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_4_64);
+ v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), cospi_28_64);
+ v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), cospi_28_64);
+ v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_12_64);
+ v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_12_64);
+ v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), cospi_20_64);
+ v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), cospi_20_64);
+ v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_12_64);
+ v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_12_64);
+ v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), cospi_20_64);
+ v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), cospi_20_64);
+ v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_28_64);
+ v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_28_64);
+ v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), cospi_4_64);
+ v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), cospi_4_64);
+ {
+ const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+ const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+ const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+ const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+ const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
+ const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
+ const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
+ const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
+ out[1] = vcombine_s16(a, c); // 10 11 12 13 50 51 52 53
+ out[3] = vcombine_s16(e, g); // 30 31 32 33 70 71 72 73
+ out[5] = vcombine_s16(b, d); // 14 15 16 17 54 55 56 57
+ out[7] = vcombine_s16(f, h); // 34 35 36 37 74 75 76 77
+ }
+}
+
+static INLINE void vpx_fdct8x8_pass1_neon(int16x8_t *in) {
+ int16x8_t out[8];
+ vpx_fdct8x8_pass1_notranspose_neon(in, out);
+ // transpose 8x8
+ // Can't use transpose_s16_8x8() because the values are arranged in two 4x8
+ // columns.
+ {
+ // 00 01 02 03 40 41 42 43
+ // 10 11 12 13 50 51 52 53
+ // 20 21 22 23 60 61 62 63
+ // 30 31 32 33 70 71 72 73
+ // 04 05 06 07 44 45 46 47
+ // 14 15 16 17 54 55 56 57
+ // 24 25 26 27 64 65 66 67
+ // 34 35 36 37 74 75 76 77
+ const int32x4x2_t r02_s32 =
+ vtrnq_s32(vreinterpretq_s32_s16(out[0]), vreinterpretq_s32_s16(out[2]));
+ const int32x4x2_t r13_s32 =
+ vtrnq_s32(vreinterpretq_s32_s16(out[1]), vreinterpretq_s32_s16(out[3]));
+ const int32x4x2_t r46_s32 =
+ vtrnq_s32(vreinterpretq_s32_s16(out[4]), vreinterpretq_s32_s16(out[6]));
+ const int32x4x2_t r57_s32 =
+ vtrnq_s32(vreinterpretq_s32_s16(out[5]), vreinterpretq_s32_s16(out[7]));
+ const int16x8x2_t r01_s16 =
+ vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]),
+ vreinterpretq_s16_s32(r13_s32.val[0]));
+ const int16x8x2_t r23_s16 =
+ vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]),
+ vreinterpretq_s16_s32(r13_s32.val[1]));
+ const int16x8x2_t r45_s16 =
+ vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]),
+ vreinterpretq_s16_s32(r57_s32.val[0]));
+ const int16x8x2_t r67_s16 =
+ vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]),
+ vreinterpretq_s16_s32(r57_s32.val[1]));
+ in[0] = r01_s16.val[0];
+ in[1] = r01_s16.val[1];
+ in[2] = r23_s16.val[0];
+ in[3] = r23_s16.val[1];
+ in[4] = r45_s16.val[0];
+ in[5] = r45_s16.val[1];
+ in[6] = r67_s16.val[0];
+ in[7] = r67_s16.val[1];
+ // 00 10 20 30 40 50 60 70
+ // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72
+ // 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74
+ // 05 15 25 35 45 55 65 75
+ // 06 16 26 36 46 56 66 76
+ // 07 17 27 37 47 57 67 77
+ }
+}
+#endif // VPX_VPX_DSP_ARM_FDCT_NEON_H_
diff --git a/vpx_dsp/arm/fwd_txfm_neon.c b/vpx_dsp/arm/fwd_txfm_neon.c
index 374a262b9..d9161c6d3 100644
--- a/vpx_dsp/arm/fwd_txfm_neon.c
+++ b/vpx_dsp/arm/fwd_txfm_neon.c
@@ -15,196 +15,54 @@
#include "vpx_dsp/txfm_common.h"
#include "vpx_dsp/vpx_dsp_common.h"
#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/fdct_neon.h"
#include "vpx_dsp/arm/mem_neon.h"
void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
int stride) {
int i;
// stage 1
- int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
- int16x8_t input_1 = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2);
- int16x8_t input_2 = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2);
- int16x8_t input_3 = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2);
- int16x8_t input_4 = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2);
- int16x8_t input_5 = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2);
- int16x8_t input_6 = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2);
- int16x8_t input_7 = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2);
+ int16x8_t in[8];
+ in[0] = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
+ in[1] = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2);
+ in[2] = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2);
+ in[3] = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2);
+ in[4] = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2);
+ in[5] = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2);
+ in[6] = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2);
+ in[7] = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2);
for (i = 0; i < 2; ++i) {
- int16x8_t out_0, out_1, out_2, out_3, out_4, out_5, out_6, out_7;
- const int16x8_t v_s0 = vaddq_s16(input_0, input_7);
- const int16x8_t v_s1 = vaddq_s16(input_1, input_6);
- const int16x8_t v_s2 = vaddq_s16(input_2, input_5);
- const int16x8_t v_s3 = vaddq_s16(input_3, input_4);
- const int16x8_t v_s4 = vsubq_s16(input_3, input_4);
- const int16x8_t v_s5 = vsubq_s16(input_2, input_5);
- const int16x8_t v_s6 = vsubq_s16(input_1, input_6);
- const int16x8_t v_s7 = vsubq_s16(input_0, input_7);
- // fdct4(step, step);
- int16x8_t v_x0 = vaddq_s16(v_s0, v_s3);
- int16x8_t v_x1 = vaddq_s16(v_s1, v_s2);
- int16x8_t v_x2 = vsubq_s16(v_s1, v_s2);
- int16x8_t v_x3 = vsubq_s16(v_s0, v_s3);
- // fdct4(step, step);
- int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
- int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
- int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
- int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
- int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_24_64);
- int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_24_64);
- int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_24_64);
- int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_24_64);
- v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), cospi_8_64);
- v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), cospi_8_64);
- v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), cospi_8_64);
- v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), cospi_8_64);
- v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64);
- v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64);
- v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64);
- v_t1_hi = vmulq_n_s32(v_t1_hi, cospi_16_64);
- {
- const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
- const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
- const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
- const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
- const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
- const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
- const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
- const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
- out_0 = vcombine_s16(a, c); // 00 01 02 03 40 41 42 43
- out_2 = vcombine_s16(e, g); // 20 21 22 23 60 61 62 63
- out_4 = vcombine_s16(b, d); // 04 05 06 07 44 45 46 47
- out_6 = vcombine_s16(f, h); // 24 25 26 27 64 65 66 67
- }
- // Stage 2
- v_x0 = vsubq_s16(v_s6, v_s5);
- v_x1 = vaddq_s16(v_s6, v_s5);
- v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), cospi_16_64);
- v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), cospi_16_64);
- v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_16_64);
- v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_16_64);
- {
- const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
- const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
- const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
- const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
- const int16x8_t ab = vcombine_s16(a, b);
- const int16x8_t cd = vcombine_s16(c, d);
- // Stage 3
- v_x0 = vaddq_s16(v_s4, ab);
- v_x1 = vsubq_s16(v_s4, ab);
- v_x2 = vsubq_s16(v_s7, cd);
- v_x3 = vaddq_s16(v_s7, cd);
- }
- // Stage 4
- v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_4_64);
- v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_4_64);
- v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), cospi_28_64);
- v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), cospi_28_64);
- v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_12_64);
- v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_12_64);
- v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), cospi_20_64);
- v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), cospi_20_64);
- v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_12_64);
- v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_12_64);
- v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), cospi_20_64);
- v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), cospi_20_64);
- v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_28_64);
- v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_28_64);
- v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), cospi_4_64);
- v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), cospi_4_64);
- {
- const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
- const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
- const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
- const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
- const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
- const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
- const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
- const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
- out_1 = vcombine_s16(a, c); // 10 11 12 13 50 51 52 53
- out_3 = vcombine_s16(e, g); // 30 31 32 33 70 71 72 73
- out_5 = vcombine_s16(b, d); // 14 15 16 17 54 55 56 57
- out_7 = vcombine_s16(f, h); // 34 35 36 37 74 75 76 77
- }
- // transpose 8x8
- // Can't use transpose_s16_8x8() because the values are arranged in two 4x8
- // columns.
- {
- // 00 01 02 03 40 41 42 43
- // 10 11 12 13 50 51 52 53
- // 20 21 22 23 60 61 62 63
- // 30 31 32 33 70 71 72 73
- // 04 05 06 07 44 45 46 47
- // 14 15 16 17 54 55 56 57
- // 24 25 26 27 64 65 66 67
- // 34 35 36 37 74 75 76 77
- const int32x4x2_t r02_s32 =
- vtrnq_s32(vreinterpretq_s32_s16(out_0), vreinterpretq_s32_s16(out_2));
- const int32x4x2_t r13_s32 =
- vtrnq_s32(vreinterpretq_s32_s16(out_1), vreinterpretq_s32_s16(out_3));
- const int32x4x2_t r46_s32 =
- vtrnq_s32(vreinterpretq_s32_s16(out_4), vreinterpretq_s32_s16(out_6));
- const int32x4x2_t r57_s32 =
- vtrnq_s32(vreinterpretq_s32_s16(out_5), vreinterpretq_s32_s16(out_7));
- const int16x8x2_t r01_s16 =
- vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]),
- vreinterpretq_s16_s32(r13_s32.val[0]));
- const int16x8x2_t r23_s16 =
- vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]),
- vreinterpretq_s16_s32(r13_s32.val[1]));
- const int16x8x2_t r45_s16 =
- vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]),
- vreinterpretq_s16_s32(r57_s32.val[0]));
- const int16x8x2_t r67_s16 =
- vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]),
- vreinterpretq_s16_s32(r57_s32.val[1]));
- input_0 = r01_s16.val[0];
- input_1 = r01_s16.val[1];
- input_2 = r23_s16.val[0];
- input_3 = r23_s16.val[1];
- input_4 = r45_s16.val[0];
- input_5 = r45_s16.val[1];
- input_6 = r67_s16.val[0];
- input_7 = r67_s16.val[1];
- // 00 10 20 30 40 50 60 70
- // 01 11 21 31 41 51 61 71
- // 02 12 22 32 42 52 62 72
- // 03 13 23 33 43 53 63 73
- // 04 14 24 34 44 54 64 74
- // 05 15 25 35 45 55 65 75
- // 06 16 26 36 46 56 66 76
- // 07 17 27 37 47 57 67 77
- }
+ vpx_fdct8x8_pass1_neon(in);
} // for
{
// from vpx_dct_sse2.c
// Post-condition (division by two)
// division of two 16 bits signed numbers using shifts
// n / 2 = (n - (n >> 15)) >> 1
- const int16x8_t sign_in0 = vshrq_n_s16(input_0, 15);
- const int16x8_t sign_in1 = vshrq_n_s16(input_1, 15);
- const int16x8_t sign_in2 = vshrq_n_s16(input_2, 15);
- const int16x8_t sign_in3 = vshrq_n_s16(input_3, 15);
- const int16x8_t sign_in4 = vshrq_n_s16(input_4, 15);
- const int16x8_t sign_in5 = vshrq_n_s16(input_5, 15);
- const int16x8_t sign_in6 = vshrq_n_s16(input_6, 15);
- const int16x8_t sign_in7 = vshrq_n_s16(input_7, 15);
- input_0 = vhsubq_s16(input_0, sign_in0);
- input_1 = vhsubq_s16(input_1, sign_in1);
- input_2 = vhsubq_s16(input_2, sign_in2);
- input_3 = vhsubq_s16(input_3, sign_in3);
- input_4 = vhsubq_s16(input_4, sign_in4);
- input_5 = vhsubq_s16(input_5, sign_in5);
- input_6 = vhsubq_s16(input_6, sign_in6);
- input_7 = vhsubq_s16(input_7, sign_in7);
+ const int16x8_t sign_in0 = vshrq_n_s16(in[0], 15);
+ const int16x8_t sign_in1 = vshrq_n_s16(in[1], 15);
+ const int16x8_t sign_in2 = vshrq_n_s16(in[2], 15);
+ const int16x8_t sign_in3 = vshrq_n_s16(in[3], 15);
+ const int16x8_t sign_in4 = vshrq_n_s16(in[4], 15);
+ const int16x8_t sign_in5 = vshrq_n_s16(in[5], 15);
+ const int16x8_t sign_in6 = vshrq_n_s16(in[6], 15);
+ const int16x8_t sign_in7 = vshrq_n_s16(in[7], 15);
+ in[0] = vhsubq_s16(in[0], sign_in0);
+ in[1] = vhsubq_s16(in[1], sign_in1);
+ in[2] = vhsubq_s16(in[2], sign_in2);
+ in[3] = vhsubq_s16(in[3], sign_in3);
+ in[4] = vhsubq_s16(in[4], sign_in4);
+ in[5] = vhsubq_s16(in[5], sign_in5);
+ in[6] = vhsubq_s16(in[6], sign_in6);
+ in[7] = vhsubq_s16(in[7], sign_in7);
// store results
- store_s16q_to_tran_low(final_output + 0 * 8, input_0);
- store_s16q_to_tran_low(final_output + 1 * 8, input_1);
- store_s16q_to_tran_low(final_output + 2 * 8, input_2);
- store_s16q_to_tran_low(final_output + 3 * 8, input_3);
- store_s16q_to_tran_low(final_output + 4 * 8, input_4);
- store_s16q_to_tran_low(final_output + 5 * 8, input_5);
- store_s16q_to_tran_low(final_output + 6 * 8, input_6);
- store_s16q_to_tran_low(final_output + 7 * 8, input_7);
+ store_s16q_to_tran_low(final_output + 0 * 8, in[0]);
+ store_s16q_to_tran_low(final_output + 1 * 8, in[1]);
+ store_s16q_to_tran_low(final_output + 2 * 8, in[2]);
+ store_s16q_to_tran_low(final_output + 3 * 8, in[3]);
+ store_s16q_to_tran_low(final_output + 4 * 8, in[4]);
+ store_s16q_to_tran_low(final_output + 5 * 8, in[5]);
+ store_s16q_to_tran_low(final_output + 6 * 8, in[6]);
+ store_s16q_to_tran_low(final_output + 7 * 8, in[7]);
}
}
diff --git a/vpx_dsp/arm/transpose_neon.h b/vpx_dsp/arm/transpose_neon.h
index 752308160..c098ad31b 100644
--- a/vpx_dsp/arm/transpose_neon.h
+++ b/vpx_dsp/arm/transpose_neon.h
@@ -1184,6 +1184,45 @@ static INLINE void transpose_u8_16x16(
*o15 = e7.val[1];
}
+static INLINE void transpose_s16_16x16(int16x8_t *in0, int16x8_t *in1) {
+ int16x8_t t[8];
+
+ // transpose the 4 8x8 quadrants separately but first swap quadrants 2 and 3.
+ t[0] = in0[8];
+ t[1] = in0[9];
+ t[2] = in0[10];
+ t[3] = in0[11];
+ t[4] = in0[12];
+ t[5] = in0[13];
+ t[6] = in0[14];
+ t[7] = in0[15];
+ in0[8] = in1[0];
+ in0[9] = in1[1];
+ in0[10] = in1[2];
+ in0[11] = in1[3];
+ in0[12] = in1[4];
+ in0[13] = in1[5];
+ in0[14] = in1[6];
+ in0[15] = in1[7];
+ in1[0] = t[0];
+ in1[1] = t[1];
+ in1[2] = t[2];
+ in1[3] = t[3];
+ in1[4] = t[4];
+ in1[5] = t[5];
+ in1[6] = t[6];
+ in1[7] = t[7];
+
+ transpose_s16_8x8(&in0[0], &in0[1], &in0[2], &in0[3], &in0[4], &in0[5],
+ &in0[6], &in0[7]);
+ transpose_s16_8x8(&in0[8], &in0[9], &in0[10], &in0[11], &in0[12], &in0[13],
+ &in0[14], &in0[15]);
+ transpose_s16_8x8(&in1[0], &in1[1], &in1[2], &in1[3], &in1[4], &in1[5],
+ &in1[6], &in1[7]);
+ transpose_s16_8x8(&in1[8], &in1[9], &in1[10], &in1[11], &in1[12], &in1[13],
+ &in1[14], &in1[15]);
+}
+
static INLINE void load_and_transpose_u8_4x8(const uint8_t *a,
const int a_stride, uint8x8_t *a0,
uint8x8_t *a1, uint8x8_t *a2,