diff options
author | Konstantinos Margaritis <konma@vectorcamp.gr> | 2022-03-11 20:19:25 +0200 |
---|---|---|
committer | Konstantinos Margaritis <konma@vectorcamp.gr> | 2022-03-17 13:07:12 +0200 |
commit | f79d256cb28a4228df66a7a6d1cebbd9071e0639 (patch) | |
tree | af6e7e70ddb165208e3a7ad22132ac48f69e1b21 /vpx_dsp/arm | |
parent | 8a50f70ffc5eea6c2392a5c176bfe43e450ecebc (diff) | |
download | libvpx-f79d256cb28a4228df66a7a6d1cebbd9071e0639.tar libvpx-f79d256cb28a4228df66a7a6d1cebbd9071e0639.tar.gz libvpx-f79d256cb28a4228df66a7a6d1cebbd9071e0639.tar.bz2 libvpx-f79d256cb28a4228df66a7a6d1cebbd9071e0639.zip |
Make sure only NEON FDCT functions are called.
[NEON]
Added vpx_fdct4x4_pass1_neon(),
Added vpx_fdct8x8_pass1_notranspose_neon(),
Added vpx_fdct8x8_pass1_neon() to avoid code duplication
Refactored vpx_fdct4x4_neon() and vpx_dct8x8_neon() to use the above
Rename dct_body to vpx_fdct16x16_body to reuse later
Add transpose_s16_16x16()
I have run make test and all tests/configurations seem to pass.
Profiled using this command on an Ampere Altra VM:
sudo perf record -g ./vpxenc --codec=vp9 --height=1080 --width=1920 \
--fps=25/1 --limit=20 -o output.mkv \
../original_videos_Sports_1080P_Sports_1080P-0063.mkv --debug –rt
Before this optimization:
1.32% 1.32% vpxenc vpxenc [.] vpx_fdct4x4_neon
0.16% 0.16% vpxenc vpxenc [.] vpx_fdct4x4_c
0.79% 0.79% vpxenc vpxenc [.] vpx_fdct8x8_c
0.52% 0.52% vpxenc vpxenc [.] vpx_fdct8x8_neon
1.23% 1.23% vpxenc vpxenc [.] vpx_fdct16x16_c
0.54% 0.54% vpxenc vpxenc [.] vpx_fdct16x16_neon
So, even though a _neon() version exists, the C version was called \
as well. After this patch:
1.42% 1.36% vpxenc vpxenc [.] vpx_fdct4x4_neon
0.87% 0.82% vpxenc vpxenc [.] vpx_fdct8x8_neon
0.74% 0.74% vpxenc vpxenc [.] vpx_fdct16x16_neon
Change-Id: Id4e1dd315c67b4355fe4e5a1b59e181a349f16d0
Diffstat (limited to 'vpx_dsp/arm')
-rw-r--r-- | vpx_dsp/arm/fdct16x16_neon.c | 319 | ||||
-rw-r--r-- | vpx_dsp/arm/fdct16x16_neon.h | 327 | ||||
-rw-r--r-- | vpx_dsp/arm/fdct_neon.c | 61 | ||||
-rw-r--r-- | vpx_dsp/arm/fdct_neon.h | 213 | ||||
-rw-r--r-- | vpx_dsp/arm/fwd_txfm_neon.c | 212 | ||||
-rw-r--r-- | vpx_dsp/arm/transpose_neon.h | 39 |
6 files changed, 629 insertions, 542 deletions
diff --git a/vpx_dsp/arm/fdct16x16_neon.c b/vpx_dsp/arm/fdct16x16_neon.c index 6b2bebd09..67f43246a 100644 --- a/vpx_dsp/arm/fdct16x16_neon.c +++ b/vpx_dsp/arm/fdct16x16_neon.c @@ -15,6 +15,7 @@ #include "vpx_dsp/txfm_common.h" #include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/arm/fdct16x16_neon.h" // Some builds of gcc 4.9.2 and .3 have trouble with some of the inline // functions. @@ -27,316 +28,6 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { #else -static INLINE void load(const int16_t *a, int stride, int16x8_t *b /*[16]*/) { - b[0] = vld1q_s16(a); - a += stride; - b[1] = vld1q_s16(a); - a += stride; - b[2] = vld1q_s16(a); - a += stride; - b[3] = vld1q_s16(a); - a += stride; - b[4] = vld1q_s16(a); - a += stride; - b[5] = vld1q_s16(a); - a += stride; - b[6] = vld1q_s16(a); - a += stride; - b[7] = vld1q_s16(a); - a += stride; - b[8] = vld1q_s16(a); - a += stride; - b[9] = vld1q_s16(a); - a += stride; - b[10] = vld1q_s16(a); - a += stride; - b[11] = vld1q_s16(a); - a += stride; - b[12] = vld1q_s16(a); - a += stride; - b[13] = vld1q_s16(a); - a += stride; - b[14] = vld1q_s16(a); - a += stride; - b[15] = vld1q_s16(a); -} - -// Store 8 16x8 values, assuming stride == 16. -static INLINE void store(tran_low_t *a, const int16x8_t *b /*[8]*/) { - store_s16q_to_tran_low(a, b[0]); - a += 16; - store_s16q_to_tran_low(a, b[1]); - a += 16; - store_s16q_to_tran_low(a, b[2]); - a += 16; - store_s16q_to_tran_low(a, b[3]); - a += 16; - store_s16q_to_tran_low(a, b[4]); - a += 16; - store_s16q_to_tran_low(a, b[5]); - a += 16; - store_s16q_to_tran_low(a, b[6]); - a += 16; - store_s16q_to_tran_low(a, b[7]); -} - -// Load step of each pass. Add and subtract clear across the input, requiring -// all 16 values to be loaded. For the first pass it also multiplies by 4. - -// To maybe reduce register usage this could be combined with the load() step to -// get the first 4 and last 4 values, cross those, then load the middle 8 values -// and cross them. -static INLINE void cross_input(const int16x8_t *a /*[16]*/, - int16x8_t *b /*[16]*/, const int pass) { - if (pass == 0) { - b[0] = vshlq_n_s16(vaddq_s16(a[0], a[15]), 2); - b[1] = vshlq_n_s16(vaddq_s16(a[1], a[14]), 2); - b[2] = vshlq_n_s16(vaddq_s16(a[2], a[13]), 2); - b[3] = vshlq_n_s16(vaddq_s16(a[3], a[12]), 2); - b[4] = vshlq_n_s16(vaddq_s16(a[4], a[11]), 2); - b[5] = vshlq_n_s16(vaddq_s16(a[5], a[10]), 2); - b[6] = vshlq_n_s16(vaddq_s16(a[6], a[9]), 2); - b[7] = vshlq_n_s16(vaddq_s16(a[7], a[8]), 2); - - b[8] = vshlq_n_s16(vsubq_s16(a[7], a[8]), 2); - b[9] = vshlq_n_s16(vsubq_s16(a[6], a[9]), 2); - b[10] = vshlq_n_s16(vsubq_s16(a[5], a[10]), 2); - b[11] = vshlq_n_s16(vsubq_s16(a[4], a[11]), 2); - b[12] = vshlq_n_s16(vsubq_s16(a[3], a[12]), 2); - b[13] = vshlq_n_s16(vsubq_s16(a[2], a[13]), 2); - b[14] = vshlq_n_s16(vsubq_s16(a[1], a[14]), 2); - b[15] = vshlq_n_s16(vsubq_s16(a[0], a[15]), 2); - } else { - b[0] = vaddq_s16(a[0], a[15]); - b[1] = vaddq_s16(a[1], a[14]); - b[2] = vaddq_s16(a[2], a[13]); - b[3] = vaddq_s16(a[3], a[12]); - b[4] = vaddq_s16(a[4], a[11]); - b[5] = vaddq_s16(a[5], a[10]); - b[6] = vaddq_s16(a[6], a[9]); - b[7] = vaddq_s16(a[7], a[8]); - - b[8] = vsubq_s16(a[7], a[8]); - b[9] = vsubq_s16(a[6], a[9]); - b[10] = vsubq_s16(a[5], a[10]); - b[11] = vsubq_s16(a[4], a[11]); - b[12] = vsubq_s16(a[3], a[12]); - b[13] = vsubq_s16(a[2], a[13]); - b[14] = vsubq_s16(a[1], a[14]); - b[15] = vsubq_s16(a[0], a[15]); - } -} - -// Quarter round at the beginning of the second pass. Can't use vrshr (rounding) -// because this only adds 1, not 1 << 2. -static INLINE void partial_round_shift(int16x8_t *a /*[16]*/) { - const int16x8_t one = vdupq_n_s16(1); - a[0] = vshrq_n_s16(vaddq_s16(a[0], one), 2); - a[1] = vshrq_n_s16(vaddq_s16(a[1], one), 2); - a[2] = vshrq_n_s16(vaddq_s16(a[2], one), 2); - a[3] = vshrq_n_s16(vaddq_s16(a[3], one), 2); - a[4] = vshrq_n_s16(vaddq_s16(a[4], one), 2); - a[5] = vshrq_n_s16(vaddq_s16(a[5], one), 2); - a[6] = vshrq_n_s16(vaddq_s16(a[6], one), 2); - a[7] = vshrq_n_s16(vaddq_s16(a[7], one), 2); - a[8] = vshrq_n_s16(vaddq_s16(a[8], one), 2); - a[9] = vshrq_n_s16(vaddq_s16(a[9], one), 2); - a[10] = vshrq_n_s16(vaddq_s16(a[10], one), 2); - a[11] = vshrq_n_s16(vaddq_s16(a[11], one), 2); - a[12] = vshrq_n_s16(vaddq_s16(a[12], one), 2); - a[13] = vshrq_n_s16(vaddq_s16(a[13], one), 2); - a[14] = vshrq_n_s16(vaddq_s16(a[14], one), 2); - a[15] = vshrq_n_s16(vaddq_s16(a[15], one), 2); -} - -// fdct_round_shift((a +/- b) * c) -static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b, - const tran_high_t c, int16x8_t *add, - int16x8_t *sub) { - const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c); - const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c); - const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), c); - const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), c); - const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), c); - const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), c); - const int16x4_t rounded0 = vqrshrn_n_s32(sum0, 14); - const int16x4_t rounded1 = vqrshrn_n_s32(sum1, 14); - const int16x4_t rounded2 = vqrshrn_n_s32(diff0, 14); - const int16x4_t rounded3 = vqrshrn_n_s32(diff1, 14); - *add = vcombine_s16(rounded0, rounded1); - *sub = vcombine_s16(rounded2, rounded3); -} - -// fdct_round_shift(a * c0 +/- b * c1) -static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b, - const tran_coef_t c0, - const tran_coef_t c1, int16x8_t *add, - int16x8_t *sub) { - const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c0); - const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c0); - const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), c1); - const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), c1); - const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), c0); - const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), c0); - const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), c1); - const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), c1); - const int16x4_t rounded0 = vqrshrn_n_s32(sum0, 14); - const int16x4_t rounded1 = vqrshrn_n_s32(sum1, 14); - const int16x4_t rounded2 = vqrshrn_n_s32(diff0, 14); - const int16x4_t rounded3 = vqrshrn_n_s32(diff1, 14); - *add = vcombine_s16(rounded0, rounded1); - *sub = vcombine_s16(rounded2, rounded3); -} - -// Transpose 8x8 to a new location. Don't use transpose_neon.h because those -// are all in-place. -static INLINE void transpose_8x8(const int16x8_t *a /*[8]*/, - int16x8_t *b /*[8]*/) { - // Swap 16 bit elements. - const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]); - const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]); - const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]); - const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]); - - // Swap 32 bit elements. - const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]), - vreinterpretq_s32_s16(c1.val[0])); - const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]), - vreinterpretq_s32_s16(c1.val[1])); - const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]), - vreinterpretq_s32_s16(c3.val[0])); - const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]), - vreinterpretq_s32_s16(c3.val[1])); - - // Swap 64 bit elements - const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]); - const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]); - const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]); - const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]); - - b[0] = e0.val[0]; - b[1] = e1.val[0]; - b[2] = e2.val[0]; - b[3] = e3.val[0]; - b[4] = e0.val[1]; - b[5] = e1.val[1]; - b[6] = e2.val[1]; - b[7] = e3.val[1]; -} - -// Main body of fdct16x16. -static void dct_body(const int16x8_t *in /*[16]*/, int16x8_t *out /*[16]*/) { - int16x8_t s[8]; - int16x8_t x[4]; - int16x8_t step[8]; - - // stage 1 - // From fwd_txfm.c: Work on the first eight values; fdct8(input, - // even_results);" - s[0] = vaddq_s16(in[0], in[7]); - s[1] = vaddq_s16(in[1], in[6]); - s[2] = vaddq_s16(in[2], in[5]); - s[3] = vaddq_s16(in[3], in[4]); - s[4] = vsubq_s16(in[3], in[4]); - s[5] = vsubq_s16(in[2], in[5]); - s[6] = vsubq_s16(in[1], in[6]); - s[7] = vsubq_s16(in[0], in[7]); - - // fdct4(step, step); - x[0] = vaddq_s16(s[0], s[3]); - x[1] = vaddq_s16(s[1], s[2]); - x[2] = vsubq_s16(s[1], s[2]); - x[3] = vsubq_s16(s[0], s[3]); - - // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64) - // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64) - butterfly_one_coeff(x[0], x[1], cospi_16_64, &out[0], &out[8]); - // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64); - // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64); - butterfly_two_coeff(x[3], x[2], cospi_24_64, cospi_8_64, &out[4], &out[12]); - - // Stage 2 - // Re-using source s5/s6 - // s5 = fdct_round_shift((s6 - s5) * cospi_16_64) - // s6 = fdct_round_shift((s6 + s5) * cospi_16_64) - butterfly_one_coeff(s[6], s[5], cospi_16_64, &s[6], &s[5]); - - // Stage 3 - x[0] = vaddq_s16(s[4], s[5]); - x[1] = vsubq_s16(s[4], s[5]); - x[2] = vsubq_s16(s[7], s[6]); - x[3] = vaddq_s16(s[7], s[6]); - - // Stage 4 - // out[2] = fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64) - // out[14] = fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64) - butterfly_two_coeff(x[3], x[0], cospi_28_64, cospi_4_64, &out[2], &out[14]); - // out[6] = fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64) - // out[10] = fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64) - butterfly_two_coeff(x[2], x[1], cospi_12_64, cospi_20_64, &out[10], &out[6]); - - // step 2 - // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results" - // That file distinguished between "in_high" and "step1" but the only - // difference is that "in_high" is the first 8 values and "step 1" is the - // second. Here, since they are all in one array, "step1" values are += 8. - - // step2[2] = fdct_round_shift((step1[5] - step1[2]) * cospi_16_64) - // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64) - // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64) - // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64) - butterfly_one_coeff(in[13], in[10], cospi_16_64, &s[5], &s[2]); - butterfly_one_coeff(in[12], in[11], cospi_16_64, &s[4], &s[3]); - - // step 3 - s[0] = vaddq_s16(in[8], s[3]); - s[1] = vaddq_s16(in[9], s[2]); - x[0] = vsubq_s16(in[9], s[2]); - x[1] = vsubq_s16(in[8], s[3]); - x[2] = vsubq_s16(in[15], s[4]); - x[3] = vsubq_s16(in[14], s[5]); - s[6] = vaddq_s16(in[14], s[5]); - s[7] = vaddq_s16(in[15], s[4]); - - // step 4 - // step2[1] = fdct_round_shift(step3[1] *-cospi_8_64 + step3[6] * cospi_24_64) - // step2[6] = fdct_round_shift(step3[1] * cospi_24_64 + step3[6] * cospi_8_64) - butterfly_two_coeff(s[6], s[1], cospi_24_64, cospi_8_64, &s[6], &s[1]); - - // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64) - // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * cospi_24_64) - butterfly_two_coeff(x[0], x[3], cospi_8_64, cospi_24_64, &s[2], &s[5]); - - // step 5 - step[0] = vaddq_s16(s[0], s[1]); - step[1] = vsubq_s16(s[0], s[1]); - step[2] = vaddq_s16(x[1], s[2]); - step[3] = vsubq_s16(x[1], s[2]); - step[4] = vsubq_s16(x[2], s[5]); - step[5] = vaddq_s16(x[2], s[5]); - step[6] = vsubq_s16(s[7], s[6]); - step[7] = vaddq_s16(s[7], s[6]); - - // step 6 - // out[1] = fdct_round_shift(step1[0] * cospi_30_64 + step1[7] * cospi_2_64) - // out[9] = fdct_round_shift(step1[1] * cospi_14_64 + step1[6] * cospi_18_64) - // out[5] = fdct_round_shift(step1[2] * cospi_22_64 + step1[5] * cospi_10_64) - // out[13] = fdct_round_shift(step1[3] * cospi_6_64 + step1[4] * cospi_26_64) - // out[3] = fdct_round_shift(step1[3] * -cospi_26_64 + step1[4] * cospi_6_64) - // out[11] = fdct_round_shift(step1[2] * -cospi_10_64 + step1[5] * - // cospi_22_64) - // out[7] = fdct_round_shift(step1[1] * -cospi_18_64 + step1[6] * cospi_14_64) - // out[15] = fdct_round_shift(step1[0] * -cospi_2_64 + step1[7] * cospi_30_64) - butterfly_two_coeff(step[6], step[1], cospi_14_64, cospi_18_64, &out[9], - &out[7]); - butterfly_two_coeff(step[7], step[0], cospi_30_64, cospi_2_64, &out[1], - &out[15]); - butterfly_two_coeff(step[4], step[3], cospi_6_64, cospi_26_64, &out[13], - &out[3]); - butterfly_two_coeff(step[5], step[2], cospi_22_64, cospi_10_64, &out[5], - &out[11]); -} - void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { int16x8_t temp0[16]; int16x8_t temp1[16]; @@ -346,12 +37,12 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { // Left half. load(input, stride, temp0); cross_input(temp0, temp1, 0); - dct_body(temp1, temp0); + vpx_fdct16x16_body(temp1, temp0); // Right half. load(input + 8, stride, temp1); cross_input(temp1, temp2, 0); - dct_body(temp2, temp1); + vpx_fdct16x16_body(temp2, temp1); // Transpose top left and top right quarters into one contiguous location to // process to the top half. @@ -359,7 +50,7 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { transpose_8x8(&temp1[0], &temp2[8]); partial_round_shift(temp2); cross_input(temp2, temp3, 1); - dct_body(temp3, temp2); + vpx_fdct16x16_body(temp3, temp2); transpose_s16_8x8(&temp2[0], &temp2[1], &temp2[2], &temp2[3], &temp2[4], &temp2[5], &temp2[6], &temp2[7]); transpose_s16_8x8(&temp2[8], &temp2[9], &temp2[10], &temp2[11], &temp2[12], @@ -375,7 +66,7 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { &temp1[13], &temp1[14], &temp1[15]); partial_round_shift(temp1); cross_input(temp1, temp0, 1); - dct_body(temp0, temp1); + vpx_fdct16x16_body(temp0, temp1); transpose_s16_8x8(&temp1[0], &temp1[1], &temp1[2], &temp1[3], &temp1[4], &temp1[5], &temp1[6], &temp1[7]); transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12], diff --git a/vpx_dsp/arm/fdct16x16_neon.h b/vpx_dsp/arm/fdct16x16_neon.h new file mode 100644 index 000000000..839123899 --- /dev/null +++ b/vpx_dsp/arm/fdct16x16_neon.h @@ -0,0 +1,327 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_ARM_FDCT16x16_NEON_H_ +#define VPX_VPX_DSP_ARM_FDCT16x16_NEON_H_ + +#include <arm_neon.h> + +static INLINE void load(const int16_t *a, int stride, int16x8_t *b /*[16]*/) { + b[0] = vld1q_s16(a); + a += stride; + b[1] = vld1q_s16(a); + a += stride; + b[2] = vld1q_s16(a); + a += stride; + b[3] = vld1q_s16(a); + a += stride; + b[4] = vld1q_s16(a); + a += stride; + b[5] = vld1q_s16(a); + a += stride; + b[6] = vld1q_s16(a); + a += stride; + b[7] = vld1q_s16(a); + a += stride; + b[8] = vld1q_s16(a); + a += stride; + b[9] = vld1q_s16(a); + a += stride; + b[10] = vld1q_s16(a); + a += stride; + b[11] = vld1q_s16(a); + a += stride; + b[12] = vld1q_s16(a); + a += stride; + b[13] = vld1q_s16(a); + a += stride; + b[14] = vld1q_s16(a); + a += stride; + b[15] = vld1q_s16(a); +} + +// Store 8 16x8 values, assuming stride == 16. +static INLINE void store(tran_low_t *a, const int16x8_t *b /*[8]*/) { + store_s16q_to_tran_low(a, b[0]); + a += 16; + store_s16q_to_tran_low(a, b[1]); + a += 16; + store_s16q_to_tran_low(a, b[2]); + a += 16; + store_s16q_to_tran_low(a, b[3]); + a += 16; + store_s16q_to_tran_low(a, b[4]); + a += 16; + store_s16q_to_tran_low(a, b[5]); + a += 16; + store_s16q_to_tran_low(a, b[6]); + a += 16; + store_s16q_to_tran_low(a, b[7]); +} + +// Load step of each pass. Add and subtract clear across the input, requiring +// all 16 values to be loaded. For the first pass it also multiplies by 4. + +// To maybe reduce register usage this could be combined with the load() step to +// get the first 4 and last 4 values, cross those, then load the middle 8 values +// and cross them. +static INLINE void cross_input(const int16x8_t *a /*[16]*/, + int16x8_t *b /*[16]*/, const int pass) { + if (pass == 0) { + b[0] = vshlq_n_s16(vaddq_s16(a[0], a[15]), 2); + b[1] = vshlq_n_s16(vaddq_s16(a[1], a[14]), 2); + b[2] = vshlq_n_s16(vaddq_s16(a[2], a[13]), 2); + b[3] = vshlq_n_s16(vaddq_s16(a[3], a[12]), 2); + b[4] = vshlq_n_s16(vaddq_s16(a[4], a[11]), 2); + b[5] = vshlq_n_s16(vaddq_s16(a[5], a[10]), 2); + b[6] = vshlq_n_s16(vaddq_s16(a[6], a[9]), 2); + b[7] = vshlq_n_s16(vaddq_s16(a[7], a[8]), 2); + + b[8] = vshlq_n_s16(vsubq_s16(a[7], a[8]), 2); + b[9] = vshlq_n_s16(vsubq_s16(a[6], a[9]), 2); + b[10] = vshlq_n_s16(vsubq_s16(a[5], a[10]), 2); + b[11] = vshlq_n_s16(vsubq_s16(a[4], a[11]), 2); + b[12] = vshlq_n_s16(vsubq_s16(a[3], a[12]), 2); + b[13] = vshlq_n_s16(vsubq_s16(a[2], a[13]), 2); + b[14] = vshlq_n_s16(vsubq_s16(a[1], a[14]), 2); + b[15] = vshlq_n_s16(vsubq_s16(a[0], a[15]), 2); + } else { + b[0] = vaddq_s16(a[0], a[15]); + b[1] = vaddq_s16(a[1], a[14]); + b[2] = vaddq_s16(a[2], a[13]); + b[3] = vaddq_s16(a[3], a[12]); + b[4] = vaddq_s16(a[4], a[11]); + b[5] = vaddq_s16(a[5], a[10]); + b[6] = vaddq_s16(a[6], a[9]); + b[7] = vaddq_s16(a[7], a[8]); + + b[8] = vsubq_s16(a[7], a[8]); + b[9] = vsubq_s16(a[6], a[9]); + b[10] = vsubq_s16(a[5], a[10]); + b[11] = vsubq_s16(a[4], a[11]); + b[12] = vsubq_s16(a[3], a[12]); + b[13] = vsubq_s16(a[2], a[13]); + b[14] = vsubq_s16(a[1], a[14]); + b[15] = vsubq_s16(a[0], a[15]); + } +} + +// Quarter round at the beginning of the second pass. Can't use vrshr (rounding) +// because this only adds 1, not 1 << 2. +static INLINE void partial_round_shift(int16x8_t *a /*[16]*/) { + const int16x8_t one = vdupq_n_s16(1); + a[0] = vshrq_n_s16(vaddq_s16(a[0], one), 2); + a[1] = vshrq_n_s16(vaddq_s16(a[1], one), 2); + a[2] = vshrq_n_s16(vaddq_s16(a[2], one), 2); + a[3] = vshrq_n_s16(vaddq_s16(a[3], one), 2); + a[4] = vshrq_n_s16(vaddq_s16(a[4], one), 2); + a[5] = vshrq_n_s16(vaddq_s16(a[5], one), 2); + a[6] = vshrq_n_s16(vaddq_s16(a[6], one), 2); + a[7] = vshrq_n_s16(vaddq_s16(a[7], one), 2); + a[8] = vshrq_n_s16(vaddq_s16(a[8], one), 2); + a[9] = vshrq_n_s16(vaddq_s16(a[9], one), 2); + a[10] = vshrq_n_s16(vaddq_s16(a[10], one), 2); + a[11] = vshrq_n_s16(vaddq_s16(a[11], one), 2); + a[12] = vshrq_n_s16(vaddq_s16(a[12], one), 2); + a[13] = vshrq_n_s16(vaddq_s16(a[13], one), 2); + a[14] = vshrq_n_s16(vaddq_s16(a[14], one), 2); + a[15] = vshrq_n_s16(vaddq_s16(a[15], one), 2); +} + +// fdct_round_shift((a +/- b) * c) +static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b, + const tran_high_t c, int16x8_t *add, + int16x8_t *sub) { + const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c); + const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c); + const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), c); + const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), c); + const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), c); + const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), c); + const int16x4_t rounded0 = vqrshrn_n_s32(sum0, 14); + const int16x4_t rounded1 = vqrshrn_n_s32(sum1, 14); + const int16x4_t rounded2 = vqrshrn_n_s32(diff0, 14); + const int16x4_t rounded3 = vqrshrn_n_s32(diff1, 14); + *add = vcombine_s16(rounded0, rounded1); + *sub = vcombine_s16(rounded2, rounded3); +} + +// fdct_round_shift(a * c0 +/- b * c1) +static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b, + const tran_coef_t c0, + const tran_coef_t c1, int16x8_t *add, + int16x8_t *sub) { + const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c0); + const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c0); + const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), c1); + const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), c1); + const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), c0); + const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), c0); + const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), c1); + const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), c1); + const int16x4_t rounded0 = vqrshrn_n_s32(sum0, 14); + const int16x4_t rounded1 = vqrshrn_n_s32(sum1, 14); + const int16x4_t rounded2 = vqrshrn_n_s32(diff0, 14); + const int16x4_t rounded3 = vqrshrn_n_s32(diff1, 14); + *add = vcombine_s16(rounded0, rounded1); + *sub = vcombine_s16(rounded2, rounded3); +} + +// Transpose 8x8 to a new location. Don't use transpose_neon.h because those +// are all in-place. +static INLINE void transpose_8x8(const int16x8_t *a /*[8]*/, + int16x8_t *b /*[8]*/) { + // Swap 16 bit elements. + const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]); + const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]); + const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]); + const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]); + + // Swap 32 bit elements. + const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]), + vreinterpretq_s32_s16(c1.val[0])); + const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]), + vreinterpretq_s32_s16(c1.val[1])); + const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]), + vreinterpretq_s32_s16(c3.val[0])); + const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]), + vreinterpretq_s32_s16(c3.val[1])); + + // Swap 64 bit elements + const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]); + const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]); + const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]); + const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]); + + b[0] = e0.val[0]; + b[1] = e1.val[0]; + b[2] = e2.val[0]; + b[3] = e3.val[0]; + b[4] = e0.val[1]; + b[5] = e1.val[1]; + b[6] = e2.val[1]; + b[7] = e3.val[1]; +} + +// Main body of fdct16x16. +static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/, + int16x8_t *out /*[16]*/) { + int16x8_t s[8]; + int16x8_t x[4]; + int16x8_t step[8]; + + // stage 1 + // From fwd_txfm.c: Work on the first eight values; fdct8(input, + // even_results);" + s[0] = vaddq_s16(in[0], in[7]); + s[1] = vaddq_s16(in[1], in[6]); + s[2] = vaddq_s16(in[2], in[5]); + s[3] = vaddq_s16(in[3], in[4]); + s[4] = vsubq_s16(in[3], in[4]); + s[5] = vsubq_s16(in[2], in[5]); + s[6] = vsubq_s16(in[1], in[6]); + s[7] = vsubq_s16(in[0], in[7]); + + // fdct4(step, step); + x[0] = vaddq_s16(s[0], s[3]); + x[1] = vaddq_s16(s[1], s[2]); + x[2] = vsubq_s16(s[1], s[2]); + x[3] = vsubq_s16(s[0], s[3]); + + // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64) + // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64) + butterfly_one_coeff(x[0], x[1], cospi_16_64, &out[0], &out[8]); + // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64); + // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64); + butterfly_two_coeff(x[3], x[2], cospi_24_64, cospi_8_64, &out[4], &out[12]); + + // Stage 2 + // Re-using source s5/s6 + // s5 = fdct_round_shift((s6 - s5) * cospi_16_64) + // s6 = fdct_round_shift((s6 + s5) * cospi_16_64) + butterfly_one_coeff(s[6], s[5], cospi_16_64, &s[6], &s[5]); + + // Stage 3 + x[0] = vaddq_s16(s[4], s[5]); + x[1] = vsubq_s16(s[4], s[5]); + x[2] = vsubq_s16(s[7], s[6]); + x[3] = vaddq_s16(s[7], s[6]); + + // Stage 4 + // out[2] = fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64) + // out[14] = fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64) + butterfly_two_coeff(x[3], x[0], cospi_28_64, cospi_4_64, &out[2], &out[14]); + // out[6] = fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64) + // out[10] = fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64) + butterfly_two_coeff(x[2], x[1], cospi_12_64, cospi_20_64, &out[10], &out[6]); + + // step 2 + // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results" + // That file distinguished between "in_high" and "step1" but the only + // difference is that "in_high" is the first 8 values and "step 1" is the + // second. Here, since they are all in one array, "step1" values are += 8. + + // step2[2] = fdct_round_shift((step1[5] - step1[2]) * cospi_16_64) + // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64) + // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64) + // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64) + butterfly_one_coeff(in[13], in[10], cospi_16_64, &s[5], &s[2]); + butterfly_one_coeff(in[12], in[11], cospi_16_64, &s[4], &s[3]); + + // step 3 + s[0] = vaddq_s16(in[8], s[3]); + s[1] = vaddq_s16(in[9], s[2]); + x[0] = vsubq_s16(in[9], s[2]); + x[1] = vsubq_s16(in[8], s[3]); + x[2] = vsubq_s16(in[15], s[4]); + x[3] = vsubq_s16(in[14], s[5]); + s[6] = vaddq_s16(in[14], s[5]); + s[7] = vaddq_s16(in[15], s[4]); + + // step 4 + // step2[1] = fdct_round_shift(step3[1] *-cospi_8_64 + step3[6] * cospi_24_64) + // step2[6] = fdct_round_shift(step3[1] * cospi_24_64 + step3[6] * cospi_8_64) + butterfly_two_coeff(s[6], s[1], cospi_24_64, cospi_8_64, &s[6], &s[1]); + + // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64) + // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * cospi_24_64) + butterfly_two_coeff(x[0], x[3], cospi_8_64, cospi_24_64, &s[2], &s[5]); + + // step 5 + step[0] = vaddq_s16(s[0], s[1]); + step[1] = vsubq_s16(s[0], s[1]); + step[2] = vaddq_s16(x[1], s[2]); + step[3] = vsubq_s16(x[1], s[2]); + step[4] = vsubq_s16(x[2], s[5]); + step[5] = vaddq_s16(x[2], s[5]); + step[6] = vsubq_s16(s[7], s[6]); + step[7] = vaddq_s16(s[7], s[6]); + + // step 6 + // out[1] = fdct_round_shift(step1[0] * cospi_30_64 + step1[7] * cospi_2_64) + // out[9] = fdct_round_shift(step1[1] * cospi_14_64 + step1[6] * cospi_18_64) + // out[5] = fdct_round_shift(step1[2] * cospi_22_64 + step1[5] * cospi_10_64) + // out[13] = fdct_round_shift(step1[3] * cospi_6_64 + step1[4] * cospi_26_64) + // out[3] = fdct_round_shift(step1[3] * -cospi_26_64 + step1[4] * cospi_6_64) + // out[11] = fdct_round_shift(step1[2] * -cospi_10_64 + step1[5] * + // cospi_22_64) + // out[7] = fdct_round_shift(step1[1] * -cospi_18_64 + step1[6] * cospi_14_64) + // out[15] = fdct_round_shift(step1[0] * -cospi_2_64 + step1[7] * cospi_30_64) + butterfly_two_coeff(step[6], step[1], cospi_14_64, cospi_18_64, &out[9], + &out[7]); + butterfly_two_coeff(step[7], step[0], cospi_30_64, cospi_2_64, &out[1], + &out[15]); + butterfly_two_coeff(step[4], step[3], cospi_6_64, cospi_26_64, &out[13], + &out[3]); + butterfly_two_coeff(step[5], step[2], cospi_22_64, cospi_10_64, &out[5], + &out[11]); +} + +#endif // VPX_VPX_DSP_ARM_FDCT16x16_NEON_H_ diff --git a/vpx_dsp/arm/fdct_neon.c b/vpx_dsp/arm/fdct_neon.c index 3708cbb11..2827791f1 100644 --- a/vpx_dsp/arm/fdct_neon.c +++ b/vpx_dsp/arm/fdct_neon.c @@ -15,6 +15,7 @@ #include "vpx_dsp/txfm_common.h" #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/fdct_neon.h" #include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/transpose_neon.h" @@ -22,67 +23,25 @@ void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output, int stride) { int i; // input[M * stride] * 16 - int16x4_t input_0 = vshl_n_s16(vld1_s16(input + 0 * stride), 4); - int16x4_t input_1 = vshl_n_s16(vld1_s16(input + 1 * stride), 4); - int16x4_t input_2 = vshl_n_s16(vld1_s16(input + 2 * stride), 4); - int16x4_t input_3 = vshl_n_s16(vld1_s16(input + 3 * stride), 4); + int16x4_t in[4]; + in[0] = vshl_n_s16(vld1_s16(input + 0 * stride), 4); + in[1] = vshl_n_s16(vld1_s16(input + 1 * stride), 4); + in[2] = vshl_n_s16(vld1_s16(input + 2 * stride), 4); + in[3] = vshl_n_s16(vld1_s16(input + 3 * stride), 4); // If the very first value != 0, then add 1. if (input[0] != 0) { const int16x4_t one = vreinterpret_s16_s64(vdup_n_s64(1)); - input_0 = vadd_s16(input_0, one); + in[0] = vadd_s16(in[0], one); } - for (i = 0; i < 2; ++i) { - const int16x8_t input_01 = vcombine_s16(input_0, input_1); - const int16x8_t input_32 = vcombine_s16(input_3, input_2); - - // in_0 +/- in_3, in_1 +/- in_2 - const int16x8_t s_01 = vaddq_s16(input_01, input_32); - const int16x8_t s_32 = vsubq_s16(input_01, input_32); - - // step_0 +/- step_1, step_2 +/- step_3 - const int16x4_t s_0 = vget_low_s16(s_01); - const int16x4_t s_1 = vget_high_s16(s_01); - const int16x4_t s_2 = vget_high_s16(s_32); - const int16x4_t s_3 = vget_low_s16(s_32); - - // (s_0 +/- s_1) * cospi_16_64 - // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c. - const int32x4_t s_0_p_s_1 = vaddl_s16(s_0, s_1); - const int32x4_t s_0_m_s_1 = vsubl_s16(s_0, s_1); - const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, cospi_16_64); - const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, cospi_16_64); - - // fdct_round_shift - int16x4_t out_0 = vrshrn_n_s32(temp1, DCT_CONST_BITS); - int16x4_t out_2 = vrshrn_n_s32(temp2, DCT_CONST_BITS); - - // s_3 * cospi_8_64 + s_2 * cospi_24_64 - // s_3 * cospi_24_64 - s_2 * cospi_8_64 - const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, cospi_8_64); - const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, cospi_24_64); - - const int32x4_t temp3 = vmlal_n_s16(s_3_cospi_8_64, s_2, cospi_24_64); - const int32x4_t temp4 = vmlsl_n_s16(s_3_cospi_24_64, s_2, cospi_8_64); - - // fdct_round_shift - int16x4_t out_1 = vrshrn_n_s32(temp3, DCT_CONST_BITS); - int16x4_t out_3 = vrshrn_n_s32(temp4, DCT_CONST_BITS); - - transpose_s16_4x4d(&out_0, &out_1, &out_2, &out_3); - - input_0 = out_0; - input_1 = out_1; - input_2 = out_2; - input_3 = out_3; + vpx_fdct4x4_pass1_neon(in); } - { // Not quite a rounding shift. Only add 1 despite shifting by 2. const int16x8_t one = vdupq_n_s16(1); - int16x8_t out_01 = vcombine_s16(input_0, input_1); - int16x8_t out_23 = vcombine_s16(input_2, input_3); + int16x8_t out_01 = vcombine_s16(in[0], in[1]); + int16x8_t out_23 = vcombine_s16(in[2], in[3]); out_01 = vshrq_n_s16(vaddq_s16(out_01, one), 2); out_23 = vshrq_n_s16(vaddq_s16(out_23, one), 2); store_s16q_to_tran_low(final_output + 0 * 8, out_01); diff --git a/vpx_dsp/arm/fdct_neon.h b/vpx_dsp/arm/fdct_neon.h new file mode 100644 index 000000000..28d7d86bf --- /dev/null +++ b/vpx_dsp/arm/fdct_neon.h @@ -0,0 +1,213 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_ARM_FDCT_NEON_H_ +#define VPX_VPX_DSP_ARM_FDCT_NEON_H_ + +#include <arm_neon.h> + +static INLINE void vpx_fdct4x4_pass1_neon(int16x4_t *in) { + const int16x8_t input_01 = vcombine_s16(in[0], in[1]); + const int16x8_t input_32 = vcombine_s16(in[3], in[2]); + + // in_0 +/- in_3, in_1 +/- in_2 + const int16x8_t s_01 = vaddq_s16(input_01, input_32); + const int16x8_t s_32 = vsubq_s16(input_01, input_32); + + // step_0 +/- step_1, step_2 +/- step_3 + const int16x4_t s_0 = vget_low_s16(s_01); + const int16x4_t s_1 = vget_high_s16(s_01); + const int16x4_t s_2 = vget_high_s16(s_32); + const int16x4_t s_3 = vget_low_s16(s_32); + + // (s_0 +/- s_1) * cospi_16_64 + // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c. + const int32x4_t s_0_p_s_1 = vaddl_s16(s_0, s_1); + const int32x4_t s_0_m_s_1 = vsubl_s16(s_0, s_1); + const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, cospi_16_64); + const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, cospi_16_64); + + // fdct_round_shift + int16x4_t out_0 = vrshrn_n_s32(temp1, DCT_CONST_BITS); + int16x4_t out_2 = vrshrn_n_s32(temp2, DCT_CONST_BITS); + + // s_3 * cospi_8_64 + s_2 * cospi_24_64 + // s_3 * cospi_24_64 - s_2 * cospi_8_64 + const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, cospi_8_64); + const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, cospi_24_64); + + const int32x4_t temp3 = vmlal_n_s16(s_3_cospi_8_64, s_2, cospi_24_64); + const int32x4_t temp4 = vmlsl_n_s16(s_3_cospi_24_64, s_2, cospi_8_64); + + // fdct_round_shift + int16x4_t out_1 = vrshrn_n_s32(temp3, DCT_CONST_BITS); + int16x4_t out_3 = vrshrn_n_s32(temp4, DCT_CONST_BITS); + + transpose_s16_4x4d(&out_0, &out_1, &out_2, &out_3); + + in[0] = out_0; + in[1] = out_1; + in[2] = out_2; + in[3] = out_3; +} + +static INLINE void vpx_fdct8x8_pass1_notranspose_neon(int16x8_t *in, + int16x8_t *out) { + const int16x8_t v_s0 = vaddq_s16(in[0], in[7]); + const int16x8_t v_s1 = vaddq_s16(in[1], in[6]); + const int16x8_t v_s2 = vaddq_s16(in[2], in[5]); + const int16x8_t v_s3 = vaddq_s16(in[3], in[4]); + const int16x8_t v_s4 = vsubq_s16(in[3], in[4]); + const int16x8_t v_s5 = vsubq_s16(in[2], in[5]); + const int16x8_t v_s6 = vsubq_s16(in[1], in[6]); + const int16x8_t v_s7 = vsubq_s16(in[0], in[7]); + // fdct4(step, step); + int16x8_t v_x0 = vaddq_s16(v_s0, v_s3); + int16x8_t v_x1 = vaddq_s16(v_s1, v_s2); + int16x8_t v_x2 = vsubq_s16(v_s1, v_s2); + int16x8_t v_x3 = vsubq_s16(v_s0, v_s3); + // fdct4(step, step); + int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1)); + int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1)); + int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1)); + int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1)); + int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_24_64); + int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_24_64); + int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_24_64); + int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_24_64); + v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), cospi_8_64); + v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), cospi_8_64); + v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), cospi_8_64); + v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), cospi_8_64); + v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64); + v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64); + v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64); + v_t1_hi = vmulq_n_s32(v_t1_hi, cospi_16_64); + { + const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); + const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); + const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); + const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); + const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS); + const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS); + const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS); + const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS); + out[0] = vcombine_s16(a, c); // 00 01 02 03 40 41 42 43 + out[2] = vcombine_s16(e, g); // 20 21 22 23 60 61 62 63 + out[4] = vcombine_s16(b, d); // 04 05 06 07 44 45 46 47 + out[6] = vcombine_s16(f, h); // 24 25 26 27 64 65 66 67 + } + // Stage 2 + v_x0 = vsubq_s16(v_s6, v_s5); + v_x1 = vaddq_s16(v_s6, v_s5); + v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), cospi_16_64); + v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), cospi_16_64); + v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_16_64); + v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_16_64); + { + const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); + const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); + const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); + const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); + const int16x8_t ab = vcombine_s16(a, b); + const int16x8_t cd = vcombine_s16(c, d); + // Stage 3 + v_x0 = vaddq_s16(v_s4, ab); + v_x1 = vsubq_s16(v_s4, ab); + v_x2 = vsubq_s16(v_s7, cd); + v_x3 = vaddq_s16(v_s7, cd); + } + // Stage 4 + v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_4_64); + v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_4_64); + v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), cospi_28_64); + v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), cospi_28_64); + v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_12_64); + v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_12_64); + v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), cospi_20_64); + v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), cospi_20_64); + v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_12_64); + v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_12_64); + v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), cospi_20_64); + v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), cospi_20_64); + v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_28_64); + v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_28_64); + v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), cospi_4_64); + v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), cospi_4_64); + { + const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); + const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); + const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); + const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); + const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS); + const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS); + const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS); + const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS); + out[1] = vcombine_s16(a, c); // 10 11 12 13 50 51 52 53 + out[3] = vcombine_s16(e, g); // 30 31 32 33 70 71 72 73 + out[5] = vcombine_s16(b, d); // 14 15 16 17 54 55 56 57 + out[7] = vcombine_s16(f, h); // 34 35 36 37 74 75 76 77 + } +} + +static INLINE void vpx_fdct8x8_pass1_neon(int16x8_t *in) { + int16x8_t out[8]; + vpx_fdct8x8_pass1_notranspose_neon(in, out); + // transpose 8x8 + // Can't use transpose_s16_8x8() because the values are arranged in two 4x8 + // columns. + { + // 00 01 02 03 40 41 42 43 + // 10 11 12 13 50 51 52 53 + // 20 21 22 23 60 61 62 63 + // 30 31 32 33 70 71 72 73 + // 04 05 06 07 44 45 46 47 + // 14 15 16 17 54 55 56 57 + // 24 25 26 27 64 65 66 67 + // 34 35 36 37 74 75 76 77 + const int32x4x2_t r02_s32 = + vtrnq_s32(vreinterpretq_s32_s16(out[0]), vreinterpretq_s32_s16(out[2])); + const int32x4x2_t r13_s32 = + vtrnq_s32(vreinterpretq_s32_s16(out[1]), vreinterpretq_s32_s16(out[3])); + const int32x4x2_t r46_s32 = + vtrnq_s32(vreinterpretq_s32_s16(out[4]), vreinterpretq_s32_s16(out[6])); + const int32x4x2_t r57_s32 = + vtrnq_s32(vreinterpretq_s32_s16(out[5]), vreinterpretq_s32_s16(out[7])); + const int16x8x2_t r01_s16 = + vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]), + vreinterpretq_s16_s32(r13_s32.val[0])); + const int16x8x2_t r23_s16 = + vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]), + vreinterpretq_s16_s32(r13_s32.val[1])); + const int16x8x2_t r45_s16 = + vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]), + vreinterpretq_s16_s32(r57_s32.val[0])); + const int16x8x2_t r67_s16 = + vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]), + vreinterpretq_s16_s32(r57_s32.val[1])); + in[0] = r01_s16.val[0]; + in[1] = r01_s16.val[1]; + in[2] = r23_s16.val[0]; + in[3] = r23_s16.val[1]; + in[4] = r45_s16.val[0]; + in[5] = r45_s16.val[1]; + in[6] = r67_s16.val[0]; + in[7] = r67_s16.val[1]; + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + // 06 16 26 36 46 56 66 76 + // 07 17 27 37 47 57 67 77 + } +} +#endif // VPX_VPX_DSP_ARM_FDCT_NEON_H_ diff --git a/vpx_dsp/arm/fwd_txfm_neon.c b/vpx_dsp/arm/fwd_txfm_neon.c index 374a262b9..d9161c6d3 100644 --- a/vpx_dsp/arm/fwd_txfm_neon.c +++ b/vpx_dsp/arm/fwd_txfm_neon.c @@ -15,196 +15,54 @@ #include "vpx_dsp/txfm_common.h" #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/fdct_neon.h" #include "vpx_dsp/arm/mem_neon.h" void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output, int stride) { int i; // stage 1 - int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2); - int16x8_t input_1 = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2); - int16x8_t input_2 = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2); - int16x8_t input_3 = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2); - int16x8_t input_4 = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2); - int16x8_t input_5 = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2); - int16x8_t input_6 = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2); - int16x8_t input_7 = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2); + int16x8_t in[8]; + in[0] = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2); + in[1] = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2); + in[2] = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2); + in[3] = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2); + in[4] = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2); + in[5] = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2); + in[6] = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2); + in[7] = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2); for (i = 0; i < 2; ++i) { - int16x8_t out_0, out_1, out_2, out_3, out_4, out_5, out_6, out_7; - const int16x8_t v_s0 = vaddq_s16(input_0, input_7); - const int16x8_t v_s1 = vaddq_s16(input_1, input_6); - const int16x8_t v_s2 = vaddq_s16(input_2, input_5); - const int16x8_t v_s3 = vaddq_s16(input_3, input_4); - const int16x8_t v_s4 = vsubq_s16(input_3, input_4); - const int16x8_t v_s5 = vsubq_s16(input_2, input_5); - const int16x8_t v_s6 = vsubq_s16(input_1, input_6); - const int16x8_t v_s7 = vsubq_s16(input_0, input_7); - // fdct4(step, step); - int16x8_t v_x0 = vaddq_s16(v_s0, v_s3); - int16x8_t v_x1 = vaddq_s16(v_s1, v_s2); - int16x8_t v_x2 = vsubq_s16(v_s1, v_s2); - int16x8_t v_x3 = vsubq_s16(v_s0, v_s3); - // fdct4(step, step); - int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1)); - int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1)); - int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1)); - int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1)); - int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_24_64); - int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_24_64); - int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_24_64); - int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_24_64); - v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), cospi_8_64); - v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), cospi_8_64); - v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), cospi_8_64); - v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), cospi_8_64); - v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64); - v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64); - v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64); - v_t1_hi = vmulq_n_s32(v_t1_hi, cospi_16_64); - { - const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); - const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); - const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); - const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); - const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS); - const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS); - const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS); - const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS); - out_0 = vcombine_s16(a, c); // 00 01 02 03 40 41 42 43 - out_2 = vcombine_s16(e, g); // 20 21 22 23 60 61 62 63 - out_4 = vcombine_s16(b, d); // 04 05 06 07 44 45 46 47 - out_6 = vcombine_s16(f, h); // 24 25 26 27 64 65 66 67 - } - // Stage 2 - v_x0 = vsubq_s16(v_s6, v_s5); - v_x1 = vaddq_s16(v_s6, v_s5); - v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), cospi_16_64); - v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), cospi_16_64); - v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_16_64); - v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_16_64); - { - const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); - const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); - const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); - const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); - const int16x8_t ab = vcombine_s16(a, b); - const int16x8_t cd = vcombine_s16(c, d); - // Stage 3 - v_x0 = vaddq_s16(v_s4, ab); - v_x1 = vsubq_s16(v_s4, ab); - v_x2 = vsubq_s16(v_s7, cd); - v_x3 = vaddq_s16(v_s7, cd); - } - // Stage 4 - v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_4_64); - v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_4_64); - v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), cospi_28_64); - v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), cospi_28_64); - v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_12_64); - v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_12_64); - v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), cospi_20_64); - v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), cospi_20_64); - v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_12_64); - v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_12_64); - v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), cospi_20_64); - v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), cospi_20_64); - v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_28_64); - v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_28_64); - v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), cospi_4_64); - v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), cospi_4_64); - { - const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); - const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); - const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); - const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); - const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS); - const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS); - const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS); - const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS); - out_1 = vcombine_s16(a, c); // 10 11 12 13 50 51 52 53 - out_3 = vcombine_s16(e, g); // 30 31 32 33 70 71 72 73 - out_5 = vcombine_s16(b, d); // 14 15 16 17 54 55 56 57 - out_7 = vcombine_s16(f, h); // 34 35 36 37 74 75 76 77 - } - // transpose 8x8 - // Can't use transpose_s16_8x8() because the values are arranged in two 4x8 - // columns. - { - // 00 01 02 03 40 41 42 43 - // 10 11 12 13 50 51 52 53 - // 20 21 22 23 60 61 62 63 - // 30 31 32 33 70 71 72 73 - // 04 05 06 07 44 45 46 47 - // 14 15 16 17 54 55 56 57 - // 24 25 26 27 64 65 66 67 - // 34 35 36 37 74 75 76 77 - const int32x4x2_t r02_s32 = - vtrnq_s32(vreinterpretq_s32_s16(out_0), vreinterpretq_s32_s16(out_2)); - const int32x4x2_t r13_s32 = - vtrnq_s32(vreinterpretq_s32_s16(out_1), vreinterpretq_s32_s16(out_3)); - const int32x4x2_t r46_s32 = - vtrnq_s32(vreinterpretq_s32_s16(out_4), vreinterpretq_s32_s16(out_6)); - const int32x4x2_t r57_s32 = - vtrnq_s32(vreinterpretq_s32_s16(out_5), vreinterpretq_s32_s16(out_7)); - const int16x8x2_t r01_s16 = - vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]), - vreinterpretq_s16_s32(r13_s32.val[0])); - const int16x8x2_t r23_s16 = - vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]), - vreinterpretq_s16_s32(r13_s32.val[1])); - const int16x8x2_t r45_s16 = - vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]), - vreinterpretq_s16_s32(r57_s32.val[0])); - const int16x8x2_t r67_s16 = - vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]), - vreinterpretq_s16_s32(r57_s32.val[1])); - input_0 = r01_s16.val[0]; - input_1 = r01_s16.val[1]; - input_2 = r23_s16.val[0]; - input_3 = r23_s16.val[1]; - input_4 = r45_s16.val[0]; - input_5 = r45_s16.val[1]; - input_6 = r67_s16.val[0]; - input_7 = r67_s16.val[1]; - // 00 10 20 30 40 50 60 70 - // 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 - // 03 13 23 33 43 53 63 73 - // 04 14 24 34 44 54 64 74 - // 05 15 25 35 45 55 65 75 - // 06 16 26 36 46 56 66 76 - // 07 17 27 37 47 57 67 77 - } + vpx_fdct8x8_pass1_neon(in); } // for { // from vpx_dct_sse2.c // Post-condition (division by two) // division of two 16 bits signed numbers using shifts // n / 2 = (n - (n >> 15)) >> 1 - const int16x8_t sign_in0 = vshrq_n_s16(input_0, 15); - const int16x8_t sign_in1 = vshrq_n_s16(input_1, 15); - const int16x8_t sign_in2 = vshrq_n_s16(input_2, 15); - const int16x8_t sign_in3 = vshrq_n_s16(input_3, 15); - const int16x8_t sign_in4 = vshrq_n_s16(input_4, 15); - const int16x8_t sign_in5 = vshrq_n_s16(input_5, 15); - const int16x8_t sign_in6 = vshrq_n_s16(input_6, 15); - const int16x8_t sign_in7 = vshrq_n_s16(input_7, 15); - input_0 = vhsubq_s16(input_0, sign_in0); - input_1 = vhsubq_s16(input_1, sign_in1); - input_2 = vhsubq_s16(input_2, sign_in2); - input_3 = vhsubq_s16(input_3, sign_in3); - input_4 = vhsubq_s16(input_4, sign_in4); - input_5 = vhsubq_s16(input_5, sign_in5); - input_6 = vhsubq_s16(input_6, sign_in6); - input_7 = vhsubq_s16(input_7, sign_in7); + const int16x8_t sign_in0 = vshrq_n_s16(in[0], 15); + const int16x8_t sign_in1 = vshrq_n_s16(in[1], 15); + const int16x8_t sign_in2 = vshrq_n_s16(in[2], 15); + const int16x8_t sign_in3 = vshrq_n_s16(in[3], 15); + const int16x8_t sign_in4 = vshrq_n_s16(in[4], 15); + const int16x8_t sign_in5 = vshrq_n_s16(in[5], 15); + const int16x8_t sign_in6 = vshrq_n_s16(in[6], 15); + const int16x8_t sign_in7 = vshrq_n_s16(in[7], 15); + in[0] = vhsubq_s16(in[0], sign_in0); + in[1] = vhsubq_s16(in[1], sign_in1); + in[2] = vhsubq_s16(in[2], sign_in2); + in[3] = vhsubq_s16(in[3], sign_in3); + in[4] = vhsubq_s16(in[4], sign_in4); + in[5] = vhsubq_s16(in[5], sign_in5); + in[6] = vhsubq_s16(in[6], sign_in6); + in[7] = vhsubq_s16(in[7], sign_in7); // store results - store_s16q_to_tran_low(final_output + 0 * 8, input_0); - store_s16q_to_tran_low(final_output + 1 * 8, input_1); - store_s16q_to_tran_low(final_output + 2 * 8, input_2); - store_s16q_to_tran_low(final_output + 3 * 8, input_3); - store_s16q_to_tran_low(final_output + 4 * 8, input_4); - store_s16q_to_tran_low(final_output + 5 * 8, input_5); - store_s16q_to_tran_low(final_output + 6 * 8, input_6); - store_s16q_to_tran_low(final_output + 7 * 8, input_7); + store_s16q_to_tran_low(final_output + 0 * 8, in[0]); + store_s16q_to_tran_low(final_output + 1 * 8, in[1]); + store_s16q_to_tran_low(final_output + 2 * 8, in[2]); + store_s16q_to_tran_low(final_output + 3 * 8, in[3]); + store_s16q_to_tran_low(final_output + 4 * 8, in[4]); + store_s16q_to_tran_low(final_output + 5 * 8, in[5]); + store_s16q_to_tran_low(final_output + 6 * 8, in[6]); + store_s16q_to_tran_low(final_output + 7 * 8, in[7]); } } diff --git a/vpx_dsp/arm/transpose_neon.h b/vpx_dsp/arm/transpose_neon.h index 752308160..c098ad31b 100644 --- a/vpx_dsp/arm/transpose_neon.h +++ b/vpx_dsp/arm/transpose_neon.h @@ -1184,6 +1184,45 @@ static INLINE void transpose_u8_16x16( *o15 = e7.val[1]; } +static INLINE void transpose_s16_16x16(int16x8_t *in0, int16x8_t *in1) { + int16x8_t t[8]; + + // transpose the 4 8x8 quadrants separately but first swap quadrants 2 and 3. + t[0] = in0[8]; + t[1] = in0[9]; + t[2] = in0[10]; + t[3] = in0[11]; + t[4] = in0[12]; + t[5] = in0[13]; + t[6] = in0[14]; + t[7] = in0[15]; + in0[8] = in1[0]; + in0[9] = in1[1]; + in0[10] = in1[2]; + in0[11] = in1[3]; + in0[12] = in1[4]; + in0[13] = in1[5]; + in0[14] = in1[6]; + in0[15] = in1[7]; + in1[0] = t[0]; + in1[1] = t[1]; + in1[2] = t[2]; + in1[3] = t[3]; + in1[4] = t[4]; + in1[5] = t[5]; + in1[6] = t[6]; + in1[7] = t[7]; + + transpose_s16_8x8(&in0[0], &in0[1], &in0[2], &in0[3], &in0[4], &in0[5], + &in0[6], &in0[7]); + transpose_s16_8x8(&in0[8], &in0[9], &in0[10], &in0[11], &in0[12], &in0[13], + &in0[14], &in0[15]); + transpose_s16_8x8(&in1[0], &in1[1], &in1[2], &in1[3], &in1[4], &in1[5], + &in1[6], &in1[7]); + transpose_s16_8x8(&in1[8], &in1[9], &in1[10], &in1[11], &in1[12], &in1[13], + &in1[14], &in1[15]); +} + static INLINE void load_and_transpose_u8_4x8(const uint8_t *a, const int a_stride, uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2, |