diff options
author | Gerda Zsejke More <gerdazsejke.more@arm.com> | 2023-01-26 16:12:55 +0100 |
---|---|---|
committer | Gerda Zsejke More <gerdazsejke.more@arm.com> | 2023-01-27 17:13:30 +0100 |
commit | 5e92d6d103e923a28cd56ee2c7efd6b48a0611b4 (patch) | |
tree | 9c4cbc429117e550b3db41c8deb487cab8cbfb07 | |
parent | 72cfcdd95ab0d17c4b8b13d9da00d1458105bf80 (diff) | |
download | libvpx-5e92d6d103e923a28cd56ee2c7efd6b48a0611b4.tar libvpx-5e92d6d103e923a28cd56ee2c7efd6b48a0611b4.tar.gz libvpx-5e92d6d103e923a28cd56ee2c7efd6b48a0611b4.tar.bz2 libvpx-5e92d6d103e923a28cd56ee2c7efd6b48a0611b4.zip |
Refactor 8x8 16-bit Neon transpose functions
Refactor the Neon implementation of transpose_s16_8x8(q) and
transpose_u16_8x8 so that the final step compiles to 8 ZIP1/ZIP2
instructions as opposed to 8 EXT, MOV pairs. This change removes 8
instructions per call to transpose_s16_8x8(q), transpose_u16_8x8
where the result stays in registers for further processing - rather
than being stored to memory - like in vpx_hadamard_8x8_neon, for
example.
This is a backport of this libaom patch[1].
[1] https://aomedia-review.googlesource.com/c/aom/+/169426
Change-Id: Icef3e51d40efeca7008e1c4fc701bf39bd319c88
-rw-r--r-- | vpx_dsp/arm/fdct16x16_neon.c | 6 | ||||
-rw-r--r-- | vpx_dsp/arm/fdct32x32_neon.c | 64 | ||||
-rw-r--r-- | vpx_dsp/arm/transpose_neon.h | 114 |
3 files changed, 118 insertions, 66 deletions
diff --git a/vpx_dsp/arm/fdct16x16_neon.c b/vpx_dsp/arm/fdct16x16_neon.c index a458ecaa4..0628acb75 100644 --- a/vpx_dsp/arm/fdct16x16_neon.c +++ b/vpx_dsp/arm/fdct16x16_neon.c @@ -47,8 +47,8 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { // Transpose top left and top right quarters into one contiguous location to // process to the top half. - transpose_s16_8x8_new(&temp0[0], &temp2[0]); - transpose_s16_8x8_new(&temp1[0], &temp2[8]); + transpose_s16_8x8q(&temp0[0], &temp2[0]); + transpose_s16_8x8q(&temp1[0], &temp2[8]); partial_round_shift(temp2); cross_input(temp2, temp3); vpx_fdct8x16_body(temp3, temp2); @@ -62,7 +62,7 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { // Transpose bottom left and bottom right quarters into one contiguous // location to process to the bottom half. - transpose_s16_8x8_new(&temp0[8], &temp1[0]); + transpose_s16_8x8q(&temp0[8], &temp1[0]); transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12], &temp1[13], &temp1[14], &temp1[15]); diff --git a/vpx_dsp/arm/fdct32x32_neon.c b/vpx_dsp/arm/fdct32x32_neon.c index d6818d2ec..a91730ce8 100644 --- a/vpx_dsp/arm/fdct32x32_neon.c +++ b/vpx_dsp/arm/fdct32x32_neon.c @@ -60,10 +60,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) { dct_body_first_pass(temp5, temp4); // Generate the top row by munging the first set of 8 from each one together. - transpose_s16_8x8_new(&temp1[0], &temp0[0]); - transpose_s16_8x8_new(&temp2[0], &temp0[8]); - transpose_s16_8x8_new(&temp3[0], &temp0[16]); - transpose_s16_8x8_new(&temp4[0], &temp0[24]); + transpose_s16_8x8q(&temp1[0], &temp0[0]); + transpose_s16_8x8q(&temp2[0], &temp0[8]); + transpose_s16_8x8q(&temp3[0], &temp0[16]); + transpose_s16_8x8q(&temp4[0], &temp0[24]); dct_body_second_pass(temp0, temp5); @@ -78,10 +78,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) { store(output, temp5); // Second row of 8x32. - transpose_s16_8x8_new(&temp1[8], &temp0[0]); - transpose_s16_8x8_new(&temp2[8], &temp0[8]); - transpose_s16_8x8_new(&temp3[8], &temp0[16]); - transpose_s16_8x8_new(&temp4[8], &temp0[24]); + transpose_s16_8x8q(&temp1[8], &temp0[0]); + transpose_s16_8x8q(&temp2[8], &temp0[8]); + transpose_s16_8x8q(&temp3[8], &temp0[16]); + transpose_s16_8x8q(&temp4[8], &temp0[24]); dct_body_second_pass(temp0, temp5); @@ -96,10 +96,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) { store(output + 8 * 32, temp5); // Third row of 8x32 - transpose_s16_8x8_new(&temp1[16], &temp0[0]); - transpose_s16_8x8_new(&temp2[16], &temp0[8]); - transpose_s16_8x8_new(&temp3[16], &temp0[16]); - transpose_s16_8x8_new(&temp4[16], &temp0[24]); + transpose_s16_8x8q(&temp1[16], &temp0[0]); + transpose_s16_8x8q(&temp2[16], &temp0[8]); + transpose_s16_8x8q(&temp3[16], &temp0[16]); + transpose_s16_8x8q(&temp4[16], &temp0[24]); dct_body_second_pass(temp0, temp5); @@ -114,10 +114,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) { store(output + 16 * 32, temp5); // Final row of 8x32. - transpose_s16_8x8_new(&temp1[24], &temp0[0]); - transpose_s16_8x8_new(&temp2[24], &temp0[8]); - transpose_s16_8x8_new(&temp3[24], &temp0[16]); - transpose_s16_8x8_new(&temp4[24], &temp0[24]); + transpose_s16_8x8q(&temp1[24], &temp0[0]); + transpose_s16_8x8q(&temp2[24], &temp0[8]); + transpose_s16_8x8q(&temp3[24], &temp0[16]); + transpose_s16_8x8q(&temp4[24], &temp0[24]); dct_body_second_pass(temp0, temp5); @@ -159,10 +159,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, dct_body_first_pass(temp5, temp4); // Generate the top row by munging the first set of 8 from each one together. - transpose_s16_8x8_new(&temp1[0], &temp0[0]); - transpose_s16_8x8_new(&temp2[0], &temp0[8]); - transpose_s16_8x8_new(&temp3[0], &temp0[16]); - transpose_s16_8x8_new(&temp4[0], &temp0[24]); + transpose_s16_8x8q(&temp1[0], &temp0[0]); + transpose_s16_8x8q(&temp2[0], &temp0[8]); + transpose_s16_8x8q(&temp3[0], &temp0[16]); + transpose_s16_8x8q(&temp4[0], &temp0[24]); dct_body_second_pass_rd(temp0, temp5); @@ -177,10 +177,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, store(output, temp5); // Second row of 8x32. - transpose_s16_8x8_new(&temp1[8], &temp0[0]); - transpose_s16_8x8_new(&temp2[8], &temp0[8]); - transpose_s16_8x8_new(&temp3[8], &temp0[16]); - transpose_s16_8x8_new(&temp4[8], &temp0[24]); + transpose_s16_8x8q(&temp1[8], &temp0[0]); + transpose_s16_8x8q(&temp2[8], &temp0[8]); + transpose_s16_8x8q(&temp3[8], &temp0[16]); + transpose_s16_8x8q(&temp4[8], &temp0[24]); dct_body_second_pass_rd(temp0, temp5); @@ -195,10 +195,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, store(output + 8 * 32, temp5); // Third row of 8x32 - transpose_s16_8x8_new(&temp1[16], &temp0[0]); - transpose_s16_8x8_new(&temp2[16], &temp0[8]); - transpose_s16_8x8_new(&temp3[16], &temp0[16]); - transpose_s16_8x8_new(&temp4[16], &temp0[24]); + transpose_s16_8x8q(&temp1[16], &temp0[0]); + transpose_s16_8x8q(&temp2[16], &temp0[8]); + transpose_s16_8x8q(&temp3[16], &temp0[16]); + transpose_s16_8x8q(&temp4[16], &temp0[24]); dct_body_second_pass_rd(temp0, temp5); @@ -213,10 +213,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, store(output + 16 * 32, temp5); // Final row of 8x32. - transpose_s16_8x8_new(&temp1[24], &temp0[0]); - transpose_s16_8x8_new(&temp2[24], &temp0[8]); - transpose_s16_8x8_new(&temp3[24], &temp0[16]); - transpose_s16_8x8_new(&temp4[24], &temp0[24]); + transpose_s16_8x8q(&temp1[24], &temp0[0]); + transpose_s16_8x8q(&temp2[24], &temp0[8]); + transpose_s16_8x8q(&temp3[24], &temp0[16]); + transpose_s16_8x8q(&temp4[24], &temp0[24]); dct_body_second_pass_rd(temp0, temp5); diff --git a/vpx_dsp/arm/transpose_neon.h b/vpx_dsp/arm/transpose_neon.h index 41d44f2b1..9d1313250 100644 --- a/vpx_dsp/arm/transpose_neon.h +++ b/vpx_dsp/arm/transpose_neon.h @@ -23,10 +23,17 @@ // b0.val[1]: 04 05 06 07 20 21 22 23 static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) { int16x8x2_t b0; +#if defined(__aarch64__) + b0.val[0] = vreinterpretq_s16_s64( + vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1))); + b0.val[1] = vreinterpretq_s16_s64( + vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1))); +#else b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)), vreinterpret_s16_s32(vget_low_s32(a1))); b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)), vreinterpret_s16_s32(vget_high_s32(a1))); +#endif return b0; } @@ -57,10 +64,17 @@ static INLINE uint8x16x2_t vpx_vtrnq_u64_to_u8(uint32x4_t a0, uint32x4_t a1) { static INLINE uint16x8x2_t vpx_vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) { uint16x8x2_t b0; +#if defined(__aarch64__) + b0.val[0] = vreinterpretq_u16_u64( + vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1))); + b0.val[1] = vreinterpretq_u16_u64( + vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1))); +#else b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)), vreinterpret_u16_u32(vget_low_u32(a1))); b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)), vreinterpret_u16_u32(vget_high_u32(a1))); +#endif return b0; } @@ -569,37 +583,73 @@ static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2, } // Transpose 8x8 to a new location. -static INLINE void transpose_s16_8x8_new(const int16x8_t *a, int16x8_t *b) { - // Swap 16 bit elements. - const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]); - const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]); - const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]); - const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]); - - // Swap 32 bit elements. - const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]), - vreinterpretq_s32_s16(c1.val[0])); - const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]), - vreinterpretq_s32_s16(c1.val[1])); - const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]), - vreinterpretq_s32_s16(c3.val[0])); - const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]), - vreinterpretq_s32_s16(c3.val[1])); - - // Swap 64 bit elements - const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]); - const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]); - const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]); - const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]); - - b[0] = e0.val[0]; - b[1] = e1.val[0]; - b[2] = e2.val[0]; - b[3] = e3.val[0]; - b[4] = e0.val[1]; - b[5] = e1.val[1]; - b[6] = e2.val[1]; - b[7] = e3.val[1]; +static INLINE void transpose_s16_8x8q(int16x8_t *a, int16x8_t *out) { + // Swap 16 bit elements. Goes from: + // a0: 00 01 02 03 04 05 06 07 + // a1: 10 11 12 13 14 15 16 17 + // a2: 20 21 22 23 24 25 26 27 + // a3: 30 31 32 33 34 35 36 37 + // a4: 40 41 42 43 44 45 46 47 + // a5: 50 51 52 53 54 55 56 57 + // a6: 60 61 62 63 64 65 66 67 + // a7: 70 71 72 73 74 75 76 77 + // to: + // b0.val[0]: 00 10 02 12 04 14 06 16 + // b0.val[1]: 01 11 03 13 05 15 07 17 + // b1.val[0]: 20 30 22 32 24 34 26 36 + // b1.val[1]: 21 31 23 33 25 35 27 37 + // b2.val[0]: 40 50 42 52 44 54 46 56 + // b2.val[1]: 41 51 43 53 45 55 47 57 + // b3.val[0]: 60 70 62 72 64 74 66 76 + // b3.val[1]: 61 71 63 73 65 75 67 77 + + const int16x8x2_t b0 = vtrnq_s16(a[0], a[1]); + const int16x8x2_t b1 = vtrnq_s16(a[2], a[3]); + const int16x8x2_t b2 = vtrnq_s16(a[4], a[5]); + const int16x8x2_t b3 = vtrnq_s16(a[6], a[7]); + + // Swap 32 bit elements resulting in: + // c0.val[0]: 00 10 20 30 04 14 24 34 + // c0.val[1]: 02 12 22 32 06 16 26 36 + // c1.val[0]: 01 11 21 31 05 15 25 35 + // c1.val[1]: 03 13 23 33 07 17 27 37 + // c2.val[0]: 40 50 60 70 44 54 64 74 + // c2.val[1]: 42 52 62 72 46 56 66 76 + // c3.val[0]: 41 51 61 71 45 55 65 75 + // c3.val[1]: 43 53 63 73 47 57 67 77 + + const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]), + vreinterpretq_s32_s16(b1.val[0])); + const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]), + vreinterpretq_s32_s16(b1.val[1])); + const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]), + vreinterpretq_s32_s16(b3.val[0])); + const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]), + vreinterpretq_s32_s16(b3.val[1])); + + // Swap 64 bit elements resulting in: + // d0.val[0]: 00 10 20 30 40 50 60 70 + // d0.val[1]: 04 14 24 34 44 54 64 74 + // d1.val[0]: 01 11 21 31 41 51 61 71 + // d1.val[1]: 05 15 25 35 45 55 65 75 + // d2.val[0]: 02 12 22 32 42 52 62 72 + // d2.val[1]: 06 16 26 36 46 56 66 76 + // d3.val[0]: 03 13 23 33 43 53 63 73 + // d3.val[1]: 07 17 27 37 47 57 67 77 + + const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]); + const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]); + const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]); + const int16x8x2_t d3 = vpx_vtrnq_s64_to_s16(c1.val[1], c3.val[1]); + + out[0] = d0.val[0]; + out[1] = d1.val[0]; + out[2] = d2.val[0]; + out[3] = d3.val[0]; + out[4] = d0.val[1]; + out[5] = d1.val[1]; + out[6] = d2.val[1]; + out[7] = d3.val[1]; } static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1, @@ -658,6 +708,7 @@ static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1, // d2.val[1]: 06 16 26 36 46 56 66 76 // d3.val[0]: 03 13 23 33 43 53 63 73 // d3.val[1]: 07 17 27 37 47 57 67 77 + const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]); const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]); const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]); @@ -729,6 +780,7 @@ static INLINE void transpose_u16_8x8(uint16x8_t *a0, uint16x8_t *a1, // d2.val[1]: 06 16 26 36 46 56 66 76 // d3.val[0]: 03 13 23 33 43 53 63 73 // d3.val[1]: 07 17 27 37 47 57 67 77 + const uint16x8x2_t d0 = vpx_vtrnq_u64_to_u16(c0.val[0], c2.val[0]); const uint16x8x2_t d1 = vpx_vtrnq_u64_to_u16(c1.val[0], c3.val[0]); const uint16x8x2_t d2 = vpx_vtrnq_u64_to_u16(c0.val[1], c2.val[1]); |