[NEON] move transpose_8x8 to reuse

Change-Id: I3915b6c9971aedaac9c23f21fdb88bc271216208
author: Konstantinos Margaritis <konma@vectorcamp.gr> 2022-10-06 10:58:27 +0000
committer: Konstantinos Margaritis <konma@vectorcamp.gr> 2022-10-10 18:43:27 +0000
commit: 6f8537c4c8bc8cd0bdd4ed329959ab8c66fd2eba (patch)
tree: 6965d09d05ea574a757f8e7debd3945d7ec4df86
parent: 46bd6574aa3fe43c8ca4f846514bbfb39d04440c (diff)
download: libvpx-6f8537c4c8bc8cd0bdd4ed329959ab8c66fd2eba.tar
libvpx-6f8537c4c8bc8cd0bdd4ed329959ab8c66fd2eba.tar.gz
libvpx-6f8537c4c8bc8cd0bdd4ed329959ab8c66fd2eba.tar.bz2
libvpx-6f8537c4c8bc8cd0bdd4ed329959ab8c66fd2eba.zip
3 files changed, 37 insertions, 39 deletions
diff --git a/vpx_dsp/arm/fdct16x16_neon.c b/vpx_dsp/arm/fdct16x16_neon.c
index 67f43246a..5cccb6a64 100644
--- a/vpx_dsp/arm/fdct16x16_neon.c
+++ b/vpx_dsp/arm/fdct16x16_neon.c
@@ -46,8 +46,8 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
 
   // Transpose top left and top right quarters into one contiguous location to
   // process to the top half.
-  transpose_8x8(&temp0[0], &temp2[0]);
-  transpose_8x8(&temp1[0], &temp2[8]);
+  transpose_s16_8x8_new(&temp0[0], &temp2[0]);
+  transpose_s16_8x8_new(&temp1[0], &temp2[8]);
   partial_round_shift(temp2);
   cross_input(temp2, temp3, 1);
   vpx_fdct16x16_body(temp3, temp2);
@@ -61,7 +61,7 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
 
   // Transpose bottom left and bottom right quarters into one contiguous
   // location to process to the bottom half.
-  transpose_8x8(&temp0[8], &temp1[0]);
+  transpose_s16_8x8_new(&temp0[8], &temp1[0]);
   transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12],
                     &temp1[13], &temp1[14], &temp1[15]);
   partial_round_shift(temp1);
diff --git a/vpx_dsp/arm/fdct16x16_neon.h b/vpx_dsp/arm/fdct16x16_neon.h
index 0dd21153f..5ce74cdf4 100644
--- a/vpx_dsp/arm/fdct16x16_neon.h
+++ b/vpx_dsp/arm/fdct16x16_neon.h
@@ -174,42 +174,6 @@ static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b,
   *sub = vcombine_s16(rounded2, rounded3);
 }
 
-// Transpose 8x8 to a new location. Don't use transpose_neon.h because those
-// are all in-place.
-static INLINE void transpose_8x8(const int16x8_t *a /*[8]*/,
-                                 int16x8_t *b /*[8]*/) {
-  // Swap 16 bit elements.
-  const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]);
-  const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]);
-  const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]);
-  const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]);
-
-  // Swap 32 bit elements.
-  const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
-                                   vreinterpretq_s32_s16(c1.val[0]));
-  const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
-                                   vreinterpretq_s32_s16(c1.val[1]));
-  const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]),
-                                   vreinterpretq_s32_s16(c3.val[0]));
-  const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]),
-                                   vreinterpretq_s32_s16(c3.val[1]));
-
-  // Swap 64 bit elements
-  const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]);
-  const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]);
-  const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]);
-  const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]);
-
-  b[0] = e0.val[0];
-  b[1] = e1.val[0];
-  b[2] = e2.val[0];
-  b[3] = e3.val[0];
-  b[4] = e0.val[1];
-  b[5] = e1.val[1];
-  b[6] = e2.val[1];
-  b[7] = e3.val[1];
-}
-
 // Main body of fdct16x16.
 static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/,
                                int16x8_t *out /*[16]*/) {
diff --git a/vpx_dsp/arm/transpose_neon.h b/vpx_dsp/arm/transpose_neon.h
index c098ad31b..bf06d6abe 100644
--- a/vpx_dsp/arm/transpose_neon.h
+++ b/vpx_dsp/arm/transpose_neon.h
@@ -568,6 +568,40 @@ static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
   *a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
 }
 
+// Transpose 8x8 to a new location.
+static INLINE void transpose_s16_8x8_new(const int16x8_t *a, int16x8_t *b) {
+  // Swap 16 bit elements.
+  const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]);
+  const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]);
+  const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]);
+  const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]);
+
+  // Swap 32 bit elements.
+  const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
+                                   vreinterpretq_s32_s16(c1.val[0]));
+  const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
+                                   vreinterpretq_s32_s16(c1.val[1]));
+  const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]),
+                                   vreinterpretq_s32_s16(c3.val[0]));
+  const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]),
+                                   vreinterpretq_s32_s16(c3.val[1]));
+
+  // Swap 64 bit elements
+  const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]);
+  const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]);
+  const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]);
+  const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]);
+
+  b[0] = e0.val[0];
+  b[1] = e1.val[0];
+  b[2] = e2.val[0];
+  b[3] = e3.val[0];
+  b[4] = e0.val[1];
+  b[5] = e1.val[1];
+  b[6] = e2.val[1];
+  b[7] = e3.val[1];
+}
+
 static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
                                      int16x8_t *a2, int16x8_t *a3,
                                      int16x8_t *a4, int16x8_t *a5,
author	Konstantinos Margaritis <konma@vectorcamp.gr>	2022-10-06 10:58:27 +0000
committer	Konstantinos Margaritis <konma@vectorcamp.gr>	2022-10-10 18:43:27 +0000
commit	6f8537c4c8bc8cd0bdd4ed329959ab8c66fd2eba (patch)
tree	6965d09d05ea574a757f8e7debd3945d7ec4df86
parent	46bd6574aa3fe43c8ca4f846514bbfb39d04440c (diff)
download	libvpx-6f8537c4c8bc8cd0bdd4ed329959ab8c66fd2eba.tar libvpx-6f8537c4c8bc8cd0bdd4ed329959ab8c66fd2eba.tar.gz libvpx-6f8537c4c8bc8cd0bdd4ed329959ab8c66fd2eba.tar.bz2 libvpx-6f8537c4c8bc8cd0bdd4ed329959ab8c66fd2eba.zip