[NEON] Optimize and homogenize Butterfly DCT functions

Provide a set of commonly used Butterfly DCT functions for use in DCT 4x4, 8x8, 16x16, 32x32 functions. These are provided in various forms, using vqrdmulh_s16/vqrdmulh_s32 for _fast variants, which unfortunately are only usable in pass1 of most DCTs, as they do not provide the necessary precision in pass2. This gave a performance gain ranging from 5% to 15% in 16x16 case. Also, for 32x32, the loads were rearranged, along with the butterfly optimizations, this gave 10% gain in 32x32_rd function. This refactoring was necessary to allow easier porting of highbd 32x32 functions -follows this patchset. Change-Id: I6282e640b95a95938faff76c3b2bace3dc298bc3
author: Konstantinos Margaritis <konstantinos@vectorcamp.gr> 2022-10-26 21:37:31 +0000
committer: Konstantinos Margaritis <konstantinos@vectorcamp.gr> 2022-11-01 23:07:27 +0000
commit: 3121783fec60d0ce4551d472d1acbd1f1a8253be (patch)
tree: 95a53f73adccf711003346860021ddf9ed1a2e0f /vpx_dsp/arm/transpose_neon.h
parent: dcb566e69f03eb046180dabf41c4118b249af96f (diff)
download: libvpx-3121783fec60d0ce4551d472d1acbd1f1a8253be.tar
libvpx-3121783fec60d0ce4551d472d1acbd1f1a8253be.tar.gz
libvpx-3121783fec60d0ce4551d472d1acbd1f1a8253be.tar.bz2
libvpx-3121783fec60d0ce4551d472d1acbd1f1a8253be.zip
1 files changed, 45 insertions, 0 deletions
diff --git a/vpx_dsp/arm/transpose_neon.h b/vpx_dsp/arm/transpose_neon.h
index bf06d6abe..41d44f2b1 100644
--- a/vpx_dsp/arm/transpose_neon.h
+++ b/vpx_dsp/arm/transpose_neon.h
@@ -821,6 +821,51 @@ static INLINE void transpose_s32_8x8(int32x4x2_t *a0, int32x4x2_t *a1,
   a7->val[1] = c7.val[1];
 }
 
+// Helper transpose function for highbd FDCT variants
+static INLINE void transpose_s32_8x8_2(int32x4_t *left /*[8]*/,
+                                       int32x4_t *right /*[8]*/,
+                                       int32x4_t *out_left /*[8]*/,
+                                       int32x4_t *out_right /*[8]*/) {
+  int32x4x2_t out[8];
+
+  out[0].val[0] = left[0];
+  out[0].val[1] = right[0];
+  out[1].val[0] = left[1];
+  out[1].val[1] = right[1];
+  out[2].val[0] = left[2];
+  out[2].val[1] = right[2];
+  out[3].val[0] = left[3];
+  out[3].val[1] = right[3];
+  out[4].val[0] = left[4];
+  out[4].val[1] = right[4];
+  out[5].val[0] = left[5];
+  out[5].val[1] = right[5];
+  out[6].val[0] = left[6];
+  out[6].val[1] = right[6];
+  out[7].val[0] = left[7];
+  out[7].val[1] = right[7];
+
+  transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
+                    &out[6], &out[7]);
+
+  out_left[0] = out[0].val[0];
+  out_left[1] = out[1].val[0];
+  out_left[2] = out[2].val[0];
+  out_left[3] = out[3].val[0];
+  out_left[4] = out[4].val[0];
+  out_left[5] = out[5].val[0];
+  out_left[6] = out[6].val[0];
+  out_left[7] = out[7].val[0];
+  out_right[0] = out[0].val[1];
+  out_right[1] = out[1].val[1];
+  out_right[2] = out[2].val[1];
+  out_right[3] = out[3].val[1];
+  out_right[4] = out[4].val[1];
+  out_right[5] = out[5].val[1];
+  out_right[6] = out[6].val[1];
+  out_right[7] = out[7].val[1];
+}
+
 static INLINE void transpose_u8_16x8(
     const uint8x16_t i0, const uint8x16_t i1, const uint8x16_t i2,
     const uint8x16_t i3, const uint8x16_t i4, const uint8x16_t i5,
author	Konstantinos Margaritis <konstantinos@vectorcamp.gr>	2022-10-26 21:37:31 +0000
committer	Konstantinos Margaritis <konstantinos@vectorcamp.gr>	2022-11-01 23:07:27 +0000
commit	3121783fec60d0ce4551d472d1acbd1f1a8253be (patch)
tree	95a53f73adccf711003346860021ddf9ed1a2e0f /vpx_dsp/arm/transpose_neon.h
parent	dcb566e69f03eb046180dabf41c4118b249af96f (diff)
download	libvpx-3121783fec60d0ce4551d472d1acbd1f1a8253be.tar libvpx-3121783fec60d0ce4551d472d1acbd1f1a8253be.tar.gz libvpx-3121783fec60d0ce4551d472d1acbd1f1a8253be.tar.bz2 libvpx-3121783fec60d0ce4551d472d1acbd1f1a8253be.zip