Make sure only NEON FDCT functions are called.

[NEON] Added vpx_fdct4x4_pass1_neon(), Added vpx_fdct8x8_pass1_notranspose_neon(), Added vpx_fdct8x8_pass1_neon() to avoid code duplication Refactored vpx_fdct4x4_neon() and vpx_dct8x8_neon() to use the above Rename dct_body to vpx_fdct16x16_body to reuse later Add transpose_s16_16x16() I have run make test and all tests/configurations seem to pass. Profiled using this command on an Ampere Altra VM: sudo perf record -g ./vpxenc --codec=vp9 --height=1080 --width=1920 \ --fps=25/1 --limit=20 -o output.mkv \ ../original_videos_Sports_1080P_Sports_1080P-0063.mkv --debug –rt Before this optimization: 1.32% 1.32% vpxenc vpxenc [.] vpx_fdct4x4_neon 0.16% 0.16% vpxenc vpxenc [.] vpx_fdct4x4_c 0.79% 0.79% vpxenc vpxenc [.] vpx_fdct8x8_c 0.52% 0.52% vpxenc vpxenc [.] vpx_fdct8x8_neon 1.23% 1.23% vpxenc vpxenc [.] vpx_fdct16x16_c 0.54% 0.54% vpxenc vpxenc [.] vpx_fdct16x16_neon So, even though a _neon() version exists, the C version was called \ as well. After this patch: 1.42% 1.36% vpxenc vpxenc [.] vpx_fdct4x4_neon 0.87% 0.82% vpxenc vpxenc [.] vpx_fdct8x8_neon 0.74% 0.74% vpxenc vpxenc [.] vpx_fdct16x16_neon Change-Id: Id4e1dd315c67b4355fe4e5a1b59e181a349f16d0
author: Konstantinos Margaritis <konma@vectorcamp.gr> 2022-03-11 20:19:25 +0200
committer: Konstantinos Margaritis <konma@vectorcamp.gr> 2022-03-17 13:07:12 +0200
commit: f79d256cb28a4228df66a7a6d1cebbd9071e0639 (patch)
tree: af6e7e70ddb165208e3a7ad22132ac48f69e1b21 /vpx_dsp/arm/transpose_neon.h
parent: 8a50f70ffc5eea6c2392a5c176bfe43e450ecebc (diff)
download: libvpx-f79d256cb28a4228df66a7a6d1cebbd9071e0639.tar
libvpx-f79d256cb28a4228df66a7a6d1cebbd9071e0639.tar.gz
libvpx-f79d256cb28a4228df66a7a6d1cebbd9071e0639.tar.bz2
libvpx-f79d256cb28a4228df66a7a6d1cebbd9071e0639.zip
1 files changed, 39 insertions, 0 deletions
diff --git a/vpx_dsp/arm/transpose_neon.h b/vpx_dsp/arm/transpose_neon.h
index 752308160..c098ad31b 100644
--- a/vpx_dsp/arm/transpose_neon.h
+++ b/vpx_dsp/arm/transpose_neon.h
@@ -1184,6 +1184,45 @@ static INLINE void transpose_u8_16x16(
   *o15 = e7.val[1];
 }
 
+static INLINE void transpose_s16_16x16(int16x8_t *in0, int16x8_t *in1) {
+  int16x8_t t[8];
+
+  // transpose the 4 8x8 quadrants separately but first swap quadrants 2 and 3.
+  t[0] = in0[8];
+  t[1] = in0[9];
+  t[2] = in0[10];
+  t[3] = in0[11];
+  t[4] = in0[12];
+  t[5] = in0[13];
+  t[6] = in0[14];
+  t[7] = in0[15];
+  in0[8] = in1[0];
+  in0[9] = in1[1];
+  in0[10] = in1[2];
+  in0[11] = in1[3];
+  in0[12] = in1[4];
+  in0[13] = in1[5];
+  in0[14] = in1[6];
+  in0[15] = in1[7];
+  in1[0] = t[0];
+  in1[1] = t[1];
+  in1[2] = t[2];
+  in1[3] = t[3];
+  in1[4] = t[4];
+  in1[5] = t[5];
+  in1[6] = t[6];
+  in1[7] = t[7];
+
+  transpose_s16_8x8(&in0[0], &in0[1], &in0[2], &in0[3], &in0[4], &in0[5],
+                    &in0[6], &in0[7]);
+  transpose_s16_8x8(&in0[8], &in0[9], &in0[10], &in0[11], &in0[12], &in0[13],
+                    &in0[14], &in0[15]);
+  transpose_s16_8x8(&in1[0], &in1[1], &in1[2], &in1[3], &in1[4], &in1[5],
+                    &in1[6], &in1[7]);
+  transpose_s16_8x8(&in1[8], &in1[9], &in1[10], &in1[11], &in1[12], &in1[13],
+                    &in1[14], &in1[15]);
+}
+
 static INLINE void load_and_transpose_u8_4x8(const uint8_t *a,
                                              const int a_stride, uint8x8_t *a0,
                                              uint8x8_t *a1, uint8x8_t *a2,
author	Konstantinos Margaritis <konma@vectorcamp.gr>	2022-03-11 20:19:25 +0200
committer	Konstantinos Margaritis <konma@vectorcamp.gr>	2022-03-17 13:07:12 +0200
commit	f79d256cb28a4228df66a7a6d1cebbd9071e0639 (patch)
tree	af6e7e70ddb165208e3a7ad22132ac48f69e1b21 /vpx_dsp/arm/transpose_neon.h
parent	8a50f70ffc5eea6c2392a5c176bfe43e450ecebc (diff)
download	libvpx-f79d256cb28a4228df66a7a6d1cebbd9071e0639.tar libvpx-f79d256cb28a4228df66a7a6d1cebbd9071e0639.tar.gz libvpx-f79d256cb28a4228df66a7a6d1cebbd9071e0639.tar.bz2 libvpx-f79d256cb28a4228df66a7a6d1cebbd9071e0639.zip