summaryrefslogtreecommitdiff
path: root/vpx_dsp/arm/transpose_neon.h
diff options
context:
space:
mode:
authorKonstantinos Margaritis <konma@vectorcamp.gr>2022-03-11 20:19:25 +0200
committerKonstantinos Margaritis <konma@vectorcamp.gr>2022-03-17 13:07:12 +0200
commitf79d256cb28a4228df66a7a6d1cebbd9071e0639 (patch)
treeaf6e7e70ddb165208e3a7ad22132ac48f69e1b21 /vpx_dsp/arm/transpose_neon.h
parent8a50f70ffc5eea6c2392a5c176bfe43e450ecebc (diff)
downloadlibvpx-f79d256cb28a4228df66a7a6d1cebbd9071e0639.tar
libvpx-f79d256cb28a4228df66a7a6d1cebbd9071e0639.tar.gz
libvpx-f79d256cb28a4228df66a7a6d1cebbd9071e0639.tar.bz2
libvpx-f79d256cb28a4228df66a7a6d1cebbd9071e0639.zip
Make sure only NEON FDCT functions are called.
[NEON] Added vpx_fdct4x4_pass1_neon(), Added vpx_fdct8x8_pass1_notranspose_neon(), Added vpx_fdct8x8_pass1_neon() to avoid code duplication Refactored vpx_fdct4x4_neon() and vpx_dct8x8_neon() to use the above Rename dct_body to vpx_fdct16x16_body to reuse later Add transpose_s16_16x16() I have run make test and all tests/configurations seem to pass. Profiled using this command on an Ampere Altra VM: sudo perf record -g ./vpxenc --codec=vp9 --height=1080 --width=1920 \ --fps=25/1 --limit=20 -o output.mkv \ ../original_videos_Sports_1080P_Sports_1080P-0063.mkv --debug –rt Before this optimization: 1.32% 1.32% vpxenc vpxenc [.] vpx_fdct4x4_neon 0.16% 0.16% vpxenc vpxenc [.] vpx_fdct4x4_c 0.79% 0.79% vpxenc vpxenc [.] vpx_fdct8x8_c 0.52% 0.52% vpxenc vpxenc [.] vpx_fdct8x8_neon 1.23% 1.23% vpxenc vpxenc [.] vpx_fdct16x16_c 0.54% 0.54% vpxenc vpxenc [.] vpx_fdct16x16_neon So, even though a _neon() version exists, the C version was called \ as well. After this patch: 1.42% 1.36% vpxenc vpxenc [.] vpx_fdct4x4_neon 0.87% 0.82% vpxenc vpxenc [.] vpx_fdct8x8_neon 0.74% 0.74% vpxenc vpxenc [.] vpx_fdct16x16_neon Change-Id: Id4e1dd315c67b4355fe4e5a1b59e181a349f16d0
Diffstat (limited to 'vpx_dsp/arm/transpose_neon.h')
-rw-r--r--vpx_dsp/arm/transpose_neon.h39
1 files changed, 39 insertions, 0 deletions
diff --git a/vpx_dsp/arm/transpose_neon.h b/vpx_dsp/arm/transpose_neon.h
index 752308160..c098ad31b 100644
--- a/vpx_dsp/arm/transpose_neon.h
+++ b/vpx_dsp/arm/transpose_neon.h
@@ -1184,6 +1184,45 @@ static INLINE void transpose_u8_16x16(
*o15 = e7.val[1];
}
+static INLINE void transpose_s16_16x16(int16x8_t *in0, int16x8_t *in1) {
+ int16x8_t t[8];
+
+ // transpose the 4 8x8 quadrants separately but first swap quadrants 2 and 3.
+ t[0] = in0[8];
+ t[1] = in0[9];
+ t[2] = in0[10];
+ t[3] = in0[11];
+ t[4] = in0[12];
+ t[5] = in0[13];
+ t[6] = in0[14];
+ t[7] = in0[15];
+ in0[8] = in1[0];
+ in0[9] = in1[1];
+ in0[10] = in1[2];
+ in0[11] = in1[3];
+ in0[12] = in1[4];
+ in0[13] = in1[5];
+ in0[14] = in1[6];
+ in0[15] = in1[7];
+ in1[0] = t[0];
+ in1[1] = t[1];
+ in1[2] = t[2];
+ in1[3] = t[3];
+ in1[4] = t[4];
+ in1[5] = t[5];
+ in1[6] = t[6];
+ in1[7] = t[7];
+
+ transpose_s16_8x8(&in0[0], &in0[1], &in0[2], &in0[3], &in0[4], &in0[5],
+ &in0[6], &in0[7]);
+ transpose_s16_8x8(&in0[8], &in0[9], &in0[10], &in0[11], &in0[12], &in0[13],
+ &in0[14], &in0[15]);
+ transpose_s16_8x8(&in1[0], &in1[1], &in1[2], &in1[3], &in1[4], &in1[5],
+ &in1[6], &in1[7]);
+ transpose_s16_8x8(&in1[8], &in1[9], &in1[10], &in1[11], &in1[12], &in1[13],
+ &in1[14], &in1[15]);
+}
+
static INLINE void load_and_transpose_u8_4x8(const uint8_t *a,
const int a_stride, uint8x8_t *a0,
uint8x8_t *a1, uint8x8_t *a2,