summaryrefslogtreecommitdiff
path: root/vpx_dsp/arm/fdct16x16_neon.c
diff options
context:
space:
mode:
authorKonstantinos Margaritis <konstantinos@vectorcamp.gr>2022-10-26 21:37:31 +0000
committerKonstantinos Margaritis <konstantinos@vectorcamp.gr>2022-11-01 23:07:27 +0000
commit3121783fec60d0ce4551d472d1acbd1f1a8253be (patch)
tree95a53f73adccf711003346860021ddf9ed1a2e0f /vpx_dsp/arm/fdct16x16_neon.c
parentdcb566e69f03eb046180dabf41c4118b249af96f (diff)
downloadlibvpx-3121783fec60d0ce4551d472d1acbd1f1a8253be.tar
libvpx-3121783fec60d0ce4551d472d1acbd1f1a8253be.tar.gz
libvpx-3121783fec60d0ce4551d472d1acbd1f1a8253be.tar.bz2
libvpx-3121783fec60d0ce4551d472d1acbd1f1a8253be.zip
[NEON] Optimize and homogenize Butterfly DCT functions
Provide a set of commonly used Butterfly DCT functions for use in DCT 4x4, 8x8, 16x16, 32x32 functions. These are provided in various forms, using vqrdmulh_s16/vqrdmulh_s32 for _fast variants, which unfortunately are only usable in pass1 of most DCTs, as they do not provide the necessary precision in pass2. This gave a performance gain ranging from 5% to 15% in 16x16 case. Also, for 32x32, the loads were rearranged, along with the butterfly optimizations, this gave 10% gain in 32x32_rd function. This refactoring was necessary to allow easier porting of highbd 32x32 functions -follows this patchset. Change-Id: I6282e640b95a95938faff76c3b2bace3dc298bc3
Diffstat (limited to 'vpx_dsp/arm/fdct16x16_neon.c')
-rw-r--r--vpx_dsp/arm/fdct16x16_neon.c18
1 files changed, 10 insertions, 8 deletions
diff --git a/vpx_dsp/arm/fdct16x16_neon.c b/vpx_dsp/arm/fdct16x16_neon.c
index d0c07d429..a458ecaa4 100644
--- a/vpx_dsp/arm/fdct16x16_neon.c
+++ b/vpx_dsp/arm/fdct16x16_neon.c
@@ -37,20 +37,21 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
// Left half.
load_cross(input, stride, temp0);
scale_input(temp0, temp1);
- vpx_fdct16x16_body(temp1, temp0);
+ vpx_fdct8x16_body(temp1, temp0);
// Right half.
load_cross(input + 8, stride, temp1);
scale_input(temp1, temp2);
- vpx_fdct16x16_body(temp2, temp1);
+ vpx_fdct8x16_body(temp2, temp1);
// Transpose top left and top right quarters into one contiguous location to
// process to the top half.
+
transpose_s16_8x8_new(&temp0[0], &temp2[0]);
transpose_s16_8x8_new(&temp1[0], &temp2[8]);
partial_round_shift(temp2);
cross_input(temp2, temp3);
- vpx_fdct16x16_body(temp3, temp2);
+ vpx_fdct8x16_body(temp3, temp2);
transpose_s16_8x8(&temp2[0], &temp2[1], &temp2[2], &temp2[3], &temp2[4],
&temp2[5], &temp2[6], &temp2[7]);
transpose_s16_8x8(&temp2[8], &temp2[9], &temp2[10], &temp2[11], &temp2[12],
@@ -62,11 +63,12 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
// Transpose bottom left and bottom right quarters into one contiguous
// location to process to the bottom half.
transpose_s16_8x8_new(&temp0[8], &temp1[0]);
+
transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12],
&temp1[13], &temp1[14], &temp1[15]);
partial_round_shift(temp1);
cross_input(temp1, temp0);
- vpx_fdct16x16_body(temp0, temp1);
+ vpx_fdct8x16_body(temp0, temp1);
transpose_s16_8x8(&temp1[0], &temp1[1], &temp1[2], &temp1[3], &temp1[4],
&temp1[5], &temp1[6], &temp1[7]);
transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12],
@@ -86,12 +88,12 @@ void vpx_highbd_fdct16x16_neon(const int16_t *input, tran_low_t *output,
// Left half.
load_cross(input, stride, temp0);
highbd_scale_input(temp0, left1, right1);
- vpx_highbd_fdct16x16_body(left1, right1);
+ vpx_highbd_fdct8x16_body(left1, right1);
// right half.
load_cross(input + 8, stride, temp0);
highbd_scale_input(temp0, left2, right2);
- vpx_highbd_fdct16x16_body(left2, right2);
+ vpx_highbd_fdct8x16_body(left2, right2);
// Transpose top left and top right quarters into one contiguous location to
// process to the top half.
@@ -103,14 +105,14 @@ void vpx_highbd_fdct16x16_neon(const int16_t *input, tran_low_t *output,
highbd_partial_round_shift(left3, right3);
highbd_cross_input(left3, right3, left1, right1);
- vpx_highbd_fdct16x16_body(left1, right1);
+ vpx_highbd_fdct8x16_body(left1, right1);
// Transpose bottom left and bottom right quarters into one contiguous
// location to process to the bottom half.
highbd_partial_round_shift(left4, right4);
highbd_cross_input(left4, right4, left2, right2);
- vpx_highbd_fdct16x16_body(left2, right2);
+ vpx_highbd_fdct8x16_body(left2, right2);
transpose_s32_8x8_2(left1, right1, left3, right3);
transpose_s32_8x8_2(left2, right2, left3 + 8, right3 + 8);