diff options
author | Konstantinos Margaritis <konstantinos@vectorcamp.gr> | 2022-10-26 21:37:31 +0000 |
---|---|---|
committer | Konstantinos Margaritis <konstantinos@vectorcamp.gr> | 2022-11-01 23:07:27 +0000 |
commit | 3121783fec60d0ce4551d472d1acbd1f1a8253be (patch) | |
tree | 95a53f73adccf711003346860021ddf9ed1a2e0f /vpx_dsp/arm/fdct16x16_neon.c | |
parent | dcb566e69f03eb046180dabf41c4118b249af96f (diff) | |
download | libvpx-3121783fec60d0ce4551d472d1acbd1f1a8253be.tar libvpx-3121783fec60d0ce4551d472d1acbd1f1a8253be.tar.gz libvpx-3121783fec60d0ce4551d472d1acbd1f1a8253be.tar.bz2 libvpx-3121783fec60d0ce4551d472d1acbd1f1a8253be.zip |
[NEON] Optimize and homogenize Butterfly DCT functions
Provide a set of commonly used Butterfly DCT functions for use in
DCT 4x4, 8x8, 16x16, 32x32 functions. These are provided in various
forms, using vqrdmulh_s16/vqrdmulh_s32 for _fast variants, which
unfortunately are only usable in pass1 of most DCTs, as they do not
provide the necessary precision in pass2.
This gave a performance gain ranging from 5% to 15% in 16x16 case.
Also, for 32x32, the loads were rearranged, along with the butterfly
optimizations, this gave 10% gain in 32x32_rd function.
This refactoring was necessary to allow easier porting of highbd
32x32 functions -follows this patchset.
Change-Id: I6282e640b95a95938faff76c3b2bace3dc298bc3
Diffstat (limited to 'vpx_dsp/arm/fdct16x16_neon.c')
-rw-r--r-- | vpx_dsp/arm/fdct16x16_neon.c | 18 |
1 files changed, 10 insertions, 8 deletions
diff --git a/vpx_dsp/arm/fdct16x16_neon.c b/vpx_dsp/arm/fdct16x16_neon.c index d0c07d429..a458ecaa4 100644 --- a/vpx_dsp/arm/fdct16x16_neon.c +++ b/vpx_dsp/arm/fdct16x16_neon.c @@ -37,20 +37,21 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { // Left half. load_cross(input, stride, temp0); scale_input(temp0, temp1); - vpx_fdct16x16_body(temp1, temp0); + vpx_fdct8x16_body(temp1, temp0); // Right half. load_cross(input + 8, stride, temp1); scale_input(temp1, temp2); - vpx_fdct16x16_body(temp2, temp1); + vpx_fdct8x16_body(temp2, temp1); // Transpose top left and top right quarters into one contiguous location to // process to the top half. + transpose_s16_8x8_new(&temp0[0], &temp2[0]); transpose_s16_8x8_new(&temp1[0], &temp2[8]); partial_round_shift(temp2); cross_input(temp2, temp3); - vpx_fdct16x16_body(temp3, temp2); + vpx_fdct8x16_body(temp3, temp2); transpose_s16_8x8(&temp2[0], &temp2[1], &temp2[2], &temp2[3], &temp2[4], &temp2[5], &temp2[6], &temp2[7]); transpose_s16_8x8(&temp2[8], &temp2[9], &temp2[10], &temp2[11], &temp2[12], @@ -62,11 +63,12 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { // Transpose bottom left and bottom right quarters into one contiguous // location to process to the bottom half. transpose_s16_8x8_new(&temp0[8], &temp1[0]); + transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12], &temp1[13], &temp1[14], &temp1[15]); partial_round_shift(temp1); cross_input(temp1, temp0); - vpx_fdct16x16_body(temp0, temp1); + vpx_fdct8x16_body(temp0, temp1); transpose_s16_8x8(&temp1[0], &temp1[1], &temp1[2], &temp1[3], &temp1[4], &temp1[5], &temp1[6], &temp1[7]); transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12], @@ -86,12 +88,12 @@ void vpx_highbd_fdct16x16_neon(const int16_t *input, tran_low_t *output, // Left half. load_cross(input, stride, temp0); highbd_scale_input(temp0, left1, right1); - vpx_highbd_fdct16x16_body(left1, right1); + vpx_highbd_fdct8x16_body(left1, right1); // right half. load_cross(input + 8, stride, temp0); highbd_scale_input(temp0, left2, right2); - vpx_highbd_fdct16x16_body(left2, right2); + vpx_highbd_fdct8x16_body(left2, right2); // Transpose top left and top right quarters into one contiguous location to // process to the top half. @@ -103,14 +105,14 @@ void vpx_highbd_fdct16x16_neon(const int16_t *input, tran_low_t *output, highbd_partial_round_shift(left3, right3); highbd_cross_input(left3, right3, left1, right1); - vpx_highbd_fdct16x16_body(left1, right1); + vpx_highbd_fdct8x16_body(left1, right1); // Transpose bottom left and bottom right quarters into one contiguous // location to process to the bottom half. highbd_partial_round_shift(left4, right4); highbd_cross_input(left4, right4, left2, right2); - vpx_highbd_fdct16x16_body(left2, right2); + vpx_highbd_fdct8x16_body(left2, right2); transpose_s32_8x8_2(left1, right1, left3, right3); transpose_s32_8x8_2(left2, right2, left3 + 8, right3 + 8); |