diff options
author | Konstantinos Margaritis <konstantinos@vectorcamp.gr> | 2022-10-26 21:37:31 +0000 |
---|---|---|
committer | Konstantinos Margaritis <konstantinos@vectorcamp.gr> | 2022-11-01 23:07:27 +0000 |
commit | 3121783fec60d0ce4551d472d1acbd1f1a8253be (patch) | |
tree | 95a53f73adccf711003346860021ddf9ed1a2e0f /vp9/encoder/arm/neon | |
parent | dcb566e69f03eb046180dabf41c4118b249af96f (diff) | |
download | libvpx-3121783fec60d0ce4551d472d1acbd1f1a8253be.tar libvpx-3121783fec60d0ce4551d472d1acbd1f1a8253be.tar.gz libvpx-3121783fec60d0ce4551d472d1acbd1f1a8253be.tar.bz2 libvpx-3121783fec60d0ce4551d472d1acbd1f1a8253be.zip |
[NEON] Optimize and homogenize Butterfly DCT functions
Provide a set of commonly used Butterfly DCT functions for use in
DCT 4x4, 8x8, 16x16, 32x32 functions. These are provided in various
forms, using vqrdmulh_s16/vqrdmulh_s32 for _fast variants, which
unfortunately are only usable in pass1 of most DCTs, as they do not
provide the necessary precision in pass2.
This gave a performance gain ranging from 5% to 15% in 16x16 case.
Also, for 32x32, the loads were rearranged, along with the butterfly
optimizations, this gave 10% gain in 32x32_rd function.
This refactoring was necessary to allow easier porting of highbd
32x32 functions -follows this patchset.
Change-Id: I6282e640b95a95938faff76c3b2bace3dc298bc3
Diffstat (limited to 'vp9/encoder/arm/neon')
-rw-r--r-- | vp9/encoder/arm/neon/vp9_dct_neon.c | 17 |
1 files changed, 12 insertions, 5 deletions
diff --git a/vp9/encoder/arm/neon/vp9_dct_neon.c b/vp9/encoder/arm/neon/vp9_dct_neon.c index a07a1608d..b8286a8dd 100644 --- a/vp9/encoder/arm/neon/vp9_dct_neon.c +++ b/vp9/encoder/arm/neon/vp9_dct_neon.c @@ -18,6 +18,8 @@ #include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/arm/fdct_neon.h" +#include "vpx_dsp/arm/fdct4x4_neon.h" +#include "vpx_dsp/arm/fdct8x8_neon.h" static INLINE void load_buffer_4x4(const int16_t *input, int16x8_t *in, int stride) { @@ -130,12 +132,14 @@ void vp9_fht4x4_neon(const int16_t *input, tran_low_t *output, int stride, case ADST_DCT: load_buffer_4x4(input, in, stride); fadst4x4_neon(in); - vpx_fdct4x4_pass1_neon((int16x4_t *)in); + // pass1 variant is not accurate enough + vpx_fdct4x4_pass2_neon((int16x4_t *)in); write_buffer_4x4(output, in); break; case DCT_ADST: load_buffer_4x4(input, in, stride); - vpx_fdct4x4_pass1_neon((int16x4_t *)in); + // pass1 variant is not accurate enough + vpx_fdct4x4_pass2_neon((int16x4_t *)in); fadst4x4_neon(in); write_buffer_4x4(output, in); break; @@ -488,13 +492,15 @@ void vp9_fht8x8_neon(const int16_t *input, tran_low_t *output, int stride, case ADST_DCT: load_buffer_8x8(input, in, stride); fadst8x8_neon(in); - vpx_fdct8x8_pass1_neon(in); + // pass1 variant is not accurate enough + vpx_fdct8x8_pass2_neon(in); right_shift_8x8(in, 1); write_buffer_8x8(output, in, 8); break; case DCT_ADST: load_buffer_8x8(input, in, stride); - vpx_fdct8x8_pass1_neon(in); + // pass1 variant is not accurate enough + vpx_fdct8x8_pass2_neon(in); fadst8x8_neon(in); right_shift_8x8(in, 1); write_buffer_8x8(output, in, 8); @@ -559,7 +565,8 @@ static void fdct16_8col(int16x8_t *in) { i[6] = vaddq_s16(in[6], in[9]); i[7] = vaddq_s16(in[7], in[8]); - vpx_fdct8x8_pass1_neon(i); + // pass1 variant is not accurate enough + vpx_fdct8x8_pass2_neon(i); transpose_s16_8x8(&i[0], &i[1], &i[2], &i[3], &i[4], &i[5], &i[6], &i[7]); // step 2 |