diff options
author | James Zern <jzern@google.com> | 2016-12-07 20:26:43 +0000 |
---|---|---|
committer | Gerrit Code Review <noreply-gerritcodereview@google.com> | 2016-12-07 20:26:44 +0000 |
commit | f16a0a1aa446930cf7e5e3c5b6a4c9721b1654cb (patch) | |
tree | 688f230a4ea03b85e63c5d1569f7f5d18e720531 /vpx_dsp | |
parent | 17c403d0ab58fc66072150b94f45420b2e7ddafa (diff) | |
parent | 2d3d95f7ac379b3927eed7519568d8bee63f56e4 (diff) | |
download | libvpx-f16a0a1aa446930cf7e5e3c5b6a4c9721b1654cb.tar libvpx-f16a0a1aa446930cf7e5e3c5b6a4c9721b1654cb.tar.gz libvpx-f16a0a1aa446930cf7e5e3c5b6a4c9721b1654cb.tar.bz2 libvpx-f16a0a1aa446930cf7e5e3c5b6a4c9721b1654cb.zip |
Merge "enable vpx_idct16x16_256_add_neon in hbd builds"
Diffstat (limited to 'vpx_dsp')
-rw-r--r-- | vpx_dsp/arm/idct16x16_add_neon.asm | 64 | ||||
-rw-r--r-- | vpx_dsp/arm/idct16x16_add_neon.c | 221 | ||||
-rw-r--r-- | vpx_dsp/arm/idct16x16_neon.c | 30 | ||||
-rw-r--r-- | vpx_dsp/vpx_dsp_rtcd_defs.pl | 2 |
4 files changed, 254 insertions, 63 deletions
diff --git a/vpx_dsp/arm/idct16x16_add_neon.asm b/vpx_dsp/arm/idct16x16_add_neon.asm index 05fce054c..9d3627a90 100644 --- a/vpx_dsp/arm/idct16x16_add_neon.asm +++ b/vpx_dsp/arm/idct16x16_add_neon.asm @@ -8,8 +8,14 @@ ; be found in the AUTHORS file in the root of the source tree. ; + INCLUDE ./vpx_config.asm + EXPORT |vpx_idct16x16_256_add_neon_pass1| EXPORT |vpx_idct16x16_256_add_neon_pass2| + IF CONFIG_VP9_HIGHBITDEPTH + EXPORT |vpx_idct16x16_256_add_neon_pass1_tran_low| + EXPORT |vpx_idct16x16_256_add_neon_pass2_tran_low| + ENDIF EXPORT |vpx_idct16x16_10_add_neon_pass1| EXPORT |vpx_idct16x16_10_add_neon_pass2| ARM @@ -60,6 +66,7 @@ vld2.s16 {q1,q2}, [r0]! vmov.s16 q15, q1 +idct16x16_256_add_neon_pass1 ; cospi_28_64 = 3196 movw r3, #0x0c7c @@ -255,6 +262,28 @@ bx lr ENDP ; |vpx_idct16x16_256_add_neon_pass1| +IF CONFIG_VP9_HIGHBITDEPTH +;void |vpx_idct16x16_256_add_neon_pass1_tran_low|(const tran_low_t *input, +; int16_t *output) +; +; r0 const tran_low_t *input +; r1 int16_t *output + +|vpx_idct16x16_256_add_neon_pass1_tran_low| PROC + LOAD_TRAN_LOW_TO_S16X2 d16, d17, d18, d19, r0 + LOAD_TRAN_LOW_TO_S16X2 d18, d19, d20, d21, r0 + LOAD_TRAN_LOW_TO_S16X2 d20, d21, d22, d23, r0 + LOAD_TRAN_LOW_TO_S16X2 d22, d23, d24, d25, r0 + LOAD_TRAN_LOW_TO_S16X2 d24, d25, d26, d27, r0 + LOAD_TRAN_LOW_TO_S16X2 d26, d27, d28, d29, r0 + LOAD_TRAN_LOW_TO_S16X2 d28, d29, d30, d31, r0 + LOAD_TRAN_LOW_TO_S16X2 d2, d3, d4, d5, r0 + vmov.s16 q15, q1 + + b idct16x16_256_add_neon_pass1 + ENDP ; |vpx_idct16x16_256_add_neon_pass1_tran_low| +ENDIF ; CONFIG_VP9_HIGHBITDEPTH + ;void vpx_idct16x16_256_add_neon_pass2(const int16_t *src, ; int16_t *output, ; int16_t *pass1_output, @@ -273,8 +302,6 @@ ; will be stored back into q8-q15 registers. This function will touch q0-q7 ; registers and use them as buffer during calculation. |vpx_idct16x16_256_add_neon_pass2| PROC - push {r3-r9} - ; TODO(hkuang): Find a better way to load the elements. ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15 vld2.s16 {q8,q9}, [r0]! @@ -287,6 +314,9 @@ vld2.s16 {q0,q1}, [r0]! vmov.s16 q15, q0; +idct16x16_256_add_neon_pass2 + push {r3-r9} + ; cospi_30_64 = 1606 movw r3, #0x0646 @@ -755,6 +785,36 @@ end_idct16x16_pass2 bx lr ENDP ; |vpx_idct16x16_256_add_neon_pass2| +IF CONFIG_VP9_HIGHBITDEPTH +;void vpx_idct16x16_256_add_neon_pass2_tran_low(const tran_low_t *src, +; int16_t *output, +; int16_t *pass1_output, +; int16_t skip_adding, +; uint8_t *dest, +; int dest_stride) +; +; r0 const tran_low_t *src +; r1 int16_t *output +; r2 int16_t *pass1_output +; r3 int16_t skip_adding +; r4 uint8_t *dest +; r5 int dest_stride + +|vpx_idct16x16_256_add_neon_pass2_tran_low| PROC + LOAD_TRAN_LOW_TO_S16X2 d16, d17, d18, d19, r0 + LOAD_TRAN_LOW_TO_S16X2 d18, d19, d20, d21, r0 + LOAD_TRAN_LOW_TO_S16X2 d20, d21, d22, d23, r0 + LOAD_TRAN_LOW_TO_S16X2 d22, d23, d24, d25, r0 + LOAD_TRAN_LOW_TO_S16X2 d24, d25, d26, d27, r0 + LOAD_TRAN_LOW_TO_S16X2 d26, d27, d28, d29, r0 + LOAD_TRAN_LOW_TO_S16X2 d28, d29, d30, d31, r0 + LOAD_TRAN_LOW_TO_S16X2 d0, d1, d2, d3, r0 + vmov.s16 q15, q0 + + b idct16x16_256_add_neon_pass2 + ENDP ; |vpx_idct16x16_256_add_neon_pass2_tran_low| +ENDIF ; CONFIG_VP9_HIGHBITDEPTH + ;void |vpx_idct16x16_10_add_neon_pass1|(const tran_low_t *input, ; int16_t *output) ; diff --git a/vpx_dsp/arm/idct16x16_add_neon.c b/vpx_dsp/arm/idct16x16_add_neon.c index d101bba41..b37063443 100644 --- a/vpx_dsp/arm/idct16x16_add_neon.c +++ b/vpx_dsp/arm/idct16x16_add_neon.c @@ -13,7 +13,11 @@ #include "vpx_dsp/arm/idct_neon.h" #include "vpx_dsp/txfm_common.h" -void vpx_idct16x16_256_add_neon_pass1(const int16_t *in, int16_t *out) { +static void idct16x16_256_add_neon_pass1(const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3, + const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, + int16_t *out) { int16x4_t d0s16, d1s16, d2s16, d3s16; int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; @@ -22,31 +26,15 @@ void vpx_idct16x16_256_add_neon_pass1(const int16_t *in, int16_t *out) { int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; int32x4_t q0s32, q1s32, q2s32, q3s32, q5s32, q6s32, q9s32; int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32; - int16x8x2_t q0x2s16; - q0x2s16 = vld2q_s16(in); - q8s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q9s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q10s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q11s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q12s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q13s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q14s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q15s16 = q0x2s16.val[0]; + q8s16 = s0; + q9s16 = s1; + q10s16 = s2; + q11s16 = s3; + q12s16 = s4; + q13s16 = s5; + q14s16 = s6; + q15s16 = s7; transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); @@ -211,10 +199,78 @@ void vpx_idct16x16_256_add_neon_pass1(const int16_t *in, int16_t *out) { vst1q_s16(out, q15s16); } -void vpx_idct16x16_256_add_neon_pass2(const int16_t *src, int16_t *out, - int16_t *pass1_output, - int16_t skip_adding, uint8_t *dest, - int dest_stride) { +void vpx_idct16x16_256_add_neon_pass1(const int16_t *in, int16_t *out) { + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + int16x8x2_t v; + + v = vld2q_s16(in); + s0 = v.val[0]; + in += 16; + v = vld2q_s16(in); + s1 = v.val[0]; + in += 16; + v = vld2q_s16(in); + s2 = v.val[0]; + in += 16; + v = vld2q_s16(in); + s3 = v.val[0]; + in += 16; + v = vld2q_s16(in); + s4 = v.val[0]; + in += 16; + v = vld2q_s16(in); + s5 = v.val[0]; + in += 16; + v = vld2q_s16(in); + s6 = v.val[0]; + in += 16; + v = vld2q_s16(in); + s7 = v.val[0]; + + idct16x16_256_add_neon_pass1(s0, s1, s2, s3, s4, s5, s6, s7, out); +} + +#if CONFIG_VP9_HIGHBITDEPTH +void vpx_idct16x16_256_add_neon_pass1_tran_low(const tran_low_t *in, + int16_t *out) { + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + int16x8x2_t v; + + v = load_tran_low_to_s16x2q(in); + s0 = v.val[0]; + in += 16; + v = load_tran_low_to_s16x2q(in); + s1 = v.val[0]; + in += 16; + v = load_tran_low_to_s16x2q(in); + s2 = v.val[0]; + in += 16; + v = load_tran_low_to_s16x2q(in); + s3 = v.val[0]; + in += 16; + v = load_tran_low_to_s16x2q(in); + s4 = v.val[0]; + in += 16; + v = load_tran_low_to_s16x2q(in); + s5 = v.val[0]; + in += 16; + v = load_tran_low_to_s16x2q(in); + s6 = v.val[0]; + in += 16; + v = load_tran_low_to_s16x2q(in); + s7 = v.val[0]; + + idct16x16_256_add_neon_pass1(s0, s1, s2, s3, s4, s5, s6, s7, out); +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +static void idct16x16_256_add_neon_pass2(const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3, + const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, + int16_t *out, int16_t *pass1_output, + int16_t skip_adding, uint8_t *dest, + int dest_stride) { uint8_t *d; uint8x8_t d12u8, d13u8; int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; @@ -229,31 +285,15 @@ void vpx_idct16x16_256_add_neon_pass2(const int16_t *src, int16_t *out, int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32; int32x4_t q10s32, q11s32, q12s32, q13s32; - int16x8x2_t q0x2s16; - q0x2s16 = vld2q_s16(src); - q8s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q9s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q10s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q11s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q12s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q13s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q14s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q15s16 = q0x2s16.val[0]; + q8s16 = s0; + q9s16 = s1; + q10s16 = s2; + q11s16 = s3; + q12s16 = s4; + q13s16 = s5; + q14s16 = s6; + q15s16 = s7; transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); @@ -760,6 +800,81 @@ void vpx_idct16x16_256_add_neon_pass2(const int16_t *src, int16_t *out, } } +void vpx_idct16x16_256_add_neon_pass2(const int16_t *src, int16_t *out, + int16_t *pass1_output, + int16_t skip_adding, uint8_t *dest, + int dest_stride) { + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + int16x8x2_t q0x2s16; + + q0x2s16 = vld2q_s16(src); + q8s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q9s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q10s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q11s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q12s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q13s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q14s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q15s16 = q0x2s16.val[0]; + + idct16x16_256_add_neon_pass2(q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, + q14s16, q15s16, out, pass1_output, skip_adding, + dest, dest_stride); +} + +#if CONFIG_VP9_HIGHBITDEPTH +void vpx_idct16x16_256_add_neon_pass2_tran_low(const tran_low_t *src, + int16_t *out, + int16_t *pass1_output, + int16_t skip_adding, + uint8_t *dest, int dest_stride) { + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + int16x8x2_t q0x2s16; + + q0x2s16 = load_tran_low_to_s16x2q(src); + q8s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = load_tran_low_to_s16x2q(src); + q9s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = load_tran_low_to_s16x2q(src); + q10s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = load_tran_low_to_s16x2q(src); + q11s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = load_tran_low_to_s16x2q(src); + q12s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = load_tran_low_to_s16x2q(src); + q13s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = load_tran_low_to_s16x2q(src); + q14s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = load_tran_low_to_s16x2q(src); + q15s16 = q0x2s16.val[0]; + + idct16x16_256_add_neon_pass2(q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, + q14s16, q15s16, out, pass1_output, skip_adding, + dest, dest_stride); +} +#endif // CONFIG_VP9_HIGHBITDEPTH + void vpx_idct16x16_10_add_neon_pass1(const tran_low_t *in, int16_t *out) { int16x4_t d4s16; int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; diff --git a/vpx_dsp/arm/idct16x16_neon.c b/vpx_dsp/arm/idct16x16_neon.c index 8eae549bb..bcbbf4b6d 100644 --- a/vpx_dsp/arm/idct16x16_neon.c +++ b/vpx_dsp/arm/idct16x16_neon.c @@ -16,6 +16,21 @@ void vpx_idct16x16_256_add_neon_pass2(const int16_t *src, int16_t *output, int16_t *pass1_output, int16_t skip_adding, uint8_t *dest, int dest_stride); +#if CONFIG_VP9_HIGHBITDEPTH +void vpx_idct16x16_256_add_neon_pass1_tran_low(const tran_low_t *input, + int16_t *output); +void vpx_idct16x16_256_add_neon_pass2_tran_low(const tran_low_t *src, + int16_t *output, + int16_t *pass1_output, + int16_t skip_adding, + uint8_t *dest, int dest_stride); +#else +#define vpx_idct16x16_256_add_neon_pass1_tran_low \ + vpx_idct16x16_256_add_neon_pass1 +#define vpx_idct16x16_256_add_neon_pass2_tran_low \ + vpx_idct16x16_256_add_neon_pass2 +#endif + void vpx_idct16x16_10_add_neon_pass1(const tran_low_t *input, int16_t *output); void vpx_idct16x16_10_add_neon_pass2(const tran_low_t *src, int16_t *output, int16_t *pass1_output); @@ -26,7 +41,7 @@ extern void vpx_push_neon(int64_t *store); extern void vpx_pop_neon(int64_t *store); #endif // HAVE_NEON_ASM -void vpx_idct16x16_256_add_neon(const int16_t *input, uint8_t *dest, +void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride) { #if HAVE_NEON_ASM int64_t store_reg[8]; @@ -42,24 +57,25 @@ void vpx_idct16x16_256_add_neon(const int16_t *input, uint8_t *dest, /* Parallel idct on the upper 8 rows */ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the // stage 6 result in pass1_output. - vpx_idct16x16_256_add_neon_pass1(input, pass1_output); + vpx_idct16x16_256_add_neon_pass1_tran_low(input, pass1_output); // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // with result in pass1(pass1_output) to calculate final result in stage 7 // which will be saved into row_idct_output. - vpx_idct16x16_256_add_neon_pass2(input + 1, row_idct_output, pass1_output, 0, - dest, dest_stride); + vpx_idct16x16_256_add_neon_pass2_tran_low(input + 1, row_idct_output, + pass1_output, 0, dest, dest_stride); /* Parallel idct on the lower 8 rows */ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the // stage 6 result in pass1_output. - vpx_idct16x16_256_add_neon_pass1(input + 8 * 16, pass1_output); + vpx_idct16x16_256_add_neon_pass1_tran_low(input + 8 * 16, pass1_output); // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // with result in pass1(pass1_output) to calculate final result in stage 7 // which will be saved into row_idct_output. - vpx_idct16x16_256_add_neon_pass2(input + 8 * 16 + 1, row_idct_output + 8, - pass1_output, 0, dest, dest_stride); + vpx_idct16x16_256_add_neon_pass2_tran_low(input + 8 * 16 + 1, + row_idct_output + 8, pass1_output, + 0, dest, dest_stride); /* Parallel idct on the left 8 columns */ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index fd5d6b48f..6d116559d 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -687,7 +687,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_idct8x8_1_add neon sse2/; add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vpx_idct16x16_256_add sse2/; + specialize qw/vpx_idct16x16_256_add neon sse2/; add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; specialize qw/vpx_idct16x16_10_add neon sse2/; |