diff options
author | Johann <johannkoenig@google.com> | 2017-06-22 18:36:08 -0700 |
---|---|---|
committer | Johann <johannkoenig@google.com> | 2017-06-28 15:37:44 -0700 |
commit | 9fe510c12ab51c2fb5d4d916f5729e19b8f5ccf4 (patch) | |
tree | 056bf9bb43214743975bd5d106415c9645405cad /vpx_dsp | |
parent | f310ddc4704e8b1cd5ec72472495ee8c3b13a486 (diff) | |
download | libvpx-9fe510c12ab51c2fb5d4d916f5729e19b8f5ccf4.tar libvpx-9fe510c12ab51c2fb5d4d916f5729e19b8f5ccf4.tar.gz libvpx-9fe510c12ab51c2fb5d4d916f5729e19b8f5ccf4.tar.bz2 libvpx-9fe510c12ab51c2fb5d4d916f5729e19b8f5ccf4.zip |
partial fdct neon: add 32x32_1
Always return an int32_t. Since it needs to be moved to a register for
shifting, this doesn't really penalize the smaller transforms.
The values could potentially be summed and shifted in place.
BUG=webm:1424
Change-Id: Id5beb35d79c7574ebd99285fc4182788cf2bb972
Diffstat (limited to 'vpx_dsp')
-rw-r--r-- | vpx_dsp/arm/fdct_partial_neon.c | 39 | ||||
-rw-r--r-- | vpx_dsp/vpx_dsp_rtcd_defs.pl | 4 |
2 files changed, 33 insertions, 10 deletions
diff --git a/vpx_dsp/arm/fdct_partial_neon.c b/vpx_dsp/arm/fdct_partial_neon.c index 4e1a6dfda..3c4b292d7 100644 --- a/vpx_dsp/arm/fdct_partial_neon.c +++ b/vpx_dsp/arm/fdct_partial_neon.c @@ -14,16 +14,12 @@ #include "./vpx_config.h" #include "vpx_dsp/arm/mem_neon.h" -static INLINE tran_low_t sum_int16x8(const int16x8_t a) { +static INLINE int32_t sum_int16x8(const int16x8_t a) { const int32x4_t b = vpaddlq_s16(a); const int64x2_t c = vpaddlq_s32(b); const int32x2_t d = vadd_s32(vreinterpret_s32_s64(vget_low_s64(c)), vreinterpret_s32_s64(vget_high_s64(c))); -#if CONFIG_VP9_HIGHBITDEPTH return vget_lane_s32(d, 0); -#else - return vget_lane_s16(vreinterpret_s16_s32(d), 0); -#endif } void vpx_fdct4x4_1_neon(const int16_t *input, tran_low_t *output, int stride) { @@ -44,7 +40,7 @@ void vpx_fdct4x4_1_neon(const int16_t *input, tran_low_t *output, int stride) { c = vaddq_s16(b0, b1); - output[0] = sum_int16x8(c) << 1; + output[0] = (tran_low_t)(sum_int16x8(c) << 1); output[1] = 0; } @@ -56,7 +52,7 @@ void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride) { sum = vaddq_s16(sum, input_00); } - output[0] = sum_int16x8(sum); + output[0] = (tran_low_t)sum_int16x8(sum); output[1] = 0; } @@ -74,6 +70,33 @@ void vpx_fdct16x16_1_neon(const int16_t *input, tran_low_t *output, right = vaddq_s16(right, b); } - output[0] = (sum_int16x8(left) + sum_int16x8(right)) >> 1; + output[0] = (tran_low_t)((sum_int16x8(left) + sum_int16x8(right)) >> 1); + output[1] = 0; +} + +void vpx_fdct32x32_1_neon(const int16_t *input, tran_low_t *output, + int stride) { + int r; + int16x8_t a0 = vld1q_s16(input); + int16x8_t a1 = vld1q_s16(input + 8); + int16x8_t a2 = vld1q_s16(input + 16); + int16x8_t a3 = vld1q_s16(input + 24); + input += stride; + for (r = 1; r < 32; ++r) { + const int16x8_t b0 = vld1q_s16(input); + const int16x8_t b1 = vld1q_s16(input + 8); + const int16x8_t b2 = vld1q_s16(input + 16); + const int16x8_t b3 = vld1q_s16(input + 24); + input += stride; + a0 = vaddq_s16(a0, b0); + a1 = vaddq_s16(a1, b1); + a2 = vaddq_s16(a2, b2); + a3 = vaddq_s16(a3, b3); + } + + // TODO(johannkoenig): sum and shift the values in neon registers. + output[0] = (tran_low_t)( + (sum_int16x8(a0) + sum_int16x8(a1) + sum_int16x8(a2) + sum_int16x8(a3)) >> + 3); output[1] = 0; } diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 734f6e1e2..bb83021d7 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -508,7 +508,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_fdct32x32_rd sse2/; add_proto qw/void vpx_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_fdct32x32_1 sse2/; + specialize qw/vpx_fdct32x32_1 sse2 neon/; add_proto qw/void vpx_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vpx_highbd_fdct4x4 sse2/; @@ -558,7 +558,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_fdct32x32_rd sse2 avx2 msa/; add_proto qw/void vpx_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_fdct32x32_1 sse2 msa/; + specialize qw/vpx_fdct32x32_1 sse2 neon msa/; } # CONFIG_VP9_HIGHBITDEPTH } # CONFIG_VP9_ENCODER |