diff options
Diffstat (limited to 'vp9')
-rw-r--r-- | vp9/common/arm/neon/vp9_iht16x16_add_neon.c | 28 | ||||
-rw-r--r-- | vp9/common/arm/neon/vp9_iht_neon.h | 42 | ||||
-rw-r--r-- | vp9/common/vp9_rtcd_defs.pl | 2 |
3 files changed, 35 insertions, 37 deletions
diff --git a/vp9/common/arm/neon/vp9_iht16x16_add_neon.c b/vp9/common/arm/neon/vp9_iht16x16_add_neon.c index fc9824ae9..a7d5a53c7 100644 --- a/vp9/common/arm/neon/vp9_iht16x16_add_neon.c +++ b/vp9/common/arm/neon/vp9_iht16x16_add_neon.c @@ -221,30 +221,10 @@ static void iadst16x16_256_add_half1d(const void *const input, int16_t *output, x[15] = sub_dct_const_round_shift_low_8(s13, s15); // stage 4 - { - const int16x8_t sum = vaddq_s16(x[2], x[3]); - const int16x8_t sub = vsubq_s16(x[2], x[3]); - x[2] = iadst_half_butterfly_neg_neon(sum, c_16_n16_8_24); - x[3] = iadst_half_butterfly_pos_neon(sub, c_16_n16_8_24); - } - { - const int16x8_t sum = vaddq_s16(x[7], x[6]); - const int16x8_t sub = vsubq_s16(x[7], x[6]); - x[6] = iadst_half_butterfly_pos_neon(sum, c_16_n16_8_24); - x[7] = iadst_half_butterfly_pos_neon(sub, c_16_n16_8_24); - } - { - const int16x8_t sum = vaddq_s16(x[11], x[10]); - const int16x8_t sub = vsubq_s16(x[11], x[10]); - x[10] = iadst_half_butterfly_pos_neon(sum, c_16_n16_8_24); - x[11] = iadst_half_butterfly_pos_neon(sub, c_16_n16_8_24); - } - { - const int16x8_t sum = vaddq_s16(x[14], x[15]); - const int16x8_t sub = vsubq_s16(x[14], x[15]); - x[14] = iadst_half_butterfly_neg_neon(sum, c_16_n16_8_24); - x[15] = iadst_half_butterfly_pos_neon(sub, c_16_n16_8_24); - } + iadst_half_butterfly_neg_neon(&x[3], &x[2], c_16_n16_8_24); + iadst_half_butterfly_pos_neon(&x[7], &x[6], c_16_n16_8_24); + iadst_half_butterfly_pos_neon(&x[11], &x[10], c_16_n16_8_24); + iadst_half_butterfly_neg_neon(&x[15], &x[14], c_16_n16_8_24); out[0] = x[0]; out[1] = vnegq_s16(x[8]); diff --git a/vp9/common/arm/neon/vp9_iht_neon.h b/vp9/common/arm/neon/vp9_iht_neon.h index 965eff36b..b09b96a4d 100644 --- a/vp9/common/arm/neon/vp9_iht_neon.h +++ b/vp9/common/arm/neon/vp9_iht_neon.h @@ -74,22 +74,40 @@ static INLINE void iadst_half_butterfly_neon(int16x8_t *const x, x[1] = dct_const_round_shift_low_8(t1); } -static INLINE int16x8_t iadst_half_butterfly_neg_neon(const int16x8_t in, - const int16x4_t c) { - int32x4_t t[2]; +static INLINE void iadst_half_butterfly_neg_neon(int16x8_t *const x0, + int16x8_t *const x1, + const int16x4_t c) { + // Don't add/sub before multiply, which will overflow in iadst8. + const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(*x0), c, 1); + const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(*x0), c, 1); + const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(*x1), c, 1); + const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(*x1), c, 1); + int32x4_t t0[2], t1[2]; - t[0] = vmull_lane_s16(vget_low_s16(in), c, 1); - t[1] = vmull_lane_s16(vget_high_s16(in), c, 1); - return dct_const_round_shift_low_8(t); + t0[0] = vaddq_s32(x0_lo, x1_lo); + t0[1] = vaddq_s32(x0_hi, x1_hi); + t1[0] = vsubq_s32(x0_lo, x1_lo); + t1[1] = vsubq_s32(x0_hi, x1_hi); + *x1 = dct_const_round_shift_low_8(t0); + *x0 = dct_const_round_shift_low_8(t1); } -static INLINE int16x8_t iadst_half_butterfly_pos_neon(const int16x8_t in, - const int16x4_t c) { - int32x4_t t[2]; +static INLINE void iadst_half_butterfly_pos_neon(int16x8_t *const x0, + int16x8_t *const x1, + const int16x4_t c) { + // Don't add/sub before multiply, which will overflow in iadst8. + const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(*x0), c, 0); + const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(*x0), c, 0); + const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(*x1), c, 0); + const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(*x1), c, 0); + int32x4_t t0[2], t1[2]; - t[0] = vmull_lane_s16(vget_low_s16(in), c, 0); - t[1] = vmull_lane_s16(vget_high_s16(in), c, 0); - return dct_const_round_shift_low_8(t); + t0[0] = vaddq_s32(x0_lo, x1_lo); + t0[1] = vaddq_s32(x0_hi, x1_hi); + t1[0] = vsubq_s32(x0_lo, x1_lo); + t1[1] = vsubq_s32(x0_hi, x1_hi); + *x1 = dct_const_round_shift_low_8(t0); + *x0 = dct_const_round_shift_low_8(t1); } static INLINE void iadst_butterfly_lane_0_1_neon(const int16x8_t in0, diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 2b15b661c..23732e214 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -69,7 +69,7 @@ if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") { # CONFIG_VP9_HIGHBITDEPTH is off. specialize qw/vp9_iht4x4_16_add neon sse2/; specialize qw/vp9_iht8x8_64_add neon sse2/; - specialize qw/vp9_iht16x16_256_add sse2/; + specialize qw/vp9_iht16x16_256_add neon sse2/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") { # Note that these specializations are appended to the above ones. specialize qw/vp9_iht4x4_16_add dspr2 msa/; |