summaryrefslogtreecommitdiff
path: root/vp9/common/arm
diff options
context:
space:
mode:
Diffstat (limited to 'vp9/common/arm')
-rw-r--r--vp9/common/arm/neon/vp9_iht16x16_add_neon.c28
-rw-r--r--vp9/common/arm/neon/vp9_iht_neon.h42
2 files changed, 34 insertions, 36 deletions
diff --git a/vp9/common/arm/neon/vp9_iht16x16_add_neon.c b/vp9/common/arm/neon/vp9_iht16x16_add_neon.c
index fc9824ae9..a7d5a53c7 100644
--- a/vp9/common/arm/neon/vp9_iht16x16_add_neon.c
+++ b/vp9/common/arm/neon/vp9_iht16x16_add_neon.c
@@ -221,30 +221,10 @@ static void iadst16x16_256_add_half1d(const void *const input, int16_t *output,
x[15] = sub_dct_const_round_shift_low_8(s13, s15);
// stage 4
- {
- const int16x8_t sum = vaddq_s16(x[2], x[3]);
- const int16x8_t sub = vsubq_s16(x[2], x[3]);
- x[2] = iadst_half_butterfly_neg_neon(sum, c_16_n16_8_24);
- x[3] = iadst_half_butterfly_pos_neon(sub, c_16_n16_8_24);
- }
- {
- const int16x8_t sum = vaddq_s16(x[7], x[6]);
- const int16x8_t sub = vsubq_s16(x[7], x[6]);
- x[6] = iadst_half_butterfly_pos_neon(sum, c_16_n16_8_24);
- x[7] = iadst_half_butterfly_pos_neon(sub, c_16_n16_8_24);
- }
- {
- const int16x8_t sum = vaddq_s16(x[11], x[10]);
- const int16x8_t sub = vsubq_s16(x[11], x[10]);
- x[10] = iadst_half_butterfly_pos_neon(sum, c_16_n16_8_24);
- x[11] = iadst_half_butterfly_pos_neon(sub, c_16_n16_8_24);
- }
- {
- const int16x8_t sum = vaddq_s16(x[14], x[15]);
- const int16x8_t sub = vsubq_s16(x[14], x[15]);
- x[14] = iadst_half_butterfly_neg_neon(sum, c_16_n16_8_24);
- x[15] = iadst_half_butterfly_pos_neon(sub, c_16_n16_8_24);
- }
+ iadst_half_butterfly_neg_neon(&x[3], &x[2], c_16_n16_8_24);
+ iadst_half_butterfly_pos_neon(&x[7], &x[6], c_16_n16_8_24);
+ iadst_half_butterfly_pos_neon(&x[11], &x[10], c_16_n16_8_24);
+ iadst_half_butterfly_neg_neon(&x[15], &x[14], c_16_n16_8_24);
out[0] = x[0];
out[1] = vnegq_s16(x[8]);
diff --git a/vp9/common/arm/neon/vp9_iht_neon.h b/vp9/common/arm/neon/vp9_iht_neon.h
index 965eff36b..b09b96a4d 100644
--- a/vp9/common/arm/neon/vp9_iht_neon.h
+++ b/vp9/common/arm/neon/vp9_iht_neon.h
@@ -74,22 +74,40 @@ static INLINE void iadst_half_butterfly_neon(int16x8_t *const x,
x[1] = dct_const_round_shift_low_8(t1);
}
-static INLINE int16x8_t iadst_half_butterfly_neg_neon(const int16x8_t in,
- const int16x4_t c) {
- int32x4_t t[2];
+static INLINE void iadst_half_butterfly_neg_neon(int16x8_t *const x0,
+ int16x8_t *const x1,
+ const int16x4_t c) {
+ // Don't add/sub before multiply, which will overflow in iadst8.
+ const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(*x0), c, 1);
+ const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(*x0), c, 1);
+ const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(*x1), c, 1);
+ const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(*x1), c, 1);
+ int32x4_t t0[2], t1[2];
- t[0] = vmull_lane_s16(vget_low_s16(in), c, 1);
- t[1] = vmull_lane_s16(vget_high_s16(in), c, 1);
- return dct_const_round_shift_low_8(t);
+ t0[0] = vaddq_s32(x0_lo, x1_lo);
+ t0[1] = vaddq_s32(x0_hi, x1_hi);
+ t1[0] = vsubq_s32(x0_lo, x1_lo);
+ t1[1] = vsubq_s32(x0_hi, x1_hi);
+ *x1 = dct_const_round_shift_low_8(t0);
+ *x0 = dct_const_round_shift_low_8(t1);
}
-static INLINE int16x8_t iadst_half_butterfly_pos_neon(const int16x8_t in,
- const int16x4_t c) {
- int32x4_t t[2];
+static INLINE void iadst_half_butterfly_pos_neon(int16x8_t *const x0,
+ int16x8_t *const x1,
+ const int16x4_t c) {
+ // Don't add/sub before multiply, which will overflow in iadst8.
+ const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(*x0), c, 0);
+ const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(*x0), c, 0);
+ const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(*x1), c, 0);
+ const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(*x1), c, 0);
+ int32x4_t t0[2], t1[2];
- t[0] = vmull_lane_s16(vget_low_s16(in), c, 0);
- t[1] = vmull_lane_s16(vget_high_s16(in), c, 0);
- return dct_const_round_shift_low_8(t);
+ t0[0] = vaddq_s32(x0_lo, x1_lo);
+ t0[1] = vaddq_s32(x0_hi, x1_hi);
+ t1[0] = vsubq_s32(x0_lo, x1_lo);
+ t1[1] = vsubq_s32(x0_hi, x1_hi);
+ *x1 = dct_const_round_shift_low_8(t0);
+ *x0 = dct_const_round_shift_low_8(t1);
}
static INLINE void iadst_butterfly_lane_0_1_neon(const int16x8_t in0,