summaryrefslogtreecommitdiff
path: root/vp9
diff options
context:
space:
mode:
authorLinfeng Zhang <linfengz@google.com>2018-03-13 16:10:00 -0700
committerLinfeng Zhang <linfengz@google.com>2018-03-13 16:10:00 -0700
commit88dc0d606255a5cba721c40b078b5802e891df57 (patch)
treedaca30f169d4eec050af0b176540d8698ccbb896 /vp9
parent7b278e30723d7c9e769ed8ac54daf45a721172da (diff)
downloadlibvpx-88dc0d606255a5cba721c40b078b5802e891df57.tar
libvpx-88dc0d606255a5cba721c40b078b5802e891df57.tar.gz
libvpx-88dc0d606255a5cba721c40b078b5802e891df57.tar.bz2
libvpx-88dc0d606255a5cba721c40b078b5802e891df57.zip
Fix a bug in vp9_highbd_iht4x4_16_add_neon()
This bug was introduced in 36363304. BUG=webm:1403 Change-Id: I695b409047e41ab7e0460981524310d78753751a
Diffstat (limited to 'vp9')
-rw-r--r--vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c75
-rw-r--r--vp9/common/vp9_rtcd_defs.pl2
2 files changed, 49 insertions, 28 deletions
diff --git a/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c b/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c
index 46284238d..52c4f1937 100644
--- a/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c
+++ b/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c
@@ -23,34 +23,55 @@
static INLINE void highbd_iadst4(int32x4_t *const io) {
const int32_t sinpis[4] = { sinpi_1_9, sinpi_2_9, sinpi_3_9, sinpi_4_9 };
const int32x4_t sinpi = vld1q_s32(sinpis);
- int32x4_t s[8];
-
- s[0] = vmulq_lane_s32(io[0], vget_low_s32(sinpi), 0);
- s[1] = vmulq_lane_s32(io[0], vget_low_s32(sinpi), 1);
- s[2] = vmulq_lane_s32(io[1], vget_high_s32(sinpi), 0);
- s[3] = vmulq_lane_s32(io[2], vget_high_s32(sinpi), 1);
- s[4] = vmulq_lane_s32(io[2], vget_low_s32(sinpi), 0);
- s[5] = vmulq_lane_s32(io[3], vget_low_s32(sinpi), 1);
- s[6] = vmulq_lane_s32(io[3], vget_high_s32(sinpi), 1);
- s[7] = vsubq_s32(io[0], io[2]);
- s[7] = vaddq_s32(s[7], io[3]);
-
- s[0] = vaddq_s32(s[0], s[3]);
- s[0] = vaddq_s32(s[0], s[5]);
- s[1] = vsubq_s32(s[1], s[4]);
- s[1] = vsubq_s32(s[1], s[6]);
+ int64x2x2_t s[7], t[4];
+ int32x4_t s7;
+
+ s[0].val[0] = vmull_lane_s32(vget_low_s32(io[0]), vget_low_s32(sinpi), 0);
+ s[0].val[1] = vmull_lane_s32(vget_high_s32(io[0]), vget_low_s32(sinpi), 0);
+ s[1].val[0] = vmull_lane_s32(vget_low_s32(io[0]), vget_low_s32(sinpi), 1);
+ s[1].val[1] = vmull_lane_s32(vget_high_s32(io[0]), vget_low_s32(sinpi), 1);
+ s[2].val[0] = vmull_lane_s32(vget_low_s32(io[1]), vget_high_s32(sinpi), 0);
+ s[2].val[1] = vmull_lane_s32(vget_high_s32(io[1]), vget_high_s32(sinpi), 0);
+ s[3].val[0] = vmull_lane_s32(vget_low_s32(io[2]), vget_high_s32(sinpi), 1);
+ s[3].val[1] = vmull_lane_s32(vget_high_s32(io[2]), vget_high_s32(sinpi), 1);
+ s[4].val[0] = vmull_lane_s32(vget_low_s32(io[2]), vget_low_s32(sinpi), 0);
+ s[4].val[1] = vmull_lane_s32(vget_high_s32(io[2]), vget_low_s32(sinpi), 0);
+ s[5].val[0] = vmull_lane_s32(vget_low_s32(io[3]), vget_low_s32(sinpi), 1);
+ s[5].val[1] = vmull_lane_s32(vget_high_s32(io[3]), vget_low_s32(sinpi), 1);
+ s[6].val[0] = vmull_lane_s32(vget_low_s32(io[3]), vget_high_s32(sinpi), 1);
+ s[6].val[1] = vmull_lane_s32(vget_high_s32(io[3]), vget_high_s32(sinpi), 1);
+ s7 = vsubq_s32(io[0], io[2]);
+ s7 = vaddq_s32(s7, io[3]);
+
+ s[0].val[0] = vaddq_s64(s[0].val[0], s[3].val[0]);
+ s[0].val[1] = vaddq_s64(s[0].val[1], s[3].val[1]);
+ s[0].val[0] = vaddq_s64(s[0].val[0], s[5].val[0]);
+ s[0].val[1] = vaddq_s64(s[0].val[1], s[5].val[1]);
+ s[1].val[0] = vsubq_s64(s[1].val[0], s[4].val[0]);
+ s[1].val[1] = vsubq_s64(s[1].val[1], s[4].val[1]);
+ s[1].val[0] = vsubq_s64(s[1].val[0], s[6].val[0]);
+ s[1].val[1] = vsubq_s64(s[1].val[1], s[6].val[1]);
s[3] = s[2];
- s[2] = vmulq_lane_s32(s[7], vget_high_s32(sinpi), 0);
-
- io[0] = vaddq_s32(s[0], s[3]);
- io[1] = vaddq_s32(s[1], s[3]);
- io[2] = s[2];
- io[3] = vaddq_s32(s[0], s[1]);
- io[3] = vsubq_s32(io[3], s[3]);
- io[0] = vrshrq_n_s32(io[0], DCT_CONST_BITS);
- io[1] = vrshrq_n_s32(io[1], DCT_CONST_BITS);
- io[2] = vrshrq_n_s32(io[2], DCT_CONST_BITS);
- io[3] = vrshrq_n_s32(io[3], DCT_CONST_BITS);
+ s[2].val[0] = vmull_lane_s32(vget_low_s32(s7), vget_high_s32(sinpi), 0);
+ s[2].val[1] = vmull_lane_s32(vget_high_s32(s7), vget_high_s32(sinpi), 0);
+
+ t[0].val[0] = vaddq_s64(s[0].val[0], s[3].val[0]);
+ t[0].val[1] = vaddq_s64(s[0].val[1], s[3].val[1]);
+ t[1].val[0] = vaddq_s64(s[1].val[0], s[3].val[0]);
+ t[1].val[1] = vaddq_s64(s[1].val[1], s[3].val[1]);
+ t[2] = s[2];
+ t[3].val[0] = vaddq_s64(s[0].val[0], s[1].val[0]);
+ t[3].val[1] = vaddq_s64(s[0].val[1], s[1].val[1]);
+ t[3].val[0] = vsubq_s64(t[3].val[0], s[3].val[0]);
+ t[3].val[1] = vsubq_s64(t[3].val[1], s[3].val[1]);
+ io[0] = vcombine_s32(vrshrn_n_s64(t[0].val[0], DCT_CONST_BITS),
+ vrshrn_n_s64(t[0].val[1], DCT_CONST_BITS));
+ io[1] = vcombine_s32(vrshrn_n_s64(t[1].val[0], DCT_CONST_BITS),
+ vrshrn_n_s64(t[1].val[1], DCT_CONST_BITS));
+ io[2] = vcombine_s32(vrshrn_n_s64(t[2].val[0], DCT_CONST_BITS),
+ vrshrn_n_s64(t[2].val[1], DCT_CONST_BITS));
+ io[3] = vcombine_s32(vrshrn_n_s64(t[3].val[0], DCT_CONST_BITS),
+ vrshrn_n_s64(t[3].val[1], DCT_CONST_BITS));
}
void vp9_highbd_iht4x4_16_add_neon(const tran_low_t *input, uint16_t *dest,
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 23732e214..fcd8f7e62 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -103,7 +103,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd";
if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
- specialize qw/vp9_highbd_iht4x4_16_add sse4_1/;
+ specialize qw/vp9_highbd_iht4x4_16_add neon sse4_1/;
specialize qw/vp9_highbd_iht8x8_64_add sse4_1/;
specialize qw/vp9_highbd_iht16x16_256_add sse4_1/;
}