diff options
author | Linfeng Zhang <linfengz@google.com> | 2018-03-13 16:10:00 -0700 |
---|---|---|
committer | Linfeng Zhang <linfengz@google.com> | 2018-03-13 16:10:00 -0700 |
commit | 88dc0d606255a5cba721c40b078b5802e891df57 (patch) | |
tree | daca30f169d4eec050af0b176540d8698ccbb896 /vp9 | |
parent | 7b278e30723d7c9e769ed8ac54daf45a721172da (diff) | |
download | libvpx-88dc0d606255a5cba721c40b078b5802e891df57.tar libvpx-88dc0d606255a5cba721c40b078b5802e891df57.tar.gz libvpx-88dc0d606255a5cba721c40b078b5802e891df57.tar.bz2 libvpx-88dc0d606255a5cba721c40b078b5802e891df57.zip |
Fix a bug in vp9_highbd_iht4x4_16_add_neon()
This bug was introduced in 36363304.
BUG=webm:1403
Change-Id: I695b409047e41ab7e0460981524310d78753751a
Diffstat (limited to 'vp9')
-rw-r--r-- | vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c | 75 | ||||
-rw-r--r-- | vp9/common/vp9_rtcd_defs.pl | 2 |
2 files changed, 49 insertions, 28 deletions
diff --git a/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c b/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c index 46284238d..52c4f1937 100644 --- a/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c +++ b/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c @@ -23,34 +23,55 @@ static INLINE void highbd_iadst4(int32x4_t *const io) { const int32_t sinpis[4] = { sinpi_1_9, sinpi_2_9, sinpi_3_9, sinpi_4_9 }; const int32x4_t sinpi = vld1q_s32(sinpis); - int32x4_t s[8]; - - s[0] = vmulq_lane_s32(io[0], vget_low_s32(sinpi), 0); - s[1] = vmulq_lane_s32(io[0], vget_low_s32(sinpi), 1); - s[2] = vmulq_lane_s32(io[1], vget_high_s32(sinpi), 0); - s[3] = vmulq_lane_s32(io[2], vget_high_s32(sinpi), 1); - s[4] = vmulq_lane_s32(io[2], vget_low_s32(sinpi), 0); - s[5] = vmulq_lane_s32(io[3], vget_low_s32(sinpi), 1); - s[6] = vmulq_lane_s32(io[3], vget_high_s32(sinpi), 1); - s[7] = vsubq_s32(io[0], io[2]); - s[7] = vaddq_s32(s[7], io[3]); - - s[0] = vaddq_s32(s[0], s[3]); - s[0] = vaddq_s32(s[0], s[5]); - s[1] = vsubq_s32(s[1], s[4]); - s[1] = vsubq_s32(s[1], s[6]); + int64x2x2_t s[7], t[4]; + int32x4_t s7; + + s[0].val[0] = vmull_lane_s32(vget_low_s32(io[0]), vget_low_s32(sinpi), 0); + s[0].val[1] = vmull_lane_s32(vget_high_s32(io[0]), vget_low_s32(sinpi), 0); + s[1].val[0] = vmull_lane_s32(vget_low_s32(io[0]), vget_low_s32(sinpi), 1); + s[1].val[1] = vmull_lane_s32(vget_high_s32(io[0]), vget_low_s32(sinpi), 1); + s[2].val[0] = vmull_lane_s32(vget_low_s32(io[1]), vget_high_s32(sinpi), 0); + s[2].val[1] = vmull_lane_s32(vget_high_s32(io[1]), vget_high_s32(sinpi), 0); + s[3].val[0] = vmull_lane_s32(vget_low_s32(io[2]), vget_high_s32(sinpi), 1); + s[3].val[1] = vmull_lane_s32(vget_high_s32(io[2]), vget_high_s32(sinpi), 1); + s[4].val[0] = vmull_lane_s32(vget_low_s32(io[2]), vget_low_s32(sinpi), 0); + s[4].val[1] = vmull_lane_s32(vget_high_s32(io[2]), vget_low_s32(sinpi), 0); + s[5].val[0] = vmull_lane_s32(vget_low_s32(io[3]), vget_low_s32(sinpi), 1); + s[5].val[1] = vmull_lane_s32(vget_high_s32(io[3]), vget_low_s32(sinpi), 1); + s[6].val[0] = vmull_lane_s32(vget_low_s32(io[3]), vget_high_s32(sinpi), 1); + s[6].val[1] = vmull_lane_s32(vget_high_s32(io[3]), vget_high_s32(sinpi), 1); + s7 = vsubq_s32(io[0], io[2]); + s7 = vaddq_s32(s7, io[3]); + + s[0].val[0] = vaddq_s64(s[0].val[0], s[3].val[0]); + s[0].val[1] = vaddq_s64(s[0].val[1], s[3].val[1]); + s[0].val[0] = vaddq_s64(s[0].val[0], s[5].val[0]); + s[0].val[1] = vaddq_s64(s[0].val[1], s[5].val[1]); + s[1].val[0] = vsubq_s64(s[1].val[0], s[4].val[0]); + s[1].val[1] = vsubq_s64(s[1].val[1], s[4].val[1]); + s[1].val[0] = vsubq_s64(s[1].val[0], s[6].val[0]); + s[1].val[1] = vsubq_s64(s[1].val[1], s[6].val[1]); s[3] = s[2]; - s[2] = vmulq_lane_s32(s[7], vget_high_s32(sinpi), 0); - - io[0] = vaddq_s32(s[0], s[3]); - io[1] = vaddq_s32(s[1], s[3]); - io[2] = s[2]; - io[3] = vaddq_s32(s[0], s[1]); - io[3] = vsubq_s32(io[3], s[3]); - io[0] = vrshrq_n_s32(io[0], DCT_CONST_BITS); - io[1] = vrshrq_n_s32(io[1], DCT_CONST_BITS); - io[2] = vrshrq_n_s32(io[2], DCT_CONST_BITS); - io[3] = vrshrq_n_s32(io[3], DCT_CONST_BITS); + s[2].val[0] = vmull_lane_s32(vget_low_s32(s7), vget_high_s32(sinpi), 0); + s[2].val[1] = vmull_lane_s32(vget_high_s32(s7), vget_high_s32(sinpi), 0); + + t[0].val[0] = vaddq_s64(s[0].val[0], s[3].val[0]); + t[0].val[1] = vaddq_s64(s[0].val[1], s[3].val[1]); + t[1].val[0] = vaddq_s64(s[1].val[0], s[3].val[0]); + t[1].val[1] = vaddq_s64(s[1].val[1], s[3].val[1]); + t[2] = s[2]; + t[3].val[0] = vaddq_s64(s[0].val[0], s[1].val[0]); + t[3].val[1] = vaddq_s64(s[0].val[1], s[1].val[1]); + t[3].val[0] = vsubq_s64(t[3].val[0], s[3].val[0]); + t[3].val[1] = vsubq_s64(t[3].val[1], s[3].val[1]); + io[0] = vcombine_s32(vrshrn_n_s64(t[0].val[0], DCT_CONST_BITS), + vrshrn_n_s64(t[0].val[1], DCT_CONST_BITS)); + io[1] = vcombine_s32(vrshrn_n_s64(t[1].val[0], DCT_CONST_BITS), + vrshrn_n_s64(t[1].val[1], DCT_CONST_BITS)); + io[2] = vcombine_s32(vrshrn_n_s64(t[2].val[0], DCT_CONST_BITS), + vrshrn_n_s64(t[2].val[1], DCT_CONST_BITS)); + io[3] = vcombine_s32(vrshrn_n_s64(t[3].val[0], DCT_CONST_BITS), + vrshrn_n_s64(t[3].val[1], DCT_CONST_BITS)); } void vp9_highbd_iht4x4_16_add_neon(const tran_low_t *input, uint16_t *dest, diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 23732e214..fcd8f7e62 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -103,7 +103,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd"; if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") { - specialize qw/vp9_highbd_iht4x4_16_add sse4_1/; + specialize qw/vp9_highbd_iht4x4_16_add neon sse4_1/; specialize qw/vp9_highbd_iht8x8_64_add sse4_1/; specialize qw/vp9_highbd_iht16x16_256_add sse4_1/; } |