summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinfeng Zhang <linfengz@google.com>2018-03-05 16:25:08 -0800
committerLinfeng Zhang <linfengz@google.com>2018-03-13 17:38:29 -0700
commitd8424d2890ddce0d92778e055eee145655cf034e (patch)
treed1dcb9073db0a70c35b885e30916d92514f07318
parent88dc0d606255a5cba721c40b078b5802e891df57 (diff)
downloadlibvpx-d8424d2890ddce0d92778e055eee145655cf034e.tar
libvpx-d8424d2890ddce0d92778e055eee145655cf034e.tar.gz
libvpx-d8424d2890ddce0d92778e055eee145655cf034e.tar.bz2
libvpx-d8424d2890ddce0d92778e055eee145655cf034e.zip
Fix a bug in vp9_highbd_iht8x8_64_add_neon
This bug was introduced in 29b6a30c. BUG=webm:1403 Change-Id: I9e0bf2c7a01d8ff1c714c12236f7985b772b0540
-rw-r--r--test/dct_test.cc4
-rw-r--r--vp9/common/arm/neon/vp9_highbd_iht8x8_add_neon.c191
-rw-r--r--vp9/common/vp9_rtcd_defs.pl2
3 files changed, 37 insertions, 160 deletions
diff --git a/test/dct_test.cc b/test/dct_test.cc
index bfa05e5d8..66b2add03 100644
--- a/test/dct_test.cc
+++ b/test/dct_test.cc
@@ -631,13 +631,9 @@ static const FuncInfo ht_neon_func_info[] = {
#if CONFIG_VP9_HIGHBITDEPTH
{ &vp9_highbd_fht4x4_c, &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_neon>, 4,
2 },
-// TODO(linfengz): reenable these functions once test vector failures are
-// addressed.
-#if 0
{ &vp9_highbd_fht8x8_c, &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_neon>, 8,
2 },
#endif
-#endif
{ &vp9_fht4x4_c, &iht_wrapper<vp9_iht4x4_16_add_neon>, 4, 1 },
{ &vp9_fht8x8_c, &iht_wrapper<vp9_iht8x8_64_add_neon>, 8, 1 },
{ &vp9_fht16x16_c, &iht_wrapper<vp9_iht16x16_256_add_neon>, 16, 1 }
diff --git a/vp9/common/arm/neon/vp9_highbd_iht8x8_add_neon.c b/vp9/common/arm/neon/vp9_highbd_iht8x8_add_neon.c
index 498f03530..9b74d270c 100644
--- a/vp9/common/arm/neon/vp9_highbd_iht8x8_add_neon.c
+++ b/vp9/common/arm/neon/vp9_highbd_iht8x8_add_neon.c
@@ -132,84 +132,10 @@ static INLINE int32x4_t sub_dct_const_round_shift_low_8_bd12(
return vcombine_s32(out_lo, out_hi);
}
-static INLINE void iadst8_bd10(int32x4_t *const io0, int32x4_t *const io1,
- int32x4_t *const io2, int32x4_t *const io3,
- int32x4_t *const io4, int32x4_t *const io5,
- int32x4_t *const io6, int32x4_t *const io7) {
- const int32x4_t c0 =
- create_s32x4_neon(cospi_2_64, cospi_30_64, cospi_10_64, cospi_22_64);
- const int32x4_t c1 =
- create_s32x4_neon(cospi_18_64, cospi_14_64, cospi_26_64, cospi_6_64);
- const int32x4_t c2 =
- create_s32x4_neon(cospi_16_64, 0, cospi_8_64, cospi_24_64);
- int32x4_t x[8], t[4];
- int32x4_t s[8];
-
- x[0] = *io7;
- x[1] = *io0;
- x[2] = *io5;
- x[3] = *io2;
- x[4] = *io3;
- x[5] = *io4;
- x[6] = *io1;
- x[7] = *io6;
-
- // stage 1
- iadst_butterfly_lane_0_1_bd10_neon(x[0], x[1], vget_low_s32(c0), &s[0],
- &s[1]);
- iadst_butterfly_lane_0_1_bd10_neon(x[2], x[3], vget_high_s32(c0), &s[2],
- &s[3]);
- iadst_butterfly_lane_0_1_bd10_neon(x[4], x[5], vget_low_s32(c1), &s[4],
- &s[5]);
- iadst_butterfly_lane_0_1_bd10_neon(x[6], x[7], vget_high_s32(c1), &s[6],
- &s[7]);
-
- x[0] = add_dct_const_round_shift_low_8_bd10(s[0], s[4]);
- x[1] = add_dct_const_round_shift_low_8_bd10(s[1], s[5]);
- x[2] = add_dct_const_round_shift_low_8_bd10(s[2], s[6]);
- x[3] = add_dct_const_round_shift_low_8_bd10(s[3], s[7]);
- x[4] = sub_dct_const_round_shift_low_8_bd10(s[0], s[4]);
- x[5] = sub_dct_const_round_shift_low_8_bd10(s[1], s[5]);
- x[6] = sub_dct_const_round_shift_low_8_bd10(s[2], s[6]);
- x[7] = sub_dct_const_round_shift_low_8_bd10(s[3], s[7]);
-
- // stage 2
- t[0] = x[0];
- t[1] = x[1];
- t[2] = x[2];
- t[3] = x[3];
- iadst_butterfly_lane_0_1_bd10_neon(x[4], x[5], vget_high_s32(c2), &s[4],
- &s[5]);
- iadst_butterfly_lane_1_0_bd10_neon(x[7], x[6], vget_high_s32(c2), &s[7],
- &s[6]);
-
- x[0] = vaddq_s32(t[0], t[2]);
- x[1] = vaddq_s32(t[1], t[3]);
- x[2] = vsubq_s32(t[0], t[2]);
- x[3] = vsubq_s32(t[1], t[3]);
- x[4] = add_dct_const_round_shift_low_8_bd10(s[4], s[6]);
- x[5] = add_dct_const_round_shift_low_8_bd10(s[5], s[7]);
- x[6] = sub_dct_const_round_shift_low_8_bd10(s[4], s[6]);
- x[7] = sub_dct_const_round_shift_low_8_bd10(s[5], s[7]);
-
- // stage 3
- iadst_half_butterfly_bd10_neon(x + 2, vget_low_s32(c2));
- iadst_half_butterfly_bd10_neon(x + 6, vget_low_s32(c2));
-
- *io0 = x[0];
- *io1 = vnegq_s32(x[4]);
- *io2 = x[6];
- *io3 = vnegq_s32(x[2]);
- *io4 = x[3];
- *io5 = vnegq_s32(x[7]);
- *io6 = x[5];
- *io7 = vnegq_s32(x[1]);
-}
-
-static INLINE void iadst8_bd12(int32x4_t *const io0, int32x4_t *const io1,
- int32x4_t *const io2, int32x4_t *const io3,
- int32x4_t *const io4, int32x4_t *const io5,
- int32x4_t *const io6, int32x4_t *const io7) {
+static INLINE void highbd_iadst8(int32x4_t *const io0, int32x4_t *const io1,
+ int32x4_t *const io2, int32x4_t *const io3,
+ int32x4_t *const io4, int32x4_t *const io5,
+ int32x4_t *const io6, int32x4_t *const io7) {
const int32x4_t c0 =
create_s32x4_neon(cospi_2_64, cospi_30_64, cospi_10_64, cospi_22_64);
const int32x4_t c1 =
@@ -394,31 +320,17 @@ void vp9_highbd_iht8x8_64_add_neon(const tran_low_t *input, uint16_t *dest,
const int32x4_t cospis1 =
vld1q_s32(kCospi32 + 4); // cospi 4, 12, 20, 28
- if (bd == 10) {
- idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
- &a[4], &a[5], &a[6], &a[7]);
- idct8x8_64_half1d_bd10(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11],
- &a[12], &a[13], &a[14], &a[15]);
- transpose_s32_8x4(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3],
- &a[11]);
- iadst8_bd10(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]);
- transpose_s32_8x4(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
- &a[15]);
- iadst8_bd10(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
+ idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
+ &a[4], &a[5], &a[6], &a[7]);
+ idct8x8_64_half1d_bd12(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11],
+ &a[12], &a[13], &a[14], &a[15]);
+ transpose_s32_8x4(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3],
+ &a[11]);
+ highbd_iadst8(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]);
+ transpose_s32_8x4(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
+ &a[15]);
+ highbd_iadst8(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
&a[15]);
- } else {
- idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
- &a[4], &a[5], &a[6], &a[7]);
- idct8x8_64_half1d_bd12(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11],
- &a[12], &a[13], &a[14], &a[15]);
- transpose_s32_8x4(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3],
- &a[11]);
- iadst8_bd12(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]);
- transpose_s32_8x4(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
- &a[15]);
- iadst8_bd12(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
- &a[15]);
- }
break;
}
@@ -427,67 +339,36 @@ void vp9_highbd_iht8x8_64_add_neon(const tran_low_t *input, uint16_t *dest,
const int32x4_t cospis1 =
vld1q_s32(kCospi32 + 4); // cospi 4, 12, 20, 28
- if (bd == 10) {
- transpose_s32_8x4(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6],
- &a[7]);
- iadst8_bd10(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
- transpose_s32_8x4(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13],
- &a[14], &a[15]);
- iadst8_bd10(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14],
+ transpose_s32_8x4(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6],
+ &a[7]);
+ highbd_iadst8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
+ transpose_s32_8x4(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14],
+ &a[15]);
+ highbd_iadst8(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14],
&a[15]);
- idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9],
- &a[2], &a[10], &a[3], &a[11]);
- idct8x8_64_half1d_bd10(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13],
- &a[6], &a[14], &a[7], &a[15]);
- } else {
- transpose_s32_8x4(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6],
- &a[7]);
- iadst8_bd12(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
- transpose_s32_8x4(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13],
- &a[14], &a[15]);
- iadst8_bd12(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14],
- &a[15]);
- idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9],
- &a[2], &a[10], &a[3], &a[11]);
- idct8x8_64_half1d_bd12(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13],
- &a[6], &a[14], &a[7], &a[15]);
- }
+ idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9],
+ &a[2], &a[10], &a[3], &a[11]);
+ idct8x8_64_half1d_bd12(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13],
+ &a[6], &a[14], &a[7], &a[15]);
break;
}
default: {
assert(tx_type == ADST_ADST);
- if (bd == 10) {
- transpose_s32_8x4(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6],
- &a[7]);
- iadst8_bd10(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
- transpose_s32_8x4(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13],
- &a[14], &a[15]);
- iadst8_bd10(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14],
+ transpose_s32_8x4(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6],
+ &a[7]);
+ highbd_iadst8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
+ transpose_s32_8x4(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14],
+ &a[15]);
+ highbd_iadst8(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14],
&a[15]);
- transpose_s32_8x4(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3],
- &a[11]);
- iadst8_bd10(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]);
- transpose_s32_8x4(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
- &a[15]);
- iadst8_bd10(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
+ transpose_s32_8x4(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3],
+ &a[11]);
+ highbd_iadst8(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]);
+ transpose_s32_8x4(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
+ &a[15]);
+ highbd_iadst8(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
&a[15]);
- } else {
- transpose_s32_8x4(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6],
- &a[7]);
- iadst8_bd12(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
- transpose_s32_8x4(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13],
- &a[14], &a[15]);
- iadst8_bd12(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14],
- &a[15]);
- transpose_s32_8x4(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3],
- &a[11]);
- iadst8_bd12(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]);
- transpose_s32_8x4(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
- &a[15]);
- iadst8_bd12(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
- &a[15]);
- }
break;
}
}
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index fcd8f7e62..c705c5ef3 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -104,7 +104,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
specialize qw/vp9_highbd_iht4x4_16_add neon sse4_1/;
- specialize qw/vp9_highbd_iht8x8_64_add sse4_1/;
+ specialize qw/vp9_highbd_iht8x8_64_add neon sse4_1/;
specialize qw/vp9_highbd_iht16x16_256_add sse4_1/;
}
}