diff options
-rw-r--r-- | test/vp9_avg_test.cc | 8 | ||||
-rw-r--r-- | vp9/common/vp9_rtcd_defs.pl | 2 | ||||
-rw-r--r-- | vp9/encoder/arm/neon/vp9_avg_neon.c | 27 |
3 files changed, 36 insertions, 1 deletions
diff --git a/test/vp9_avg_test.cc b/test/vp9_avg_test.cc index 1a9b43062..290bdc75e 100644 --- a/test/vp9_avg_test.cc +++ b/test/vp9_avg_test.cc @@ -385,6 +385,14 @@ INSTANTIATE_TEST_CASE_P( make_tuple(16, &vp9_int_pro_col_neon, &vp9_int_pro_col_c), make_tuple(32, &vp9_int_pro_col_neon, &vp9_int_pro_col_c), make_tuple(64, &vp9_int_pro_col_neon, &vp9_int_pro_col_c))); + +INSTANTIATE_TEST_CASE_P( + NEON, SatdTest, + ::testing::Values( + make_tuple(16, &vp9_satd_neon), + make_tuple(64, &vp9_satd_neon), + make_tuple(256, &vp9_satd_neon), + make_tuple(1024, &vp9_satd_neon))); #endif #if HAVE_MSA diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 7a2883aba..8fe6503aa 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -210,7 +210,7 @@ add_proto qw/void vp9_hadamard_16x16/, "int16_t const *src_diff, int src_stride, specialize qw/vp9_hadamard_16x16 sse2/; add_proto qw/int vp9_satd/, "const int16_t *coeff, int length"; -specialize qw/vp9_satd sse2/; +specialize qw/vp9_satd sse2 neon/; add_proto qw/void vp9_int_pro_row/, "int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height"; specialize qw/vp9_int_pro_row sse2 neon/; diff --git a/vp9/encoder/arm/neon/vp9_avg_neon.c b/vp9/encoder/arm/neon/vp9_avg_neon.c index d569ec95d..5996bd426 100644 --- a/vp9/encoder/arm/neon/vp9_avg_neon.c +++ b/vp9/encoder/arm/neon/vp9_avg_neon.c @@ -50,6 +50,33 @@ unsigned int vp9_avg_8x8_neon(const uint8_t *s, int p) { return (horizontal_add_u16x8(v_sum) + 32) >> 6; } +// coeff: 16 bits, dynamic range [-32640, 32640]. +// length: value range {16, 64, 256, 1024}. +int vp9_satd_neon(const int16_t *coeff, int length) { + const int16x4_t zero = vdup_n_s16(0); + int32x4_t accum = vdupq_n_s32(0); + + do { + const int16x8_t src0 = vld1q_s16(coeff); + const int16x8_t src8 = vld1q_s16(coeff + 8); + accum = vabal_s16(accum, vget_low_s16(src0), zero); + accum = vabal_s16(accum, vget_high_s16(src0), zero); + accum = vabal_s16(accum, vget_low_s16(src8), zero); + accum = vabal_s16(accum, vget_high_s16(src8), zero); + length -= 16; + coeff += 16; + } while (length != 0); + + { + // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024] + const int64x2_t s0 = vpaddlq_s32(accum); // cascading summation of 'accum'. + const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)), + vreinterpret_s32_s64(vget_high_s64(s0))); + const int satd = vget_lane_s32(s1, 0); + return satd; + } +} + void vp9_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref, const int ref_stride, const int height) { int i; |