diff options
author | Linfeng Zhang <linfengz@google.com> | 2016-08-03 11:42:33 -0700 |
---|---|---|
committer | Linfeng Zhang <linfengz@google.com> | 2016-08-12 09:58:17 -0700 |
commit | f09b5a33285762a0a6ab5d81d5804aeb52523276 (patch) | |
tree | 50bfe3794dc6acd66f79b7201fd227a4cb31eb9c | |
parent | f1e12c1bf3b24e67c8c1542c87c0cd84f85b88fc (diff) | |
download | libvpx-f09b5a33285762a0a6ab5d81d5804aeb52523276.tar libvpx-f09b5a33285762a0a6ab5d81d5804aeb52523276.tar.gz libvpx-f09b5a33285762a0a6ab5d81d5804aeb52523276.tar.bz2 libvpx-f09b5a33285762a0a6ab5d81d5804aeb52523276.zip |
NEON intrinsics for 4 loopfilter functions
New NEON intrinsics functions:
vpx_lpf_horizontal_edge_8_neon()
vpx_lpf_horizontal_edge_16_neon()
vpx_lpf_vertical_16_neon()
vpx_lpf_vertical_16_dual_neon()
BUG=webm:1262, webm:1263, webm:1264, webm:1265.
Change-Id: I7a2aff2a358b22277429329adec606e08efbc8cb
-rw-r--r-- | test/lpf_8_test.cc | 6 | ||||
-rw-r--r-- | vpx_dsp/arm/loopfilter_mb_neon.asm | 65 | ||||
-rw-r--r-- | vpx_dsp/arm/loopfilter_mb_neon.c | 446 | ||||
-rw-r--r-- | vpx_dsp/arm/loopfilter_neon.c | 7 | ||||
-rw-r--r-- | vpx_dsp/arm/transpose_neon.h | 215 | ||||
-rw-r--r-- | vpx_dsp/loopfilter.c | 4 | ||||
-rw-r--r-- | vpx_dsp/vpx_dsp.mk | 1 | ||||
-rw-r--r-- | vpx_dsp/vpx_dsp_rtcd_defs.pl | 12 |
8 files changed, 718 insertions, 38 deletions
diff --git a/test/lpf_8_test.cc b/test/lpf_8_test.cc index 1ab04d935..27bedb8c7 100644 --- a/test/lpf_8_test.cc +++ b/test/lpf_8_test.cc @@ -520,9 +520,6 @@ INSTANTIATE_TEST_CASE_P( INSTANTIATE_TEST_CASE_P( NEON, Loop8Test6Param, ::testing::Values( -// Using #if inside the macro is unsupported on MSVS but the tests are not -// currently built for MSVS with ARM and NEON. -#if HAVE_NEON_ASM make_tuple(&vpx_lpf_horizontal_edge_8_neon, &vpx_lpf_horizontal_edge_8_c, 8), make_tuple(&vpx_lpf_horizontal_edge_16_neon, @@ -530,13 +527,14 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vpx_lpf_vertical_16_neon, &vpx_lpf_vertical_16_c, 8), make_tuple(&vpx_lpf_vertical_16_dual_neon, &vpx_lpf_vertical_16_dual_c, 8), -#endif // HAVE_NEON_ASM make_tuple(&vpx_lpf_horizontal_8_neon, &vpx_lpf_horizontal_8_c, 8), make_tuple(&vpx_lpf_vertical_8_neon, &vpx_lpf_vertical_8_c, 8), make_tuple(&vpx_lpf_horizontal_4_neon, &vpx_lpf_horizontal_4_c, 8), make_tuple(&vpx_lpf_vertical_4_neon, &vpx_lpf_vertical_4_c, 8))); INSTANTIATE_TEST_CASE_P(NEON, Loop8Test9Param, ::testing::Values( +// Using #if inside the macro is unsupported on MSVS but the tests are not +// currently built for MSVS with ARM and NEON. #if HAVE_NEON_ASM make_tuple(&vpx_lpf_horizontal_8_dual_neon, &vpx_lpf_horizontal_8_dual_c, 8), diff --git a/vpx_dsp/arm/loopfilter_mb_neon.asm b/vpx_dsp/arm/loopfilter_mb_neon.asm index d5da7a840..5279ecfb7 100644 --- a/vpx_dsp/arm/loopfilter_mb_neon.asm +++ b/vpx_dsp/arm/loopfilter_mb_neon.asm @@ -11,6 +11,7 @@ EXPORT |vpx_lpf_horizontal_edge_8_neon| EXPORT |vpx_lpf_horizontal_edge_16_neon| EXPORT |vpx_lpf_vertical_16_neon| + EXPORT |vpx_lpf_vertical_16_dual_neon| ARM AREA ||.text||, CODE, READONLY, ALIGN=2 @@ -146,20 +147,21 @@ h_next b mb_lpf_horizontal_edge ENDP ; |vpx_lpf_horizontal_edge_16_neon| -; void vpx_lpf_vertical_16_neon(uint8_t *s, int p, -; const uint8_t *blimit, -; const uint8_t *limit, -; const uint8_t *thresh) +; void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit, +; const uint8_t *limit, const uint8_t *thresh, +; int count) { ; r0 uint8_t *s, ; r1 int p, /* pitch */ ; r2 const uint8_t *blimit, ; r3 const uint8_t *limit, ; sp const uint8_t *thresh, -|vpx_lpf_vertical_16_neon| PROC +; r12 int count +|mb_lpf_vertical_edge_w| PROC push {r4-r8, lr} vpush {d8-d15} ldr r4, [sp, #88] ; load thresh +v_count vld1.8 {d16[]}, [r2] ; load *blimit vld1.8 {d17[]}, [r3] ; load *limit vld1.8 {d18[]}, [r4] ; load *thresh @@ -212,20 +214,21 @@ h_next ; flat && mask were not set for any of the channels. Just store the values ; from filter. - sub r8, r0, #2 + sub r0, #2 vswp d23, d25 - vst4.8 {d23[0], d24[0], d25[0], d26[0]}, [r8], r1 - vst4.8 {d23[1], d24[1], d25[1], d26[1]}, [r8], r1 - vst4.8 {d23[2], d24[2], d25[2], d26[2]}, [r8], r1 - vst4.8 {d23[3], d24[3], d25[3], d26[3]}, [r8], r1 - vst4.8 {d23[4], d24[4], d25[4], d26[4]}, [r8], r1 - vst4.8 {d23[5], d24[5], d25[5], d26[5]}, [r8], r1 - vst4.8 {d23[6], d24[6], d25[6], d26[6]}, [r8], r1 - vst4.8 {d23[7], d24[7], d25[7], d26[7]}, [r8], r1 + vst4.8 {d23[0], d24[0], d25[0], d26[0]}, [r0], r1 + vst4.8 {d23[1], d24[1], d25[1], d26[1]}, [r0], r1 + vst4.8 {d23[2], d24[2], d25[2], d26[2]}, [r0], r1 + vst4.8 {d23[3], d24[3], d25[3], d26[3]}, [r0], r1 + vst4.8 {d23[4], d24[4], d25[4], d26[4]}, [r0], r1 + vst4.8 {d23[5], d24[5], d25[5], d26[5]}, [r0], r1 + vst4.8 {d23[6], d24[6], d25[6], d26[6]}, [r0], r1 + vst4.8 {d23[7], d24[7], d25[7], d26[7]}, [r0], r1 + add r0, #2 - b v_end + b v_next v_mbfilter tst r7, #2 @@ -252,7 +255,7 @@ v_mbfilter vst3.8 {d18[7], d19[7], d20[7]}, [r8], r1 vst3.8 {d21[7], d22[7], d23[7]}, [r0], r1 - b v_end + b v_next v_wide_mbfilter sub r8, r0, #8 @@ -304,12 +307,40 @@ v_wide_mbfilter vst1.8 {d19}, [r8@64], r1 vst1.8 {d15}, [r0@64], r1 -v_end +v_next + subs r12, #1 + bne v_count + vpop {d8-d15} pop {r4-r8, pc} + ENDP ; |mb_lpf_vertical_edge_w| + +; void vpx_lpf_vertical_16_neon(uint8_t *s, int p, const uint8_t *blimit, +; const uint8_t *limit, const uint8_t *thresh) +; r0 uint8_t *s, +; r1 int p, /* pitch */ +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh +|vpx_lpf_vertical_16_neon| PROC + mov r12, #1 + b mb_lpf_vertical_edge_w ENDP ; |vpx_lpf_vertical_16_neon| +; void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh) +; r0 uint8_t *s, +; r1 int p, /* pitch */ +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh +|vpx_lpf_vertical_16_dual_neon| PROC + mov r12, #2 + b mb_lpf_vertical_edge_w + ENDP ; |vpx_lpf_vertical_16_dual_neon| + ; void vpx_wide_mbfilter_neon(); ; This is a helper function for the loopfilters. The invidual functions do the ; necessary load, transpose (if necessary) and store. diff --git a/vpx_dsp/arm/loopfilter_mb_neon.c b/vpx_dsp/arm/loopfilter_mb_neon.c new file mode 100644 index 000000000..6d5d8e71a --- /dev/null +++ b/vpx_dsp/arm/loopfilter_mb_neon.c @@ -0,0 +1,446 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/transpose_neon.h" + +// Should we apply any filter at all: 11111111 yes, 00000000 no +static INLINE uint8x8_t filter_mask( + const uint8x8_t limit, const uint8x8_t blimit, const uint8x8_t thresh, + const uint8x8_t p3, const uint8x8_t p2, const uint8x8_t p1, + const uint8x8_t p0, const uint8x8_t q0, const uint8x8_t q1, + const uint8x8_t q2, const uint8x8_t q3, uint8x8_t *flat, uint8x8_t *hev) { + uint8x8_t t0, t1; + uint8x8_t max = vabd_u8(p1, p0); + max = vmax_u8(max, vabd_u8(q1, q0)); + + // Is there high edge variance internal edge: 11111111 yes, 00000000 no + *hev = vcgt_u8(max, thresh); + *flat = vmax_u8(max, vabd_u8(p2, p0)); + max = vmax_u8(max, vabd_u8(p3, p2)); + max = vmax_u8(max, vabd_u8(p2, p1)); + max = vmax_u8(max, vabd_u8(q2, q1)); + max = vmax_u8(max, vabd_u8(q3, q2)); + t0 = vabd_u8(p0, q0); + t1 = vabd_u8(p1, q1); + t0 = vqshl_n_u8(t0, 1); + t1 = vshr_n_u8(t1, 1); + t0 = vqadd_u8(t0, t1); + max = vcle_u8(max, limit); + t0 = vcle_u8(t0, blimit); + max = vand_u8(max, t0); + + *flat = vmax_u8(*flat, vabd_u8(q2, q0)); + *flat = vmax_u8(*flat, vabd_u8(p3, p0)); + *flat = vmax_u8(*flat, vabd_u8(q3, q0)); + *flat = vcle_u8(*flat, vdup_n_u8(1)); // flat_mask4() + + return max; +} + +static INLINE uint8x8_t flat_mask5(const uint8x8_t p4, const uint8x8_t p3, + const uint8x8_t p2, const uint8x8_t p1, + const uint8x8_t p0, const uint8x8_t q0, + const uint8x8_t q1, const uint8x8_t q2, + const uint8x8_t q3, const uint8x8_t q4) { + uint8x8_t max = vabd_u8(p4, p0); + max = vmax_u8(max, vabd_u8(p3, p0)); + max = vmax_u8(max, vabd_u8(p2, p0)); + max = vmax_u8(max, vabd_u8(p1, p0)); + max = vmax_u8(max, vabd_u8(q1, q0)); + max = vmax_u8(max, vabd_u8(q2, q0)); + max = vmax_u8(max, vabd_u8(q3, q0)); + max = vmax_u8(max, vabd_u8(q4, q0)); + max = vcle_u8(max, vdup_n_u8(1)); + + return max; +} + +static INLINE int8x8_t flip_sign(const uint8x8_t v) { + const uint8x8_t sign_bit = vdup_n_u8(0x80); + return vreinterpret_s8_u8(veor_u8(v, sign_bit)); +} + +static INLINE uint8x8_t flip_sign_back(const int8x8_t v) { + const int8x8_t sign_bit = vdup_n_s8(0x80); + return vreinterpret_u8_s8(veor_s8(v, sign_bit)); +} + +static INLINE uint8x8_t filter_tap7(const uint8x8_t flat, const uint8x8_t sub0, + const uint8x8_t sub1, const uint8x8_t add0, + const uint8x8_t add1, const uint8x8_t in, + uint16x8_t *sum) { + *sum = vsubw_u8(*sum, sub0); + *sum = vsubw_u8(*sum, sub1); + *sum = vaddw_u8(*sum, add0); + *sum = vaddw_u8(*sum, add1); + return vbsl_u8(flat, vrshrn_n_u16(*sum, 3), in); +} + +static INLINE uint8x8_t filter_tap15(const uint8x8_t flat, const uint8x8_t sub0, + const uint8x8_t sub1, const uint8x8_t add0, + const uint8x8_t add1, const uint8x8_t in, + uint16x8_t *sum) { + *sum = vsubw_u8(*sum, sub0); + *sum = vsubw_u8(*sum, sub1); + *sum = vaddw_u8(*sum, add0); + *sum = vaddw_u8(*sum, add1); + return vbsl_u8(flat, vrshrn_n_u16(*sum, 4), in); +} + +// 7-tap filter [1, 1, 1, 2, 1, 1, 1] +static INLINE void apply_7_tap_filter(const uint8x8_t flat, const uint8x8_t p3, + const uint8x8_t p2, const uint8x8_t p1, + const uint8x8_t p0, const uint8x8_t q0, + const uint8x8_t q1, const uint8x8_t q2, + const uint8x8_t q3, uint8x8_t *op2, + uint8x8_t *op1, uint8x8_t *op0, + uint8x8_t *oq0, uint8x8_t *oq1, + uint8x8_t *oq2) { + uint16x8_t sum; + sum = vaddl_u8(p3, p3); // 2*p3 + sum = vaddw_u8(sum, p3); // 3*p3 + sum = vaddw_u8(sum, p2); // 3*p3+p2 + sum = vaddw_u8(sum, p2); // 3*p3+2*p2 + sum = vaddw_u8(sum, p1); // 3*p3+2*p2+p1 + sum = vaddw_u8(sum, p0); // 3*p3+2*p2+p1+p0 + sum = vaddw_u8(sum, q0); // 3*p3+2*p2+p1+p0+q0 + *op2 = vbsl_u8(flat, vrshrn_n_u16(sum, 3), p2); + *op1 = filter_tap7(flat, p3, p2, p1, q1, *op1, &sum); + *op0 = filter_tap7(flat, p3, p1, p0, q2, *op0, &sum); + *oq0 = filter_tap7(flat, p3, p0, q0, q3, *oq0, &sum); + *oq1 = filter_tap7(flat, p2, q0, q1, q3, *oq1, &sum); + *oq2 = filter_tap7(flat, p1, q1, q2, q3, q2, &sum); +} + +// 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1] +static INLINE void apply_15_tap_filter( + const uint8x8_t flat2, const uint8x8_t p7, const uint8x8_t p6, + const uint8x8_t p5, const uint8x8_t p4, const uint8x8_t p3, + const uint8x8_t p2, const uint8x8_t p1, const uint8x8_t p0, + const uint8x8_t q0, const uint8x8_t q1, const uint8x8_t q2, + const uint8x8_t q3, const uint8x8_t q4, const uint8x8_t q5, + const uint8x8_t q6, const uint8x8_t q7, uint8x8_t *op6, uint8x8_t *op5, + uint8x8_t *op4, uint8x8_t *op3, uint8x8_t *op2, uint8x8_t *op1, + uint8x8_t *op0, uint8x8_t *oq0, uint8x8_t *oq1, uint8x8_t *oq2, + uint8x8_t *oq3, uint8x8_t *oq4, uint8x8_t *oq5, uint8x8_t *oq6) { + uint16x8_t sum; + sum = vshll_n_u8(p7, 3); // 8*p7 + sum = vsubw_u8(sum, p7); // 7*p7 + sum = vaddw_u8(sum, p6); // 7*p7+p6 + sum = vaddw_u8(sum, p6); // 7*p7+2*p6 + sum = vaddw_u8(sum, p5); // 7*p7+2*p6+p5 + sum = vaddw_u8(sum, p4); // 7*p7+2*p6+p5+p4 + sum = vaddw_u8(sum, p3); // 7*p7+2*p6+p5+p4+p3 + sum = vaddw_u8(sum, p2); // 7*p7+2*p6+p5+p4+p3+p2 + sum = vaddw_u8(sum, p1); // 7*p7+2*p6+p5+p4+p3+p2+p1 + sum = vaddw_u8(sum, p0); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0 + sum = vaddw_u8(sum, q0); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0+q0 + *op6 = vbsl_u8(flat2, vrshrn_n_u16(sum, 4), p6); + *op5 = filter_tap15(flat2, p7, p6, p5, q1, p5, &sum); + *op4 = filter_tap15(flat2, p7, p5, p4, q2, p4, &sum); + *op3 = filter_tap15(flat2, p7, p4, p3, q3, p3, &sum); + *op2 = filter_tap15(flat2, p7, p3, p2, q4, *op2, &sum); + *op1 = filter_tap15(flat2, p7, p2, p1, q5, *op1, &sum); + *op0 = filter_tap15(flat2, p7, p1, p0, q6, *op0, &sum); + *oq0 = filter_tap15(flat2, p7, p0, q0, q7, *oq0, &sum); + *oq1 = filter_tap15(flat2, p6, q0, q1, q7, *oq1, &sum); + *oq2 = filter_tap15(flat2, p5, q1, q2, q7, *oq2, &sum); + *oq3 = filter_tap15(flat2, p4, q2, q3, q7, q3, &sum); + *oq4 = filter_tap15(flat2, p3, q3, q4, q7, q4, &sum); + *oq5 = filter_tap15(flat2, p2, q4, q5, q7, q5, &sum); + *oq6 = filter_tap15(flat2, p1, q5, q6, q7, q6, &sum); +} + +static INLINE void filter16( + const uint8x8_t mask, const uint8x8_t flat, const uint64_t flat_u64, + const uint8x8_t flat2, const uint64_t flat2_u64, const uint8x8_t hev, + const uint8x8_t p7, const uint8x8_t p6, const uint8x8_t p5, + const uint8x8_t p4, const uint8x8_t p3, const uint8x8_t p2, + const uint8x8_t p1, const uint8x8_t p0, const uint8x8_t q0, + const uint8x8_t q1, const uint8x8_t q2, const uint8x8_t q3, + const uint8x8_t q4, const uint8x8_t q5, const uint8x8_t q6, + const uint8x8_t q7, uint8x8_t *op6, uint8x8_t *op5, uint8x8_t *op4, + uint8x8_t *op3, uint8x8_t *op2, uint8x8_t *op1, uint8x8_t *op0, + uint8x8_t *oq0, uint8x8_t *oq1, uint8x8_t *oq2, uint8x8_t *oq3, + uint8x8_t *oq4, uint8x8_t *oq5, uint8x8_t *oq6) { + // add outer taps if we have high edge variance + if (flat_u64 != (uint64_t)-1) { + int8x8_t filter, filter1, filter2, t; + int8x8_t ps1 = flip_sign(p1); + int8x8_t ps0 = flip_sign(p0); + int8x8_t qs0 = flip_sign(q0); + int8x8_t qs1 = flip_sign(q1); + + filter = vqsub_s8(ps1, qs1); + filter = vand_s8(filter, vreinterpret_s8_u8(hev)); + t = vqsub_s8(qs0, ps0); + + // inner taps + filter = vqadd_s8(filter, t); + filter = vqadd_s8(filter, t); + filter = vqadd_s8(filter, t); + filter = vand_s8(filter, vreinterpret_s8_u8(mask)); + + // save bottom 3 bits so that we round one side +4 and the other +3 + // if it equals 4 we'll set to adjust by -1 to account for the fact + // we'd round 3 the other way + filter1 = vshr_n_s8(vqadd_s8(filter, vdup_n_s8(4)), 3); + filter2 = vshr_n_s8(vqadd_s8(filter, vdup_n_s8(3)), 3); + + qs0 = vqsub_s8(qs0, filter1); + ps0 = vqadd_s8(ps0, filter2); + *oq0 = flip_sign_back(qs0); + *op0 = flip_sign_back(ps0); + + // outer tap adjustments + filter = vrshr_n_s8(filter1, 1); + filter = vbic_s8(filter, vreinterpret_s8_u8(hev)); + + qs1 = vqsub_s8(qs1, filter); + ps1 = vqadd_s8(ps1, filter); + *oq1 = flip_sign_back(qs1); + *op1 = flip_sign_back(ps1); + } + + if (flat_u64) { + *op2 = p2; + *oq2 = q2; + if (flat2_u64 != (uint64_t)-1) { + apply_7_tap_filter(flat, p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, op0, + oq0, oq1, oq2); + } + if (flat2_u64) { + apply_15_tap_filter(flat2, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, + q4, q5, q6, q7, op6, op5, op4, op3, op2, op1, op0, + oq0, oq1, oq2, oq3, oq4, oq5, oq6); + } + } +} + +static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int count) { + const uint8x8_t blimit_u8x8 = vld1_dup_u8(blimit); + const uint8x8_t limit_u8x8 = vld1_dup_u8(limit); + const uint8x8_t thresh_u8x8 = vld1_dup_u8(thresh); + + do { + const uint8x8_t p7 = vld1_u8(s - 8 * p); + const uint8x8_t p6 = vld1_u8(s - 7 * p); + const uint8x8_t p5 = vld1_u8(s - 6 * p); + const uint8x8_t p4 = vld1_u8(s - 5 * p); + const uint8x8_t p3 = vld1_u8(s - 4 * p); + const uint8x8_t p2 = vld1_u8(s - 3 * p); + const uint8x8_t p1 = vld1_u8(s - 2 * p); + const uint8x8_t p0 = vld1_u8(s - 1 * p); + const uint8x8_t q0 = vld1_u8(s + 0 * p); + const uint8x8_t q1 = vld1_u8(s + 1 * p); + const uint8x8_t q2 = vld1_u8(s + 2 * p); + const uint8x8_t q3 = vld1_u8(s + 3 * p); + const uint8x8_t q4 = vld1_u8(s + 4 * p); + const uint8x8_t q5 = vld1_u8(s + 5 * p); + const uint8x8_t q6 = vld1_u8(s + 6 * p); + const uint8x8_t q7 = vld1_u8(s + 7 * p); + uint8x8_t op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, + oq6, flat, hev; + const uint8x8_t mask = filter_mask(limit_u8x8, blimit_u8x8, thresh_u8x8, p3, + p2, p1, p0, q0, q1, q2, q3, &flat, &hev); + uint8x8_t flat2 = flat_mask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7); + uint64_t flat_u64, flat2_u64; + + flat = vand_u8(flat, mask); + flat2 = vand_u8(flat2, flat); + flat_u64 = vget_lane_u64(vreinterpret_u64_u8(flat), 0); + flat2_u64 = vget_lane_u64(vreinterpret_u64_u8(flat2), 0); + + filter16(mask, flat, flat_u64, flat2, flat2_u64, hev, p7, p6, p5, p4, p3, + p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, &op3, + &op2, &op1, &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6); + + if (flat_u64) { + if (flat2_u64) { + vst1_u8(s - 7 * p, op6); + vst1_u8(s - 6 * p, op5); + vst1_u8(s - 5 * p, op4); + vst1_u8(s - 4 * p, op3); + vst1_u8(s + 3 * p, oq3); + vst1_u8(s + 4 * p, oq4); + vst1_u8(s + 5 * p, oq5); + vst1_u8(s + 6 * p, oq6); + } + vst1_u8(s - 3 * p, op2); + vst1_u8(s + 2 * p, oq2); + } + vst1_u8(s - 2 * p, op1); + vst1_u8(s - 1 * p, op0); + vst1_u8(s + 0 * p, oq0); + vst1_u8(s + 1 * p, oq1); + s += 8; + } while (--count); +} + +void vpx_lpf_horizontal_edge_8_neon(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh) { + mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1); +} + +void vpx_lpf_horizontal_edge_16_neon(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh) { + mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2); +} + +static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int count) { + const uint8x8_t blimit_u8x8 = vld1_dup_u8(blimit); + const uint8x8_t limit_u8x8 = vld1_dup_u8(limit); + const uint8x8_t thresh_u8x8 = vld1_dup_u8(thresh); + uint8_t *d; + + s -= 8; + d = s; + do { + uint8x16_t t0, t1, t2, t3, t4, t5, t6, t7; + uint8x8_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, + op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, oq6, + flat, hev, mask, flat2; + uint64_t flat_u64, flat2_u64; + + t0 = vld1q_u8(s); + s += p; + t1 = vld1q_u8(s); + s += p; + t2 = vld1q_u8(s); + s += p; + t3 = vld1q_u8(s); + s += p; + t4 = vld1q_u8(s); + s += p; + t5 = vld1q_u8(s); + s += p; + t6 = vld1q_u8(s); + s += p; + t7 = vld1q_u8(s); + s += p; + + transpose_u8_16x8(t0, t1, t2, t3, t4, t5, t6, t7, &p7, &p6, &p5, &p4, &p3, + &p2, &p1, &p0, &q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7); + + mask = filter_mask(limit_u8x8, blimit_u8x8, thresh_u8x8, p3, p2, p1, p0, q0, + q1, q2, q3, &flat, &hev); + flat2 = flat_mask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7); + flat = vand_u8(flat, mask); + flat2 = vand_u8(flat2, flat); + flat_u64 = vget_lane_u64(vreinterpret_u64_u8(flat), 0); + flat2_u64 = vget_lane_u64(vreinterpret_u64_u8(flat2), 0); + + filter16(mask, flat, flat_u64, flat2, flat2_u64, hev, p7, p6, p5, p4, p3, + p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, &op3, + &op2, &op1, &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6); + + if (flat_u64) { + if (flat2_u64) { + uint8x16_t o0, o1, o2, o3, o4, o5, o6, o7; + transpose_u8_8x16(p7, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, + oq3, oq4, oq5, oq6, q7, &o0, &o1, &o2, &o3, &o4, &o5, + &o6, &o7); + + vst1q_u8(d, o0); + d += p; + vst1q_u8(d, o1); + d += p; + vst1q_u8(d, o2); + d += p; + vst1q_u8(d, o3); + d += p; + vst1q_u8(d, o4); + d += p; + vst1q_u8(d, o5); + d += p; + vst1q_u8(d, o6); + d += p; + vst1q_u8(d, o7); + d += p; + } else { + uint8x8x3_t o0, o1; + d += 8; + o0.val[0] = op2; + o0.val[1] = op1; + o0.val[2] = op0; + o1.val[0] = oq0; + o1.val[1] = oq1; + o1.val[2] = oq2; + vst3_lane_u8(d - 3, o0, 0); + vst3_lane_u8(d + 0, o1, 0); + d += p; + vst3_lane_u8(d - 3, o0, 1); + vst3_lane_u8(d + 0, o1, 1); + d += p; + vst3_lane_u8(d - 3, o0, 2); + vst3_lane_u8(d + 0, o1, 2); + d += p; + vst3_lane_u8(d - 3, o0, 3); + vst3_lane_u8(d + 0, o1, 3); + d += p; + vst3_lane_u8(d - 3, o0, 4); + vst3_lane_u8(d + 0, o1, 4); + d += p; + vst3_lane_u8(d - 3, o0, 5); + vst3_lane_u8(d + 0, o1, 5); + d += p; + vst3_lane_u8(d - 3, o0, 6); + vst3_lane_u8(d + 0, o1, 6); + d += p; + vst3_lane_u8(d - 3, o0, 7); + vst3_lane_u8(d + 0, o1, 7); + d += p - 8; + } + } else { + uint8x8x4_t o; + d += 6; + o.val[0] = op1; + o.val[1] = op0; + o.val[2] = oq0; + o.val[3] = oq1; + vst4_lane_u8(d, o, 0); + d += p; + vst4_lane_u8(d, o, 1); + d += p; + vst4_lane_u8(d, o, 2); + d += p; + vst4_lane_u8(d, o, 3); + d += p; + vst4_lane_u8(d, o, 4); + d += p; + vst4_lane_u8(d, o, 5); + d += p; + vst4_lane_u8(d, o, 6); + d += p; + vst4_lane_u8(d, o, 7); + d += p - 6; + } + } while (--count); +} + +void vpx_lpf_vertical_16_neon(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 1); +} + +void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh) { + mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 2); +} diff --git a/vpx_dsp/arm/loopfilter_neon.c b/vpx_dsp/arm/loopfilter_neon.c index 9129b5d2d..7741b226c 100644 --- a/vpx_dsp/arm/loopfilter_neon.c +++ b/vpx_dsp/arm/loopfilter_neon.c @@ -38,11 +38,4 @@ void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0, vpx_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0); vpx_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1); } - -void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh) { - vpx_lpf_vertical_16_neon(s, p, blimit, limit, thresh); - vpx_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh); -} #endif // HAVE_NEON_ASM diff --git a/vpx_dsp/arm/transpose_neon.h b/vpx_dsp/arm/transpose_neon.h index 426abe903..3727b6709 100644 --- a/vpx_dsp/arm/transpose_neon.h +++ b/vpx_dsp/arm/transpose_neon.h @@ -101,4 +101,219 @@ static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1, *a7 = d3.val[1]; } +static INLINE void transpose_u8_16x8( + const uint8x16_t i0, const uint8x16_t i1, const uint8x16_t i2, + const uint8x16_t i3, const uint8x16_t i4, const uint8x16_t i5, + const uint8x16_t i6, const uint8x16_t i7, uint8x8_t *o0, uint8x8_t *o1, + uint8x8_t *o2, uint8x8_t *o3, uint8x8_t *o4, uint8x8_t *o5, uint8x8_t *o6, + uint8x8_t *o7, uint8x8_t *o8, uint8x8_t *o9, uint8x8_t *o10, uint8x8_t *o11, + uint8x8_t *o12, uint8x8_t *o13, uint8x8_t *o14, uint8x8_t *o15) { + // Input: + // i0: 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F + // i1: 10 11 12 13 14 15 16 17 18 19 1A 1B 1C 1D 1E 1F + // i2: 20 21 22 23 24 25 26 27 28 29 2A 2B 2C 2D 2E 2F + // i3: 30 31 32 33 34 35 36 37 38 39 3A 3B 3C 3D 3E 3F + // i4: 40 41 42 43 44 45 46 47 48 49 4A 4B 4C 4D 4E 4F + // i5: 50 51 52 53 54 55 56 57 58 59 5A 5B 5C 5D 5E 5F + // i6: 60 61 62 63 64 65 66 67 68 69 6A 6B 6C 6D 6E 6F + // i7: 70 71 72 73 74 75 76 77 78 79 7A 7B 7C 7D 7E 7F + uint8x16x2_t b0, b1, b2, b3; + uint16x8x2_t c0, c1, c2, c3; + uint32x4x2_t d0, d1, d2, d3; + + // b0: 00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E + // 01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F + // b1: 20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E + // 21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F + // b2: 40 50 42 52 44 54 46 56 48 58 4A 5A 4C 5C 4E 5E + // 41 51 43 53 45 55 47 57 49 59 4B 5B 4D 5D 4F 5F + // b3: 60 70 62 72 64 74 66 76 68 78 6A 7A 6C 7C 6E 7E + // 61 71 63 73 65 75 67 77 69 79 6B 7B 6D 7D 6F 7F + b0 = vtrnq_u8(i0, i1); + b1 = vtrnq_u8(i2, i3); + b2 = vtrnq_u8(i4, i5); + b3 = vtrnq_u8(i6, i7); + + // c0: 00 10 20 30 04 14 24 34 08 18 28 38 0C 1C 2C 3C + // 02 12 22 32 06 16 26 36 0A 1A 2A 3A 0E 1E 2E 3E + // c1: 01 11 21 31 05 15 25 35 09 19 29 39 0D 1D 2D 3D + // 03 13 23 33 07 17 27 37 0B 1B 2B 3B 0F 1F 2F 3F + // c2: 40 50 60 70 44 54 64 74 48 58 68 78 4C 5C 6C 7C + // 42 52 62 72 46 56 66 76 4A 5A 6A 7A 4E 5E 6E 7E + // c3: 41 51 61 71 45 55 65 75 49 59 69 79 4D 5D 6D 7D + // 43 53 63 73 47 57 67 77 4B 5B 6B 7B 4F 5F 6F 7F + c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]), + vreinterpretq_u16_u8(b1.val[0])); + c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]), + vreinterpretq_u16_u8(b1.val[1])); + c2 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[0]), + vreinterpretq_u16_u8(b3.val[0])); + c3 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[1]), + vreinterpretq_u16_u8(b3.val[1])); + + // d0: 00 10 20 30 40 50 60 70 08 18 28 38 48 58 68 78 + // 04 14 24 34 44 54 64 74 0C 1C 2C 3C 4C 5C 6C 7C + // d1: 02 12 22 32 42 52 62 72 0A 1A 2A 3A 4A 5A 6A 7A + // 06 16 26 36 46 56 66 76 0E 1E 2E 3E 4E 5E 6E 7E + // d2: 01 11 21 31 41 51 61 71 09 19 29 39 49 59 69 79 + // 05 15 25 35 45 55 65 75 0D 1D 2D 3D 4D 5D 6D 7D + // d3: 03 13 23 33 43 53 63 73 0B 1B 2B 3B 4B 5B 6B 7B + // 07 17 27 37 47 57 67 77 0F 1F 2F 3F 4F 5F 6F 7F + d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]), + vreinterpretq_u32_u16(c2.val[0])); + d1 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]), + vreinterpretq_u32_u16(c2.val[1])); + d2 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[0]), + vreinterpretq_u32_u16(c3.val[0])); + d3 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[1]), + vreinterpretq_u32_u16(c3.val[1])); + + // Output: + // o0 : 00 10 20 30 40 50 60 70 + // o1 : 01 11 21 31 41 51 61 71 + // o2 : 02 12 22 32 42 52 62 72 + // o3 : 03 13 23 33 43 53 63 73 + // o4 : 04 14 24 34 44 54 64 74 + // o5 : 05 15 25 35 45 55 65 75 + // o6 : 06 16 26 36 46 56 66 76 + // o7 : 07 17 27 37 47 57 67 77 + // o8 : 08 18 28 38 48 58 68 78 + // o9 : 09 19 29 39 49 59 69 79 + // o10: 0A 1A 2A 3A 4A 5A 6A 7A + // o11: 0B 1B 2B 3B 4B 5B 6B 7B + // o12: 0C 1C 2C 3C 4C 5C 6C 7C + // o13: 0D 1D 2D 3D 4D 5D 6D 7D + // o14: 0E 1E 2E 3E 4E 5E 6E 7E + // o15: 0F 1F 2F 3F 4F 5F 6F 7F + *o0 = vget_low_u8(vreinterpretq_u8_u32(d0.val[0])); + *o1 = vget_low_u8(vreinterpretq_u8_u32(d2.val[0])); + *o2 = vget_low_u8(vreinterpretq_u8_u32(d1.val[0])); + *o3 = vget_low_u8(vreinterpretq_u8_u32(d3.val[0])); + *o4 = vget_low_u8(vreinterpretq_u8_u32(d0.val[1])); + *o5 = vget_low_u8(vreinterpretq_u8_u32(d2.val[1])); + *o6 = vget_low_u8(vreinterpretq_u8_u32(d1.val[1])); + *o7 = vget_low_u8(vreinterpretq_u8_u32(d3.val[1])); + *o8 = vget_high_u8(vreinterpretq_u8_u32(d0.val[0])); + *o9 = vget_high_u8(vreinterpretq_u8_u32(d2.val[0])); + *o10 = vget_high_u8(vreinterpretq_u8_u32(d1.val[0])); + *o11 = vget_high_u8(vreinterpretq_u8_u32(d3.val[0])); + *o12 = vget_high_u8(vreinterpretq_u8_u32(d0.val[1])); + *o13 = vget_high_u8(vreinterpretq_u8_u32(d2.val[1])); + *o14 = vget_high_u8(vreinterpretq_u8_u32(d1.val[1])); + *o15 = vget_high_u8(vreinterpretq_u8_u32(d3.val[1])); +} + +static INLINE void transpose_u8_8x16( + const uint8x8_t i0, const uint8x8_t i1, const uint8x8_t i2, + const uint8x8_t i3, const uint8x8_t i4, const uint8x8_t i5, + const uint8x8_t i6, const uint8x8_t i7, const uint8x8_t i8, + const uint8x8_t i9, const uint8x8_t i10, const uint8x8_t i11, + const uint8x8_t i12, const uint8x8_t i13, const uint8x8_t i14, + const uint8x8_t i15, uint8x16_t *o0, uint8x16_t *o1, uint8x16_t *o2, + uint8x16_t *o3, uint8x16_t *o4, uint8x16_t *o5, uint8x16_t *o6, + uint8x16_t *o7) { + // Input: + // i0 : 00 01 02 03 04 05 06 07 + // i1 : 10 11 12 13 14 15 16 17 + // i2 : 20 21 22 23 24 25 26 27 + // i3 : 30 31 32 33 34 35 36 37 + // i4 : 40 41 42 43 44 45 46 47 + // i5 : 50 51 52 53 54 55 56 57 + // i6 : 60 61 62 63 64 65 66 67 + // i7 : 70 71 72 73 74 75 76 77 + // i8 : 80 81 82 83 84 85 86 87 + // i9 : 90 91 92 93 94 95 96 97 + // i10: A0 A1 A2 A3 A4 A5 A6 A7 + // i11: B0 B1 B2 B3 B4 B5 B6 B7 + // i12: C0 C1 C2 C3 C4 C5 C6 C7 + // i13: D0 D1 D2 D3 D4 D5 D6 D7 + // i14: E0 E1 E2 E3 E4 E5 E6 E7 + // i15: F0 F1 F2 F3 F4 F5 F6 F7 + uint8x16x2_t b0, b1, b2, b3; + uint16x8x2_t c0, c1, c2, c3; + uint32x4x2_t d0, d1, d2, d3; + + // b0: 00 01 02 03 04 05 06 07 80 81 82 83 84 85 86 87 + // 10 11 12 13 14 15 16 17 90 91 92 93 94 95 96 97 + // b1: 20 21 22 23 24 25 26 27 A0 A1 A2 A3 A4 A5 A6 A7 + // 30 31 32 33 34 35 36 37 B0 B1 B2 B3 B4 B5 B6 B7 + // b2: 40 41 42 43 44 45 46 47 C0 C1 C2 C3 C4 C5 C6 C7 + // 50 51 52 53 54 55 56 57 D0 D1 D2 D3 D4 D5 D6 D7 + // b3: 60 61 62 63 64 65 66 67 E0 E1 E2 E3 E4 E5 E6 E7 + // 70 71 72 73 74 75 76 77 F0 F1 F2 F3 F4 F5 F6 F7 + b0.val[0] = vcombine_u8(i0, i8); + b0.val[1] = vcombine_u8(i1, i9); + b1.val[0] = vcombine_u8(i2, i10); + b1.val[1] = vcombine_u8(i3, i11); + b2.val[0] = vcombine_u8(i4, i12); + b2.val[1] = vcombine_u8(i5, i13); + b3.val[0] = vcombine_u8(i6, i14); + b3.val[1] = vcombine_u8(i7, i15); + + // b0: 00 10 02 12 04 14 06 16 80 90 82 92 84 94 86 96 + // 01 11 03 13 05 15 07 17 81 91 83 93 85 95 87 97 + // b1: 20 30 22 32 24 34 26 36 A0 B0 A2 B2 A4 B4 A6 B6 + // 21 31 23 33 25 35 27 37 A1 B1 A3 B3 A5 B5 A7 B7 + // b2: 40 50 42 52 44 54 46 56 C0 D0 C2 D2 C4 D4 C6 D6 + // 41 51 43 53 45 55 47 57 C1 D1 C3 D3 C5 D5 C7 D7 + // b3: 60 70 62 72 64 74 66 76 E0 F0 E2 F2 E4 F4 E6 F6 + // 61 71 63 73 65 75 67 77 E1 F1 E3 F3 E5 F5 E7 F7 + b0 = vtrnq_u8(b0.val[0], b0.val[1]); + b1 = vtrnq_u8(b1.val[0], b1.val[1]); + b2 = vtrnq_u8(b2.val[0], b2.val[1]); + b3 = vtrnq_u8(b3.val[0], b3.val[1]); + + // c0: 00 10 20 30 04 14 24 34 80 90 A0 B0 84 94 A4 B4 + // 02 12 22 32 06 16 26 36 82 92 A2 B2 86 96 A6 B6 + // c1: 01 11 21 31 05 15 25 35 81 91 A1 B1 85 95 A5 B5 + // 03 13 23 33 07 17 27 37 83 93 A3 B3 87 97 A7 B7 + // c2: 40 50 60 70 44 54 64 74 C0 D0 E0 F0 C4 D4 E4 F4 + // 42 52 62 72 46 56 66 76 C2 D2 E2 F2 C6 D6 E6 F6 + // c3: 41 51 61 71 45 55 65 75 C1 D1 E1 F1 C5 D5 E5 F5 + // 43 53 63 73 47 57 67 77 C3 D3 E3 F3 C7 D7 E7 F7 + c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]), + vreinterpretq_u16_u8(b1.val[0])); + c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]), + vreinterpretq_u16_u8(b1.val[1])); + c2 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[0]), + vreinterpretq_u16_u8(b3.val[0])); + c3 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[1]), + vreinterpretq_u16_u8(b3.val[1])); + + // d0: 00 10 20 30 40 50 60 70 80 90 A0 B0 C0 D0 E0 F0 + // 04 14 24 34 44 54 64 74 84 94 A4 B4 C4 D4 E4 F4 + // d1: 02 12 22 32 42 52 62 72 82 92 A2 B2 C2 D2 E2 F2 + // 06 16 26 36 46 56 66 76 86 96 A6 B6 C6 D6 E6 F6 + // d2: 01 11 21 31 41 51 61 71 81 91 A1 B1 C1 D1 E1 F1 + // 05 15 25 35 45 55 65 75 85 95 A5 B5 C5 D5 E5 F5 + // d3: 03 13 23 33 43 53 63 73 83 93 A3 B3 C3 D3 E3 F3 + // 07 17 27 37 47 57 67 77 87 97 A7 B7 C7 D7 E7 F7 + d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]), + vreinterpretq_u32_u16(c2.val[0])); + d1 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]), + vreinterpretq_u32_u16(c2.val[1])); + d2 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[0]), + vreinterpretq_u32_u16(c3.val[0])); + d3 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[1]), + vreinterpretq_u32_u16(c3.val[1])); + + // Output: + // o0: 00 10 20 30 40 50 60 70 80 90 A0 B0 C0 D0 E0 F0 + // o1: 01 11 21 31 41 51 61 71 81 91 A1 B1 C1 D1 E1 F1 + // o2: 02 12 22 32 42 52 62 72 82 92 A2 B2 C2 D2 E2 F2 + // o3: 03 13 23 33 43 53 63 73 83 93 A3 B3 C3 D3 E3 F3 + // o4: 04 14 24 34 44 54 64 74 84 94 A4 B4 C4 D4 E4 F4 + // o5: 05 15 25 35 45 55 65 75 85 95 A5 B5 C5 D5 E5 F5 + // o6: 06 16 26 36 46 56 66 76 86 96 A6 B6 C6 D6 E6 F6 + // o7: 07 17 27 37 47 57 67 77 87 97 A7 B7 C7 D7 E7 F7 + *o0 = vreinterpretq_u8_u32(d0.val[0]); + *o1 = vreinterpretq_u8_u32(d2.val[0]); + *o2 = vreinterpretq_u8_u32(d1.val[0]); + *o3 = vreinterpretq_u8_u32(d3.val[0]); + *o4 = vreinterpretq_u8_u32(d0.val[1]); + *o5 = vreinterpretq_u8_u32(d2.val[1]); + *o6 = vreinterpretq_u8_u32(d1.val[1]); + *o7 = vreinterpretq_u8_u32(d3.val[1]); +} + #endif // VPX_DSP_ARM_TRANSPOSE_NEON_H_ diff --git a/vpx_dsp/loopfilter.c b/vpx_dsp/loopfilter.c index 40f02b46d..2f7eff8b0 100644 --- a/vpx_dsp/loopfilter.c +++ b/vpx_dsp/loopfilter.c @@ -30,7 +30,7 @@ static INLINE int16_t signed_char_clamp_high(int t, int bd) { } #endif -// should we apply any filter at all: 11111111 yes, 00000000 no +// Should we apply any filter at all: 11111111 yes, 00000000 no static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, uint8_t p3, uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1, uint8_t q2, uint8_t q3) { @@ -68,7 +68,7 @@ static INLINE int8_t flat_mask5(uint8_t thresh, uint8_t p4, uint8_t p3, return ~mask; } -// is there high edge variance internal edge: 11111111 yes, 00000000 no +// Is there high edge variance internal edge: 11111111 yes, 00000000 no static INLINE int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1) { int8_t hev = 0; diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 21dc95a34..98fda656f 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -144,6 +144,7 @@ DSP_SRCS-yes += arm/loopfilter_8_neon$(ASM) DSP_SRCS-yes += arm/loopfilter_4_neon$(ASM) else ifeq ($(HAVE_NEON),yes) +DSP_SRCS-yes += arm/loopfilter_mb_neon.c DSP_SRCS-yes += arm/loopfilter_16_neon.c DSP_SRCS-yes += arm/loopfilter_8_neon.c DSP_SRCS-yes += arm/loopfilter_4_neon.c diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 5ad154f36..428d5e951 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -505,12 +505,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # Loopfilter # add_proto qw/void vpx_lpf_vertical_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -specialize qw/vpx_lpf_vertical_16 sse2 neon_asm dspr2 msa/; -$vpx_lpf_vertical_16_neon_asm=vpx_lpf_vertical_16_neon; +specialize qw/vpx_lpf_vertical_16 sse2 neon dspr2 msa/; add_proto qw/void vpx_lpf_vertical_16_dual/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -specialize qw/vpx_lpf_vertical_16_dual sse2 neon_asm dspr2 msa/; -$vpx_lpf_vertical_16_dual_neon_asm=vpx_lpf_vertical_16_dual_neon; +specialize qw/vpx_lpf_vertical_16_dual sse2 neon dspr2 msa/; add_proto qw/void vpx_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; specialize qw/vpx_lpf_vertical_8 sse2 neon dspr2 msa/; @@ -526,12 +524,10 @@ add_proto qw/void vpx_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_ specialize qw/vpx_lpf_vertical_4_dual sse2 neon dspr2 msa/; add_proto qw/void vpx_lpf_horizontal_edge_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -specialize qw/vpx_lpf_horizontal_edge_8 sse2 avx2 neon_asm dspr2 msa/; -$vpx_lpf_horizontal_edge_8_neon_asm=vpx_lpf_horizontal_edge_8_neon; +specialize qw/vpx_lpf_horizontal_edge_8 sse2 avx2 neon dspr2 msa/; add_proto qw/void vpx_lpf_horizontal_edge_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -specialize qw/vpx_lpf_horizontal_edge_16 sse2 avx2 neon_asm dspr2 msa/; -$vpx_lpf_horizontal_edge_16_neon_asm=vpx_lpf_horizontal_edge_16_neon; +specialize qw/vpx_lpf_horizontal_edge_16 sse2 avx2 neon dspr2 msa/; add_proto qw/void vpx_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; specialize qw/vpx_lpf_horizontal_8 sse2 neon dspr2 msa/; |