diff options
-rw-r--r-- | test/lpf_8_test.cc | 13 | ||||
-rw-r--r-- | vp10/common/loopfilter.c | 2 | ||||
-rw-r--r-- | vp9/common/vp9_loopfilter.c | 2 | ||||
-rw-r--r-- | vpx_dsp/arm/loopfilter_8_neon.asm | 16 | ||||
-rw-r--r-- | vpx_dsp/arm/loopfilter_8_neon.c | 8 | ||||
-rw-r--r-- | vpx_dsp/arm/loopfilter_neon.c | 4 | ||||
-rw-r--r-- | vpx_dsp/loopfilter.c | 9 | ||||
-rw-r--r-- | vpx_dsp/mips/loopfilter_8_msa.c | 5 | ||||
-rw-r--r-- | vpx_dsp/mips/loopfilter_filters_dspr2.c | 4 | ||||
-rw-r--r-- | vpx_dsp/mips/loopfilter_mb_dspr2.c | 3 | ||||
-rw-r--r-- | vpx_dsp/vpx_dsp_rtcd_defs.pl | 2 | ||||
-rw-r--r-- | vpx_dsp/x86/loopfilter_sse2.c | 6 |
12 files changed, 26 insertions, 48 deletions
diff --git a/test/lpf_8_test.cc b/test/lpf_8_test.cc index 5c83f3a1f..394360e2f 100644 --- a/test/lpf_8_test.cc +++ b/test/lpf_8_test.cc @@ -520,7 +520,8 @@ INSTANTIATE_TEST_CASE_P( INSTANTIATE_TEST_CASE_P( SSE2, Loop8Test6Param, ::testing::Values( - make_tuple(&vpx_lpf_horizontal_8_sse2, &vpx_lpf_horizontal_8_c, 8, 1), + make_tuple(&wrapper_nc<vpx_lpf_horizontal_8_sse2>, + &wrapper_nc<vpx_lpf_horizontal_8_c>, 8, 1), make_tuple(&vpx_lpf_horizontal_16_sse2, &vpx_lpf_horizontal_16_c, 8, 1), make_tuple(&vpx_lpf_horizontal_16_sse2, &vpx_lpf_horizontal_16_c, 8, 2), make_tuple(&wrapper_nc<vpx_lpf_vertical_8_sse2>, @@ -604,8 +605,8 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&wrapper_nc<vpx_lpf_vertical_16_dual_neon>, &wrapper_nc<vpx_lpf_vertical_16_dual_c>, 8, 1), #endif // HAVE_NEON_ASM - make_tuple(&vpx_lpf_horizontal_8_neon, - &vpx_lpf_horizontal_8_c, 8, 1), + make_tuple(&wrapper_nc<vpx_lpf_horizontal_8_neon>, + &wrapper_nc<vpx_lpf_horizontal_8_c>, 8, 1), make_tuple(&wrapper_nc<vpx_lpf_vertical_8_neon>, &wrapper_nc<vpx_lpf_vertical_8_c>, 8, 1), make_tuple(&vpx_lpf_horizontal_4_neon, @@ -633,7 +634,8 @@ INSTANTIATE_TEST_CASE_P( DSPR2, Loop8Test6Param, ::testing::Values( make_tuple(&vpx_lpf_horizontal_4_dspr2, &vpx_lpf_horizontal_4_c, 8, 1), - make_tuple(&vpx_lpf_horizontal_8_dspr2, &vpx_lpf_horizontal_8_c, 8, 1), + make_tuple(&wrapper_nc<vpx_lpf_horizontal_8_dspr2>, + &wrapper_nc<vpx_lpf_horizontal_8_c>, 8, 1), make_tuple(&vpx_lpf_horizontal_16_dspr2, &vpx_lpf_horizontal_16_c, 8, 1), make_tuple(&vpx_lpf_horizontal_16_dspr2, @@ -665,7 +667,8 @@ INSTANTIATE_TEST_CASE_P( MSA, Loop8Test6Param, ::testing::Values( make_tuple(&vpx_lpf_horizontal_4_msa, &vpx_lpf_horizontal_4_c, 8, 1), - make_tuple(&vpx_lpf_horizontal_8_msa, &vpx_lpf_horizontal_8_c, 8, 1), + make_tuple(&wrapper_nc<vpx_lpf_horizontal_8_msa>, + &wrapper_nc<vpx_lpf_horizontal_8_c>, 8, 1), make_tuple(&vpx_lpf_horizontal_16_msa, &vpx_lpf_horizontal_16_c, 8, 1), make_tuple(&vpx_lpf_horizontal_16_msa, &vpx_lpf_horizontal_16_c, 8, 2), make_tuple(&wrapper_nc<vpx_lpf_vertical_4_msa>, diff --git a/vp10/common/loopfilter.c b/vp10/common/loopfilter.c index 9f55dc248..6bbf191ac 100644 --- a/vp10/common/loopfilter.c +++ b/vp10/common/loopfilter.c @@ -542,7 +542,7 @@ static void filter_selectively_horiz(uint8_t *s, int pitch, } count = 2; } else { - vpx_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1); + vpx_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); if (mask_4x4_int & 1) vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c index e892f78d0..d5431c2c2 100644 --- a/vp9/common/vp9_loopfilter.c +++ b/vp9/common/vp9_loopfilter.c @@ -542,7 +542,7 @@ static void filter_selectively_horiz(uint8_t *s, int pitch, } count = 2; } else { - vpx_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1); + vpx_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); if (mask_4x4_int & 1) vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, diff --git a/vpx_dsp/arm/loopfilter_8_neon.asm b/vpx_dsp/arm/loopfilter_8_neon.asm index 61cabe8e8..a2f20e15f 100644 --- a/vpx_dsp/arm/loopfilter_8_neon.asm +++ b/vpx_dsp/arm/loopfilter_8_neon.asm @@ -16,35 +16,26 @@ ; Currently vpx only works on iterations 8 at a time. The vp8 loop filter ; works on 16 iterations at a time. -; TODO(fgalligan): See about removing the count code as this function is only -; called with a count of 1. ; ; void vpx_lpf_horizontal_8_neon(uint8_t *s, int p, ; const uint8_t *blimit, ; const uint8_t *limit, -; const uint8_t *thresh, -; int count) +; const uint8_t *thresh) ; r0 uint8_t *s, ; r1 int p, /* pitch */ ; r2 const uint8_t *blimit, ; r3 const uint8_t *limit, ; sp const uint8_t *thresh, -; sp+4 int count |vpx_lpf_horizontal_8_neon| PROC push {r4-r5, lr} vld1.8 {d0[]}, [r2] ; duplicate *blimit - ldr r12, [sp, #16] ; load count ldr r2, [sp, #12] ; load thresh add r1, r1, r1 ; double pitch - cmp r12, #0 - beq end_vpx_mblf_h_edge - vld1.8 {d1[]}, [r3] ; duplicate *limit vld1.8 {d2[]}, [r2] ; duplicate *thresh -count_mblf_h_loop sub r3, r0, r1, lsl #1 ; move src pointer down by 4 lines add r2, r3, r1, lsr #1 ; set to 3 lines down @@ -69,11 +60,6 @@ count_mblf_h_loop vst1.u8 {d4}, [r2@64], r1 ; store oq1 vst1.u8 {d5}, [r3@64], r1 ; store oq2 - add r0, r0, #8 - subs r12, r12, #1 - bne count_mblf_h_loop - -end_vpx_mblf_h_edge pop {r4-r5, pc} ENDP ; |vpx_lpf_horizontal_8_neon| diff --git a/vpx_dsp/arm/loopfilter_8_neon.c b/vpx_dsp/arm/loopfilter_8_neon.c index 3c005700f..ec3757380 100644 --- a/vpx_dsp/arm/loopfilter_8_neon.c +++ b/vpx_dsp/arm/loopfilter_8_neon.c @@ -268,23 +268,19 @@ void vpx_lpf_horizontal_8_neon( int pitch, const uint8_t *blimit, const uint8_t *limit, - const uint8_t *thresh, - int count) { + const uint8_t *thresh) { int i; uint8_t *s, *psrc; uint8x8_t dblimit, dlimit, dthresh; uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; uint8x8_t d16u8, d17u8, d18u8; - if (count == 0) // end_vpx_mblf_h_edge - return; - dblimit = vld1_u8(blimit); dlimit = vld1_u8(limit); dthresh = vld1_u8(thresh); psrc = src - (pitch << 2); - for (i = 0; i < count; i++) { + for (i = 0; i < 1; i++) { s = psrc + i * 8; d3u8 = vld1_u8(s); diff --git a/vpx_dsp/arm/loopfilter_neon.c b/vpx_dsp/arm/loopfilter_neon.c index b01944ebb..aa31f2935 100644 --- a/vpx_dsp/arm/loopfilter_neon.c +++ b/vpx_dsp/arm/loopfilter_neon.c @@ -33,8 +33,8 @@ void vpx_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { - vpx_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0, 1); - vpx_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1, 1); + vpx_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0); + vpx_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1); } void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, diff --git a/vpx_dsp/loopfilter.c b/vpx_dsp/loopfilter.c index 1604fdbcf..e8092d912 100644 --- a/vpx_dsp/loopfilter.c +++ b/vpx_dsp/loopfilter.c @@ -188,13 +188,12 @@ static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat, } void vpx_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int count) { + const uint8_t *limit, const uint8_t *thresh) { int i; // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. - for (i = 0; i < 8 * count; ++i) { + for (i = 0; i < 8; ++i) { const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; @@ -211,8 +210,8 @@ void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { - vpx_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, 1); - vpx_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, 1); + vpx_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0); + vpx_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1); } void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, diff --git a/vpx_dsp/mips/loopfilter_8_msa.c b/vpx_dsp/mips/loopfilter_8_msa.c index ec3f5dd22..5b22bd002 100644 --- a/vpx_dsp/mips/loopfilter_8_msa.c +++ b/vpx_dsp/mips/loopfilter_8_msa.c @@ -13,8 +13,7 @@ void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch, const uint8_t *b_limit_ptr, const uint8_t *limit_ptr, - const uint8_t *thresh_ptr, - int32_t count) { + const uint8_t *thresh_ptr) { uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d; v16u8 mask, hev, flat, thresh, b_limit, limit; v16u8 p3, p2, p1, p0, q3, q2, q1, q0; @@ -23,8 +22,6 @@ void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch, v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r; v16i8 zero = { 0 }; - (void)count; - /* load vector elements */ LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); diff --git a/vpx_dsp/mips/loopfilter_filters_dspr2.c b/vpx_dsp/mips/loopfilter_filters_dspr2.c index 9924982f1..8a24372cb 100644 --- a/vpx_dsp/mips/loopfilter_filters_dspr2.c +++ b/vpx_dsp/mips/loopfilter_filters_dspr2.c @@ -323,8 +323,8 @@ void vpx_lpf_horizontal_8_dual_dspr2(uint8_t *s, int p /* pitch */, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { - vpx_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0, 1); - vpx_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1, 1); + vpx_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0); + vpx_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1); } void vpx_lpf_vertical_4_dual_dspr2(uint8_t *s, int p, diff --git a/vpx_dsp/mips/loopfilter_mb_dspr2.c b/vpx_dsp/mips/loopfilter_mb_dspr2.c index 5bbf091c8..dd0545eed 100644 --- a/vpx_dsp/mips/loopfilter_mb_dspr2.c +++ b/vpx_dsp/mips/loopfilter_mb_dspr2.c @@ -23,8 +23,7 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s, int pitch, const uint8_t *blimit, const uint8_t *limit, - const uint8_t *thresh, - int count) { + const uint8_t *thresh) { uint32_t mask; uint32_t hev, flat; uint8_t i; diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index eeb03b671..3f63a5f62 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -552,7 +552,7 @@ add_proto qw/void vpx_lpf_horizontal_16/, "uint8_t *s, int pitch, const uint8_t specialize qw/vpx_lpf_horizontal_16 sse2 avx2 neon_asm dspr2 msa/; $vpx_lpf_horizontal_16_neon_asm=vpx_lpf_horizontal_16_neon; -add_proto qw/void vpx_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"; +add_proto qw/void vpx_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; specialize qw/vpx_lpf_horizontal_8 sse2 neon dspr2 msa/; add_proto qw/void vpx_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; diff --git a/vpx_dsp/x86/loopfilter_sse2.c b/vpx_dsp/x86/loopfilter_sse2.c index 086d075fa..e1236dc4d 100644 --- a/vpx_dsp/x86/loopfilter_sse2.c +++ b/vpx_dsp/x86/loopfilter_sse2.c @@ -730,7 +730,7 @@ void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p, void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p, const unsigned char *_blimit, const unsigned char *_limit, - const unsigned char *_thresh, int count) { + const unsigned char *_thresh) { DECLARE_ALIGNED(16, unsigned char, flat_op2[16]); DECLARE_ALIGNED(16, unsigned char, flat_op1[16]); DECLARE_ALIGNED(16, unsigned char, flat_op0[16]); @@ -745,8 +745,6 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p, __m128i p3, p2, p1, p0, q0, q1, q2, q3; __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0; - (void)count; - q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)), _mm_loadl_epi64((__m128i *)(s + 3 * p))); q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)), @@ -1504,7 +1502,7 @@ void vpx_lpf_vertical_8_sse2(unsigned char *s, int p, transpose(src, p, dst, 8, 1); // Loop filtering - vpx_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1); + vpx_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh); src[0] = t_dst; dst[0] = s - 4; |