diff options
-rw-r--r-- | test/pp_filter_test.cc | 98 | ||||
-rw-r--r-- | vpx_dsp/arm/deblock_neon.c | 259 | ||||
-rw-r--r-- | vpx_dsp/arm/transpose_neon.h | 140 | ||||
-rw-r--r-- | vpx_dsp/vpx_dsp.mk | 1 | ||||
-rw-r--r-- | vpx_dsp/vpx_dsp_rtcd_defs.pl | 2 |
5 files changed, 492 insertions, 8 deletions
diff --git a/test/pp_filter_test.cc b/test/pp_filter_test.cc index 86d858b0a..cf6c23fdd 100644 --- a/test/pp_filter_test.cc +++ b/test/pp_filter_test.cc @@ -120,6 +120,94 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckFilterOutput) { vpx_free(flimits); }; +TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckCvsAssembly) { + // Size of the underlying data block that will be filtered. + // Y blocks are always a multiple of 16 wide and exactly 16 high. U and V + // blocks are always a multiple of 8 wide and exactly 8 high. + const int block_width = 136; + const int block_height = 16; + + // 5-tap filter needs 2 padding rows above and below the block in the input. + const int input_width = block_width; + const int input_height = block_height + 4; + const int input_stride = input_width; + const int input_size = input_stride * input_height; + + // Filter extends output block by 8 samples at left and right edges. + const int output_width = block_width + 16; + const int output_height = block_height; + const int output_stride = output_width; + const int output_size = output_stride * output_height; + + uint8_t *const src_image = new uint8_t[input_size]; + ASSERT_TRUE(src_image != NULL); + + // Though the left padding is only 8 bytes, the assembly code tries to + // read 16 bytes before the pointer. + uint8_t *const dst_image = new uint8_t[output_size + 8]; + ASSERT_TRUE(dst_image != NULL); + uint8_t *const dst_image_ref = new uint8_t[output_size + 8]; + ASSERT_TRUE(dst_image_ref != NULL); + + // Pointers to top-left pixel of block in the input and output images. + uint8_t *const src_image_ptr = src_image + (input_stride << 1); + + // The assembly works in increments of 16. The first read may be offset by + // this amount. + uint8_t *const dst_image_ptr = dst_image + 16; + uint8_t *const dst_image_ref_ptr = dst_image + 16; + + // Filter values are set in blocks of 16 for Y and 8 for U/V. Each macroblock + // can have a different filter. + uint8_t *const flimits = + reinterpret_cast<uint8_t *>(vpx_memalign(16, block_width)); + + ACMRandom rnd; + rnd.Reset(ACMRandom::DeterministicSeed()); + // Initialize pixels in the input: + // block pixels to random values. + // border pixels to value 10. + (void)memset(src_image, 10, input_size); + uint8_t *pixel_ptr = src_image_ptr; + for (int i = 0; i < block_height; ++i) { + for (int j = 0; j < block_width; ++j) { + pixel_ptr[j] = rnd.Rand8(); + } + pixel_ptr += input_stride; + } + + for (int blocks = 0; blocks < block_width; blocks += 8) { + (void)memset(flimits, 0, sizeof(*flimits) * block_width); + + for (int f = 0; f < 255; f++) { + (void)memset(flimits + blocks, f, sizeof(*flimits) * 8); + + (void)memset(dst_image, 0, output_size); + (void)memset(dst_image_ref, 0, output_size); + + vpx_post_proc_down_and_across_mb_row_c( + src_image_ptr, dst_image_ref_ptr, input_stride, output_stride, + block_width, flimits, block_height); + ASM_REGISTER_STATE_CHECK(GetParam()(src_image_ptr, dst_image_ptr, + input_stride, output_stride, + block_width, flimits, 16)); + + for (int i = 0; i < block_height; ++i) { + for (int j = 0; j < block_width; ++j) { + ASSERT_EQ(dst_image_ref_ptr[j + i * output_stride], + dst_image_ptr[j + i * output_stride]) + << "at (" << i << ", " << j << ")"; + } + } + } + } + + delete[] src_image; + delete[] dst_image; + delete[] dst_image_ref; + vpx_free(flimits); +} + class VpxMbPostProcAcrossIpTest : public ::testing::TestWithParam<VpxMbPostProcAcrossIpFunc> { public: @@ -497,7 +585,13 @@ INSTANTIATE_TEST_CASE_P(SSE2, VpxMbPostProcAcrossIpTest, INSTANTIATE_TEST_CASE_P(SSE2, VpxMbPostProcDownTest, ::testing::Values(vpx_mbpost_proc_down_sse2)); -#endif +#endif // HAVE_SSE2 + +#if HAVE_NEON +INSTANTIATE_TEST_CASE_P( + NEON, VpxPostProcDownAndAcrossMbRowTest, + ::testing::Values(vpx_post_proc_down_and_across_mb_row_neon)); +#endif // HAVE_NEON #if HAVE_MSA INSTANTIATE_TEST_CASE_P( @@ -509,6 +603,6 @@ INSTANTIATE_TEST_CASE_P(MSA, VpxMbPostProcAcrossIpTest, INSTANTIATE_TEST_CASE_P(MSA, VpxMbPostProcDownTest, ::testing::Values(vpx_mbpost_proc_down_msa)); -#endif +#endif // HAVE_MSA } // namespace diff --git a/vpx_dsp/arm/deblock_neon.c b/vpx_dsp/arm/deblock_neon.c new file mode 100644 index 000000000..8fde332d3 --- /dev/null +++ b/vpx_dsp/arm/deblock_neon.c @@ -0,0 +1,259 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include <assert.h> + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/transpose_neon.h" + +static uint8x8_t average_k_out(const uint8x8_t a2, const uint8x8_t a1, + const uint8x8_t v0, const uint8x8_t b1, + const uint8x8_t b2) { + const uint8x8_t k1 = vrhadd_u8(a2, a1); + const uint8x8_t k2 = vrhadd_u8(b2, b1); + const uint8x8_t k3 = vrhadd_u8(k1, k2); + return vrhadd_u8(k3, v0); +} + +static uint8x8_t generate_mask(const uint8x8_t a2, const uint8x8_t a1, + const uint8x8_t v0, const uint8x8_t b1, + const uint8x8_t b2, const uint8x8_t filter) { + const uint8x8_t a2_v0 = vabd_u8(a2, v0); + const uint8x8_t a1_v0 = vabd_u8(a1, v0); + const uint8x8_t b1_v0 = vabd_u8(b1, v0); + const uint8x8_t b2_v0 = vabd_u8(b2, v0); + + uint8x8_t max = vmax_u8(a2_v0, a1_v0); + max = vmax_u8(b1_v0, max); + max = vmax_u8(b2_v0, max); + return vclt_u8(max, filter); +} + +static uint8x8_t generate_output(const uint8x8_t a2, const uint8x8_t a1, + const uint8x8_t v0, const uint8x8_t b1, + const uint8x8_t b2, const uint8x8_t filter) { + const uint8x8_t k_out = average_k_out(a2, a1, v0, b1, b2); + const uint8x8_t mask = generate_mask(a2, a1, v0, b1, b2, filter); + + return vbsl_u8(mask, k_out, v0); +} + +// Same functions but for uint8x16_t. +static uint8x16_t average_k_outq(const uint8x16_t a2, const uint8x16_t a1, + const uint8x16_t v0, const uint8x16_t b1, + const uint8x16_t b2) { + const uint8x16_t k1 = vrhaddq_u8(a2, a1); + const uint8x16_t k2 = vrhaddq_u8(b2, b1); + const uint8x16_t k3 = vrhaddq_u8(k1, k2); + return vrhaddq_u8(k3, v0); +} + +static uint8x16_t generate_maskq(const uint8x16_t a2, const uint8x16_t a1, + const uint8x16_t v0, const uint8x16_t b1, + const uint8x16_t b2, const uint8x16_t filter) { + const uint8x16_t a2_v0 = vabdq_u8(a2, v0); + const uint8x16_t a1_v0 = vabdq_u8(a1, v0); + const uint8x16_t b1_v0 = vabdq_u8(b1, v0); + const uint8x16_t b2_v0 = vabdq_u8(b2, v0); + + uint8x16_t max = vmaxq_u8(a2_v0, a1_v0); + max = vmaxq_u8(b1_v0, max); + max = vmaxq_u8(b2_v0, max); + return vcltq_u8(max, filter); +} + +static uint8x16_t generate_outputq(const uint8x16_t a2, const uint8x16_t a1, + const uint8x16_t v0, const uint8x16_t b1, + const uint8x16_t b2, + const uint8x16_t filter) { + const uint8x16_t k_out = average_k_outq(a2, a1, v0, b1, b2); + const uint8x16_t mask = generate_maskq(a2, a1, v0, b1, b2, filter); + + return vbslq_u8(mask, k_out, v0); +} + +void vpx_post_proc_down_and_across_mb_row_neon(uint8_t *src_ptr, + uint8_t *dst_ptr, int src_stride, + int dst_stride, int cols, + uint8_t *f, int size) { + uint8_t *src, *dst; + int row; + int col; + + // Process a stripe of macroblocks. The stripe will be a multiple of 16 (for + // Y) or 8 (for U/V) wide (cols) and the height (size) will be 16 (for Y) or 8 + // (for U/V). + assert((size == 8 || size == 16) && cols % 8 == 0); + + // While columns of length 16 can be processed, load them. + for (col = 0; col < cols - 8; col += 16) { + uint8x16_t a0, a1, a2, a3, a4, a5, a6, a7; + src = src_ptr - 2 * src_stride; + dst = dst_ptr; + + a0 = vld1q_u8(src); + src += src_stride; + a1 = vld1q_u8(src); + src += src_stride; + a2 = vld1q_u8(src); + src += src_stride; + a3 = vld1q_u8(src); + src += src_stride; + + for (row = 0; row < size; row += 4) { + uint8x16_t v_out_0, v_out_1, v_out_2, v_out_3; + const uint8x16_t filterq = vld1q_u8(f + col); + + a4 = vld1q_u8(src); + src += src_stride; + a5 = vld1q_u8(src); + src += src_stride; + a6 = vld1q_u8(src); + src += src_stride; + a7 = vld1q_u8(src); + src += src_stride; + + v_out_0 = generate_outputq(a0, a1, a2, a3, a4, filterq); + v_out_1 = generate_outputq(a1, a2, a3, a4, a5, filterq); + v_out_2 = generate_outputq(a2, a3, a4, a5, a6, filterq); + v_out_3 = generate_outputq(a3, a4, a5, a6, a7, filterq); + + vst1q_u8(dst, v_out_0); + dst += dst_stride; + vst1q_u8(dst, v_out_1); + dst += dst_stride; + vst1q_u8(dst, v_out_2); + dst += dst_stride; + vst1q_u8(dst, v_out_3); + dst += dst_stride; + + // Rotate over to the next slot. + a0 = a4; + a1 = a5; + a2 = a6; + a3 = a7; + } + + src_ptr += 16; + dst_ptr += 16; + } + + // Clean up any left over column of length 8. + if (col != cols) { + uint8x8_t a0, a1, a2, a3, a4, a5, a6, a7; + src = src_ptr - 2 * src_stride; + dst = dst_ptr; + + a0 = vld1_u8(src); + src += src_stride; + a1 = vld1_u8(src); + src += src_stride; + a2 = vld1_u8(src); + src += src_stride; + a3 = vld1_u8(src); + src += src_stride; + + for (row = 0; row < size; row += 4) { + uint8x8_t v_out_0, v_out_1, v_out_2, v_out_3; + const uint8x8_t filter = vld1_u8(f + col); + + a4 = vld1_u8(src); + src += src_stride; + a5 = vld1_u8(src); + src += src_stride; + a6 = vld1_u8(src); + src += src_stride; + a7 = vld1_u8(src); + src += src_stride; + + v_out_0 = generate_output(a0, a1, a2, a3, a4, filter); + v_out_1 = generate_output(a1, a2, a3, a4, a5, filter); + v_out_2 = generate_output(a2, a3, a4, a5, a6, filter); + v_out_3 = generate_output(a3, a4, a5, a6, a7, filter); + + vst1_u8(dst, v_out_0); + dst += dst_stride; + vst1_u8(dst, v_out_1); + dst += dst_stride; + vst1_u8(dst, v_out_2); + dst += dst_stride; + vst1_u8(dst, v_out_3); + dst += dst_stride; + + // Rotate over to the next slot. + a0 = a4; + a1 = a5; + a2 = a6; + a3 = a7; + } + + // Not strictly necessary but makes resetting dst_ptr easier. + dst_ptr += 8; + } + + dst_ptr -= cols; + + for (row = 0; row < size; row += 8) { + uint8x8_t a0, a1, a2, a3; + uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7; + + src = dst_ptr; + dst = dst_ptr; + + // Load 8 values, transpose 4 of them, and discard 2 because they will be + // reloaded later. + load_and_transpose_u8_4x8(src, dst_stride, &a0, &a1, &a2, &a3); + a3 = a1; + a2 = a1 = a0; // Extend left border. + + src += 2; + + for (col = 0; col < cols; col += 8) { + uint8x8_t v_out_0, v_out_1, v_out_2, v_out_3, v_out_4, v_out_5, v_out_6, + v_out_7; + // Although the filter is meant to be applied vertically and is instead + // being applied horizontally here it's OK because it's set in blocks of 8 + // (or 16). + const uint8x8_t filter = vld1_u8(f + col); + + load_and_transpose_u8_8x8(src, dst_stride, &b0, &b1, &b2, &b3, &b4, &b5, + &b6, &b7); + + if (col + 8 == cols) { + // Last row. Extend border (b5). + b6 = b7 = b5; + } + + v_out_0 = generate_output(a0, a1, a2, a3, b0, filter); + v_out_1 = generate_output(a1, a2, a3, b0, b1, filter); + v_out_2 = generate_output(a2, a3, b0, b1, b2, filter); + v_out_3 = generate_output(a3, b0, b1, b2, b3, filter); + v_out_4 = generate_output(b0, b1, b2, b3, b4, filter); + v_out_5 = generate_output(b1, b2, b3, b4, b5, filter); + v_out_6 = generate_output(b2, b3, b4, b5, b6, filter); + v_out_7 = generate_output(b3, b4, b5, b6, b7, filter); + + transpose_and_store_u8_8x8(dst, dst_stride, v_out_0, v_out_1, v_out_2, + v_out_3, v_out_4, v_out_5, v_out_6, v_out_7); + + a0 = b4; + a1 = b5; + a2 = b6; + a3 = b7; + + src += 8; + dst += 8; + } + + dst_ptr += 8 * dst_stride; + } +} diff --git a/vpx_dsp/arm/transpose_neon.h b/vpx_dsp/arm/transpose_neon.h index fb28c6ea5..e5fe55c1b 100644 --- a/vpx_dsp/arm/transpose_neon.h +++ b/vpx_dsp/arm/transpose_neon.h @@ -179,6 +179,62 @@ static INLINE void transpose_u16_4x4q(uint16x8_t *a0, uint16x8_t *a1) { *a1 = d0.val[1]; } +static INLINE void transpose_u8_4x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2, + uint8x8_t *a3, const uint8x8_t a4, + const uint8x8_t a5, const uint8x8_t a6, + const uint8x8_t a7) { + // Swap 32 bit elements. Goes from: + // a0: 00 01 02 03 XX XX XX XX + // a1: 10 11 12 13 XX XX XX XX + // a2: 20 21 22 23 XX XX XX XX + // a3; 30 31 32 33 XX XX XX XX + // a4: 40 41 42 43 XX XX XX XX + // a5: 50 51 52 53 XX XX XX XX + // a6: 60 61 62 63 XX XX XX XX + // a7: 70 71 72 73 XX XX XX XX + // to: + // b0.val[0]: 00 01 02 03 40 41 42 43 + // b1.val[0]: 10 11 12 13 50 51 52 53 + // b2.val[0]: 20 21 22 23 60 61 62 63 + // b3.val[0]: 30 31 32 33 70 71 72 73 + + const uint32x2x2_t b0 = + vtrn_u32(vreinterpret_u32_u8(*a0), vreinterpret_u32_u8(a4)); + const uint32x2x2_t b1 = + vtrn_u32(vreinterpret_u32_u8(*a1), vreinterpret_u32_u8(a5)); + const uint32x2x2_t b2 = + vtrn_u32(vreinterpret_u32_u8(*a2), vreinterpret_u32_u8(a6)); + const uint32x2x2_t b3 = + vtrn_u32(vreinterpret_u32_u8(*a3), vreinterpret_u32_u8(a7)); + + // Swap 16 bit elements resulting in: + // c0.val[0]: 00 01 20 21 40 41 60 61 + // c0.val[1]: 02 03 22 23 42 43 62 63 + // c1.val[0]: 10 11 30 31 50 51 70 71 + // c1.val[1]: 12 13 32 33 52 53 72 73 + + const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u32(b0.val[0]), + vreinterpret_u16_u32(b2.val[0])); + const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u32(b1.val[0]), + vreinterpret_u16_u32(b3.val[0])); + + // Swap 8 bit elements resulting in: + // d0.val[0]: 00 10 20 30 40 50 60 70 + // d0.val[1]: 01 11 21 31 41 51 61 71 + // d1.val[0]: 02 12 22 32 42 52 62 72 + // d1.val[1]: 03 13 23 33 43 53 63 73 + + const uint8x8x2_t d0 = + vtrn_u8(vreinterpret_u8_u16(c0.val[0]), vreinterpret_u8_u16(c1.val[0])); + const uint8x8x2_t d1 = + vtrn_u8(vreinterpret_u8_u16(c0.val[1]), vreinterpret_u8_u16(c1.val[1])); + + *a0 = d0.val[0]; + *a1 = d0.val[1]; + *a2 = d1.val[0]; + *a3 = d1.val[1]; +} + static INLINE void transpose_s32_4x4(int32x4_t *a0, int32x4_t *a1, int32x4_t *a2, int32x4_t *a3) { // Swap 32 bit elements. Goes from: @@ -936,11 +992,85 @@ static INLINE void transpose_u8_16x16( *o15 = e7.val[1]; } -static INLINE void load_and_transpose_s16_8x8(const int16_t *a, int a_stride, - int16x8_t *a0, int16x8_t *a1, - int16x8_t *a2, int16x8_t *a3, - int16x8_t *a4, int16x8_t *a5, - int16x8_t *a6, int16x8_t *a7) { +static INLINE void load_and_transpose_u8_4x8(const uint8_t *a, + const int a_stride, uint8x8_t *a0, + uint8x8_t *a1, uint8x8_t *a2, + uint8x8_t *a3) { + uint8x8_t a4, a5, a6, a7; + *a0 = vld1_u8(a); + a += a_stride; + *a1 = vld1_u8(a); + a += a_stride; + *a2 = vld1_u8(a); + a += a_stride; + *a3 = vld1_u8(a); + a += a_stride; + a4 = vld1_u8(a); + a += a_stride; + a5 = vld1_u8(a); + a += a_stride; + a6 = vld1_u8(a); + a += a_stride; + a7 = vld1_u8(a); + + transpose_u8_4x8(a0, a1, a2, a3, a4, a5, a6, a7); +} + +static INLINE void load_and_transpose_u8_8x8(const uint8_t *a, + const int a_stride, uint8x8_t *a0, + uint8x8_t *a1, uint8x8_t *a2, + uint8x8_t *a3, uint8x8_t *a4, + uint8x8_t *a5, uint8x8_t *a6, + uint8x8_t *a7) { + *a0 = vld1_u8(a); + a += a_stride; + *a1 = vld1_u8(a); + a += a_stride; + *a2 = vld1_u8(a); + a += a_stride; + *a3 = vld1_u8(a); + a += a_stride; + *a4 = vld1_u8(a); + a += a_stride; + *a5 = vld1_u8(a); + a += a_stride; + *a6 = vld1_u8(a); + a += a_stride; + *a7 = vld1_u8(a); + + transpose_u8_8x8(a0, a1, a2, a3, a4, a5, a6, a7); +} + +static INLINE void transpose_and_store_u8_8x8(uint8_t *a, const int a_stride, + uint8x8_t a0, uint8x8_t a1, + uint8x8_t a2, uint8x8_t a3, + uint8x8_t a4, uint8x8_t a5, + uint8x8_t a6, uint8x8_t a7) { + transpose_u8_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); + + vst1_u8(a, a0); + a += a_stride; + vst1_u8(a, a1); + a += a_stride; + vst1_u8(a, a2); + a += a_stride; + vst1_u8(a, a3); + a += a_stride; + vst1_u8(a, a4); + a += a_stride; + vst1_u8(a, a5); + a += a_stride; + vst1_u8(a, a6); + a += a_stride; + vst1_u8(a, a7); +} + +static INLINE void load_and_transpose_s16_8x8(const int16_t *a, + const int a_stride, int16x8_t *a0, + int16x8_t *a1, int16x8_t *a2, + int16x8_t *a3, int16x8_t *a4, + int16x8_t *a5, int16x8_t *a6, + int16x8_t *a7) { *a0 = vld1q_s16(a); a += a_stride; *a1 = vld1q_s16(a); diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 987ba7077..06c7f9e6e 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -57,6 +57,7 @@ DSP_SRCS-yes += deblock.c DSP_SRCS-yes += postproc.h DSP_SRCS-$(HAVE_MSA) += mips/add_noise_msa.c DSP_SRCS-$(HAVE_MSA) += mips/deblock_msa.c +DSP_SRCS-$(HAVE_NEON) += arm/deblock_neon.c DSP_SRCS-$(HAVE_SSE2) += x86/add_noise_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/deblock_sse2.asm endif # CONFIG_POSTPROC diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 5c05a30fc..d3c07002d 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -1756,7 +1756,7 @@ if (vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC") specialize qw/vpx_mbpost_proc_across_ip sse2 msa/; add_proto qw/void vpx_post_proc_down_and_across_mb_row/, "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size"; - specialize qw/vpx_post_proc_down_and_across_mb_row sse2 msa/; + specialize qw/vpx_post_proc_down_and_across_mb_row sse2 neon msa/; } |