From 4dca92345467fea3c4461152f1847352271ec883 Mon Sep 17 00:00:00 2001 From: Johann Date: Wed, 21 Dec 2016 14:19:25 -0800 Subject: postproc: vpx_mbpost_proc_across_ip_neon The speedup is pretty poor. I would be concerned except the SSE2 is worse: Existing SSE2 improvement: 22% New neon improvement: 35% BUG=webm:1320 Change-Id: Ied598a261134aa6cbe69f96f58589d2bae17bf62 --- vpx_dsp/arm/deblock_neon.c | 127 +++++++++++++++++++++++++++++++++++++++++++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 2 +- 2 files changed, 128 insertions(+), 1 deletion(-) (limited to 'vpx_dsp') diff --git a/vpx_dsp/arm/deblock_neon.c b/vpx_dsp/arm/deblock_neon.c index 8fde332d3..ed1a4df25 100644 --- a/vpx_dsp/arm/deblock_neon.c +++ b/vpx_dsp/arm/deblock_neon.c @@ -257,3 +257,130 @@ void vpx_post_proc_down_and_across_mb_row_neon(uint8_t *src_ptr, dst_ptr += 8 * dst_stride; } } + +// sum += x; +// sumsq += x * y; +static void accumulate_sum_sumsq(const int16x4_t x, const int32x4_t xy, + int16x4_t *const sum, int32x4_t *const sumsq) { + const int16x4_t zero = vdup_n_s16(0); + const int32x4_t zeroq = vdupq_n_s32(0); + + // Add in the first set because vext doesn't work with '0'. + *sum = vadd_s16(*sum, x); + *sumsq = vaddq_s32(*sumsq, xy); + + // Shift x and xy to the right and sum. vext requires an immediate. + *sum = vadd_s16(*sum, vext_s16(zero, x, 1)); + *sumsq = vaddq_s32(*sumsq, vextq_s32(zeroq, xy, 1)); + + *sum = vadd_s16(*sum, vext_s16(zero, x, 2)); + *sumsq = vaddq_s32(*sumsq, vextq_s32(zeroq, xy, 2)); + + *sum = vadd_s16(*sum, vext_s16(zero, x, 3)); + *sumsq = vaddq_s32(*sumsq, vextq_s32(zeroq, xy, 3)); +} + +// Generate mask based on (sumsq * 15 - sum * sum < flimit) +static uint16x4_t calculate_mask(const int16x4_t sum, const int32x4_t sumsq, + const int32x4_t f, const int32x4_t fifteen) { + const int32x4_t a = vmulq_s32(sumsq, fifteen); + const int32x4_t b = vmlsl_s16(a, sum, sum); + const uint32x4_t mask32 = vcltq_s32(b, f); + return vmovn_u32(mask32); +} + +static uint8x8_t combine_mask(const int16x4_t sum_low, const int16x4_t sum_high, + const int32x4_t sumsq_low, + const int32x4_t sumsq_high, const int32x4_t f) { + const int32x4_t fifteen = vdupq_n_s32(15); + const uint16x4_t mask16_low = calculate_mask(sum_low, sumsq_low, f, fifteen); + const uint16x4_t mask16_high = + calculate_mask(sum_high, sumsq_high, f, fifteen); + return vmovn_u16(vcombine_u16(mask16_low, mask16_high)); +} + +// Apply filter of (8 + sum + s[c]) >> 4. +static uint8x8_t filter_pixels(const int16x8_t sum, const uint8x8_t s) { + const int16x8_t s16 = vreinterpretq_s16_u16(vmovl_u8(s)); + const int16x8_t sum_s = vaddq_s16(sum, s16); + + return vqrshrun_n_s16(sum_s, 4); +} + +void vpx_mbpost_proc_across_ip_neon(uint8_t *src, int pitch, int rows, int cols, + int flimit) { + int row, col; + const int32x4_t f = vdupq_n_s32(flimit); + + assert(cols % 8 == 0); + + for (row = 0; row < rows; ++row) { + // Sum the first 8 elements, which are extended from s[0]. + // sumsq gets primed with +16. + int sumsq = src[0] * src[0] * 9 + 16; + int sum = src[0] * 9; + + uint8x8_t left_context, s, right_context; + int16x4_t sum_low, sum_high; + int32x4_t sumsq_low, sumsq_high; + + // Sum (+square) the next 6 elements. + // Skip [0] because it's included above. + for (col = 1; col <= 6; ++col) { + sumsq += src[col] * src[col]; + sum += src[col]; + } + + // Prime the sums. Later the loop uses the _high values to prime the new + // vectors. + sumsq_high = vdupq_n_s32(sumsq); + sum_high = vdup_n_s16(sum); + + // Manually extend the left border. + left_context = vdup_n_u8(src[0]); + + for (col = 0; col < cols; col += 8) { + uint8x8_t mask, output; + int16x8_t x, y; + int32x4_t xy_low, xy_high; + + s = vld1_u8(src + col); + + if (col + 8 == cols) { + // Last row. Extend border. + right_context = vdup_n_u8(src[col + 7]); + } else { + right_context = vld1_u8(src + col + 7); + } + + x = vreinterpretq_s16_u16(vsubl_u8(right_context, left_context)); + y = vreinterpretq_s16_u16(vaddl_u8(right_context, left_context)); + xy_low = vmull_s16(vget_low_s16(x), vget_low_s16(y)); + xy_high = vmull_s16(vget_high_s16(x), vget_high_s16(y)); + + // Catch up to the last sum'd value. + sum_low = vdup_lane_s16(sum_high, 3); + sumsq_low = vdupq_lane_s32(vget_high_s32(sumsq_high), 1); + + accumulate_sum_sumsq(vget_low_s16(x), xy_low, &sum_low, &sumsq_low); + + // Need to do this sequentially because we need the max value from + // sum_low. + sum_high = vdup_lane_s16(sum_low, 3); + sumsq_high = vdupq_lane_s32(vget_high_s32(sumsq_low), 1); + + accumulate_sum_sumsq(vget_high_s16(x), xy_high, &sum_high, &sumsq_high); + + mask = combine_mask(sum_low, sum_high, sumsq_low, sumsq_high, f); + + output = filter_pixels(vcombine_s16(sum_low, sum_high), s); + output = vbsl_u8(mask, output, s); + + vst1_u8(src + col, output); + + left_context = s; + } + + src += pitch; + } +} diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index d3c07002d..0574000c5 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -1753,7 +1753,7 @@ if (vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC") specialize qw/vpx_mbpost_proc_down sse2 msa/; add_proto qw/void vpx_mbpost_proc_across_ip/, "unsigned char *dst, int pitch, int rows, int cols,int flimit"; - specialize qw/vpx_mbpost_proc_across_ip sse2 msa/; + specialize qw/vpx_mbpost_proc_across_ip sse2 neon msa/; add_proto qw/void vpx_post_proc_down_and_across_mb_row/, "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size"; specialize qw/vpx_post_proc_down_and_across_mb_row sse2 neon msa/; -- cgit v1.2.3