diff options
author | Scott LaVarnway <slavarnway@google.com> | 2014-05-28 13:32:52 -0700 |
---|---|---|
committer | Scott LaVarnway <slavarnway@google.com> | 2014-05-28 13:32:52 -0700 |
commit | 4d9b9fa5082687300122a814bb039f9e3aa65341 (patch) | |
tree | 9c600275964b834cca60825181c16f7169719c51 /vp8/encoder/arm | |
parent | 39b9731876e57aa68335cef6bf5d687a49bee6d9 (diff) | |
download | libvpx-4d9b9fa5082687300122a814bb039f9e3aa65341.tar libvpx-4d9b9fa5082687300122a814bb039f9e3aa65341.tar.gz libvpx-4d9b9fa5082687300122a814bb039f9e3aa65341.tar.bz2 libvpx-4d9b9fa5082687300122a814bb039f9e3aa65341.zip |
Neon match to vp8 temporal denoiser fix
Now match the "C" version of "Fix to reduce block
artifacts from vp8 temporal denoiser."
(see change id Id9b56e59e33f3c22e79d2f89f763bdde246fdf3f)
Change-Id: I99e569bb6af4ae3532621127e12bf917a48ba08e
Diffstat (limited to 'vp8/encoder/arm')
-rw-r--r-- | vp8/encoder/arm/neon/denoising_neon.c | 87 |
1 files changed, 82 insertions, 5 deletions
diff --git a/vp8/encoder/arm/neon/denoising_neon.c b/vp8/encoder/arm/neon/denoising_neon.c index b8e403419..78cc6fa37 100644 --- a/vp8/encoder/arm/neon/denoising_neon.c +++ b/vp8/encoder/arm/neon/denoising_neon.c @@ -68,8 +68,8 @@ int vp8_denoiser_filter_neon(unsigned char *mc_running_avg_y, int64x2_t v_sum_diff_total = vdupq_n_s64(0); /* Go over lines. */ - int i; - for (i = 0; i < 16; ++i) { + int r; + for (r = 0; r < 16; ++r) { /* Load inputs. */ const uint8x16_t v_sig = vld1q_u8(sig); const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y); @@ -145,14 +145,91 @@ int vp8_denoiser_filter_neon(unsigned char *mc_running_avg_y, /* Too much adjustments => copy block. */ { - const int64x1_t x = vqadd_s64(vget_high_s64(v_sum_diff_total), + int64x1_t x = vqadd_s64(vget_high_s64(v_sum_diff_total), vget_low_s64(v_sum_diff_total)); - const int s0 = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0); + int sum_diff = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0); int sum_diff_thresh = SUM_DIFF_THRESHOLD; if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH; - if (s0 > sum_diff_thresh) + if (sum_diff > sum_diff_thresh) { + // Before returning to copy the block (i.e., apply no denoising), + // checK if we can still apply some (weaker) temporal filtering to + // this block, that would otherwise not be denoised at all. Simplest + // is to apply an additional adjustment to running_avg_y to bring it + // closer to sig. The adjustment is capped by a maximum delta, and + // chosen such that in most cases the resulting sum_diff will be + // within the accceptable range given by sum_diff_thresh. + + // The delta is set by the excess of absolute pixel diff over the + // threshold. + int delta = ((sum_diff - sum_diff_thresh) >> 8) + 1; + // Only apply the adjustment for max delta up to 3. + if (delta < 4) { + const uint8x16_t k_delta = vmovq_n_u8(delta); + sig -= sig_stride * 16; + mc_running_avg_y -= mc_running_avg_y_stride * 16; + running_avg_y -= running_avg_y_stride * 16; + for (r = 0; r < 16; ++r) { + uint8x16_t v_running_avg_y = vld1q_u8(running_avg_y); + const uint8x16_t v_sig = vld1q_u8(sig); + const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y); + + /* Calculate absolute difference and sign masks. */ + const uint8x16_t v_abs_diff = vabdq_u8(v_sig, + v_mc_running_avg_y); + const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, + v_mc_running_avg_y); + const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, + v_mc_running_avg_y); + // Clamp absolute difference to delta to get the adjustment. + const uint8x16_t v_abs_adjustment = + vminq_u8(v_abs_diff, (k_delta)); + + const uint8x16_t v_pos_adjustment = vandq_u8(v_diff_pos_mask, + v_abs_adjustment); + const uint8x16_t v_neg_adjustment = vandq_u8(v_diff_neg_mask, + v_abs_adjustment); + + v_running_avg_y = vqsubq_u8(v_running_avg_y, v_pos_adjustment); + v_running_avg_y = vqaddq_u8(v_running_avg_y, v_neg_adjustment); + + /* Store results. */ + vst1q_u8(running_avg_y, v_running_avg_y); + + { + const int8x16_t v_sum_diff = + vqsubq_s8(vreinterpretq_s8_u8(v_neg_adjustment), + vreinterpretq_s8_u8(v_pos_adjustment)); + + const int16x8_t fe_dc_ba_98_76_54_32_10 = + vpaddlq_s8(v_sum_diff); + const int32x4_t fedc_ba98_7654_3210 = + vpaddlq_s16(fe_dc_ba_98_76_54_32_10); + const int64x2_t fedcba98_76543210 = + vpaddlq_s32(fedc_ba98_7654_3210); + + v_sum_diff_total = vqaddq_s64(v_sum_diff_total, + fedcba98_76543210); + } + /* Update pointers for next iteration. */ + sig += sig_stride; + mc_running_avg_y += mc_running_avg_y_stride; + running_avg_y += running_avg_y_stride; + } + { + // Update the sum of all pixel differences of this MB. + x = vqadd_s64(vget_high_s64(v_sum_diff_total), + vget_low_s64(v_sum_diff_total)); + sum_diff = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0); + + if (sum_diff > sum_diff_thresh) { + return COPY_BLOCK; + } + } + } else { return COPY_BLOCK; + } + } } /* Tell above level that block was filtered. */ |