diff options
author | JackyChen <jackychen@google.com> | 2016-05-01 16:15:29 -0700 |
---|---|---|
committer | JackyChen <jackychen@google.com> | 2016-05-02 13:19:40 -0700 |
commit | b29612fbbea5d55726003a3a133d7b66bbb7226b (patch) | |
tree | 263e62dac3d255708260a2c43289c8fd63baa7f3 /vp9/encoder | |
parent | 94806e029d774afb002e80a36007725fd29d40b2 (diff) | |
download | libvpx-b29612fbbea5d55726003a3a133d7b66bbb7226b.tar libvpx-b29612fbbea5d55726003a3a133d7b66bbb7226b.tar.gz libvpx-b29612fbbea5d55726003a3a133d7b66bbb7226b.tar.bz2 libvpx-b29612fbbea5d55726003a3a133d7b66bbb7226b.zip |
vp9: Refactor vp9_denoiser_NxM_sse2.
Denoiser is ~1.5% faster in speed 6~8.
Change-Id: I7b350f3c50cce6773d9c4eded4c0c1b722d0a5fc
Diffstat (limited to 'vp9/encoder')
-rw-r--r-- | vp9/encoder/x86/vp9_denoiser_sse2.c | 68 |
1 files changed, 32 insertions, 36 deletions
diff --git a/vp9/encoder/x86/vp9_denoiser_sse2.c b/vp9/encoder/x86/vp9_denoiser_sse2.c index f4a149d9e..883507af3 100644 --- a/vp9/encoder/x86/vp9_denoiser_sse2.c +++ b/vp9/encoder/x86/vp9_denoiser_sse2.c @@ -147,8 +147,9 @@ static int vp9_denoiser_NxM_sse2_small( const __m128i l32 = _mm_set1_epi8(2); // Difference between level 2 and level 1 is 1. const __m128i l21 = _mm_set1_epi8(1); + const int b_height = (4 << b_height_log2_lookup[bs]) >> 1; - for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> 1); ++r) { + for (r = 0; r < b_height; ++r) { memcpy(sig_buffer[r], sig, width); memcpy(sig_buffer[r] + width, sig + sig_stride, width); memcpy(mc_running_buffer[r], mc_running_avg_y, width); @@ -188,8 +189,8 @@ static int vp9_denoiser_NxM_sse2_small( // Only apply the adjustment for max delta up to 3. if (delta < 4) { const __m128i k_delta = _mm_set1_epi8(delta); - running_avg_y -= avg_y_stride * (4 << b_height_log2_lookup[bs]); - for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> 1); ++r) { + running_avg_y -= avg_y_stride * (b_height << 1); + for (r = 0; r < b_height; ++r) { acc_diff = vp9_denoiser_adj_16x1_sse2( sig_buffer[r], mc_running_buffer[r], running_buffer[r], k_0, k_delta, acc_diff); @@ -235,38 +236,37 @@ static int vp9_denoiser_NxM_sse2_big(const uint8_t *sig, int sig_stride, const __m128i l32 = _mm_set1_epi8(2); // Difference between level 2 and level 1 is 1. const __m128i l21 = _mm_set1_epi8(1); + const int b_width = (4 << b_width_log2_lookup[bs]); + const int b_height = (4 << b_height_log2_lookup[bs]); + const int b_width_shift4 = b_width >> 4; - for (c = 0; c < 4; ++c) { - for (r = 0; r < 4; ++r) { + for (r = 0; r < 4; ++r) { + for (c = 0; c < b_width_shift4; ++c) { acc_diff[c][r] = _mm_setzero_si128(); } } - for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) { - for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) { - acc_diff[c>>4][r>>4] = vp9_denoiser_16x1_sse2( + for (r = 0; r < b_height; ++r) { + for (c = 0; c < b_width_shift4; ++c) { + acc_diff[c][r>>4] = vp9_denoiser_16x1_sse2( sig, mc_running_avg_y, running_avg_y, &k_0, &k_4, - &k_8, &k_16, &l3, &l32, &l21, acc_diff[c>>4][r>>4]); + &k_8, &k_16, &l3, &l32, &l21, acc_diff[c][r>>4]); // Update pointers for next iteration. sig += 16; mc_running_avg_y += 16; running_avg_y += 16; } - if ((r + 1) % 16 == 0 || (bs == BLOCK_16X8 && r == 7)) { - for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) { - sum_diff += sum_diff_16x1(acc_diff[c>>4][r>>4]); + if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) { + for (c = 0; c < b_width_shift4; ++c) { + sum_diff += sum_diff_16x1(acc_diff[c][r>>4]); } } // Update pointers for next iteration. - sig = sig - 16 * ((4 << b_width_log2_lookup[bs]) >> 4) + sig_stride; - mc_running_avg_y = mc_running_avg_y - - 16 * ((4 << b_width_log2_lookup[bs]) >> 4) + - mc_avg_y_stride; - running_avg_y = running_avg_y - - 16 * ((4 << b_width_log2_lookup[bs]) >> 4) + - avg_y_stride; + sig = sig - b_width + sig_stride; + mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride; + running_avg_y = running_avg_y - b_width + avg_y_stride; } { @@ -278,33 +278,29 @@ static int vp9_denoiser_NxM_sse2_big(const uint8_t *sig, int sig_stride, // Only apply the adjustment for max delta up to 3. if (delta < 4) { const __m128i k_delta = _mm_set1_epi8(delta); - sig -= sig_stride * (4 << b_height_log2_lookup[bs]); - mc_running_avg_y -= mc_avg_y_stride * (4 << b_height_log2_lookup[bs]); - running_avg_y -= avg_y_stride * (4 << b_height_log2_lookup[bs]); + sig -= sig_stride * b_height; + mc_running_avg_y -= mc_avg_y_stride * b_height; + running_avg_y -= avg_y_stride * b_height; sum_diff = 0; - for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) { - for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) { - acc_diff[c>>4][r>>4] = vp9_denoiser_adj_16x1_sse2( + for (r = 0; r < b_height; ++r) { + for (c = 0; c < b_width_shift4; ++c) { + acc_diff[c][r>>4] = vp9_denoiser_adj_16x1_sse2( sig, mc_running_avg_y, running_avg_y, k_0, - k_delta, acc_diff[c>>4][r>>4]); + k_delta, acc_diff[c][r>>4]); // Update pointers for next iteration. sig += 16; mc_running_avg_y += 16; running_avg_y += 16; } - if ((r + 1) % 16 == 0 || (bs == BLOCK_16X8 && r == 7)) { - for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) { - sum_diff += sum_diff_16x1(acc_diff[c>>4][r>>4]); + if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) { + for (c = 0; c < b_width_shift4; ++c) { + sum_diff += sum_diff_16x1(acc_diff[c][r>>4]); } } - sig = sig - 16 * ((4 << b_width_log2_lookup[bs]) >> 4) + sig_stride; - mc_running_avg_y = mc_running_avg_y - - 16 * ((4 << b_width_log2_lookup[bs]) >> 4) + - mc_avg_y_stride; - running_avg_y = running_avg_y - - 16 * ((4 << b_width_log2_lookup[bs]) >> 4) + - avg_y_stride; + sig = sig - b_width + sig_stride; + mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride; + running_avg_y = running_avg_y - b_width + avg_y_stride; } if (abs(sum_diff) > sum_diff_thresh) { return COPY_BLOCK; |