diff options
-rw-r--r-- | test/test.mk | 5 | ||||
-rw-r--r-- | test/vp9_denoiser_sse2_test.cc | 102 | ||||
-rw-r--r-- | vp9/common/vp9_common_data.c | 1 | ||||
-rw-r--r-- | vp9/common/vp9_debugmodes.c | 4 | ||||
-rw-r--r-- | vp9/common/vp9_rtcd_defs.pl | 8 | ||||
-rw-r--r-- | vp9/decoder/vp9_detokenize.c | 4 | ||||
-rw-r--r-- | vp9/decoder/vp9_dthread.c | 12 | ||||
-rw-r--r-- | vp9/encoder/vp9_denoiser.c | 55 | ||||
-rw-r--r-- | vp9/encoder/vp9_denoiser.h | 4 | ||||
-rw-r--r-- | vp9/encoder/vp9_encoder.c | 4 | ||||
-rw-r--r-- | vp9/encoder/vp9_rdopt.c | 20 | ||||
-rw-r--r-- | vp9/encoder/vp9_temporal_filter.c | 95 | ||||
-rw-r--r-- | vp9/encoder/x86/vp9_denoiser_sse2.c | 474 | ||||
-rw-r--r-- | vp9/vp9cx.mk | 4 |
14 files changed, 704 insertions, 88 deletions
diff --git a/test/test.mk b/test/test.mk index df4969f5c..bdde5064a 100644 --- a/test/test.mk +++ b/test/test.mk @@ -135,13 +135,16 @@ ifeq ($(CONFIG_VP9_ENCODER),yes) LIBVPX_TEST_SRCS-$(CONFIG_SPATIAL_SVC) += svc_test.cc endif +ifeq ($(CONFIG_VP9_TEMPORAL_DENOISING),yes) +LIBVPX_TEST_SRCS-$(CONFIG_VP9_TEMPORAL_DENOISING) += vp9_denoiser_sse2_test.cc +endif + endif # VP9 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += sad_test.cc endif # CONFIG_SHARED - ## ## TEST DATA ## diff --git a/test/vp9_denoiser_sse2_test.cc b/test/vp9_denoiser_sse2_test.cc new file mode 100644 index 000000000..edebc83e8 --- /dev/null +++ b/test/vp9_denoiser_sse2_test.cc @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <math.h> +#include <stdlib.h> +#include <string.h> + +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" + +#include "vpx_scale/yv12config.h" +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_reconinter.h" +#include "vp9/encoder/vp9_context_tree.h" +#include "vp9/encoder/vp9_denoiser.h" + +using libvpx_test::ACMRandom; + +namespace { + +const int kNumPixels = 64 * 64; +class VP9DenoiserTest + : public ::testing::TestWithParam<int> { + public: + virtual ~VP9DenoiserTest() {} + + virtual void SetUp() { + bs = (BLOCK_SIZE)GetParam(); + } + + virtual void TearDown() { libvpx_test::ClearSystemState(); } + + protected: + BLOCK_SIZE bs; +}; + +TEST_P(VP9DenoiserTest, BitexactCheck) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 4000; + + // Allocate the space for input and output, + // where sig_block is the block to be denoised, + // mc_avg_block is the denoised reference block, + // avg_block_c is the denoised result from C code, + // avg_block_sse2 is the denoised result from SSE2 code. + DECLARE_ALIGNED_ARRAY(16, uint8_t, sig_block, kNumPixels); + DECLARE_ALIGNED_ARRAY(16, uint8_t, mc_avg_block, kNumPixels); + DECLARE_ALIGNED_ARRAY(16, uint8_t, avg_block_c, kNumPixels); + DECLARE_ALIGNED_ARRAY(16, uint8_t, avg_block_sse2, kNumPixels); + + for (int i = 0; i < count_test_block; ++i) { + // Generate random motion magnitude, 20% of which exceed the threshold. + uint8_t motion_magnitude_random + = rnd.Rand8() % (uint8_t)(MOTION_MAGNITUDE_THRESHOLD * 1.2); + + // Initialize a test block with random number in range [0, 255]. + for (int j = 0; j < kNumPixels; ++j) { + int temp = 0; + sig_block[j] = rnd.Rand8(); + // The pixels in mc_avg_block are generated by adding a random + // number in range [-19, 19] to corresponding pixels in sig_block. + temp = sig_block[j] + (rnd.Rand8() % 2 == 0? -1 : 1) * + (rnd.Rand8()%20); + // Clip. + mc_avg_block[j] = (temp < 0? 0 : (temp > 255? 255 : temp)); + } + + ASM_REGISTER_STATE_CHECK(vp9_denoiser_filter_c(sig_block, 64, + mc_avg_block, 64, avg_block_c, 64, + 0, bs, motion_magnitude_random)); + + ASM_REGISTER_STATE_CHECK(vp9_denoiser_filter_sse2(sig_block, 64, + mc_avg_block, 64, avg_block_sse2, 64, + 0, bs, motion_magnitude_random)); + + // Test bitexactness. + for (int h = 0; h < (4 << b_height_log2_lookup[bs]); ++h) { + for (int w = 0; w < (4 << b_width_log2_lookup[bs]); ++w) { + EXPECT_EQ(avg_block_c[h * 64 + w], avg_block_sse2[h * 64 + w]); + } + } + } +} + +// Test for all block size. +INSTANTIATE_TEST_CASE_P( + SSE2, VP9DenoiserTest, + ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4, BLOCK_8X8, + BLOCK_8X16, BLOCK_16X8, BLOCK_16X16, BLOCK_16X32, + BLOCK_32X16, BLOCK_32X32, BLOCK_32X64, BLOCK_64X32, + BLOCK_64X64)); +} // namespace diff --git a/vp9/common/vp9_common_data.c b/vp9/common/vp9_common_data.c index d4c1b7124..8685c0f8e 100644 --- a/vp9/common/vp9_common_data.c +++ b/vp9/common/vp9_common_data.c @@ -36,7 +36,6 @@ const int size_group_lookup[BLOCK_SIZES] = const int num_pels_log2_lookup[BLOCK_SIZES] = {4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12}; - const PARTITION_TYPE partition_lookup[][BLOCK_SIZES] = { { // 4X4 // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64 diff --git a/vp9/common/vp9_debugmodes.c b/vp9/common/vp9_debugmodes.c index 3f1684104..d9dace6ac 100644 --- a/vp9/common/vp9_debugmodes.c +++ b/vp9/common/vp9_debugmodes.c @@ -27,7 +27,7 @@ static void print_mi_data(VP9_COMMON *cm, FILE *file, const char *descriptor, int mi_row, mi_col; int mi_index = 0; // TODO(hkuang): Fix this debug function. - MODE_INFO **mi = NULL; + MODE_INFO **mi = &cm->mi; int rows = cm->mi_rows; int cols = cm->mi_cols; char prefix = descriptor[0]; @@ -53,7 +53,7 @@ void vp9_print_modes_and_motion_vectors(VP9_COMMON *cm, const char *file) { int mi_index = 0; FILE *mvs = fopen(file, "a"); // TODO(hkuang): Fix this debug function. - MODE_INFO **mi = NULL; + MODE_INFO **mi = &cm->mi; int rows = cm->mi_rows; int cols = cm->mi_cols; diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 1d9a20670..27ccf03e7 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -1115,6 +1115,14 @@ specialize qw/vp9_get_mb_ss/, "$sse2_x86inc"; add_proto qw/void vp9_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"; specialize qw/vp9_subtract_block neon/, "$sse2_x86inc"; +# +# Denoiser +# +if (vpx_config("CONFIG_VP9_TEMPORAL_DENOISING") eq "yes") { + add_proto qw/int vp9_denoiser_filter/, "const uint8_t *sig, int sig_stride, const uint8_t *mc_avg, int mc_avg_stride, uint8_t *avg, int avg_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude"; + specialize qw/vp9_denoiser_filter sse2/; +} + if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # the transform coefficients are held in 32-bit # values, so the assembler code for vp9_block_error can no longer be used. diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c index 577874882..421229a28 100644 --- a/vp9/decoder/vp9_detokenize.c +++ b/vp9/decoder/vp9_detokenize.c @@ -190,7 +190,11 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, PLANE_TYPE type, } } v = (val * dqv) >> dq_shift; +#if CONFIG_COEFFICIENT_RANGE_CHECKING + dqcoeff[scan[c]] = check_range(vp9_read_bit(r) ? -v : v); +#else dqcoeff[scan[c]] = vp9_read_bit(r) ? -v : v; +#endif token_cache[scan[c]] = vp9_pt_energy_class[token]; ++c; ctx = get_coef_context(nb, token_cache, c); diff --git a/vp9/decoder/vp9_dthread.c b/vp9/decoder/vp9_dthread.c index 62ea6c14d..69e4fde85 100644 --- a/vp9/decoder/vp9_dthread.c +++ b/vp9/decoder/vp9_dthread.c @@ -223,14 +223,18 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows, CHECK_MEM_ERROR(cm, lf_sync->mutex_, vpx_malloc(sizeof(*lf_sync->mutex_) * rows)); - for (i = 0; i < rows; ++i) { - pthread_mutex_init(&lf_sync->mutex_[i], NULL); + if (lf_sync->mutex_) { + for (i = 0; i < rows; ++i) { + pthread_mutex_init(&lf_sync->mutex_[i], NULL); + } } CHECK_MEM_ERROR(cm, lf_sync->cond_, vpx_malloc(sizeof(*lf_sync->cond_) * rows)); - for (i = 0; i < rows; ++i) { - pthread_cond_init(&lf_sync->cond_[i], NULL); + if (lf_sync->cond_) { + for (i = 0; i < rows; ++i) { + pthread_cond_init(&lf_sync->cond_[i], NULL); + } } } #endif // CONFIG_MULTITHREAD diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c index 11cb27f43..4deeed217 100644 --- a/vp9/encoder/vp9_denoiser.c +++ b/vp9/encoder/vp9_denoiser.c @@ -31,9 +31,6 @@ static void make_grayscale(YV12_BUFFER_CONFIG *yuv); #endif -static const int widths[] = {4, 4, 8, 8, 8, 16, 16, 16, 32, 32, 32, 64, 64}; -static const int heights[] = {4, 8, 4, 8, 16, 8, 16, 32, 16, 32, 64, 32, 64}; - static int absdiff_thresh(BLOCK_SIZE bs, int increase_denoising) { (void)bs; return 3 + (increase_denoising ? 1 : 0); @@ -52,7 +49,9 @@ static int noise_motion_thresh(BLOCK_SIZE bs, int increase_denoising) { } static unsigned int sse_thresh(BLOCK_SIZE bs, int increase_denoising) { - return widths[bs] * heights[bs] * (increase_denoising ? 60 : 40); + return (4 << b_width_log2_lookup[bs]) * + (4 << b_height_log2_lookup[bs]) * + (increase_denoising ? 60 : 40); } static int sse_diff_thresh(BLOCK_SIZE bs, int increase_denoising, @@ -61,25 +60,31 @@ static int sse_diff_thresh(BLOCK_SIZE bs, int increase_denoising, noise_motion_thresh(bs, increase_denoising)) { return 0; } else { - return widths[bs] * heights[bs] * 20; + return (4 << b_width_log2_lookup[bs]) * + (4 << b_height_log2_lookup[bs]) * 20; } } -static int total_adj_strong_thresh(BLOCK_SIZE bs, int increase_denoising) { - return widths[bs] * heights[bs] * (increase_denoising ? 3 : 2); +int total_adj_strong_thresh(BLOCK_SIZE bs, int increase_denoising) { + return (4 << b_width_log2_lookup[bs]) * + (4 << b_height_log2_lookup[bs]) * (increase_denoising ? 3 : 2); } static int total_adj_weak_thresh(BLOCK_SIZE bs, int increase_denoising) { - return widths[bs] * heights[bs] * (increase_denoising ? 3 : 2); + return (4 << b_width_log2_lookup[bs]) * + (4 << b_height_log2_lookup[bs]) * (increase_denoising ? 3 : 2); } -static VP9_DENOISER_DECISION denoiser_filter(const uint8_t *sig, int sig_stride, - const uint8_t *mc_avg, - int mc_avg_stride, - uint8_t *avg, int avg_stride, - int increase_denoising, - BLOCK_SIZE bs, - int motion_magnitude) { +// TODO(jackychen): If increase_denoising is enabled in the future, +// we might need to update the code for calculating 'total_adj' in +// case the C code is not bit-exact with corresponding sse2 code. +int vp9_denoiser_filter_c(const uint8_t *sig, int sig_stride, + const uint8_t *mc_avg, + int mc_avg_stride, + uint8_t *avg, int avg_stride, + int increase_denoising, + BLOCK_SIZE bs, + int motion_magnitude) { int r, c; const uint8_t *sig_start = sig; const uint8_t *mc_avg_start = mc_avg; @@ -102,8 +107,8 @@ static VP9_DENOISER_DECISION denoiser_filter(const uint8_t *sig, int sig_stride, } // First attempt to apply a strong temporal denoising filter. - for (r = 0; r < heights[bs]; ++r) { - for (c = 0; c < widths[bs]; ++c) { + for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) { + for (c = 0; c < (4 << b_width_log2_lookup[bs]); ++c) { diff = mc_avg[c] - sig[c]; absdiff = abs(diff); @@ -152,8 +157,8 @@ static VP9_DENOISER_DECISION denoiser_filter(const uint8_t *sig, int sig_stride, mc_avg = mc_avg_start; avg = avg_start; sig = sig_start; - for (r = 0; r < heights[bs]; ++r) { - for (c = 0; c < widths[bs]; ++c) { + for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) { + for (c = 0; c < (4 << b_width_log2_lookup[bs]); ++c) { diff = mc_avg[c] - sig[c]; adj = abs(diff); if (adj > delta) { @@ -193,8 +198,8 @@ static uint8_t *block_start(uint8_t *framebuf, int stride, static void copy_block(uint8_t *dest, int dest_stride, const uint8_t *src, int src_stride, BLOCK_SIZE bs) { int r; - for (r = 0; r < heights[bs]; ++r) { - vpx_memcpy(dest, src, widths[bs]); + for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) { + vpx_memcpy(dest, src, (4 << b_width_log2_lookup[bs])); dest += dest_stride; src += src_stride; } @@ -336,10 +341,10 @@ void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb, &motion_magnitude); if (decision == FILTER_BLOCK) { - decision = denoiser_filter(src.buf, src.stride, - mc_avg_start, mc_avg.y_stride, - avg_start, avg.y_stride, - 0, bs, motion_magnitude); + decision = vp9_denoiser_filter(src.buf, src.stride, + mc_avg_start, mc_avg.y_stride, + avg_start, avg.y_stride, + 0, bs, motion_magnitude); } if (decision == FILTER_BLOCK) { diff --git a/vp9/encoder/vp9_denoiser.h b/vp9/encoder/vp9_denoiser.h index fa714b132..421dfcd0c 100644 --- a/vp9/encoder/vp9_denoiser.h +++ b/vp9/encoder/vp9_denoiser.h @@ -55,6 +55,10 @@ int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height, #endif int border); +#if CONFIG_VP9_TEMPORAL_DENOISING +int total_adj_strong_thresh(BLOCK_SIZE bs, int increase_denoising); +#endif + void vp9_denoiser_free(VP9_DENOISER *denoiser); #ifdef __cplusplus diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 822a45e5e..dbc206ec4 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -1224,9 +1224,7 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { cpi->oxcf = *oxcf; #if CONFIG_VP9_HIGHBITDEPTH - if (cpi->oxcf.use_highbitdepth) { - cpi->mb.e_mbd.bd = (int)cm->bit_depth; - } + cpi->mb.e_mbd.bd = (int)cm->bit_depth; #endif // CONFIG_VP9_HIGHBITDEPTH rc->baseline_gf_interval = DEFAULT_GF_INTERVAL; diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index ac1c9d4c2..1af57339a 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -211,6 +211,7 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, x->bsse[(i << 2) + block_idx] = sse; sum_sse += sse; + x->skip_txfm[(i << 2) + block_idx] = 0; if (!x->select_tx_size) { // Check if all ac coefficients can be quantized to zero. if (var < p->quant_thred[1] >> shift) { @@ -219,8 +220,6 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, // Check if dc coefficient can be quantized to zero. if (sse - var < p->quant_thred[0] >> shift) x->skip_txfm[(i << 2) + block_idx] = 1; - } else { - x->skip_txfm[(i << 2) + block_idx] = 0; } } @@ -232,7 +231,7 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, // Fast approximate the modelling function. if (cpi->oxcf.speed > 4) { int64_t rate; - int64_t square_error = sse; + const int64_t square_error = sum_sse; int quantizer = (pd->dequant[1] >> 3); #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { @@ -497,7 +496,7 @@ static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize, if (tx_size != TX_32X32) dc_correct >>= 2; - args->dist = args->sse - dc_correct; + args->dist = MAX(0, args->sse - dc_correct); } } else { // skip forward transform @@ -1050,6 +1049,7 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x, for (i = 0; i < TX_MODES; i++) tx_cache[i] = INT64_MAX; + vpx_memset(x->skip_txfm, 0, sizeof(x->skip_txfm)); /* Y Search for intra prediction mode */ for (mode = DC_PRED; mode <= TM_PRED; mode++) { int64_t local_tx_cache[TX_MODES]; @@ -1149,6 +1149,7 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x, int this_rate_tokenonly, this_rate, s; int64_t this_distortion, this_sse; + vpx_memset(x->skip_txfm, 0, sizeof(x->skip_txfm)); for (mode = DC_PRED; mode <= TM_PRED; ++mode) { if (!(cpi->sf.intra_uv_mode_mask[max_tx_size] & (1 << mode))) continue; @@ -1187,6 +1188,7 @@ static int64_t rd_sbuv_dcpred(const VP9_COMP *cpi, MACROBLOCK *x, int64_t unused; x->e_mbd.mi[0].src_mi->mbmi.uv_mode = DC_PRED; + vpx_memset(x->skip_txfm, 0, sizeof(x->skip_txfm)); super_block_uvrd(cpi, x, rate_tokenonly, distortion, skippable, &unused, bsize, INT64_MAX); *rate = *rate_tokenonly + cpi->intra_uv_mode_cost[cm->frame_type][DC_PRED]; @@ -2458,7 +2460,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, #if CONFIG_VP9_HIGHBITDEPTH DECLARE_ALIGNED_ARRAY(16, uint16_t, tmp_buf16, MAX_MB_PLANE * 64 * 64); DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf8, MAX_MB_PLANE * 64 * 64); - uint8_t *tmp_buf = tmp_buf8; + uint8_t *tmp_buf; #else DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf, MAX_MB_PLANE * 64 * 64); #endif // CONFIG_VP9_HIGHBITDEPTH @@ -2659,8 +2661,6 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, best_filter = mbmi->interp_filter; if (cm->interp_filter == SWITCHABLE && i && !intpel_mv) best_needs_copy = !best_needs_copy; - vpx_memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm)); - vpx_memcpy(bsse, x->bsse, sizeof(bsse)); } if ((cm->interp_filter == SWITCHABLE && newbest) || @@ -2668,6 +2668,9 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, cm->interp_filter == mbmi->interp_filter)) { pred_exists = 1; tmp_rd = best_rd; + + vpx_memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm)); + vpx_memcpy(bsse, x->bsse, sizeof(bsse)); } } restore_dst_buf(xd, orig_dst, orig_dst_stride); @@ -3222,6 +3225,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, if (ref_frame == INTRA_FRAME) { TX_SIZE uv_tx; + + vpx_memset(x->skip_txfm, 0, sizeof(x->skip_txfm)); super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL, bsize, tx_cache, best_rd); @@ -4000,6 +4005,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, // then dont bother looking at UV vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col, BLOCK_8X8); + vpx_memset(x->skip_txfm, 0, sizeof(x->skip_txfm)); super_block_uvrd(cpi, x, &rate_uv, &distortion_uv, &uv_skippable, &uv_sse, BLOCK_8X8, tmp_best_rdu); if (rate_uv == INT_MAX) diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c index eeb1ce929..2d594dd09 100644 --- a/vp9/encoder/vp9_temporal_filter.c +++ b/vp9/encoder/vp9_temporal_filter.c @@ -676,61 +676,66 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) { frames[frames_to_blur - 1 - frame] = &buf->img; } - // Setup scaling factors. Scaling on each of the arnr frames is not supported - if (is_two_pass_svc(cpi)) { - // In spatial svc the scaling factors might be less then 1/2. So we will use - // non-normative scaling. - int frame_used = 0; + if (frames_to_blur > 0) { + // Setup scaling factors. Scaling on each of the arnr frames is not + // supported. + if (is_two_pass_svc(cpi)) { + // In spatial svc the scaling factors might be less then 1/2. + // So we will use non-normative scaling. + int frame_used = 0; #if CONFIG_VP9_HIGHBITDEPTH - vp9_setup_scale_factors_for_frame(&sf, - get_frame_new_buffer(cm)->y_crop_width, - get_frame_new_buffer(cm)->y_crop_height, - get_frame_new_buffer(cm)->y_crop_width, - get_frame_new_buffer(cm)->y_crop_height, - cm->use_highbitdepth); + vp9_setup_scale_factors_for_frame( + &sf, + get_frame_new_buffer(cm)->y_crop_width, + get_frame_new_buffer(cm)->y_crop_height, + get_frame_new_buffer(cm)->y_crop_width, + get_frame_new_buffer(cm)->y_crop_height, + cm->use_highbitdepth); #else - vp9_setup_scale_factors_for_frame(&sf, - get_frame_new_buffer(cm)->y_crop_width, - get_frame_new_buffer(cm)->y_crop_height, - get_frame_new_buffer(cm)->y_crop_width, - get_frame_new_buffer(cm)->y_crop_height); + vp9_setup_scale_factors_for_frame( + &sf, + get_frame_new_buffer(cm)->y_crop_width, + get_frame_new_buffer(cm)->y_crop_height, + get_frame_new_buffer(cm)->y_crop_width, + get_frame_new_buffer(cm)->y_crop_height); #endif // CONFIG_VP9_HIGHBITDEPTH - for (frame = 0; frame < frames_to_blur; ++frame) { - if (cm->mi_cols * MI_SIZE != frames[frame]->y_width || - cm->mi_rows * MI_SIZE != frames[frame]->y_height) { - if (vp9_realloc_frame_buffer(&cpi->svc.scaled_frames[frame_used], - cm->width, cm->height, - cm->subsampling_x, cm->subsampling_y, + for (frame = 0; frame < frames_to_blur; ++frame) { + if (cm->mi_cols * MI_SIZE != frames[frame]->y_width || + cm->mi_rows * MI_SIZE != frames[frame]->y_height) { + if (vp9_realloc_frame_buffer(&cpi->svc.scaled_frames[frame_used], + cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, #if CONFIG_VP9_HIGHBITDEPTH - cm->use_highbitdepth, + cm->use_highbitdepth, #endif - VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, - NULL)) - vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, - "Failed to reallocate alt_ref_buffer"); - - frames[frame] = vp9_scale_if_required(cm, frames[frame], - &cpi->svc.scaled_frames[frame_used]); - ++frame_used; + VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, + NULL)) { + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to reallocate alt_ref_buffer"); + } + frames[frame] = vp9_scale_if_required( + cm, frames[frame], &cpi->svc.scaled_frames[frame_used]); + ++frame_used; + } } - } - } else { - // ARF is produced at the native frame size and resized when coded. + } else { + // ARF is produced at the native frame size and resized when coded. #if CONFIG_VP9_HIGHBITDEPTH - vp9_setup_scale_factors_for_frame(&sf, - frames[0]->y_crop_width, - frames[0]->y_crop_height, - frames[0]->y_crop_width, - frames[0]->y_crop_height, - cm->use_highbitdepth); + vp9_setup_scale_factors_for_frame(&sf, + frames[0]->y_crop_width, + frames[0]->y_crop_height, + frames[0]->y_crop_width, + frames[0]->y_crop_height, + cm->use_highbitdepth); #else - vp9_setup_scale_factors_for_frame(&sf, - frames[0]->y_crop_width, - frames[0]->y_crop_height, - frames[0]->y_crop_width, - frames[0]->y_crop_height); + vp9_setup_scale_factors_for_frame(&sf, + frames[0]->y_crop_width, + frames[0]->y_crop_height, + frames[0]->y_crop_width, + frames[0]->y_crop_height); #endif // CONFIG_VP9_HIGHBITDEPTH + } } temporal_filter_iterate_c(cpi, frames, frames_to_blur, diff --git a/vp9/encoder/x86/vp9_denoiser_sse2.c b/vp9/encoder/x86/vp9_denoiser_sse2.c new file mode 100644 index 000000000..bf400d38b --- /dev/null +++ b/vp9/encoder/x86/vp9_denoiser_sse2.c @@ -0,0 +1,474 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <emmintrin.h> + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" + +#include "vpx_ports/emmintrin_compat.h" +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_reconinter.h" +#include "vp9/encoder/vp9_context_tree.h" +#include "vp9/encoder/vp9_denoiser.h" +#include "vpx_mem/vpx_mem.h" + +// Compute the sum of all pixel differences of this MB. +static INLINE int sum_diff_16x1(__m128i acc_diff) { + const __m128i k_1 = _mm_set1_epi16(1); + const __m128i acc_diff_lo = _mm_srai_epi16( + _mm_unpacklo_epi8(acc_diff, acc_diff), 8); + const __m128i acc_diff_hi = _mm_srai_epi16( + _mm_unpackhi_epi8(acc_diff, acc_diff), 8); + const __m128i acc_diff_16 = _mm_add_epi16(acc_diff_lo, acc_diff_hi); + const __m128i hg_fe_dc_ba = _mm_madd_epi16(acc_diff_16, k_1); + const __m128i hgfe_dcba = _mm_add_epi32(hg_fe_dc_ba, + _mm_srli_si128(hg_fe_dc_ba, 8)); + const __m128i hgfedcba = _mm_add_epi32(hgfe_dcba, + _mm_srli_si128(hgfe_dcba, 4)); + int sum_diff = _mm_cvtsi128_si32(hgfedcba); + return sum_diff; +} + +// Denoise a 16x1 vector. +static INLINE __m128i vp9_denoiser_16x1_sse2(const uint8_t *sig, + const uint8_t *mc_running_avg_y, + uint8_t *running_avg_y, + const __m128i k_0, + const __m128i k_4, + const __m128i k_8, + const __m128i k_16, + const __m128i l3, + const __m128i l32, + const __m128i l21, + __m128i acc_diff) { + // Calculate differences + const __m128i v_sig = _mm_loadu_si128((__m128i *)(&sig[0])); + const __m128i v_mc_running_avg_y = _mm_loadu_si128( + (__m128i *)(&mc_running_avg_y[0])); + __m128i v_running_avg_y; + const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig); + const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y); + // Obtain the sign. FF if diff is negative. + const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0); + // Clamp absolute difference to 16 to be used to get mask. Doing this + // allows us to use _mm_cmpgt_epi8, which operates on signed byte. + const __m128i clamped_absdiff = _mm_min_epu8( + _mm_or_si128(pdiff, ndiff), k_16); + // Get masks for l2 l1 and l0 adjustments. + const __m128i mask2 = _mm_cmpgt_epi8(k_16, clamped_absdiff); + const __m128i mask1 = _mm_cmpgt_epi8(k_8, clamped_absdiff); + const __m128i mask0 = _mm_cmpgt_epi8(k_4, clamped_absdiff); + // Get adjustments for l2, l1, and l0. + __m128i adj2 = _mm_and_si128(mask2, l32); + const __m128i adj1 = _mm_and_si128(mask1, l21); + const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff); + __m128i adj, padj, nadj; + + // Combine the adjustments and get absolute adjustments. + adj2 = _mm_add_epi8(adj2, adj1); + adj = _mm_sub_epi8(l3, adj2); + adj = _mm_andnot_si128(mask0, adj); + adj = _mm_or_si128(adj, adj0); + + // Restore the sign and get positive and negative adjustments. + padj = _mm_andnot_si128(diff_sign, adj); + nadj = _mm_and_si128(diff_sign, adj); + + // Calculate filtered value. + v_running_avg_y = _mm_adds_epu8(v_sig, padj); + v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj); + _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y); + + // Adjustments <=7, and each element in acc_diff can fit in signed + // char. + acc_diff = _mm_adds_epi8(acc_diff, padj); + acc_diff = _mm_subs_epi8(acc_diff, nadj); + return acc_diff; +} + +// Denoise a 16x1 vector with a weaker filter. +static INLINE __m128i vp9_denoiser_adj_16x1_sse2(const uint8_t *sig, + const uint8_t *mc_running_avg_y, + uint8_t *running_avg_y, + const __m128i k_0, + const __m128i k_delta, + __m128i acc_diff) { + __m128i v_running_avg_y = _mm_loadu_si128((__m128i *)(&running_avg_y[0])); + // Calculate differences. + const __m128i v_sig = _mm_loadu_si128((__m128i *)(&sig[0])); + const __m128i v_mc_running_avg_y = + _mm_loadu_si128((__m128i *)(&mc_running_avg_y[0])); + const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig); + const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y); + // Obtain the sign. FF if diff is negative. + const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0); + // Clamp absolute difference to delta to get the adjustment. + const __m128i adj = + _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta); + // Restore the sign and get positive and negative adjustments. + __m128i padj, nadj; + padj = _mm_andnot_si128(diff_sign, adj); + nadj = _mm_and_si128(diff_sign, adj); + // Calculate filtered value. + v_running_avg_y = _mm_subs_epu8(v_running_avg_y, padj); + v_running_avg_y = _mm_adds_epu8(v_running_avg_y, nadj); + _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y); + + // Accumulate the adjustments. + acc_diff = _mm_subs_epi8(acc_diff, padj); + acc_diff = _mm_adds_epi8(acc_diff, nadj); + return acc_diff; +} + +static int vp9_denoiser_4xM_sse2(const uint8_t *sig, int sig_stride, + const uint8_t *mc_running_avg_y, + int mc_avg_y_stride, + uint8_t *running_avg_y, int avg_y_stride, + int increase_denoising, + BLOCK_SIZE bs, + int motion_magnitude) { + int sum_diff_thresh; + int r; + int shift_inc = (increase_denoising && + motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 1 : 0; + unsigned char sig_buffer[2][16], mc_running_buffer[2][16], + running_buffer[2][16]; + __m128i acc_diff = _mm_setzero_si128(); + const __m128i k_0 = _mm_setzero_si128(); + const __m128i k_4 = _mm_set1_epi8(4 + shift_inc); + const __m128i k_8 = _mm_set1_epi8(8); + const __m128i k_16 = _mm_set1_epi8(16); + // Modify each level's adjustment according to motion_magnitude. + const __m128i l3 = _mm_set1_epi8( + (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? + 7 + shift_inc : 6); + // Difference between level 3 and level 2 is 2. + const __m128i l32 = _mm_set1_epi8(2); + // Difference between level 2 and level 1 is 1. + const __m128i l21 = _mm_set1_epi8(1); + int sum_diff = 0; + + for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> 2); ++r) { + vpx_memcpy(sig_buffer[r], sig, 4); + vpx_memcpy(sig_buffer[r] + 4, sig + sig_stride, 4); + vpx_memcpy(sig_buffer[r] + 8, sig + sig_stride * 2, 4); + vpx_memcpy(sig_buffer[r] + 12, sig + sig_stride * 3, 4); + vpx_memcpy(mc_running_buffer[r], mc_running_avg_y, 4); + vpx_memcpy(mc_running_buffer[r] + 4, mc_running_avg_y + + mc_avg_y_stride, 4); + vpx_memcpy(mc_running_buffer[r] + 8, mc_running_avg_y + + mc_avg_y_stride * 2, 4); + vpx_memcpy(mc_running_buffer[r] + 12, mc_running_avg_y + + mc_avg_y_stride * 3, 4); + vpx_memcpy(running_buffer[r], running_avg_y, 4); + vpx_memcpy(running_buffer[r] + 4, running_avg_y + + avg_y_stride, 4); + vpx_memcpy(running_buffer[r] + 8, running_avg_y + + avg_y_stride * 2, 4); + vpx_memcpy(running_buffer[r] + 12, running_avg_y + + avg_y_stride * 3, 4); + acc_diff = vp9_denoiser_16x1_sse2(sig_buffer[r], + mc_running_buffer[r], + running_buffer[r], + k_0, k_4, k_8, k_16, + l3, l32, l21, acc_diff); + vpx_memcpy(running_avg_y, running_buffer[r], 4); + vpx_memcpy(running_avg_y + avg_y_stride, running_buffer[r] + 4, 4); + vpx_memcpy(running_avg_y + avg_y_stride * 2, + running_buffer[r] + 8, 4); + vpx_memcpy(running_avg_y + avg_y_stride * 3, + running_buffer[r] + 12, 4); + // Update pointers for next iteration. + sig += (sig_stride << 2); + mc_running_avg_y += (mc_avg_y_stride << 2); + running_avg_y += (avg_y_stride << 2); + } + + { + sum_diff = sum_diff_16x1(acc_diff); + sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising); + if (abs(sum_diff) > sum_diff_thresh) { + // Before returning to copy the block (i.e., apply no denoising), + // checK if we can still apply some (weaker) temporal filtering to + // this block, that would otherwise not be denoised at all. Simplest + // is to apply an additional adjustment to running_avg_y to bring it + // closer to sig. The adjustment is capped by a maximum delta, and + // chosen such that in most cases the resulting sum_diff will be + // within the accceptable range given by sum_diff_thresh. + + // The delta is set by the excess of absolute pixel diff over the + // threshold. + int delta = ((abs(sum_diff) - sum_diff_thresh) + >> num_pels_log2_lookup[bs]) + 1; + // Only apply the adjustment for max delta up to 3. + if (delta < 4) { + const __m128i k_delta = _mm_set1_epi8(delta); + running_avg_y -= avg_y_stride * (4 << b_height_log2_lookup[bs]); + sum_diff = 0; + for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> 2); ++r) { + acc_diff = vp9_denoiser_adj_16x1_sse2( + sig_buffer[r], mc_running_buffer[r], + running_buffer[r], k_0, k_delta, + acc_diff); + vpx_memcpy(running_avg_y, running_buffer[r], 4); + vpx_memcpy(running_avg_y + avg_y_stride, running_buffer[r] + 4, 4); + vpx_memcpy(running_avg_y + avg_y_stride * 2, + running_buffer[r] + 8, 4); + vpx_memcpy(running_avg_y + avg_y_stride * 3, + running_buffer[r] + 12, 4); + // Update pointers for next iteration. + running_avg_y += (avg_y_stride << 2); + } + sum_diff = sum_diff_16x1(acc_diff); + if (abs(sum_diff) > sum_diff_thresh) { + return COPY_BLOCK; + } + } else { + return COPY_BLOCK; + } + } + } + return FILTER_BLOCK; +} + +static int vp9_denoiser_8xM_sse2(const uint8_t *sig, int sig_stride, + const uint8_t *mc_running_avg_y, + int mc_avg_y_stride, + uint8_t *running_avg_y, int avg_y_stride, + int increase_denoising, + BLOCK_SIZE bs, + int motion_magnitude) { + int sum_diff_thresh; + int r; + int shift_inc = (increase_denoising && + motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 1 : 0; + unsigned char sig_buffer[8][16], mc_running_buffer[8][16], + running_buffer[8][16]; + __m128i acc_diff = _mm_setzero_si128(); + const __m128i k_0 = _mm_setzero_si128(); + const __m128i k_4 = _mm_set1_epi8(4 + shift_inc); + const __m128i k_8 = _mm_set1_epi8(8); + const __m128i k_16 = _mm_set1_epi8(16); + // Modify each level's adjustment according to motion_magnitude. + const __m128i l3 = _mm_set1_epi8( + (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? + 7 + shift_inc : 6); + // Difference between level 3 and level 2 is 2. + const __m128i l32 = _mm_set1_epi8(2); + // Difference between level 2 and level 1 is 1. + const __m128i l21 = _mm_set1_epi8(1); + int sum_diff = 0; + + for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> 1); ++r) { + vpx_memcpy(sig_buffer[r], sig, 8); + vpx_memcpy(sig_buffer[r] + 8, sig + sig_stride, 8); + vpx_memcpy(mc_running_buffer[r], mc_running_avg_y, 8); + vpx_memcpy(mc_running_buffer[r] + 8, mc_running_avg_y + + mc_avg_y_stride, 8); + vpx_memcpy(running_buffer[r], running_avg_y, 8); + vpx_memcpy(running_buffer[r] + 8, running_avg_y + + avg_y_stride, 8); + acc_diff = vp9_denoiser_16x1_sse2(sig_buffer[r], + mc_running_buffer[r], + running_buffer[r], + k_0, k_4, k_8, k_16, + l3, l32, l21, acc_diff); + vpx_memcpy(running_avg_y, running_buffer[r], 8); + vpx_memcpy(running_avg_y + avg_y_stride, running_buffer[r] + 8, 8); + // Update pointers for next iteration. + sig += (sig_stride << 1); + mc_running_avg_y += (mc_avg_y_stride << 1); + running_avg_y += (avg_y_stride << 1); + } + + { + sum_diff = sum_diff_16x1(acc_diff); + sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising); + if (abs(sum_diff) > sum_diff_thresh) { + // Before returning to copy the block (i.e., apply no denoising), + // checK if we can still apply some (weaker) temporal filtering to + // this block, that would otherwise not be denoised at all. Simplest + // is to apply an additional adjustment to running_avg_y to bring it + // closer to sig. The adjustment is capped by a maximum delta, and + // chosen such that in most cases the resulting sum_diff will be + // within the accceptable range given by sum_diff_thresh. + + // The delta is set by the excess of absolute pixel diff over the + // threshold. + int delta = ((abs(sum_diff) - sum_diff_thresh) + >> num_pels_log2_lookup[bs]) + 1; + // Only apply the adjustment for max delta up to 3. + if (delta < 4) { + const __m128i k_delta = _mm_set1_epi8(delta); + running_avg_y -= avg_y_stride * (4 << b_height_log2_lookup[bs]); + for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> 1); ++r) { + acc_diff = vp9_denoiser_adj_16x1_sse2( + sig_buffer[r], mc_running_buffer[r], + running_buffer[r], k_0, k_delta, + acc_diff); + vpx_memcpy(running_avg_y, running_buffer[r], 8); + vpx_memcpy(running_avg_y + avg_y_stride, running_buffer[r] + 8, 8); + // Update pointers for next iteration. + running_avg_y += (avg_y_stride << 1); + } + sum_diff = sum_diff_16x1(acc_diff); + if (abs(sum_diff) > sum_diff_thresh) { + return COPY_BLOCK; + } + } else { + return COPY_BLOCK; + } + } + } + return FILTER_BLOCK; +} + +static int vp9_denoiser_64_32_16xM_sse2(const uint8_t *sig, int sig_stride, + const uint8_t *mc_running_avg_y, + int mc_avg_y_stride, + uint8_t *running_avg_y, + int avg_y_stride, + int increase_denoising, BLOCK_SIZE bs, + int motion_magnitude) { + int sum_diff_thresh; + int r, c; + int shift_inc = (increase_denoising && + motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 1 : 0; + __m128i acc_diff[4][4]; + const __m128i k_0 = _mm_setzero_si128(); + const __m128i k_4 = _mm_set1_epi8(4 + shift_inc); + const __m128i k_8 = _mm_set1_epi8(8); + const __m128i k_16 = _mm_set1_epi8(16); + // Modify each level's adjustment according to motion_magnitude. + const __m128i l3 = _mm_set1_epi8( + (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? + 7 + shift_inc : 6); + // Difference between level 3 and level 2 is 2. + const __m128i l32 = _mm_set1_epi8(2); + // Difference between level 2 and level 1 is 1. + const __m128i l21 = _mm_set1_epi8(1); + int sum_diff = 0; + + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) { + acc_diff[i][j] = _mm_setzero_si128(); + } + } + + for (r = 0; r < (4 << b_height_log2_lookup[bs]); r++) { + for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) { + acc_diff[c>>4][r>>4] = vp9_denoiser_16x1_sse2( + sig, mc_running_avg_y, + running_avg_y, + k_0, k_4, k_8, k_16, + l3, l32, l21, acc_diff[c>>4][r>>4]); + // Update pointers for next iteration. + sig += 16; + mc_running_avg_y += 16; + running_avg_y += 16; + } + + if ((r + 1) % 16 == 0 || (bs == BLOCK_16X8 && r == 7)) { + for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) { + sum_diff += sum_diff_16x1(acc_diff[c>>4][r>>4]); + } + } + + // Update pointers for next iteration. + sig = sig - 16 * ((4 << b_width_log2_lookup[bs]) >> 4) + sig_stride; + mc_running_avg_y = mc_running_avg_y - + 16 * ((4 << b_width_log2_lookup[bs]) >> 4) + + mc_avg_y_stride; + running_avg_y = running_avg_y - + 16 * ((4 << b_width_log2_lookup[bs]) >> 4) + + avg_y_stride; + } + + { + sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising); + if (abs(sum_diff) > sum_diff_thresh) { + int delta = ((abs(sum_diff) - sum_diff_thresh) + >> num_pels_log2_lookup[bs]) + 1; + + // Only apply the adjustment for max delta up to 3. + if (delta < 4) { + const __m128i k_delta = _mm_set1_epi8(delta); + sig -= sig_stride * (4 << b_height_log2_lookup[bs]); + mc_running_avg_y -= mc_avg_y_stride * (4 << b_height_log2_lookup[bs]); + running_avg_y -= avg_y_stride * (4 << b_height_log2_lookup[bs]); + sum_diff = 0; + for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) { + for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) { + acc_diff[c>>4][r>>4] = vp9_denoiser_adj_16x1_sse2( + sig, mc_running_avg_y, + running_avg_y, k_0, + k_delta, acc_diff[c>>4][r>>4]); + // Update pointers for next iteration. + sig += 16; + mc_running_avg_y += 16; + running_avg_y += 16; + } + + if ((r + 1) % 16 == 0 || (bs == BLOCK_16X8 && r == 7)) { + for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) { + sum_diff += sum_diff_16x1(acc_diff[c>>4][r>>4]); + } + } + sig = sig - 16 * ((4 << b_width_log2_lookup[bs]) >> 4) + sig_stride; + mc_running_avg_y = mc_running_avg_y - + 16 * ((4 << b_width_log2_lookup[bs]) >> 4) + + mc_avg_y_stride; + running_avg_y = running_avg_y - + 16 * ((4 << b_width_log2_lookup[bs]) >> 4) + + avg_y_stride; + } + if (abs(sum_diff) > sum_diff_thresh) { + return COPY_BLOCK; + } + } else { + return COPY_BLOCK; + } + } + } + return FILTER_BLOCK; +} + +int vp9_denoiser_filter_sse2(const uint8_t *sig, int sig_stride, + const uint8_t *mc_avg, + int mc_avg_stride, + uint8_t *avg, int avg_stride, + int increase_denoising, + BLOCK_SIZE bs, + int motion_magnitude) { + if (bs == BLOCK_4X4 || bs == BLOCK_4X8) { + return vp9_denoiser_4xM_sse2(sig, sig_stride, + mc_avg, mc_avg_stride, + avg, avg_stride, + increase_denoising, + bs, motion_magnitude); + } else if (bs == BLOCK_8X4 || bs == BLOCK_8X8 || bs == BLOCK_8X16) { + return vp9_denoiser_8xM_sse2(sig, sig_stride, + mc_avg, mc_avg_stride, + avg, avg_stride, + increase_denoising, + bs, motion_magnitude); + } else if (bs == BLOCK_16X8 || bs == BLOCK_16X16 || bs == BLOCK_16X32 || + bs == BLOCK_32X16|| bs == BLOCK_32X32 || bs == BLOCK_32X64 || + bs == BLOCK_64X32 || bs == BLOCK_64X64) { + return vp9_denoiser_64_32_16xM_sse2(sig, sig_stride, + mc_avg, mc_avg_stride, + avg, avg_stride, + increase_denoising, + bs, motion_magnitude); + } else { + return COPY_BLOCK; + } +} diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index e450f7b7f..869737137 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -120,6 +120,10 @@ VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt_x86_64.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct32x32_sse2.c +ifeq ($(CONFIG_VP9_TEMPORAL_DENOISING),yes) +VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_denoiser_sse2.c +endif + VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct32x32_avx2.c VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct_avx2.c VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_intrin_avx2.c |