4 files changed, 128 insertions, 4 deletions
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 57900b8fc..08d9e29e6 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -116,6 +116,13 @@ static int is_spatial_denoise_enabled(VP9_COMP *cpi) {
 }
 #endif
 
+#if CONFIG_VP9_HIGHBITDEPTH
+void highbd_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
+                         TX_SIZE tx_size);
+#endif
+void wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
+                  TX_SIZE tx_size);
+
 // compute adaptive threshold for skip recoding
 static int compute_context_model_thresh(const VP9_COMP *const cpi) {
   const VP9_COMMON *const cm = &cpi->common;
@@ -977,6 +984,12 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
   vpx_free(cpi->consec_zero_mv);
   cpi->consec_zero_mv = NULL;
 
+  vpx_free(cpi->stack_rank_buffer);
+  cpi->stack_rank_buffer = NULL;
+
+  vpx_free(cpi->mb_wiener_variance);
+  cpi->mb_wiener_variance = NULL;
+
   vp9_free_ref_frame_buffers(cm->buffer_pool);
 #if CONFIG_VP9_POSTPROC
   vp9_free_postproc_buffers(cm);
@@ -2367,6 +2380,14 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
   vp9_set_speed_features_framesize_independent(cpi);
   vp9_set_speed_features_framesize_dependent(cpi);
 
+  if (cpi->sf.enable_wiener_variance) {
+    CHECK_MEM_ERROR(cm, cpi->stack_rank_buffer,
+                    vpx_calloc(UINT16_MAX, sizeof(*cpi->stack_rank_buffer)));
+    CHECK_MEM_ERROR(cm, cpi->mb_wiener_variance,
+                    vpx_calloc(cm->mb_rows * cm->mb_cols,
+                               sizeof(*cpi->mb_wiener_variance)));
+  }
+
 #if CONFIG_NON_GREEDY_MV
   cpi->feature_score_loc_alloc = 0;
   cpi->tpl_ready = 0;
@@ -4691,6 +4712,97 @@ static void set_frame_index(VP9_COMP *cpi, VP9_COMMON *cm) {
   }
 }
 
+// Process the wiener variance in 16x16 block basis.
+static void set_mb_wiener_variance(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+  uint8_t *buffer = cpi->Source->y_buffer;
+  int buf_stride = cpi->Source->y_stride;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  ThreadData *td = &cpi->td;
+  MACROBLOCK *x = &td->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  DECLARE_ALIGNED(16, uint16_t, zero_pred16[32 * 32]);
+  DECLARE_ALIGNED(16, uint8_t, zero_pred8[32 * 32]);
+  uint8_t *zero_pred;
+#else
+  DECLARE_ALIGNED(16, uint8_t, zero_pred[32 * 32]);
+#endif
+
+  DECLARE_ALIGNED(16, int16_t, src_diff[32 * 32]);
+  DECLARE_ALIGNED(16, tran_low_t, coeff[32 * 32]);
+
+  int mb_row, mb_col;
+  // Hard coded operating block size
+  const int block_size = 16;
+  const int coeff_count = block_size * block_size;
+  const TX_SIZE tx_size = TX_16X16;
+
+  if (cpi->sf.enable_wiener_variance == 0) return;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    zero_pred = CONVERT_TO_BYTEPTR(zero_pred16);
+  else
+    zero_pred = zero_pred8;
+#endif
+
+  memset(zero_pred, 0, sizeof(*zero_pred) * coeff_count);
+
+  for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
+    for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) {
+      int idx, hist_count = 0;
+      int16_t median_val = 0;
+      uint8_t *mb_buffer =
+          buffer + mb_row * block_size * buf_stride + mb_col * block_size;
+      int64_t wiener_variance = 0;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        vpx_highbd_subtract_block(block_size, block_size, src_diff, block_size,
+                                  mb_buffer, buf_stride, zero_pred, block_size,
+                                  xd->bd);
+        highbd_wht_fwd_txfm(src_diff, block_size, coeff, tx_size);
+      } else {
+        vpx_subtract_block(block_size, block_size, src_diff, block_size,
+                           mb_buffer, buf_stride, zero_pred, block_size);
+        wht_fwd_txfm(src_diff, block_size, coeff, tx_size);
+      }
+#else
+      vpx_subtract_block(block_size, block_size, src_diff, block_size,
+                         mb_buffer, buf_stride, zero_pred, block_size);
+      wht_fwd_txfm(src_diff, block_size, coeff, tx_size);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+      for (idx = 0; idx < UINT16_MAX; ++idx) cpi->stack_rank_buffer[idx] = 0;
+
+      for (idx = 0; idx < coeff_count; ++idx)
+        ++cpi->stack_rank_buffer[abs(coeff[idx])];
+
+      for (idx = 0; idx < UINT16_MAX; ++idx) {
+        hist_count += cpi->stack_rank_buffer[idx];
+        if (hist_count >= coeff_count / 2) break;
+      }
+
+      // Noise level estimation
+      median_val = idx;
+
+      // Wiener filter
+      for (idx = 1; idx < coeff_count; ++idx) {
+        int sign = coeff[idx] < 0;
+        int64_t sqr_coeff = (int64_t)coeff[idx] * coeff[idx];
+        coeff[idx] = (int16_t)((sqr_coeff * coeff[idx]) /
+                               (sqr_coeff + (int64_t)median_val * median_val));
+        if (sign) coeff[idx] = -coeff[idx];
+
+        wiener_variance += (int64_t)coeff[idx] * coeff[idx];
+      }
+      cpi->mb_wiener_variance[mb_row * cm->mb_cols + mb_col] =
+          wiener_variance / coeff_count;
+    }
+  }
+}
+
 static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size,
                                       uint8_t *dest,
                                       unsigned int *frame_flags) {
@@ -4777,6 +4889,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size,
     }
   }
 
+  set_mb_wiener_variance(cpi);
+
   vpx_clear_system_state();
 
 #if CONFIG_INTERNAL_STATS
@@ -5827,8 +5941,8 @@ static void get_quantize_error(MACROBLOCK *x, int plane, tran_low_t *coeff,
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static void highbd_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
-                                TX_SIZE tx_size) {
+void highbd_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
+                         TX_SIZE tx_size) {
   // TODO(sdeng): Implement SIMD based high bit-depth Hadamard transforms.
   switch (tx_size) {
     case TX_8X8: vpx_highbd_hadamard_8x8(src_diff, bw, coeff); break;
@@ -5839,8 +5953,8 @@ static void highbd_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-static void wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
-                         TX_SIZE tx_size) {
+void wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
+                  TX_SIZE tx_size) {
   switch (tx_size) {
     case TX_8X8: vpx_hadamard_8x8(src_diff, bw, coeff); break;
     case TX_16X16: vpx_hadamard_16x16(src_diff, bw, coeff); break;
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index d1a782bfb..847290556 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -628,6 +628,9 @@ typedef struct VP9_COMP {
   int ext_refresh_frame_context_pending;
   int ext_refresh_frame_context;
 
+  int64_t *mb_wiener_variance;
+  int *stack_rank_buffer;
+
   YV12_BUFFER_CONFIG last_frame_uf;
 
   TOKENEXTRA *tile_tok[4][1 << 6];
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 8c59662c3..c8e5795ee 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -939,6 +939,10 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
   sf->tx_size_search_breakout = 1;
   sf->tx_size_search_depth = 2;
 
+  // Manually turn this on during experimentation. Off by default to disable its
+  // effect on the baseline encoder.
+  sf->enable_wiener_variance = 0;
+
   sf->exhaustive_searches_thresh =
       (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? (1 << 20)
                                                               : INT_MAX;
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index 8609c9837..4261fca14 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -288,6 +288,9 @@ typedef struct SPEED_FEATURES {
   // level within a frame.
   int allow_skip_recode;
 
+  // Enable Wiener filter based block complexity analysis.
+  int enable_wiener_variance;
+
   // Coefficient probability model approximation step size
   int coeff_prob_appx_step;