From 8fd3f9a2fb7fd29d811f2af11433b1b8bebabbb5 Mon Sep 17 00:00:00 2001
From: Marco <marpan@google.com>
Date: Wed, 12 Nov 2014 14:51:49 -0800
Subject: Enable non-rd mode coding on key frame, for speed 6.

For key frame at speed 6: enable the non-rd mode selection in speed setting
and use the (non-rd) variance_based partition.

Adjust some logic/thresholds in variance partition selection for key frame only (no change to delta frames),
mainly to bias to selecting smaller prediction blocks, and also set max tx size of 16x16.

Loss in key frame quality (~0.6-0.7dB) compared to rd coding,
but speeds up key frame encoding by at least 6x.
Average PNSR/SSIM metrics over RTC clips go down by ~1-2% for speed 6.

Change-Id: Ie4845e0127e876337b9c105aa37e93b286193405
---
 vp9/common/vp9_rtcd_defs.pl           |   5 +
 vp9/encoder/vp9_avg.c                 |  20 ++++
 vp9/encoder/vp9_encodeframe.c         | 209 +++++++++++++++++++++++-----------
 vp9/encoder/vp9_speed_features.c      |   2 +-
 vp9/encoder/x86/vp9_avg_intrin_sse2.c |  18 +++
 5 files changed, 184 insertions(+), 70 deletions(-)

(limited to 'vp9')

diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index ae1280864..281dcbd8b 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -1135,9 +1135,14 @@ specialize qw/vp9_get_mb_ss/, "$sse2_x86inc";
 add_proto qw/unsigned int vp9_avg_8x8/, "const uint8_t *, int p";
 specialize qw/vp9_avg_8x8 sse2/;
 
+add_proto qw/unsigned int vp9_avg_4x4/, "const uint8_t *, int p";
+specialize qw/vp9_avg_4x4 sse2/;
+
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/unsigned int vp9_highbd_avg_8x8/, "const uint8_t *, int p";
   specialize qw/vp9_highbd_avg_8x8/;
+  add_proto qw/unsigned int vp9_highbd_avg_4x4/, "const uint8_t *, int p";
+  specialize qw/vp9_highbd_avg_4x4/;
 }
 
 # ENCODEMB INVOKE
diff --git a/vp9/encoder/vp9_avg.c b/vp9/encoder/vp9_avg.c
index e9810c894..f8fa7d2e8 100644
--- a/vp9/encoder/vp9_avg.c
+++ b/vp9/encoder/vp9_avg.c
@@ -19,6 +19,15 @@ unsigned int vp9_avg_8x8_c(const uint8_t *s, int p) {
   return (sum + 32) >> 6;
 }
 
+unsigned int vp9_avg_4x4_c(const uint8_t *s, int p) {
+  int i, j;
+  int sum = 0;
+  for (i = 0; i < 4; ++i, s+=p)
+    for (j = 0; j < 4; sum += s[j], ++j) {}
+
+  return (sum + 8) >> 4;
+}
+
 #if CONFIG_VP9_HIGHBITDEPTH
 unsigned int vp9_highbd_avg_8x8_c(const uint8_t *s8, int p) {
   int i, j;
@@ -29,5 +38,16 @@ unsigned int vp9_highbd_avg_8x8_c(const uint8_t *s8, int p) {
 
   return (sum + 32) >> 6;
 }
+
+unsigned int vp9_highbd_avg_4x4_c(const uint8_t *s8, int p) {
+  int i, j;
+  int sum = 0;
+  const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
+  for (i = 0; i < 4; ++i, s+=p)
+    for (j = 0; j < 4; sum += s[j], ++j) {}
+
+  return (sum + 8) >> 4;
+}
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
+
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index d5122d0bc..7788e502d 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -291,6 +291,11 @@ typedef struct {
 typedef struct {
   partition_variance part_variances;
   var split[4];
+} v4x4;
+
+typedef struct {
+  partition_variance part_variances;
+  v4x4 split[4];
 } v8x8;
 
 typedef struct {
@@ -348,6 +353,13 @@ static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) {
     case BLOCK_8X8: {
       v8x8 *vt = (v8x8 *) data;
       node->part_variances = &vt->part_variances;
+      for (i = 0; i < 4; i++)
+        node->split[i] = &vt->split[i].part_variances.none;
+      break;
+    }
+    case BLOCK_4X4: {
+      v4x4 *vt = (v4x4 *) data;
+      node->part_variances = &vt->part_variances;
       for (i = 0; i < 4; i++)
         node->split[i] = &vt->split[i];
       break;
@@ -398,64 +410,76 @@ static int set_vt_partitioning(VP9_COMP *cpi,
   variance_node vt;
   const int block_width = num_8x8_blocks_wide_lookup[bsize];
   const int block_height = num_8x8_blocks_high_lookup[bsize];
-  // TODO(debargha): Choose this more intelligently.
-  const int threshold_multiplier = cm->frame_type == KEY_FRAME ? 64 : 4;
+  // TODO(marpan): Adjust/tune these thresholds.
+  const int threshold_multiplier = cm->frame_type == KEY_FRAME ? 80 : 4;
   int64_t threshold =
       (int64_t)(threshold_multiplier *
                 vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth));
+  int64_t threshold_bsize_ref = threshold << 6;
+  int64_t threshold_low = threshold;
+  BLOCK_SIZE bsize_ref = BLOCK_16X16;
+
   assert(block_height == block_width);
   tree_to_node(data, bsize, &vt);
 
-  // Split none is available only if we have more than half a block size
-  // in width and height inside the visible image.
-  if (mi_col + block_width / 2 < cm->mi_cols &&
-      mi_row + block_height / 2 < cm->mi_rows &&
-      vt.part_variances->none.variance < threshold) {
-    set_block_size(cpi, xd, mi_row, mi_col, bsize);
-    return 1;
+  if (cm->frame_type == KEY_FRAME) {
+    bsize_ref = BLOCK_8X8;
+    // Choose lower thresholds for key frame variance to favor split.
+    threshold_bsize_ref = threshold >> 1;
+    threshold_low = threshold >> 2;
   }
 
-  // Only allow split for blocks above 16x16.
-  if (bsize > BLOCK_16X16) {
-    // Vertical split is available on all but the bottom border.
+  // For bsize=bsize_ref (16x16/8x8 for 8x8/4x4 downsampling), select if
+  // variance is below threshold, otherwise split will be selected.
+  // No check for vert/horiz split as too few samples for variance.
+  if (bsize == bsize_ref) {
+    if (mi_col + block_width / 2 < cm->mi_cols &&
+        mi_row + block_height / 2 < cm->mi_rows &&
+        vt.part_variances->none.variance < threshold_bsize_ref) {
+      set_block_size(cpi, xd, mi_row, mi_col, bsize);
+      return 1;
+    }
+    return 0;
+  } else if (bsize > bsize_ref) {
+    // For key frame, for bsize above 32X32, or very high variance, take split.
+    if (cm->frame_type == KEY_FRAME &&
+        (bsize > BLOCK_32X32 ||
+        vt.part_variances->none.variance > (threshold << 2))) {
+      return 0;
+    }
+    // If variance is low, take the bsize (no split).
+    if (mi_col + block_width / 2 < cm->mi_cols &&
+        mi_row + block_height / 2 < cm->mi_rows &&
+        vt.part_variances->none.variance < threshold_low) {
+      set_block_size(cpi, xd, mi_row, mi_col, bsize);
+      return 1;
+    }
+    // Check vertical split.
     if (mi_row + block_height / 2 < cm->mi_rows &&
-        vt.part_variances->vert[0].variance < threshold &&
-        vt.part_variances->vert[1].variance < threshold) {
+        vt.part_variances->vert[0].variance < threshold_low &&
+        vt.part_variances->vert[1].variance < threshold_low) {
       BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_VERT);
       set_block_size(cpi, xd, mi_row, mi_col, subsize);
       set_block_size(cpi, xd, mi_row, mi_col + block_width / 2, subsize);
       return 1;
     }
-
-    // Horizontal split is available on all but the right border.
+    // Check horizontal split.
     if (mi_col + block_width / 2 < cm->mi_cols &&
-        vt.part_variances->horz[0].variance < threshold &&
-        vt.part_variances->horz[1].variance < threshold) {
+        vt.part_variances->horz[0].variance < threshold_low &&
+        vt.part_variances->horz[1].variance < threshold_low) {
       BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_HORZ);
       set_block_size(cpi, xd, mi_row, mi_col, subsize);
       set_block_size(cpi, xd, mi_row + block_height / 2, mi_col, subsize);
       return 1;
     }
-  }
-
-  // This will only allow 8x8 if the 16x16 variance is very large.
-  if (bsize == BLOCK_16X16) {
-    if (mi_col + block_width / 2 < cm->mi_cols &&
-        mi_row + block_height / 2 < cm->mi_rows &&
-        vt.part_variances->none.variance < (threshold << 6)) {
-      set_block_size(cpi, xd, mi_row, mi_col, bsize);
-      return 1;
-    }
+    return 0;
   }
   return 0;
 }
 
-// This function chooses partitioning based on the variance
-// between source and reconstructed last, where variance is
-// computed for 8x8 downsampled inputs. Some things to check:
-// using the last source rather than reconstructed last, and
-// allowing for small downsampling (4x4 or 2x2) for selection
-// of smaller block sizes (i.e., < 16x16).
+// This function chooses partitioning based on the variance between source and
+// reconstructed last, where variance is computed for downsampled inputs.
+// Currently 8x8 downsampling is used for delta frames, 4x4 for key frames.
 static void choose_partitioning(VP9_COMP *cpi,
                                 const TileInfo *const tile,
                                 MACROBLOCK *x,
@@ -463,7 +487,7 @@ static void choose_partitioning(VP9_COMP *cpi,
   VP9_COMMON * const cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
 
-  int i, j, k;
+  int i, j, k, m;
   v64x64 vt;
   uint8_t *s;
   const uint8_t *d;
@@ -525,38 +549,63 @@ static void choose_partitioning(VP9_COMP *cpi,
       const int y16_idx = y32_idx + ((j >> 1) << 4);
       v16x16 *vst = &vt.split[i].split[j];
       for (k = 0; k < 4; k++) {
-        int x_idx = x16_idx + ((k & 1) << 3);
-        int y_idx = y16_idx + ((k >> 1) << 3);
-        unsigned int sse = 0;
-        int sum = 0;
-
-        if (x_idx < pixels_wide && y_idx < pixels_high) {
-          int s_avg, d_avg;
+        int x8_idx = x16_idx + ((k & 1) << 3);
+        int y8_idx = y16_idx + ((k >> 1) << 3);
+        if (cm->frame_type != KEY_FRAME) {
+          unsigned int sse = 0;
+          int sum = 0;
+          if (x8_idx < pixels_wide && y8_idx < pixels_high) {
+            int s_avg, d_avg;
 #if CONFIG_VP9_HIGHBITDEPTH
-          if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-            s_avg = vp9_highbd_avg_8x8(s + y_idx * sp + x_idx, sp);
-            d_avg = vp9_highbd_avg_8x8(d + y_idx * dp + x_idx, dp);
-          } else {
-            s_avg = vp9_avg_8x8(s + y_idx * sp + x_idx, sp);
-            d_avg = vp9_avg_8x8(d + y_idx * dp + x_idx, dp);
-          }
+            if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+              s_avg = vp9_highbd_avg_8x8(s + y8_idx * sp + x8_idx, sp);
+              d_avg = vp9_highbd_avg_8x8(d + y8_idx * dp + x8_idx, dp);
+            } else {
+              s_avg = vp9_avg_8x8(s + y8_idx * sp + x8_idx, sp);
+              d_avg = vp9_avg_8x8(d + y8_idx * dp + x8_idx, dp);
+           }
 #else
-          s_avg = vp9_avg_8x8(s + y_idx * sp + x_idx, sp);
-          d_avg = vp9_avg_8x8(d + y_idx * dp + x_idx, dp);
+            s_avg = vp9_avg_8x8(s + y8_idx * sp + x8_idx, sp);
+            d_avg = vp9_avg_8x8(d + y8_idx * dp + x8_idx, dp);
 #endif
-          sum = s_avg - d_avg;
-          sse = sum * sum;
+            sum = s_avg - d_avg;
+            sse = sum * sum;
+          }
+          // If variance is based on 8x8 downsampling, we stop here and have
+          // one sample for 8x8 block (so use 1 for count in fill_variance),
+          // which of course means variance = 0 for 8x8 block.
+          fill_variance(sse, sum, 1, &vst->split[k].part_variances.none);
+        } else {
+          // For key frame, go down to 4x4.
+          v8x8 *vst2 = &vst->split[k];
+          for (m = 0; m < 4; m++) {
+            int x4_idx = x8_idx + ((m & 1) << 2);
+            int y4_idx = y8_idx + ((m >> 1) << 2);
+            unsigned int sse = 0;
+            int sum = 0;
+            if (x4_idx < pixels_wide && y4_idx < pixels_high) {
+              int s_avg = vp9_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+              // For key frame, reference is set to 128.
+              sum = s_avg - 128;
+              sse = sum * sum;
+            }
+            // If variance is based on 4x4 downsampling, we stop here and have
+            // one sample for 4x4 block (so use 1 for count in fill_variance),
+            // which of course means variance = 0 for 4x4 block.
+           fill_variance(sse, sum, 1, &vst2->split[m].part_variances.none);
+          }
         }
-        // For an 8x8 block we have just one value the average of all 64
-        // pixels,  so use 1.   This means of course that there is no variance
-        // in an 8x8 block.
-        fill_variance(sse, sum, 1, &vst->split[k].part_variances.none);
       }
     }
   }
   // Fill the rest of the variance tree by summing split partition values.
   for (i = 0; i < 4; i++) {
     for (j = 0; j < 4; j++) {
+      if (cm->frame_type == KEY_FRAME) {
+        for (m = 0; m < 4; m++) {
+          fill_variance_tree(&vt.split[i].split[j].split[m], BLOCK_8X8);
+        }
+      }
       fill_variance_tree(&vt.split[i].split[j], BLOCK_16X16);
     }
     fill_variance_tree(&vt.split[i], BLOCK_32X32);
@@ -564,8 +613,7 @@ static void choose_partitioning(VP9_COMP *cpi,
   fill_variance_tree(&vt, BLOCK_64X64);
 
   // Now go through the entire structure,  splitting every block size until
-  // we get to one that's got a variance lower than our threshold,  or we
-  // hit 8x8.
+  // we get to one that's got a variance lower than our threshold.
   if ( mi_col + 8 > cm->mi_cols || mi_row + 8 > cm->mi_rows ||
       !set_vt_partitioning(cpi, xd, &vt, BLOCK_64X64, mi_row, mi_col)) {
     for (i = 0; i < 4; ++i) {
@@ -576,11 +624,13 @@ static void choose_partitioning(VP9_COMP *cpi,
         for (j = 0; j < 4; ++j) {
           const int x16_idx = ((j & 1) << 1);
           const int y16_idx = ((j >> 1) << 1);
-          // NOTE: Since this uses 8x8 downsampling for variance calculation
-          // we cannot really select block size 8x8 (or even 8x16/16x8),
-          // since we do not sufficient samples for variance.
-          // For now, 8x8 partition is only set if the variance of the 16x16
-          // block is very high. This is controlled in set_vt_partitioning.
+          // Note: If 8x8 downsampling is used for variance calculation we
+          // cannot really select block size 8x8 (or even 8x16/16x8), since we
+          // don't have sufficient samples for variance. So on delta frames,
+          // 8x8 partition is only set if variance of the 16x16 block is very
+          // high. For key frames, 4x4 downsampling is used, so we can better
+          // select 8x16/16x8 and 8x8. 4x4 partition can potentially be set
+          // used here too, but for now 4x4 is not allowed.
           if (!set_vt_partitioning(cpi, xd, &vt.split[i].split[j],
                                    BLOCK_16X16,
                                    mi_row + y32_idx + y16_idx,
@@ -588,10 +638,26 @@ static void choose_partitioning(VP9_COMP *cpi,
             for (k = 0; k < 4; ++k) {
               const int x8_idx = (k & 1);
               const int y8_idx = (k >> 1);
-              set_block_size(cpi, xd,
-                             (mi_row + y32_idx + y16_idx + y8_idx),
-                             (mi_col + x32_idx + x16_idx + x8_idx),
-                             BLOCK_8X8);
+              // TODO(marpan): Allow for setting 4x4 partition on key frame.
+              /*
+              if (cm->frame_type == KEY_FRAME) {
+                if (!set_vt_partitioning(cpi, xd,
+                                         &vt.split[i].split[j].split[k],
+                                         BLOCK_8X8,
+                                         mi_row + y32_idx + y16_idx + y8_idx,
+                                         mi_col + x32_idx + x16_idx + x8_idx)) {
+                    set_block_size(cpi, xd,
+                                  (mi_row + y32_idx + y16_idx + y8_idx),
+                                  (mi_col + x32_idx + x16_idx + x8_idx),
+                                   BLOCK_4X4);
+                }
+              } else {
+              */
+                set_block_size(cpi, xd,
+                               (mi_row + y32_idx + y16_idx + y8_idx),
+                               (mi_col + x32_idx + x16_idx + x8_idx),
+                               BLOCK_8X8);
+              // }
             }
           }
         }
@@ -2511,7 +2577,7 @@ static void encode_rd_sb_row(VP9_COMP *cpi,
       rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
                        BLOCK_64X64, &dummy_rate, &dummy_dist, 1, td->pc_root);
     } else if (sf->partition_search_type == VAR_BASED_PARTITION &&
-               cm->frame_type != KEY_FRAME ) {
+               cm->frame_type != KEY_FRAME) {
       choose_partitioning(cpi, tile_info, x, mi_row, mi_col);
       rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
                        BLOCK_64X64, &dummy_rate, &dummy_dist, 1, td->pc_root);
@@ -3532,6 +3598,11 @@ static void encode_frame_internal(VP9_COMP *cpi) {
                  cm->uv_ac_delta_q == 0;
 
   cm->tx_mode = select_tx_mode(cpi, xd);
+  if (cm->frame_type == KEY_FRAME &&
+      cpi->sf.use_nonrd_pick_mode &&
+      cpi->sf.partition_search_type == VAR_BASED_PARTITION) {
+    cm->tx_mode = ALLOW_16X16;
+  }
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (cm->use_highbitdepth)
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 5c70b4ee4..85d0dba59 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -321,7 +321,7 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
     sf->partition_search_type = VAR_BASED_PARTITION;
 
     // Turn on this to use non-RD key frame coding mode.
-    // sf->use_nonrd_pick_mode = 1;
+    sf->use_nonrd_pick_mode = 1;
     sf->mv.search_method = NSTEP;
     sf->tx_size_search_method = is_keyframe ? USE_LARGESTALL : USE_TX_8X8;
     sf->mv.reduce_first_step_size = 1;
diff --git a/vp9/encoder/x86/vp9_avg_intrin_sse2.c b/vp9/encoder/x86/vp9_avg_intrin_sse2.c
index ca6cf1ac9..4c3495b05 100644
--- a/vp9/encoder/x86/vp9_avg_intrin_sse2.c
+++ b/vp9/encoder/x86/vp9_avg_intrin_sse2.c
@@ -38,3 +38,21 @@ unsigned int vp9_avg_8x8_sse2(const uint8_t *s, int p) {
   avg = _mm_extract_epi16(s0, 0);
   return (avg + 32) >> 6;
 }
+
+unsigned int vp9_avg_4x4_sse2(const uint8_t *s, int p) {
+  __m128i s0, s1, u0;
+  unsigned int avg = 0;
+  u0  = _mm_setzero_si128();
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+
+  s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4));
+  s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
+  avg = _mm_extract_epi16(s0, 0);
+  return (avg + 8) >> 4;
+}
-- 
cgit v1.2.3