Record the sum of tx block eobs in the partition block

The sum of tx bloxk eobs is needed in the machine learning based partition early termination. The eobs are first accumulated during tx search, and then the value associated with the best tx_size is copied to ctx for later use. After the sum of eobs are calculated correctly, re-enabled ml_partition_search_early_termination speed feature. Re-did the quality/speed test to check the impact of the fix. 1. Borg test BDRATE result: 4k set: PSNR: +0.183%; SSIM: +0.100%; hdres set: PSNR: +0.168%; SSIM: +0.256%; midres set: PSNR: +0.186%; SSIM: +0.326%; 2.Average speed gain result: 4k clips: 21%; hd clips: 26%; midres clips: 15%. The result is in line with the original result. Change-Id: I4209a95c89be03b4cbfb6a95b16885f89feddbda
author: Yunqing Wang <yunqingwang@google.com> 2017-03-16 15:45:07 -0700
committer: Yunqing Wang <yunqingwang@google.com> 2017-03-20 17:12:15 +0000
commit: 9c2552a1c149cbc7ee407c514e0cf78e6f45bcec (patch)
tree: 78312b2547181485e319e2d37f05d81448b31b74 /vp9
parent: 83ba1880bf6596c922e67f99290a30b758b01379 (diff)
download: libvpx-9c2552a1c149cbc7ee407c514e0cf78e6f45bcec.tar
libvpx-9c2552a1c149cbc7ee407c514e0cf78e6f45bcec.tar.gz
libvpx-9c2552a1c149cbc7ee407c514e0cf78e6f45bcec.tar.bz2
libvpx-9c2552a1c149cbc7ee407c514e0cf78e6f45bcec.zip
5 files changed, 25 insertions, 24 deletions
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index c86c818aa..00c5a9378 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -128,6 +128,9 @@ struct macroblock {
   // Set during mode selection. Read during block encoding.
   uint8_t zcoeff_blk[TX_SIZES][256];
 
+  // Accumulate the tx block eobs in a partition block.
+  int32_t sum_y_eobs[TX_SIZES];
+
   int skip;
 
   int encode_breakout;
diff --git a/vp9/encoder/vp9_context_tree.h b/vp9/encoder/vp9_context_tree.h
index af1a93a00..9e4cbb360 100644
--- a/vp9/encoder/vp9_context_tree.h
+++ b/vp9/encoder/vp9_context_tree.h
@@ -73,7 +73,7 @@ typedef struct {
   INTERP_FILTER pred_interp_filter;
 
   // Used for the machine learning-based early termination
-  int sum_eobs;
+  int32_t sum_y_eobs;
 } PICK_MODE_CONTEXT;
 
 typedef struct PC_TREE {
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index df72667d4..7665064b9 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -2711,18 +2711,6 @@ static INLINE int get_motion_inconsistency(MOTION_DIRECTION this_mv,
 }
 #endif
 
-// Accumulate all tx blocks' eobs results got from the partition evaluation.
-static void accumulate_eobs(int plane, int block, int row, int col,
-                            BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
-                            void *arg) {
-  PICK_MODE_CONTEXT *ctx = (PICK_MODE_CONTEXT *)arg;
-  (void)row;
-  (void)col;
-  (void)plane_bsize;
-  (void)tx_size;
-  ctx->sum_eobs += ctx->eobs_pbuf[plane][1][block];
-}
-
 // TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
 // unlikely to be selected depending on previous rate-distortion optimization
 // results, for encoding speed-up.
@@ -2899,6 +2887,8 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
       }
 
       if (this_rdc.rdcost < best_rdc.rdcost) {
+        MODE_INFO *mi = xd->mi[0];
+
         best_rdc = this_rdc;
         if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE;
 
@@ -2917,7 +2907,9 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
           // Currently, the machine-learning based partition search early
           // termination is only used while bsize is 16x16, 32x32 or 64x64,
           // VPXMIN(cm->width, cm->height) >= 480, and speed = 0.
-          if (ctx->mic.mode >= INTRA_MODES && bsize >= BLOCK_16X16) {
+          if (!x->e_mbd.lossless &&
+              !segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP) &&
+              ctx->mic.mode >= INTRA_MODES && bsize >= BLOCK_16X16) {
             const double *clf;
             const double *mean;
             const double *sd;
@@ -2936,10 +2928,6 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
 
             assert(b_width_log2_lookup[bsize] == b_height_log2_lookup[bsize]);
 
-            ctx->sum_eobs = 0;
-            vp9_foreach_transformed_block_in_plane(xd, bsize, 0,
-                                                   accumulate_eobs, ctx);
-
             if (above_in_image) {
               context_size = xd->above_mi->sb_type;
               if (context_size < bsize)
@@ -2980,7 +2968,7 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
                     clf[2] * (((double)mag_mv / 2 - mean[2]) * sd[2]) +
                     clf[3] * (((double)(left_par + above_par) / 2 - mean[3]) *
                               sd[3]) +
-                    clf[4] * (((double)ctx->sum_eobs - mean[4]) / sd[4]) +
+                    clf[4] * (((double)ctx->sum_y_eobs - mean[4]) / sd[4]) +
                     clf[5] * (((double)cm->base_qindex - mean[5]) * sd[5]) +
                     clf[6] * (((double)last_par - mean[6]) * sd[6]) + clf[7];
             if (score < 0) {
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 0c4604437..9123b87f5 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -742,9 +742,11 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
 
   // TODO(jingning): temporarily enabled only for luma component
   rd = VPXMIN(rd1, rd2);
-  if (plane == 0)
+  if (plane == 0) {
     x->zcoeff_blk[tx_size][block] =
         !x->plane[plane].eobs[block] || (rd1 > rd2 && !xd->lossless);
+    x->sum_y_eobs[tx_size] += x->plane[plane].eobs[block];
+  }
 
   args->this_rate += rate;
   args->this_dist += dist;
@@ -3190,6 +3192,8 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
     ref_frame = vp9_mode_order[mode_index].ref_frame[0];
     second_ref_frame = vp9_mode_order[mode_index].ref_frame[1];
 
+    vp9_zero(x->sum_y_eobs);
+
     // Look at the reference frame of the best mode so far and set the
     // skip mask to look at a subset of the remaining modes.
     if (midx == mode_skip_start && best_mode_index >= 0) {
@@ -3469,6 +3473,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
         if (!x->select_tx_size) swap_block_ptr(x, ctx, 1, 0, 0, max_plane);
         memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mi->tx_size],
                sizeof(ctx->zcoeff_blk[0]) * ctx->num_4x4_blk);
+        ctx->sum_y_eobs = x->sum_y_eobs[mi->tx_size];
 
         // TODO(debargha): enhance this test with a better distortion prediction
         // based on qp, activity mask and history
@@ -3699,6 +3704,8 @@ void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, TileDataEnc *tile_data,
   mi->mv[0].as_int = 0;
   x->skip = 1;
 
+  ctx->sum_y_eobs = 0;
+
   if (cm->interp_filter != BILINEAR) {
     best_filter = EIGHTTAP;
     if (cm->interp_filter == SWITCHABLE &&
@@ -3853,6 +3860,8 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
     ref_frame = vp9_ref_order[ref_index].ref_frame[0];
     second_ref_frame = vp9_ref_order[ref_index].ref_frame[1];
 
+    vp9_zero(x->sum_y_eobs);
+
 #if CONFIG_BETTER_HW_COMPATIBILITY
     // forbid 8X4 and 4X8 partitions if any reference frame is scaled.
     if (bsize == BLOCK_8X4 || bsize == BLOCK_4X8) {
@@ -4069,6 +4078,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
               for (i = 0; i < 4; i++) {
                 tmp_best_bmodes[i] = xd->mi[0]->bmi[i];
                 x->zcoeff_blk[TX_4X4][i] = !x->plane[0].eobs[i];
+                x->sum_y_eobs[TX_4X4] += x->plane[0].eobs[i];
               }
               pred_exists = 1;
               if (switchable_filter_index == 0 && sf->use_rd_breakout &&
@@ -4233,6 +4243,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
         if (!x->select_tx_size) swap_block_ptr(x, ctx, 1, 0, 0, max_plane);
         memcpy(ctx->zcoeff_blk, x->zcoeff_blk[TX_4X4],
                sizeof(ctx->zcoeff_blk[0]) * ctx->num_4x4_blk);
+        ctx->sum_y_eobs = x->sum_y_eobs[TX_4X4];
 
         for (i = 0; i < 4; i++) best_bmodes[i] = xd->mi[0]->bmi[i];
 
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index ce5353539..dad1261af 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -73,10 +73,9 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi,
 
   // Currently, the machine-learning based partition search early termination
   // is only used while VPXMIN(cm->width, cm->height) >= 480 and speed = 0.
-  // TODO(yunqingwang): Re-enable when test failures are fixed.
-  // if (VPXMIN(cm->width, cm->height) >= 480) {
-  //   sf->ml_partition_search_early_termination = 1;
-  // }
+  if (VPXMIN(cm->width, cm->height) >= 480) {
+    sf->ml_partition_search_early_termination = 1;
+  }
 
   if (speed >= 1) {
     sf->ml_partition_search_early_termination = 0;
author	Yunqing Wang <yunqingwang@google.com>	2017-03-16 15:45:07 -0700
committer	Yunqing Wang <yunqingwang@google.com>	2017-03-20 17:12:15 +0000
commit	9c2552a1c149cbc7ee407c514e0cf78e6f45bcec (patch)
tree	78312b2547181485e319e2d37f05d81448b31b74 /vp9
parent	83ba1880bf6596c922e67f99290a30b758b01379 (diff)
download	libvpx-9c2552a1c149cbc7ee407c514e0cf78e6f45bcec.tar libvpx-9c2552a1c149cbc7ee407c514e0cf78e6f45bcec.tar.gz libvpx-9c2552a1c149cbc7ee407c514e0cf78e6f45bcec.tar.bz2 libvpx-9c2552a1c149cbc7ee407c514e0cf78e6f45bcec.zip