7 files changed, 216 insertions, 63 deletions
diff --git a/vp9/common/vp9_findnearmv.c b/vp9/common/vp9_findnearmv.c
index d7817114e..a6922715e 100644
--- a/vp9/common/vp9_findnearmv.c
+++ b/vp9/common/vp9_findnearmv.c
@@ -13,7 +13,6 @@
 #include "vp9/common/vp9_findnearmv.h"
 #include "vp9/common/vp9_mvref_common.h"
 #include "vp9/common/vp9_sadmxn.h"
-#include "vp9/common/vp9_subpelvar.h"
 
 static void lower_mv_precision(int_mv *mv, int usehp) {
   if (!usehp || !vp9_use_nmv_hp(&mv->as_mv)) {
@@ -24,12 +23,6 @@ static void lower_mv_precision(int_mv *mv, int usehp) {
   }
 }
 
-vp9_prob *vp9_mv_ref_probs(VP9_COMMON *pc, vp9_prob *p, int context) {
-  p[0] = pc->fc.inter_mode_probs[context][0];
-  p[1] = pc->fc.inter_mode_probs[context][1];
-  p[2] = pc->fc.inter_mode_probs[context][2];
-  return p;
-}
 
 void vp9_find_best_ref_mvs(MACROBLOCKD *xd,
                            int_mv *mvlist,
diff --git a/vp9/common/vp9_findnearmv.h b/vp9/common/vp9_findnearmv.h
index 17fef125c..d4ae2102d 100644
--- a/vp9/common/vp9_findnearmv.h
+++ b/vp9/common/vp9_findnearmv.h
@@ -70,10 +70,6 @@ static int check_mv_bounds(int_mv *mv,
          mv->as_mv.row > mb_to_bottom_edge;
 }
 
-vp9_prob *vp9_mv_ref_probs(VP9_COMMON *pc,
-                           vp9_prob p[VP9_INTER_MODES - 1],
-                           int context);
-
 void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *pc,
                                    MACROBLOCKD *xd,
                                    int_mv *dst_nearest,
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 375fe2a4d..b3d41bed7 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -560,7 +560,7 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
   if (mbmi->ref_frame[0] != INTRA_FRAME) {
     int_mv nearest, nearby, best_mv;
     int_mv nearest_second, nearby_second, best_mv_second;
-    vp9_prob mv_ref_p[VP9_INTER_MODES - 1];
+    vp9_prob *mv_ref_p;
 
     read_ref_frame(pbi, r, mbmi->segment_id, mbmi->ref_frame);
 
@@ -574,7 +574,8 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
                        mbmi->ref_frame[0], mbmi->ref_mvs[mbmi->ref_frame[0]],
                        cm->ref_frame_sign_bias);
 
-      vp9_mv_ref_probs(cm, mv_ref_p, mbmi->mb_mode_context[mbmi->ref_frame[0]]);
+      mv_ref_p = cm->fc.inter_mode_probs[
+        mbmi->mb_mode_context[mbmi->ref_frame[0]]];
 
       // If the segment level skip mode enabled
       if (vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_SKIP)) {
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index 49b181d69..193353272 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -276,9 +276,7 @@ static void decode_atom(VP9D_COMP *pbi, MACROBLOCKD *xd,
   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
 
   assert(mbmi->ref_frame[0] != INTRA_FRAME);
-
-  if ((pbi->common.frame_type != KEY_FRAME) && (!pbi->common.intra_only))
-    vp9_setup_interp_filters(xd, mbmi->interp_filter, &pbi->common);
+  vp9_setup_interp_filters(xd, mbmi->interp_filter, &pbi->common);
 
   // prediction
   vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
@@ -327,8 +325,7 @@ static void decode_sb(VP9D_COMP *pbi, MACROBLOCKD *xd, int mi_row, int mi_col,
   assert(mbmi->sb_type == bsize);
   assert(mbmi->ref_frame[0] != INTRA_FRAME);
 
-  if (pbi->common.frame_type != KEY_FRAME)
-    vp9_setup_interp_filters(xd, mbmi->interp_filter, pc);
+  vp9_setup_interp_filters(xd, mbmi->interp_filter, pc);
 
   // generate prediction
   vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
@@ -392,26 +389,24 @@ static void set_refs(VP9D_COMP *pbi, int mi_row, int mi_col) {
   MACROBLOCKD *const xd = &pbi->mb;
   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
 
-  if (mbmi->ref_frame[0] > INTRA_FRAME) {
+  // Select the appropriate reference frame for this MB
+  const int fb_idx = cm->active_ref_idx[mbmi->ref_frame[0] - 1];
+  const YV12_BUFFER_CONFIG *cfg = &cm->yv12_fb[fb_idx];
+  xd->scale_factor[0] = cm->active_ref_scale[mbmi->ref_frame[0] - 1];
+  xd->scale_factor_uv[0] = cm->active_ref_scale[mbmi->ref_frame[0] - 1];
+  setup_pre_planes(xd, cfg, NULL, mi_row, mi_col, xd->scale_factor,
+                   xd->scale_factor_uv);
+  xd->corrupted |= cfg->corrupted;
+
+  if (mbmi->ref_frame[1] > INTRA_FRAME) {
     // Select the appropriate reference frame for this MB
-    const int fb_idx = cm->active_ref_idx[mbmi->ref_frame[0] - 1];
-    const YV12_BUFFER_CONFIG *cfg = &cm->yv12_fb[fb_idx];
-    xd->scale_factor[0]    = cm->active_ref_scale[mbmi->ref_frame[0] - 1];
-    xd->scale_factor_uv[0] = cm->active_ref_scale[mbmi->ref_frame[0] - 1];
-    setup_pre_planes(xd, cfg, NULL, mi_row, mi_col,
-                     xd->scale_factor, xd->scale_factor_uv);
-    xd->corrupted |= cfg->corrupted;
-
-    if (mbmi->ref_frame[1] > INTRA_FRAME) {
-      // Select the appropriate reference frame for this MB
-      const int second_fb_idx = cm->active_ref_idx[mbmi->ref_frame[1] - 1];
-      const YV12_BUFFER_CONFIG *second_cfg = &cm->yv12_fb[second_fb_idx];
-      xd->scale_factor[1]    = cm->active_ref_scale[mbmi->ref_frame[1] - 1];
-      xd->scale_factor_uv[1] = cm->active_ref_scale[mbmi->ref_frame[1] - 1];
-      setup_pre_planes(xd, NULL, second_cfg, mi_row, mi_col,
-                       xd->scale_factor, xd->scale_factor_uv);
-      xd->corrupted |= second_cfg->corrupted;
-    }
+    const int second_fb_idx = cm->active_ref_idx[mbmi->ref_frame[1] - 1];
+    const YV12_BUFFER_CONFIG *second_cfg = &cm->yv12_fb[second_fb_idx];
+    xd->scale_factor[1] = cm->active_ref_scale[mbmi->ref_frame[1] - 1];
+    xd->scale_factor_uv[1] = cm->active_ref_scale[mbmi->ref_frame[1] - 1];
+    setup_pre_planes(xd, NULL, second_cfg, mi_row, mi_col, xd->scale_factor,
+                     xd->scale_factor_uv);
+    xd->corrupted |= second_cfg->corrupted;
   }
 }
 
@@ -424,16 +419,17 @@ static void decode_modes_b(VP9D_COMP *pbi, int mi_row, int mi_col,
       return;
   set_offsets(pbi, bsize, mi_row, mi_col);
   vp9_decode_mb_mode_mv(pbi, xd, mi_row, mi_col, r);
-  set_refs(pbi, mi_row, mi_col);
 
-  if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME)
+  if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME) {
     decode_sb_intra(pbi, xd, mi_row, mi_col, r, (bsize < BLOCK_SIZE_SB8X8) ?
                                      BLOCK_SIZE_SB8X8 : bsize);
-  else if (bsize < BLOCK_SIZE_SB8X8)
-    decode_atom(pbi, xd, mi_row, mi_col, r, BLOCK_SIZE_SB8X8);
-  else
-    decode_sb(pbi, xd, mi_row, mi_col, r, bsize);
-
+  } else {
+    set_refs(pbi, mi_row, mi_col);
+    if (bsize < BLOCK_SIZE_SB8X8)
+      decode_atom(pbi, xd, mi_row, mi_col, r, BLOCK_SIZE_SB8X8);
+    else
+      decode_sb(pbi, xd, mi_row, mi_col, r, bsize);
+  }
   xd->corrupted |= vp9_reader_has_error(r);
 }
 
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index e18394b1e..09ab2db67 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -708,11 +708,11 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
     write_intra_mode(bc, mi->uv_mode,
                      pc->fc.uv_mode_prob[mode]);
   } else {
-    vp9_prob mv_ref_p[VP9_INTER_MODES - 1];
+    vp9_prob *mv_ref_p;
 
     encode_ref_frame(cpi, bc);
 
-    vp9_mv_ref_probs(&cpi->common, mv_ref_p, mi->mb_mode_context[rf]);
+    mv_ref_p = cpi->common.fc.inter_mode_probs[mi->mb_mode_context[rf]];
 
 #ifdef ENTROPY_STATS
     active_section = 3;
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 53d8be775..ccbb624b0 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -35,6 +35,153 @@ static void quantize(int16_t *zbin_boost_orig_ptr,
                      uint16_t *eob_ptr,
                      const int *scan, int mul) {
   int i, rc, eob;
+  int zbins[2], nzbins[2], zbin;
+  int x, y, z, sz;
+  int zero_run = 0;
+  int16_t *zbin_boost_ptr = zbin_boost_orig_ptr;
+  int zero_flag = n_coeffs;
+
+  vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
+  vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
+
+  eob = -1;
+
+  // Base ZBIN
+  zbins[0] = zbin_ptr[0] + zbin_oq_value;
+  zbins[1] = zbin_ptr[1] + zbin_oq_value;
+  nzbins[0] = zbins[0] * -1;
+  nzbins[1] = zbins[1] * -1;
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = n_coeffs - 1; i >= 0; i--) {
+      rc = scan[i];
+      z = coeff_ptr[rc] * mul;
+
+      if (z < zbins[rc != 0] && z > nzbins[rc != 0]) {
+        zero_flag--;
+      } else {
+        break;
+      }
+    }
+
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+    for (i = 0; i < zero_flag; i++) {
+      rc = scan[i];
+      z  = coeff_ptr[rc] * mul;
+
+      zbin = (zbins[rc != 0] + zbin_boost_ptr[zero_run]);
+      zero_run += (zero_run < 15);
+
+      sz = (z >> 31);                               // sign of z
+      x  = (z ^ sz) - sz;
+
+      if (x >= zbin) {
+        x += (round_ptr[rc != 0]);
+        y  = ((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x))
+            >> quant_shift_ptr[rc != 0];            // quantize (x)
+        x  = (y ^ sz) - sz;                         // get the sign back
+        qcoeff_ptr[rc]  = x;                        // write to destination
+        dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0] / mul;  // dequantized value
+
+        if (y) {
+          eob = i;                                  // last nonzero coeffs
+          zero_run = 0;                             // set zero_run
+        }
+      }
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+// This function works well for large transform size.
+static void quantize_sparse(int16_t *zbin_boost_orig_ptr,
+                            int16_t *coeff_ptr, int n_coeffs, int skip_block,
+                            int16_t *zbin_ptr, int16_t *round_ptr,
+                            int16_t *quant_ptr, uint8_t *quant_shift_ptr,
+                            int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                            int16_t *dequant_ptr, int zbin_oq_value,
+                            uint16_t *eob_ptr, const int *scan, int mul,
+                            int *idx_arr) {
+  int i, rc, eob;
+  int zbins[2], pzbins[2], nzbins[2], zbin;
+  int x, y, z, sz;
+  int zero_run = 0;
+  int16_t *zbin_boost_ptr = zbin_boost_orig_ptr;
+  int idx = 0;
+  int pre_idx = 0;
+
+  vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
+  vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
+
+  eob = -1;
+
+  // Base ZBIN
+  zbins[0] = zbin_ptr[0] + zbin_oq_value;
+  zbins[1] = zbin_ptr[1] + zbin_oq_value;
+  // Positive and negative ZBIN
+  pzbins[0] = zbins[0]/mul;
+  pzbins[1] = zbins[1]/mul;
+  nzbins[0] = pzbins[0] * -1;
+  nzbins[1] = pzbins[1] * -1;
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = 0; i < n_coeffs; i++) {
+      rc = scan[i];
+      z = coeff_ptr[rc];
+
+      // If the coefficient is out of the base ZBIN range, keep it for
+      // quantization.
+      if (z >= pzbins[rc != 0] || z <= nzbins[rc != 0])
+        idx_arr[idx++] = i;
+    }
+
+    // Quantization pass: only process the coefficients selected in
+    // pre-scan pass. Note: idx can be zero.
+    for (i = 0; i < idx; i++) {
+      rc = scan[idx_arr[i]];
+
+      // Calculate ZBIN
+      zero_run += idx_arr[i] - pre_idx;
+      if(zero_run > 15) zero_run = 15;
+      zbin = (zbins[rc != 0] + zbin_boost_ptr[zero_run]);
+
+      pre_idx = idx_arr[i];
+      z = coeff_ptr[rc] * mul;
+      sz = (z >> 31);                               // sign of z
+      x  = (z ^ sz) - sz;                           // x = abs(z)
+
+      if (x >= zbin) {
+        x += (round_ptr[rc != 0]);
+        y  = ((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x))
+            >> quant_shift_ptr[rc != 0];            // quantize (x)
+
+        x  = (y ^ sz) - sz;                         // get the sign back
+        qcoeff_ptr[rc]  = x;                        // write to destination
+        dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0] / mul;  // dequantized value
+
+        if (y) {
+          eob = idx_arr[i];                         // last nonzero coeffs
+          zero_run = -1;                            // set zero_run
+        }
+      }
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+#if 0
+// Original quantize function
+static void quantize(int16_t *zbin_boost_orig_ptr,
+                     int16_t *coeff_ptr, int n_coeffs, int skip_block,
+                     int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr,
+                     uint8_t *quant_shift_ptr,
+                     int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                     int16_t *dequant_ptr, int zbin_oq_value,
+                     uint16_t *eob_ptr,
+                     const int *scan, int mul) {
+  int i, rc, eob;
   int zbin;
   int x, y, z, sz;
   int zero_run = 0;
@@ -74,6 +221,7 @@ static void quantize(int16_t *zbin_boost_orig_ptr,
 
   *eob_ptr = eob + 1;
 }
+#endif
 
 void vp9_quantize(MACROBLOCK *mb, int plane, int block, int n_coeffs,
                   TX_TYPE tx_type) {
@@ -97,19 +245,40 @@ void vp9_quantize(MACROBLOCK *mb, int plane, int block, int n_coeffs,
       break;
   }
 
-  quantize(mb->plane[plane].zrun_zbin_boost,
-           BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
-           n_coeffs, mb->skip_block,
-           mb->plane[plane].zbin,
-           mb->plane[plane].round,
-           mb->plane[plane].quant,
-           mb->plane[plane].quant_shift,
-           BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16),
-           BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
-           xd->plane[plane].dequant,
-           mb->plane[plane].zbin_extra,
-           &xd->plane[plane].eobs[block],
-           scan, mul);
+  // Call different quantization for different transform size.
+  if (n_coeffs >= 1024) {
+    // Save index of picked coefficient in pre-scan pass.
+    int idx_arr[1024];
+
+    quantize_sparse(mb->plane[plane].zrun_zbin_boost,
+                    BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
+                    n_coeffs, mb->skip_block,
+                    mb->plane[plane].zbin,
+                    mb->plane[plane].round,
+                    mb->plane[plane].quant,
+                    mb->plane[plane].quant_shift,
+                    BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16),
+                    BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
+                    xd->plane[plane].dequant,
+                    mb->plane[plane].zbin_extra,
+                    &xd->plane[plane].eobs[block],
+                    scan, mul, idx_arr);
+  }
+  else {
+    quantize(mb->plane[plane].zrun_zbin_boost,
+             BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
+             n_coeffs, mb->skip_block,
+             mb->plane[plane].zbin,
+             mb->plane[plane].round,
+             mb->plane[plane].quant,
+             mb->plane[plane].quant_shift,
+             BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16),
+             BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
+             xd->plane[plane].dequant,
+             mb->plane[plane].zbin_extra,
+             &xd->plane[plane].eobs[block],
+             scan, mul);
+  }
 }
 
 void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 4e485e9f1..9cb7ab0e1 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -992,11 +992,9 @@ int vp9_cost_mv_ref(VP9_COMP *cpi,
   // Dont account for mode here if segment skip is enabled.
   if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {
     VP9_COMMON *pc = &cpi->common;
-
-    vp9_prob p[VP9_INTER_MODES - 1];
     assert(NEARESTMV <= m  &&  m <= NEWMV);
-    vp9_mv_ref_probs(pc, p, mode_context);
-    return cost_token(vp9_sb_mv_ref_tree, p,
+    return cost_token(vp9_sb_mv_ref_tree,
+                      pc->fc.inter_mode_probs[mode_context],
                       vp9_sb_mv_ref_encoding_array - NEARESTMV + m);
   } else
     return 0;