Add two-pass quantization

Optimized the quantization function by making it a two-pass process. The first pass does a quick checking of the transform coefficients against the base ZBIN, and only keep the good enough set of coefficients for quantization. A skipping check is added. If all coefficients are within the base ZBIN, no quantization is needed. The second pass is the actual quantization pass, which only processes the coefficient subset determined in first pass. This reduces the computation. Furthermore, an alternitive method is used for large transform size, which often has sparse nonzero quantized coefficients. Overall, the encoder speedup is about 4%. The quantization function itself gets 20% faster. Change-Id: I3a9dd0da6db030260b6d9c314a9fa48ecae89f22
author: Yunqing Wang <yunqingwang@google.com> 2013-06-18 17:09:50 -0700
committer: Yunqing Wang <yunqingwang@google.com> 2013-06-19 10:35:02 -0700
commit: b5bf7b13a83fc6fccafec1611410910eb78155a9 (patch)
tree: 60c4e18486cc559ff67457e314c6e6a534e96dd1 /vp9
parent: 2319b7aaf17b7efe143862bf0569fe8c9efd9d1f (diff)
download: libvpx-b5bf7b13a83fc6fccafec1611410910eb78155a9.tar
libvpx-b5bf7b13a83fc6fccafec1611410910eb78155a9.tar.gz
libvpx-b5bf7b13a83fc6fccafec1611410910eb78155a9.tar.bz2
libvpx-b5bf7b13a83fc6fccafec1611410910eb78155a9.zip
1 files changed, 182 insertions, 13 deletions
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 53d8be775..ccbb624b0 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -35,6 +35,153 @@ static void quantize(int16_t *zbin_boost_orig_ptr,
                      uint16_t *eob_ptr,
                      const int *scan, int mul) {
   int i, rc, eob;
+  int zbins[2], nzbins[2], zbin;
+  int x, y, z, sz;
+  int zero_run = 0;
+  int16_t *zbin_boost_ptr = zbin_boost_orig_ptr;
+  int zero_flag = n_coeffs;
+
+  vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
+  vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
+
+  eob = -1;
+
+  // Base ZBIN
+  zbins[0] = zbin_ptr[0] + zbin_oq_value;
+  zbins[1] = zbin_ptr[1] + zbin_oq_value;
+  nzbins[0] = zbins[0] * -1;
+  nzbins[1] = zbins[1] * -1;
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = n_coeffs - 1; i >= 0; i--) {
+      rc = scan[i];
+      z = coeff_ptr[rc] * mul;
+
+      if (z < zbins[rc != 0] && z > nzbins[rc != 0]) {
+        zero_flag--;
+      } else {
+        break;
+      }
+    }
+
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+    for (i = 0; i < zero_flag; i++) {
+      rc = scan[i];
+      z  = coeff_ptr[rc] * mul;
+
+      zbin = (zbins[rc != 0] + zbin_boost_ptr[zero_run]);
+      zero_run += (zero_run < 15);
+
+      sz = (z >> 31);                               // sign of z
+      x  = (z ^ sz) - sz;
+
+      if (x >= zbin) {
+        x += (round_ptr[rc != 0]);
+        y  = ((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x))
+            >> quant_shift_ptr[rc != 0];            // quantize (x)
+        x  = (y ^ sz) - sz;                         // get the sign back
+        qcoeff_ptr[rc]  = x;                        // write to destination
+        dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0] / mul;  // dequantized value
+
+        if (y) {
+          eob = i;                                  // last nonzero coeffs
+          zero_run = 0;                             // set zero_run
+        }
+      }
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+// This function works well for large transform size.
+static void quantize_sparse(int16_t *zbin_boost_orig_ptr,
+                            int16_t *coeff_ptr, int n_coeffs, int skip_block,
+                            int16_t *zbin_ptr, int16_t *round_ptr,
+                            int16_t *quant_ptr, uint8_t *quant_shift_ptr,
+                            int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                            int16_t *dequant_ptr, int zbin_oq_value,
+                            uint16_t *eob_ptr, const int *scan, int mul,
+                            int *idx_arr) {
+  int i, rc, eob;
+  int zbins[2], pzbins[2], nzbins[2], zbin;
+  int x, y, z, sz;
+  int zero_run = 0;
+  int16_t *zbin_boost_ptr = zbin_boost_orig_ptr;
+  int idx = 0;
+  int pre_idx = 0;
+
+  vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
+  vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
+
+  eob = -1;
+
+  // Base ZBIN
+  zbins[0] = zbin_ptr[0] + zbin_oq_value;
+  zbins[1] = zbin_ptr[1] + zbin_oq_value;
+  // Positive and negative ZBIN
+  pzbins[0] = zbins[0]/mul;
+  pzbins[1] = zbins[1]/mul;
+  nzbins[0] = pzbins[0] * -1;
+  nzbins[1] = pzbins[1] * -1;
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = 0; i < n_coeffs; i++) {
+      rc = scan[i];
+      z = coeff_ptr[rc];
+
+      // If the coefficient is out of the base ZBIN range, keep it for
+      // quantization.
+      if (z >= pzbins[rc != 0] || z <= nzbins[rc != 0])
+        idx_arr[idx++] = i;
+    }
+
+    // Quantization pass: only process the coefficients selected in
+    // pre-scan pass. Note: idx can be zero.
+    for (i = 0; i < idx; i++) {
+      rc = scan[idx_arr[i]];
+
+      // Calculate ZBIN
+      zero_run += idx_arr[i] - pre_idx;
+      if(zero_run > 15) zero_run = 15;
+      zbin = (zbins[rc != 0] + zbin_boost_ptr[zero_run]);
+
+      pre_idx = idx_arr[i];
+      z = coeff_ptr[rc] * mul;
+      sz = (z >> 31);                               // sign of z
+      x  = (z ^ sz) - sz;                           // x = abs(z)
+
+      if (x >= zbin) {
+        x += (round_ptr[rc != 0]);
+        y  = ((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x))
+            >> quant_shift_ptr[rc != 0];            // quantize (x)
+
+        x  = (y ^ sz) - sz;                         // get the sign back
+        qcoeff_ptr[rc]  = x;                        // write to destination
+        dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0] / mul;  // dequantized value
+
+        if (y) {
+          eob = idx_arr[i];                         // last nonzero coeffs
+          zero_run = -1;                            // set zero_run
+        }
+      }
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+#if 0
+// Original quantize function
+static void quantize(int16_t *zbin_boost_orig_ptr,
+                     int16_t *coeff_ptr, int n_coeffs, int skip_block,
+                     int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr,
+                     uint8_t *quant_shift_ptr,
+                     int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                     int16_t *dequant_ptr, int zbin_oq_value,
+                     uint16_t *eob_ptr,
+                     const int *scan, int mul) {
+  int i, rc, eob;
   int zbin;
   int x, y, z, sz;
   int zero_run = 0;
@@ -74,6 +221,7 @@ static void quantize(int16_t *zbin_boost_orig_ptr,
 
   *eob_ptr = eob + 1;
 }
+#endif
 
 void vp9_quantize(MACROBLOCK *mb, int plane, int block, int n_coeffs,
                   TX_TYPE tx_type) {
@@ -97,19 +245,40 @@ void vp9_quantize(MACROBLOCK *mb, int plane, int block, int n_coeffs,
       break;
   }
 
-  quantize(mb->plane[plane].zrun_zbin_boost,
-           BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
-           n_coeffs, mb->skip_block,
-           mb->plane[plane].zbin,
-           mb->plane[plane].round,
-           mb->plane[plane].quant,
-           mb->plane[plane].quant_shift,
-           BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16),
-           BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
-           xd->plane[plane].dequant,
-           mb->plane[plane].zbin_extra,
-           &xd->plane[plane].eobs[block],
-           scan, mul);
+  // Call different quantization for different transform size.
+  if (n_coeffs >= 1024) {
+    // Save index of picked coefficient in pre-scan pass.
+    int idx_arr[1024];
+
+    quantize_sparse(mb->plane[plane].zrun_zbin_boost,
+                    BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
+                    n_coeffs, mb->skip_block,
+                    mb->plane[plane].zbin,
+                    mb->plane[plane].round,
+                    mb->plane[plane].quant,
+                    mb->plane[plane].quant_shift,
+                    BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16),
+                    BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
+                    xd->plane[plane].dequant,
+                    mb->plane[plane].zbin_extra,
+                    &xd->plane[plane].eobs[block],
+                    scan, mul, idx_arr);
+  }
+  else {
+    quantize(mb->plane[plane].zrun_zbin_boost,
+             BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
+             n_coeffs, mb->skip_block,
+             mb->plane[plane].zbin,
+             mb->plane[plane].round,
+             mb->plane[plane].quant,
+             mb->plane[plane].quant_shift,
+             BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16),
+             BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
+             xd->plane[plane].dequant,
+             mb->plane[plane].zbin_extra,
+             &xd->plane[plane].eobs[block],
+             scan, mul);
+  }
 }
 
 void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
author	Yunqing Wang <yunqingwang@google.com>	2013-06-18 17:09:50 -0700
committer	Yunqing Wang <yunqingwang@google.com>	2013-06-19 10:35:02 -0700
commit	b5bf7b13a83fc6fccafec1611410910eb78155a9 (patch)
tree	60c4e18486cc559ff67457e314c6e6a534e96dd1 /vp9
parent	2319b7aaf17b7efe143862bf0569fe8c9efd9d1f (diff)
download	libvpx-b5bf7b13a83fc6fccafec1611410910eb78155a9.tar libvpx-b5bf7b13a83fc6fccafec1611410910eb78155a9.tar.gz libvpx-b5bf7b13a83fc6fccafec1611410910eb78155a9.tar.bz2 libvpx-b5bf7b13a83fc6fccafec1611410910eb78155a9.zip