12 files changed, 99 insertions, 52 deletions
diff --git a/vp9/common/vp9_mvref_common.h b/vp9/common/vp9_mvref_common.h
index 7bce3fa37..a937b7823 100644
--- a/vp9/common/vp9_mvref_common.h
+++ b/vp9/common/vp9_mvref_common.h
@@ -125,7 +125,7 @@ static const int idx_n_column_to_subblock[4][2] = {
 // clamp_mv_ref
 #define MV_BORDER (16 << 3)  // Allow 16 pels in 1/8th pel units
 
-static void clamp_mv_ref(MV *mv, const MACROBLOCKD *xd) {
+static INLINE void clamp_mv_ref(MV *mv, const MACROBLOCKD *xd) {
   clamp_mv(mv, xd->mb_to_left_edge - MV_BORDER,
                xd->mb_to_right_edge + MV_BORDER,
                xd->mb_to_top_edge - MV_BORDER,
diff --git a/vp9/decoder/vp9_reader.h b/vp9/decoder/vp9_reader.h
index 32e200e2b..2d9eccfbf 100644
--- a/vp9/decoder/vp9_reader.h
+++ b/vp9/decoder/vp9_reader.h
@@ -52,7 +52,7 @@ int vp9_reader_has_error(vp9_reader *r);
 
 const uint8_t *vp9_reader_find_end(vp9_reader *r);
 
-static int vp9_read(vp9_reader *r, int prob) {
+static INLINE int vp9_read(vp9_reader *r, int prob) {
   unsigned int bit = 0;
   BD_VALUE value;
   BD_VALUE bigsplit;
@@ -89,11 +89,11 @@ static int vp9_read(vp9_reader *r, int prob) {
   return bit;
 }
 
-static int vp9_read_bit(vp9_reader *r) {
+static INLINE int vp9_read_bit(vp9_reader *r) {
   return vp9_read(r, 128);  // vp9_prob_half
 }
 
-static int vp9_read_literal(vp9_reader *r, int bits) {
+static INLINE int vp9_read_literal(vp9_reader *r, int bits) {
   int literal = 0, bit;
 
   for (bit = bits - 1; bit >= 0; bit--)
@@ -102,8 +102,8 @@ static int vp9_read_literal(vp9_reader *r, int bits) {
   return literal;
 }
 
-static int vp9_read_tree(vp9_reader *r, const vp9_tree_index *tree,
-                         const vp9_prob *probs) {
+static INLINE int vp9_read_tree(vp9_reader *r, const vp9_tree_index *tree,
+                                const vp9_prob *probs) {
   vp9_tree_index i = 0;
 
   while ((i = tree[i + vp9_read(r, probs[i >> 1])]) > 0)
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index d638a2146..b0ff0fa81 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -1216,7 +1216,7 @@ void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size) {
   saved_wb = wb;
   vp9_wb_write_literal(&wb, 0, 16);  // don't know in advance first part. size
 
-  uncompressed_hdr_size = vp9_rb_bytes_written(&wb);
+  uncompressed_hdr_size = vp9_wb_bytes_written(&wb);
   data += uncompressed_hdr_size;
 
   vp9_clear_system_state();
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 2c7739115..bd3b0fdc8 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -41,6 +41,7 @@ struct macroblock_plane {
   int16_t *zbin;
   int16_t *round;
 
+  int64_t quant_thred[2];
   // Zbin Over Quant value
   int16_t zbin_extra;
 };
@@ -117,6 +118,8 @@ struct macroblock {
   // skip forward transform and quantization
   int skip_txfm[MAX_MB_PLANE];
 
+  int64_t bsse[MAX_MB_PLANE];
+
   // Used to store sub partition's choices.
   MV pred_mv[MAX_REF_FRAMES];
 
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 9e57d6abe..6115f5a0f 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -673,7 +673,6 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     int i, j;
     const int width  = num_4x4_blocks_wide_lookup[bsize];
     const int height = num_4x4_blocks_high_lookup[bsize];
-    const BLOCK_SIZE bsize_tx = txsize_to_bsize[mbmi->tx_size];
 
     int rate2 = 0;
     int64_t dist2 = 0;
@@ -683,28 +682,36 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 
     TX_SIZE tmp_tx_size = MIN(max_txsize_lookup[bsize],
                               tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+    const BLOCK_SIZE bsize_tx = txsize_to_bsize[tmp_tx_size];
     const int step = 1 << tmp_tx_size;
 
-    for (this_mode = DC_PRED; this_mode <= DC_PRED; ++this_mode) {
-      if (cpi->sf.reuse_inter_pred_sby) {
-        pd->dst.buf = tmp[0].data;
-        pd->dst.stride = bw;
-      }
+    if (cpi->sf.reuse_inter_pred_sby) {
+      pd->dst.buf = tmp[0].data;
+      pd->dst.stride = bw;
+    }
 
+    for (this_mode = DC_PRED; this_mode <= DC_PRED; ++this_mode) {
+      uint8_t *const src_buf_base = p->src.buf;
+      uint8_t *const dst_buf_base = pd->dst.buf;
       for (j = 0; j < height; j += step) {
         for (i = 0; i < width; i += step) {
+          p->src.buf = &src_buf_base[4 * (j * src_stride + i)];
+          pd->dst.buf = &dst_buf_base[4 * (j * dst_stride + i)];
+          // Use source buffer as an approximation for the fully reconstructed
+          // buffer
           vp9_predict_intra_block(xd, block_idx, b_width_log2(bsize),
                                   tmp_tx_size, this_mode,
-                                  &p->src.buf[4 * (j * dst_stride + i)],
-                                  src_stride,
-                                  &pd->dst.buf[4 * (j * dst_stride + i)],
-                                  dst_stride, i, j, 0);
+                                  p->src.buf, src_stride,
+                                  pd->dst.buf, dst_stride,
+                                  i, j, 0);
           model_rd_for_sb_y(cpi, bsize_tx, x, xd, &rate, &dist, &var_y, &sse_y);
           rate2 += rate;
           dist2 += dist;
           ++block_idx;
         }
       }
+      p->src.buf = src_buf_base;
+      pd->dst.buf = dst_buf_base;
 
       rate = rate2;
       dist = dist2;
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index e153b2077..eababdbca 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -23,15 +23,14 @@ void vp9_quantize_dc(const int16_t *coeff_ptr, int skip_block,
                      const int16_t *round_ptr, const int16_t quant,
                      int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
                      const int16_t dequant_ptr, uint16_t *eob_ptr) {
-  int eob = -1;
+  const int rc = 0;
+  const int coeff = coeff_ptr[rc];
+  const int coeff_sign = (coeff >> 31);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  int tmp, eob = -1;
 
   if (!skip_block) {
-    const int rc = 0;
-    const int coeff = coeff_ptr[rc];
-    const int coeff_sign = (coeff >> 31);
-    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-
-    int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+    tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
     tmp = (tmp * quant) >> 16;
     qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;
     dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr;
@@ -45,15 +44,15 @@ void vp9_quantize_dc_32x32(const int16_t *coeff_ptr, int skip_block,
                            const int16_t *round_ptr, const int16_t quant,
                            int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
                            const int16_t dequant_ptr, uint16_t *eob_ptr) {
-  int eob = -1;
+  const int rc = 0;
+  const int coeff = coeff_ptr[rc];
+  const int coeff_sign = (coeff >> 31);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  int tmp, eob = -1;
 
   if (!skip_block) {
-    const int rc = 0;
-    const int coeff = coeff_ptr[rc];
-    const int coeff_sign = (coeff >> 31);
-    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
 
-    int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+    tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
     tmp = (tmp * quant) >> 15;
     qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;
     dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2;
@@ -354,6 +353,10 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
   x->plane[0].quant_shift = quants->y_quant_shift[qindex];
   x->plane[0].zbin = quants->y_zbin[qindex];
   x->plane[0].round = quants->y_round[qindex];
+  x->plane[0].quant_thred[0] = cm->y_dequant[qindex][0] *
+                                  cm->y_dequant[qindex][0];
+  x->plane[0].quant_thred[1] = cm->y_dequant[qindex][1] *
+                                  cm->y_dequant[qindex][1];
   x->plane[0].zbin_extra = (int16_t)((cm->y_dequant[qindex][1] * zbin) >> 7);
   xd->plane[0].dequant = cm->y_dequant[qindex];
 
@@ -365,6 +368,10 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
     x->plane[i].quant_shift = quants->uv_quant_shift[qindex];
     x->plane[i].zbin = quants->uv_zbin[qindex];
     x->plane[i].round = quants->uv_round[qindex];
+    x->plane[i].quant_thred[0] = cm->y_dequant[qindex][0] *
+                                    cm->y_dequant[qindex][0];
+    x->plane[i].quant_thred[1] = cm->y_dequant[qindex][1] *
+                                    cm->y_dequant[qindex][1];
     x->plane[i].zbin_extra = (int16_t)((cm->uv_dequant[qindex][1] * zbin) >> 7);
     xd->plane[i].dequant = cm->uv_dequant[qindex];
   }
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 5caafd370..cc55dd78f 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -171,15 +171,27 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize,
   int64_t dist_sum = 0;
   const int ref = xd->mi[0]->mbmi.ref_frame[0];
   unsigned int sse;
+  const int shift = 8;
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     struct macroblock_plane *const p = &x->plane[i];
     struct macroblockd_plane *const pd = &xd->plane[i];
     const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
 
-    (void) cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
-                              pd->dst.buf, pd->dst.stride, &sse);
+    const unsigned int var = cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
+                                                pd->dst.buf, pd->dst.stride,
+                                                &sse);
 
+    if (!x->select_tx_size) {
+      if (sse < p->quant_thred[0] >> shift)
+        x->skip_txfm[i] = 1;
+      else if (var < p->quant_thred[1] >> shift)
+        x->skip_txfm[i] = 2;
+      else
+        x->skip_txfm[i] = 0;
+    }
+
+    x->bsse[i] = sse;
     if (i == 0)
       x->pred_sse[ref] = sse;
 
@@ -357,12 +369,32 @@ static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
   if (args->skip)
     return;
 
-  if (!is_inter_block(mbmi))
+  if (!is_inter_block(mbmi)) {
     vp9_encode_block_intra(x, plane, block, plane_bsize, tx_size, &mbmi->skip);
-  else
-    vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
+    dist_block(plane, block, tx_size, args);
+  } else {
+    if (x->skip_txfm[plane] == 0) {
+      // full forward transform and quantization
+      vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
+      dist_block(plane, block, tx_size, args);
+    } else if (x->skip_txfm[plane] == 2) {
+      // compute DC coefficient
+      int16_t *const coeff   = BLOCK_OFFSET(x->plane[plane].coeff, block);
+      int16_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block);
+      vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size);
+      args->sse  = x->bsse[plane] << 4;
+      args->dist = args->sse;
+      if (!x->plane[plane].eobs[block])
+        args->dist = args->sse - ((coeff[0] * coeff[0] -
+            (coeff[0] - dqcoeff[0]) * (coeff[0] - dqcoeff[0])) >> 2);
+    } else {
+      // skip forward transform
+      x->plane[plane].eobs[block] = 0;
+      args->sse  = x->bsse[plane] << 4;
+      args->dist = args->sse;
+    }
+  }
 
-  dist_block(plane, block, tx_size, args);
   rate_block(plane, block, plane_bsize, tx_size, args);
   rd1 = RDCOST(x->rdmult, x->rddiv, args->rate, args->dist);
   rd2 = RDCOST(x->rdmult, x->rddiv, 0, args->sse);
@@ -2102,6 +2134,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   int orig_dst_stride[MAX_MB_PLANE];
   int rs = 0;
   INTERP_FILTER best_filter = SWITCHABLE;
+  int skip_txfm[MAX_MB_PLANE] = {0};
+  int64_t bsse[MAX_MB_PLANE] = {0};
 
   int bsl = mi_width_log2_lookup[bsize];
   int pred_filter_search = cpi->sf.cb_pred_filter_search ?
@@ -2264,6 +2298,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
           best_filter = mbmi->interp_filter;
           if (cm->interp_filter == SWITCHABLE && i && !intpel_mv)
             best_needs_copy = !best_needs_copy;
+          vpx_memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm));
+          vpx_memcpy(bsse, x->bsse, sizeof(bsse));
         }
 
         if ((cm->interp_filter == SWITCHABLE && newbest) ||
@@ -2316,6 +2352,9 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                               disable_skip);
   }
 
+  vpx_memcpy(x->skip_txfm, skip_txfm, sizeof(skip_txfm));
+  vpx_memcpy(x->bsse, bsse, sizeof(bsse));
+
   if (!x->skip) {
     int skippable_y, skippable_uv;
     int64_t sseuv = INT64_MAX;
diff --git a/vp9/encoder/vp9_write_bit_buffer.c b/vp9/encoder/vp9_write_bit_buffer.c
index 962d0ca56..6d55e84e8 100644
--- a/vp9/encoder/vp9_write_bit_buffer.c
+++ b/vp9/encoder/vp9_write_bit_buffer.c
@@ -8,9 +8,10 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <limits.h>
 #include "vp9/encoder/vp9_write_bit_buffer.h"
 
-size_t vp9_rb_bytes_written(struct vp9_write_bit_buffer *wb) {
+size_t vp9_wb_bytes_written(const struct vp9_write_bit_buffer *wb) {
   return wb->bit_offset / CHAR_BIT + (wb->bit_offset % CHAR_BIT > 0);
 }
 
diff --git a/vp9/encoder/vp9_write_bit_buffer.h b/vp9/encoder/vp9_write_bit_buffer.h
index 073608d7f..59f9bbe30 100644
--- a/vp9/encoder/vp9_write_bit_buffer.h
+++ b/vp9/encoder/vp9_write_bit_buffer.h
@@ -11,8 +11,6 @@
 #ifndef VP9_ENCODER_VP9_WRITE_BIT_BUFFER_H_
 #define VP9_ENCODER_VP9_WRITE_BIT_BUFFER_H_
 
-#include <limits.h>
-
 #include "vpx/vpx_integer.h"
 
 #ifdef __cplusplus
@@ -24,7 +22,7 @@ struct vp9_write_bit_buffer {
   size_t bit_offset;
 };
 
-size_t vp9_rb_bytes_written(struct vp9_write_bit_buffer *wb);
+size_t vp9_wb_bytes_written(const struct vp9_write_bit_buffer *wb);
 
 void vp9_wb_write_bit(struct vp9_write_bit_buffer *wb, int bit);
 
diff --git a/vp9/encoder/vp9_writer.c b/vp9/encoder/vp9_writer.c
index 8398fc07a..ff461f218 100644
--- a/vp9/encoder/vp9_writer.c
+++ b/vp9/encoder/vp9_writer.c
@@ -15,7 +15,6 @@
 void vp9_start_encode(vp9_writer *br, uint8_t *source) {
   br->lowvalue = 0;
   br->range    = 255;
-  br->value    = 0;
   br->count    = -24;
   br->buffer   = source;
   br->pos      = 0;
diff --git a/vp9/encoder/vp9_writer.h b/vp9/encoder/vp9_writer.h
index 7f4fa1ef2..9d161f95c 100644
--- a/vp9/encoder/vp9_writer.h
+++ b/vp9/encoder/vp9_writer.h
@@ -22,20 +22,15 @@ extern "C" {
 typedef struct {
   unsigned int lowvalue;
   unsigned int range;
-  unsigned int value;
   int count;
   unsigned int pos;
   uint8_t *buffer;
-
-  // Variables used to track bit costs without outputing to the bitstream
-  unsigned int  measure_cost;
-  uint64_t bit_counter;
 } vp9_writer;
 
 void vp9_start_encode(vp9_writer *bc, uint8_t *buffer);
 void vp9_stop_encode(vp9_writer *bc);
 
-static void vp9_write(vp9_writer *br, int bit, int probability) {
+static INLINE void vp9_write(vp9_writer *br, int bit, int probability) {
   unsigned int split;
   int count = br->count;
   unsigned int range = br->range;
@@ -83,11 +78,11 @@ static void vp9_write(vp9_writer *br, int bit, int probability) {
   br->range = range;
 }
 
-static void vp9_write_bit(vp9_writer *w, int bit) {
+static INLINE void vp9_write_bit(vp9_writer *w, int bit) {
   vp9_write(w, bit, 128);  // vp9_prob_half
 }
 
-static void vp9_write_literal(vp9_writer *w, int data, int bits) {
+static INLINE void vp9_write_literal(vp9_writer *w, int data, int bits) {
   int bit;
 
   for (bit = bits - 1; bit >= 0; bit--)
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 35ee1aee6..bf8eec717 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -808,7 +808,7 @@ static int write_superframe_index(vpx_codec_alg_priv_t *ctx) {
 }
 
 // vp9 uses 10,000,000 ticks/second as time stamp
-#define TICKS_PER_SEC 10000000
+#define TICKS_PER_SEC 10000000LL
 
 static int64_t timebase_units_to_ticks(const vpx_rational_t *timebase,
                                        int64_t n) {
@@ -1325,9 +1325,7 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
       9999,               // kf_max_dist
 
       VPX_SS_DEFAULT_LAYERS,  // ss_number_layers
-#if CONFIG_SPATIAL_SVC
       {0},
-#endif
       {0},                    // ss_target_bitrate
       1,                      // ts_number_layers
       {0},                    // ts_target_bitrate