9 files changed, 149 insertions, 106 deletions
diff --git a/build/make/Makefile b/build/make/Makefile
index 4ac5bcf1f..de71c6133 100644
--- a/build/make/Makefile
+++ b/build/make/Makefile
@@ -103,6 +103,18 @@ test::
 .PHONY: testdata
 testdata::
 
+# Add compiler flags for intrinsic files
+$(BUILD_PFX)%_mmx.c.d: CFLAGS += -mmmx
+$(BUILD_PFX)%_mmx.c.o: CFLAGS += -mmmx
+$(BUILD_PFX)%_sse2.c.d: CFLAGS += -msse2
+$(BUILD_PFX)%_sse2.c.o: CFLAGS += -msse2
+$(BUILD_PFX)%_sse3.c.d: CFLAGS += -msse3
+$(BUILD_PFX)%_sse3.c.o: CFLAGS += -msse3
+$(BUILD_PFX)%_ssse3.c.d: CFLAGS += -mssse3
+$(BUILD_PFX)%_ssse3.c.o: CFLAGS += -mssse3
+$(BUILD_PFX)%_sse4.c.d: CFLAGS += -msse4.1
+$(BUILD_PFX)%_sse4.c.o: CFLAGS += -msse4.1
+
 $(BUILD_PFX)%.c.d: %.c
 	$(if $(quiet),@echo "    [DEP] $@")
 	$(qexec)mkdir -p $(dir $@)
diff --git a/test/tile_independence_test.cc b/test/tile_independence_test.cc
index 711d0bd45..9633ed756 100644
--- a/test/tile_independence_test.cc
+++ b/test/tile_independence_test.cc
@@ -56,7 +56,13 @@ class TileIndependenceTest : public ::libvpx_test::EncoderTest,
 
   void UpdateMD5(::libvpx_test::Decoder *dec, const vpx_codec_cx_pkt_t *pkt,
                  ::libvpx_test::MD5 *md5) {
-    dec->DecodeFrame((uint8_t *) pkt->data.frame.buf, pkt->data.frame.sz);
+    const vpx_codec_err_t res =
+        dec->DecodeFrame(reinterpret_cast<uint8_t*>(pkt->data.frame.buf),
+                         pkt->data.frame.sz);
+    if (res != VPX_CODEC_OK) {
+      abort_ = true;
+      ASSERT_EQ(VPX_CODEC_OK, res);
+    }
     const vpx_image_t *img = dec->GetDxData().Next();
     md5->Add(img);
   }
diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk
index 7d1904aaf..cd091f39a 100644
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -91,18 +91,8 @@ VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.c
 
-# TODO(johann) make this generic
-ifeq ($(HAVE_SSE2),yes)
-vp8/encoder/x86/quantize_sse2.c.o: CFLAGS += -msse2
-vp8/encoder/x86/quantize_sse2.c.d: CFLAGS += -msse2
-endif
-
 ifeq ($(CONFIG_TEMPORAL_DENOISING),yes)
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/denoising_sse2.c
-ifeq ($(HAVE_SSE2),yes)
-vp8/encoder/x86/denoising_sse2.c.o: CFLAGS += -msse2
-vp8/encoder/x86/denoising_sse2.c.d: CFLAGS += -msse2
-endif
 endif
 
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index 2f713d3ad..2457f79e1 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -1246,8 +1246,6 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
 
   setup_loopfilter(pc, xd, &header_bc);
 
-  vp9_read_literal(&header_bc, 2);  // unused
-
   setup_quantization(pbi, &header_bc);
 
   // Determine if the golden frame or ARF buffer should be updated and how.
@@ -1343,11 +1341,8 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
   vp9_setup_block_dptrs(xd);
 
   // clear out the coeff buffer
-  vpx_memset(xd->plane[0].qcoeff, 0, sizeof(xd->plane[0].qcoeff));
-  vpx_memset(xd->plane[1].qcoeff, 0, sizeof(xd->plane[1].qcoeff));
-  vpx_memset(xd->plane[2].qcoeff, 0, sizeof(xd->plane[2].qcoeff));
-
-  vp9_read_bit(&header_bc);  // unused
+  for (i = 0; i < MAX_MB_PLANE; ++i)
+    vp9_zero(xd->plane[i].qcoeff);
 
   vp9_decode_mode_mvs_init(pbi, &header_bc);
 
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 7152ac955..0d5de648e 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -1930,9 +1930,6 @@ void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, unsigned long *size) {
     }
   }
 
-  // TODO(jkoleszar): remove these unused bits
-  vp9_write_literal(&header_bc, 0, 2);
-
   // Frame Q baseline quantizer index
   vp9_write_literal(&header_bc, pc->base_qindex, QINDEX_BITS);
 
@@ -2178,9 +2175,6 @@ void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, unsigned long *size) {
   active_section = 2;
 #endif
 
-  // TODO(jkoleszar): remove this unused bit
-  vp9_write_bit(&header_bc, 1);
-
   vp9_update_skip_probs(cpi);
   for (i = 0; i < MBSKIP_CONTEXTS; ++i) {
     vp9_write_prob(&header_bc, pc->mbskip_pred_probs[i]);
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index f2cee7fff..dcbdef3ba 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -3089,6 +3089,93 @@ static enum BlockSize y_bsizet_to_block_size(BLOCK_SIZE_TYPE bs) {
   }
 }
 
+static enum BlockSize get_block_size(int bw, int bh) {
+#if CONFIG_SB8X8
+  if (bw == 4 && bh == 4)
+    return BLOCK_4X4;
+
+  if (bw == 4 && bh == 8)
+    return BLOCK_4X8;
+
+  if (bw == 8 && bh == 4)
+    return BLOCK_8X4;
+
+  if (bw == 8 && bh == 8)
+    return BLOCK_8X8;
+
+  if (bw == 8 && bh == 16)
+    return BLOCK_8X16;
+
+  if (bw == 16 && bh == 8)
+    return BLOCK_16X8;
+#else
+  if (bw == 16 && bh == 8)
+    return BLOCK_16X8;
+
+  if (bw == 8 && bh == 16)
+    return BLOCK_8X16;
+
+  if (bw == 8 && bh == 8)
+    return BLOCK_8X8;
+
+  if (bw == 4 && bh == 4)
+    return BLOCK_4X4;
+#endif
+  if (bw == 16 && bh == 16)
+    return BLOCK_16X16;
+
+  if (bw == 32 && bh == 32)
+    return BLOCK_32X32;
+
+  if (bw == 32 && bh == 16)
+    return BLOCK_32X16;
+
+  if (bw == 16 && bh == 32)
+    return BLOCK_16X32;
+
+  if (bw == 64 && bh == 32)
+    return BLOCK_64X32;
+
+  if (bw == 32 && bh == 64)
+    return BLOCK_32X64;
+
+  if (bw == 64 && bh == 64)
+    return BLOCK_64X64;
+
+  assert(0);
+  return -1;
+}
+
+static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
+                            MACROBLOCK *x, MACROBLOCKD *xd,
+                            int *out_rate_sum, int *out_dist_sum) {
+  // Note our transform coeffs are 8 times an orthogonal transform.
+  // Hence quantizer step is also 8 times. To get effective quantizer
+  // we need to divide by 8 before sending to modeling function.
+  unsigned int sse, var;
+  int i, rate_sum = 0, dist_sum = 0;
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    struct macroblock_plane *const p = &x->plane[i];
+    struct macroblockd_plane *const pd = &xd->plane[i];
+
+    const int bwl = b_width_log2(bsize) - pd->subsampling_x;
+    const int bhl = b_height_log2(bsize) - pd->subsampling_y;
+    const enum BlockSize bs = get_block_size(4 << bwl, 4 << bhl);
+    int rate, dist;
+    var = cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
+                             pd->dst.buf, pd->dst.stride, &sse);
+    model_rd_from_var_lapndz(var, 16 << (bwl + bhl),
+                             pd->dequant[1] >> 3, &rate, &dist);
+
+    rate_sum += rate;
+    dist_sum += dist;
+  }
+
+  *out_rate_sum = rate_sum;
+  *out_dist_sum = dist_sum;
+}
+
 static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                  BLOCK_SIZE_TYPE bsize,
                                  int mdcounts[4], int64_t txfm_cache[],
@@ -3283,76 +3370,40 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   // Search for best switchable filter by checking the variance of
   // pred error irrespective of whether the filter will be used
   if (1) {
-    int switchable_filter_index, newbest;
-    int tmp_rate_y_i = 0, tmp_rate_u_i = 0, tmp_rate_v_i = 0;
-    int tmp_dist_y_i = 0, tmp_dist_u_i = 0, tmp_dist_v_i = 0;
-    for (switchable_filter_index = 0;
-         switchable_filter_index < VP9_SWITCHABLE_FILTERS;
-         ++switchable_filter_index) {
+    int i, newbest;
+    int tmp_rate_sum = 0, tmp_dist_sum = 0;
+    for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
       int rs = 0;
-      mbmi->interp_filter = vp9_switchable_interp[switchable_filter_index];
-      vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
+      const INTERPOLATIONFILTERTYPE filter = vp9_switchable_interp[i];
+      const int is_intpel_interp = intpel_mv &&
+                                   vp9_is_interpolating_filter[filter];
+      mbmi->interp_filter = filter;
+      vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
 
-      if (cpi->common.mcomp_filter_type == SWITCHABLE) {
+      if (cm->mcomp_filter_type == SWITCHABLE) {
         const int c = vp9_get_pred_context(cm, xd, PRED_SWITCHABLE_INTERP);
         const int m = vp9_switchable_interp_map[mbmi->interp_filter];
         rs = SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m];
       }
-      if (interpolating_intpel_seen && intpel_mv &&
-          vp9_is_interpolating_filter[mbmi->interp_filter]) {
-        rd = RDCOST(x->rdmult, x->rddiv,
-                    rs + tmp_rate_y_i + tmp_rate_u_i + tmp_rate_v_i,
-                    tmp_dist_y_i + tmp_dist_u_i + tmp_dist_v_i);
+
+      if (interpolating_intpel_seen && is_intpel_interp) {
+        rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate_sum, tmp_dist_sum);
       } else {
-        unsigned int sse, var;
-        int tmp_rate_y, tmp_rate_u, tmp_rate_v;
-        int tmp_dist_y, tmp_dist_u, tmp_dist_v;
+        int rate_sum = 0, dist_sum = 0;
         vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
-        var = cpi->fn_ptr[block_size].vf(x->plane[0].src.buf,
-                                         x->plane[0].src.stride,
-                                         xd->plane[0].dst.buf,
-                                         xd->plane[0].dst.stride,
-                                         &sse);
-        // Note our transform coeffs are 8 times an orthogonal transform.
-        // Hence quantizer step is also 8 times. To get effective quantizer
-        // we need to divide by 8 before sending to modeling function.
-        model_rd_from_var_lapndz(var, MI_SIZE * bw * MI_SIZE * bh,
-                                 xd->plane[0].dequant[1] >> 3,
-                                 &tmp_rate_y, &tmp_dist_y);
-        var = cpi->fn_ptr[uv_block_size].vf(x->plane[1].src.buf,
-                                            x->plane[1].src.stride,
-                                            xd->plane[1].dst.buf,
-                                            xd->plane[1].dst.stride,
-                                            &sse);
-        model_rd_from_var_lapndz(var, MI_UV_SIZE * bw * MI_UV_SIZE * bh,
-                                 xd->plane[1].dequant[1] >> 3,
-                                 &tmp_rate_u, &tmp_dist_u);
-        var = cpi->fn_ptr[uv_block_size].vf(x->plane[2].src.buf,
-                                            x->plane[1].src.stride,
-                                            xd->plane[2].dst.buf,
-                                            xd->plane[1].dst.stride,
-                                            &sse);
-        model_rd_from_var_lapndz(var, MI_UV_SIZE * bw * MI_UV_SIZE * bh,
-                                 xd->plane[2].dequant[1] >> 3,
-                                 &tmp_rate_v, &tmp_dist_v);
-        rd = RDCOST(x->rdmult, x->rddiv,
-                    rs + tmp_rate_y + tmp_rate_u + tmp_rate_v,
-                    tmp_dist_y + tmp_dist_u + tmp_dist_v);
-        if (!interpolating_intpel_seen && intpel_mv &&
-            vp9_is_interpolating_filter[mbmi->interp_filter]) {
-          tmp_rate_y_i = tmp_rate_y;
-          tmp_rate_u_i = tmp_rate_u;
-          tmp_rate_v_i = tmp_rate_v;
-          tmp_dist_y_i = tmp_dist_y;
-          tmp_dist_u_i = tmp_dist_u;
-          tmp_dist_v_i = tmp_dist_v;
+        model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum);
+        rd = RDCOST(x->rdmult, x->rddiv, rs + rate_sum, dist_sum);
+        if (!interpolating_intpel_seen && is_intpel_interp) {
+          tmp_rate_sum = rate_sum;
+          tmp_dist_sum = dist_sum;
         }
       }
-      newbest = (switchable_filter_index == 0 || rd < best_rd);
+      newbest = i == 0 || rd < best_rd;
       if (newbest) {
         best_rd = rd;
         *best_filter = mbmi->interp_filter;
       }
+
       if ((cm->mcomp_filter_type == SWITCHABLE && newbest) ||
           (cm->mcomp_filter_type != SWITCHABLE &&
            cm->mcomp_filter_type == mbmi->interp_filter)) {
@@ -3367,21 +3418,18 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                      sizeof(unsigned char) * MI_UV_SIZE * bw);
         for (i = 0; i < MI_UV_SIZE * bh; ++i)
           vpx_memcpy(tmp_vbuf + i * MI_UV_SIZE * bw,
-                     xd->plane[2].dst.buf + i * xd->plane[1].dst.stride,
+                     xd->plane[2].dst.buf + i * xd->plane[2].dst.stride,
                      sizeof(unsigned char) * MI_UV_SIZE * bw);
         pred_exists = 1;
       }
-      interpolating_intpel_seen |=
-        intpel_mv && vp9_is_interpolating_filter[mbmi->interp_filter];
+      interpolating_intpel_seen |= is_intpel_interp;
     }
   }
 
   // Set the appripriate filter
-  if (cm->mcomp_filter_type != SWITCHABLE)
-    mbmi->interp_filter = cm->mcomp_filter_type;
-  else
-    mbmi->interp_filter = *best_filter;
-  vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
+  mbmi->interp_filter = cm->mcomp_filter_type != SWITCHABLE ?
+      cm->mcomp_filter_type : *best_filter;
+  vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
 
   if (pred_exists) {
     // FIXME(rbultje): mb code still predicts into xd->predictor
@@ -3394,7 +3442,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                  tmp_ubuf + i * bw * MI_UV_SIZE,
                  sizeof(unsigned char) * bw * MI_UV_SIZE);
     for (i = 0; i < bh * MI_UV_SIZE; ++i)
-      vpx_memcpy(xd->plane[2].dst.buf + i * xd->plane[1].dst.stride,
+      vpx_memcpy(xd->plane[2].dst.buf + i * xd->plane[2].dst.stride,
                  tmp_vbuf + i * bw * MI_UV_SIZE,
                  sizeof(unsigned char) * bw * MI_UV_SIZE);
   } else {
@@ -4756,6 +4804,21 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
     mbmi->ref_frame = ref_frame;
     mbmi->second_ref_frame = vp9_mode_order[mode_index].second_ref_frame;
+
+    // TODO(jingning): scaling not supported in SPLITMV mode.
+    if (mbmi->ref_frame > 0 &&
+          (yv12_mb[mbmi->ref_frame].y_width != cm->mb_cols * 16 ||
+           yv12_mb[mbmi->ref_frame].y_height != cm->mb_rows * 16) &&
+        this_mode == SPLITMV)
+      continue;
+
+    if (mbmi->second_ref_frame > 0 &&
+          (yv12_mb[mbmi->second_ref_frame].y_width != cm->mb_cols * 16 ||
+           yv12_mb[mbmi->second_ref_frame].y_height != cm->mb_rows * 16) &&
+        this_mode == SPLITMV)
+      continue;
+
+
     set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame,
                       scale_factor);
     comp_pred = mbmi->second_ref_frame > INTRA_FRAME;
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 1ddd4f057..cbe3aa367 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -113,14 +113,6 @@ endif
 
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_sadmxn_sse2.c
-ifeq ($(HAVE_SSE2),yes)
-vp9/common/x86/vp9_idct_intrin_sse2.c.o: CFLAGS += -msse2
-vp9/common/x86/vp9_loopfilter_intrin_sse2.c.o: CFLAGS += -msse2
-vp9/common/x86/vp9_sadmxn_sse2.c.o: CFLAGS += -msse2
-vp9/common/x86/vp9_idct_intrin_sse2.c.d: CFLAGS += -msse2
-vp9/common/x86/vp9_loopfilter_intrin_sse2.c.d: CFLAGS += -msse2
-vp9/common/x86/vp9_sadmxn_sse2.c.d: CFLAGS += -msse2
-endif
 
 $(eval $(call asm_offsets_template,\
          vp9_asm_com_offsets.asm, $(VP9_PREFIX)common/vp9_asm_com_offsets.c))
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 39f836fd4..42ab02d31 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -95,10 +95,5 @@ VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_encodeopt.asm
 VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt.asm
 
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c
-ifeq ($(HAVE_SSE2),yes)
-vp9/encoder/x86/vp9_dct_sse2.c.d: CFLAGS += -msse2
-vp9/encoder/x86/vp9_dct_sse2.c.o: CFLAGS += -msse2
-endif
-
 
 VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))
diff --git a/vp9/vp9dx.mk b/vp9/vp9dx.mk
index babdebb86..72cdfebf4 100644
--- a/vp9/vp9dx.mk
+++ b/vp9/vp9dx.mk
@@ -38,10 +38,6 @@ VP9_DX_SRCS-yes := $(filter-out $(VP9_DX_SRCS_REMOVE-yes),$(VP9_DX_SRCS-yes))
 VP9_DX_SRCS-$(HAVE_SSE2) += decoder/x86/vp9_idct_blk_sse2.c
 
 VP9_DX_SRCS-$(HAVE_SSE2) += decoder/x86/vp9_dequantize_sse2.c
-ifeq ($(HAVE_SSE2),yes)
-vp9/decoder/x86/vp9_dequantize_sse2.c.o: CFLAGS += -msse2
-vp9/decoder/x86/vp9_dequantize_sse2.c.d: CFLAGS += -msse2
-endif
 
 $(eval $(call asm_offsets_template,\
          vp9_asm_dec_offsets.asm, $(VP9_PREFIX)decoder/vp9_asm_dec_offsets.c))