45 files changed, 987 insertions, 456 deletions
diff --git a/configure b/configure
index 428fba343..5fbd2d002 100755
--- a/configure
+++ b/configure
@@ -35,6 +35,9 @@ Advanced options:
   ${toggle_debug_libs}            in/exclude debug version of libraries
   ${toggle_static_msvcrt}         use static MSVCRT (VS builds only)
   ${toggle_vp9_highbitdepth}      use VP9 high bit depth (10/12) profiles
+  ${toggle_better_hw_compatibility}
+                                  enable encoder to produce streams with better
+                                  hardware decoder compatibility
   ${toggle_vp8}                   VP8 codec support
   ${toggle_vp9}                   VP9 codec support
   ${toggle_vp10}                  VP10 codec support
@@ -320,6 +323,7 @@ CONFIG_LIST="
     vp9_temporal_denoising
     coefficient_range_checking
     vp9_highbitdepth
+    better_hw_compatibility
     experimental
     size_limit
     ${EXPERIMENT_LIST}
@@ -378,6 +382,7 @@ CMDLINE_SELECT="
     temporal_denoising
     vp9_temporal_denoising
     coefficient_range_checking
+    better_hw_compatibility
     vp9_highbitdepth
     experimental
 "
diff --git a/test/sad_test.cc b/test/sad_test.cc
index 9dec3cb49..a144cfce7 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -702,18 +702,6 @@ INSTANTIATE_TEST_CASE_P(MMX, SADTest, ::testing::ValuesIn(mmx_tests));
 
 #if HAVE_SSE
 #if CONFIG_USE_X86INC
-const SadMxNParam sse_tests[] = {
-  make_tuple(4, 8, &vpx_sad4x8_sse, -1),
-  make_tuple(4, 4, &vpx_sad4x4_sse, -1),
-};
-INSTANTIATE_TEST_CASE_P(SSE, SADTest, ::testing::ValuesIn(sse_tests));
-
-const SadMxNAvgParam avg_sse_tests[] = {
-  make_tuple(4, 8, &vpx_sad4x8_avg_sse, -1),
-  make_tuple(4, 4, &vpx_sad4x4_avg_sse, -1),
-};
-INSTANTIATE_TEST_CASE_P(SSE, SADavgTest, ::testing::ValuesIn(avg_sse_tests));
-
 const SadMxNx4Param x4d_sse_tests[] = {
   make_tuple(4, 8, &vpx_sad4x8x4d_sse, -1),
   make_tuple(4, 4, &vpx_sad4x4x4d_sse, -1),
@@ -736,6 +724,8 @@ const SadMxNParam sse2_tests[] = {
   make_tuple(8, 16, &vpx_sad8x16_sse2, -1),
   make_tuple(8, 8, &vpx_sad8x8_sse2, -1),
   make_tuple(8, 4, &vpx_sad8x4_sse2, -1),
+  make_tuple(4, 8, &vpx_sad4x8_sse2, -1),
+  make_tuple(4, 4, &vpx_sad4x4_sse2, -1),
 #if CONFIG_VP9_HIGHBITDEPTH
   make_tuple(64, 64, &vpx_highbd_sad64x64_sse2, 8),
   make_tuple(64, 32, &vpx_highbd_sad64x32_sse2, 8),
@@ -786,6 +776,8 @@ const SadMxNAvgParam avg_sse2_tests[] = {
   make_tuple(8, 16, &vpx_sad8x16_avg_sse2, -1),
   make_tuple(8, 8, &vpx_sad8x8_avg_sse2, -1),
   make_tuple(8, 4, &vpx_sad8x4_avg_sse2, -1),
+  make_tuple(4, 8, &vpx_sad4x8_avg_sse2, -1),
+  make_tuple(4, 4, &vpx_sad4x4_avg_sse2, -1),
 #if CONFIG_VP9_HIGHBITDEPTH
   make_tuple(64, 64, &vpx_highbd_sad64x64_avg_sse2, 8),
   make_tuple(64, 32, &vpx_highbd_sad64x32_avg_sse2, 8),
diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc
index ad3327e2d..e6198afbd 100644
--- a/test/vp9_intrapred_test.cc
+++ b/test/vp9_intrapred_test.cc
@@ -132,7 +132,6 @@ using std::tr1::make_tuple;
 #if HAVE_SSE2
 #if CONFIG_VP9_HIGHBITDEPTH
 #if CONFIG_USE_X86INC
-#if ARCH_X86_64
 INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, VP9IntraPredTest,
                         ::testing::Values(
                             make_tuple(&vpx_highbd_dc_predictor_32x32_sse2,
@@ -141,13 +140,13 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, VP9IntraPredTest,
                                        &vpx_highbd_tm_predictor_16x16_c, 16, 8),
                             make_tuple(&vpx_highbd_tm_predictor_32x32_sse2,
                                        &vpx_highbd_tm_predictor_32x32_c, 32, 8),
-                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse2,
                                        &vpx_highbd_dc_predictor_4x4_c, 4, 8),
                             make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
                                        &vpx_highbd_dc_predictor_8x8_c, 8, 8),
                             make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
                                        &vpx_highbd_dc_predictor_16x16_c, 16, 8),
-                            make_tuple(&vpx_highbd_v_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_v_predictor_4x4_sse2,
                                        &vpx_highbd_v_predictor_4x4_c, 4, 8),
                             make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
                                        &vpx_highbd_v_predictor_8x8_c, 8, 8),
@@ -155,34 +154,11 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, VP9IntraPredTest,
                                        &vpx_highbd_v_predictor_16x16_c, 16, 8),
                             make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
                                        &vpx_highbd_v_predictor_32x32_c, 32, 8),
-                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse2,
                                        &vpx_highbd_tm_predictor_4x4_c, 4, 8),
                             make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
                                        &vpx_highbd_tm_predictor_8x8_c, 8, 8)));
-#else
-INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, VP9IntraPredTest,
-                        ::testing::Values(
-                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse,
-                                       &vpx_highbd_dc_predictor_4x4_c, 4, 8),
-                            make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
-                                       &vpx_highbd_dc_predictor_8x8_c, 8, 8),
-                            make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
-                                       &vpx_highbd_dc_predictor_16x16_c, 16, 8),
-                            make_tuple(&vpx_highbd_v_predictor_4x4_sse,
-                                       &vpx_highbd_v_predictor_4x4_c, 4, 8),
-                            make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
-                                       &vpx_highbd_v_predictor_8x8_c, 8, 8),
-                            make_tuple(&vpx_highbd_v_predictor_16x16_sse2,
-                                       &vpx_highbd_v_predictor_16x16_c, 16, 8),
-                            make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
-                                       &vpx_highbd_v_predictor_32x32_c, 32, 8),
-                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse,
-                                       &vpx_highbd_tm_predictor_4x4_c, 4, 8),
-                            make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
-                                       &vpx_highbd_tm_predictor_8x8_c, 8, 8)));
-#endif  // !ARCH_X86_64
 
-#if ARCH_X86_64
 INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, VP9IntraPredTest,
                         ::testing::Values(
                             make_tuple(&vpx_highbd_dc_predictor_32x32_sse2,
@@ -194,14 +170,14 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, VP9IntraPredTest,
                             make_tuple(&vpx_highbd_tm_predictor_32x32_sse2,
                                        &vpx_highbd_tm_predictor_32x32_c, 32,
                                        10),
-                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse2,
                                        &vpx_highbd_dc_predictor_4x4_c, 4, 10),
                             make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
                                        &vpx_highbd_dc_predictor_8x8_c, 8, 10),
                             make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
                                        &vpx_highbd_dc_predictor_16x16_c, 16,
                                        10),
-                            make_tuple(&vpx_highbd_v_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_v_predictor_4x4_sse2,
                                        &vpx_highbd_v_predictor_4x4_c, 4, 10),
                             make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
                                        &vpx_highbd_v_predictor_8x8_c, 8, 10),
@@ -211,35 +187,11 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, VP9IntraPredTest,
                             make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
                                        &vpx_highbd_v_predictor_32x32_c, 32,
                                        10),
-                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse,
-                                       &vpx_highbd_tm_predictor_4x4_c, 4, 10),
-                            make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
-                                       &vpx_highbd_tm_predictor_8x8_c, 8, 10)));
-#else
-INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, VP9IntraPredTest,
-                        ::testing::Values(
-                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse,
-                                       &vpx_highbd_dc_predictor_4x4_c, 4, 10),
-                            make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
-                                       &vpx_highbd_dc_predictor_8x8_c, 8, 10),
-                            make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
-                                       &vpx_highbd_dc_predictor_16x16_c, 16,
-                                       10),
-                            make_tuple(&vpx_highbd_v_predictor_4x4_sse,
-                                       &vpx_highbd_v_predictor_4x4_c, 4, 10),
-                            make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
-                                       &vpx_highbd_v_predictor_8x8_c, 8, 10),
-                            make_tuple(&vpx_highbd_v_predictor_16x16_sse2,
-                                       &vpx_highbd_v_predictor_16x16_c, 16, 10),
-                            make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
-                                       &vpx_highbd_v_predictor_32x32_c, 32, 10),
-                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse2,
                                        &vpx_highbd_tm_predictor_4x4_c, 4, 10),
                             make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
                                        &vpx_highbd_tm_predictor_8x8_c, 8, 10)));
-#endif  // !ARCH_X86_64
 
-#if ARCH_X86_64
 INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, VP9IntraPredTest,
                         ::testing::Values(
                             make_tuple(&vpx_highbd_dc_predictor_32x32_sse2,
@@ -251,14 +203,14 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, VP9IntraPredTest,
                             make_tuple(&vpx_highbd_tm_predictor_32x32_sse2,
                                        &vpx_highbd_tm_predictor_32x32_c, 32,
                                        12),
-                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse2,
                                        &vpx_highbd_dc_predictor_4x4_c, 4, 12),
                             make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
                                        &vpx_highbd_dc_predictor_8x8_c, 8, 12),
                             make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
                                        &vpx_highbd_dc_predictor_16x16_c, 16,
                                        12),
-                            make_tuple(&vpx_highbd_v_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_v_predictor_4x4_sse2,
                                        &vpx_highbd_v_predictor_4x4_c, 4, 12),
                             make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
                                        &vpx_highbd_v_predictor_8x8_c, 8, 12),
@@ -268,33 +220,11 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, VP9IntraPredTest,
                             make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
                                        &vpx_highbd_v_predictor_32x32_c, 32,
                                        12),
-                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse2,
                                        &vpx_highbd_tm_predictor_4x4_c, 4, 12),
                             make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
                                        &vpx_highbd_tm_predictor_8x8_c, 8, 12)));
-#else
-INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, VP9IntraPredTest,
-                        ::testing::Values(
-                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse,
-                                       &vpx_highbd_dc_predictor_4x4_c, 4, 12),
-                            make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
-                                       &vpx_highbd_dc_predictor_8x8_c, 8, 12),
-                            make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
-                                       &vpx_highbd_dc_predictor_16x16_c, 16,
-                                       12),
-                            make_tuple(&vpx_highbd_v_predictor_4x4_sse,
-                                       &vpx_highbd_v_predictor_4x4_c, 4, 12),
-                            make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
-                                       &vpx_highbd_v_predictor_8x8_c, 8, 12),
-                            make_tuple(&vpx_highbd_v_predictor_16x16_sse2,
-                                       &vpx_highbd_v_predictor_16x16_c, 16, 12),
-                            make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
-                                       &vpx_highbd_v_predictor_32x32_c, 32, 12),
-                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse,
-                                       &vpx_highbd_tm_predictor_4x4_c, 4, 12),
-                            make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
-                                       &vpx_highbd_tm_predictor_8x8_c, 8, 12)));
-#endif  // !ARCH_X86_64
+
 #endif  // CONFIG_USE_X86INC
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_SSE2
diff --git a/vp10/common/blockd.h b/vp10/common/blockd.h
index 2f10378a6..fce176796 100644
--- a/vp10/common/blockd.h
+++ b/vp10/common/blockd.h
@@ -82,6 +82,7 @@ typedef struct {
   // Only for INTER blocks
   INTERP_FILTER interp_filter;
   MV_REFERENCE_FRAME ref_frame[2];
+  TX_TYPE tx_type;
 
   // TODO(slavarnway): Delete and use bmi[3].as_mv[] instead.
   int_mv mv[2];
@@ -207,7 +208,7 @@ static INLINE BLOCK_SIZE get_subsize(BLOCK_SIZE bsize,
   return subsize_lookup[partition][bsize];
 }
 
-static const TX_TYPE intra_mode_to_tx_type_lookup[INTRA_MODES] = {
+static const TX_TYPE intra_mode_to_tx_type_context[INTRA_MODES] = {
   DCT_DCT,    // DC
   ADST_DCT,   // V
   DCT_ADST,   // H
@@ -225,11 +226,12 @@ static INLINE TX_TYPE get_tx_type(PLANE_TYPE plane_type, const MACROBLOCKD *xd,
   const MODE_INFO *const mi = xd->mi[0];
   const MB_MODE_INFO *const mbmi = &mi->mbmi;
 
+  (void) block_idx;
   if (plane_type != PLANE_TYPE_Y || xd->lossless[mbmi->segment_id] ||
-      is_inter_block(mbmi) || mbmi->tx_size >= TX_32X32)
+      mbmi->tx_size >= TX_32X32)
     return DCT_DCT;
 
-  return intra_mode_to_tx_type_lookup[get_y_mode(mi, block_idx)];
+  return mbmi->tx_type;
 }
 
 void vp10_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y);
diff --git a/vp10/common/entropy.h b/vp10/common/entropy.h
index 2f93cb31c..9a471c818 100644
--- a/vp10/common/entropy.h
+++ b/vp10/common/entropy.h
@@ -21,7 +21,8 @@
 extern "C" {
 #endif
 
-#define DIFF_UPDATE_PROB 252
+#define DIFF_UPDATE_PROB        252
+#define GROUP_DIFF_UPDATE_PROB  252
 
 // Coefficient token alphabet
 #define ZERO_TOKEN      0   // 0     Extra Bits 0+0
diff --git a/vp10/common/entropymode.c b/vp10/common/entropymode.c
index 2bb292a6b..78f3650f8 100644
--- a/vp10/common/entropymode.c
+++ b/vp10/common/entropymode.c
@@ -326,6 +326,26 @@ static const struct segmentation_probs default_seg_probs = {
 };
 #endif
 
+const vpx_tree_index vp10_ext_tx_tree[TREE_SIZE(TX_TYPES)] = {
+  -DCT_DCT, 2,
+  -ADST_ADST, 4,
+  -ADST_DCT, -DCT_ADST
+};
+
+static const vpx_prob default_intra_ext_tx_prob[EXT_TX_SIZES]
+                                               [TX_TYPES][TX_TYPES - 1] = {
+  {{240, 85, 128}, {4, 1, 248}, {4, 1, 8}, {4, 248, 128}},
+  {{244, 85, 128}, {8, 2, 248}, {8, 2, 8}, {8, 248, 128}},
+  {{248, 85, 128}, {16, 4, 248}, {16, 4, 8}, {16, 248, 128}},
+};
+
+static const vpx_prob default_inter_ext_tx_prob[EXT_TX_SIZES]
+                                               [TX_TYPES - 1] = {
+  {160, 85, 128},
+  {176, 85, 128},
+  {192, 85, 128},
+};
+
 static void init_mode_probs(FRAME_CONTEXT *fc) {
   vp10_copy(fc->uv_mode_prob, default_uv_probs);
   vp10_copy(fc->y_mode_prob, default_if_y_probs);
@@ -342,6 +362,8 @@ static void init_mode_probs(FRAME_CONTEXT *fc) {
   vp10_copy(fc->seg.tree_probs, default_seg_probs.tree_probs);
   vp10_copy(fc->seg.pred_probs, default_seg_probs.pred_probs);
 #endif
+  vp10_copy(fc->intra_ext_tx_prob, default_intra_ext_tx_prob);
+  vp10_copy(fc->inter_ext_tx_prob, default_inter_ext_tx_prob);
 }
 
 const vpx_tree_index vp10_switchable_interp_tree
@@ -431,6 +453,21 @@ void vp10_adapt_intra_frame_probs(VP10_COMMON *cm) {
     fc->skip_probs[i] = mode_mv_merge_probs(
         pre_fc->skip_probs[i], counts->skip[i]);
 
+  for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+    int j;
+    for (j = 0; j < TX_TYPES; ++j)
+      vpx_tree_merge_probs(vp10_ext_tx_tree,
+                           pre_fc->intra_ext_tx_prob[i][j],
+                           counts->intra_ext_tx[i][j],
+                           fc->intra_ext_tx_prob[i][j]);
+  }
+  for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+    vpx_tree_merge_probs(vp10_ext_tx_tree,
+                         pre_fc->inter_ext_tx_prob[i],
+                         counts->inter_ext_tx[i],
+                         fc->inter_ext_tx_prob[i]);
+  }
+
 #if CONFIG_MISC_FIXES
   if (cm->seg.temporal_update) {
     for (i = 0; i < PREDICTION_PROBS; i++)
diff --git a/vp10/common/entropymode.h b/vp10/common/entropymode.h
index 42fd9207f..611d3ad13 100644
--- a/vp10/common/entropymode.h
+++ b/vp10/common/entropymode.h
@@ -66,6 +66,8 @@ typedef struct frame_contexts {
 #if CONFIG_MISC_FIXES
   struct segmentation_probs seg;
 #endif
+  vpx_prob intra_ext_tx_prob[EXT_TX_SIZES][TX_TYPES][TX_TYPES - 1];
+  vpx_prob inter_ext_tx_prob[EXT_TX_SIZES][TX_TYPES - 1];
   int initialized;
 } FRAME_CONTEXT;
 
@@ -90,6 +92,8 @@ typedef struct FRAME_COUNTS {
 #if CONFIG_MISC_FIXES
   struct seg_counts seg;
 #endif
+  unsigned int intra_ext_tx[EXT_TX_SIZES][TX_TYPES][TX_TYPES];
+  unsigned int inter_ext_tx[EXT_TX_SIZES][TX_TYPES];
 } FRAME_COUNTS;
 
 extern const vpx_prob vp10_kf_y_mode_prob[INTRA_MODES][INTRA_MODES]
@@ -119,6 +123,9 @@ void vp10_tx_counts_to_branch_counts_16x16(const unsigned int *tx_count_16x16p,
 void vp10_tx_counts_to_branch_counts_8x8(const unsigned int *tx_count_8x8p,
                                     unsigned int (*ct_8x8p)[2]);
 
+extern const vpx_tree_index
+    vp10_ext_tx_tree[TREE_SIZE(TX_TYPES)];
+
 static INLINE int vp10_ceil_log2(int n) {
   int i = 1, p = 2;
   while (p < n) {
diff --git a/vp10/common/enums.h b/vp10/common/enums.h
index a226a2d69..18c7d1629 100644
--- a/vp10/common/enums.h
+++ b/vp10/common/enums.h
@@ -97,6 +97,8 @@ typedef enum {
   TX_TYPES = 4
 } TX_TYPE;
 
+#define EXT_TX_SIZES       3  // number of sizes that use extended transforms
+
 typedef enum {
   VP9_LAST_FLAG = 1 << 0,
   VP9_GOLD_FLAG = 1 << 1,
diff --git a/vp10/common/thread_common.c b/vp10/common/thread_common.c
index e83cb8e67..0c7a1c22a 100644
--- a/vp10/common/thread_common.c
+++ b/vp10/common/thread_common.c
@@ -435,6 +435,17 @@ void vp10_accumulate_frame_counts(VP10_COMMON *cm, FRAME_COUNTS *counts,
       comps->fp[i] += comps_t->fp[i];
   }
 
+  for (i = 0; i < EXT_TX_SIZES; i++) {
+    int j;
+    for (j = 0; j < TX_TYPES; ++j)
+      for (k = 0; k < TX_TYPES; k++)
+        cm->counts.intra_ext_tx[i][j][k] += counts->intra_ext_tx[i][j][k];
+  }
+  for (i = 0; i < EXT_TX_SIZES; i++) {
+    for (k = 0; k < TX_TYPES; k++)
+      cm->counts.inter_ext_tx[i][k] += counts->inter_ext_tx[i][k];
+  }
+
 #if CONFIG_MISC_FIXES
   for (i = 0; i < PREDICTION_PROBS; i++)
     for (j = 0; j < 2; j++)
diff --git a/vp10/decoder/decodeframe.c b/vp10/decoder/decodeframe.c
index c0fbc4949..1c3f18239 100644
--- a/vp10/decoder/decodeframe.c
+++ b/vp10/decoder/decodeframe.c
@@ -268,7 +268,7 @@ static void inverse_transform_block_inter(MACROBLOCKD* xd, int plane,
     if (eob == 1) {
       dqcoeff[0] = 0;
     } else {
-      if (tx_size <= TX_16X16 && eob <= 10)
+      if (tx_type == DCT_DCT && tx_size <= TX_16X16 && eob <= 10)
         memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0]));
       else if (tx_size == TX_32X32 && eob <= 34)
         memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0]));
@@ -2109,8 +2109,8 @@ static size_t read_uncompressed_header(VP10Decoder *pbi,
 
   setup_segmentation_dequant(cm);
 #if CONFIG_MISC_FIXES
-  cm->tx_mode = (xd->lossless[0]) ? ONLY_4X4
-                                  : read_tx_mode(rb);
+  cm->tx_mode = (!cm->seg.enabled && xd->lossless[0]) ? ONLY_4X4
+                                                      : read_tx_mode(rb);
   cm->reference_mode = read_frame_reference_mode(cm, rb);
 #endif
 
@@ -2124,6 +2124,23 @@ static size_t read_uncompressed_header(VP10Decoder *pbi,
   return sz;
 }
 
+static void read_ext_tx_probs(FRAME_CONTEXT *fc, vpx_reader *r) {
+  int i, j, k;
+  if (vpx_read(r, GROUP_DIFF_UPDATE_PROB)) {
+    for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+      for (j = 0; j < TX_TYPES; ++j)
+        for (k = 0; k < TX_TYPES - 1; ++k)
+          vp10_diff_update_prob(r, &fc->intra_ext_tx_prob[i][j][k]);
+    }
+  }
+  if (vpx_read(r, GROUP_DIFF_UPDATE_PROB)) {
+    for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+      for (k = 0; k < TX_TYPES - 1; ++k)
+        vp10_diff_update_prob(r, &fc->inter_ext_tx_prob[i][k]);
+    }
+  }
+}
+
 static int read_compressed_header(VP10Decoder *pbi, const uint8_t *data,
                                   size_t partition_size) {
   VP10_COMMON *const cm = &pbi->common;
@@ -2205,6 +2222,7 @@ static int read_compressed_header(VP10Decoder *pbi, const uint8_t *data,
 #endif
 
     read_mv_probs(nmvc, cm->allow_high_precision_mv, &r);
+    read_ext_tx_probs(fc, &r);
   }
 
   return vpx_reader_has_error(&r);
@@ -2245,6 +2263,10 @@ static void debug_check_frame_counts(const VP10_COMMON *const cm) {
   assert(!memcmp(&cm->counts.tx, &zero_counts.tx, sizeof(cm->counts.tx)));
   assert(!memcmp(cm->counts.skip, zero_counts.skip, sizeof(cm->counts.skip)));
   assert(!memcmp(&cm->counts.mv, &zero_counts.mv, sizeof(cm->counts.mv)));
+  assert(!memcmp(cm->counts.intra_ext_tx, zero_counts.intra_ext_tx,
+                 sizeof(cm->counts.intra_ext_tx)));
+  assert(!memcmp(cm->counts.inter_ext_tx, zero_counts.inter_ext_tx,
+                 sizeof(cm->counts.inter_ext_tx)));
 }
 #endif  // NDEBUG
 
diff --git a/vp10/decoder/decodemv.c b/vp10/decoder/decodemv.c
index b516333e0..a28ae5592 100644
--- a/vp10/decoder/decodemv.c
+++ b/vp10/decoder/decodemv.c
@@ -100,6 +100,8 @@ static TX_SIZE read_tx_size(VP10_COMMON *cm, MACROBLOCKD *xd,
   TX_MODE tx_mode = cm->tx_mode;
   BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
   const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
+  if (xd->lossless[xd->mi[0]->mbmi.segment_id])
+    return TX_4X4;
   if (allow_select && tx_mode == TX_MODE_SELECT && bsize >= BLOCK_8X8)
     return read_selected_tx_size(cm, xd, max_tx_size, r);
   else
@@ -294,6 +296,20 @@ static void read_intra_frame_mode_info(VP10_COMMON *const cm,
   }
 
   mbmi->uv_mode = read_intra_mode_uv(cm, xd, r, mbmi->mode);
+
+  if (mbmi->tx_size < TX_32X32 &&
+      cm->base_qindex > 0 && !mbmi->skip &&
+      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+    FRAME_COUNTS *counts = xd->counts;
+    TX_TYPE tx_type_nom = intra_mode_to_tx_type_context[mbmi->mode];
+    mbmi->tx_type = vpx_read_tree(
+        r, vp10_ext_tx_tree,
+        cm->fc->intra_ext_tx_prob[mbmi->tx_size][tx_type_nom]);
+    if (counts)
+      ++counts->intra_ext_tx[mbmi->tx_size][tx_type_nom][mbmi->tx_type];
+  } else {
+    mbmi->tx_type = DCT_DCT;
+  }
 }
 
 static int read_mv_component(vpx_reader *r,
@@ -650,6 +666,28 @@ static void read_inter_frame_mode_info(VP10Decoder *const pbi,
     read_inter_block_mode_info(pbi, xd, mi, mi_row, mi_col, r);
   else
     read_intra_block_mode_info(cm, xd, mi, r);
+
+  if (mbmi->tx_size < TX_32X32 &&
+      cm->base_qindex > 0 && !mbmi->skip &&
+      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+    FRAME_COUNTS *counts = xd->counts;
+    if (inter_block) {
+      mbmi->tx_type = vpx_read_tree(
+          r, vp10_ext_tx_tree,
+          cm->fc->inter_ext_tx_prob[mbmi->tx_size]);
+      if (counts)
+        ++counts->inter_ext_tx[mbmi->tx_size][mbmi->tx_type];
+    } else {
+      const TX_TYPE tx_type_nom = intra_mode_to_tx_type_context[mbmi->mode];
+      mbmi->tx_type = vpx_read_tree(
+          r, vp10_ext_tx_tree,
+          cm->fc->intra_ext_tx_prob[mbmi->tx_size][tx_type_nom]);
+      if (counts)
+        ++counts->intra_ext_tx[mbmi->tx_size][tx_type_nom][mbmi->tx_type];
+    }
+  } else {
+    mbmi->tx_type = DCT_DCT;
+  }
 }
 
 void vp10_read_mode_info(VP10Decoder *const pbi, MACROBLOCKD *xd,
diff --git a/vp10/encoder/aq_complexity.c b/vp10/encoder/aq_complexity.c
index 0de044cf9..2506a4e55 100644
--- a/vp10/encoder/aq_complexity.c
+++ b/vp10/encoder/aq_complexity.c
@@ -51,7 +51,7 @@ void vp10_setup_in_frame_q_adj(VP10_COMP *cpi) {
   // Make SURE use of floating point in this function is safe.
   vpx_clear_system_state();
 
-  if (cm->frame_type == KEY_FRAME ||
+  if (frame_is_intra_only(cm) || cm->error_resilient_mode ||
       cpi->refresh_alt_ref_frame ||
       (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
     int segment;
diff --git a/vp10/encoder/aq_variance.c b/vp10/encoder/aq_variance.c
index e8e88c3b6..bed5162fb 100644
--- a/vp10/encoder/aq_variance.c
+++ b/vp10/encoder/aq_variance.c
@@ -47,7 +47,7 @@ void vp10_vaq_frame_setup(VP10_COMP *cpi) {
   struct segmentation *seg = &cm->seg;
   int i;
 
-  if (cm->frame_type == KEY_FRAME ||
+  if (frame_is_intra_only(cm) || cm->error_resilient_mode ||
       cpi->refresh_alt_ref_frame ||
       (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
     vp10_enable_segmentation(seg);
diff --git a/vp10/encoder/bitstream.c b/vp10/encoder/bitstream.c
index 361ac9962..ede8bb370 100644
--- a/vp10/encoder/bitstream.c
+++ b/vp10/encoder/bitstream.c
@@ -58,6 +58,12 @@ static INLINE void write_uniform(vpx_writer *w, int n, int v) {
   }
 }
 
+static struct vp10_token ext_tx_encodings[TX_TYPES];
+
+void vp10_encode_token_init() {
+  vp10_tokens_from_tree(ext_tx_encodings, vp10_ext_tx_tree);
+}
+
 static void write_intra_mode(vpx_writer *w, PREDICTION_MODE mode,
                              const vpx_prob *probs) {
   vp10_write_token(w, vp10_intra_mode_tree, probs, &intra_mode_encodings[mode]);
@@ -90,6 +96,24 @@ static void prob_diff_update(const vpx_tree_index *tree,
     vp10_cond_prob_diff_update(w, &probs[i], branch_ct[i]);
 }
 
+static int prob_diff_update_savings(const vpx_tree_index *tree,
+                                    vpx_prob probs[/*n - 1*/],
+                                    const unsigned int counts[/*n - 1*/],
+                                    int n) {
+  int i;
+  unsigned int branch_ct[32][2];
+  int savings = 0;
+
+  // Assuming max number of probabilities <= 32
+  assert(n <= 32);
+  vp10_tree_probs_from_distribution(tree, branch_ct, counts);
+  for (i = 0; i < n - 1; ++i) {
+    savings += vp10_cond_prob_diff_update_savings(&probs[i],
+                                                  branch_ct[i]);
+  }
+  return savings;
+}
+
 static void write_selected_tx_size(const VP10_COMMON *cm,
                                    const MACROBLOCKD *xd, vpx_writer *w) {
   TX_SIZE tx_size = xd->mi[0]->mbmi.tx_size;
@@ -133,6 +157,49 @@ static void update_switchable_interp_probs(VP10_COMMON *cm, vpx_writer *w,
                      counts->switchable_interp[j], SWITCHABLE_FILTERS, w);
 }
 
+static void update_ext_tx_probs(VP10_COMMON *cm, vpx_writer *w) {
+  const int savings_thresh = vp10_cost_one(GROUP_DIFF_UPDATE_PROB) -
+                             vp10_cost_zero(GROUP_DIFF_UPDATE_PROB);
+  int i, j;
+
+  int savings = 0;
+  int do_update = 0;
+  for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+    for (j = 0; j < TX_TYPES; ++j)
+      savings += prob_diff_update_savings(
+          vp10_ext_tx_tree, cm->fc->intra_ext_tx_prob[i][j],
+          cm->counts.intra_ext_tx[i][j], TX_TYPES);
+  }
+  do_update = savings > savings_thresh;
+  vpx_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
+  if (do_update) {
+    for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+      for (j = 0; j < TX_TYPES; ++j)
+        prob_diff_update(vp10_ext_tx_tree,
+                         cm->fc->intra_ext_tx_prob[i][j],
+                         cm->counts.intra_ext_tx[i][j],
+                         TX_TYPES, w);
+    }
+  }
+  savings = 0;
+  do_update = 0;
+  for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+    savings += prob_diff_update_savings(
+        vp10_ext_tx_tree, cm->fc->inter_ext_tx_prob[i],
+        cm->counts.inter_ext_tx[i], TX_TYPES);
+  }
+  do_update = savings > savings_thresh;
+  vpx_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
+  if (do_update) {
+    for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+      prob_diff_update(vp10_ext_tx_tree,
+                       cm->fc->inter_ext_tx_prob[i],
+                       cm->counts.inter_ext_tx[i],
+                       TX_TYPES, w);
+    }
+  }
+}
+
 static void pack_mb_tokens(vpx_writer *w,
                            TOKENEXTRA **tp, const TOKENEXTRA *const stop,
                            vpx_bit_depth_t bit_depth, const TX_SIZE tx) {
@@ -303,7 +370,7 @@ static void pack_inter_mode_mvs(VP10_COMP *cpi, const MODE_INFO *mi,
     vpx_write(w, is_inter, vp10_get_intra_inter_prob(cm, xd));
 
   if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT &&
-      !(is_inter && skip)) {
+      !(is_inter && skip) && !xd->lossless[segment_id]) {
     write_selected_tx_size(cm, xd, w);
   }
 
@@ -370,6 +437,25 @@ static void pack_inter_mode_mvs(VP10_COMP *cpi, const MODE_INFO *mi,
       }
     }
   }
+  if (mbmi->tx_size < TX_32X32 &&
+      cm->base_qindex > 0 && !mbmi->skip &&
+      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+    if (is_inter) {
+      vp10_write_token(
+          w, vp10_ext_tx_tree,
+          cm->fc->inter_ext_tx_prob[mbmi->tx_size],
+          &ext_tx_encodings[mbmi->tx_type]);
+    } else {
+      vp10_write_token(
+          w, vp10_ext_tx_tree,
+          cm->fc->intra_ext_tx_prob[mbmi->tx_size]
+                                   [intra_mode_to_tx_type_context[mbmi->mode]],
+          &ext_tx_encodings[mbmi->tx_type]);
+    }
+  } else {
+    if (!mbmi->skip)
+      assert(mbmi->tx_type == DCT_DCT);
+  }
 }
 
 static void write_mb_modes_kf(const VP10_COMMON *cm, const MACROBLOCKD *xd,
@@ -391,7 +477,8 @@ static void write_mb_modes_kf(const VP10_COMMON *cm, const MACROBLOCKD *xd,
 
   write_skip(cm, xd, mbmi->segment_id, mi, w);
 
-  if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT)
+  if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT &&
+      !xd->lossless[mbmi->segment_id])
     write_selected_tx_size(cm, xd, w);
 
   if (bsize >= BLOCK_8X8) {
@@ -412,6 +499,16 @@ static void write_mb_modes_kf(const VP10_COMMON *cm, const MACROBLOCKD *xd,
   }
 
   write_intra_mode(w, mbmi->uv_mode, cm->fc->uv_mode_prob[mbmi->mode]);
+
+  if (mbmi->tx_size < TX_32X32 &&
+      cm->base_qindex > 0 && !mbmi->skip &&
+      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+    vp10_write_token(
+        w, vp10_ext_tx_tree,
+        cm->fc->intra_ext_tx_prob[mbmi->tx_size]
+                                 [intra_mode_to_tx_type_context[mbmi->mode]],
+        &ext_tx_encodings[mbmi->tx_type]);
+  }
 }
 
 static void write_modes_b(VP10_COMP *cpi, const TileInfo *const tile,
@@ -1260,7 +1357,7 @@ static void write_uncompressed_header(VP10_COMP *cpi,
   encode_quantization(cm, wb);
   encode_segmentation(cm, xd, wb);
 #if CONFIG_MISC_FIXES
-  if (xd->lossless[0])
+  if (!cm->seg.enabled && xd->lossless[0])
     cm->tx_mode = TX_4X4;
   else
     write_txfm_mode(cm->tx_mode, wb);
@@ -1380,6 +1477,7 @@ static size_t write_compressed_header(VP10_COMP *cpi, uint8_t *data) {
 
     vp10_write_nmv_probs(cm, cm->allow_high_precision_mv, &header_bc,
                         &counts->mv);
+    update_ext_tx_probs(cm, &header_bc);
   }
 
   vpx_stop_encode(&header_bc);
diff --git a/vp10/encoder/bitstream.h b/vp10/encoder/bitstream.h
index aa0ed2fdf..b1da89f1d 100644
--- a/vp10/encoder/bitstream.h
+++ b/vp10/encoder/bitstream.h
@@ -18,6 +18,7 @@ extern "C" {
 
 #include "vp10/encoder/encoder.h"
 
+void vp10_encode_token_init();
 void vp10_pack_bitstream(VP10_COMP *const cpi, uint8_t *dest, size_t *size);
 
 static INLINE int vp10_preserve_existing_gf(VP10_COMP *cpi) {
diff --git a/vp10/encoder/encodeframe.c b/vp10/encoder/encodeframe.c
index 9381b653d..26ce5a1eb 100644
--- a/vp10/encoder/encodeframe.c
+++ b/vp10/encoder/encodeframe.c
@@ -3024,5 +3024,16 @@ static void encode_superblock(VP10_COMP *cpi, ThreadData *td,
     }
     ++td->counts->tx.tx_totals[mbmi->tx_size];
     ++td->counts->tx.tx_totals[get_uv_tx_size(mbmi, &xd->plane[1])];
+    if (mbmi->tx_size < TX_32X32 &&
+        cm->base_qindex > 0 && !mbmi->skip &&
+        !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+      if (is_inter_block(mbmi)) {
+        ++td->counts->inter_ext_tx[mbmi->tx_size][mbmi->tx_type];
+      } else {
+        ++td->counts->intra_ext_tx[mbmi->tx_size]
+                                  [intra_mode_to_tx_type_context[mbmi->mode]]
+                                  [mbmi->tx_type];
+      }
+    }
   }
 }
diff --git a/vp10/encoder/encoder.c b/vp10/encoder/encoder.c
index 175c6d855..9e3bec40e 100644
--- a/vp10/encoder/encoder.c
+++ b/vp10/encoder/encoder.c
@@ -328,6 +328,7 @@ void vp10_initialize_enc(void) {
     vp10_rc_init_minq_luts();
     vp10_entropy_mv_init();
     vp10_temporal_filter_init();
+    vp10_encode_token_init();
     init_done = 1;
   }
 }
@@ -2654,7 +2655,7 @@ static void loopfilter_frame(VP10_COMP *cpi, VP10_COMMON *cm) {
   MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
   struct loopfilter *lf = &cm->lf;
   if (is_lossless_requested(&cpi->oxcf)) {
-      lf->filter_level = 0;
+    lf->filter_level = 0;
   } else {
     struct vpx_usec_timer timer;
 
diff --git a/vp10/encoder/encoder.h b/vp10/encoder/encoder.h
index 2a44e4744..bd6a00932 100644
--- a/vp10/encoder/encoder.h
+++ b/vp10/encoder/encoder.h
@@ -467,6 +467,8 @@ typedef struct VP10_COMP {
   int multi_arf_enabled;
   int multi_arf_last_grp_enabled;
 
+  int intra_tx_type_costs[EXT_TX_SIZES][TX_TYPES][TX_TYPES];
+  int inter_tx_type_costs[EXT_TX_SIZES][TX_TYPES];
 #if CONFIG_VP9_TEMPORAL_DENOISING
   VP9_DENOISER denoiser;
 #endif
diff --git a/vp10/encoder/rd.c b/vp10/encoder/rd.c
index 5623a7202..f4fdb2417 100644
--- a/vp10/encoder/rd.c
+++ b/vp10/encoder/rd.c
@@ -83,6 +83,18 @@ static void fill_mode_costs(VP10_COMP *cpi) {
   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
     vp10_cost_tokens(cpi->switchable_interp_costs[i],
                     fc->switchable_interp_prob[i], vp10_switchable_interp_tree);
+
+  for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+    for (j = 0; j < TX_TYPES; ++j)
+      vp10_cost_tokens(cpi->intra_tx_type_costs[i][j],
+                       fc->intra_ext_tx_prob[i][j],
+                       vp10_ext_tx_tree);
+  }
+  for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+    vp10_cost_tokens(cpi->inter_tx_type_costs[i],
+                     fc->inter_ext_tx_prob[i],
+                     vp10_ext_tx_tree);
+  }
 }
 
 static void fill_token_costs(vp10_coeff_cost *c,
diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c
index bbddc1d29..90a716d2c 100644
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c
@@ -54,6 +54,8 @@
 #define MIN_EARLY_TERM_INDEX    3
 #define NEW_MV_DISCOUNT_FACTOR  8
 
+const double ext_tx_th = 0.99;
+
 typedef struct {
   PREDICTION_MODE mode;
   MV_REFERENCE_FRAME ref_frame[2];
@@ -598,11 +600,56 @@ static void choose_largest_tx_size(VP10_COMP *cpi, MACROBLOCK *x,
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
 
+  TX_TYPE tx_type, best_tx_type = DCT_DCT;
+  int r, s;
+  int64_t d, psse, this_rd, best_rd = INT64_MAX;
+  vpx_prob skip_prob = vp10_get_skip_prob(cm, xd);
+  int  s0 = vp10_cost_bit(skip_prob, 0);
+  int  s1 = vp10_cost_bit(skip_prob, 1);
+  const int is_inter = is_inter_block(mbmi);
+
   mbmi->tx_size = VPXMIN(max_tx_size, largest_tx_size);
+  if (mbmi->tx_size < TX_32X32 &&
+      !xd->lossless[mbmi->segment_id]) {
+    for (tx_type = 0; tx_type < TX_TYPES; ++tx_type) {
+      mbmi->tx_type = tx_type;
+      txfm_rd_in_plane(x, &r, &d, &s,
+                       &psse, ref_best_rd, 0, bs, mbmi->tx_size,
+                       cpi->sf.use_fast_coef_costing);
+      if (r == INT_MAX)
+        continue;
+      if (is_inter)
+        r += cpi->inter_tx_type_costs[mbmi->tx_size][mbmi->tx_type];
+      else
+        r += cpi->intra_tx_type_costs[mbmi->tx_size]
+                                     [intra_mode_to_tx_type_context[mbmi->mode]]
+                                     [mbmi->tx_type];
+      if (s)
+        this_rd = RDCOST(x->rdmult, x->rddiv, s1, psse);
+      else
+        this_rd = RDCOST(x->rdmult, x->rddiv, r + s0, d);
+      if (is_inter && !xd->lossless[mbmi->segment_id] && !s)
+        this_rd = VPXMIN(this_rd, RDCOST(x->rdmult, x->rddiv, s1, psse));
 
+      if (this_rd < ((best_tx_type == DCT_DCT) ? ext_tx_th : 1) * best_rd) {
+        best_rd = this_rd;
+        best_tx_type = mbmi->tx_type;
+      }
+    }
+  }
+  mbmi->tx_type = best_tx_type;
   txfm_rd_in_plane(x, rate, distortion, skip,
                    sse, ref_best_rd, 0, bs,
                    mbmi->tx_size, cpi->sf.use_fast_coef_costing);
+  if (mbmi->tx_size < TX_32X32 && !xd->lossless[mbmi->segment_id] &&
+      *rate != INT_MAX) {
+    if (is_inter)
+      *rate += cpi->inter_tx_type_costs[mbmi->tx_size][mbmi->tx_type];
+    else
+      *rate += cpi->intra_tx_type_costs[mbmi->tx_size]
+          [intra_mode_to_tx_type_context[mbmi->mode]]
+          [mbmi->tx_type];
+  }
 }
 
 static void choose_smallest_tx_size(VP10_COMP *cpi, MACROBLOCK *x,
@@ -632,87 +679,115 @@ static void choose_tx_size_from_rd(VP10_COMP *cpi, MACROBLOCK *x,
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   vpx_prob skip_prob = vp10_get_skip_prob(cm, xd);
-  int r[TX_SIZES][2], s[TX_SIZES];
-  int64_t d[TX_SIZES], sse[TX_SIZES];
-  int64_t rd[TX_SIZES][2] = {{INT64_MAX, INT64_MAX},
-                             {INT64_MAX, INT64_MAX},
-                             {INT64_MAX, INT64_MAX},
-                             {INT64_MAX, INT64_MAX}};
+  int r, s;
+  int64_t d, sse;
+  int64_t rd = INT64_MAX;
   int n, m;
   int s0, s1;
-  int64_t best_rd = INT64_MAX;
+  int64_t best_rd = INT64_MAX, last_rd = INT64_MAX;
   TX_SIZE best_tx = max_tx_size;
   int start_tx, end_tx;
+  const int tx_select = cm->tx_mode == TX_MODE_SELECT;
+  TX_TYPE tx_type, best_tx_type = DCT_DCT;
+  const int is_inter = is_inter_block(mbmi);
 
   const vpx_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc->tx_probs);
   assert(skip_prob > 0);
   s0 = vp10_cost_bit(skip_prob, 0);
   s1 = vp10_cost_bit(skip_prob, 1);
 
-  if (cm->tx_mode == TX_MODE_SELECT) {
+  if (tx_select) {
     start_tx = max_tx_size;
     end_tx = 0;
   } else {
-    TX_SIZE chosen_tx_size = VPXMIN(max_tx_size,
-                                    tx_mode_to_biggest_tx_size[cm->tx_mode]);
+    const TX_SIZE chosen_tx_size =
+        VPXMIN(max_tx_size, tx_mode_to_biggest_tx_size[cm->tx_mode]);
     start_tx = chosen_tx_size;
     end_tx = chosen_tx_size;
   }
 
-  for (n = start_tx; n >= end_tx; n--) {
-    int r_tx_size = 0;
-    for (m = 0; m <= n - (n == (int) max_tx_size); m++) {
-      if (m == n)
-        r_tx_size += vp10_cost_zero(tx_probs[m]);
-      else
-        r_tx_size += vp10_cost_one(tx_probs[m]);
-    }
-    txfm_rd_in_plane(x, &r[n][0], &d[n], &s[n],
-                     &sse[n], ref_best_rd, 0, bs, n,
-                     cpi->sf.use_fast_coef_costing);
-    r[n][1] = r[n][0];
-    if (r[n][0] < INT_MAX) {
-      r[n][1] += r_tx_size;
-    }
-    if (d[n] == INT64_MAX || r[n][0] == INT_MAX) {
-      rd[n][0] = rd[n][1] = INT64_MAX;
-    } else if (s[n]) {
-      if (is_inter_block(mbmi)) {
-        rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, sse[n]);
-        r[n][1] -= r_tx_size;
-      } else {
-        rd[n][0] = RDCOST(x->rdmult, x->rddiv, s1, sse[n]);
-        rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1 + r_tx_size, sse[n]);
+  *distortion = INT64_MAX;
+  *rate       = INT_MAX;
+  *skip       = 0;
+  *psse       = INT64_MAX;
+
+  for (tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) {
+    last_rd = INT64_MAX;
+    for (n = start_tx; n >= end_tx; --n) {
+      int r_tx_size = 0;
+      for (m = 0; m <= n - (n == (int) max_tx_size); ++m) {
+        if (m == n)
+          r_tx_size += vp10_cost_zero(tx_probs[m]);
+        else
+          r_tx_size += vp10_cost_one(tx_probs[m]);
       }
-    } else {
-      rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]);
-      rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]);
-    }
 
-    if (is_inter_block(mbmi) && !xd->lossless[mbmi->segment_id] &&
-        !s[n] && sse[n] != INT64_MAX) {
-      rd[n][0] = VPXMIN(rd[n][0], RDCOST(x->rdmult, x->rddiv, s1, sse[n]));
-      rd[n][1] = VPXMIN(rd[n][1], RDCOST(x->rdmult, x->rddiv, s1, sse[n]));
-    }
+      if (n >= TX_32X32 && tx_type != DCT_DCT) {
+        continue;
+      }
+      mbmi->tx_type = tx_type;
+      txfm_rd_in_plane(x, &r, &d, &s,
+                       &sse, ref_best_rd, 0, bs, n,
+                       cpi->sf.use_fast_coef_costing);
+      if (n < TX_32X32 &&
+          !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
+          r != INT_MAX) {
+        if (is_inter)
+          r += cpi->inter_tx_type_costs[mbmi->tx_size][mbmi->tx_type];
+        else
+          r += cpi->intra_tx_type_costs[mbmi->tx_size]
+              [intra_mode_to_tx_type_context[mbmi->mode]]
+              [mbmi->tx_type];
+      }
 
-    // Early termination in transform size search.
-    if (cpi->sf.tx_size_search_breakout &&
-        (rd[n][1] == INT64_MAX ||
-        (n < (int) max_tx_size && rd[n][1] > rd[n + 1][1]) ||
-        s[n] == 1))
-      break;
+      if (r == INT_MAX)
+        continue;
 
-    if (rd[n][1] < best_rd) {
-      best_tx = n;
-      best_rd = rd[n][1];
+      if (s) {
+        if (is_inter) {
+          rd = RDCOST(x->rdmult, x->rddiv, s1, sse);
+        } else {
+          rd =  RDCOST(x->rdmult, x->rddiv, s1 + r_tx_size * tx_select, sse);
+        }
+      } else {
+        rd = RDCOST(x->rdmult, x->rddiv, r + s0 + r_tx_size * tx_select, d);
+      }
+
+      if (tx_select && !(s && is_inter))
+        r += r_tx_size;
+
+      if (is_inter && !xd->lossless[xd->mi[0]->mbmi.segment_id] && !s)
+        rd = VPXMIN(rd, RDCOST(x->rdmult, x->rddiv, s1, sse));
+
+      // Early termination in transform size search.
+      if (cpi->sf.tx_size_search_breakout &&
+          (rd == INT64_MAX ||
+           (s == 1 && tx_type != DCT_DCT && n < start_tx) ||
+           (n < (int) max_tx_size && rd > last_rd)))
+        break;
+
+      last_rd = rd;
+      if (rd <
+          (is_inter && best_tx_type == DCT_DCT ? ext_tx_th : 1) *
+          best_rd) {
+        best_tx = n;
+        best_rd = rd;
+        *distortion = d;
+        *rate       = r;
+        *skip       = s;
+        *psse       = sse;
+        best_tx_type = mbmi->tx_type;
+      }
     }
   }
-  mbmi->tx_size = best_tx;
 
-  *distortion = d[mbmi->tx_size];
-  *rate       = r[mbmi->tx_size][cm->tx_mode == TX_MODE_SELECT];
-  *skip       = s[mbmi->tx_size];
-  *psse       = sse[mbmi->tx_size];
+  mbmi->tx_size = best_tx;
+  mbmi->tx_type = best_tx_type;
+  if (mbmi->tx_size >= TX_32X32)
+    assert(mbmi->tx_type == DCT_DCT);
+  txfm_rd_in_plane(x, &r, &d, &s,
+                   &sse, ref_best_rd, 0, bs, best_tx,
+                   cpi->sf.use_fast_coef_costing);
 }
 
 static void super_block_yrd(VP10_COMP *cpi, MACROBLOCK *x, int *rate,
@@ -1065,6 +1140,7 @@ static int64_t rd_pick_intra_sby_mode(VP10_COMP *cpi, MACROBLOCK *x,
   int this_rate, this_rate_tokenonly, s;
   int64_t this_distortion, this_rd;
   TX_SIZE best_tx = TX_4X4;
+  TX_TYPE best_tx_type = DCT_DCT;
   int *bmode_costs;
   const MODE_INFO *above_mi = xd->above_mi;
   const MODE_INFO *left_mi = xd->left_mi;
@@ -1091,6 +1167,7 @@ static int64_t rd_pick_intra_sby_mode(VP10_COMP *cpi, MACROBLOCK *x,
       mode_selected   = mode;
       best_rd         = this_rd;
       best_tx         = mic->mbmi.tx_size;
+      best_tx_type    = mic->mbmi.tx_type;
       *rate           = this_rate;
       *rate_tokenonly = this_rate_tokenonly;
       *distortion     = this_distortion;
@@ -1100,6 +1177,7 @@ static int64_t rd_pick_intra_sby_mode(VP10_COMP *cpi, MACROBLOCK *x,
 
   mic->mbmi.mode = mode_selected;
   mic->mbmi.tx_size = best_tx;
+  mic->mbmi.tx_type = best_tx_type;
 
   return best_rd;
 }
diff --git a/vp10/encoder/segmentation.c b/vp10/encoder/segmentation.c
index 6a20ee47d..677910fa3 100644
--- a/vp10/encoder/segmentation.c
+++ b/vp10/encoder/segmentation.c
@@ -273,7 +273,7 @@ void vp10_choose_segmap_coding_method(VP10_COMMON *cm, MACROBLOCKD *xd) {
   no_pred_cost = cost_segmap(no_pred_segcounts, no_pred_tree);
 
   // Key frames cannot use temporal prediction
-  if (!frame_is_intra_only(cm)) {
+  if (!frame_is_intra_only(cm) && !cm->error_resilient_mode) {
     // Work out probability tree for coding those segments not
     // predicted using the temporal method and the cost.
     calc_segtree_probs(t_unpred_seg_counts, t_pred_tree, segp->tree_probs);
@@ -300,6 +300,7 @@ void vp10_choose_segmap_coding_method(VP10_COMMON *cm, MACROBLOCKD *xd) {
 
   // Now choose which coding method to use.
   if (t_pred_cost < no_pred_cost) {
+    assert(!cm->error_resilient_mode);
     seg->temporal_update = 1;
 #if !CONFIG_MISC_FIXES
     memcpy(segp->tree_probs, t_pred_tree, sizeof(t_pred_tree));
diff --git a/vp10/encoder/subexp.c b/vp10/encoder/subexp.c
index 67e820b1f..d4074775b 100644
--- a/vp10/encoder/subexp.c
+++ b/vp10/encoder/subexp.c
@@ -212,3 +212,12 @@ void vp10_cond_prob_diff_update(vpx_writer *w, vpx_prob *oldp,
     vpx_write(w, 0, upd);
   }
 }
+
+int vp10_cond_prob_diff_update_savings(vpx_prob *oldp,
+                                       const unsigned int ct[2]) {
+  const vpx_prob upd = DIFF_UPDATE_PROB;
+  vpx_prob newp = get_binary_prob(ct[0], ct[1]);
+  const int savings = vp10_prob_diff_update_savings_search(ct, *oldp, &newp,
+                                                           upd);
+  return savings;
+}
diff --git a/vp10/encoder/subexp.h b/vp10/encoder/subexp.h
index 04b96c0bd..091334f1f 100644
--- a/vp10/encoder/subexp.h
+++ b/vp10/encoder/subexp.h
@@ -37,6 +37,8 @@ int vp10_prob_diff_update_savings_search_model(const unsigned int *ct,
                                               vpx_prob upd,
                                               int stepsize);
 
+int vp10_cond_prob_diff_update_savings(vpx_prob *oldp,
+                                       const unsigned int ct[2]);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/common/vp9_mvref_common.c b/vp9/common/vp9_mvref_common.c
index 77d1ff459..9545729fb 100644
--- a/vp9/common/vp9_mvref_common.c
+++ b/vp9/common/vp9_mvref_common.c
@@ -11,7 +11,7 @@
 
 #include "vp9/common/vp9_mvref_common.h"
 
-// This function searches the neighbourhood of a given MB/SB
+// This function searches the neighborhood of a given MB/SB
 // to try and find candidate reference vectors.
 static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
                              MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
@@ -24,7 +24,7 @@ static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
   const POSITION *const mv_ref_search = mv_ref_blocks[mi->mbmi.sb_type];
   int different_ref_found = 0;
   int context_counter = 0;
-  const MV_REF *const  prev_frame_mvs = cm->use_prev_frame_mvs ?
+  const MV_REF *const prev_frame_mvs = cm->use_prev_frame_mvs ?
       cm->prev_frame->mvs + mi_row * cm->mi_cols + mi_col : NULL;
   const TileInfo *const tile = &xd->tile;
 
@@ -59,8 +59,8 @@ static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
   for (; i < MVREF_NEIGHBOURS; ++i) {
     const POSITION *const mv_ref = &mv_ref_search[i];
     if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
-      const MB_MODE_INFO *const candidate = &xd->mi[mv_ref->col + mv_ref->row *
-                                                    xd->mi_stride]->mbmi;
+      const MB_MODE_INFO *const candidate =
+          &xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride]->mbmi;
       different_ref_found = 1;
 
       if (candidate->ref_frame[0] == ref_frame)
@@ -71,7 +71,7 @@ static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
   }
 
   // TODO(hkuang): Remove this sync after fixing pthread_cond_broadcast
-  // on windows platform. The sync here is unncessary if use_perv_frame_mvs
+  // on windows platform. The sync here is unnecessary if use_prev_frame_mvs
   // is 0. But after removing it, there will be hang in the unit test on windows
   // due to several threads waiting for a thread's signal.
 #if defined(_WIN32) && !HAVE_PTHREAD_H
@@ -101,8 +101,8 @@ static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
     for (i = 0; i < MVREF_NEIGHBOURS; ++i) {
       const POSITION *mv_ref = &mv_ref_search[i];
       if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
-        const MB_MODE_INFO *const candidate = &xd->mi[mv_ref->col + mv_ref->row
-                                              * xd->mi_stride]->mbmi;
+        const MB_MODE_INFO *const candidate =
+            &xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride]->mbmi;
 
         // If the candidate is INTRA we don't want to consider its mv.
         IF_DIFF_REF_FRAME_ADD_MV(candidate, ref_frame, ref_sign_bias,
@@ -156,16 +156,6 @@ void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
                    mi_row, mi_col, sync, data, mode_context);
 }
 
-static void lower_mv_precision(MV *mv, int allow_hp) {
-  const int use_hp = allow_hp && vp9_use_mv_hp(mv);
-  if (!use_hp) {
-    if (mv->row & 1)
-      mv->row += (mv->row > 0 ? -1 : 1);
-    if (mv->col & 1)
-      mv->col += (mv->col > 0 ? -1 : 1);
-  }
-}
-
 void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp,
                            int_mv *mvlist, int_mv *nearest_mv,
                            int_mv *near_mv) {
diff --git a/vp9/common/vp9_mvref_common.h b/vp9/common/vp9_mvref_common.h
index bd216d433..22fbaf857 100644
--- a/vp9/common/vp9_mvref_common.h
+++ b/vp9/common/vp9_mvref_common.h
@@ -157,7 +157,7 @@ static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref,
 
 // This macro is used to add a motion vector mv_ref list if it isn't
 // already in the list.  If it's the second motion vector it will also
-// skip all additional processing and jump to done!
+// skip all additional processing and jump to Done!
 #define ADD_MV_REF_LIST(mv, refmv_count, mv_ref_list, Done) \
   do { \
     if (refmv_count) { \
@@ -207,6 +207,16 @@ static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
                xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
 }
 
+static INLINE void lower_mv_precision(MV *mv, int allow_hp) {
+  const int use_hp = allow_hp && vp9_use_mv_hp(mv);
+  if (!use_hp) {
+    if (mv->row & 1)
+      mv->row += (mv->row > 0 ? -1 : 1);
+    if (mv->col & 1)
+      mv->col += (mv->col > 0 ? -1 : 1);
+  }
+}
+
 typedef void (*find_mv_refs_sync)(void *const data, int mi_row);
 void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
                       MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index d8c14ecc8..37658dc94 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -190,6 +190,12 @@ static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
       // Co-ordinate of containing block to pixel precision.
       const int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x));
       const int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y));
+#if CONFIG_BETTER_HW_COMPATIBILITY
+      assert(xd->mi[0]->mbmi.sb_type != BLOCK_4X8 &&
+             xd->mi[0]->mbmi.sb_type != BLOCK_8X4);
+      assert(mv_q4.row == mv.row * (1 << (1 - pd->subsampling_y)) &&
+             mv_q4.col == mv.col * (1 << (1 - pd->subsampling_x)));
+#endif
       if (plane == 0)
         pre_buf->buf = xd->block_refs[ref]->buf->y_buffer;
       else if (plane == 1)
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 39e4dcfe3..e27634cdd 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -587,7 +587,12 @@ static void dec_build_inter_predictors(VP9Decoder *const pbi, MACROBLOCKD *xd,
     // Co-ordinate of containing block to pixel precision.
     int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x));
     int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y));
-
+#if CONFIG_BETTER_HW_COMPATIBILITY
+    assert(xd->mi[0]->mbmi.sb_type != BLOCK_4X8 &&
+           xd->mi[0]->mbmi.sb_type != BLOCK_8X4);
+    assert(mv_q4.row == mv->row * (1 << (1 - pd->subsampling_y)) &&
+           mv_q4.col == mv->col * (1 << (1 - pd->subsampling_x)));
+#endif
     // Co-ordinate of the block to 1/16th pixel precision.
     x0_16 = (x_start + x) << SUBPEL_BITS;
     y0_16 = (y_start + y) << SUBPEL_BITS;
@@ -714,6 +719,18 @@ static void dec_build_inter_predictors_sb(VP9Decoder *const pbi,
   const InterpKernel *kernel = vp9_filter_kernels[mi->mbmi.interp_filter];
   const BLOCK_SIZE sb_type = mi->mbmi.sb_type;
   const int is_compound = has_second_ref(&mi->mbmi);
+  int ref;
+
+  for (ref = 0; ref < 1 + is_compound; ++ref) {
+    const MV_REFERENCE_FRAME frame = mi->mbmi.ref_frame[ref];
+    RefBuffer *ref_buf = &pbi->common.frame_refs[frame - LAST_FRAME];
+
+    xd->block_refs[ref] = ref_buf;
+    if (!vp9_is_valid_scale(&ref_buf->sf))
+      vpx_internal_error(xd->error_info, VPX_CODEC_UNSUP_BITSTREAM,
+                         "Reference frame has invalid dimensions");
+    vp9_setup_pre_planes(xd, ref, ref_buf->buf, mi_row, mi_col, &ref_buf->sf);
+  }
 
   for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
     struct macroblockd_plane *const pd = &xd->plane[plane];
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index d3ca7b3fe..42f554591 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -284,12 +284,19 @@ static int read_mv_component(vpx_reader *r,
   return sign ? -mag : mag;
 }
 
+// TODO(slavarnway): move to vp9_entropymv.h and replace vp9_use_mv_hp
+#define COMPANDED_MVREF_THRESH 8
+static int use_mv_hp(const MV *ref) {
+  return (abs(ref->row) >> 3) < COMPANDED_MVREF_THRESH &&
+         (abs(ref->col) >> 3) < COMPANDED_MVREF_THRESH;
+}
+
 static INLINE void read_mv(vpx_reader *r, MV *mv, const MV *ref,
                            const nmv_context *ctx,
                            nmv_context_counts *counts, int allow_hp) {
   const MV_JOINT_TYPE joint_type =
       (MV_JOINT_TYPE)vpx_read_tree(r, vp9_mv_joint_tree, ctx->joints);
-  const int use_hp = allow_hp && vp9_use_mv_hp(ref);
+  const int use_hp = allow_hp && use_mv_hp(ref);
   MV diff = {0, 0};
 
   if (mv_joint_vertical(joint_type))
@@ -476,12 +483,203 @@ static int read_is_inter_block(VP9_COMMON *const cm, MACROBLOCKD *const xd,
   }
 }
 
+static void dec_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp, int_mv *mvlist,
+                                  int_mv *nearest_mv, int_mv *near_mv,
+                                  int refmv_count) {
+  int i;
+
+  // Make sure all the candidates are properly clamped etc
+  for (i = 0; i < refmv_count; ++i) {
+    lower_mv_precision(&mvlist[i].as_mv, allow_hp);
+    clamp_mv2(&mvlist[i].as_mv, xd);
+  }
+  *nearest_mv = mvlist[0];
+  *near_mv = mvlist[1];
+}
+
 static void fpm_sync(void *const data, int mi_row) {
   VP9Decoder *const pbi = (VP9Decoder *)data;
   vp9_frameworker_wait(pbi->frame_worker_owner, pbi->common.prev_frame,
                        mi_row << MI_BLOCK_SIZE_LOG2);
 }
 
+// This macro is used to add a motion vector mv_ref list if it isn't
+// already in the list.  If it's the second motion vector or early_break
+// it will also skip all additional processing and jump to Done!
+#define ADD_MV_REF_LIST_EB(mv, refmv_count, mv_ref_list, Done) \
+  do { \
+    if (refmv_count) { \
+      if ((mv).as_int != (mv_ref_list)[0].as_int) { \
+        (mv_ref_list)[(refmv_count)] = (mv); \
+        refmv_count++; \
+        goto Done; \
+      } \
+    } else { \
+      (mv_ref_list)[(refmv_count)++] = (mv); \
+      if (early_break) \
+        goto Done; \
+    } \
+  } while (0)
+
+// If either reference frame is different, not INTRA, and they
+// are different from each other scale and add the mv to our list.
+#define IF_DIFF_REF_FRAME_ADD_MV_EB(mbmi, ref_frame, ref_sign_bias, \
+                                    refmv_count, mv_ref_list, Done) \
+  do { \
+    if (is_inter_block(mbmi)) { \
+      if ((mbmi)->ref_frame[0] != ref_frame) \
+        ADD_MV_REF_LIST_EB(scale_mv((mbmi), 0, ref_frame, ref_sign_bias), \
+                           refmv_count, mv_ref_list, Done); \
+      if (has_second_ref(mbmi) && \
+          (mbmi)->ref_frame[1] != ref_frame && \
+          (mbmi)->mv[1].as_int != (mbmi)->mv[0].as_int) \
+        ADD_MV_REF_LIST_EB(scale_mv((mbmi), 1, ref_frame, ref_sign_bias), \
+                           refmv_count, mv_ref_list, Done); \
+    } \
+  } while (0)
+
+// This function searches the neighborhood of a given MB/SB
+// to try and find candidate reference vectors.
+static int dec_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+                            MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
+                            const POSITION *const mv_ref_search,
+                            int_mv *mv_ref_list,
+                            int mi_row, int mi_col,
+                            find_mv_refs_sync sync, void *const data) {
+  const int *ref_sign_bias = cm->ref_frame_sign_bias;
+  int i, refmv_count = 0;
+  int different_ref_found = 0;
+  const MV_REF *const prev_frame_mvs = cm->use_prev_frame_mvs ?
+      cm->prev_frame->mvs + mi_row * cm->mi_cols + mi_col : NULL;
+  const TileInfo *const tile = &xd->tile;
+  // If mode is nearestmv or newmv (uses nearestmv as a reference) then stop
+  // searching after the first mv is found.
+  const int early_break = (mi->mbmi.mode == NEARESTMV) ||
+                          (mi->mbmi.mode == NEWMV);
+
+  // Blank the reference vector list
+  memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES);
+
+  // Check the rest of the neighbors in much the same way
+  // as before except we don't need to keep track of sub blocks or
+  // mode counts.
+  for (i = 0; i < MVREF_NEIGHBOURS; ++i) {
+    const POSITION *const mv_ref = &mv_ref_search[i];
+    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
+      const MB_MODE_INFO *const candidate =
+          &xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride]->mbmi;
+      different_ref_found = 1;
+
+      if (candidate->ref_frame[0] == ref_frame)
+        ADD_MV_REF_LIST_EB(candidate->mv[0], refmv_count, mv_ref_list, Done);
+      else if (candidate->ref_frame[1] == ref_frame)
+        ADD_MV_REF_LIST_EB(candidate->mv[1], refmv_count, mv_ref_list, Done);
+    }
+  }
+
+  // TODO(hkuang): Remove this sync after fixing pthread_cond_broadcast
+  // on windows platform. The sync here is unnecessary if use_prev_frame_mvs
+  // is 0. But after removing it, there will be hang in the unit test on windows
+  // due to several threads waiting for a thread's signal.
+#if defined(_WIN32) && !HAVE_PTHREAD_H
+    if (cm->frame_parallel_decode && sync != NULL) {
+      sync(data, mi_row);
+    }
+#endif
+
+  // Check the last frame's mode and mv info.
+  if (prev_frame_mvs) {
+    // Synchronize here for frame parallel decode if sync function is provided.
+    if (cm->frame_parallel_decode && sync != NULL) {
+      sync(data, mi_row);
+    }
+
+    if (prev_frame_mvs->ref_frame[0] == ref_frame) {
+      ADD_MV_REF_LIST_EB(prev_frame_mvs->mv[0], refmv_count, mv_ref_list, Done);
+    } else if (prev_frame_mvs->ref_frame[1] == ref_frame) {
+      ADD_MV_REF_LIST_EB(prev_frame_mvs->mv[1], refmv_count, mv_ref_list, Done);
+    }
+  }
+
+  // Since we couldn't find 2 mvs from the same reference frame
+  // go back through the neighbors and find motion vectors from
+  // different reference frames.
+  if (different_ref_found) {
+    for (i = 0; i < MVREF_NEIGHBOURS; ++i) {
+      const POSITION *mv_ref = &mv_ref_search[i];
+      if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
+        const MB_MODE_INFO *const candidate =
+            &xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride]->mbmi;
+
+        // If the candidate is INTRA we don't want to consider its mv.
+        IF_DIFF_REF_FRAME_ADD_MV_EB(candidate, ref_frame, ref_sign_bias,
+                                    refmv_count, mv_ref_list, Done);
+      }
+    }
+  }
+
+  // Since we still don't have a candidate we'll try the last frame.
+  if (prev_frame_mvs) {
+    if (prev_frame_mvs->ref_frame[0] != ref_frame &&
+        prev_frame_mvs->ref_frame[0] > INTRA_FRAME) {
+      int_mv mv = prev_frame_mvs->mv[0];
+      if (ref_sign_bias[prev_frame_mvs->ref_frame[0]] !=
+          ref_sign_bias[ref_frame]) {
+        mv.as_mv.row *= -1;
+        mv.as_mv.col *= -1;
+      }
+      ADD_MV_REF_LIST_EB(mv, refmv_count, mv_ref_list, Done);
+    }
+
+    if (prev_frame_mvs->ref_frame[1] > INTRA_FRAME &&
+        prev_frame_mvs->ref_frame[1] != ref_frame &&
+        prev_frame_mvs->mv[1].as_int != prev_frame_mvs->mv[0].as_int) {
+      int_mv mv = prev_frame_mvs->mv[1];
+      if (ref_sign_bias[prev_frame_mvs->ref_frame[1]] !=
+          ref_sign_bias[ref_frame]) {
+        mv.as_mv.row *= -1;
+        mv.as_mv.col *= -1;
+      }
+      ADD_MV_REF_LIST_EB(mv, refmv_count, mv_ref_list, Done);
+    }
+  }
+
+  if (mi->mbmi.mode == NEARMV)
+    refmv_count = MAX_MV_REF_CANDIDATES;
+  else
+    // we only care about the nearestmv for the remaining modes
+    refmv_count = 1;
+
+ Done:
+  // Clamp vectors
+  for (i = 0; i < refmv_count; ++i)
+    clamp_mv_ref(&mv_ref_list[i].as_mv, xd);
+
+  return refmv_count;
+}
+
+static uint8_t get_mode_context(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+                                const POSITION *const mv_ref_search,
+                                int mi_row, int mi_col) {
+  int i;
+  int context_counter = 0;
+  const TileInfo *const tile = &xd->tile;
+
+  // Get mode count from nearest 2 blocks
+  for (i = 0; i < 2; ++i) {
+    const POSITION *const mv_ref = &mv_ref_search[i];
+    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
+      const MODE_INFO *const candidate_mi = xd->mi[mv_ref->col + mv_ref->row *
+                                                   xd->mi_stride];
+      const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
+      // Keep counts for entropy encoding.
+      context_counter += mode_2_counter[candidate->mode];
+    }
+  }
+
+  return counter_to_context[context_counter];
+}
+
 static void read_inter_block_mode_info(VP9Decoder *const pbi,
                                        MACROBLOCKD *const xd,
                                        MODE_INFO *const mi,
@@ -491,26 +689,13 @@ static void read_inter_block_mode_info(VP9Decoder *const pbi,
   const BLOCK_SIZE bsize = mbmi->sb_type;
   const int allow_hp = cm->allow_high_precision_mv;
   int_mv nearestmv[2], nearmv[2];
-  int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
   int ref, is_compound;
-  uint8_t inter_mode_ctx[MAX_REF_FRAMES];
+  uint8_t inter_mode_ctx;
+  const POSITION *const mv_ref_search = mv_ref_blocks[bsize];
 
   read_ref_frames(cm, xd, r, mbmi->segment_id, mbmi->ref_frame);
   is_compound = has_second_ref(mbmi);
-
-  for (ref = 0; ref < 1 + is_compound; ++ref) {
-    const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
-    RefBuffer *ref_buf = &cm->frame_refs[frame - LAST_FRAME];
-
-    xd->block_refs[ref] = ref_buf;
-    if ((!vp9_is_valid_scale(&ref_buf->sf)))
-      vpx_internal_error(xd->error_info, VPX_CODEC_UNSUP_BITSTREAM,
-                         "Reference frame has invalid dimensions");
-    vp9_setup_pre_planes(xd, ref, ref_buf->buf, mi_row, mi_col,
-                         &ref_buf->sf);
-    vp9_find_mv_refs(cm, xd, mi, frame, ref_mvs[frame],
-                     mi_row, mi_col, fpm_sync, (void *)pbi, inter_mode_ctx);
-  }
+  inter_mode_ctx = get_mode_context(cm, xd, mv_ref_search, mi_row, mi_col);
 
   if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
     mbmi->mode = ZEROMV;
@@ -521,14 +706,27 @@ static void read_inter_block_mode_info(VP9Decoder *const pbi,
     }
   } else {
     if (bsize >= BLOCK_8X8)
-      mbmi->mode = read_inter_mode(cm, xd, r,
-                                   inter_mode_ctx[mbmi->ref_frame[0]]);
-  }
-
-  if (bsize < BLOCK_8X8 || mbmi->mode != ZEROMV) {
-    for (ref = 0; ref < 1 + is_compound; ++ref) {
-      vp9_find_best_ref_mvs(xd, allow_hp, ref_mvs[mbmi->ref_frame[ref]],
-                            &nearestmv[ref], &nearmv[ref]);
+      mbmi->mode = read_inter_mode(cm, xd, r, inter_mode_ctx);
+    else
+      // Sub 8x8 blocks use the nearestmv as a ref_mv if the b_mode is NEWMV.
+      // Setting mode to NEARESTMV forces the search to stop after the nearestmv
+      // has been found. After b_modes have been read, mode will be overwritten
+      // by the last b_mode.
+      mbmi->mode = NEARESTMV;
+
+    if (mbmi->mode != ZEROMV) {
+      for (ref = 0; ref < 1 + is_compound; ++ref) {
+        int_mv ref_mvs[MAX_MV_REF_CANDIDATES];
+        const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
+        int refmv_count;
+
+        refmv_count = dec_find_mv_refs(cm, xd, mi, frame, mv_ref_search,
+                                       ref_mvs, mi_row, mi_col, fpm_sync,
+                                       (void *)pbi);
+
+        dec_find_best_ref_mvs(xd, allow_hp, ref_mvs, &nearestmv[ref],
+                              &nearmv[ref], refmv_count);
+      }
     }
   }
 
@@ -546,7 +744,7 @@ static void read_inter_block_mode_info(VP9Decoder *const pbi,
       for (idx = 0; idx < 2; idx += num_4x4_w) {
         int_mv block[2];
         const int j = idy * 2 + idx;
-        b_mode = read_inter_mode(cm, xd, r, inter_mode_ctx[mbmi->ref_frame[0]]);
+        b_mode = read_inter_mode(cm, xd, r, inter_mode_ctx);
 
         if (b_mode == NEARESTMV || b_mode == NEARMV) {
           uint8_t dummy_mode_ctx[MAX_REF_FRAMES];
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c
index 0def2cf1f..63db214d1 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -191,7 +191,8 @@ void vp9_cyclic_refresh_update_segment(VP9_COMP *const cpi,
                                        BLOCK_SIZE bsize,
                                        int64_t rate,
                                        int64_t dist,
-                                       int skip) {
+                                       int skip,
+                                       struct macroblock_plane *const p) {
   const VP9_COMMON *const cm = &cpi->common;
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   const int bw = num_8x8_blocks_wide_lookup[bsize];
@@ -199,12 +200,25 @@ void vp9_cyclic_refresh_update_segment(VP9_COMP *const cpi,
   const int xmis = VPXMIN(cm->mi_cols - mi_col, bw);
   const int ymis = VPXMIN(cm->mi_rows - mi_row, bh);
   const int block_index = mi_row * cm->mi_cols + mi_col;
-  const int refresh_this_block = candidate_refresh_aq(cr, mbmi, rate, dist,
-                                                      bsize);
+  int refresh_this_block = candidate_refresh_aq(cr, mbmi, rate, dist, bsize);
   // Default is to not update the refresh map.
   int new_map_value = cr->map[block_index];
   int x = 0; int y = 0;
 
+  int is_skin = 0;
+  if (refresh_this_block == 0 &&
+      bsize <= BLOCK_16X16 &&
+      cpi->oxcf.content != VP9E_CONTENT_SCREEN) {
+    is_skin = vp9_compute_skin_block(p[0].src.buf,
+                                     p[1].src.buf,
+                                     p[2].src.buf,
+                                     p[0].src.stride,
+                                     p[1].src.stride,
+                                     bsize);
+    if (is_skin)
+      refresh_this_block = 1;
+  }
+
   // If this block is labeled for refresh, check if we should reset the
   // segment_id.
   if (cyclic_refresh_segment_id_boosted(mbmi->segment_id)) {
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.h b/vp9/encoder/vp9_aq_cyclicrefresh.h
index a5b38138b..edf0a973e 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.h
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.h
@@ -14,6 +14,8 @@
 
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_blockd.h"
+#include "vp9/encoder/vp9_block.h"
+#include "vp9/encoder/vp9_skin_detection.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -93,7 +95,8 @@ int vp9_cyclic_refresh_rc_bits_per_mb(const struct VP9_COMP *cpi, int i,
 void vp9_cyclic_refresh_update_segment(struct VP9_COMP *const cpi,
                                        MB_MODE_INFO *const mbmi,
                                        int mi_row, int mi_col, BLOCK_SIZE bsize,
-                                       int64_t rate, int64_t dist, int skip);
+                                       int64_t rate, int64_t dist, int skip,
+                                       struct macroblock_plane *const p);
 
 void vp9_cyclic_refresh_update_sb_postencode(struct VP9_COMP *const cpi,
                                              const MB_MODE_INFO *const mbmi,
diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index 93aa40ae9..6533902b3 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -333,20 +333,12 @@ void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
   int is_skin = 0;
 
   if (bs <= BLOCK_16X16 && denoiser->denoising_level >= kDenLow) {
-    // Take center pixel in block to determine is_skin.
-    const int y_width_shift = (4 << b_width_log2_lookup[bs]) >> 1;
-    const int y_height_shift = (4 << b_height_log2_lookup[bs]) >> 1;
-    const int uv_width_shift = y_width_shift >> 1;
-    const int uv_height_shift = y_height_shift >> 1;
-    const int stride = mb->plane[0].src.stride;
-    const int strideuv = mb->plane[1].src.stride;
-    const uint8_t ysource =
-      mb->plane[0].src.buf[y_height_shift * stride + y_width_shift];
-    const uint8_t usource =
-      mb->plane[1].src.buf[uv_height_shift * strideuv + uv_width_shift];
-    const uint8_t vsource =
-      mb->plane[2].src.buf[uv_height_shift * strideuv + uv_width_shift];
-    is_skin = vp9_skin_pixel(ysource, usource, vsource);
+    is_skin = vp9_compute_skin_block(mb->plane[0].src.buf,
+                                     mb->plane[1].src.buf,
+                                     mb->plane[2].src.buf,
+                                     mb->plane[0].src.stride,
+                                     mb->plane[1].src.stride,
+                                     bs);
   }
 
   mv_col = ctx->best_sse_mv.as_mv.col;
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index cc4d1f14e..c07eee969 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -1045,7 +1045,7 @@ static void update_state(VP9_COMP *cpi, ThreadData *td,
     if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
       vp9_cyclic_refresh_update_segment(cpi, &xd->mi[0]->mbmi, mi_row,
                                         mi_col, bsize, ctx->rate, ctx->dist,
-                                        x->skip);
+                                        x->skip, p);
     }
   }
 
@@ -1705,6 +1705,7 @@ static void update_state_rt(VP9_COMP *cpi, ThreadData *td,
   MACROBLOCKD *const xd = &x->e_mbd;
   MODE_INFO *const mi = xd->mi[0];
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  struct macroblock_plane *const p = x->plane;
   const struct segmentation *const seg = &cm->seg;
   const int bw = num_8x8_blocks_wide_lookup[mi->mbmi.sb_type];
   const int bh = num_8x8_blocks_high_lookup[mi->mbmi.sb_type];
@@ -1725,7 +1726,7 @@ static void update_state_rt(VP9_COMP *cpi, ThreadData *td,
     } else {
     // Setting segmentation map for cyclic_refresh.
       vp9_cyclic_refresh_update_segment(cpi, mbmi, mi_row, mi_col, bsize,
-                                        ctx->rate, ctx->dist, x->skip);
+                                        ctx->rate, ctx->dist, x->skip, p);
     }
     vp9_init_plane_quantizers(cpi, x);
   }
diff --git a/vp9/encoder/vp9_noise_estimate.c b/vp9/encoder/vp9_noise_estimate.c
index 4befbb066..008a40afc 100644
--- a/vp9/encoder/vp9_noise_estimate.c
+++ b/vp9/encoder/vp9_noise_estimate.c
@@ -145,10 +145,6 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
     const uint8_t *src_u = cpi->Source->u_buffer;
     const uint8_t *src_v = cpi->Source->v_buffer;
     const int src_uvstride = cpi->Source->uv_stride;
-    const int y_width_shift = (4 << b_width_log2_lookup[bsize]) >> 1;
-    const int y_height_shift = (4 << b_height_log2_lookup[bsize]) >> 1;
-    const int uv_width_shift = y_width_shift >> 1;
-    const int uv_height_shift = y_height_shift >> 1;
     int mi_row, mi_col;
     int num_low_motion = 0;
     int frame_low_motion = 1;
@@ -173,13 +169,12 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
           // been encoded as zero/low motion x (= thresh_consec_zeromv) frames
           // in a row. consec_zero_mv[] defined for 8x8 blocks, so consider all
           // 4 sub-blocks for 16x16 block. Also, avoid skin blocks.
-          const uint8_t ysource =
-            src_y[y_height_shift * src_ystride + y_width_shift];
-          const uint8_t usource =
-            src_u[uv_height_shift * src_uvstride + uv_width_shift];
-          const uint8_t vsource =
-            src_v[uv_height_shift * src_uvstride + uv_width_shift];
-          int is_skin = vp9_skin_pixel(ysource, usource, vsource);
+          int is_skin = vp9_compute_skin_block(src_y,
+                                               src_u,
+                                               src_v,
+                                               src_ystride,
+                                               src_uvstride,
+                                               bsize);
           if (frame_low_motion &&
               cr->consec_zero_mv[bl_index] > thresh_consec_zeromv &&
               cr->consec_zero_mv[bl_index1] > thresh_consec_zeromv &&
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index b929758ca..71b8bdba0 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -1263,9 +1263,6 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     ref_frame = ref_mode_set[idx].ref_frame;
     if (cpi->use_svc) {
       ref_frame = ref_mode_set_svc[idx].ref_frame;
-      if (svc_force_zero_mode[ref_frame - 1] &&
-          frame_mv[this_mode][ref_frame].as_int != 0)
-        continue;
     }
 
     if (!(cpi->ref_frame_flags & flag_list[ref_frame]))
@@ -1273,6 +1270,12 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     if (const_motion[ref_frame] && this_mode == NEARMV)
       continue;
 
+    if (cpi->use_svc) {
+      if (svc_force_zero_mode[ref_frame - 1] &&
+          frame_mv[this_mode][ref_frame].as_int != 0)
+        continue;
+    }
+
     if (!(frame_mv[this_mode][ref_frame].as_int == 0 &&
         ref_frame == LAST_FRAME)) {
       i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME;
@@ -1855,6 +1858,13 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
     if (ref_frame_skip_mask & (1 << ref_frame))
       continue;
 
+#if CONFIG_BETTER_HW_COMPATIBILITY
+    if ((bsize == BLOCK_8X4 || bsize == BLOCK_4X8) &&
+        ref_frame > INTRA_FRAME &&
+        vp9_is_scaled(&cm->frame_refs[ref_frame - 1].sf))
+      continue;
+#endif
+
     // TODO(jingning, agrange): Scaling reference frame not supported for
     // sub8x8 blocks. Is this supported now?
     if (ref_frame > INTRA_FRAME &&
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 2a6b70703..b8d17205d 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -1349,11 +1349,25 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
   const InterpKernel *kernel = vp9_filter_kernels[mi->mbmi.interp_filter];
 
   for (ref = 0; ref < 1 + is_compound; ++ref) {
-    const uint8_t *pre = &pd->pre[ref].buf[vp9_raster_block_offset(BLOCK_8X8, i,
-                                               pd->pre[ref].stride)];
+    const int bw = b_width_log2_lookup[BLOCK_8X8];
+    const int h = 4 * (i >> bw);
+    const int w = 4 * (i & ((1 << bw) - 1));
+    const struct scale_factors *sf = &xd->block_refs[ref]->sf;
+    int y_stride = pd->pre[ref].stride;
+    uint8_t *pre = pd->pre[ref].buf + (h * pd->pre[ref].stride + w);
+
+    if (vp9_is_scaled(sf)) {
+      const int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x));
+      const int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y));
+
+      y_stride = xd->block_refs[ref]->buf->y_stride;
+      pre = xd->block_refs[ref]->buf->y_buffer;
+      pre += scaled_buffer_offset(x_start + w, y_start + h,
+                                  y_stride, sf);
+    }
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    vp9_highbd_build_inter_predictor(pre, pd->pre[ref].stride,
+    vp9_highbd_build_inter_predictor(pre, y_stride,
                                      dst, pd->dst.stride,
                                      &mi->bmi[i].as_mv[ref].as_mv,
                                      &xd->block_refs[ref]->sf, width, height,
@@ -1361,7 +1375,7 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
                                      mi_col * MI_SIZE + 4 * (i % 2),
                                      mi_row * MI_SIZE + 4 * (i / 2), xd->bd);
   } else {
-    vp9_build_inter_predictor(pre, pd->pre[ref].stride,
+    vp9_build_inter_predictor(pre, y_stride,
                               dst, pd->dst.stride,
                               &mi->bmi[i].as_mv[ref].as_mv,
                               &xd->block_refs[ref]->sf, width, height, ref,
@@ -1370,7 +1384,7 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
                               mi_row * MI_SIZE + 4 * (i / 2));
   }
 #else
-    vp9_build_inter_predictor(pre, pd->pre[ref].stride,
+    vp9_build_inter_predictor(pre, y_stride,
                               dst, pd->dst.stride,
                               &mi->bmi[i].as_mv[ref].as_mv,
                               &xd->block_refs[ref]->sf, width, height, ref,
@@ -3021,7 +3035,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi,
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     if (!(cpi->ref_frame_flags & flag_list[ref_frame])) {
       // Skip checking missing references in both single and compound reference
-      // modes. Note that a mode will be skipped iff both reference frames
+      // modes. Note that a mode will be skipped if both reference frames
       // are masked out.
       ref_frame_skip_mask[0] |= (1 << ref_frame);
       ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
@@ -3804,6 +3818,16 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi,
     ref_frame = vp9_ref_order[ref_index].ref_frame[0];
     second_ref_frame = vp9_ref_order[ref_index].ref_frame[1];
 
+#if CONFIG_BETTER_HW_COMPATIBILITY
+    // forbid 8X4 and 4X8 partitions if any reference frame is scaled.
+    if (bsize == BLOCK_8X4 || bsize == BLOCK_4X8) {
+      int ref_scaled = vp9_is_scaled(&cm->frame_refs[ref_frame - 1].sf);
+      if (second_ref_frame > INTRA_FRAME)
+        ref_scaled += vp9_is_scaled(&cm->frame_refs[second_ref_frame - 1].sf);
+      if (ref_scaled)
+        continue;
+    }
+#endif
     // Look at the reference frame of the best mode so far and set the
     // skip mask to look at a subset of the remaining modes.
     if (ref_index > 2 && sf->mode_skip_start < MAX_MODES) {
diff --git a/vp9/encoder/vp9_skin_detection.c b/vp9/encoder/vp9_skin_detection.c
index c2763b7da..0ca166536 100644
--- a/vp9/encoder/vp9_skin_detection.c
+++ b/vp9/encoder/vp9_skin_detection.c
@@ -48,6 +48,20 @@ int vp9_skin_pixel(const uint8_t y, const uint8_t cb, const uint8_t cr) {
     return (evaluate_skin_color_difference(cb, cr) < skin_threshold);
 }
 
+int vp9_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
+                           int stride, int strideuv, int bsize) {
+  // Take center pixel in block to determine is_skin.
+  const int y_width_shift = (4 << b_width_log2_lookup[bsize]) >> 1;
+  const int y_height_shift = (4 << b_height_log2_lookup[bsize]) >> 1;
+  const int uv_width_shift = y_width_shift >> 1;
+  const int uv_height_shift = y_height_shift >> 1;
+  const uint8_t ysource = y[y_height_shift * stride + y_width_shift];
+  const uint8_t usource = u[uv_height_shift * strideuv + uv_width_shift];
+  const uint8_t vsource = v[uv_height_shift * strideuv + uv_width_shift];
+  return vp9_skin_pixel(ysource, usource, vsource);
+}
+
+
 #ifdef OUTPUT_YUV_SKINMAP
 // For viewing skin map on input source.
 void vp9_compute_skin_map(VP9_COMP *const cpi, FILE *yuv_skinmap_file) {
diff --git a/vp9/encoder/vp9_skin_detection.h b/vp9/encoder/vp9_skin_detection.h
index 0a87ef9f4..73f7c39d9 100644
--- a/vp9/encoder/vp9_skin_detection.h
+++ b/vp9/encoder/vp9_skin_detection.h
@@ -23,6 +23,9 @@ struct VP9_COMP;
 
 int vp9_skin_pixel(const uint8_t y, const uint8_t cb, const uint8_t cr);
 
+int vp9_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
+                           int stride, int strideuv, int bsize);
+
 #ifdef OUTPUT_YUV_SKINMAP
 // For viewing skin map on input source.
 void vp9_compute_skin_map(struct VP9_COMP *const cpi, FILE *yuv_skinmap_file);
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 318d8100c..c5f0bad8f 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -394,7 +394,7 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
           sf->intra_y_mode_bsize_mask[i] = INTRA_DC_TM_H_V;
       } else {
         for (i = 0; i < BLOCK_SIZES; ++i)
-          if (i >= BLOCK_16X16)
+          if (i > BLOCK_16X16)
             sf->intra_y_mode_bsize_mask[i] = INTRA_DC;
           else
             // Use H and V intra mode for block sizes <= 16X16.
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index a4e7eb19e..30a7d1013 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -36,6 +36,12 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
   svc->current_superframe = 0;
   for (i = 0; i < REF_FRAMES; ++i)
     svc->ref_frame_index[i] = -1;
+  for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
+    cpi->svc.ext_frame_flags[sl] = 0;
+    cpi->svc.ext_lst_fb_idx[sl] = 0;
+    cpi->svc.ext_gld_fb_idx[sl] = 1;
+    cpi->svc.ext_alt_fb_idx[sl] = 2;
+  }
 
   if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2) {
     if (vpx_realloc_frame_buffer(&cpi->svc.empty_frame.img,
@@ -566,6 +572,8 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
     // Note that the check (cpi->ext_refresh_frame_flags_pending == 0) is
     // needed to support the case where the frame flags may be passed in via
     // vpx_codec_encode(), which can be used for the temporal-only svc case.
+    // TODO(marpan): Consider adding an enc_config parameter to better handle
+    // this case.
     if (cpi->ext_refresh_frame_flags_pending == 0) {
       int sl;
       cpi->svc.spatial_layer_id = cpi->svc.spatial_layer_to_encode;
diff --git a/vpx/vpx_image.h b/vpx/vpx_image.h
index e9e952c48..7958c6980 100644
--- a/vpx/vpx_image.h
+++ b/vpx/vpx_image.h
@@ -28,7 +28,7 @@ extern "C" {
    * types, removing or reassigning enums, adding/removing/rearranging
    * fields to structures
    */
-#define VPX_IMAGE_ABI_VERSION (3) /**<\hideinitializer*/
+#define VPX_IMAGE_ABI_VERSION (4) /**<\hideinitializer*/
 
 
 #define VPX_IMG_FMT_PLANAR     0x100  /**< Image is a planar format. */
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index f71769918..6c6f15e51 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -288,13 +288,13 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vpx_highbd_d153_predictor_4x4/;
 
   add_proto qw/void vpx_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_v_predictor_4x4/, "$sse_x86inc";
+  specialize qw/vpx_highbd_v_predictor_4x4/, "$sse2_x86inc";
 
   add_proto qw/void vpx_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_tm_predictor_4x4/, "$sse_x86inc";
+  specialize qw/vpx_highbd_tm_predictor_4x4/, "$sse2_x86inc";
 
   add_proto qw/void vpx_highbd_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_predictor_4x4/, "$sse_x86inc";
+  specialize qw/vpx_highbd_dc_predictor_4x4/, "$sse2_x86inc";
 
   add_proto qw/void vpx_highbd_dc_top_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_top_predictor_4x4/;
@@ -387,7 +387,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vpx_highbd_v_predictor_16x16/, "$sse2_x86inc";
 
   add_proto qw/void vpx_highbd_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_tm_predictor_16x16/, "$sse2_x86_64_x86inc";
+  specialize qw/vpx_highbd_tm_predictor_16x16/, "$sse2_x86inc";
 
   add_proto qw/void vpx_highbd_dc_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_predictor_16x16/, "$sse2_x86inc";
@@ -435,10 +435,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vpx_highbd_v_predictor_32x32/, "$sse2_x86inc";
 
   add_proto qw/void vpx_highbd_tm_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_tm_predictor_32x32/, "$sse2_x86_64_x86inc";
+  specialize qw/vpx_highbd_tm_predictor_32x32/, "$sse2_x86inc";
 
   add_proto qw/void vpx_highbd_dc_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_predictor_32x32/, "$sse2_x86_64_x86inc";
+  specialize qw/vpx_highbd_dc_predictor_32x32/, "$sse2_x86inc";
 
   add_proto qw/void vpx_highbd_dc_top_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_top_predictor_32x32/;
@@ -990,10 +990,10 @@ add_proto qw/unsigned int vpx_sad8x4/, "const uint8_t *src_ptr, int src_stride,
 specialize qw/vpx_sad8x4 msa/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vpx_sad4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad4x8 msa/, "$sse_x86inc";
+specialize qw/vpx_sad4x8 msa/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vpx_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad4x4 mmx neon msa/, "$sse_x86inc";
+specialize qw/vpx_sad4x4 mmx neon msa/, "$sse2_x86inc";
 
 #
 # Avg
@@ -1061,10 +1061,10 @@ add_proto qw/unsigned int vpx_sad8x4_avg/, "const uint8_t *src_ptr, int src_stri
 specialize qw/vpx_sad8x4_avg msa/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vpx_sad4x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad4x8_avg msa/, "$sse_x86inc";
+specialize qw/vpx_sad4x8_avg msa/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vpx_sad4x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad4x4_avg msa/, "$sse_x86inc";
+specialize qw/vpx_sad4x4_avg msa/, "$sse2_x86inc";
 
 #
 # Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
diff --git a/vpx_dsp/x86/highbd_intrapred_sse2.asm b/vpx_dsp/x86/highbd_intrapred_sse2.asm
index b12d29c0a..c61b62104 100644
--- a/vpx_dsp/x86/highbd_intrapred_sse2.asm
+++ b/vpx_dsp/x86/highbd_intrapred_sse2.asm
@@ -17,24 +17,20 @@ pw_16: times 4 dd 16
 pw_32: times 4 dd 32
 
 SECTION .text
-INIT_MMX sse
+INIT_XMM sse2
 cglobal highbd_dc_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset
   GET_GOT     goffsetq
 
   movq                  m0, [aboveq]
   movq                  m2, [leftq]
-  DEFINE_ARGS dst, stride, one
-  mov                 oned, 0x0001
-  pxor                  m1, m1
-  movd                  m3, oned
-  pshufw                m3, m3, 0x0
   paddw                 m0, m2
-  pmaddwd               m0, m3
-  packssdw              m0, m1
-  pmaddwd               m0, m3
+  pshuflw               m1, m0, 0xe
+  paddw                 m0, m1
+  pshuflw               m1, m0, 0x1
+  paddw                 m0, m1
   paddw                 m0, [GLOBAL(pw_4)]
   psraw                 m0, 3
-  pshufw                m0, m0, 0x0
+  pshuflw               m0, m0, 0x0
   movq    [dstq          ], m0
   movq    [dstq+strideq*2], m0
   lea                 dstq, [dstq+strideq*4]
@@ -122,30 +118,29 @@ cglobal highbd_dc_predictor_16x16, 4, 5, 5, dst, stride, above, left, goffset
   RESTORE_GOT
   REP_RET
 
-%if ARCH_X86_64
 INIT_XMM sse2
-cglobal highbd_dc_predictor_32x32, 4, 5, 9, dst, stride, above, left, goffset
+cglobal highbd_dc_predictor_32x32, 4, 5, 7, dst, stride, above, left, goffset
   GET_GOT     goffsetq
 
-  pxor                  m1, m1
   mova                  m0, [aboveq]
   mova                  m2, [aboveq+16]
   mova                  m3, [aboveq+32]
   mova                  m4, [aboveq+48]
-  mova                  m5, [leftq]
-  mova                  m6, [leftq+16]
-  mova                  m7, [leftq+32]
-  mova                  m8, [leftq+48]
+  paddw                 m0, m2
+  paddw                 m3, m4
+  mova                  m2, [leftq]
+  mova                  m4, [leftq+16]
+  mova                  m5, [leftq+32]
+  mova                  m6, [leftq+48]
+  paddw                 m2, m4
+  paddw                 m5, m6
+  paddw                 m0, m3
+  paddw                 m2, m5
+  pxor                  m1, m1
+  paddw                 m0, m2
   DEFINE_ARGS dst, stride, stride3, lines4
   lea             stride3q, [strideq*3]
   mov              lines4d, 8
-  paddw                 m0, m2
-  paddw                 m0, m3
-  paddw                 m0, m4
-  paddw                 m0, m5
-  paddw                 m0, m6
-  paddw                 m0, m7
-  paddw                 m0, m8
   movhlps               m2, m0
   paddw                 m0, m2
   punpcklwd             m0, m1
@@ -181,9 +176,8 @@ cglobal highbd_dc_predictor_32x32, 4, 5, 9, dst, stride, above, left, goffset
 
   RESTORE_GOT
   REP_RET
-%endif
 
-INIT_MMX sse
+INIT_XMM sse2
 cglobal highbd_v_predictor_4x4, 3, 3, 1, dst, stride, above
   movq                  m0, [aboveq]
   movq    [dstq          ], m0
@@ -261,43 +255,44 @@ cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above
   jnz .loop
   REP_RET
 
-INIT_MMX sse
-cglobal highbd_tm_predictor_4x4, 5, 6, 5, dst, stride, above, left, bps, one
+INIT_XMM sse2
+cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bps
   movd                  m1, [aboveq-2]
   movq                  m0, [aboveq]
-  pshufw                m1, m1, 0x0
+  pshuflw               m1, m1, 0x0
+  movlhps               m0, m0         ; t1 t2 t3 t4 t1 t2 t3 t4
+  movlhps               m1, m1         ; tl tl tl tl tl tl tl tl
   ; Get the values to compute the maximum value at this bit depth
-  mov                 oned, 1
-  movd                  m3, oned
+  pcmpeqw               m3, m3
   movd                  m4, bpsd
-  pshufw                m3, m3, 0x0
-  DEFINE_ARGS dst, stride, line, left
-  mov                lineq, -2
-  mova                  m2, m3
+  psubw                 m0, m1         ; t1-tl t2-tl t3-tl t4-tl
   psllw                 m3, m4
-  add                leftq, 8
-  psubw                 m3, m2 ; max possible value
-  pxor                  m4, m4 ; min possible value
-  psubw                 m0, m1
-.loop:
-  movq                  m1, [leftq+lineq*4]
-  movq                  m2, [leftq+lineq*4+2]
-  pshufw                m1, m1, 0x0
-  pshufw                m2, m2, 0x0
-  paddw                 m1, m0
+  pcmpeqw               m2, m2
+  pxor                  m4, m4         ; min possible value
+  pxor                  m3, m2         ; max possible value
+  mova                  m1, [leftq]
+  pshuflw               m2, m1, 0x0
+  pshuflw               m5, m1, 0x55
+  movlhps               m2, m5         ; l1 l1 l1 l1 l2 l2 l2 l2
   paddw                 m2, m0
   ;Clamp to the bit-depth
-  pminsw                m1, m3
   pminsw                m2, m3
-  pmaxsw                m1, m4
   pmaxsw                m2, m4
   ;Store the values
-  movq    [dstq          ], m1
-  movq    [dstq+strideq*2], m2
+  movq    [dstq          ], m2
+  movhpd  [dstq+strideq*2], m2
   lea                 dstq, [dstq+strideq*4]
-  inc                lineq
-  jnz .loop
-  REP_RET
+  pshuflw               m2, m1, 0xaa
+  pshuflw               m5, m1, 0xff
+  movlhps               m2, m5
+  paddw                 m2, m0
+  ;Clamp to the bit-depth
+  pminsw                m2, m3
+  pmaxsw                m2, m4
+  ;Store the values
+  movq    [dstq          ], m2
+  movhpd  [dstq+strideq*2], m2
+  RET
 
 INIT_XMM sse2
 cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one
@@ -343,63 +338,55 @@ cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one
   jnz .loop
   REP_RET
 
-%if ARCH_X86_64
 INIT_XMM sse2
-cglobal highbd_tm_predictor_16x16, 5, 6, 9, dst, stride, above, left, bps, one
+cglobal highbd_tm_predictor_16x16, 5, 5, 8, dst, stride, above, left, bps
   movd                  m2, [aboveq-2]
   mova                  m0, [aboveq]
   mova                  m1, [aboveq+16]
   pshuflw               m2, m2, 0x0
   ; Get the values to compute the maximum value at this bit depth
-  mov                 oned, 1
-  pxor                  m7, m7
-  pxor                  m8, m8
-  pinsrw                m7, oned, 0
-  pinsrw                m8, bpsd, 0
-  pshuflw               m7, m7, 0x0
+  pcmpeqw               m3, m3
+  movd                  m4, bpsd
+  punpcklqdq            m2, m2
+  psllw                 m3, m4
+  pcmpeqw               m5, m5
+  pxor                  m4, m4         ; min possible value
+  pxor                  m3, m5         ; max possible value
   DEFINE_ARGS dst, stride, line, left
-  punpcklqdq            m7, m7
   mov                lineq, -8
-  mova                  m5, m7
-  punpcklqdq            m2, m2
-  psllw                 m7, m8
-  add                leftq, 32
-  psubw                 m7, m5 ; max possible value
-  pxor                  m8, m8 ; min possible value
   psubw                 m0, m2
   psubw                 m1, m2
 .loop:
-  movd                  m2, [leftq+lineq*4]
-  movd                  m3, [leftq+lineq*4+2]
-  pshuflw               m2, m2, 0x0
-  pshuflw               m3, m3, 0x0
-  punpcklqdq            m2, m2
-  punpcklqdq            m3, m3
-  paddw                 m4, m2, m0
-  paddw                 m5, m3, m0
+  movd                  m7, [leftq]
+  pshuflw               m5, m7, 0x0
+  pshuflw               m2, m7, 0x55
+  punpcklqdq            m5, m5         ; l1 l1 l1 l1 l1 l1 l1 l1
+  punpcklqdq            m2, m2         ; l2 l2 l2 l2 l2 l2 l2 l2
+  paddw                 m6, m5, m0     ; t1-tl+l1 to t4-tl+l1
+  paddw                 m5, m1         ; t5-tl+l1 to t8-tl+l1
+  pminsw                m6, m3
+  pminsw                m5, m3
+  pmaxsw                m6, m4         ; Clamp to the bit-depth
+  pmaxsw                m5, m4
+  mova   [dstq           ], m6
+  mova   [dstq        +16], m5
+  paddw                 m6, m2, m0
   paddw                 m2, m1
-  paddw                 m3, m1
-  ;Clamp to the bit-depth
-  pminsw                m4, m7
-  pminsw                m5, m7
-  pminsw                m2, m7
-  pminsw                m3, m7
-  pmaxsw                m4, m8
-  pmaxsw                m5, m8
-  pmaxsw                m2, m8
-  pmaxsw                m3, m8
-  ;Store the values
-  mova   [dstq             ], m4
-  mova   [dstq+strideq*2   ], m5
-  mova   [dstq          +16], m2
-  mova   [dstq+strideq*2+16], m3
+  pminsw                m6, m3
+  pminsw                m2, m3
+  pmaxsw                m6, m4
+  pmaxsw                m2, m4
+  mova   [dstq+strideq*2 ], m6
+  mova [dstq+strideq*2+16], m2
   lea                 dstq, [dstq+strideq*4]
   inc                lineq
+  lea                leftq, [leftq+4]
+
   jnz .loop
   REP_RET
 
 INIT_XMM sse2
-cglobal highbd_tm_predictor_32x32, 5, 6, 12, dst, stride, above, left, bps, one
+cglobal highbd_tm_predictor_32x32, 5, 5, 8, dst, stride, above, left, bps
   movd                  m0, [aboveq-2]
   mova                  m1, [aboveq]
   mova                  m2, [aboveq+16]
@@ -407,70 +394,60 @@ cglobal highbd_tm_predictor_32x32, 5, 6, 12, dst, stride, above, left, bps, one
   mova                  m4, [aboveq+48]
   pshuflw               m0, m0, 0x0
   ; Get the values to compute the maximum value at this bit depth
-  mov                 oned, 1
-  pxor                 m10, m10
-  pxor                 m11, m11
-  pinsrw               m10, oned, 0
-  pinsrw               m11, bpsd, 0
-  pshuflw              m10, m10, 0x0
+  pcmpeqw               m5, m5
+  movd                  m6, bpsd
+  psllw                 m5, m6
+  pcmpeqw               m7, m7
+  pxor                  m6, m6         ; min possible value
+  pxor                  m5, m7         ; max possible value
+  punpcklqdq            m0, m0
   DEFINE_ARGS dst, stride, line, left
-  punpcklqdq           m10, m10
   mov                lineq, -16
-  mova                  m5, m10
-  punpcklqdq            m0, m0
-  psllw                m10, m11
-  add                leftq, 64
-  psubw                m10, m5 ; max possible value
-  pxor                 m11, m11 ; min possible value
   psubw                 m1, m0
   psubw                 m2, m0
   psubw                 m3, m0
   psubw                 m4, m0
 .loop:
-  movd                  m5, [leftq+lineq*4]
-  movd                  m6, [leftq+lineq*4+2]
-  pshuflw               m5, m5, 0x0
-  pshuflw               m6, m6, 0x0
-  punpcklqdq            m5, m5
-  punpcklqdq            m6, m6
-  paddw                 m7, m5, m1
-  paddw                 m8, m5, m2
-  paddw                 m9, m5, m3
-  paddw                 m5, m4
-  ;Clamp these values to the bit-depth
-  pminsw                m7, m10
-  pminsw                m8, m10
-  pminsw                m9, m10
-  pminsw                m5, m10
-  pmaxsw                m7, m11
-  pmaxsw                m8, m11
-  pmaxsw                m9, m11
-  pmaxsw                m5, m11
-  ;Store these values
-  mova   [dstq           ], m7
-  mova   [dstq        +16], m8
-  mova   [dstq        +32], m9
-  mova   [dstq        +48], m5
-  paddw                 m7, m6, m1
-  paddw                 m8, m6, m2
-  paddw                 m9, m6, m3
-  paddw                 m6, m4
-  ;Clamp these values to the bit-depth
-  pminsw                m7, m10
-  pminsw                m8, m10
-  pminsw                m9, m10
-  pminsw                m6, m10
-  pmaxsw                m7, m11
-  pmaxsw                m8, m11
-  pmaxsw                m9, m11
-  pmaxsw                m6, m11
-  ;Store these values
-  mova   [dstq+strideq*2   ], m7
-  mova   [dstq+strideq*2+16], m8
-  mova   [dstq+strideq*2+32], m9
-  mova   [dstq+strideq*2+48], m6
+  movd                  m7, [leftq]
+  pshuflw               m7, m7, 0x0
+  punpcklqdq            m7, m7         ; l1 l1 l1 l1 l1 l1 l1 l1
+  paddw                 m0, m7, m1
+  pminsw                m0, m5
+  pmaxsw                m0, m6
+  mova   [dstq           ], m0
+  paddw                 m0, m7, m2
+  pminsw                m0, m5
+  pmaxsw                m0, m6
+  mova   [dstq        +16], m0
+  paddw                 m0, m7, m3
+  pminsw                m0, m5
+  pmaxsw                m0, m6
+  mova   [dstq        +32], m0
+  paddw                 m0, m7, m4
+  pminsw                m0, m5
+  pmaxsw                m0, m6
+  mova   [dstq        +48], m0
+  movd                  m7, [leftq+2]
+  pshuflw               m7, m7, 0x0
+  punpcklqdq            m7, m7         ; l2 l2 l2 l2 l2 l2 l2 l2
+  paddw                 m0, m7, m1
+  pminsw                m0, m5
+  pmaxsw                m0, m6
+  mova   [dstq+strideq*2 ], m0
+  paddw                 m0, m7, m2
+  pminsw                m0, m5
+  pmaxsw                m0, m6
+  mova   [dstq+strideq*2+16], m0
+  paddw                 m0, m7, m3
+  pminsw                m0, m5
+  pmaxsw                m0, m6
+  mova   [dstq+strideq*2+32], m0
+  paddw                 m0, m7, m4
+  pminsw                m0, m5
+  pmaxsw                m0, m6
+  mova   [dstq+strideq*2+48], m0
   lea                 dstq, [dstq+strideq*4]
+  lea                leftq, [leftq+4]
   inc                lineq
   jnz .loop
   REP_RET
-%endif
diff --git a/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
index 22d52a2af..30ee81b68 100644
--- a/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
+++ b/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
@@ -79,20 +79,13 @@ SECTION .text
 
 %macro INC_SRC_BY_SRC_STRIDE  0
 %if ARCH_X86=1 && CONFIG_PIC=1
-  lea                srcq, [srcq + src_stridemp*2]
+  add                srcq, src_stridemp
+  add                srcq, src_stridemp
 %else
   lea                srcq, [srcq + src_strideq*2]
 %endif
 %endmacro
 
-%macro INC_SRC_BY_SRC_2STRIDE  0
-%if ARCH_X86=1 && CONFIG_PIC=1
-  lea                srcq, [srcq + src_stridemp*4]
-%else
-  lea                srcq, [srcq + src_strideq*4]
-%endif
-%endmacro
-
 %macro SUBPEL_VARIANCE 1-2 0 ; W
 %define bilin_filter_m bilin_filter_m_sse2
 %define filter_idx_shift 5
@@ -984,8 +977,9 @@ SECTION .text
 .x_other_y_other_loop:
   movu                 m2, [srcq]
   movu                 m4, [srcq+2]
-  movu                 m3, [srcq+src_strideq*2]
-  movu                 m5, [srcq+src_strideq*2+2]
+  INC_SRC_BY_SRC_STRIDE
+  movu                 m3, [srcq]
+  movu                 m5, [srcq+2]
   pmullw               m2, filter_x_a
   pmullw               m4, filter_x_b
   paddw                m2, filter_rnd
@@ -1018,7 +1012,7 @@ SECTION .text
   SUM_SSE              m0, m2, m4, m3, m6, m7
   mova                 m0, m5
 
-  INC_SRC_BY_SRC_2STRIDE
+  INC_SRC_BY_SRC_STRIDE
   lea                dstq, [dstq + dst_strideq * 4]
 %if %2 == 1 ; avg
   add                secq, sec_str
diff --git a/vpx_dsp/x86/highbd_variance_sse2.c b/vpx_dsp/x86/highbd_variance_sse2.c
index b45331caa..81ec5dbdb 100644
--- a/vpx_dsp/x86/highbd_variance_sse2.c
+++ b/vpx_dsp/x86/highbd_variance_sse2.c
@@ -243,13 +243,18 @@ unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
 }
 
 #if CONFIG_USE_X86INC
+// The 2 unused parameters are place holders for PIC enabled build.
+// These definitions are for functions defined in
+// highbd_subpel_variance_impl_sse2.asm
 #define DECL(w, opt) \
   int vpx_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \
                                                  ptrdiff_t src_stride, \
                                                  int x_offset, int y_offset, \
                                                  const uint16_t *dst, \
                                                  ptrdiff_t dst_stride, \
-                                                 int height, unsigned int *sse);
+                                                 int height, \
+                                                 unsigned int *sse, \
+                                                 void *unused0, void *unused);
 #define DECLS(opt1, opt2) \
   DECL(8, opt1); \
   DECL(16, opt1)
@@ -274,7 +279,7 @@ uint32_t vpx_highbd_8_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src8, \
   int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
                                                        x_offset, y_offset, \
                                                        dst, dst_stride, h, \
-                                                       &sse); \
+                                                       &sse, NULL, NULL); \
   if (w > wf) { \
     unsigned int sse2; \
     int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
@@ -282,19 +287,20 @@ uint32_t vpx_highbd_8_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src8, \
                                                           x_offset, y_offset, \
                                                           dst + 16, \
                                                           dst_stride, \
-                                                          h, &sse2); \
+                                                          h, &sse2, \
+                                                          NULL, NULL); \
     se += se2; \
     sse += sse2; \
     if (w > wf * 2) { \
       se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
                                                         x_offset, y_offset, \
                                                         dst + 32, dst_stride, \
-                                                        h, &sse2); \
+                                                        h, &sse2, NULL, NULL); \
       se += se2; \
       sse += sse2; \
       se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
           src + 48, src_stride, x_offset, y_offset, \
-          dst + 48, dst_stride, h, &sse2); \
+          dst + 48, dst_stride, h, &sse2, NULL, NULL); \
       se += se2; \
       sse += sse2; \
     } \
@@ -312,7 +318,7 @@ uint32_t vpx_highbd_10_sub_pixel_variance##w##x##h##_##opt( \
   int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
                                                        x_offset, y_offset, \
                                                        dst, dst_stride, \
-                                                       h, &sse); \
+                                                       h, &sse, NULL, NULL); \
   if (w > wf) { \
     uint32_t sse2; \
     int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
@@ -320,20 +326,21 @@ uint32_t vpx_highbd_10_sub_pixel_variance##w##x##h##_##opt( \
                                                           x_offset, y_offset, \
                                                           dst + 16, \
                                                           dst_stride, \
-                                                          h, &sse2); \
+                                                          h, &sse2, \
+                                                          NULL, NULL); \
     se += se2; \
     sse += sse2; \
     if (w > wf * 2) { \
       se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
                                                         x_offset, y_offset, \
                                                         dst + 32, dst_stride, \
-                                                        h, &sse2); \
+                                                        h, &sse2, NULL, NULL); \
       se += se2; \
       sse += sse2; \
       se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
                                                         x_offset, y_offset, \
                                                         dst + 48, dst_stride, \
-                                                        h, &sse2); \
+                                                        h, &sse2, NULL, NULL); \
       se += se2; \
       sse += sse2; \
     } \
@@ -359,27 +366,27 @@ uint32_t vpx_highbd_12_sub_pixel_variance##w##x##h##_##opt( \
     int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
         src + (start_row * src_stride), src_stride, \
         x_offset, y_offset, dst + (start_row * dst_stride), \
-        dst_stride, height, &sse2); \
+        dst_stride, height, &sse2, NULL, NULL); \
     se += se2; \
     long_sse += sse2; \
     if (w > wf) { \
       se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
           src + 16 + (start_row * src_stride), src_stride, \
           x_offset, y_offset, dst + 16 + (start_row * dst_stride), \
-          dst_stride, height, &sse2); \
+          dst_stride, height, &sse2, NULL, NULL); \
       se += se2; \
       long_sse += sse2; \
       if (w > wf * 2) { \
         se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
             src + 32 + (start_row * src_stride), src_stride, \
             x_offset, y_offset, dst + 32 + (start_row * dst_stride), \
-            dst_stride, height, &sse2); \
+            dst_stride, height, &sse2, NULL, NULL); \
         se += se2; \
         long_sse += sse2; \
         se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
             src + 48 + (start_row * src_stride), src_stride, \
             x_offset, y_offset, dst + 48 + (start_row * dst_stride), \
-            dst_stride, height, &sse2); \
+            dst_stride, height, &sse2, NULL, NULL); \
         se += se2; \
         long_sse += sse2; \
       }\
@@ -410,6 +417,7 @@ FNS(sse2, sse);
 #undef FNS
 #undef FN
 
+// The 2 unused parameters are place holders for PIC enabled build.
 #define DECL(w, opt) \
 int vpx_highbd_sub_pixel_avg_variance##w##xh_##opt(const uint16_t *src, \
                                                    ptrdiff_t src_stride, \
@@ -419,7 +427,8 @@ int vpx_highbd_sub_pixel_avg_variance##w##xh_##opt(const uint16_t *src, \
                                                    const uint16_t *sec, \
                                                    ptrdiff_t sec_stride, \
                                                    int height, \
-                                                   unsigned int *sse);
+                                                   unsigned int *sse, \
+                                                   void *unused0, void *unused);
 #define DECLS(opt1) \
 DECL(16, opt1) \
 DECL(8, opt1)
@@ -439,23 +448,23 @@ uint32_t vpx_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt( \
   uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
   int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
                src, src_stride, x_offset, \
-               y_offset, dst, dst_stride, sec, w, h, &sse); \
+               y_offset, dst, dst_stride, sec, w, h, &sse, NULL, NULL); \
   if (w > wf) { \
     uint32_t sse2; \
     int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
                   src + 16, src_stride, x_offset, y_offset, \
-                  dst + 16, dst_stride, sec + 16, w, h, &sse2); \
+                  dst + 16, dst_stride, sec + 16, w, h, &sse2, NULL, NULL); \
     se += se2; \
     sse += sse2; \
     if (w > wf * 2) { \
       se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
                 src + 32, src_stride, x_offset, y_offset, \
-                dst + 32, dst_stride, sec + 32, w, h, &sse2); \
+                dst + 32, dst_stride, sec + 32, w, h, &sse2, NULL, NULL); \
       se += se2; \
       sse += sse2; \
       se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
                 src + 48, src_stride, x_offset, y_offset, \
-                dst + 48, dst_stride, sec + 48, w, h, &sse2); \
+                dst + 48, dst_stride, sec + 48, w, h, &sse2, NULL, NULL); \
       se += se2; \
       sse += sse2; \
     } \
@@ -475,14 +484,15 @@ uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \
   int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
                                             src, src_stride, x_offset, \
                                             y_offset, dst, dst_stride, \
-                                            sec, w, h, &sse); \
+                                            sec, w, h, &sse, NULL, NULL); \
   if (w > wf) { \
     uint32_t sse2; \
     int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
                                             src + 16, src_stride, \
                                             x_offset, y_offset, \
                                             dst + 16, dst_stride, \
-                                            sec + 16, w, h, &sse2); \
+                                            sec + 16, w, h, &sse2, \
+                                            NULL, NULL); \
     se += se2; \
     sse += sse2; \
     if (w > wf * 2) { \
@@ -490,14 +500,16 @@ uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \
                                             src + 32, src_stride, \
                                             x_offset, y_offset, \
                                             dst + 32, dst_stride, \
-                                            sec + 32, w, h, &sse2); \
+                                            sec + 32, w, h, &sse2, \
+                                            NULL, NULL); \
       se += se2; \
       sse += sse2; \
       se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
                                             src + 48, src_stride, \
                                             x_offset, y_offset, \
                                             dst + 48, dst_stride, \
-                                            sec + 48, w, h, &sse2); \
+                                            sec + 48, w, h, &sse2, \
+                                            NULL, NULL); \
       se += se2; \
       sse += sse2; \
     } \
@@ -525,7 +537,7 @@ uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \
     int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
                 src + (start_row * src_stride), src_stride, x_offset, \
                 y_offset, dst + (start_row * dst_stride), dst_stride, \
-                sec + (start_row * w), w, height, &sse2); \
+                sec + (start_row * w), w, height, &sse2, NULL, NULL); \
     se += se2; \
     long_sse += sse2; \
     if (w > wf) { \
@@ -533,7 +545,7 @@ uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \
                 src + 16 + (start_row * src_stride), src_stride, \
                 x_offset, y_offset, \
                 dst + 16 + (start_row * dst_stride), dst_stride, \
-                sec + 16 + (start_row * w), w, height, &sse2); \
+                sec + 16 + (start_row * w), w, height, &sse2, NULL, NULL); \
       se += se2; \
       long_sse += sse2; \
       if (w > wf * 2) { \
@@ -541,14 +553,14 @@ uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \
                 src + 32 + (start_row * src_stride), src_stride, \
                 x_offset, y_offset, \
                 dst + 32 + (start_row * dst_stride), dst_stride, \
-                sec + 32 + (start_row * w), w, height, &sse2); \
+                sec + 32 + (start_row * w), w, height, &sse2, NULL, NULL); \
         se += se2; \
         long_sse += sse2; \
         se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
                 src + 48 + (start_row * src_stride), src_stride, \
                 x_offset, y_offset, \
                 dst + 48 + (start_row * dst_stride), dst_stride, \
-                sec + 48 + (start_row * w), w, height, &sse2); \
+                sec + 48 + (start_row * w), w, height, &sse2, NULL, NULL); \
         se += se2; \
         long_sse += sse2; \
       } \
diff --git a/vpx_dsp/x86/sad_sse2.asm b/vpx_dsp/x86/sad_sse2.asm
index 0defe1b6d..1ec906c23 100644
--- a/vpx_dsp/x86/sad_sse2.asm
+++ b/vpx_dsp/x86/sad_sse2.asm
@@ -17,7 +17,7 @@ SECTION .text
 %if %3 == 5
 cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
 %else ; %3 == 7
-cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, \
+cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
                             src_stride3, ref_stride3, n_rows
 %endif ; %3 == 5/7
 %else ; avg
@@ -25,7 +25,7 @@ cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, \
 cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \
                                     second_pred, n_rows
 %else ; %3 == 7
-cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 5, src, src_stride, \
+cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 6, src, src_stride, \
                                               ref, ref_stride, \
                                               second_pred, \
                                               src_stride3, ref_stride3
@@ -222,8 +222,8 @@ SAD8XN 16, 1 ; sad8x16_avg_sse2
 SAD8XN  8, 1 ; sad8x8_avg_sse2
 SAD8XN  4, 1 ; sad8x4_avg_sse2
 
-; unsigned int vpx_sad4x{4, 8}_sse(uint8_t *src, int src_stride,
-;                                  uint8_t *ref, int ref_stride);
+; unsigned int vpx_sad4x{4, 8}_sse2(uint8_t *src, int src_stride,
+;                                   uint8_t *ref, int ref_stride);
 %macro SAD4XN 1-2 0
   SAD_FN 4, %1, 7, %2
   mov              n_rowsd, %1/4
@@ -236,31 +236,32 @@ SAD8XN  4, 1 ; sad8x4_avg_sse2
   movd                  m4, [refq+ref_stride3q]
   punpckldq             m1, m2
   punpckldq             m3, m4
+  movlhps               m1, m3
 %if %2 == 1
   pavgb                 m1, [second_predq+mmsize*0]
-  pavgb                 m3, [second_predq+mmsize*1]
-  lea         second_predq, [second_predq+mmsize*2]
+  lea         second_predq, [second_predq+mmsize*1]
 %endif
   movd                  m2, [srcq]
   movd                  m5, [srcq+src_strideq]
   movd                  m4, [srcq+src_strideq*2]
-  movd                  m6, [srcq+src_stride3q]
+  movd                  m3, [srcq+src_stride3q]
   punpckldq             m2, m5
-  punpckldq             m4, m6
+  punpckldq             m4, m3
+  movlhps               m2, m4
   psadbw                m1, m2
-  psadbw                m3, m4
   lea                 refq, [refq+ref_strideq*4]
   paddd                 m0, m1
   lea                 srcq, [srcq+src_strideq*4]
-  paddd                 m0, m3
   dec              n_rowsd
   jg .loop
 
+  movhlps               m1, m0
+  paddd                 m0, m1
   movd                 eax, m0
   RET
 %endmacro
 
-INIT_MMX sse
+INIT_XMM sse2
 SAD4XN  8 ; sad4x8_sse
 SAD4XN  4 ; sad4x4_sse
 SAD4XN  8, 1 ; sad4x8_avg_sse