47 files changed, 1418 insertions, 630 deletions
diff --git a/README b/README
index a900c8077..49407ed9f 100644
--- a/README
+++ b/README
@@ -76,7 +76,6 @@ COMPILING THE APPLICATIONS/LIBRARIES:
     armv8-linux-gcc
     mips32-linux-gcc
     mips64-linux-gcc
-    ppc64-linux-gcc
     ppc64le-linux-gcc
     sparc-solaris-gcc
     x86-android-gcc
diff --git a/build/make/configure.sh b/build/make/configure.sh
index 876255bfe..f1d0e34c3 100644
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -719,11 +719,8 @@ process_common_toolchain() {
       *sparc*)
         tgt_isa=sparc
         ;;
-      power*64*-*)
-        tgt_isa=ppc64
-        ;;
-      power*)
-        tgt_isa=ppc
+      power*64le*-*)
+        tgt_isa=ppc64le
         ;;
       *mips64el*)
         tgt_isa=mips64
@@ -1221,7 +1218,7 @@ EOF
       check_add_asflags -march=${tgt_isa}
       check_add_asflags -KPIC
       ;;
-    ppc*)
+    ppc64le*)
       link_with_cc=gcc
       setup_gnu_toolchain
       check_gcc_machine_option "vsx"
@@ -1491,7 +1488,7 @@ EOF
         # bionic includes basic pthread functionality, obviating -lpthread.
         ;;
       *)
-        check_header pthread.h && check_lib -lpthread <<EOF && enable_feature pthread_h && add_extralibs -lpthread
+        check_header pthread.h && check_lib -lpthread <<EOF && add_extralibs -lpthread || disable_feature pthread_h
 #include <pthread.h>
 #include <stddef.h>
 int main(void) { return pthread_create(NULL, NULL, NULL, NULL); }
diff --git a/configure b/configure
index b021c1212..3a1a318c9 100755
--- a/configure
+++ b/configure
@@ -116,7 +116,6 @@ all_platforms="${all_platforms} armv7s-darwin-gcc"
 all_platforms="${all_platforms} armv8-linux-gcc"
 all_platforms="${all_platforms} mips32-linux-gcc"
 all_platforms="${all_platforms} mips64-linux-gcc"
-all_platforms="${all_platforms} ppc64-linux-gcc"
 all_platforms="${all_platforms} ppc64le-linux-gcc"
 all_platforms="${all_platforms} sparc-solaris-gcc"
 all_platforms="${all_platforms} x86-android-gcc"
@@ -585,7 +584,7 @@ EOF
 
     # Use both check_header and check_lib here, since check_lib
     # could be a stub that always returns true.
-    check_header pthread.h && check_lib -lpthread <<EOF && enable_feature pthread_h
+    check_header pthread.h && check_lib -lpthread <<EOF || disable_feature pthread_h
 #include <pthread.h>
 #include <stddef.h>
 int main(void) { return pthread_create(NULL, NULL, NULL, NULL); }
@@ -593,6 +592,10 @@ EOF
     check_header unistd.h # for sysconf(3) and friends.
 
     check_header vpx/vpx_integer.h -I${source_path} && enable_feature vpx_ports
+
+    if enabled neon && ! enabled external_build; then
+      check_header arm_neon.h || die "Unable to find arm_neon.h"
+    fi
 }
 
 process_toolchain() {
diff --git a/ivfdec.c b/ivfdec.c
index f64e594ab..3e179bc6e 100644
--- a/ivfdec.c
+++ b/ivfdec.c
@@ -76,12 +76,12 @@ int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read,
   size_t frame_size = 0;
 
   if (fread(raw_header, IVF_FRAME_HDR_SZ, 1, infile) != 1) {
-    if (!feof(infile)) warn("Failed to read frame size\n");
+    if (!feof(infile)) warn("Failed to read frame size");
   } else {
     frame_size = mem_get_le32(raw_header);
 
     if (frame_size > 256 * 1024 * 1024) {
-      warn("Read invalid frame size (%u)\n", (unsigned int)frame_size);
+      warn("Read invalid frame size (%u)", (unsigned int)frame_size);
       frame_size = 0;
     }
 
@@ -92,7 +92,7 @@ int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read,
         *buffer = new_buffer;
         *buffer_size = 2 * frame_size;
       } else {
-        warn("Failed to allocate compressed data buffer\n");
+        warn("Failed to allocate compressed data buffer");
         frame_size = 0;
       }
     }
@@ -100,7 +100,7 @@ int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read,
 
   if (!feof(infile)) {
     if (fread(*buffer, 1, frame_size, infile) != frame_size) {
-      warn("Failed to read full frame\n");
+      warn("Failed to read full frame");
       return 1;
     }
 
diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index 3e9377a0e..8f6c5cd48 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -1382,8 +1382,8 @@ INSTANTIATE_TEST_CASE_P(VSX, ConvolveTest,
 
 #if HAVE_MMI
 const ConvolveFunctions convolve8_mmi(
-    vpx_convolve_copy_c, vpx_convolve_avg_c, vpx_convolve8_horiz_mmi,
-    vpx_convolve8_avg_horiz_c, vpx_convolve8_vert_mmi,
+    vpx_convolve_copy_c, vpx_convolve_avg_mmi, vpx_convolve8_horiz_mmi,
+    vpx_convolve8_avg_horiz_mmi, vpx_convolve8_vert_mmi,
     vpx_convolve8_avg_vert_mmi, vpx_convolve8_mmi, vpx_convolve8_avg_mmi,
     vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c,
     vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
diff --git a/test/dct_test.cc b/test/dct_test.cc
index 10062150f..e8ad0cd5d 100644
--- a/test/dct_test.cc
+++ b/test/dct_test.cc
@@ -725,4 +725,14 @@ INSTANTIATE_TEST_CASE_P(SSE2, TransWHT,
                         ::testing::Values(make_tuple(0, &wht_sse2_func_info, 0,
                                                      VPX_BITS_8)));
 #endif  // HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_VSX && !CONFIG_EMULATE_HARDWARE && !CONFIG_VP9_HIGHBITDEPTH
+static const FuncInfo wht_vsx_func_info = {
+  &fdct_wrapper<vp9_fwht4x4_c>, &idct_wrapper<vpx_iwht4x4_16_add_vsx>, 4, 1
+};
+
+INSTANTIATE_TEST_CASE_P(VSX, TransWHT,
+                        ::testing::Values(make_tuple(0, &wht_vsx_func_info, 0,
+                                                     VPX_BITS_8)));
+#endif  // HAVE_VSX && !CONFIG_EMULATE_HARDWARE
 }  // namespace
diff --git a/test/ivf_video_source.h b/test/ivf_video_source.h
index 5862d2649..4b5d55469 100644
--- a/test/ivf_video_source.h
+++ b/test/ivf_video_source.h
@@ -16,7 +16,7 @@
 #include "test/video_source.h"
 
 namespace libvpx_test {
-const unsigned int kCodeBufferSize = 256 * 1024;
+const unsigned int kCodeBufferSize = 256 * 1024 * 1024;
 const unsigned int kIvfFileHdrSize = 32;
 const unsigned int kIvfFrameHdrSize = 12;
 
diff --git a/test/sum_squares_test.cc b/test/sum_squares_test.cc
index 910718b06..e211fb4d8 100644
--- a/test/sum_squares_test.cc
+++ b/test/sum_squares_test.cc
@@ -104,6 +104,13 @@ TEST_P(SumSquaresTest, ExtremeValues) {
 
 using ::testing::make_tuple;
 
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(
+    NEON, SumSquaresTest,
+    ::testing::Values(make_tuple(&vpx_sum_squares_2d_i16_c,
+                                 &vpx_sum_squares_2d_i16_neon)));
+#endif  // HAVE_NEON
+
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(
     SSE2, SumSquaresTest,
diff --git a/test/variance_test.cc b/test/variance_test.cc
index f89c98523..725821ae6 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -20,24 +20,13 @@
 #include "test/register_state_check.h"
 #include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/variance.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 #include "vpx_ports/vpx_timer.h"
 
 namespace {
 
-typedef unsigned int (*VarianceMxNFunc)(const uint8_t *a, int a_stride,
-                                        const uint8_t *b, int b_stride,
-                                        unsigned int *sse);
-typedef unsigned int (*SubpixVarMxNFunc)(const uint8_t *a, int a_stride,
-                                         int xoffset, int yoffset,
-                                         const uint8_t *b, int b_stride,
-                                         unsigned int *sse);
-typedef unsigned int (*SubpixAvgVarMxNFunc)(const uint8_t *a, int a_stride,
-                                            int xoffset, int yoffset,
-                                            const uint8_t *b, int b_stride,
-                                            uint32_t *sse,
-                                            const uint8_t *second_pred);
 typedef unsigned int (*Get4x4SseFunc)(const uint8_t *a, int a_stride,
                                       const uint8_t *b, int b_stride);
 typedef unsigned int (*SumOfSquaresFunction)(const int16_t *src);
@@ -692,7 +681,7 @@ void SubpelVarianceTest<SubpelVarianceFunctionType>::ExtremeRefTest() {
 }
 
 template <>
-void SubpelVarianceTest<SubpixAvgVarMxNFunc>::RefTest() {
+void SubpelVarianceTest<vpx_subp_avg_variance_fn_t>::RefTest() {
   for (int x = 0; x < 8; ++x) {
     for (int y = 0; y < 8; ++y) {
       if (!use_high_bit_depth()) {
@@ -728,10 +717,10 @@ void SubpelVarianceTest<SubpixAvgVarMxNFunc>::RefTest() {
 }
 
 typedef MainTestClass<Get4x4SseFunc> VpxSseTest;
-typedef MainTestClass<VarianceMxNFunc> VpxMseTest;
-typedef MainTestClass<VarianceMxNFunc> VpxVarianceTest;
-typedef SubpelVarianceTest<SubpixVarMxNFunc> VpxSubpelVarianceTest;
-typedef SubpelVarianceTest<SubpixAvgVarMxNFunc> VpxSubpelAvgVarianceTest;
+typedef MainTestClass<vpx_variance_fn_t> VpxMseTest;
+typedef MainTestClass<vpx_variance_fn_t> VpxVarianceTest;
+typedef SubpelVarianceTest<vpx_subpixvariance_fn_t> VpxSubpelVarianceTest;
+typedef SubpelVarianceTest<vpx_subp_avg_variance_fn_t> VpxSubpelAvgVarianceTest;
 
 TEST_P(VpxSseTest, RefSse) { RefTestSse(); }
 TEST_P(VpxSseTest, MaxSse) { MaxTestSse(); }
@@ -756,14 +745,14 @@ INSTANTIATE_TEST_CASE_P(C, VpxSseTest,
                         ::testing::Values(SseParams(2, 2,
                                                     &vpx_get4x4sse_cs_c)));
 
-typedef TestParams<VarianceMxNFunc> MseParams;
+typedef TestParams<vpx_variance_fn_t> MseParams;
 INSTANTIATE_TEST_CASE_P(C, VpxMseTest,
                         ::testing::Values(MseParams(4, 4, &vpx_mse16x16_c),
                                           MseParams(4, 3, &vpx_mse16x8_c),
                                           MseParams(3, 4, &vpx_mse8x16_c),
                                           MseParams(3, 3, &vpx_mse8x8_c)));
 
-typedef TestParams<VarianceMxNFunc> VarianceParams;
+typedef TestParams<vpx_variance_fn_t> VarianceParams;
 INSTANTIATE_TEST_CASE_P(
     C, VpxVarianceTest,
     ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_c),
@@ -780,7 +769,7 @@ INSTANTIATE_TEST_CASE_P(
                       VarianceParams(2, 3, &vpx_variance4x8_c),
                       VarianceParams(2, 2, &vpx_variance4x4_c)));
 
-typedef TestParams<SubpixVarMxNFunc> SubpelVarianceParams;
+typedef TestParams<vpx_subpixvariance_fn_t> SubpelVarianceParams;
 INSTANTIATE_TEST_CASE_P(
     C, VpxSubpelVarianceTest,
     ::testing::Values(
@@ -798,7 +787,7 @@ INSTANTIATE_TEST_CASE_P(
         SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_c, 0),
         SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_c, 0)));
 
-typedef TestParams<SubpixAvgVarMxNFunc> SubpelAvgVarianceParams;
+typedef TestParams<vpx_subp_avg_variance_fn_t> SubpelAvgVarianceParams;
 INSTANTIATE_TEST_CASE_P(
     C, VpxSubpelAvgVarianceTest,
     ::testing::Values(
@@ -817,10 +806,11 @@ INSTANTIATE_TEST_CASE_P(
         SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_c, 0)));
 
 #if CONFIG_VP9_HIGHBITDEPTH
-typedef MainTestClass<VarianceMxNFunc> VpxHBDMseTest;
-typedef MainTestClass<VarianceMxNFunc> VpxHBDVarianceTest;
-typedef SubpelVarianceTest<SubpixVarMxNFunc> VpxHBDSubpelVarianceTest;
-typedef SubpelVarianceTest<SubpixAvgVarMxNFunc> VpxHBDSubpelAvgVarianceTest;
+typedef MainTestClass<vpx_variance_fn_t> VpxHBDMseTest;
+typedef MainTestClass<vpx_variance_fn_t> VpxHBDVarianceTest;
+typedef SubpelVarianceTest<vpx_subpixvariance_fn_t> VpxHBDSubpelVarianceTest;
+typedef SubpelVarianceTest<vpx_subp_avg_variance_fn_t>
+    VpxHBDSubpelAvgVarianceTest;
 
 TEST_P(VpxHBDMseTest, RefMse) { RefTestMse(); }
 TEST_P(VpxHBDMseTest, MaxMse) { MaxTestMse(); }
diff --git a/test/vp8_multi_resolution_encoder.sh b/test/vp8_multi_resolution_encoder.sh
index a8b7fe78e..33fd5b0d8 100755
--- a/test/vp8_multi_resolution_encoder.sh
+++ b/test/vp8_multi_resolution_encoder.sh
@@ -46,19 +46,31 @@ vp8_multi_resolution_encoder_three_formats() {
   local readonly output_files="${VPX_TEST_OUTPUT_DIR}/vp8_mre_0.ivf
                                ${VPX_TEST_OUTPUT_DIR}/vp8_mre_1.ivf
                                ${VPX_TEST_OUTPUT_DIR}/vp8_mre_2.ivf"
+  local readonly layer_bitrates="150 80 50"
+  local readonly keyframe_insert="200"
+  local readonly temporal_layers="3 3 3"
+  local readonly framerate="30"
 
   if [ "$(vpx_config_option_enabled CONFIG_MULTI_RES_ENCODING)" = "yes" ]; then
     if [ "$(vp8_encode_available)" = "yes" ]; then
       # Param order:
       #  Input width
       #  Input height
+      #  Framerate
       #  Input file path
       #  Output file names
+      #  Layer bitrates
+      #  Temporal layers
+      #  Keyframe insert
       #  Output PSNR
       vp8_mre "${YUV_RAW_INPUT_WIDTH}" \
         "${YUV_RAW_INPUT_HEIGHT}" \
+        "${framerate}" \
         "${YUV_RAW_INPUT}" \
         ${output_files} \
+        ${layer_bitrates} \
+        ${temporal_layers} \
+        "${keyframe_insert}" \
         0
 
       for output_file in ${output_files}; do
diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index 4740cadd9..f0bbedbfa 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -575,3 +575,10 @@ INSTANTIATE_TEST_CASE_P(
                    &QuantFPWrapper<vp9_quantize_fp_32x32_c>, VPX_BITS_8, 32,
                    true)));
 }  // namespace
+
+#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(VSX, VP9QuantizeTest,
+                        ::testing::Values(make_tuple(&vpx_quantize_b_vsx,
+                                                     &vpx_quantize_b_c,
+                                                     VPX_BITS_8, 16, false)));
+#endif  // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index abc705363..4e7d99f50 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -459,7 +459,8 @@ static void write_modes_sb(
           write_modes_b(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col + bs,
                         max_mv_magnitude, interp_filter_selected);
         break;
-      case PARTITION_SPLIT:
+      default:
+        assert(partition == PARTITION_SPLIT);
         write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col, subsize,
                        max_mv_magnitude, interp_filter_selected);
         write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col + bs,
@@ -469,7 +470,6 @@ static void write_modes_sb(
         write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row + bs, mi_col + bs,
                        subsize, max_mv_magnitude, interp_filter_selected);
         break;
-      default: assert(0);
     }
   }
 
@@ -618,9 +618,10 @@ static void update_coef_probs_common(vpx_writer *const bc, VP9_COMP *cpi,
       return;
     }
 
-    case ONE_LOOP_REDUCED: {
+    default: {
       int updates = 0;
       int noupdates_before_first = 0;
+      assert(cpi->sf.use_fast_coef_updates == ONE_LOOP_REDUCED);
       for (i = 0; i < PLANE_TYPES; ++i) {
         for (j = 0; j < REF_TYPES; ++j) {
           for (k = 0; k < COEF_BANDS; ++k) {
@@ -670,7 +671,6 @@ static void update_coef_probs_common(vpx_writer *const bc, VP9_COMP *cpi,
       }
       return;
     }
-    default: assert(0);
   }
 }
 
@@ -1149,8 +1149,10 @@ static void write_profile(BITSTREAM_PROFILE profile,
     case PROFILE_0: vpx_wb_write_literal(wb, 0, 2); break;
     case PROFILE_1: vpx_wb_write_literal(wb, 2, 2); break;
     case PROFILE_2: vpx_wb_write_literal(wb, 1, 2); break;
-    case PROFILE_3: vpx_wb_write_literal(wb, 6, 3); break;
-    default: assert(0);
+    default:
+      assert(profile == PROFILE_3);
+      vpx_wb_write_literal(wb, 6, 3);
+      break;
   }
 }
 
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index a283d92a8..091992dbd 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -385,16 +385,13 @@ static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) {
         node->split[i] = &vt->split[i].part_variances.none;
       break;
     }
-    case BLOCK_4X4: {
+    default: {
       v4x4 *vt = (v4x4 *)data;
+      assert(bsize == BLOCK_4X4);
       node->part_variances = &vt->part_variances;
       for (i = 0; i < 4; i++) node->split[i] = &vt->split[i];
       break;
     }
-    default: {
-      assert(0);
-      break;
-    }
   }
 }
 
@@ -885,13 +882,13 @@ static void copy_partitioning_helper(VP9_COMP *cpi, MACROBLOCK *x,
         set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
         set_block_size(cpi, x, xd, mi_row, mi_col + bs, subsize);
         break;
-      case PARTITION_SPLIT:
+      default:
+        assert(partition == PARTITION_SPLIT);
         copy_partitioning_helper(cpi, x, xd, subsize, mi_row, mi_col);
         copy_partitioning_helper(cpi, x, xd, subsize, mi_row + bs, mi_col);
         copy_partitioning_helper(cpi, x, xd, subsize, mi_row, mi_col + bs);
         copy_partitioning_helper(cpi, x, xd, subsize, mi_row + bs, mi_col + bs);
         break;
-      default: assert(0);
     }
   }
 }
@@ -1004,7 +1001,8 @@ static int scale_partitioning_svc(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
           set_block_size(cpi, x, xd, mi_row_high, mi_col_high + bs_high,
                          subsize_high);
         break;
-      case PARTITION_SPLIT:
+      default:
+        assert(partition_high == PARTITION_SPLIT);
         if (scale_partitioning_svc(cpi, x, xd, subsize_high, mi_row, mi_col,
                                    mi_row_high, mi_col_high))
           return 1;
@@ -1020,7 +1018,6 @@ static int scale_partitioning_svc(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
                                    mi_col_high + bs_high))
           return 1;
         break;
-      default: assert(0);
     }
   }
 
@@ -1067,13 +1064,13 @@ static void update_partition_svc(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
         prev_part[start_pos] = subsize;
         if (mi_col + bs < cm->mi_cols) prev_part[start_pos + bs] = subsize;
         break;
-      case PARTITION_SPLIT:
+      default:
+        assert(partition == PARTITION_SPLIT);
         update_partition_svc(cpi, subsize, mi_row, mi_col);
         update_partition_svc(cpi, subsize, mi_row + bs, mi_col);
         update_partition_svc(cpi, subsize, mi_row, mi_col + bs);
         update_partition_svc(cpi, subsize, mi_row + bs, mi_col + bs);
         break;
-      default: assert(0);
     }
   }
 }
@@ -1108,13 +1105,13 @@ static void update_prev_partition_helper(VP9_COMP *cpi, BLOCK_SIZE bsize,
         prev_part[start_pos] = subsize;
         if (mi_col + bs < cm->mi_cols) prev_part[start_pos + bs] = subsize;
         break;
-      case PARTITION_SPLIT:
+      default:
+        assert(partition == PARTITION_SPLIT);
         update_prev_partition_helper(cpi, subsize, mi_row, mi_col);
         update_prev_partition_helper(cpi, subsize, mi_row + bs, mi_col);
         update_prev_partition_helper(cpi, subsize, mi_row, mi_col + bs);
         update_prev_partition_helper(cpi, subsize, mi_row + bs, mi_col + bs);
         break;
-      default: assert(0);
     }
   }
 }
@@ -1387,7 +1384,9 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
           x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf,
           xd->plane[0].pre[0].stride);
     } else {
-      y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col);
+      const MV dummy_mv = { 0, 0 };
+      y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col,
+                                            &dummy_mv);
       x->sb_use_mv_part = 1;
       x->sb_mvcol_part = mi->mv[0].as_mv.col;
       x->sb_mvrow_part = mi->mv[0].as_mv.row;
@@ -2181,7 +2180,8 @@ static void encode_sb(VP9_COMP *cpi, ThreadData *td, const TileInfo *const tile,
                  subsize, &pc_tree->horizontal[1]);
       }
       break;
-    case PARTITION_SPLIT:
+    default:
+      assert(partition == PARTITION_SPLIT);
       if (bsize == BLOCK_8X8) {
         encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
                  pc_tree->leaf_split[0]);
@@ -2196,7 +2196,6 @@ static void encode_sb(VP9_COMP *cpi, ThreadData *td, const TileInfo *const tile,
                   subsize, pc_tree->split[3]);
       }
       break;
-    default: assert(0 && "Invalid partition type."); break;
   }
 
   if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
@@ -2522,7 +2521,8 @@ static void encode_sb_rt(VP9_COMP *cpi, ThreadData *td,
                     subsize, &pc_tree->horizontal[1]);
       }
       break;
-    case PARTITION_SPLIT:
+    default:
+      assert(partition == PARTITION_SPLIT);
       subsize = get_subsize(bsize, PARTITION_SPLIT);
       encode_sb_rt(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize,
                    pc_tree->split[0]);
@@ -2533,7 +2533,6 @@ static void encode_sb_rt(VP9_COMP *cpi, ThreadData *td,
       encode_sb_rt(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs,
                    output_enabled, subsize, pc_tree->split[3]);
       break;
-    default: assert(0 && "Invalid partition type."); break;
   }
 
   if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
@@ -2672,7 +2671,8 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td,
         last_part_rdc.rdcost += tmp_rdc.rdcost;
       }
       break;
-    case PARTITION_SPLIT:
+    default:
+      assert(partition == PARTITION_SPLIT);
       if (bsize == BLOCK_8X8) {
         rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
                          subsize, pc_tree->leaf_split[0], INT64_MAX);
@@ -2702,7 +2702,6 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td,
         last_part_rdc.dist += tmp_rdc.dist;
       }
       break;
-    default: assert(0); break;
   }
 
   pl = partition_plane_context(xd, mi_row, mi_col, bsize);
@@ -4208,7 +4207,8 @@ static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td,
           }
         }
         break;
-      case PARTITION_SPLIT:
+      default:
+        assert(partition == PARTITION_SPLIT);
         subsize = get_subsize(bsize, PARTITION_SPLIT);
         nonrd_select_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
                                subsize, output_enabled, rd_cost,
@@ -4238,7 +4238,6 @@ static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td,
           rd_cost->dist += this_rdc.dist;
         }
         break;
-      default: assert(0 && "Invalid partition type."); break;
     }
   }
 
@@ -4327,7 +4326,8 @@ static void nonrd_use_partition(VP9_COMP *cpi, ThreadData *td,
                     output_enabled, subsize, &pc_tree->horizontal[1]);
       }
       break;
-    case PARTITION_SPLIT:
+    default:
+      assert(partition == PARTITION_SPLIT);
       subsize = get_subsize(bsize, PARTITION_SPLIT);
       if (bsize == BLOCK_8X8) {
         nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, dummy_cost,
@@ -4348,7 +4348,6 @@ static void nonrd_use_partition(VP9_COMP *cpi, ThreadData *td,
                             dummy_cost, pc_tree->split[3]);
       }
       break;
-    default: assert(0 && "Invalid partition type."); break;
   }
 
   if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
@@ -4452,7 +4451,8 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td,
         nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
                             BLOCK_64X64, 1, &dummy_rdc, td->pc_root);
         break;
-      case REFERENCE_PARTITION:
+      default:
+        assert(partition_search_type == REFERENCE_PARTITION);
         x->sb_pickmode_part = 1;
         set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
         // Use nonrd_pick_partition on scene-cut for VBR mode.
@@ -4484,7 +4484,6 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td,
         }
 
         break;
-      default: assert(0); break;
     }
 
     // Update ref_frame usage for inter frame if this group is ARF group.
@@ -4551,16 +4550,12 @@ static int set_var_thresh_from_histogram(VP9_COMP *cpi) {
                                       &var16->sse, &var16->sum);
             var16->var = variance_highbd(var16);
             break;
-          case VPX_BITS_12:
+          default:
+            assert(cm->bit_depth == VPX_BITS_12);
             vpx_highbd_12_get16x16var(src, src_stride, last_src, last_stride,
                                       &var16->sse, &var16->sum);
             var16->var = variance_highbd(var16);
             break;
-          default:
-            assert(0 &&
-                   "cm->bit_depth should be VPX_BITS_8, VPX_BITS_10"
-                   " or VPX_BITS_12");
-            return -1;
         }
       } else {
         vpx_get16x16var(src, src_stride, last_src, last_stride, &var16->sse,
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 970077d89..bc2765728 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -358,13 +358,13 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col,
                                p->quant_fp, qcoeff, dqcoeff, pd->dequant, eob,
                                scan_order->scan, scan_order->iscan);
         break;
-      case TX_4X4:
+      default:
+        assert(tx_size == TX_4X4);
         x->fwd_txfm4x4(src_diff, coeff, diff_stride);
         vp9_highbd_quantize_fp(coeff, 16, x->skip_block, p->round_fp,
                                p->quant_fp, qcoeff, dqcoeff, pd->dequant, eob,
                                scan_order->scan, scan_order->iscan);
         break;
-      default: assert(0);
     }
     return;
   }
@@ -388,13 +388,13 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col,
                         p->round_fp, p->quant_fp, qcoeff, dqcoeff, pd->dequant,
                         eob, scan_order->scan, scan_order->iscan);
       break;
-    case TX_4X4:
+    default:
+      assert(tx_size == TX_4X4);
       x->fwd_txfm4x4(src_diff, coeff, diff_stride);
       vp9_quantize_fp(coeff, 16, x->skip_block, p->round_fp, p->quant_fp,
                       qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
                       scan_order->iscan);
       break;
-    default: assert(0); break;
   }
 }
 
@@ -434,13 +434,13 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, int row, int col,
                                p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0],
                                eob);
         break;
-      case TX_4X4:
+      default:
+        assert(tx_size == TX_4X4);
         x->fwd_txfm4x4(src_diff, coeff, diff_stride);
         vpx_highbd_quantize_dc(coeff, 16, x->skip_block, p->round,
                                p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0],
                                eob);
         break;
-      default: assert(0);
     }
     return;
   }
@@ -462,12 +462,12 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, int row, int col,
       vpx_quantize_dc(coeff, 64, x->skip_block, p->round, p->quant_fp[0],
                       qcoeff, dqcoeff, pd->dequant[0], eob);
       break;
-    case TX_4X4:
+    default:
+      assert(tx_size == TX_4X4);
       x->fwd_txfm4x4(src_diff, coeff, diff_stride);
       vpx_quantize_dc(coeff, 16, x->skip_block, p->round, p->quant_fp[0],
                       qcoeff, dqcoeff, pd->dequant[0], eob);
       break;
-    default: assert(0); break;
   }
 }
 
@@ -511,14 +511,14 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
                               pd->dequant, eob, scan_order->scan,
                               scan_order->iscan);
         break;
-      case TX_4X4:
+      default:
+        assert(tx_size == TX_4X4);
         x->fwd_txfm4x4(src_diff, coeff, diff_stride);
         vpx_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
                               p->quant, p->quant_shift, qcoeff, dqcoeff,
                               pd->dequant, eob, scan_order->scan,
                               scan_order->iscan);
         break;
-      default: assert(0);
     }
     return;
   }
@@ -544,13 +544,13 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
                      p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
                      scan_order->scan, scan_order->iscan);
       break;
-    case TX_4X4:
+    default:
+      assert(tx_size == TX_4X4);
       x->fwd_txfm4x4(src_diff, coeff, diff_stride);
       vpx_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
                      p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
                      scan_order->scan, scan_order->iscan);
       break;
-    default: assert(0); break;
   }
 }
 
@@ -634,14 +634,14 @@ static void encode_block(int plane, int block, int row, int col,
         vp9_highbd_idct8x8_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block],
                                xd->bd);
         break;
-      case TX_4X4:
+      default:
+        assert(tx_size == TX_4X4);
         // this is like vp9_short_idct4x4 but has a special case around eob<=1
         // which is significant (not just an optimization) for the lossless
         // case.
         x->highbd_inv_txfm_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block],
                                xd->bd);
         break;
-      default: assert(0 && "Invalid transform size");
     }
     return;
   }
@@ -657,13 +657,13 @@ static void encode_block(int plane, int block, int row, int col,
     case TX_8X8:
       vp9_idct8x8_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
       break;
-    case TX_4X4:
+    default:
+      assert(tx_size == TX_4X4);
       // this is like vp9_short_idct4x4 but has a special case around eob<=1
       // which is significant (not just an optimization) for the lossless
       // case.
       x->inv_txfm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
       break;
-    default: assert(0 && "Invalid transform size"); break;
   }
 }
 
@@ -848,7 +848,8 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
                                 xd->bd);
         }
         break;
-      case TX_4X4:
+      default:
+        assert(tx_size == TX_4X4);
         if (!x->skip_recode) {
           vpx_highbd_subtract_block(4, 4, src_diff, diff_stride, src,
                                     src_stride, dst, dst_stride, xd->bd);
@@ -876,7 +877,6 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
           }
         }
         break;
-      default: assert(0); return;
     }
     if (*eob) *(args->skip) = 0;
     return;
@@ -930,7 +930,8 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
       if (!x->skip_encode && *eob)
         vp9_iht8x8_add(tx_type, dqcoeff, dst, dst_stride, *eob);
       break;
-    case TX_4X4:
+    default:
+      assert(tx_size == TX_4X4);
       if (!x->skip_recode) {
         vpx_subtract_block(4, 4, src_diff, diff_stride, src, src_stride, dst,
                            dst_stride);
@@ -955,7 +956,6 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
           vp9_iht4x4_16_add(dqcoeff, dst, dst_stride, tx_type);
       }
       break;
-    default: assert(0); break;
   }
   if (*eob) *(args->skip) = 0;
 }
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 2f92456f2..3384de7ea 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -483,14 +483,10 @@ static INLINE void Scale2Ratio(VPX_SCALING mode, int *hr, int *hs) {
       *hr = 3;
       *hs = 5;
       break;
-    case ONETWO:
-      *hr = 1;
-      *hs = 2;
-      break;
     default:
+      assert(mode == ONETWO);
       *hr = 1;
-      *hs = 1;
-      assert(0);
+      *hs = 2;
       break;
   }
 }
@@ -1726,7 +1722,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
                    vpx_highbd_sad4x4x4d_bits10)
         break;
 
-      case VPX_BITS_12:
+      default:
+        assert(cm->bit_depth == VPX_BITS_12);
         HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits12,
                    vpx_highbd_sad32x16_avg_bits12, vpx_highbd_12_variance32x16,
                    vpx_highbd_12_sub_pixel_variance32x16,
@@ -1805,11 +1802,6 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
                    vpx_highbd_12_sub_pixel_avg_variance4x4,
                    vpx_highbd_sad4x4x4d_bits12)
         break;
-
-      default:
-        assert(0 &&
-               "cm->bit_depth should be VPX_BITS_8, "
-               "VPX_BITS_10 or VPX_BITS_12");
     }
   }
 }
@@ -3031,9 +3023,17 @@ void vp9_update_reference_frames(VP9_COMP *cpi) {
     // Keep track of frame index for each reference frame.
     SVC *const svc = &cpi->svc;
     if (cm->frame_type == KEY_FRAME) {
+      int i;
       svc->ref_frame_index[cpi->lst_fb_idx] = svc->current_superframe;
       svc->ref_frame_index[cpi->gld_fb_idx] = svc->current_superframe;
       svc->ref_frame_index[cpi->alt_fb_idx] = svc->current_superframe;
+      // On key frame update all reference frame slots.
+      for (i = 0; i < REF_FRAMES; i++) {
+        // LAST/GOLDEN/ALTREF is already updated above.
+        if (i != cpi->lst_fb_idx && i != cpi->gld_fb_idx &&
+            i != cpi->alt_fb_idx)
+          ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[i], cm->new_fb_idx);
+      }
     } else {
       if (cpi->refresh_last_frame)
         svc->ref_frame_index[cpi->lst_fb_idx] = svc->current_superframe;
@@ -3042,6 +3042,8 @@ void vp9_update_reference_frames(VP9_COMP *cpi) {
       if (cpi->refresh_alt_ref_frame)
         svc->ref_frame_index[cpi->alt_fb_idx] = svc->current_superframe;
     }
+    // Copy flags from encoder to SVC struct.
+    vp9_copy_flags_ref_update_idx(cpi);
   }
 }
 
@@ -3284,11 +3286,9 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) {
       case VPX_BITS_10:
         dc_quant_devisor = 16.0;
         break;
-      case VPX_BITS_12:
-        dc_quant_devisor = 64.0;
-        break;
       default:
-        assert(0 && "bit_depth must be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+        assert(cm->bit_depth == VPX_BITS_12);
+        dc_quant_devisor = 64.0;
         break;
     }
 #else
@@ -3730,28 +3730,9 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
   suppress_active_map(cpi);
 
   // For SVC on non-zero spatial layer: check for disabling inter-layer
-  // (spatial) prediction, if svc.disable_inter_layer_pred is set.
-  // if the previous spatial layer was dropped then disable the prediction from
-  // this (scaled) reference.
-  if (cpi->use_svc && cpi->svc.spatial_layer_id > 0) {
-    if ((cpi->svc.disable_inter_layer_pred == INTER_LAYER_PRED_OFF_NONKEY &&
-         !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame) ||
-        cpi->svc.disable_inter_layer_pred == INTER_LAYER_PRED_OFF ||
-        cpi->svc.drop_spatial_layer[cpi->svc.spatial_layer_id - 1]) {
-      MV_REFERENCE_FRAME ref_frame;
-      static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                        VP9_ALT_FLAG };
-      for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-        const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
-        if (yv12 != NULL && (cpi->ref_frame_flags & flag_list[ref_frame])) {
-          const struct scale_factors *const scale_fac =
-              &cm->frame_refs[ref_frame - 1].sf;
-          if (vp9_is_scaled(scale_fac))
-            cpi->ref_frame_flags &= (~flag_list[ref_frame]);
-        }
-      }
-    }
-  }
+  // prediction.
+  if (cpi->use_svc && cpi->svc.spatial_layer_id > 0)
+    vp9_svc_constrain_inter_layer_pred(cpi);
 
   // Variance adaptive and in frame q adjustment experiments are mutually
   // exclusive.
@@ -4612,6 +4593,16 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size,
 
   cpi->last_frame_dropped = 0;
   cpi->svc.last_layer_dropped[cpi->svc.spatial_layer_id] = 0;
+  // Keep track of the frame buffer index updated/refreshed for the
+  // current encoded TL0 superframe.
+  if (cpi->svc.temporal_layer_id == 0) {
+    if (cpi->refresh_last_frame)
+      cpi->svc.fb_idx_upd_tl0[cpi->svc.spatial_layer_id] = cpi->lst_fb_idx;
+    else if (cpi->refresh_golden_frame)
+      cpi->svc.fb_idx_upd_tl0[cpi->svc.spatial_layer_id] = cpi->gld_fb_idx;
+    else if (cpi->refresh_alt_ref_frame)
+      cpi->svc.fb_idx_upd_tl0[cpi->svc.spatial_layer_id] = cpi->alt_fb_idx;
+  }
 
   // Disable segmentation if it decrease rate/distortion ratio
   if (cpi->oxcf.aq_mode == LOOKAHEAD_AQ)
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 303c00a6f..1e0ed70fb 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -645,6 +645,8 @@ typedef struct VP9_COMP {
   int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
   int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
   int partition_cost[PARTITION_CONTEXTS][PARTITION_TYPES];
+  // Indices are:  max_tx_size-1,  tx_size_ctx,    tx_size
+  int tx_size_cost[TX_SIZES - 1][TX_SIZE_CONTEXTS][TX_SIZES];
 
   int multi_arf_allowed;
   int multi_arf_enabled;
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 3302fde08..453879fb8 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -494,11 +494,10 @@ static int scale_sse_threshold(VP9_COMMON *cm, int thresh) {
     switch (cm->bit_depth) {
       case VPX_BITS_8: ret_val = thresh; break;
       case VPX_BITS_10: ret_val = thresh << 4; break;
-      case VPX_BITS_12: ret_val = thresh << 8; break;
       default:
-        assert(0 &&
-               "cm->bit_depth should be VPX_BITS_8, "
-               "VPX_BITS_10 or VPX_BITS_12");
+        assert(cm->bit_depth == VPX_BITS_12);
+        ret_val = thresh << 8;
+        break;
     }
   }
 #else
@@ -520,11 +519,10 @@ static int get_ul_intra_threshold(VP9_COMMON *cm) {
     switch (cm->bit_depth) {
       case VPX_BITS_8: ret_val = UL_INTRA_THRESH; break;
       case VPX_BITS_10: ret_val = UL_INTRA_THRESH << 2; break;
-      case VPX_BITS_12: ret_val = UL_INTRA_THRESH << 4; break;
       default:
-        assert(0 &&
-               "cm->bit_depth should be VPX_BITS_8, "
-               "VPX_BITS_10 or VPX_BITS_12");
+        assert(cm->bit_depth == VPX_BITS_12);
+        ret_val = UL_INTRA_THRESH << 4;
+        break;
     }
   }
 #else
@@ -541,11 +539,10 @@ static int get_smooth_intra_threshold(VP9_COMMON *cm) {
     switch (cm->bit_depth) {
       case VPX_BITS_8: ret_val = SMOOTH_INTRA_THRESH; break;
       case VPX_BITS_10: ret_val = SMOOTH_INTRA_THRESH << 4; break;
-      case VPX_BITS_12: ret_val = SMOOTH_INTRA_THRESH << 8; break;
       default:
-        assert(0 &&
-               "cm->bit_depth should be VPX_BITS_8, "
-               "VPX_BITS_10 or VPX_BITS_12");
+        assert(cm->bit_depth == VPX_BITS_12);
+        ret_val = SMOOTH_INTRA_THRESH << 8;
+        break;
     }
   }
 #else
@@ -971,12 +968,10 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
       switch (cm->bit_depth) {
         case VPX_BITS_8: break;
         case VPX_BITS_10: this_error >>= 4; break;
-        case VPX_BITS_12: this_error >>= 8; break;
         default:
-          assert(0 &&
-                 "cm->bit_depth should be VPX_BITS_8, "
-                 "VPX_BITS_10 or VPX_BITS_12");
-          return;
+          assert(cm->bit_depth == VPX_BITS_12);
+          this_error >>= 8;
+          break;
       }
     }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
@@ -2446,8 +2441,9 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
 
       // Monitor for static sections.
-      zero_motion_accumulator = VPXMIN(
-          zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));
+      if ((rc->frames_since_key + i - 1) > 1) {
+        zero_motion_accumulator *= get_zero_motion_factor(cpi, &next_frame);
+      }
 
       // Break clause to detect very still sections after motion. For example,
       // a static image after a fade or other transition.
@@ -2469,8 +2465,17 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
     // Break out conditions.
     // Break at maximum of active_max_gf_interval unless almost totally static.
-    if (((twopass->kf_zeromotion_pct < STATIC_KF_GROUP_THRESH) &&
-         (i >= active_max_gf_interval) && (zero_motion_accumulator < 0.995)) ||
+    //
+    // Note that the addition of a test of rc->source_alt_ref_active is
+    // deliberate. The effect of this is that after a normal altref group even
+    // if the material is static there will be one normal length GF group
+    // before allowing longer GF groups. The reason for this is that in cases
+    // such as slide shows where slides are separated by a complex transition
+    // such as a fade, the arf group spanning the transition may not be coded
+    // at a very high quality and hence this frame (with its overlay) is a
+    // poor golden frame to use for an extended group.
+    if (((i >= active_max_gf_interval) &&
+         ((zero_motion_accumulator < 0.995) || (rc->source_alt_ref_active))) ||
         (
             // Don't break out with a very short interval.
             (i >= active_min_gf_interval) &&
@@ -2490,7 +2495,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0;
 
   // Should we use the alternate reference frame.
-  if ((twopass->kf_zeromotion_pct < STATIC_KF_GROUP_THRESH) && allow_alt_ref &&
+  if ((zero_motion_accumulator < 0.995) && allow_alt_ref &&
       (i < cpi->oxcf.lag_in_frames) && (i >= rc->min_gf_interval)) {
     const int forward_frames = (rc->frames_to_key - i >= i - 1)
                                    ? i - 1
@@ -2518,11 +2523,11 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   rc->gfu_boost = VPXMIN((int)rc->gfu_boost, i * 200);
 #endif
 
-  // Set the interval until the next gf.
   rc->baseline_gf_interval =
-      (twopass->kf_zeromotion_pct < STATIC_KF_GROUP_THRESH)
-          ? (i - (is_key_frame || rc->source_alt_ref_pending))
-          : i;
+      ((twopass->kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH) &&
+       (i >= rc->frames_to_key))
+          ? i
+          : (i - (is_key_frame || rc->source_alt_ref_pending));
 
   rc->frames_till_gf_update_due = rc->baseline_gf_interval;
 
@@ -2769,6 +2774,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   vp9_zero(next_frame);
 
   cpi->common.frame_type = KEY_FRAME;
+  rc->frames_since_key = 0;
 
   // Reset the GF group data structures.
   vp9_zero(*gf_group);
@@ -2913,7 +2919,10 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   for (i = 0; i < (rc->frames_to_key - 1); ++i) {
     if (EOF == input_stats(twopass, &next_frame)) break;
 
-    if (i <= KF_BOOST_SCAN_MAX_FRAMES) {
+    // The zero motion test here insures that if we mark a kf group as static
+    // it is static throughout not just the first KF_BOOST_SCAN_MAX_FRAMES.
+    // It also allows for a larger boost on long static groups.
+    if ((i <= KF_BOOST_SCAN_MAX_FRAMES) || (zero_motion_accumulator >= 0.99)) {
       double frame_boost;
       double zm_factor;
 
@@ -3025,12 +3034,13 @@ static void configure_buffer_updates(VP9_COMP *cpi) {
       cpi->refresh_alt_ref_frame = 0;
       cpi->rc.is_src_frame_alt_ref = 1;
       break;
-    case ARF_UPDATE:
+    default:
+      assert(twopass->gf_group.update_type[twopass->gf_group.index] ==
+             ARF_UPDATE);
       cpi->refresh_last_frame = 0;
       cpi->refresh_golden_frame = 0;
       cpi->refresh_alt_ref_frame = 1;
       break;
-    default: assert(0); break;
   }
 }
 
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 1cb978667..ba72c0be5 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -1793,7 +1793,7 @@ static const MV search_pos[4] = {
 
 unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,
                                            BLOCK_SIZE bsize, int mi_row,
-                                           int mi_col) {
+                                           int mi_col, const MV *ref_mv) {
   MACROBLOCKD *xd = &x->e_mbd;
   MODE_INFO *mi = xd->mi[0];
   struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0 } };
@@ -1815,6 +1815,7 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,
   const int norm_factor = 3 + (bw >> 5);
   const YV12_BUFFER_CONFIG *scaled_ref_frame =
       vp9_get_scaled_ref_frame(cpi, mi->ref_frame[0]);
+  MvLimits subpel_mv_limits;
 
   if (scaled_ref_frame) {
     int i;
@@ -1917,6 +1918,10 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,
   tmp_mv->row *= 8;
   tmp_mv->col *= 8;
 
+  vp9_set_subpel_mv_search_range(&subpel_mv_limits, &x->mv_limits, ref_mv);
+  clamp_mv(tmp_mv, subpel_mv_limits.col_min, subpel_mv_limits.col_max,
+           subpel_mv_limits.row_min, subpel_mv_limits.row_max);
+
   if (scaled_ref_frame) {
     int i;
     for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i];
@@ -2210,7 +2215,8 @@ int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
       var = bigdia_search(x, mvp_full, step_param, error_per_bit, 1, cost_list,
                           fn_ptr, 1, ref_mv, tmp_mv);
       break;
-    case NSTEP:
+    default:
+      assert(method == NSTEP);
       var = full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit,
                                MAX_MVSEARCH_STEPS - 1 - step_param, 1,
                                cost_list, fn_ptr, ref_mv, tmp_mv);
@@ -2236,7 +2242,6 @@ int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
         }
       }
       break;
-    default: assert(0 && "Invalid search method.");
   }
 
   if (method != NSTEP && rd && var < var_max)
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index b8db2c353..b4787fe1f 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -66,7 +66,8 @@ int vp9_refining_search_sad(const struct macroblock *x, struct mv *ref_mv,
 // Perform integral projection based motion estimation.
 unsigned int vp9_int_pro_motion_estimation(const struct VP9_COMP *cpi,
                                            MACROBLOCK *x, BLOCK_SIZE bsize,
-                                           int mi_row, int mi_col);
+                                           int mi_row, int mi_col,
+                                           const MV *ref_mv);
 
 typedef uint32_t(fractional_mv_step_fp)(
     const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp,
diff --git a/vp9/encoder/vp9_picklpf.c b/vp9/encoder/vp9_picklpf.c
index 1c2c55b9e..4e9649065 100644
--- a/vp9/encoder/vp9_picklpf.c
+++ b/vp9/encoder/vp9_picklpf.c
@@ -169,14 +169,10 @@ void vp9_pick_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
       case VPX_BITS_10:
         filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 4060632, 20);
         break;
-      case VPX_BITS_12:
+      default:
+        assert(cm->bit_depth == VPX_BITS_12);
         filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 16242526, 22);
         break;
-      default:
-        assert(0 &&
-               "bit_depth should be VPX_BITS_8, VPX_BITS_10 "
-               "or VPX_BITS_12");
-        return;
     }
 #else
     int filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18);
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 3aee46636..f9d7a6db8 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -726,13 +726,13 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
                             qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
                             scan_order->iscan);
             break;
-          case TX_4X4:
+          default:
+            assert(tx_size == TX_4X4);
             x->fwd_txfm4x4(src_diff, coeff, diff_stride);
             vp9_quantize_fp(coeff, 16, x->skip_block, p->round_fp, p->quant_fp,
                             qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
                             scan_order->iscan);
             break;
-          default: assert(0); break;
         }
         *skippable &= (*eob == 0);
         eob_cost += 1;
@@ -1502,8 +1502,17 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   int flag_svc_subpel = 0;
   int svc_mv_col = 0;
   int svc_mv_row = 0;
+  int no_scaling = 0;
   unsigned int thresh_svc_skip_golden = 500;
-  if (cpi->svc.spatial_layer_id > 0 && cpi->svc.high_source_sad_superframe)
+  if (cpi->use_svc && cpi->svc.spatial_layer_id > 0) {
+    int layer = LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id - 1,
+                                 cpi->svc.temporal_layer_id,
+                                 cpi->svc.number_temporal_layers);
+    LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
+    if (lc->scaling_factor_num == lc->scaling_factor_den) no_scaling = 1;
+  }
+  if (cpi->svc.spatial_layer_id > 0 &&
+      (cpi->svc.high_source_sad_superframe || no_scaling))
     thresh_svc_skip_golden = 0;
   // Lower the skip threshold if lower spatial layer is better quality relative
   // to current layer.
@@ -1517,7 +1526,6 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     thresh_svc_skip_golden = 1000;
 
   init_ref_frame_cost(cm, xd, ref_frame_cost);
-
   memset(&mode_checked[0][0], 0, MB_MODE_COUNT * MAX_REF_FRAMES);
 
   if (reuse_inter_pred) {
@@ -1764,7 +1772,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
       continue;
 
     if (sf->short_circuit_flat_blocks && x->source_variance == 0 &&
-        this_mode != NEARESTMV) {
+        frame_mv[this_mode][ref_frame].as_int != 0) {
       continue;
     }
 
@@ -1875,7 +1883,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
         (!cpi->sf.adaptive_rd_thresh_row_mt &&
          rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh,
                              &rd_thresh_freq_fact[mode_index])))
-      continue;
+      if (frame_mv[this_mode][ref_frame].as_int != 0) continue;
 
     if (this_mode == NEWMV && !force_gf_mv) {
       if (ref_frame > LAST_FRAME && !cpi->use_svc &&
@@ -1886,7 +1894,9 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
 
         if (bsize < BLOCK_16X16) continue;
 
-        tmp_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col);
+        tmp_sad = vp9_int_pro_motion_estimation(
+            cpi, x, bsize, mi_row, mi_col,
+            &x->mbmi_ext->ref_mvs[ref_frame][0].as_mv);
 
         if (tmp_sad > x->pred_mv_sad[LAST_FRAME]) continue;
         if (tmp_sad + (num_pels_log2_lookup[bsize] << 4) > best_pred_sad)
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 09f61ead2..276022a56 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -204,10 +204,9 @@ static int get_qzbin_factor(int q, vpx_bit_depth_t bit_depth) {
   switch (bit_depth) {
     case VPX_BITS_8: return q == 0 ? 64 : (quant < 148 ? 84 : 80);
     case VPX_BITS_10: return q == 0 ? 64 : (quant < 592 ? 84 : 80);
-    case VPX_BITS_12: return q == 0 ? 64 : (quant < 2368 ? 84 : 80);
     default:
-      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
-      return -1;
+      assert(bit_depth == VPX_BITS_12);
+      return q == 0 ? 64 : (quant < 2368 ? 84 : 80);
   }
 #else
   (void)bit_depth;
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 9fbb48817..599337f80 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -48,18 +48,16 @@
 #define MAX_BPB_FACTOR 50
 
 #if CONFIG_VP9_HIGHBITDEPTH
-#define ASSIGN_MINQ_TABLE(bit_depth, name)                   \
-  do {                                                       \
-    switch (bit_depth) {                                     \
-      case VPX_BITS_8: name = name##_8; break;               \
-      case VPX_BITS_10: name = name##_10; break;             \
-      case VPX_BITS_12: name = name##_12; break;             \
-      default:                                               \
-        assert(0 &&                                          \
-               "bit_depth should be VPX_BITS_8, VPX_BITS_10" \
-               " or VPX_BITS_12");                           \
-        name = NULL;                                         \
-    }                                                        \
+#define ASSIGN_MINQ_TABLE(bit_depth, name)       \
+  do {                                           \
+    switch (bit_depth) {                         \
+      case VPX_BITS_8: name = name##_8; break;   \
+      case VPX_BITS_10: name = name##_10; break; \
+      default:                                   \
+        assert(bit_depth == VPX_BITS_12);        \
+        name = name##_12;                        \
+        break;                                   \
+    }                                            \
   } while (0)
 #else
 #define ASSIGN_MINQ_TABLE(bit_depth, name) \
@@ -167,10 +165,9 @@ double vp9_convert_qindex_to_q(int qindex, vpx_bit_depth_t bit_depth) {
   switch (bit_depth) {
     case VPX_BITS_8: return vp9_ac_quant(qindex, 0, bit_depth) / 4.0;
     case VPX_BITS_10: return vp9_ac_quant(qindex, 0, bit_depth) / 16.0;
-    case VPX_BITS_12: return vp9_ac_quant(qindex, 0, bit_depth) / 64.0;
     default:
-      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
-      return -1.0;
+      assert(bit_depth == VPX_BITS_12);
+      return vp9_ac_quant(qindex, 0, bit_depth) / 64.0;
   }
 #else
   return vp9_ac_quant(qindex, 0, bit_depth) / 4.0;
diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c
index 6b2306ce9..3407e74c6 100644
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c
@@ -69,10 +69,12 @@ static void fill_mode_costs(VP9_COMP *cpi) {
   const FRAME_CONTEXT *const fc = cpi->common.fc;
   int i, j;
 
-  for (i = 0; i < INTRA_MODES; ++i)
-    for (j = 0; j < INTRA_MODES; ++j)
+  for (i = 0; i < INTRA_MODES; ++i) {
+    for (j = 0; j < INTRA_MODES; ++j) {
       vp9_cost_tokens(cpi->y_mode_costs[i][j], vp9_kf_y_mode_prob[i][j],
                       vp9_intra_mode_tree);
+    }
+  }
 
   vp9_cost_tokens(cpi->mbmode_cost, fc->y_mode_prob[1], vp9_intra_mode_tree);
   for (i = 0; i < INTRA_MODES; ++i) {
@@ -82,9 +84,28 @@ static void fill_mode_costs(VP9_COMP *cpi) {
                     fc->uv_mode_prob[i], vp9_intra_mode_tree);
   }
 
-  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) {
     vp9_cost_tokens(cpi->switchable_interp_costs[i],
                     fc->switchable_interp_prob[i], vp9_switchable_interp_tree);
+  }
+
+  for (i = TX_8X8; i < TX_SIZES; ++i) {
+    for (j = 0; j < TX_SIZE_CONTEXTS; ++j) {
+      const vpx_prob *tx_probs = get_tx_probs(i, j, &fc->tx_probs);
+      int k;
+      for (k = 0; k <= i; ++k) {
+        int cost = 0;
+        int m;
+        for (m = 0; m <= k - (k == i); ++m) {
+          if (m == k)
+            cost += vp9_cost_zero(tx_probs[m]);
+          else
+            cost += vp9_cost_one(tx_probs[m]);
+        }
+        cpi->tx_size_cost[i - 1][j][k] = cost;
+      }
+    }
+  }
 }
 
 static void fill_token_costs(vp9_coeff_cost *c,
@@ -153,10 +174,10 @@ int64_t vp9_compute_rd_mult_based_on_qindex(const VP9_COMP *cpi, int qindex) {
   switch (cpi->common.bit_depth) {
     case VPX_BITS_8: rdmult = 88 * q * q / 24; break;
     case VPX_BITS_10: rdmult = ROUND_POWER_OF_TWO(88 * q * q / 24, 4); break;
-    case VPX_BITS_12: rdmult = ROUND_POWER_OF_TWO(88 * q * q / 24, 8); break;
     default:
-      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
-      return -1;
+      assert(cpi->common.bit_depth == VPX_BITS_12);
+      rdmult = ROUND_POWER_OF_TWO(88 * q * q / 24, 8);
+      break;
   }
 #else
   int64_t rdmult = 88 * q * q / 24;
@@ -185,10 +206,10 @@ static int compute_rd_thresh_factor(int qindex, vpx_bit_depth_t bit_depth) {
   switch (bit_depth) {
     case VPX_BITS_8: q = vp9_dc_quant(qindex, 0, VPX_BITS_8) / 4.0; break;
     case VPX_BITS_10: q = vp9_dc_quant(qindex, 0, VPX_BITS_10) / 16.0; break;
-    case VPX_BITS_12: q = vp9_dc_quant(qindex, 0, VPX_BITS_12) / 64.0; break;
     default:
-      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
-      return -1;
+      assert(bit_depth == VPX_BITS_12);
+      q = vp9_dc_quant(qindex, 0, VPX_BITS_12) / 64.0;
+      break;
   }
 #else
   (void)bit_depth;
@@ -209,12 +230,11 @@ void vp9_initialize_me_consts(VP9_COMP *cpi, MACROBLOCK *x, int qindex) {
       x->sadperbit16 = sad_per_bit16lut_10[qindex];
       x->sadperbit4 = sad_per_bit4lut_10[qindex];
       break;
-    case VPX_BITS_12:
+    default:
+      assert(cpi->common.bit_depth == VPX_BITS_12);
       x->sadperbit16 = sad_per_bit16lut_12[qindex];
       x->sadperbit4 = sad_per_bit4lut_12[qindex];
       break;
-    default:
-      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
   }
 #else
   (void)cpi;
@@ -471,13 +491,13 @@ void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
       for (i = 0; i < num_4x4_h; i += 4)
         t_left[i] = !!*(const uint32_t *)&left[i];
       break;
-    case TX_32X32:
+    default:
+      assert(tx_size == TX_32X32);
       for (i = 0; i < num_4x4_w; i += 8)
         t_above[i] = !!*(const uint64_t *)&above[i];
       for (i = 0; i < num_4x4_h; i += 8)
         t_left[i] = !!*(const uint64_t *)&left[i];
       break;
-    default: assert(0 && "Invalid transform size."); break;
   }
 }
 
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 39cd1d41c..e39df033a 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -543,8 +543,9 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane,
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int eob = p->eobs[block];
 
-  if (x->block_tx_domain) {
+  if (x->block_tx_domain && eob) {
     const int ss_txfrm_size = tx_size << 1;
     int64_t this_sse;
     const int shift = tx_size == TX_32X32 ? 0 : 2;
@@ -584,14 +585,13 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane,
     const uint8_t *src = &p->src.buf[src_idx];
     const uint8_t *dst = &pd->dst.buf[dst_idx];
     const tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-    const uint16_t *eob = &p->eobs[block];
     unsigned int tmp;
 
     tmp = pixel_sse(cpi, xd, pd, src, src_stride, dst, dst_stride, blk_row,
                     blk_col, plane_bsize, tx_bsize);
     *out_sse = (int64_t)tmp * 16;
 
-    if (*eob) {
+    if (eob) {
 #if CONFIG_VP9_HIGHBITDEPTH
       DECLARE_ALIGNED(16, uint16_t, recon16[1024]);
       uint8_t *recon = (uint8_t *)recon16;
@@ -604,22 +604,22 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane,
         vpx_highbd_convolve_copy(CONVERT_TO_SHORTPTR(dst), dst_stride, recon16,
                                  32, NULL, 0, 0, 0, 0, bs, bs, xd->bd);
         if (xd->lossless) {
-          vp9_highbd_iwht4x4_add(dqcoeff, recon16, 32, *eob, xd->bd);
+          vp9_highbd_iwht4x4_add(dqcoeff, recon16, 32, eob, xd->bd);
         } else {
           switch (tx_size) {
             case TX_4X4:
-              vp9_highbd_idct4x4_add(dqcoeff, recon16, 32, *eob, xd->bd);
+              vp9_highbd_idct4x4_add(dqcoeff, recon16, 32, eob, xd->bd);
               break;
             case TX_8X8:
-              vp9_highbd_idct8x8_add(dqcoeff, recon16, 32, *eob, xd->bd);
+              vp9_highbd_idct8x8_add(dqcoeff, recon16, 32, eob, xd->bd);
               break;
             case TX_16X16:
-              vp9_highbd_idct16x16_add(dqcoeff, recon16, 32, *eob, xd->bd);
+              vp9_highbd_idct16x16_add(dqcoeff, recon16, 32, eob, xd->bd);
               break;
-            case TX_32X32:
-              vp9_highbd_idct32x32_add(dqcoeff, recon16, 32, *eob, xd->bd);
+            default:
+              assert(tx_size == TX_32X32);
+              vp9_highbd_idct32x32_add(dqcoeff, recon16, 32, eob, xd->bd);
               break;
-            default: assert(0 && "Invalid transform size");
           }
         }
         recon = CONVERT_TO_BYTEPTR(recon16);
@@ -627,16 +627,16 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane,
 #endif  // CONFIG_VP9_HIGHBITDEPTH
         vpx_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, 0, 0, 0, bs, bs);
         switch (tx_size) {
-          case TX_32X32: vp9_idct32x32_add(dqcoeff, recon, 32, *eob); break;
-          case TX_16X16: vp9_idct16x16_add(dqcoeff, recon, 32, *eob); break;
-          case TX_8X8: vp9_idct8x8_add(dqcoeff, recon, 32, *eob); break;
-          case TX_4X4:
+          case TX_32X32: vp9_idct32x32_add(dqcoeff, recon, 32, eob); break;
+          case TX_16X16: vp9_idct16x16_add(dqcoeff, recon, 32, eob); break;
+          case TX_8X8: vp9_idct8x8_add(dqcoeff, recon, 32, eob); break;
+          default:
+            assert(tx_size == TX_4X4);
             // this is like vp9_short_idct4x4 but has a special case around
             // eob<=1, which is significant (not just an optimization) for
             // the lossless case.
-            x->inv_txfm_add(dqcoeff, recon, 32, *eob);
+            x->inv_txfm_add(dqcoeff, recon, 32, eob);
             break;
-          default: assert(0 && "Invalid transform size"); break;
         }
 #if CONFIG_VP9_HIGHBITDEPTH
       }
@@ -845,21 +845,20 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
                               { INT64_MAX, INT64_MAX },
                               { INT64_MAX, INT64_MAX },
                               { INT64_MAX, INT64_MAX } };
-  int n, m;
+  int n;
   int s0, s1;
   int64_t best_rd = INT64_MAX;
   TX_SIZE best_tx = max_tx_size;
   int start_tx, end_tx;
-
-  const vpx_prob *tx_probs =
-      get_tx_probs(max_tx_size, get_tx_size_context(xd), &cm->fc->tx_probs);
+  const int tx_size_ctx = get_tx_size_context(xd);
   assert(skip_prob > 0);
   s0 = vp9_cost_bit(skip_prob, 0);
   s1 = vp9_cost_bit(skip_prob, 1);
 
   if (cm->tx_mode == TX_MODE_SELECT) {
     start_tx = max_tx_size;
-    end_tx = 0;
+    end_tx = VPXMAX(start_tx - cpi->sf.tx_size_search_depth, 0);
+    if (bs > BLOCK_32X32) end_tx = VPXMIN(end_tx + 1, start_tx);
   } else {
     TX_SIZE chosen_tx_size =
         VPXMIN(max_tx_size, tx_mode_to_biggest_tx_size[cm->tx_mode]);
@@ -868,13 +867,7 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
   }
 
   for (n = start_tx; n >= end_tx; n--) {
-    int r_tx_size = 0;
-    for (m = 0; m <= n - (n == (int)max_tx_size); m++) {
-      if (m == n)
-        r_tx_size += vp9_cost_zero(tx_probs[m]);
-      else
-        r_tx_size += vp9_cost_one(tx_probs[m]);
-    }
+    const int r_tx_size = cpi->tx_size_cost[max_tx_size - 1][tx_size_ctx][n];
     txfm_rd_in_plane(cpi, x, &r[n][0], &d[n], &s[n], &sse[n], ref_best_rd, 0,
                      bs, n, cpi->sf.use_fast_coef_costing);
     r[n][1] = r[n][0];
@@ -1469,11 +1462,11 @@ static int set_and_cost_bmi_mvs(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
       if (is_compound)
         this_mv[1].as_int = frame_mv[mode][mi->ref_frame[1]].as_int;
       break;
-    case ZEROMV:
+    default:
+      assert(mode == ZEROMV);
       this_mv[0].as_int = 0;
       if (is_compound) this_mv[1].as_int = 0;
       break;
-    default: break;
   }
 
   mi->bmi[i].as_mv[0].as_int = this_mv[0].as_int;
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index c0f985cbd..90da68726 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -817,6 +817,7 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
   // Some speed-up features even for best quality as minimal impact on quality.
   sf->adaptive_rd_thresh = 1;
   sf->tx_size_search_breakout = 1;
+  sf->tx_size_search_depth = 2;
 
   sf->exhaustive_searches_thresh =
       (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? (1 << 20)
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index 15e8dacbd..946bf0545 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -272,6 +272,9 @@ typedef struct SPEED_FEATURES {
   // for intra and model coefs for the rest.
   TX_SIZE_SEARCH_METHOD tx_size_search_method;
 
+  // How many levels of tx size to search, starting from the largest.
+  int tx_size_search_depth;
+
   // Low precision 32x32 fdct keeps everything in 16 bits and thus is less
   // precise but significantly faster than the non lp version.
   int use_lp32x32fdct;
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index 42a197769..07d1995a8 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -46,9 +46,9 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
     svc->last_layer_dropped[sl] = 0;
     svc->drop_spatial_layer[sl] = 0;
     svc->ext_frame_flags[sl] = 0;
-    svc->ext_lst_fb_idx[sl] = 0;
-    svc->ext_gld_fb_idx[sl] = 1;
-    svc->ext_alt_fb_idx[sl] = 2;
+    svc->lst_fb_idx[sl] = 0;
+    svc->gld_fb_idx[sl] = 1;
+    svc->alt_fb_idx[sl] = 2;
     svc->downsample_filter_type[sl] = BILINEAR;
     svc->downsample_filter_phase[sl] = 8;  // Set to 8 for averaging filter.
     svc->framedrop_thresh[sl] = oxcf->drop_frames_water_mark;
@@ -407,6 +407,40 @@ void get_layer_resolution(const int width_org, const int height_org,
   *height_out = h;
 }
 
+void reset_fb_idx_unused(VP9_COMP *const cpi) {
+  // If a reference frame is not referenced or refreshed, then set the
+  // fb_idx for that reference to the first one used/referenced.
+  // This is to avoid setting fb_idx for a reference to a slot that is not
+  // used/needed (i.e., since that reference is not referenced or refreshed).
+  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+                                    VP9_ALT_FLAG };
+  MV_REFERENCE_FRAME ref_frame;
+  MV_REFERENCE_FRAME first_ref = 0;
+  int first_fb_idx = 0;
+  int fb_idx[3] = { cpi->lst_fb_idx, cpi->gld_fb_idx, cpi->alt_fb_idx };
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+    if (cpi->ref_frame_flags & flag_list[ref_frame]) {
+      first_ref = ref_frame;
+      first_fb_idx = fb_idx[ref_frame - 1];
+      break;
+    }
+  }
+  if (first_ref > 0) {
+    if (first_ref != LAST_FRAME &&
+        !(cpi->ref_frame_flags & flag_list[LAST_FRAME]) &&
+        !cpi->ext_refresh_last_frame)
+      cpi->lst_fb_idx = first_fb_idx;
+    else if (first_ref != GOLDEN_FRAME &&
+             !(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) &&
+             !cpi->ext_refresh_golden_frame)
+      cpi->gld_fb_idx = first_fb_idx;
+    else if (first_ref != ALTREF_FRAME &&
+             !(cpi->ref_frame_flags & flag_list[ALTREF_FRAME]) &&
+             !cpi->ext_refresh_alt_ref_frame)
+      cpi->alt_fb_idx = first_fb_idx;
+  }
+}
+
 // The function sets proper ref_frame_flags, buffer indices, and buffer update
 // variables for temporal layering mode 3 - that does 0-2-1-2 temporal layering
 // scheme.
@@ -510,6 +544,8 @@ static void set_flags_and_fb_idx_for_temporal_mode3(VP9_COMP *const cpi) {
     cpi->gld_fb_idx = cpi->svc.number_spatial_layers + spatial_id - 1;
     cpi->alt_fb_idx = cpi->svc.number_spatial_layers + spatial_id;
   }
+
+  reset_fb_idx_unused(cpi);
 }
 
 // The function sets proper ref_frame_flags, buffer indices, and buffer update
@@ -569,6 +605,8 @@ static void set_flags_and_fb_idx_for_temporal_mode2(VP9_COMP *const cpi) {
     cpi->gld_fb_idx = cpi->svc.number_spatial_layers + spatial_id - 1;
     cpi->alt_fb_idx = cpi->svc.number_spatial_layers + spatial_id;
   }
+
+  reset_fb_idx_unused(cpi);
 }
 
 // The function sets proper ref_frame_flags, buffer indices, and buffer update
@@ -601,6 +639,28 @@ static void set_flags_and_fb_idx_for_temporal_mode_noLayering(
   } else {
     cpi->gld_fb_idx = 0;
   }
+
+  reset_fb_idx_unused(cpi);
+}
+
+void vp9_copy_flags_ref_update_idx(VP9_COMP *const cpi) {
+  SVC *const svc = &cpi->svc;
+  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+                                    VP9_ALT_FLAG };
+  int sl = svc->spatial_layer_id;
+  svc->lst_fb_idx[sl] = cpi->lst_fb_idx;
+  svc->gld_fb_idx[sl] = cpi->gld_fb_idx;
+  svc->alt_fb_idx[sl] = cpi->alt_fb_idx;
+
+  svc->update_last[sl] = (uint8_t)cpi->refresh_last_frame;
+  svc->update_golden[sl] = (uint8_t)cpi->refresh_golden_frame;
+  svc->update_altref[sl] = (uint8_t)cpi->refresh_alt_ref_frame;
+  svc->reference_last[sl] =
+      (uint8_t)(cpi->ref_frame_flags & flag_list[LAST_FRAME]);
+  svc->reference_golden[sl] =
+      (uint8_t)(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]);
+  svc->reference_altref[sl] =
+      (uint8_t)(cpi->ref_frame_flags & flag_list[ALTREF_FRAME]);
 }
 
 int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
@@ -637,18 +697,30 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
       cpi->svc.spatial_layer_id = cpi->svc.spatial_layer_to_encode;
       sl = cpi->svc.spatial_layer_id;
       vp9_apply_encoding_flags(cpi, cpi->svc.ext_frame_flags[sl]);
-      cpi->lst_fb_idx = cpi->svc.ext_lst_fb_idx[sl];
-      cpi->gld_fb_idx = cpi->svc.ext_gld_fb_idx[sl];
-      cpi->alt_fb_idx = cpi->svc.ext_alt_fb_idx[sl];
+      cpi->lst_fb_idx = cpi->svc.lst_fb_idx[sl];
+      cpi->gld_fb_idx = cpi->svc.gld_fb_idx[sl];
+      cpi->alt_fb_idx = cpi->svc.alt_fb_idx[sl];
     }
   }
 
   // Reset the drop flags for all spatial layers, on the base layer.
   if (cpi->svc.spatial_layer_id == 0) {
-    int i;
-    for (i = 0; i < cpi->svc.number_spatial_layers; i++) {
-      cpi->svc.drop_spatial_layer[i] = 0;
+    vp9_zero(cpi->svc.drop_spatial_layer);
+    // TODO(jianj/marpan): Investigate why setting cpi->svc.lst/gld/alt_fb_idx
+    // causes an issue with frame dropping and temporal layers, when the frame
+    // flags are passed via the encode call (bypass mode). Issue is that we're
+    // resetting ext_refresh_frame_flags_pending to 0 on frame drops.
+    if (cpi->svc.temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
+      memset(&cpi->svc.lst_fb_idx, -1, sizeof(cpi->svc.lst_fb_idx));
+      memset(&cpi->svc.gld_fb_idx, -1, sizeof(cpi->svc.lst_fb_idx));
+      memset(&cpi->svc.alt_fb_idx, -1, sizeof(cpi->svc.lst_fb_idx));
     }
+    vp9_zero(cpi->svc.update_last);
+    vp9_zero(cpi->svc.update_golden);
+    vp9_zero(cpi->svc.update_altref);
+    vp9_zero(cpi->svc.reference_last);
+    vp9_zero(cpi->svc.reference_golden);
+    vp9_zero(cpi->svc.reference_altref);
   }
 
   lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id *
@@ -714,6 +786,15 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
 
   if (cpi->svc.spatial_layer_id == 0) cpi->svc.high_source_sad_superframe = 0;
 
+  if (cpi->svc.temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS &&
+      cpi->svc.last_layer_dropped[cpi->svc.spatial_layer_id]) {
+    // For fixed/non-flexible mode, if the previous frame (same spatial layer
+    // from previous superframe) was dropped, make sure the lst_fb_idx
+    // for this frame corresponds to the buffer index updated on (last) encoded
+    // TL0 frame (with same spatial layer).
+    cpi->lst_fb_idx = cpi->svc.fb_idx_upd_tl0[cpi->svc.spatial_layer_id];
+  }
+
   if (vp9_set_size_literal(cpi, width, height) != 0)
     return VPX_CODEC_INVALID_PARAM;
 
@@ -799,3 +880,63 @@ void vp9_svc_check_reset_layer_rc_flag(VP9_COMP *const cpi) {
     }
   }
 }
+
+void vp9_svc_constrain_inter_layer_pred(VP9_COMP *const cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  // Check for disabling inter-layer (spatial) prediction, if
+  // svc.disable_inter_layer_pred is set. If the previous spatial layer was
+  // dropped then disable the prediction from this (scaled) reference.
+  if ((cpi->svc.disable_inter_layer_pred == INTER_LAYER_PRED_OFF_NONKEY &&
+       !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame) ||
+      cpi->svc.disable_inter_layer_pred == INTER_LAYER_PRED_OFF ||
+      cpi->svc.drop_spatial_layer[cpi->svc.spatial_layer_id - 1]) {
+    MV_REFERENCE_FRAME ref_frame;
+    static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+                                      VP9_ALT_FLAG };
+    for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+      const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
+      if (yv12 != NULL && (cpi->ref_frame_flags & flag_list[ref_frame])) {
+        const struct scale_factors *const scale_fac =
+            &cm->frame_refs[ref_frame - 1].sf;
+        if (vp9_is_scaled(scale_fac))
+          cpi->ref_frame_flags &= (~flag_list[ref_frame]);
+      }
+    }
+  }
+  // Check for disabling inter-layer prediction if
+  // INTER_LAYER_PRED_ON_CONSTRAINED is enabled.
+  // If the reference for inter-layer prediction (the reference that is scaled)
+  // is not the previous spatial layer from the same superframe, then we
+  // disable inter-layer prediction.
+  if (cpi->svc.disable_inter_layer_pred == INTER_LAYER_PRED_ON_CONSTRAINED) {
+    // We only use LAST and GOLDEN for prediction in real-time mode, so we
+    // check both here.
+    MV_REFERENCE_FRAME ref_frame;
+    for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ref_frame++) {
+      struct scale_factors *scale_fac = &cm->frame_refs[ref_frame - 1].sf;
+      if (vp9_is_scaled(scale_fac)) {
+        // If this reference  was updated on the previous spatial layer of the
+        // current superframe, then we keep this reference (don't disable).
+        // Otherwise we disable the inter-layer prediction.
+        // This condition is verified by checking if the current frame buffer
+        // index is equal to any of the slots for the previous spatial layer,
+        // and if so, check if that slot was updated/refreshed. If that is the
+        // case, then this reference is valid for inter-layer prediction under
+        // the mode INTER_LAYER_PRED_ON_CONSTRAINED.
+        int fb_idx =
+            ref_frame == LAST_FRAME ? cpi->lst_fb_idx : cpi->gld_fb_idx;
+        int ref_flag = ref_frame == LAST_FRAME ? VP9_LAST_FLAG : VP9_GOLD_FLAG;
+        int sl = cpi->svc.spatial_layer_id;
+        int disable = 1;
+        if ((fb_idx == cpi->svc.lst_fb_idx[sl - 1] &&
+             cpi->svc.update_last[sl - 1]) ||
+            (fb_idx == cpi->svc.gld_fb_idx[sl - 1] &&
+             cpi->svc.update_golden[sl - 1]) ||
+            (fb_idx == cpi->svc.alt_fb_idx[sl - 1] &&
+             cpi->svc.update_altref[sl - 1]))
+          disable = 0;
+        if (disable) cpi->ref_frame_flags &= (~ref_flag);
+      }
+    }
+  }
+}
diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h
index 022fd00f7..617717049 100644
--- a/vp9/encoder/vp9_svc_layercontext.h
+++ b/vp9/encoder/vp9_svc_layercontext.h
@@ -20,9 +20,16 @@ extern "C" {
 #endif
 
 typedef enum {
+  // Inter-layer prediction is on on all frames.
   INTER_LAYER_PRED_ON,
+  // Inter-layer prediction is off on all frames.
   INTER_LAYER_PRED_OFF,
-  INTER_LAYER_PRED_OFF_NONKEY
+  // Inter-layer prediction is off on non-key frames.
+  INTER_LAYER_PRED_OFF_NONKEY,
+  // Inter-layer prediction is on on all frames, but constrained such
+  // that any layer S (> 0) can only predict from previous spatial
+  // layer S-1, from the same superframe.
+  INTER_LAYER_PRED_ON_CONSTRAINED
 } INTER_LAYER_PRED;
 
 typedef struct {
@@ -86,9 +93,9 @@ typedef struct SVC {
   // Frame flags and buffer indexes for each spatial layer, set by the
   // application (external settings).
   int ext_frame_flags[VPX_MAX_LAYERS];
-  int ext_lst_fb_idx[VPX_MAX_LAYERS];
-  int ext_gld_fb_idx[VPX_MAX_LAYERS];
-  int ext_alt_fb_idx[VPX_MAX_LAYERS];
+  int lst_fb_idx[VPX_MAX_LAYERS];
+  int gld_fb_idx[VPX_MAX_LAYERS];
+  int alt_fb_idx[VPX_MAX_LAYERS];
   int ref_frame_index[REF_FRAMES];
   int force_zero_mode_spatial_ref;
   int current_superframe;
@@ -123,6 +130,18 @@ typedef struct SVC {
   // currently checked for each superframe prior to encoding, on the full
   // resolution source.
   int high_source_sad_superframe;
+
+  // Flags used to get SVC pattern info.
+  uint8_t update_last[VPX_SS_MAX_LAYERS];
+  uint8_t update_golden[VPX_SS_MAX_LAYERS];
+  uint8_t update_altref[VPX_SS_MAX_LAYERS];
+  uint8_t reference_last[VPX_SS_MAX_LAYERS];
+  uint8_t reference_golden[VPX_SS_MAX_LAYERS];
+  uint8_t reference_altref[VPX_SS_MAX_LAYERS];
+
+  // Keep track of the frame buffer index updated/refreshed on the base
+  // temporal superframe.
+  uint8_t fb_idx_upd_tl0[VPX_SS_MAX_LAYERS];
 } SVC;
 
 struct VP9_COMP;
@@ -170,6 +189,8 @@ struct lookahead_entry *vp9_svc_lookahead_pop(struct VP9_COMP *const cpi,
 // Start a frame and initialize svc parameters
 int vp9_svc_start_frame(struct VP9_COMP *const cpi);
 
+void vp9_copy_flags_ref_update_idx(struct VP9_COMP *const cpi);
+
 int vp9_one_pass_cbr_svc_start_layer(struct VP9_COMP *const cpi);
 
 void vp9_free_svc_cyclic_refresh(struct VP9_COMP *const cpi);
@@ -178,6 +199,8 @@ void vp9_svc_reset_key_frame(struct VP9_COMP *const cpi);
 
 void vp9_svc_check_reset_layer_rc_flag(struct VP9_COMP *const cpi);
 
+void vp9_svc_constrain_inter_layer_pred(struct VP9_COMP *const cpi);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index e006606f1..5eaa7a18a 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -1487,6 +1487,25 @@ static vpx_codec_err_t ctrl_set_svc_parameters(vpx_codec_alg_priv_t *ctx,
   return VPX_CODEC_OK;
 }
 
+static vpx_codec_err_t ctrl_get_svc_ref_frame_config(vpx_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  VP9_COMP *const cpi = ctx->cpi;
+  vpx_svc_ref_frame_config_t *data = va_arg(args, vpx_svc_ref_frame_config_t *);
+  int sl;
+  for (sl = 0; sl <= cpi->svc.spatial_layer_id; sl++) {
+    data->update_last[sl] = cpi->svc.update_last[sl];
+    data->update_golden[sl] = cpi->svc.update_golden[sl];
+    data->update_alt_ref[sl] = cpi->svc.update_altref[sl];
+    data->reference_last[sl] = cpi->svc.reference_last[sl];
+    data->reference_golden[sl] = cpi->svc.reference_golden[sl];
+    data->reference_alt_ref[sl] = cpi->svc.reference_altref[sl];
+    data->lst_fb_idx[sl] = cpi->svc.lst_fb_idx[sl];
+    data->gld_fb_idx[sl] = cpi->svc.gld_fb_idx[sl];
+    data->alt_fb_idx[sl] = cpi->svc.alt_fb_idx[sl];
+  }
+  return VPX_CODEC_OK;
+}
+
 static vpx_codec_err_t ctrl_set_svc_ref_frame_config(vpx_codec_alg_priv_t *ctx,
                                                      va_list args) {
   VP9_COMP *const cpi = ctx->cpi;
@@ -1494,9 +1513,9 @@ static vpx_codec_err_t ctrl_set_svc_ref_frame_config(vpx_codec_alg_priv_t *ctx,
   int sl;
   for (sl = 0; sl < cpi->svc.number_spatial_layers; ++sl) {
     cpi->svc.ext_frame_flags[sl] = data->frame_flags[sl];
-    cpi->svc.ext_lst_fb_idx[sl] = data->lst_fb_idx[sl];
-    cpi->svc.ext_gld_fb_idx[sl] = data->gld_fb_idx[sl];
-    cpi->svc.ext_alt_fb_idx[sl] = data->alt_fb_idx[sl];
+    cpi->svc.lst_fb_idx[sl] = data->lst_fb_idx[sl];
+    cpi->svc.gld_fb_idx[sl] = data->gld_fb_idx[sl];
+    cpi->svc.alt_fb_idx[sl] = data->alt_fb_idx[sl];
   }
   return VPX_CODEC_OK;
 }
@@ -1613,6 +1632,7 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { VP9E_GET_SVC_LAYER_ID, ctrl_get_svc_layer_id },
   { VP9E_GET_ACTIVEMAP, ctrl_get_active_map },
   { VP9E_GET_LEVEL, ctrl_get_level },
+  { VP9E_GET_SVC_REF_FRAME_CONFIG, ctrl_get_svc_ref_frame_config },
 
   { -1, NULL },
 };
diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index f409844b5..b201d96f4 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -620,6 +620,13 @@ enum vp8e_enc_control_id {
    * Supported in codecs: VP9
    */
   VP9E_SET_SVC_FRAME_DROP_LAYER,
+
+  /*!\brief Codec control function to get the refresh and reference flags and
+   * the buffer indices, up to the last encoded spatial layer.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_GET_SVC_REF_FRAME_CONFIG,
 };
 
 /*!\brief vpx 1-D scaling mode
@@ -757,10 +764,18 @@ typedef struct vpx_svc_layer_id {
  *
  */
 typedef struct vpx_svc_ref_frame_config {
-  int frame_flags[VPX_TS_MAX_LAYERS]; /**< Frame flags. */
-  int lst_fb_idx[VPX_TS_MAX_LAYERS];  /**< Last buffer index. */
-  int gld_fb_idx[VPX_TS_MAX_LAYERS];  /**< Golden buffer index. */
-  int alt_fb_idx[VPX_TS_MAX_LAYERS];  /**< Altref buffer index. */
+  // TODO(jianj/marpan): Remove the usage of frame_flags, instead use the
+  // update and reference flags.
+  int frame_flags[VPX_SS_MAX_LAYERS];       /**< Frame flags. */
+  int lst_fb_idx[VPX_SS_MAX_LAYERS];        /**< Last buffer index. */
+  int gld_fb_idx[VPX_SS_MAX_LAYERS];        /**< Golden buffer index. */
+  int alt_fb_idx[VPX_SS_MAX_LAYERS];        /**< Altref buffer index. */
+  int update_last[VPX_SS_MAX_LAYERS];       /**< Update last. */
+  int update_golden[VPX_SS_MAX_LAYERS];     /**< Update golden. */
+  int update_alt_ref[VPX_SS_MAX_LAYERS];    /**< Update altref. */
+  int reference_last[VPX_SS_MAX_LAYERS];    /**< Last as eference. */
+  int reference_golden[VPX_SS_MAX_LAYERS];  /**< Golden as reference. */
+  int reference_alt_ref[VPX_SS_MAX_LAYERS]; /**< Altref as reference. */
 } vpx_svc_ref_frame_config_t;
 
 /*!\brief VP9 svc frame dropping mode.
@@ -927,6 +942,9 @@ VPX_CTRL_USE_TYPE(VP9E_SET_SVC_INTER_LAYER_PRED, unsigned int)
 VPX_CTRL_USE_TYPE(VP9E_SET_SVC_FRAME_DROP_LAYER, vpx_svc_frame_drop_t *)
 #define VPX_CTRL_VP9E_SET_SVC_FRAME_DROP_LAYER
 
+VPX_CTRL_USE_TYPE(VP9E_GET_SVC_REF_FRAME_CONFIG, vpx_svc_ref_frame_config_t *)
+#define VPX_CTRL_VP9E_GET_SVC_REF_FRAME_CONFIG
+
 /*!\endcond */
 /*! @} - end defgroup vp8_encoder */
 #ifdef __cplusplus
diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h
index 4017e5719..8c08017b6 100644
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h
@@ -63,7 +63,7 @@ extern "C" {
  * fields to structures
  */
 #define VPX_ENCODER_ABI_VERSION \
-  (11 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/
+  (12 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/
 
 /*! \brief Encoder capabilities bitfield
  *
diff --git a/vpx_dsp/arm/avg_pred_neon.c b/vpx_dsp/arm/avg_pred_neon.c
index 1370ec2d2..5afdece0a 100644
--- a/vpx_dsp/arm/avg_pred_neon.c
+++ b/vpx_dsp/arm/avg_pred_neon.c
@@ -17,8 +17,8 @@
 void vpx_comp_avg_pred_neon(uint8_t *comp, const uint8_t *pred, int width,
                             int height, const uint8_t *ref, int ref_stride) {
   if (width > 8) {
-    int x, y;
-    for (y = 0; y < height; ++y) {
+    int x, y = height;
+    do {
       for (x = 0; x < width; x += 16) {
         const uint8x16_t p = vld1q_u8(pred + x);
         const uint8x16_t r = vld1q_u8(ref + x);
@@ -28,28 +28,38 @@ void vpx_comp_avg_pred_neon(uint8_t *comp, const uint8_t *pred, int width,
       comp += width;
       pred += width;
       ref += ref_stride;
-    }
+    } while (--y);
+  } else if (width == 8) {
+    int i = width * height;
+    do {
+      const uint8x16_t p = vld1q_u8(pred);
+      uint8x16_t r;
+      const uint8x8_t r_0 = vld1_u8(ref);
+      const uint8x8_t r_1 = vld1_u8(ref + ref_stride);
+      r = vcombine_u8(r_0, r_1);
+      ref += 2 * ref_stride;
+      r = vrhaddq_u8(r, p);
+      vst1q_u8(comp, r);
+
+      pred += 16;
+      comp += 16;
+      i -= 16;
+    } while (i);
   } else {
-    int i;
-    for (i = 0; i < width * height; i += 16) {
+    int i = width * height;
+    assert(width == 4);
+    do {
       const uint8x16_t p = vld1q_u8(pred);
       uint8x16_t r;
 
-      if (width == 4) {
-        r = load_unaligned_u8q(ref, ref_stride);
-        ref += 4 * ref_stride;
-      } else {
-        const uint8x8_t r_0 = vld1_u8(ref);
-        const uint8x8_t r_1 = vld1_u8(ref + ref_stride);
-        assert(width == 8);
-        r = vcombine_u8(r_0, r_1);
-        ref += 2 * ref_stride;
-      }
+      r = load_unaligned_u8q(ref, ref_stride);
+      ref += 4 * ref_stride;
       r = vrhaddq_u8(r, p);
       vst1q_u8(comp, r);
 
       pred += 16;
       comp += 16;
-    }
+      i -= 16;
+    } while (i);
   }
 }
diff --git a/vpx_dsp/arm/mem_neon.h b/vpx_dsp/arm/mem_neon.h
index 12c0a54c8..6745464d7 100644
--- a/vpx_dsp/arm/mem_neon.h
+++ b/vpx_dsp/arm/mem_neon.h
@@ -101,9 +101,9 @@ static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) {
   if (stride == 4) return vld1_u8(buf);
   memcpy(&a, buf, 4);
   buf += stride;
-  a_u32 = vld1_lane_u32(&a, a_u32, 0);
+  a_u32 = vset_lane_u32(a, a_u32, 0);
   memcpy(&a, buf, 4);
-  a_u32 = vld1_lane_u32(&a, a_u32, 1);
+  a_u32 = vset_lane_u32(a, a_u32, 1);
   return vreinterpret_u8_u32(a_u32);
 }
 
@@ -127,16 +127,16 @@ static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) {
   if (stride == 4) return vld1q_u8(buf);
   memcpy(&a, buf, 4);
   buf += stride;
-  a_u32 = vld1q_lane_u32(&a, a_u32, 0);
+  a_u32 = vsetq_lane_u32(a, a_u32, 0);
   memcpy(&a, buf, 4);
   buf += stride;
-  a_u32 = vld1q_lane_u32(&a, a_u32, 1);
+  a_u32 = vsetq_lane_u32(a, a_u32, 1);
   memcpy(&a, buf, 4);
   buf += stride;
-  a_u32 = vld1q_lane_u32(&a, a_u32, 2);
+  a_u32 = vsetq_lane_u32(a, a_u32, 2);
   memcpy(&a, buf, 4);
   buf += stride;
-  a_u32 = vld1q_lane_u32(&a, a_u32, 3);
+  a_u32 = vsetq_lane_u32(a, a_u32, 3);
   return vreinterpretq_u8_u32(a_u32);
 }
 
diff --git a/vpx_dsp/arm/sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c
index b04de3aff..535ec0f0d 100644
--- a/vpx_dsp/arm/sad4d_neon.c
+++ b/vpx_dsp/arm/sad4d_neon.c
@@ -10,64 +10,152 @@
 
 #include <arm_neon.h>
 
+#include <assert.h>
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/sum_neon.h"
 
+static INLINE uint8x8_t load_unaligned_2_buffers(const void *const buf0,
+                                                 const void *const buf1) {
+  uint32_t a;
+  uint32x2_t aa = vdup_n_u32(0);
+  memcpy(&a, buf0, 4);
+  aa = vset_lane_u32(a, aa, 0);
+  memcpy(&a, buf1, 4);
+  aa = vset_lane_u32(a, aa, 1);
+  return vreinterpret_u8_u32(aa);
+}
+
+static INLINE void sad4x_4d(const uint8_t *const src, const int src_stride,
+                            const uint8_t *const ref[4], const int ref_stride,
+                            const int height, uint32_t *const res) {
+  int i;
+  uint16x8_t abs[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
+  uint16x4_t a[2];
+  uint32x4_t r;
+
+  assert(!((intptr_t)src % sizeof(uint32_t)));
+  assert(!(src_stride % sizeof(uint32_t)));
+
+  for (i = 0; i < height; ++i) {
+    const uint8x8_t s = vreinterpret_u8_u32(
+        vld1_dup_u32((const uint32_t *)(src + i * src_stride)));
+    const uint8x8_t ref01 = load_unaligned_2_buffers(ref[0] + i * ref_stride,
+                                                     ref[1] + i * ref_stride);
+    const uint8x8_t ref23 = load_unaligned_2_buffers(ref[2] + i * ref_stride,
+                                                     ref[3] + i * ref_stride);
+    abs[0] = vabal_u8(abs[0], s, ref01);
+    abs[1] = vabal_u8(abs[1], s, ref23);
+  }
+
+  a[0] = vpadd_u16(vget_low_u16(abs[0]), vget_high_u16(abs[0]));
+  a[1] = vpadd_u16(vget_low_u16(abs[1]), vget_high_u16(abs[1]));
+  r = vpaddlq_u16(vcombine_u16(a[0], a[1]));
+  vst1q_u32(res, r);
+}
+
 void vpx_sad4x4x4d_neon(const uint8_t *src, int src_stride,
                         const uint8_t *const ref[4], int ref_stride,
                         uint32_t *res) {
-  int i;
-  const uint8x16_t src_u8 = load_unaligned_u8q(src, src_stride);
-  for (i = 0; i < 4; ++i) {
-    const uint8x16_t ref_u8 = load_unaligned_u8q(ref[i], ref_stride);
-    uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(ref_u8));
-    abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8));
-    res[i] = vget_lane_u32(horizontal_add_uint16x8(abs), 0);
-  }
+  sad4x_4d(src, src_stride, ref, ref_stride, 4, res);
 }
 
 void vpx_sad4x8x4d_neon(const uint8_t *src, int src_stride,
                         const uint8_t *const ref[4], int ref_stride,
                         uint32_t *res) {
-  int i;
-  const uint8x16_t src_0 = load_unaligned_u8q(src, src_stride);
-  const uint8x16_t src_1 = load_unaligned_u8q(src + 4 * src_stride, src_stride);
-  for (i = 0; i < 4; ++i) {
-    const uint8x16_t ref_0 = load_unaligned_u8q(ref[i], ref_stride);
-    const uint8x16_t ref_1 =
-        load_unaligned_u8q(ref[i] + 4 * ref_stride, ref_stride);
-    uint16x8_t abs = vabdl_u8(vget_low_u8(src_0), vget_low_u8(ref_0));
-    abs = vabal_u8(abs, vget_high_u8(src_0), vget_high_u8(ref_0));
-    abs = vabal_u8(abs, vget_low_u8(src_1), vget_low_u8(ref_1));
-    abs = vabal_u8(abs, vget_high_u8(src_1), vget_high_u8(ref_1));
-    res[i] = vget_lane_u32(horizontal_add_uint16x8(abs), 0);
-  }
+  sad4x_4d(src, src_stride, ref, ref_stride, 8, res);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Can handle 512 pixels' sad sum (such as 16x32 or 32x16)
+static INLINE void sad_512_pel_final_neon(const uint16x8_t *sum /*[4]*/,
+                                          uint32_t *const res) {
+  const uint16x4_t a0 = vadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0]));
+  const uint16x4_t a1 = vadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1]));
+  const uint16x4_t a2 = vadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2]));
+  const uint16x4_t a3 = vadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3]));
+  const uint16x4_t b0 = vpadd_u16(a0, a1);
+  const uint16x4_t b1 = vpadd_u16(a2, a3);
+  const uint32x4_t r = vpaddlq_u16(vcombine_u16(b0, b1));
+  vst1q_u32(res, r);
 }
 
-static INLINE void sad8x_4d(const uint8_t *a, int a_stride,
-                            const uint8_t *const b[4], int b_stride,
-                            uint32_t *result, const int height) {
+// Can handle 1024 pixels' sad sum (such as 32x32)
+static INLINE void sad_1024_pel_final_neon(const uint16x8_t *sum /*[4]*/,
+                                           uint32_t *const res) {
+  const uint16x4_t a0 = vpadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0]));
+  const uint16x4_t a1 = vpadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1]));
+  const uint16x4_t a2 = vpadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2]));
+  const uint16x4_t a3 = vpadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3]));
+  const uint32x4_t b0 = vpaddlq_u16(vcombine_u16(a0, a1));
+  const uint32x4_t b1 = vpaddlq_u16(vcombine_u16(a2, a3));
+  const uint32x2_t c0 = vpadd_u32(vget_low_u32(b0), vget_high_u32(b0));
+  const uint32x2_t c1 = vpadd_u32(vget_low_u32(b1), vget_high_u32(b1));
+  vst1q_u32(res, vcombine_u32(c0, c1));
+}
+
+// Can handle 2048 pixels' sad sum (such as 32x64 or 64x32)
+static INLINE void sad_2048_pel_final_neon(const uint16x8_t *sum /*[4]*/,
+                                           uint32_t *const res) {
+  const uint32x4_t a0 = vpaddlq_u16(sum[0]);
+  const uint32x4_t a1 = vpaddlq_u16(sum[1]);
+  const uint32x4_t a2 = vpaddlq_u16(sum[2]);
+  const uint32x4_t a3 = vpaddlq_u16(sum[3]);
+  const uint32x2_t b0 = vadd_u32(vget_low_u32(a0), vget_high_u32(a0));
+  const uint32x2_t b1 = vadd_u32(vget_low_u32(a1), vget_high_u32(a1));
+  const uint32x2_t b2 = vadd_u32(vget_low_u32(a2), vget_high_u32(a2));
+  const uint32x2_t b3 = vadd_u32(vget_low_u32(a3), vget_high_u32(a3));
+  const uint32x2_t c0 = vpadd_u32(b0, b1);
+  const uint32x2_t c1 = vpadd_u32(b2, b3);
+  vst1q_u32(res, vcombine_u32(c0, c1));
+}
+
+// Can handle 4096 pixels' sad sum (such as 64x64)
+static INLINE void sad_4096_pel_final_neon(const uint16x8_t *sum /*[8]*/,
+                                           uint32_t *const res) {
+  const uint32x4_t a0 = vpaddlq_u16(sum[0]);
+  const uint32x4_t a1 = vpaddlq_u16(sum[1]);
+  const uint32x4_t a2 = vpaddlq_u16(sum[2]);
+  const uint32x4_t a3 = vpaddlq_u16(sum[3]);
+  const uint32x4_t a4 = vpaddlq_u16(sum[4]);
+  const uint32x4_t a5 = vpaddlq_u16(sum[5]);
+  const uint32x4_t a6 = vpaddlq_u16(sum[6]);
+  const uint32x4_t a7 = vpaddlq_u16(sum[7]);
+  const uint32x4_t b0 = vaddq_u32(a0, a1);
+  const uint32x4_t b1 = vaddq_u32(a2, a3);
+  const uint32x4_t b2 = vaddq_u32(a4, a5);
+  const uint32x4_t b3 = vaddq_u32(a6, a7);
+  const uint32x2_t c0 = vadd_u32(vget_low_u32(b0), vget_high_u32(b0));
+  const uint32x2_t c1 = vadd_u32(vget_low_u32(b1), vget_high_u32(b1));
+  const uint32x2_t c2 = vadd_u32(vget_low_u32(b2), vget_high_u32(b2));
+  const uint32x2_t c3 = vadd_u32(vget_low_u32(b3), vget_high_u32(b3));
+  const uint32x2_t d0 = vpadd_u32(c0, c1);
+  const uint32x2_t d1 = vpadd_u32(c2, c3);
+  vst1q_u32(res, vcombine_u32(d0, d1));
+}
+
+static INLINE void sad8x_4d(const uint8_t *src, int src_stride,
+                            const uint8_t *const ref[4], int ref_stride,
+                            uint32_t *res, const int height) {
   int i, j;
+  const uint8_t *ref_loop[4] = { ref[0], ref[1], ref[2], ref[3] };
   uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
                         vdupq_n_u16(0) };
-  const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] };
 
   for (i = 0; i < height; ++i) {
-    const uint8x8_t a_u8 = vld1_u8(a);
-    a += a_stride;
+    const uint8x8_t s = vld1_u8(src);
+    src += src_stride;
     for (j = 0; j < 4; ++j) {
-      const uint8x8_t b_u8 = vld1_u8(b_loop[j]);
-      b_loop[j] += b_stride;
-      sum[j] = vabal_u8(sum[j], a_u8, b_u8);
+      const uint8x8_t b_u8 = vld1_u8(ref_loop[j]);
+      ref_loop[j] += ref_stride;
+      sum[j] = vabal_u8(sum[j], s, b_u8);
     }
   }
 
-  for (j = 0; j < 4; ++j) {
-    result[j] = vget_lane_u32(horizontal_add_uint16x8(sum[j]), 0);
-  }
+  sad_512_pel_final_neon(sum, res);
 }
 
 void vpx_sad8x4x4d_neon(const uint8_t *src, int src_stride,
@@ -88,28 +176,33 @@ void vpx_sad8x16x4d_neon(const uint8_t *src, int src_stride,
   sad8x_4d(src, src_stride, ref, ref_stride, res, 16);
 }
 
-static INLINE void sad16x_4d(const uint8_t *a, int a_stride,
-                             const uint8_t *const b[4], int b_stride,
-                             uint32_t *result, const int height) {
+////////////////////////////////////////////////////////////////////////////////
+
+static INLINE void sad16_neon(const uint8_t *ref, const uint8x16_t src,
+                              uint16x8_t *const sum) {
+  const uint8x16_t r = vld1q_u8(ref);
+  *sum = vabal_u8(*sum, vget_low_u8(src), vget_low_u8(r));
+  *sum = vabal_u8(*sum, vget_high_u8(src), vget_high_u8(r));
+}
+
+static INLINE void sad16x_4d(const uint8_t *src, int src_stride,
+                             const uint8_t *const ref[4], int ref_stride,
+                             uint32_t *res, const int height) {
   int i, j;
+  const uint8_t *ref_loop[4] = { ref[0], ref[1], ref[2], ref[3] };
   uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
                         vdupq_n_u16(0) };
-  const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] };
 
   for (i = 0; i < height; ++i) {
-    const uint8x16_t a_u8 = vld1q_u8(a);
-    a += a_stride;
+    const uint8x16_t s = vld1q_u8(src);
+    src += src_stride;
     for (j = 0; j < 4; ++j) {
-      const uint8x16_t b_u8 = vld1q_u8(b_loop[j]);
-      b_loop[j] += b_stride;
-      sum[j] = vabal_u8(sum[j], vget_low_u8(a_u8), vget_low_u8(b_u8));
-      sum[j] = vabal_u8(sum[j], vget_high_u8(a_u8), vget_high_u8(b_u8));
+      sad16_neon(ref_loop[j], s, &sum[j]);
+      ref_loop[j] += ref_stride;
     }
   }
 
-  for (j = 0; j < 4; ++j) {
-    result[j] = vget_lane_u32(horizontal_add_uint16x8(sum[j]), 0);
-  }
+  sad_512_pel_final_neon(sum, res);
 }
 
 void vpx_sad16x8x4d_neon(const uint8_t *src, int src_stride,
@@ -130,113 +223,152 @@ void vpx_sad16x32x4d_neon(const uint8_t *src, int src_stride,
   sad16x_4d(src, src_stride, ref, ref_stride, res, 32);
 }
 
-static INLINE void sad32x_4d(const uint8_t *a, int a_stride,
-                             const uint8_t *const b[4], int b_stride,
-                             uint32_t *result, const int height) {
-  int i, j;
-  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
-                        vdupq_n_u16(0) };
-  const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] };
+////////////////////////////////////////////////////////////////////////////////
+
+static INLINE void sad32x_4d(const uint8_t *src, int src_stride,
+                             const uint8_t *const ref[4], int ref_stride,
+                             const int height, uint16x8_t *const sum) {
+  int i;
+  const uint8_t *ref_loop[4] = { ref[0], ref[1], ref[2], ref[3] };
+
+  sum[0] = sum[1] = sum[2] = sum[3] = vdupq_n_u16(0);
 
   for (i = 0; i < height; ++i) {
-    const uint8x16_t a_0 = vld1q_u8(a);
-    const uint8x16_t a_1 = vld1q_u8(a + 16);
-    a += a_stride;
-    for (j = 0; j < 4; ++j) {
-      const uint8x16_t b_0 = vld1q_u8(b_loop[j]);
-      const uint8x16_t b_1 = vld1q_u8(b_loop[j] + 16);
-      b_loop[j] += b_stride;
-      sum[j] = vabal_u8(sum[j], vget_low_u8(a_0), vget_low_u8(b_0));
-      sum[j] = vabal_u8(sum[j], vget_high_u8(a_0), vget_high_u8(b_0));
-      sum[j] = vabal_u8(sum[j], vget_low_u8(a_1), vget_low_u8(b_1));
-      sum[j] = vabal_u8(sum[j], vget_high_u8(a_1), vget_high_u8(b_1));
-    }
-  }
+    uint8x16_t s;
 
-  for (j = 0; j < 4; ++j) {
-    result[j] = vget_lane_u32(horizontal_add_uint16x8(sum[j]), 0);
+    s = vld1q_u8(src + 0 * 16);
+    sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
+    sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]);
+    sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]);
+    sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]);
+
+    s = vld1q_u8(src + 1 * 16);
+    sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
+    sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]);
+    sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]);
+    sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]);
+
+    src += src_stride;
+    ref_loop[0] += ref_stride;
+    ref_loop[1] += ref_stride;
+    ref_loop[2] += ref_stride;
+    ref_loop[3] += ref_stride;
   }
 }
 
 void vpx_sad32x16x4d_neon(const uint8_t *src, int src_stride,
                           const uint8_t *const ref[4], int ref_stride,
                           uint32_t *res) {
-  sad32x_4d(src, src_stride, ref, ref_stride, res, 16);
+  uint16x8_t sum[4];
+  sad32x_4d(src, src_stride, ref, ref_stride, 16, sum);
+  sad_512_pel_final_neon(sum, res);
 }
 
 void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride,
                           const uint8_t *const ref[4], int ref_stride,
                           uint32_t *res) {
-  sad32x_4d(src, src_stride, ref, ref_stride, res, 32);
+  uint16x8_t sum[4];
+  sad32x_4d(src, src_stride, ref, ref_stride, 32, sum);
+  sad_1024_pel_final_neon(sum, res);
 }
 
 void vpx_sad32x64x4d_neon(const uint8_t *src, int src_stride,
                           const uint8_t *const ref[4], int ref_stride,
                           uint32_t *res) {
-  sad32x_4d(src, src_stride, ref, ref_stride, res, 64);
+  uint16x8_t sum[4];
+  sad32x_4d(src, src_stride, ref, ref_stride, 64, sum);
+  sad_2048_pel_final_neon(sum, res);
 }
 
-static INLINE void sum64x(const uint8x16_t a_0, const uint8x16_t a_1,
-                          const uint8x16_t b_0, const uint8x16_t b_1,
-                          uint16x8_t *sum) {
-  *sum = vabal_u8(*sum, vget_low_u8(a_0), vget_low_u8(b_0));
-  *sum = vabal_u8(*sum, vget_high_u8(a_0), vget_high_u8(b_0));
-  *sum = vabal_u8(*sum, vget_low_u8(a_1), vget_low_u8(b_1));
-  *sum = vabal_u8(*sum, vget_high_u8(a_1), vget_high_u8(b_1));
-}
+////////////////////////////////////////////////////////////////////////////////
 
-static INLINE void sad64x_4d(const uint8_t *a, int a_stride,
-                             const uint8_t *const b[4], int b_stride,
-                             uint32_t *result, const int height) {
+void vpx_sad64x32x4d_neon(const uint8_t *src, int src_stride,
+                          const uint8_t *const ref[4], int ref_stride,
+                          uint32_t *res) {
   int i;
-  uint16x8_t sum_0 = vdupq_n_u16(0);
-  uint16x8_t sum_1 = vdupq_n_u16(0);
-  uint16x8_t sum_2 = vdupq_n_u16(0);
-  uint16x8_t sum_3 = vdupq_n_u16(0);
-  uint16x8_t sum_4 = vdupq_n_u16(0);
-  uint16x8_t sum_5 = vdupq_n_u16(0);
-  uint16x8_t sum_6 = vdupq_n_u16(0);
-  uint16x8_t sum_7 = vdupq_n_u16(0);
-  const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] };
+  const uint8_t *ref_loop[4] = { ref[0], ref[1], ref[2], ref[3] };
+  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0) };
 
-  for (i = 0; i < height; ++i) {
-    const uint8x16_t a_0 = vld1q_u8(a);
-    const uint8x16_t a_1 = vld1q_u8(a + 16);
-    const uint8x16_t a_2 = vld1q_u8(a + 32);
-    const uint8x16_t a_3 = vld1q_u8(a + 48);
-    a += a_stride;
-    sum64x(a_0, a_1, vld1q_u8(b_loop[0]), vld1q_u8(b_loop[0] + 16), &sum_0);
-    sum64x(a_2, a_3, vld1q_u8(b_loop[0] + 32), vld1q_u8(b_loop[0] + 48),
-           &sum_1);
-    b_loop[0] += b_stride;
-    sum64x(a_0, a_1, vld1q_u8(b_loop[1]), vld1q_u8(b_loop[1] + 16), &sum_2);
-    sum64x(a_2, a_3, vld1q_u8(b_loop[1] + 32), vld1q_u8(b_loop[1] + 48),
-           &sum_3);
-    b_loop[1] += b_stride;
-    sum64x(a_0, a_1, vld1q_u8(b_loop[2]), vld1q_u8(b_loop[2] + 16), &sum_4);
-    sum64x(a_2, a_3, vld1q_u8(b_loop[2] + 32), vld1q_u8(b_loop[2] + 48),
-           &sum_5);
-    b_loop[2] += b_stride;
-    sum64x(a_0, a_1, vld1q_u8(b_loop[3]), vld1q_u8(b_loop[3] + 16), &sum_6);
-    sum64x(a_2, a_3, vld1q_u8(b_loop[3] + 32), vld1q_u8(b_loop[3] + 48),
-           &sum_7);
-    b_loop[3] += b_stride;
-  }
+  for (i = 0; i < 32; ++i) {
+    uint8x16_t s;
 
-  result[0] = vget_lane_u32(horizontal_add_long_uint16x8(sum_0, sum_1), 0);
-  result[1] = vget_lane_u32(horizontal_add_long_uint16x8(sum_2, sum_3), 0);
-  result[2] = vget_lane_u32(horizontal_add_long_uint16x8(sum_4, sum_5), 0);
-  result[3] = vget_lane_u32(horizontal_add_long_uint16x8(sum_6, sum_7), 0);
-}
+    s = vld1q_u8(src + 0 * 16);
+    sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
+    sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]);
+    sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]);
+    sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]);
 
-void vpx_sad64x32x4d_neon(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
-                          uint32_t *res) {
-  sad64x_4d(src, src_stride, ref, ref_stride, res, 32);
+    s = vld1q_u8(src + 1 * 16);
+    sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
+    sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]);
+    sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]);
+    sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]);
+
+    s = vld1q_u8(src + 2 * 16);
+    sad16_neon(ref_loop[0] + 2 * 16, s, &sum[0]);
+    sad16_neon(ref_loop[1] + 2 * 16, s, &sum[1]);
+    sad16_neon(ref_loop[2] + 2 * 16, s, &sum[2]);
+    sad16_neon(ref_loop[3] + 2 * 16, s, &sum[3]);
+
+    s = vld1q_u8(src + 3 * 16);
+    sad16_neon(ref_loop[0] + 3 * 16, s, &sum[0]);
+    sad16_neon(ref_loop[1] + 3 * 16, s, &sum[1]);
+    sad16_neon(ref_loop[2] + 3 * 16, s, &sum[2]);
+    sad16_neon(ref_loop[3] + 3 * 16, s, &sum[3]);
+
+    src += src_stride;
+    ref_loop[0] += ref_stride;
+    ref_loop[1] += ref_stride;
+    ref_loop[2] += ref_stride;
+    ref_loop[3] += ref_stride;
+  }
+
+  sad_2048_pel_final_neon(sum, res);
 }
 
 void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride,
                           const uint8_t *const ref[4], int ref_stride,
                           uint32_t *res) {
-  sad64x_4d(src, src_stride, ref, ref_stride, res, 64);
+  int i;
+  const uint8_t *ref_loop[4] = { ref[0], ref[1], ref[2], ref[3] };
+  uint16x8_t sum[8] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0), vdupq_n_u16(0) };
+
+  for (i = 0; i < 64; ++i) {
+    uint8x16_t s;
+
+    s = vld1q_u8(src + 0 * 16);
+    sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
+    sad16_neon(ref_loop[1] + 0 * 16, s, &sum[2]);
+    sad16_neon(ref_loop[2] + 0 * 16, s, &sum[4]);
+    sad16_neon(ref_loop[3] + 0 * 16, s, &sum[6]);
+
+    s = vld1q_u8(src + 1 * 16);
+    sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
+    sad16_neon(ref_loop[1] + 1 * 16, s, &sum[2]);
+    sad16_neon(ref_loop[2] + 1 * 16, s, &sum[4]);
+    sad16_neon(ref_loop[3] + 1 * 16, s, &sum[6]);
+
+    s = vld1q_u8(src + 2 * 16);
+    sad16_neon(ref_loop[0] + 2 * 16, s, &sum[1]);
+    sad16_neon(ref_loop[1] + 2 * 16, s, &sum[3]);
+    sad16_neon(ref_loop[2] + 2 * 16, s, &sum[5]);
+    sad16_neon(ref_loop[3] + 2 * 16, s, &sum[7]);
+
+    s = vld1q_u8(src + 3 * 16);
+    sad16_neon(ref_loop[0] + 3 * 16, s, &sum[1]);
+    sad16_neon(ref_loop[1] + 3 * 16, s, &sum[3]);
+    sad16_neon(ref_loop[2] + 3 * 16, s, &sum[5]);
+    sad16_neon(ref_loop[3] + 3 * 16, s, &sum[7]);
+
+    src += src_stride;
+    ref_loop[0] += ref_stride;
+    ref_loop[1] += ref_stride;
+    ref_loop[2] += ref_stride;
+    ref_loop[3] += ref_stride;
+  }
+
+  sad_4096_pel_final_neon(sum, res);
 }
diff --git a/vpx_dsp/arm/subtract_neon.c b/vpx_dsp/arm/subtract_neon.c
index ce81fb630..7e4610d2e 100644
--- a/vpx_dsp/arm/subtract_neon.c
+++ b/vpx_dsp/arm/subtract_neon.c
@@ -9,71 +9,72 @@
  */
 
 #include <arm_neon.h>
+#include <assert.h>
 
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
 
 void vpx_subtract_block_neon(int rows, int cols, int16_t *diff,
                              ptrdiff_t diff_stride, const uint8_t *src,
                              ptrdiff_t src_stride, const uint8_t *pred,
                              ptrdiff_t pred_stride) {
-  int r, c;
+  int r = rows, c;
 
   if (cols > 16) {
-    for (r = 0; r < rows; ++r) {
+    do {
       for (c = 0; c < cols; c += 32) {
-        const uint8x16_t v_src_00 = vld1q_u8(&src[c + 0]);
-        const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]);
-        const uint8x16_t v_pred_00 = vld1q_u8(&pred[c + 0]);
-        const uint8x16_t v_pred_16 = vld1q_u8(&pred[c + 16]);
-        const uint16x8_t v_diff_lo_00 =
-            vsubl_u8(vget_low_u8(v_src_00), vget_low_u8(v_pred_00));
-        const uint16x8_t v_diff_hi_00 =
-            vsubl_u8(vget_high_u8(v_src_00), vget_high_u8(v_pred_00));
-        const uint16x8_t v_diff_lo_16 =
-            vsubl_u8(vget_low_u8(v_src_16), vget_low_u8(v_pred_16));
-        const uint16x8_t v_diff_hi_16 =
-            vsubl_u8(vget_high_u8(v_src_16), vget_high_u8(v_pred_16));
-        vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_lo_00));
-        vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_hi_00));
-        vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16));
-        vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(v_diff_hi_16));
+        const uint8x16_t s0 = vld1q_u8(&src[c + 0]);
+        const uint8x16_t s1 = vld1q_u8(&src[c + 16]);
+        const uint8x16_t p0 = vld1q_u8(&pred[c + 0]);
+        const uint8x16_t p1 = vld1q_u8(&pred[c + 16]);
+        const uint16x8_t d0 = vsubl_u8(vget_low_u8(s0), vget_low_u8(p0));
+        const uint16x8_t d1 = vsubl_u8(vget_high_u8(s0), vget_high_u8(p0));
+        const uint16x8_t d2 = vsubl_u8(vget_low_u8(s1), vget_low_u8(p1));
+        const uint16x8_t d3 = vsubl_u8(vget_high_u8(s1), vget_high_u8(p1));
+        vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(d0));
+        vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(d1));
+        vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(d2));
+        vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(d3));
       }
       diff += diff_stride;
       pred += pred_stride;
       src += src_stride;
-    }
+    } while (--r);
   } else if (cols > 8) {
-    for (r = 0; r < rows; ++r) {
-      const uint8x16_t v_src = vld1q_u8(&src[0]);
-      const uint8x16_t v_pred = vld1q_u8(&pred[0]);
-      const uint16x8_t v_diff_lo =
-          vsubl_u8(vget_low_u8(v_src), vget_low_u8(v_pred));
-      const uint16x8_t v_diff_hi =
-          vsubl_u8(vget_high_u8(v_src), vget_high_u8(v_pred));
-      vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_lo));
-      vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_hi));
+    do {
+      const uint8x16_t s = vld1q_u8(&src[0]);
+      const uint8x16_t p = vld1q_u8(&pred[0]);
+      const uint16x8_t d0 = vsubl_u8(vget_low_u8(s), vget_low_u8(p));
+      const uint16x8_t d1 = vsubl_u8(vget_high_u8(s), vget_high_u8(p));
+      vst1q_s16(&diff[0], vreinterpretq_s16_u16(d0));
+      vst1q_s16(&diff[8], vreinterpretq_s16_u16(d1));
       diff += diff_stride;
       pred += pred_stride;
       src += src_stride;
-    }
+    } while (--r);
   } else if (cols > 4) {
-    for (r = 0; r < rows; ++r) {
-      const uint8x8_t v_src = vld1_u8(&src[0]);
-      const uint8x8_t v_pred = vld1_u8(&pred[0]);
-      const uint16x8_t v_diff = vsubl_u8(v_src, v_pred);
+    do {
+      const uint8x8_t s = vld1_u8(&src[0]);
+      const uint8x8_t p = vld1_u8(&pred[0]);
+      const uint16x8_t v_diff = vsubl_u8(s, p);
       vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff));
       diff += diff_stride;
       pred += pred_stride;
       src += src_stride;
-    }
+    } while (--r);
   } else {
-    for (r = 0; r < rows; ++r) {
-      for (c = 0; c < cols; ++c) diff[c] = src[c] - pred[c];
-
-      diff += diff_stride;
-      pred += pred_stride;
-      src += src_stride;
-    }
+    assert(cols == 4);
+    do {
+      const uint8x8_t s = load_unaligned_u8(src, src_stride);
+      const uint8x8_t p = load_unaligned_u8(pred, pred_stride);
+      const uint16x8_t d = vsubl_u8(s, p);
+      vst1_s16(diff + 0 * diff_stride, vreinterpret_s16_u16(vget_low_u16(d)));
+      vst1_s16(diff + 1 * diff_stride, vreinterpret_s16_u16(vget_high_u16(d)));
+      diff += 2 * diff_stride;
+      pred += 2 * pred_stride;
+      src += 2 * src_stride;
+      r -= 2;
+    } while (r);
   }
 }
diff --git a/vpx_dsp/arm/sum_neon.h b/vpx_dsp/arm/sum_neon.h
index d74fe0cde..c09841223 100644
--- a/vpx_dsp/arm/sum_neon.h
+++ b/vpx_dsp/arm/sum_neon.h
@@ -30,15 +30,6 @@ static INLINE uint32x2_t horizontal_add_uint16x8(const uint16x8_t a) {
                   vreinterpret_u32_u64(vget_high_u64(c)));
 }
 
-static INLINE uint32x2_t horizontal_add_long_uint16x8(const uint16x8_t a,
-                                                      const uint16x8_t b) {
-  const uint32x4_t c = vpaddlq_u16(a);
-  const uint32x4_t d = vpadalq_u16(c, b);
-  const uint64x2_t e = vpaddlq_u32(d);
-  return vadd_u32(vreinterpret_u32_u64(vget_low_u64(e)),
-                  vreinterpret_u32_u64(vget_high_u64(e)));
-}
-
 static INLINE uint32x2_t horizontal_add_uint32x4(const uint32x4_t a) {
   const uint64x2_t b = vpaddlq_u32(a);
   return vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
diff --git a/vpx_dsp/arm/sum_squares_neon.c b/vpx_dsp/arm/sum_squares_neon.c
new file mode 100644
index 000000000..8942ba83b
--- /dev/null
+++ b/vpx_dsp/arm/sum_squares_neon.c
@@ -0,0 +1,85 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+
+uint64_t vpx_sum_squares_2d_i16_neon(const int16_t *src, int stride, int size) {
+  int64x1_t s2;
+
+  if (size == 4) {
+    int16x4_t s[4];
+    int32x4_t s0;
+    uint32x2_t s1;
+
+    s[0] = vld1_s16(src + 0 * stride);
+    s[1] = vld1_s16(src + 1 * stride);
+    s[2] = vld1_s16(src + 2 * stride);
+    s[3] = vld1_s16(src + 3 * stride);
+    s0 = vmull_s16(s[0], s[0]);
+    s0 = vmlal_s16(s0, s[1], s[1]);
+    s0 = vmlal_s16(s0, s[2], s[2]);
+    s0 = vmlal_s16(s0, s[3], s[3]);
+    s1 = vpadd_u32(vget_low_u32(vreinterpretq_u32_s32(s0)),
+                   vget_high_u32(vreinterpretq_u32_s32(s0)));
+    s2 = vpaddl_u32(s1);
+  } else {
+    int r = size;
+    uint64x2_t s1 = vdupq_n_u64(0);
+
+    do {
+      int c = size;
+      int32x4_t s0 = vdupq_n_s32(0);
+      const int16_t *src_t = src;
+
+      do {
+        int16x8_t s[8];
+
+        s[0] = vld1q_s16(src_t + 0 * stride);
+        s[1] = vld1q_s16(src_t + 1 * stride);
+        s[2] = vld1q_s16(src_t + 2 * stride);
+        s[3] = vld1q_s16(src_t + 3 * stride);
+        s[4] = vld1q_s16(src_t + 4 * stride);
+        s[5] = vld1q_s16(src_t + 5 * stride);
+        s[6] = vld1q_s16(src_t + 6 * stride);
+        s[7] = vld1q_s16(src_t + 7 * stride);
+        s0 = vmlal_s16(s0, vget_low_s16(s[0]), vget_low_s16(s[0]));
+        s0 = vmlal_s16(s0, vget_low_s16(s[1]), vget_low_s16(s[1]));
+        s0 = vmlal_s16(s0, vget_low_s16(s[2]), vget_low_s16(s[2]));
+        s0 = vmlal_s16(s0, vget_low_s16(s[3]), vget_low_s16(s[3]));
+        s0 = vmlal_s16(s0, vget_low_s16(s[4]), vget_low_s16(s[4]));
+        s0 = vmlal_s16(s0, vget_low_s16(s[5]), vget_low_s16(s[5]));
+        s0 = vmlal_s16(s0, vget_low_s16(s[6]), vget_low_s16(s[6]));
+        s0 = vmlal_s16(s0, vget_low_s16(s[7]), vget_low_s16(s[7]));
+        s0 = vmlal_s16(s0, vget_high_s16(s[0]), vget_high_s16(s[0]));
+        s0 = vmlal_s16(s0, vget_high_s16(s[1]), vget_high_s16(s[1]));
+        s0 = vmlal_s16(s0, vget_high_s16(s[2]), vget_high_s16(s[2]));
+        s0 = vmlal_s16(s0, vget_high_s16(s[3]), vget_high_s16(s[3]));
+        s0 = vmlal_s16(s0, vget_high_s16(s[4]), vget_high_s16(s[4]));
+        s0 = vmlal_s16(s0, vget_high_s16(s[5]), vget_high_s16(s[5]));
+        s0 = vmlal_s16(s0, vget_high_s16(s[6]), vget_high_s16(s[6]));
+        s0 = vmlal_s16(s0, vget_high_s16(s[7]), vget_high_s16(s[7]));
+        src_t += 8;
+        c -= 8;
+      } while (c);
+
+      s1 = vaddw_u32(s1, vget_low_u32(vreinterpretq_u32_s32(s0)));
+      s1 = vaddw_u32(s1, vget_high_u32(vreinterpretq_u32_s32(s0)));
+      src += 8 * stride;
+      r -= 8;
+    } while (r);
+
+    s2 = vadd_u64(vget_low_u64(s1), vget_high_u64(s1));
+  }
+
+  return vget_lane_u64(s2, 0);
+}
diff --git a/vpx_dsp/mips/vpx_convolve8_mmi.c b/vpx_dsp/mips/vpx_convolve8_mmi.c
index 0cfb81e4d..ba9ceb866 100644
--- a/vpx_dsp/mips/vpx_convolve8_mmi.c
+++ b/vpx_dsp/mips/vpx_convolve8_mmi.c
@@ -254,6 +254,89 @@ static void convolve_vert_mmi(const uint8_t *src, ptrdiff_t src_stride,
   );
 }
 
+static void convolve_avg_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride,
+                                   uint8_t *dst, ptrdiff_t dst_stride,
+                                   const InterpKernel *filter, int x0_q4,
+                                   int x_step_q4, int32_t w, int32_t h) {
+  const int16_t *filter_x = filter[x0_q4];
+  double ftmp[14];
+  uint32_t tmp[2];
+  uint32_t para[2];
+  para[0] = (1 << ((FILTER_BITS)-1));
+  para[1] = FILTER_BITS;
+  src -= SUBPEL_TAPS / 2 - 1;
+  src_stride -= w;
+  dst_stride -= w;
+  (void)x_step_q4;
+
+  __asm__ volatile(
+    "move       %[tmp1],    %[width]                   \n\t"
+    "xor        %[ftmp0],   %[ftmp0],    %[ftmp0]      \n\t"
+    "gsldlc1    %[filter1], 0x03(%[filter])            \n\t"
+    "gsldrc1    %[filter1], 0x00(%[filter])            \n\t"
+    "gsldlc1    %[filter2], 0x0b(%[filter])            \n\t"
+    "gsldrc1    %[filter2], 0x08(%[filter])            \n\t"
+    "1:                                                \n\t"
+    /* Get 8 data per row */
+    "gsldlc1    %[ftmp5],   0x07(%[src])               \n\t"
+    "gsldrc1    %[ftmp5],   0x00(%[src])               \n\t"
+    "gsldlc1    %[ftmp7],   0x08(%[src])               \n\t"
+    "gsldrc1    %[ftmp7],   0x01(%[src])               \n\t"
+    "gsldlc1    %[ftmp9],   0x09(%[src])               \n\t"
+    "gsldrc1    %[ftmp9],   0x02(%[src])               \n\t"
+    "gsldlc1    %[ftmp11],  0x0A(%[src])               \n\t"
+    "gsldrc1    %[ftmp11],  0x03(%[src])               \n\t"
+    "punpcklbh  %[ftmp4],   %[ftmp5],    %[ftmp0]      \n\t"
+    "punpckhbh  %[ftmp5],   %[ftmp5],    %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp6],   %[ftmp7],    %[ftmp0]      \n\t"
+    "punpckhbh  %[ftmp7],   %[ftmp7],    %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp8],   %[ftmp9],    %[ftmp0]      \n\t"
+    "punpckhbh  %[ftmp9],   %[ftmp9],    %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp10],  %[ftmp11],   %[ftmp0]      \n\t"
+    "punpckhbh  %[ftmp11],  %[ftmp11],   %[ftmp0]      \n\t"
+    MMI_ADDIU(%[width],   %[width],    -0x04)
+    /* Get raw data */
+    GET_DATA_H_MMI
+    ROUND_POWER_OF_TWO_MMI
+    CLIP_PIXEL_MMI
+    "punpcklbh  %[ftmp12],  %[ftmp12],   %[ftmp0]      \n\t"
+    "gsldlc1    %[ftmp4],   0x07(%[dst])               \n\t"
+    "gsldrc1    %[ftmp4],   0x00(%[dst])               \n\t"
+    "punpcklbh  %[ftmp4],   %[ftmp4],    %[ftmp0]      \n\t"
+    "paddh      %[ftmp12],  %[ftmp12],   %[ftmp4]      \n\t"
+    "li         %[tmp0],    0x10001                    \n\t"
+    MMI_MTC1(%[tmp0],     %[ftmp5])
+    "punpcklhw  %[ftmp5],   %[ftmp5],    %[ftmp5]      \n\t"
+    "paddh      %[ftmp12],  %[ftmp12],   %[ftmp5]      \n\t"
+    "psrah      %[ftmp12],  %[ftmp12],   %[ftmp5]      \n\t"
+    "packushb   %[ftmp12],  %[ftmp12],   %[ftmp0]      \n\t"
+    "swc1       %[ftmp12],  0x00(%[dst])               \n\t"
+    MMI_ADDIU(%[dst],     %[dst],      0x04)
+    MMI_ADDIU(%[src],     %[src],      0x04)
+    /* Loop count */
+    "bnez       %[width],   1b                         \n\t"
+    "move       %[width],   %[tmp1]                    \n\t"
+    MMI_ADDU(%[src],      %[src],      %[src_stride])
+    MMI_ADDU(%[dst],      %[dst],      %[dst_stride])
+    MMI_ADDIU(%[height],  %[height],   -0x01)
+    "bnez       %[height],  1b                         \n\t"
+    : [srcl]"=&f"(ftmp[0]),     [srch]"=&f"(ftmp[1]),
+      [filter1]"=&f"(ftmp[2]),  [filter2]"=&f"(ftmp[3]),
+      [ftmp0]"=&f"(ftmp[4]),    [ftmp4]"=&f"(ftmp[5]),
+      [ftmp5]"=&f"(ftmp[6]),    [ftmp6]"=&f"(ftmp[7]),
+      [ftmp7]"=&f"(ftmp[8]),    [ftmp8]"=&f"(ftmp[9]),
+      [ftmp9]"=&f"(ftmp[10]),   [ftmp10]"=&f"(ftmp[11]),
+      [ftmp11]"=&f"(ftmp[12]),  [ftmp12]"=&f"(ftmp[13]),
+      [tmp0]"=&r"(tmp[0]),      [tmp1]"=&r"(tmp[1]),
+      [src]"+&r"(src),          [width]"+&r"(w),
+      [dst]"+&r"(dst),          [height]"+&r"(h)
+    : [filter]"r"(filter_x),    [para]"r"(para),
+      [src_stride]"r"((mips_reg)src_stride),
+      [dst_stride]"r"((mips_reg)dst_stride)
+    : "memory"
+  );
+}
+
 static void convolve_avg_vert_mmi(const uint8_t *src, ptrdiff_t src_stride,
                                   uint8_t *dst, ptrdiff_t dst_stride,
                                   const InterpKernel *filter, int y0_q4,
@@ -362,52 +445,63 @@ void vpx_convolve_avg_mmi(const uint8_t *src, ptrdiff_t src_stride,
                           uint8_t *dst, ptrdiff_t dst_stride,
                           const InterpKernel *filter, int x0_q4, int x_step_q4,
                           int y0_q4, int y_step_q4, int w, int h) {
-  double ftmp[4];
-  uint32_t tmp[2];
-  src_stride -= w;
-  dst_stride -= w;
+  int x, y;
+
   (void)filter;
   (void)x0_q4;
   (void)x_step_q4;
   (void)y0_q4;
   (void)y_step_q4;
 
-  __asm__ volatile(
-    "move       %[tmp1],    %[width]                  \n\t"
-    "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]      \n\t"
-    "li         %[tmp0],    0x10001                   \n\t"
-    MMI_MTC1(%[tmp0],    %[ftmp3])
-    "punpcklhw  %[ftmp3],   %[ftmp3],   %[ftmp3]      \n\t"
-    "1:                                               \n\t"
-    "gsldlc1    %[ftmp1],   0x07(%[src])              \n\t"
-    "gsldrc1    %[ftmp1],   0x00(%[src])              \n\t"
-    "gsldlc1    %[ftmp2],   0x07(%[dst])              \n\t"
-    "gsldrc1    %[ftmp2],   0x00(%[dst])              \n\t"
-    "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]      \n\t"
-    "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]      \n\t"
-    "paddh      %[ftmp1],   %[ftmp1],   %[ftmp2]      \n\t"
-    "paddh      %[ftmp1],   %[ftmp1],   %[ftmp3]      \n\t"
-    "psrah      %[ftmp1],   %[ftmp1],   %[ftmp3]      \n\t"
-    "packushb   %[ftmp1],   %[ftmp1],   %[ftmp0]      \n\t"
-    "swc1       %[ftmp1],   0x00(%[dst])              \n\t"
-    MMI_ADDIU(%[width],  %[width],   -0x04)
-    MMI_ADDIU(%[dst],    %[dst],     0x04)
-    MMI_ADDIU(%[src],    %[src],     0x04)
-    "bnez       %[width],   1b                        \n\t"
-    "move       %[width],   %[tmp1]                   \n\t"
-    MMI_ADDU(%[dst],     %[dst],     %[dst_stride])
-    MMI_ADDU(%[src],     %[src],     %[src_stride])
-    MMI_ADDIU(%[height], %[height],  -0x01)
-    "bnez       %[height],  1b                        \n\t"
-    : [ftmp0]"=&f"(ftmp[0]),  [ftmp1]"=&f"(ftmp[1]),
-      [ftmp2]"=&f"(ftmp[2]),  [ftmp3]"=&f"(ftmp[3]),
-      [tmp0]"=&r"(tmp[0]),    [tmp1]"=&r"(tmp[1]),
-      [src]"+&r"(src),        [dst]"+&r"(dst),
-      [width]"+&r"(w),        [height]"+&r"(h)
-    : [src_stride]"r"((mips_reg)src_stride),
-      [dst_stride]"r"((mips_reg)dst_stride)
-    : "memory"
-  );
+  if (w & 0x03) {
+    for (y = 0; y < h; ++y) {
+      for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
+      src += src_stride;
+      dst += dst_stride;
+    }
+  } else {
+    double ftmp[4];
+    uint32_t tmp[2];
+    src_stride -= w;
+    dst_stride -= w;
+
+    __asm__ volatile(
+      "move       %[tmp1],    %[width]                  \n\t"
+      "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]      \n\t"
+      "li         %[tmp0],    0x10001                   \n\t"
+      MMI_MTC1(%[tmp0],    %[ftmp3])
+      "punpcklhw  %[ftmp3],   %[ftmp3],   %[ftmp3]      \n\t"
+      "1:                                               \n\t"
+      "gsldlc1    %[ftmp1],   0x07(%[src])              \n\t"
+      "gsldrc1    %[ftmp1],   0x00(%[src])              \n\t"
+      "gsldlc1    %[ftmp2],   0x07(%[dst])              \n\t"
+      "gsldrc1    %[ftmp2],   0x00(%[dst])              \n\t"
+      "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]      \n\t"
+      "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]      \n\t"
+      "paddh      %[ftmp1],   %[ftmp1],   %[ftmp2]      \n\t"
+      "paddh      %[ftmp1],   %[ftmp1],   %[ftmp3]      \n\t"
+      "psrah      %[ftmp1],   %[ftmp1],   %[ftmp3]      \n\t"
+      "packushb   %[ftmp1],   %[ftmp1],   %[ftmp0]      \n\t"
+      "swc1       %[ftmp1],   0x00(%[dst])              \n\t"
+      MMI_ADDIU(%[width],  %[width],   -0x04)
+      MMI_ADDIU(%[dst],    %[dst],     0x04)
+      MMI_ADDIU(%[src],    %[src],     0x04)
+      "bnez       %[width],   1b                        \n\t"
+      "move       %[width],   %[tmp1]                   \n\t"
+      MMI_ADDU(%[dst],     %[dst],     %[dst_stride])
+      MMI_ADDU(%[src],     %[src],     %[src_stride])
+      MMI_ADDIU(%[height], %[height],  -0x01)
+      "bnez       %[height],  1b                        \n\t"
+      : [ftmp0]"=&f"(ftmp[0]),  [ftmp1]"=&f"(ftmp[1]),
+        [ftmp2]"=&f"(ftmp[2]),  [ftmp3]"=&f"(ftmp[3]),
+        [tmp0]"=&r"(tmp[0]),    [tmp1]"=&r"(tmp[1]),
+        [src]"+&r"(src),        [dst]"+&r"(dst),
+        [width]"+&r"(w),        [height]"+&r"(h)
+      : [src_stride]"r"((mips_reg)src_stride),
+        [dst_stride]"r"((mips_reg)dst_stride)
+      : "memory"
+    );
+  }
 }
 
 static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
@@ -481,6 +575,29 @@ static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
+static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const InterpKernel *x_filters, int x0_q4,
+                               int x_step_q4, int w, int h) {
+  int x, y;
+  src -= SUBPEL_TAPS / 2 - 1;
+
+  for (y = 0; y < h; ++y) {
+    int x_q4 = x0_q4;
+    for (x = 0; x < w; ++x) {
+      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
+      dst[x] = ROUND_POWER_OF_TWO(
+          dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
+      x_q4 += x_step_q4;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
 void vpx_convolve8_mmi(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
                        ptrdiff_t dst_stride, const InterpKernel *filter,
                        int x0_q4, int32_t x_step_q4, int y0_q4,
@@ -553,6 +670,21 @@ void vpx_convolve8_vert_mmi(const uint8_t *src, ptrdiff_t src_stride,
                       y_step_q4, w, h);
 }
 
+void vpx_convolve8_avg_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride,
+                                 uint8_t *dst, ptrdiff_t dst_stride,
+                                 const InterpKernel *filter, int x0_q4,
+                                 int32_t x_step_q4, int y0_q4, int y_step_q4,
+                                 int w, int h) {
+  (void)y0_q4;
+  (void)y_step_q4;
+  if (w & 0x03)
+    convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4,
+                       x_step_q4, w, h);
+  else
+    convolve_avg_horiz_mmi(src, src_stride, dst, dst_stride, filter, x0_q4,
+                           x_step_q4, w, h);
+}
+
 void vpx_convolve8_avg_vert_mmi(const uint8_t *src, ptrdiff_t src_stride,
                                 uint8_t *dst, ptrdiff_t dst_stride,
                                 const InterpKernel *filter, int x0_q4,
@@ -580,8 +712,5 @@ void vpx_convolve8_avg_mmi(const uint8_t *src, ptrdiff_t src_stride,
 
   vpx_convolve8_mmi(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, y0_q4,
                     y_step_q4, w, h);
-  if (w & 0x03)
-    vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h);
-  else
-    vpx_convolve_avg_mmi(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h);
+  vpx_convolve_avg_mmi(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h);
 }
diff --git a/vpx_dsp/ppc/inv_txfm_vsx.c b/vpx_dsp/ppc/inv_txfm_vsx.c
index f095cb0a4..6603b85ac 100644
--- a/vpx_dsp/ppc/inv_txfm_vsx.c
+++ b/vpx_dsp/ppc/inv_txfm_vsx.c
@@ -76,6 +76,8 @@ static int16x8_t cospi29_v = { 2404, 2404, 2404, 2404, 2404, 2404, 2404, 2404 };
 static int16x8_t cospi30_v = { 1606, 1606, 1606, 1606, 1606, 1606, 1606, 1606 };
 static int16x8_t cospi31_v = { 804, 804, 804, 804, 804, 804, 804, 804 };
 
+static uint8x16_t mask1 = { 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,
+                            0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 };
 #define ROUND_SHIFT_INIT                                               \
   const int32x4_t shift = vec_sl(vec_splat_s32(1), vec_splat_u32(13)); \
   const uint32x4_t shift14 = vec_splat_u32(14);
@@ -107,6 +109,15 @@ static int16x8_t cospi31_v = { 804, 804, 804, 804, 804, 804, 804, 804 };
   out1 = vec_sub(step0, step1);                                               \
   out1 = vec_perm(out1, out1, mask0);
 
+#define PACK_STORE(v0, v1)                            \
+  tmp16_0 = vec_add(vec_perm(d_u0, d_u1, mask1), v0); \
+  tmp16_1 = vec_add(vec_perm(d_u2, d_u3, mask1), v1); \
+  output_v = vec_packsu(tmp16_0, tmp16_1);            \
+                                                      \
+  vec_vsx_st(output_v, 0, tmp_dest);                  \
+  for (i = 0; i < 4; i++)                             \
+    for (j = 0; j < 4; j++) dest[j * stride + i] = tmp_dest[j * 4 + i];
+
 void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest,
                             int stride) {
   int i, j;
@@ -114,13 +125,10 @@ void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest,
   int16x8_t step0, step1, tmp16_0, tmp16_1, t_out0, t_out1;
   uint8x16_t mask0 = { 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF,
                        0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 };
-  uint8x16_t mask1 = { 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,
-                       0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 };
   int16x8_t v0 = load_tran_low(0, input);
   int16x8_t v1 = load_tran_low(8 * sizeof(*input), input);
   int16x8_t t0 = vec_mergeh(v0, v1);
   int16x8_t t1 = vec_mergel(v0, v1);
-
   uint8x16_t dest0 = vec_vsx_ld(0, dest);
   uint8x16_t dest1 = vec_vsx_ld(stride, dest);
   uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest);
@@ -130,6 +138,7 @@ void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest,
   int16x8_t d_u1 = (int16x8_t)vec_mergeh(dest1, zerov);
   int16x8_t d_u2 = (int16x8_t)vec_mergeh(dest2, zerov);
   int16x8_t d_u3 = (int16x8_t)vec_mergeh(dest3, zerov);
+
   uint8x16_t output_v;
   uint8_t tmp_dest[16];
   ROUND_SHIFT_INIT
@@ -148,13 +157,8 @@ void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest,
 
   PIXEL_ADD4(v0, t_out0);
   PIXEL_ADD4(v1, t_out1);
-  tmp16_0 = vec_add(vec_perm(d_u0, d_u1, mask1), v0);
-  tmp16_1 = vec_add(vec_perm(d_u2, d_u3, mask1), v1);
-  output_v = vec_packsu(tmp16_0, tmp16_1);
 
-  vec_vsx_st(output_v, 0, tmp_dest);
-  for (i = 0; i < 4; i++)
-    for (j = 0; j < 4; j++) dest[j * stride + i] = tmp_dest[j * 4 + i];
+  PACK_STORE(v0, v1);
 }
 
 #define TRANSPOSE8x8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
@@ -1062,3 +1066,67 @@ void vpx_idct32x32_1024_add_vsx(const tran_low_t *input, uint8_t *dest,
   ADD_STORE_BLOCK(src2, 16);
   ADD_STORE_BLOCK(src3, 24);
 }
+
+#define TRANSFORM_COLS           \
+  v32_a = vec_add(v32_a, v32_c); \
+  v32_d = vec_sub(v32_d, v32_b); \
+  v32_e = vec_sub(v32_a, v32_d); \
+  v32_e = vec_sra(v32_e, one);   \
+  v32_b = vec_sub(v32_e, v32_b); \
+  v32_c = vec_sub(v32_e, v32_c); \
+  v32_a = vec_sub(v32_a, v32_b); \
+  v32_d = vec_add(v32_d, v32_c); \
+  v_a = vec_packs(v32_a, v32_b); \
+  v_c = vec_packs(v32_c, v32_d);
+
+#define TRANSPOSE_WHT             \
+  tmp_a = vec_mergeh(v_a, v_c);   \
+  tmp_c = vec_mergel(v_a, v_c);   \
+  v_a = vec_mergeh(tmp_a, tmp_c); \
+  v_c = vec_mergel(tmp_a, tmp_c);
+
+void vpx_iwht4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest,
+                            int stride) {
+  int16x8_t v_a = load_tran_low(0, input);
+  int16x8_t v_c = load_tran_low(8 * sizeof(*input), input);
+  int16x8_t tmp_a, tmp_c;
+  uint16x8_t two = vec_splat_u16(2);
+  uint32x4_t one = vec_splat_u32(1);
+  int16x8_t tmp16_0, tmp16_1;
+  int32x4_t v32_a, v32_c, v32_d, v32_b, v32_e;
+  uint8x16_t dest0 = vec_vsx_ld(0, dest);
+  uint8x16_t dest1 = vec_vsx_ld(stride, dest);
+  uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest);
+  uint8x16_t dest3 = vec_vsx_ld(3 * stride, dest);
+  int16x8_t d_u0 = (int16x8_t)unpack_to_u16_h(dest0);
+  int16x8_t d_u1 = (int16x8_t)unpack_to_u16_h(dest1);
+  int16x8_t d_u2 = (int16x8_t)unpack_to_u16_h(dest2);
+  int16x8_t d_u3 = (int16x8_t)unpack_to_u16_h(dest3);
+  uint8x16_t output_v;
+  uint8_t tmp_dest[16];
+  int i, j;
+
+  v_a = vec_sra(v_a, two);
+  v_c = vec_sra(v_c, two);
+
+  TRANSPOSE_WHT;
+
+  v32_a = vec_unpackh(v_a);
+  v32_c = vec_unpackl(v_a);
+
+  v32_d = vec_unpackh(v_c);
+  v32_b = vec_unpackl(v_c);
+
+  TRANSFORM_COLS;
+
+  TRANSPOSE_WHT;
+
+  v32_a = vec_unpackh(v_a);
+  v32_c = vec_unpackl(v_a);
+  v32_d = vec_unpackh(v_c);
+  v32_b = vec_unpackl(v_c);
+
+  TRANSFORM_COLS;
+
+  PACK_STORE(v_a, v_c);
+}
diff --git a/vpx_dsp/ppc/quantize_vsx.c b/vpx_dsp/ppc/quantize_vsx.c
new file mode 100644
index 000000000..e037f89e3
--- /dev/null
+++ b/vpx_dsp/ppc/quantize_vsx.c
@@ -0,0 +1,124 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+
+// Negate 16-bit integers in a when the corresponding signed 16-bit
+// integer in b is negative.
+static INLINE int16x8_t vec_sign(int16x8_t a, int16x8_t b) {
+  const int16x8_t mask = vec_sra(b, vec_shift_sign_s16);
+  return vec_xor(vec_add(a, mask), mask);
+}
+
+// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit
+// integers, and return the high 16 bits of the intermediate integers.
+static INLINE int16x8_t vec_mulhi(int16x8_t a, int16x8_t b) {
+  // madds does ((A * B) >>15) + C, we need >> 16, so we perform an extra right
+  // shift.
+  return vec_sra(vec_madds(a, b, vec_zeros_s16), vec_ones_s16);
+}
+
+static INLINE int16x8_t quantize_coeff(int16x8_t coeff, int16x8_t coeff_abs,
+                                       int16x8_t round, int16x8_t quant,
+                                       int16x8_t quant_shift, bool16x8_t mask) {
+  int16x8_t rounded, qcoeff;
+  rounded = vec_vaddshs(coeff_abs, round);
+  qcoeff = vec_mulhi(rounded, quant);
+  qcoeff = vec_add(qcoeff, rounded);
+  qcoeff = vec_mulhi(qcoeff, quant_shift);
+  qcoeff = vec_sign(qcoeff, coeff);
+  return vec_and(qcoeff, mask);
+}
+
+static INLINE int16x8_t nonzero_scanindex(int16x8_t qcoeff, bool16x8_t mask,
+                                          const int16_t *iscan_ptr) {
+  bool16x8_t zero_coeff;
+  int16x8_t scan = vec_vsx_ld(0, iscan_ptr);
+  zero_coeff = vec_cmpeq(qcoeff, vec_zeros_s16);
+  scan = vec_sub(scan, mask);
+  return vec_andc(scan, zero_coeff);
+}
+
+// Compare packed 16-bit integers across a, and return the maximum value in
+// every element. Returns a vector containing the biggest value across vector a.
+static INLINE int16x8_t vec_max_across(int16x8_t a) {
+  a = vec_max(a, vec_perm(a, a, vec_perm64));
+  a = vec_max(a, vec_perm(a, a, vec_perm32));
+  return vec_max(a, vec_perm(a, a, vec_perm16));
+}
+
+void vpx_quantize_b_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                        int skip_block, const int16_t *zbin_ptr,
+                        const int16_t *round_ptr, const int16_t *quant_ptr,
+                        const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+                        tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                        uint16_t *eob_ptr, const int16_t *scan_ptr,
+                        const int16_t *iscan_ptr) {
+  int16x8_t qcoeff, dqcoeff, eob;
+
+  // First set of 8 coeff starts with DC + 7 AC
+  int16x8_t zbin = vec_vsx_ld(0, zbin_ptr);
+  int16x8_t round = vec_vsx_ld(0, round_ptr);
+  int16x8_t quant = vec_vsx_ld(0, quant_ptr);
+  int16x8_t dequant = vec_vsx_ld(0, dequant_ptr);
+  int16x8_t quant_shift = vec_vsx_ld(0, quant_shift_ptr);
+
+  int16x8_t coeff = vec_vsx_ld(0, coeff_ptr);
+  int16x8_t coeff_abs = vec_abs(coeff);
+  bool16x8_t zero_mask = vec_cmpge(coeff_abs, zbin);
+
+  (void)scan_ptr;
+  (void)skip_block;
+  assert(!skip_block);
+
+  qcoeff =
+      quantize_coeff(coeff, coeff_abs, round, quant, quant_shift, zero_mask);
+  vec_vsx_st(qcoeff, 0, qcoeff_ptr);
+
+  dqcoeff = vec_mladd(qcoeff, dequant, vec_zeros_s16);
+  vec_vsx_st(dqcoeff, 0, dqcoeff_ptr);
+
+  eob = nonzero_scanindex(qcoeff, zero_mask, iscan_ptr);
+
+  // All other sets of 8 coeffs will only contain AC
+  zbin = vec_splat(zbin, 1);
+  round = vec_splat(round, 1);
+  quant = vec_splat(quant, 1);
+  dequant = vec_splat(dequant, 1);
+  quant_shift = vec_splat(quant_shift, 1);
+
+  n_coeffs -= 8;
+  do {
+    coeff_ptr += 8;
+    qcoeff_ptr += 8;
+    dqcoeff_ptr += 8;
+    iscan_ptr += 8;
+
+    coeff = vec_vsx_ld(0, coeff_ptr);
+    coeff_abs = vec_abs(coeff);
+    zero_mask = vec_cmpge(coeff_abs, zbin);
+    qcoeff =
+        quantize_coeff(coeff, coeff_abs, round, quant, quant_shift, zero_mask);
+    vec_vsx_st(qcoeff, 0, qcoeff_ptr);
+
+    dqcoeff = vec_mladd(qcoeff, dequant, vec_zeros_s16);
+    vec_vsx_st(dqcoeff, 0, dqcoeff_ptr);
+
+    eob = vec_max(eob, nonzero_scanindex(qcoeff, zero_mask, iscan_ptr));
+
+    n_coeffs -= 8;
+  } while (n_coeffs > 0);
+
+  eob = vec_max_across(eob);
+  *eob_ptr = eob[0];
+}
diff --git a/vpx_dsp/ppc/types_vsx.h b/vpx_dsp/ppc/types_vsx.h
index f611d02d2..e2af55463 100644
--- a/vpx_dsp/ppc/types_vsx.h
+++ b/vpx_dsp/ppc/types_vsx.h
@@ -19,6 +19,7 @@ typedef vector signed short int16x8_t;
 typedef vector unsigned short uint16x8_t;
 typedef vector signed int int32x4_t;
 typedef vector unsigned int uint32x4_t;
+typedef vector bool short bool16x8_t;
 
 #ifdef __clang__
 static const uint8x16_t xxpermdi0_perm = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
@@ -65,4 +66,17 @@ static const uint8x16_t xxpermdi3_perm = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
 #endif
 #endif
 
+static const int16x8_t vec_zeros_s16 = { 0, 0, 0, 0, 0, 0, 0, 0 };
+static const uint16x8_t vec_ones_s16 = { 1, 1, 1, 1, 1, 1, 1, 1 };
+static const uint16x8_t vec_shift_sign_s16 = { 15, 15, 15, 15, 15, 15, 15, 15 };
+static const uint8x16_t vec_perm64 = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+                                       0x0E, 0x0F, 0x00, 0x01, 0x02, 0x03,
+                                       0x04, 0x05, 0x06, 0x07 };
+static const uint8x16_t vec_perm32 = { 0x04, 0x05, 0x06, 0x07, 0x08, 0x09,
+                                       0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+                                       0x00, 0x01, 0x02, 0x03 };
+static const uint8x16_t vec_perm16 = { 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+                                       0x08, 0x09, 0x0A, 0x0B, 0x0E, 0x0D,
+                                       0x0E, 0x0F, 0x00, 0x01 };
+
 #endif  // VPX_DSP_PPC_TYPES_VSX_H_
diff --git a/vpx_dsp/sum_squares.c b/vpx_dsp/sum_squares.c
index 7c535ac2d..b80cd588e 100644
--- a/vpx_dsp/sum_squares.c
+++ b/vpx_dsp/sum_squares.c
@@ -10,8 +10,7 @@
 
 #include "./vpx_dsp_rtcd.h"
 
-uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int src_stride,
-                                  int size) {
+uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size) {
   int r, c;
   uint64_t ss = 0;
 
@@ -20,7 +19,7 @@ uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int src_stride,
       const int16_t v = src[c];
       ss += v * v;
     }
-    src += src_stride;
+    src += stride;
   }
 
   return ss;
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 167011034..cb06a476f 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -286,6 +286,7 @@ DSP_SRCS-$(HAVE_SSE2)   += x86/quantize_sse2.c
 DSP_SRCS-$(HAVE_SSSE3)  += x86/quantize_ssse3.c
 DSP_SRCS-$(HAVE_AVX)    += x86/quantize_avx.c
 DSP_SRCS-$(HAVE_NEON)   += arm/quantize_neon.c
+DSP_SRCS-$(HAVE_VSX)    += ppc/quantize_vsx.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_quantize_intrin_sse2.c
 endif
@@ -312,6 +313,7 @@ ifeq ($(CONFIG_ENCODERS),yes)
 DSP_SRCS-yes            += sad.c
 DSP_SRCS-yes            += subtract.c
 DSP_SRCS-yes            += sum_squares.c
+DSP_SRCS-$(HAVE_NEON)   += arm/sum_squares_neon.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/sum_squares_sse2.c
 DSP_SRCS-$(HAVE_MSA)    += mips/sum_squares_msa.c
 
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 4dbee088b..c36ec84b0 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -363,7 +363,7 @@ add_proto qw/void vpx_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride,
 specialize qw/vpx_convolve_copy neon dspr2 msa sse2 vsx/;
 
 add_proto qw/void vpx_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve_avg neon dspr2 msa sse2 vsx/;
+specialize qw/vpx_convolve_avg neon dspr2 msa sse2 vsx mmi/;
 
 add_proto qw/void vpx_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
 specialize qw/vpx_convolve8 sse2 ssse3 avx2 neon dspr2 msa vsx mmi/;
@@ -378,7 +378,7 @@ add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride,
 specialize qw/vpx_convolve8_avg sse2 ssse3 avx2 neon dspr2 msa vsx mmi/;
 
 add_proto qw/void vpx_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 avx2 neon dspr2 msa vsx/;
+specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 avx2 neon dspr2 msa vsx mmi/;
 
 add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
 specialize qw/vpx_convolve8_avg_vert sse2 ssse3 avx2 neon dspr2 msa vsx mmi/;
@@ -626,7 +626,7 @@ if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
   specialize qw/vpx_idct32x32_135_add neon sse2 ssse3/;
   specialize qw/vpx_idct32x32_34_add neon sse2 ssse3/;
   specialize qw/vpx_idct32x32_1_add neon sse2/;
-  specialize qw/vpx_iwht4x4_16_add sse2/;
+  specialize qw/vpx_iwht4x4_16_add sse2 vsx/;
 
   if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") {
     # Note that these specializations are appended to the above ones.
@@ -699,7 +699,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 #
 if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
   add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vpx_quantize_b neon sse2 ssse3 avx/;
+  specialize qw/vpx_quantize_b neon sse2 ssse3 avx vsx/;
 
   add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vpx_quantize_b_32x32 neon ssse3 avx/;
@@ -922,7 +922,7 @@ add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const
 specialize qw/vpx_sad4x4x4d neon msa sse2 mmi/;
 
 add_proto qw/uint64_t vpx_sum_squares_2d_i16/, "const int16_t *src, int stride, int size";
-specialize qw/vpx_sum_squares_2d_i16 sse2 msa/;
+specialize qw/vpx_sum_squares_2d_i16 neon sse2 msa/;
 
 #
 # Structured Similarity (SSIM)
diff --git a/vpx_dsp/x86/mem_sse2.h b/vpx_dsp/x86/mem_sse2.h
index 2ce738fb7..419f17863 100644
--- a/vpx_dsp/x86/mem_sse2.h
+++ b/vpx_dsp/x86/mem_sse2.h
@@ -15,6 +15,11 @@
 
 #include "./vpx_config.h"
 
+static INLINE __m128i loadh_epi64(const __m128i s, const void *const src) {
+  return _mm_castps_si128(
+      _mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src));
+}
+
 static INLINE void load_8bit_4x4(const uint8_t *const s, const ptrdiff_t stride,
                                  __m128i *const d) {
   d[0] = _mm_cvtsi32_si128(*(const int *)(s + 0 * stride));
diff --git a/vpx_dsp/x86/sum_squares_sse2.c b/vpx_dsp/x86/sum_squares_sse2.c
index 026d0ca2f..9eaf6ee1b 100644
--- a/vpx_dsp/x86/sum_squares_sse2.c
+++ b/vpx_dsp/x86/sum_squares_sse2.c
@@ -10,120 +10,96 @@
 
 #include <assert.h>
 #include <emmintrin.h>
-#include <stdio.h>
 
 #include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/mem_sse2.h"
 
-static uint64_t vpx_sum_squares_2d_i16_4x4_sse2(const int16_t *src,
-                                                int stride) {
-  const __m128i v_val_0_w =
-      _mm_loadl_epi64((const __m128i *)(src + 0 * stride));
-  const __m128i v_val_1_w =
-      _mm_loadl_epi64((const __m128i *)(src + 1 * stride));
-  const __m128i v_val_2_w =
-      _mm_loadl_epi64((const __m128i *)(src + 2 * stride));
-  const __m128i v_val_3_w =
-      _mm_loadl_epi64((const __m128i *)(src + 3 * stride));
-
-  const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
-  const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
-  const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
-  const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
-
-  const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
-  const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
-  const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
-
-  const __m128i v_sum_d =
-      _mm_add_epi32(v_sum_0123_d, _mm_srli_epi64(v_sum_0123_d, 32));
-
-  return (uint64_t)_mm_cvtsi128_si32(v_sum_d);
-}
-
-// TODO(jingning): Evaluate the performance impact here.
-#ifdef __GNUC__
-// This prevents GCC/Clang from inlining this function into
-// vpx_sum_squares_2d_i16_sse2, which in turn saves some stack
-// maintenance instructions in the common case of 4x4.
-__attribute__((noinline))
-#endif
-static uint64_t
-vpx_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int size) {
-  int r, c;
-  const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
-  __m128i v_acc_q = _mm_setzero_si128();
-
-  for (r = 0; r < size; r += 8) {
-    __m128i v_acc_d = _mm_setzero_si128();
-
-    for (c = 0; c < size; c += 8) {
-      const int16_t *b = src + c;
-      const __m128i v_val_0_w =
-          _mm_load_si128((const __m128i *)(b + 0 * stride));
-      const __m128i v_val_1_w =
-          _mm_load_si128((const __m128i *)(b + 1 * stride));
-      const __m128i v_val_2_w =
-          _mm_load_si128((const __m128i *)(b + 2 * stride));
-      const __m128i v_val_3_w =
-          _mm_load_si128((const __m128i *)(b + 3 * stride));
-      const __m128i v_val_4_w =
-          _mm_load_si128((const __m128i *)(b + 4 * stride));
-      const __m128i v_val_5_w =
-          _mm_load_si128((const __m128i *)(b + 5 * stride));
-      const __m128i v_val_6_w =
-          _mm_load_si128((const __m128i *)(b + 6 * stride));
-      const __m128i v_val_7_w =
-          _mm_load_si128((const __m128i *)(b + 7 * stride));
-
-      const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
-      const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
-      const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
-      const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
-      const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w);
-      const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w);
-      const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w);
-      const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w);
-
-      const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
-      const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
-      const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d);
-      const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d);
-
-      const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
-      const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d);
-
-      v_acc_d = _mm_add_epi32(v_acc_d, v_sum_0123_d);
-      v_acc_d = _mm_add_epi32(v_acc_d, v_sum_4567_d);
-    }
-
-    v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q));
-    v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32));
+uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size) {
+  // Over 75% of all calls are with size == 4.
+  if (size == 4) {
+    __m128i s[2], sq[2], ss;
+
+    s[0] = _mm_loadl_epi64((const __m128i *)(src + 0 * stride));
+    s[0] = loadh_epi64(s[0], src + 1 * stride);
+    s[1] = _mm_loadl_epi64((const __m128i *)(src + 2 * stride));
+    s[1] = loadh_epi64(s[1], src + 3 * stride);
+    sq[0] = _mm_madd_epi16(s[0], s[0]);
+    sq[1] = _mm_madd_epi16(s[1], s[1]);
+    sq[0] = _mm_add_epi32(sq[0], sq[1]);
+    ss = _mm_add_epi32(sq[0], _mm_srli_si128(sq[0], 8));
+    ss = _mm_add_epi32(ss, _mm_srli_epi64(ss, 32));
+
+    return (uint64_t)_mm_cvtsi128_si32(ss);
+  } else {
+    // Generic case
+    int r = size;
+    const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
+    __m128i v_acc_q = _mm_setzero_si128();
 
-    src += 8 * stride;
-  }
+    assert(size % 8 == 0);
 
-  v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8));
+    do {
+      int c = 0;
+      __m128i v_acc_d = _mm_setzero_si128();
+
+      do {
+        const int16_t *const b = src + c;
+        const __m128i v_val_0_w =
+            _mm_load_si128((const __m128i *)(b + 0 * stride));
+        const __m128i v_val_1_w =
+            _mm_load_si128((const __m128i *)(b + 1 * stride));
+        const __m128i v_val_2_w =
+            _mm_load_si128((const __m128i *)(b + 2 * stride));
+        const __m128i v_val_3_w =
+            _mm_load_si128((const __m128i *)(b + 3 * stride));
+        const __m128i v_val_4_w =
+            _mm_load_si128((const __m128i *)(b + 4 * stride));
+        const __m128i v_val_5_w =
+            _mm_load_si128((const __m128i *)(b + 5 * stride));
+        const __m128i v_val_6_w =
+            _mm_load_si128((const __m128i *)(b + 6 * stride));
+        const __m128i v_val_7_w =
+            _mm_load_si128((const __m128i *)(b + 7 * stride));
+
+        const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
+        const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
+        const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
+        const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
+        const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w);
+        const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w);
+        const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w);
+        const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w);
+
+        const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
+        const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
+        const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d);
+        const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d);
+
+        const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
+        const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d);
+
+        v_acc_d = _mm_add_epi32(v_acc_d, v_sum_0123_d);
+        v_acc_d = _mm_add_epi32(v_acc_d, v_sum_4567_d);
+        c += 8;
+      } while (c < size);
+
+      v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q));
+      v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32));
+
+      src += 8 * stride;
+      r -= 8;
+    } while (r);
+
+    v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8));
 
 #if ARCH_X86_64
-  return (uint64_t)_mm_cvtsi128_si64(v_acc_q);
+    return (uint64_t)_mm_cvtsi128_si64(v_acc_q);
 #else
-  {
-    uint64_t tmp;
-    _mm_storel_epi64((__m128i *)&tmp, v_acc_q);
-    return tmp;
-  }
+    {
+      uint64_t tmp;
+      _mm_storel_epi64((__m128i *)&tmp, v_acc_q);
+      return tmp;
+    }
 #endif
-}
-
-uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size) {
-  // 4 elements per row only requires half an XMM register, so this
-  // must be a special case, but also note that over 75% of all calls
-  // are with size == 4, so it is also the common case.
-  if (size == 4) {
-    return vpx_sum_squares_2d_i16_4x4_sse2(src, stride);
-  } else {
-    // Generic case
-    assert(size % 8 == 0);
-    return vpx_sum_squares_2d_i16_nxn_sse2(src, stride, size);
   }
 }