13 files changed, 431 insertions, 115 deletions
diff --git a/vp9/common/arm/neon/vp9_reconintra_neon.c b/vp9/common/arm/neon/vp9_reconintra_neon.c
index 499c42ac3..13c46a57e 100644
--- a/vp9/common/arm/neon/vp9_reconintra_neon.c
+++ b/vp9/common/arm/neon/vp9_reconintra_neon.c
@@ -15,6 +15,75 @@
 #include "vpx/vpx_integer.h"
 
 //------------------------------------------------------------------------------
+// DC 4x4
+
+// 'do_above' and 'do_left' facilitate branch removal when inlined.
+static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride,
+                          const uint8_t *above, const uint8_t *left,
+                          int do_above, int do_left) {
+  uint16x8_t sum_top;
+  uint16x8_t sum_left;
+  uint8x8_t dc0;
+
+  if (do_above) {
+    const uint8x8_t A = vld1_u8(above);  // top row
+    const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
+    const uint16x4_t p1 = vpadd_u16(p0, p0);
+    sum_top = vcombine_u16(p1, p1);
+  }
+
+  if (do_left) {
+    const uint8x8_t L = vld1_u8(left);  // left border
+    const uint16x4_t p0 = vpaddl_u8(L);  // cascading summation of the left
+    const uint16x4_t p1 = vpadd_u16(p0, p0);
+    sum_left = vcombine_u16(p1, p1);
+  }
+
+  if (do_above && do_left) {
+    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+    dc0 = vrshrn_n_u16(sum, 3);
+  } else if (do_above) {
+    dc0 = vrshrn_n_u16(sum_top, 2);
+  } else if (do_left) {
+    dc0 = vrshrn_n_u16(sum_left, 2);
+  } else {
+    dc0 = vdup_n_u8(0x80);
+  }
+
+  {
+    const uint8x8_t dc = vdup_lane_u8(dc0, 0);
+    int i;
+    for (i = 0; i < 4; ++i) {
+      vst1_lane_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc), 0);
+    }
+  }
+}
+
+void vp9_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  dc_4x4(dst, stride, above, left, 1, 1);
+}
+
+void vp9_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  dc_4x4(dst, stride, NULL, left, 0, 1);
+}
+
+void vp9_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  dc_4x4(dst, stride, above, NULL, 1, 0);
+}
+
+void vp9_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  dc_4x4(dst, stride, NULL, NULL, 0, 0);
+}
+
+//------------------------------------------------------------------------------
 // DC 8x8
 
 // 'do_above' and 'do_left' facilitate branch removal when inlined.
@@ -161,6 +230,144 @@ void vp9_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
   dc_16x16(dst, stride, NULL, NULL, 0, 0);
 }
 
+//------------------------------------------------------------------------------
+// DC 32x32
+
+// 'do_above' and 'do_left' facilitate branch removal when inlined.
+static INLINE void dc_32x32(uint8_t *dst, ptrdiff_t stride,
+                            const uint8_t *above, const uint8_t *left,
+                            int do_above, int do_left) {
+  uint16x8_t sum_top;
+  uint16x8_t sum_left;
+  uint8x8_t dc0;
+
+  if (do_above) {
+    const uint8x16_t A0 = vld1q_u8(above);  // top row
+    const uint8x16_t A1 = vld1q_u8(above + 16);
+    const uint16x8_t p0 = vpaddlq_u8(A0);  // cascading summation of the top
+    const uint16x8_t p1 = vpaddlq_u8(A1);
+    const uint16x8_t p2 = vaddq_u16(p0, p1);
+    const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
+    const uint16x4_t p4 = vpadd_u16(p3, p3);
+    const uint16x4_t p5 = vpadd_u16(p4, p4);
+    sum_top = vcombine_u16(p5, p5);
+  }
+
+  if (do_left) {
+    const uint8x16_t L0 = vld1q_u8(left);  // left row
+    const uint8x16_t L1 = vld1q_u8(left + 16);
+    const uint16x8_t p0 = vpaddlq_u8(L0);  // cascading summation of the left
+    const uint16x8_t p1 = vpaddlq_u8(L1);
+    const uint16x8_t p2 = vaddq_u16(p0, p1);
+    const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
+    const uint16x4_t p4 = vpadd_u16(p3, p3);
+    const uint16x4_t p5 = vpadd_u16(p4, p4);
+    sum_left = vcombine_u16(p5, p5);
+  }
+
+  if (do_above && do_left) {
+    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+    dc0 = vrshrn_n_u16(sum, 6);
+  } else if (do_above) {
+    dc0 = vrshrn_n_u16(sum_top, 5);
+  } else if (do_left) {
+    dc0 = vrshrn_n_u16(sum_left, 5);
+  } else {
+    dc0 = vdup_n_u8(0x80);
+  }
+
+  {
+    const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
+    int i;
+    for (i = 0; i < 32; ++i) {
+      vst1q_u8(dst + i * stride, dc);
+      vst1q_u8(dst + i * stride + 16, dc);
+    }
+  }
+}
+
+void vp9_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  dc_32x32(dst, stride, above, left, 1, 1);
+}
+
+void vp9_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  dc_32x32(dst, stride, NULL, left, 0, 1);
+}
+
+void vp9_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  dc_32x32(dst, stride, above, NULL, 1, 0);
+}
+
+void vp9_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  dc_32x32(dst, stride, NULL, NULL, 0, 0);
+}
+
+// -----------------------------------------------------------------------------
+
+void vp9_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(above));  // top row
+  const uint64x1_t A1 = vshr_n_u64(A0, 8);
+  const uint64x1_t A2 = vshr_n_u64(A0, 16);
+  const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);
+  const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
+  const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
+  const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGH00);
+  const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
+  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
+  const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
+  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
+  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
+  const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
+  (void)left;
+  vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
+  vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
+  vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
+  vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
+  dst[3 * stride + 3] = above[7];
+}
+
+// -----------------------------------------------------------------------------
+
+void vp9_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const uint8x8_t XABCD_u8 = vld1_u8(above - 1);
+  const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
+  const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
+  const uint32x2_t zero = vdup_n_u32(0);
+  const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0);
+  const uint8x8_t IJKL_u8 = vreinterpret_u8_u32(IJKL);
+  const uint64x1_t LKJI____ = vreinterpret_u64_u8(vrev32_u8(IJKL_u8));
+  const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
+  const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
+  const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
+  const uint8_t D = vget_lane_u8(XABCD_u8, 4);
+  const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
+  const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
+  const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
+  const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
+  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
+  const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
+  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
+  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
+  const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
+  vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
+  vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
+  vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
+  vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
+}
+
 #if !HAVE_NEON_ASM
 
 void vp9_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
diff --git a/vp9/common/vp9_filter.c b/vp9/common/vp9_filter.c
index b256d4af5..14654a5ab 100644
--- a/vp9/common/vp9_filter.c
+++ b/vp9/common/vp9_filter.c
@@ -12,8 +12,8 @@
 
 #include "vp9/common/vp9_filter.h"
 
-DECLARE_ALIGNED(256, const InterpKernel,
-                vp9_bilinear_filters[SUBPEL_SHIFTS]) = {
+DECLARE_ALIGNED(256, static const InterpKernel,
+                bilinear_filters[SUBPEL_SHIFTS]) = {
   { 0, 0, 0, 128,   0, 0, 0, 0 },
   { 0, 0, 0, 120,   8, 0, 0, 0 },
   { 0, 0, 0, 112,  16, 0, 0, 0 },
@@ -33,8 +33,8 @@ DECLARE_ALIGNED(256, const InterpKernel,
 };
 
 // Lagrangian interpolation filter
-DECLARE_ALIGNED(256, const InterpKernel,
-                vp9_sub_pel_filters_8[SUBPEL_SHIFTS]) = {
+DECLARE_ALIGNED(256, static const InterpKernel,
+                sub_pel_filters_8[SUBPEL_SHIFTS]) = {
   { 0,   0,   0, 128,   0,   0,   0,  0},
   { 0,   1,  -5, 126,   8,  -3,   1,  0},
   { -1,   3, -10, 122,  18,  -6,   2,  0},
@@ -54,8 +54,8 @@ DECLARE_ALIGNED(256, const InterpKernel,
 };
 
 // DCT based filter
-DECLARE_ALIGNED(256, const InterpKernel,
-                vp9_sub_pel_filters_8s[SUBPEL_SHIFTS]) = {
+DECLARE_ALIGNED(256, static const InterpKernel,
+                sub_pel_filters_8s[SUBPEL_SHIFTS]) = {
   {0,   0,   0, 128,   0,   0,   0, 0},
   {-1,   3,  -7, 127,   8,  -3,   1, 0},
   {-2,   5, -13, 125,  17,  -6,   3, -1},
@@ -75,8 +75,8 @@ DECLARE_ALIGNED(256, const InterpKernel,
 };
 
 // freqmultiplier = 0.5
-DECLARE_ALIGNED(256, const InterpKernel,
-                vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS]) = {
+DECLARE_ALIGNED(256, static const InterpKernel,
+                sub_pel_filters_8lp[SUBPEL_SHIFTS]) = {
   { 0,  0,  0, 128,  0,  0,  0,  0},
   {-3, -1, 32,  64, 38,  1, -3,  0},
   {-2, -2, 29,  63, 41,  2, -3,  0},
@@ -96,15 +96,15 @@ DECLARE_ALIGNED(256, const InterpKernel,
 };
 
 
-static const InterpKernel* vp9_filter_kernels[4] = {
-  vp9_sub_pel_filters_8,
-  vp9_sub_pel_filters_8lp,
-  vp9_sub_pel_filters_8s,
-  vp9_bilinear_filters
+static const InterpKernel* filter_kernels[4] = {
+  sub_pel_filters_8,
+  sub_pel_filters_8lp,
+  sub_pel_filters_8s,
+  bilinear_filters
 };
 
 const InterpKernel *vp9_get_interp_kernel(INTERP_FILTER filter) {
   assert(filter != SWITCHABLE);
-  return vp9_filter_kernels[filter];
+  return filter_kernels[filter];
 }
 
diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c
index 650f4adde..1e9acb8d5 100644
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c
@@ -533,8 +533,8 @@ static INLINE void d117_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
 }
 intra_pred_no_4x4(d117)
 
-void vp9_d135_predictor_4x4(uint8_t *dst, ptrdiff_t stride,
-                            const uint8_t *above, const uint8_t *left) {
+void vp9_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
   const int I = left[0];
   const int J = left[1];
   const int K = left[2];
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 2a3b054ad..dac142397 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -60,7 +60,7 @@ add_proto qw/void vp9_d207_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, co
 specialize qw/vp9_d207_predictor_4x4/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_d45_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_d45_predictor_4x4/, "$ssse3_x86inc";
+specialize qw/vp9_d45_predictor_4x4 neon/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_d63_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_d63_predictor_4x4/, "$ssse3_x86inc";
@@ -72,7 +72,7 @@ add_proto qw/void vp9_d117_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, co
 specialize qw/vp9_d117_predictor_4x4/;
 
 add_proto qw/void vp9_d135_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_d135_predictor_4x4/;
+specialize qw/vp9_d135_predictor_4x4 neon/;
 
 add_proto qw/void vp9_d153_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_d153_predictor_4x4/, "$ssse3_x86inc";
@@ -84,16 +84,16 @@ add_proto qw/void vp9_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, cons
 specialize qw/vp9_tm_predictor_4x4 neon dspr2 msa/, "$sse_x86inc";
 
 add_proto qw/void vp9_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_predictor_4x4 dspr2 msa/, "$sse_x86inc";
+specialize qw/vp9_dc_predictor_4x4 dspr2 msa neon/, "$sse_x86inc";
 
 add_proto qw/void vp9_dc_top_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_top_predictor_4x4 msa/, "$sse_x86inc";
+specialize qw/vp9_dc_top_predictor_4x4 msa neon/, "$sse_x86inc";
 
 add_proto qw/void vp9_dc_left_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_left_predictor_4x4 msa/, "$sse_x86inc";
+specialize qw/vp9_dc_left_predictor_4x4 msa neon/, "$sse_x86inc";
 
 add_proto qw/void vp9_dc_128_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_128_predictor_4x4 msa/, "$sse_x86inc";
+specialize qw/vp9_dc_128_predictor_4x4 msa neon/, "$sse_x86inc";
 
 add_proto qw/void vp9_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_d207_predictor_8x8/, "$ssse3_x86inc";
@@ -201,16 +201,16 @@ add_proto qw/void vp9_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, co
 specialize qw/vp9_tm_predictor_32x32 neon msa/, "$sse2_x86_64";
 
 add_proto qw/void vp9_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_predictor_32x32 msa/, "$sse2_x86inc";
+specialize qw/vp9_dc_predictor_32x32 msa neon/, "$sse2_x86inc";
 
 add_proto qw/void vp9_dc_top_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_top_predictor_32x32 msa/, "$sse2_x86inc";
+specialize qw/vp9_dc_top_predictor_32x32 msa neon/, "$sse2_x86inc";
 
 add_proto qw/void vp9_dc_left_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_left_predictor_32x32 msa/, "$sse2_x86inc";
+specialize qw/vp9_dc_left_predictor_32x32 msa neon/, "$sse2_x86inc";
 
 add_proto qw/void vp9_dc_128_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_128_predictor_32x32 msa/, "$sse2_x86inc";
+specialize qw/vp9_dc_128_predictor_32x32 msa neon/, "$sse2_x86inc";
 
 #
 # Loopfilter
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c
index df70d48f8..0e4d8638b 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -39,6 +39,8 @@ struct CYCLIC_REFRESH {
   int rdmult;
   // Cyclic refresh map.
   signed char *map;
+  // Map of the last q a block was coded at.
+  uint8_t *last_coded_q_map;
   // Thresholds applied to the projected rate/distortion of the coding block,
   // when deciding whether block should be refreshed.
   int64_t thresh_rate_sb;
@@ -51,11 +53,11 @@ struct CYCLIC_REFRESH {
   // Boost factor for rate target ratio, for segment CR_SEGMENT_ID_BOOST2.
   double rate_boost_fac;
   double low_content_avg;
-  int qindex_delta_seg1;
-  int qindex_delta_seg2;
+  int qindex_delta[3];
 };
 
 CYCLIC_REFRESH *vp9_cyclic_refresh_alloc(int mi_rows, int mi_cols) {
+  size_t last_coded_q_map_size;
   CYCLIC_REFRESH *const cr = vpx_calloc(1, sizeof(*cr));
   if (cr == NULL)
     return NULL;
@@ -65,12 +67,21 @@ CYCLIC_REFRESH *vp9_cyclic_refresh_alloc(int mi_rows, int mi_cols) {
     vpx_free(cr);
     return NULL;
   }
+  last_coded_q_map_size = mi_rows * mi_cols * sizeof(*cr->last_coded_q_map);
+  cr->last_coded_q_map = vpx_malloc(last_coded_q_map_size);
+  if (cr->last_coded_q_map == NULL) {
+    vpx_free(cr);
+    return NULL;
+  }
+  assert(MAXQ <= 255);
+  memset(cr->last_coded_q_map, MAXQ, last_coded_q_map_size);
 
   return cr;
 }
 
 void vp9_cyclic_refresh_free(CYCLIC_REFRESH *cr) {
   vpx_free(cr->map);
+  vpx_free(cr->last_coded_q_map);
   vpx_free(cr);
 }
 
@@ -159,11 +170,11 @@ int vp9_cyclic_refresh_estimate_bits_at_q(const VP9_COMP *cpi,
                              correction_factor, cm->bit_depth) +
                              weight_segment1 *
       vp9_estimate_bits_at_q(cm->frame_type,
-                             cm->base_qindex + cr->qindex_delta_seg1, mbs,
+                             cm->base_qindex + cr->qindex_delta[1], mbs,
                              correction_factor, cm->bit_depth) +
                              weight_segment2 *
       vp9_estimate_bits_at_q(cm->frame_type,
-                             cm->base_qindex + cr->qindex_delta_seg2, mbs,
+                             cm->base_qindex + cr->qindex_delta[2], mbs,
                              correction_factor, cm->bit_depth));
   return estimated_bits;
 }
@@ -248,9 +259,16 @@ void vp9_cyclic_refresh_update_segment(VP9_COMP *const cpi,
   // copy mbmi->segment_id into global segmentation map.
   for (y = 0; y < ymis; y++)
     for (x = 0; x < xmis; x++) {
-      cr->map[block_index + y * cm->mi_cols + x] = new_map_value;
-      cpi->segmentation_map[block_index + y * cm->mi_cols + x] =
-          mbmi->segment_id;
+      int map_offset = block_index + y * cm->mi_cols + x;
+      cr->map[map_offset] = new_map_value;
+      cpi->segmentation_map[map_offset] = mbmi->segment_id;
+      // Inter skip blocks were clearly not coded at the current qindex, so
+      // don't update the map for them. For cases where motion is non-zero or
+      // the reference frame isn't the previous frame, the previous value in
+      // the map for this spatial location is not entirely correct.
+      if (!is_inter_block(mbmi) || !skip)
+        cr->last_coded_q_map[map_offset] = clamp(
+            cm->base_qindex + cr->qindex_delta[mbmi->segment_id], 0, MAXQ);
     }
 }
 
@@ -384,6 +402,10 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) {
     int sb_col_index = i - sb_row_index * sb_cols;
     int mi_row = sb_row_index * MI_BLOCK_SIZE;
     int mi_col = sb_col_index * MI_BLOCK_SIZE;
+    int qindex_thresh =
+        cpi->oxcf.content == VP9E_CONTENT_SCREEN
+            ? vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST2, cm->base_qindex)
+            : 0;
     assert(mi_row >= 0 && mi_row < cm->mi_rows);
     assert(mi_col >= 0 && mi_col < cm->mi_cols);
     bl_index = mi_row * cm->mi_cols + mi_col;
@@ -399,7 +421,8 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) {
         // for possible boost/refresh (segment 1). The segment id may get
         // reset to 0 later if block gets coded anything other than ZEROMV.
         if (cr->map[bl_index2] == 0) {
-          sum_map++;
+          if (cr->last_coded_q_map[bl_index2] > qindex_thresh)
+            sum_map++;
         } else if (cr->map[bl_index2] < 0) {
           cr->map[bl_index2]++;
         }
@@ -466,8 +489,11 @@ void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) {
     unsigned char *const seg_map = cpi->segmentation_map;
     memset(seg_map, 0, cm->mi_rows * cm->mi_cols);
     vp9_disable_segmentation(&cm->seg);
-    if (cm->frame_type == KEY_FRAME)
+    if (cm->frame_type == KEY_FRAME) {
+      memset(cr->last_coded_q_map, MAXQ,
+             cm->mi_rows * cm->mi_cols * sizeof(*cr->last_coded_q_map));
       cr->sb_index = 0;
+    }
     return;
   } else {
     int qindex_delta = 0;
@@ -505,7 +531,7 @@ void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) {
 
     // Set the q delta for segment BOOST1.
     qindex_delta = compute_deltaq(cpi, cm->base_qindex, cr->rate_ratio_qdelta);
-    cr->qindex_delta_seg1 = qindex_delta;
+    cr->qindex_delta[1] = qindex_delta;
 
     // Compute rd-mult for segment BOOST1.
     qindex2 = clamp(cm->base_qindex + cm->y_dc_delta_q + qindex_delta, 0, MAXQ);
@@ -518,7 +544,7 @@ void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) {
     qindex_delta = compute_deltaq(cpi, cm->base_qindex,
                                   MIN(CR_MAX_RATE_TARGET_RATIO,
                                   cr->rate_boost_fac * cr->rate_ratio_qdelta));
-    cr->qindex_delta_seg2 = qindex_delta;
+    cr->qindex_delta[2] = qindex_delta;
     vp9_set_segdata(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q, qindex_delta);
 
     // Update the segmentation and refresh map.
@@ -529,3 +555,10 @@ void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) {
 int vp9_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr) {
   return cr->rdmult;
 }
+
+void vp9_cyclic_refresh_reset_resize(VP9_COMP *const cpi) {
+  const VP9_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  memset(cr->map, 0, cm->mi_rows * cm->mi_cols);
+  cr->sb_index = 0;
+}
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.h b/vp9/encoder/vp9_aq_cyclicrefresh.h
index 99bb98ec8..29d2a91bc 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.h
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.h
@@ -75,6 +75,8 @@ void vp9_cyclic_refresh_setup(struct VP9_COMP *const cpi);
 
 int vp9_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr);
 
+void vp9_cyclic_refresh_reset_resize(struct VP9_COMP *const cpi);
+
 static INLINE int cyclic_refresh_segment_id_boosted(int segment_id) {
   return segment_id == CR_SEGMENT_ID_BOOST1 ||
          segment_id == CR_SEGMENT_ID_BOOST2;
diff --git a/vp9/encoder/vp9_avg.c b/vp9/encoder/vp9_avg.c
index 437356681..3ef3882d2 100644
--- a/vp9/encoder/vp9_avg.c
+++ b/vp9/encoder/vp9_avg.c
@@ -115,33 +115,41 @@ void vp9_hadamard_16x16_c(int16_t const *src_diff, int src_stride,
   }
 }
 
+// coeff: 16 bits, dynamic range [-32640, 32640].
+// length: value range {16, 64, 256, 1024}.
 int16_t vp9_satd_c(const int16_t *coeff, int length) {
   int i;
   int satd = 0;
   for (i = 0; i < length; ++i)
     satd += abs(coeff[i]);
 
+  // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
   return (int16_t)satd;
 }
 
 // Integer projection onto row vectors.
-void vp9_int_pro_row_c(int16_t *hbuf, uint8_t const *ref,
+// height: value range {16, 32, 64}.
+void vp9_int_pro_row_c(int16_t hbuf[16], uint8_t const *ref,
                        const int ref_stride, const int height) {
   int idx;
-  const int norm_factor = MAX(8, height >> 1);
+  const int norm_factor = height >> 1;
   for (idx = 0; idx < 16; ++idx) {
     int i;
     hbuf[idx] = 0;
+    // hbuf[idx]: 14 bit, dynamic range [0, 16320].
     for (i = 0; i < height; ++i)
       hbuf[idx] += ref[i * ref_stride];
+    // hbuf[idx]: 9 bit, dynamic range [0, 510].
     hbuf[idx] /= norm_factor;
     ++ref;
   }
 }
 
+// width: value range {16, 32, 64}.
 int16_t vp9_int_pro_col_c(uint8_t const *ref, const int width) {
   int idx;
   int16_t sum = 0;
+  // sum: 14 bit, dynamic range [0, 16320]
   for (idx = 0; idx < width; ++idx)
     sum += ref[idx];
   return sum;
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 5299244d9..dcddefc28 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -2220,66 +2220,6 @@ static void rd_auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile,
   *max_block_size = max_size;
 }
 
-static void auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile,
-                                 MACROBLOCKD *const xd,
-                                 int mi_row, int mi_col,
-                                 BLOCK_SIZE *min_block_size,
-                                 BLOCK_SIZE *max_block_size) {
-  VP9_COMMON *const cm = &cpi->common;
-  MODE_INFO **mi_8x8 = xd->mi;
-  const int left_in_image = xd->left_available && mi_8x8[-1];
-  const int above_in_image = xd->up_available && mi_8x8[-xd->mi_stride];
-  int row8x8_remaining = tile->mi_row_end - mi_row;
-  int col8x8_remaining = tile->mi_col_end - mi_col;
-  int bh, bw;
-  BLOCK_SIZE min_size = BLOCK_32X32;
-  BLOCK_SIZE max_size = BLOCK_8X8;
-  int bsl = mi_width_log2_lookup[BLOCK_64X64];
-  const int search_range_ctrl = (((mi_row + mi_col) >> bsl) +
-                       get_chessboard_index(cm->current_video_frame)) & 0x1;
-  // Trap case where we do not have a prediction.
-  if (search_range_ctrl &&
-      (left_in_image || above_in_image || cm->frame_type != KEY_FRAME)) {
-    int block;
-    MODE_INFO **mi;
-    BLOCK_SIZE sb_type;
-
-    // Find the min and max partition sizes used in the left SB64.
-    if (left_in_image) {
-      MODE_INFO *cur_mi;
-      mi = &mi_8x8[-1];
-      for (block = 0; block < MI_BLOCK_SIZE; ++block) {
-        cur_mi = mi[block * xd->mi_stride];
-        sb_type = cur_mi ? cur_mi->mbmi.sb_type : 0;
-        min_size = MIN(min_size, sb_type);
-        max_size = MAX(max_size, sb_type);
-      }
-    }
-    // Find the min and max partition sizes used in the above SB64.
-    if (above_in_image) {
-      mi = &mi_8x8[-xd->mi_stride * MI_BLOCK_SIZE];
-      for (block = 0; block < MI_BLOCK_SIZE; ++block) {
-        sb_type = mi[block] ? mi[block]->mbmi.sb_type : 0;
-        min_size = MIN(min_size, sb_type);
-        max_size = MAX(max_size, sb_type);
-      }
-    }
-
-    min_size = min_partition_size[min_size];
-    max_size = find_partition_size(max_size, row8x8_remaining, col8x8_remaining,
-                                   &bh, &bw);
-    min_size = MIN(min_size, max_size);
-    min_size = MAX(min_size, BLOCK_8X8);
-    max_size = MIN(max_size, BLOCK_32X32);
-  } else {
-    min_size = BLOCK_8X8;
-    max_size = BLOCK_32X32;
-  }
-
-  *min_block_size = min_size;
-  *max_block_size = max_size;
-}
-
 // TODO(jingning) refactor functions setting partition search range
 static void set_partition_range(VP9_COMMON *cm, MACROBLOCKD *xd,
                                 int mi_row, int mi_col, BLOCK_SIZE bsize,
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index ecd60d772..d708b8319 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -1596,6 +1596,9 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
                   sizeof(*cm->frame_contexts)));
 
   cpi->use_svc = 0;
+  cpi->resize_state = 0;
+  cpi->resize_avg_qp = 0;
+  cpi->resize_buffer_underflow = 0;
   cpi->common.buffer_pool = pool;
 
   init_config(cpi, oxcf);
@@ -3032,6 +3035,31 @@ static void set_frame_size(VP9_COMP *cpi) {
                          oxcf->scaled_frame_height);
   }
 
+  if (oxcf->pass == 0 &&
+      oxcf->rc_mode == VPX_CBR &&
+      !cpi->use_svc &&
+      oxcf->resize_mode == RESIZE_DYNAMIC) {
+      if (cpi->resize_state == 1) {
+        oxcf->scaled_frame_width =
+            (cm->width * cpi->resize_scale_num) / cpi->resize_scale_den;
+        oxcf->scaled_frame_height =
+            (cm->height * cpi->resize_scale_num) /cpi->resize_scale_den;
+      } else if (cpi->resize_state == -1) {
+        // Go back up to original size.
+        oxcf->scaled_frame_width = oxcf->width;
+        oxcf->scaled_frame_height = oxcf->height;
+      }
+      if (cpi->resize_state != 0) {
+        // There has been a change in frame size.
+        vp9_set_size_literal(cpi,
+                             oxcf->scaled_frame_width,
+                             oxcf->scaled_frame_height);
+
+        // TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed.
+        set_mv_search_params(cpi);
+      }
+  }
+
   if ((oxcf->pass == 2) &&
       (!cpi->use_svc ||
           (is_two_pass_svc(cpi) &&
@@ -3962,7 +3990,6 @@ static void check_src_altref(VP9_COMP *cpi,
 extern double vp9_get_blockiness(const unsigned char *img1, int img1_pitch,
                                  const unsigned char *img2, int img2_pitch,
                                  int width, int height);
-#endif
 
 static void adjust_image_stat(double y, double u, double v, double all,
                               ImageStat *s) {
@@ -3972,6 +3999,7 @@ static void adjust_image_stat(double y, double u, double v, double all,
   s->stat[ALL] += all;
   s->worst = MIN(s->worst, all);
 }
+#endif  // CONFIG_INTERNAL_STATS
 
 int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
                             size_t *size, uint8_t *dest,
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 4d2a186e8..2b0da103f 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -479,6 +479,12 @@ typedef struct VP9_COMP {
 #endif
 
   int resize_pending;
+  int resize_state;
+  int resize_scale_num;
+  int resize_scale_den;
+  int resize_avg_qp;
+  int resize_buffer_underflow;
+  int resize_count;
 
   // VAR_BASED_PARTITION thresholds
   // 0 - threshold_64x64; 1 - threshold_32x32;
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 234272697..081b99f9f 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -286,20 +286,20 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
   bestmv->row *= 8;                                                        \
   bestmv->col *= 8;
 
-static INLINE unsigned int setup_center_error(const MACROBLOCKD *xd,
-                                              const MV *bestmv,
-                                              const MV *ref_mv,
-                                              int error_per_bit,
-                                              const vp9_variance_fn_ptr_t *vfp,
-                                              const uint8_t *const src,
-                                              const int src_stride,
-                                              const uint8_t *const y,
-                                              int y_stride,
-                                              const uint8_t *second_pred,
-                                              int w, int h, int offset,
-                                              int *mvjcost, int *mvcost[2],
-                                              unsigned int *sse1,
-                                              int *distortion) {
+static unsigned int setup_center_error(const MACROBLOCKD *xd,
+                                       const MV *bestmv,
+                                       const MV *ref_mv,
+                                       int error_per_bit,
+                                       const vp9_variance_fn_ptr_t *vfp,
+                                       const uint8_t *const src,
+                                       const int src_stride,
+                                       const uint8_t *const y,
+                                       int y_stride,
+                                       const uint8_t *second_pred,
+                                       int w, int h, int offset,
+                                       int *mvjcost, int *mvcost[2],
+                                       unsigned int *sse1,
+                                       int *distortion) {
   unsigned int besterr;
 #if CONFIG_VP9_HIGHBITDEPTH
   if (second_pred != NULL) {
@@ -610,7 +610,7 @@ int vp9_find_best_sub_pixel_tree_pruned(const MACROBLOCK *x,
   return besterr;
 }
 
-const MV search_step_table[12] = {
+static const MV search_step_table[12] = {
     // left, right, up, down
     {0, -4}, {0, 4}, {-4, 0}, {4, 0},
     {0, -2}, {0, 2}, {-2, 0}, {2, 0},
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 32682fe74..425073fcd 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -1596,6 +1596,7 @@ void vp9_rc_get_one_pass_cbr_params(VP9_COMP *cpi) {
     target = calc_pframe_target_size_one_pass_cbr(cpi);
 
   vp9_rc_set_frame_target(cpi, target);
+  cpi->resize_state = vp9_resize_one_pass_cbr(cpi);
 }
 
 int vp9_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget,
@@ -1756,3 +1757,92 @@ void vp9_set_target_rate(VP9_COMP *cpi) {
     vbr_rate_correction(cpi, &target_rate);
   vp9_rc_set_frame_target(cpi, target_rate);
 }
+
+// Check if we should resize, based on average QP from past x frames.
+// Only allow for resize at most one scale down for now, scaling factor is 2.
+int vp9_resize_one_pass_cbr(VP9_COMP *cpi) {
+  const VP9_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  int resize_now = 0;
+  cpi->resize_scale_num = 1;
+  cpi->resize_scale_den = 1;
+  // Don't resize on key frame; reset the counters on key frame.
+  if (cm->frame_type == KEY_FRAME) {
+    cpi->resize_avg_qp = 0;
+    cpi->resize_count = 0;
+    return 0;
+  }
+  // Resize based on average QP over some window.
+  // Ignore samples close to key frame, since QP is usually high after key.
+  if (cpi->rc.frames_since_key > 2 * cpi->framerate) {
+    const int window = (int)(5 * cpi->framerate);
+    cpi->resize_avg_qp += cm->base_qindex;
+    if (cpi->rc.buffer_level < 0)
+      ++cpi->resize_buffer_underflow;
+    ++cpi->resize_count;
+    // Check for resize action every "window" frames.
+    if (cpi->resize_count == window) {
+      int avg_qp = cpi->resize_avg_qp / cpi->resize_count;
+      // Resize down if buffer level has underflowed sufficent amount in past
+      // window, and we are at original resolution.
+      // Resize back up if average QP is low, and we are currently in a resized
+      // down state.
+      if (cpi->resize_state == 0 &&
+          cpi->resize_buffer_underflow > (cpi->resize_count >> 3)) {
+        resize_now = 1;
+      } else if (cpi->resize_state == 1 &&
+                 avg_qp < 40 * cpi->rc.worst_quality / 100) {
+        resize_now = -1;
+      }
+      // Reset for next window measurement.
+      cpi->resize_avg_qp = 0;
+      cpi->resize_count = 0;
+      cpi->resize_buffer_underflow = 0;
+    }
+  }
+  // If decision is to resize, reset some quantities, and check is we should
+  // reduce rate correction factor,
+  if (resize_now != 0) {
+    int target_bits_per_frame;
+    int active_worst_quality;
+    int qindex;
+    int tot_scale_change;
+    // For now, resize is by 1/2 x 1/2.
+    cpi->resize_scale_num = 1;
+    cpi->resize_scale_den = 2;
+    tot_scale_change = (cpi->resize_scale_den * cpi->resize_scale_den) /
+        (cpi->resize_scale_num * cpi->resize_scale_num);
+    // Reset buffer level to optimal, update target size.
+    rc->buffer_level = rc->optimal_buffer_level;
+    rc->bits_off_target = rc->optimal_buffer_level;
+    rc->this_frame_target = calc_pframe_target_size_one_pass_cbr(cpi);
+    // Reset cyclic refresh parameters.
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled)
+      vp9_cyclic_refresh_reset_resize(cpi);
+    // Get the projected qindex, based on the scaled target frame size (scaled
+    // so target_bits_per_mb in vp9_rc_regulate_q will be correct target).
+    target_bits_per_frame = (resize_now == 1) ?
+        rc->this_frame_target * tot_scale_change :
+        rc->this_frame_target / tot_scale_change;
+    active_worst_quality = calc_active_worst_quality_one_pass_cbr(cpi);
+    qindex = vp9_rc_regulate_q(cpi,
+                               target_bits_per_frame,
+                               rc->best_quality,
+                               active_worst_quality);
+    // If resize is down, check if projected q index is close to worst_quality,
+    // and if so, reduce the rate correction factor (since likely can afford
+    // lower q for resized frame).
+    if (resize_now == 1 &&
+        qindex > 90 * cpi->rc.worst_quality / 100) {
+      rc->rate_correction_factors[INTER_NORMAL] *= 0.85;
+    }
+    // If resize is back up, check if projected q index is too much above the
+    // current base_qindex, and if so, reduce the rate correction factor
+    // (since prefer to keep q for resized frame at least close to previous q).
+    if (resize_now == -1 &&
+       qindex > 130 * cm->base_qindex / 100) {
+      rc->rate_correction_factors[INTER_NORMAL] *= 0.9;
+    }
+  }
+  return resize_now;
+}
diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h
index e12d200be..a10836c74 100644
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -245,6 +245,8 @@ void vp9_rc_set_gf_interval_range(const struct VP9_COMP *const cpi,
 
 void vp9_set_target_rate(struct VP9_COMP *cpi);
 
+int vp9_resize_one_pass_cbr(struct VP9_COMP *cpi);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif