21 files changed, 1113 insertions, 1407 deletions
diff --git a/configure b/configure
index 5cbf07095..cc8c58141 100755
--- a/configure
+++ b/configure
@@ -247,6 +247,7 @@ EXPERIMENT_LIST="
     multiple_arf
     non420
     ab4x4
+    comp_inter_joint_search
 "
 CONFIG_LIST="
     external_build
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index ab9e28dcc..004054d10 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -70,17 +70,17 @@ typedef enum {
 } INTERPOLATIONFILTERTYPE;
 
 typedef enum {
-  DC_PRED,            /* average of above and left pixels */
-  V_PRED,             /* vertical prediction */
-  H_PRED,             /* horizontal prediction */
-  D45_PRED,           /* Directional 45 deg prediction  [anti-clockwise from 0 deg hor] */
-  D135_PRED,          /* Directional 135 deg prediction [anti-clockwise from 0 deg hor] */
-  D117_PRED,          /* Directional 112 deg prediction [anti-clockwise from 0 deg hor] */
-  D153_PRED,          /* Directional 157 deg prediction [anti-clockwise from 0 deg hor] */
-  D27_PRED,           /* Directional 22 deg prediction  [anti-clockwise from 0 deg hor] */
-  D63_PRED,           /* Directional 67 deg prediction  [anti-clockwise from 0 deg hor] */
-  TM_PRED,            /* Truemotion prediction */
-  I4X4_PRED,          /* 4x4 based prediction, each 4x4 has its own mode */
+  DC_PRED,         // Average of above and left pixels
+  V_PRED,          // Vertical
+  H_PRED,          // Horizontal
+  D45_PRED,        // Directional 45  deg = round(arctan(1/1) * 180/pi)
+  D135_PRED,       // Directional 135 deg = 180 - 45
+  D117_PRED,       // Directional 117 deg = 180 - 63
+  D153_PRED,       // Directional 153 deg = 180 - 27
+  D27_PRED,        // Directional 27  deg = round(arctan(1/2) * 180/pi)
+  D63_PRED,        // Directional 63  deg = round(arctan(2/1) * 180/pi)
+  TM_PRED,         // True-motion
+  I4X4_PRED,       // Each 4x4 subblock has its own mode
   NEARESTMV,
   NEARMV,
   ZEROMV,
diff --git a/vp9/common/vp9_enums.h b/vp9/common/vp9_enums.h
index 166319565..2f6707487 100644
--- a/vp9/common/vp9_enums.h
+++ b/vp9/common/vp9_enums.h
@@ -36,6 +36,7 @@ typedef enum BLOCK_SIZE_TYPE {
   BLOCK_SIZE_SB32X64,
   BLOCK_SIZE_SB64X32,
   BLOCK_SIZE_SB64X64,
+  BLOCK_SIZE_TYPES
 } BLOCK_SIZE_TYPE;
 
 typedef enum PARTITION_TYPE {
diff --git a/vp9/common/vp9_loopfilter_filters.c b/vp9/common/vp9_loopfilter_filters.c
index 15785f581..bf97589a9 100644
--- a/vp9/common/vp9_loopfilter_filters.c
+++ b/vp9/common/vp9_loopfilter_filters.c
@@ -282,29 +282,6 @@ static INLINE void simple_filter(int8_t mask,
   *op0 = signed_char_clamp(p0 + filter2) ^ 0x80;
 }
 
-void vp9_loop_filter_simple_horizontal_edge_c(uint8_t *s, int p,
-                                              const uint8_t *blimit) {
-  int i = 0;
-
-  do {
-    const int8_t mask = simple_filter_mask(blimit[0], s[-2 * p], s[-1 * p],
-                                                      s[0 * p],  s[1 * p]);
-    simple_filter(mask, s - 2 * p, s - 1 * p, s, s + 1 * p);
-    ++s;
-  } while (++i < 16);
-}
-
-void vp9_loop_filter_simple_vertical_edge_c(uint8_t *s, int p,
-                                            const uint8_t *blimit) {
-  int i = 0;
-
-  do {
-    const int8_t mask = simple_filter_mask(blimit[0], s[-2], s[-1], s[0], s[1]);
-    simple_filter(mask, s - 2, s - 1, s, s + 1);
-    s += p;
-  } while (++i < 16);
-}
-
 /* Vertical MB Filtering */
 void vp9_loop_filter_mbv_c(uint8_t *y_ptr, uint8_t *u_ptr,
                            uint8_t *v_ptr, int y_stride, int uv_stride,
@@ -392,11 +369,6 @@ void vp9_loop_filter_bh8x8_c(uint8_t *y, uint8_t *u, uint8_t *v,
                                       lfi->blim, lfi->lim, lfi->hev_thr, 1);
 }
 
-void vp9_loop_filter_bhs_c(uint8_t *y, int y_stride, const uint8_t *blimit) {
-  vp9_loop_filter_simple_horizontal_edge_c(y + 4 * y_stride, y_stride, blimit);
-  vp9_loop_filter_simple_horizontal_edge_c(y + 8 * y_stride, y_stride, blimit);
-  vp9_loop_filter_simple_horizontal_edge_c(y + 12 * y_stride, y_stride, blimit);
-}
 
 void vp9_loop_filter_bv8x8_c(uint8_t *y, uint8_t *u, uint8_t *v,
                              int y_stride, int uv_stride,
@@ -413,12 +385,6 @@ void vp9_loop_filter_bv8x8_c(uint8_t *y, uint8_t *u, uint8_t *v,
                                     lfi->blim, lfi->lim, lfi->hev_thr, 1);
 }
 
-void vp9_loop_filter_bvs_c(uint8_t *y, int y_stride, const uint8_t *blimit) {
-  vp9_loop_filter_simple_vertical_edge_c(y + 4, y_stride, blimit);
-  vp9_loop_filter_simple_vertical_edge_c(y + 8, y_stride, blimit);
-  vp9_loop_filter_simple_vertical_edge_c(y + 12, y_stride, blimit);
-}
-
 static INLINE void wide_mbfilter(int8_t mask, uint8_t hev,
                                  uint8_t flat, uint8_t flat2,
                                  uint8_t *op7, uint8_t *op6, uint8_t *op5,
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 75e36040c..02d32530a 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -128,30 +128,6 @@ specialize vp9_loop_filter_bh sse2
 prototype void vp9_loop_filter_bh8x8 "uint8_t *y, uint8_t *u, uint8_t *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
 specialize vp9_loop_filter_bh8x8 sse2
 
-prototype void vp9_loop_filter_simple_mbv "uint8_t *y, int ystride, const uint8_t *blimit"
-specialize vp9_loop_filter_simple_mbv mmx sse2
-vp9_loop_filter_simple_mbv_c=vp9_loop_filter_simple_vertical_edge_c
-vp9_loop_filter_simple_mbv_mmx=vp9_loop_filter_simple_vertical_edge_mmx
-vp9_loop_filter_simple_mbv_sse2=vp9_loop_filter_simple_vertical_edge_sse2
-
-prototype void vp9_loop_filter_simple_mbh "uint8_t *y, int ystride, const uint8_t *blimit"
-specialize vp9_loop_filter_simple_mbh mmx sse2
-vp9_loop_filter_simple_mbh_c=vp9_loop_filter_simple_horizontal_edge_c
-vp9_loop_filter_simple_mbh_mmx=vp9_loop_filter_simple_horizontal_edge_mmx
-vp9_loop_filter_simple_mbh_sse2=vp9_loop_filter_simple_horizontal_edge_sse2
-
-prototype void vp9_loop_filter_simple_bv "uint8_t *y, int ystride, const uint8_t *blimit"
-specialize vp9_loop_filter_simple_bv mmx sse2
-vp9_loop_filter_simple_bv_c=vp9_loop_filter_bvs_c
-vp9_loop_filter_simple_bv_mmx=vp9_loop_filter_bvs_mmx
-vp9_loop_filter_simple_bv_sse2=vp9_loop_filter_bvs_sse2
-
-prototype void vp9_loop_filter_simple_bh "uint8_t *y, int ystride, const uint8_t *blimit"
-specialize vp9_loop_filter_simple_bh mmx sse2
-vp9_loop_filter_simple_bh_c=vp9_loop_filter_bhs_c
-vp9_loop_filter_simple_bh_mmx=vp9_loop_filter_bhs_mmx
-vp9_loop_filter_simple_bh_sse2=vp9_loop_filter_bhs_sse2
-
 prototype void vp9_lpf_mbh_w "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi"
 specialize vp9_lpf_mbh_w sse2
 
@@ -337,41 +313,74 @@ vp9_variance4x4_mmx=vp9_variance4x4_mmx
 prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance64x64 sse2
 
+prototype unsigned int vp9_sub_pixel_avg_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance64x64
+
 prototype unsigned int vp9_sub_pixel_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance32x64
 
+prototype unsigned int vp9_sub_pixel_avg_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance32x64
+
 prototype unsigned int vp9_sub_pixel_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance64x32
 
+prototype unsigned int vp9_sub_pixel_avg_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance64x32
+
 prototype unsigned int vp9_sub_pixel_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance32x16
 
+prototype unsigned int vp9_sub_pixel_avg_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance32x16
+
 prototype unsigned int vp9_sub_pixel_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance16x32
 
+prototype unsigned int vp9_sub_pixel_avg_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance16x32
+
 prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance32x32 sse2
 
+prototype unsigned int vp9_sub_pixel_avg_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance32x32
+
 prototype unsigned int vp9_sub_pixel_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance16x16 sse2 mmx ssse3
 
+prototype unsigned int vp9_sub_pixel_avg_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance16x16
+
 prototype unsigned int vp9_sub_pixel_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance8x16 sse2 mmx
 vp9_sub_pixel_variance8x16_sse2=vp9_sub_pixel_variance8x16_wmt
 
+prototype unsigned int vp9_sub_pixel_avg_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance8x16
+
 prototype unsigned int vp9_sub_pixel_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance16x8 sse2 mmx ssse3
 vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_ssse3;
 vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_wmt
 
+prototype unsigned int vp9_sub_pixel_avg_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance16x8
+
 prototype unsigned int vp9_sub_pixel_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance8x8 sse2 mmx
 vp9_sub_pixel_variance8x8_sse2=vp9_sub_pixel_variance8x8_wmt
 
+prototype unsigned int vp9_sub_pixel_avg_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance8x8
+
 prototype unsigned int vp9_sub_pixel_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance4x4 sse2 mmx
 vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt
 
+prototype unsigned int vp9_sub_pixel_avg_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance4x4
+
 prototype unsigned int vp9_sad64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
 specialize vp9_sad64x64 sse2
 
diff --git a/vp9/common/x86/vp9_loopfilter_intrin_mmx.c b/vp9/common/x86/vp9_loopfilter_intrin_mmx.c
index 2be9e3179..7e6c4be2c 100644
--- a/vp9/common/x86/vp9_loopfilter_intrin_mmx.c
+++ b/vp9/common/x86/vp9_loopfilter_intrin_mmx.c
@@ -35,16 +35,6 @@ void vp9_loop_filter_bh_mmx(unsigned char *y_ptr,
 
 }
 
-void vp9_loop_filter_bhs_mmx(unsigned char *y_ptr, int y_stride,
-                             const unsigned char *blimit) {
-  vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 4 * y_stride,
-                                             y_stride, blimit);
-  vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 8 * y_stride,
-                                             y_stride, blimit);
-  vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 12 * y_stride,
-                                             y_stride, blimit);
-}
-
 /* Vertical B Filtering */
 void vp9_loop_filter_bv_mmx(unsigned char *y_ptr,
                             unsigned char *u_ptr, unsigned char *v_ptr,
@@ -66,9 +56,3 @@ void vp9_loop_filter_bv_mmx(unsigned char *y_ptr,
                                       lfi->blim, lfi->lim, lfi->hev_thr, 1);
 }
 
-void vp9_loop_filter_bvs_mmx(unsigned char *y_ptr, int y_stride,
-                             const unsigned char *blimit) {
-  vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, blimit);
-  vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, blimit);
-  vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, blimit);
-}
diff --git a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
index 08447a62d..7982ca6a2 100644
--- a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
+++ b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
@@ -1115,16 +1115,6 @@ void vp9_loop_filter_bh_sse2(unsigned char *y_ptr,
                                             v_ptr + 4 * uv_stride);
 }
 
-void vp9_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride,
-                              const unsigned char *blimit) {
-  vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 4 * y_stride,
-                                              y_stride, blimit);
-  vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 8 * y_stride,
-                                              y_stride, blimit);
-  vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 12 * y_stride,
-                                              y_stride, blimit);
-}
-
 /* Vertical B Filtering */
 void vp9_loop_filter_bv_sse2(unsigned char *y_ptr,
                              unsigned char *u_ptr, unsigned char *v_ptr,
@@ -1143,9 +1133,3 @@ void vp9_loop_filter_bv_sse2(unsigned char *y_ptr,
                                           v_ptr + 4);
 }
 
-void vp9_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride,
-                              const unsigned char *blimit) {
-  vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, blimit);
-  vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, blimit);
-  vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, blimit);
-}
diff --git a/vp9/common/x86/vp9_loopfilter_mmx.asm b/vp9/common/x86/vp9_loopfilter_mmx.asm
index ceffdf558..4ebb51b77 100644
--- a/vp9/common/x86/vp9_loopfilter_mmx.asm
+++ b/vp9/common/x86/vp9_loopfilter_mmx.asm
@@ -593,349 +593,6 @@ sym(vp9_loop_filter_vertical_edge_mmx):
     pop         rbp
     ret
 
-
-;void vp9_loop_filter_simple_horizontal_edge_mmx
-;(
-;    unsigned char *src_ptr,
-;    int  src_pixel_step,
-;    const char *blimit
-;)
-global sym(vp9_loop_filter_simple_horizontal_edge_mmx) PRIVATE
-sym(vp9_loop_filter_simple_horizontal_edge_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 3
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rsi, arg(0) ;src_ptr
-        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
-
-        mov         rcx, 2                ; count
-.nexts8_h:
-        mov         rdx, arg(2) ;blimit           ; get blimit
-        movq        mm3, [rdx]            ;
-
-        mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
-        add         rdi, rax
-        neg         rax
-
-        ; calculate mask
-        movq        mm1, [rsi+2*rax]      ; p1
-        movq        mm0, [rdi]            ; q1
-        movq        mm2, mm1
-        movq        mm7, mm0
-        movq        mm4, mm0
-        psubusb     mm0, mm1              ; q1-=p1
-        psubusb     mm1, mm4              ; p1-=q1
-        por         mm1, mm0              ; abs(p1-q1)
-        pand        mm1, [GLOBAL(tfe)]    ; set lsb of each byte to zero
-        psrlw       mm1, 1                ; abs(p1-q1)/2
-
-        movq        mm5, [rsi+rax]        ; p0
-        movq        mm4, [rsi]            ; q0
-        movq        mm0, mm4              ; q0
-        movq        mm6, mm5              ; p0
-        psubusb     mm5, mm4              ; p0-=q0
-        psubusb     mm4, mm6              ; q0-=p0
-        por         mm5, mm4              ; abs(p0 - q0)
-        paddusb     mm5, mm5              ; abs(p0-q0)*2
-        paddusb     mm5, mm1              ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
-        psubusb     mm5, mm3              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
-        pxor        mm3, mm3
-        pcmpeqb     mm5, mm3
-
-        ; start work on filters
-        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values
-        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values
-        psubsb      mm2, mm7              ; p1 - q1
-
-        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values
-        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values
-        movq        mm3, mm0              ; q0
-        psubsb      mm0, mm6              ; q0 - p0
-        paddsb      mm2, mm0              ; p1 - q1 + 1 * (q0 - p0)
-        paddsb      mm2, mm0              ; p1 - q1 + 2 * (q0 - p0)
-        paddsb      mm2, mm0              ; p1 - q1 + 3 * (q0 - p0)
-        pand        mm5, mm2              ; mask filter values we don't care about
-
-        ; do + 4 side
-        paddsb      mm5, [GLOBAL(t4)]     ; 3* (q0 - p0) + (p1 - q1) + 4
-
-        movq        mm0, mm5              ; get a copy of filters
-        psllw       mm0, 8                ; shift left 8
-        psraw       mm0, 3                ; arithmetic shift right 11
-        psrlw       mm0, 8
-        movq        mm1, mm5              ; get a copy of filters
-        psraw       mm1, 11               ; arithmetic shift right 11
-        psllw       mm1, 8                ; shift left 8 to put it back
-
-        por         mm0, mm1              ; put the two together to get result
-
-        psubsb      mm3, mm0              ; q0-= q0 add
-        pxor        mm3, [GLOBAL(t80)]    ; unoffset
-        movq        [rsi], mm3            ; write back
-
-
-        ; now do +3 side
-        psubsb      mm5, [GLOBAL(t1s)]     ; +3 instead of +4
-
-        movq        mm0, mm5              ; get a copy of filters
-        psllw       mm0, 8                ; shift left 8
-        psraw       mm0, 3                ; arithmetic shift right 11
-        psrlw       mm0, 8
-        psraw       mm5, 11               ; arithmetic shift right 11
-        psllw       mm5, 8                ; shift left 8 to put it back
-        por         mm0, mm5              ; put the two together to get result
-
-
-        paddsb      mm6, mm0              ; p0+= p0 add
-        pxor        mm6, [GLOBAL(t80)]    ; unoffset
-        movq        [rsi+rax], mm6        ; write back
-
-        add         rsi,8
-        neg         rax
-        dec         rcx
-        jnz         .nexts8_h
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_loop_filter_simple_vertical_edge_mmx
-;(
-;    unsigned char *src_ptr,
-;    int  src_pixel_step,
-;    const char *blimit
-;)
-global sym(vp9_loop_filter_simple_vertical_edge_mmx) PRIVATE
-sym(vp9_loop_filter_simple_vertical_edge_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 3
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub          rsp, 32      ; reserve 32 bytes
-    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
-    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
-
-        mov         rsi, arg(0) ;src_ptr
-        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
-
-        lea         rsi, [rsi + rax*4- 2];  ;
-        mov         rcx, 2                                      ; count
-.nexts8_v:
-
-        lea         rdi,        [rsi + rax];
-        movd        mm0,        [rdi + rax * 2]                 ; xx xx xx xx 73 72 71 70
-
-        movd        mm6,        [rsi + rax * 2]                 ; xx xx xx xx 63 62 61 60
-        punpcklbw   mm6,        mm0                             ; 73 63 72 62 71 61 70 60
-
-        movd        mm0,        [rsi + rax]                     ; xx xx xx xx 53 52 51 50
-        movd        mm4,        [rsi]                           ; xx xx xx xx 43 42 41 40
-
-        punpcklbw   mm4,        mm0                             ; 53 43 52 42 51 41 50 40
-        movq        mm5,        mm4                             ; 53 43 52 42 51 41 50 40
-
-        punpcklwd   mm4,        mm6                             ; 71 61 51 41 70 60 50 40
-        punpckhwd   mm5,        mm6                             ; 73 63 53 43 72 62 52 42
-
-        neg         rax
-
-        movd        mm7,        [rsi + rax]                     ; xx xx xx xx 33 32 31 30
-        movd        mm6,        [rsi + rax * 2]                 ; xx xx xx xx 23 22 21 20
-
-        punpcklbw   mm6,        mm7                             ; 33 23 32 22 31 21 30 20
-        movd        mm1,        [rdi + rax * 4]                 ; xx xx xx xx 13 12 11 10
-
-        movd        mm0,        [rsi + rax * 4]                 ; xx xx xx xx 03 02 01 00
-        punpcklbw   mm0,        mm1                             ; 13 03 12 02 11 01 10 00
-
-        movq        mm2,        mm0                             ; 13 03 12 02 11 01 10 00
-        punpcklwd   mm0,        mm6                             ; 31 21 11 01 30 20 10 00
-
-        punpckhwd   mm2,        mm6                             ; 33 23 13 03 32 22 12 02
-        movq        mm1,        mm0                             ; 13 03 12 02 11 01 10 00
-
-        punpckldq   mm0,        mm4                             ; 70 60 50 40 30 20 10 00       = p1
-        movq        mm3,        mm2                             ; 33 23 13 03 32 22 12 02
-
-        punpckhdq   mm1,        mm4                             ; 71 61 51 41 31 21 11 01       = p0
-        punpckldq   mm2,        mm5                             ; 72 62 52 42 32 22 12 02       = q0
-
-        punpckhdq   mm3,        mm5                             ; 73 63 53 43 33 23 13 03       = q1
-
-
-        ; calculate mask
-        movq        mm6,        mm0                             ; p1
-        movq        mm7,        mm3                             ; q1
-        psubusb     mm7,        mm6                             ; q1-=p1
-        psubusb     mm6,        mm3                             ; p1-=q1
-        por         mm6,        mm7                             ; abs(p1-q1)
-        pand        mm6,        [GLOBAL(tfe)]                   ; set lsb of each byte to zero
-        psrlw       mm6,        1                               ; abs(p1-q1)/2
-
-        movq        mm5,        mm1                             ; p0
-        movq        mm4,        mm2                             ; q0
-
-        psubusb     mm5,        mm2                             ; p0-=q0
-        psubusb     mm4,        mm1                             ; q0-=p0
-
-        por         mm5,        mm4                             ; abs(p0 - q0)
-        paddusb     mm5,        mm5                             ; abs(p0-q0)*2
-        paddusb     mm5,        mm6                             ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
-        mov         rdx,        arg(2) ;blimit                          ; get blimit
-        movq        mm7,        [rdx]
-
-        psubusb     mm5,        mm7                             ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
-        pxor        mm7,        mm7
-        pcmpeqb     mm5,        mm7                             ; mm5 = mask
-
-        ; start work on filters
-        movq        t0,         mm0
-        movq        t1,         mm3
-
-        pxor        mm0,        [GLOBAL(t80)]                   ; p1 offset to convert to signed values
-        pxor        mm3,        [GLOBAL(t80)]                   ; q1 offset to convert to signed values
-
-        psubsb      mm0,        mm3                             ; p1 - q1
-        movq        mm6,        mm1                             ; p0
-
-        movq        mm7,        mm2                             ; q0
-        pxor        mm6,        [GLOBAL(t80)]                   ; offset to convert to signed values
-
-        pxor        mm7,        [GLOBAL(t80)]                   ; offset to convert to signed values
-        movq        mm3,        mm7                             ; offseted ; q0
-
-        psubsb      mm7,        mm6                             ; q0 - p0
-        paddsb      mm0,        mm7                             ; p1 - q1 + 1 * (q0 - p0)
-
-        paddsb      mm0,        mm7                             ; p1 - q1 + 2 * (q0 - p0)
-        paddsb      mm0,        mm7                             ; p1 - q1 + 3 * (q0 - p0)
-
-        pand        mm5,        mm0                             ; mask filter values we don't care about
-
-        paddsb      mm5,        [GLOBAL(t4)]                    ;  3* (q0 - p0) + (p1 - q1) + 4
-
-        movq        mm0,        mm5                             ; get a copy of filters
-        psllw       mm0,        8                               ; shift left 8
-        psraw       mm0,        3                               ; arithmetic shift right 11
-        psrlw       mm0,        8
-
-        movq        mm7,        mm5                             ; get a copy of filters
-        psraw       mm7,        11                              ; arithmetic shift right 11
-        psllw       mm7,        8                               ; shift left 8 to put it back
-
-        por         mm0,        mm7                             ; put the two together to get result
-
-        psubsb      mm3,        mm0                             ; q0-= q0sz add
-        pxor        mm3,        [GLOBAL(t80)]                   ; unoffset
-
-        ; now do +3 side
-        psubsb      mm5, [GLOBAL(t1s)]                          ; +3 instead of +4
-
-        movq        mm0, mm5                                    ; get a copy of filters
-        psllw       mm0, 8                                      ; shift left 8
-        psraw       mm0, 3                                      ; arithmetic shift right 11
-        psrlw       mm0, 8
-
-        psraw       mm5, 11                                     ; arithmetic shift right 11
-        psllw       mm5, 8                                      ; shift left 8 to put it back
-        por         mm0, mm5                                    ; put the two together to get result
-
-        paddsb      mm6, mm0                                    ; p0+= p0 add
-        pxor        mm6, [GLOBAL(t80)]                          ; unoffset
-
-
-        movq        mm0,        t0
-        movq        mm4,        t1
-
-        ; mm0 = 70 60 50 40 30 20 10 00
-        ; mm6 = 71 61 51 41 31 21 11 01
-        ; mm3 = 72 62 52 42 32 22 12 02
-        ; mm4 = 73 63 53 43 33 23 13 03
-        ; transpose back to write out
-
-        movq        mm1,        mm0                         ;
-        punpcklbw   mm0,        mm6                         ; 31 30 21 20 11 10 01 00
-
-        punpckhbw   mm1,        mm6                         ; 71 70 61 60 51 50 41 40
-        movq        mm2,        mm3                         ;
-
-        punpcklbw   mm2,        mm4                         ; 33 32 23 22 13 12 03 02
-        movq        mm5,        mm1                         ; 71 70 61 60 51 50 41 40
-
-        punpckhbw   mm3,        mm4                         ; 73 72 63 62 53 52 43 42
-        movq        mm6,        mm0                         ; 31 30 21 20 11 10 01 00
-
-        punpcklwd   mm0,        mm2                         ; 13 12 11 10 03 02 01 00
-        punpckhwd   mm6,        mm2                         ; 33 32 31 30 23 22 21 20
-
-        movd        [rsi+rax*4], mm0                        ; write 03 02 01 00
-        punpcklwd   mm1,        mm3                         ; 53 52 51 50 43 42 41 40
-
-        psrlq       mm0,        32                          ; xx xx xx xx 13 12 11 10
-        punpckhwd   mm5,        mm3                         ; 73 72 71 70 63 62 61 60
-
-        movd        [rdi+rax*4], mm0                        ; write 13 12 11 10
-        movd        [rsi+rax*2], mm6                        ; write 23 22 21 20
-
-        psrlq       mm6,        32                          ; 33 32 31 30
-        movd        [rsi],      mm1                         ; write 43 42 41 40
-
-        movd        [rsi + rax], mm6                        ; write 33 32 31 30
-        neg         rax
-
-        movd        [rsi + rax*2], mm5                      ; write 63 62 61 60
-        psrlq       mm1,        32                          ; 53 52 51 50
-
-        movd        [rdi],      mm1                         ; write out 53 52 51 50
-        psrlq       mm5,        32                          ; 73 72 71 70
-
-        movd        [rdi + rax*2], mm5                      ; write 73 72 71 70
-
-        lea         rsi,        [rsi+rax*8]                 ; next 8
-
-        dec         rcx
-        jnz         .nexts8_v
-
-    add rsp, 32
-    pop rsp
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
-;void fast_loop_filter_vertical_edges_mmx(unsigned char *y_ptr,
-;                  int y_stride,
-;                  loop_filter_info *lfi)
-;{
-;
-;
-;    vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+4, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
-;    vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+8, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
-;    vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+12, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
-;}
-
 SECTION_RODATA
 align 16
 tfe:
diff --git a/vp9/common/x86/vp9_loopfilter_sse2.asm b/vp9/common/x86/vp9_loopfilter_sse2.asm
index ae4c60f53..74236cfbb 100644
--- a/vp9/common/x86/vp9_loopfilter_sse2.asm
+++ b/vp9/common/x86/vp9_loopfilter_sse2.asm
@@ -845,372 +845,6 @@ sym(vp9_loop_filter_vertical_edge_uv_sse2):
     pop         rbp
     ret
 
-;void vp9_loop_filter_simple_horizontal_edge_sse2
-;(
-;    unsigned char *src_ptr,
-;    int  src_pixel_step,
-;    const char *blimit,
-;)
-global sym(vp9_loop_filter_simple_horizontal_edge_sse2) PRIVATE
-sym(vp9_loop_filter_simple_horizontal_edge_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 3
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rsi, arg(0)             ;src_ptr
-        movsxd      rax, dword ptr arg(1)   ;src_pixel_step     ; destination pitch?
-        mov         rdx, arg(2)             ;blimit
-        movdqa      xmm3, XMMWORD PTR [rdx]
-
-        mov         rdi, rsi                ; rdi points to row +1 for indirect addressing
-        add         rdi, rax
-        neg         rax
-
-        ; calculate mask
-        movdqa      xmm1, [rsi+2*rax]       ; p1
-        movdqa      xmm0, [rdi]             ; q1
-        movdqa      xmm2, xmm1
-        movdqa      xmm7, xmm0
-        movdqa      xmm4, xmm0
-        psubusb     xmm0, xmm1              ; q1-=p1
-        psubusb     xmm1, xmm4              ; p1-=q1
-        por         xmm1, xmm0              ; abs(p1-q1)
-        pand        xmm1, [GLOBAL(tfe)]     ; set lsb of each byte to zero
-        psrlw       xmm1, 1                 ; abs(p1-q1)/2
-
-        movdqa      xmm5, [rsi+rax]         ; p0
-        movdqa      xmm4, [rsi]             ; q0
-        movdqa      xmm0, xmm4              ; q0
-        movdqa      xmm6, xmm5              ; p0
-        psubusb     xmm5, xmm4              ; p0-=q0
-        psubusb     xmm4, xmm6              ; q0-=p0
-        por         xmm5, xmm4              ; abs(p0 - q0)
-        paddusb     xmm5, xmm5              ; abs(p0-q0)*2
-        paddusb     xmm5, xmm1              ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
-        psubusb     xmm5, xmm3              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
-        pxor        xmm3, xmm3
-        pcmpeqb     xmm5, xmm3
-
-        ; start work on filters
-        pxor        xmm2, [GLOBAL(t80)]     ; p1 offset to convert to signed values
-        pxor        xmm7, [GLOBAL(t80)]     ; q1 offset to convert to signed values
-        psubsb      xmm2, xmm7              ; p1 - q1
-
-        pxor        xmm6, [GLOBAL(t80)]     ; offset to convert to signed values
-        pxor        xmm0, [GLOBAL(t80)]     ; offset to convert to signed values
-        movdqa      xmm3, xmm0              ; q0
-        psubsb      xmm0, xmm6              ; q0 - p0
-        paddsb      xmm2, xmm0              ; p1 - q1 + 1 * (q0 - p0)
-        paddsb      xmm2, xmm0              ; p1 - q1 + 2 * (q0 - p0)
-        paddsb      xmm2, xmm0              ; p1 - q1 + 3 * (q0 - p0)
-        pand        xmm5, xmm2              ; mask filter values we don't care about
-
-        ; do + 4 side
-        paddsb      xmm5, [GLOBAL(t4)]      ; 3* (q0 - p0) + (p1 - q1) + 4
-
-        movdqa      xmm0, xmm5              ; get a copy of filters
-        psllw       xmm0, 8                 ; shift left 8
-        psraw       xmm0, 3                 ; arithmetic shift right 11
-        psrlw       xmm0, 8
-        movdqa      xmm1, xmm5              ; get a copy of filters
-        psraw       xmm1, 11                ; arithmetic shift right 11
-        psllw       xmm1, 8                 ; shift left 8 to put it back
-
-        por         xmm0, xmm1              ; put the two together to get result
-
-        psubsb      xmm3, xmm0              ; q0-= q0 add
-        pxor        xmm3, [GLOBAL(t80)]     ; unoffset
-        movdqa      [rsi], xmm3             ; write back
-
-        ; now do +3 side
-        psubsb      xmm5, [GLOBAL(t1s)]     ; +3 instead of +4
-
-        movdqa      xmm0, xmm5              ; get a copy of filters
-        psllw       xmm0, 8                 ; shift left 8
-        psraw       xmm0, 3                 ; arithmetic shift right 11
-        psrlw       xmm0, 8
-        psraw       xmm5, 11                ; arithmetic shift right 11
-        psllw       xmm5, 8                 ; shift left 8 to put it back
-        por         xmm0, xmm5              ; put the two together to get result
-
-
-        paddsb      xmm6, xmm0              ; p0+= p0 add
-        pxor        xmm6, [GLOBAL(t80)]     ; unoffset
-        movdqa      [rsi+rax], xmm6         ; write back
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_loop_filter_simple_vertical_edge_sse2
-;(
-;    unsigned char *src_ptr,
-;    int  src_pixel_step,
-;    const char *blimit,
-;)
-global sym(vp9_loop_filter_simple_vertical_edge_sse2) PRIVATE
-sym(vp9_loop_filter_simple_vertical_edge_sse2):
-    push        rbp         ; save old base pointer value.
-    mov         rbp, rsp    ; set new base pointer value.
-    SHADOW_ARGS_TO_STACK 3
-    SAVE_XMM 7
-    GET_GOT     rbx         ; save callee-saved reg
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 32                         ; reserve 32 bytes
-    %define t0  [rsp + 0]    ;__declspec(align(16)) char t0[16];
-    %define t1  [rsp + 16]   ;__declspec(align(16)) char t1[16];
-
-        mov         rsi, arg(0) ;src_ptr
-        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
-
-        lea         rsi,        [rsi - 2 ]
-        lea         rdi,        [rsi + rax]
-        lea         rdx,        [rsi + rax*4]
-        lea         rcx,        [rdx + rax]
-
-        movd        xmm0,       [rsi]                   ; (high 96 bits unused) 03 02 01 00
-        movd        xmm1,       [rdx]                   ; (high 96 bits unused) 43 42 41 40
-        movd        xmm2,       [rdi]                   ; 13 12 11 10
-        movd        xmm3,       [rcx]                   ; 53 52 51 50
-        punpckldq   xmm0,       xmm1                    ; (high 64 bits unused) 43 42 41 40 03 02 01 00
-        punpckldq   xmm2,       xmm3                    ; 53 52 51 50 13 12 11 10
-
-        movd        xmm4,       [rsi + rax*2]           ; 23 22 21 20
-        movd        xmm5,       [rdx + rax*2]           ; 63 62 61 60
-        movd        xmm6,       [rdi + rax*2]           ; 33 32 31 30
-        movd        xmm7,       [rcx + rax*2]           ; 73 72 71 70
-        punpckldq   xmm4,       xmm5                    ; 63 62 61 60 23 22 21 20
-        punpckldq   xmm6,       xmm7                    ; 73 72 71 70 33 32 31 30
-
-        punpcklbw   xmm0,       xmm2                    ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
-        punpcklbw   xmm4,       xmm6                    ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
-
-        movdqa      xmm1,       xmm0
-        punpcklwd   xmm0,       xmm4                    ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
-        punpckhwd   xmm1,       xmm4                    ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
-
-        movdqa      xmm2,       xmm0
-        punpckldq   xmm0,       xmm1                    ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
-        punpckhdq   xmm2,       xmm1                    ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
-
-        movdqa      t0,         xmm0                    ; save to t0
-        movdqa      t1,         xmm2                    ; save to t1
-
-        lea         rsi,        [rsi + rax*8]
-        lea         rdi,        [rsi + rax]
-        lea         rdx,        [rsi + rax*4]
-        lea         rcx,        [rdx + rax]
-
-        movd        xmm4,       [rsi]                   ; 83 82 81 80
-        movd        xmm1,       [rdx]                   ; c3 c2 c1 c0
-        movd        xmm6,       [rdi]                   ; 93 92 91 90
-        movd        xmm3,       [rcx]                   ; d3 d2 d1 d0
-        punpckldq   xmm4,       xmm1                    ; c3 c2 c1 c0 83 82 81 80
-        punpckldq   xmm6,       xmm3                    ; d3 d2 d1 d0 93 92 91 90
-
-        movd        xmm0,       [rsi + rax*2]           ; a3 a2 a1 a0
-        movd        xmm5,       [rdx + rax*2]           ; e3 e2 e1 e0
-        movd        xmm2,       [rdi + rax*2]           ; b3 b2 b1 b0
-        movd        xmm7,       [rcx + rax*2]           ; f3 f2 f1 f0
-        punpckldq   xmm0,       xmm5                    ; e3 e2 e1 e0 a3 a2 a1 a0
-        punpckldq   xmm2,       xmm7                    ; f3 f2 f1 f0 b3 b2 b1 b0
-
-        punpcklbw   xmm4,       xmm6                    ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80
-        punpcklbw   xmm0,       xmm2                    ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0
-
-        movdqa      xmm1,       xmm4
-        punpcklwd   xmm4,       xmm0                    ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
-        punpckhwd   xmm1,       xmm0                    ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
-
-        movdqa      xmm6,       xmm4
-        punpckldq   xmm4,       xmm1                    ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
-        punpckhdq   xmm6,       xmm1                    ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
-
-        movdqa      xmm0,       t0                      ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
-        movdqa      xmm2,       t1                      ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
-        movdqa      xmm1,       xmm0
-        movdqa      xmm3,       xmm2
-
-        punpcklqdq  xmm0,       xmm4                    ; p1  f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
-        punpckhqdq  xmm1,       xmm4                    ; p0  f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
-        punpcklqdq  xmm2,       xmm6                    ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
-        punpckhqdq  xmm3,       xmm6                    ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
-
-        ; calculate mask
-        movdqa      xmm6,       xmm0                            ; p1
-        movdqa      xmm7,       xmm3                            ; q1
-        psubusb     xmm7,       xmm0                            ; q1-=p1
-        psubusb     xmm6,       xmm3                            ; p1-=q1
-        por         xmm6,       xmm7                            ; abs(p1-q1)
-        pand        xmm6,       [GLOBAL(tfe)]                   ; set lsb of each byte to zero
-        psrlw       xmm6,       1                               ; abs(p1-q1)/2
-
-        movdqa      xmm5,       xmm1                            ; p0
-        movdqa      xmm4,       xmm2                            ; q0
-        psubusb     xmm5,       xmm2                            ; p0-=q0
-        psubusb     xmm4,       xmm1                            ; q0-=p0
-        por         xmm5,       xmm4                            ; abs(p0 - q0)
-        paddusb     xmm5,       xmm5                            ; abs(p0-q0)*2
-        paddusb     xmm5,       xmm6                            ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
-        mov         rdx,        arg(2)                          ;blimit
-        movdqa      xmm7, XMMWORD PTR [rdx]
-
-        psubusb     xmm5,        xmm7                           ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
-        pxor        xmm7,        xmm7
-        pcmpeqb     xmm5,        xmm7                           ; mm5 = mask
-
-        ; start work on filters
-        movdqa        t0,        xmm0
-        movdqa        t1,        xmm3
-
-        pxor        xmm0,        [GLOBAL(t80)]                  ; p1 offset to convert to signed values
-        pxor        xmm3,        [GLOBAL(t80)]                  ; q1 offset to convert to signed values
-
-        psubsb      xmm0,        xmm3                           ; p1 - q1
-        movdqa      xmm6,        xmm1                           ; p0
-
-        movdqa      xmm7,        xmm2                           ; q0
-        pxor        xmm6,        [GLOBAL(t80)]                  ; offset to convert to signed values
-
-        pxor        xmm7,        [GLOBAL(t80)]                  ; offset to convert to signed values
-        movdqa      xmm3,        xmm7                           ; offseted ; q0
-
-        psubsb      xmm7,        xmm6                           ; q0 - p0
-        paddsb      xmm0,        xmm7                           ; p1 - q1 + 1 * (q0 - p0)
-
-        paddsb      xmm0,        xmm7                           ; p1 - q1 + 2 * (q0 - p0)
-        paddsb      xmm0,        xmm7                           ; p1 - q1 + 3 * (q0 - p0)
-
-        pand        xmm5,        xmm0                           ; mask filter values we don't care about
-
-
-        paddsb      xmm5,        [GLOBAL(t4)]                   ;  3* (q0 - p0) + (p1 - q1) + 4
-
-        movdqa      xmm0,        xmm5                           ; get a copy of filters
-        psllw       xmm0,        8                              ; shift left 8
-
-        psraw       xmm0,        3                              ; arithmetic shift right 11
-        psrlw       xmm0,        8
-
-        movdqa      xmm7,        xmm5                           ; get a copy of filters
-        psraw       xmm7,        11                             ; arithmetic shift right 11
-
-        psllw       xmm7,        8                              ; shift left 8 to put it back
-        por         xmm0,        xmm7                           ; put the two together to get result
-
-        psubsb      xmm3,        xmm0                           ; q0-= q0sz add
-        pxor        xmm3,        [GLOBAL(t80)]                  ; unoffset   q0
-
-        ; now do +3 side
-        psubsb      xmm5,        [GLOBAL(t1s)]                  ; +3 instead of +4
-        movdqa      xmm0,        xmm5                           ; get a copy of filters
-
-        psllw       xmm0,        8                              ; shift left 8
-        psraw       xmm0,        3                              ; arithmetic shift right 11
-
-        psrlw       xmm0,        8
-        psraw       xmm5,        11                             ; arithmetic shift right 11
-
-        psllw       xmm5,        8                              ; shift left 8 to put it back
-        por         xmm0,        xmm5                           ; put the two together to get result
-
-        paddsb      xmm6,        xmm0                           ; p0+= p0 add
-        pxor        xmm6,        [GLOBAL(t80)]                  ; unoffset   p0
-
-        movdqa      xmm0,        t0                             ; p1
-        movdqa      xmm4,        t1                             ; q1
-
-        ; transpose back to write out
-        ; p1  f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
-        ; p0  f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
-        ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
-        ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
-        movdqa      xmm1,       xmm0
-        punpcklbw   xmm0,       xmm6                               ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
-        punpckhbw   xmm1,       xmm6                               ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
-
-        movdqa      xmm5,       xmm3
-        punpcklbw   xmm3,       xmm4                               ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
-        punpckhbw   xmm5,       xmm4                               ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
-
-        movdqa      xmm2,       xmm0
-        punpcklwd   xmm0,       xmm3                               ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
-        punpckhwd   xmm2,       xmm3                               ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
-
-        movdqa      xmm3,       xmm1
-        punpcklwd   xmm1,       xmm5                               ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
-        punpckhwd   xmm3,       xmm5                               ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
-
-        ; write out order: xmm0 xmm2 xmm1 xmm3
-        lea         rdx,        [rsi + rax*4]
-
-        movd        [rsi],      xmm1                               ; write the second 8-line result
-        psrldq      xmm1,       4
-        movd        [rdi],      xmm1
-        psrldq      xmm1,       4
-        movd        [rsi + rax*2], xmm1
-        psrldq      xmm1,       4
-        movd        [rdi + rax*2], xmm1
-
-        movd        [rdx],      xmm3
-        psrldq      xmm3,       4
-        movd        [rcx],      xmm3
-        psrldq      xmm3,       4
-        movd        [rdx + rax*2], xmm3
-        psrldq      xmm3,       4
-        movd        [rcx + rax*2], xmm3
-
-        neg         rax
-        lea         rsi,        [rsi + rax*8]
-        neg         rax
-        lea         rdi,        [rsi + rax]
-        lea         rdx,        [rsi + rax*4]
-        lea         rcx,        [rdx + rax]
-
-        movd        [rsi],      xmm0                                ; write the first 8-line result
-        psrldq      xmm0,       4
-        movd        [rdi],      xmm0
-        psrldq      xmm0,       4
-        movd        [rsi + rax*2], xmm0
-        psrldq      xmm0,       4
-        movd        [rdi + rax*2], xmm0
-
-        movd        [rdx],      xmm2
-        psrldq      xmm2,       4
-        movd        [rcx],      xmm2
-        psrldq      xmm2,       4
-        movd        [rdx + rax*2], xmm2
-        psrldq      xmm2,       4
-        movd        [rcx + rax*2], xmm2
-
-    add rsp, 32
-    pop rsp
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
 SECTION_RODATA
 align 16
 tfe:
diff --git a/vp9/common/x86/vp9_loopfilter_x86.h b/vp9/common/x86/vp9_loopfilter_x86.h
index 46a6202d2..fb5af05f7 100644
--- a/vp9/common/x86/vp9_loopfilter_x86.h
+++ b/vp9/common/x86/vp9_loopfilter_x86.h
@@ -23,10 +23,6 @@ extern prototype_loopfilter_block(vp9_loop_filter_mbv_mmx);
 extern prototype_loopfilter_block(vp9_loop_filter_bv_mmx);
 extern prototype_loopfilter_block(vp9_loop_filter_mbh_mmx);
 extern prototype_loopfilter_block(vp9_loop_filter_bh_mmx);
-extern prototype_simple_loopfilter(vp9_loop_filter_simple_vertical_edge_mmx);
-extern prototype_simple_loopfilter(vp9_loop_filter_bvs_mmx);
-extern prototype_simple_loopfilter(vp9_loop_filter_simple_horizontal_edge_mmx);
-extern prototype_simple_loopfilter(vp9_loop_filter_bhs_mmx);
 #endif
 
 #if HAVE_SSE2
@@ -34,10 +30,6 @@ extern prototype_loopfilter_block(vp9_loop_filter_mbv_sse2);
 extern prototype_loopfilter_block(vp9_loop_filter_bv_sse2);
 extern prototype_loopfilter_block(vp9_loop_filter_mbh_sse2);
 extern prototype_loopfilter_block(vp9_loop_filter_bh_sse2);
-extern prototype_simple_loopfilter(vp9_loop_filter_simple_vertical_edge_sse2);
-extern prototype_simple_loopfilter(vp9_loop_filter_bvs_sse2);
-extern prototype_simple_loopfilter(vp9_loop_filter_simple_horizontal_edge_sse2);
-extern prototype_simple_loopfilter(vp9_loop_filter_bhs_sse2);
 #endif
 
 #endif  // LOOPFILTER_X86_H
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index 4be36774a..70db06dc1 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -811,12 +811,12 @@ static void decode_tile(VP9D_COMP *pbi, vp9_reader *r) {
   int mi_row, mi_col;
 
   for (mi_row = pc->cur_tile_mi_row_start;
-       mi_row < pc->cur_tile_mi_row_end; mi_row += 8) {
+       mi_row < pc->cur_tile_mi_row_end; mi_row += 64 / MI_SIZE) {
     // For a SB there are 2 left contexts, each pertaining to a MB row within
     vpx_memset(&pc->left_context, 0, sizeof(pc->left_context));
     vpx_memset(pc->left_seg_context, 0, sizeof(pc->left_seg_context));
     for (mi_col = pc->cur_tile_mi_col_start;
-         mi_col < pc->cur_tile_mi_col_end; mi_col += 8)
+         mi_col < pc->cur_tile_mi_col_end; mi_col += 64 / MI_SIZE)
       decode_modes_sb(pbi, mi_row, mi_col, r, BLOCK_SIZE_SB64X64);
   }
 }
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index ce2a86b4f..1d7e093cf 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -62,7 +62,7 @@ DECLARE_ALIGNED(16, extern const uint8_t, vp9_norm[256]);
 
 #define INCREMENT_COUNT(token)               \
   do {                                       \
-    coef_counts[type][ref][get_coef_band(scan, txfm_size, c)] \
+    coef_counts[type][ref][band] \
                [pt][token]++;     \
     token_cache[scan[c]] = token; \
   } while (0)
@@ -96,6 +96,7 @@ static int decode_coefs(VP9D_COMP *dx, const MACROBLOCKD *xd,
   ENTROPY_CONTEXT above_ec, left_ec;
   FRAME_CONTEXT *const fc = &dx->common.fc;
   int pt, c = 0, pad, default_eob;
+  int band;
   vp9_coeff_probs *coef_probs;
   vp9_prob *prob;
   vp9_coeff_count *coef_counts;
@@ -162,7 +163,6 @@ static int decode_coefs(VP9D_COMP *dx, const MACROBLOCKD *xd,
 
   while (1) {
     int val;
-    int band;
     const uint8_t *cat6 = cat6_prob;
     if (c >= seg_eob)
       break;
@@ -249,8 +249,7 @@ SKIP_START:
   }
 
   if (c < seg_eob)
-    coef_counts[type][ref][get_coef_band(scan, txfm_size, c)]
-        [pt][DCT_EOB_TOKEN]++;
+    coef_counts[type][ref][band][pt][DCT_EOB_TOKEN]++;
 
   for (pt = 0; pt < (1 << txfm_size); pt++) {
     A[pt] = L[pt] = c > 0;
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 6bc42c7ff..44261481c 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -138,8 +138,8 @@ struct macroblock {
 
   int optimize;
 
-  // Structure to hold context for each of the 4 MBs within a SB:
-  // when encoded as 4 independent MBs:
+  // TODO(jingning): Need to refactor the structure arrays that buffers the
+  // coding mode decisions of each partition type.
   PICK_MODE_CONTEXT sb8_context[4][4][4];
   PICK_MODE_CONTEXT sb8x16_context[4][4][2];
   PICK_MODE_CONTEXT sb16x8_context[4][4][2];
@@ -153,6 +153,10 @@ struct macroblock {
   PICK_MODE_CONTEXT sb64_context;
   int partition_cost[NUM_PARTITION_CONTEXTS][PARTITION_TYPES];
 
+  BLOCK_SIZE_TYPE mb_partitioning[4][4];
+  BLOCK_SIZE_TYPE sb_partitioning[4];
+  BLOCK_SIZE_TYPE sb64_partitioning;
+
   void (*fwd_txm4x4)(int16_t *input, int16_t *output, int pitch);
   void (*fwd_txm8x4)(int16_t *input, int16_t *output, int pitch);
   void (*fwd_txm8x8)(int16_t *input, int16_t *output, int pitch);
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 418f60edc..3345f8965 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -731,6 +731,9 @@ static void set_block_index(MACROBLOCKD *xd, int idx,
   }
 }
 
+// TODO(jingning): the variables used here are little complicated. need further
+// refactoring on organizing the the temporary buffers, when recursive
+// partition down to 4x4 block size is enabled.
 static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x,
                                             BLOCK_SIZE_TYPE bsize) {
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -762,6 +765,72 @@ static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x,
   }
 }
 
+static int *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE_TYPE subsize) {
+  switch (subsize) {
+    case BLOCK_SIZE_SB64X32:
+    case BLOCK_SIZE_SB32X64:
+    case BLOCK_SIZE_SB32X32:
+      return &xd->sb_index;
+    case BLOCK_SIZE_SB32X16:
+    case BLOCK_SIZE_SB16X32:
+    case BLOCK_SIZE_MB16X16:
+      return &xd->mb_index;
+    case BLOCK_SIZE_SB16X8:
+    case BLOCK_SIZE_SB8X16:
+    case BLOCK_SIZE_SB8X8:
+      return &xd->b_index;
+    default:
+      assert(0);
+      return NULL;
+  }
+}
+
+static BLOCK_SIZE_TYPE *get_sb_partitioning(MACROBLOCK *x,
+                                            BLOCK_SIZE_TYPE bsize) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  switch (bsize) {
+    case BLOCK_SIZE_SB64X64:
+      return &x->sb64_partitioning;
+    case BLOCK_SIZE_SB32X32:
+      return &x->sb_partitioning[xd->sb_index];
+    case BLOCK_SIZE_MB16X16:
+      return &x->mb_partitioning[xd->sb_index][xd->mb_index];
+    default:
+      assert(0);
+      return NULL;
+  }
+}
+
+static void restore_context(VP9_COMP *cpi, int mi_row, int mi_col,
+                            ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
+                            ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
+                            PARTITION_CONTEXT sa[8],
+                            PARTITION_CONTEXT sl[8],
+                            BLOCK_SIZE_TYPE bsize) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int p;
+  int bwl = b_width_log2(bsize), bw = 1 << bwl;
+  int bhl = b_height_log2(bsize), bh = 1 << bhl;
+  int mwl = mi_width_log2(bsize), mw = 1 << mwl;
+  int mhl = mi_height_log2(bsize), mh = 1 << mhl;
+  for (p = 0; p < MAX_MB_PLANE; p++) {
+    vpx_memcpy(cm->above_context[p] +
+               ((mi_col * 2) >> xd->plane[p].subsampling_x),
+               a + bw * p,
+               sizeof(ENTROPY_CONTEXT) * bw >> xd->plane[p].subsampling_x);
+    vpx_memcpy(cm->left_context[p] +
+               ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
+               l + bh * p,
+               sizeof(ENTROPY_CONTEXT) * bh >> xd->plane[p].subsampling_y);
+  }
+  vpx_memcpy(cm->above_seg_context + mi_col, sa,
+             sizeof(PARTITION_CONTEXT) * mw);
+  vpx_memcpy(cm->left_seg_context + (mi_row & MI_MASK), sl,
+             sizeof(PARTITION_CONTEXT) * mh);
+}
+
 static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp,
                      int mi_row, int mi_col, int output_enabled,
                      BLOCK_SIZE_TYPE bsize, int sub_index) {
@@ -788,27 +857,28 @@ static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp,
 
 static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp,
                       int mi_row, int mi_col, int output_enabled,
-                      BLOCK_SIZE_TYPE level,
-                      BLOCK_SIZE_TYPE c1, BLOCK_SIZE_TYPE c2[4],
-                      BLOCK_SIZE_TYPE c3[4][4]
-                      ) {
+                      BLOCK_SIZE_TYPE bsize) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  const int bsl = mi_width_log2(level), bs = 1 << (bsl - 1);
-  const int bwl = mi_width_log2(c1), bhl = mi_height_log2(c1);
+  BLOCK_SIZE_TYPE c1 = BLOCK_SIZE_SB8X8;
+  const int bsl = mi_width_log2(bsize), bs = 1 << (bsl - 1);
+  int bwl, bhl;
   int UNINITIALIZED_IS_SAFE(pl);
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-  if (level > BLOCK_SIZE_SB8X8) {
+  if (bsize > BLOCK_SIZE_SB8X8) {
     set_partition_seg_context(cpi, mi_row, mi_col);
-    pl = partition_plane_context(xd, level);
+    pl = partition_plane_context(xd, bsize);
+    c1 = *(get_sb_partitioning(x, bsize));
   }
 
+  bwl = mi_width_log2(c1), bhl = mi_height_log2(c1);
+
   if (bsl == bwl && bsl == bhl) {
-    if (output_enabled && level > BLOCK_SIZE_SB8X8)
+    if (output_enabled && bsize > BLOCK_SIZE_SB8X8)
       cpi->partition_count[pl][PARTITION_NONE]++;
     encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, -1);
   } else if (bsl == bhl && bsl > bwl) {
@@ -826,12 +896,12 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp,
     int i;
 
     assert(bwl < bsl && bhl < bsl);
-    if (level == BLOCK_SIZE_SB64X64) {
+    if (bsize == BLOCK_SIZE_SB64X64) {
       subsize = BLOCK_SIZE_SB32X32;
-    } else if (level == BLOCK_SIZE_SB32X32) {
+    } else if (bsize == BLOCK_SIZE_SB32X32) {
       subsize = BLOCK_SIZE_MB16X16;
     } else {
-      assert(level == BLOCK_SIZE_MB16X16);
+      assert(bsize == BLOCK_SIZE_MB16X16);
       subsize = BLOCK_SIZE_SB8X8;
     }
 
@@ -843,554 +913,200 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp,
 
       set_block_index(xd, i, subsize);
       encode_sb(cpi, tp, mi_row + y_idx * bs, mi_col + x_idx * bs,
-                output_enabled, subsize,
-                c2 ? c2[i] : c1, c3 ? c3[i] : NULL, NULL);
+                output_enabled, subsize);
     }
   }
 
-  if (level > BLOCK_SIZE_SB8X8 &&
-      (level == BLOCK_SIZE_MB16X16 || bsl == bwl || bsl == bhl)) {
+  if (bsize > BLOCK_SIZE_SB8X8 &&
+      (bsize == BLOCK_SIZE_MB16X16 || bsl == bwl || bsl == bhl)) {
     set_partition_seg_context(cpi, mi_row, mi_col);
-    update_partition_context(xd, c1, level);
+    update_partition_context(xd, c1, bsize);
   }
 }
 
-static void encode_sb_row(VP9_COMP *cpi,
-                          int mi_row,
-                          TOKENEXTRA **tp,
-                          int *totalrate) {
+
+// TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
+// unlikely to be selected depending on previously rate-distortion optimization
+// results, for encoding speed-up.
+static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp,
+                              int mi_row, int mi_col,
+                              BLOCK_SIZE_TYPE bsize,
+                              int *rate, int *dist) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  int mi_col, pl;
-
-  // Initialize the left context for the new SB row
-  vpx_memset(&cm->left_context, 0, sizeof(cm->left_context));
-  vpx_memset(cm->left_seg_context, 0, sizeof(cm->left_seg_context));
-
-  // Code each SB in the row
-  for (mi_col = cm->cur_tile_mi_col_start;
-       mi_col < cm->cur_tile_mi_col_end; mi_col += 8) {
-    int i, p;
-    BLOCK_SIZE_TYPE mb_partitioning[4][4];
-    BLOCK_SIZE_TYPE sb_partitioning[4];
-    BLOCK_SIZE_TYPE sb64_partitioning = BLOCK_SIZE_SB32X32;
-    int sb64_rate = 0, sb64_dist = 0;
-    int sb64_skip = 0;
-    ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
-    PARTITION_CONTEXT seg_l[64 / MI_SIZE], seg_a[64 / MI_SIZE];
-    TOKENEXTRA *tp_orig = *tp;
-
-    for (p = 0; p < MAX_MB_PLANE; p++) {
-      memcpy(a + 16 * p, cm->above_context[p] +
-                 (mi_col * 2 >> xd->plane[p].subsampling_x),
-             sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_x);
-      memcpy(l + 16 * p, cm->left_context[p],
-             sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_y);
-    }
-    vpx_memcpy(&seg_a, cm->above_seg_context + mi_col, sizeof(seg_a));
-    vpx_memcpy(&seg_l, cm->left_seg_context, sizeof(seg_l));
+  int bsl = b_width_log2(bsize), bs = 1 << bsl;
+  int msl = mi_height_log2(bsize), ms = 1 << msl;
+  ENTROPY_CONTEXT   l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
+  PARTITION_CONTEXT sl[8], sa[8];
+  TOKENEXTRA *tp_orig = *tp;
+  int i, p, pl;
+  BLOCK_SIZE_TYPE subsize;
+  int srate = INT_MAX, sdist = INT_MAX;
+
+  assert(mi_height_log2(bsize) == mi_width_log2(bsize));
+
+  // buffer the above/left context information of the block in search.
+  for (p = 0; p < MAX_MB_PLANE; ++p) {
+    vpx_memcpy(a + bs * p, cm->above_context[p] +
+               (mi_col * 2 >> xd->plane[p].subsampling_x),
+               sizeof(ENTROPY_CONTEXT) * bs >> xd->plane[p].subsampling_x);
+    vpx_memcpy(l + bs * p, cm->left_context[p] +
+               ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
+               sizeof(ENTROPY_CONTEXT) * bs >> xd->plane[p].subsampling_y);
+  }
+  vpx_memcpy(sa, cm->above_seg_context + mi_col,
+             sizeof(PARTITION_CONTEXT) * ms);
+  vpx_memcpy(sl, cm->left_seg_context + (mi_row & MI_MASK),
+             sizeof(PARTITION_CONTEXT) * ms);
+
+  // PARTITION_SPLIT
+  if (bsize >= BLOCK_SIZE_MB16X16) {
+    int r4 = 0, d4 = 0;
+    subsize = get_subsize(bsize, PARTITION_SPLIT);
+    *(get_sb_partitioning(x, bsize)) = subsize;
+
+    for (i = 0; i < 4; ++i) {
+      int x_idx = (i & 1) * (ms >> 1);
+      int y_idx = (i >> 1) * (ms >> 1);
+      int r, d;
 
-    // FIXME(rbultje): this function should probably be rewritten to be
-    // recursive at some point in the future.
-    for (i = 0; i < 4; i++) {
-      const int x_idx = (i & 1) << 2;
-      const int y_idx = (i & 2) << 1;
-      int sb32_rate = 0, sb32_dist = 0;
-      int splitmodes_used = 0;
-      int sb32_skip = 0;
-      int j;
-      ENTROPY_CONTEXT l2[8 * MAX_MB_PLANE], a2[8 * MAX_MB_PLANE];
-      PARTITION_CONTEXT sl32[32 / MI_SIZE], sa32[32 / MI_SIZE];
-
-      sb_partitioning[i] = BLOCK_SIZE_MB16X16;
-      if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
+      if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
         continue;
 
-      xd->sb_index = i;
-
-      /* Function should not modify L & A contexts; save and restore on exit */
-      for (p = 0; p < MAX_MB_PLANE; p++) {
-        vpx_memcpy(l2 + 8 * p,
-                   cm->left_context[p] +
-                       (y_idx * 2 >> xd->plane[p].subsampling_y),
-                   sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_y);
-        vpx_memcpy(a2 + 8 * p,
-                   cm->above_context[p] +
-                       ((mi_col + x_idx) * 2 >> xd->plane[p].subsampling_x),
-                   sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_x);
-      }
-      vpx_memcpy(&sa32, cm->above_seg_context + mi_col + x_idx, sizeof(sa32));
-      vpx_memcpy(&sl32, cm->left_seg_context + y_idx, sizeof(sl32));
-
-      /* Encode MBs in raster order within the SB */
-      for (j = 0; j < 4; j++) {
-        const int x_idx_m = x_idx + ((j & 1) << 1);
-        const int y_idx_m = y_idx + ((j >> 1) << 1);
-        int r, d;
-        int r2, d2, mb16_rate = 0, mb16_dist = 0, k;
-        ENTROPY_CONTEXT l3[4 * MAX_MB_PLANE], a3[4 * MAX_MB_PLANE];
-        PARTITION_CONTEXT sl16[16 / MI_SIZE], sa16[16 / MI_SIZE];
-
-        mb_partitioning[i][j] = BLOCK_SIZE_SB8X8;
-
-        if (mi_row + y_idx_m >= cm->mi_rows ||
-            mi_col + x_idx_m >= cm->mi_cols) {
-          // MB lies outside frame, move on
-          continue;
-        }
-
-        // Index of the MB in the SB 0..3
-        xd->mb_index = j;
-
-        for (p = 0; p < MAX_MB_PLANE; p++) {
-          vpx_memcpy(l3 + 4 * p,
-                     cm->left_context[p] +
-                         (y_idx_m * 2 >> xd->plane[p].subsampling_y),
-                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_y);
-          vpx_memcpy(a3 + 4 * p,
-                     cm->above_context[p] +
-                         ((mi_col + x_idx_m) * 2 >> xd->plane[p].subsampling_x),
-                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_x);
-        }
-        vpx_memcpy(&sa16, cm->above_seg_context + mi_col + x_idx_m,
-                   sizeof(sa16));
-        vpx_memcpy(&sl16, cm->left_seg_context + y_idx_m, sizeof(sl16));
-
-        for (k = 0; k < 4; k++) {
-          xd->b_index = k;
-
-          // try 8x8 coding
-          pick_sb_modes(cpi, mi_row + y_idx_m + (k >> 1),
-                        mi_col + x_idx_m + (k & 1),
-                        tp, &r, &d, BLOCK_SIZE_SB8X8,
-                        &x->sb8_context[xd->sb_index][xd->mb_index]
-                                       [xd->b_index]);
-          mb16_rate += r;
-          mb16_dist += d;
-          update_state(cpi, &x->sb8_context[xd->sb_index][xd->mb_index]
-                                           [xd->b_index],
-                       BLOCK_SIZE_SB8X8, 0);
-          encode_superblock(cpi, tp,
-                            0, mi_row + y_idx_m + (k >> 1),
-                            mi_col + x_idx_m + (k & 1),
-                            BLOCK_SIZE_SB8X8);
-        }
-        set_partition_seg_context(cpi, mi_row + y_idx_m, mi_col + x_idx_m);
-        pl = partition_plane_context(xd, BLOCK_SIZE_MB16X16);
-        mb16_rate += x->partition_cost[pl][PARTITION_SPLIT];
-        for (p = 0; p < MAX_MB_PLANE; p++) {
-          vpx_memcpy(cm->left_context[p] +
-                         (y_idx_m * 2 >> xd->plane[p].subsampling_y),
-                     l3 + 4 * p,
-                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_y);
-          vpx_memcpy(cm->above_context[p] +
-                         ((mi_col + x_idx_m) * 2 >> xd->plane[p].subsampling_x),
-                     a3 + 4 * p,
-                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_x);
-        }
-        vpx_memcpy(cm->above_seg_context + mi_col + x_idx_m,
-                   sa16, sizeof(sa16));
-        vpx_memcpy(cm->left_seg_context + y_idx_m, sl16, sizeof(sl16));
-
-        // try 8x16 coding
-        r2 = 0;
-        d2 = 0;
-        xd->b_index = 0;
-        pick_sb_modes(cpi, mi_row + y_idx_m, mi_col + x_idx_m,
-                      tp, &r, &d, BLOCK_SIZE_SB8X16,
-                      &x->sb8x16_context[xd->sb_index][xd->mb_index]
-                                        [xd->b_index]);
-        r2 += r;
-        d2 += d;
-        update_state(cpi, &x->sb8x16_context[xd->sb_index][xd->mb_index]
-                                            [xd->b_index],
-                     BLOCK_SIZE_SB8X16, 0);
-        encode_superblock(cpi, tp,
-                          0, mi_row + y_idx_m, mi_col + x_idx_m,
-                          BLOCK_SIZE_SB8X16);
-        xd->b_index = 1;
-        pick_sb_modes(cpi, mi_row + y_idx_m, mi_col + x_idx_m + 1,
-                      tp, &r, &d, BLOCK_SIZE_SB8X16,
-                      &x->sb8x16_context[xd->sb_index][xd->mb_index]
-                                        [xd->b_index]);
-        r2 += r;
-        d2 += d;
-        set_partition_seg_context(cpi, mi_row + y_idx_m, mi_col + x_idx_m);
-        pl = partition_plane_context(xd, BLOCK_SIZE_MB16X16);
-        r2 += x->partition_cost[pl][PARTITION_VERT];
-        if (RDCOST(x->rdmult, x->rddiv, r2, d2) <
-                RDCOST(x->rdmult, x->rddiv, mb16_rate, mb16_dist)) {
-          mb16_rate = r2;
-          mb16_dist = d2;
-          mb_partitioning[i][j] = BLOCK_SIZE_SB8X16;
-        }
-        for (p = 0; p < MAX_MB_PLANE; p++) {
-          vpx_memcpy(cm->left_context[p] +
-                         (y_idx_m * 2 >> xd->plane[p].subsampling_y),
-                     l3 + 4 * p,
-                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_y);
-          vpx_memcpy(cm->above_context[p] +
-                         ((mi_col + x_idx_m) * 2 >> xd->plane[p].subsampling_x),
-                     a3 + 4 * p,
-                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_x);
-        }
-
-        // try 16x8 coding
-        r2 = 0;
-        d2 = 0;
-        xd->b_index = 0;
-        pick_sb_modes(cpi, mi_row + y_idx_m, mi_col + x_idx_m,
-                      tp, &r, &d, BLOCK_SIZE_SB16X8,
-                      &x->sb16x8_context[xd->sb_index][xd->mb_index]
-                                        [xd->b_index]);
-        r2 += r;
-        d2 += d;
-        update_state(cpi, &x->sb16x8_context[xd->sb_index][xd->mb_index]
-                                            [xd->b_index],
-                     BLOCK_SIZE_SB16X8, 0);
-        encode_superblock(cpi, tp,
-                          0, mi_row + y_idx_m, mi_col + x_idx_m,
-                          BLOCK_SIZE_SB16X8);
-        xd->b_index = 1;
-        pick_sb_modes(cpi, mi_row + y_idx_m + 1, mi_col + x_idx_m,
-                      tp, &r, &d, BLOCK_SIZE_SB16X8,
-                      &x->sb16x8_context[xd->sb_index][xd->mb_index]
-                                        [xd->b_index]);
-        r2 += r;
-        d2 += d;
-        set_partition_seg_context(cpi, mi_row + y_idx_m, mi_col + x_idx_m);
-        pl = partition_plane_context(xd, BLOCK_SIZE_MB16X16);
-        r2 += x->partition_cost[pl][PARTITION_HORZ];
-        if (RDCOST(x->rdmult, x->rddiv, r2, d2) <
-                RDCOST(x->rdmult, x->rddiv, mb16_rate, mb16_dist)) {
-          mb16_rate = r2;
-          mb16_dist = d2;
-          mb_partitioning[i][j] = BLOCK_SIZE_SB16X8;
-        }
-        for (p = 0; p < MAX_MB_PLANE; p++) {
-          vpx_memcpy(cm->left_context[p] +
-                         (y_idx_m * 2 >> xd->plane[p].subsampling_y),
-                     l3 + 4 * p,
-                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_y);
-          vpx_memcpy(cm->above_context[p] +
-                         ((mi_col + x_idx_m) * 2 >> xd->plane[p].subsampling_x),
-                     a3 + 4 * p,
-                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_x);
-        }
-
-        // try as 16x16
-        pick_sb_modes(cpi, mi_row + y_idx_m, mi_col + x_idx_m,
-                      tp, &r, &d, BLOCK_SIZE_MB16X16,
-                      &x->mb_context[xd->sb_index][xd->mb_index]);
-        set_partition_seg_context(cpi, mi_row + y_idx_m, mi_col + x_idx_m);
-        pl = partition_plane_context(xd, BLOCK_SIZE_MB16X16);
-        r += x->partition_cost[pl][PARTITION_NONE];
-        if (RDCOST(x->rdmult, x->rddiv, r, d) <
-                RDCOST(x->rdmult, x->rddiv, mb16_rate, mb16_dist)) {
-          mb16_rate = r;
-          mb16_dist = d;
-          mb_partitioning[i][j] = BLOCK_SIZE_MB16X16;
-        }
-        sb32_rate += mb16_rate;
-        sb32_dist += mb16_dist;
-
-        // Dummy encode, do not do the tokenization
-        encode_sb(cpi, tp, mi_row + y_idx_m, mi_col + x_idx_m, 0,
-                  BLOCK_SIZE_MB16X16, mb_partitioning[i][j], NULL, NULL);
-      }
-
-      /* Restore L & A coding context to those in place on entry */
-      for (p = 0; p < MAX_MB_PLANE; p++) {
-        vpx_memcpy(cm->left_context[p] +
-                       (y_idx * 2 >> xd->plane[p].subsampling_y),
-                   l2 + 8 * p,
-                   sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_y);
-        vpx_memcpy(cm->above_context[p] +
-                       ((mi_col + x_idx) * 2 >> xd->plane[p].subsampling_x),
-                   a2 + 8 * p,
-                   sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_x);
-      }
-      // restore partition information context
-      vpx_memcpy(cm->above_seg_context + mi_col + x_idx, sa32, sizeof(sa32));
-      vpx_memcpy(cm->left_seg_context + y_idx, sl32, sizeof(sl32));
-
-      set_partition_seg_context(cpi, mi_row + y_idx, mi_col + x_idx);
-      pl = partition_plane_context(xd, BLOCK_SIZE_SB32X32);
-      sb32_rate += x->partition_cost[pl][PARTITION_SPLIT];
-
-      if (cpi->sf.splitmode_breakout) {
-        sb32_skip = splitmodes_used;
-        sb64_skip += splitmodes_used;
-      }
-
-      // check 32x16
-      if (mi_col + x_idx + 4 <= cm->mi_cols) {
-        int r, d;
-
-        xd->mb_index = 0;
-        pick_sb_modes(cpi, mi_row + y_idx, mi_col + x_idx,
-                      tp, &r, &d, BLOCK_SIZE_SB32X16,
-                      &x->sb32x16_context[xd->sb_index][xd->mb_index]);
-        if (mi_row + y_idx + 2 < cm->mi_rows) {
-          int r2, d2;
-
-          update_state(cpi, &x->sb32x16_context[xd->sb_index][xd->mb_index],
-                       BLOCK_SIZE_SB32X16, 0);
-          encode_superblock(cpi, tp,
-                            0, mi_row + y_idx, mi_col + x_idx,
-                            BLOCK_SIZE_SB32X16);
-          xd->mb_index = 1;
-          pick_sb_modes(cpi, mi_row + y_idx + 2,
-                        mi_col + x_idx, tp, &r2, &d2, BLOCK_SIZE_SB32X16,
-                        &x->sb32x16_context[xd->sb_index][xd->mb_index]);
-          r += r2;
-          d += d2;
-        }
-
-        set_partition_seg_context(cpi, mi_row + y_idx, mi_col + x_idx);
-        pl = partition_plane_context(xd, BLOCK_SIZE_SB32X32);
-        r += x->partition_cost[pl][PARTITION_HORZ];
-
-        /* is this better than MB coding? */
-        if (RDCOST(x->rdmult, x->rddiv, r, d) <
-                RDCOST(x->rdmult, x->rddiv, sb32_rate, sb32_dist)) {
-          sb32_rate = r;
-          sb32_dist = d;
-          sb_partitioning[i] = BLOCK_SIZE_SB32X16;
-        }
-
-        for (p = 0; p < MAX_MB_PLANE; p++) {
-          vpx_memcpy(cm->left_context[p] +
-                         (y_idx * 2 >> xd->plane[p].subsampling_y),
-                     l2 + 8 * p,
-                     sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_y);
-          vpx_memcpy(cm->above_context[p] +
-                         ((mi_col + x_idx) * 2 >> xd->plane[p].subsampling_x),
-                     a2 + 8 * p,
-                     sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_x);
-        }
-      }
-
-      // check 16x32
-      if (mi_row + y_idx + 4 <= cm->mi_rows) {
-        int r, d;
-
-        xd->mb_index = 0;
-        pick_sb_modes(cpi, mi_row + y_idx, mi_col + x_idx,
-                      tp, &r, &d, BLOCK_SIZE_SB16X32,
-                      &x->sb16x32_context[xd->sb_index][xd->mb_index]);
-        if (mi_col + x_idx + 2 < cm->mi_cols) {
-          int r2, d2;
-
-          update_state(cpi, &x->sb16x32_context[xd->sb_index][xd->mb_index],
-                       BLOCK_SIZE_SB16X32, 0);
-          encode_superblock(cpi, tp,
-                            0, mi_row + y_idx, mi_col + x_idx,
-                            BLOCK_SIZE_SB16X32);
-          xd->mb_index = 1;
-          pick_sb_modes(cpi, mi_row + y_idx,
-                        mi_col + x_idx + 2,
-                        tp, &r2, &d2, BLOCK_SIZE_SB16X32,
-                        &x->sb16x32_context[xd->sb_index][xd->mb_index]);
-          r += r2;
-          d += d2;
-        }
-
-        set_partition_seg_context(cpi, mi_row + y_idx, mi_col + x_idx);
-        pl = partition_plane_context(xd, BLOCK_SIZE_SB32X32);
-        r += x->partition_cost[pl][PARTITION_VERT];
-
-        /* is this better than MB coding? */
-        if (RDCOST(x->rdmult, x->rddiv, r, d) <
-                RDCOST(x->rdmult, x->rddiv, sb32_rate, sb32_dist)) {
-          sb32_rate = r;
-          sb32_dist = d;
-          sb_partitioning[i] = BLOCK_SIZE_SB16X32;
-        }
-
-        for (p = 0; p < MAX_MB_PLANE; p++) {
-          vpx_memcpy(cm->left_context[p] +
-                         (y_idx * 2 >> xd->plane[p].subsampling_y),
-                     l2 + 8 * p,
-                     sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_y);
-          vpx_memcpy(cm->above_context[p] +
-                         ((mi_col + x_idx) * 2 >> xd->plane[p].subsampling_x),
-                     a2 + 8 * p,
-                     sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_x);
-        }
-      }
-
-      if (!sb32_skip &&
-          mi_col + x_idx + 4 <= cm->mi_cols &&
-          mi_row + y_idx + 4 <= cm->mi_rows) {
-        int r, d;
-
-        /* Pick a mode assuming that it applies to all 4 of the MBs in the SB */
-        pick_sb_modes(cpi, mi_row + y_idx, mi_col + x_idx,
-                      tp, &r, &d, BLOCK_SIZE_SB32X32,
-                      &x->sb32_context[xd->sb_index]);
-
-        set_partition_seg_context(cpi, mi_row + y_idx, mi_col + x_idx);
-        pl = partition_plane_context(xd, BLOCK_SIZE_SB32X32);
-        r += x->partition_cost[pl][PARTITION_NONE];
-
-        if (RDCOST(x->rdmult, x->rddiv, r, d) <
-                RDCOST(x->rdmult, x->rddiv, sb32_rate, sb32_dist)) {
-          sb32_rate = r;
-          sb32_dist = d;
-          sb_partitioning[i] = BLOCK_SIZE_SB32X32;
-        }
-      }
-
-      // If we used 16x16 instead of 32x32 then skip 64x64 (if enabled).
-      if (cpi->sf.mb16_breakout && sb_partitioning[i] != BLOCK_SIZE_SB32X32) {
-        ++sb64_skip;
-      }
-
-      sb64_rate += sb32_rate;
-      sb64_dist += sb32_dist;
-
-      /* Encode SB using best computed mode(s) */
-      // FIXME(rbultje): there really shouldn't be any need to encode_mb/sb
-      // for each level that we go up, we can just keep tokens and recon
-      // pixels of the lower level; also, inverting SB/MB order (big->small
-      // instead of small->big) means we can use as threshold for small, which
-      // may enable breakouts if RD is not good enough (i.e. faster)
-      encode_sb(cpi, tp, mi_row + y_idx, mi_col + x_idx, 0,
-                BLOCK_SIZE_SB32X32, sb_partitioning[i], mb_partitioning[i],
-                NULL);
+      *(get_sb_index(xd, subsize)) = i;
+      rd_pick_partition(cpi, tp, mi_row + y_idx, mi_col + x_idx, subsize,
+                        &r, &d);
+      r4 += r;
+      d4 += d;
     }
-
-    for (p = 0; p < MAX_MB_PLANE; p++) {
-      memcpy(cm->above_context[p] +
-                 (mi_col * 2 >> xd->plane[p].subsampling_x),
-             a + 16 * p,
-             sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_x);
-      memcpy(cm->left_context[p], l + 16 * p,
-             sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_y);
-    }
-    memcpy(cm->above_seg_context + mi_col, &seg_a, sizeof(seg_a));
-    memcpy(cm->left_seg_context, &seg_l, sizeof(seg_l));
-
     set_partition_seg_context(cpi, mi_row, mi_col);
-    pl = partition_plane_context(xd, BLOCK_SIZE_SB64X64);
-    sb64_rate += x->partition_cost[pl][PARTITION_SPLIT];
-
-    // check 64x32
-    if (mi_col + 8 <= cm->mi_cols && !(cm->mb_rows & 1)) {
-      int r, d;
-
-      xd->sb_index = 0;
-      pick_sb_modes(cpi, mi_row, mi_col,
-                    tp, &r, &d, BLOCK_SIZE_SB64X32,
-                    &x->sb64x32_context[xd->sb_index]);
-      if (mi_row + 4 != cm->mi_rows) {
-        int r2, d2;
-
-        update_state(cpi, &x->sb64x32_context[xd->sb_index],
-                     BLOCK_SIZE_SB64X32, 0);
-        encode_superblock(cpi, tp,
-                          0, mi_row, mi_col, BLOCK_SIZE_SB64X32);
-        xd->sb_index = 1;
-        pick_sb_modes(cpi, mi_row + 4, mi_col,
-                      tp, &r2, &d2, BLOCK_SIZE_SB64X32,
-                      &x->sb64x32_context[xd->sb_index]);
-        r += r2;
-        d += d2;
-      }
+    pl = partition_plane_context(xd, bsize);
+    r4 += x->partition_cost[pl][PARTITION_SPLIT];
 
-      set_partition_seg_context(cpi, mi_row, mi_col);
-      pl = partition_plane_context(xd, BLOCK_SIZE_SB64X64);
-      r += x->partition_cost[pl][PARTITION_HORZ];
-
-      /* is this better than MB coding? */
-      if (RDCOST(x->rdmult, x->rddiv, r, d) <
-              RDCOST(x->rdmult, x->rddiv, sb64_rate, sb64_dist)) {
-        sb64_rate = r;
-        sb64_dist = d;
-        sb64_partitioning = BLOCK_SIZE_SB64X32;
-      }
+    srate = r4;
+    sdist = d4;
+    restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+  }
 
-      for (p = 0; p < MAX_MB_PLANE; p++) {
-        memcpy(cm->above_context[p] +
-                   (mi_col * 2 >> xd->plane[p].subsampling_x),
-               a + 16 * p,
-               sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_x);
-        memcpy(cm->left_context[p], l + 16 * p,
-               sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_y);
-      }
+  // PARTITION_HORZ
+  if ((mi_col + ms <= cm->mi_cols) && (mi_row + (ms >> 1) <= cm->mi_rows) &&
+      (bsize >= BLOCK_SIZE_MB16X16)) {
+    int r2, d2;
+    int mb_skip = 0;
+    subsize = get_subsize(bsize, PARTITION_HORZ);
+    *(get_sb_index(xd, subsize)) = 0;
+    pick_sb_modes(cpi, mi_row, mi_col, tp, &r2, &d2, subsize,
+                  get_block_context(x, subsize));
+
+    if (mi_row + ms <= cm->mi_rows) {
+      int r, d;
+      update_state(cpi, get_block_context(x, subsize), subsize, 0);
+      encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
+      *(get_sb_index(xd, subsize)) = 1;
+      pick_sb_modes(cpi, mi_row + (ms >> 1), mi_col, tp, &r, &d, subsize,
+                    get_block_context(x, subsize));
+      r2 += r;
+      d2 += d;
+    } else {
+      if (mi_row + (ms >> 1) != cm->mi_rows)
+        mb_skip = 1;
     }
+    set_partition_seg_context(cpi, mi_row, mi_col);
+    pl = partition_plane_context(xd, bsize);
+    r2 += x->partition_cost[pl][PARTITION_HORZ];
+
+    if ((RDCOST(x->rdmult, x->rddiv, r2, d2) <
+         RDCOST(x->rdmult, x->rddiv, srate, sdist)) && !mb_skip) {
+      srate = r2;
+      sdist = d2;
+      *(get_sb_partitioning(x, bsize)) = subsize;
+    }
+    restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+  }
 
-    // check 32x64
-    if (mi_row + 8 <= cm->mi_rows && !(cm->mb_cols & 1)) {
+  // PARTITION_VERT
+  if ((mi_row + ms <= cm->mi_rows) && (mi_col + (ms >> 1) <= cm->mi_cols) &&
+      (bsize >= BLOCK_SIZE_MB16X16)) {
+    int r2, d2;
+    int mb_skip = 0;
+    subsize = get_subsize(bsize, PARTITION_VERT);
+    *(get_sb_index(xd, subsize)) = 0;
+    pick_sb_modes(cpi, mi_row, mi_col, tp, &r2, &d2, subsize,
+                  get_block_context(x, subsize));
+    if (mi_col + ms <= cm->mi_cols) {
       int r, d;
+      update_state(cpi, get_block_context(x, subsize), subsize, 0);
+      encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
+      *(get_sb_index(xd, subsize)) = 1;
+      pick_sb_modes(cpi, mi_row, mi_col + (ms >> 1), tp, &r, &d, subsize,
+                    get_block_context(x, subsize));
+      r2 += r;
+      d2 += d;
+    } else {
+      if (mi_col + (ms >> 1) != cm->mi_cols)
+        mb_skip = 1;
+    }
+    set_partition_seg_context(cpi, mi_row, mi_col);
+    pl = partition_plane_context(xd, bsize);
+    r2 += x->partition_cost[pl][PARTITION_VERT];
+
+    if ((RDCOST(x->rdmult, x->rddiv, r2, d2) <
+         RDCOST(x->rdmult, x->rddiv, srate, sdist)) && !mb_skip) {
+      srate = r2;
+      sdist = d2;
+      *(get_sb_partitioning(x, bsize)) = subsize;
+    }
+    restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+  }
 
-      xd->sb_index = 0;
-      pick_sb_modes(cpi, mi_row, mi_col,
-                    tp, &r, &d, BLOCK_SIZE_SB32X64,
-                    &x->sb32x64_context[xd->sb_index]);
-      if (mi_col + 4 != cm->mi_cols) {
-        int r2, d2;
-
-        update_state(cpi, &x->sb32x64_context[xd->sb_index],
-                     BLOCK_SIZE_SB32X64, 0);
-        encode_superblock(cpi, tp,
-                          0, mi_row, mi_col, BLOCK_SIZE_SB32X64);
-        xd->sb_index = 1;
-        pick_sb_modes(cpi, mi_row, mi_col + 4,
-                      tp, &r2, &d2, BLOCK_SIZE_SB32X64,
-                      &x->sb32x64_context[xd->sb_index]);
-        r += r2;
-        d += d2;
-      }
-
+  // PARTITION_NONE
+  if (mi_row + ms <= cm->mi_rows && mi_col + ms <= cm->mi_cols) {
+    int r, d;
+    pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, bsize,
+                  get_block_context(x, bsize));
+    if (bsize >= BLOCK_SIZE_MB16X16) {
       set_partition_seg_context(cpi, mi_row, mi_col);
-      pl = partition_plane_context(xd, BLOCK_SIZE_SB64X64);
-      r += x->partition_cost[pl][PARTITION_VERT];
-
-      /* is this better than MB coding? */
-      if (RDCOST(x->rdmult, x->rddiv, r, d) <
-              RDCOST(x->rdmult, x->rddiv, sb64_rate, sb64_dist)) {
-        sb64_rate = r;
-        sb64_dist = d;
-        sb64_partitioning = BLOCK_SIZE_SB32X64;
-      }
+      pl = partition_plane_context(xd, bsize);
+      r += x->partition_cost[pl][PARTITION_NONE];
+    }
 
-      for (p = 0; p < MAX_MB_PLANE; p++) {
-        memcpy(cm->above_context[p] +
-                   (mi_col * 2 >> xd->plane[p].subsampling_x),
-               a + 16 * p,
-               sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_x);
-        memcpy(cm->left_context[p], l + 16 * p,
-               sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_y);
-      }
+    if (RDCOST(x->rdmult, x->rddiv, r, d) <
+        RDCOST(x->rdmult, x->rddiv, srate, sdist)) {
+      srate = r;
+      sdist = d;
+      if (bsize >= BLOCK_SIZE_MB16X16)
+        *(get_sb_partitioning(x, bsize)) = bsize;
     }
+  }
 
-    if (!sb64_skip &&
-        mi_col + 8 <= cm->mi_cols &&
-        mi_row + 8 <= cm->mi_rows) {
-      int r, d;
+  assert(srate < INT_MAX && sdist < INT_MAX);
+  *rate = srate;
+  *dist = sdist;
 
-      pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d,
-                    BLOCK_SIZE_SB64X64, &x->sb64_context);
+  encode_sb(cpi, tp, mi_row, mi_col, bsize == BLOCK_SIZE_SB64X64, bsize);
 
-      set_partition_seg_context(cpi, mi_row, mi_col);
-      pl = partition_plane_context(xd, BLOCK_SIZE_SB64X64);
-      r += x->partition_cost[pl][PARTITION_NONE];
+  if (bsize == BLOCK_SIZE_SB64X64)
+    assert(tp_orig < *tp);
+  else
+    assert(tp_orig == *tp);
+}
 
-      if (RDCOST(x->rdmult, x->rddiv, r, d) <
-              RDCOST(x->rdmult, x->rddiv, sb64_rate, sb64_dist)) {
-        sb64_rate = r;
-        sb64_dist = d;
-        sb64_partitioning = BLOCK_SIZE_SB64X64;
-      }
-    }
+static void encode_sb_row(VP9_COMP *cpi, int mi_row,
+                       TOKENEXTRA **tp, int *totalrate) {
+  VP9_COMMON *const cm = &cpi->common;
+  int mi_col;
 
-    assert(tp_orig == *tp);
-    encode_sb(cpi, tp, mi_row, mi_col, 1, BLOCK_SIZE_SB64X64,
-              sb64_partitioning, sb_partitioning, mb_partitioning);
-    assert(tp_orig < *tp);
+  // Initialize the left context for the new SB row
+  vpx_memset(&cm->left_context, 0, sizeof(cm->left_context));
+  vpx_memset(cm->left_seg_context, 0, sizeof(cm->left_seg_context));
+
+  // Code each SB in the row
+  for (mi_col = cm->cur_tile_mi_col_start;
+       mi_col < cm->cur_tile_mi_col_end; mi_col += 8) {
+    int dummy_rate, dummy_dist;
+    rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
+                      &dummy_rate, &dummy_dist);
   }
 }
 
@@ -1559,9 +1275,8 @@ static void encode_frame_internal(VP9_COMP *cpi) {
           vp9_get_tile_col_offsets(cm, tile_col);
           for (mi_row = cm->cur_tile_mi_row_start;
                mi_row < cm->cur_tile_mi_row_end;
-               mi_row += 8) {
+               mi_row += 8)
             encode_sb_row(cpi, mi_row, &tp, &totalrate);
-          }
           cpi->tok_count[tile_col] = (unsigned int)(tp - tp_old);
           assert(tp - cpi->tok <=
                  get_token_alloc(cm->mb_rows, cm->mb_cols));
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index ff0725fd0..72238514a 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -632,7 +632,7 @@ void vp9_first_pass(VP9_COMP *cpi) {
           vp9_build_inter_predictors_sby(xd, mb_row << 1,
                                          mb_col << 1,
                                          BLOCK_SIZE_MB16X16);
-          vp9_encode_sb(cm, x, BLOCK_SIZE_MB16X16);
+          vp9_encode_sby(cm, x, BLOCK_SIZE_MB16X16);
           sum_mvr += mv.as_mv.row;
           sum_mvr_abs += abs(mv.as_mv.row);
           sum_mvc += mv.as_mv.col;
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 74caba5a0..aff5637e1 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -413,6 +413,201 @@ int vp9_find_best_sub_pixel_step_iteratively(MACROBLOCK *x,
 
   return besterr;
 }
+
+#if CONFIG_COMP_INTER_JOINT_SEARCH
+#undef DIST
+/* returns subpixel variance error function */
+#define DIST(r, c) \
+    vfp->svaf(PRE(r, c), y_stride, SP(c), SP(r), \
+              z, src_stride, &sse, second_pred)
+
+int vp9_find_best_sub_pixel_comp(MACROBLOCK *x,
+                                 int_mv *bestmv, int_mv *ref_mv,
+                                 int error_per_bit,
+                                 const vp9_variance_fn_ptr_t *vfp,
+                                 int *mvjcost, int *mvcost[2],
+                                 int *distortion,
+                                 unsigned int *sse1,
+                                 const uint8_t *second_pred, int w, int h) {
+  uint8_t *z = x->plane[0].src.buf;
+  int src_stride = x->plane[0].src.stride;
+  MACROBLOCKD *xd = &x->e_mbd;
+
+  int rr, rc, br, bc, hstep;
+  int tr, tc;
+  unsigned int besterr = INT_MAX;
+  unsigned int left, right, up, down, diag;
+  unsigned int sse;
+  unsigned int whichdir;
+  unsigned int halfiters = 4;
+  unsigned int quarteriters = 4;
+  unsigned int eighthiters = 4;
+  int thismse;
+  int maxc, minc, maxr, minr;
+  int y_stride;
+  int offset;
+  int usehp = xd->allow_high_precision_mv;
+
+  uint8_t *comp_pred = vpx_memalign(16, w * h * sizeof(uint8_t));
+  uint8_t *y = xd->plane[0].pre[0].buf +
+               (bestmv->as_mv.row) * xd->plane[0].pre[0].stride +
+               bestmv->as_mv.col;
+
+  y_stride = xd->plane[0].pre[0].stride;
+
+  rr = ref_mv->as_mv.row;
+  rc = ref_mv->as_mv.col;
+  br = bestmv->as_mv.row << 3;
+  bc = bestmv->as_mv.col << 3;
+  hstep = 4;
+  minc = MAX(x->mv_col_min << 3, (ref_mv->as_mv.col) -
+             ((1 << MV_MAX_BITS) - 1));
+  maxc = MIN(x->mv_col_max << 3, (ref_mv->as_mv.col) +
+             ((1 << MV_MAX_BITS) - 1));
+  minr = MAX(x->mv_row_min << 3, (ref_mv->as_mv.row) -
+             ((1 << MV_MAX_BITS) - 1));
+  maxr = MIN(x->mv_row_max << 3, (ref_mv->as_mv.row) +
+             ((1 << MV_MAX_BITS) - 1));
+
+  tr = br;
+  tc = bc;
+
+
+  offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col;
+
+  // central mv
+  bestmv->as_mv.row <<= 3;
+  bestmv->as_mv.col <<= 3;
+
+  // calculate central point error
+  // TODO(yunqingwang): central pointer error was already calculated in full-
+  // pixel search, and can be passed in this function.
+  comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride);
+  besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);
+  *distortion = besterr;
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost,
+                         error_per_bit, xd->allow_high_precision_mv);
+
+  // Each subsequent iteration checks at least one point in
+  // common with the last iteration could be 2 ( if diag selected)
+  while (--halfiters) {
+    // 1/2 pel
+    CHECK_BETTER(left, tr, tc - hstep);
+    CHECK_BETTER(right, tr, tc + hstep);
+    CHECK_BETTER(up, tr - hstep, tc);
+    CHECK_BETTER(down, tr + hstep, tc);
+
+    whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+    switch (whichdir) {
+      case 0:
+        CHECK_BETTER(diag, tr - hstep, tc - hstep);
+        break;
+      case 1:
+        CHECK_BETTER(diag, tr - hstep, tc + hstep);
+        break;
+      case 2:
+        CHECK_BETTER(diag, tr + hstep, tc - hstep);
+        break;
+      case 3:
+        CHECK_BETTER(diag, tr + hstep, tc + hstep);
+        break;
+    }
+
+    // no reason to check the same one again.
+    if (tr == br && tc == bc)
+      break;
+
+    tr = br;
+    tc = bc;
+  }
+
+  // Each subsequent iteration checks at least one point in common with
+  // the last iteration could be 2 ( if diag selected) 1/4 pel
+  hstep >>= 1;
+  while (--quarteriters) {
+    CHECK_BETTER(left, tr, tc - hstep);
+    CHECK_BETTER(right, tr, tc + hstep);
+    CHECK_BETTER(up, tr - hstep, tc);
+    CHECK_BETTER(down, tr + hstep, tc);
+
+    whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+    switch (whichdir) {
+      case 0:
+        CHECK_BETTER(diag, tr - hstep, tc - hstep);
+        break;
+      case 1:
+        CHECK_BETTER(diag, tr - hstep, tc + hstep);
+        break;
+      case 2:
+        CHECK_BETTER(diag, tr + hstep, tc - hstep);
+        break;
+      case 3:
+        CHECK_BETTER(diag, tr + hstep, tc + hstep);
+        break;
+    }
+
+    // no reason to check the same one again.
+    if (tr == br && tc == bc)
+      break;
+
+    tr = br;
+    tc = bc;
+  }
+
+  if (xd->allow_high_precision_mv) {
+    usehp = vp9_use_nmv_hp(&ref_mv->as_mv);
+  } else {
+    usehp = 0;
+  }
+
+  if (usehp) {
+    hstep >>= 1;
+    while (--eighthiters) {
+      CHECK_BETTER(left, tr, tc - hstep);
+      CHECK_BETTER(right, tr, tc + hstep);
+      CHECK_BETTER(up, tr - hstep, tc);
+      CHECK_BETTER(down, tr + hstep, tc);
+
+      whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+      switch (whichdir) {
+        case 0:
+          CHECK_BETTER(diag, tr - hstep, tc - hstep);
+          break;
+        case 1:
+          CHECK_BETTER(diag, tr - hstep, tc + hstep);
+          break;
+        case 2:
+          CHECK_BETTER(diag, tr + hstep, tc - hstep);
+          break;
+        case 3:
+          CHECK_BETTER(diag, tr + hstep, tc + hstep);
+          break;
+      }
+
+      // no reason to check the same one again.
+      if (tr == br && tc == bc)
+        break;
+
+      tr = br;
+      tc = bc;
+    }
+  }
+  bestmv->as_mv.row = br;
+  bestmv->as_mv.col = bc;
+
+  vpx_free(comp_pred);
+
+  if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) ||
+      (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3)))
+    return INT_MAX;
+
+  return besterr;
+}
+#endif  // CONFIG_COMP_INTER_JOINT_SEARCH
+
 #undef MVC
 #undef PRE
 #undef DIST
@@ -2132,7 +2327,109 @@ int vp9_refining_search_sadx4(MACROBLOCK *x,
     return INT_MAX;
 }
 
+#if CONFIG_COMP_INTER_JOINT_SEARCH
+/* This function is called when we do joint motion search in comp_inter_inter
+ * mode.
+ */
+int vp9_refining_search_8p_c(MACROBLOCK *x,
+                             int_mv *ref_mv, int error_per_bit,
+                             int search_range, vp9_variance_fn_ptr_t *fn_ptr,
+                             int *mvjcost, int *mvcost[2], int_mv *center_mv,
+                             const uint8_t *second_pred, int w, int h) {
+  const MACROBLOCKD* const xd = &x->e_mbd;
+  MV neighbors[8] = {{-1, 0}, {0, -1}, {0, 1}, {1, 0},
+      {-1, -1}, {1, -1}, {-1, 1}, {1, 1}};
+  int i, j;
+  int this_row_offset, this_col_offset;
 
+  int what_stride = x->plane[0].src.stride;
+  int in_what_stride = xd->plane[0].pre[0].stride;
+  uint8_t *what = x->plane[0].src.buf;
+  uint8_t *best_address = xd->plane[0].pre[0].buf +
+                          (ref_mv->as_mv.row * xd->plane[0].pre[0].stride) +
+                          ref_mv->as_mv.col;
+  uint8_t *check_here;
+  unsigned int thissad;
+  int_mv this_mv;
+  unsigned int bestsad = INT_MAX;
+  int_mv fcenter_mv;
+
+  int *mvjsadcost = x->nmvjointsadcost;
+  int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
+
+  /* Compound pred buffer */
+  uint8_t *comp_pred = vpx_memalign(16, w * h * sizeof(uint8_t));
+
+  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
+  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+
+  /* Get compound pred by averaging two pred blocks. */
+  comp_avg_pred(comp_pred, second_pred, w, h, best_address, in_what_stride);
+
+  bestsad = fn_ptr->sdf(what, what_stride, comp_pred, w, 0x7fffffff) +
+      mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, error_per_bit);
+
+  for (i = 0; i < search_range; i++) {
+    int best_site = -1;
+
+    for (j = 0; j < 8; j++) {
+      this_row_offset = ref_mv->as_mv.row + neighbors[j].row;
+      this_col_offset = ref_mv->as_mv.col + neighbors[j].col;
+
+      if ((this_col_offset > x->mv_col_min) &&
+          (this_col_offset < x->mv_col_max) &&
+          (this_row_offset > x->mv_row_min) &&
+          (this_row_offset < x->mv_row_max)) {
+        check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col +
+            best_address;
+
+        /* Get compound block and use it to calculate SAD. */
+        comp_avg_pred(comp_pred, second_pred, w, h, check_here,
+                      in_what_stride);
+        thissad = fn_ptr->sdf(what, what_stride, comp_pred, w, bestsad);
+
+        if (thissad < bestsad) {
+          this_mv.as_mv.row = this_row_offset;
+          this_mv.as_mv.col = this_col_offset;
+          thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvjsadcost,
+                                    mvsadcost, error_per_bit);
+
+          if (thissad < bestsad) {
+            bestsad = thissad;
+            best_site = j;
+          }
+        }
+      }
+    }
+
+    if (best_site == -1) {
+      break;
+    } else {
+      ref_mv->as_mv.row += neighbors[best_site].row;
+      ref_mv->as_mv.col += neighbors[best_site].col;
+      best_address += (neighbors[best_site].row) * in_what_stride +
+          neighbors[best_site].col;
+    }
+  }
+
+  this_mv.as_mv.row = ref_mv->as_mv.row << 3;
+  this_mv.as_mv.col = ref_mv->as_mv.col << 3;
+
+  if (bestsad < INT_MAX) {
+    int besterr;
+    comp_avg_pred(comp_pred, second_pred, w, h, best_address, in_what_stride);
+    besterr = fn_ptr->vf(what, what_stride, comp_pred, w,
+        (unsigned int *)(&thissad)) +
+        mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit,
+                    xd->allow_high_precision_mv);
+    vpx_free(comp_pred);
+    return besterr;
+  } else {
+    vpx_free(comp_pred);
+    return INT_MAX;
+  }
+}
+#endif  // CONFIG_COMP_INTER_JOINT_SEARCH
 
 #ifdef ENTROPY_STATS
 void print_mode_context(VP9_COMMON *pc) {
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index e1ba7fd9d..cdbd29aa5 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -79,5 +79,21 @@ typedef int (*vp9_diamond_search_fn_t)(MACROBLOCK *x,
                                        int *mvjcost, int *mvcost[2],
                                        int_mv *center_mv);
 
+#if CONFIG_COMP_INTER_JOINT_SEARCH
+int vp9_find_best_sub_pixel_comp(MACROBLOCK *x,
+                                 int_mv *bestmv, int_mv *ref_mv,
+                                 int error_per_bit,
+                                 const vp9_variance_fn_ptr_t *vfp,
+                                 int *mvjcost, int *mvcost[2],
+                                 int *distortion, unsigned int *sse1,
+                                 const uint8_t *second_pred,
+                                 int w, int h);
 
+int vp9_refining_search_8p_c(MACROBLOCK *x,
+                             int_mv *ref_mv, int error_per_bit,
+                             int search_range, vp9_variance_fn_ptr_t *fn_ptr,
+                             int *mvjcost, int *mvcost[2],
+                             int_mv *center_mv, const uint8_t *second_pred,
+                             int w, int h);
+#endif  // CONFIG_COMP_INTER_JOINT_SEARCH
 #endif  // VP9_ENCODER_VP9_MCOMP_H_
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index e55f5551f..610d7330b 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -1527,10 +1527,11 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
   for (i = 0; i < MAX_MODES; i++)
     cpi->rd_thresh_mult[i] = 128;
 
-#define BFP(BT, SDF, VF, SVF, SVFHH, SVFHV, SVFHHV, SDX3F, SDX8F, SDX4DF) \
+#define BFP(BT, SDF, VF, SVF, SVAF, SVFHH, SVFHV, SVFHHV, SDX3F, SDX8F, SDX4DF)\
     cpi->fn_ptr[BT].sdf            = SDF; \
     cpi->fn_ptr[BT].vf             = VF; \
     cpi->fn_ptr[BT].svf            = SVF; \
+    cpi->fn_ptr[BT].svaf           = SVAF; \
     cpi->fn_ptr[BT].svf_halfpix_h  = SVFHH; \
     cpi->fn_ptr[BT].svf_halfpix_v  = SVFHV; \
     cpi->fn_ptr[BT].svf_halfpix_hv = SVFHHV; \
@@ -1539,57 +1540,64 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
     cpi->fn_ptr[BT].sdx4df         = SDX4DF;
 
   BFP(BLOCK_32X16, vp9_sad32x16, vp9_variance32x16, vp9_sub_pixel_variance32x16,
-      NULL, NULL,
+      vp9_sub_pixel_avg_variance32x16, NULL, NULL,
       NULL, NULL, NULL,
       vp9_sad32x16x4d)
 
   BFP(BLOCK_16X32, vp9_sad16x32, vp9_variance16x32, vp9_sub_pixel_variance16x32,
-      NULL, NULL,
+      vp9_sub_pixel_avg_variance16x32, NULL, NULL,
       NULL, NULL, NULL,
       vp9_sad16x32x4d)
 
   BFP(BLOCK_64X32, vp9_sad64x32, vp9_variance64x32, vp9_sub_pixel_variance64x32,
-      NULL, NULL,
+      vp9_sub_pixel_avg_variance64x32, NULL, NULL,
       NULL, NULL, NULL,
       vp9_sad64x32x4d)
 
   BFP(BLOCK_32X64, vp9_sad32x64, vp9_variance32x64, vp9_sub_pixel_variance32x64,
-      NULL, NULL,
+      vp9_sub_pixel_avg_variance32x64, NULL, NULL,
       NULL, NULL, NULL,
       vp9_sad32x64x4d)
 
   BFP(BLOCK_32X32, vp9_sad32x32, vp9_variance32x32, vp9_sub_pixel_variance32x32,
-      vp9_variance_halfpixvar32x32_h, vp9_variance_halfpixvar32x32_v,
+      vp9_sub_pixel_avg_variance32x32, vp9_variance_halfpixvar32x32_h,
+      vp9_variance_halfpixvar32x32_v,
       vp9_variance_halfpixvar32x32_hv, vp9_sad32x32x3, vp9_sad32x32x8,
       vp9_sad32x32x4d)
 
   BFP(BLOCK_64X64, vp9_sad64x64, vp9_variance64x64, vp9_sub_pixel_variance64x64,
-      vp9_variance_halfpixvar64x64_h, vp9_variance_halfpixvar64x64_v,
+      vp9_sub_pixel_avg_variance64x64, vp9_variance_halfpixvar64x64_h,
+      vp9_variance_halfpixvar64x64_v,
       vp9_variance_halfpixvar64x64_hv, vp9_sad64x64x3, vp9_sad64x64x8,
       vp9_sad64x64x4d)
 
   BFP(BLOCK_16X16, vp9_sad16x16, vp9_variance16x16, vp9_sub_pixel_variance16x16,
-       vp9_variance_halfpixvar16x16_h, vp9_variance_halfpixvar16x16_v,
-       vp9_variance_halfpixvar16x16_hv, vp9_sad16x16x3, vp9_sad16x16x8,
-       vp9_sad16x16x4d)
+      vp9_sub_pixel_avg_variance16x16, vp9_variance_halfpixvar16x16_h,
+      vp9_variance_halfpixvar16x16_v,
+      vp9_variance_halfpixvar16x16_hv, vp9_sad16x16x3, vp9_sad16x16x8,
+      vp9_sad16x16x4d)
 
   BFP(BLOCK_16X8, vp9_sad16x8, vp9_variance16x8, vp9_sub_pixel_variance16x8,
-      NULL, NULL, NULL, vp9_sad16x8x3, vp9_sad16x8x8, vp9_sad16x8x4d)
+      vp9_sub_pixel_avg_variance16x8, NULL, NULL, NULL,
+      vp9_sad16x8x3, vp9_sad16x8x8, vp9_sad16x8x4d)
 
   BFP(BLOCK_8X16, vp9_sad8x16, vp9_variance8x16, vp9_sub_pixel_variance8x16,
-      NULL, NULL, NULL, vp9_sad8x16x3, vp9_sad8x16x8, vp9_sad8x16x4d)
+      vp9_sub_pixel_avg_variance8x16, NULL, NULL, NULL,
+      vp9_sad8x16x3, vp9_sad8x16x8, vp9_sad8x16x4d)
 
   BFP(BLOCK_8X8, vp9_sad8x8, vp9_variance8x8, vp9_sub_pixel_variance8x8,
-      NULL, NULL, NULL, vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d)
+      vp9_sub_pixel_avg_variance8x8, NULL, NULL, NULL,
+      vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d)
 
   BFP(BLOCK_4X8, NULL, vp9_variance4x8, NULL,
-      NULL, NULL, NULL, NULL, NULL, NULL)
+      NULL, NULL, NULL, NULL, NULL, NULL, NULL)
 
   BFP(BLOCK_8X4, NULL, vp9_variance8x4, NULL,
-      NULL, NULL, NULL, NULL, NULL, NULL)
+      NULL, NULL, NULL, NULL, NULL, NULL, NULL)
 
   BFP(BLOCK_4X4, vp9_sad4x4, vp9_variance4x4, vp9_sub_pixel_variance4x4,
-      NULL, NULL, NULL, vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)
+      vp9_sub_pixel_avg_variance4x4, NULL, NULL, NULL,
+      vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)
 
   cpi->full_search_sad = vp9_full_search_sad;
   cpi->diamond_search_sad = vp9_diamond_search_sad;
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 1b143f5e0..48356931a 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -1069,9 +1069,7 @@ typedef struct {
   B_PREDICTION_MODE modes[4];
   int_mv mvs[4], second_mvs[4];
   int eobs[4];
-
   int mvthresh;
-  int *mdcounts;
 } BEST_SEG_INFO;
 
 static INLINE int mv_check_bounds(MACROBLOCK *x, int_mv *mv) {
@@ -1322,7 +1320,6 @@ static int rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
                                        int_mv *best_ref_mv,
                                        int_mv *second_best_ref_mv,
                                        int64_t best_rd,
-                                       int *mdcounts,
                                        int *returntotrate,
                                        int *returnyrate,
                                        int *returndistortion,
@@ -1339,7 +1336,6 @@ static int rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
   bsi.second_ref_mv = second_best_ref_mv;
   bsi.mvp.as_int = best_ref_mv->as_int;
   bsi.mvthresh = mvthresh;
-  bsi.mdcounts = mdcounts;
 
   for (i = 0; i < 4; i++)
     bsi.modes[i] = ZERO4X4;
@@ -1612,7 +1608,6 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
                                int mi_row, int mi_col,
                                int_mv frame_nearest_mv[MAX_REF_FRAMES],
                                int_mv frame_near_mv[MAX_REF_FRAMES],
-                               int frame_mdcounts[4][4],
                                struct buf_2d yv12_mb[4][MAX_MB_PLANE],
                                struct scale_factors scale[MAX_REF_FRAMES]) {
   VP9_COMMON *cm = &cpi->common;
@@ -1797,7 +1792,7 @@ static INLINE int get_switchable_rate(VP9_COMMON *cm, MACROBLOCK *x) {
 
 static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                  BLOCK_SIZE_TYPE bsize,
-                                 int mdcounts[4], int64_t txfm_cache[],
+                                 int64_t txfm_cache[],
                                  int *rate2, int *distortion, int *skippable,
                                  int *compmode_cost,
                                  int *rate_y, int *distortion_y,
@@ -1807,8 +1802,9 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                  INTERPOLATIONFILTERTYPE *best_filter,
                                  int_mv frame_mv[MB_MODE_COUNT]
                                                 [MAX_REF_FRAMES],
-                                 YV12_BUFFER_CONFIG *scaled_ref_frame,
-                                 int mi_row, int mi_col) {
+                                 YV12_BUFFER_CONFIG **scaled_ref_frame,
+                                 int mi_row, int mi_col,
+                                 int_mv single_newmv[MAX_REF_FRAMES]) {
   const int bw = 1 << mi_width_log2(bsize), bh = 1 << mi_height_log2(bsize);
 
   VP9_COMMON *cm = &cpi->common;
@@ -1838,6 +1834,152 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       ref_mv[1] = mbmi->ref_mvs[refs[1]][0];
 
       if (is_comp_pred) {
+#if CONFIG_COMP_INTER_JOINT_SEARCH
+        const int b_sz[BLOCK_SIZE_TYPES][2] = {
+            {4, 4},
+            {8, 8},
+            {8, 16},
+            {16, 8},
+            {16, 16},
+            {16, 32},
+            {32, 16},
+            {32, 32},
+            {32, 64},
+            {64, 32},
+            {64, 64}
+        };
+
+        int ite;
+        // Prediction buffer from second frame.
+        uint8_t *second_pred = vpx_memalign(16, b_sz[bsize][0] *
+                                            b_sz[bsize][1] * sizeof(uint8_t));
+
+        // Do joint motion search in compound mode to get more accurate mv.
+        struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
+        struct buf_2d backup_second_yv12[MAX_MB_PLANE] = {{0}};
+        struct buf_2d scaled_first_yv12;
+
+        if (scaled_ref_frame[0]) {
+          int i;
+
+          // Swap out the reference frame for a version that's been scaled to
+          // match the resolution of the current frame, allowing the existing
+          // motion search code to be used without additional modifications.
+          for (i = 0; i < MAX_MB_PLANE; i++)
+            backup_yv12[i] = xd->plane[i].pre[0];
+
+          setup_pre_planes(xd, scaled_ref_frame[0], NULL, mi_row, mi_col,
+                           NULL, NULL);
+        }
+
+        if (scaled_ref_frame[1]) {
+          int i;
+
+          for (i = 0; i < MAX_MB_PLANE; i++)
+            backup_second_yv12[i] = xd->plane[i].pre[1];
+
+          setup_pre_planes(xd, scaled_ref_frame[1], NULL, mi_row, mi_col,
+                           NULL, NULL);
+        }
+        xd->scale_factor[0].set_scaled_offsets(&xd->scale_factor[0],
+                                                mi_row, mi_col);
+        xd->scale_factor[1].set_scaled_offsets(&xd->scale_factor[1],
+                                                mi_row, mi_col);
+
+        scaled_first_yv12 = xd->plane[0].pre[0];
+
+        // Initialize mv using single prediction mode result.
+        frame_mv[NEWMV][refs[0]].as_int = single_newmv[refs[0]].as_int;
+        frame_mv[NEWMV][refs[1]].as_int = single_newmv[refs[1]].as_int;
+
+        // Iteration: joint search is done once for each ref frame.
+        // Tried allowing search multiple times iteratively, and break out if
+        // it couldn't find better mv. But tests didn't show noticeable
+        // improvement.
+        for (ite = 0; ite < 2; ite++) {
+          struct buf_2d ref_yv12[2] = {xd->plane[0].pre[0],
+                                       xd->plane[0].pre[1]};
+          int bestsme = INT_MAX;
+          int sadpb = x->sadperbit16;
+          int_mv tmp_mv;
+          int search_range = 3;
+
+          int tmp_col_min = x->mv_col_min;
+          int tmp_col_max = x->mv_col_max;
+          int tmp_row_min = x->mv_row_min;
+          int tmp_row_max = x->mv_row_max;
+          int id = ite % 2;
+
+          // Get pred block from second frame.
+          vp9_build_inter_predictor(ref_yv12[!id].buf,
+                                    ref_yv12[!id].stride,
+                                    second_pred, b_sz[bsize][0],
+                                    &frame_mv[NEWMV][refs[!id]],
+                                    &xd->scale_factor[!id],
+                                    b_sz[bsize][0], b_sz[bsize][1], 0,
+                                    &xd->subpix);
+
+          // Compound motion search on first ref frame.
+          if (id)
+            xd->plane[0].pre[0] = ref_yv12[id];
+          vp9_clamp_mv_min_max(x, &ref_mv[id]);
+
+          // Use mv result from single mode as mvp.
+          tmp_mv.as_int = frame_mv[NEWMV][refs[id]].as_int;
+
+          tmp_mv.as_mv.col >>= 3;
+          tmp_mv.as_mv.row >>= 3;
+
+          // Small-range full-pixel motion search
+          bestsme = vp9_refining_search_8p_c(x, &tmp_mv, sadpb,
+                                             search_range,
+                                             &cpi->fn_ptr[block_size],
+                                             x->nmvjointcost, x->mvcost,
+                                             &ref_mv[id], second_pred,
+                                             b_sz[bsize][0], b_sz[bsize][1]);
+
+          x->mv_col_min = tmp_col_min;
+          x->mv_col_max = tmp_col_max;
+          x->mv_row_min = tmp_row_min;
+          x->mv_row_max = tmp_row_max;
+
+          if (bestsme < INT_MAX) {
+            int dis; /* TODO: use dis in distortion calculation later. */
+            unsigned int sse;
+
+            vp9_find_best_sub_pixel_comp(x, &tmp_mv,
+                                         &ref_mv[id],
+                                         x->errorperbit,
+                                         &cpi->fn_ptr[block_size],
+                                         x->nmvjointcost, x->mvcost,
+                                         &dis, &sse, second_pred,
+                                         b_sz[bsize][0], b_sz[bsize][1]);
+          }
+
+          frame_mv[NEWMV][refs[id]].as_int =
+              xd->mode_info_context->bmi[0].as_mv[1].as_int = tmp_mv.as_int;
+          if (id)
+            xd->plane[0].pre[0] = scaled_first_yv12;
+        }
+
+        // restore the predictor
+        if (scaled_ref_frame[0]) {
+          int i;
+
+          for (i = 0; i < MAX_MB_PLANE; i++)
+            xd->plane[i].pre[0] = backup_yv12[i];
+        }
+
+        if (scaled_ref_frame[1]) {
+          int i;
+
+          for (i = 0; i < MAX_MB_PLANE; i++)
+            xd->plane[i].pre[1] = backup_second_yv12[i];
+        }
+
+        vpx_free(second_pred);
+#endif  // CONFIG_COMP_INTER_JOINT_SEARCH
+
         if (frame_mv[NEWMV][refs[0]].as_int == INVALID_MV ||
             frame_mv[NEWMV][refs[1]].as_int == INVALID_MV)
           return INT64_MAX;
@@ -1862,7 +2004,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
         int tmp_row_min = x->mv_row_min;
         int tmp_row_max = x->mv_row_max;
 
-        if (scaled_ref_frame) {
+        if (scaled_ref_frame[0]) {
           int i;
 
           // Swap out the reference frame for a version that's been scaled to
@@ -1871,7 +2013,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
           for (i = 0; i < MAX_MB_PLANE; i++)
             backup_yv12[i] = xd->plane[i].pre[0];
 
-          setup_pre_planes(xd, scaled_ref_frame, NULL, mi_row, mi_col,
+          setup_pre_planes(xd, scaled_ref_frame[0], NULL, mi_row, mi_col,
                            NULL, NULL);
         }
 
@@ -1914,6 +2056,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
         }
         frame_mv[NEWMV][refs[0]].as_int =
           xd->mode_info_context->bmi[0].as_mv[0].as_int = tmp_mv.as_int;
+        single_newmv[refs[0]].as_int = tmp_mv.as_int;
 
         // Add the new motion vector cost to our rolling cost variable
         *rate2 += vp9_mv_bit_cost(&tmp_mv, &ref_mv[0],
@@ -1921,7 +2064,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                   96, xd->allow_high_precision_mv);
 
         // restore the predictor, if required
-        if (scaled_ref_frame) {
+        if (scaled_ref_frame[0]) {
           int i;
 
           for (i = 0; i < MAX_MB_PLANE; i++)
@@ -2203,15 +2346,14 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   unsigned char segment_id = xd->mode_info_context->mbmi.segment_id;
   int comp_pred, i;
   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
-  int frame_mdcounts[4][4];
   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
+  int_mv single_newmv[MAX_REF_FRAMES];
   static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
                                     VP9_ALT_FLAG };
   int idx_list[4] = {0,
                      cpi->lst_fb_idx,
                      cpi->gld_fb_idx,
                      cpi->alt_fb_idx};
-  int mdcounts[4];
   int64_t best_rd = INT64_MAX;
   int64_t best_txfm_rd[NB_TXFM_MODES];
   int64_t best_txfm_diff[NB_TXFM_MODES];
@@ -2251,6 +2393,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   xd->mode_info_context->mbmi.segment_id = segment_id;
   estimate_ref_frame_costs(cpi, segment_id, ref_costs);
   vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));
+  vpx_memset(&single_newmv, 0, sizeof(single_newmv));
 
   for (i = 0; i < NB_PREDICTION_TYPES; ++i)
     best_pred_rd[i] = INT64_MAX;
@@ -2293,7 +2436,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
       setup_buffer_inter(cpi, x, idx_list[ref_frame], ref_frame, block_size,
                          mi_row, mi_col, frame_mv[NEARESTMV], frame_mv[NEARMV],
-                         frame_mdcounts, yv12_mb, scale_factor);
+                         yv12_mb, scale_factor);
     }
     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
     frame_mv[ZEROMV][ref_frame].as_int = 0;
@@ -2420,8 +2563,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         xd->plane[i].pre[1] = yv12_mb[second_ref][i];
     }
 
-    vpx_memcpy(mdcounts, frame_mdcounts[ref_frame], sizeof(mdcounts));
-
     // If the segment reference frame feature is enabled....
     // then do nothing if the current ref frame is not allowed..
     if (vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
@@ -2519,7 +2660,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
         tmp_rd = rd_pick_best_mbsegmentation(cpi, x,
                                              &mbmi->ref_mvs[mbmi->ref_frame][0],
-                                             second_ref, INT64_MAX, mdcounts,
+                                             second_ref, INT64_MAX,
                                              &rate, &rate_y, &distortion,
                                              &skippable,
                                              (int)this_rd_thresh, seg_mvs);
@@ -2558,7 +2699,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         // switchable list (bilinear, 6-tap) is indicated at the frame level
         tmp_rd = rd_pick_best_mbsegmentation(cpi, x,
                                              &mbmi->ref_mvs[mbmi->ref_frame][0],
-                                             second_ref, INT64_MAX, mdcounts,
+                                             second_ref, INT64_MAX,
                                              &rate, &rate_y, &distortion,
                                              &skippable,
                                              (int)this_rd_thresh, seg_mvs);
@@ -2608,7 +2749,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
           vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_COMP), is_comp_pred);
       mbmi->mode = this_mode;
     } else {
-      YV12_BUFFER_CONFIG *scaled_ref_frame = NULL;
+      YV12_BUFFER_CONFIG *scaled_ref_frame[2] = {NULL, NULL};
       int fb;
 
       if (mbmi->ref_frame == LAST_FRAME) {
@@ -2620,17 +2761,31 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       }
 
       if (cpi->scaled_ref_idx[fb] != cm->ref_frame_map[fb])
-        scaled_ref_frame = &cm->yv12_fb[cpi->scaled_ref_idx[fb]];
+        scaled_ref_frame[0] = &cm->yv12_fb[cpi->scaled_ref_idx[fb]];
+
+      if (comp_pred) {
+        if (mbmi->second_ref_frame == LAST_FRAME) {
+          fb = cpi->lst_fb_idx;
+        } else if (mbmi->second_ref_frame == GOLDEN_FRAME) {
+          fb = cpi->gld_fb_idx;
+        } else {
+          fb = cpi->alt_fb_idx;
+        }
+
+        if (cpi->scaled_ref_idx[fb] != cm->ref_frame_map[fb])
+          scaled_ref_frame[1] = &cm->yv12_fb[cpi->scaled_ref_idx[fb]];
+      }
 
       this_rd = handle_inter_mode(cpi, x, bsize,
-                                  mdcounts, txfm_cache,
+                                  txfm_cache,
                                   &rate2, &distortion2, &skippable,
                                   &compmode_cost,
                                   &rate_y, &distortion_y,
                                   &rate_uv, &distortion_uv,
                                   &mode_excluded, &disable_skip,
                                   mode_index, &tmp_best_filter, frame_mv,
-                                  scaled_ref_frame, mi_row, mi_col);
+                                  scaled_ref_frame, mi_row, mi_col,
+                                  single_newmv);
       if (this_rd == INT64_MAX)
         continue;
     }
diff --git a/vp9/encoder/vp9_variance.h b/vp9/encoder/vp9_variance.h
index 13dabbda4..306476b01 100644
--- a/vp9/encoder/vp9_variance.h
+++ b/vp9/encoder/vp9_variance.h
@@ -12,6 +12,7 @@
 #define VP9_ENCODER_VP9_VARIANCE_H_
 
 #include "vpx/vpx_integer.h"
+// #include "./vpx_config.h"
 
 typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr,
                                     int source_stride,
@@ -50,6 +51,15 @@ typedef unsigned int (*vp9_subpixvariance_fn_t)(const uint8_t *src_ptr,
                                                 int Refstride,
                                                 unsigned int *sse);
 
+typedef unsigned int (*vp9_subp_avg_variance_fn_t)(const uint8_t *src_ptr,
+                                                   int source_stride,
+                                                   int xoffset,
+                                                   int yoffset,
+                                                   const uint8_t *ref_ptr,
+                                                   int Refstride,
+                                                   unsigned int *sse,
+                                                   const uint8_t *second_pred);
+
 typedef void (*vp9_ssimpf_fn_t)(uint8_t *s, int sp, uint8_t *r,
                                 int rp, unsigned long *sum_s,
                                 unsigned long *sum_r, unsigned long *sum_sq_s,
@@ -64,15 +74,33 @@ typedef unsigned int (*vp9_get16x16prederror_fn_t)(const uint8_t *src_ptr,
                                                    int  ref_stride);
 
 typedef struct vp9_variance_vtable {
-    vp9_sad_fn_t            sdf;
-    vp9_variance_fn_t       vf;
-    vp9_subpixvariance_fn_t svf;
-    vp9_variance_fn_t       svf_halfpix_h;
-    vp9_variance_fn_t       svf_halfpix_v;
-    vp9_variance_fn_t       svf_halfpix_hv;
-    vp9_sad_multi_fn_t      sdx3f;
-    vp9_sad_multi1_fn_t     sdx8f;
-    vp9_sad_multi_d_fn_t    sdx4df;
+    vp9_sad_fn_t               sdf;
+    vp9_variance_fn_t          vf;
+    vp9_subpixvariance_fn_t    svf;
+    vp9_subp_avg_variance_fn_t svaf;
+    vp9_variance_fn_t          svf_halfpix_h;
+    vp9_variance_fn_t          svf_halfpix_v;
+    vp9_variance_fn_t          svf_halfpix_hv;
+    vp9_sad_multi_fn_t         sdx3f;
+    vp9_sad_multi1_fn_t        sdx8f;
+    vp9_sad_multi_d_fn_t       sdx4df;
 } vp9_variance_fn_ptr_t;
 
+// #if CONFIG_COMP_INTER_JOINT_SEARCH
+static void comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int weight,
+                          int height, uint8_t *ref, int ref_stride) {
+  int i, j;
+
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < weight; j++) {
+      int tmp;
+      tmp = pred[j] + ref[j];
+      comp_pred[j] = (tmp + 1) >> 1;
+    }
+    comp_pred += weight;
+    pred += weight;
+    ref += ref_stride;
+  }
+}
+// #endif  // CONFIG_COMP_INTER_JOINT_SEARCH
 #endif  // VP9_ENCODER_VP9_VARIANCE_H_
diff --git a/vp9/encoder/vp9_variance_c.c b/vp9/encoder/vp9_variance_c.c
index c2a600408..fa53abdec 100644
--- a/vp9/encoder/vp9_variance_c.c
+++ b/vp9/encoder/vp9_variance_c.c
@@ -13,6 +13,7 @@
 #include "vp9/common/vp9_filter.h"
 #include "vp9/common/vp9_subpelvar.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
 
 unsigned int vp9_get_mb_ss_c(const int16_t *src_ptr) {
   unsigned int i, sum = 0;
@@ -58,6 +59,29 @@ unsigned int vp9_sub_pixel_variance64x32_c(const uint8_t *src_ptr,
   return vp9_variance64x32_c(temp2, 64, dst_ptr, dst_pixels_per_line, sse);
 }
 
+unsigned int vp9_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr,
+                                               int  src_pixels_per_line,
+                                               int  xoffset,
+                                               int  yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse,
+                                               const uint8_t *second_pred) {
+  uint16_t fdata3[65 * 64];  // Temp data bufffer used in filtering
+  uint8_t temp2[68 * 64];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 64 * 64);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 33, 64, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 64, 64, 32, 64, vfilter);
+  comp_avg_pred(temp3, second_pred, 64, 32, temp2, 64);
+  return vp9_variance64x32_c(temp3, 64, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_variance32x64_c(const uint8_t *src_ptr,
                                  int  source_stride,
                                  const uint8_t *ref_ptr,
@@ -92,6 +116,29 @@ unsigned int vp9_sub_pixel_variance32x64_c(const uint8_t *src_ptr,
   return vp9_variance32x64_c(temp2, 32, dst_ptr, dst_pixels_per_line, sse);
 }
 
+unsigned int vp9_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr,
+                                               int  src_pixels_per_line,
+                                               int  xoffset,
+                                               int  yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse,
+                                               const uint8_t *second_pred) {
+  uint16_t fdata3[65 * 64];  // Temp data bufffer used in filtering
+  uint8_t temp2[68 * 64];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 64);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 65, 32, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 64, 32, vfilter);
+  comp_avg_pred(temp3, second_pred, 32, 64, temp2, 32);
+  return vp9_variance32x64_c(temp3, 32, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_variance32x16_c(const uint8_t *src_ptr,
                                  int  source_stride,
                                  const uint8_t *ref_ptr,
@@ -126,6 +173,29 @@ unsigned int vp9_sub_pixel_variance32x16_c(const uint8_t *src_ptr,
   return vp9_variance32x16_c(temp2, 32, dst_ptr, dst_pixels_per_line, sse);
 }
 
+unsigned int vp9_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr,
+                                               int  src_pixels_per_line,
+                                               int  xoffset,
+                                               int  yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse,
+                                               const uint8_t *second_pred) {
+  uint16_t fdata3[33 * 32];  // Temp data bufffer used in filtering
+  uint8_t temp2[36 * 32];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 16);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 17, 32, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 16, 32, vfilter);
+  comp_avg_pred(temp3, second_pred, 32, 16, temp2, 32);
+  return vp9_variance32x16_c(temp3, 32, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_variance16x32_c(const uint8_t *src_ptr,
                                  int  source_stride,
                                  const uint8_t *ref_ptr,
@@ -160,6 +230,29 @@ unsigned int vp9_sub_pixel_variance16x32_c(const uint8_t *src_ptr,
   return vp9_variance16x32_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
 }
 
+unsigned int vp9_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr,
+                                               int  src_pixels_per_line,
+                                               int  xoffset,
+                                               int  yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse,
+                                               const uint8_t *second_pred) {
+  uint16_t fdata3[33 * 32];  // Temp data bufffer used in filtering
+  uint8_t temp2[36 * 32];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 32);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 33, 16, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 32, 16, vfilter);
+  comp_avg_pred(temp3, second_pred, 16, 32, temp2, 16);
+  return vp9_variance16x32_c(temp3, 16, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_variance64x64_c(const uint8_t *src_ptr,
                                  int  source_stride,
                                  const uint8_t *ref_ptr,
@@ -317,6 +410,31 @@ unsigned int vp9_sub_pixel_variance4x4_c(const uint8_t *src_ptr,
   return vp9_variance4x4_c(temp2, 4, dst_ptr, dst_pixels_per_line, sse);
 }
 
+unsigned int vp9_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr,
+                                             int  src_pixels_per_line,
+                                             int  xoffset,
+                                             int  yoffset,
+                                             const uint8_t *dst_ptr,
+                                             int dst_pixels_per_line,
+                                             unsigned int *sse,
+                                             const uint8_t *second_pred) {
+  uint8_t temp2[20 * 16];
+  const int16_t *hfilter, *vfilter;
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 4 * 4);  // compound pred buffer
+  uint16_t fdata3[5 * 4];  // Temp data bufffer used in filtering
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  // First filter 1d Horizontal
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 5, 4, hfilter);
+
+  // Now filter Verticaly
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 4,  4,  4,  4, vfilter);
+  comp_avg_pred(temp3, second_pred, 4, 4, temp2, 4);
+  return vp9_variance4x4_c(temp3, 4, dst_ptr, dst_pixels_per_line, sse);
+}
 
 unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr,
                                          int  src_pixels_per_line,
@@ -339,6 +457,29 @@ unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr,
   return vp9_variance8x8_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
 }
 
+unsigned int vp9_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr,
+                                             int  src_pixels_per_line,
+                                             int  xoffset,
+                                             int  yoffset,
+                                             const uint8_t *dst_ptr,
+                                             int dst_pixels_per_line,
+                                             unsigned int *sse,
+                                             const uint8_t *second_pred) {
+  uint16_t fdata3[9 * 8];  // Temp data bufffer used in filtering
+  uint8_t temp2[20 * 16];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 8 * 8);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 9, 8, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 8, 8, vfilter);
+  comp_avg_pred(temp3, second_pred, 8, 8, temp2, 8);
+  return vp9_variance8x8_c(temp3, 8, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr,
                                            int  src_pixels_per_line,
                                            int  xoffset,
@@ -360,6 +501,30 @@ unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr,
   return vp9_variance16x16_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
 }
 
+unsigned int vp9_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr,
+                                               int  src_pixels_per_line,
+                                               int  xoffset,
+                                               int  yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse,
+                                               const uint8_t *second_pred) {
+  uint16_t fdata3[17 * 16];
+  uint8_t temp2[20 * 16];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 16);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 17, 16, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 16, 16, vfilter);
+
+  comp_avg_pred(temp3, second_pred, 16, 16, temp2, 16);
+  return vp9_variance16x16_c(temp3, 16, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr,
                                            int  src_pixels_per_line,
                                            int  xoffset,
@@ -381,6 +546,29 @@ unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr,
   return vp9_variance64x64_c(temp2, 64, dst_ptr, dst_pixels_per_line, sse);
 }
 
+unsigned int vp9_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr,
+                                               int  src_pixels_per_line,
+                                               int  xoffset,
+                                               int  yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse,
+                                               const uint8_t *second_pred) {
+  uint16_t fdata3[65 * 64];  // Temp data bufffer used in filtering
+  uint8_t temp2[68 * 64];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 64 * 64);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 65, 64, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 64, 64, 64, 64, vfilter);
+  comp_avg_pred(temp3, second_pred, 64, 64, temp2, 64);
+  return vp9_variance64x64_c(temp3, 64, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr,
                                            int  src_pixels_per_line,
                                            int  xoffset,
@@ -402,6 +590,29 @@ unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr,
   return vp9_variance32x32_c(temp2, 32, dst_ptr, dst_pixels_per_line, sse);
 }
 
+unsigned int vp9_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr,
+                                               int  src_pixels_per_line,
+                                               int  xoffset,
+                                               int  yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse,
+                                               const uint8_t *second_pred) {
+  uint16_t fdata3[33 * 32];  // Temp data bufffer used in filtering
+  uint8_t temp2[36 * 32];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 32);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 33, 32, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 32, 32, vfilter);
+  comp_avg_pred(temp3, second_pred, 32, 32, temp2, 32);
+  return vp9_variance32x32_c(temp3, 32, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_variance_halfpixvar16x16_h_c(const uint8_t *src_ptr,
                                               int  source_stride,
                                               const uint8_t *ref_ptr,
@@ -543,6 +754,29 @@ unsigned int vp9_sub_pixel_variance16x8_c(const uint8_t *src_ptr,
   return vp9_variance16x8_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
 }
 
+unsigned int vp9_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr,
+                                              int  src_pixels_per_line,
+                                              int  xoffset,
+                                              int  yoffset,
+                                              const uint8_t *dst_ptr,
+                                              int dst_pixels_per_line,
+                                              unsigned int *sse,
+                                              const uint8_t *second_pred) {
+  uint16_t fdata3[16 * 9];  // Temp data bufffer used in filtering
+  uint8_t temp2[20 * 16];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 8);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 9, 16, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 8, 16, vfilter);
+  comp_avg_pred(temp3, second_pred, 16, 8, temp2, 16);
+  return vp9_variance16x8_c(temp3, 16, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr,
                                           int  src_pixels_per_line,
                                           int  xoffset,
@@ -564,3 +798,25 @@ unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr,
   return vp9_variance8x16_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
 }
 
+unsigned int vp9_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr,
+                                              int  src_pixels_per_line,
+                                              int  xoffset,
+                                              int  yoffset,
+                                              const uint8_t *dst_ptr,
+                                              int dst_pixels_per_line,
+                                              unsigned int *sse,
+                                              const uint8_t *second_pred) {
+  uint16_t fdata3[9 * 16];  // Temp data bufffer used in filtering
+  uint8_t temp2[20 * 16];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 8 * 16);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 17, 8, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 16, 8, vfilter);
+  comp_avg_pred(temp3, second_pred, 8, 16, temp2, 8);
+  return vp9_variance8x16_c(temp3, 8, dst_ptr, dst_pixels_per_line, sse);
+}