23 files changed, 587 insertions, 417 deletions
diff --git a/vp9/common/arm/neon/vp9_loopfilter_neon.asm b/vp9/common/arm/neon/vp9_loopfilter_neon.asm
index 4fe1a6ac6..8b4fe5dcc 100644
--- a/vp9/common/arm/neon/vp9_loopfilter_neon.asm
+++ b/vp9/common/arm/neon/vp9_loopfilter_neon.asm
@@ -37,13 +37,14 @@
 |vp9_loop_filter_horizontal_edge_neon| PROC
     push        {lr}
 
-    ldr         r12, [sp,#8]               ; load count
+    vld1.8      {d0[]}, [r2]               ; duplicate *blimit
+    ldr         r12, [sp, #8]              ; load count
+    ldr         r2, [sp, #4]               ; load thresh
     add         r1, r1, r1                 ; double pitch
+
     cmp         r12, #0
     beq         end_vp9_lf_h_edge
 
-    vld1.8      {d0[]}, [r2]               ; duplicate *blimit
-    ldr         r2, [sp, #4]               ; load thresh
     vld1.8      {d1[]}, [r3]               ; duplicate *limit
     vld1.8      {d2[]}, [r2]               ; duplicate *thresh
 
@@ -99,18 +100,18 @@ end_vp9_lf_h_edge
 |vp9_loop_filter_vertical_edge_neon| PROC
     push        {lr}
 
-    ldr         r12, [sp,#8]               ; load count
+    vld1.8      {d0[]}, [r2]              ; duplicate *blimit
+    ldr         r12, [sp, #8]             ; load count
+    vld1.8      {d1[]}, [r3]              ; duplicate *limit
+
+    ldr         r3, [sp, #4]              ; load thresh
+    sub         r2, r0, #4                ; move s pointer down by 4 columns
     cmp         r12, #0
     beq         end_vp9_lf_v_edge
 
-    vld1.8      {d0[]}, [r2]               ; duplicate *blimit
-    ldr         r2, [sp, #4]               ; load thresh
-    vld1.8      {d1[]}, [r3]               ; duplicate *limit
-    vld1.8      {d2[]}, [r2]               ; duplicate *thresh
+    vld1.8      {d2[]}, [r3]              ; duplicate *thresh
 
 count_lf_v_loop
-    sub         r2, r0, #4                 ; move s pointer down by 4 columns
-
     vld1.u8     {d3}, [r2], r1             ; load s data
     vld1.u8     {d4}, [r2], r1
     vld1.u8     {d5}, [r2], r1
@@ -152,6 +153,7 @@ count_lf_v_loop
 
     add         r0, r0, r1, lsl #3         ; s += pitch * 8
     subs        r12, r12, #1
+    subne       r2, r0, #4                 ; move s pointer down by 4 columns
     bne         count_lf_v_loop
 
 end_vp9_lf_v_edge
@@ -163,6 +165,7 @@ end_vp9_lf_v_edge
 ; necessary load, transpose (if necessary) and store. The function does not use
 ; registers d8-d15.
 ;
+; Inputs:
 ; r0-r3, r12 PRESERVE
 ; d0    blimit
 ; d1    limit
@@ -175,39 +178,48 @@ end_vp9_lf_v_edge
 ; d16   q1
 ; d17   q2
 ; d18   q3
+;
+; Outputs:
+; d4    op1
+; d5    op0
+; d6    oq0
+; d7    oq1
 |vp9_loop_filter_neon| PROC
     ; filter_mask
-    vabd.u8     d19, d3, d4                 ; abs(p3 - p2)
-    vabd.u8     d20, d4, d5                 ; abs(p2 - p1)
-    vabd.u8     d21, d5, d6                 ; abs(p1 - p0)
-    vabd.u8     d22, d16, d7                ; abs(q1 - q0)
-    vabd.u8     d3, d17, d16                ; abs(q2 - q1)
-    vabd.u8     d4, d18, d17                ; abs(q3 - q2)
+    vabd.u8     d19, d3, d4                 ; m1 = abs(p3 - p2)
+    vabd.u8     d20, d4, d5                 ; m2 = abs(p2 - p1)
+    vabd.u8     d21, d5, d6                 ; m3 = abs(p1 - p0)
+    vabd.u8     d22, d16, d7                ; m4 = abs(q1 - q0)
+    vabd.u8     d3, d17, d16                ; m5 = abs(q2 - q1)
+    vabd.u8     d4, d18, d17                ; m6 = abs(q3 - q2)
 
     ; only compare the largest value to limit
-    vmax.u8     d19, d19, d20
-    vmax.u8     d20, d21, d22
-    vmax.u8     d3, d3, d4
-    vmax.u8     d23, d19, d20
+    vmax.u8     d19, d19, d20               ; m1 = max(m1, m2)
+    vmax.u8     d20, d21, d22               ; m2 = max(m3, m4)
 
     vabd.u8     d17, d6, d7                 ; abs(p0 - q0)
 
+    vmax.u8     d3, d3, d4                  ; m3 = max(m5, m6)
+
+    vmov.u8     d18, #0x80
+
+    vmax.u8     d23, d19, d20               ; m1 = max(m1, m2)
+
     ; hevmask
     vcgt.u8     d21, d21, d2                ; (abs(p1 - p0) > thresh)*-1
     vcgt.u8     d22, d22, d2                ; (abs(q1 - q0) > thresh)*-1
-    vmax.u8     d23, d23, d3
-
-    vmov.u8     d18, #0x80
+    vmax.u8     d23, d23, d3                ; m1 = max(m1, m3)
 
     vabd.u8     d28, d5, d16                ; a = abs(p1 - q1)
     vqadd.u8    d17, d17, d17               ; b = abs(p0 - q0) * 2
 
-    ; abs () > limit
-    vcge.u8     d23, d1, d23
+    veor        d7, d7, d18                 ; qs0
+
+    vcge.u8     d23, d1, d23                ; abs(m1) > limit
 
     ; filter() function
     ; convert to signed
-    veor        d7, d7, d18                 ; qs0
+
     vshr.u8     d28, d28, #1                ; a = a / 2
     veor        d6, d6, d18                 ; ps0
 
@@ -244,19 +256,20 @@ end_vp9_lf_v_edge
     vshr.s8     d28, d28, #3                ; filter2 >>= 3
     vshr.s8     d27, d27, #3                ; filter1 >>= 3
 
-
     vqadd.s8    d19, d6, d28                ; u = clamp(ps0 + filter2)
     vqsub.s8    d26, d7, d27                ; u = clamp(qs0 - filter1)
 
-    ; outer tap adjustments: ++filter >> 1
-    vrshr.s8    d27, d27, #1
+    ; outer tap adjustments
+    vrshr.s8    d27, d27, #1                ; filter = ++filter1 >> 1
+
+    veor        d6, d26, d18                ; *oq0 = u^0x80
+
     vbic        d27, d27, d22               ; filter &= ~hev
 
     vqadd.s8    d21, d5, d27                ; u = clamp(ps1 + filter)
     vqsub.s8    d20, d16, d27               ; u = clamp(qs1 - filter)
 
     veor        d5, d19, d18                ; *op0 = u^0x80
-    veor        d6, d26, d18                ; *oq0 = u^0x80
     veor        d4, d21, d18                ; *op1 = u^0x80
     veor        d7, d20, d18                ; *oq1 = u^0x80
 
@@ -277,13 +290,14 @@ end_vp9_lf_v_edge
 |vp9_mbloop_filter_horizontal_edge_neon| PROC
     push        {r4-r5, lr}
 
-    ldr         r12, [sp,#16]              ; load count
+    vld1.8      {d0[]}, [r2]               ; duplicate *blimit
+    ldr         r12, [sp, #16]             ; load count
+    ldr         r2, [sp, #12]              ; load thresh
     add         r1, r1, r1                 ; double pitch
+
     cmp         r12, #0
     beq         end_vp9_mblf_h_edge
 
-    vld1.8      {d0[]}, [r2]               ; duplicate *blimit
-    ldr         r2, [sp, #12]              ; load thresh
     vld1.8      {d1[]}, [r3]               ; duplicate *limit
     vld1.8      {d2[]}, [r2]               ; duplicate *thresh
 
@@ -305,12 +319,12 @@ count_mblf_h_loop
 
     bl          vp9_mbloop_filter_neon
 
-    vst1.u8     {d2}, [r2@64], r1          ; store op2
-    vst1.u8     {d3}, [r3@64], r1          ; store op1
-    vst1.u8     {d4}, [r2@64], r1          ; store op0
-    vst1.u8     {d5}, [r3@64], r1          ; store oq0
-    vst1.u8     {d6}, [r2@64], r1          ; store oq1
-    vst1.u8     {d7}, [r3@64], r1          ; store oq2
+    vst1.u8     {d0}, [r2@64], r1          ; store op2
+    vst1.u8     {d1}, [r3@64], r1          ; store op1
+    vst1.u8     {d2}, [r2@64], r1          ; store op0
+    vst1.u8     {d3}, [r3@64], r1          ; store oq0
+    vst1.u8     {d4}, [r2@64], r1          ; store oq1
+    vst1.u8     {d5}, [r3@64], r1          ; store oq2
 
     add         r0, r0, #8
     subs        r12, r12, #1
@@ -337,18 +351,18 @@ end_vp9_mblf_h_edge
 |vp9_mbloop_filter_vertical_edge_neon| PROC
     push        {r4-r5, lr}
 
-    ldr         r12, [sp,#16]              ; load count
+    vld1.8      {d0[]}, [r2]              ; duplicate *blimit
+    ldr         r12, [sp, #16]            ; load count
+    vld1.8      {d1[]}, [r3]              ; duplicate *limit
+
+    ldr         r3, [sp, #12]             ; load thresh
+    sub         r2, r0, #4                ; move s pointer down by 4 columns
     cmp         r12, #0
     beq         end_vp9_mblf_v_edge
 
-    vld1.8      {d0[]}, [r2]               ; duplicate *blimit
-    ldr         r2, [sp, #12]              ; load thresh
-    vld1.8      {d1[]}, [r3]               ; duplicate *limit
-    vld1.8      {d2[]}, [r2]               ; duplicate *thresh
+    vld1.8      {d2[]}, [r3]              ; duplicate *thresh
 
 count_mblf_v_loop
-    sub         r2, r0, #4                 ; move s pointer down by 4 columns
-
     vld1.u8     {d3}, [r2], r1             ; load s data
     vld1.u8     {d4}, [r2], r1
     vld1.u8     {d5}, [r2], r1
@@ -380,27 +394,28 @@ count_mblf_v_loop
     bl          vp9_mbloop_filter_neon
 
     ;store op2, op1, op0, oq0
-    vst4.8      {d2[0], d3[0], d4[0], d5[0]}, [r2], r1
-    vst4.8      {d2[1], d3[1], d4[1], d5[1]}, [r2], r1
-    vst4.8      {d2[2], d3[2], d4[2], d5[2]}, [r2], r1
-    vst4.8      {d2[3], d3[3], d4[3], d5[3]}, [r2], r1
-    vst4.8      {d2[4], d3[4], d4[4], d5[4]}, [r2], r1
-    vst4.8      {d2[5], d3[5], d4[5], d5[5]}, [r2], r1
-    vst4.8      {d2[6], d3[6], d4[6], d5[6]}, [r2], r1
-    vst4.8      {d2[7], d3[7], d4[7], d5[7]}, [r2]
+    vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r2], r1
+    vst4.8      {d0[1], d1[1], d2[1], d3[1]}, [r2], r1
+    vst4.8      {d0[2], d1[2], d2[2], d3[2]}, [r2], r1
+    vst4.8      {d0[3], d1[3], d2[3], d3[3]}, [r2], r1
+    vst4.8      {d0[4], d1[4], d2[4], d3[4]}, [r2], r1
+    vst4.8      {d0[5], d1[5], d2[5], d3[5]}, [r2], r1
+    vst4.8      {d0[6], d1[6], d2[6], d3[6]}, [r2], r1
+    vst4.8      {d0[7], d1[7], d2[7], d3[7]}, [r2]
 
     ;store oq1, oq2
-    vst2.8      {d6[0], d7[0]}, [r3], r1
-    vst2.8      {d6[1], d7[1]}, [r3], r1
-    vst2.8      {d6[2], d7[2]}, [r3], r1
-    vst2.8      {d6[3], d7[3]}, [r3], r1
-    vst2.8      {d6[4], d7[4]}, [r3], r1
-    vst2.8      {d6[5], d7[5]}, [r3], r1
-    vst2.8      {d6[6], d7[6]}, [r3], r1
-    vst2.8      {d6[7], d7[7]}, [r3]
+    vst2.8      {d4[0], d5[0]}, [r3], r1
+    vst2.8      {d4[1], d5[1]}, [r3], r1
+    vst2.8      {d4[2], d5[2]}, [r3], r1
+    vst2.8      {d4[3], d5[3]}, [r3], r1
+    vst2.8      {d4[4], d5[4]}, [r3], r1
+    vst2.8      {d4[5], d5[5]}, [r3], r1
+    vst2.8      {d4[6], d5[6]}, [r3], r1
+    vst2.8      {d4[7], d5[7]}, [r3]
 
     add         r0, r0, r1, lsl #3         ; s += pitch * 8
     subs        r12, r12, #1
+    subne       r2, r0, #4                 ; move s pointer down by 4 columns
     bne         count_mblf_v_loop
 
 end_vp9_mblf_v_edge
@@ -412,6 +427,7 @@ end_vp9_mblf_v_edge
 ; necessary load, transpose (if necessary) and store. The function does not use
 ; registers d8-d15.
 ;
+; Inputs:
 ; r0-r3, r12 PRESERVE
 ; d0    blimit
 ; d1    limit
@@ -424,22 +440,38 @@ end_vp9_mblf_v_edge
 ; d16   q1
 ; d17   q2
 ; d18   q3
+;
+; Outputs:
+; d0    op2
+; d1    op1
+; d2    op0
+; d3    oq0
+; d4    oq1
+; d5    oq2
 |vp9_mbloop_filter_neon| PROC
     ; filter_mask
-    vabd.u8     d19, d3, d4                ; abs(p3 - p2)
-    vabd.u8     d20, d4, d5                ; abs(p2 - p1)
-    vabd.u8     d21, d5, d6                ; abs(p1 - p0)
-    vabd.u8     d22, d16, d7               ; abs(q1 - q0)
-    vabd.u8     d23, d17, d16              ; abs(q2 - q1)
-    vabd.u8     d24, d18, d17              ; abs(q3 - q2)
+    vabd.u8     d19, d3, d4                ; m1 = abs(p3 - p2)
+    vabd.u8     d20, d4, d5                ; m2 = abs(p2 - p1)
+    vabd.u8     d21, d5, d6                ; m3 = abs(p1 - p0)
+    vabd.u8     d22, d16, d7               ; m4 = abs(q1 - q0)
+    vabd.u8     d23, d17, d16              ; m5 = abs(q2 - q1)
+    vabd.u8     d24, d18, d17              ; m6 = abs(q3 - q2)
 
     ; only compare the largest value to limit
-    vmax.u8     d19, d19, d20              ; max(abs(p3 - p2), abs(p2 - p1))
-    vmax.u8     d20, d21, d22              ; max(abs(p1 - p0), abs(q1 - q0))
-    vmax.u8     d23, d23, d24              ; max(abs(q2 - q1), abs(q3 - q2))
+    vmax.u8     d19, d19, d20              ; m1 = max(m1, m2)
+    vmax.u8     d20, d21, d22              ; m2 = max(m3, m4)
+
+    vabd.u8     d25, d6, d4                ; m7 = abs(p0 - p2)
+
+    vmax.u8     d23, d23, d24              ; m3 = max(m5, m6)
+
+    vabd.u8     d26, d7, d17               ; m8 = abs(q0 - q2)
+
     vmax.u8     d19, d19, d20
 
-    vabd.u8     d24, d6, d7                ; abs(p0 - q0)
+    vabd.u8     d24, d6, d7                ; m9 = abs(p0 - q0)
+    vabd.u8     d27, d3, d6                ; m10 = abs(p3 - p0)
+    vabd.u8     d28, d18, d7               ; m11 = abs(q3 - q0)
 
     vmax.u8     d19, d19, d23
 
@@ -449,30 +481,35 @@ end_vp9_mblf_v_edge
     ; abs () > limit
     vcge.u8     d19, d1, d19
 
-    ; flatmask4
-    vabd.u8     d25, d6, d4                ; abs(p0 - p2)
-    vabd.u8     d26, d7, d17               ; abs(q0 - q2)
-    vabd.u8     d27, d3, d6                ; abs(p3 - p0)
-    vabd.u8     d28, d18, d7               ; abs(q3 - q0)
-
     ; only compare the largest value to thresh
-    vmax.u8     d25, d25, d26              ; max(abs(p0 - p2), abs(q0 - q2))
-    vmax.u8     d26, d27, d28              ; max(abs(p3 - p0), abs(q3 - q0))
-    vmax.u8     d25, d25, d26
-    vmax.u8     d20, d20, d25
+    vmax.u8     d25, d25, d26              ; m4 = max(m7, m8)
+    vmax.u8     d26, d27, d28              ; m5 = max(m10, m11)
 
     vshr.u8     d23, d23, #1               ; a = a / 2
+
+    vmax.u8     d25, d25, d26              ; m4 = max(m4, m5)
+
     vqadd.u8    d24, d24, d23              ; a = b + a
 
+    vmax.u8     d20, d20, d25              ; m2 = max(m2, m4)
+
     vmov.u8     d23, #1
     vcge.u8     d24, d0, d24               ; a > blimit
 
+    vcgt.u8     d21, d21, d2               ; (abs(p1 - p0) > thresh)*-1
+
     vcge.u8     d20, d23, d20              ; flat
 
     vand        d19, d19, d24              ; mask
 
+    vcgt.u8     d23, d22, d2               ; (abs(q1 - q0) > thresh)*-1
+
     vand        d20, d20, d19              ; flat & mask
 
+    vmov.u8     d22, #0x80
+
+    vorr        d23, d21, d23              ; hev
+
     ; This instruction will truncate the "flat & mask" masks down to 4 bits
     ; each to fit into one 32 bit arm register. The values are stored in
     ; q10.64[0].
@@ -480,35 +517,30 @@ end_vp9_mblf_v_edge
     vmov.u32    r4, d30[0]                 ; flat & mask 4bits
 
     adds        r5, r4, #1                 ; Check for all 1's
+
+    ; If mask and flat are 1's for all vectors, then we only need to execute
+    ; the power branch for all vectors.
     beq         power_branch_only
 
     cmp         r4, #0                     ; Check for 0, set flag for later
 
-    ; hevmask
-    vcgt.u8     d21, d21, d2               ; (abs(p1 - p0) > thresh)*-1
-    vcgt.u8     d22, d22, d2               ; (abs(q1 - q0) > thresh)*-1
-    vorr        d21, d21, d22              ; hev
-
-    vmov.u8     d22, #0x80
-
     ; mbfilter() function
-
     ; filter() function
     ; convert to signed
-    veor        d23, d7, d22               ; qs0
+    veor        d21, d7, d22               ; qs0
     veor        d24, d6, d22               ; ps0
     veor        d25, d5, d22               ; ps1
     veor        d26, d16, d22              ; qs1
 
     vmov.u8     d27, #3
 
-    vsub.s8     d28, d23, d24              ; ( qs0 - ps0)
+    vsub.s8     d28, d21, d24              ; ( qs0 - ps0)
 
     vqsub.s8    d29, d25, d26              ; filter = clamp(ps1-qs1)
 
     vmull.s8    q15, d28, d27              ; 3 * ( qs0 - ps0)
 
-    vand        d29, d29, d21              ; filter &= hev
+    vand        d29, d29, d23              ; filter &= hev
 
     vaddw.s8    q15, q15, d29              ; filter + 3 * (qs0 - ps0)
 
@@ -525,80 +557,96 @@ end_vp9_mblf_v_edge
     vshr.s8     d29, d29, #3               ; filter1 >>= 3
 
     vqadd.s8    d24, d24, d30              ; op0 = clamp(ps0 + filter2)
-    vqsub.s8    d23, d23, d29              ; oq0 = clamp(qs0 - filter1)
+    vqsub.s8    d21, d21, d29              ; oq0 = clamp(qs0 - filter1)
 
     ; outer tap adjustments: ++filter1 >> 1
     vrshr.s8    d29, d29, #1
-    vbic        d29, d29, d21              ; filter &= ~hev
+    vbic        d29, d29, d23              ; filter &= ~hev
 
     vqadd.s8    d25, d25, d29              ; op1 = clamp(ps1 + filter)
     vqsub.s8    d26, d26, d29              ; oq1 = clamp(qs1 - filter)
 
+    ; If mask and flat are 0's for all vectors, then we only need to execute
+    ; the filter branch for all vectors.
     beq         filter_branch_only
 
+    ; If mask and flat are mixed then we must perform both branches and
+    ; combine the data.
     veor        d24, d24, d22              ; *f_op0 = u^0x80
-    veor        d23, d23, d22              ; *f_oq0 = u^0x80
+    veor        d21, d21, d22              ; *f_oq0 = u^0x80
     veor        d25, d25, d22              ; *f_op1 = u^0x80
     veor        d26, d26, d22              ; *f_oq1 = u^0x80
 
-    ; mbfilter flat && mask branch
-    ; TODO(fgalligan): Can I decrease the cycles shifting to consective d's
-    ; and using vibt on the q's?
-    vmov.u8     d21, #2
-    vaddl.u8    q14, d6, d7                ; op2 = p0 + q0
-    vmlal.u8    q14, d3, d27               ; op2 += p3 * 3
-    vmlal.u8    q14, d4, d21               ; op2 += p2 * 2
-    vaddw.u8    q14, d5                    ; op2 += p1
+    ; At this point we have already executed the filter branch. The filter
+    ; branch does not set op2 or oq2, so use p2 and q2. Execute the power
+    ; branch and combine the data.
+    vmov.u8     d23, #2
+    vaddl.u8    q14, d6, d7                ; r_op2 = p0 + q0
+    vmlal.u8    q14, d3, d27               ; r_op2 += p3 * 3
+    vmlal.u8    q14, d4, d23               ; r_op2 += p2 * 2
+
+    vbif        d0, d4, d20                ; op2 |= p2 & ~(flat & mask)
+
+    vaddw.u8    q14, d5                    ; r_op2 += p1
+
+    vbif        d1, d25, d20               ; op1 |= f_op1 & ~(flat & mask)
+
     vqrshrn.u16 d30, q14, #3               ; r_op2
 
-    vsubw.u8    q14, d3                    ; op1 = op2 - p3
-    vsubw.u8    q14, d4                    ; op1 -= p2
-    vaddw.u8    q14, d5                    ; op1 += p1
-    vaddw.u8    q14, d16                   ; op1 += q1
+    vsubw.u8    q14, d3                    ; r_op1 = r_op2 - p3
+    vsubw.u8    q14, d4                    ; r_op1 -= p2
+    vaddw.u8    q14, d5                    ; r_op1 += p1
+    vaddw.u8    q14, d16                   ; r_op1 += q1
+
+    vbif        d2, d24, d20               ; op0 |= f_op0 & ~(flat & mask)
+
     vqrshrn.u16 d31, q14, #3               ; r_op1
 
-    vsubw.u8    q14, d3                    ; op0 = op1 - p3
-    vsubw.u8    q14, d5                    ; op0 -= p1
-    vaddw.u8    q14, d6                    ; op0 += p0
-    vaddw.u8    q14, d17                   ; op0 += q2
-    vqrshrn.u16 d21, q14, #3               ; r_op0
+    vsubw.u8    q14, d3                    ; r_op0 = r_op1 - p3
+    vsubw.u8    q14, d5                    ; r_op0 -= p1
+    vaddw.u8    q14, d6                    ; r_op0 += p0
+    vaddw.u8    q14, d17                   ; r_op0 += q2
+
+    vbit        d0, d30, d20               ; op2 |= r_op2 & (flat & mask)
+
+    vqrshrn.u16 d23, q14, #3               ; r_op0
+
+    vsubw.u8    q14, d3                    ; r_oq0 = r_op0 - p3
+    vsubw.u8    q14, d6                    ; r_oq0 -= p0
+    vaddw.u8    q14, d7                    ; r_oq0 += q0
+
+    vbit        d1, d31, d20               ; op1 |= r_op1 & (flat & mask)
 
-    vsubw.u8    q14, d3                    ; oq0 = op0 - p3
-    vsubw.u8    q14, d6                    ; oq0 -= p0
-    vaddw.u8    q14, d7                    ; oq0 += q0
     vaddw.u8    q14, d18                   ; oq0 += q3
+
+    vbit        d2, d23, d20               ; op0 |= r_op0 & (flat & mask)
+
     vqrshrn.u16 d22, q14, #3               ; r_oq0
 
-    vsubw.u8    q14, d4                    ; oq1 = oq0 - p2
-    vsubw.u8    q14, d7                    ; oq1 -= q0
-    vaddw.u8    q14, d16                   ; oq1 += q1
-    vaddw.u8    q14, d18                   ; oq1 += q3
-    vqrshrn.u16 d0, q14, #3                ; r_oq1
+    vsubw.u8    q14, d4                    ; r_oq1 = r_oq0 - p2
+    vsubw.u8    q14, d7                    ; r_oq1 -= q0
+    vaddw.u8    q14, d16                   ; r_oq1 += q1
 
-    vsubw.u8    q14, d5                    ; oq2 = oq0 - p1
-    vsubw.u8    q14, d16                   ; oq2 -= q1
-    vaddw.u8    q14, d17                   ; oq2 += q2
-    vaddw.u8    q14, d18                   ; oq2 += q3
-    vqrshrn.u16 d1, q14, #3                ; r_oq2
+    vbif        d3, d21, d20               ; oq0 |= f_oq0 & ~(flat & mask)
+
+    vaddw.u8    q14, d18                   ; r_oq1 += q3
 
-    ; Filter does not set op2 or oq2, so use p2 and q2.
-    vbit        d2, d30, d20               ; op2 |= r_op2 & (flat & mask)
-    vbif        d2, d4, d20                ; op2 |= op2 & ~(flat & mask)
+    vbif        d4, d26, d20               ; oq1 |= f_oq1 & ~(flat & mask)
 
-    vbit        d3, d31, d20               ; op1 |= r_op1 & (flat & mask)
-    vbif        d3, d25, d20               ; op1 |= f_op1 & ~(flat & mask)
+    vqrshrn.u16 d6, q14, #3                ; r_oq1
 
-    vbit        d4, d21, d20               ; op0 |= r_op0 & (flat & mask)
-    vbif        d4, d24, d20               ; op0 |= f_op0 & ~(flat & mask)
+    vsubw.u8    q14, d5                    ; r_oq2 = r_oq1 - p1
+    vsubw.u8    q14, d16                   ; r_oq2 -= q1
+    vaddw.u8    q14, d17                   ; r_oq2 += q2
+    vaddw.u8    q14, d18                   ; r_oq2 += q3
 
-    vbit        d5, d22, d20               ; oq0 |= r_oq0 & (flat & mask)
-    vbif        d5, d23, d20               ; oq0 |= f_oq0 & ~(flat & mask)
+    vbif        d5, d17, d20               ; oq2 |= q2 & ~(flat & mask)
 
-    vbit        d6, d0, d20                ; oq1 |= r_oq1 & (flat & mask)
-    vbif        d6, d26, d20               ; oq1 |= f_oq1 & ~(flat & mask)
+    vqrshrn.u16 d7, q14, #3                ; r_oq2
 
-    vbit        d7, d1, d20                ; oq2 |= r_oq2 & (flat & mask)
-    vbif        d7, d17, d20               ; oq2 |= oq2 & ~(flat & mask)
+    vbit        d3, d22, d20               ; oq0 |= r_oq0 & (flat & mask)
+    vbit        d4, d6, d20                ; oq1 |= r_oq1 & (flat & mask)
+    vbit        d5, d7, d20                ; oq2 |= r_oq2 & (flat & mask)
 
     bx          lr
 
@@ -609,53 +657,49 @@ power_branch_only
     vmlal.u8    q14, d3, d27               ; op2 += p3 * 3
     vmlal.u8    q14, d4, d21               ; op2 += p2 * 2
     vaddw.u8    q14, d5                    ; op2 += p1
-    vqrshrn.u16 d2, q14, #3                ; op2
+    vqrshrn.u16 d0, q14, #3                ; op2
 
     vsubw.u8    q14, d3                    ; op1 = op2 - p3
     vsubw.u8    q14, d4                    ; op1 -= p2
     vaddw.u8    q14, d5                    ; op1 += p1
     vaddw.u8    q14, d16                   ; op1 += q1
-    vqrshrn.u16 d31, q14, #3               ; op1
+    vqrshrn.u16 d1, q14, #3                ; op1
 
     vsubw.u8    q14, d3                    ; op0 = op1 - p3
     vsubw.u8    q14, d5                    ; op0 -= p1
     vaddw.u8    q14, d6                    ; op0 += p0
     vaddw.u8    q14, d17                   ; op0 += q2
-    vqrshrn.u16 d21, q14, #3               ; op0
+    vqrshrn.u16 d2, q14, #3                ; op0
 
     vsubw.u8    q14, d3                    ; oq0 = op0 - p3
     vsubw.u8    q14, d6                    ; oq0 -= p0
     vaddw.u8    q14, d7                    ; oq0 += q0
     vaddw.u8    q14, d18                   ; oq0 += q3
-    vqrshrn.u16 d22, q14, #3               ; oq0
+    vqrshrn.u16 d3, q14, #3                ; oq0
 
     vsubw.u8    q14, d4                    ; oq1 = oq0 - p2
     vsubw.u8    q14, d7                    ; oq1 -= q0
     vaddw.u8    q14, d16                   ; oq1 += q1
     vaddw.u8    q14, d18                   ; oq1 += q3
-    vqrshrn.u16 d6, q14, #3                ; oq1
+    vqrshrn.u16 d4, q14, #3                ; oq1
 
-    vsubw.u8    q14, d5                    ; oq2 = oq0 - p1
+    vsubw.u8    q14, d5                    ; oq2 = oq1 - p1
     vsubw.u8    q14, d16                   ; oq2 -= q1
     vaddw.u8    q14, d17                   ; oq2 += q2
     vaddw.u8    q14, d18                   ; oq2 += q3
-    vqrshrn.u16 d7, q14, #3                ; oq2
-
-    vswp        d3, d31
-    vswp        d4, d21
-    vswp        d5, d22
+    vqrshrn.u16 d5, q14, #3                ; oq2
 
     bx          lr
 
 filter_branch_only
     ; TODO(fgalligan): See if we can rearange registers so we do not need to
     ; do the 2 vswp.
-    vswp        d2, d4                      ; op2
-    vswp        d7, d17                     ; oq2
-    veor        d4, d24, d22                ; *op0 = u^0x80
-    veor        d5, d23, d22                ; *oq0 = u^0x80
-    veor        d3, d25, d22                ; *op1 = u^0x80
-    veor        d6, d26, d22                ; *oq1 = u^0x80
+    vswp        d0, d4                      ; op2
+    vswp        d5, d17                     ; oq2
+    veor        d2, d24, d22                ; *op0 = u^0x80
+    veor        d3, d21, d22                ; *oq0 = u^0x80
+    veor        d1, d25, d22                ; *op1 = u^0x80
+    veor        d4, d26, d22                ; *oq1 = u^0x80
 
     bx          lr
 
diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index 0a4f921c2..554a31730 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -53,7 +53,6 @@ void vp9_free_frame_buffers(VP9_COMMON *oci) {
   for (i = 0; i < NUM_YV12_BUFFERS; i++)
     vp9_free_frame_buffer(&oci->yv12_fb[i]);
 
-  vp9_free_frame_buffer(&oci->temp_scale_frame);
   vp9_free_frame_buffer(&oci->post_proc_buffer);
 
   vpx_free(oci->mip);
@@ -121,10 +120,6 @@ int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) {
     oci->fb_idx_ref_cnt[i] = 1;
   }
 
-  if (vp9_alloc_frame_buffer(&oci->temp_scale_frame, width, 16, ss_x, ss_y,
-                             VP9BORDERINPIXELS) < 0)
-    goto fail;
-
   if (vp9_alloc_frame_buffer(&oci->post_proc_buffer, width, height, ss_x, ss_y,
                              VP9BORDERINPIXELS) < 0)
     goto fail;
diff --git a/vp9/common/vp9_common_data.c b/vp9/common/vp9_common_data.c
index d5b51e89d..dee44ec63 100644
--- a/vp9/common/vp9_common_data.c
+++ b/vp9/common/vp9_common_data.c
@@ -17,11 +17,54 @@ const int b_width_log2_lookup[BLOCK_SIZE_TYPES] =
   {0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4};
 const int b_height_log2_lookup[BLOCK_SIZE_TYPES] =
   {0, 1, 0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4};
+const int num_4x4_blocks_wide_lookup[BLOCK_SIZE_TYPES] =
+  {1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16, 16};
+const int num_4x4_blocks_high_lookup[BLOCK_SIZE_TYPES] =
+  {1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 16, 8, 16};
 // Log 2 conversion lookup tables for modeinfo width and height
 const int mi_width_log2_lookup[BLOCK_SIZE_TYPES] =
   {0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3};
+const int num_8x8_blocks_wide_lookup[BLOCK_SIZE_TYPES] =
+  {1, 1, 1, 1, 1, 2, 2, 2, 4, 4, 4, 8, 8};
 const int mi_height_log2_lookup[BLOCK_SIZE_TYPES] =
   {0, 0, 0, 0, 1, 0, 1, 2, 1, 2, 3, 2, 3};
+const int num_8x8_blocks_high_lookup[BLOCK_SIZE_TYPES] =
+  {1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8};
+
+const PARTITION_TYPE partition_lookup[][BLOCK_SIZE_TYPES] = {
+  {  // 4X4
+    // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
+    PARTITION_NONE, PARTITION_INVALID, PARTITION_INVALID,
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+    PARTITION_INVALID
+  }, {  // 8X8
+  // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
+    PARTITION_SPLIT, PARTITION_VERT, PARTITION_HORZ, PARTITION_NONE,
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID
+  }, {  // 16X16
+  // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
+    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
+    PARTITION_VERT, PARTITION_HORZ, PARTITION_NONE, PARTITION_INVALID,
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+    PARTITION_INVALID, PARTITION_INVALID
+  }, {  // 32X32
+  // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
+    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
+    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_VERT,
+    PARTITION_HORZ, PARTITION_NONE, PARTITION_INVALID,
+    PARTITION_INVALID, PARTITION_INVALID
+  }, {  // 64X64
+  // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
+    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
+    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
+    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_VERT, PARTITION_HORZ,
+    PARTITION_NONE
+  }
+};
 
 const BLOCK_SIZE_TYPE subsize_lookup[PARTITION_TYPES][BLOCK_SIZE_TYPES] = {
   {     // PARTITION_NONE
diff --git a/vp9/common/vp9_common_data.h b/vp9/common/vp9_common_data.h
index 52c314897..8b0f8a500 100644
--- a/vp9/common/vp9_common_data.h
+++ b/vp9/common/vp9_common_data.h
@@ -17,6 +17,14 @@ extern const int b_width_log2_lookup[BLOCK_SIZE_TYPES];
 extern const int b_height_log2_lookup[BLOCK_SIZE_TYPES];
 extern const int mi_width_log2_lookup[BLOCK_SIZE_TYPES];
 extern const int mi_height_log2_lookup[BLOCK_SIZE_TYPES];
+extern const int num_8x8_blocks_wide_lookup[BLOCK_SIZE_TYPES];
+extern const int num_8x8_blocks_high_lookup[BLOCK_SIZE_TYPES];
+extern const int num_4x4_blocks_high_lookup[BLOCK_SIZE_TYPES];
+extern const int num_4x4_blocks_wide_lookup[BLOCK_SIZE_TYPES];
+extern const PARTITION_TYPE
+  partition_lookup[][BLOCK_SIZE_TYPES];
+
+
 extern const BLOCK_SIZE_TYPE subsize_lookup[PARTITION_TYPES][BLOCK_SIZE_TYPES];
 extern const TX_SIZE max_txsize_lookup[BLOCK_SIZE_TYPES];
 extern const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZE_TYPES];
diff --git a/vp9/common/vp9_enums.h b/vp9/common/vp9_enums.h
index 855c5e3de..86f0d0bfd 100644
--- a/vp9/common/vp9_enums.h
+++ b/vp9/common/vp9_enums.h
@@ -35,7 +35,7 @@ typedef enum BLOCK_SIZE_TYPE {
   BLOCK_SIZE_SB32X64, BLOCK_32X64 = BLOCK_SIZE_SB32X64,
   BLOCK_SIZE_SB64X32, BLOCK_64X32 = BLOCK_SIZE_SB64X32,
   BLOCK_SIZE_SB64X64, BLOCK_64X64 = BLOCK_SIZE_SB64X64,
-  BLOCK_SIZE_TYPES, BLOCK_MAX_SB_SEGMENTS = BLOCK_SIZE_TYPES,
+  BLOCK_SIZE_TYPES, BLOCK_MAX_SB_SEGMENTS = BLOCK_SIZE_TYPES
 } BLOCK_SIZE_TYPE;
 
 typedef enum PARTITION_TYPE {
@@ -43,7 +43,7 @@ typedef enum PARTITION_TYPE {
   PARTITION_HORZ,
   PARTITION_VERT,
   PARTITION_SPLIT,
-  PARTITION_TYPES
+  PARTITION_TYPES, PARTITION_INVALID = PARTITION_TYPES
 } PARTITION_TYPE;
 
 #define PARTITION_PLOFFSET   4  // number of probability models per block size
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index d8be8765a..5498b1717 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -87,7 +87,7 @@ void vp9_loop_filter_frame_init(VP9_COMMON *const cm, MACROBLOCKD *const xd,
     lf->last_sharpness_level = lf->sharpness_level;
   }
 
-  for (seg = 0; seg < MAX_MB_SEGMENTS; seg++) {
+  for (seg = 0; seg < MAX_SEGMENTS; seg++) {
     int lvl_seg = default_filt_lvl, ref, mode, intra_lvl;
 
     // Set the baseline filter values for each segment
diff --git a/vp9/common/vp9_loopfilter.h b/vp9/common/vp9_loopfilter.h
index fddf2ce82..e59cc6485 100644
--- a/vp9/common/vp9_loopfilter.h
+++ b/vp9/common/vp9_loopfilter.h
@@ -31,7 +31,7 @@ typedef struct {
                   lim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
   DECLARE_ALIGNED(SIMD_WIDTH, uint8_t,
                   hev_thr[4][SIMD_WIDTH]);
-  uint8_t lvl[MAX_MB_SEGMENTS][MAX_REF_FRAMES][MAX_MODE_LF_DELTAS];
+  uint8_t lvl[MAX_SEGMENTS][MAX_REF_FRAMES][MAX_MODE_LF_DELTAS];
   uint8_t mode_lf_lut[MB_MODE_COUNT];
 } loop_filter_info_n;
 
diff --git a/vp9/common/vp9_onyx.h b/vp9/common/vp9_onyx.h
index fe8122b46..152046f6f 100644
--- a/vp9/common/vp9_onyx.h
+++ b/vp9/common/vp9_onyx.h
@@ -22,7 +22,7 @@ extern "C"
 #include "vpx_scale/yv12config.h"
 #include "vp9/common/vp9_ppflags.h"
 
-#define MAX_MB_SEGMENTS 8
+#define MAX_SEGMENTS 8
 
   typedef int *VP9_PTR;
 
@@ -200,9 +200,9 @@ extern "C"
 
   int vp9_set_roimap(VP9_PTR comp, unsigned char *map,
                      unsigned int rows, unsigned int cols,
-                     int delta_q[MAX_MB_SEGMENTS],
-                     int delta_lf[MAX_MB_SEGMENTS],
-                     unsigned int threshold[MAX_MB_SEGMENTS]);
+                     int delta_q[MAX_SEGMENTS],
+                     int delta_lf[MAX_SEGMENTS],
+                     unsigned int threshold[MAX_SEGMENTS]);
 
   int vp9_set_active_map(VP9_PTR comp, unsigned char *map,
                          unsigned int rows, unsigned int cols);
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index 2efdf8fa3..8b76ac711 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -130,10 +130,7 @@ typedef struct VP9Common {
   struct scale_factors active_ref_scale[ALLOWED_REFS_PER_FRAME];
   int new_fb_idx;
 
-
   YV12_BUFFER_CONFIG post_proc_buffer;
-  YV12_BUFFER_CONFIG temp_scale_frame;
-
 
   FRAME_TYPE last_frame_type;  /* Save last frame's frame type for motion search. */
   FRAME_TYPE frame_type;
diff --git a/vp9/common/vp9_pred_common.c b/vp9/common/vp9_pred_common.c
index ea2b0f418..71fca4cb9 100644
--- a/vp9/common/vp9_pred_common.c
+++ b/vp9/common/vp9_pred_common.c
@@ -445,6 +445,6 @@ int vp9_get_segment_id(VP9_COMMON *cm, const uint8_t *segment_ids,
       segment_id = MIN(segment_id,
                        segment_ids[mi_offset + y * cm->mi_cols + x]);
 
-  assert(segment_id >= 0 && segment_id < MAX_MB_SEGMENTS);
+  assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
   return segment_id;
 }
diff --git a/vp9/common/vp9_seg_common.h b/vp9/common/vp9_seg_common.h
index f072a518d..f22239b92 100644
--- a/vp9/common/vp9_seg_common.h
+++ b/vp9/common/vp9_seg_common.h
@@ -16,8 +16,8 @@
 #define SEGMENT_DELTADATA   0
 #define SEGMENT_ABSDATA     1
 
-#define MAX_MB_SEGMENTS     8
-#define MB_SEG_TREE_PROBS   (MAX_MB_SEGMENTS-1)
+#define MAX_SEGMENTS     8
+#define SEG_TREE_PROBS   (MAX_SEGMENTS-1)
 
 #define PREDICTION_PROBS 3
 
@@ -27,7 +27,7 @@ typedef enum {
   SEG_LVL_ALT_LF = 1,              // Use alternate loop filter value...
   SEG_LVL_REF_FRAME = 2,           // Optional Segment reference frame
   SEG_LVL_SKIP = 3,                // Optional Segment (0,0) + skip mode
-  SEG_LVL_MAX = 4                  // Number of MB level features supported
+  SEG_LVL_MAX = 4                  // Number of features supported
 } SEG_LVL_FEATURES;
 
 
@@ -38,11 +38,11 @@ struct segmentation {
   uint8_t abs_delta;
   uint8_t temporal_update;
 
-  vp9_prob tree_probs[MB_SEG_TREE_PROBS];
+  vp9_prob tree_probs[SEG_TREE_PROBS];
   vp9_prob pred_probs[PREDICTION_PROBS];
 
-  int16_t feature_data[MAX_MB_SEGMENTS][SEG_LVL_MAX];
-  unsigned int feature_mask[MAX_MB_SEGMENTS];
+  int16_t feature_data[MAX_SEGMENTS][SEG_LVL_MAX];
+  unsigned int feature_mask[MAX_SEGMENTS];
 };
 
 int vp9_segfeature_active(const struct segmentation *seg,
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 6660f5b8e..0fdba805d 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -86,7 +86,7 @@ static void set_segment_id(VP9_COMMON *cm, BLOCK_SIZE_TYPE bsize,
   const int ymis = MIN(cm->mi_rows - mi_row, bh);
   int x, y;
 
-  assert(segment_id >= 0 && segment_id < MAX_MB_SEGMENTS);
+  assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
 
   for (y = 0; y < ymis; y++)
     for (x = 0; x < xmis; x++)
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index 61c14b8dc..6f7908ffc 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -400,7 +400,7 @@ static void setup_segmentation(struct segmentation *seg,
   // Segmentation map update
   seg->update_map = vp9_rb_read_bit(rb);
   if (seg->update_map) {
-    for (i = 0; i < MB_SEG_TREE_PROBS; i++)
+    for (i = 0; i < SEG_TREE_PROBS; i++)
       seg->tree_probs[i] = vp9_rb_read_bit(rb) ? vp9_rb_read_literal(rb, 8)
                                                : MAX_PROB;
 
@@ -422,7 +422,7 @@ static void setup_segmentation(struct segmentation *seg,
 
     vp9_clearall_segfeatures(seg);
 
-    for (i = 0; i < MAX_MB_SEGMENTS; i++) {
+    for (i = 0; i < MAX_SEGMENTS; i++) {
       for (j = 0; j < SEG_LVL_MAX; j++) {
         int data = 0;
         const int feature_enabled = vp9_rb_read_bit(rb);
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 07cb2b83e..2fede1580 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -459,10 +459,10 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
       write_intra_mode(bc, mode, pc->fc.y_mode_prob[MIN(3, bsl)]);
     } else {
       int idx, idy;
-      int bw = 1 << b_width_log2(mi->sb_type);
-      int bh = 1 << b_height_log2(mi->sb_type);
-      for (idy = 0; idy < 2; idy += bh)
-        for (idx = 0; idx < 2; idx += bw) {
+      int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mi->sb_type];
+      int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mi->sb_type];
+      for (idy = 0; idy < 2; idy += num_4x4_blocks_high)
+        for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
           const MB_PREDICTION_MODE bm = m->bmi[idy * 2 + idx].as_mode;
           write_intra_mode(bc, bm, pc->fc.y_mode_prob[0]);
         }
@@ -498,11 +498,11 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
       int j;
       MB_PREDICTION_MODE blockmode;
       int_mv blockmv;
-      int bwl = b_width_log2(mi->sb_type), bw = 1 << bwl;
-      int bhl = b_height_log2(mi->sb_type), bh = 1 << bhl;
+      int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mi->sb_type];
+      int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mi->sb_type];
       int idx, idy;
-      for (idy = 0; idy < 2; idy += bh) {
-        for (idx = 0; idx < 2; idx += bw) {
+      for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
+        for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
           j = idy * 2 + idx;
           blockmode = cpi->mb.partition_info->bmi[j].mode;
           blockmv = m->bmi[j].as_mv[0];
@@ -563,10 +563,10 @@ static void write_mb_modes_kf(const VP9_COMP *cpi,
     write_intra_mode(bc, ym, vp9_kf_y_mode_prob[A][L]);
   } else {
     int idx, idy;
-    int bw = 1 << b_width_log2(m->mbmi.sb_type);
-    int bh = 1 << b_height_log2(m->mbmi.sb_type);
-    for (idy = 0; idy < 2; idy += bh) {
-      for (idx = 0; idx < 2; idx += bw) {
+    int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[m->mbmi.sb_type];
+    int num_4x4_blocks_high = num_4x4_blocks_high_lookup[m->mbmi.sb_type];
+    for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
+      for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
         int i = idy * 2 + idx;
         const MB_PREDICTION_MODE A = above_block_mode(m, i, mis);
         const MB_PREDICTION_MODE L = (xd->left_available || idx) ?
@@ -619,7 +619,6 @@ static void write_modes_sb(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *xd = &cpi->mb.e_mbd;
   const int mis = cm->mode_info_stride;
-  int bwl, bhl;
   int bsl = b_width_log2(bsize);
   int bs = (1 << bsl) / 4;  // mode_info step for subsize
   int n;
@@ -629,20 +628,7 @@ static void write_modes_sb(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-  bwl = b_width_log2(m->mbmi.sb_type);
-  bhl = b_height_log2(m->mbmi.sb_type);
-
-  // parse the partition type
-  if ((bwl == bsl) && (bhl == bsl))
-    partition = PARTITION_NONE;
-  else if ((bwl == bsl) && (bhl < bsl))
-    partition = PARTITION_HORZ;
-  else if ((bwl < bsl) && (bhl == bsl))
-    partition = PARTITION_VERT;
-  else if ((bwl < bsl) && (bhl < bsl))
-    partition = PARTITION_SPLIT;
-  else
-    assert(0);
+  partition = partition_lookup[bsl][m->mbmi.sb_type];
 
   if (bsize < BLOCK_SIZE_SB8X8)
     if (xd->ab_index > 0)
@@ -1011,7 +997,7 @@ static void encode_segmentation(VP9_COMP *cpi,
     // Select the coding strategy (temporal or spatial)
     vp9_choose_segmap_coding_method(cpi);
     // Write out probabilities used to decode unpredicted  macro-block segments
-    for (i = 0; i < MB_SEG_TREE_PROBS; i++) {
+    for (i = 0; i < SEG_TREE_PROBS; i++) {
       const int prob = seg->tree_probs[i];
       const int update = prob != MAX_PROB;
       vp9_wb_write_bit(wb, update);
@@ -1037,7 +1023,7 @@ static void encode_segmentation(VP9_COMP *cpi,
   if (seg->update_data) {
     vp9_wb_write_bit(wb, seg->abs_delta);
 
-    for (i = 0; i < MAX_MB_SEGMENTS; i++) {
+    for (i = 0; i < MAX_SEGMENTS; i++) {
       for (j = 0; j < SEG_LVL_MAX; j++) {
         const int active = vp9_segfeature_active(seg, i, j);
         vp9_wb_write_bit(wb, active);
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index d6882d585..4b49b17a2 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -96,6 +96,7 @@ struct macroblock {
   signed int act_zbin_adj;
 
   int mv_best_ref_index[MAX_REF_FRAMES];
+  unsigned int max_mv_context[MAX_REF_FRAMES];
 
   int nmvjointcost[MV_JOINTS];
   int nmvcosts[2][MV_VALS];
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 2e7cb291d..502308766 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -323,7 +323,8 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
 
   int mb_mode_index = ctx->best_mode_index;
   const int mis = cpi->common.mode_info_stride;
-  const int bh = 1 << mi_height_log2(bsize), bw = 1 << mi_width_log2(bsize);
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
 
   assert(mi->mbmi.mode < MB_MODE_COUNT);
   assert(mb_mode_index < MAX_MODES);
@@ -333,10 +334,10 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
 
   // Restore the coding context of the MB to that that was in place
   // when the mode was picked for it
-  for (y = 0; y < bh; y++) {
-    for (x_idx = 0; x_idx < bw; x_idx++) {
-      if ((xd->mb_to_right_edge >> (3 + LOG2_MI_SIZE)) + bw > x_idx
-          && (xd->mb_to_bottom_edge >> (3 + LOG2_MI_SIZE)) + bh > y) {
+  for (y = 0; y < mi_height; y++) {
+    for (x_idx = 0; x_idx < mi_width; x_idx++) {
+      if ((xd->mb_to_right_edge >> (3 + LOG2_MI_SIZE)) + mi_width > x_idx
+          && (xd->mb_to_bottom_edge >> (3 + LOG2_MI_SIZE)) + mi_height > y) {
         MODE_INFO *mi_addr = xd->mode_info_context + x_idx + y * mis;
         *mi_addr = *mi;
       }
@@ -412,10 +413,10 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
 
     if (bsize > BLOCK_SIZE_SB8X8 && mbmi->mode == NEWMV) {
       int i, j;
-      for (j = 0; j < bh; ++j)
-        for (i = 0; i < bw; ++i)
-          if ((xd->mb_to_right_edge >> (3 + LOG2_MI_SIZE)) + bw > i
-              && (xd->mb_to_bottom_edge >> (3 + LOG2_MI_SIZE)) + bh > j)
+      for (j = 0; j < mi_height; ++j)
+        for (i = 0; i < mi_width; ++i)
+          if ((xd->mb_to_right_edge >> (3 + LOG2_MI_SIZE)) + mi_width > i
+              && (xd->mb_to_bottom_edge >> (3 + LOG2_MI_SIZE)) + mi_height > j)
             xd->mode_info_context[mis * j + i].mbmi = *mbmi;
     }
 
@@ -459,7 +460,8 @@ static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col,
   MB_MODE_INFO *mbmi;
   const int dst_fb_idx = cm->new_fb_idx;
   const int idx_str = xd->mode_info_stride * mi_row + mi_col;
-  const int bw = 1 << mi_width_log2(bsize), bh = 1 << mi_height_log2(bsize);
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
   const int mb_row = mi_row >> 1;
   const int mb_col = mi_col >> 1;
   const int idx_map = mb_row * cm->mb_cols + mb_col;
@@ -496,13 +498,13 @@ static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col,
   x->mv_row_min = -((mi_row * MI_SIZE)+ VP9BORDERINPIXELS - VP9_INTERP_EXTEND);
   x->mv_col_min = -((mi_col * MI_SIZE)+ VP9BORDERINPIXELS - VP9_INTERP_EXTEND);
   x->mv_row_max = ((cm->mi_rows - mi_row) * MI_SIZE
-      + (VP9BORDERINPIXELS - MI_SIZE * bh - VP9_INTERP_EXTEND));
+      + (VP9BORDERINPIXELS - MI_SIZE * mi_height - VP9_INTERP_EXTEND));
   x->mv_col_max = ((cm->mi_cols - mi_col) * MI_SIZE
-      + (VP9BORDERINPIXELS - MI_SIZE * bw - VP9_INTERP_EXTEND));
+      + (VP9BORDERINPIXELS - MI_SIZE * mi_width - VP9_INTERP_EXTEND));
 
   // Set up distance of MB to edge of frame in 1/8th pel units
-  assert(!(mi_col & (bw - 1)) && !(mi_row & (bh - 1)));
-  set_mi_row_col(cm, xd, mi_row, bh, mi_col, bw);
+  assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1)));
+  set_mi_row_col(cm, xd, mi_row, mi_height, mi_col, mi_width);
 
   /* set up source buffers */
   vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col);
@@ -676,23 +678,27 @@ static void restore_context(VP9_COMP *cpi, int mi_row, int mi_col,
   MACROBLOCK * const x = &cpi->mb;
   MACROBLOCKD * const xd = &x->e_mbd;
   int p;
-  int bwl = b_width_log2(bsize), bw = 1 << bwl;
-  int bhl = b_height_log2(bsize), bh = 1 << bhl;
-  int mwl = mi_width_log2(bsize), mw = 1 << mwl;
-  int mhl = mi_height_log2(bsize), mh = 1 << mhl;
+  int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+  int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
+  int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  int mi_height = num_8x8_blocks_high_lookup[bsize];
   for (p = 0; p < MAX_MB_PLANE; p++) {
     vpx_memcpy(
         cm->above_context[p] + ((mi_col * 2) >> xd->plane[p].subsampling_x),
-        a + bw * p, sizeof(ENTROPY_CONTEXT) * bw >> xd->plane[p].subsampling_x);
+        a + num_4x4_blocks_wide * p,
+        (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
+        xd->plane[p].subsampling_x);
     vpx_memcpy(
         cm->left_context[p]
-            + ((mi_row & MI_MASK)* 2 >> xd->plane[p].subsampling_y),l + bh * p,
-            sizeof(ENTROPY_CONTEXT) * bh >> xd->plane[p].subsampling_y);
-          }
+            + ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
+        l + num_4x4_blocks_high * p,
+        (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
+        xd->plane[p].subsampling_y);
+  }
   vpx_memcpy(cm->above_seg_context + mi_col, sa,
-             sizeof(PARTITION_CONTEXT) * mw);
+             sizeof(PARTITION_CONTEXT) * mi_width);
   vpx_memcpy(cm->left_seg_context + (mi_row & MI_MASK), sl,
-             sizeof(PARTITION_CONTEXT) * mh);
+             sizeof(PARTITION_CONTEXT) * mi_height);
 }
 static void save_context(VP9_COMP *cpi, int mi_row, int mi_col,
                          ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
@@ -703,27 +709,30 @@ static void save_context(VP9_COMP *cpi, int mi_row, int mi_col,
   MACROBLOCK * const x = &cpi->mb;
   MACROBLOCKD * const xd = &x->e_mbd;
   int p;
-  int bwl = b_width_log2(bsize), bw = 1 << bwl;
-  int bhl = b_height_log2(bsize), bh = 1 << bhl;
-  int mwl = mi_width_log2(bsize), mw = 1 << mwl;
-  int mhl = mi_height_log2(bsize), mh = 1 << mhl;
+  int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+  int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
+  int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  int mi_height = num_8x8_blocks_high_lookup[bsize];
 
   // buffer the above/left context information of the block in search.
   for (p = 0; p < MAX_MB_PLANE; ++p) {
     vpx_memcpy(
-        a + bw * p,
+        a + num_4x4_blocks_wide * p,
         cm->above_context[p] + (mi_col * 2 >> xd->plane[p].subsampling_x),
-        sizeof(ENTROPY_CONTEXT) * bw >> xd->plane[p].subsampling_x);
+        (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
+        xd->plane[p].subsampling_x);
     vpx_memcpy(
-        l + bh * p,
+        l + num_4x4_blocks_high * p,
         cm->left_context[p]
-            + ((mi_row & MI_MASK)* 2 >> xd->plane[p].subsampling_y),sizeof(ENTROPY_CONTEXT) * bh >> xd->plane[p].subsampling_y);
-          }
+            + ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
+        (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
+        xd->plane[p].subsampling_y);
+  }
   vpx_memcpy(sa, cm->above_seg_context + mi_col,
-             sizeof(PARTITION_CONTEXT) * mw);
+             sizeof(PARTITION_CONTEXT) * mi_width);
   vpx_memcpy(sl, cm->left_seg_context + (mi_row & MI_MASK),
-  sizeof(PARTITION_CONTEXT) * mh)
-             ;}
+             sizeof(PARTITION_CONTEXT) * mi_height);
+}
 
 static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
                      int output_enabled, BLOCK_SIZE_TYPE bsize, int sub_index) {
@@ -759,8 +768,10 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
   MACROBLOCKD * const xd = &x->e_mbd;
   BLOCK_SIZE_TYPE c1 = BLOCK_SIZE_SB8X8;
   const int bsl = b_width_log2(bsize), bs = (1 << bsl) / 4;
-  int bwl, bhl;
   int UNINITIALIZED_IS_SAFE(pl);
+  PARTITION_TYPE partition;
+  BLOCK_SIZE_TYPE subsize;
+  int i;
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
@@ -771,44 +782,46 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
     pl = partition_plane_context(xd, bsize);
     c1 = *(get_sb_partitioning(x, bsize));
   }
+  partition = partition_lookup[bsl][c1];
 
-  bwl = b_width_log2(c1), bhl = b_height_log2(c1);
-
-  if (bsl == bwl && bsl == bhl) {
-    if (output_enabled && bsize >= BLOCK_SIZE_SB8X8)
-      cpi->partition_count[pl][PARTITION_NONE]++;
-    encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, -1);
-  } else if (bsl == bhl && bsl > bwl) {
-    if (output_enabled)
-      cpi->partition_count[pl][PARTITION_VERT]++;
-    encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, 0);
-    encode_b(cpi, tp, mi_row, mi_col + bs, output_enabled, c1, 1);
-  } else if (bsl == bwl && bsl > bhl) {
-    if (output_enabled)
-      cpi->partition_count[pl][PARTITION_HORZ]++;
-    encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, 0);
-    encode_b(cpi, tp, mi_row + bs, mi_col, output_enabled, c1, 1);
-  } else {
-    BLOCK_SIZE_TYPE subsize;
-    int i;
-
-    assert(bwl < bsl && bhl < bsl);
-    subsize = get_subsize(bsize, PARTITION_SPLIT);
+  switch (partition) {
+    case PARTITION_NONE:
+      if (output_enabled && bsize >= BLOCK_SIZE_SB8X8)
+        cpi->partition_count[pl][PARTITION_NONE]++;
+      encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, -1);
+      break;
+    case PARTITION_VERT:
+      if (output_enabled)
+        cpi->partition_count[pl][PARTITION_VERT]++;
+      encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, 0);
+      encode_b(cpi, tp, mi_row, mi_col + bs, output_enabled, c1, 1);
+      break;
+    case PARTITION_HORZ:
+      if (output_enabled)
+        cpi->partition_count[pl][PARTITION_HORZ]++;
+      encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, 0);
+      encode_b(cpi, tp, mi_row + bs, mi_col, output_enabled, c1, 1);
+      break;
+    case PARTITION_SPLIT:
+      subsize = get_subsize(bsize, PARTITION_SPLIT);
 
-    if (output_enabled)
-      cpi->partition_count[pl][PARTITION_SPLIT]++;
+      if (output_enabled)
+        cpi->partition_count[pl][PARTITION_SPLIT]++;
 
-    for (i = 0; i < 4; i++) {
-      const int x_idx = i & 1, y_idx = i >> 1;
+      for (i = 0; i < 4; i++) {
+        const int x_idx = i & 1, y_idx = i >> 1;
 
-      *(get_sb_index(xd, subsize)) = i;
-      encode_sb(cpi, tp, mi_row + y_idx * bs, mi_col + x_idx * bs,
-                output_enabled, subsize);
-    }
+        *(get_sb_index(xd, subsize)) = i;
+        encode_sb(cpi, tp, mi_row + y_idx * bs, mi_col + x_idx * bs,
+                  output_enabled, subsize);
+      }
+      break;
+    default:
+      assert(0);
+      break;
   }
 
-  if (bsize >= BLOCK_SIZE_SB8X8
-      && (bsize == BLOCK_SIZE_SB8X8 || bsl == bwl || bsl == bhl)) {
+  if (partition != PARTITION_SPLIT || bsize == BLOCK_SIZE_SB8X8) {
     set_partition_seg_context(cm, xd, mi_row, mi_col);
     update_partition_context(xd, c1, bsize);
   }
@@ -1159,13 +1172,11 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
   MACROBLOCK * const x = &cpi->mb;
   MACROBLOCKD *xd = &cpi->mb.e_mbd;
   const int mis = cm->mode_info_stride;
-  int bwl = b_width_log2(m->mbmi.sb_type);
-  int bhl = b_height_log2(m->mbmi.sb_type);
   int bsl = b_width_log2(bsize);
-  int bs = (1 << bsl);
-  int bh = (1 << bhl);
-  int ms = bs / 2;
-  int mh = bh / 2;
+  int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+  int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
+  int ms = num_4x4_blocks_wide / 2;
+  int mh = num_4x4_blocks_high / 2;
   int bss = (1 << bsl) / 4;
   int i, pl;
   PARTITION_TYPE partition = PARTITION_NONE;
@@ -1187,17 +1198,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-  // parse the partition type
-  if ((bwl == bsl) && (bhl == bsl))
-    partition = PARTITION_NONE;
-  else if ((bwl == bsl) && (bhl < bsl))
-    partition = PARTITION_HORZ;
-  else if ((bwl < bsl) && (bhl == bsl))
-    partition = PARTITION_VERT;
-  else if ((bwl < bsl) && (bhl < bsl))
-    partition = PARTITION_SPLIT;
-  else
-    assert(0);
+  partition = partition_lookup[bsl][bs_type];
 
   subsize = get_subsize(bsize, partition);
 
@@ -1340,8 +1341,8 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
 
     // Split partition.
     for (i = 0; i < 4; i++) {
-      int x_idx = (i & 1) * (bs >> 2);
-      int y_idx = (i >> 1) * (bs >> 2);
+      int x_idx = (i & 1) * (num_4x4_blocks_wide >> 2);
+      int y_idx = (i >> 1) * (num_4x4_blocks_wide >> 2);
       int rt = 0;
       int64_t dt = 0;
       ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
@@ -2468,10 +2469,12 @@ static void sum_intra_stats(VP9_COMP *cpi, MACROBLOCK *x) {
     ++cpi->y_mode_count[MIN(bsl, 3)][m];
   } else {
     int idx, idy;
-    int bw = 1 << b_width_log2(xd->mode_info_context->mbmi.sb_type);
-    int bh = 1 << b_height_log2(xd->mode_info_context->mbmi.sb_type);
-    for (idy = 0; idy < 2; idy += bh) {
-      for (idx = 0; idx < 2; idx += bw) {
+    int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[
+      xd->mode_info_context->mbmi.sb_type];
+    int num_4x4_blocks_high = num_4x4_blocks_high_lookup[
+      xd->mode_info_context->mbmi.sb_type];
+    for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
+      for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
         int m = xd->mode_info_context->bmi[idy * 2 + idx].as_mode;
         ++cpi->y_mode_count[0][m];
       }
@@ -2509,8 +2512,8 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
   MB_MODE_INFO *mbmi = &mi->mbmi;
   unsigned int segment_id = mbmi->segment_id;
   const int mis = cm->mode_info_stride;
-  const int bwl = mi_width_log2(bsize);
-  const int bw = 1 << bwl, bh = 1 << mi_height_log2(bsize);
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
   x->rd_search = 0;
   x->skip_encode = (!output_enabled && cpi->sf.skip_encode_frame &&
                     xd->q_index < QIDX_SKIP_THRESH);
@@ -2635,8 +2638,8 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
         sz = TX_4X4;
       }
 
-      for (y = 0; y < bh; y++) {
-        for (x = 0; x < bw; x++) {
+      for (y = 0; y < mi_height; y++) {
+        for (x = 0; x < mi_width; x++) {
           if (mi_col + x < cm->mi_cols && mi_row + y < cm->mi_rows) {
             mi[mis * y + x].mbmi.txfm_size = sz;
           }
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 6a918926d..710417948 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -441,7 +441,7 @@ void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize,
   TX_TYPE tx_type;
   const int16_t *scan, *iscan;
   uint16_t *eob = &pd->eobs[block];
-  const int bwl = b_width_log2(bsize) - pd->subsampling_x, bw = 1 << bwl;
+  const int bwl = plane_block_width_log2by4(bsize, pd), bw = 1 << bwl;
   const int twl = bwl - tx_size, twmask = (1 << twl) - 1;
   int xoff, yoff;
   int16_t *src_diff;
@@ -533,6 +533,8 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
 
   if (x->skip_encode)
     return;
+  if (pd->eobs[block] == 0)
+    return;
 
   switch (ss_txfrm_size / 2) {
     case TX_32X32:
@@ -657,7 +659,7 @@ void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
       vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
                            p->quant, p->quant_shift, qcoeff, dqcoeff,
                            pd->dequant, p->zbin_extra, eob, scan, iscan);
-      if (!x->skip_encode)
+      if (!x->skip_encode && *eob)
         vp9_short_idct32x32_add(dqcoeff, dst, pd->dst.stride);
       break;
     case TX_16X16:
@@ -682,7 +684,7 @@ void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
       vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, p->quant,
                      p->quant_shift, qcoeff, dqcoeff,
                      pd->dequant, p->zbin_extra, eob, scan, iscan);
-      if (!x->skip_encode) {
+      if (!x->skip_encode && *eob) {
         if (tx_type == DCT_DCT)
           vp9_short_idct16x16_add(dqcoeff, dst, pd->dst.stride);
         else
@@ -711,7 +713,7 @@ void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
       vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
                      p->quant_shift, qcoeff, dqcoeff,
                      pd->dequant, p->zbin_extra, eob, scan, iscan);
-      if (!x->skip_encode) {
+      if (!x->skip_encode && *eob) {
         if (tx_type == DCT_DCT)
           vp9_short_idct8x8_add(dqcoeff, dst, pd->dst.stride);
         else
@@ -743,7 +745,7 @@ void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
       vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
                      p->quant_shift, qcoeff, dqcoeff,
                      pd->dequant, p->zbin_extra, eob, scan, iscan);
-      if (!x->skip_encode) {
+      if (!x->skip_encode && *eob) {
         if (tx_type == DCT_DCT)
           // this is like vp9_short_idct4x4 but has a special case around eob<=1
           // which is significant (not just an optimization) for the lossless
diff --git a/vp9/encoder/vp9_encodemv.c b/vp9/encoder/vp9_encodemv.c
index 500f57442..f0c34b373 100644
--- a/vp9/encoder/vp9_encodemv.c
+++ b/vp9/encoder/vp9_encodemv.c
@@ -486,11 +486,11 @@ void vp9_encode_mv(VP9_COMP* cpi, vp9_writer* w,
   if (mv_joint_horizontal(j))
     encode_mv_component(w, diff.col, &mvctx->comps[1], usehp);
 
-  // If auto_mv_step_size is enabled and it is an arf/non shown frame
-  // then keep track of the largest motion vector component used.
-  if (cpi->sf.auto_mv_step_size && !cpi->common.show_frame) {
-    cpi->max_mv_magnitude = MAX((MAX(abs(mv->row), abs(mv->col)) >> 3),
-                                cpi->max_mv_magnitude);
+  // If auto_mv_step_size is enabled then keep track of the largest
+  // motion vector component used.
+  if (!cpi->dummy_packing && cpi->sf.auto_mv_step_size) {
+    unsigned int maxv = MAX(abs(mv->row), abs(mv->col)) >> 3;
+    cpi->max_mv_magnitude = MAX(maxv, cpi->max_mv_magnitude);
   }
 }
 
@@ -513,14 +513,14 @@ void vp9_update_nmv_count(VP9_COMP *cpi, MACROBLOCK *x,
   MODE_INFO *mi = x->e_mbd.mode_info_context;
   MB_MODE_INFO *const mbmi = &mi->mbmi;
   MV diff;
-  const int bw = 1 << b_width_log2(mbmi->sb_type);
-  const int bh = 1 << b_height_log2(mbmi->sb_type);
+  const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
+  const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
   int idx, idy;
 
   if (mbmi->sb_type < BLOCK_SIZE_SB8X8) {
     PARTITION_INFO *pi = x->partition_info;
-    for (idy = 0; idy < 2; idy += bh) {
-      for (idx = 0; idx < 2; idx += bw) {
+    for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
+      for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
         const int i = idy * 2 + idx;
         if (pi->bmi[i].mode == NEWMV) {
           diff.row = mi->bmi[i].as_mv[0].as_mv.row - best_ref_mv->as_mv.row;
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 5b7bed463..0be98913e 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -19,11 +19,13 @@
 #include "vp9/common/vp9_findnearmv.h"
 #include "vp9/common/vp9_common.h"
 
+// #define NEW_DIAMOND_SEARCH
+
 void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv) {
   int col_min = (ref_mv->as_mv.col >> 3) - MAX_FULL_PEL_VAL +
-                                 ((ref_mv->as_mv.col & 7) ? 1 : 0);
+                ((ref_mv->as_mv.col & 7) ? 1 : 0);
   int row_min = (ref_mv->as_mv.row >> 3) - MAX_FULL_PEL_VAL +
-                                 ((ref_mv->as_mv.row & 7) ? 1 : 0);
+                ((ref_mv->as_mv.row & 7) ? 1 : 0);
   int col_max = (ref_mv->as_mv.col >> 3) + MAX_FULL_PEL_VAL;
   int row_max = (ref_mv->as_mv.row >> 3) + MAX_FULL_PEL_VAL;
 
@@ -1511,12 +1513,13 @@ int vp9_diamond_search_sad_c(MACROBLOCK *x,
       this_row_offset = best_mv->as_mv.row + ss[i].mv.row;
       this_col_offset = best_mv->as_mv.col + ss[i].mv.col;
 
-      if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&
-          (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max))
-
-      {
+      if ((this_col_offset > x->mv_col_min) &&
+          (this_col_offset < x->mv_col_max) &&
+          (this_row_offset > x->mv_row_min) &&
+          (this_row_offset < x->mv_row_max)) {
         check_here = ss[i].offset + best_address;
-        thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+        thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
+                              bestsad);
 
         if (thissad < bestsad) {
           this_mv.as_mv.row = this_row_offset;
@@ -1539,6 +1542,34 @@ int vp9_diamond_search_sad_c(MACROBLOCK *x,
       best_mv->as_mv.col += ss[best_site].mv.col;
       best_address += ss[best_site].offset;
       last_site = best_site;
+#if defined(NEW_DIAMOND_SEARCH)
+      while (1) {
+        this_row_offset = best_mv->as_mv.row + ss[best_site].mv.row;
+        this_col_offset = best_mv->as_mv.col + ss[best_site].mv.col;
+        if ((this_col_offset > x->mv_col_min) &&
+            (this_col_offset < x->mv_col_max) &&
+            (this_row_offset > x->mv_row_min) &&
+            (this_row_offset < x->mv_row_max)) {
+          check_here = ss[best_site].offset + best_address;
+          thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
+                                bestsad);
+          if (thissad < bestsad) {
+            this_mv.as_mv.row = this_row_offset;
+            this_mv.as_mv.col = this_col_offset;
+            thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+                                      mvjsadcost, mvsadcost, sad_per_bit);
+            if (thissad < bestsad) {
+              bestsad = thissad;
+              best_mv->as_mv.row += ss[best_site].mv.row;
+              best_mv->as_mv.col += ss[best_site].mv.col;
+              best_address += ss[best_site].offset;
+              continue;
+            }
+          }
+        }
+        break;
+      };
+#endif
     } else if (best_address == in_what)
       (*num00)++;
   }
@@ -1680,12 +1711,39 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x,
         i++;
       }
     }
-
     if (best_site != last_site) {
       best_mv->as_mv.row += ss[best_site].mv.row;
       best_mv->as_mv.col += ss[best_site].mv.col;
       best_address += ss[best_site].offset;
       last_site = best_site;
+#if defined(NEW_DIAMOND_SEARCH)
+      while (1) {
+        this_row_offset = best_mv->as_mv.row + ss[best_site].mv.row;
+        this_col_offset = best_mv->as_mv.col + ss[best_site].mv.col;
+        if ((this_col_offset > x->mv_col_min) &&
+            (this_col_offset < x->mv_col_max) &&
+            (this_row_offset > x->mv_row_min) &&
+            (this_row_offset < x->mv_row_max)) {
+          check_here = ss[best_site].offset + best_address;
+          thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
+                                bestsad);
+          if (thissad < bestsad) {
+            this_mv.as_mv.row = this_row_offset;
+            this_mv.as_mv.col = this_col_offset;
+            thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+                                      mvjsadcost, mvsadcost, sad_per_bit);
+            if (thissad < bestsad) {
+              bestsad = thissad;
+              best_mv->as_mv.row += ss[best_site].mv.row;
+              best_mv->as_mv.col += ss[best_site].mv.col;
+              best_address += ss[best_site].offset;
+              continue;
+            }
+          }
+        }
+        break;
+      };
+#endif
     } else if (best_address == in_what)
       (*num00)++;
   }
@@ -1706,6 +1764,7 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x,
 /* do_refine: If last step (1-away) of n-step search doesn't pick the center
               point as the best match, we will do a final 1-away diamond
               refining search  */
+
 int vp9_full_pixel_diamond(VP9_COMP *cpi, MACROBLOCK *x,
                            int_mv *mvp_full, int step_param,
                            int sadpb, int further_steps,
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index fe276fa6b..7b50e076e 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -706,12 +706,6 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
     cpi->mode_chosen_counts[i] = 0;
   }
 
-  // Initialize cpi->max_mv_magnitude if appropriate.
-  if ((cpi->common.frame_type == KEY_FRAME) || cpi->common.intra_only ||
-    (cpi->common.show_frame == 0)) {
-    cpi->max_mv_magnitude = 0;
-  }
-
   // best quality defaults
   sf->RD = 1;
   sf->search_method = NSTEP;
@@ -773,7 +767,6 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
 #else
       sf->static_segmentation = 0;
 #endif
-      sf->auto_mv_step_size = 1;
       sf->use_avoid_tested_higherror = 1;
       sf->adaptive_rd_thresh = 1;
       sf->last_chroma_intra_mode = TM_PRED;
@@ -798,6 +791,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
         sf->last_chroma_intra_mode = H_PRED;
         sf->use_rd_breakout = 1;
         sf->skip_encode_sb = 1;
+        sf->auto_mv_step_size = 1;
       }
       if (speed == 2) {
         sf->adjust_thresholds_by_speed = 1;
@@ -824,6 +818,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
         sf->using_small_partition_info = 1;
         sf->disable_splitmv =
             (MIN(cpi->common.width, cpi->common.height) >= 720)? 1 : 0;
+        sf->auto_mv_step_size = 1;
       }
       if (speed == 3) {
         sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES;
@@ -840,6 +835,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
         sf->use_rd_breakout = 1;
         sf->skip_encode_sb = 1;
         sf->disable_splitmv = 1;
+        sf->auto_mv_step_size = 1;
       }
       if (speed == 4) {
         sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES;
@@ -856,6 +852,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
                                      FLAG_SKIP_COMP_REFMISMATCH;
         sf->use_rd_breakout = 1;
         sf->optimize_coefficients = 0;
+        sf->auto_mv_step_size = 1;
         // sf->reduce_first_step_size = 1;
         // sf->reference_masking = 1;
 
@@ -1222,7 +1219,7 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
   {
     int i;
 
-    for (i = 0; i < MAX_MB_SEGMENTS; i++)
+    for (i = 0; i < MAX_SEGMENTS; i++)
       cpi->segment_encode_breakout[i] = cpi->oxcf.encode_breakout;
   }
 
@@ -2515,6 +2512,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   int undershoot_seen = 0;
 
   SPEED_FEATURES *sf = &cpi->sf;
+  unsigned int max_mv_def = MIN(cpi->common.width, cpi->common.height);
 #if RESET_FOREACH_FILTER
   int q_low0;
   int q_high0;
@@ -2587,6 +2585,24 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   // Set default state for segment based loop filter update flags
   xd->lf.mode_ref_delta_update = 0;
 
+  // Initialize cpi->mv_step_param to default based on max resolution
+  cpi->mv_step_param = vp9_init_search_range(cpi, max_mv_def);
+  // Initialize cpi->max_mv_magnitude and cpi->mv_step_param if appropriate.
+  if (sf->auto_mv_step_size) {
+    if ((cpi->common.frame_type == KEY_FRAME) || cpi->common.intra_only) {
+      // initialize max_mv_magnitude for use in the first INTER frame
+      // after a key/intra-only frame
+      cpi->max_mv_magnitude = max_mv_def;
+    } else {
+      if (cm->show_frame)
+        // allow mv_steps to correspond to twice the max mv magnitude found
+        // in the previous frame, capped by the default max_mv_magnitude based
+        // on resolution
+        cpi->mv_step_param = vp9_init_search_range(
+            cpi, MIN(max_mv_def, 2 * cpi->max_mv_magnitude));
+      cpi->max_mv_magnitude = 0;
+    }
+  }
 
   // Set various flags etc to special state if it is a key frame
   if (cm->frame_type == KEY_FRAME) {
@@ -3444,15 +3460,24 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   cm->last_width = cm->width;
   cm->last_height = cm->height;
 
-  // Don't increment frame counters if this was an altref buffer
-  // update not a real frame
+  // reset to normal state now that we are done.
   cm->last_show_frame = cm->show_frame;
   if (cm->show_frame) {
+    // current mip will be the prev_mip for the next frame
+    MODE_INFO *temp = cm->prev_mip;
+    cm->prev_mip = cm->mip;
+    cm->mip = temp;
+
+    // update the upper left visible macroblock ptrs
+    cm->mi = cm->mip + cm->mode_info_stride + 1;
+
+    // Don't increment frame counters if this was an altref buffer
+    // update not a real frame
     ++cm->current_video_frame;
     ++cpi->frames_since_key;
   }
-
-  // reset to normal state now that we are done.
+  // restore prev_mi
+  cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1;
 
 #if 0
   {
@@ -3470,17 +3495,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   vp9_write_yuv_rec_frame(cm);
 #endif
 
-  if (cm->show_frame) {
-    vpx_memcpy(cm->prev_mip, cm->mip,
-               cm->mode_info_stride * (cm->mi_rows + MI_BLOCK_SIZE) *
-               sizeof(MODE_INFO));
-  } else {
-    vpx_memset(cm->prev_mip, 0,
-               cm->mode_info_stride * (cm->mi_rows + MI_BLOCK_SIZE) *
-               sizeof(MODE_INFO));
-  }
-  // restore prev_mi
-  cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1;
 }
 
 static void Pass2Encode(VP9_COMP *cpi, unsigned long *size,
@@ -3973,11 +3987,11 @@ int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest,
 }
 
 int vp9_set_roimap(VP9_PTR comp, unsigned char *map, unsigned int rows,
-                   unsigned int cols, int delta_q[MAX_MB_SEGMENTS],
-                   int delta_lf[MAX_MB_SEGMENTS],
-                   unsigned int threshold[MAX_MB_SEGMENTS]) {
+                   unsigned int cols, int delta_q[MAX_SEGMENTS],
+                   int delta_lf[MAX_SEGMENTS],
+                   unsigned int threshold[MAX_SEGMENTS]) {
   VP9_COMP *cpi = (VP9_COMP *) comp;
-  signed char feature_data[SEG_LVL_MAX][MAX_MB_SEGMENTS];
+  signed char feature_data[SEG_LVL_MAX][MAX_SEGMENTS];
   MACROBLOCKD *xd = &cpi->mb.e_mbd;
   int i;
 
@@ -3996,14 +4010,14 @@ int vp9_set_roimap(VP9_PTR comp, unsigned char *map, unsigned int rows,
   vp9_enable_segmentation((VP9_PTR)cpi);
 
   // Set up the quan, LF and breakout threshold segment data
-  for (i = 0; i < MAX_MB_SEGMENTS; i++) {
+  for (i = 0; i < MAX_SEGMENTS; i++) {
     feature_data[SEG_LVL_ALT_Q][i] = delta_q[i];
     feature_data[SEG_LVL_ALT_LF][i] = delta_lf[i];
     cpi->segment_encode_breakout[i] = threshold[i];
   }
 
   // Enable the loop and quant changes in the feature mask
-  for (i = 0; i < MAX_MB_SEGMENTS; i++) {
+  for (i = 0; i < MAX_SEGMENTS; i++) {
     if (delta_q[i])
       vp9_enable_segfeature(&xd->seg, i, SEG_LVL_ALT_Q);
     else
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 2c65fecd1..0798927bd 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -504,6 +504,7 @@ typedef struct VP9_COMP {
   int error_bins[1024];
 
   unsigned int max_mv_magnitude;
+  int mv_step_param;
 
   // Data used for real time conferencing mode to help determine if it would be good to update the gf
   int inter_zz_count;
@@ -513,7 +514,7 @@ typedef struct VP9_COMP {
   unsigned char *segmentation_map;
 
   // segment threashold for encode breakout
-  int  segment_encode_breakout[MAX_MB_SEGMENTS];
+  int  segment_encode_breakout[MAX_SEGMENTS];
 
   unsigned char *active_map;
   unsigned int active_map_enabled;
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index d52091c70..9c6f9f8db 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -463,10 +463,8 @@ static void model_rd_for_sb_y_tx(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
   BLOCK_SIZE_TYPE bs = BLOCK_SIZE_AB4X4;
   struct macroblock_plane *const p = &x->plane[0];
   struct macroblockd_plane *const pd = &xd->plane[0];
-  const int bwl = plane_block_width_log2by4(bsize, pd);
-  const int bhl = plane_block_height_log2by4(bsize, pd);
-  const int bw = 4 << bwl;
-  const int bh = 4 << bhl;
+  const int width = plane_block_width(bsize, pd);
+  const int height = plane_block_height(bsize, pd);
   int rate_sum = 0;
   int64_t dist_sum = 0;
 
@@ -485,10 +483,9 @@ static void model_rd_for_sb_y_tx(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
   } else {
     assert(0);
   }
-  assert(bs <= get_block_size(bwl, bhl));
   *out_skip = 1;
-  for (j = 0; j < bh; j+=t) {
-    for (k = 0; k < bw; k+=t) {
+  for (j = 0; j < height; j += t) {
+    for (k = 0; k < width; k += t) {
       int rate;
       int64_t dist;
       unsigned int sse;
@@ -711,8 +708,8 @@ static void rate_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
 static int rdcost_plane(VP9_COMMON * const cm, MACROBLOCK *x, int plane,
                         BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
   MACROBLOCKD * const xd = &x->e_mbd;
-  const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
-  const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y;
+  const int bwl = plane_block_width_log2by4(bsize, &xd->plane[plane]);
+  const int bhl = plane_block_height_log2by4(bsize, &xd->plane[plane]);
   const int bw = 1 << bwl, bh = 1 << bhl;
   struct rdcost_block_args args = { cm, x, { 0 }, { 0 }, tx_size, bw, bh,
     0, 0, 0, INT64_MAX, 0 };
@@ -802,8 +799,8 @@ static void super_block_yrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
                                      BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
   MACROBLOCKD *const xd = &x->e_mbd;
   struct macroblockd_plane *const pd = &xd->plane[0];
-  const int bwl = b_width_log2(bsize) - xd->plane[0].subsampling_x;
-  const int bhl = b_height_log2(bsize) - xd->plane[0].subsampling_y;
+  const int bwl = plane_block_width_log2by4(bsize, pd);
+  const int bhl = plane_block_height_log2by4(bsize, pd);
   const int bw = 1 << bwl, bh = 1 << bhl;
   struct rdcost_block_args args = { cm, x, { 0 }, { 0 }, tx_size, bw, bh,
                                     0, 0, 0, ref_best_rd, 0 };
@@ -1185,8 +1182,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
   ENTROPY_CONTEXT tl[2], templ[2];
   TX_TYPE tx_type = DCT_DCT;
   TX_TYPE best_tx_type = DCT_DCT;
-  int bw = 1 << b_width_log2(bsize);
-  int bh = 1 << b_height_log2(bsize);
+  int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+  int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
   int idx, idy, block;
   DECLARE_ALIGNED(16, int16_t, best_dqcoeff[4][16]);
 
@@ -1212,8 +1209,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
     vpx_memcpy(tempa, ta, sizeof(ta));
     vpx_memcpy(templ, tl, sizeof(tl));
 
-    for (idy = 0; idy < bh; ++idy) {
-      for (idx = 0; idx < bw; ++idx) {
+    for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
+      for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
         int64_t ssz;
 
         block = ib + idy * 2 + idx;
@@ -1270,8 +1267,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
       best_tx_type = tx_type;
       vpx_memcpy(a, tempa, sizeof(tempa));
       vpx_memcpy(l, templ, sizeof(templ));
-      for (idy = 0; idy < bh; ++idy) {
-        for (idx = 0; idx < bw; ++idx) {
+      for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
+        for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
           block = ib + idy * 2 + idx;
           vpx_memcpy(best_dqcoeff[idy * 2 + idx],
                      BLOCK_OFFSET(pd->dqcoeff, block, 16),
@@ -1284,8 +1281,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
   if (x->skip_encode)
     return best_rd;
 
-  for (idy = 0; idy < bh; ++idy) {
-    for (idx = 0; idx < bw; ++idx) {
+  for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
+    for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
       block = ib + idy * 2 + idx;
       xd->mode_info_context->bmi[block].as_mode = *best_mode;
       src = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block,
@@ -1317,8 +1314,8 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
   int i, j;
   MACROBLOCKD *const xd = &mb->e_mbd;
   BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
-  int bw = 1 << b_width_log2(bsize);
-  int bh = 1 << b_height_log2(bsize);
+  int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+  int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
   int idx, idy;
   int cost = 0;
   int64_t distortion = 0;
@@ -1333,8 +1330,8 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
 
   bmode_costs = mb->mbmode_cost;
 
-  for (idy = 0; idy < 2; idy += bh) {
-    for (idx = 0; idx < 2; idx += bw) {
+  for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
+    for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
       const int mis = xd->mode_info_stride;
       MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
       int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry);
@@ -1357,9 +1354,9 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
       tot_rate_y += ry;
 
       mic->bmi[i].as_mode = best_mode;
-      for (j = 1; j < bh; ++j)
+      for (j = 1; j < num_4x4_blocks_high; ++j)
         mic->bmi[i + j * 2].as_mode = best_mode;
-      for (j = 1; j < bw; ++j)
+      for (j = 1; j < num_4x4_blocks_wide; ++j)
         mic->bmi[i + j].as_mode = best_mode;
 
       if (total_rd >= best_rd)
@@ -1599,8 +1596,8 @@ static int labels2mode(MACROBLOCK *x, int i,
   MB_MODE_INFO * mbmi = &mic->mbmi;
   int cost = 0, thismvcost = 0;
   int idx, idy;
-  int bw = 1 << b_width_log2(mbmi->sb_type);
-  int bh = 1 << b_height_log2(mbmi->sb_type);
+  int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
+  int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
 
   /* We have to be careful retrieving previously-encoded motion vectors.
    Ones from this macroblock have to be pulled from the BLOCKD array
@@ -1650,8 +1647,8 @@ static int labels2mode(MACROBLOCK *x, int i,
     mic->bmi[i].as_mv[1].as_int = this_second_mv->as_int;
 
   x->partition_info->bmi[i].mode = m;
-  for (idy = 0; idy < bh; ++idy)
-    for (idx = 0; idx < bw; ++idx)
+  for (idy = 0; idy < num_4x4_blocks_high; ++idy)
+    for (idx = 0; idx < num_4x4_blocks_wide; ++idx)
       vpx_memcpy(&mic->bmi[i + idy * 2 + idx],
                  &mic->bmi[i], sizeof(mic->bmi[i]));
 
@@ -1671,10 +1668,8 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
-  const int bwl = plane_block_width_log2by4(bsize, &xd->plane[0]);
-  const int bhl = plane_block_height_log2by4(bsize, &xd->plane[0]);
-  const int bw = 4 << bwl;
-  const int bh = 4 << bhl;
+  const int width = plane_block_width(bsize, &xd->plane[0]);
+  const int height = plane_block_height(bsize, &xd->plane[0]);
   int idx, idy;
   const int src_stride = x->plane[0].src.stride;
   uint8_t* const src = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
@@ -1698,7 +1693,7 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
                             xd->plane[0].dst.stride,
                             &xd->mode_info_context->bmi[i].as_mv[0],
                             &xd->scale_factor[0],
-                            bw, bh, 0 /* no avg */, &xd->subpix,
+                            width, height, 0, &xd->subpix,
                             MV_PRECISION_Q3);
 
   if (xd->mode_info_context->mbmi.ref_frame[1] > 0) {
@@ -1709,17 +1704,18 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
     vp9_build_inter_predictor(second_pre, xd->plane[0].pre[1].stride,
                               dst, xd->plane[0].dst.stride,
                               &xd->mode_info_context->bmi[i].as_mv[1],
-                              &xd->scale_factor[1], bw, bh, 1,
+                              &xd->scale_factor[1],
+                              width, height, 1,
                               &xd->subpix, MV_PRECISION_Q3);
   }
 
-  vp9_subtract_block(bh, bw, src_diff, 8,
+  vp9_subtract_block(height, width, src_diff, 8,
                      src, src_stride,
                      dst, xd->plane[0].dst.stride);
 
   k = i;
-  for (idy = 0; idy < bh / 4; ++idy) {
-    for (idx = 0; idx < bw / 4; ++idx) {
+  for (idy = 0; idy < height / 4; ++idy) {
+    for (idx = 0; idx < width / 4; ++idx) {
       int64_t ssz, rd, rd1, rd2;
 
       k += (idy * 2 + idx);
@@ -1825,8 +1821,8 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
   int label_mv_thresh;
   int segmentyrate = 0;
   BLOCK_SIZE_TYPE bsize = mbmi->sb_type;
-  int bwl = b_width_log2(bsize), bw = 1 << bwl;
-  int bhl = b_height_log2(bsize), bh = 1 << bhl;
+  int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+  int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
   vp9_variance_fn_ptr_t *v_fn_ptr;
   ENTROPY_CONTEXT t_above[2], t_left[2];
   BEST_SEG_INFO *bsi = bsi_buf + filter_idx;
@@ -1836,7 +1832,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
   vpx_memcpy(t_above, x->e_mbd.plane[0].above_context, sizeof(t_above));
   vpx_memcpy(t_left, x->e_mbd.plane[0].left_context, sizeof(t_left));
 
-  v_fn_ptr = &cpi->fn_ptr[get_block_size(bwl, bhl)];
+  v_fn_ptr = &cpi->fn_ptr[bsize];
 
   // 64 makes this threshold really big effectively
   // making it so that we very rarely check mvs on
@@ -1845,8 +1841,8 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
   label_mv_thresh = 1 * bsi->mvthresh / label_count;
 
   // Segmentation method overheads
-  for (idy = 0; idy < 2; idy += bh) {
-    for (idx = 0; idx < 2; idx += bw) {
+  for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
+    for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
       // TODO(jingning,rbultje): rewrite the rate-distortion optimization
       // loop for 4x4/4x8/8x4 block coding. to be replaced with new rd loop
       int_mv mode_mv[MB_MODE_COUNT], second_mode_mv[MB_MODE_COUNT];
@@ -1940,9 +1936,24 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
               if (i == 2)
                 bsi->mvp.as_int =
                 x->e_mbd.mode_info_context->bmi[i - 2].as_mv[0].as_int;
-              step_param = 2;
             }
           }
+          if (cpi->sf.auto_mv_step_size && cpi->common.show_frame) {
+            // Take wtd average of the step_params based on the last frame's
+            // max mv magnitude and the best ref mvs of the current block for
+            // the given reference.
+            if (i == 0)
+              step_param = (vp9_init_search_range(
+                  cpi, x->max_mv_context[mbmi->ref_frame[0]]) +
+                  cpi->mv_step_param) >> 1;
+            else
+              step_param = (vp9_init_search_range(
+                  cpi, MAX(abs(bsi->mvp.as_mv.row),
+                           abs(bsi->mvp.as_mv.col)) >> 3) +
+                  cpi->mv_step_param) >> 1;
+          } else {
+            step_param = cpi->mv_step_param;
+          }
 
           further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
 
@@ -2023,19 +2034,19 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
                         x->mvcost, cpi);
 
         bsi->rdstat[i][mode_idx].mvs[0].as_int = mode_mv[this_mode].as_int;
-        if (bw > 1)
+        if (num_4x4_blocks_wide > 1)
           bsi->rdstat[i + 1][mode_idx].mvs[0].as_int =
               mode_mv[this_mode].as_int;
-        if (bh > 1)
+        if (num_4x4_blocks_high > 1)
           bsi->rdstat[i + 2][mode_idx].mvs[0].as_int =
               mode_mv[this_mode].as_int;
         if (mbmi->ref_frame[1] > 0) {
           bsi->rdstat[i][mode_idx].mvs[1].as_int =
               second_mode_mv[this_mode].as_int;
-          if (bw > 1)
+          if (num_4x4_blocks_wide > 1)
             bsi->rdstat[i + 1][mode_idx].mvs[1].as_int =
                 second_mode_mv[this_mode].as_int;
-          if (bh > 1)
+          if (num_4x4_blocks_high > 1)
             bsi->rdstat[i + 2][mode_idx].mvs[1].as_int =
                 second_mode_mv[this_mode].as_int;
         }
@@ -2136,11 +2147,11 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
         return;
       }
 
-      for (j = 1; j < bh; ++j)
+      for (j = 1; j < num_4x4_blocks_high; ++j)
         vpx_memcpy(&x->partition_info->bmi[i + j * 2],
                    &x->partition_info->bmi[i],
                    sizeof(x->partition_info->bmi[i]));
-      for (j = 1; j < bw; ++j)
+      for (j = 1; j < num_4x4_blocks_wide; ++j)
         vpx_memcpy(&x->partition_info->bmi[i + j],
                    &x->partition_info->bmi[i],
                    sizeof(x->partition_info->bmi[i]));
@@ -2227,6 +2238,7 @@ static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
   int best_index = 0;
   int best_sad = INT_MAX;
   int this_sad = INT_MAX;
+  unsigned int max_mv = 0;
 
   uint8_t *src_y_ptr = x->plane[0].src.buf;
   uint8_t *ref_y_ptr;
@@ -2236,6 +2248,8 @@ static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
   for (i = 0; i < MAX_MV_REF_CANDIDATES; i++) {
     this_mv.as_int = mbmi->ref_mvs[ref_frame][i].as_int;
 
+    max_mv = MAX(max_mv,
+                 MAX(abs(this_mv.as_mv.row), abs(this_mv.as_mv.col)) >> 3);
     // The list is at an end if we see 0 for a second time.
     if (!this_mv.as_int && zero_seen)
       break;
@@ -2259,6 +2273,7 @@ static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
 
   // Note the index of the mv that worked best in the reference list.
   x->mv_best_ref_index[ref_frame] = best_index;
+  x->max_mv_context[ref_frame] = max_mv;
 }
 
 static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id,
@@ -2505,12 +2520,14 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
     // Work out the size of the first step in the mv step search.
     // 0 here is maximum length first step. 1 is MAX >> 1 etc.
     if (cpi->sf.auto_mv_step_size && cpi->common.show_frame) {
-      step_param = vp9_init_search_range(cpi, cpi->max_mv_magnitude);
+      // Take wtd average of the step_params based on the last frame's
+      // max mv magnitude and that based on the best ref mvs of the current
+      // block for the given reference.
+      step_param = (vp9_init_search_range(cpi, x->max_mv_context[ref]) +
+                    cpi->mv_step_param) >> 1;
     } else {
-      step_param = vp9_init_search_range(
-                     cpi, MIN(cpi->common.width, cpi->common.height));
+      step_param = cpi->mv_step_param;
     }
-
     // mvp_full.as_int = ref_mv[0].as_int;
     mvp_full.as_int =
         mbmi->ref_mvs[ref][x->mv_best_ref_index[ref]].as_int;
diff --git a/vp9/encoder/vp9_segmentation.c b/vp9/encoder/vp9_segmentation.c
index 8d5b3860c..ef84cc5c0 100644
--- a/vp9/encoder/vp9_segmentation.c
+++ b/vp9/encoder/vp9_segmentation.c
@@ -219,11 +219,11 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) {
   int i, tile_col, mi_row, mi_col;
 
   int temporal_predictor_count[PREDICTION_PROBS][2];
-  int no_pred_segcounts[MAX_MB_SEGMENTS];
-  int t_unpred_seg_counts[MAX_MB_SEGMENTS];
+  int no_pred_segcounts[MAX_SEGMENTS];
+  int t_unpred_seg_counts[MAX_SEGMENTS];
 
-  vp9_prob no_pred_tree[MB_SEG_TREE_PROBS];
-  vp9_prob t_pred_tree[MB_SEG_TREE_PROBS];
+  vp9_prob no_pred_tree[SEG_TREE_PROBS];
+  vp9_prob t_pred_tree[SEG_TREE_PROBS];
   vp9_prob t_nopred_prob[PREDICTION_PROBS];
 
   const int mis = cm->mode_info_stride;