57 files changed, 2121 insertions, 1521 deletions
diff --git a/vp9/common/arm/neon/vp9_convolve8_avg_neon.c b/vp9/common/arm/neon/vp9_convolve8_avg_neon.c
index 2f8dda07c..dd569d348 100644
--- a/vp9/common/arm/neon/vp9_convolve8_avg_neon.c
+++ b/vp9/common/arm/neon/vp9_convolve8_avg_neon.c
@@ -11,6 +11,9 @@
 #include <stddef.h>
 #include <arm_neon.h>
 
+#include "./vpx_config.h"
+#include "vpx_ports/mem.h"
+
 void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
                                const int16_t *filter_x, int x_step_q4,
@@ -22,7 +25,7 @@ void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                                const int16_t *filter_y, int y_step_q4,
                                int w, int h);
 
-static inline int32x4_t MULTIPLY_BY_Q0(
+static INLINE int32x4_t MULTIPLY_BY_Q0(
         int16x4_t dsrc0,
         int16x4_t dsrc1,
         int16x4_t dsrc2,
diff --git a/vp9/common/arm/neon/vp9_convolve8_neon.c b/vp9/common/arm/neon/vp9_convolve8_neon.c
index c8704aa9c..5c555c458 100644
--- a/vp9/common/arm/neon/vp9_convolve8_neon.c
+++ b/vp9/common/arm/neon/vp9_convolve8_neon.c
@@ -11,6 +11,9 @@
 #include <stddef.h>
 #include <arm_neon.h>
 
+#include "./vpx_config.h"
+#include "vpx_ports/mem.h"
+
 void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                            uint8_t *dst, ptrdiff_t dst_stride,
                            const int16_t *filter_x, int x_step_q4,
@@ -22,7 +25,7 @@ void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                            const int16_t *filter_y, int y_step_q4,
                            int w, int h);
 
-static inline int32x4_t MULTIPLY_BY_Q0(
+static INLINE int32x4_t MULTIPLY_BY_Q0(
         int16x4_t dsrc0,
         int16x4_t dsrc1,
         int16x4_t dsrc2,
diff --git a/vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm b/vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm
deleted file mode 100644
index 60a0d98c5..000000000
--- a/vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm
+++ /dev/null
@@ -1,69 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-    EXPORT  |vp9_dc_only_idct_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;void vp9_dc_only_idct_add_neon(int input_dc, uint8_t *pred_ptr,
-;                            uint8_t *dst_ptr, int pitch, int stride)
-;
-; r0  int input_dc
-; r1  uint8_t *pred_ptr
-; r2  uint8_t *dst_ptr
-; r3  int pitch
-; sp  int stride
-
-|vp9_dc_only_idct_add_neon| PROC
-
-    ; generate cospi_16_64 = 11585
-    mov              r12, #0x2d00
-    add              r12, #0x41
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    mul              r0, r0, r12               ; input_dc * cospi_16_64
-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
-    asr              r0, r0, #14               ; >> DCT_CONST_BITS
-
-    ; dct_const_round_shift(out * cospi_16_64)
-    mul              r0, r0, r12               ; out * cospi_16_64
-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
-    asr              r0, r0, #14               ; >> DCT_CONST_BITS
-
-    ; ROUND_POWER_OF_TWO(out, 4)
-    add              r0, r0, #8                ; + (1 <<((4) - 1))
-    asr              r0, r0, #4                ; >> 4
-
-    vdup.16         q0, r0;                   ; duplicate a1
-    ldr              r12, [sp]                 ; load stride
-
-    vld1.32         {d2[0]}, [r1], r3
-    vld1.32         {d2[1]}, [r1], r3
-    vld1.32         {d4[0]}, [r1], r3
-    vld1.32         {d4[1]}, [r1]
-
-    vaddw.u8        q1, q0, d2                ; a1 + pred_ptr[c]
-    vaddw.u8        q2, q0, d4
-
-    vqmovun.s16     d2, q1                    ; clip_pixel
-    vqmovun.s16     d4, q2
-
-    vst1.32         {d2[0]}, [r2], r12
-    vst1.32         {d2[1]}, [r2], r12
-    vst1.32         {d4[0]}, [r2], r12
-    vst1.32         {d4[1]}, [r2]
-
-    bx               lr
-    ENDP             ; |vp9_dc_only_idct_add_neon|
-
-    END
diff --git a/vp9/common/arm/neon/vp9_idct16x16_add_neon.c b/vp9/common/arm/neon/vp9_idct16x16_add_neon.c
index 68d7cccc0..5fa3f5c01 100644
--- a/vp9/common/arm/neon/vp9_idct16x16_add_neon.c
+++ b/vp9/common/arm/neon/vp9_idct16x16_add_neon.c
@@ -10,6 +10,8 @@
 
 #include <arm_neon.h>
 
+#include "./vpx_config.h"
+
 static int16_t cospi_2_64 = 16305;
 static int16_t cospi_4_64 = 16069;
 static int16_t cospi_6_64 = 15679;
@@ -26,7 +28,7 @@ static int16_t cospi_26_64 = 4756;
 static int16_t cospi_28_64 = 3196;
 static int16_t cospi_30_64 = 1606;
 
-static inline void TRANSPOSE8X8(
+static INLINE void TRANSPOSE8X8(
         int16x8_t *q8s16,
         int16x8_t *q9s16,
         int16x8_t *q10s16,
diff --git a/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.c b/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.c
index 1bfee22b2..d0e4b4f40 100644
--- a/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.c
+++ b/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.c
@@ -11,7 +11,9 @@
 #include <arm_neon.h>
 #include "vp9/common/vp9_idct.h"
 
-static inline void LD_16x8(
+#include "./vpx_config.h"
+
+static INLINE void LD_16x8(
         uint8_t *d,
         int d_stride,
         uint8x16_t *q8u8,
@@ -40,7 +42,7 @@ static inline void LD_16x8(
     return;
 }
 
-static inline void ADD_DIFF_16x8(
+static INLINE void ADD_DIFF_16x8(
         uint8x16_t qdiffu8,
         uint8x16_t *q8u8,
         uint8x16_t *q9u8,
@@ -61,7 +63,7 @@ static inline void ADD_DIFF_16x8(
     return;
 }
 
-static inline void SUB_DIFF_16x8(
+static INLINE void SUB_DIFF_16x8(
         uint8x16_t qdiffu8,
         uint8x16_t *q8u8,
         uint8x16_t *q9u8,
@@ -82,7 +84,7 @@ static inline void SUB_DIFF_16x8(
     return;
 }
 
-static inline void ST_16x8(
+static INLINE void ST_16x8(
         uint8_t *d,
         int d_stride,
         uint8x16_t *q8u8,
diff --git a/vp9/common/arm/neon/vp9_idct32x32_add_neon.c b/vp9/common/arm/neon/vp9_idct32x32_add_neon.c
index 53f721b44..309bdf8d7 100644
--- a/vp9/common/arm/neon/vp9_idct32x32_add_neon.c
+++ b/vp9/common/arm/neon/vp9_idct32x32_add_neon.c
@@ -10,6 +10,8 @@
 
 #include <arm_neon.h>
 
+#include "./vpx_config.h"
+
 static int16_t cospi_1_64 = 16364;
 static int16_t cospi_2_64 = 16305;
 static int16_t cospi_3_64 = 16207;
@@ -57,7 +59,7 @@ static int16_t cospi_31_64 = 804;
 #define  STORE_COMBINE_CENTER_RESULTS(r10, r9) \
        __STORE_COMBINE_CENTER_RESULTS(r10, r9, stride, \
                                       q6s16, q7s16, q8s16, q9s16);
-static inline void __STORE_COMBINE_CENTER_RESULTS(
+static INLINE void __STORE_COMBINE_CENTER_RESULTS(
         uint8_t *p1,
         uint8_t *p2,
         int stride,
@@ -105,7 +107,7 @@ static inline void __STORE_COMBINE_CENTER_RESULTS(
 #define  STORE_COMBINE_EXTREME_RESULTS(r7, r6); \
        __STORE_COMBINE_EXTREME_RESULTS(r7, r6, stride, \
                                       q4s16, q5s16, q6s16, q7s16);
-static inline void __STORE_COMBINE_EXTREME_RESULTS(
+static INLINE void __STORE_COMBINE_EXTREME_RESULTS(
         uint8_t *p1,
         uint8_t *p2,
         int stride,
@@ -152,7 +154,7 @@ static inline void __STORE_COMBINE_EXTREME_RESULTS(
 
 #define DO_BUTTERFLY_STD(const_1, const_2, qA, qB) \
         DO_BUTTERFLY(q14s16, q13s16, const_1, const_2, qA, qB);
-static inline void DO_BUTTERFLY(
+static INLINE void DO_BUTTERFLY(
         int16x8_t q14s16,
         int16x8_t q13s16,
         int16_t first_const,
@@ -194,7 +196,7 @@ static inline void DO_BUTTERFLY(
     return;
 }
 
-static inline void idct32_transpose_pair(
+static INLINE void idct32_transpose_pair(
         int16_t *input,
         int16_t *t_buf) {
     int16_t *in;
@@ -288,7 +290,7 @@ static inline void idct32_transpose_pair(
     return;
 }
 
-static inline void idct32_bands_end_1st_pass(
+static INLINE void idct32_bands_end_1st_pass(
         int16_t *out,
         int16x8_t q2s16,
         int16x8_t q3s16,
@@ -383,7 +385,7 @@ static inline void idct32_bands_end_1st_pass(
     return;
 }
 
-static inline void idct32_bands_end_2nd_pass(
+static INLINE void idct32_bands_end_2nd_pass(
         int16_t *out,
         uint8_t *dest,
         int stride,
diff --git a/vp9/common/arm/neon/vp9_idct8x8_add_neon.c b/vp9/common/arm/neon/vp9_idct8x8_add_neon.c
index 50587f6bc..2b3c1ce60 100644
--- a/vp9/common/arm/neon/vp9_idct8x8_add_neon.c
+++ b/vp9/common/arm/neon/vp9_idct8x8_add_neon.c
@@ -10,6 +10,8 @@
 
 #include <arm_neon.h>
 
+#include "./vpx_config.h"
+
 static int16_t cospi_4_64 = 16069;
 static int16_t cospi_8_64 = 15137;
 static int16_t cospi_12_64 = 13623;
@@ -18,7 +20,7 @@ static int16_t cospi_20_64 = 9102;
 static int16_t cospi_24_64 = 6270;
 static int16_t cospi_28_64 = 3196;
 
-static inline void TRANSPOSE8X8(
+static INLINE void TRANSPOSE8X8(
         int16x8_t *q8s16,
         int16x8_t *q9s16,
         int16x8_t *q10s16,
@@ -87,7 +89,7 @@ static inline void TRANSPOSE8X8(
     return;
 }
 
-static inline void IDCT8x8_1D(
+static INLINE void IDCT8x8_1D(
         int16x8_t *q8s16,
         int16x8_t *q9s16,
         int16x8_t *q10s16,
diff --git a/vp9/common/arm/neon/vp9_iht4x4_add_neon.asm b/vp9/common/arm/neon/vp9_iht4x4_add_neon.asm
deleted file mode 100644
index 2f326e24c..000000000
--- a/vp9/common/arm/neon/vp9_iht4x4_add_neon.asm
+++ /dev/null
@@ -1,237 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-    EXPORT  |vp9_iht4x4_16_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-    ; Parallel 1D IDCT on all the columns of a 4x4 16bits data matrix which are
-    ; loaded in d16-d19. d0 must contain cospi_8_64. d1 must contain
-    ; cospi_16_64. d2 must contain cospi_24_64. The output will be stored back
-    ; into d16-d19 registers. This macro will touch q10- q15 registers and use
-    ; them as buffer during calculation.
-    MACRO
-    IDCT4x4_1D
-    ; stage 1
-    vadd.s16    d23, d16, d18   ; (input[0] + input[2])
-    vsub.s16    d24, d16, d18   ; (input[0] - input[2])
-
-    vmull.s16   q15, d17, d2    ; input[1] * cospi_24_64
-    vmull.s16   q10, d17, d0    ; input[1] * cospi_8_64
-    vmull.s16   q13, d23, d1    ; (input[0] + input[2]) * cospi_16_64
-    vmull.s16   q14, d24, d1    ; (input[0] - input[2]) * cospi_16_64
-    vmlsl.s16   q15, d19, d0    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
-    vmlal.s16   q10, d19, d2    ; input[1] * cospi_8_64 + input[3] * cospi_24_64
-
-    ; dct_const_round_shift
-    vqrshrn.s32 d26, q13, #14
-    vqrshrn.s32 d27, q14, #14
-    vqrshrn.s32 d29, q15, #14
-    vqrshrn.s32 d28, q10, #14
-
-    ; stage 2
-    ; output[0] = step[0] + step[3];
-    ; output[1] = step[1] + step[2];
-    ; output[3] = step[0] - step[3];
-    ; output[2] = step[1] - step[2];
-    vadd.s16    q8,  q13, q14
-    vsub.s16    q9,  q13, q14
-    vswp        d18, d19
-    MEND
-
-    ; Parallel 1D IADST on all the columns of a 4x4 16bits data matrix which
-    ; loaded in d16-d19. d3 must contain sinpi_1_9. d4 must contain sinpi_2_9.
-    ; d5 must contain sinpi_4_9. d6 must contain sinpi_3_9. The output will be
-    ; stored back into d16-d19 registers. This macro will touch q11,q12,q13,
-    ; q14,q15 registers and use them as buffer during calculation.
-    MACRO
-    IADST4x4_1D
-    vmull.s16   q10, d3, d16    ; s0 = sinpi_1_9 * x0
-    vmull.s16   q11, d4, d16    ; s1 = sinpi_2_9 * x0
-    vmull.s16   q12, d6, d17    ; s2 = sinpi_3_9 * x1
-    vmull.s16   q13, d5, d18    ; s3 = sinpi_4_9 * x2
-    vmull.s16   q14, d3, d18    ; s4 = sinpi_1_9 * x2
-    vmovl.s16   q15, d16        ; expand x0 from 16 bit to 32 bit
-    vaddw.s16   q15, q15, d19   ; x0 + x3
-    vmull.s16   q8, d4, d19     ; s5 = sinpi_2_9 * x3
-    vsubw.s16   q15, q15, d18   ; s7 = x0 + x3 - x2
-    vmull.s16   q9, d5, d19     ; s6 = sinpi_4_9 * x3
-
-    vadd.s32    q10, q10, q13   ; x0 = s0 + s3 + s5
-    vadd.s32    q10, q10, q8
-    vsub.s32    q11, q11, q14   ; x1 = s1 - s4 - s6
-    vdup.32     q8, r0          ; duplicate sinpi_3_9
-    vsub.s32    q11, q11, q9
-    vmul.s32    q15, q15, q8    ; x2 = sinpi_3_9 * s7
-
-    vadd.s32    q13, q10, q12   ; s0 = x0 + x3
-    vadd.s32    q10, q10, q11   ; x0 + x1
-    vadd.s32    q14, q11, q12   ; s1 = x1 + x3
-    vsub.s32    q10, q10, q12   ; s3 = x0 + x1 - x3
-
-    ; dct_const_round_shift
-    vqrshrn.s32 d16, q13, #14
-    vqrshrn.s32 d17, q14, #14
-    vqrshrn.s32 d18, q15, #14
-    vqrshrn.s32 d19, q10, #14
-    MEND
-
-    ; Generate cosine constants in d6 - d8 for the IDCT
-    MACRO
-    GENERATE_COSINE_CONSTANTS
-    ; cospi_8_64 = 15137 = 0x3b21
-    mov         r0, #0x3b00
-    add         r0, #0x21
-    ; cospi_16_64 = 11585 = 0x2d41
-    mov         r3, #0x2d00
-    add         r3, #0x41
-    ; cospi_24_64 = 6270 = 0x187e
-    mov         r12, #0x1800
-    add         r12, #0x7e
-
-    ; generate constant vectors
-    vdup.16     d0, r0          ; duplicate cospi_8_64
-    vdup.16     d1, r3          ; duplicate cospi_16_64
-    vdup.16     d2, r12         ; duplicate cospi_24_64
-    MEND
-
-    ; Generate sine constants in d1 - d4 for the IADST.
-    MACRO
-    GENERATE_SINE_CONSTANTS
-    ; sinpi_1_9 = 5283 = 0x14A3
-    mov         r0, #0x1400
-    add         r0, #0xa3
-    ; sinpi_2_9 = 9929 = 0x26C9
-    mov         r3, #0x2600
-    add         r3, #0xc9
-    ; sinpi_4_9 = 15212 = 0x3B6C
-    mov         r12, #0x3b00
-    add         r12, #0x6c
-
-    ; generate constant vectors
-    vdup.16     d3, r0          ; duplicate sinpi_1_9
-
-    ; sinpi_3_9 = 13377 = 0x3441
-    mov         r0, #0x3400
-    add         r0, #0x41
-
-    vdup.16     d4, r3          ; duplicate sinpi_2_9
-    vdup.16     d5, r12         ; duplicate sinpi_4_9
-    vdup.16     q3, r0          ; duplicate sinpi_3_9
-    MEND
-
-    ; Transpose a 4x4 16bits data matrix. Datas are loaded in d16-d19.
-    MACRO
-    TRANSPOSE4X4
-    vtrn.16     d16, d17
-    vtrn.16     d18, d19
-    vtrn.32     q8, q9
-    MEND
-
-    AREA     Block, CODE, READONLY ; name this block of code
-;void vp9_iht4x4_16_add_neon(int16_t *input, uint8_t *dest,
-;                               int dest_stride, int tx_type)
-;
-; r0  int16_t input
-; r1  uint8_t *dest
-; r2  int dest_stride
-; r3  int tx_type)
-; This function will only handle tx_type of 1,2,3.
-|vp9_iht4x4_16_add_neon| PROC
-
-    ; load the inputs into d16-d19
-    vld1.s16    {q8,q9}, [r0]!
-
-    ; transpose the input data
-    TRANSPOSE4X4
-
-    ; decide the type of transform
-    cmp         r3, #2
-    beq         idct_iadst
-    cmp         r3, #3
-    beq         iadst_iadst
-
-iadst_idct
-    ; generate constants
-    GENERATE_COSINE_CONSTANTS
-    GENERATE_SINE_CONSTANTS
-
-    ; first transform rows
-    IDCT4x4_1D
-
-    ; transpose the matrix
-    TRANSPOSE4X4
-
-    ; then transform columns
-    IADST4x4_1D
-
-    b end_vp9_iht4x4_16_add_neon
-
-idct_iadst
-    ; generate constants
-    GENERATE_COSINE_CONSTANTS
-    GENERATE_SINE_CONSTANTS
-
-    ; first transform rows
-    IADST4x4_1D
-
-    ; transpose the matrix
-    TRANSPOSE4X4
-
-    ; then transform columns
-    IDCT4x4_1D
-
-    b end_vp9_iht4x4_16_add_neon
-
-iadst_iadst
-    ; generate constants
-    GENERATE_SINE_CONSTANTS
-
-    ; first transform rows
-    IADST4x4_1D
-
-    ; transpose the matrix
-    TRANSPOSE4X4
-
-    ; then transform columns
-    IADST4x4_1D
-
-end_vp9_iht4x4_16_add_neon
-    ; ROUND_POWER_OF_TWO(temp_out[j], 4)
-    vrshr.s16   q8, q8, #4
-    vrshr.s16   q9, q9, #4
-
-    vld1.32     {d26[0]}, [r1], r2
-    vld1.32     {d26[1]}, [r1], r2
-    vld1.32     {d27[0]}, [r1], r2
-    vld1.32     {d27[1]}, [r1]
-
-    ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]
-    vaddw.u8    q8, q8, d26
-    vaddw.u8    q9, q9, d27
-
-    ; clip_pixel
-    vqmovun.s16 d26, q8
-    vqmovun.s16 d27, q9
-
-    ; do the stores in reverse order with negative post-increment, by changing
-    ; the sign of the stride
-    rsb         r2, r2, #0
-    vst1.32     {d27[1]}, [r1], r2
-    vst1.32     {d27[0]}, [r1], r2
-    vst1.32     {d26[1]}, [r1], r2
-    vst1.32     {d26[0]}, [r1]  ; no post-increment
-    bx          lr
-    ENDP  ; |vp9_iht4x4_16_add_neon|
-
-    END
diff --git a/vp9/common/arm/neon/vp9_iht4x4_add_neon.c b/vp9/common/arm/neon/vp9_iht4x4_add_neon.c
new file mode 100644
index 000000000..1761fada2
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_iht4x4_add_neon.c
@@ -0,0 +1,248 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vp9/common/vp9_common.h"
+
+static int16_t sinpi_1_9 = 0x14a3;
+static int16_t sinpi_2_9 = 0x26c9;
+static int16_t sinpi_3_9 = 0x3441;
+static int16_t sinpi_4_9 = 0x3b6c;
+static int16_t cospi_8_64 = 0x3b21;
+static int16_t cospi_16_64 = 0x2d41;
+static int16_t cospi_24_64 = 0x187e;
+
+static INLINE void TRANSPOSE4X4(
+        int16x8_t *q8s16,
+        int16x8_t *q9s16) {
+    int32x4_t q8s32, q9s32;
+    int16x4x2_t d0x2s16, d1x2s16;
+    int32x4x2_t q0x2s32;
+
+    d0x2s16 = vtrn_s16(vget_low_s16(*q8s16), vget_high_s16(*q8s16));
+    d1x2s16 = vtrn_s16(vget_low_s16(*q9s16), vget_high_s16(*q9s16));
+
+    q8s32 = vreinterpretq_s32_s16(vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]));
+    q9s32 = vreinterpretq_s32_s16(vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]));
+    q0x2s32 = vtrnq_s32(q8s32, q9s32);
+
+    *q8s16 = vreinterpretq_s16_s32(q0x2s32.val[0]);
+    *q9s16 = vreinterpretq_s16_s32(q0x2s32.val[1]);
+    return;
+}
+
+static INLINE void GENERATE_COSINE_CONSTANTS(
+        int16x4_t *d0s16,
+        int16x4_t *d1s16,
+        int16x4_t *d2s16) {
+    *d0s16 = vdup_n_s16(cospi_8_64);
+    *d1s16 = vdup_n_s16(cospi_16_64);
+    *d2s16 = vdup_n_s16(cospi_24_64);
+    return;
+}
+
+static INLINE void GENERATE_SINE_CONSTANTS(
+        int16x4_t *d3s16,
+        int16x4_t *d4s16,
+        int16x4_t *d5s16,
+        int16x8_t *q3s16) {
+    *d3s16 = vdup_n_s16(sinpi_1_9);
+    *d4s16 = vdup_n_s16(sinpi_2_9);
+    *q3s16 = vdupq_n_s16(sinpi_3_9);
+    *d5s16 = vdup_n_s16(sinpi_4_9);
+    return;
+}
+
+static INLINE void IDCT4x4_1D(
+        int16x4_t *d0s16,
+        int16x4_t *d1s16,
+        int16x4_t *d2s16,
+        int16x8_t *q8s16,
+        int16x8_t *q9s16) {
+    int16x4_t d16s16, d17s16, d18s16, d19s16, d23s16, d24s16;
+    int16x4_t d26s16, d27s16, d28s16, d29s16;
+    int32x4_t q10s32, q13s32, q14s32, q15s32;
+    int16x8_t q13s16, q14s16;
+
+    d16s16 = vget_low_s16(*q8s16);
+    d17s16 = vget_high_s16(*q8s16);
+    d18s16 = vget_low_s16(*q9s16);
+    d19s16 = vget_high_s16(*q9s16);
+
+    d23s16 = vadd_s16(d16s16, d18s16);
+    d24s16 = vsub_s16(d16s16, d18s16);
+
+    q15s32 = vmull_s16(d17s16, *d2s16);
+    q10s32 = vmull_s16(d17s16, *d0s16);
+    q13s32 = vmull_s16(d23s16, *d1s16);
+    q14s32 = vmull_s16(d24s16, *d1s16);
+    q15s32 = vmlsl_s16(q15s32, d19s16, *d0s16);
+    q10s32 = vmlal_s16(q10s32, d19s16, *d2s16);
+
+    d26s16 = vqrshrn_n_s32(q13s32, 14);
+    d27s16 = vqrshrn_n_s32(q14s32, 14);
+    d29s16 = vqrshrn_n_s32(q15s32, 14);
+    d28s16 = vqrshrn_n_s32(q10s32, 14);
+
+    q13s16 = vcombine_s16(d26s16, d27s16);
+    q14s16 = vcombine_s16(d28s16, d29s16);
+    *q8s16 = vaddq_s16(q13s16, q14s16);
+    *q9s16 = vsubq_s16(q13s16, q14s16);
+    *q9s16 = vcombine_s16(vget_high_s16(*q9s16),
+                          vget_low_s16(*q9s16));  // vswp
+    return;
+}
+
+static INLINE void IADST4x4_1D(
+        int16x4_t *d3s16,
+        int16x4_t *d4s16,
+        int16x4_t *d5s16,
+        int16x8_t *q3s16,
+        int16x8_t *q8s16,
+        int16x8_t *q9s16) {
+    int16x4_t d6s16, d16s16, d17s16, d18s16, d19s16;
+    int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;
+
+    d6s16 = vget_low_s16(*q3s16);
+
+    d16s16 = vget_low_s16(*q8s16);
+    d17s16 = vget_high_s16(*q8s16);
+    d18s16 = vget_low_s16(*q9s16);
+    d19s16 = vget_high_s16(*q9s16);
+
+    q10s32 = vmull_s16(*d3s16, d16s16);
+    q11s32 = vmull_s16(*d4s16, d16s16);
+    q12s32 = vmull_s16(d6s16, d17s16);
+    q13s32 = vmull_s16(*d5s16, d18s16);
+    q14s32 = vmull_s16(*d3s16, d18s16);
+    q15s32 = vmovl_s16(d16s16);
+    q15s32 = vaddw_s16(q15s32, d19s16);
+    q8s32  = vmull_s16(*d4s16, d19s16);
+    q15s32 = vsubw_s16(q15s32, d18s16);
+    q9s32  = vmull_s16(*d5s16, d19s16);
+
+    q10s32 = vaddq_s32(q10s32, q13s32);
+    q10s32 = vaddq_s32(q10s32, q8s32);
+    q11s32 = vsubq_s32(q11s32, q14s32);
+    q8s32  = vdupq_n_s32(sinpi_3_9);
+    q11s32 = vsubq_s32(q11s32, q9s32);
+    q15s32 = vmulq_s32(q15s32, q8s32);
+
+    q13s32 = vaddq_s32(q10s32, q12s32);
+    q10s32 = vaddq_s32(q10s32, q11s32);
+    q14s32 = vaddq_s32(q11s32, q12s32);
+    q10s32 = vsubq_s32(q10s32, q12s32);
+
+    d16s16 = vqrshrn_n_s32(q13s32, 14);
+    d17s16 = vqrshrn_n_s32(q14s32, 14);
+    d18s16 = vqrshrn_n_s32(q15s32, 14);
+    d19s16 = vqrshrn_n_s32(q10s32, 14);
+
+    *q8s16 = vcombine_s16(d16s16, d17s16);
+    *q9s16 = vcombine_s16(d18s16, d19s16);
+    return;
+}
+
+void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
+                            int dest_stride, int tx_type) {
+    uint8x8_t d26u8, d27u8;
+    int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16;
+    uint32x2_t d26u32, d27u32;
+    int16x8_t q3s16, q8s16, q9s16;
+    uint16x8_t q8u16, q9u16;
+
+    d26u32 = d27u32 = vdup_n_u32(0);
+
+    q8s16 = vld1q_s16(input);
+    q9s16 = vld1q_s16(input + 8);
+
+    TRANSPOSE4X4(&q8s16, &q9s16);
+
+    switch (tx_type) {
+      case 0:  // idct_idct is not supported. Fall back to C
+        vp9_iht4x4_16_add_c(input, dest, dest_stride, tx_type);
+        return;
+        break;
+      case 1:  // iadst_idct
+        // generate constants
+        GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);
+        GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
+
+        // first transform rows
+        IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);
+
+        // transpose the matrix
+        TRANSPOSE4X4(&q8s16, &q9s16);
+
+        // then transform columns
+        IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+        break;
+      case 2:  // idct_iadst
+        // generate constantsyy
+        GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);
+        GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
+
+        // first transform rows
+        IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+
+        // transpose the matrix
+        TRANSPOSE4X4(&q8s16, &q9s16);
+
+        // then transform columns
+        IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);
+        break;
+      case 3:  // iadst_iadst
+        // generate constants
+        GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
+
+        // first transform rows
+        IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+
+        // transpose the matrix
+        TRANSPOSE4X4(&q8s16, &q9s16);
+
+        // then transform columns
+        IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+        break;
+      default:  // iadst_idct
+        assert(0);
+        break;
+    }
+
+    q8s16 = vrshrq_n_s16(q8s16, 4);
+    q9s16 = vrshrq_n_s16(q9s16, 4);
+
+    d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 0);
+    dest += dest_stride;
+    d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 1);
+    dest += dest_stride;
+    d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 0);
+    dest += dest_stride;
+    d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 1);
+
+    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32));
+    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32));
+
+    d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+    d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+
+    vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 1);
+    dest -= dest_stride;
+    vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 0);
+    dest -= dest_stride;
+    vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 1);
+    dest -= dest_stride;
+    vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 0);
+    return;
+}
diff --git a/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm b/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm
deleted file mode 100644
index b41f5661b..000000000
--- a/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm
+++ /dev/null
@@ -1,698 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-    EXPORT  |vp9_iht8x8_64_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-    ; Generate IADST constants in r0 - r12 for the IADST.
-    MACRO
-    GENERATE_IADST_CONSTANTS
-    ; generate  cospi_2_64  = 16305
-    mov             r0, #0x3f00
-    add             r0, #0xb1
-
-    ; generate cospi_30_64 = 1606
-    mov             r1, #0x600
-    add             r1, #0x46
-
-    ; generate cospi_10_64 = 14449
-    mov             r2, #0x3800
-    add             r2, #0x71
-
-    ; generate cospi_22_64 = 7723
-    mov             r3, #0x1e00
-    add             r3, #0x2b
-
-    ; generate cospi_18_64 = 10394
-    mov             r4, #0x2800
-    add             r4, #0x9a
-
-    ; generate cospi_14_64 = 12665
-    mov             r5, #0x3100
-    add             r5, #0x79
-
-    ; generate cospi_26_64 = 4756
-    mov             r6, #0x1200
-    add             r6, #0x94
-
-    ; generate cospi_6_64  = 15679
-    mov             r7, #0x3d00
-    add             r7, #0x3f
-
-    ; generate cospi_8_64  = 15137
-    mov             r8, #0x3b00
-    add             r8, #0x21
-
-    ; generate cospi_24_64 = 6270
-    mov             r9, #0x1800
-    add             r9, #0x7e
-
-    ; generate 0
-    mov             r10, #0
-
-    ; generate  cospi_16_64 = 11585
-    mov             r12, #0x2d00
-    add             r12, #0x41
-    MEND
-
-    ; Generate IDCT constants in r3 - r9 for the IDCT.
-    MACRO
-    GENERATE_IDCT_CONSTANTS
-    ; generate  cospi_28_64 = 3196
-    mov             r3, #0x0c00
-    add             r3, #0x7c
-
-    ; generate cospi_4_64  = 16069
-    mov             r4, #0x3e00
-    add             r4, #0xc5
-
-    ; generate cospi_12_64 = 13623
-    mov             r5, #0x3500
-    add             r5, #0x37
-
-    ; generate cospi_20_64 = 9102
-    mov             r6, #0x2300
-    add             r6, #0x8e
-
-    ; generate cospi_16_64 = 11585
-    mov             r7, #0x2d00
-    add             r7, #0x41
-
-    ; generate cospi_24_64 = 6270
-    mov             r8, #0x1800
-    add             r8, #0x7e
-
-    ; generate cospi_8_64 = 15137
-    mov             r9, #0x3b00
-    add             r9, #0x21
-    MEND
-
-    ; Transpose a 8x8 16bits data matrix. Datas are loaded in q8-q15.
-    MACRO
-    TRANSPOSE8X8
-    vswp            d17, d24
-    vswp            d23, d30
-    vswp            d21, d28
-    vswp            d19, d26
-    vtrn.32         q8, q10
-    vtrn.32         q9, q11
-    vtrn.32         q12, q14
-    vtrn.32         q13, q15
-    vtrn.16         q8, q9
-    vtrn.16         q10, q11
-    vtrn.16         q12, q13
-    vtrn.16         q14, q15
-    MEND
-
-    ; Parallel 1D IDCT on all the columns of a 8x8 16bits data matrix which are
-    ; loaded in q8-q15. The IDCT constants are loaded in r3 - r9. The output
-    ; will be stored back into q8-q15 registers. This macro will touch q0-q7
-    ; registers and use them as buffer during calculation.
-    MACRO
-    IDCT8x8_1D
-    ; stage 1
-    vdup.16         d0, r3                    ; duplicate cospi_28_64
-    vdup.16         d1, r4                    ; duplicate cospi_4_64
-    vdup.16         d2, r5                    ; duplicate cospi_12_64
-    vdup.16         d3, r6                    ; duplicate cospi_20_64
-
-    ; input[1] * cospi_28_64
-    vmull.s16       q2, d18, d0
-    vmull.s16       q3, d19, d0
-
-    ; input[5] * cospi_12_64
-    vmull.s16       q5, d26, d2
-    vmull.s16       q6, d27, d2
-
-    ; input[1]*cospi_28_64-input[7]*cospi_4_64
-    vmlsl.s16       q2, d30, d1
-    vmlsl.s16       q3, d31, d1
-
-    ; input[5] * cospi_12_64 - input[3] * cospi_20_64
-    vmlsl.s16       q5, d22, d3
-    vmlsl.s16       q6, d23, d3
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d8, q2, #14               ; >> 14
-    vqrshrn.s32     d9, q3, #14               ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d10, q5, #14              ; >> 14
-    vqrshrn.s32     d11, q6, #14              ; >> 14
-
-    ; input[1] * cospi_4_64
-    vmull.s16       q2, d18, d1
-    vmull.s16       q3, d19, d1
-
-    ; input[5] * cospi_20_64
-    vmull.s16       q9, d26, d3
-    vmull.s16       q13, d27, d3
-
-    ; input[1]*cospi_4_64+input[7]*cospi_28_64
-    vmlal.s16       q2, d30, d0
-    vmlal.s16       q3, d31, d0
-
-    ; input[5] * cospi_20_64 + input[3] * cospi_12_64
-    vmlal.s16       q9, d22, d2
-    vmlal.s16       q13, d23, d2
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d14, q2, #14              ; >> 14
-    vqrshrn.s32     d15, q3, #14              ; >> 14
-
-    ; stage 2 & stage 3 - even half
-    vdup.16         d0, r7                    ; duplicate cospi_16_64
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d12, q9, #14              ; >> 14
-    vqrshrn.s32     d13, q13, #14             ; >> 14
-
-    ; input[0] * cospi_16_64
-    vmull.s16       q2, d16, d0
-    vmull.s16       q3, d17, d0
-
-    ; input[0] * cospi_16_64
-    vmull.s16       q13, d16, d0
-    vmull.s16       q15, d17, d0
-
-    ; (input[0] + input[2]) * cospi_16_64
-    vmlal.s16       q2,  d24, d0
-    vmlal.s16       q3, d25, d0
-
-    ; (input[0] - input[2]) * cospi_16_64
-    vmlsl.s16       q13, d24, d0
-    vmlsl.s16       q15, d25, d0
-
-    vdup.16         d0, r8                    ; duplicate cospi_24_64
-    vdup.16         d1, r9                    ; duplicate cospi_8_64
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d18, q2, #14              ; >> 14
-    vqrshrn.s32     d19, q3, #14              ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d22, q13, #14             ; >> 14
-    vqrshrn.s32     d23, q15, #14             ; >> 14
-
-    ; input[1] * cospi_24_64
-    vmull.s16       q2, d20, d0
-    vmull.s16       q3, d21, d0
-
-    ; input[1] * cospi_8_64
-    vmull.s16       q8, d20, d1
-    vmull.s16       q12, d21, d1
-
-    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
-    vmlsl.s16       q2, d28, d1
-    vmlsl.s16       q3, d29, d1
-
-    ; input[1] * cospi_8_64 + input[3] * cospi_24_64
-    vmlal.s16       q8, d28, d0
-    vmlal.s16       q12, d29, d0
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d26, q2, #14              ; >> 14
-    vqrshrn.s32     d27, q3, #14              ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d30, q8, #14              ; >> 14
-    vqrshrn.s32     d31, q12, #14             ; >> 14
-
-    vadd.s16        q0, q9, q15               ; output[0] = step[0] + step[3]
-    vadd.s16        q1, q11, q13              ; output[1] = step[1] + step[2]
-    vsub.s16        q2, q11, q13              ; output[2] = step[1] - step[2]
-    vsub.s16        q3, q9, q15               ; output[3] = step[0] - step[3]
-
-    ; stage 3 -odd half
-    vdup.16         d16, r7                   ; duplicate cospi_16_64
-
-    ; stage 2 - odd half
-    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5]
-    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5]
-    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7]
-    vadd.s16        q7, q7, q6                ; step2[7] = step1[6] + step1[7]
-
-    ; step2[6] * cospi_16_64
-    vmull.s16       q9, d28, d16
-    vmull.s16       q10, d29, d16
-
-    ; step2[6] * cospi_16_64
-    vmull.s16       q11, d28, d16
-    vmull.s16       q12, d29, d16
-
-    ; (step2[6] - step2[5]) * cospi_16_64
-    vmlsl.s16       q9, d26, d16
-    vmlsl.s16       q10, d27, d16
-
-    ; (step2[5] + step2[6]) * cospi_16_64
-    vmlal.s16       q11, d26, d16
-    vmlal.s16       q12, d27, d16
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d10, q9, #14              ; >> 14
-    vqrshrn.s32     d11, q10, #14             ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d12, q11, #14             ; >> 14
-    vqrshrn.s32     d13, q12, #14             ; >> 14
-
-    ; stage 4
-    vadd.s16        q8, q0, q7                ; output[0] = step1[0] + step1[7];
-    vadd.s16        q9, q1, q6                ; output[1] = step1[1] + step1[6];
-    vadd.s16        q10, q2, q5               ; output[2] = step1[2] + step1[5];
-    vadd.s16        q11, q3, q4               ; output[3] = step1[3] + step1[4];
-    vsub.s16        q12, q3, q4               ; output[4] = step1[3] - step1[4];
-    vsub.s16        q13, q2, q5               ; output[5] = step1[2] - step1[5];
-    vsub.s16        q14, q1, q6               ; output[6] = step1[1] - step1[6];
-    vsub.s16        q15, q0, q7               ; output[7] = step1[0] - step1[7];
-    MEND
-
-    ; Parallel 1D IADST on all the columns of a 8x8 16bits data matrix which
-    ; loaded in q8-q15. IADST constants are loaded in r0 - r12 registers. The
-    ; output will be stored back into q8-q15 registers. This macro will touch
-    ; q0 - q7 registers and use them as buffer during calculation.
-    MACRO
-    IADST8X8_1D
-    vdup.16         d14, r0                   ; duplicate cospi_2_64
-    vdup.16         d15, r1                   ; duplicate cospi_30_64
-
-    ; cospi_2_64  * x0
-    vmull.s16       q1, d30, d14
-    vmull.s16       q2, d31, d14
-
-    ; cospi_30_64 * x0
-    vmull.s16       q3, d30, d15
-    vmull.s16       q4, d31, d15
-
-    vdup.16         d30, r4                   ; duplicate cospi_18_64
-    vdup.16         d31, r5                   ; duplicate cospi_14_64
-
-    ; s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
-    vmlal.s16       q1, d16, d15
-    vmlal.s16       q2, d17, d15
-
-    ; s1 = cospi_30_64 * x0 - cospi_2_64  * x1
-    vmlsl.s16       q3, d16, d14
-    vmlsl.s16       q4, d17, d14
-
-    ; cospi_18_64 * x4
-    vmull.s16       q5, d22, d30
-    vmull.s16       q6, d23, d30
-
-    ; cospi_14_64 * x4
-    vmull.s16       q7, d22, d31
-    vmull.s16       q8, d23, d31
-
-    ; s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
-    vmlal.s16       q5, d24, d31
-    vmlal.s16       q6, d25, d31
-
-    ; s5 = cospi_14_64 * x4 - cospi_18_64 * x5
-    vmlsl.s16       q7, d24, d30
-    vmlsl.s16       q8, d25, d30
-
-    ; (s0 + s4)
-    vadd.s32        q11, q1, q5
-    vadd.s32        q12, q2, q6
-
-    vdup.16         d0, r2                   ; duplicate cospi_10_64
-    vdup.16         d1, r3                   ; duplicate cospi_22_64
-
-    ; (s0 - s4)
-    vsub.s32        q1, q1, q5
-    vsub.s32        q2, q2, q6
-
-    ; x0 = dct_const_round_shift(s0 + s4);
-    vqrshrn.s32     d22, q11, #14             ; >> 14
-    vqrshrn.s32     d23, q12, #14             ; >> 14
-
-    ; (s1 + s5)
-    vadd.s32        q12, q3, q7
-    vadd.s32        q15, q4, q8
-
-    ; (s1 - s5)
-    vsub.s32        q3, q3, q7
-    vsub.s32        q4, q4, q8
-
-    ; x4 = dct_const_round_shift(s0 - s4);
-    vqrshrn.s32     d2, q1, #14               ; >> 14
-    vqrshrn.s32     d3, q2, #14               ; >> 14
-
-    ; x1 = dct_const_round_shift(s1 + s5);
-    vqrshrn.s32     d24, q12, #14             ; >> 14
-    vqrshrn.s32     d25, q15, #14             ; >> 14
-
-    ; x5 = dct_const_round_shift(s1 - s5);
-    vqrshrn.s32     d6, q3, #14               ; >> 14
-    vqrshrn.s32     d7, q4, #14               ; >> 14
-
-    ; cospi_10_64 * x2
-    vmull.s16       q4, d26, d0
-    vmull.s16       q5, d27, d0
-
-    ; cospi_22_64 * x2
-    vmull.s16       q2, d26, d1
-    vmull.s16       q6, d27, d1
-
-    vdup.16         d30, r6                   ; duplicate cospi_26_64
-    vdup.16         d31, r7                   ; duplicate cospi_6_64
-
-    ; s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
-    vmlal.s16       q4, d20, d1
-    vmlal.s16       q5, d21, d1
-
-    ; s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
-    vmlsl.s16       q2, d20, d0
-    vmlsl.s16       q6, d21, d0
-
-    ; cospi_26_64 * x6
-    vmull.s16       q0, d18, d30
-    vmull.s16       q13, d19, d30
-
-    ; s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
-    vmlal.s16       q0, d28, d31
-    vmlal.s16       q13, d29, d31
-
-    ; cospi_6_64  * x6
-    vmull.s16       q10, d18, d31
-    vmull.s16       q9, d19, d31
-
-    ; s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
-    vmlsl.s16       q10, d28, d30
-    vmlsl.s16       q9, d29, d30
-
-    ; (s3 + s7)
-    vadd.s32        q14, q2, q10
-    vadd.s32        q15, q6, q9
-
-    ; (s3 - s7)
-    vsub.s32        q2, q2, q10
-    vsub.s32        q6, q6, q9
-
-    ; x3 = dct_const_round_shift(s3 + s7);
-    vqrshrn.s32     d28, q14, #14             ; >> 14
-    vqrshrn.s32     d29, q15, #14             ; >> 14
-
-    ; x7 = dct_const_round_shift(s3 - s7);
-    vqrshrn.s32     d4, q2, #14               ; >> 14
-    vqrshrn.s32     d5, q6, #14               ; >> 14
-
-    ; (s2 + s6)
-    vadd.s32        q9, q4, q0
-    vadd.s32        q10, q5, q13
-
-    ; (s2 - s6)
-    vsub.s32        q4, q4, q0
-    vsub.s32        q5, q5, q13
-
-    vdup.16         d30, r8                   ; duplicate cospi_8_64
-    vdup.16         d31, r9                   ; duplicate cospi_24_64
-
-    ; x2 = dct_const_round_shift(s2 + s6);
-    vqrshrn.s32     d18, q9, #14              ; >> 14
-    vqrshrn.s32     d19, q10, #14             ; >> 14
-
-    ; x6 = dct_const_round_shift(s2 - s6);
-    vqrshrn.s32     d8, q4, #14               ; >> 14
-    vqrshrn.s32     d9, q5, #14               ; >> 14
-
-    ; cospi_8_64  * x4
-    vmull.s16       q5, d2, d30
-    vmull.s16       q6, d3, d30
-
-    ; cospi_24_64 * x4
-    vmull.s16       q7, d2, d31
-    vmull.s16       q0, d3, d31
-
-    ; s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;
-    vmlal.s16       q5, d6, d31
-    vmlal.s16       q6, d7, d31
-
-    ; s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;
-    vmlsl.s16       q7, d6, d30
-    vmlsl.s16       q0, d7, d30
-
-    ; cospi_8_64  * x7
-    vmull.s16       q1, d4, d30
-    vmull.s16       q3, d5, d30
-
-    ; cospi_24_64 * x7
-    vmull.s16       q10, d4, d31
-    vmull.s16       q2, d5, d31
-
-    ; s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
-    vmlsl.s16       q1, d8, d31
-    vmlsl.s16       q3, d9, d31
-
-    ; s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
-    vmlal.s16       q10, d8, d30
-    vmlal.s16       q2, d9, d30
-
-    vadd.s16        q8, q11, q9               ; x0 = s0 + s2;
-
-    vsub.s16        q11, q11, q9              ; x2 = s0 - s2;
-
-    vadd.s16        q4, q12, q14              ; x1 = s1 + s3;
-
-    vsub.s16        q12, q12, q14             ; x3 = s1 - s3;
-
-    ; (s4 + s6)
-    vadd.s32        q14, q5, q1
-    vadd.s32        q15, q6, q3
-
-    ; (s4 - s6)
-    vsub.s32        q5, q5, q1
-    vsub.s32        q6, q6, q3
-
-    ; x4 = dct_const_round_shift(s4 + s6);
-    vqrshrn.s32     d18, q14, #14             ; >> 14
-    vqrshrn.s32     d19, q15, #14             ; >> 14
-
-    ; x6 = dct_const_round_shift(s4 - s6);
-    vqrshrn.s32     d10, q5, #14              ; >> 14
-    vqrshrn.s32     d11, q6, #14              ; >> 14
-
-    ; (s5 + s7)
-    vadd.s32        q1, q7, q10
-    vadd.s32        q3, q0, q2
-
-    ; (s5 - s7))
-    vsub.s32        q7, q7, q10
-    vsub.s32        q0, q0, q2
-
-    ; x5 = dct_const_round_shift(s5 + s7);
-    vqrshrn.s32     d28, q1, #14               ; >> 14
-    vqrshrn.s32     d29, q3, #14               ; >> 14
-
-    ; x7 = dct_const_round_shift(s5 - s7);
-    vqrshrn.s32     d14, q7, #14              ; >> 14
-    vqrshrn.s32     d15, q0, #14              ; >> 14
-
-    vdup.16         d30, r12                  ; duplicate cospi_16_64
-
-    ; cospi_16_64 * x2
-    vmull.s16       q2, d22, d30
-    vmull.s16       q3, d23, d30
-
-    ; cospi_6_64  * x6
-    vmull.s16       q13, d22, d30
-    vmull.s16       q1, d23, d30
-
-    ; cospi_16_64 * x2 + cospi_16_64  * x3;
-    vmlal.s16       q2, d24, d30
-    vmlal.s16       q3, d25, d30
-
-    ; cospi_16_64 * x2 - cospi_16_64  * x3;
-    vmlsl.s16       q13, d24, d30
-    vmlsl.s16       q1, d25, d30
-
-    ; x2 = dct_const_round_shift(s2);
-    vqrshrn.s32     d4, q2, #14               ; >> 14
-    vqrshrn.s32     d5, q3, #14               ; >> 14
-
-    ;x3 = dct_const_round_shift(s3);
-    vqrshrn.s32     d24, q13, #14             ; >> 14
-    vqrshrn.s32     d25, q1, #14              ; >> 14
-
-    ; cospi_16_64 * x6
-    vmull.s16       q13, d10, d30
-    vmull.s16       q1, d11, d30
-
-    ; cospi_6_64  * x6
-    vmull.s16       q11, d10, d30
-    vmull.s16       q0, d11, d30
-
-    ; cospi_16_64 * x6 + cospi_16_64  * x7;
-    vmlal.s16       q13, d14, d30
-    vmlal.s16       q1, d15, d30
-
-    ; cospi_16_64 * x6 - cospi_16_64  * x7;
-    vmlsl.s16       q11, d14, d30
-    vmlsl.s16       q0, d15, d30
-
-    ; x6 = dct_const_round_shift(s6);
-    vqrshrn.s32     d20, q13, #14             ; >> 14
-    vqrshrn.s32     d21, q1, #14              ; >> 14
-
-    ;x7 = dct_const_round_shift(s7);
-    vqrshrn.s32     d12, q11, #14             ; >> 14
-    vqrshrn.s32     d13, q0, #14              ; >> 14
-
-    vdup.16         q5, r10                   ; duplicate 0
-
-    vsub.s16        q9, q5, q9                ; output[1] = -x4;
-    vsub.s16        q11, q5, q2               ; output[3] = -x2;
-    vsub.s16        q13, q5, q6               ; output[5] = -x7;
-    vsub.s16        q15, q5, q4               ; output[7] = -x1;
-    MEND
-
-
-    AREA     Block, CODE, READONLY ; name this block of code
-;void vp9_iht8x8_64_add_neon(int16_t *input, uint8_t *dest,
-;                               int dest_stride, int tx_type)
-;
-; r0  int16_t input
-; r1  uint8_t *dest
-; r2  int dest_stride
-; r3  int tx_type)
-; This function will only handle tx_type of 1,2,3.
-|vp9_iht8x8_64_add_neon| PROC
-
-    ; load the inputs into d16-d19
-    vld1.s16        {q8,q9}, [r0]!
-    vld1.s16        {q10,q11}, [r0]!
-    vld1.s16        {q12,q13}, [r0]!
-    vld1.s16        {q14,q15}, [r0]!
-
-    push            {r0-r10}
-    vpush           {d8-d15}
-
-    ; transpose the input data
-    TRANSPOSE8X8
-
-    ; decide the type of transform
-    cmp         r3, #2
-    beq         idct_iadst
-    cmp         r3, #3
-    beq         iadst_iadst
-
-iadst_idct
-    ; generate IDCT constants
-    GENERATE_IDCT_CONSTANTS
-
-    ; first transform rows
-    IDCT8x8_1D
-
-    ; transpose the matrix
-    TRANSPOSE8X8
-
-    ; generate IADST constants
-    GENERATE_IADST_CONSTANTS
-
-    ; then transform columns
-    IADST8X8_1D
-
-    b end_vp9_iht8x8_64_add_neon
-
-idct_iadst
-    ; generate IADST constants
-    GENERATE_IADST_CONSTANTS
-
-    ; first transform rows
-    IADST8X8_1D
-
-    ; transpose the matrix
-    TRANSPOSE8X8
-
-    ; generate IDCT constants
-    GENERATE_IDCT_CONSTANTS
-
-    ; then transform columns
-    IDCT8x8_1D
-
-    b end_vp9_iht8x8_64_add_neon
-
-iadst_iadst
-    ; generate IADST constants
-    GENERATE_IADST_CONSTANTS
-
-    ; first transform rows
-    IADST8X8_1D
-
-    ; transpose the matrix
-    TRANSPOSE8X8
-
-    ; then transform columns
-    IADST8X8_1D
-
-end_vp9_iht8x8_64_add_neon
-    vpop           {d8-d15}
-    pop            {r0-r10}
-
-    ; ROUND_POWER_OF_TWO(temp_out[j], 5)
-    vrshr.s16       q8, q8, #5
-    vrshr.s16       q9, q9, #5
-    vrshr.s16       q10, q10, #5
-    vrshr.s16       q11, q11, #5
-    vrshr.s16       q12, q12, #5
-    vrshr.s16       q13, q13, #5
-    vrshr.s16       q14, q14, #5
-    vrshr.s16       q15, q15, #5
-
-    ; save dest pointer
-    mov             r0, r1
-
-    ; load destination data
-    vld1.64         {d0}, [r1], r2
-    vld1.64         {d1}, [r1], r2
-    vld1.64         {d2}, [r1], r2
-    vld1.64         {d3}, [r1], r2
-    vld1.64         {d4}, [r1], r2
-    vld1.64         {d5}, [r1], r2
-    vld1.64         {d6}, [r1], r2
-    vld1.64         {d7}, [r1]
-
-    ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
-    vaddw.u8        q8, q8, d0
-    vaddw.u8        q9, q9, d1
-    vaddw.u8        q10, q10, d2
-    vaddw.u8        q11, q11, d3
-    vaddw.u8        q12, q12, d4
-    vaddw.u8        q13, q13, d5
-    vaddw.u8        q14, q14, d6
-    vaddw.u8        q15, q15, d7
-
-    ; clip_pixel
-    vqmovun.s16     d0, q8
-    vqmovun.s16     d1, q9
-    vqmovun.s16     d2, q10
-    vqmovun.s16     d3, q11
-    vqmovun.s16     d4, q12
-    vqmovun.s16     d5, q13
-    vqmovun.s16     d6, q14
-    vqmovun.s16     d7, q15
-
-    ; store the data
-    vst1.64         {d0}, [r0], r2
-    vst1.64         {d1}, [r0], r2
-    vst1.64         {d2}, [r0], r2
-    vst1.64         {d3}, [r0], r2
-    vst1.64         {d4}, [r0], r2
-    vst1.64         {d5}, [r0], r2
-    vst1.64         {d6}, [r0], r2
-    vst1.64         {d7}, [r0], r2
-    bx          lr
-    ENDP  ; |vp9_iht8x8_64_add_neon|
-
-    END
diff --git a/vp9/common/arm/neon/vp9_iht8x8_add_neon.c b/vp9/common/arm/neon/vp9_iht8x8_add_neon.c
new file mode 100644
index 000000000..04b342c3d
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_iht8x8_add_neon.c
@@ -0,0 +1,624 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vp9/common/vp9_common.h"
+
+static int16_t cospi_2_64 = 16305;
+static int16_t cospi_4_64 = 16069;
+static int16_t cospi_6_64 = 15679;
+static int16_t cospi_8_64 = 15137;
+static int16_t cospi_10_64 = 14449;
+static int16_t cospi_12_64 = 13623;
+static int16_t cospi_14_64 = 12665;
+static int16_t cospi_16_64 = 11585;
+static int16_t cospi_18_64 = 10394;
+static int16_t cospi_20_64 = 9102;
+static int16_t cospi_22_64 = 7723;
+static int16_t cospi_24_64 = 6270;
+static int16_t cospi_26_64 = 4756;
+static int16_t cospi_28_64 = 3196;
+static int16_t cospi_30_64 = 1606;
+
+static INLINE void TRANSPOSE8X8(
+        int16x8_t *q8s16,
+        int16x8_t *q9s16,
+        int16x8_t *q10s16,
+        int16x8_t *q11s16,
+        int16x8_t *q12s16,
+        int16x8_t *q13s16,
+        int16x8_t *q14s16,
+        int16x8_t *q15s16) {
+    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+    int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
+    int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
+
+    d16s16 = vget_low_s16(*q8s16);
+    d17s16 = vget_high_s16(*q8s16);
+    d18s16 = vget_low_s16(*q9s16);
+    d19s16 = vget_high_s16(*q9s16);
+    d20s16 = vget_low_s16(*q10s16);
+    d21s16 = vget_high_s16(*q10s16);
+    d22s16 = vget_low_s16(*q11s16);
+    d23s16 = vget_high_s16(*q11s16);
+    d24s16 = vget_low_s16(*q12s16);
+    d25s16 = vget_high_s16(*q12s16);
+    d26s16 = vget_low_s16(*q13s16);
+    d27s16 = vget_high_s16(*q13s16);
+    d28s16 = vget_low_s16(*q14s16);
+    d29s16 = vget_high_s16(*q14s16);
+    d30s16 = vget_low_s16(*q15s16);
+    d31s16 = vget_high_s16(*q15s16);
+
+    *q8s16  = vcombine_s16(d16s16, d24s16);  // vswp d17, d24
+    *q9s16  = vcombine_s16(d18s16, d26s16);  // vswp d19, d26
+    *q10s16 = vcombine_s16(d20s16, d28s16);  // vswp d21, d28
+    *q11s16 = vcombine_s16(d22s16, d30s16);  // vswp d23, d30
+    *q12s16 = vcombine_s16(d17s16, d25s16);
+    *q13s16 = vcombine_s16(d19s16, d27s16);
+    *q14s16 = vcombine_s16(d21s16, d29s16);
+    *q15s16 = vcombine_s16(d23s16, d31s16);
+
+    q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16),
+                        vreinterpretq_s32_s16(*q10s16));
+    q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16),
+                        vreinterpretq_s32_s16(*q11s16));
+    q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16),
+                        vreinterpretq_s32_s16(*q14s16));
+    q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16),
+                        vreinterpretq_s32_s16(*q15s16));
+
+    q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8
+                        vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9
+    q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]),   // q10
+                        vreinterpretq_s16_s32(q1x2s32.val[1]));  // q11
+    q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]),   // q12
+                        vreinterpretq_s16_s32(q3x2s32.val[0]));  // q13
+    q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]),   // q14
+                        vreinterpretq_s16_s32(q3x2s32.val[1]));  // q15
+
+    *q8s16  = q0x2s16.val[0];
+    *q9s16  = q0x2s16.val[1];
+    *q10s16 = q1x2s16.val[0];
+    *q11s16 = q1x2s16.val[1];
+    *q12s16 = q2x2s16.val[0];
+    *q13s16 = q2x2s16.val[1];
+    *q14s16 = q3x2s16.val[0];
+    *q15s16 = q3x2s16.val[1];
+    return;
+}
+
+static INLINE void IDCT8x8_1D(
+        int16x8_t *q8s16,
+        int16x8_t *q9s16,
+        int16x8_t *q10s16,
+        int16x8_t *q11s16,
+        int16x8_t *q12s16,
+        int16x8_t *q13s16,
+        int16x8_t *q14s16,
+        int16x8_t *q15s16) {
+    int16x4_t d0s16, d1s16, d2s16, d3s16;
+    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+    int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;
+    int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
+
+    d0s16 = vdup_n_s16(cospi_28_64);
+    d1s16 = vdup_n_s16(cospi_4_64);
+    d2s16 = vdup_n_s16(cospi_12_64);
+    d3s16 = vdup_n_s16(cospi_20_64);
+
+    d16s16 = vget_low_s16(*q8s16);
+    d17s16 = vget_high_s16(*q8s16);
+    d18s16 = vget_low_s16(*q9s16);
+    d19s16 = vget_high_s16(*q9s16);
+    d20s16 = vget_low_s16(*q10s16);
+    d21s16 = vget_high_s16(*q10s16);
+    d22s16 = vget_low_s16(*q11s16);
+    d23s16 = vget_high_s16(*q11s16);
+    d24s16 = vget_low_s16(*q12s16);
+    d25s16 = vget_high_s16(*q12s16);
+    d26s16 = vget_low_s16(*q13s16);
+    d27s16 = vget_high_s16(*q13s16);
+    d28s16 = vget_low_s16(*q14s16);
+    d29s16 = vget_high_s16(*q14s16);
+    d30s16 = vget_low_s16(*q15s16);
+    d31s16 = vget_high_s16(*q15s16);
+
+    q2s32 = vmull_s16(d18s16, d0s16);
+    q3s32 = vmull_s16(d19s16, d0s16);
+    q5s32 = vmull_s16(d26s16, d2s16);
+    q6s32 = vmull_s16(d27s16, d2s16);
+
+    q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
+    q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
+    q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);
+    q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);
+
+    d8s16  = vqrshrn_n_s32(q2s32, 14);
+    d9s16  = vqrshrn_n_s32(q3s32, 14);
+    d10s16 = vqrshrn_n_s32(q5s32, 14);
+    d11s16 = vqrshrn_n_s32(q6s32, 14);
+    q4s16 = vcombine_s16(d8s16, d9s16);
+    q5s16 = vcombine_s16(d10s16, d11s16);
+
+    q2s32 = vmull_s16(d18s16, d1s16);
+    q3s32 = vmull_s16(d19s16, d1s16);
+    q9s32 = vmull_s16(d26s16, d3s16);
+    q13s32 = vmull_s16(d27s16, d3s16);
+
+    q2s32 = vmlal_s16(q2s32, d30s16, d0s16);
+    q3s32 = vmlal_s16(q3s32, d31s16, d0s16);
+    q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
+    q13s32 = vmlal_s16(q13s32, d23s16, d2s16);
+
+    d14s16 = vqrshrn_n_s32(q2s32, 14);
+    d15s16 = vqrshrn_n_s32(q3s32, 14);
+    d12s16 = vqrshrn_n_s32(q9s32, 14);
+    d13s16 = vqrshrn_n_s32(q13s32, 14);
+    q6s16 = vcombine_s16(d12s16, d13s16);
+    q7s16 = vcombine_s16(d14s16, d15s16);
+
+    d0s16 = vdup_n_s16(cospi_16_64);
+
+    q2s32 = vmull_s16(d16s16, d0s16);
+    q3s32 = vmull_s16(d17s16, d0s16);
+    q13s32 = vmull_s16(d16s16, d0s16);
+    q15s32 = vmull_s16(d17s16, d0s16);
+
+    q2s32 = vmlal_s16(q2s32, d24s16, d0s16);
+    q3s32 = vmlal_s16(q3s32, d25s16, d0s16);
+    q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);
+    q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);
+
+    d0s16 = vdup_n_s16(cospi_24_64);
+    d1s16 = vdup_n_s16(cospi_8_64);
+
+    d18s16 = vqrshrn_n_s32(q2s32, 14);
+    d19s16 = vqrshrn_n_s32(q3s32, 14);
+    d22s16 = vqrshrn_n_s32(q13s32, 14);
+    d23s16 = vqrshrn_n_s32(q15s32, 14);
+    *q9s16  = vcombine_s16(d18s16, d19s16);
+    *q11s16 = vcombine_s16(d22s16, d23s16);
+
+    q2s32 = vmull_s16(d20s16, d0s16);
+    q3s32 = vmull_s16(d21s16, d0s16);
+    q8s32 = vmull_s16(d20s16, d1s16);
+    q12s32 = vmull_s16(d21s16, d1s16);
+
+    q2s32 = vmlsl_s16(q2s32, d28s16, d1s16);
+    q3s32 = vmlsl_s16(q3s32, d29s16, d1s16);
+    q8s32 = vmlal_s16(q8s32, d28s16, d0s16);
+    q12s32 = vmlal_s16(q12s32, d29s16, d0s16);
+
+    d26s16 = vqrshrn_n_s32(q2s32, 14);
+    d27s16 = vqrshrn_n_s32(q3s32, 14);
+    d30s16 = vqrshrn_n_s32(q8s32, 14);
+    d31s16 = vqrshrn_n_s32(q12s32, 14);
+    *q13s16 = vcombine_s16(d26s16, d27s16);
+    *q15s16 = vcombine_s16(d30s16, d31s16);
+
+    q0s16 = vaddq_s16(*q9s16, *q15s16);
+    q1s16 = vaddq_s16(*q11s16, *q13s16);
+    q2s16 = vsubq_s16(*q11s16, *q13s16);
+    q3s16 = vsubq_s16(*q9s16, *q15s16);
+
+    *q13s16 = vsubq_s16(q4s16, q5s16);
+    q4s16   = vaddq_s16(q4s16, q5s16);
+    *q14s16 = vsubq_s16(q7s16, q6s16);
+    q7s16   = vaddq_s16(q7s16, q6s16);
+    d26s16 = vget_low_s16(*q13s16);
+    d27s16 = vget_high_s16(*q13s16);
+    d28s16 = vget_low_s16(*q14s16);
+    d29s16 = vget_high_s16(*q14s16);
+
+    d16s16 = vdup_n_s16(cospi_16_64);
+
+    q9s32  = vmull_s16(d28s16, d16s16);
+    q10s32 = vmull_s16(d29s16, d16s16);
+    q11s32 = vmull_s16(d28s16, d16s16);
+    q12s32 = vmull_s16(d29s16, d16s16);
+
+    q9s32  = vmlsl_s16(q9s32,  d26s16, d16s16);
+    q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
+    q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
+    q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
+
+    d10s16 = vqrshrn_n_s32(q9s32, 14);
+    d11s16 = vqrshrn_n_s32(q10s32, 14);
+    d12s16 = vqrshrn_n_s32(q11s32, 14);
+    d13s16 = vqrshrn_n_s32(q12s32, 14);
+    q5s16 = vcombine_s16(d10s16, d11s16);
+    q6s16 = vcombine_s16(d12s16, d13s16);
+
+    *q8s16  = vaddq_s16(q0s16, q7s16);
+    *q9s16  = vaddq_s16(q1s16, q6s16);
+    *q10s16 = vaddq_s16(q2s16, q5s16);
+    *q11s16 = vaddq_s16(q3s16, q4s16);
+    *q12s16 = vsubq_s16(q3s16, q4s16);
+    *q13s16 = vsubq_s16(q2s16, q5s16);
+    *q14s16 = vsubq_s16(q1s16, q6s16);
+    *q15s16 = vsubq_s16(q0s16, q7s16);
+    return;
+}
+
+static INLINE void IADST8X8_1D(
+        int16x8_t *q8s16,
+        int16x8_t *q9s16,
+        int16x8_t *q10s16,
+        int16x8_t *q11s16,
+        int16x8_t *q12s16,
+        int16x8_t *q13s16,
+        int16x8_t *q14s16,
+        int16x8_t *q15s16) {
+    int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+    int16x8_t q2s16, q4s16, q5s16, q6s16;
+    int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q7s32, q8s32;
+    int32x4_t q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;
+
+    d16s16 = vget_low_s16(*q8s16);
+    d17s16 = vget_high_s16(*q8s16);
+    d18s16 = vget_low_s16(*q9s16);
+    d19s16 = vget_high_s16(*q9s16);
+    d20s16 = vget_low_s16(*q10s16);
+    d21s16 = vget_high_s16(*q10s16);
+    d22s16 = vget_low_s16(*q11s16);
+    d23s16 = vget_high_s16(*q11s16);
+    d24s16 = vget_low_s16(*q12s16);
+    d25s16 = vget_high_s16(*q12s16);
+    d26s16 = vget_low_s16(*q13s16);
+    d27s16 = vget_high_s16(*q13s16);
+    d28s16 = vget_low_s16(*q14s16);
+    d29s16 = vget_high_s16(*q14s16);
+    d30s16 = vget_low_s16(*q15s16);
+    d31s16 = vget_high_s16(*q15s16);
+
+    d14s16 = vdup_n_s16(cospi_2_64);
+    d15s16 = vdup_n_s16(cospi_30_64);
+
+    q1s32 = vmull_s16(d30s16, d14s16);
+    q2s32 = vmull_s16(d31s16, d14s16);
+    q3s32 = vmull_s16(d30s16, d15s16);
+    q4s32 = vmull_s16(d31s16, d15s16);
+
+    d30s16 = vdup_n_s16(cospi_18_64);
+    d31s16 = vdup_n_s16(cospi_14_64);
+
+    q1s32 = vmlal_s16(q1s32, d16s16, d15s16);
+    q2s32 = vmlal_s16(q2s32, d17s16, d15s16);
+    q3s32 = vmlsl_s16(q3s32, d16s16, d14s16);
+    q4s32 = vmlsl_s16(q4s32, d17s16, d14s16);
+
+    q5s32 = vmull_s16(d22s16, d30s16);
+    q6s32 = vmull_s16(d23s16, d30s16);
+    q7s32 = vmull_s16(d22s16, d31s16);
+    q8s32 = vmull_s16(d23s16, d31s16);
+
+    q5s32 = vmlal_s16(q5s32, d24s16, d31s16);
+    q6s32 = vmlal_s16(q6s32, d25s16, d31s16);
+    q7s32 = vmlsl_s16(q7s32, d24s16, d30s16);
+    q8s32 = vmlsl_s16(q8s32, d25s16, d30s16);
+
+    q11s32 = vaddq_s32(q1s32, q5s32);
+    q12s32 = vaddq_s32(q2s32, q6s32);
+    q1s32 = vsubq_s32(q1s32, q5s32);
+    q2s32 = vsubq_s32(q2s32, q6s32);
+
+    d22s16 = vqrshrn_n_s32(q11s32, 14);
+    d23s16 = vqrshrn_n_s32(q12s32, 14);
+    *q11s16 = vcombine_s16(d22s16, d23s16);
+
+    q12s32 = vaddq_s32(q3s32, q7s32);
+    q15s32 = vaddq_s32(q4s32, q8s32);
+    q3s32 = vsubq_s32(q3s32, q7s32);
+    q4s32 = vsubq_s32(q4s32, q8s32);
+
+    d2s16  = vqrshrn_n_s32(q1s32, 14);
+    d3s16  = vqrshrn_n_s32(q2s32, 14);
+    d24s16 = vqrshrn_n_s32(q12s32, 14);
+    d25s16 = vqrshrn_n_s32(q15s32, 14);
+    d6s16  = vqrshrn_n_s32(q3s32, 14);
+    d7s16  = vqrshrn_n_s32(q4s32, 14);
+    *q12s16 = vcombine_s16(d24s16, d25s16);
+
+    d0s16 = vdup_n_s16(cospi_10_64);
+    d1s16 = vdup_n_s16(cospi_22_64);
+    q4s32 = vmull_s16(d26s16, d0s16);
+    q5s32 = vmull_s16(d27s16, d0s16);
+    q2s32 = vmull_s16(d26s16, d1s16);
+    q6s32 = vmull_s16(d27s16, d1s16);
+
+    d30s16 = vdup_n_s16(cospi_26_64);
+    d31s16 = vdup_n_s16(cospi_6_64);
+
+    q4s32 = vmlal_s16(q4s32, d20s16, d1s16);
+    q5s32 = vmlal_s16(q5s32, d21s16, d1s16);
+    q2s32 = vmlsl_s16(q2s32, d20s16, d0s16);
+    q6s32 = vmlsl_s16(q6s32, d21s16, d0s16);
+
+    q0s32 = vmull_s16(d18s16, d30s16);
+    q13s32 = vmull_s16(d19s16, d30s16);
+
+    q0s32 = vmlal_s16(q0s32, d28s16, d31s16);
+    q13s32 = vmlal_s16(q13s32, d29s16, d31s16);
+
+    q10s32 = vmull_s16(d18s16, d31s16);
+    q9s32 = vmull_s16(d19s16, d31s16);
+
+    q10s32 = vmlsl_s16(q10s32, d28s16, d30s16);
+    q9s32 = vmlsl_s16(q9s32, d29s16, d30s16);
+
+    q14s32 = vaddq_s32(q2s32, q10s32);
+    q15s32 = vaddq_s32(q6s32, q9s32);
+    q2s32 = vsubq_s32(q2s32, q10s32);
+    q6s32 = vsubq_s32(q6s32, q9s32);
+
+    d28s16 = vqrshrn_n_s32(q14s32, 14);
+    d29s16 = vqrshrn_n_s32(q15s32, 14);
+    d4s16 = vqrshrn_n_s32(q2s32, 14);
+    d5s16 = vqrshrn_n_s32(q6s32, 14);
+    *q14s16 = vcombine_s16(d28s16, d29s16);
+
+    q9s32 = vaddq_s32(q4s32, q0s32);
+    q10s32 = vaddq_s32(q5s32, q13s32);
+    q4s32 = vsubq_s32(q4s32, q0s32);
+    q5s32 = vsubq_s32(q5s32, q13s32);
+
+    d30s16 = vdup_n_s16(cospi_8_64);
+    d31s16 = vdup_n_s16(cospi_24_64);
+
+    d18s16 = vqrshrn_n_s32(q9s32, 14);
+    d19s16 = vqrshrn_n_s32(q10s32, 14);
+    d8s16 = vqrshrn_n_s32(q4s32, 14);
+    d9s16 = vqrshrn_n_s32(q5s32, 14);
+    *q9s16 = vcombine_s16(d18s16, d19s16);
+
+    q5s32 = vmull_s16(d2s16, d30s16);
+    q6s32 = vmull_s16(d3s16, d30s16);
+    q7s32 = vmull_s16(d2s16, d31s16);
+    q0s32 = vmull_s16(d3s16, d31s16);
+
+    q5s32 = vmlal_s16(q5s32, d6s16, d31s16);
+    q6s32 = vmlal_s16(q6s32, d7s16, d31s16);
+    q7s32 = vmlsl_s16(q7s32, d6s16, d30s16);
+    q0s32 = vmlsl_s16(q0s32, d7s16, d30s16);
+
+    q1s32 = vmull_s16(d4s16, d30s16);
+    q3s32 = vmull_s16(d5s16, d30s16);
+    q10s32 = vmull_s16(d4s16, d31s16);
+    q2s32 = vmull_s16(d5s16, d31s16);
+
+    q1s32 = vmlsl_s16(q1s32, d8s16, d31s16);
+    q3s32 = vmlsl_s16(q3s32, d9s16, d31s16);
+    q10s32 = vmlal_s16(q10s32, d8s16, d30s16);
+    q2s32 = vmlal_s16(q2s32, d9s16, d30s16);
+
+    *q8s16 = vaddq_s16(*q11s16, *q9s16);
+    *q11s16 = vsubq_s16(*q11s16, *q9s16);
+    q4s16 = vaddq_s16(*q12s16, *q14s16);
+    *q12s16 = vsubq_s16(*q12s16, *q14s16);
+
+    q14s32 = vaddq_s32(q5s32, q1s32);
+    q15s32 = vaddq_s32(q6s32, q3s32);
+    q5s32 = vsubq_s32(q5s32, q1s32);
+    q6s32 = vsubq_s32(q6s32, q3s32);
+
+    d18s16 = vqrshrn_n_s32(q14s32, 14);
+    d19s16 = vqrshrn_n_s32(q15s32, 14);
+    d10s16 = vqrshrn_n_s32(q5s32, 14);
+    d11s16 = vqrshrn_n_s32(q6s32, 14);
+    *q9s16 = vcombine_s16(d18s16, d19s16);
+
+    q1s32 = vaddq_s32(q7s32, q10s32);
+    q3s32 = vaddq_s32(q0s32, q2s32);
+    q7s32 = vsubq_s32(q7s32, q10s32);
+    q0s32 = vsubq_s32(q0s32, q2s32);
+
+    d28s16 = vqrshrn_n_s32(q1s32, 14);
+    d29s16 = vqrshrn_n_s32(q3s32, 14);
+    d14s16 = vqrshrn_n_s32(q7s32, 14);
+    d15s16 = vqrshrn_n_s32(q0s32, 14);
+    *q14s16 = vcombine_s16(d28s16, d29s16);
+
+    d30s16 = vdup_n_s16(cospi_16_64);
+
+    d22s16 = vget_low_s16(*q11s16);
+    d23s16 = vget_high_s16(*q11s16);
+    q2s32 = vmull_s16(d22s16, d30s16);
+    q3s32 = vmull_s16(d23s16, d30s16);
+    q13s32 = vmull_s16(d22s16, d30s16);
+    q1s32 = vmull_s16(d23s16, d30s16);
+
+    d24s16 = vget_low_s16(*q12s16);
+    d25s16 = vget_high_s16(*q12s16);
+    q2s32 = vmlal_s16(q2s32, d24s16, d30s16);
+    q3s32 = vmlal_s16(q3s32, d25s16, d30s16);
+    q13s32 = vmlsl_s16(q13s32, d24s16, d30s16);
+    q1s32 = vmlsl_s16(q1s32, d25s16, d30s16);
+
+    d4s16 = vqrshrn_n_s32(q2s32, 14);
+    d5s16 = vqrshrn_n_s32(q3s32, 14);
+    d24s16 = vqrshrn_n_s32(q13s32, 14);
+    d25s16 = vqrshrn_n_s32(q1s32, 14);
+    q2s16 = vcombine_s16(d4s16, d5s16);
+    *q12s16 = vcombine_s16(d24s16, d25s16);
+
+    q13s32 = vmull_s16(d10s16, d30s16);
+    q1s32 = vmull_s16(d11s16, d30s16);
+    q11s32 = vmull_s16(d10s16, d30s16);
+    q0s32 = vmull_s16(d11s16, d30s16);
+
+    q13s32 = vmlal_s16(q13s32, d14s16, d30s16);
+    q1s32 = vmlal_s16(q1s32, d15s16, d30s16);
+    q11s32 = vmlsl_s16(q11s32, d14s16, d30s16);
+    q0s32 = vmlsl_s16(q0s32, d15s16, d30s16);
+
+    d20s16 = vqrshrn_n_s32(q13s32, 14);
+    d21s16 = vqrshrn_n_s32(q1s32, 14);
+    d12s16 = vqrshrn_n_s32(q11s32, 14);
+    d13s16 = vqrshrn_n_s32(q0s32, 14);
+    *q10s16 = vcombine_s16(d20s16, d21s16);
+    q6s16 = vcombine_s16(d12s16, d13s16);
+
+    q5s16 = vdupq_n_s16(0);
+
+    *q9s16  = vsubq_s16(q5s16, *q9s16);
+    *q11s16 = vsubq_s16(q5s16, q2s16);
+    *q13s16 = vsubq_s16(q5s16, q6s16);
+    *q15s16 = vsubq_s16(q5s16, q4s16);
+    return;
+}
+
+void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest,
+                            int dest_stride, int tx_type) {
+    int i;
+    uint8_t *d1, *d2;
+    uint8x8_t d0u8, d1u8, d2u8, d3u8;
+    uint64x1_t d0u64, d1u64, d2u64, d3u64;
+    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+    uint16x8_t q8u16, q9u16, q10u16, q11u16;
+
+    q8s16  = vld1q_s16(input);
+    q9s16  = vld1q_s16(input + 8);
+    q10s16 = vld1q_s16(input + 8 * 2);
+    q11s16 = vld1q_s16(input + 8 * 3);
+    q12s16 = vld1q_s16(input + 8 * 4);
+    q13s16 = vld1q_s16(input + 8 * 5);
+    q14s16 = vld1q_s16(input + 8 * 6);
+    q15s16 = vld1q_s16(input + 8 * 7);
+
+    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+                 &q12s16, &q13s16, &q14s16, &q15s16);
+
+    switch (tx_type) {
+      case 0:  // idct_idct is not supported. Fall back to C
+        vp9_iht8x8_64_add_c(input, dest, dest_stride, tx_type);
+        return;
+        break;
+      case 1:  // iadst_idct
+        // generate IDCT constants
+        // GENERATE_IDCT_CONSTANTS
+
+        // first transform rows
+        IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+                   &q12s16, &q13s16, &q14s16, &q15s16);
+
+        // transpose the matrix
+        TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+                     &q12s16, &q13s16, &q14s16, &q15s16);
+
+        // generate IADST constants
+        // GENERATE_IADST_CONSTANTS
+
+        // then transform columns
+        IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+                    &q12s16, &q13s16, &q14s16, &q15s16);
+        break;
+      case 2:  // idct_iadst
+        // generate IADST constants
+        // GENERATE_IADST_CONSTANTS
+
+        // first transform rows
+        IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+                    &q12s16, &q13s16, &q14s16, &q15s16);
+
+        // transpose the matrix
+        TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+                     &q12s16, &q13s16, &q14s16, &q15s16);
+
+        // generate IDCT constants
+        // GENERATE_IDCT_CONSTANTS
+
+        // then transform columns
+        IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+                   &q12s16, &q13s16, &q14s16, &q15s16);
+        break;
+      case 3:  // iadst_iadst
+        // generate IADST constants
+        // GENERATE_IADST_CONSTANTS
+
+        // first transform rows
+        IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+                    &q12s16, &q13s16, &q14s16, &q15s16);
+
+        // transpose the matrix
+        TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+                     &q12s16, &q13s16, &q14s16, &q15s16);
+
+        // then transform columns
+        IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+                    &q12s16, &q13s16, &q14s16, &q15s16);
+        break;
+      default:  // iadst_idct
+        assert(0);
+        break;
+    }
+
+    q8s16 = vrshrq_n_s16(q8s16, 5);
+    q9s16 = vrshrq_n_s16(q9s16, 5);
+    q10s16 = vrshrq_n_s16(q10s16, 5);
+    q11s16 = vrshrq_n_s16(q11s16, 5);
+    q12s16 = vrshrq_n_s16(q12s16, 5);
+    q13s16 = vrshrq_n_s16(q13s16, 5);
+    q14s16 = vrshrq_n_s16(q14s16, 5);
+    q15s16 = vrshrq_n_s16(q15s16, 5);
+
+    for (d1 = d2 = dest, i = 0; i < 2; i++) {
+        if (i != 0) {
+            q8s16 = q12s16;
+            q9s16 = q13s16;
+            q10s16 = q14s16;
+            q11s16 = q15s16;
+        }
+
+        d0u64 = vld1_u64((uint64_t *)d1);
+        d1 += dest_stride;
+        d1u64 = vld1_u64((uint64_t *)d1);
+        d1 += dest_stride;
+        d2u64 = vld1_u64((uint64_t *)d1);
+        d1 += dest_stride;
+        d3u64 = vld1_u64((uint64_t *)d1);
+        d1 += dest_stride;
+
+        q8u16  = vaddw_u8(vreinterpretq_u16_s16(q8s16),
+                          vreinterpret_u8_u64(d0u64));
+        q9u16  = vaddw_u8(vreinterpretq_u16_s16(q9s16),
+                          vreinterpret_u8_u64(d1u64));
+        q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
+                          vreinterpret_u8_u64(d2u64));
+        q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
+                          vreinterpret_u8_u64(d3u64));
+
+        d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+        d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+        d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+        d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+        d2 += dest_stride;
+        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+        d2 += dest_stride;
+        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+        d2 += dest_stride;
+        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+        d2 += dest_stride;
+    }
+    return;
+}
diff --git a/vp9/common/arm/neon/vp9_loopfilter_16_neon.c b/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
index 97fe02805..09f470e97 100644
--- a/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
+++ b/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -8,9 +8,178 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <arm_neon.h>
+
 #include "./vp9_rtcd.h"
+#include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 
+static INLINE void vp9_loop_filter_neon_16(
+        uint8x16_t qblimit,  // blimit
+        uint8x16_t qlimit,   // limit
+        uint8x16_t qthresh,  // thresh
+        uint8x16_t q3,       // p3
+        uint8x16_t q4,       // p2
+        uint8x16_t q5,       // p1
+        uint8x16_t q6,       // p0
+        uint8x16_t q7,       // q0
+        uint8x16_t q8,       // q1
+        uint8x16_t q9,       // q2
+        uint8x16_t q10,      // q3
+        uint8x16_t *q5r,     // p1
+        uint8x16_t *q6r,     // p0
+        uint8x16_t *q7r,     // q0
+        uint8x16_t *q8r) {   // q1
+    uint8x16_t q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8;
+    int16x8_t q2s16, q11s16;
+    uint16x8_t q4u16;
+    int8x16_t q0s8, q1s8, q2s8, q11s8, q12s8, q13s8;
+    int8x8_t d2s8, d3s8;
+
+    q11u8 = vabdq_u8(q3, q4);
+    q12u8 = vabdq_u8(q4, q5);
+    q13u8 = vabdq_u8(q5, q6);
+    q14u8 = vabdq_u8(q8, q7);
+    q3 = vabdq_u8(q9, q8);
+    q4 = vabdq_u8(q10, q9);
+
+    q11u8 = vmaxq_u8(q11u8, q12u8);
+    q12u8 = vmaxq_u8(q13u8, q14u8);
+    q3 = vmaxq_u8(q3, q4);
+    q15u8 = vmaxq_u8(q11u8, q12u8);
+
+    q9 = vabdq_u8(q6, q7);
+
+    // vp8_hevmask
+    q13u8 = vcgtq_u8(q13u8, qthresh);
+    q14u8 = vcgtq_u8(q14u8, qthresh);
+    q15u8 = vmaxq_u8(q15u8, q3);
+
+    q2u8 = vabdq_u8(q5, q8);
+    q9 = vqaddq_u8(q9, q9);
+
+    q15u8 = vcgeq_u8(qlimit, q15u8);
+
+    // vp8_filter() function
+    // convert to signed
+    q10 = vdupq_n_u8(0x80);
+    q8 = veorq_u8(q8, q10);
+    q7 = veorq_u8(q7, q10);
+    q6 = veorq_u8(q6, q10);
+    q5 = veorq_u8(q5, q10);
+
+    q2u8 = vshrq_n_u8(q2u8, 1);
+    q9 = vqaddq_u8(q9, q2u8);
+
+    q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
+                     vget_low_s8(vreinterpretq_s8_u8(q6)));
+    q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
+                      vget_high_s8(vreinterpretq_s8_u8(q6)));
+
+    q9 = vcgeq_u8(qblimit, q9);
+
+    q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5),
+                    vreinterpretq_s8_u8(q8));
+
+    q14u8 = vorrq_u8(q13u8, q14u8);
+
+    q4u16 = vdupq_n_u16(3);
+    q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16));
+    q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16));
+
+    q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8);
+    q15u8 = vandq_u8(q15u8, q9);
+
+    q1s8 = vreinterpretq_s8_u8(q1u8);
+    q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
+    q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8));
+
+    q4 = vdupq_n_u8(3);
+    q9 = vdupq_n_u8(4);
+    // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
+    d2s8 = vqmovn_s16(q2s16);
+    d3s8 = vqmovn_s16(q11s16);
+    q1s8 = vcombine_s8(d2s8, d3s8);
+    q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8);
+    q1s8 = vreinterpretq_s8_u8(q1u8);
+
+    q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q4));
+    q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9));
+    q2s8 = vshrq_n_s8(q2s8, 3);
+    q1s8 = vshrq_n_s8(q1s8, 3);
+
+    q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8);
+    q0s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8);
+
+    q1s8 = vrshrq_n_s8(q1s8, 1);
+    q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
+
+    q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8);
+    q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8);
+
+    *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q10);
+    *q7r = veorq_u8(vreinterpretq_u8_s8(q0s8),  q10);
+    *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q10);
+    *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q10);
+    return;
+}
+
+#if !HAVE_NEON_ASM
+void vp9_lpf_horizontal_4_dual_neon(uint8_t *s, int p /* pitch */,
+                                    const uint8_t *blimit0,
+                                    const uint8_t *limit0,
+                                    const uint8_t *thresh0,
+                                    const uint8_t *blimit1,
+                                    const uint8_t *limit1,
+                                    const uint8_t *thresh1) {
+    uint8x8_t dblimit0, dlimit0, dthresh0, dblimit1, dlimit1, dthresh1;
+    uint8x16_t qblimit, qlimit, qthresh;
+    uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8;
+
+    dblimit0 = vld1_u8(blimit0);
+    dlimit0 = vld1_u8(limit0);
+    dthresh0 = vld1_u8(thresh0);
+    dblimit1 = vld1_u8(blimit1);
+    dlimit1 = vld1_u8(limit1);
+    dthresh1 = vld1_u8(thresh1);
+    qblimit = vcombine_u8(dblimit0, dblimit1);
+    qlimit = vcombine_u8(dlimit0, dlimit1);
+    qthresh = vcombine_u8(dthresh0, dthresh1);
+
+    s -= (p << 2);
+
+    q3u8 = vld1q_u8(s);
+    s += p;
+    q4u8 = vld1q_u8(s);
+    s += p;
+    q5u8 = vld1q_u8(s);
+    s += p;
+    q6u8 = vld1q_u8(s);
+    s += p;
+    q7u8 = vld1q_u8(s);
+    s += p;
+    q8u8 = vld1q_u8(s);
+    s += p;
+    q9u8 = vld1q_u8(s);
+    s += p;
+    q10u8 = vld1q_u8(s);
+
+    vp9_loop_filter_neon_16(qblimit, qlimit, qthresh,
+                            q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8,
+                            &q5u8, &q6u8, &q7u8, &q8u8);
+
+    s -= (p * 5);
+    vst1q_u8(s, q5u8);
+    s += p;
+    vst1q_u8(s, q6u8);
+    s += p;
+    vst1q_u8(s, q7u8);
+    s += p;
+    vst1q_u8(s, q8u8);
+    return;
+}
+#endif  // !HAVE_NEON_ASM
+
 void vp9_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */,
                                     const uint8_t *blimit0,
                                     const uint8_t *limit0,
diff --git a/vp9/common/arm/neon/vp9_loopfilter_neon.c b/vp9/common/arm/neon/vp9_loopfilter_neon.c
index f54d7a94b..079d26677 100644
--- a/vp9/common/arm/neon/vp9_loopfilter_neon.c
+++ b/vp9/common/arm/neon/vp9_loopfilter_neon.c
@@ -10,7 +10,9 @@
 
 #include <arm_neon.h>
 
-static inline void vp9_loop_filter_neon(
+#include "./vpx_config.h"
+
+static INLINE void vp9_loop_filter_neon(
         uint8x8_t dblimit,    // flimit
         uint8x8_t dlimit,     // limit
         uint8x8_t dthresh,    // thresh
@@ -271,7 +273,7 @@ void vp9_lpf_vertical_4_neon(
     return;
 }
 
-static inline void vp9_mbloop_filter_neon(
+static INLINE void vp9_mbloop_filter_neon(
         uint8x8_t dblimit,   // mblimit
         uint8x8_t dlimit,    // limit
         uint8x8_t dthresh,   // thresh
diff --git a/vp9/common/arm/neon/vp9_reconintra_neon.c b/vp9/common/arm/neon/vp9_reconintra_neon.c
new file mode 100644
index 000000000..d0beaa720
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_reconintra_neon.c
@@ -0,0 +1,473 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stddef.h>
+#include <arm_neon.h>
+
+void vp9_v_predictor_4x4_neon(
+        uint8_t *dst,
+        ptrdiff_t y_stride,
+        const uint8_t *above,
+        const uint8_t *left) {
+    int i;
+    uint32x2_t d0u32 = vdup_n_u32(0);
+    (void)left;
+
+    d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0);
+    for (i = 0; i < 4; i++, dst += y_stride)
+        vst1_lane_u32((uint32_t *)dst, d0u32, 0);
+    return;
+}
+
+void vp9_v_predictor_8x8_neon(
+        uint8_t *dst,
+        ptrdiff_t y_stride,
+        const uint8_t *above,
+        const uint8_t *left) {
+    int i;
+    uint8x8_t d0u8 = vdup_n_u8(0);
+    (void)left;
+
+    d0u8 = vld1_u8(above);
+    for (i = 0; i < 8; i++, dst += y_stride)
+        vst1_u8(dst, d0u8);
+    return;
+}
+
+void vp9_v_predictor_16x16_neon(
+        uint8_t *dst,
+        ptrdiff_t y_stride,
+        const uint8_t *above,
+        const uint8_t *left) {
+    int i;
+    uint8x16_t q0u8 = vdupq_n_u8(0);
+    (void)left;
+
+    q0u8 = vld1q_u8(above);
+    for (i = 0; i < 16; i++, dst += y_stride)
+        vst1q_u8(dst, q0u8);
+    return;
+}
+
+void vp9_v_predictor_32x32_neon(
+        uint8_t *dst,
+        ptrdiff_t y_stride,
+        const uint8_t *above,
+        const uint8_t *left) {
+    int i;
+    uint8x16_t q0u8 = vdupq_n_u8(0);
+    uint8x16_t q1u8 = vdupq_n_u8(0);
+    (void)left;
+
+    q0u8 = vld1q_u8(above);
+    q1u8 = vld1q_u8(above + 16);
+    for (i = 0; i < 32; i++, dst += y_stride) {
+        vst1q_u8(dst, q0u8);
+        vst1q_u8(dst + 16, q1u8);
+    }
+    return;
+}
+
+void vp9_h_predictor_4x4_neon(
+        uint8_t *dst,
+        ptrdiff_t y_stride,
+        const uint8_t *above,
+        const uint8_t *left) {
+    uint8x8_t d0u8 = vdup_n_u8(0);
+    uint32x2_t d1u32 = vdup_n_u32(0);
+    (void)above;
+
+    d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0);
+
+    d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0);
+    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+    dst += y_stride;
+    d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1);
+    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+    dst += y_stride;
+    d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2);
+    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+    dst += y_stride;
+    d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3);
+    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+    return;
+}
+
+void vp9_h_predictor_8x8_neon(
+        uint8_t *dst,
+        ptrdiff_t y_stride,
+        const uint8_t *above,
+        const uint8_t *left) {
+    uint8x8_t d0u8 = vdup_n_u8(0);
+    uint64x1_t d1u64 = vdup_n_u64(0);
+    (void)above;
+
+    d1u64 = vld1_u64((const uint64_t *)left);
+
+    d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0);
+    vst1_u8(dst, d0u8);
+    dst += y_stride;
+    d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1);
+    vst1_u8(dst, d0u8);
+    dst += y_stride;
+    d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2);
+    vst1_u8(dst, d0u8);
+    dst += y_stride;
+    d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3);
+    vst1_u8(dst, d0u8);
+    dst += y_stride;
+    d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4);
+    vst1_u8(dst, d0u8);
+    dst += y_stride;
+    d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5);
+    vst1_u8(dst, d0u8);
+    dst += y_stride;
+    d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6);
+    vst1_u8(dst, d0u8);
+    dst += y_stride;
+    d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7);
+    vst1_u8(dst, d0u8);
+    return;
+}
+
+void vp9_h_predictor_16x16_neon(
+        uint8_t *dst,
+        ptrdiff_t y_stride,
+        const uint8_t *above,
+        const uint8_t *left) {
+    int j;
+    uint8x8_t d2u8 = vdup_n_u8(0);
+    uint8x16_t q0u8 = vdupq_n_u8(0);
+    uint8x16_t q1u8 = vdupq_n_u8(0);
+    (void)above;
+
+    q1u8 = vld1q_u8(left);
+    d2u8 = vget_low_u8(q1u8);
+    for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
+        q0u8 = vdupq_lane_u8(d2u8, 0);
+        vst1q_u8(dst, q0u8);
+        dst += y_stride;
+        q0u8 = vdupq_lane_u8(d2u8, 1);
+        vst1q_u8(dst, q0u8);
+        dst += y_stride;
+        q0u8 = vdupq_lane_u8(d2u8, 2);
+        vst1q_u8(dst, q0u8);
+        dst += y_stride;
+        q0u8 = vdupq_lane_u8(d2u8, 3);
+        vst1q_u8(dst, q0u8);
+        dst += y_stride;
+        q0u8 = vdupq_lane_u8(d2u8, 4);
+        vst1q_u8(dst, q0u8);
+        dst += y_stride;
+        q0u8 = vdupq_lane_u8(d2u8, 5);
+        vst1q_u8(dst, q0u8);
+        dst += y_stride;
+        q0u8 = vdupq_lane_u8(d2u8, 6);
+        vst1q_u8(dst, q0u8);
+        dst += y_stride;
+        q0u8 = vdupq_lane_u8(d2u8, 7);
+        vst1q_u8(dst, q0u8);
+        dst += y_stride;
+    }
+    return;
+}
+
+void vp9_h_predictor_32x32_neon(
+        uint8_t *dst,
+        ptrdiff_t y_stride,
+        const uint8_t *above,
+        const uint8_t *left) {
+    int j, k;
+    uint8x8_t d2u8 = vdup_n_u8(0);
+    uint8x16_t q0u8 = vdupq_n_u8(0);
+    uint8x16_t q1u8 = vdupq_n_u8(0);
+    (void)above;
+
+    for (k = 0; k < 2; k++, left += 16) {
+        q1u8 = vld1q_u8(left);
+        d2u8 = vget_low_u8(q1u8);
+        for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
+            q0u8 = vdupq_lane_u8(d2u8, 0);
+            vst1q_u8(dst, q0u8);
+            vst1q_u8(dst + 16, q0u8);
+            dst += y_stride;
+            q0u8 = vdupq_lane_u8(d2u8, 1);
+            vst1q_u8(dst, q0u8);
+            vst1q_u8(dst + 16, q0u8);
+            dst += y_stride;
+            q0u8 = vdupq_lane_u8(d2u8, 2);
+            vst1q_u8(dst, q0u8);
+            vst1q_u8(dst + 16, q0u8);
+            dst += y_stride;
+            q0u8 = vdupq_lane_u8(d2u8, 3);
+            vst1q_u8(dst, q0u8);
+            vst1q_u8(dst + 16, q0u8);
+            dst += y_stride;
+            q0u8 = vdupq_lane_u8(d2u8, 4);
+            vst1q_u8(dst, q0u8);
+            vst1q_u8(dst + 16, q0u8);
+            dst += y_stride;
+            q0u8 = vdupq_lane_u8(d2u8, 5);
+            vst1q_u8(dst, q0u8);
+            vst1q_u8(dst + 16, q0u8);
+            dst += y_stride;
+            q0u8 = vdupq_lane_u8(d2u8, 6);
+            vst1q_u8(dst, q0u8);
+            vst1q_u8(dst + 16, q0u8);
+            dst += y_stride;
+            q0u8 = vdupq_lane_u8(d2u8, 7);
+            vst1q_u8(dst, q0u8);
+            vst1q_u8(dst + 16, q0u8);
+            dst += y_stride;
+        }
+    }
+    return;
+}
+
+void vp9_tm_predictor_4x4_neon(
+        uint8_t *dst,
+        ptrdiff_t y_stride,
+        const uint8_t *above,
+        const uint8_t *left) {
+    int i;
+    uint16x8_t q1u16, q3u16;
+    int16x8_t q1s16;
+    uint8x8_t d0u8 = vdup_n_u8(0);
+    uint32x2_t d2u32 = vdup_n_u32(0);
+
+    d0u8 = vdup_n_u8(above[-1]);
+    d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0);
+    q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8);
+    for (i = 0; i < 4; i++, dst += y_stride) {
+        q1u16 = vdupq_n_u16((uint16_t)left[i]);
+        q1s16 = vaddq_s16(vreinterpretq_s16_u16(q1u16),
+                          vreinterpretq_s16_u16(q3u16));
+        d0u8 = vqmovun_s16(q1s16);
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+    }
+    return;
+}
+
+void vp9_tm_predictor_8x8_neon(
+        uint8_t *dst,
+        ptrdiff_t y_stride,
+        const uint8_t *above,
+        const uint8_t *left) {
+    int j;
+    uint16x8_t q0u16, q3u16, q10u16;
+    int16x8_t q0s16;
+    uint16x4_t d20u16;
+    uint8x8_t d0u8, d2u8, d30u8;
+
+    d0u8 = vdup_n_u8(above[-1]);
+    d30u8 = vld1_u8(left);
+    d2u8 = vld1_u8(above);
+    q10u16 = vmovl_u8(d30u8);
+    q3u16 = vsubl_u8(d2u8, d0u8);
+    d20u16 = vget_low_u16(q10u16);
+    for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
+        q0u16 = vdupq_lane_u16(d20u16, 0);
+        q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
+                          vreinterpretq_s16_u16(q0u16));
+        d0u8 = vqmovun_s16(q0s16);
+        vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+        dst += y_stride;
+        q0u16 = vdupq_lane_u16(d20u16, 1);
+        q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
+                          vreinterpretq_s16_u16(q0u16));
+        d0u8 = vqmovun_s16(q0s16);
+        vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+        dst += y_stride;
+        q0u16 = vdupq_lane_u16(d20u16, 2);
+        q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
+                          vreinterpretq_s16_u16(q0u16));
+        d0u8 = vqmovun_s16(q0s16);
+        vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+        dst += y_stride;
+        q0u16 = vdupq_lane_u16(d20u16, 3);
+        q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
+                          vreinterpretq_s16_u16(q0u16));
+        d0u8 = vqmovun_s16(q0s16);
+        vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+        dst += y_stride;
+    }
+    return;
+}
+
+void vp9_tm_predictor_16x16_neon(
+        uint8_t *dst,
+        ptrdiff_t y_stride,
+        const uint8_t *above,
+        const uint8_t *left) {
+    int j, k;
+    uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16;
+    uint8x16_t q0u8, q1u8;
+    int16x8_t q0s16, q1s16, q8s16, q11s16;
+    uint16x4_t d20u16;
+    uint8x8_t d2u8, d3u8, d18u8, d22u8, d23u8;
+
+    q0u8 = vdupq_n_u8(above[-1]);
+    q1u8 = vld1q_u8(above);
+    q2u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
+    q3u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
+    for (k = 0; k < 2; k++, left += 8) {
+        d18u8 = vld1_u8(left);
+        q10u16 = vmovl_u8(d18u8);
+        d20u16 = vget_low_u16(q10u16);
+        for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
+            q0u16 = vdupq_lane_u16(d20u16, 0);
+            q8u16 = vdupq_lane_u16(d20u16, 1);
+            q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                              vreinterpretq_s16_u16(q2u16));
+            q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                              vreinterpretq_s16_u16(q3u16));
+            q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
+                              vreinterpretq_s16_u16(q2u16));
+            q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
+                              vreinterpretq_s16_u16(q3u16));
+            d2u8 = vqmovun_s16(q1s16);
+            d3u8 = vqmovun_s16(q0s16);
+            d22u8 = vqmovun_s16(q11s16);
+            d23u8 = vqmovun_s16(q8s16);
+            vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
+            vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
+            dst += y_stride;
+            vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
+            vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
+            dst += y_stride;
+
+            q0u16 = vdupq_lane_u16(d20u16, 2);
+            q8u16 = vdupq_lane_u16(d20u16, 3);
+            q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                              vreinterpretq_s16_u16(q2u16));
+            q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                              vreinterpretq_s16_u16(q3u16));
+            q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
+                              vreinterpretq_s16_u16(q2u16));
+            q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
+                              vreinterpretq_s16_u16(q3u16));
+            d2u8 = vqmovun_s16(q1s16);
+            d3u8 = vqmovun_s16(q0s16);
+            d22u8 = vqmovun_s16(q11s16);
+            d23u8 = vqmovun_s16(q8s16);
+            vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
+            vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
+            dst += y_stride;
+            vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
+            vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
+            dst += y_stride;
+        }
+    }
+    return;
+}
+
+void vp9_tm_predictor_32x32_neon(
+        uint8_t *dst,
+        ptrdiff_t y_stride,
+        const uint8_t *above,
+        const uint8_t *left) {
+    int j, k;
+    uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16;
+    uint8x16_t q0u8, q1u8, q2u8;
+    int16x8_t q12s16, q13s16, q14s16, q15s16;
+    uint16x4_t d6u16;
+    uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8;
+
+    q0u8 = vdupq_n_u8(above[-1]);
+    q1u8 = vld1q_u8(above);
+    q2u8 = vld1q_u8(above + 16);
+    q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
+    q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
+    q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8));
+    q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8));
+    for (k = 0; k < 4; k++, left += 8) {
+        d26u8 = vld1_u8(left);
+        q3u16 = vmovl_u8(d26u8);
+        d6u16 = vget_low_u16(q3u16);
+        for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) {
+            q0u16 = vdupq_lane_u16(d6u16, 0);
+            q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                               vreinterpretq_s16_u16(q8u16));
+            q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                               vreinterpretq_s16_u16(q9u16));
+            q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                               vreinterpretq_s16_u16(q10u16));
+            q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                               vreinterpretq_s16_u16(q11u16));
+            d0u8 = vqmovun_s16(q12s16);
+            d1u8 = vqmovun_s16(q13s16);
+            d2u8 = vqmovun_s16(q14s16);
+            d3u8 = vqmovun_s16(q15s16);
+            q0u8 = vcombine_u8(d0u8, d1u8);
+            q1u8 = vcombine_u8(d2u8, d3u8);
+            vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+            vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+            dst += y_stride;
+
+            q0u16 = vdupq_lane_u16(d6u16, 1);
+            q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                               vreinterpretq_s16_u16(q8u16));
+            q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                               vreinterpretq_s16_u16(q9u16));
+            q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                               vreinterpretq_s16_u16(q10u16));
+            q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                               vreinterpretq_s16_u16(q11u16));
+            d0u8 = vqmovun_s16(q12s16);
+            d1u8 = vqmovun_s16(q13s16);
+            d2u8 = vqmovun_s16(q14s16);
+            d3u8 = vqmovun_s16(q15s16);
+            q0u8 = vcombine_u8(d0u8, d1u8);
+            q1u8 = vcombine_u8(d2u8, d3u8);
+            vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+            vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+            dst += y_stride;
+
+            q0u16 = vdupq_lane_u16(d6u16, 2);
+            q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                               vreinterpretq_s16_u16(q8u16));
+            q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                               vreinterpretq_s16_u16(q9u16));
+            q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                               vreinterpretq_s16_u16(q10u16));
+            q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                               vreinterpretq_s16_u16(q11u16));
+            d0u8 = vqmovun_s16(q12s16);
+            d1u8 = vqmovun_s16(q13s16);
+            d2u8 = vqmovun_s16(q14s16);
+            d3u8 = vqmovun_s16(q15s16);
+            q0u8 = vcombine_u8(d0u8, d1u8);
+            q1u8 = vcombine_u8(d2u8, d3u8);
+            vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+            vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+            dst += y_stride;
+
+            q0u16 = vdupq_lane_u16(d6u16, 3);
+            q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                               vreinterpretq_s16_u16(q8u16));
+            q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                               vreinterpretq_s16_u16(q9u16));
+            q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                               vreinterpretq_s16_u16(q10u16));
+            q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                               vreinterpretq_s16_u16(q11u16));
+            d0u8 = vqmovun_s16(q12s16);
+            d1u8 = vqmovun_s16(q13s16);
+            d2u8 = vqmovun_s16(q14s16);
+            d3u8 = vqmovun_s16(q15s16);
+            q0u8 = vcombine_u8(d0u8, d1u8);
+            q1u8 = vcombine_u8(d2u8, d3u8);
+            vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+            vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+            dst += y_stride;
+        }
+    }
+    return;
+}
diff --git a/vp9/common/arm/neon/vp9_reconintra_neon.asm b/vp9/common/arm/neon/vp9_reconintra_neon_asm.asm
index dc9856fa8..dc9856fa8 100644
--- a/vp9/common/arm/neon/vp9_reconintra_neon.asm
+++ b/vp9/common/arm/neon/vp9_reconintra_neon_asm.asm
diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index cb299f9f7..2f75af575 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -44,8 +44,10 @@ void vp9_free_ref_frame_buffers(VP9_COMMON *cm) {
     vp9_free_frame_buffer(&cm->frame_bufs[i].buf);
   }
 
+#if CONFIG_VP9_POSTPROC
   vp9_free_frame_buffer(&cm->post_proc_buffer);
   vp9_free_frame_buffer(&cm->post_proc_buffer_int);
+#endif
 }
 
 void vp9_free_context_buffers(VP9_COMMON *cm) {
@@ -110,7 +112,8 @@ int vp9_alloc_ref_frame_buffers(VP9_COMMON *cm, int width, int height) {
 #if CONFIG_VP9_HIGHBITDEPTH
                                cm->use_highbitdepth,
 #endif
-                               VP9_ENC_BORDER_IN_PIXELS) < 0)
+                               VP9_ENC_BORDER_IN_PIXELS,
+                               cm->byte_alignment) < 0)
       goto fail;
     if (cm->frame_bufs[i].mvs == NULL) {
       cm->frame_bufs[i].mvs =
@@ -126,12 +129,13 @@ int vp9_alloc_ref_frame_buffers(VP9_COMMON *cm, int width, int height) {
 
   init_frame_bufs(cm);
 
-#if CONFIG_INTERNAL_STATS || CONFIG_VP9_POSTPROC
+#if CONFIG_VP9_POSTPROC
   if (vp9_alloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y,
 #if CONFIG_VP9_HIGHBITDEPTH
                              cm->use_highbitdepth,
 #endif
-                             VP9_ENC_BORDER_IN_PIXELS) < 0)
+                             VP9_ENC_BORDER_IN_PIXELS,
+                             cm->byte_alignment) < 0)
     goto fail;
 #endif
 
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 7d7209c56..e7fb19fd1 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -212,6 +212,12 @@ typedef struct macroblockd {
   /* pointer to current frame */
   const YV12_BUFFER_CONFIG *cur_buf;
 
+  ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
+  ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];
+
+  PARTITION_CONTEXT *above_seg_context;
+  PARTITION_CONTEXT left_seg_context[8];
+
   /* mc buffer */
   DECLARE_ALIGNED(16, uint8_t, mc_buf[80 * 2 * 80 * 2]);
 
@@ -221,17 +227,11 @@ typedef struct macroblockd {
   DECLARE_ALIGNED(16, uint16_t, mc_buf_high[80 * 2 * 80 * 2]);
 #endif
 
-  int lossless;
+  /* dqcoeff are shared by all the planes. So planes must be decoded serially */
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[64 * 64]);
 
+  int lossless;
   int corrupted;
-
-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[MAX_MB_PLANE][64 * 64]);
-
-  ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
-  ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];
-
-  PARTITION_CONTEXT *above_seg_context;
-  PARTITION_CONTEXT left_seg_context[8];
 } MACROBLOCKD;
 
 static INLINE BLOCK_SIZE get_subsize(BLOCK_SIZE bsize,
diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c
index d7610ed28..4557e19bf 100644
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -453,6 +453,7 @@ void vp9_setup_past_independence(VP9_COMMON *cm) {
   vp9_default_coef_probs(cm);
   vp9_init_mode_probs(cm->fc);
   vp9_init_mv_probs(cm);
+  cm->fc->initialized = 1;
 
   if (cm->frame_type == KEY_FRAME ||
       cm->error_resilient_mode || cm->reset_frame_context == 3) {
@@ -469,8 +470,6 @@ void vp9_setup_past_independence(VP9_COMMON *cm) {
     vpx_memset(cm->prev_mip, 0, cm->mi_stride * (cm->mi_rows + 1) *
                                     sizeof(*cm->prev_mip));
 
-  vpx_memset(cm->mip, 0, cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mip));
-
   vp9_zero(cm->ref_frame_sign_bias);
 
   cm->frame_context_idx = 0;
diff --git a/vp9/common/vp9_entropymode.h b/vp9/common/vp9_entropymode.h
index 6831d3f87..6db10806d 100644
--- a/vp9/common/vp9_entropymode.h
+++ b/vp9/common/vp9_entropymode.h
@@ -50,6 +50,7 @@ typedef struct frame_contexts {
   struct tx_probs tx_probs;
   vp9_prob skip_probs[SKIP_CONTEXTS];
   nmv_context nmvc;
+  int initialized;
 } FRAME_CONTEXT;
 
 typedef struct {
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index 43a4fe5b9..58b2da75f 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -968,7 +968,7 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
       break;
   }
   // The largest loopfilter we have is 16x16 so we use the 16x16 mask
-  // for 32x32 transforms also also.
+  // for 32x32 transforms also.
   lfm->left_y[TX_16X16] |= lfm->left_y[TX_32X32];
   lfm->above_y[TX_16X16] |= lfm->above_y[TX_32X32];
   lfm->left_uv[TX_16X16] |= lfm->left_uv[TX_32X32];
diff --git a/vp9/common/vp9_mfqe.c b/vp9/common/vp9_mfqe.c
index f1bdc1b06..92650e954 100644
--- a/vp9/common/vp9_mfqe.c
+++ b/vp9/common/vp9_mfqe.c
@@ -210,17 +210,85 @@ static void mfqe_partition(VP9_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs,
     return;
   }
   // No MFQE on blocks smaller than 16x16
-  if (partition == PARTITION_SPLIT && bs == BLOCK_16X16) {
+  if (bs == BLOCK_16X16) {
     partition = PARTITION_NONE;
   }
+  if (bs == BLOCK_64X64) {
+    mi_offset = 4;
+    y_offset = 32;
+    uv_offset = 16;
+  } else {
+    mi_offset = 2;
+    y_offset = 16;
+    uv_offset = 8;
+  }
   switch (partition) {
+    BLOCK_SIZE mfqe_bs, bs_tmp;
     case PARTITION_HORZ:
+      if (bs == BLOCK_64X64) {
+        mfqe_bs = BLOCK_64X32;
+        bs_tmp = BLOCK_32X32;
+      } else {
+        mfqe_bs = BLOCK_32X16;
+        bs_tmp = BLOCK_16X16;
+      }
+      if (mfqe_decision(mi, mfqe_bs)) {
+        // Do mfqe on the first square partition.
+        mfqe_block(bs_tmp, y, u, v, y_stride, uv_stride,
+                   yd, ud, vd, yd_stride, uvd_stride);
+        // Do mfqe on the second square partition.
+        mfqe_block(bs_tmp, y + y_offset, u + uv_offset, v + uv_offset,
+                   y_stride, uv_stride, yd + y_offset, ud + uv_offset,
+                   vd + uv_offset, yd_stride, uvd_stride);
+      }
+      if (mfqe_decision(mi + mi_offset * cm->mi_stride, mfqe_bs)) {
+        // Do mfqe on the first square partition.
+        mfqe_block(bs_tmp, y + y_offset * y_stride, u + uv_offset * uv_stride,
+                   v + uv_offset * uv_stride, y_stride, uv_stride,
+                   yd + y_offset * yd_stride, ud + uv_offset * uvd_stride,
+                   vd + uv_offset * uvd_stride, yd_stride, uvd_stride);
+        // Do mfqe on the second square partition.
+        mfqe_block(bs_tmp, y + y_offset * y_stride + y_offset,
+                   u + uv_offset * uv_stride + uv_offset,
+                   v + uv_offset * uv_stride + uv_offset, y_stride,
+                   uv_stride, yd + y_offset * yd_stride + y_offset,
+                   ud + uv_offset * uvd_stride + uv_offset,
+                   vd + uv_offset * uvd_stride + uv_offset,
+                   yd_stride, uvd_stride);
+      }
+      break;
     case PARTITION_VERT:
-      // If current block size is not square.
-      // Copy the block from current frame(i.e., no mfqe is done).
-      // TODO(jackychen): Rectangle blocks should also be taken into account.
-      copy_block(y, u, v, y_stride, uv_stride, yd, ud, vd,
-                 yd_stride, uvd_stride, bs);
+      if (bs == BLOCK_64X64) {
+        mfqe_bs = BLOCK_32X64;
+        bs_tmp = BLOCK_32X32;
+      } else {
+        mfqe_bs = BLOCK_16X32;
+        bs_tmp = BLOCK_16X16;
+      }
+      if (mfqe_decision(mi, mfqe_bs)) {
+        // Do mfqe on the first square partition.
+        mfqe_block(bs_tmp, y, u, v, y_stride, uv_stride,
+                   yd, ud, vd, yd_stride, uvd_stride);
+        // Do mfqe on the second square partition.
+        mfqe_block(bs_tmp, y + y_offset * y_stride, u + uv_offset * uv_stride,
+                   v + uv_offset * uv_stride, y_stride, uv_stride,
+                   yd + y_offset * yd_stride, ud + uv_offset * uvd_stride,
+                   vd + uv_offset * uvd_stride, yd_stride, uvd_stride);
+      }
+      if (mfqe_decision(mi + mi_offset, mfqe_bs)) {
+        // Do mfqe on the first square partition.
+        mfqe_block(bs_tmp, y + y_offset, u + uv_offset, v + uv_offset,
+                   y_stride, uv_stride, yd + y_offset, ud + uv_offset,
+                   vd + uv_offset, yd_stride, uvd_stride);
+        // Do mfqe on the second square partition.
+        mfqe_block(bs_tmp, y + y_offset * y_stride + y_offset,
+                   u + uv_offset * uv_stride + uv_offset,
+                   v + uv_offset * uv_stride + uv_offset, y_stride,
+                   uv_stride, yd + y_offset * yd_stride + y_offset,
+                   ud + uv_offset * uvd_stride + uv_offset,
+                   vd + uv_offset * uvd_stride + uv_offset,
+                   yd_stride, uvd_stride);
+      }
       break;
     case PARTITION_NONE:
       if (mfqe_decision(mi, cur_bs)) {
@@ -234,15 +302,6 @@ static void mfqe_partition(VP9_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs,
       }
       break;
     case PARTITION_SPLIT:
-      if (bs == BLOCK_64X64) {
-        mi_offset = 4;
-        y_offset = 32;
-        uv_offset = 16;
-      } else {
-        mi_offset = 2;
-        y_offset = 16;
-        uv_offset = 8;
-      }
       // Recursion on four square partitions, e.g. if bs is 64X64,
       // then look into four 32X32 blocks in it.
       mfqe_partition(cm, mi, subsize, y, u, v, y_stride, uv_stride, yd, ud, vd,
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index 55a1f86c7..ad91c10dd 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -112,8 +112,10 @@ typedef struct VP9Common {
 
   int new_fb_idx;
 
+#if CONFIG_VP9_POSTPROC
   YV12_BUFFER_CONFIG post_proc_buffer;
   YV12_BUFFER_CONFIG post_proc_buffer_int;
+#endif
 
   FRAME_TYPE last_frame_type;  /* last frame's frame type for motion search.*/
   FRAME_TYPE frame_type;
@@ -182,7 +184,6 @@ typedef struct VP9Common {
   struct segmentation seg;
 
   // Context probabilities for reference frame prediction
-  int allow_comp_inter_inter;
   MV_REFERENCE_FRAME comp_fixed_ref;
   MV_REFERENCE_FRAME comp_var_ref[2];
   REFERENCE_MODE reference_mode;
@@ -207,6 +208,7 @@ typedef struct VP9Common {
   int frame_parallel_decoding_mode;
 
   int log2_tile_cols, log2_tile_rows;
+  int byte_alignment;
 
   // Private data associated with the frame buffer callbacks.
   void *cb_priv;
@@ -263,7 +265,7 @@ static INLINE void init_macroblockd(VP9_COMMON *cm, MACROBLOCKD *xd) {
   int i;
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
-    xd->plane[i].dqcoeff = xd->dqcoeff[i];
+    xd->plane[i].dqcoeff = xd->dqcoeff;
     xd->above_context[i] = cm->above_context +
         i * sizeof(*cm->above_context) * 2 * mi_cols_aligned_to_sb(cm->mi_cols);
   }
diff --git a/vp9/common/vp9_postproc.c b/vp9/common/vp9_postproc.c
index e1a389132..7eac70be2 100644
--- a/vp9/common/vp9_postproc.c
+++ b/vp9/common/vp9_postproc.c
@@ -671,7 +671,8 @@ int vp9_post_proc_frame(struct VP9Common *cm,
 #if CONFIG_VP9_HIGHBITDEPTH
                                  cm->use_highbitdepth,
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-                                 VP9_ENC_BORDER_IN_PIXELS) < 0) {
+                                 VP9_ENC_BORDER_IN_PIXELS,
+                                 cm->byte_alignment) < 0) {
         vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                            "Failed to allocate MFQE framebuffer");
       }
@@ -683,19 +684,18 @@ int vp9_post_proc_frame(struct VP9Common *cm,
     }
   }
 
-#if CONFIG_VP9_POSTPROC || CONFIG_INTERNAL_STATS
   if (vp9_realloc_frame_buffer(&cm->post_proc_buffer, cm->width, cm->height,
                                cm->subsampling_x, cm->subsampling_y,
 #if CONFIG_VP9_HIGHBITDEPTH
                                cm->use_highbitdepth,
 #endif
-                               VP9_DEC_BORDER_IN_PIXELS, NULL, NULL, NULL) < 0)
+                               VP9_DEC_BORDER_IN_PIXELS, cm->byte_alignment,
+                               NULL, NULL, NULL) < 0)
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate post-processing buffer");
-#endif
 
   if ((flags & VP9D_MFQE) && cm->current_video_frame >= 2 &&
-      cm->postproc_state.last_frame_valid &&
+      cm->postproc_state.last_frame_valid && cm->bit_depth == 8 &&
       cm->postproc_state.last_base_qindex <= last_q_thresh &&
       cm->base_qindex - cm->postproc_state.last_base_qindex >= q_diff_thresh) {
     vp9_mfqe(cm);
@@ -749,4 +749,4 @@ int vp9_post_proc_frame(struct VP9Common *cm,
   swap_mi_and_prev_mi(cm);
   return 0;
 }
-#endif
+#endif  // CONFIG_VP9_POSTPROC
diff --git a/vp9/common/vp9_rtcd.c b/vp9/common/vp9_rtcd.c
index dc15a84ff..c777bc81f 100644
--- a/vp9/common/vp9_rtcd.c
+++ b/vp9/common/vp9_rtcd.c
@@ -16,5 +16,7 @@ void vpx_scale_rtcd(void);
 
 void vp9_rtcd() {
     vpx_scale_rtcd();
+    // TODO(JBB): Remove this once, by insuring that both the encoder and
+    // decoder setup functions are protected by once();
     once(setup_rtcd_internal);
 }
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 575990bb5..d2ab875e9 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -66,8 +66,7 @@ add_proto qw/void vp9_d63_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, con
 specialize qw/vp9_d63_predictor_4x4/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_h_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_h_predictor_4x4 neon_asm dspr2/, "$ssse3_x86inc";
-$vp9_h_predictor_4x4_neon_asm=vp9_h_predictor_4x4_neon;
+specialize qw/vp9_h_predictor_4x4 neon dspr2/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_d117_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_d117_predictor_4x4/;
@@ -79,12 +78,10 @@ add_proto qw/void vp9_d153_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, co
 specialize qw/vp9_d153_predictor_4x4/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_v_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_v_predictor_4x4 neon_asm/, "$sse_x86inc";
-$vp9_v_predictor_4x4_neon_asm=vp9_v_predictor_4x4_neon;
+specialize qw/vp9_v_predictor_4x4 neon/, "$sse_x86inc";
 
 add_proto qw/void vp9_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_tm_predictor_4x4 neon_asm dspr2/, "$sse_x86inc";
-$vp9_tm_predictor_4x4_neon_asm=vp9_tm_predictor_4x4_neon;
+specialize qw/vp9_tm_predictor_4x4 neon dspr2/, "$sse_x86inc";
 
 add_proto qw/void vp9_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_dc_predictor_4x4 dspr2/, "$sse_x86inc";
@@ -108,8 +105,7 @@ add_proto qw/void vp9_d63_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, con
 specialize qw/vp9_d63_predictor_8x8/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_h_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_h_predictor_8x8 neon_asm dspr2/, "$ssse3_x86inc";
-$vp9_h_predictor_8x8_neon_asm=vp9_h_predictor_8x8_neon;
+specialize qw/vp9_h_predictor_8x8 neon dspr2/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_d117_predictor_8x8/;
@@ -121,12 +117,10 @@ add_proto qw/void vp9_d153_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, co
 specialize qw/vp9_d153_predictor_8x8/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_v_predictor_8x8 neon_asm/, "$sse_x86inc";
-$vp9_v_predictor_8x8_neon_asm=vp9_v_predictor_8x8_neon;
+specialize qw/vp9_v_predictor_8x8 neon/, "$sse_x86inc";
 
 add_proto qw/void vp9_tm_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_tm_predictor_8x8 neon_asm dspr2/, "$sse2_x86inc";
-$vp9_tm_predictor_8x8_neon_asm=vp9_tm_predictor_8x8_neon;
+specialize qw/vp9_tm_predictor_8x8 neon dspr2/, "$sse2_x86inc";
 
 add_proto qw/void vp9_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_dc_predictor_8x8 dspr2/, "$sse_x86inc";
@@ -150,8 +144,7 @@ add_proto qw/void vp9_d63_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, c
 specialize qw/vp9_d63_predictor_16x16/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_h_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_h_predictor_16x16 neon_asm dspr2/, "$ssse3_x86inc";
-$vp9_h_predictor_16x16_neon_asm=vp9_h_predictor_16x16_neon;
+specialize qw/vp9_h_predictor_16x16 neon dspr2/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_d117_predictor_16x16/;
@@ -163,12 +156,10 @@ add_proto qw/void vp9_d153_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride,
 specialize qw/vp9_d153_predictor_16x16/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_v_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_v_predictor_16x16 neon_asm/, "$sse2_x86inc";
-$vp9_v_predictor_16x16_neon_asm=vp9_v_predictor_16x16_neon;
+specialize qw/vp9_v_predictor_16x16 neon/, "$sse2_x86inc";
 
 add_proto qw/void vp9_tm_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_tm_predictor_16x16 neon_asm/, "$sse2_x86inc";
-$vp9_tm_predictor_16x16_neon_asm=vp9_tm_predictor_16x16_neon;
+specialize qw/vp9_tm_predictor_16x16 neon/, "$sse2_x86inc";
 
 add_proto qw/void vp9_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_dc_predictor_16x16 dspr2/, "$sse2_x86inc";
@@ -192,8 +183,7 @@ add_proto qw/void vp9_d63_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, c
 specialize qw/vp9_d63_predictor_32x32/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_h_predictor_32x32 neon_asm/, "$ssse3_x86inc";
-$vp9_h_predictor_32x32_neon_asm=vp9_h_predictor_32x32_neon;
+specialize qw/vp9_h_predictor_32x32 neon/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_d117_predictor_32x32/;
@@ -205,12 +195,10 @@ add_proto qw/void vp9_d153_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride,
 specialize qw/vp9_d153_predictor_32x32/;
 
 add_proto qw/void vp9_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_v_predictor_32x32 neon_asm/, "$sse2_x86inc";
-$vp9_v_predictor_32x32_neon_asm=vp9_v_predictor_32x32_neon;
+specialize qw/vp9_v_predictor_32x32 neon/, "$sse2_x86inc";
 
 add_proto qw/void vp9_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_tm_predictor_32x32 neon_asm/, "$sse2_x86_64";
-$vp9_tm_predictor_32x32_neon_asm=vp9_tm_predictor_32x32_neon;
+specialize qw/vp9_tm_predictor_32x32 neon/, "$sse2_x86_64";
 
 add_proto qw/void vp9_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_dc_predictor_32x32/, "$sse2_x86inc";
@@ -261,8 +249,7 @@ add_proto qw/void vp9_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *
 specialize qw/vp9_lpf_horizontal_4 mmx neon dspr2/;
 
 add_proto qw/void vp9_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vp9_lpf_horizontal_4_dual sse2 neon_asm dspr2/;
-$vp9_lpf_horizontal_4_dual_neon_asm=vp9_lpf_horizontal_4_dual_neon;
+specialize qw/vp9_lpf_horizontal_4_dual sse2 neon dspr2/;
 
 #
 # post proc
@@ -457,12 +444,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     specialize qw/vp9_idct32x32_1_add sse2 neon dspr2/;
 
     add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
-    specialize qw/vp9_iht4x4_16_add sse2 neon_asm dspr2/;
-    $vp9_iht4x4_16_add_neon_asm=vp9_iht4x4_16_add_neon;
+    specialize qw/vp9_iht4x4_16_add sse2 neon dspr2/;
 
     add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
-    specialize qw/vp9_iht8x8_64_add sse2 neon_asm dspr2/;
-    $vp9_iht8x8_64_add_neon_asm=vp9_iht8x8_64_add_neon;
+    specialize qw/vp9_iht8x8_64_add sse2 neon dspr2/;
 
     add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
     specialize qw/vp9_iht16x16_256_add sse2 dspr2/;
@@ -1140,37 +1125,37 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
   specialize qw/vp9_block_error/;
 
-  add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_quantize_fp/;
 
-  add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_quantize_fp_32x32/;
 
-  add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_quantize_b/;
 
-  add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_quantize_b_32x32/;
 
-  add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_fdct8x8_quant/;
 } else {
   add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
   specialize qw/vp9_block_error avx2/, "$sse2_x86inc";
 
-  add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64";
 
-  add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64";
 
-  add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_quantize_b sse2/, "$ssse3_x86_64";
 
-  add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_quantize_b_32x32/, "$ssse3_x86_64";
 
-  add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_fdct8x8_quant sse2 ssse3/;
 }
 
@@ -1865,16 +1850,16 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vp9_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
   specialize qw/vp9_highbd_subtract_block/;
 
-  add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_highbd_quantize_fp/;
 
-  add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_highbd_quantize_fp_32x32/;
 
-  add_proto qw/void vp9_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  add_proto qw/void vp9_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_highbd_quantize_b sse2/;
 
-  add_proto qw/void vp9_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  add_proto qw/void vp9_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_highbd_quantize_b_32x32 sse2/;
 
   #
diff --git a/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c b/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
index c4efa6565..71dbb402d 100644
--- a/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
+++ b/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
@@ -312,9 +312,11 @@ void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr,
                                          unsigned int out_pitch,
                                          unsigned int output_height,
                                          int16_t *filter) {
-  __m128i addFilterReg64, filtersReg, minReg, srcRegFilt6;
+  __m128i addFilterReg64, filtersReg, minReg;
   __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
-  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4, srcRegFilt5;
+  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;
+  __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
+  __m128i srcReg8;
   unsigned int i;
 
   // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
@@ -333,27 +335,26 @@ void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr,
   // duplicate only the forth 16 bits in the filter
   forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
 
+  // load the first 7 rows of 8 bytes
+  srcReg1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]);
+  srcReg2 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch)[0]);
+  srcReg3 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 2)[0]);
+  srcReg4 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 3)[0]);
+  srcReg5 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 4)[0]);
+  srcReg6 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 5)[0]);
+  srcReg7 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 6)[0]);
+
   for (i = 0; i < output_height; i++) {
-    // load the first 8 bytes
-    srcRegFilt1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]);
-    // load the next 8 bytes in stride of src_pitch
-    srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch)[0]);
-    srcRegFilt3 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*2)[0]);
-    srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*3)[0]);
+    // load the last 8 bytes
+    srcReg8 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 7)[0]);
 
     // merge the result together
-    srcRegFilt1 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
-    srcRegFilt3 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
-
-    // load the next 8 bytes in stride of src_pitch
-    srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*4)[0]);
-    srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*5)[0]);
-    srcRegFilt5 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*6)[0]);
-    srcRegFilt6 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*7)[0]);
+    srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2);
+    srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
 
     // merge the result together
-    srcRegFilt2 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt4);
-    srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt5, srcRegFilt6);
+    srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6);
+    srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8);
 
     // multiply 2 adjacent elements with the filter and add the result
     srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
@@ -377,6 +378,15 @@ void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr,
 
     src_ptr+=src_pitch;
 
+    // shift down a row
+    srcReg1 = srcReg2;
+    srcReg2 = srcReg3;
+    srcReg3 = srcReg4;
+    srcReg4 = srcReg5;
+    srcReg5 = srcReg6;
+    srcReg6 = srcReg7;
+    srcReg7 = srcReg8;
+
     // save only 8 bytes convolve result
     _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
 
@@ -390,9 +400,11 @@ void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
                                           unsigned int out_pitch,
                                           unsigned int output_height,
                                           int16_t *filter) {
-  __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt2, srcRegFilt3;
+  __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt3;
   __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
-  __m128i srcRegFilt4, srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;
+  __m128i srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;
+  __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
+  __m128i srcReg8;
   unsigned int i;
 
   // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
@@ -411,19 +423,24 @@ void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
   // duplicate only the forth 16 bits in the filter
   forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
 
+  // load the first 7 rows of 16 bytes
+  srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr));
+  srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch));
+  srcReg3 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 2));
+  srcReg4 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 3));
+  srcReg5 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 4));
+  srcReg6 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 5));
+  srcReg7 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 6));
+
   for (i = 0; i < output_height; i++) {
-    // load the first 16 bytes
-    srcRegFilt1 = _mm_loadu_si128((__m128i *)(src_ptr));
-    // load the next 16 bytes in stride of src_pitch
-    srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch));
-    srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6));
-    srcRegFilt4 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7));
+    // load the last 16 bytes
+    srcReg8 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 7));
 
     // merge the result together
-    srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
-    srcRegFilt6 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
-    srcRegFilt1 = _mm_unpackhi_epi8(srcRegFilt1, srcRegFilt2);
-    srcRegFilt3 = _mm_unpackhi_epi8(srcRegFilt3, srcRegFilt4);
+    srcRegFilt5 = _mm_unpacklo_epi8(srcReg1, srcReg2);
+    srcRegFilt6 = _mm_unpacklo_epi8(srcReg7, srcReg8);
+    srcRegFilt1 = _mm_unpackhi_epi8(srcReg1, srcReg2);
+    srcRegFilt3 = _mm_unpackhi_epi8(srcReg7, srcReg8);
 
     // multiply 2 adjacent elements with the filter and add the result
     srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters);
@@ -435,25 +452,17 @@ void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
     srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6);
     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
 
-    // load the next 16 bytes in stride of two/three src_pitch
-    srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2));
-    srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3));
-
     // merge the result together
-    srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
-    srcRegFilt6 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);
+    srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
+    srcRegFilt6 = _mm_unpackhi_epi8(srcReg3, srcReg4);
 
     // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, secondFilters);
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
     srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters);
 
-    // load the next 16 bytes in stride of four/five src_pitch
-    srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4));
-    srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5));
-
     // merge the result together
-    srcRegFilt7 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
-    srcRegFilt8 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);
+    srcRegFilt7 = _mm_unpacklo_epi8(srcReg5, srcReg6);
+    srcRegFilt8 = _mm_unpackhi_epi8(srcReg5, srcReg6);
 
     // multiply 2 adjacent elements with the filter and add the result
     srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, thirdFilters);
@@ -461,13 +470,13 @@ void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
 
     // add and saturate the results together
     srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
-                                 _mm_min_epi16(srcRegFilt4, srcRegFilt7));
+                                 _mm_min_epi16(srcRegFilt3, srcRegFilt7));
     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
                                  _mm_min_epi16(srcRegFilt6, srcRegFilt8));
 
     // add and saturate the results together
     srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
-                                 _mm_max_epi16(srcRegFilt4, srcRegFilt7));
+                                 _mm_max_epi16(srcRegFilt3, srcRegFilt7));
     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
                                  _mm_max_epi16(srcRegFilt6, srcRegFilt8));
     srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg64);
@@ -484,6 +493,15 @@ void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
 
     src_ptr+=src_pitch;
 
+    // shift down a row
+    srcReg1 = srcReg2;
+    srcReg2 = srcReg3;
+    srcReg3 = srcReg4;
+    srcReg4 = srcReg5;
+    srcReg5 = srcReg6;
+    srcReg6 = srcReg7;
+    srcReg7 = srcReg8;
+
     // save 16 bytes convolve result
     _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
 
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 58df87d0c..9677173db 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -719,6 +719,7 @@ static void setup_frame_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
           cm->use_highbitdepth,
 #endif
           VP9_DEC_BORDER_IN_PIXELS,
+          cm->byte_alignment,
           &cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer, cm->get_fb_cb,
           cm->cb_priv)) {
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
@@ -793,6 +794,7 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm,
           cm->use_highbitdepth,
 #endif
           VP9_DEC_BORDER_IN_PIXELS,
+          cm->byte_alignment,
           &cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer, cm->get_fb_cb,
           cm->cb_priv)) {
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
@@ -1556,6 +1558,10 @@ void vp9_decode_frame(VP9Decoder *pbi,
   vp9_setup_block_planes(xd, cm->subsampling_x, cm->subsampling_y);
 
   *cm->fc = cm->frame_contexts[cm->frame_context_idx];
+  if (!cm->fc->initialized)
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Uninitialized entropy context.");
+
   vp9_zero(cm->counts);
 
   xd->corrupted = 0;
diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c
index 2daf86200..1406b4034 100644
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@@ -15,6 +15,7 @@
 #include "./vpx_scale_rtcd.h"
 
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/vpx_once.h"
 #include "vpx_ports/vpx_timer.h"
 #include "vpx_scale/vpx_scale.h"
 
@@ -33,8 +34,8 @@
 #include "vp9/decoder/vp9_detokenize.h"
 #include "vp9/decoder/vp9_dthread.h"
 
-static void initialize_dec() {
-  static int init_done = 0;
+static void initialize_dec(void) {
+  static volatile int init_done = 0;
 
   if (!init_done) {
     vp9_rtcd();
@@ -85,7 +86,7 @@ VP9Decoder *vp9_decoder_create() {
                   sizeof(*cm->frame_contexts)));
 
   pbi->need_resync = 1;
-  initialize_dec();
+  once(initialize_dec);
 
   // Initialize the references to not point to any frame buffers.
   vpx_memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
diff --git a/vp9/decoder/vp9_read_bit_buffer.c b/vp9/decoder/vp9_read_bit_buffer.c
index 3eef72844..c3b38a9c7 100644
--- a/vp9/decoder/vp9_read_bit_buffer.c
+++ b/vp9/decoder/vp9_read_bit_buffer.c
@@ -10,20 +10,20 @@
 #include "vp9/decoder/vp9_read_bit_buffer.h"
 
 size_t vp9_rb_bytes_read(struct vp9_read_bit_buffer *rb) {
-  return (rb->bit_offset + CHAR_BIT - 1) / CHAR_BIT;
+  return (rb->bit_offset + 7) >> 3;
 }
 
 int vp9_rb_read_bit(struct vp9_read_bit_buffer *rb) {
   const size_t off = rb->bit_offset;
-  const size_t p = off / CHAR_BIT;
-  const int q = CHAR_BIT - 1 - (int)off % CHAR_BIT;
-  if (rb->bit_buffer + p >= rb->bit_buffer_end) {
-    rb->error_handler(rb->error_handler_data);
-    return 0;
-  } else {
-    const int bit = (rb->bit_buffer[p] & (1 << q)) >> q;
+  const size_t p = off >> 3;
+  const int q = 7 - (int)(off & 0x7);
+  if (rb->bit_buffer + p < rb->bit_buffer_end) {
+    const int bit = (rb->bit_buffer[p] >> q) & 1;
     rb->bit_offset = off + 1;
     return bit;
+  } else {
+    rb->error_handler(rb->error_handler_data);
+    return 0;
   }
 }
 
diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c
index 8c13d0da6..9cf1e5e2c 100644
--- a/vp9/encoder/arm/neon/vp9_quantize_neon.c
+++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c
@@ -26,13 +26,12 @@ void vp9_quantize_fp_neon(const int16_t *coeff_ptr, intptr_t count,
                           const int16_t *round_ptr, const int16_t *quant_ptr,
                           const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
                           int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                          int zbin_oq_value, uint16_t *eob_ptr,
+                          uint16_t *eob_ptr,
                           const int16_t *scan, const int16_t *iscan) {
   // TODO(jingning) Decide the need of these arguments after the
   // quantization process is completed.
   (void)zbin_ptr;
   (void)quant_shift_ptr;
-  (void)zbin_oq_value;
   (void)scan;
 
   if (!skip_block) {
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 019d5b90b..752429c8f 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -1181,7 +1181,7 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
       vp9_cond_prob_diff_update(&header_bc, &fc->intra_inter_prob[i],
                                 counts->intra_inter[i]);
 
-    if (cm->allow_comp_inter_inter) {
+    if (cpi->allow_comp_inter_inter) {
       const int use_compound_pred = cm->reference_mode != SINGLE_REFERENCE;
       const int use_hybrid_pred = cm->reference_mode == REFERENCE_MODE_SELECT;
 
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 2ffc7ea67..68174a6cc 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -40,8 +40,6 @@ struct macroblock_plane {
   int16_t *round;
 
   int64_t quant_thred[2];
-  // Zbin Over Quant value
-  int16_t zbin_extra;
 };
 
 /* The [2] dimension is for whether we skip the EOB node (i.e. if previous
diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c
index 020a95196..506f6de84 100644
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@@ -339,7 +339,7 @@ void vp9_fdct8x8_quant_c(const int16_t *input, int stride,
                          const int16_t *quant_shift_ptr,
                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                          const int16_t *dequant_ptr,
-                         int zbin_oq_value, uint16_t *eob_ptr,
+                         uint16_t *eob_ptr,
                          const int16_t *scan, const int16_t *iscan) {
   int eob = -1;
 
@@ -416,7 +416,6 @@ void vp9_fdct8x8_quant_c(const int16_t *input, int stride,
   // quantization process is completed.
   (void)zbin_ptr;
   (void)quant_shift_ptr;
-  (void)zbin_oq_value;
   (void)iscan;
 
   vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index 4deeed217..56ec6b335 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -425,6 +425,7 @@ int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height,
 #endif
                        int border) {
   int i, fail;
+  const int legacy_byte_alignment = 0;
   assert(denoiser != NULL);
 
   for (i = 0; i < MAX_REF_FRAMES; ++i) {
@@ -433,7 +434,7 @@ int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height,
 #if CONFIG_VP9_HIGHBITDEPTH
                                   use_highbitdepth,
 #endif
-                                  border);
+                                  border, legacy_byte_alignment);
     if (fail) {
       vp9_denoiser_free(denoiser);
       return 1;
@@ -448,7 +449,7 @@ int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height,
 #if CONFIG_VP9_HIGHBITDEPTH
                                 use_highbitdepth,
 #endif
-                                border);
+                                border, legacy_byte_alignment);
   if (fail) {
     vp9_denoiser_free(denoiser);
     return 1;
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 756393f31..4c948237d 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -423,8 +423,9 @@ static int set_vt_partitioning(VP9_COMP *cpi,
 
   if (cm->frame_type == KEY_FRAME) {
     bsize_ref = BLOCK_8X8;
-    // Choose lower thresholds for key frame variance to favor split.
-    threshold_bsize_ref = threshold >> 1;
+    // Choose lower thresholds for key frame variance to favor split, but keep
+    // threshold for splitting to 4x4 block still fairly high for now.
+    threshold_bsize_ref = threshold << 2;
     threshold_low = threshold >> 2;
   }
 
@@ -592,7 +593,16 @@ static void choose_partitioning(VP9_COMP *cpi,
             unsigned int sse = 0;
             int sum = 0;
             if (x4_idx < pixels_wide && y4_idx < pixels_high) {
+#if CONFIG_VP9_HIGHBITDEPTH
+              int s_avg;
+              if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+                s_avg = vp9_highbd_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+              } else {
+                s_avg = vp9_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+              }
+#else
               int s_avg = vp9_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+#endif
               // For key frame, reference is set to 128.
               sum = s_avg - 128;
               sse = sum * sum;
@@ -646,8 +656,6 @@ static void choose_partitioning(VP9_COMP *cpi,
             for (k = 0; k < 4; ++k) {
               const int x8_idx = (k & 1);
               const int y8_idx = (k >> 1);
-              // TODO(marpan): Allow for setting 4x4 partition on key frame.
-              /*
               if (cm->frame_type == KEY_FRAME) {
                 if (!set_vt_partitioning(cpi, xd,
                                          &vt.split[i].split[j].split[k],
@@ -660,12 +668,11 @@ static void choose_partitioning(VP9_COMP *cpi,
                                    BLOCK_4X4);
                 }
               } else {
-              */
                 set_block_size(cpi, xd,
                                (mi_row + y32_idx + y16_idx + y8_idx),
                                (mi_col + x32_idx + x16_idx + x8_idx),
                                BLOCK_8X8);
-              // }
+               }
             }
           }
         }
@@ -3612,7 +3619,6 @@ static void encode_frame_internal(VP9_COMP *cpi) {
   if (xd->lossless) {
     x->optimize = 0;
     cm->lf.filter_level = 0;
-    cpi->zbin_mode_boost_enabled = 0;
   }
 
   vp9_frame_init_quantizer(cpi);
@@ -3716,9 +3722,9 @@ void vp9_encode_frame(VP9_COMP *cpi) {
              cm->ref_frame_sign_bias[GOLDEN_FRAME]) ||
         (cm->ref_frame_sign_bias[ALTREF_FRAME] ==
              cm->ref_frame_sign_bias[LAST_FRAME])) {
-      cm->allow_comp_inter_inter = 0;
+      cpi->allow_comp_inter_inter = 0;
     } else {
-      cm->allow_comp_inter_inter = 1;
+      cpi->allow_comp_inter_inter = 1;
       cm->comp_fixed_ref = ALTREF_FRAME;
       cm->comp_var_ref[0] = LAST_FRAME;
       cm->comp_var_ref[1] = GOLDEN_FRAME;
@@ -3742,7 +3748,7 @@ void vp9_encode_frame(VP9_COMP *cpi) {
     const int is_alt_ref = frame_type == ALTREF_FRAME;
 
     /* prediction (compound, single or hybrid) mode selection */
-    if (is_alt_ref || !cm->allow_comp_inter_inter)
+    if (is_alt_ref || !cpi->allow_comp_inter_inter)
       cm->reference_mode = SINGLE_REFERENCE;
     else if (mode_thrs[COMPOUND_REFERENCE] > mode_thrs[SINGLE_REFERENCE] &&
              mode_thrs[COMPOUND_REFERENCE] >
@@ -3852,24 +3858,6 @@ static void sum_intra_stats(FRAME_COUNTS *counts, const MODE_INFO *mi) {
   ++counts->uv_mode[y_mode][uv_mode];
 }
 
-static int get_zbin_mode_boost(const MB_MODE_INFO *mbmi, int enabled) {
-  if (enabled) {
-    if (is_inter_block(mbmi)) {
-      if (mbmi->mode == ZEROMV) {
-        return mbmi->ref_frame[0] != LAST_FRAME ? GF_ZEROMV_ZBIN_BOOST
-                                                : LF_ZEROMV_ZBIN_BOOST;
-      } else {
-        return mbmi->sb_type < BLOCK_8X8 ? SPLIT_MV_ZBIN_BOOST
-                                         : MV_ZBIN_BOOST;
-      }
-    } else {
-      return INTRA_ZBIN_BOOST;
-    }
-  } else {
-    return 0;
-  }
-}
-
 static void encode_superblock(VP9_COMP *cpi, ThreadData *td,
                               TOKENEXTRA **t, int output_enabled,
                               int mi_row, int mi_col, BLOCK_SIZE bsize,
@@ -3905,12 +3893,6 @@ static void encode_superblock(VP9_COMP *cpi, ThreadData *td,
 
   set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
 
-  // Experimental code. Special case for gf and arf zeromv modes.
-  // Increase zbin size to suppress noise
-  cpi->zbin_mode_boost = get_zbin_mode_boost(mbmi,
-                                             cpi->zbin_mode_boost_enabled);
-  vp9_update_zbin_extra(cpi, x);
-
   if (!is_inter_block(mbmi)) {
     int plane;
     mbmi->skip = 1;
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 9b2165be6..9c29eb438 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -149,7 +149,6 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block,
   int64_t rd_cost0, rd_cost1;
   int rate0, rate1, error0, error1, t0, t1;
   int best, band, pt, i, final_eob;
-  const TOKENVALUE *dct_value_tokens;
   const int16_t *dct_value_cost;
 
   assert((!type && !plane) || (type && plane));
@@ -169,22 +168,18 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block,
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->bd == 12) {
-    dct_value_tokens = vp9_dct_value_tokens_high12_ptr;
     dct_value_cost = vp9_dct_value_cost_high12_ptr;
   } else if (xd->bd == 10) {
-    dct_value_tokens = vp9_dct_value_tokens_high10_ptr;
     dct_value_cost = vp9_dct_value_cost_high10_ptr;
   } else {
-    dct_value_tokens = vp9_dct_value_tokens_ptr;
     dct_value_cost = vp9_dct_value_cost_ptr;
   }
 #else
-  dct_value_tokens = vp9_dct_value_tokens_ptr;
   dct_value_cost = vp9_dct_value_cost_ptr;
 #endif
   for (i = 0; i < eob; i++)
     token_cache[scan[i]] =
-        vp9_pt_energy_class[dct_value_tokens[qcoeff[scan[i]]].token];
+        vp9_pt_energy_class[vp9_get_token(qcoeff[scan[i]])];
 
   for (i = eob; i-- > 0;) {
     int base_bits, d2, dx;
@@ -198,7 +193,7 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block,
       /* Evaluate the first possibility for this state. */
       rate0 = tokens[next][0].rate;
       rate1 = tokens[next][1].rate;
-      t0 = (dct_value_tokens + x)->token;
+      t0 = vp9_get_token(x);
       /* Consider both possible successor states. */
       if (next < default_eob) {
         band = band_translate[i + 1];
@@ -250,7 +245,7 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block,
         t0 = tokens[next][0].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
         t1 = tokens[next][1].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
       } else {
-        t0 = t1 = (dct_value_tokens + x)->token;
+        t0 = t1 = vp9_get_token(x);
       }
       if (next < default_eob) {
         band = band_translate[i + 1];
@@ -391,28 +386,28 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
         vp9_highbd_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin,
                                      p->round_fp, p->quant_fp, p->quant_shift,
                                      qcoeff, dqcoeff, pd->dequant,
-                                     p->zbin_extra, eob, scan_order->scan,
+                                     eob, scan_order->scan,
                                      scan_order->iscan);
         break;
       case TX_16X16:
         vp9_highbd_fdct16x16(src_diff, coeff, diff_stride);
         vp9_highbd_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
                                p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                               pd->dequant, p->zbin_extra, eob,
+                               pd->dequant, eob,
                                scan_order->scan, scan_order->iscan);
         break;
       case TX_8X8:
         vp9_highbd_fdct8x8(src_diff, coeff, diff_stride);
         vp9_highbd_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,
                                p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                               pd->dequant, p->zbin_extra, eob,
+                               pd->dequant, eob,
                                scan_order->scan, scan_order->iscan);
         break;
       case TX_4X4:
         x->fwd_txm4x4(src_diff, coeff, diff_stride);
         vp9_highbd_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
                                p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                               pd->dequant, p->zbin_extra, eob,
+                               pd->dequant, eob,
                                scan_order->scan, scan_order->iscan);
         break;
       default:
@@ -427,28 +422,28 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
       fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
       vp9_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin, p->round_fp,
                             p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                            pd->dequant, p->zbin_extra, eob, scan_order->scan,
+                            pd->dequant, eob, scan_order->scan,
                             scan_order->iscan);
       break;
     case TX_16X16:
       vp9_fdct16x16(src_diff, coeff, diff_stride);
       vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
                       p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                      pd->dequant, p->zbin_extra, eob,
+                      pd->dequant, eob,
                       scan_order->scan, scan_order->iscan);
       break;
     case TX_8X8:
       vp9_fdct8x8_quant(src_diff, diff_stride, coeff, 64,
                         x->skip_block, p->zbin, p->round_fp,
                         p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                        pd->dequant, p->zbin_extra, eob,
+                        pd->dequant, eob,
                         scan_order->scan, scan_order->iscan);
       break;
     case TX_4X4:
       x->fwd_txm4x4(src_diff, coeff, diff_stride);
       vp9_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
                       p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                      pd->dequant, p->zbin_extra, eob,
+                      pd->dequant, eob,
                       scan_order->scan, scan_order->iscan);
       break;
     default:
@@ -561,28 +556,28 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
         highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
         vp9_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin,
                                     p->round, p->quant, p->quant_shift, qcoeff,
-                                    dqcoeff, pd->dequant, p->zbin_extra, eob,
+                                    dqcoeff, pd->dequant, eob,
                                     scan_order->scan, scan_order->iscan);
         break;
       case TX_16X16:
         vp9_highbd_fdct16x16(src_diff, coeff, diff_stride);
         vp9_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
                               p->quant, p->quant_shift, qcoeff, dqcoeff,
-                              pd->dequant, p->zbin_extra, eob,
+                              pd->dequant, eob,
                               scan_order->scan, scan_order->iscan);
         break;
       case TX_8X8:
         vp9_highbd_fdct8x8(src_diff, coeff, diff_stride);
         vp9_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
                               p->quant, p->quant_shift, qcoeff, dqcoeff,
-                              pd->dequant, p->zbin_extra, eob,
+                              pd->dequant, eob,
                               scan_order->scan, scan_order->iscan);
         break;
       case TX_4X4:
         x->fwd_txm4x4(src_diff, coeff, diff_stride);
         vp9_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
                               p->quant, p->quant_shift, qcoeff, dqcoeff,
-                              pd->dequant, p->zbin_extra, eob,
+                              pd->dequant, eob,
                               scan_order->scan, scan_order->iscan);
         break;
       default:
@@ -597,28 +592,28 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
       fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
       vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
                            p->quant, p->quant_shift, qcoeff, dqcoeff,
-                           pd->dequant, p->zbin_extra, eob, scan_order->scan,
+                           pd->dequant, eob, scan_order->scan,
                            scan_order->iscan);
       break;
     case TX_16X16:
       vp9_fdct16x16(src_diff, coeff, diff_stride);
       vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
                      p->quant, p->quant_shift, qcoeff, dqcoeff,
-                     pd->dequant, p->zbin_extra, eob,
+                     pd->dequant, eob,
                      scan_order->scan, scan_order->iscan);
       break;
     case TX_8X8:
       vp9_fdct8x8(src_diff, coeff, diff_stride);
       vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
                      p->quant, p->quant_shift, qcoeff, dqcoeff,
-                     pd->dequant, p->zbin_extra, eob,
+                     pd->dequant, eob,
                      scan_order->scan, scan_order->iscan);
       break;
     case TX_4X4:
       x->fwd_txm4x4(src_diff, coeff, diff_stride);
       vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
                      p->quant, p->quant_shift, qcoeff, dqcoeff,
-                     pd->dequant, p->zbin_extra, eob,
+                     pd->dequant, eob,
                      scan_order->scan, scan_order->iscan);
       break;
     default:
@@ -849,8 +844,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
           highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
           vp9_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin,
                                       p->round, p->quant, p->quant_shift,
-                                      qcoeff, dqcoeff, pd->dequant,
-                                      p->zbin_extra, eob,
+                                      qcoeff, dqcoeff, pd->dequant, eob,
                                       scan_order->scan, scan_order->iscan);
         }
         if (!x->skip_encode && *eob) {
@@ -871,7 +865,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
           vp9_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type);
           vp9_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
                                 p->quant, p->quant_shift, qcoeff, dqcoeff,
-                                pd->dequant, p->zbin_extra, eob,
+                                pd->dequant, eob,
                                 scan_order->scan, scan_order->iscan);
         }
         if (!x->skip_encode && *eob) {
@@ -893,7 +887,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
           vp9_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type);
           vp9_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
                                 p->quant, p->quant_shift, qcoeff, dqcoeff,
-                                pd->dequant, p->zbin_extra, eob,
+                                pd->dequant, eob,
                                 scan_order->scan, scan_order->iscan);
         }
         if (!x->skip_encode && *eob) {
@@ -919,7 +913,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
             x->fwd_txm4x4(src_diff, coeff, diff_stride);
           vp9_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
                                 p->quant, p->quant_shift, qcoeff, dqcoeff,
-                                pd->dequant, p->zbin_extra, eob,
+                                pd->dequant, eob,
                                 scan_order->scan, scan_order->iscan);
         }
 
@@ -958,7 +952,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
         fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
         vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
                              p->quant, p->quant_shift, qcoeff, dqcoeff,
-                             pd->dequant, p->zbin_extra, eob, scan_order->scan,
+                             pd->dequant, eob, scan_order->scan,
                              scan_order->iscan);
       }
       if (!x->skip_encode && *eob)
@@ -978,7 +972,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
         vp9_fht16x16(src_diff, coeff, diff_stride, tx_type);
         vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
                        p->quant, p->quant_shift, qcoeff, dqcoeff,
-                       pd->dequant, p->zbin_extra, eob, scan_order->scan,
+                       pd->dequant, eob, scan_order->scan,
                        scan_order->iscan);
       }
       if (!x->skip_encode && *eob)
@@ -998,7 +992,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
         vp9_fht8x8(src_diff, coeff, diff_stride, tx_type);
         vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
                        p->quant_shift, qcoeff, dqcoeff,
-                       pd->dequant, p->zbin_extra, eob, scan_order->scan,
+                       pd->dequant, eob, scan_order->scan,
                        scan_order->iscan);
       }
       if (!x->skip_encode && *eob)
@@ -1022,7 +1016,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
           x->fwd_txm4x4(src_diff, coeff, diff_stride);
         vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
                        p->quant_shift, qcoeff, dqcoeff,
-                       pd->dequant, p->zbin_extra, eob, scan_order->scan,
+                       pd->dequant, eob, scan_order->scan,
                        scan_order->iscan);
       }
 
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 620027e2d..047b9aaef 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -181,8 +181,8 @@ static void vp9_swap_mi_and_prev_mi(VP9_COMMON *cm) {
   cm->prev_mi = cm->prev_mip + cm->mi_stride + 1;
 }
 
-void vp9_initialize_enc() {
-  static int init_done = 0;
+void vp9_initialize_enc(void) {
+  static volatile int init_done = 0;
 
   if (!init_done) {
     vp9_rtcd();
@@ -490,7 +490,8 @@ static void alloc_raw_frame_buffers(VP9_COMP *cpi) {
 #if CONFIG_VP9_HIGHBITDEPTH
                                cm->use_highbitdepth,
 #endif
-                               VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
+                               VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+                               NULL, NULL, NULL))
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate altref buffer");
 }
@@ -510,7 +511,8 @@ static void alloc_util_frame_buffers(VP9_COMP *cpi) {
 #if CONFIG_VP9_HIGHBITDEPTH
                                cm->use_highbitdepth,
 #endif
-                               VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
+                               VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+                               NULL, NULL, NULL))
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate last frame buffer");
 
@@ -520,7 +522,8 @@ static void alloc_util_frame_buffers(VP9_COMP *cpi) {
 #if CONFIG_VP9_HIGHBITDEPTH
                                cm->use_highbitdepth,
 #endif
-                               VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
+                               VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+                               NULL, NULL, NULL))
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate scaled source buffer");
 
@@ -530,7 +533,8 @@ static void alloc_util_frame_buffers(VP9_COMP *cpi) {
 #if CONFIG_VP9_HIGHBITDEPTH
                                cm->use_highbitdepth,
 #endif
-                               VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
+                               VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+                               NULL, NULL, NULL))
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate scaled last source buffer");
 }
@@ -566,7 +570,8 @@ static void update_frame_size(VP9_COMP *cpi) {
 #if CONFIG_VP9_HIGHBITDEPTH
                                  cm->use_highbitdepth,
 #endif
-                                 VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
+                                 VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+                                 NULL, NULL, NULL))
       vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                          "Failed to reallocate alt_ref_buffer");
   }
@@ -2472,7 +2477,8 @@ void vp9_scale_references(VP9_COMP *cpi) {
                                  cm->width, cm->height,
                                  cm->subsampling_x, cm->subsampling_y,
                                  cm->use_highbitdepth,
-                                 VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL);
+                                 VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+                                 NULL, NULL, NULL);
         scale_and_extend_frame(ref, &cm->frame_bufs[new_fb].buf,
                                (int)cm->bit_depth);
 #else
@@ -2481,7 +2487,8 @@ void vp9_scale_references(VP9_COMP *cpi) {
         vp9_realloc_frame_buffer(&cm->frame_bufs[new_fb].buf,
                                  cm->width, cm->height,
                                  cm->subsampling_x, cm->subsampling_y,
-                                 VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL);
+                                 VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+                                 NULL, NULL, NULL);
         scale_and_extend_frame(ref, &cm->frame_bufs[new_fb].buf);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
         cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
@@ -2720,7 +2727,8 @@ void set_frame_size(VP9_COMP *cpi) {
 #if CONFIG_VP9_HIGHBITDEPTH
                            cm->use_highbitdepth,
 #endif
-                           VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL);
+                           VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+                           NULL, NULL, NULL);
 
   alloc_util_frame_buffers(cpi);
   init_motion_estimation(cpi);
@@ -3139,12 +3147,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
 
   vp9_clear_system_state();
 
-  // Enable or disable mode based tweaking of the zbin.
-  // For 2 pass only used where GF/ARF prediction quality
-  // is above a threshold.
-  cpi->zbin_mode_boost = 0;
-  cpi->zbin_mode_boost_enabled = 0;
-
   // Set the arf sign bias for this frame.
   set_arf_sign_bias(cpi);
 
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 14f7c7f0c..7872e2cc1 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -315,9 +315,6 @@ typedef struct VP9_COMP {
   int *nmvsadcosts[2];
   int *nmvsadcosts_hp[2];
 
-  int zbin_mode_boost;
-  int zbin_mode_boost_enabled;
-
   int64_t last_time_stamp_seen;
   int64_t last_end_time_stamp_seen;
   int64_t first_time_stamp_ever;
@@ -339,6 +336,8 @@ typedef struct VP9_COMP {
   unsigned int max_mv_magnitude;
   int mv_step_param;
 
+  int allow_comp_inter_inter;
+
   // Default value is 1. From first pass stats, encode_breakout may be disabled.
   ENCODE_BREAKOUT_TYPE allow_encode_breakout;
 
@@ -449,7 +448,7 @@ typedef struct VP9_COMP {
   VP9Worker *workers;
 } VP9_COMP;
 
-void vp9_initialize_enc();
+void vp9_initialize_enc(void);
 
 struct VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf);
 void vp9_remove_compressor(VP9_COMP *cpi);
diff --git a/vp9/encoder/vp9_lookahead.c b/vp9/encoder/vp9_lookahead.c
index 823e7a162..708072ee2 100644
--- a/vp9/encoder/vp9_lookahead.c
+++ b/vp9/encoder/vp9_lookahead.c
@@ -65,6 +65,7 @@ struct lookahead_ctx *vp9_lookahead_init(unsigned int width,
   // Allocate the lookahead structures
   ctx = calloc(1, sizeof(*ctx));
   if (ctx) {
+    const int legacy_byte_alignment = 0;
     unsigned int i;
     ctx->max_sz = depth;
     ctx->buf = calloc(depth, sizeof(*ctx->buf));
@@ -76,7 +77,8 @@ struct lookahead_ctx *vp9_lookahead_init(unsigned int width,
 #if CONFIG_VP9_HIGHBITDEPTH
                                  use_highbitdepth,
 #endif
-                                 VP9_ENC_BORDER_IN_PIXELS))
+                                 VP9_ENC_BORDER_IN_PIXELS,
+                                 legacy_byte_alignment))
         goto bail;
   }
   return ctx;
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index b45032456..319a47833 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -249,14 +249,14 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize,
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    vp9_model_rd_from_var_lapndz(sse - var, 1 << num_pels_log2_lookup[bsize],
+    vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
                                  dc_quant >> (xd->bd - 5), &rate, &dist);
   } else {
-    vp9_model_rd_from_var_lapndz(sse - var, 1 << num_pels_log2_lookup[bsize],
+    vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
                                  dc_quant >> 3, &rate, &dist);
   }
 #else
-  vp9_model_rd_from_var_lapndz(sse - var, 1 << num_pels_log2_lookup[bsize],
+  vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
                                dc_quant >> 3, &rate, &dist);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
@@ -265,14 +265,14 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize,
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    vp9_model_rd_from_var_lapndz(var, 1 << num_pels_log2_lookup[bsize],
+    vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize],
                                  ac_quant >> (xd->bd - 5), &rate, &dist);
   } else {
-    vp9_model_rd_from_var_lapndz(var, 1 << num_pels_log2_lookup[bsize],
+    vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize],
                                  ac_quant >> 3, &rate, &dist);
   }
 #else
-  vp9_model_rd_from_var_lapndz(var, 1 << num_pels_log2_lookup[bsize],
+  vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize],
                                ac_quant >> 3, &rate, &dist);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
@@ -447,11 +447,10 @@ static void estimate_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
   args->dist += dist;
 }
 
-static const THR_MODES mode_idx[MAX_REF_FRAMES][4] = {
+static const THR_MODES mode_idx[MAX_REF_FRAMES - 1][4] = {
   {THR_DC, THR_H_PRED, THR_V_PRED, THR_TM},
   {THR_NEARESTMV, THR_NEARMV, THR_ZEROMV, THR_NEWMV},
   {THR_NEARESTG, THR_NEARG, THR_ZEROG, THR_NEWG},
-  {THR_NEARESTA, THR_NEARA, THR_ZEROA, THR_NEWA},
 };
 
 static const PREDICTION_MODE intra_mode_list[] = {
@@ -522,8 +521,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   struct macroblockd_plane *const pd = &xd->plane[0];
   PREDICTION_MODE best_mode = ZEROMV;
   MV_REFERENCE_FRAME ref_frame, best_ref_frame = LAST_FRAME;
-  TX_SIZE best_tx_size = MIN(max_txsize_lookup[bsize],
-                             tx_mode_to_biggest_tx_size[cm->tx_mode]);
+  TX_SIZE best_tx_size = TX_SIZES;
   INTERP_FILTER best_pred_filter = EIGHTTAP;
   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
@@ -537,9 +535,9 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   // Reduce the intra cost penalty for small blocks (<=16x16).
   const int reduction_fac =
       (cpi->sf.partition_search_type == VAR_BASED_PARTITION &&
-       bsize <= BLOCK_16X16) ? 4 : 1;
+       bsize <= BLOCK_16X16) ? 2 : 0;
   const int intra_cost_penalty = vp9_get_intra_cost_penalty(
-      cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth) / reduction_fac;
+      cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth) >> reduction_fac;
   const int64_t inter_mode_thresh = RDCOST(x->rdmult, x->rddiv,
                                            intra_cost_penalty, 0);
   const int8_t segment_id = mbmi->segment_id;
@@ -839,13 +837,15 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 
   // Perform intra prediction search, if the best SAD is above a certain
   // threshold.
-  if (!x->skip && best_rdc.rdcost > inter_mode_thresh &&
-      bsize <= cpi->sf.max_intra_bsize) {
+  if (best_rdc.rdcost == INT64_MAX ||
+      (!x->skip && best_rdc.rdcost > inter_mode_thresh &&
+       bsize <= cpi->sf.max_intra_bsize)) {
     struct estimate_block_intra_args args = { cpi, x, DC_PRED, 0, 0 };
     const TX_SIZE intra_tx_size =
         MIN(max_txsize_lookup[bsize],
             tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
     int i;
+    TX_SIZE best_intra_tx_size = TX_SIZES;
 
     if (reuse_inter_pred && best_pred != NULL) {
       if (best_pred->data == orig_dst.buf) {
@@ -870,11 +870,9 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     pd->dst = orig_dst;
 
     for (i = 0; i < 4; ++i) {
-      const TX_SIZE saved_tx_size = mbmi->tx_size;
       const PREDICTION_MODE this_mode = intra_mode_list[i];
       if (!((1 << this_mode) & cpi->sf.intra_y_mode_mask[intra_tx_size]))
         continue;
-      skip_txfm = x->skip_txfm[0];
       args.mode = this_mode;
       args.rate = 0;
       args.dist = 0;
@@ -891,15 +889,20 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       if (this_rdc.rdcost < best_rdc.rdcost) {
         best_rdc = this_rdc;
         mbmi->mode = this_mode;
-        mbmi->tx_size = intra_tx_size;
+        best_intra_tx_size = mbmi->tx_size;
         mbmi->ref_frame[0] = INTRA_FRAME;
         mbmi->uv_mode = this_mode;
         mbmi->mv[0].as_int = INVALID_MV;
-      } else {
-        x->skip_txfm[0] = best_mode_skip_txfm;
-        mbmi->tx_size = saved_tx_size;
       }
     }
+
+    // Reset mb_mode_info to the best inter mode.
+    if (mbmi->ref_frame[0] != INTRA_FRAME) {
+      x->skip_txfm[0] = best_mode_skip_txfm;
+      mbmi->tx_size = best_tx_size;
+    } else {
+      mbmi->tx_size = best_intra_tx_size;
+    }
   }
 
   pd->dst = orig_dst;
@@ -923,14 +926,23 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     }
   }
 
-  if (is_inter_block(mbmi))
-    vp9_update_rd_thresh_fact(tile_data->thresh_freq_fact,
-                            cpi->sf.adaptive_rd_thresh, bsize,
-                            mode_idx[best_ref_frame][INTER_OFFSET(mbmi->mode)]);
-  else
-    vp9_update_rd_thresh_fact(tile_data->thresh_freq_fact,
-                              cpi->sf.adaptive_rd_thresh, bsize,
-                              mode_idx[INTRA_FRAME][mbmi->mode]);
+  if (cpi->sf.adaptive_rd_thresh) {
+    THR_MODES best_mode_idx = is_inter_block(mbmi) ?
+        mode_idx[best_ref_frame][INTER_OFFSET(mbmi->mode)] :
+        mode_idx[INTRA_FRAME][mbmi->mode];
+    PREDICTION_MODE this_mode;
+    for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) {
+      for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
+        THR_MODES thr_mode_idx = mode_idx[ref_frame][INTER_OFFSET(this_mode)];
+        int *freq_fact = &tile_data->thresh_freq_fact[bsize][thr_mode_idx];
+        if (thr_mode_idx == best_mode_idx)
+          *freq_fact -= (*freq_fact >> 4);
+        else
+          *freq_fact = MIN(*freq_fact + RD_THRESH_INC,
+                           cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT);
+      }
+    }
+  }
 
   *rd_cost = best_rdc;
 }
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index e7a20c4d2..389dc87e0 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -122,14 +122,13 @@ void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                        const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
                        tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                        const int16_t *dequant_ptr,
-                       int zbin_oq_value, uint16_t *eob_ptr,
+                       uint16_t *eob_ptr,
                        const int16_t *scan, const int16_t *iscan) {
   int i, eob = -1;
   // TODO(jingning) Decide the need of these arguments after the
   // quantization process is completed.
   (void)zbin_ptr;
   (void)quant_shift_ptr;
-  (void)zbin_oq_value;
   (void)iscan;
 
   vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
@@ -168,7 +167,6 @@ void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr,
                               tran_low_t *qcoeff_ptr,
                               tran_low_t *dqcoeff_ptr,
                               const int16_t *dequant_ptr,
-                              int zbin_oq_value,
                               uint16_t *eob_ptr,
                               const int16_t *scan,
                               const int16_t *iscan) {
@@ -178,7 +176,6 @@ void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr,
   // quantization process is completed.
   (void)zbin_ptr;
   (void)quant_shift_ptr;
-  (void)zbin_oq_value;
   (void)iscan;
 
   vpx_memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
@@ -217,12 +214,11 @@ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                              const int16_t *quant_shift_ptr,
                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                              const int16_t *dequant_ptr,
-                             int zbin_oq_value, uint16_t *eob_ptr,
+                             uint16_t *eob_ptr,
                              const int16_t *scan, const int16_t *iscan) {
   int i, eob = -1;
   (void)zbin_ptr;
   (void)quant_shift_ptr;
-  (void)zbin_oq_value;
   (void)iscan;
 
   vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
@@ -261,12 +257,11 @@ void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr,
                                     tran_low_t *qcoeff_ptr,
                                     tran_low_t *dqcoeff_ptr,
                                     const int16_t *dequant_ptr,
-                                    int zbin_oq_value, uint16_t *eob_ptr,
+                                    uint16_t *eob_ptr,
                                     const int16_t *scan, const int16_t *iscan) {
   int i, eob = -1;
   (void)zbin_ptr;
   (void)quant_shift_ptr;
-  (void)zbin_oq_value;
   (void)iscan;
 
   vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
@@ -302,13 +297,11 @@ void vp9_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                       const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
                       tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                       const int16_t *dequant_ptr,
-                      int zbin_oq_value, uint16_t *eob_ptr,
+                      uint16_t *eob_ptr,
                       const int16_t *scan, const int16_t *iscan) {
   int i, non_zero_count = (int)n_coeffs, eob = -1;
-  const int zbins[2] = { zbin_ptr[0] + zbin_oq_value,
-                         zbin_ptr[1] + zbin_oq_value };
-  const int nzbins[2] = { zbins[0] * -1,
-                          zbins[1] * -1 };
+  const int zbins[2] = {zbin_ptr[0], zbin_ptr[1]};
+  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
   (void)iscan;
 
   vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
@@ -355,14 +348,12 @@ void vp9_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                              const int16_t *round_ptr, const int16_t *quant_ptr,
                              const int16_t *quant_shift_ptr,
                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                             const int16_t *dequant_ptr, int zbin_oq_value,
+                             const int16_t *dequant_ptr,
                              uint16_t *eob_ptr, const int16_t *scan,
                              const int16_t *iscan) {
   int i, non_zero_count = (int)n_coeffs, eob = -1;
-  const int zbins[2] = { zbin_ptr[0] + zbin_oq_value,
-                         zbin_ptr[1] + zbin_oq_value };
-  const int nzbins[2] = { zbins[0] * -1,
-                          zbins[1] * -1 };
+  const int zbins[2] = {zbin_ptr[0], zbin_ptr[1]};
+  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
   (void)iscan;
 
   vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
@@ -412,10 +403,10 @@ void vp9_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                             const int16_t *quant_shift_ptr,
                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                             const int16_t *dequant_ptr,
-                            int zbin_oq_value, uint16_t *eob_ptr,
+                            uint16_t *eob_ptr,
                             const int16_t *scan, const int16_t *iscan) {
-  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0] + zbin_oq_value, 1),
-                         ROUND_POWER_OF_TWO(zbin_ptr[1] + zbin_oq_value, 1) };
+  const int zbins[2] = {ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
+                        ROUND_POWER_OF_TWO(zbin_ptr[1], 1)};
   const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
 
   int idx = 0;
@@ -471,11 +462,11 @@ void vp9_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
                                    tran_low_t *qcoeff_ptr,
                                    tran_low_t *dqcoeff_ptr,
                                    const int16_t *dequant_ptr,
-                                   int zbin_oq_value, uint16_t *eob_ptr,
+                                   uint16_t *eob_ptr,
                                    const int16_t *scan, const int16_t *iscan) {
-  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0] + zbin_oq_value, 1),
-                         ROUND_POWER_OF_TWO(zbin_ptr[1] + zbin_oq_value, 1) };
-  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+  const int zbins[2] = {ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
+                        ROUND_POWER_OF_TWO(zbin_ptr[1], 1)};
+  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
 
   int idx = 0;
   int idx_arr[1024];
@@ -534,7 +525,7 @@ void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
                         p->zbin, p->round, p->quant, p->quant_shift,
                         BLOCK_OFFSET(p->qcoeff, block),
                         BLOCK_OFFSET(pd->dqcoeff, block),
-                        pd->dequant, p->zbin_extra, &p->eobs[block],
+                        pd->dequant, &p->eobs[block],
                         scan, iscan);
     return;
   }
@@ -544,7 +535,7 @@ void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
            p->zbin, p->round, p->quant, p->quant_shift,
            BLOCK_OFFSET(p->qcoeff, block),
            BLOCK_OFFSET(pd->dqcoeff, block),
-           pd->dequant, p->zbin_extra, &p->eobs[block], scan, iscan);
+           pd->dequant, &p->eobs[block], scan, iscan);
 }
 
 static void invert_quant(int16_t *quant, int16_t *shift, int d) {
@@ -641,7 +632,6 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
   const int segment_id = xd->mi[0].src_mi->mbmi.segment_id;
   const int qindex = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex);
   const int rdmult = vp9_compute_rd_mult(cpi, qindex + cm->y_dc_delta_q);
-  const int zbin = cpi->zbin_mode_boost;
   int i;
 
   // Y
@@ -651,13 +641,10 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
   x->plane[0].quant_shift = quants->y_quant_shift[qindex];
   x->plane[0].zbin = quants->y_zbin[qindex];
   x->plane[0].round = quants->y_round[qindex];
-  x->plane[0].zbin_extra = (int16_t)((cm->y_dequant[qindex][1] * zbin) >> 7);
   xd->plane[0].dequant = cm->y_dequant[qindex];
 
-  x->plane[0].quant_thred[0] = (x->plane[0].zbin[0] + x->plane[0].zbin_extra) *
-      (x->plane[0].zbin[0] + x->plane[0].zbin_extra);
-  x->plane[0].quant_thred[1] = (x->plane[0].zbin[1] + x->plane[0].zbin_extra) *
-      (x->plane[0].zbin[1] + x->plane[0].zbin_extra);
+  x->plane[0].quant_thred[0] = x->plane[0].zbin[0] * x->plane[0].zbin[0];
+  x->plane[0].quant_thred[1] = x->plane[0].zbin[1] * x->plane[0].zbin[1];
 
   // UV
   for (i = 1; i < 3; i++) {
@@ -667,15 +654,10 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
     x->plane[i].quant_shift = quants->uv_quant_shift[qindex];
     x->plane[i].zbin = quants->uv_zbin[qindex];
     x->plane[i].round = quants->uv_round[qindex];
-    x->plane[i].zbin_extra = (int16_t)((cm->uv_dequant[qindex][1] * zbin) >> 7);
     xd->plane[i].dequant = cm->uv_dequant[qindex];
 
-    x->plane[i].quant_thred[0] =
-        (x->plane[i].zbin[0] + x->plane[i].zbin_extra) *
-        (x->plane[i].zbin[0] + x->plane[i].zbin_extra);
-    x->plane[i].quant_thred[1] =
-        (x->plane[i].zbin[1] + x->plane[i].zbin_extra) *
-        (x->plane[i].zbin[1] + x->plane[i].zbin_extra);
+    x->plane[i].quant_thred[0] = x->plane[i].zbin[0] * x->plane[i].zbin[0];
+    x->plane[i].quant_thred[1] = x->plane[i].zbin[1] * x->plane[i].zbin[1];
   }
 
   x->skip_block = vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP);
@@ -687,20 +669,7 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
   vp9_initialize_me_consts(cpi, x->q_index);
 }
 
-void vp9_update_zbin_extra(VP9_COMP *cpi, MACROBLOCK *x) {
-  const int qindex = x->q_index;
-  const int y_zbin_extra = (cpi->common.y_dequant[qindex][1] *
-                            cpi->zbin_mode_boost) >> 7;
-  const int uv_zbin_extra = (cpi->common.uv_dequant[qindex][1] *
-                             cpi->zbin_mode_boost) >> 7;
-
-  x->plane[0].zbin_extra = (int16_t)y_zbin_extra;
-  x->plane[1].zbin_extra = (int16_t)uv_zbin_extra;
-  x->plane[2].zbin_extra = (int16_t)uv_zbin_extra;
-}
-
 void vp9_frame_init_quantizer(VP9_COMP *cpi) {
-  cpi->zbin_mode_boost = 0;
   vp9_init_plane_quantizers(cpi, &cpi->td.mb);
 }
 
diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h
index cee46e7e0..de2839f5b 100644
--- a/vp9/encoder/vp9_quantize.h
+++ b/vp9/encoder/vp9_quantize.h
@@ -68,8 +68,6 @@ struct VP9Common;
 
 void vp9_frame_init_quantizer(struct VP9_COMP *cpi);
 
-void vp9_update_zbin_extra(struct VP9_COMP *cpi, MACROBLOCK *x);
-
 void vp9_init_plane_quantizers(struct VP9_COMP *cpi, MACROBLOCK *x);
 
 void vp9_init_quantizer(struct VP9_COMP *cpi);
diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c
index 5b49bfc17..34d49f058 100644
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c
@@ -379,7 +379,7 @@ static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) {
   *d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10;
 }
 
-void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n,
+void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2,
                                   unsigned int qstep, int *rate,
                                   int64_t *dist) {
   // This function models the rate and distortion for a Laplacian
@@ -395,10 +395,10 @@ void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n,
     int d_q10, r_q10;
     static const uint32_t MAX_XSQ_Q10 = 245727;
     const uint64_t xsq_q10_64 =
-        ((((uint64_t)qstep * qstep * n) << 10) + (var >> 1)) / var;
+        (((uint64_t)qstep * qstep << (n_log2 + 10)) + (var >> 1)) / var;
     const int xsq_q10 = (int)MIN(xsq_q10_64, MAX_XSQ_Q10);
     model_rd_norm(xsq_q10, &r_q10, &d_q10);
-    *rate = (n * r_q10 + 2) >> 2;
+    *rate = ((r_q10 << n_log2) + 2) >> 2;
     *dist = (var * (int64_t)d_q10 + 512) >> 10;
   }
 }
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 600a3eb1a..ded082f86 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -37,6 +37,7 @@
 #include "vp9/encoder/vp9_rd.h"
 #include "vp9/encoder/vp9_rdopt.h"
 #include "vp9/encoder/vp9_variance.h"
+#include "vp9/encoder/vp9_aq_variance.h"
 
 #define LAST_FRAME_MODE_MASK    ((1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME) | \
                                  (1 << INTRA_FRAME))
@@ -48,6 +49,7 @@
 #define SECOND_REF_FRAME_MASK   ((1 << ALTREF_FRAME) | 0x01)
 
 #define MIN_EARLY_TERM_INDEX    3
+#define NEW_MV_DISCOUNT_FACTOR  8
 
 typedef struct {
   PREDICTION_MODE mode;
@@ -75,6 +77,7 @@ struct rdcost_block_args {
   const scan_order *so;
 };
 
+#define LAST_NEW_MV_INDEX 6
 static const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
   {NEARESTMV, {LAST_FRAME,   NONE}},
   {NEARESTMV, {ALTREF_FRAME, NONE}},
@@ -265,15 +268,15 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize,
     } else {
 #if CONFIG_VP9_HIGHBITDEPTH
       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-        vp9_model_rd_from_var_lapndz(sum_sse, 1 << num_pels_log2_lookup[bs],
+        vp9_model_rd_from_var_lapndz(sum_sse, num_pels_log2_lookup[bs],
                                      pd->dequant[1] >> (xd->bd - 5),
                                      &rate, &dist);
       } else {
-        vp9_model_rd_from_var_lapndz(sum_sse, 1 << num_pels_log2_lookup[bs],
+        vp9_model_rd_from_var_lapndz(sum_sse, num_pels_log2_lookup[bs],
                                      pd->dequant[1] >> 3, &rate, &dist);
       }
 #else
-      vp9_model_rd_from_var_lapndz(sum_sse, 1 << num_pels_log2_lookup[bs],
+      vp9_model_rd_from_var_lapndz(sum_sse, num_pels_log2_lookup[bs],
                                    pd->dequant[1] >> 3, &rate, &dist);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
       rate_sum += rate;
@@ -370,7 +373,7 @@ static INLINE int cost_coeffs(MACROBLOCK *x,
 
     // dc token
     int v = qcoeff[0];
-    int prev_t = vp9_dct_value_tokens_ptr[v].token;
+    int prev_t = vp9_get_token(v);
     cost = (*token_costs)[0][pt][prev_t] + vp9_dct_value_cost_ptr[v];
     token_cache[0] = vp9_pt_energy_class[prev_t];
     ++token_costs;
@@ -381,7 +384,7 @@ static INLINE int cost_coeffs(MACROBLOCK *x,
       int t;
 
       v = qcoeff[rc];
-      t = vp9_dct_value_tokens_ptr[v].token;
+      t = vp9_get_token(v);
       if (use_fast_coef_costing) {
         cost += (*token_costs)[!prev_t][!prev_t][t] + vp9_dct_value_cost_ptr[v];
       } else {
@@ -2355,6 +2358,27 @@ static INLINE void restore_dst_buf(MACROBLOCKD *xd,
   }
 }
 
+// In some situations we want to discount tha pparent cost of a new motion
+// vector. Where there is a subtle motion field and especially where there is
+// low spatial complexity then it can be hard to cover the cost of a new motion
+// vector in a single block, even if that motion vector reduces distortion.
+// However, once established that vector may be usable through the nearest and
+// near mv modes to reduce distortion in subsequent blocks and also improve
+// visual quality.
+static int discount_newmv_test(const VP9_COMP *cpi,
+                               int this_mode,
+                               int_mv this_mv,
+                               int_mv (*mode_mv)[MAX_REF_FRAMES],
+                               int ref_frame) {
+  return (!cpi->rc.is_src_frame_alt_ref &&
+          (this_mode == NEWMV) &&
+          (this_mv.as_int != 0) &&
+          ((mode_mv[NEARESTMV][ref_frame].as_int == 0) ||
+           (mode_mv[NEARESTMV][ref_frame].as_int == INVALID_MV)) &&
+          ((mode_mv[NEARMV][ref_frame].as_int == 0) ||
+           (mode_mv[NEARMV][ref_frame].as_int == INVALID_MV)));
+}
+
 static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                  BLOCK_SIZE bsize,
                                  int64_t txfm_cache[],
@@ -2464,10 +2488,20 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                            &tmp_mv, &rate_mv);
       if (tmp_mv.as_int == INVALID_MV)
         return INT64_MAX;
-      *rate2 += rate_mv;
+
       frame_mv[refs[0]].as_int =
           xd->mi[0].src_mi->bmi[0].as_mv[0].as_int = tmp_mv.as_int;
       single_newmv[refs[0]].as_int = tmp_mv.as_int;
+
+      // Estimate the rate implications of a new mv but discount this
+      // under certain circumstances where we want to help initiate a weak
+      // motion field, where the distortion gain for a single block may not
+      // be enough to overcome the cost of a new mv.
+      if (discount_newmv_test(cpi, this_mode, tmp_mv, mode_mv, refs[0])) {
+        *rate2 += MAX((rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
+      } else {
+        *rate2 += rate_mv;
+      }
     }
   }
 
@@ -2492,11 +2526,20 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     orig_dst_stride[i] = xd->plane[i].dst.stride;
   }
 
-  /* We don't include the cost of the second reference here, because there
-   * are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other
-   * words if you present them in that order, the second one is always known
-   * if the first is known */
-  *rate2 += cost_mv_ref(cpi, this_mode, mbmi->mode_context[refs[0]]);
+  // We don't include the cost of the second reference here, because there
+  // are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other
+  // words if you present them in that order, the second one is always known
+  // if the first is known.
+  //
+  // Under some circumstances we discount the cost of new mv mode to encourage
+  // initiation of a motion field.
+  if (discount_newmv_test(cpi, this_mode, frame_mv[refs[0]],
+                          mode_mv, refs[0])) {
+    *rate2 += MIN(cost_mv_ref(cpi, this_mode, mbmi->mode_context[refs[0]]),
+                  cost_mv_ref(cpi, NEARESTMV, mbmi->mode_context[refs[0]]));
+  } else {
+    *rate2 += cost_mv_ref(cpi, this_mode, mbmi->mode_context[refs[0]]);
+  }
 
   if (RDCOST(x->rdmult, x->rddiv, *rate2, 0) > ref_best_rd &&
       mbmi->mode != NEARESTMV)
@@ -2726,6 +2769,7 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   x->skip_encode = 0;
   ctx->skip = 0;
   xd->mi[0].src_mi->mbmi.ref_frame[0] = INTRA_FRAME;
+  xd->mi[0].src_mi->mbmi.ref_frame[1] = NONE;
 
   if (bsize >= BLOCK_8X8) {
     if (rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
@@ -2941,7 +2985,9 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi,
   mode_skip_mask[INTRA_FRAME] |=
       ~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]);
 
-  for (i = 0; i < MAX_MODES; ++i)
+  for (i = 0; i <= LAST_NEW_MV_INDEX; ++i)
+    mode_threshold[i] = 0;
+  for (i = LAST_NEW_MV_INDEX + 1; i < MAX_MODES; ++i)
     mode_threshold[i] = ((int64_t)rd_threshes[i] * rd_thresh_freq_fact[i]) >> 5;
 
   midx =  sf->schedule_mode_search ? mode_skip_start : 0;
@@ -3065,7 +3111,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi,
 
     comp_pred = second_ref_frame > INTRA_FRAME;
     if (comp_pred) {
-      if (!cm->allow_comp_inter_inter)
+      if (!cpi->allow_comp_inter_inter)
         continue;
 
       // Skip compound inter modes if ARF is not available.
@@ -3715,7 +3761,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi,
 
     comp_pred = second_ref_frame > INTRA_FRAME;
     if (comp_pred) {
-      if (!cm->allow_comp_inter_inter)
+      if (!cpi->allow_comp_inter_inter)
         continue;
       if (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))
         continue;
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 0775b919c..15831fbbe 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -339,18 +339,10 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
     sf->mv.fullpel_search_step_param = 10;
     sf->lpf_pick = LPF_PICK_MINIMAL_LPF;
   }
-
-  if (speed >= 12) {
+  if (speed >= 8) {
     sf->adaptive_rd_thresh = 4;
     sf->mv.subpel_force_stop = 2;
   }
-
-  if (speed >= 13) {
-    int i;
-    sf->max_intra_bsize = BLOCK_32X32;
-    for (i = 0; i < BLOCK_SIZES; ++i)
-      sf->inter_mode_mask[i] = INTER_NEAREST;
-  }
 }
 
 void vp9_set_speed_features_framesize_dependent(VP9_COMP *cpi) {
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index 184322f4f..31e93be65 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -39,7 +39,9 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
 #if CONFIG_VP9_HIGHBITDEPTH
                                  cpi->common.use_highbitdepth,
 #endif
-                                 VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
+                                 VP9_ENC_BORDER_IN_PIXELS,
+                                 cpi->common.byte_alignment,
+                                 NULL, NULL, NULL))
         vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
                            "Failed to allocate empty frame for multiple frame "
                            "contexts");
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index a4051f05e..424cc0843 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -710,8 +710,9 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) {
 #if CONFIG_VP9_HIGHBITDEPTH
                                        cm->use_highbitdepth,
 #endif
-                                       VP9_ENC_BORDER_IN_PIXELS, NULL, NULL,
-                                       NULL)) {
+                                       VP9_ENC_BORDER_IN_PIXELS,
+                                       cm->byte_alignment,
+                                       NULL, NULL, NULL)) {
             vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                                "Failed to reallocate alt_ref_buffer");
           }
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 393eb1a4a..06bcfc317 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -23,23 +23,46 @@
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_tokenize.h"
 
-static TOKENVALUE dct_value_tokens[DCT_MAX_VALUE * 2];
-const TOKENVALUE *vp9_dct_value_tokens_ptr;
 static int16_t dct_value_cost[DCT_MAX_VALUE * 2];
-const int16_t *vp9_dct_value_cost_ptr;
+const int16_t *vp9_dct_value_cost_ptr = dct_value_cost + DCT_MAX_VALUE;
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static TOKENVALUE dct_value_tokens_high10[DCT_MAX_VALUE_HIGH10 * 2];
-const TOKENVALUE *vp9_dct_value_tokens_high10_ptr;
 static int16_t dct_value_cost_high10[DCT_MAX_VALUE_HIGH10 * 2];
-const int16_t *vp9_dct_value_cost_high10_ptr;
+const int16_t *vp9_dct_value_cost_high10_ptr =
+    dct_value_cost_high10 + DCT_MAX_VALUE_HIGH10;
 
-static TOKENVALUE dct_value_tokens_high12[DCT_MAX_VALUE_HIGH12 * 2];
-const TOKENVALUE *vp9_dct_value_tokens_high12_ptr;
 static int16_t dct_value_cost_high12[DCT_MAX_VALUE_HIGH12 * 2];
-const int16_t *vp9_dct_value_cost_high12_ptr;
+const int16_t *vp9_dct_value_cost_high12_ptr =
+    dct_value_cost_high12 + DCT_MAX_VALUE_HIGH12;
 #endif
 
+static const TOKENVALUE dct_cat_lt_10_value_tokens[] = {
+  {9, 63}, {9, 61}, {9, 59}, {9, 57}, {9, 55}, {9, 53}, {9, 51}, {9, 49},
+  {9, 47}, {9, 45}, {9, 43}, {9, 41}, {9, 39}, {9, 37}, {9, 35}, {9, 33},
+  {9, 31}, {9, 29}, {9, 27}, {9, 25}, {9, 23}, {9, 21}, {9, 19}, {9, 17},
+  {9, 15}, {9, 13}, {9, 11}, {9, 9}, {9, 7}, {9, 5}, {9, 3}, {9, 1},
+  {8, 31}, {8, 29}, {8, 27}, {8, 25}, {8, 23}, {8, 21},
+  {8, 19}, {8, 17}, {8, 15}, {8, 13}, {8, 11}, {8, 9},
+  {8, 7}, {8, 5}, {8, 3}, {8, 1},
+  {7, 15}, {7, 13}, {7, 11}, {7, 9}, {7, 7}, {7, 5}, {7, 3}, {7, 1},
+  {6, 7}, {6, 5}, {6, 3}, {6, 1}, {5, 3}, {5, 1},
+  {4, 1}, {3, 1}, {2, 1}, {1, 1}, {0, 0},
+  {1, 0},  {2, 0}, {3, 0}, {4, 0},
+  {5, 0}, {5, 2}, {6, 0}, {6, 2}, {6, 4}, {6, 6},
+  {7, 0}, {7, 2}, {7, 4}, {7, 6}, {7, 8}, {7, 10}, {7, 12}, {7, 14},
+  {8, 0}, {8, 2}, {8, 4}, {8, 6}, {8, 8}, {8, 10}, {8, 12},
+  {8, 14}, {8, 16}, {8, 18}, {8, 20}, {8, 22}, {8, 24},
+  {8, 26}, {8, 28}, {8, 30}, {9, 0}, {9, 2},
+  {9, 4}, {9, 6}, {9, 8}, {9, 10}, {9, 12}, {9, 14}, {9, 16},
+  {9, 18}, {9, 20}, {9, 22}, {9, 24}, {9, 26}, {9, 28},
+  {9, 30}, {9, 32}, {9, 34}, {9, 36}, {9, 38}, {9, 40},
+  {9, 42}, {9, 44}, {9, 46}, {9, 48}, {9, 50}, {9, 52},
+  {9, 54}, {9, 56}, {9, 58}, {9, 60}, {9, 62}
+};
+const TOKENVALUE *vp9_dct_cat_lt_10_value_tokens = dct_cat_lt_10_value_tokens +
+    (sizeof(dct_cat_lt_10_value_tokens) / sizeof(*dct_cat_lt_10_value_tokens))
+    / 2;
+
 // Array indices are identical to previously-existing CONTEXT_NODE indices
 const vp9_tree_index vp9_coef_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
   -EOB_TOKEN, 2,                       // 0  = EOB
@@ -67,21 +90,31 @@ const vp9_tree_index vp9_coef_con_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
   -CATEGORY5_TOKEN, -CATEGORY6_TOKEN   // 7 = CAT_FIVE
 };
 
-static vp9_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[28];
+static const vp9_tree_index cat1[2] = {0, 0};
+static const vp9_tree_index cat2[4] = {2, 2, 0, 0};
+static const vp9_tree_index cat3[6] = {2, 2, 4, 4, 0, 0};
+static const vp9_tree_index cat4[8] = {2, 2, 4, 4, 6, 6, 0, 0};
+static const vp9_tree_index cat5[10] = {2, 2, 4, 4, 6, 6, 8, 8, 0, 0};
+static const vp9_tree_index cat6[28] = {2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12,
+    14, 14, 16, 16, 18, 18, 20, 20, 22, 22, 24, 24, 26, 26, 0, 0};
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static vp9_tree_index cat1_high10[2];
-static vp9_tree_index cat2_high10[4];
-static vp9_tree_index cat3_high10[6];
-static vp9_tree_index cat4_high10[8];
-static vp9_tree_index cat5_high10[10];
-static vp9_tree_index cat6_high10[32];
-static vp9_tree_index cat1_high12[2];
-static vp9_tree_index cat2_high12[4];
-static vp9_tree_index cat3_high12[6];
-static vp9_tree_index cat4_high12[8];
-static vp9_tree_index cat5_high12[10];
-static vp9_tree_index cat6_high12[36];
+static const vp9_tree_index cat1_high10[2] = {0, 0};
+static const vp9_tree_index cat2_high10[4] = {2, 2, 0, 0};
+static const vp9_tree_index cat3_high10[6] = {2, 2, 4, 4, 0, 0};
+static const vp9_tree_index cat4_high10[8] = {2, 2, 4, 4, 6, 6, 0, 0};
+static const vp9_tree_index cat5_high10[10] = {2, 2, 4, 4, 6, 6, 8, 8, 0, 0};
+static const vp9_tree_index cat6_high10[32] = {2, 2, 4, 4, 6, 6, 8, 8, 10, 10,
+  12, 12, 14, 14, 16, 16, 18, 18, 20, 20, 22, 22, 24, 24, 26, 26, 28, 28,
+  30, 30, 0, 0};
+static const vp9_tree_index cat1_high12[2] = {0, 0};
+static const vp9_tree_index cat2_high12[4] = {2, 2, 0, 0};
+static const vp9_tree_index cat3_high12[6] = {2, 2, 4, 4, 0, 0};
+static const vp9_tree_index cat4_high12[8] = {2, 2, 4, 4, 6, 6, 0, 0};
+static const vp9_tree_index cat5_high12[10] = {2, 2, 4, 4, 6, 6, 8, 8, 0, 0};
+static const vp9_tree_index cat6_high12[36] = {2, 2, 4, 4, 6, 6, 8, 8, 10, 10,
+  12, 12, 14, 14, 16, 16, 18, 18, 20, 20, 22, 22, 24, 24, 26, 26, 28, 28,
+  30, 30, 32, 32, 34, 34, 0, 0};
 #endif
 
 static void init_bit_tree(vp9_tree_index *p, int n) {
@@ -95,28 +128,6 @@ static void init_bit_tree(vp9_tree_index *p, int n) {
   p[0] = p[1] = 0;
 }
 
-static void init_bit_trees() {
-  init_bit_tree(cat1, 1);
-  init_bit_tree(cat2, 2);
-  init_bit_tree(cat3, 3);
-  init_bit_tree(cat4, 4);
-  init_bit_tree(cat5, 5);
-  init_bit_tree(cat6, 14);
-#if CONFIG_VP9_HIGHBITDEPTH
-  init_bit_tree(cat1_high10, 1);
-  init_bit_tree(cat2_high10, 2);
-  init_bit_tree(cat3_high10, 3);
-  init_bit_tree(cat4_high10, 4);
-  init_bit_tree(cat5_high10, 5);
-  init_bit_tree(cat6_high10, 16);
-  init_bit_tree(cat1_high12, 1);
-  init_bit_tree(cat2_high12, 2);
-  init_bit_tree(cat3_high12, 3);
-  init_bit_tree(cat4_high12, 4);
-  init_bit_tree(cat5_high12, 5);
-  init_bit_tree(cat6_high12, 18);
-#endif
-}
 
 const vp9_extra_bit vp9_extra_bits[ENTROPY_TOKENS] = {
   {0, 0, 0, 0},                              // ZERO_TOKEN
@@ -167,43 +178,24 @@ const vp9_extra_bit vp9_extra_bits_high12[ENTROPY_TOKENS] = {
 struct vp9_token vp9_coef_encodings[ENTROPY_TOKENS];
 
 void vp9_coef_tree_initialize() {
-  init_bit_trees();
   vp9_tokens_from_tree(vp9_coef_encodings, vp9_coef_tree);
 }
 
-static void tokenize_init_one(TOKENVALUE *t, const vp9_extra_bit *const e,
+static void tokenize_init_one(const vp9_extra_bit *const e,
                               int16_t *value_cost, int max_value) {
   int i = -max_value;
-  int sign = 1;
 
+  TOKENVALUE t;
   do {
-    if (!i)
-      sign = 0;
-
-    {
-      const int a = sign ? -i : i;
-      int eb = sign;
-
-      if (a > 4) {
-        int j = 4;
-
-        while (++j < 11  &&  e[j].base_val <= a) {}
-
-        t[i].token = --j;
-        eb |= (a - e[j].base_val) << 1;
-      } else {
-        t[i].token = a;
-      }
-      t[i].extra = eb;
-    }
 
+    vp9_get_token_extra(i, &t.token, &t.extra);
     // initialize the cost for extra bits for all possible coefficient value.
     {
       int cost = 0;
-      const vp9_extra_bit *p = &e[t[i].token];
+      const vp9_extra_bit *p = &e[t.token];
 
       if (p->base_val) {
-        const int extra = t[i].extra;
+        const int extra = t.extra;
         const int length = p->len;
 
         if (length)
@@ -217,26 +209,14 @@ static void tokenize_init_one(TOKENVALUE *t, const vp9_extra_bit *const e,
 }
 
 void vp9_tokenize_initialize() {
-  vp9_dct_value_tokens_ptr = dct_value_tokens + DCT_MAX_VALUE;
-  vp9_dct_value_cost_ptr = dct_value_cost + DCT_MAX_VALUE;
-
-  tokenize_init_one(dct_value_tokens + DCT_MAX_VALUE, vp9_extra_bits,
+  tokenize_init_one(vp9_extra_bits,
                     dct_value_cost + DCT_MAX_VALUE, DCT_MAX_VALUE);
 #if CONFIG_VP9_HIGHBITDEPTH
-  vp9_dct_value_tokens_high10_ptr = dct_value_tokens_high10 +
-      DCT_MAX_VALUE_HIGH10;
-  vp9_dct_value_cost_high10_ptr = dct_value_cost_high10 + DCT_MAX_VALUE_HIGH10;
-
-  tokenize_init_one(dct_value_tokens_high10 + DCT_MAX_VALUE_HIGH10,
-                    vp9_extra_bits_high10,
+  tokenize_init_one(vp9_extra_bits_high10,
                     dct_value_cost_high10 + DCT_MAX_VALUE_HIGH10,
                     DCT_MAX_VALUE_HIGH10);
-  vp9_dct_value_tokens_high12_ptr = dct_value_tokens_high12 +
-      DCT_MAX_VALUE_HIGH12;
-  vp9_dct_value_cost_high12_ptr = dct_value_cost_high12 + DCT_MAX_VALUE_HIGH12;
 
-  tokenize_init_one(dct_value_tokens_high12 + DCT_MAX_VALUE_HIGH12,
-                    vp9_extra_bits_high12,
+  tokenize_init_one(vp9_extra_bits_high12,
                     dct_value_cost_high12 + DCT_MAX_VALUE_HIGH12,
                     DCT_MAX_VALUE_HIGH12);
 #endif
@@ -322,8 +302,8 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
       td->counts->eob_branch[tx_size][type][ref];
   const uint8_t *const band = get_band_translate(tx_size);
   const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size);
-  const TOKENVALUE *dct_value_tokens;
-
+  int16_t token;
+  EXTRABIT extra;
   int aoff, loff;
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff);
 
@@ -333,17 +313,6 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
   scan = so->scan;
   nb = so->neighbors;
   c = 0;
-#if CONFIG_VP9_HIGHBITDEPTH
-  if (cpi->common.profile >= PROFILE_2) {
-    dct_value_tokens = (cpi->common.bit_depth == VPX_BITS_10 ?
-                        vp9_dct_value_tokens_high10_ptr :
-                        vp9_dct_value_tokens_high12_ptr);
-  } else {
-    dct_value_tokens = vp9_dct_value_tokens_ptr;
-  }
-#else
-  dct_value_tokens = vp9_dct_value_tokens_ptr;
-#endif
 
   while (c < eob) {
     int v = 0;
@@ -362,14 +331,13 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
       v = qcoeff[scan[c]];
     }
 
-    add_token(&t, coef_probs[band[c]][pt],
-              dct_value_tokens[v].extra,
-              (uint8_t)dct_value_tokens[v].token,
-              (uint8_t)skip_eob,
-              counts[band[c]][pt]);
+    vp9_get_token_extra(v, &token, &extra);
+
+    add_token(&t, coef_probs[band[c]][pt], extra, (uint8_t)token,
+              (uint8_t)skip_eob, counts[band[c]][pt]);
     eob_branch[band[c]][pt] += !skip_eob;
 
-    token_cache[scan[c]] = vp9_pt_energy_class[dct_value_tokens[v].token];
+    token_cache[scan[c]] = vp9_pt_energy_class[token];
     ++c;
     pt = get_coef_context(nb, token_cache, c);
   }
diff --git a/vp9/encoder/vp9_tokenize.h b/vp9/encoder/vp9_tokenize.h
index 00afb723e..845e139f2 100644
--- a/vp9/encoder/vp9_tokenize.h
+++ b/vp9/encoder/vp9_tokenize.h
@@ -24,24 +24,23 @@ void vp9_tokenize_initialize();
 
 #define EOSB_TOKEN 127     // Not signalled, encoder only
 
-typedef struct {
-  int16_t token;
 #if CONFIG_VP9_HIGHBITDEPTH
-  int32_t extra;
+  typedef int32_t EXTRABIT;
 #else
-  int16_t extra;
+  typedef int16_t EXTRABIT;
 #endif
+
+
+typedef struct {
+  int16_t token;
+  EXTRABIT extra;
 } TOKENVALUE;
 
 typedef struct {
   const vp9_prob *context_tree;
-#if CONFIG_VP9_HIGHBITDEPTH
-  int32_t extra;
-#else
-  int16_t         extra;
-#endif
-  uint8_t         token;
-  uint8_t         skip_eob_node;
+  EXTRABIT extra;
+  uint8_t token;
+  uint8_t skip_eob_node;
 } TOKENEXTRA;
 
 extern const vp9_tree_index vp9_coef_tree[];
@@ -63,6 +62,7 @@ extern const int16_t *vp9_dct_value_cost_ptr;
  *  fields are not.
  */
 extern const TOKENVALUE *vp9_dct_value_tokens_ptr;
+extern const TOKENVALUE *vp9_dct_cat_lt_10_value_tokens;
 #if CONFIG_VP9_HIGHBITDEPTH
 extern const int16_t *vp9_dct_value_cost_high10_ptr;
 extern const TOKENVALUE *vp9_dct_value_tokens_high10_ptr;
@@ -70,6 +70,25 @@ extern const int16_t *vp9_dct_value_cost_high12_ptr;
 extern const TOKENVALUE *vp9_dct_value_tokens_high12_ptr;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
+static INLINE void vp9_get_token_extra(int v, int16_t *token, EXTRABIT *extra) {
+  if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL) {
+    *token = CATEGORY6_TOKEN;
+    if (v >= CAT6_MIN_VAL)
+      *extra = 2 * v - 2 * CAT6_MIN_VAL;
+    else
+      *extra = -2 * v - 2 * CAT6_MIN_VAL + 1;
+    return;
+  }
+  *token = vp9_dct_cat_lt_10_value_tokens[v].token;
+  *extra = vp9_dct_cat_lt_10_value_tokens[v].extra;
+}
+static INLINE int16_t vp9_get_token(int v) {
+  if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL)
+    return 10;
+  return vp9_dct_cat_lt_10_value_tokens[v].token;
+}
+
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_sse2.c
index e671f3998..ae22a0b32 100644
--- a/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_sse2.c
@@ -254,7 +254,7 @@ void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride,
                             const int16_t* round_ptr, const int16_t* quant_ptr,
                             const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr,
                             int16_t* dqcoeff_ptr, const int16_t* dequant_ptr,
-                            int zbin_oq_value, uint16_t* eob_ptr,
+                            uint16_t* eob_ptr,
                             const int16_t* scan_ptr,
                             const int16_t* iscan_ptr) {
   __m128i zero;
@@ -287,7 +287,6 @@ void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride,
   (void)scan_ptr;
   (void)zbin_ptr;
   (void)quant_shift_ptr;
-  (void)zbin_oq_value;
   (void)coeff_ptr;
 
   // Pre-condition input (shift by two)
diff --git a/vp9/encoder/x86/vp9_dct_ssse3.c b/vp9/encoder/x86/vp9_dct_ssse3.c
index 237c5e278..5c0ad7892 100644
--- a/vp9/encoder/x86/vp9_dct_ssse3.c
+++ b/vp9/encoder/x86/vp9_dct_ssse3.c
@@ -23,7 +23,7 @@ void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride,
                              const int16_t* quant_shift_ptr,
                              int16_t* qcoeff_ptr,
                              int16_t* dqcoeff_ptr, const int16_t* dequant_ptr,
-                             int zbin_oq_value, uint16_t* eob_ptr,
+                             uint16_t* eob_ptr,
                              const int16_t* scan_ptr,
                              const int16_t* iscan_ptr) {
   __m128i zero;
@@ -57,7 +57,6 @@ void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride,
   (void)scan_ptr;
   (void)zbin_ptr;
   (void)quant_shift_ptr;
-  (void)zbin_oq_value;
   (void)coeff_ptr;
 
   // Pre-condition input (shift by two)
diff --git a/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c b/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c
index 55c6ed71f..0bce9c321 100644
--- a/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c
+++ b/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c
@@ -24,7 +24,6 @@ void vp9_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr,
                                 tran_low_t *qcoeff_ptr,
                                 tran_low_t *dqcoeff_ptr,
                                 const int16_t *dequant_ptr,
-                                int zbin_oq_value,
                                 uint16_t *eob_ptr,
                                 const int16_t *scan,
                                 const int16_t *iscan) {
@@ -32,11 +31,11 @@ void vp9_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr,
   __m128i zbins[2];
   __m128i nzbins[2];
 
-  zbins[0] = _mm_set_epi32((int)(zbin_ptr[1] + zbin_oq_value),
-                           (int)(zbin_ptr[1] + zbin_oq_value),
-                           (int)(zbin_ptr[1] + zbin_oq_value),
-                           (int)(zbin_ptr[0] + zbin_oq_value));
-  zbins[1] = _mm_set1_epi32((int)(zbin_ptr[1] + zbin_oq_value));
+  zbins[0] = _mm_set_epi32((int)zbin_ptr[1],
+                           (int)zbin_ptr[1],
+                           (int)zbin_ptr[1],
+                           (int)zbin_ptr[0]);
+  zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]);
 
   nzbins[0] = _mm_setzero_si128();
   nzbins[1] = _mm_setzero_si128();
@@ -111,7 +110,6 @@ void vp9_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr,
                                       tran_low_t *qcoeff_ptr,
                                       tran_low_t *dqcoeff_ptr,
                                       const int16_t *dequant_ptr,
-                                      int zbin_oq_value,
                                       uint16_t *eob_ptr,
                                       const int16_t *scan,
                                       const int16_t *iscan) {
@@ -120,14 +118,14 @@ void vp9_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr,
   int idx = 0;
   int idx_arr[1024];
   int i, eob = -1;
-  const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0] + zbin_oq_value, 1);
-  const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1] + zbin_oq_value, 1);
+  const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);
+  const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);
   (void)scan;
-  zbins[0] = _mm_set_epi32((zbin1_tmp + zbin_oq_value),
-                           (zbin1_tmp + zbin_oq_value),
-                           (zbin1_tmp + zbin_oq_value),
-                           (zbin0_tmp + zbin_oq_value));
-  zbins[1] = _mm_set1_epi32((zbin1_tmp + zbin_oq_value));
+  zbins[0] = _mm_set_epi32(zbin1_tmp,
+                           zbin1_tmp,
+                           zbin1_tmp,
+                           zbin0_tmp);
+  zbins[1] = _mm_set1_epi32(zbin1_tmp);
 
   nzbins[0] = _mm_setzero_si128();
   nzbins[1] = _mm_setzero_si128();
diff --git a/vp9/encoder/x86/vp9_quantize_sse2.c b/vp9/encoder/x86/vp9_quantize_sse2.c
index e06eb2f15..679c66e30 100644
--- a/vp9/encoder/x86/vp9_quantize_sse2.c
+++ b/vp9/encoder/x86/vp9_quantize_sse2.c
@@ -18,7 +18,7 @@ void vp9_quantize_b_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
                          const int16_t* round_ptr, const int16_t* quant_ptr,
                          const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr,
                          int16_t* dqcoeff_ptr, const int16_t* dequant_ptr,
-                         int zbin_oq_value, uint16_t* eob_ptr,
+                         uint16_t* eob_ptr,
                          const int16_t* scan_ptr,
                          const int16_t* iscan_ptr) {
   __m128i zero;
@@ -39,13 +39,10 @@ void vp9_quantize_b_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
 
       // Setup global values
       {
-        __m128i zbin_oq;
         __m128i pw_1;
-        zbin_oq = _mm_set1_epi16(zbin_oq_value);
         zbin = _mm_load_si128((const __m128i*)zbin_ptr);
         round = _mm_load_si128((const __m128i*)round_ptr);
         quant = _mm_load_si128((const __m128i*)quant_ptr);
-        zbin = _mm_add_epi16(zbin, zbin_oq);
         pw_1 = _mm_set1_epi16(1);
         zbin = _mm_sub_epi16(zbin, pw_1);
         dequant = _mm_load_si128((const __m128i*)dequant_ptr);
@@ -229,14 +226,13 @@ void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
                           const int16_t* round_ptr, const int16_t* quant_ptr,
                           const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr,
                           int16_t* dqcoeff_ptr, const int16_t* dequant_ptr,
-                          int zbin_oq_value, uint16_t* eob_ptr,
+                          uint16_t* eob_ptr,
                           const int16_t* scan_ptr,
                           const int16_t* iscan_ptr) {
   __m128i zero;
   (void)scan_ptr;
   (void)zbin_ptr;
   (void)quant_shift_ptr;
-  (void)zbin_oq_value;
 
   coeff_ptr += n_coeffs;
   iscan_ptr += n_coeffs;
diff --git a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
index f5f05e799..72e01d646 100644
--- a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
+++ b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
@@ -17,7 +17,7 @@ SECTION .text
 
 %macro QUANTIZE_FN 2
 cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
-                                shift, qcoeff, dqcoeff, dequant, zbin_oq, \
+                                shift, qcoeff, dqcoeff, dequant, \
                                 eob, scan, iscan
   cmp                    dword skipm, 0
   jne .blank
@@ -29,13 +29,9 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   movifnidn                    zbinq, zbinmp
   movifnidn                   roundq, roundmp
   movifnidn                   quantq, quantmp
-  movd                            m4, dword zbin_oqm       ; m4 = zbin_oq
   mova                            m0, [zbinq]              ; m0 = zbin
-  punpcklwd                       m4, m4
   mova                            m1, [roundq]             ; m1 = round
-  pshufd                          m4, m4, 0
   mova                            m2, [quantq]             ; m2 = quant
-  paddw                           m0, m4                   ; m0 = zbin + zbin_oq
 %ifidn %1, b_32x32
   pcmpeqw                         m5, m5
   psrlw                           m5, 15
@@ -55,7 +51,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   psllw                           m4, 1
 %endif
   pxor                            m5, m5                   ; m5 = dedicated zero
-  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob
+  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob
   lea                         coeffq, [  coeffq+ncoeffq*2]
   lea                         iscanq, [  iscanq+ncoeffq*2]
   lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
@@ -220,7 +216,7 @@ QUANTIZE_FN b_32x32, 7
 
 %macro QUANTIZE_FP 2
 cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
-                                shift, qcoeff, dqcoeff, dequant, zbin_oq, \
+                                shift, qcoeff, dqcoeff, dequant, \
                                 eob, scan, iscan
   cmp                    dword skipm, 0
   jne .blank
@@ -248,7 +244,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   psllw                           m2, 1
 %endif
   pxor                            m5, m5                   ; m5 = dedicated zero
-  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob
+  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob
   lea                         coeffq, [  coeffq+ncoeffq*2]
   lea                         iscanq, [  iscanq+ncoeffq*2]
   lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 2504f4db9..f5e6e3190 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -133,13 +133,11 @@ ifeq ($(ARCH_X86_64), yes)
 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_idct_ssse3_x86_64.asm
 endif
 
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_16_neon_asm$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_dc_only_idct_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_iht4x4_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_iht8x8_add_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_mb_lpf_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_save_reg_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_reconintra_neon$(ASM)
+
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht4x4_add_neon.c
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht8x8_add_neon.c
 
 # neon with assembly and intrinsics implementations. If both are available
 # prefer assembly.
@@ -158,8 +156,10 @@ VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_1_add_neon_asm$(ASM)
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_add_neon_asm$(ASM)
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_1_add_neon_asm$(ASM)
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_add_neon_asm$(ASM)
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_16_neon_asm$(ASM)
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_neon_asm$(ASM)
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_16_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_reconintra_neon_asm$(ASM)
 else
 ifeq ($(HAVE_NEON), yes)
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_avg_neon.c
@@ -178,6 +178,7 @@ VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_1_add_neon.c
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_add_neon.c
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_neon.c
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_16_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_reconintra_neon.c
 endif  # HAVE_NEON
 endif  # HAVE_NEON_ASM
 
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index b9fb8140c..7b4b17809 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -13,6 +13,7 @@
 
 #include "./vpx_config.h"
 #include "vpx/vpx_codec.h"
+#include "vpx_ports/vpx_once.h"
 #include "vpx/internal/vpx_codec_internal.h"
 #include "./vpx_version.h"
 #include "vp9/encoder/vp9_encoder.h"
@@ -208,7 +209,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
           "or kf_max_dist instead.");
 
   RANGE_CHECK(extra_cfg, enable_auto_alt_ref, 0, 2);
-  RANGE_CHECK(extra_cfg, cpu_used, -16, 16);
+  RANGE_CHECK(extra_cfg, cpu_used, -8, 8);
   RANGE_CHECK_HI(extra_cfg, noise_sensitivity, 6);
   RANGE_CHECK(extra_cfg, tile_columns, 0, 6);
   RANGE_CHECK(extra_cfg, tile_rows, 0, 2);
@@ -729,7 +730,7 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
     }
 
     priv->extra_cfg = default_extra_cfg;
-    vp9_initialize_enc();
+    once(vp9_initialize_enc);
 
     res = validate_config(priv, &priv->cfg, &priv->extra_cfg);
 
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index 809514001..43bf35f9c 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -44,6 +44,7 @@ struct vpx_codec_alg_priv {
   int                     flushed;
   int                     invert_tile_order;
   int                     frame_parallel_decode;  // frame-based threading.
+  int                     byte_alignment;
 
   // External frame buffer info to save for VP9 common.
   void *ext_priv;  // Private data associated with the external frame buffers.
@@ -219,6 +220,7 @@ static void init_buffer_callbacks(vpx_codec_alg_priv_t *ctx) {
   VP9_COMMON *const cm = &ctx->pbi->common;
 
   cm->new_fb_idx = -1;
+  cm->byte_alignment = ctx->byte_alignment;
 
   if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) {
     cm->get_fb_cb = ctx->get_ext_fb_cb;
@@ -617,6 +619,27 @@ static vpx_codec_err_t ctrl_set_decryptor(vpx_codec_alg_priv_t *ctx,
   return VPX_CODEC_OK;
 }
 
+static vpx_codec_err_t ctrl_set_byte_alignment(vpx_codec_alg_priv_t *ctx,
+                                               va_list args) {
+  const int legacy_byte_alignment = 0;
+  const int min_byte_alignment = 32;
+  const int max_byte_alignment = 1024;
+  const int byte_alignment = va_arg(args, int);
+
+  if (byte_alignment != legacy_byte_alignment &&
+      (byte_alignment < min_byte_alignment ||
+       byte_alignment > max_byte_alignment ||
+       (byte_alignment & (byte_alignment - 1)) != 0))
+    return VPX_CODEC_INVALID_PARAM;
+
+  ctx->byte_alignment = byte_alignment;
+  if (ctx->pbi != NULL) {
+    VP9_COMMON *const cm = &ctx->pbi->common;
+    cm->byte_alignment = byte_alignment;
+  }
+  return VPX_CODEC_OK;
+}
+
 static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
   {VP8_COPY_REFERENCE,            ctrl_copy_reference},
 
@@ -629,6 +652,7 @@ static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
   {VP8_SET_DBG_DISPLAY_MV,        ctrl_set_dbg_options},
   {VP9_INVERT_TILE_DECODE_ORDER,  ctrl_set_invert_tile_order},
   {VPXD_SET_DECRYPTOR,            ctrl_set_decryptor},
+  {VP9_SET_BYTE_ALIGNMENT,        ctrl_set_byte_alignment},
 
   // Getters
   {VP8D_GET_LAST_REF_UPDATES,     ctrl_get_last_ref_updates},