16 files changed, 409 insertions, 75 deletions
diff --git a/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm b/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm
new file mode 100644
index 000000000..963ef35da
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm
@@ -0,0 +1,237 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vp9_short_iht4x4_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+    ; Parallel 1D IDCT on all the columns of a 4x4 16bits data matrix which are
+    ; loaded in d16-d19. d0 must contain cospi_8_64. d1 must contain
+    ; cospi_16_64. d2 must contain cospi_24_64. The output will be stored back
+    ; into d16-d19 registers. This macro will touch q10- q15 registers and use
+    ; them as buffer during calculation.
+    MACRO
+    IDCT4x4_1D
+    ; stage 1
+    vadd.s16    d23, d16, d18   ; (input[0] + input[2])
+    vsub.s16    d24, d16, d18   ; (input[0] - input[2])
+
+    vmull.s16   q15, d17, d2    ; input[1] * cospi_24_64
+    vmull.s16   q10, d17, d0    ; input[1] * cospi_8_64
+    vmull.s16   q13, d23, d1    ; (input[0] + input[2]) * cospi_16_64
+    vmull.s16   q14, d24, d1    ; (input[0] - input[2]) * cospi_16_64
+    vmlsl.s16   q15, d19, d0    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
+    vmlal.s16   q10, d19, d2    ; input[1] * cospi_8_64 + input[3] * cospi_24_64
+
+    ; dct_const_round_shift
+    vqrshrn.s32 d26, q13, #14
+    vqrshrn.s32 d27, q14, #14
+    vqrshrn.s32 d29, q15, #14
+    vqrshrn.s32 d28, q10, #14
+
+    ; stage 2
+    ; output[0] = step[0] + step[3];
+    ; output[1] = step[1] + step[2];
+    ; output[3] = step[0] - step[3];
+    ; output[2] = step[1] - step[2];
+    vadd.s16    q8,  q13, q14
+    vsub.s16    q9,  q13, q14
+    vswp        d18, d19
+    MEND
+
+    ; Parallel 1D IADST on all the columns of a 4x4 16bits data matrix which
+    ; loaded in d16-d19. d3 must contain sinpi_1_9. d4 must contain sinpi_2_9.
+    ; d5 must contain sinpi_4_9. d6 must contain sinpi_3_9. The output will be
+    ; stored back into d16-d19 registers. This macro will touch q11,q12,q13,
+    ; q14,q15 registers and use them as buffer during calculation.
+    MACRO
+    IADST4x4_1D
+    vmull.s16   q10, d3, d16    ; s0 = sinpi_1_9 * x0
+    vmull.s16   q11, d4, d16    ; s1 = sinpi_2_9 * x0
+    vmull.s16   q12, d6, d17    ; s2 = sinpi_3_9 * x1
+    vmull.s16   q13, d5, d18    ; s3 = sinpi_4_9 * x2
+    vmull.s16   q14, d3, d18    ; s4 = sinpi_1_9 * x2
+    vmovl.s16   q15, d16        ; expand x0 from 16 bit to 32 bit
+    vaddw.s16   q15, q15, d19   ; x0 + x3
+    vmull.s16   q8, d4, d19     ; s5 = sinpi_2_9 * x3
+    vsubw.s16   q15, q15, d18   ; s7 = x0 + x3 - x2
+    vmull.s16   q9, d5, d19     ; s6 = sinpi_4_9 * x3
+
+    vadd.s32    q10, q10, q13   ; x0 = s0 + s3 + s5
+    vadd.s32    q10, q10, q8
+    vsub.s32    q11, q11, q14   ; x1 = s1 - s4 - s6
+    vdup.32     q8, r0          ; duplicate sinpi_3_9
+    vsub.s32    q11, q11, q9
+    vmul.s32    q15, q15, q8    ; x2 = sinpi_3_9 * s7
+
+    vadd.s32    q13, q10, q12   ; s0 = x0 + x3
+    vadd.s32    q10, q10, q11   ; x0 + x1
+    vadd.s32    q14, q11, q12   ; s1 = x1 + x3
+    vsub.s32    q10, q10, q12   ; s3 = x0 + x1 - x3
+
+    ; dct_const_round_shift
+    vqrshrn.s32 d16, q13, #14
+    vqrshrn.s32 d17, q14, #14
+    vqrshrn.s32 d18, q15, #14
+    vqrshrn.s32 d19, q10, #14
+    MEND
+
+    ; Generate cosine constants in d6 - d8 for the IDCT
+    MACRO
+    GENERATE_COSINE_CONSTANTS
+    ; cospi_8_64 = 15137 = 0x3b21
+    mov         r0, #0x3b00
+    add         r0, #0x21
+    ; cospi_16_64 = 11585 = 0x2d41
+    mov         r3, #0x2d00
+    add         r3, #0x41
+    ; cospi_24_64 = 6270 = 0x187e
+    mov         r12, #0x1800
+    add         r12, #0x7e
+
+    ; generate constant vectors
+    vdup.16     d0, r0          ; duplicate cospi_8_64
+    vdup.16     d1, r3          ; duplicate cospi_16_64
+    vdup.16     d2, r12         ; duplicate cospi_24_64
+    MEND
+
+    ; Generate sine constants in d1 - d4 for the IADST.
+    MACRO
+    GENERATE_SINE_CONSTANTS
+    ; sinpi_1_9 = 5283 = 0x14A3
+    mov         r0, #0x1400
+    add         r0, #0xa3
+    ; sinpi_2_9 = 9929 = 0x26C9
+    mov         r3, #0x2600
+    add         r3, #0xc9
+    ; sinpi_4_9 = 15212 = 0x3B6C
+    mov         r12, #0x3b00
+    add         r12, #0x6c
+
+    ; generate constant vectors
+    vdup.16     d3, r0          ; duplicate sinpi_1_9
+
+    ; sinpi_3_9 = 13377 = 0x3441
+    mov         r0, #0x3400
+    add         r0, #0x41
+
+    vdup.16     d4, r3          ; duplicate sinpi_2_9
+    vdup.16     d5, r12         ; duplicate sinpi_4_9
+    vdup.16     q3, r0          ; duplicate sinpi_3_9
+    MEND
+
+    ; Transpose a 4x4 16bits data matrix. Datas are loaded in d16-d19.
+    MACRO
+    TRANSPOSE4X4
+    vtrn.16     d16, d17
+    vtrn.16     d18, d19
+    vtrn.32     q8, q9
+    MEND
+
+    AREA     Block, CODE, READONLY ; name this block of code
+;void vp9_short_iht4x4_add_neon(int16_t *input, uint8_t *dest,
+;                               int dest_stride, int tx_type)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride
+; r3  int tx_type)
+; This function will only handle tx_type of 1,2,3.
+|vp9_short_iht4x4_add_neon| PROC
+
+    ; load the inputs into d16-d19
+    vld1.s16    {q8,q9}, [r0]!
+
+    ; transpose the input data
+    TRANSPOSE4X4
+
+    ; decide the type of transform
+    cmp         r3, #2
+    beq         idct_iadst
+    cmp         r3, #3
+    beq         iadst_iadst
+
+iadst_idct
+    ; generate constants
+    GENERATE_COSINE_CONSTANTS
+    GENERATE_SINE_CONSTANTS
+
+    ; first transform rows
+    IDCT4x4_1D
+
+    ; transpose the matrix
+    TRANSPOSE4X4
+
+    ; then transform columns
+    IADST4x4_1D
+
+    b end_vp9_short_iht4x4_add_neon
+
+idct_iadst
+    ; generate constants
+    GENERATE_COSINE_CONSTANTS
+    GENERATE_SINE_CONSTANTS
+
+    ; first transform rows
+    IADST4x4_1D
+
+    ; transpose the matrix
+    TRANSPOSE4X4
+
+    ; then transform columns
+    IDCT4x4_1D
+
+    b end_vp9_short_iht4x4_add_neon
+
+iadst_iadst
+    ; generate constants
+    GENERATE_SINE_CONSTANTS
+
+    ; first transform rows
+    IADST4x4_1D
+
+    ; transpose the matrix
+    TRANSPOSE4X4
+
+    ; then transform columns
+    IADST4x4_1D
+
+end_vp9_short_iht4x4_add_neon
+    ; ROUND_POWER_OF_TWO(temp_out[j], 4)
+    vrshr.s16   q8, q8, #4
+    vrshr.s16   q9, q9, #4
+
+    vld1.32     {d26[0]}, [r1], r2
+    vld1.32     {d26[1]}, [r1], r2
+    vld1.32     {d27[0]}, [r1], r2
+    vld1.32     {d27[1]}, [r1]
+
+    ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]
+    vaddw.u8    q8, q8, d26
+    vaddw.u8    q9, q9, d27
+
+    ; clip_pixel
+    vqmovun.s16 d26, q8
+    vqmovun.s16 d27, q9
+
+    ; do the stores in reverse order with negative post-increment, by changing
+    ; the sign of the stride
+    rsb         r2, r2, #0
+    vst1.32     {d27[1]}, [r1], r2
+    vst1.32     {d27[0]}, [r1], r2
+    vst1.32     {d26[1]}, [r1], r2
+    vst1.32     {d26[0]}, [r1]  ; no post-increment
+    bx          lr
+    ENDP  ; |vp9_short_iht4x4_add_neon|
+
+    END
diff --git a/vp9/common/vp9_convolve.c b/vp9/common/vp9_convolve.c
index 1e6cd4404..be092f41c 100644
--- a/vp9/common/vp9_convolve.c
+++ b/vp9/common/vp9_convolve.c
@@ -195,7 +195,7 @@ static void convolve_c(const uint8_t *src, ptrdiff_t src_stride,
    * h == 64, taps == 8.
    */
   uint8_t temp[64 * 135];
-  int intermediate_height = MAX(((h * y_step_q4) >> 4), 1) + taps - 1;
+  int intermediate_height = (((h - 1) * y_step_q4 + 15) >> 4) + taps;
 
   assert(w <= 64);
   assert(h <= 64);
@@ -203,9 +203,6 @@ static void convolve_c(const uint8_t *src, ptrdiff_t src_stride,
   assert(y_step_q4 <= 32);
   assert(x_step_q4 <= 32);
 
-  if (intermediate_height < h)
-    intermediate_height = h;
-
   convolve_horiz_c(src - src_stride * (taps / 2 - 1), src_stride, temp, 64,
                    filter_x, x_step_q4, filter_y, y_step_q4, w,
                    intermediate_height, taps);
diff --git a/vp9/common/vp9_entropymv.c b/vp9/common/vp9_entropymv.c
index 6cfc34697..c6eefda92 100644
--- a/vp9/common/vp9_entropymv.c
+++ b/vp9/common/vp9_entropymv.c
@@ -79,20 +79,59 @@ static const nmv_context default_nmv_context = {
 
 #define mv_class_base(c) ((c) ? (CLASS0_SIZE << (c + 2)) : 0)
 
+static const uint8_t log_in_base_2[] = {
+  0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
+  4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10
+};
+
 MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset) {
   MV_CLASS_TYPE c = MV_CLASS_0;
-  if      (z < CLASS0_SIZE * 8)    c = MV_CLASS_0;
-  else if (z < CLASS0_SIZE * 16)   c = MV_CLASS_1;
-  else if (z < CLASS0_SIZE * 32)   c = MV_CLASS_2;
-  else if (z < CLASS0_SIZE * 64)   c = MV_CLASS_3;
-  else if (z < CLASS0_SIZE * 128)  c = MV_CLASS_4;
-  else if (z < CLASS0_SIZE * 256)  c = MV_CLASS_5;
-  else if (z < CLASS0_SIZE * 512)  c = MV_CLASS_6;
-  else if (z < CLASS0_SIZE * 1024) c = MV_CLASS_7;
-  else if (z < CLASS0_SIZE * 2048) c = MV_CLASS_8;
-  else if (z < CLASS0_SIZE * 4096) c = MV_CLASS_9;
-  else if (z < CLASS0_SIZE * 8192) c = MV_CLASS_10;
-  else assert(0);
+  if (z >= CLASS0_SIZE * 4096)
+    c = MV_CLASS_10;
+  else
+    c = log_in_base_2[z >> 3];
+
   if (offset)
     *offset = z - mv_class_base(c);
   return c;
@@ -123,29 +162,18 @@ static void inc_mv_component(int v, nmv_component_counts *comp_counts,
   d = (o >> 3);               /* int mv data */
   f = (o >> 1) & 3;           /* fractional pel mv data */
   e = (o & 1);                /* high precision mv data */
+
   if (c == MV_CLASS_0) {
     comp_counts->class0[d] += incr;
+    comp_counts->class0_fp[d][f] += incr;
+    comp_counts->class0_hp[e] += usehp * incr;
   } else {
     int i;
     int b = c + CLASS0_BITS - 1;  // number of bits
     for (i = 0; i < b; ++i)
       comp_counts->bits[i][((d >> i) & 1)] += incr;
-  }
-
-  /* Code the fractional pel bits */
-  if (c == MV_CLASS_0) {
-    comp_counts->class0_fp[d][f] += incr;
-  } else {
     comp_counts->fp[f] += incr;
-  }
-
-  /* Code the high precision bit */
-  if (usehp) {
-    if (c == MV_CLASS_0) {
-      comp_counts->class0_hp[e] += incr;
-    } else {
-      comp_counts->hp[e] += incr;
-    }
+    comp_counts->hp[e] += usehp * incr;
   }
 }
 
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index cfa61c20b..df806ac56 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -777,6 +777,7 @@ static void setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
     }
   }
 }
+#if CONFIG_NON420
 static void filter_block_plane_non420(VP9_COMMON *cm,
                                       struct macroblockd_plane *plane,
                                       const MODE_INFO *mi,
@@ -896,6 +897,7 @@ static void filter_block_plane_non420(VP9_COMMON *cm,
     dst->buf += 8 * dst->stride;
   }
 }
+#endif
 
 static void filter_block_plane(VP9_COMMON *const cm,
                                struct macroblockd_plane *const plane,
@@ -981,8 +983,10 @@ void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer,
   const int num_planes = y_only ? 1 : MAX_MB_PLANE;
   int mi_row, mi_col;
   LOOP_FILTER_MASK lfm;
+#if CONFIG_NON420
   int use_420 = y_only || (xd->plane[1].subsampling_y == 1 &&
       xd->plane[1].subsampling_x == 1);
+#endif
 
   for (mi_row = start; mi_row < stop; mi_row += MI_BLOCK_SIZE) {
     MODE_INFO* const mi = cm->mi + mi_row * cm->mode_info_stride;
@@ -993,16 +997,22 @@ void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer,
       setup_dst_planes(xd, frame_buffer, mi_row, mi_col);
 
       // TODO(JBB): Make setup_mask work for non 420.
+#if CONFIG_NON420
       if (use_420)
+#endif
         setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mode_info_stride, &lfm);
 
       for (plane = 0; plane < num_planes; ++plane) {
+#if CONFIG_NON420
         if (use_420)
+#endif
           filter_block_plane(cm, &xd->plane[plane], mi + mi_col, mi_row, mi_col,
                              &lfm);
+#if CONFIG_NON420
         else
           filter_block_plane_non420(cm, &xd->plane[plane], mi + mi_col,
                                     mi_row, mi_col);
+#endif
       }
     }
   }
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index a669cc5e7..f0bc063f2 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -20,7 +20,7 @@
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_quant_common.h"
 
-#if CONFIG_POSTPROC
+#if CONFIG_VP9_POSTPROC
 #include "vp9/common/vp9_postproc.h"
 #endif
 
@@ -201,7 +201,7 @@ typedef struct VP9Common {
   unsigned int current_video_frame;
   int version;
 
-#if CONFIG_POSTPROC
+#if CONFIG_VP9_POSTPROC
   struct postproc_state  postproc_state;
 #endif
 
diff --git a/vp9/common/vp9_postproc.c b/vp9/common/vp9_postproc.c
index 48d3d2d98..955e6766a 100644
--- a/vp9/common/vp9_postproc.c
+++ b/vp9/common/vp9_postproc.c
@@ -1011,7 +1011,8 @@ int vp9_post_proc_frame(struct VP9Common *cm,
   /* handle problem with extending borders */
   dest->y_width = cm->width;
   dest->y_height = cm->height;
-  dest->uv_height = dest->y_height / 2;
+  dest->uv_width = dest->y_width >> cm->subsampling_x;
+  dest->uv_height = dest->y_height >> cm->subsampling_y;
 
   return 0;
 }
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 104db6aeb..c2777aa51 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -237,7 +237,7 @@ specialize vp9_loop_filter_horizontal_edge mmx neon
 #
 # post proc
 #
-if [ "$CONFIG_POSTPROC" = "yes" ]; then
+if [ "$CONFIG_VP9_POSTPROC" = "yes" ]; then
 prototype void vp9_mbpost_proc_down "uint8_t *dst, int pitch, int rows, int cols, int flimit"
 specialize vp9_mbpost_proc_down mmx sse2
 vp9_mbpost_proc_down_sse2=vp9_mbpost_proc_down_xmm
@@ -325,7 +325,7 @@ prototype void vp9_short_idct1_32x32 "int16_t *input, int16_t *output"
 specialize vp9_short_idct1_32x32
 
 prototype void vp9_short_iht4x4_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
-specialize vp9_short_iht4x4_add sse2
+specialize vp9_short_iht4x4_add sse2 neon
 
 prototype void vp9_short_iht8x8_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
 specialize vp9_short_iht8x8_add sse2
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index 41e406d95..6cb7c094b 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -77,14 +77,11 @@ static void read_tx_probs(struct tx_probs *tx_probs, vp9_reader *r) {
         vp9_diff_update_prob(r, &tx_probs->p32x32[i][j]);
 }
 
-static void init_dequantizer(VP9_COMMON *cm, MACROBLOCKD *xd) {
+static void setup_plane_dequants(VP9_COMMON *cm, MACROBLOCKD *xd, int q_index) {
   int i;
-  const int segment_id = xd->mode_info_context->mbmi.segment_id;
-  xd->q_index = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex);
-
-  xd->plane[0].dequant = cm->y_dequant[xd->q_index];
+  xd->plane[0].dequant = cm->y_dequant[q_index];
   for (i = 1; i < MAX_MB_PLANE; i++)
-    xd->plane[i].dequant = cm->uv_dequant[xd->q_index];
+    xd->plane[i].dequant = cm->uv_dequant[q_index];
 }
 
 static void decode_block(int plane, int block, BLOCK_SIZE plane_bsize,
@@ -149,14 +146,17 @@ static void decode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
 }
 
 static int decode_tokens(VP9D_COMP *pbi, BLOCK_SIZE bsize, vp9_reader *r) {
+  VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
+  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
 
-  if (xd->mode_info_context->mbmi.skip_coeff) {
-      reset_skip_context(xd, bsize);
+  if (mbmi->skip_coeff) {
+    reset_skip_context(xd, bsize);
     return -1;
   } else {
-    if (pbi->common.seg.enabled)
-      init_dequantizer(&pbi->common, xd);
+    if (cm->seg.enabled)
+      setup_plane_dequants(cm, xd, vp9_get_qindex(&cm->seg, mbmi->segment_id,
+                                                  cm->base_qindex));
 
     // TODO(dkovalev) if (!vp9_reader_has_error(r))
     return vp9_decode_tokens(pbi, r, bsize);
@@ -173,6 +173,7 @@ static void set_offsets(VP9D_COMP *pbi, BLOCK_SIZE bsize,
 
   xd->mode_info_context = cm->mi + offset;
   xd->mode_info_context->mbmi.sb_type = bsize;
+  xd->mode_info_stride = cm->mode_info_stride;
   // Special case: if prev_mi is NULL, the previous mode info context
   // cannot be used.
   xd->prev_mode_info_context = cm->prev_mi ? cm->prev_mi + offset : NULL;
@@ -453,8 +454,7 @@ static void setup_loopfilter(struct loopfilter *lf,
 
 static int read_delta_q(struct vp9_read_bit_buffer *rb, int *delta_q) {
   const int old = *delta_q;
-  if (vp9_rb_read_bit(rb))
-    *delta_q = vp9_rb_read_signed_literal(rb, 4);
+  *delta_q = vp9_rb_read_bit(rb) ? vp9_rb_read_signed_literal(rb, 4) : 0;
   return old != *delta_q;
 }
 
@@ -958,11 +958,7 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
     vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
                        "Truncated packet or corrupt header length");
 
-  xd->mode_info_context = cm->mi;
-  xd->prev_mode_info_context = cm->prev_mi;
-  xd->mode_info_stride = cm->mode_info_stride;
-
-  init_dequantizer(cm, &pbi->mb);
+  setup_plane_dequants(cm, &pbi->mb, cm->base_qindex);
 
   cm->fc = cm->frame_contexts[cm->frame_context_idx];
 
diff --git a/vp9/decoder/vp9_onyxd_if.c b/vp9/decoder/vp9_onyxd_if.c
index b609f9f31..505e9dc5d 100644
--- a/vp9/decoder/vp9_onyxd_if.c
+++ b/vp9/decoder/vp9_onyxd_if.c
@@ -13,7 +13,7 @@
 #include <stdio.h>
 
 #include "vp9/common/vp9_onyxc_int.h"
-#if CONFIG_POSTPROC
+#if CONFIG_VP9_POSTPROC
 #include "vp9/common/vp9_postproc.h"
 #endif
 #include "vp9/decoder/vp9_onyxd.h"
@@ -421,7 +421,7 @@ int vp9_get_raw_frame(VP9D_PTR ptr, YV12_BUFFER_CONFIG *sd,
   *time_stamp = pbi->last_time_stamp;
   *time_end_stamp = 0;
 
-#if CONFIG_POSTPROC
+#if CONFIG_VP9_POSTPROC
   ret = vp9_post_proc_frame(&pbi->common, sd, flags);
 #else
 
@@ -429,7 +429,9 @@ int vp9_get_raw_frame(VP9D_PTR ptr, YV12_BUFFER_CONFIG *sd,
     *sd = *pbi->common.frame_to_show;
     sd->y_width = pbi->common.width;
     sd->y_height = pbi->common.height;
-    sd->uv_height = pbi->common.height / 2;
+    sd->uv_width = sd->y_width >> pbi->common.subsampling_x;
+    sd->uv_height = sd->y_height >> pbi->common.subsampling_y;
+
     ret = 0;
   } else {
     ret = -1;
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index eb83903ca..45758e7cb 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -851,13 +851,75 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
   }
 }
 
-static void set_partitioning(VP9_COMP *cpi, MODE_INFO *m, BLOCK_SIZE bsize) {
+// Check to see if the given partition size is allowed for a specified number
+// of 8x8 block rows and columns remaining in the image.
+// If not then return the largest allowed partition size
+static BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize,
+                                      int rows_left, int cols_left,
+                                      int *bh, int *bw) {
+  if ((rows_left <= 0) || (cols_left <= 0)) {
+    return MIN(bsize, BLOCK_8X8);
+  } else {
+    for (; bsize > 0; --bsize) {
+      *bh = num_8x8_blocks_high_lookup[bsize];
+      *bw = num_8x8_blocks_wide_lookup[bsize];
+      if ((*bh <= rows_left) && (*bw <= cols_left)) {
+        break;
+      }
+    }
+  }
+  return bsize;
+}
+
+// This function attempts to set all mode info entries in a given SB64
+// to the same block partition size.
+// However, at the bottom and right borders of the image the requested size
+// may not be allowed in which case this code attempts to choose the largest
+// allowable partition.
+static void set_partitioning(VP9_COMP *cpi, MODE_INFO *m,
+                             int mi_row, int mi_col) {
   VP9_COMMON *const cm = &cpi->common;
+  BLOCK_SIZE bsize = cpi->sf.always_this_block_size;
   const int mis = cm->mode_info_stride;
+  int row8x8_remaining = cm->cur_tile_mi_row_end - mi_row;
+  int col8x8_remaining = cm->cur_tile_mi_col_end - mi_col;
   int block_row, block_col;
-  for (block_row = 0; block_row < 8; ++block_row) {
-    for (block_col = 0; block_col < 8; ++block_col) {
-      m[block_row * mis + block_col].mbmi.sb_type = bsize;
+
+  assert((row8x8_remaining > 0) && (col8x8_remaining > 0));
+
+  // Apply the requested partition size to the SB64 if it is all "in image"
+  if ((col8x8_remaining >= MI_BLOCK_SIZE) &&
+      (row8x8_remaining >= MI_BLOCK_SIZE)) {
+    for (block_row = 0; block_row < MI_BLOCK_SIZE; ++block_row) {
+      for (block_col = 0; block_col < MI_BLOCK_SIZE; ++block_col) {
+        m[block_row * mis + block_col].mbmi.sb_type = bsize;
+      }
+    }
+  } else {
+    // Else this is a partial SB64.
+    int bh = num_8x8_blocks_high_lookup[bsize];
+    int bw = num_8x8_blocks_wide_lookup[bsize];
+    int sub_block_row;
+    int sub_block_col;
+    int row_index;
+    int col_index;
+
+    for (block_row = 0; block_row < MI_BLOCK_SIZE; block_row += bh) {
+      for (block_col = 0; block_col < MI_BLOCK_SIZE; block_col += bw) {
+        // Find a partition size that fits
+        bsize = find_partition_size(cpi->sf.always_this_block_size,
+                                    (row8x8_remaining - block_row),
+                                    (col8x8_remaining - block_col), &bh, &bw);
+
+        // Set the mi entries for all 8x8 blocks within the selected size
+        for (sub_block_row = 0; sub_block_row < bh; ++sub_block_row) {
+          for (sub_block_col = 0; sub_block_col < bw; ++sub_block_col) {
+            row_index = block_row + sub_block_row;
+            col_index = block_col + sub_block_col;
+            m[row_index * mis + col_index].mbmi.sb_type = bsize;
+          }
+        }
+      }
     }
   }
 }
@@ -1946,7 +2008,7 @@ static void encode_sb_row(VP9_COMP *cpi, int mi_row, TOKENEXTRA **tp,
       cpi->mb.source_variance = UINT_MAX;
       if (cpi->sf.use_one_partition_size_always) {
         set_offsets(cpi, mi_row, mi_col, BLOCK_64X64);
-        set_partitioning(cpi, m, cpi->sf.always_this_block_size);
+        set_partitioning(cpi, m, mi_row, mi_col);
         rd_use_partition(cpi, m, tp, mi_row, mi_col, BLOCK_64X64,
                          &dummy_rate, &dummy_dist, 1);
       } else if (cpi->sf.partition_by_variance) {
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index e9c214ff8..d35b739fb 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -28,7 +28,7 @@
 #include "vp9/encoder/vp9_segmentation.h"
 #include "./vp9_rtcd.h"
 #include "./vpx_scale_rtcd.h"
-#if CONFIG_POSTPROC
+#if CONFIG_VP9_POSTPROC
 #include "vp9/common/vp9_postproc.h"
 #endif
 #include "vpx_mem/vpx_mem.h"
@@ -2895,7 +2895,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
     set_mvcost(&cpi->mb);
   }
 
-#if CONFIG_POSTPROC
+#if CONFIG_VP9_POSTPROC
 
   if (cpi->oxcf.noise_sensitivity > 0) {
     int l = 0;
@@ -3954,7 +3954,7 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
         {
           double frame_psnr2, frame_ssim2 = 0;
           double weight = 0;
-#if CONFIG_POSTPROC
+#if CONFIG_VP9_POSTPROC
           vp9_deblock(cm->frame_to_show, &cm->post_proc_buffer,
                       cm->lf.filter_level * 10 / 6);
 #endif
@@ -4030,7 +4030,7 @@ int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest,
     return -1;
   else {
     int ret;
-#if CONFIG_POSTPROC
+#if CONFIG_VP9_POSTPROC
     ret = vp9_post_proc_frame(&cpi->common, dest, flags);
 #else
 
@@ -4044,7 +4044,7 @@ int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest,
       ret = -1;
     }
 
-#endif // !CONFIG_POSTPROC
+#endif  // !CONFIG_VP9_POSTPROC
     vp9_clear_system_state();
     return ret;
   }
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 39b6544f6..647265bf6 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -2916,8 +2916,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                 (sse_v - var_v < thresh_dc || sse_v == var_v)) {
               x->skip = 1;
 
-              *rate2 = 500;
-              *rate_uv = 0;
+              // The cost of skip bit needs to be added.
+              *rate2 += vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), 1);
 
               // Scaling factor for SSE from spatial domain to frequency domain
               // is 16. Adjust distortion accordingly.
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index fb302abec..c6daecca0 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -73,11 +73,11 @@ VP9_COMMON_SRCS-yes += common/vp9_common_data.h
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_postproc_x86.h
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_intrin_sse2.c
-VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.h
-VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.c
+VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.h
+VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm
-ifeq ($(CONFIG_POSTPROC),yes)
+ifeq ($(CONFIG_VP9_POSTPROC),yes)
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm
 endif
@@ -102,6 +102,7 @@ VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct8x8_1_add_neon$(AS
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct8x8_add_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct16x16_1_add_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct16x16_add_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_iht4x4_add_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_mb_lpf_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_copy_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_avg_neon$(ASM)
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 0874afdbc..e9549228e 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -888,7 +888,7 @@ static vpx_codec_err_t get_reference(vpx_codec_alg_priv_t *ctx,
 static vpx_codec_err_t vp9e_set_previewpp(vpx_codec_alg_priv_t *ctx,
                                           int ctr_id,
                                           va_list args) {
-#if CONFIG_POSTPROC
+#if CONFIG_VP9_POSTPROC
   vp8_postproc_cfg_t *data = va_arg(args, vp8_postproc_cfg_t *);
   (void)ctr_id;
 
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index e7362fca0..10b32385c 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -20,7 +20,7 @@
 #include "vp9/decoder/vp9_read_bit_buffer.h"
 #include "vp9/vp9_iface_common.h"
 
-#define VP9_CAP_POSTPROC (CONFIG_POSTPROC ? VPX_CODEC_CAP_POSTPROC : 0)
+#define VP9_CAP_POSTPROC (CONFIG_VP9_POSTPROC ? VPX_CODEC_CAP_POSTPROC : 0)
 typedef vpx_codec_stream_info_t  vp9_stream_info_t;
 
 /* Structures for handling memory allocations */
@@ -596,7 +596,7 @@ static vpx_codec_err_t get_reference(vpx_codec_alg_priv_t *ctx,
 static vpx_codec_err_t set_postproc(vpx_codec_alg_priv_t *ctx,
                                     int ctr_id,
                                     va_list args) {
-#if CONFIG_POSTPROC
+#if CONFIG_VP9_POSTPROC
   vp8_postproc_cfg_t *data = va_arg(args, vp8_postproc_cfg_t *);
 
   if (data) {
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 89de6014e..9fbf100f5 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -64,7 +64,7 @@ VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_ssim.c
 VP9_CX_SRCS-yes += encoder/vp9_tokenize.c
 VP9_CX_SRCS-yes += encoder/vp9_treewriter.c
 VP9_CX_SRCS-yes += encoder/vp9_variance_c.c
-ifeq ($(CONFIG_POSTPROC),yes)
+ifeq ($(CONFIG_VP9_POSTPROC),yes)
 VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.h
 VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.c
 endif