32 files changed, 841 insertions, 244 deletions
diff --git a/vp9/common/arm/neon/vp9_reconintra_neon.asm b/vp9/common/arm/neon/vp9_reconintra_neon.asm
index 279f678b1..4a49964d5 100644
--- a/vp9/common/arm/neon/vp9_reconintra_neon.asm
+++ b/vp9/common/arm/neon/vp9_reconintra_neon.asm
@@ -349,7 +349,7 @@ loop_h
     vdup.u8             d0, r12
 
     ; preload 8 left
-    vld1.8              d30, [r3]
+    vld1.8              {d30}, [r3]
 
     ; Load above 8 pixels
     vld1.64             {d2}, [r2]
@@ -422,10 +422,10 @@ loop_h
     vdup.u8             q0, r12
 
     ; Load above 8 pixels
-    vld1.8              q1, [r2]
+    vld1.8              {q1}, [r2]
 
     ; preload 8 left into r12
-    vld1.8              d18, [r3]!
+    vld1.8              {d18}, [r3]!
 
     ; Compute above - ytop_left
     vsubl.u8            q2, d2, d0
@@ -492,7 +492,7 @@ loop_16x16_neon
     vqshrun.s16         d23, q8, #0
     vdup.16             q0, d20[2]
     vdup.16             q8, d20[3]
-    vld1.8              d18, [r3]!                  ; preload 8 left into r12
+    vld1.8              {d18}, [r3]!                  ; preload 8 left into r12
     vmovl.u8            q10, d18
     vst1.64             {d2,d3}, [r0], r1
     vst1.64             {d22,d23}, [r0], r1
@@ -518,11 +518,11 @@ loop_16x16_neon
     vdup.u8             q0, r12
 
     ; Load above 32 pixels
-    vld1.8              q1, [r2]!
-    vld1.8              q2, [r2]
+    vld1.8              {q1}, [r2]!
+    vld1.8              {q2}, [r2]
 
     ; preload 8 left pixels
-    vld1.8              d26, [r3]!
+    vld1.8              {d26}, [r3]!
 
     ; Compute above - ytop_left
     vsubl.u8            q8, d2, d0
@@ -621,7 +621,7 @@ loop_32x32_neon
     vst1.64             {d0-d3}, [r0], r1
     vqshrun.s16         d24, q12, #0
     vqshrun.s16         d25, q13, #0
-    vld1.8              d0, [r3]!                   ; preload 8 left pixels
+    vld1.8              {d0}, [r3]!                   ; preload 8 left pixels
     vqshrun.s16         d26, q14, #0
     vqshrun.s16         d27, q15, #0
     vmovl.u8            q3, d0
diff --git a/vp9/common/mips/dspr2/vp9_common_dspr2.h b/vp9/common/mips/dspr2/vp9_common_dspr2.h
index 991d3c2b3..6ebea9f2f 100644
--- a/vp9/common/mips/dspr2/vp9_common_dspr2.h
+++ b/vp9/common/mips/dspr2/vp9_common_dspr2.h
@@ -85,8 +85,8 @@ static INLINE void vp9_prefetch_store_streamed(unsigned char *dst) {
   );
 }
 
-void vp9_idct32_1d_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
-                                      int dest_stride);
+void vp9_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                                   int dest_stride);
 
 void vp9_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
diff --git a/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c
index 1b2f5506a..19c582fd1 100644
--- a/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c
@@ -19,8 +19,8 @@
 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
 
 #if HAVE_DSPR2
-static void idct16_1d_rows_dspr2(const int16_t *input, int16_t *output,
-                                 uint32_t no_rows) {
+static void idct16_rows_dspr2(const int16_t *input, int16_t *output,
+                              uint32_t no_rows) {
   int i;
   int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
   int step1_10, step1_11, step1_12, step1_13;
@@ -404,8 +404,8 @@ static void idct16_1d_rows_dspr2(const int16_t *input, int16_t *output,
   }
 }
 
-static void idct16_1d_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
-                                         int dest_stride) {
+static void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                                      int dest_stride) {
   int i;
   int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
   int step1_8, step1_9, step1_10, step1_11;
@@ -905,13 +905,13 @@ void vp9_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
   );
 
   // First transform rows
-  idct16_1d_rows_dspr2(input, out, 16);
+  idct16_rows_dspr2(input, out, 16);
 
   // Then transform columns and add to dest
-  idct16_1d_cols_add_blk_dspr2(out, dest, dest_stride);
+  idct16_cols_add_blk_dspr2(out, dest, dest_stride);
 }
 
-static void iadst16_1d(const int16_t *input, int16_t *output) {
+static void iadst16(const int16_t *input, int16_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
 
   int x0 = input[15];
@@ -1099,16 +1099,16 @@ void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
 
   switch (tx_type) {
     case DCT_DCT:     // DCT in both horizontal and vertical
-      idct16_1d_rows_dspr2(input, outptr, 16);
-      idct16_1d_cols_add_blk_dspr2(out, dest, pitch);
+      idct16_rows_dspr2(input, outptr, 16);
+      idct16_cols_add_blk_dspr2(out, dest, pitch);
       break;
     case ADST_DCT:    // ADST in vertical, DCT in horizontal
-      idct16_1d_rows_dspr2(input, outptr, 16);
+      idct16_rows_dspr2(input, outptr, 16);
 
       outptr = out;
 
       for (i = 0; i < 16; ++i) {
-        iadst16_1d(outptr, temp_out);
+        iadst16(outptr, temp_out);
 
         for (j = 0; j < 16; ++j)
           dest[j * pitch + i] =
@@ -1125,7 +1125,7 @@ void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
         /* prefetch row */
         vp9_prefetch_load((const uint8_t *)(input + 16));
 
-        iadst16_1d(input, outptr);
+        iadst16(input, outptr);
         input += 16;
         outptr += 16;
       }
@@ -1134,7 +1134,7 @@ void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
         for (j = 0; j < 16; ++j)
             temp_in[j * 16 + i] = out[i * 16 + j];
 
-      idct16_1d_cols_add_blk_dspr2(temp_in, dest, pitch);
+      idct16_cols_add_blk_dspr2(temp_in, dest, pitch);
     }
     break;
     case ADST_ADST:   // ADST in both directions
@@ -1145,7 +1145,7 @@ void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
         /* prefetch row */
         vp9_prefetch_load((const uint8_t *)(input + 16));
 
-        iadst16_1d(input, outptr);
+        iadst16(input, outptr);
         input += 16;
         outptr += 16;
       }
@@ -1153,7 +1153,7 @@ void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
       for (i = 0; i < 16; ++i) {
         for (j = 0; j < 16; ++j)
           temp_in[j] = out[j * 16 + i];
-        iadst16_1d(temp_in, temp_out);
+        iadst16(temp_in, temp_out);
         for (j = 0; j < 16; ++j)
           dest[j * pitch + i] =
                     clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
@@ -1183,7 +1183,7 @@ void vp9_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,
 
   // First transform rows. Since all non-zero dct coefficients are in
   // upper-left 4x4 area, we only need to calculate first 4 rows here.
-  idct16_1d_rows_dspr2(input, outptr, 4);
+  idct16_rows_dspr2(input, outptr, 4);
 
   outptr += 4;
   for (i = 0; i < 6; ++i) {
@@ -1213,7 +1213,7 @@ void vp9_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,
   }
 
   // Then transform columns
-  idct16_1d_cols_add_blk_dspr2(out, dest, dest_stride);
+  idct16_cols_add_blk_dspr2(out, dest, dest_stride);
 }
 
 void vp9_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
diff --git a/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c
index 5e92db3d2..132d88ce5 100644
--- a/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c
@@ -18,8 +18,8 @@
 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
 
 #if HAVE_DSPR2
-void vp9_idct32_1d_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
-                                      int dest_stride) {
+void vp9_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                                   int dest_stride) {
   int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
   int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
   int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19;
diff --git a/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c
index bc6759400..74a90b02c 100644
--- a/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c
@@ -19,8 +19,8 @@
 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
 
 #if HAVE_DSPR2
-static void idct32_1d_rows_dspr2(const int16_t *input, int16_t *output,
-                                 uint32_t no_rows) {
+static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
+                              uint32_t no_rows) {
   int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
   int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
   int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20;
@@ -882,10 +882,10 @@ void vp9_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest,
   );
 
   // Rows
-  idct32_1d_rows_dspr2(input, outptr, 32);
+  idct32_rows_dspr2(input, outptr, 32);
 
   // Columns
-  vp9_idct32_1d_cols_add_blk_dspr2(out, dest, dest_stride);
+  vp9_idct32_cols_add_blk_dspr2(out, dest, dest_stride);
 }
 
 void vp9_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest,
@@ -903,7 +903,7 @@ void vp9_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest,
   );
 
   // Rows
-  idct32_1d_rows_dspr2(input, outptr, 8);
+  idct32_rows_dspr2(input, outptr, 8);
 
   outptr += 8;
   __asm__ __volatile__ (
@@ -947,7 +947,7 @@ void vp9_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest,
   }
 
   // Columns
-  vp9_idct32_1d_cols_add_blk_dspr2(out, dest, stride);
+  vp9_idct32_cols_add_blk_dspr2(out, dest, stride);
 }
 
 void vp9_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest,
diff --git a/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c
index 5b7aa5e71..1990348b8 100644
--- a/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c
@@ -19,7 +19,7 @@
 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
 
 #if HAVE_DSPR2
-static void vp9_idct4_1d_rows_dspr2(const int16_t *input, int16_t *output) {
+static void vp9_idct4_rows_dspr2(const int16_t *input, int16_t *output) {
   int16_t   step_0, step_1, step_2, step_3;
   int       Temp0, Temp1, Temp2, Temp3;
   const int const_2_power_13 = 8192;
@@ -104,7 +104,7 @@ static void vp9_idct4_1d_rows_dspr2(const int16_t *input, int16_t *output) {
   }
 }
 
-static void vp9_idct4_1d_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
+static void vp9_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
                                                int dest_stride) {
   int16_t   step_0, step_1, step_2, step_3;
   int       Temp0, Temp1, Temp2, Temp3;
@@ -240,10 +240,10 @@ void vp9_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
   );
 
   // Rows
-  vp9_idct4_1d_rows_dspr2(input, outptr);
+  vp9_idct4_rows_dspr2(input, outptr);
 
   // Columns
-  vp9_idct4_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+  vp9_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride);
 }
 
 void vp9_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest,
@@ -319,7 +319,7 @@ void vp9_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest,
   }
 }
 
-static void iadst4_1d_dspr2(const int16_t *input, int16_t *output) {
+static void iadst4_dspr2(const int16_t *input, int16_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7;
   int x0, x1, x2, x3;
 
@@ -379,16 +379,16 @@ void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
 
   switch (tx_type) {
     case DCT_DCT:   // DCT in both horizontal and vertical
-      vp9_idct4_1d_rows_dspr2(input, outptr);
-      vp9_idct4_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+      vp9_idct4_rows_dspr2(input, outptr);
+      vp9_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride);
       break;
     case ADST_DCT:  // ADST in vertical, DCT in horizontal
-      vp9_idct4_1d_rows_dspr2(input, outptr);
+      vp9_idct4_rows_dspr2(input, outptr);
 
       outptr = out;
 
       for (i = 0; i < 4; ++i) {
-        iadst4_1d_dspr2(outptr, temp_out);
+        iadst4_dspr2(outptr, temp_out);
 
         for (j = 0; j < 4; ++j)
           dest[j * dest_stride + i] =
@@ -400,7 +400,7 @@ void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
       break;
     case DCT_ADST:  // DCT in vertical, ADST in horizontal
       for (i = 0; i < 4; ++i) {
-        iadst4_1d_dspr2(input, outptr);
+        iadst4_dspr2(input, outptr);
         input  += 4;
         outptr += 4;
       }
@@ -410,11 +410,11 @@ void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
           temp_in[i * 4 + j] = out[j * 4 + i];
         }
       }
-      vp9_idct4_1d_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride);
+      vp9_idct4_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride);
       break;
     case ADST_ADST:  // ADST in both directions
       for (i = 0; i < 4; ++i) {
-        iadst4_1d_dspr2(input, outptr);
+        iadst4_dspr2(input, outptr);
         input  += 4;
         outptr += 4;
       }
@@ -422,7 +422,7 @@ void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
       for (i = 0; i < 4; ++i) {
         for (j = 0; j < 4; ++j)
           temp_in[j] = out[j * 4 + i];
-        iadst4_1d_dspr2(temp_in, temp_out);
+        iadst4_dspr2(temp_in, temp_out);
 
         for (j = 0; j < 4; ++j)
           dest[j * dest_stride + i] =
diff --git a/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
index 93a08401d..acccaea6d 100644
--- a/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
@@ -19,8 +19,8 @@
 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
 
 #if HAVE_DSPR2
-static void idct8_1d_rows_dspr2(const int16_t *input, int16_t *output,
-                                uint32_t no_rows) {
+static void idct8_rows_dspr2(const int16_t *input, int16_t *output,
+                             uint32_t no_rows) {
   int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
   const int const_2_power_13 = 8192;
   int Temp0, Temp1, Temp2, Temp3, Temp4;
@@ -200,8 +200,8 @@ static void idct8_1d_rows_dspr2(const int16_t *input, int16_t *output,
   }
 }
 
-static void idct8_1d_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
-                                           int dest_stride) {
+static void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                                        int dest_stride) {
   int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
   int Temp0, Temp1, Temp2, Temp3;
   int i;
@@ -462,13 +462,13 @@ void vp9_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
   );
 
   // First transform rows
-  idct8_1d_rows_dspr2(input, outptr, 8);
+  idct8_rows_dspr2(input, outptr, 8);
 
   // Then transform columns and add to dest
-  idct8_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+  idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
 }
 
-static void iadst8_1d_dspr2(const int16_t *input, int16_t *output) {
+static void iadst8_dspr2(const int16_t *input, int16_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7;
   int x0, x1, x2, x3, x4, x5, x6, x7;
 
@@ -563,14 +563,14 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
 
   switch (tx_type) {
     case DCT_DCT:     // DCT in both horizontal and vertical
-      idct8_1d_rows_dspr2(input, outptr, 8);
-      idct8_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+      idct8_rows_dspr2(input, outptr, 8);
+      idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
       break;
     case ADST_DCT:    // ADST in vertical, DCT in horizontal
-      idct8_1d_rows_dspr2(input, outptr, 8);
+      idct8_rows_dspr2(input, outptr, 8);
 
       for (i = 0; i < 8; ++i) {
-        iadst8_1d_dspr2(&out[i * 8], temp_out);
+        iadst8_dspr2(&out[i * 8], temp_out);
 
         for (j = 0; j < 8; ++j)
           dest[j * dest_stride + i] =
@@ -580,7 +580,7 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
       break;
     case DCT_ADST:    // DCT in vertical, ADST in horizontal
       for (i = 0; i < 8; ++i) {
-        iadst8_1d_dspr2(input, outptr);
+        iadst8_dspr2(input, outptr);
         input += 8;
         outptr += 8;
       }
@@ -590,11 +590,11 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
           temp_in[i * 8 + j] = out[j * 8 + i];
         }
       }
-      idct8_1d_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride);
+      idct8_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride);
       break;
     case ADST_ADST:   // ADST in both directions
       for (i = 0; i < 8; ++i) {
-        iadst8_1d_dspr2(input, outptr);
+        iadst8_dspr2(input, outptr);
         input += 8;
         outptr += 8;
       }
@@ -603,7 +603,7 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
         for (j = 0; j < 8; ++j)
           temp_in[j] = out[j * 8 + i];
 
-        iadst8_1d_dspr2(temp_in, temp_out);
+        iadst8_dspr2(temp_in, temp_out);
 
         for (j = 0; j < 8; ++j)
           dest[j * dest_stride + i] =
@@ -631,7 +631,7 @@ void vp9_idct8x8_10_add_dspr2(const int16_t *input, uint8_t *dest,
   );
 
   // First transform rows
-  idct8_1d_rows_dspr2(input, outptr, 4);
+  idct8_rows_dspr2(input, outptr, 4);
 
   outptr += 4;
 
@@ -659,7 +659,7 @@ void vp9_idct8x8_10_add_dspr2(const int16_t *input, uint8_t *dest,
 
 
   // Then transform columns and add to dest
-  idct8_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+  idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
 }
 
 void vp9_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,
diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index e033fbb99..6f771992b 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -33,9 +33,16 @@ void vp9_update_mode_info_border(VP9_COMMON *cm, MODE_INFO *mi) {
 void vp9_free_frame_buffers(VP9_COMMON *cm) {
   int i;
 
-  for (i = 0; i < FRAME_BUFFERS; i++)
+  for (i = 0; i < FRAME_BUFFERS; i++) {
     vp9_free_frame_buffer(&cm->frame_bufs[i].buf);
 
+    if (cm->frame_bufs[i].ref_count > 0 &&
+        cm->frame_bufs[i].raw_frame_buffer.data != NULL) {
+      cm->release_fb_cb(cm->cb_priv, &cm->frame_bufs[i].raw_frame_buffer);
+      cm->frame_bufs[i].ref_count = 0;
+    }
+  }
+
   vp9_free_frame_buffer(&cm->post_proc_buffer);
 
   vpx_free(cm->mip);
@@ -85,7 +92,7 @@ int vp9_resize_frame_buffers(VP9_COMMON *cm, int width, int height) {
   int mi_size;
 
   if (vp9_realloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y,
-                               VP9_DEC_BORDER_IN_PIXELS) < 0)
+                               VP9_DEC_BORDER_IN_PIXELS, NULL, NULL, NULL) < 0)
     goto fail;
 
   set_mb_mi(cm, aligned_width, aligned_height);
@@ -199,6 +206,7 @@ void vp9_create_common(VP9_COMMON *cm) {
 
 void vp9_remove_common(VP9_COMMON *cm) {
   vp9_free_frame_buffers(cm);
+  vp9_free_internal_frame_buffers(&cm->int_frame_buffers);
 }
 
 void vp9_initialize_common() {
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 70b8ffa4e..f10a3c8c7 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -182,7 +182,7 @@ struct macroblockd_plane {
   int subsampling_y;
   struct buf_2d dst;
   struct buf_2d pre[2];
-  int16_t *dequant;
+  const int16_t *dequant;
   ENTROPY_CONTEXT *above_context;
   ENTROPY_CONTEXT *left_context;
 };
diff --git a/vp9/common/vp9_convolve.c b/vp9/common/vp9_convolve.c
index 3807ccc87..d30e0b488 100644
--- a/vp9/common/vp9_convolve.c
+++ b/vp9/common/vp9_convolve.c
@@ -145,7 +145,7 @@ static const InterpKernel *get_filter_base(const int16_t *filter) {
 }
 
 static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
-  return (const InterpKernel *)(intptr_t)f - base;
+  return (int)((const InterpKernel *)(intptr_t)f - base);
 }
 
 void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index e030d92ec..d6b380fd5 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -119,7 +119,7 @@ static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) {
 extern DECLARE_ALIGNED(16, const uint8_t, vp9_coefband_trans_8x8plus[1024]);
 extern DECLARE_ALIGNED(16, const uint8_t, vp9_coefband_trans_4x4[16]);
 
-static const uint8_t *get_band_translate(TX_SIZE tx_size) {
+static INLINE const uint8_t *get_band_translate(TX_SIZE tx_size) {
   return tx_size == TX_4X4 ? vp9_coefband_trans_4x4
                            : vp9_coefband_trans_8x8plus;
 }
@@ -146,8 +146,8 @@ typedef unsigned int vp9_coeff_count_model[REF_TYPES][COEF_BANDS]
 
 void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full);
 
-static int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
-                                                const ENTROPY_CONTEXT *l) {
+static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
+                                      const ENTROPY_CONTEXT *l) {
   ENTROPY_CONTEXT above_ec = 0, left_ec = 0;
 
   switch (tx_size) {
@@ -174,8 +174,8 @@ static int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
   return combine_entropy_contexts(above_ec, left_ec);
 }
 
-static const scan_order *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size,
-                                  PLANE_TYPE type, int block_idx) {
+static const INLINE scan_order *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size,
+                                         PLANE_TYPE type, int block_idx) {
   const MODE_INFO *const mi = xd->mi_8x8[0];
   const MB_MODE_INFO *const mbmi = &mi->mbmi;
 
diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c
index 6def3c869..25cba7fbe 100644
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -345,7 +345,7 @@ static int adapt_prob(vp9_prob pre_prob, const unsigned int ct[2]) {
 static void adapt_probs(const vp9_tree_index *tree,
                         const vp9_prob *pre_probs, const unsigned int *counts,
                         vp9_prob *probs) {
-  tree_merge_probs(tree, pre_probs, counts, COUNT_SAT, MAX_UPDATE_FACTOR,
+  vp9_tree_merge_probs(tree, pre_probs, counts, COUNT_SAT, MAX_UPDATE_FACTOR,
                    probs);
 }
 
diff --git a/vp9/common/vp9_entropymv.c b/vp9/common/vp9_entropymv.c
index 60ae79fdc..e1f5ef7b4 100644
--- a/vp9/common/vp9_entropymv.c
+++ b/vp9/common/vp9_entropymv.c
@@ -192,8 +192,8 @@ static vp9_prob adapt_prob(vp9_prob prep, const unsigned int ct[2]) {
 
 static void adapt_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs,
                         const unsigned int *counts, vp9_prob *probs) {
-  tree_merge_probs(tree, pre_probs, counts, MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR,
-                   probs);
+  vp9_tree_merge_probs(tree, pre_probs, counts, MV_COUNT_SAT,
+                       MV_MAX_UPDATE_FACTOR, probs);
 }
 
 void vp9_adapt_mv_probs(VP9_COMMON *cm, int allow_hp) {
diff --git a/vp9/common/vp9_frame_buffers.c b/vp9/common/vp9_frame_buffers.c
new file mode 100644
index 000000000..d903ed695
--- /dev/null
+++ b/vp9/common/vp9_frame_buffers.c
@@ -0,0 +1,84 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vp9/common/vp9_frame_buffers.h"
+#include "vpx_mem/vpx_mem.h"
+
+int vp9_alloc_internal_frame_buffers(InternalFrameBufferList *list) {
+  assert(list != NULL);
+  vp9_free_internal_frame_buffers(list);
+
+  list->num_internal_frame_buffers =
+      VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS;
+  list->int_fb = vpx_calloc(list->num_internal_frame_buffers,
+                            sizeof(*list->int_fb));
+  return (list->int_fb == NULL);
+}
+
+void vp9_free_internal_frame_buffers(InternalFrameBufferList *list) {
+  int i;
+
+  assert(list != NULL);
+
+  for (i = 0; i < list->num_internal_frame_buffers; ++i) {
+    vpx_free(list->int_fb[i].data);
+    list->int_fb[i].data = NULL;
+  }
+  vpx_free(list->int_fb);
+  list->int_fb = NULL;
+}
+
+int vp9_get_frame_buffer(void *cb_priv, size_t min_size,
+                         vpx_codec_frame_buffer_t *fb) {
+  int i;
+  InternalFrameBufferList *const int_fb_list =
+      (InternalFrameBufferList *)cb_priv;
+  if (int_fb_list == NULL || fb == NULL)
+    return -1;
+
+  // Find a free frame buffer.
+  for (i = 0; i < int_fb_list->num_internal_frame_buffers; ++i) {
+    if (!int_fb_list->int_fb[i].in_use)
+      break;
+  }
+
+  if (i == int_fb_list->num_internal_frame_buffers)
+    return -1;
+
+  if (int_fb_list->int_fb[i].size < min_size) {
+    int_fb_list->int_fb[i].data =
+        (uint8_t *)vpx_realloc(int_fb_list->int_fb[i].data, min_size);
+    if (!int_fb_list->int_fb[i].data)
+      return -1;
+
+    int_fb_list->int_fb[i].size = min_size;
+  }
+
+  fb->data = int_fb_list->int_fb[i].data;
+  fb->size = int_fb_list->int_fb[i].size;
+  int_fb_list->int_fb[i].in_use = 1;
+
+  // Set the frame buffer's private data to point at the internal frame buffer.
+  fb->priv = &int_fb_list->int_fb[i];
+  return 0;
+}
+
+int vp9_release_frame_buffer(void *cb_priv, vpx_codec_frame_buffer_t *fb) {
+  InternalFrameBuffer *int_fb;
+  (void)cb_priv;
+  if (fb == NULL)
+    return -1;
+
+  int_fb = (InternalFrameBuffer *)fb->priv;
+  int_fb->in_use = 0;
+  return 0;
+}
diff --git a/vp9/common/vp9_frame_buffers.h b/vp9/common/vp9_frame_buffers.h
new file mode 100644
index 000000000..e2cfe61b6
--- /dev/null
+++ b/vp9/common/vp9_frame_buffers.h
@@ -0,0 +1,53 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_FRAME_BUFFERS_H_
+#define VP9_COMMON_VP9_FRAME_BUFFERS_H_
+
+#include "vpx/vpx_frame_buffer.h"
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct InternalFrameBuffer {
+  uint8_t *data;
+  size_t size;
+  int in_use;
+} InternalFrameBuffer;
+
+typedef struct InternalFrameBufferList {
+  int num_internal_frame_buffers;
+  InternalFrameBuffer *int_fb;
+} InternalFrameBufferList;
+
+// Initializes |list|. Returns 0 on success.
+int vp9_alloc_internal_frame_buffers(InternalFrameBufferList *list);
+
+// Free any data allocated to the frame buffers.
+void vp9_free_internal_frame_buffers(InternalFrameBufferList *list);
+
+// Callback used by libvpx to request an external frame buffer. |cb_priv|
+// Callback private data, which points to an InternalFrameBufferList.
+// |min_size| is the minimum size in bytes needed to decode the next frame.
+// |fb| pointer to the frame buffer.
+int vp9_get_frame_buffer(void *cb_priv, size_t min_size,
+                         vpx_codec_frame_buffer_t *fb);
+
+// Callback used by libvpx when there are no references to the frame buffer.
+// |cb_priv| is not used. |fb| pointer to the frame buffer.
+int vp9_release_frame_buffer(void *cb_priv, vpx_codec_frame_buffer_t *fb);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_COMMON_VP9_FRAME_BUFFERS_H_
diff --git a/vp9/common/vp9_mv.h b/vp9/common/vp9_mv.h
index 98fd1d82f..3eb7f9d61 100644
--- a/vp9/common/vp9_mv.h
+++ b/vp9/common/vp9_mv.h
@@ -34,8 +34,8 @@ typedef struct mv32 {
   int32_t col;
 } MV32;
 
-static void clamp_mv(MV *mv, int min_col, int max_col,
-                             int min_row, int max_row) {
+static INLINE void clamp_mv(MV *mv, int min_col, int max_col,
+                            int min_row, int max_row) {
   mv->col = clamp(mv->col, min_col, max_col);
   mv->row = clamp(mv->row, min_row, max_row);
 }
diff --git a/vp9/common/vp9_mvref_common.h b/vp9/common/vp9_mvref_common.h
index 0936abfcd..f99952f3c 100644
--- a/vp9/common/vp9_mvref_common.h
+++ b/vp9/common/vp9_mvref_common.h
@@ -48,7 +48,7 @@ void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp,
                            int_mv *mvlist, int_mv *nearest, int_mv *near);
 
 // TODO(jingning): this mv clamping function should be block size dependent.
-static void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
+static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
   clamp_mv(mv, xd->mb_to_left_edge - LEFT_TOP_MARGIN,
                xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
                xd->mb_to_top_edge - LEFT_TOP_MARGIN,
diff --git a/vp9/common/vp9_onyx.h b/vp9/common/vp9_onyx.h
index 564e4195f..ac39a98fd 100644
--- a/vp9/common/vp9_onyx.h
+++ b/vp9/common/vp9_onyx.h
@@ -147,8 +147,12 @@ extern "C" {
     // END DATARATE CONTROL OPTIONS
     // ----------------------------------------------------------------
 
-    // Spatial scalability
-    int ss_number_layers;
+    // Spatial and temporal scalability.
+    int ss_number_layers;  // Number of spatial layers.
+    int ts_number_layers;  // Number of temporal layers.
+    // Bitrate allocation (CBR mode) and framerate factor, for temporal layers.
+    int ts_target_bitrate[VPX_TS_MAX_LAYERS];
+    int ts_rate_decimator[VPX_TS_MAX_LAYERS];
 
     // these parameters aren't to be used in final build don't use!!!
     int play_alternate;
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index d92a25b12..97983c596 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -18,6 +18,7 @@
 #include "vp9/common/vp9_entropymv.h"
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_entropymode.h"
+#include "vp9/common/vp9_frame_buffers.h"
 #include "vp9/common/vp9_quant_common.h"
 #include "vp9/common/vp9_tile_common.h"
 
@@ -94,6 +95,7 @@ typedef enum {
 
 typedef struct {
   int ref_count;
+  vpx_codec_frame_buffer_t raw_frame_buffer;
   YV12_BUFFER_CONFIG buf;
 } RefCntBuffer;
 
@@ -223,13 +225,21 @@ typedef struct VP9Common {
   int frame_parallel_decoding_mode;
 
   int log2_tile_cols, log2_tile_rows;
+
+  // Private data associated with the frame buffer callbacks.
+  void *cb_priv;
+  vpx_get_frame_buffer_cb_fn_t get_fb_cb;
+  vpx_release_frame_buffer_cb_fn_t release_fb_cb;
+
+  // Handles memory for the codec.
+  InternalFrameBufferList int_frame_buffers;
 } VP9_COMMON;
 
-static YV12_BUFFER_CONFIG *get_frame_new_buffer(VP9_COMMON *cm) {
+static INLINE YV12_BUFFER_CONFIG *get_frame_new_buffer(VP9_COMMON *cm) {
   return &cm->frame_bufs[cm->new_fb_idx].buf;
 }
 
-static int get_free_fb(VP9_COMMON *cm) {
+static INLINE int get_free_fb(VP9_COMMON *cm) {
   int i;
   for (i = 0; i < FRAME_BUFFERS; i++)
     if (cm->frame_bufs[i].ref_count == 0)
@@ -240,7 +250,7 @@ static int get_free_fb(VP9_COMMON *cm) {
   return i;
 }
 
-static void ref_cnt_fb(RefCntBuffer *bufs, int *idx, int new_idx) {
+static INLINE void ref_cnt_fb(RefCntBuffer *bufs, int *idx, int new_idx) {
   const int ref_index = *idx;
 
   if (ref_index >= 0 && bufs[ref_index].ref_count > 0)
@@ -251,7 +261,7 @@ static void ref_cnt_fb(RefCntBuffer *bufs, int *idx, int new_idx) {
   bufs[new_idx].ref_count++;
 }
 
-static int mi_cols_aligned_to_sb(int n_mis) {
+static INLINE int mi_cols_aligned_to_sb(int n_mis) {
   return ALIGN_POWER_OF_TWO(n_mis, MI_BLOCK_SIZE_LOG2);
 }
 
@@ -275,10 +285,10 @@ static INLINE void set_skip_context(
   }
 }
 
-static void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
-                           int mi_row, int bh,
-                           int mi_col, int bw,
-                           int mi_rows, int mi_cols) {
+static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
+                                  int mi_row, int bh,
+                                  int mi_col, int bw,
+                                  int mi_rows, int mi_cols) {
   xd->mb_to_top_edge    = -((mi_row * MI_SIZE) * 8);
   xd->mb_to_bottom_edge = ((mi_rows - bh - mi_row) * MI_SIZE) * 8;
   xd->mb_to_left_edge   = -((mi_col * MI_SIZE) * 8);
diff --git a/vp9/common/vp9_pred_common.c b/vp9/common/vp9_pred_common.c
index 11b6d93c1..487f00cca 100644
--- a/vp9/common/vp9_pred_common.c
+++ b/vp9/common/vp9_pred_common.c
@@ -218,27 +218,25 @@ int vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
     } else {  // inter/inter
       const int above_has_second = has_second_ref(above_mbmi);
       const int left_has_second = has_second_ref(left_mbmi);
+      const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1];
+      const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
 
       if (above_has_second && left_has_second) {
-        pred_context = 1 + (above_mbmi->ref_frame[0] == LAST_FRAME ||
-                            above_mbmi->ref_frame[1] == LAST_FRAME ||
-                            left_mbmi->ref_frame[0] == LAST_FRAME ||
-                            left_mbmi->ref_frame[1] == LAST_FRAME);
+        pred_context = 1 + (above0 == LAST_FRAME || above1 == LAST_FRAME ||
+                            left0 == LAST_FRAME || left1 == LAST_FRAME);
       } else if (above_has_second || left_has_second) {
-        const MV_REFERENCE_FRAME rfs = !above_has_second ?
-            above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0];
-        const MV_REFERENCE_FRAME crf1 = above_has_second ?
-            above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0];
-        const MV_REFERENCE_FRAME crf2 = above_has_second ?
-            above_mbmi->ref_frame[1] : left_mbmi->ref_frame[1];
+        const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
 
         if (rfs == LAST_FRAME)
           pred_context = 3 + (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
         else
           pred_context = (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
       } else {
-        pred_context = 2 * (above_mbmi->ref_frame[0] == LAST_FRAME) +
-                       2 * (left_mbmi->ref_frame[0] == LAST_FRAME);
+        pred_context = 2 * (above0 == LAST_FRAME) + 2 * (left0 == LAST_FRAME);
       }
     }
   } else if (has_above || has_left) {  // one edge available
@@ -291,23 +289,23 @@ int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
     } else {  // inter/inter
       const int above_has_second = has_second_ref(above_mbmi);
       const int left_has_second = has_second_ref(left_mbmi);
+      const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1];
+      const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
 
       if (above_has_second && left_has_second) {
-        if (above_mbmi->ref_frame[0] == left_mbmi->ref_frame[0] &&
-            above_mbmi->ref_frame[1] == left_mbmi->ref_frame[1])
-          pred_context = 3 * (above_mbmi->ref_frame[0] == GOLDEN_FRAME ||
-                              above_mbmi->ref_frame[1] == GOLDEN_FRAME ||
-                              left_mbmi->ref_frame[0] == GOLDEN_FRAME ||
-                              left_mbmi->ref_frame[1] == GOLDEN_FRAME);
+        if (above0 == left0 && above1 == left1)
+          pred_context = 3 * (above0 == GOLDEN_FRAME ||
+                              above1 == GOLDEN_FRAME ||
+                              left0 == GOLDEN_FRAME ||
+                              left1 == GOLDEN_FRAME);
         else
           pred_context = 2;
       } else if (above_has_second || left_has_second) {
-        const MV_REFERENCE_FRAME rfs = !above_has_second ?
-                     above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0];
-        const MV_REFERENCE_FRAME crf1 = above_has_second ?
-                   above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0];
-        const MV_REFERENCE_FRAME crf2 = above_has_second ?
-                     above_mbmi->ref_frame[1] : left_mbmi->ref_frame[1];
+        const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
 
         if (rfs == GOLDEN_FRAME)
           pred_context = 3 + (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);
@@ -316,17 +314,15 @@ int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
         else
           pred_context = 1 + 2 * (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);
       } else {
-        if (above_mbmi->ref_frame[0] == LAST_FRAME &&
-            left_mbmi->ref_frame[0] == LAST_FRAME) {
+        if (above0 == LAST_FRAME && left0 == LAST_FRAME) {
           pred_context = 3;
-        } else if (above_mbmi->ref_frame[0] == LAST_FRAME ||
-                   left_mbmi->ref_frame[0] == LAST_FRAME) {
-          const MB_MODE_INFO *edge_mbmi =
-              above_mbmi->ref_frame[0] == LAST_FRAME ? left_mbmi : above_mbmi;
-            pred_context = 4 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME);
+        } else if (above0 == LAST_FRAME || left0 == LAST_FRAME) {
+          const MV_REFERENCE_FRAME edge0 = (above0 == LAST_FRAME) ? left0
+                                                                  : above0;
+          pred_context = 4 * (edge0 == GOLDEN_FRAME);
         } else {
-          pred_context = 2 * (above_mbmi->ref_frame[0] == GOLDEN_FRAME) +
-                         2 * (left_mbmi->ref_frame[0] == GOLDEN_FRAME);
+          pred_context = 2 * (above0 == GOLDEN_FRAME) +
+                             2 * (left0 == GOLDEN_FRAME);
         }
       }
     }
diff --git a/vp9/common/vp9_pred_common.h b/vp9/common/vp9_pred_common.h
index 0acee32f8..33ae5a896 100644
--- a/vp9/common/vp9_pred_common.h
+++ b/vp9/common/vp9_pred_common.h
@@ -98,8 +98,8 @@ static INLINE vp9_prob vp9_get_pred_prob_single_ref_p2(const VP9_COMMON *cm,
 
 int vp9_get_tx_size_context(const MACROBLOCKD *xd);
 
-static const vp9_prob *get_tx_probs(TX_SIZE max_tx_size, int ctx,
-                                    const struct tx_probs *tx_probs) {
+static INLINE const vp9_prob *get_tx_probs(TX_SIZE max_tx_size, int ctx,
+                                           const struct tx_probs *tx_probs) {
   switch (max_tx_size) {
     case TX_8X8:
       return tx_probs->p8x8[ctx];
@@ -113,13 +113,14 @@ static const vp9_prob *get_tx_probs(TX_SIZE max_tx_size, int ctx,
   }
 }
 
-static const vp9_prob *get_tx_probs2(TX_SIZE max_tx_size, const MACROBLOCKD *xd,
-                                     const struct tx_probs *tx_probs) {
+static INLINE const vp9_prob *get_tx_probs2(TX_SIZE max_tx_size,
+                                            const MACROBLOCKD *xd,
+                                            const struct tx_probs *tx_probs) {
   return get_tx_probs(max_tx_size, vp9_get_tx_size_context(xd), tx_probs);
 }
 
-static unsigned int *get_tx_counts(TX_SIZE max_tx_size, int ctx,
-                                   struct tx_counts *tx_counts) {
+static INLINE unsigned int *get_tx_counts(TX_SIZE max_tx_size, int ctx,
+                                          struct tx_counts *tx_counts) {
   switch (max_tx_size) {
     case TX_8X8:
       return tx_counts->p8x8[ctx];
diff --git a/vp9/common/vp9_prob.c b/vp9/common/vp9_prob.c
index 884884e0b..f9bc06ecf 100644
--- a/vp9/common/vp9_prob.c
+++ b/vp9/common/vp9_prob.c
@@ -28,3 +28,34 @@ DECLARE_ALIGNED(16, const uint8_t, vp9_norm[256]) = {
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 };
+
+
+static unsigned int tree_merge_probs_impl(unsigned int i,
+                                          const vp9_tree_index *tree,
+                                          const vp9_prob *pre_probs,
+                                          const unsigned int *counts,
+                                          unsigned int count_sat,
+                                          unsigned int max_update,
+                                          vp9_prob *probs) {
+  const int l = tree[i];
+  const unsigned int left_count = (l <= 0)
+                 ? counts[-l]
+                 : tree_merge_probs_impl(l, tree, pre_probs, counts,
+                                         count_sat, max_update, probs);
+  const int r = tree[i + 1];
+  const unsigned int right_count = (r <= 0)
+                 ? counts[-r]
+                 : tree_merge_probs_impl(r, tree, pre_probs, counts,
+                                         count_sat, max_update, probs);
+  const unsigned int ct[2] = { left_count, right_count };
+  probs[i >> 1] = merge_probs(pre_probs[i >> 1], ct,
+                              count_sat, max_update);
+  return left_count + right_count;
+}
+
+void vp9_tree_merge_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs,
+                          const unsigned int *counts, unsigned int count_sat,
+                          unsigned int max_update_factor, vp9_prob *probs) {
+  tree_merge_probs_impl(0, tree, pre_probs, counts, count_sat,
+                        max_update_factor, probs);
+}
diff --git a/vp9/common/vp9_prob.h b/vp9/common/vp9_prob.h
index cc8d8ab38..f36148035 100644
--- a/vp9/common/vp9_prob.h
+++ b/vp9/common/vp9_prob.h
@@ -79,37 +79,10 @@ static INLINE vp9_prob merge_probs(vp9_prob pre_prob,
   return weighted_prob(pre_prob, prob, factor);
 }
 
-static unsigned int tree_merge_probs_impl(unsigned int i,
-                                          const vp9_tree_index *tree,
-                                          const vp9_prob *pre_probs,
-                                          const unsigned int *counts,
-                                          unsigned int count_sat,
-                                          unsigned int max_update_factor,
-                                          vp9_prob *probs) {
-  const int l = tree[i];
-  const unsigned int left_count = (l <= 0)
-                 ? counts[-l]
-                 : tree_merge_probs_impl(l, tree, pre_probs, counts,
-                                         count_sat, max_update_factor, probs);
-  const int r = tree[i + 1];
-  const unsigned int right_count = (r <= 0)
-                 ? counts[-r]
-                 : tree_merge_probs_impl(r, tree, pre_probs, counts,
-                                         count_sat, max_update_factor, probs);
-  const unsigned int ct[2] = { left_count, right_count };
-  probs[i >> 1] = merge_probs(pre_probs[i >> 1], ct,
-                              count_sat, max_update_factor);
-  return left_count + right_count;
-}
+void vp9_tree_merge_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs,
+                          const unsigned int *counts, unsigned int count_sat,
+                          unsigned int max_update_factor, vp9_prob *probs);
 
-static void tree_merge_probs(const vp9_tree_index *tree,
-                             const vp9_prob *pre_probs,
-                             const unsigned int *counts,
-                             unsigned int count_sat,
-                             unsigned int max_update_factor, vp9_prob *probs) {
-  tree_merge_probs_impl(0, tree, pre_probs, counts,
-                        count_sat, max_update_factor, probs);
-}
 
 DECLARE_ALIGNED(16, extern const uint8_t, vp9_norm[256]);
 
diff --git a/vp9/common/vp9_quant_common.c b/vp9/common/vp9_quant_common.c
index 6dbdb4216..9fef8b1ef 100644
--- a/vp9/common/vp9_quant_common.c
+++ b/vp9/common/vp9_quant_common.c
@@ -130,7 +130,8 @@ int16_t vp9_ac_quant(int qindex, int delta) {
 }
 
 
-int vp9_get_qindex(struct segmentation *seg, int segment_id, int base_qindex) {
+int vp9_get_qindex(const struct segmentation *seg, int segment_id,
+                   int base_qindex) {
   if (vp9_segfeature_active(seg, segment_id, SEG_LVL_ALT_Q)) {
     const int data = vp9_get_segdata(seg, segment_id, SEG_LVL_ALT_Q);
     return seg->abs_delta == SEGMENT_ABSDATA ?
diff --git a/vp9/common/vp9_quant_common.h b/vp9/common/vp9_quant_common.h
index af50e23cd..581104006 100644
--- a/vp9/common/vp9_quant_common.h
+++ b/vp9/common/vp9_quant_common.h
@@ -27,7 +27,8 @@ void vp9_init_quant_tables();
 int16_t vp9_dc_quant(int qindex, int delta);
 int16_t vp9_ac_quant(int qindex, int delta);
 
-int vp9_get_qindex(struct segmentation *seg, int segment_id, int base_qindex);
+int vp9_get_qindex(const struct segmentation *seg, int segment_id,
+                   int base_qindex);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index cc70e4cc0..7576e7b6f 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -269,21 +269,15 @@ static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
                ? (plane == 0 ? mi->bmi[block].as_mv[ref].as_mv
                              : mi_mv_pred_q4(mi, ref))
                : mi->mbmi.mv[ref].as_mv;
-
-    // TODO(jkoleszar): This clamping is done in the incorrect place for the
-    // scaling case. It needs to be done on the scaled MV, not the pre-scaling
-    // MV. Note however that it performs the subsampling aware scaling so
-    // that the result is always q4.
-    // mv_precision precision is MV_PRECISION_Q4.
-    const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh,
-                                               pd->subsampling_x,
-                                               pd->subsampling_y);
-
     MV32 scaled_mv;
-    int xs, ys, x0, y0, x0_16, y0_16, x1, y1, frame_width,
-        frame_height, subpel_x, subpel_y, buf_stride;
+    int xs, ys, x0, y0, x0_16, y0_16, frame_width, frame_height, buf_stride,
+        subpel_x, subpel_y;
     uint8_t *ref_frame, *buf_ptr;
     const YV12_BUFFER_CONFIG *ref_buf = xd->block_refs[ref]->buf;
+    const MV mv_q4 = {
+      mv.row * (1 << (1 - pd->subsampling_y)),
+      mv.col * (1 << (1 - pd->subsampling_x))
+    };
 
     // Get reference frame pointer, width and height.
     if (plane == 0) {
@@ -327,10 +321,6 @@ static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
     x0_16 += scaled_mv.col;
     y0_16 += scaled_mv.row;
 
-    // Get reference block bottom right coordinate.
-    x1 = ((x0_16 + (w - 1) * xs) >> SUBPEL_BITS) + 1;
-    y1 = ((y0_16 + (h - 1) * ys) >> SUBPEL_BITS) + 1;
-
     // Get reference block pointer.
     buf_ptr = ref_frame + y0 * pre_buf->stride + x0;
     buf_stride = pre_buf->stride;
@@ -339,6 +329,9 @@ static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
     // width/height is not a multiple of 8 pixels.
     if (scaled_mv.col || scaled_mv.row ||
         (frame_width & 0x7) || (frame_height & 0x7)) {
+      // Get reference block bottom right coordinate.
+      int x1 = ((x0_16 + (w - 1) * xs) >> SUBPEL_BITS) + 1;
+      int y1 = ((y0_16 + (h - 1) * ys) >> SUBPEL_BITS) + 1;
       int x_pad = 0, y_pad = 0;
 
       if (subpel_x || (sf->x_step_q4 & SUBPEL_MASK)) {
diff --git a/vp9/common/vp9_reconinter.h b/vp9/common/vp9_reconinter.h
index bf738c28b..dccd60938 100644
--- a/vp9/common/vp9_reconinter.h
+++ b/vp9/common/vp9_reconinter.h
@@ -39,18 +39,18 @@ void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
                                enum mv_precision precision,
                                int x, int y);
 
-static int scaled_buffer_offset(int x_offset, int y_offset, int stride,
-                                const struct scale_factors *sf) {
+static INLINE int scaled_buffer_offset(int x_offset, int y_offset, int stride,
+                                       const struct scale_factors *sf) {
   const int x = sf ? sf->scale_value_x(x_offset, sf) : x_offset;
   const int y = sf ? sf->scale_value_y(y_offset, sf) : y_offset;
   return y * stride + x;
 }
 
-static void setup_pred_plane(struct buf_2d *dst,
-                             uint8_t *src, int stride,
-                             int mi_row, int mi_col,
-                             const struct scale_factors *scale,
-                             int subsampling_x, int subsampling_y) {
+static INLINE void setup_pred_plane(struct buf_2d *dst,
+                                    uint8_t *src, int stride,
+                                    int mi_row, int mi_col,
+                                    const struct scale_factors *scale,
+                                    int subsampling_x, int subsampling_y) {
   const int x = (MI_SIZE * mi_col) >> subsampling_x;
   const int y = (MI_SIZE * mi_row) >> subsampling_y;
   dst->buf = src + scaled_buffer_offset(x, y, stride, scale);
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 04a40bd58..7bdd11eb0 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -707,14 +707,14 @@ if [ "$CONFIG_INTERNAL_STATS" = "yes" ]; then
 fi
 
 # fdct functions
-prototype void vp9_short_fht4x4 "const int16_t *input, int16_t *output, int stride, int tx_type"
-specialize vp9_short_fht4x4 sse2 avx2
+prototype void vp9_fht4x4 "const int16_t *input, int16_t *output, int stride, int tx_type"
+specialize vp9_fht4x4 sse2 avx2
 
-prototype void vp9_short_fht8x8 "const int16_t *input, int16_t *output, int stride, int tx_type"
-specialize vp9_short_fht8x8 sse2 avx2
+prototype void vp9_fht8x8 "const int16_t *input, int16_t *output, int stride, int tx_type"
+specialize vp9_fht8x8 sse2 avx2
 
-prototype void vp9_short_fht16x16 "const int16_t *input, int16_t *output, int stride, int tx_type"
-specialize vp9_short_fht16x16 sse2 avx2
+prototype void vp9_fht16x16 "const int16_t *input, int16_t *output, int stride, int tx_type"
+specialize vp9_fht16x16 sse2 avx2
 
 prototype void vp9_fwht4x4 "const int16_t *input, int16_t *output, int stride"
 specialize vp9_fwht4x4
diff --git a/vp9/common/vp9_scale.h b/vp9/common/vp9_scale.h
index 90b0d0bf9..a9dda1889 100644
--- a/vp9/common/vp9_scale.h
+++ b/vp9/common/vp9_scale.h
@@ -40,12 +40,12 @@ void vp9_setup_scale_factors_for_frame(struct scale_factors *sf,
                                        int other_w, int other_h,
                                        int this_w, int this_h);
 
-static int vp9_is_valid_scale(const struct scale_factors *sf) {
+static INLINE int vp9_is_valid_scale(const struct scale_factors *sf) {
   return sf->x_scale_fp != REF_INVALID_SCALE &&
          sf->y_scale_fp != REF_INVALID_SCALE;
 }
 
-static int vp9_is_scaled(const struct scale_factors *sf) {
+static INLINE int vp9_is_scaled(const struct scale_factors *sf) {
   return sf->x_scale_fp != REF_NO_SCALE ||
          sf->y_scale_fp != REF_NO_SCALE;
 }
diff --git a/vp9/common/vp9_systemdependent.h b/vp9/common/vp9_systemdependent.h
index ee9a4823b..7455abce3 100644
--- a/vp9/common/vp9_systemdependent.h
+++ b/vp9/common/vp9_systemdependent.h
@@ -11,13 +11,17 @@
 #ifndef VP9_COMMON_VP9_SYSTEMDEPENDENT_H_
 #define VP9_COMMON_VP9_SYSTEMDEPENDENT_H_
 
-#ifdef __cplusplus
-extern "C" {
+#ifdef _MSC_VER
+# if _MSC_VER > 1310 && (defined(_M_X64) || defined(_M_IX86))
+#  include <intrin.h>
+#  define USE_MSC_INTRIN
+# endif
+# include <math.h>
+# define snprintf _snprintf
 #endif
 
-#ifdef _MSC_VER
-#include <math.h>
-#define snprintf _snprintf
+#ifdef __cplusplus
+extern "C" {
 #endif
 
 #include "./vpx_config.h"
@@ -30,7 +34,7 @@ void vpx_reset_mmx_state(void);
 
 #if defined(_MSC_VER) && _MSC_VER < 1800
 // round is not defined in MSVC before VS2013.
-static int round(double x) {
+static INLINE int round(double x) {
   if (x < 0)
     return (int)ceil(x - 0.5);
   else
@@ -44,9 +48,7 @@ static int round(double x) {
 static INLINE int get_msb(unsigned int n) {
   return 31 ^ __builtin_clz(n);
 }
-#elif defined(_MSC_VER) && _MSC_VER > 1310 && \
-      (defined(_M_X64) || defined(_M_IX86))
-#include <intrin.h>
+#elif defined(USE_MSC_INTRIN)
 #pragma intrinsic(_BitScanReverse)
 
 static INLINE int get_msb(unsigned int n) {
@@ -54,6 +56,7 @@ static INLINE int get_msb(unsigned int n) {
   _BitScanReverse(&first_set_bit, n);
   return first_set_bit;
 }
+#undef USE_MSC_INTRIN
 #else
 // Returns (int)floor(log2(n)). n must be > 0.
 static INLINE int get_msb(unsigned int n) {
diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c
index 8ab5fb1bc..60018ea86 100644
--- a/vp9/common/x86/vp9_asm_stubs.c
+++ b/vp9/common/x86/vp9_asm_stubs.c
@@ -16,15 +16,15 @@
 
 typedef void filter8_1dfunction (
   const unsigned char *src_ptr,
-  const unsigned int src_pitch,
+  const ptrdiff_t src_pitch,
   unsigned char *output_ptr,
-  unsigned int out_pitch,
+  ptrdiff_t out_pitch,
   unsigned int output_height,
   const short *filter
 );
 
-#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt1, opt2) \
-void vp9_convolve8_##name##_##opt1(const uint8_t *src, ptrdiff_t src_stride, \
+#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
+  void vp9_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \
                                    uint8_t *dst, ptrdiff_t dst_stride, \
                                    const int16_t *filter_x, int x_step_q4, \
                                    const int16_t *filter_y, int y_step_q4, \
@@ -32,50 +32,68 @@ void vp9_convolve8_##name##_##opt1(const uint8_t *src, ptrdiff_t src_stride, \
   if (step_q4 == 16 && filter[3] != 128) { \
     if (filter[0] || filter[1] || filter[2]) { \
       while (w >= 16) { \
-        vp9_filter_block1d16_##dir##8_##avg##opt1(src_start, src_stride, \
-                                                  dst, dst_stride, \
-                                                  h, filter); \
+        vp9_filter_block1d16_##dir##8_##avg##opt(src_start, \
+                                                 src_stride, \
+                                                 dst, \
+                                                 dst_stride, \
+                                                 h, \
+                                                 filter); \
         src += 16; \
         dst += 16; \
         w -= 16; \
       } \
       while (w >= 8) { \
-        vp9_filter_block1d8_##dir##8_##avg##opt1(src_start, src_stride, \
-                                                 dst, dst_stride, \
-                                                 h, filter); \
+        vp9_filter_block1d8_##dir##8_##avg##opt(src_start, \
+                                                src_stride, \
+                                                dst, \
+                                                dst_stride, \
+                                                h, \
+                                                filter); \
         src += 8; \
         dst += 8; \
         w -= 8; \
       } \
       while (w >= 4) { \
-        vp9_filter_block1d4_##dir##8_##avg##opt1(src_start, src_stride, \
-                                                 dst, dst_stride, \
-                                                 h, filter); \
+        vp9_filter_block1d4_##dir##8_##avg##opt(src_start, \
+                                                src_stride, \
+                                                dst, \
+                                                dst_stride, \
+                                                h, \
+                                                filter); \
         src += 4; \
         dst += 4; \
         w -= 4; \
       } \
     } else { \
       while (w >= 16) { \
-        vp9_filter_block1d16_##dir##2_##avg##opt2(src, src_stride, \
-                                                  dst, dst_stride, \
-                                                  h, filter); \
+        vp9_filter_block1d16_##dir##2_##avg##opt(src, \
+                                                 src_stride, \
+                                                 dst, \
+                                                 dst_stride, \
+                                                 h, \
+                                                 filter); \
         src += 16; \
         dst += 16; \
         w -= 16; \
       } \
       while (w >= 8) { \
-        vp9_filter_block1d8_##dir##2_##avg##opt2(src, src_stride, \
-                                                 dst, dst_stride, \
-                                                 h, filter); \
+        vp9_filter_block1d8_##dir##2_##avg##opt(src, \
+                                                src_stride, \
+                                                dst, \
+                                                dst_stride, \
+                                                h, \
+                                                filter); \
         src += 8; \
         dst += 8; \
         w -= 8; \
       } \
       while (w >= 4) { \
-        vp9_filter_block1d4_##dir##2_##avg##opt2(src, src_stride, \
-                                                 dst, dst_stride, \
-                                                 h, filter); \
+        vp9_filter_block1d4_##dir##2_##avg##opt(src, \
+                                                src_stride, \
+                                                dst, \
+                                                dst_stride, \
+                                                h, \
+                                                filter); \
         src += 4; \
         dst += 4; \
         w -= 4; \
@@ -136,18 +154,18 @@ filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3;
 filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3;
 filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3;
 
-filter8_1dfunction vp9_filter_block1d16_v2_sse2;
-filter8_1dfunction vp9_filter_block1d16_h2_sse2;
-filter8_1dfunction vp9_filter_block1d8_v2_sse2;
-filter8_1dfunction vp9_filter_block1d8_h2_sse2;
-filter8_1dfunction vp9_filter_block1d4_v2_sse2;
-filter8_1dfunction vp9_filter_block1d4_h2_sse2;
-filter8_1dfunction vp9_filter_block1d16_v2_avg_sse2;
-filter8_1dfunction vp9_filter_block1d16_h2_avg_sse2;
-filter8_1dfunction vp9_filter_block1d8_v2_avg_sse2;
-filter8_1dfunction vp9_filter_block1d8_h2_avg_sse2;
-filter8_1dfunction vp9_filter_block1d4_v2_avg_sse2;
-filter8_1dfunction vp9_filter_block1d4_h2_avg_sse2;
+filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
+filter8_1dfunction vp9_filter_block1d16_v2_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h2_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v2_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h2_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v2_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h2_avg_ssse3;
 
 // void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 //                                uint8_t *dst, ptrdiff_t dst_stride,
@@ -169,11 +187,11 @@ filter8_1dfunction vp9_filter_block1d4_h2_avg_sse2;
 //                                   const int16_t *filter_x, int x_step_q4,
 //                                   const int16_t *filter_y, int y_step_q4,
 //                                   int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3, sse2);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3, sse2);
-FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3, sse2);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
+FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
 FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
-            ssse3, sse2);
+            ssse3);
 
 // void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 //                          uint8_t *dst, ptrdiff_t dst_stride,
@@ -236,11 +254,10 @@ filter8_1dfunction vp9_filter_block1d4_h2_avg_sse2;
 //                                  const int16_t *filter_x, int x_step_q4,
 //                                  const int16_t *filter_y, int y_step_q4,
 //                                  int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2, sse2);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2, sse2);
-FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2, sse2);
-FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2,
-            sse2);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
+FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
+FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2);
 
 // void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
 //                         uint8_t *dst, ptrdiff_t dst_stride,
diff --git a/vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm b/vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm
new file mode 100644
index 000000000..b5e18fe6d
--- /dev/null
+++ b/vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm
@@ -0,0 +1,422 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro GET_PARAM_4 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm3, [rdx]                 ;load filters
+    psrldq      xmm3, 6
+    packsswb    xmm3, xmm3
+    pshuflw     xmm3, xmm3, 0b              ;k3_k4
+
+    movq        xmm2, rcx                   ;rounding
+    pshufd      xmm2, xmm2, 0
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+%endm
+
+%macro APPLY_FILTER_4 1
+    punpcklbw   xmm0, xmm1
+    pmaddubsw   xmm0, xmm3
+
+    paddsw      xmm0, xmm2                  ;rounding
+    psraw       xmm0, 7                     ;shift
+    packuswb    xmm0, xmm0                  ;pack to byte
+
+%if %1
+    movd        xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+    movd        [rdi], xmm0
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+%endm
+
+%macro GET_PARAM 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm7, [rdx]                 ;load filters
+    psrldq      xmm7, 6
+    packsswb    xmm7, xmm7
+    pshuflw     xmm7, xmm7, 0b              ;k3_k4
+    punpcklwd   xmm7, xmm7
+
+    movq        xmm6, rcx                   ;rounding
+    pshufd      xmm6, xmm6, 0
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+%endm
+
+%macro APPLY_FILTER_8 1
+    punpcklbw   xmm0, xmm1
+    pmaddubsw   xmm0, xmm7
+
+    paddsw      xmm0, xmm6                  ;rounding
+    psraw       xmm0, 7                     ;shift
+    packuswb    xmm0, xmm0                  ;pack back to byte
+
+%if %1
+    movq        xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+    movq        [rdi], xmm0                 ;store the result
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+%endm
+
+%macro APPLY_FILTER_16 1
+    punpcklbw   xmm0, xmm1
+    punpckhbw   xmm2, xmm1
+    pmaddubsw   xmm0, xmm7
+    pmaddubsw   xmm2, xmm7
+
+    paddsw      xmm0, xmm6                  ;rounding
+    paddsw      xmm2, xmm6
+    psraw       xmm0, 7                     ;shift
+    psraw       xmm2, 7
+    packuswb    xmm0, xmm2                  ;pack back to byte
+
+%if %1
+    movdqu      xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+    movdqu      [rdi], xmm0                 ;store the result
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+%endm
+
+global sym(vp9_filter_block1d4_v2_ssse3) PRIVATE
+sym(vp9_filter_block1d4_v2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM_4
+.loop:
+    movd        xmm0, [rsi]                 ;load src
+    movd        xmm1, [rsi + rax]
+
+    APPLY_FILTER_4 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d8_v2_ssse3) PRIVATE
+sym(vp9_filter_block1d8_v2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movq        xmm0, [rsi]                 ;0
+    movq        xmm1, [rsi + rax]           ;1
+
+    APPLY_FILTER_8 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d16_v2_ssse3) PRIVATE
+sym(vp9_filter_block1d16_v2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu        xmm0, [rsi]               ;0
+    movdqu        xmm1, [rsi + rax]         ;1
+    movdqa        xmm2, xmm0
+
+    APPLY_FILTER_16 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d4_v2_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d4_v2_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM_4
+.loop:
+    movd        xmm0, [rsi]                 ;load src
+    movd        xmm1, [rsi + rax]
+
+    APPLY_FILTER_4 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d8_v2_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d8_v2_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movq        xmm0, [rsi]                 ;0
+    movq        xmm1, [rsi + rax]           ;1
+
+    APPLY_FILTER_8 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d16_v2_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d16_v2_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu        xmm0, [rsi]               ;0
+    movdqu        xmm1, [rsi + rax]         ;1
+    movdqa        xmm2, xmm0
+
+    APPLY_FILTER_16 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d4_h2_ssse3) PRIVATE
+sym(vp9_filter_block1d4_h2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM_4
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 1
+
+    APPLY_FILTER_4 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d8_h2_ssse3) PRIVATE
+sym(vp9_filter_block1d8_h2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 1
+
+    APPLY_FILTER_8 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d16_h2_ssse3) PRIVATE
+sym(vp9_filter_block1d16_h2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu      xmm0,   [rsi]               ;load src
+    movdqu      xmm1,   [rsi + 1]
+    movdqa      xmm2, xmm0
+
+    APPLY_FILTER_16 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d4_h2_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d4_h2_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM_4
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 1
+
+    APPLY_FILTER_4 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d8_h2_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d8_h2_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 1
+
+    APPLY_FILTER_8 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d16_h2_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d16_h2_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu      xmm0,   [rsi]               ;load src
+    movdqu      xmm1,   [rsi + 1]
+    movdqa      xmm2, xmm0
+
+    APPLY_FILTER_16 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret