46 files changed, 897 insertions, 684 deletions
diff --git a/vp9/common/arm/neon/vp9_reconintra_neon.asm b/vp9/common/arm/neon/vp9_reconintra_neon.asm
index 71bf24c9f..279f678b1 100644
--- a/vp9/common/arm/neon/vp9_reconintra_neon.asm
+++ b/vp9/common/arm/neon/vp9_reconintra_neon.asm
@@ -18,6 +18,8 @@
     EXPORT  |vp9_h_predictor_32x32_neon|
     EXPORT  |vp9_tm_predictor_4x4_neon|
     EXPORT  |vp9_tm_predictor_8x8_neon|
+    EXPORT  |vp9_tm_predictor_16x16_neon|
+    EXPORT  |vp9_tm_predictor_32x32_neon|
     ARM
     REQUIRE8
     PRESERVE8
@@ -346,61 +348,289 @@ loop_h
     ldrb                r12, [r12]
     vdup.u8             d0, r12
 
+    ; preload 8 left
+    vld1.8              d30, [r3]
+
     ; Load above 8 pixels
     vld1.64             {d2}, [r2]
 
+    vmovl.u8            q10, d30
+
     ; Compute above - ytop_left
     vsubl.u8            q3, d2, d0
 
     ; Load left row by row and compute left + (above - ytop_left)
     ; 1st row and 2nd row
-    ldrb                r12, [r3], #1
-    ldrb                r2, [r3], #1
-    vdup.u16            q1, r12
-    vdup.u16            q2, r2
-    vadd.s16            q1, q1, q3
-    vadd.s16            q2, q2, q3
-    vqshrun.s16         d0, q1, #0
-    vqshrun.s16         d1, q2, #0
-    vst1.64             {d0}, [r0], r1
-    vst1.64             {d1}, [r0], r1
+    vdup.16             q0, d20[0]
+    vdup.16             q1, d20[1]
+    vadd.s16            q0, q3, q0
+    vadd.s16            q1, q3, q1
 
     ; 3rd row and 4th row
-    ldrb                r12, [r3], #1
-    ldrb                r2, [r3], #1
-    vdup.u16            q1, r12
-    vdup.u16            q2, r2
-    vadd.s16            q1, q1, q3
-    vadd.s16            q2, q2, q3
-    vqshrun.s16         d0, q1, #0
-    vqshrun.s16         d1, q2, #0
+    vdup.16             q8, d20[2]
+    vdup.16             q9, d20[3]
+    vadd.s16            q8, q3, q8
+    vadd.s16            q9, q3, q9
+
+    vqshrun.s16         d0, q0, #0
+    vqshrun.s16         d1, q1, #0
+    vqshrun.s16         d2, q8, #0
+    vqshrun.s16         d3, q9, #0
+
     vst1.64             {d0}, [r0], r1
     vst1.64             {d1}, [r0], r1
+    vst1.64             {d2}, [r0], r1
+    vst1.64             {d3}, [r0], r1
 
     ; 5th row and 6th row
-    ldrb                r12, [r3], #1
-    ldrb                r2, [r3], #1
-    vdup.u16            q1, r12
-    vdup.u16            q2, r2
-    vadd.s16            q1, q1, q3
-    vadd.s16            q2, q2, q3
-    vqshrun.s16         d0, q1, #0
-    vqshrun.s16         d1, q2, #0
-    vst1.64             {d0}, [r0], r1
-    vst1.64             {d1}, [r0], r1
+    vdup.16             q0, d21[0]
+    vdup.16             q1, d21[1]
+    vadd.s16            q0, q3, q0
+    vadd.s16            q1, q3, q1
+
+    ; 7th row and 8th row
+    vdup.16             q8, d21[2]
+    vdup.16             q9, d21[3]
+    vadd.s16            q8, q3, q8
+    vadd.s16            q9, q3, q9
+
+    vqshrun.s16         d0, q0, #0
+    vqshrun.s16         d1, q1, #0
+    vqshrun.s16         d2, q8, #0
+    vqshrun.s16         d3, q9, #0
 
-    ; 7rd row and 8th row
-    ldrb                r12, [r3], #1
-    ldrb                r2, [r3], #1
-    vdup.u16            q1, r12
-    vdup.u16            q2, r2
-    vadd.s16            q1, q1, q3
-    vadd.s16            q2, q2, q3
-    vqshrun.s16         d0, q1, #0
-    vqshrun.s16         d1, q2, #0
     vst1.64             {d0}, [r0], r1
     vst1.64             {d1}, [r0], r1
+    vst1.64             {d2}, [r0], r1
+    vst1.64             {d3}, [r0], r1
+
     bx                  lr
     ENDP                ; |vp9_tm_predictor_8x8_neon|
 
+;void vp9_tm_predictor_16x16_neon (uint8_t *dst, ptrdiff_t y_stride,
+;                                const uint8_t *above,
+;                                const uint8_t *left)
+; r0  uint8_t *dst
+; r1  ptrdiff_t y_stride
+; r2  const uint8_t *above
+; r3  const uint8_t *left
+
+|vp9_tm_predictor_16x16_neon| PROC
+    ; Load ytop_left = above[-1];
+    sub                 r12, r2, #1
+    ldrb                r12, [r12]
+    vdup.u8             q0, r12
+
+    ; Load above 8 pixels
+    vld1.8              q1, [r2]
+
+    ; preload 8 left into r12
+    vld1.8              d18, [r3]!
+
+    ; Compute above - ytop_left
+    vsubl.u8            q2, d2, d0
+    vsubl.u8            q3, d3, d1
+
+    vmovl.u8            q10, d18
+
+    ; Load left row by row and compute left + (above - ytop_left)
+    ; Process 8 rows in each single loop and loop 2 times to process 16 rows.
+    mov                 r2, #2
+
+loop_16x16_neon
+    ; Process two rows.
+    vdup.16             q0, d20[0]
+    vdup.16             q8, d20[1]
+    vadd.s16            q1, q0, q2
+    vadd.s16            q0, q0, q3
+    vadd.s16            q11, q8, q2
+    vadd.s16            q8, q8, q3
+    vqshrun.s16         d2, q1, #0
+    vqshrun.s16         d3, q0, #0
+    vqshrun.s16         d22, q11, #0
+    vqshrun.s16         d23, q8, #0
+    vdup.16             q0, d20[2]                  ; proload next 2 rows data
+    vdup.16             q8, d20[3]
+    vst1.64             {d2,d3}, [r0], r1
+    vst1.64             {d22,d23}, [r0], r1
+
+    ; Process two rows.
+    vadd.s16            q1, q0, q2
+    vadd.s16            q0, q0, q3
+    vadd.s16            q11, q8, q2
+    vadd.s16            q8, q8, q3
+    vqshrun.s16         d2, q1, #0
+    vqshrun.s16         d3, q0, #0
+    vqshrun.s16         d22, q11, #0
+    vqshrun.s16         d23, q8, #0
+    vdup.16             q0, d21[0]                  ; proload next 2 rows data
+    vdup.16             q8, d21[1]
+    vst1.64             {d2,d3}, [r0], r1
+    vst1.64             {d22,d23}, [r0], r1
+
+    vadd.s16            q1, q0, q2
+    vadd.s16            q0, q0, q3
+    vadd.s16            q11, q8, q2
+    vadd.s16            q8, q8, q3
+    vqshrun.s16         d2, q1, #0
+    vqshrun.s16         d3, q0, #0
+    vqshrun.s16         d22, q11, #0
+    vqshrun.s16         d23, q8, #0
+    vdup.16             q0, d21[2]                  ; proload next 2 rows data
+    vdup.16             q8, d21[3]
+    vst1.64             {d2,d3}, [r0], r1
+    vst1.64             {d22,d23}, [r0], r1
+
+
+    vadd.s16            q1, q0, q2
+    vadd.s16            q0, q0, q3
+    vadd.s16            q11, q8, q2
+    vadd.s16            q8, q8, q3
+    vqshrun.s16         d2, q1, #0
+    vqshrun.s16         d3, q0, #0
+    vqshrun.s16         d22, q11, #0
+    vqshrun.s16         d23, q8, #0
+    vdup.16             q0, d20[2]
+    vdup.16             q8, d20[3]
+    vld1.8              d18, [r3]!                  ; preload 8 left into r12
+    vmovl.u8            q10, d18
+    vst1.64             {d2,d3}, [r0], r1
+    vst1.64             {d22,d23}, [r0], r1
+
+    subs                r2, r2, #1
+    bgt                 loop_16x16_neon
+
+    bx                  lr
+    ENDP                ; |vp9_tm_predictor_16x16_neon|
+
+;void vp9_tm_predictor_32x32_neon (uint8_t *dst, ptrdiff_t y_stride,
+;                                  const uint8_t *above,
+;                                  const uint8_t *left)
+; r0  uint8_t *dst
+; r1  ptrdiff_t y_stride
+; r2  const uint8_t *above
+; r3  const uint8_t *left
+
+|vp9_tm_predictor_32x32_neon| PROC
+    ; Load ytop_left = above[-1];
+    sub                 r12, r2, #1
+    ldrb                r12, [r12]
+    vdup.u8             q0, r12
+
+    ; Load above 32 pixels
+    vld1.8              q1, [r2]!
+    vld1.8              q2, [r2]
+
+    ; preload 8 left pixels
+    vld1.8              d26, [r3]!
+
+    ; Compute above - ytop_left
+    vsubl.u8            q8, d2, d0
+    vsubl.u8            q9, d3, d1
+    vsubl.u8            q10, d4, d0
+    vsubl.u8            q11, d5, d1
+
+    vmovl.u8            q3, d26
+
+    ; Load left row by row and compute left + (above - ytop_left)
+    ; Process 8 rows in each single loop and loop 4 times to process 32 rows.
+    mov                 r2, #4
+
+loop_32x32_neon
+    ; Process two rows.
+    vdup.16             q0, d6[0]
+    vdup.16             q2, d6[1]
+    vadd.s16            q12, q0, q8
+    vadd.s16            q13, q0, q9
+    vadd.s16            q14, q0, q10
+    vadd.s16            q15, q0, q11
+    vqshrun.s16         d0, q12, #0
+    vqshrun.s16         d1, q13, #0
+    vadd.s16            q12, q2, q8
+    vadd.s16            q13, q2, q9
+    vqshrun.s16         d2, q14, #0
+    vqshrun.s16         d3, q15, #0
+    vadd.s16            q14, q2, q10
+    vadd.s16            q15, q2, q11
+    vst1.64             {d0-d3}, [r0], r1
+    vqshrun.s16         d24, q12, #0
+    vqshrun.s16         d25, q13, #0
+    vqshrun.s16         d26, q14, #0
+    vqshrun.s16         d27, q15, #0
+    vdup.16             q1, d6[2]
+    vdup.16             q2, d6[3]
+    vst1.64             {d24-d27}, [r0], r1
+
+    ; Process two rows.
+    vadd.s16            q12, q1, q8
+    vadd.s16            q13, q1, q9
+    vadd.s16            q14, q1, q10
+    vadd.s16            q15, q1, q11
+    vqshrun.s16         d0, q12, #0
+    vqshrun.s16         d1, q13, #0
+    vadd.s16            q12, q2, q8
+    vadd.s16            q13, q2, q9
+    vqshrun.s16         d2, q14, #0
+    vqshrun.s16         d3, q15, #0
+    vadd.s16            q14, q2, q10
+    vadd.s16            q15, q2, q11
+    vst1.64             {d0-d3}, [r0], r1
+    vqshrun.s16         d24, q12, #0
+    vqshrun.s16         d25, q13, #0
+    vqshrun.s16         d26, q14, #0
+    vqshrun.s16         d27, q15, #0
+    vdup.16             q0, d7[0]
+    vdup.16             q2, d7[1]
+    vst1.64             {d24-d27}, [r0], r1
+
+    ; Process two rows.
+    vadd.s16            q12, q0, q8
+    vadd.s16            q13, q0, q9
+    vadd.s16            q14, q0, q10
+    vadd.s16            q15, q0, q11
+    vqshrun.s16         d0, q12, #0
+    vqshrun.s16         d1, q13, #0
+    vadd.s16            q12, q2, q8
+    vadd.s16            q13, q2, q9
+    vqshrun.s16         d2, q14, #0
+    vqshrun.s16         d3, q15, #0
+    vadd.s16            q14, q2, q10
+    vadd.s16            q15, q2, q11
+    vst1.64             {d0-d3}, [r0], r1
+    vqshrun.s16         d24, q12, #0
+    vqshrun.s16         d25, q13, #0
+    vqshrun.s16         d26, q14, #0
+    vqshrun.s16         d27, q15, #0
+    vdup.16             q0, d7[2]
+    vdup.16             q2, d7[3]
+    vst1.64             {d24-d27}, [r0], r1
+
+    ; Process two rows.
+    vadd.s16            q12, q0, q8
+    vadd.s16            q13, q0, q9
+    vadd.s16            q14, q0, q10
+    vadd.s16            q15, q0, q11
+    vqshrun.s16         d0, q12, #0
+    vqshrun.s16         d1, q13, #0
+    vadd.s16            q12, q2, q8
+    vadd.s16            q13, q2, q9
+    vqshrun.s16         d2, q14, #0
+    vqshrun.s16         d3, q15, #0
+    vadd.s16            q14, q2, q10
+    vadd.s16            q15, q2, q11
+    vst1.64             {d0-d3}, [r0], r1
+    vqshrun.s16         d24, q12, #0
+    vqshrun.s16         d25, q13, #0
+    vld1.8              d0, [r3]!                   ; preload 8 left pixels
+    vqshrun.s16         d26, q14, #0
+    vqshrun.s16         d27, q15, #0
+    vmovl.u8            q3, d0
+    vst1.64             {d24-d27}, [r0], r1
+
+    subs                r2, r2, #1
+    bgt                 loop_32x32_neon
+
+    bx                  lr
+    ENDP                ; |vp9_tm_predictor_32x32_neon|
+
     END
diff --git a/vp9/common/mips/dspr2/vp9_common_dspr2.h b/vp9/common/mips/dspr2/vp9_common_dspr2.h
index e9c698119..991d3c2b3 100644
--- a/vp9/common/mips/dspr2/vp9_common_dspr2.h
+++ b/vp9/common/mips/dspr2/vp9_common_dspr2.h
@@ -17,6 +17,10 @@
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_common.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #if HAVE_DSPR2
 #define CROP_WIDTH 512
 extern uint8_t *vp9_ff_cropTbl;
@@ -114,4 +118,8 @@ void vp9_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                               int w, int h);
 
 #endif  // #if HAVE_DSPR2
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_MIPS_DSPR2_VP9_COMMON_DSPR2_H_
diff --git a/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h b/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h
index 98bfcfaf2..008cf8cac 100644
--- a/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h
+++ b/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h
@@ -17,6 +17,10 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_onyxc_int.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #if HAVE_DSPR2
 /* inputs & outputs are quad-byte vectors */
 static INLINE void vp9_filter_dspr2(uint32_t mask, uint32_t hev,
@@ -752,4 +756,8 @@ static INLINE void vp9_wide_mbfilter_dspr2(uint32_t *op7, uint32_t *op6,
   *oq6 = res_oq6;
 }
 #endif  // #if HAVE_DSPR2
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_FILTERS_DSPR2_H_
diff --git a/vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h b/vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h
index 4cb2ebb46..ca01a6a10 100644
--- a/vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h
+++ b/vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h
@@ -17,6 +17,10 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_onyxc_int.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #if HAVE_DSPR2
 #define STORE_F0() {                                                    \
     __asm__ __volatile__ (                                              \
@@ -467,4 +471,8 @@
 }
 
 #endif  // #if HAVE_DSPR2
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MACROS_DSPR2_H_
diff --git a/vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h b/vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h
index b9e0aca90..5b0d9cc9b 100644
--- a/vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h
+++ b/vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h
@@ -17,6 +17,10 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_onyxc_int.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #if HAVE_DSPR2
 /* processing 4 pixels at the same time
  * compute hev and mask in the same function */
@@ -362,4 +366,8 @@ static INLINE void vp9_flatmask5(uint32_t p4, uint32_t p3,
   *flat2 = flat1;
 }
 #endif  // #if HAVE_DSPR2
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MASKS_DSPR2_H_
diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index ca42090c1..e033fbb99 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -33,8 +33,8 @@ void vp9_update_mode_info_border(VP9_COMMON *cm, MODE_INFO *mi) {
 void vp9_free_frame_buffers(VP9_COMMON *cm) {
   int i;
 
-  for (i = 0; i < cm->fb_count; i++)
-    vp9_free_frame_buffer(&cm->yv12_fb[i]);
+  for (i = 0; i < FRAME_BUFFERS; i++)
+    vp9_free_frame_buffer(&cm->frame_bufs[i].buf);
 
   vp9_free_frame_buffer(&cm->post_proc_buffer);
 
@@ -85,7 +85,7 @@ int vp9_resize_frame_buffers(VP9_COMMON *cm, int width, int height) {
   int mi_size;
 
   if (vp9_realloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y,
-                               VP9_DEC_BORDER_IN_PIXELS, NULL, NULL, NULL) < 0)
+                               VP9_DEC_BORDER_IN_PIXELS) < 0)
     goto fail;
 
   set_mb_mi(cm, aligned_width, aligned_height);
@@ -137,33 +137,21 @@ int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) {
   const int ss_y = cm->subsampling_y;
   int mi_size;
 
-  if (cm->fb_count == 0) {
-    cm->fb_count = FRAME_BUFFERS;
-    CHECK_MEM_ERROR(cm, cm->yv12_fb,
-                    vpx_calloc(cm->fb_count, sizeof(*cm->yv12_fb)));
-    CHECK_MEM_ERROR(cm, cm->fb_idx_ref_cnt,
-                    vpx_calloc(cm->fb_count, sizeof(*cm->fb_idx_ref_cnt)));
-    if (cm->fb_lru) {
-      CHECK_MEM_ERROR(cm, cm->fb_idx_ref_lru,
-                      vpx_calloc(cm->fb_count, sizeof(*cm->fb_idx_ref_lru)));
-    }
-  }
-
   vp9_free_frame_buffers(cm);
 
-  for (i = 0; i < cm->fb_count; i++) {
-    cm->fb_idx_ref_cnt[i] = 0;
-    if (vp9_alloc_frame_buffer(&cm->yv12_fb[i], width, height, ss_x, ss_y,
-                               VP9_ENC_BORDER_IN_PIXELS) < 0)
+  for (i = 0; i < FRAME_BUFFERS; i++) {
+    cm->frame_bufs[i].ref_count = 0;
+    if (vp9_alloc_frame_buffer(&cm->frame_bufs[i].buf, width, height,
+                               ss_x, ss_y, VP9_ENC_BORDER_IN_PIXELS) < 0)
       goto fail;
   }
 
-  cm->new_fb_idx = cm->fb_count - 1;
-  cm->fb_idx_ref_cnt[cm->new_fb_idx] = 1;
+  cm->new_fb_idx = FRAME_BUFFERS - 1;
+  cm->frame_bufs[cm->new_fb_idx].ref_count = 1;
 
   for (i = 0; i < REF_FRAMES; i++) {
     cm->ref_frame_map[i] = i;
-    cm->fb_idx_ref_cnt[i] = 1;
+    cm->frame_bufs[i].ref_count = 1;
   }
 
   if (vp9_alloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y,
@@ -211,14 +199,6 @@ void vp9_create_common(VP9_COMMON *cm) {
 
 void vp9_remove_common(VP9_COMMON *cm) {
   vp9_free_frame_buffers(cm);
-
-  vpx_free(cm->yv12_fb);
-  vpx_free(cm->fb_idx_ref_cnt);
-  vpx_free(cm->fb_idx_ref_lru);
-
-  cm->yv12_fb = NULL;
-  cm->fb_idx_ref_cnt = NULL;
-  cm->fb_idx_ref_lru = NULL;
 }
 
 void vp9_initialize_common() {
diff --git a/vp9/common/vp9_alloccommon.h b/vp9/common/vp9_alloccommon.h
index cf8dca573..e3b5b95d8 100644
--- a/vp9/common/vp9_alloccommon.h
+++ b/vp9/common/vp9_alloccommon.h
@@ -14,6 +14,10 @@
 
 #include "vp9/common/vp9_onyxc_int.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 void vp9_initialize_common();
 
 void vp9_update_mode_info_border(VP9_COMMON *cm, MODE_INFO *mi);
@@ -28,4 +32,8 @@ void vp9_free_frame_buffers(VP9_COMMON *cm);
 
 void vp9_update_frame_size(VP9_COMMON *cm);
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_VP9_ALLOCCOMMON_H_
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index ad78b0dc4..49e336aa4 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -25,8 +25,12 @@
 #include "vp9/common/vp9_scale.h"
 #include "vp9/common/vp9_seg_common.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define BLOCK_SIZE_GROUPS 4
-#define MBSKIP_CONTEXTS 3
+#define SKIP_CONTEXTS 3
 #define INTER_MODE_CONTEXTS 7
 
 /* Segment Feature Masks */
@@ -131,7 +135,7 @@ typedef struct {
   // Flags used for prediction status of various bit-stream signals
   unsigned char seg_id_predicted;
 
-  INTERPOLATION_TYPE interp_filter;
+  INTERP_FILTER interp_filter;
 
   BLOCK_SIZE sb_type;
 } MB_MODE_INFO;
@@ -248,7 +252,7 @@ typedef struct macroblockd {
   /* Inverse transform function pointers. */
   void (*itxm_add)(const int16_t *input, uint8_t *dest, int stride, int eob);
 
-  struct subpix_fn_table  subpix;
+  const interp_kernel *interp_kernel;
 
   int corrupted;
 
@@ -463,4 +467,8 @@ static int get_tx_eob(const struct segmentation *seg, int segment_id,
   return vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
 }
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_VP9_BLOCKD_H_
diff --git a/vp9/common/vp9_common.h b/vp9/common/vp9_common.h
index 36d1cdf14..69964dae8 100644
--- a/vp9/common/vp9_common.h
+++ b/vp9/common/vp9_common.h
@@ -19,6 +19,10 @@
 #include "vpx_mem/vpx_mem.h"
 #include "vpx/vpx_integer.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define MIN(x, y) (((x) < (y)) ? (x) : (y))
 #define MAX(x, y) (((x) > (y)) ? (x) : (y))
 
@@ -91,4 +95,8 @@ static int get_unsigned_bits(unsigned int num_values) {
 #define VP9_FRAME_MARKER 0x2
 
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_VP9_COMMON_H_
diff --git a/vp9/common/vp9_common_data.h b/vp9/common/vp9_common_data.h
index 5222d29c1..f41962747 100644
--- a/vp9/common/vp9_common_data.h
+++ b/vp9/common/vp9_common_data.h
@@ -13,6 +13,10 @@
 
 #include "vp9/common/vp9_enums.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 extern const int b_width_log2_lookup[BLOCK_SIZES];
 extern const int b_height_log2_lookup[BLOCK_SIZES];
 extern const int mi_width_log2_lookup[BLOCK_SIZES];
@@ -28,4 +32,8 @@ extern const TX_SIZE max_txsize_lookup[BLOCK_SIZES];
 extern const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES];
 extern const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2];
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_VP9_COMMON_DATA_H_
diff --git a/vp9/common/vp9_convolve.c b/vp9/common/vp9_convolve.c
index 6edf7eaca..b105a57bc 100644
--- a/vp9/common/vp9_convolve.c
+++ b/vp9/common/vp9_convolve.c
@@ -20,7 +20,7 @@
 
 static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
                            uint8_t *dst, ptrdiff_t dst_stride,
-                           const subpel_kernel *x_filters,
+                           const interp_kernel *x_filters,
                            int x0_q4, int x_step_q4, int w, int h) {
   int x, y;
   src -= SUBPEL_TAPS / 2 - 1;
@@ -42,7 +42,7 @@ static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
 
 static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
-                               const subpel_kernel *x_filters,
+                               const interp_kernel *x_filters,
                                int x0_q4, int x_step_q4, int w, int h) {
   int x, y;
   src -= SUBPEL_TAPS / 2 - 1;
@@ -65,7 +65,7 @@ static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
 
 static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
                           uint8_t *dst, ptrdiff_t dst_stride,
-                          const subpel_kernel *y_filters,
+                          const interp_kernel *y_filters,
                           int y0_q4, int y_step_q4, int w, int h) {
   int x, y;
   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
@@ -88,7 +88,7 @@ static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
 
 static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
-                              const subpel_kernel *y_filters,
+                              const interp_kernel *y_filters,
                               int y0_q4, int y_step_q4, int w, int h) {
   int x, y;
   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
@@ -112,9 +112,9 @@ static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
 
 static void convolve(const uint8_t *src, ptrdiff_t src_stride,
                      uint8_t *dst, ptrdiff_t dst_stride,
-                     const subpel_kernel *const x_filters,
+                     const interp_kernel *const x_filters,
                      int x0_q4, int x_step_q4,
-                     const subpel_kernel *const y_filters,
+                     const interp_kernel *const y_filters,
                      int y0_q4, int y_step_q4,
                      int w, int h) {
   // Fixed size intermediate buffer places limits on parameters.
@@ -138,14 +138,14 @@ static void convolve(const uint8_t *src, ptrdiff_t src_stride,
                 y_filters, y0_q4, y_step_q4, w, h);
 }
 
-static const subpel_kernel *get_filter_base(const int16_t *filter) {
+static const interp_kernel *get_filter_base(const int16_t *filter) {
   // NOTE: This assumes that the filter table is 256-byte aligned.
   // TODO(agrange) Modify to make independent of table alignment.
-  return (const subpel_kernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
+  return (const interp_kernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
 }
 
-static int get_filter_offset(const int16_t *f, const subpel_kernel *base) {
-  return (const subpel_kernel *)(intptr_t)f - base;
+static int get_filter_offset(const int16_t *f, const interp_kernel *base) {
+  return (const interp_kernel *)(intptr_t)f - base;
 }
 
 void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
@@ -153,7 +153,7 @@ void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                            const int16_t *filter_x, int x_step_q4,
                            const int16_t *filter_y, int y_step_q4,
                            int w, int h) {
-  const subpel_kernel *const filters_x = get_filter_base(filter_x);
+  const interp_kernel *const filters_x = get_filter_base(filter_x);
   const int x0_q4 = get_filter_offset(filter_x, filters_x);
 
   convolve_horiz(src, src_stride, dst, dst_stride, filters_x,
@@ -165,7 +165,7 @@ void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                                const int16_t *filter_x, int x_step_q4,
                                const int16_t *filter_y, int y_step_q4,
                                int w, int h) {
-  const subpel_kernel *const filters_x = get_filter_base(filter_x);
+  const interp_kernel *const filters_x = get_filter_base(filter_x);
   const int x0_q4 = get_filter_offset(filter_x, filters_x);
 
   convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x,
@@ -177,7 +177,7 @@ void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                           const int16_t *filter_x, int x_step_q4,
                           const int16_t *filter_y, int y_step_q4,
                           int w, int h) {
-  const subpel_kernel *const filters_y = get_filter_base(filter_y);
+  const interp_kernel *const filters_y = get_filter_base(filter_y);
   const int y0_q4 = get_filter_offset(filter_y, filters_y);
   convolve_vert(src, src_stride, dst, dst_stride, filters_y,
                 y0_q4, y_step_q4, w, h);
@@ -188,7 +188,7 @@ void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                               const int16_t *filter_x, int x_step_q4,
                               const int16_t *filter_y, int y_step_q4,
                               int w, int h) {
-  const subpel_kernel *const filters_y = get_filter_base(filter_y);
+  const interp_kernel *const filters_y = get_filter_base(filter_y);
   const int y0_q4 = get_filter_offset(filter_y, filters_y);
   convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y,
                     y0_q4, y_step_q4, w, h);
@@ -199,10 +199,10 @@ void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
                      const int16_t *filter_x, int x_step_q4,
                      const int16_t *filter_y, int y_step_q4,
                      int w, int h) {
-  const subpel_kernel *const filters_x = get_filter_base(filter_x);
+  const interp_kernel *const filters_x = get_filter_base(filter_x);
   const int x0_q4 = get_filter_offset(filter_x, filters_x);
 
-  const subpel_kernel *const filters_y = get_filter_base(filter_y);
+  const interp_kernel *const filters_y = get_filter_base(filter_y);
   const int y0_q4 = get_filter_offset(filter_y, filters_y);
 
   convolve(src, src_stride, dst, dst_stride,
diff --git a/vp9/common/vp9_convolve.h b/vp9/common/vp9_convolve.h
index 29d499063..6bf71fc79 100644
--- a/vp9/common/vp9_convolve.h
+++ b/vp9/common/vp9_convolve.h
@@ -13,10 +13,18 @@
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
                               const int16_t *filter_x, int x_step_q4,
                               const int16_t *filter_y, int y_step_q4,
                               int w, int h);
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_VP9_CONVOLVE_H_
diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index ba162fd20..e030d92ec 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -18,6 +18,10 @@
 #include "vp9/common/vp9_scan.h"
 #include "vp9/common/vp9_entropymode.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define DIFF_UPDATE_PROB 252
 
 // Coefficient token alphabet
@@ -184,4 +188,8 @@ static const scan_order *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size,
   }
 }
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_VP9_ENTROPY_H_
diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c
index 83281b2ea..6def3c869 100644
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -303,7 +303,7 @@ void tx_counts_to_branch_counts_8x8(const unsigned int *tx_count_8x8p,
   ct_8x8p[0][1] = tx_count_8x8p[TX_8X8];
 }
 
-static const vp9_prob default_mbskip_probs[MBSKIP_CONTEXTS] = {
+static const vp9_prob default_skip_probs[SKIP_CONTEXTS] = {
   192, 128, 64
 };
 
@@ -325,7 +325,7 @@ void vp9_init_mbmode_probs(VP9_COMMON *cm) {
   vp9_copy(cm->fc.comp_ref_prob, default_comp_ref_p);
   vp9_copy(cm->fc.single_ref_prob, default_single_ref_p);
   cm->fc.tx_probs = default_tx_probs;
-  vp9_copy(cm->fc.mbskip_probs, default_mbskip_probs);
+  vp9_copy(cm->fc.skip_probs, default_skip_probs);
   vp9_copy(cm->fc.inter_mode_probs, default_inter_mode_probs);
 }
 
@@ -385,7 +385,7 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
     adapt_probs(vp9_partition_tree, pre_fc->partition_prob[i],
                 counts->partition[i], fc->partition_prob[i]);
 
-  if (cm->mcomp_filter_type == SWITCHABLE) {
+  if (cm->interp_filter == SWITCHABLE) {
     for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
       adapt_probs(vp9_switchable_interp_tree, pre_fc->switchable_interp_prob[i],
                   counts->switchable_interp[i], fc->switchable_interp_prob[i]);
@@ -415,9 +415,8 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
     }
   }
 
-  for (i = 0; i < MBSKIP_CONTEXTS; ++i)
-    fc->mbskip_probs[i] = adapt_prob(pre_fc->mbskip_probs[i],
-                                     counts->mbskip[i]);
+  for (i = 0; i < SKIP_CONTEXTS; ++i)
+    fc->skip_probs[i] = adapt_prob(pre_fc->skip_probs[i], counts->skip[i]);
 }
 
 static void set_default_lf_deltas(struct loopfilter *lf) {
diff --git a/vp9/common/vp9_entropymode.h b/vp9/common/vp9_entropymode.h
index 5312553c7..deec3f652 100644
--- a/vp9/common/vp9_entropymode.h
+++ b/vp9/common/vp9_entropymode.h
@@ -13,12 +13,14 @@
 
 #include "vp9/common/vp9_blockd.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define TX_SIZE_CONTEXTS 2
 #define SWITCHABLE_FILTERS 3   // number of switchable filters
 #define SWITCHABLE_FILTER_CONTEXTS (SWITCHABLE_FILTERS + 1)
 
-// #define MODE_STATS
-
 struct VP9Common;
 
 struct tx_probs {
@@ -57,4 +59,8 @@ void tx_counts_to_branch_counts_16x16(const unsigned int *tx_count_16x16p,
 void tx_counts_to_branch_counts_8x8(const unsigned int *tx_count_8x8p,
                                     unsigned int (*ct_8x8p)[2]);
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_VP9_ENTROPYMODE_H_
diff --git a/vp9/common/vp9_entropymv.h b/vp9/common/vp9_entropymv.h
index 48cb82db1..7e1f1479b 100644
--- a/vp9/common/vp9_entropymv.h
+++ b/vp9/common/vp9_entropymv.h
@@ -15,6 +15,10 @@
 #include "./vpx_config.h"
 #include "vp9/common/vp9_blockd.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 struct VP9Common;
 
 void vp9_init_mv_probs(struct VP9Common *cm);
@@ -121,4 +125,8 @@ typedef struct {
 
 void vp9_inc_mv(const MV *mv, nmv_context_counts *mvctx);
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_VP9_ENTROPYMV_H_
diff --git a/vp9/common/vp9_enums.h b/vp9/common/vp9_enums.h
index 34411a34f..e96e76947 100644
--- a/vp9/common/vp9_enums.h
+++ b/vp9/common/vp9_enums.h
@@ -13,6 +13,10 @@
 
 #include "./vpx_config.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define MI_SIZE_LOG2 3
 #define MI_BLOCK_SIZE_LOG2 (6 - MI_SIZE_LOG2)  // 64 = 2^6
 
@@ -90,4 +94,8 @@ typedef enum {
   SRGB       = 7   // RGB
 } COLOR_SPACE;
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_VP9_ENUMS_H_
diff --git a/vp9/common/vp9_filter.c b/vp9/common/vp9_filter.c
index 79ace147c..dbde6d551 100644
--- a/vp9/common/vp9_filter.c
+++ b/vp9/common/vp9_filter.c
@@ -14,7 +14,7 @@
 
 #include "vp9/common/vp9_filter.h"
 
-DECLARE_ALIGNED(256, const subpel_kernel,
+DECLARE_ALIGNED(256, const interp_kernel,
                 vp9_bilinear_filters[SUBPEL_SHIFTS]) = {
   { 0, 0, 0, 128,   0, 0, 0, 0 },
   { 0, 0, 0, 120,   8, 0, 0, 0 },
@@ -35,7 +35,7 @@ DECLARE_ALIGNED(256, const subpel_kernel,
 };
 
 // Lagrangian interpolation filter
-DECLARE_ALIGNED(256, const subpel_kernel,
+DECLARE_ALIGNED(256, const interp_kernel,
                 vp9_sub_pel_filters_8[SUBPEL_SHIFTS]) = {
   { 0,   0,   0, 128,   0,   0,   0,  0},
   { 0,   1,  -5, 126,   8,  -3,   1,  0},
@@ -56,7 +56,7 @@ DECLARE_ALIGNED(256, const subpel_kernel,
 };
 
 // DCT based filter
-DECLARE_ALIGNED(256, const subpel_kernel,
+DECLARE_ALIGNED(256, const interp_kernel,
                 vp9_sub_pel_filters_8s[SUBPEL_SHIFTS]) = {
   {0,   0,   0, 128,   0,   0,   0, 0},
   {-1,   3,  -7, 127,   8,  -3,   1, 0},
@@ -77,7 +77,7 @@ DECLARE_ALIGNED(256, const subpel_kernel,
 };
 
 // freqmultiplier = 0.5
-DECLARE_ALIGNED(256, const subpel_kernel,
+DECLARE_ALIGNED(256, const interp_kernel,
                 vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS]) = {
   { 0,  0,  0, 128,  0,  0,  0,  0},
   {-3, -1, 32,  64, 38,  1, -3,  0},
@@ -98,14 +98,15 @@ DECLARE_ALIGNED(256, const subpel_kernel,
 };
 
 
-static const subpel_kernel* vp9_filter_kernels[4] = {
+static const interp_kernel* vp9_filter_kernels[4] = {
   vp9_sub_pel_filters_8,
   vp9_sub_pel_filters_8lp,
   vp9_sub_pel_filters_8s,
   vp9_bilinear_filters
 };
 
-const subpel_kernel *vp9_get_filter_kernel(INTERPOLATION_TYPE type) {
-  return vp9_filter_kernels[type];
+const interp_kernel *vp9_get_interp_kernel(INTERP_FILTER filter) {
+  assert(filter != SWITCHABLE);
+  return vp9_filter_kernels[filter];
 }
 
diff --git a/vp9/common/vp9_filter.h b/vp9/common/vp9_filter.h
index b1e7e6499..b611e304c 100644
--- a/vp9/common/vp9_filter.h
+++ b/vp9/common/vp9_filter.h
@@ -14,6 +14,10 @@
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define FILTER_BITS 7
 
 #define SUBPEL_BITS 4
@@ -27,25 +31,24 @@ typedef enum {
   EIGHTTAP_SHARP = 2,
   BILINEAR = 3,
   SWITCHABLE = 4  /* should be the last one */
-} INTERPOLATION_TYPE;
-
-typedef int16_t subpel_kernel[SUBPEL_TAPS];
+} INTERP_FILTER;
 
-struct subpix_fn_table {
-  const subpel_kernel *filter_x;
-  const subpel_kernel *filter_y;
-};
+typedef int16_t interp_kernel[SUBPEL_TAPS];
 
-const subpel_kernel *vp9_get_filter_kernel(INTERPOLATION_TYPE type);
+const interp_kernel *vp9_get_interp_kernel(INTERP_FILTER filter);
 
-extern const subpel_kernel vp9_bilinear_filters[SUBPEL_SHIFTS];
-extern const subpel_kernel vp9_sub_pel_filters_8[SUBPEL_SHIFTS];
-extern const subpel_kernel vp9_sub_pel_filters_8s[SUBPEL_SHIFTS];
-extern const subpel_kernel vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS];
+extern const interp_kernel vp9_bilinear_filters[SUBPEL_SHIFTS];
+extern const interp_kernel vp9_sub_pel_filters_8[SUBPEL_SHIFTS];
+extern const interp_kernel vp9_sub_pel_filters_8s[SUBPEL_SHIFTS];
+extern const interp_kernel vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS];
 
 // The VP9_BILINEAR_FILTERS_2TAP macro returns a pointer to the bilinear
 // filter kernel as a 2 tap filter.
 #define BILINEAR_FILTERS_2TAP(x) \
   (vp9_bilinear_filters[(x)] + SUBPEL_TAPS/2 - 1)
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_VP9_FILTER_H_
diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c
index 533f7f361..20b78bfed 100644
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -96,7 +96,7 @@ void vp9_iwht4x4_1_add_c(const int16_t *in, uint8_t *dest, int dest_stride) {
   }
 }
 
-static void idct4_1d(const int16_t *input, int16_t *output) {
+static void idct4(const int16_t *input, int16_t *output) {
   int16_t step[4];
   int temp1, temp2;
   // stage 1
@@ -124,7 +124,7 @@ void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {
 
   // Rows
   for (i = 0; i < 4; ++i) {
-    idct4_1d(input, outptr);
+    idct4(input, outptr);
     input += 4;
     outptr += 4;
   }
@@ -133,7 +133,7 @@ void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {
   for (i = 0; i < 4; ++i) {
     for (j = 0; j < 4; ++j)
       temp_in[j] = out[j * 4 + i];
-    idct4_1d(temp_in, temp_out);
+    idct4(temp_in, temp_out);
     for (j = 0; j < 4; ++j)
       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
                                   + dest[j * stride + i]);
@@ -156,7 +156,7 @@ void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride) {
   }
 }
 
-static void idct8_1d(const int16_t *input, int16_t *output) {
+static void idct8(const int16_t *input, int16_t *output) {
   int16_t step1[8], step2[8];
   int temp1, temp2;
   // stage 1
@@ -174,7 +174,7 @@ static void idct8_1d(const int16_t *input, int16_t *output) {
   step1[6] = dct_const_round_shift(temp2);
 
   // stage 2 & stage 3 - even half
-  idct4_1d(step1, step1);
+  idct4(step1, step1);
 
   // stage 2 - odd half
   step2[4] = step1[4] + step1[5];
@@ -209,7 +209,7 @@ void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride) {
 
   // First transform rows
   for (i = 0; i < 8; ++i) {
-    idct8_1d(input, outptr);
+    idct8(input, outptr);
     input += 8;
     outptr += 8;
   }
@@ -218,7 +218,7 @@ void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride) {
   for (i = 0; i < 8; ++i) {
     for (j = 0; j < 8; ++j)
       temp_in[j] = out[j * 8 + i];
-    idct8_1d(temp_in, temp_out);
+    idct8(temp_in, temp_out);
     for (j = 0; j < 8; ++j)
       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
                                   + dest[j * stride + i]);
@@ -238,7 +238,7 @@ void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
   }
 }
 
-static void iadst4_1d(const int16_t *input, int16_t *output) {
+static void iadst4(const int16_t *input, int16_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7;
 
   int x0 = input[0];
@@ -283,10 +283,10 @@ static void iadst4_1d(const int16_t *input, int16_t *output) {
 void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride,
                          int tx_type) {
   const transform_2d IHT_4[] = {
-    { idct4_1d, idct4_1d  },  // DCT_DCT  = 0
-    { iadst4_1d, idct4_1d  },   // ADST_DCT = 1
-    { idct4_1d, iadst4_1d },    // DCT_ADST = 2
-    { iadst4_1d, iadst4_1d }      // ADST_ADST = 3
+    { idct4, idct4  },  // DCT_DCT  = 0
+    { iadst4, idct4  },   // ADST_DCT = 1
+    { idct4, iadst4 },    // DCT_ADST = 2
+    { iadst4, iadst4 }      // ADST_ADST = 3
   };
 
   int i, j;
@@ -311,7 +311,7 @@ void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride,
                                   + dest[j * stride + i]);
   }
 }
-static void iadst8_1d(const int16_t *input, int16_t *output) {
+static void iadst8(const int16_t *input, int16_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7;
 
   int x0 = input[7];
@@ -389,10 +389,10 @@ static void iadst8_1d(const int16_t *input, int16_t *output) {
 }
 
 static const transform_2d IHT_8[] = {
-  { idct8_1d,  idct8_1d  },  // DCT_DCT  = 0
-  { iadst8_1d, idct8_1d  },  // ADST_DCT = 1
-  { idct8_1d,  iadst8_1d },  // DCT_ADST = 2
-  { iadst8_1d, iadst8_1d }   // ADST_ADST = 3
+  { idct8,  idct8  },  // DCT_DCT  = 0
+  { iadst8, idct8  },  // ADST_DCT = 1
+  { idct8,  iadst8 },  // DCT_ADST = 2
+  { iadst8, iadst8 }   // ADST_ADST = 3
 };
 
 void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride,
@@ -430,7 +430,7 @@ void vp9_idct8x8_10_add_c(const int16_t *input, uint8_t *dest, int stride) {
   // First transform rows
   // only first 4 row has non-zero coefs
   for (i = 0; i < 4; ++i) {
-    idct8_1d(input, outptr);
+    idct8(input, outptr);
     input += 8;
     outptr += 8;
   }
@@ -439,14 +439,14 @@ void vp9_idct8x8_10_add_c(const int16_t *input, uint8_t *dest, int stride) {
   for (i = 0; i < 8; ++i) {
     for (j = 0; j < 8; ++j)
       temp_in[j] = out[j * 8 + i];
-    idct8_1d(temp_in, temp_out);
+    idct8(temp_in, temp_out);
     for (j = 0; j < 8; ++j)
       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
                                   + dest[j * stride + i]);
   }
 }
 
-static void idct16_1d(const int16_t *input, int16_t *output) {
+static void idct16(const int16_t *input, int16_t *output) {
   int16_t step1[16], step2[16];
   int temp1, temp2;
 
@@ -619,7 +619,7 @@ void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride) {
 
   // First transform rows
   for (i = 0; i < 16; ++i) {
-    idct16_1d(input, outptr);
+    idct16(input, outptr);
     input += 16;
     outptr += 16;
   }
@@ -628,14 +628,14 @@ void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride) {
   for (i = 0; i < 16; ++i) {
     for (j = 0; j < 16; ++j)
       temp_in[j] = out[j * 16 + i];
-    idct16_1d(temp_in, temp_out);
+    idct16(temp_in, temp_out);
     for (j = 0; j < 16; ++j)
       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
                                   + dest[j * stride + i]);
   }
 }
 
-static void iadst16_1d(const int16_t *input, int16_t *output) {
+static void iadst16(const int16_t *input, int16_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
 
   int x0 = input[15];
@@ -807,10 +807,10 @@ static void iadst16_1d(const int16_t *input, int16_t *output) {
 }
 
 static const transform_2d IHT_16[] = {
-  { idct16_1d,  idct16_1d  },  // DCT_DCT  = 0
-  { iadst16_1d, idct16_1d  },  // ADST_DCT = 1
-  { idct16_1d,  iadst16_1d },  // DCT_ADST = 2
-  { iadst16_1d, iadst16_1d }   // ADST_ADST = 3
+  { idct16,  idct16  },  // DCT_DCT  = 0
+  { iadst16, idct16  },  // ADST_DCT = 1
+  { idct16,  iadst16 },  // DCT_ADST = 2
+  { iadst16, iadst16 }   // ADST_ADST = 3
 };
 
 void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride,
@@ -848,7 +848,7 @@ void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int stride) {
   // First transform rows. Since all non-zero dct coefficients are in
   // upper-left 4x4 area, we only need to calculate first 4 rows here.
   for (i = 0; i < 4; ++i) {
-    idct16_1d(input, outptr);
+    idct16(input, outptr);
     input += 16;
     outptr += 16;
   }
@@ -857,7 +857,7 @@ void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int stride) {
   for (i = 0; i < 16; ++i) {
     for (j = 0; j < 16; ++j)
       temp_in[j] = out[j*16 + i];
-    idct16_1d(temp_in, temp_out);
+    idct16(temp_in, temp_out);
     for (j = 0; j < 16; ++j)
       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
                                   + dest[j * stride + i]);
@@ -877,7 +877,7 @@ void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
   }
 }
 
-static void idct32_1d(const int16_t *input, int16_t *output) {
+static void idct32(const int16_t *input, int16_t *output) {
   int16_t step1[32], step2[32];
   int temp1, temp2;
 
@@ -1263,7 +1263,7 @@ void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) {
       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
 
     if (zero_coeff[0] | zero_coeff[1])
-      idct32_1d(input, outptr);
+      idct32(input, outptr);
     else
       vpx_memset(outptr, 0, sizeof(int16_t) * 32);
     input += 32;
@@ -1274,7 +1274,7 @@ void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) {
   for (i = 0; i < 32; ++i) {
     for (j = 0; j < 32; ++j)
       temp_in[j] = out[j * 32 + i];
-    idct32_1d(temp_in, temp_out);
+    idct32(temp_in, temp_out);
     for (j = 0; j < 32; ++j)
       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
                                         + dest[j * stride + i]);
@@ -1290,7 +1290,7 @@ void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int stride) {
   // Rows
   // only upper-left 8x8 has non-zero coeff
   for (i = 0; i < 8; ++i) {
-    idct32_1d(input, outptr);
+    idct32(input, outptr);
     input += 32;
     outptr += 32;
   }
@@ -1299,7 +1299,7 @@ void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int stride) {
   for (i = 0; i < 32; ++i) {
     for (j = 0; j < 32; ++j)
       temp_in[j] = out[j * 32 + i];
-    idct32_1d(temp_in, temp_out);
+    idct32(temp_in, temp_out);
     for (j = 0; j < 32; ++j)
       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
                                   + dest[j * stride + i]);
diff --git a/vp9/common/vp9_idct.h b/vp9/common/vp9_idct.h
index 183c50abf..ceca7951b 100644
--- a/vp9/common/vp9_idct.h
+++ b/vp9/common/vp9_idct.h
@@ -18,6 +18,10 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_enums.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 
 // Constants and Macros used by all idct/dct functions
 #define DCT_CONST_BITS 14
@@ -103,4 +107,8 @@ void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
                       int stride, int eob);
 
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_VP9_IDCT_H_
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index 2266e0ec2..dd304c909 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -16,26 +16,6 @@
 
 #include "vp9/common/vp9_seg_common.h"
 
-// This structure holds bit masks for all 8x8 blocks in a 64x64 region.
-// Each 1 bit represents a position in which we want to apply the loop filter.
-// Left_ entries refer to whether we apply a filter on the border to the
-// left of the block.   Above_ entries refer to whether or not to apply a
-// filter on the above border.   Int_ entries refer to whether or not to
-// apply borders on the 4x4 edges within the 8x8 block that each bit
-// represents.
-// Since each transform is accompanied by a potentially different type of
-// loop filter there is a different entry in the array for each transform size.
-typedef struct {
-  uint64_t left_y[TX_SIZES];
-  uint64_t above_y[TX_SIZES];
-  uint64_t int_4x4_y;
-  uint16_t left_uv[TX_SIZES];
-  uint16_t above_uv[TX_SIZES];
-  uint16_t int_4x4_uv;
-  uint8_t lfl_y[64];
-  uint8_t lfl_uv[16];
-} LOOP_FILTER_MASK;
-
 // 64 bit masks for left transform size.  Each 1 represents a position where
 // we should apply a loop filter across the left border of an 8x8 block
 // boundary.
@@ -638,9 +618,9 @@ static void build_y_mask(const loop_filter_info_n *const lfi_n,
 // This function sets up the bit masks for the entire 64x64 region represented
 // by mi_row, mi_col.
 // TODO(JBB): This function only works for yv12.
-static void setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
-                       MODE_INFO **mi_8x8, const int mode_info_stride,
-                       LOOP_FILTER_MASK *lfm) {
+void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
+                    MODE_INFO **mi_8x8, const int mode_info_stride,
+                    LOOP_FILTER_MASK *lfm) {
   int idx_32, idx_16, idx_8;
   const loop_filter_info_n *const lfi_n = &cm->lf_info;
   MODE_INFO **mip = mi_8x8;
@@ -1069,10 +1049,10 @@ static void filter_block_plane_non420(VP9_COMMON *cm,
 }
 #endif
 
-static void filter_block_plane(VP9_COMMON *const cm,
-                               struct macroblockd_plane *const plane,
-                               int mi_row,
-                               LOOP_FILTER_MASK *lfm) {
+void vp9_filter_block_plane(VP9_COMMON *const cm,
+                            struct macroblockd_plane *const plane,
+                            int mi_row,
+                            LOOP_FILTER_MASK *lfm) {
   struct buf_2d *const dst = &plane->dst;
   uint8_t* const dst0 = dst->buf;
   int r, c;
@@ -1244,14 +1224,14 @@ void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer,
 #if CONFIG_NON420
       if (use_420)
 #endif
-        setup_mask(cm, mi_row, mi_col, mi_8x8 + mi_col, cm->mode_info_stride,
-                   &lfm);
+        vp9_setup_mask(cm, mi_row, mi_col, mi_8x8 + mi_col,
+                       cm->mode_info_stride, &lfm);
 
       for (plane = 0; plane < num_planes; ++plane) {
 #if CONFIG_NON420
         if (use_420)
 #endif
-          filter_block_plane(cm, &xd->plane[plane], mi_row, &lfm);
+          vp9_filter_block_plane(cm, &xd->plane[plane], mi_row, &lfm);
 #if CONFIG_NON420
         else
           filter_block_plane_non420(cm, &xd->plane[plane], mi_8x8 + mi_col,
diff --git a/vp9/common/vp9_loopfilter.h b/vp9/common/vp9_loopfilter.h
index 98fac96ff..668e898cf 100644
--- a/vp9/common/vp9_loopfilter.h
+++ b/vp9/common/vp9_loopfilter.h
@@ -17,6 +17,10 @@
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_seg_common.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define MAX_LOOP_FILTER 63
 #define MAX_SHARPNESS 7
 
@@ -56,9 +60,42 @@ typedef struct {
   uint8_t lvl[MAX_SEGMENTS][MAX_REF_FRAMES][MAX_MODE_LF_DELTAS];
 } loop_filter_info_n;
 
+// This structure holds bit masks for all 8x8 blocks in a 64x64 region.
+// Each 1 bit represents a position in which we want to apply the loop filter.
+// Left_ entries refer to whether we apply a filter on the border to the
+// left of the block.   Above_ entries refer to whether or not to apply a
+// filter on the above border.   Int_ entries refer to whether or not to
+// apply borders on the 4x4 edges within the 8x8 block that each bit
+// represents.
+// Since each transform is accompanied by a potentially different type of
+// loop filter there is a different entry in the array for each transform size.
+typedef struct {
+  uint64_t left_y[TX_SIZES];
+  uint64_t above_y[TX_SIZES];
+  uint64_t int_4x4_y;
+  uint16_t left_uv[TX_SIZES];
+  uint16_t above_uv[TX_SIZES];
+  uint16_t int_4x4_uv;
+  uint8_t lfl_y[64];
+  uint8_t lfl_uv[16];
+} LOOP_FILTER_MASK;
+
 /* assorted loopfilter functions which get used elsewhere */
 struct VP9Common;
 struct macroblockd;
+struct VP9LfSyncData;
+
+// This function sets up the bit masks for the entire 64x64 region represented
+// by mi_row, mi_col.
+void vp9_setup_mask(struct VP9Common *const cm,
+                    const int mi_row, const int mi_col,
+                    MODE_INFO **mi_8x8, const int mode_info_stride,
+                    LOOP_FILTER_MASK *lfm);
+
+void vp9_filter_block_plane(struct VP9Common *const cm,
+                            struct macroblockd_plane *const plane,
+                            int mi_row,
+                            LOOP_FILTER_MASK *lfm);
 
 void vp9_loop_filter_init(struct VP9Common *cm);
 
@@ -86,8 +123,15 @@ typedef struct LoopFilterWorkerData {
   int start;
   int stop;
   int y_only;
+
+  struct VP9LfSyncData *lf_sync;
+  int num_lf_workers;
 } LFWorkerData;
 
 // Operates on the rows described by LFWorkerData passed as 'arg1'.
 int vp9_loop_filter_worker(void *arg1, void *arg2);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_VP9_LOOPFILTER_H_
diff --git a/vp9/common/vp9_mv.h b/vp9/common/vp9_mv.h
index 155c3f12e..98fd1d82f 100644
--- a/vp9/common/vp9_mv.h
+++ b/vp9/common/vp9_mv.h
@@ -15,6 +15,10 @@
 
 #include "vp9/common/vp9_common.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct mv {
   int16_t row;
   int16_t col;
@@ -36,4 +40,8 @@ static void clamp_mv(MV *mv, int min_col, int max_col,
   mv->row = clamp(mv->row, min_row, max_row);
 }
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_VP9_MV_H_
diff --git a/vp9/common/vp9_mvref_common.h b/vp9/common/vp9_mvref_common.h
index cd89390d5..0936abfcd 100644
--- a/vp9/common/vp9_mvref_common.h
+++ b/vp9/common/vp9_mvref_common.h
@@ -7,12 +7,16 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#ifndef VP9_COMMON_VP9_MVREF_COMMON_H_
+#define VP9_COMMON_VP9_MVREF_COMMON_H_
 
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_blockd.h"
 
-#ifndef VP9_COMMON_VP9_MVREF_COMMON_H_
-#define VP9_COMMON_VP9_MVREF_COMMON_H_
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 
 void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
                           const TileInfo *const tile,
@@ -56,4 +60,8 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
                                    int block, int ref, int mi_row, int mi_col,
                                    int_mv *nearest, int_mv *near);
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_VP9_MVREF_COMMON_H_
diff --git a/vp9/common/vp9_onyx.h b/vp9/common/vp9_onyx.h
index 45d798482..564e4195f 100644
--- a/vp9/common/vp9_onyx.h
+++ b/vp9/common/vp9_onyx.h
@@ -11,17 +11,16 @@
 #ifndef VP9_COMMON_VP9_ONYX_H_
 #define VP9_COMMON_VP9_ONYX_H_
 
-#ifdef __cplusplus
-extern "C"
-{ // NOLINT
-#endif
-
 #include "./vpx_config.h"
 #include "vpx/internal/vpx_codec_internal.h"
 #include "vpx/vp8cx.h"
 #include "vpx_scale/yv12config.h"
 #include "vp9/common/vp9_ppflags.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define MAX_SEGMENTS 8
 
   typedef int *VP9_PTR;
@@ -56,6 +55,7 @@ extern "C"
     MODE_FIRSTPASS      = 0x3,
     MODE_SECONDPASS     = 0x4,
     MODE_SECONDPASS_BEST = 0x5,
+    MODE_REALTIME       = 0x6,
   } MODE;
 
   typedef enum {
@@ -237,7 +237,7 @@ extern "C"
   int vp9_get_quantizer(VP9_PTR c);
 
 #ifdef __cplusplus
-}
+}  // extern "C"
 #endif
 
 #endif  // VP9_COMMON_VP9_ONYX_H_
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index f6fe4d3f1..d92a25b12 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -25,6 +25,10 @@
 #include "vp9/common/vp9_postproc.h"
 #endif
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define REFS_PER_FRAME 3
 
 #define REF_FRAMES_LOG2 3
@@ -56,7 +60,7 @@ typedef struct frame_contexts {
   vp9_prob single_ref_prob[REF_CONTEXTS][2];
   vp9_prob comp_ref_prob[REF_CONTEXTS];
   struct tx_probs tx_probs;
-  vp9_prob mbskip_probs[MBSKIP_CONTEXTS];
+  vp9_prob skip_probs[SKIP_CONTEXTS];
   nmv_context nmvc;
 } FRAME_CONTEXT;
 
@@ -75,7 +79,7 @@ typedef struct {
   unsigned int single_ref[REF_CONTEXTS][2][2];
   unsigned int comp_ref[REF_CONTEXTS][2];
   struct tx_counts tx;
-  unsigned int mbskip[MBSKIP_CONTEXTS][2];
+  unsigned int skip[SKIP_CONTEXTS][2];
   nmv_context_counts mv;
 } FRAME_COUNTS;
 
@@ -87,6 +91,12 @@ typedef enum {
   REFERENCE_MODES       = 3,
 } REFERENCE_MODE;
 
+
+typedef struct {
+  int ref_count;
+  YV12_BUFFER_CONFIG buf;
+} RefCntBuffer;
+
 typedef struct VP9Common {
   struct vpx_internal_error_info  error;
 
@@ -113,8 +123,8 @@ typedef struct VP9Common {
 
   YV12_BUFFER_CONFIG *frame_to_show;
 
-  YV12_BUFFER_CONFIG *yv12_fb;
-  int *fb_idx_ref_cnt; /* reference counts */
+  RefCntBuffer frame_bufs[FRAME_BUFFERS];
+
   int ref_frame_map[REF_FRAMES]; /* maps fb_idx to reference slot */
 
   // TODO(jkoleszar): could expand active_ref_idx to 4, with 0 as intra, and
@@ -180,7 +190,7 @@ typedef struct VP9Common {
   // Persistent mb segment id map used in prediction.
   unsigned char *last_frame_seg_map;
 
-  INTERPOLATION_TYPE mcomp_filter_type;
+  INTERP_FILTER interp_filter;
 
   loop_filter_info_n lf_info;
 
@@ -213,55 +223,32 @@ typedef struct VP9Common {
   int frame_parallel_decoding_mode;
 
   int log2_tile_cols, log2_tile_rows;
-
-  vpx_codec_frame_buffer_t *fb_list;  // External frame buffers
-  int fb_count;  // Total number of frame buffers
-  vpx_realloc_frame_buffer_cb_fn_t realloc_fb_cb;
-  void *user_priv;  // Private data associated with the external frame buffers.
-
-  int fb_lru;  // Flag telling if lru is on/off
-  uint32_t *fb_idx_ref_lru;  // Frame buffer lru cache
-  uint32_t fb_idx_ref_lru_count;
 } VP9_COMMON;
 
 static YV12_BUFFER_CONFIG *get_frame_new_buffer(VP9_COMMON *cm) {
-  return &cm->yv12_fb[cm->new_fb_idx];
+  return &cm->frame_bufs[cm->new_fb_idx].buf;
 }
 
 static int get_free_fb(VP9_COMMON *cm) {
   int i;
-  uint32_t lru_count = cm->fb_idx_ref_lru_count + 1;
-  int free_buffer_idx = cm->fb_count;
-  for (i = 0; i < cm->fb_count; i++) {
-    if (!cm->fb_lru) {
-      if (cm->fb_idx_ref_cnt[i] == 0) {
-        free_buffer_idx = i;
-        break;
-      }
-    } else {
-      if (cm->fb_idx_ref_cnt[i] == 0 && cm->fb_idx_ref_lru[i] < lru_count) {
-        free_buffer_idx = i;
-        lru_count = cm->fb_idx_ref_lru[i];
-      }
-    }
-  }
+  for (i = 0; i < FRAME_BUFFERS; i++)
+    if (cm->frame_bufs[i].ref_count == 0)
+      break;
 
-  assert(free_buffer_idx < cm->fb_count);
-  cm->fb_idx_ref_cnt[free_buffer_idx] = 1;
-  if (cm->fb_lru)
-    cm->fb_idx_ref_lru[free_buffer_idx] = ++cm->fb_idx_ref_lru_count;
-  return free_buffer_idx;
+  assert(i < FRAME_BUFFERS);
+  cm->frame_bufs[i].ref_count = 1;
+  return i;
 }
 
-static void ref_cnt_fb(int *buf, int *idx, int new_idx) {
+static void ref_cnt_fb(RefCntBuffer *bufs, int *idx, int new_idx) {
   const int ref_index = *idx;
 
-  if (ref_index >= 0 && buf[ref_index] > 0)
-    buf[ref_index]--;
+  if (ref_index >= 0 && bufs[ref_index].ref_count > 0)
+    bufs[ref_index].ref_count--;
 
   *idx = new_idx;
 
-  buf[new_idx]++;
+  bufs[new_idx].ref_count++;
 }
 
 static int mi_cols_aligned_to_sb(int n_mis) {
@@ -359,4 +346,8 @@ static INLINE int partition_plane_context(
   return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
 }
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_VP9_ONYXC_INT_H_
diff --git a/vp9/common/vp9_postproc.h b/vp9/common/vp9_postproc.h
index b8a456fdb..b07d5d045 100644
--- a/vp9/common/vp9_postproc.h
+++ b/vp9/common/vp9_postproc.h
@@ -15,6 +15,10 @@
 #include "vpx_ports/mem.h"
 #include "vp9/common/vp9_ppflags.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 struct postproc_state {
   int last_q;
   int last_noise;
@@ -33,4 +37,8 @@ void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q);
 
 void vp9_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q);
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_VP9_POSTPROC_H_
diff --git a/vp9/common/vp9_ppflags.h b/vp9/common/vp9_ppflags.h
index 561c93028..8168935fc 100644
--- a/vp9/common/vp9_ppflags.h
+++ b/vp9/common/vp9_ppflags.h
@@ -11,6 +11,10 @@
 #ifndef VP9_COMMON_VP9_PPFLAGS_H_
 #define VP9_COMMON_VP9_PPFLAGS_H_
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 enum {
   VP9D_NOFILTERING            = 0,
   VP9D_DEBLOCK                = 1 << 0,
@@ -35,4 +39,8 @@ typedef struct {
   int display_mv_flag;
 } vp9_ppflags_t;
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_VP9_PPFLAGS_H_
diff --git a/vp9/common/vp9_pragmas.h b/vp9/common/vp9_pragmas.h
index f079161d6..0efc713ca 100644
--- a/vp9/common/vp9_pragmas.h
+++ b/vp9/common/vp9_pragmas.h
@@ -11,6 +11,10 @@
 #ifndef VP9_COMMON_VP9_PRAGMAS_H_
 #define VP9_COMMON_VP9_PRAGMAS_H_
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #ifdef __INTEL_COMPILER
 #pragma warning(disable:997 1011 170)
 #endif
@@ -19,4 +23,8 @@
 #pragma warning(disable:4799)
 #endif
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_VP9_PRAGMAS_H_
diff --git a/vp9/common/vp9_pred_common.h b/vp9/common/vp9_pred_common.h
index 23722ba72..0acee32f8 100644
--- a/vp9/common/vp9_pred_common.h
+++ b/vp9/common/vp9_pred_common.h
@@ -14,6 +14,10 @@
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_onyxc_int.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 static INLINE const MODE_INFO *get_above_mi(const MACROBLOCKD *const xd) {
   return xd->up_available ? xd->mi_8x8[-xd->mode_info_stride] : NULL;
 }
@@ -50,7 +54,7 @@ static INLINE int vp9_get_skip_context(const MACROBLOCKD *xd) {
 
 static INLINE vp9_prob vp9_get_skip_prob(const VP9_COMMON *cm,
                                          const MACROBLOCKD *xd) {
-  return cm->fc.mbskip_probs[vp9_get_skip_context(xd)];
+  return cm->fc.skip_probs[vp9_get_skip_context(xd)];
 }
 
 int vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd);
@@ -129,4 +133,8 @@ static unsigned int *get_tx_counts(TX_SIZE max_tx_size, int ctx,
   }
 }
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_VP9_PRED_COMMON_H_
diff --git a/vp9/common/vp9_prob.h b/vp9/common/vp9_prob.h
index 7a790c542..cc8d8ab38 100644
--- a/vp9/common/vp9_prob.h
+++ b/vp9/common/vp9_prob.h
@@ -18,6 +18,10 @@
 
 #include "vp9/common/vp9_common.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef uint8_t vp9_prob;
 
 #define MAX_PROB 255
@@ -109,4 +113,8 @@ static void tree_merge_probs(const vp9_tree_index *tree,
 
 DECLARE_ALIGNED(16, extern const uint8_t, vp9_norm[256]);
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_VP9_PROB_H_
diff --git a/vp9/common/vp9_quant_common.h b/vp9/common/vp9_quant_common.h
index 83f2fb655..af50e23cd 100644
--- a/vp9/common/vp9_quant_common.h
+++ b/vp9/common/vp9_quant_common.h
@@ -13,6 +13,10 @@
 
 #include "vp9/common/vp9_blockd.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define MINQ 0
 #define MAXQ 255
 #define QINDEX_RANGE (MAXQ - MINQ + 1)
@@ -25,4 +29,8 @@ int16_t vp9_ac_quant(int qindex, int delta);
 
 int vp9_get_qindex(struct segmentation *seg, int segment_id, int base_qindex);
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_VP9_QUANT_COMMON_H_
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index b5a9248c3..d554cc0ed 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -69,13 +69,11 @@ static void inter_predictor(const uint8_t *src, int src_stride,
                             const int subpel_y,
                             const struct scale_factors *sf,
                             int w, int h, int ref,
-                            const struct subpix_fn_table *subpix,
+                            const interp_kernel *kernel,
                             int xs, int ys) {
   sf->predict[subpel_x != 0][subpel_y != 0][ref](
       src, src_stride, dst, dst_stride,
-      subpix->filter_x[subpel_x], xs,
-      subpix->filter_y[subpel_y], ys,
-      w, h);
+      kernel[subpel_x], xs, kernel[subpel_y], ys, w, h);
 }
 
 void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
@@ -83,7 +81,7 @@ void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
                                const MV *src_mv,
                                const struct scale_factors *sf,
                                int w, int h, int ref,
-                               const struct subpix_fn_table *subpix,
+                               const interp_kernel *kernel,
                                enum mv_precision precision,
                                int x, int y) {
   const int is_q4 = precision == MV_PRECISION_Q4;
@@ -96,7 +94,7 @@ void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
   src += (mv.row >> SUBPEL_BITS) * src_stride + (mv.col >> SUBPEL_BITS);
 
   inter_predictor(src, src_stride, dst, dst_stride, subpel_x, subpel_y,
-                  sf, w, h, ref, subpix, sf->x_step_q4, sf->y_step_q4);
+                  sf, w, h, ref, kernel, sf->x_step_q4, sf->y_step_q4);
 }
 
 static INLINE int round_mv_comp_q4(int value) {
@@ -198,7 +196,8 @@ static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
            + (scaled_mv.col >> SUBPEL_BITS);
 
     inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
-                    subpel_x, subpel_y, sf, w, h, ref, &xd->subpix, xs, ys);
+                    subpel_x, subpel_y, sf, w, h, ref, xd->interp_kernel,
+                    xs, ys);
   }
 }
 
@@ -367,7 +366,7 @@ static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
     }
 
     inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
-                    subpel_y, sf, w, h, ref, &xd->subpix, xs, ys);
+                    subpel_y, sf, w, h, ref, xd->interp_kernel, xs, ys);
   }
 }
 
diff --git a/vp9/common/vp9_reconinter.h b/vp9/common/vp9_reconinter.h
index 3cc16d94e..3345d83e8 100644
--- a/vp9/common/vp9_reconinter.h
+++ b/vp9/common/vp9_reconinter.h
@@ -14,7 +14,10 @@
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_onyxc_int.h"
 
-struct subpix_fn_table;
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 void vp9_build_inter_predictors_sby(MACROBLOCKD *xd, int mi_row, int mi_col,
                                     BLOCK_SIZE bsize);
 
@@ -32,7 +35,7 @@ void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
                                const MV *mv_q3,
                                const struct scale_factors *sf,
                                int w, int h, int do_avg,
-                               const struct subpix_fn_table *subpix,
+                               const interp_kernel *kernel,
                                enum mv_precision precision,
                                int x, int y);
 
@@ -90,10 +93,8 @@ static void setup_pre_planes(MACROBLOCKD *xd, int idx,
   }
 }
 
-static void set_scale_factors(VP9_COMMON *cm, MACROBLOCKD *xd,
-                              int ref0, int ref1) {
-  xd->block_refs[0] = &cm->frame_refs[ref0 >= 0 ? ref0 : 0];
-  xd->block_refs[1] = &cm->frame_refs[ref1 >= 0 ? ref1 : 0];
-}
+#ifdef __cplusplus
+}  // extern "C"
+#endif
 
 #endif  // VP9_COMMON_VP9_RECONINTER_H_
diff --git a/vp9/common/vp9_reconintra.h b/vp9/common/vp9_reconintra.h
index fc916fcf3..800736d30 100644
--- a/vp9/common/vp9_reconintra.h
+++ b/vp9/common/vp9_reconintra.h
@@ -14,9 +14,17 @@
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_blockd.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 void vp9_predict_intra_block(const MACROBLOCKD *xd, int block_idx, int bwl_in,
                              TX_SIZE tx_size, int mode,
                              const uint8_t *ref, int ref_stride,
                              uint8_t *dst, int dst_stride,
                              int aoff, int loff, int plane);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_VP9_RECONINTRA_H_
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index e384032f4..04a40bd58 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -135,7 +135,7 @@ prototype void vp9_v_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const ui
 specialize vp9_v_predictor_16x16 $sse2_x86inc neon
 
 prototype void vp9_tm_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_tm_predictor_16x16 $sse2_x86inc
+specialize vp9_tm_predictor_16x16 $sse2_x86inc neon
 
 prototype void vp9_dc_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_dc_predictor_16x16 $sse2_x86inc dspr2
@@ -174,7 +174,7 @@ prototype void vp9_v_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const ui
 specialize vp9_v_predictor_32x32 $sse2_x86inc neon
 
 prototype void vp9_tm_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_tm_predictor_32x32 $sse2_x86_64
+specialize vp9_tm_predictor_32x32 $sse2_x86_64 neon
 
 prototype void vp9_dc_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_dc_predictor_32x32 $sse2_x86inc
diff --git a/vp9/common/vp9_scale.h b/vp9/common/vp9_scale.h
index 55b4d8888..90b0d0bf9 100644
--- a/vp9/common/vp9_scale.h
+++ b/vp9/common/vp9_scale.h
@@ -14,6 +14,10 @@
 #include "vp9/common/vp9_mv.h"
 #include "vp9/common/vp9_convolve.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define REF_SCALE_SHIFT 14
 #define REF_NO_SCALE (1 << REF_SCALE_SHIFT)
 #define REF_INVALID_SCALE -1
@@ -46,4 +50,8 @@ static int vp9_is_scaled(const struct scale_factors *sf) {
          sf->y_scale_fp != REF_NO_SCALE;
 }
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_VP9_SCALE_H_
diff --git a/vp9/common/vp9_scan.h b/vp9/common/vp9_scan.h
index efab48bfc..9613b675c 100644
--- a/vp9/common/vp9_scan.h
+++ b/vp9/common/vp9_scan.h
@@ -17,6 +17,10 @@
 #include "vp9/common/vp9_enums.h"
 #include "vp9/common/vp9_blockd.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define MAX_NEIGHBORS 2
 
 void vp9_init_neighbors();
@@ -36,4 +40,8 @@ static INLINE int get_coef_context(const int16_t *neighbors,
           token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >> 1;
 }
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_VP9_SCAN_H_
diff --git a/vp9/common/vp9_seg_common.h b/vp9/common/vp9_seg_common.h
index 8ff54fb73..ff2d66a36 100644
--- a/vp9/common/vp9_seg_common.h
+++ b/vp9/common/vp9_seg_common.h
@@ -13,6 +13,10 @@
 
 #include "vp9/common/vp9_prob.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define SEGMENT_DELTADATA   0
 #define SEGMENT_ABSDATA     1
 
@@ -70,5 +74,9 @@ int vp9_get_segdata(const struct segmentation *seg,
 
 extern const vp9_tree_index vp9_segment_tree[TREE_SIZE(MAX_SEGMENTS)];
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_VP9_SEG_COMMON_H_
 
diff --git a/vp9/common/vp9_systemdependent.h b/vp9/common/vp9_systemdependent.h
index 6f955ab56..ee9a4823b 100644
--- a/vp9/common/vp9_systemdependent.h
+++ b/vp9/common/vp9_systemdependent.h
@@ -11,6 +11,10 @@
 #ifndef VP9_COMMON_VP9_SYSTEMDEPENDENT_H_
 #define VP9_COMMON_VP9_SYSTEMDEPENDENT_H_
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #ifdef _MSC_VER
 #include <math.h>
 #define snprintf _snprintf
@@ -72,4 +76,8 @@ static INLINE int get_msb(unsigned int n) {
 struct VP9Common;
 void vp9_machine_specific_config(struct VP9Common *cm);
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_VP9_SYSTEMDEPENDENT_H_
diff --git a/vp9/common/vp9_textblit.h b/vp9/common/vp9_textblit.h
index c968628fe..158ec1b37 100644
--- a/vp9/common/vp9_textblit.h
+++ b/vp9/common/vp9_textblit.h
@@ -11,9 +11,17 @@
 #ifndef VP9_COMMON_VP9_TEXTBLIT_H_
 #define VP9_COMMON_VP9_TEXTBLIT_H_
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 void vp9_blit_text(const char *msg, unsigned char *address, int pitch);
 
 void vp9_blit_line(int x0, int x1, int y0, int y1, unsigned char *image,
                    int pitch);
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_VP9_TEXTBLIT_H_
diff --git a/vp9/common/vp9_tile_common.h b/vp9/common/vp9_tile_common.h
index a09876e4b..a97719e29 100644
--- a/vp9/common/vp9_tile_common.h
+++ b/vp9/common/vp9_tile_common.h
@@ -11,6 +11,10 @@
 #ifndef VP9_COMMON_VP9_TILE_COMMON_H_
 #define VP9_COMMON_VP9_TILE_COMMON_H_
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 struct VP9Common;
 
 typedef struct TileInfo {
@@ -26,4 +30,8 @@ void vp9_tile_init(TileInfo *tile, const struct VP9Common *cm,
 void vp9_get_tile_n_bits(int mi_cols,
                          int *min_log2_tile_cols, int *max_log2_tile_cols);
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_VP9_TILE_COMMON_H_
diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c
index f95423678..8a2297feb 100644
--- a/vp9/common/x86/vp9_asm_stubs.c
+++ b/vp9/common/x86/vp9_asm_stubs.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -23,6 +23,68 @@ typedef void filter8_1dfunction (
   const short *filter
 );
 
+#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
+void vp9_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \
+                                  uint8_t *dst, ptrdiff_t dst_stride, \
+                                  const int16_t *filter_x, int x_step_q4, \
+                                  const int16_t *filter_y, int y_step_q4, \
+                                  int w, int h) { \
+  if (step_q4 == 16 && filter[3] != 128) { \
+    while (w >= 16) { \
+      vp9_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, \
+                                               dst, dst_stride, \
+                                               h, filter); \
+      src += 16; \
+      dst += 16; \
+      w -= 16; \
+    } \
+    while (w >= 8) { \
+      vp9_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, \
+                                              dst, dst_stride, \
+                                              h, filter); \
+      src += 8; \
+      dst += 8; \
+      w -= 8; \
+    } \
+    while (w >= 4) { \
+      vp9_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, \
+                                              dst, dst_stride, \
+                                              h, filter); \
+      src += 4; \
+      dst += 4; \
+      w -= 4; \
+    } \
+  } \
+  if (w) { \
+    vp9_convolve8_##name##_c(src, src_stride, dst, dst_stride, \
+                             filter_x, x_step_q4, filter_y, y_step_q4, \
+                             w, h); \
+  } \
+}
+
+#define FUN_CONV_2D(avg, opt) \
+void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
+                              uint8_t *dst, ptrdiff_t dst_stride, \
+                              const int16_t *filter_x, int x_step_q4, \
+                              const int16_t *filter_y, int y_step_q4, \
+                              int w, int h) { \
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); \
+  \
+  assert(w <= 64); \
+  assert(h <= 64); \
+  if (x_step_q4 == 16 && y_step_q4 == 16) { \
+    vp9_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
+                              filter_x, x_step_q4, filter_y, y_step_q4, \
+                              w, h + 7); \
+    vp9_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \
+                                    filter_x, x_step_q4, filter_y, y_step_q4, \
+                                    w, h); \
+  } else { \
+    vp9_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
+                           filter_x, x_step_q4, filter_y, y_step_q4, w, h); \
+  } \
+}
+
 #if HAVE_SSSE3
 filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
 filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
@@ -37,201 +99,44 @@ filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3;
 filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3;
 filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3;
 
-void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4,
-                               int w, int h) {
-  /* Ensure the filter can be compressed to int16_t. */
-  if (x_step_q4 == 16 && filter_x[3] != 128) {
-    while (w >= 16) {
-      vp9_filter_block1d16_h8_ssse3(src, src_stride,
-                                    dst, dst_stride,
-                                    h, filter_x);
-      src += 16;
-      dst += 16;
-      w -= 16;
-    }
-    while (w >= 8) {
-      vp9_filter_block1d8_h8_ssse3(src, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_x);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    }
-    while (w >= 4) {
-      vp9_filter_block1d4_h8_ssse3(src, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_x);
-      src += 4;
-      dst += 4;
-      w -= 4;
-    }
-  }
-  if (w) {
-    vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
-                          filter_x, x_step_q4, filter_y, y_step_q4,
-                          w, h);
-  }
-}
-
-void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4,
-                              int w, int h) {
-  if (y_step_q4 == 16 && filter_y[3] != 128) {
-    while (w >= 16) {
-      vp9_filter_block1d16_v8_ssse3(src - src_stride * 3, src_stride,
-                                    dst, dst_stride,
-                                    h, filter_y);
-      src += 16;
-      dst += 16;
-      w -= 16;
-    }
-    while (w >= 8) {
-      vp9_filter_block1d8_v8_ssse3(src - src_stride * 3, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_y);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    }
-    while (w >= 4) {
-      vp9_filter_block1d4_v8_ssse3(src - src_stride * 3, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_y);
-      src += 4;
-      dst += 4;
-      w -= 4;
-    }
-  }
-  if (w) {
-    vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
-                         filter_x, x_step_q4, filter_y, y_step_q4,
-                         w, h);
-  }
-}
-
-void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4,
-                               int w, int h) {
-  if (x_step_q4 == 16 && filter_x[3] != 128) {
-    while (w >= 16) {
-      vp9_filter_block1d16_h8_avg_ssse3(src, src_stride,
-                                    dst, dst_stride,
-                                    h, filter_x);
-      src += 16;
-      dst += 16;
-      w -= 16;
-    }
-    while (w >= 8) {
-      vp9_filter_block1d8_h8_avg_ssse3(src, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_x);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    }
-    while (w >= 4) {
-      vp9_filter_block1d4_h8_avg_ssse3(src, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_x);
-      src += 4;
-      dst += 4;
-      w -= 4;
-    }
-  }
-  if (w) {
-    vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
-                              filter_x, x_step_q4, filter_y, y_step_q4,
-                              w, h);
-  }
-}
-
-void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4,
-                              int w, int h) {
-  if (y_step_q4 == 16 && filter_y[3] != 128) {
-    while (w >= 16) {
-      vp9_filter_block1d16_v8_avg_ssse3(src - src_stride * 3, src_stride,
-                                    dst, dst_stride,
-                                    h, filter_y);
-      src += 16;
-      dst += 16;
-      w -= 16;
-    }
-    while (w >= 8) {
-      vp9_filter_block1d8_v8_avg_ssse3(src - src_stride * 3, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_y);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    }
-    while (w >= 4) {
-      vp9_filter_block1d4_v8_avg_ssse3(src - src_stride * 3, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_y);
-      src += 4;
-      dst += 4;
-      w -= 4;
-    }
-  }
-  if (w) {
-    vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
-                             filter_x, x_step_q4, filter_y, y_step_q4,
-                             w, h);
-  }
-}
-
-void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-                         uint8_t *dst, ptrdiff_t dst_stride,
-                         const int16_t *filter_x, int x_step_q4,
-                         const int16_t *filter_y, int y_step_q4,
-                         int w, int h) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71);
-
-  assert(w <= 64);
-  assert(h <= 64);
-  if (x_step_q4 == 16 && y_step_q4 == 16) {
-    vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64,
-                              filter_x, x_step_q4, filter_y, y_step_q4,
-                              w, h + 7);
-    vp9_convolve8_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride,
-                             filter_x, x_step_q4, filter_y, y_step_q4, w, h);
-  } else {
-    vp9_convolve8_c(src, src_stride, dst, dst_stride,
-                    filter_x, x_step_q4, filter_y, y_step_q4, w, h);
-  }
-}
-
-void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-                         uint8_t *dst, ptrdiff_t dst_stride,
-                         const int16_t *filter_x, int x_step_q4,
-                         const int16_t *filter_y, int y_step_q4,
-                         int w, int h) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71);
-
-  assert(w <= 64);
-  assert(h <= 64);
-  if (x_step_q4 == 16 && y_step_q4 == 16) {
-    vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64,
-                              filter_x, x_step_q4, filter_y, y_step_q4,
-                              w, h + 7);
-    vp9_convolve8_avg_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride,
-                                 filter_x, x_step_q4, filter_y, y_step_q4,
-                                 w, h);
-  } else {
-    vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,
-                        filter_x, x_step_q4, filter_y, y_step_q4, w, h);
-  }
-}
+// void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+//                                uint8_t *dst, ptrdiff_t dst_stride,
+//                                const int16_t *filter_x, int x_step_q4,
+//                                const int16_t *filter_y, int y_step_q4,
+//                                int w, int h);
+// void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+//                               uint8_t *dst, ptrdiff_t dst_stride,
+//                               const int16_t *filter_x, int x_step_q4,
+//                               const int16_t *filter_y, int y_step_q4,
+//                               int w, int h);
+// void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+//                                    uint8_t *dst, ptrdiff_t dst_stride,
+//                                    const int16_t *filter_x, int x_step_q4,
+//                                    const int16_t *filter_y, int y_step_q4,
+//                                    int w, int h);
+// void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+//                                   uint8_t *dst, ptrdiff_t dst_stride,
+//                                   const int16_t *filter_x, int x_step_q4,
+//                                   const int16_t *filter_y, int y_step_q4,
+//                                   int w, int h);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
+FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
+FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
+            ssse3);
+
+// void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+//                          uint8_t *dst, ptrdiff_t dst_stride,
+//                          const int16_t *filter_x, int x_step_q4,
+//                          const int16_t *filter_y, int y_step_q4,
+//                          int w, int h);
+// void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+//                              uint8_t *dst, ptrdiff_t dst_stride,
+//                              const int16_t *filter_x, int x_step_q4,
+//                              const int16_t *filter_y, int y_step_q4,
+//                              int w, int h);
+FUN_CONV_2D(, ssse3);
+FUN_CONV_2D(avg_ , ssse3);
 #endif
 
 #if HAVE_SSE2
@@ -248,199 +153,41 @@ filter8_1dfunction vp9_filter_block1d8_h8_avg_sse2;
 filter8_1dfunction vp9_filter_block1d4_v8_avg_sse2;
 filter8_1dfunction vp9_filter_block1d4_h8_avg_sse2;
 
-void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
-                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4,
-                               int w, int h) {
-  /* Ensure the filter can be compressed to int16_t. */
-  if (x_step_q4 == 16 && filter_x[3] != 128) {
-    while (w >= 16) {
-      vp9_filter_block1d16_h8_sse2(src, src_stride,
-                                    dst, dst_stride,
-                                    h, filter_x);
-      src += 16;
-      dst += 16;
-      w -= 16;
-    }
-    while (w >= 8) {
-      vp9_filter_block1d8_h8_sse2(src, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_x);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    }
-    while (w >= 4) {
-      vp9_filter_block1d4_h8_sse2(src, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_x);
-      src += 4;
-      dst += 4;
-      w -= 4;
-    }
-  }
-  if (w) {
-    vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
-                          filter_x, x_step_q4, filter_y, y_step_q4,
-                          w, h);
-  }
-}
-
-void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4,
-                              int w, int h) {
-  if (y_step_q4 == 16 && filter_y[3] != 128) {
-    while (w >= 16) {
-      vp9_filter_block1d16_v8_sse2(src - src_stride * 3, src_stride,
-                                    dst, dst_stride,
-                                    h, filter_y);
-      src += 16;
-      dst += 16;
-      w -= 16;
-    }
-    while (w >= 8) {
-      vp9_filter_block1d8_v8_sse2(src - src_stride * 3, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_y);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    }
-    while (w >= 4) {
-      vp9_filter_block1d4_v8_sse2(src - src_stride * 3, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_y);
-      src += 4;
-      dst += 4;
-      w -= 4;
-    }
-  }
-  if (w) {
-    vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
-                         filter_x, x_step_q4, filter_y, y_step_q4,
-                         w, h);
-  }
-}
-
-void vp9_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
-                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4,
-                               int w, int h) {
-  if (x_step_q4 == 16 && filter_x[3] != 128) {
-    while (w >= 16) {
-      vp9_filter_block1d16_h8_avg_sse2(src, src_stride,
-                                    dst, dst_stride,
-                                    h, filter_x);
-      src += 16;
-      dst += 16;
-      w -= 16;
-    }
-    while (w >= 8) {
-      vp9_filter_block1d8_h8_avg_sse2(src, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_x);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    }
-    while (w >= 4) {
-      vp9_filter_block1d4_h8_avg_sse2(src, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_x);
-      src += 4;
-      dst += 4;
-      w -= 4;
-    }
-  }
-  if (w) {
-    vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
-                              filter_x, x_step_q4, filter_y, y_step_q4,
-                              w, h);
-  }
-}
-
-void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4,
-                              int w, int h) {
-  if (y_step_q4 == 16 && filter_y[3] != 128) {
-    while (w >= 16) {
-      vp9_filter_block1d16_v8_avg_sse2(src - src_stride * 3, src_stride,
-                                    dst, dst_stride,
-                                    h, filter_y);
-      src += 16;
-      dst += 16;
-      w -= 16;
-    }
-    while (w >= 8) {
-      vp9_filter_block1d8_v8_avg_sse2(src - src_stride * 3, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_y);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    }
-    while (w >= 4) {
-      vp9_filter_block1d4_v8_avg_sse2(src - src_stride * 3, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_y);
-      src += 4;
-      dst += 4;
-      w -= 4;
-    }
-  }
-  if (w) {
-    vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
-                             filter_x, x_step_q4, filter_y, y_step_q4,
-                             w, h);
-  }
-}
-
-void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
-                         uint8_t *dst, ptrdiff_t dst_stride,
-                         const int16_t *filter_x, int x_step_q4,
-                         const int16_t *filter_y, int y_step_q4,
-                         int w, int h) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71);
-
-  assert(w <= 64);
-  assert(h <= 64);
-  if (x_step_q4 == 16 && y_step_q4 == 16) {
-    vp9_convolve8_horiz_sse2(src - 3 * src_stride, src_stride, fdata2, 64,
-                              filter_x, x_step_q4, filter_y, y_step_q4,
-                              w, h + 7);
-    vp9_convolve8_vert_sse2(fdata2 + 3 * 64, 64, dst, dst_stride,
-                             filter_x, x_step_q4, filter_y, y_step_q4, w, h);
-  } else {
-    vp9_convolve8_c(src, src_stride, dst, dst_stride,
-                    filter_x, x_step_q4, filter_y, y_step_q4, w, h);
-  }
-}
-
-void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
-                         uint8_t *dst, ptrdiff_t dst_stride,
-                         const int16_t *filter_x, int x_step_q4,
-                         const int16_t *filter_y, int y_step_q4,
-                         int w, int h) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71);
-
-  assert(w <= 64);
-  assert(h <= 64);
-  if (x_step_q4 == 16 && y_step_q4 == 16) {
-    vp9_convolve8_horiz_sse2(src - 3 * src_stride, src_stride, fdata2, 64,
-                              filter_x, x_step_q4, filter_y, y_step_q4,
-                              w, h + 7);
-    vp9_convolve8_avg_vert_sse2(fdata2 + 3 * 64, 64, dst, dst_stride,
-                                 filter_x, x_step_q4, filter_y, y_step_q4,
-                                 w, h);
-  } else {
-    vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,
-                        filter_x, x_step_q4, filter_y, y_step_q4, w, h);
-  }
-}
+// void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                               uint8_t *dst, ptrdiff_t dst_stride,
+//                               const int16_t *filter_x, int x_step_q4,
+//                               const int16_t *filter_y, int y_step_q4,
+//                               int w, int h);
+// void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                              uint8_t *dst, ptrdiff_t dst_stride,
+//                              const int16_t *filter_x, int x_step_q4,
+//                              const int16_t *filter_y, int y_step_q4,
+//                              int w, int h);
+// void vp9_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                                   uint8_t *dst, ptrdiff_t dst_stride,
+//                                   const int16_t *filter_x, int x_step_q4,
+//                                   const int16_t *filter_y, int y_step_q4,
+//                                   int w, int h);
+// void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                                  uint8_t *dst, ptrdiff_t dst_stride,
+//                                  const int16_t *filter_x, int x_step_q4,
+//                                  const int16_t *filter_y, int y_step_q4,
+//                                  int w, int h);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
+FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
+FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2);
+
+// void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                         uint8_t *dst, ptrdiff_t dst_stride,
+//                         const int16_t *filter_x, int x_step_q4,
+//                         const int16_t *filter_y, int y_step_q4,
+//                         int w, int h);
+// void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                             uint8_t *dst, ptrdiff_t dst_stride,
+//                             const int16_t *filter_x, int x_step_q4,
+//                             const int16_t *filter_y, int y_step_q4,
+//                             int w, int h);
+FUN_CONV_2D(, sse2);
+FUN_CONV_2D(avg_ , sse2);
 #endif
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index 2f6149464..13a5b5a82 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -180,7 +180,7 @@ static INLINE void transpose_4x4(__m128i *res) {
   res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
 }
 
-static void idct4_1d_sse2(__m128i *in) {
+static void idct4_sse2(__m128i *in) {
   const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
@@ -216,7 +216,7 @@ static void idct4_1d_sse2(__m128i *in) {
   in[1] = _mm_shuffle_epi32(in[1], 0x4E);
 }
 
-static void iadst4_1d_sse2(__m128i *in) {
+static void iadst4_sse2(__m128i *in) {
   const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
   const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
   const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
@@ -276,20 +276,20 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
 
   switch (tx_type) {
     case 0:  // DCT_DCT
-      idct4_1d_sse2(in);
-      idct4_1d_sse2(in);
+      idct4_sse2(in);
+      idct4_sse2(in);
       break;
     case 1:  // ADST_DCT
-      idct4_1d_sse2(in);
-      iadst4_1d_sse2(in);
+      idct4_sse2(in);
+      iadst4_sse2(in);
       break;
     case 2:  // DCT_ADST
-      iadst4_1d_sse2(in);
-      idct4_1d_sse2(in);
+      iadst4_sse2(in);
+      idct4_sse2(in);
       break;
     case 3:  // ADST_ADST
-      iadst4_1d_sse2(in);
-      iadst4_1d_sse2(in);
+      iadst4_sse2(in);
+      iadst4_sse2(in);
       break;
     default:
       assert(0);
@@ -455,7 +455,7 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
       res1 = _mm_packs_epi32(tmp2, tmp3); \
   }
 
-#define IDCT8_1D(in0, in1, in2, in3, in4, in5, in6, in7, \
+#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \
                  out0, out1, out2, out3, out4, out5, out6, out7)  \
   { \
   /* Stage1 */      \
@@ -573,7 +573,7 @@ void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
                   in0, in1, in2, in3, in4, in5, in6, in7);
 
     // 4-stage 1D idct8x8
-    IDCT8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
+    IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
              in0, in1, in2, in3, in4, in5, in6, in7);
   }
 
@@ -674,7 +674,7 @@ static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) {
   out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6);
 }
 
-static void idct8_1d_sse2(__m128i *in) {
+static void idct8_sse2(__m128i *in) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
   const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
@@ -695,11 +695,11 @@ static void idct8_1d_sse2(__m128i *in) {
                 in0, in1, in2, in3, in4, in5, in6, in7);
 
   // 4-stage 1D idct8x8
-  IDCT8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
+  IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
            in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]);
 }
 
-static void iadst8_1d_sse2(__m128i *in) {
+static void iadst8_sse2(__m128i *in) {
   const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
   const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
   const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
@@ -946,20 +946,20 @@ void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride,
 
   switch (tx_type) {
     case 0:  // DCT_DCT
-      idct8_1d_sse2(in);
-      idct8_1d_sse2(in);
+      idct8_sse2(in);
+      idct8_sse2(in);
       break;
     case 1:  // ADST_DCT
-      idct8_1d_sse2(in);
-      iadst8_1d_sse2(in);
+      idct8_sse2(in);
+      iadst8_sse2(in);
       break;
     case 2:  // DCT_ADST
-      iadst8_1d_sse2(in);
-      idct8_1d_sse2(in);
+      iadst8_sse2(in);
+      idct8_sse2(in);
       break;
     case 3:  // ADST_ADST
-      iadst8_1d_sse2(in);
-      iadst8_1d_sse2(in);
+      iadst8_sse2(in);
+      iadst8_sse2(in);
       break;
     default:
       assert(0);
@@ -1104,7 +1104,7 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
 
   TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
 
-  IDCT8_1D(in0, in1, in2, in3, zero, zero, zero, zero,
+  IDCT8(in0, in1, in2, in3, zero, zero, zero, zero,
            in0, in1, in2, in3, in4, in5, in6, in7);
   // Final rounding and shift
   in0 = _mm_adds_epi16(in0, final_rounding);
@@ -1135,7 +1135,7 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   RECON_AND_STORE(dest, in7);
 }
 
-#define IDCT16_1D \
+#define IDCT16 \
   /* Stage2 */ \
   { \
     const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \
@@ -1264,7 +1264,7 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
                            stp2_10, stp2_13, stp2_11, stp2_12) \
   }
 
-#define IDCT16_10_1D \
+#define IDCT16_10 \
     /* Stage2 */ \
     { \
       const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \
@@ -1437,7 +1437,7 @@ void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
       array_transpose_8x8(in, in);
       array_transpose_8x8(in+8, in+8);
 
-      IDCT16_1D
+      IDCT16
 
       // Stage7
       curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
@@ -1465,7 +1465,7 @@ void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
       array_transpose_8x8(l+i*8, in);
       array_transpose_8x8(r+i*8, in+8);
 
-      IDCT16_1D
+      IDCT16
 
       // 2-D
       in[0] = _mm_add_epi16(stp2_0, stp1_15);
@@ -1590,7 +1590,7 @@ static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
   res0[15] = tbuf[7];
 }
 
-static void iadst16_1d_8col(__m128i *in) {
+static void iadst16_8col(__m128i *in) {
   // perform 16x16 1-D ADST for 8 columns
   __m128i s[16], x[16], u[32], v[32];
   const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
@@ -2060,7 +2060,7 @@ static void iadst16_1d_8col(__m128i *in) {
   in[15] = _mm_sub_epi16(kZero, s[1]);
 }
 
-static void idct16_1d_8col(__m128i *in) {
+static void idct16_8col(__m128i *in) {
   const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
   const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
   const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
@@ -2404,16 +2404,16 @@ static void idct16_1d_8col(__m128i *in) {
   in[15] = _mm_sub_epi16(s[0], s[15]);
 }
 
-static void idct16_1d_sse2(__m128i *in0, __m128i *in1) {
+static void idct16_sse2(__m128i *in0, __m128i *in1) {
   array_transpose_16x16(in0, in1);
-  idct16_1d_8col(in0);
-  idct16_1d_8col(in1);
+  idct16_8col(in0);
+  idct16_8col(in1);
 }
 
-static void iadst16_1d_sse2(__m128i *in0, __m128i *in1) {
+static void iadst16_sse2(__m128i *in0, __m128i *in1) {
   array_transpose_16x16(in0, in1);
-  iadst16_1d_8col(in0);
-  iadst16_1d_8col(in1);
+  iadst16_8col(in0);
+  iadst16_8col(in1);
 }
 
 static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) {
@@ -2502,20 +2502,20 @@ void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride,
 
   switch (tx_type) {
     case 0:  // DCT_DCT
-      idct16_1d_sse2(in0, in1);
-      idct16_1d_sse2(in0, in1);
+      idct16_sse2(in0, in1);
+      idct16_sse2(in0, in1);
       break;
     case 1:  // ADST_DCT
-      idct16_1d_sse2(in0, in1);
-      iadst16_1d_sse2(in0, in1);
+      idct16_sse2(in0, in1);
+      iadst16_sse2(in0, in1);
       break;
     case 2:  // DCT_ADST
-      iadst16_1d_sse2(in0, in1);
-      idct16_1d_sse2(in0, in1);
+      iadst16_sse2(in0, in1);
+      idct16_sse2(in0, in1);
       break;
     case 3:  // ADST_ADST
-      iadst16_1d_sse2(in0, in1);
-      iadst16_1d_sse2(in0, in1);
+      iadst16_sse2(in0, in1);
+      iadst16_sse2(in0, in1);
       break;
     default:
       assert(0);
@@ -2732,7 +2732,7 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
   for (i = 0; i < 2; i++) {
     array_transpose_4X8(l + 8*i, in);
 
-    IDCT16_10_1D
+    IDCT16_10
 
     // Stage7
     in[0] = _mm_add_epi16(stp2_0, stp1_15);
@@ -2814,7 +2814,7 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
     input += 8; \
   }  \
 
-#define IDCT32_1D_34 \
+#define IDCT32_34 \
 /* Stage1 */ \
 { \
   const __m128i zero = _mm_setzero_si128();\
@@ -3115,7 +3115,7 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
 }
 
 
-#define IDCT32_1D \
+#define IDCT32 \
 /* Stage1 */ \
 { \
   const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \
@@ -3554,7 +3554,7 @@ void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
   array_transpose_8x8(in+16, in+16);
   array_transpose_8x8(in+24, in+24);
 
-  IDCT32_1D
+  IDCT32
 
   // 1_D: Store 32 intermediate results for each 8x32 block.
   col[0] = _mm_add_epi16(stp1_0, stp1_31);
@@ -3593,7 +3593,7 @@ void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
       const __m128i zero = _mm_setzero_si128();
       // Transpose 32x8 block to 8x32 block
       array_transpose_8x8(col+i*8, in);
-      IDCT32_1D_34
+      IDCT32_34
 
       // 2_D: Calculate the results and store them to destination.
       in[0] = _mm_add_epi16(stp1_0, stp1_31);
@@ -3922,7 +3922,7 @@ void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
       array_transpose_8x8(in+16, in+16);
       array_transpose_8x8(in+24, in+24);
 
-      IDCT32_1D
+      IDCT32
 
       // 1_D: Store 32 intermediate results for each 8x32 block.
       col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
@@ -3969,7 +3969,7 @@ void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
       array_transpose_8x8(col+j+64, in+16);
       array_transpose_8x8(col+j+96, in+24);
 
-      IDCT32_1D
+      IDCT32
 
       // 2_D: Calculate the results and store them to destination.
       in[0] = _mm_add_epi16(stp1_0, stp1_31);
diff --git a/vp9/common/x86/vp9_postproc_x86.h b/vp9/common/x86/vp9_postproc_x86.h
index 8870215a2..cab9d34f2 100644
--- a/vp9/common/x86/vp9_postproc_x86.h
+++ b/vp9/common/x86/vp9_postproc_x86.h
@@ -12,6 +12,10 @@
 #ifndef VP9_COMMON_X86_VP9_POSTPROC_X86_H_
 #define VP9_COMMON_X86_VP9_POSTPROC_X86_H_
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /* Note:
  *
  * This platform is commonly built for runtime CPU detection. If you modify
@@ -61,4 +65,8 @@ extern prototype_postproc_addnoise(vp9_plane_add_noise_wmt);
 #endif
 #endif
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_X86_VP9_POSTPROC_X86_H_