Code refactor on InterpKernel

It in essence refactors the code for both the interpolation filtering and the convolution. This change includes the moving of all the files as well as the changing of the code from vp9_ prefix to vpx_ prefix accordingly, for underneath architectures: (1) x86; (2) arm/neon; and (3) mips/msa. The work on mips/drsp2 will be done in a separate change list. Change-Id: Ic3ce7fb7f81210db7628b373c73553db68793c46
author: Zoe Liu <zoeliu@google.com> 2015-07-22 10:40:42 -0700
committer: Zoe Liu <zoeliu@google.com> 2015-07-31 10:27:33 -0700
commit: 7186a2dd863028faa54d2e5fc7995693488af63c (patch)
tree: 62139af4f8934023aaa441b757107af434408812
parent: 0e3f494b217bde5e1d47107cdfbb044e4d801cec (diff)
download: libvpx-7186a2dd863028faa54d2e5fc7995693488af63c.tar
libvpx-7186a2dd863028faa54d2e5fc7995693488af63c.tar.gz
libvpx-7186a2dd863028faa54d2e5fc7995693488af63c.tar.bz2
libvpx-7186a2dd863028faa54d2e5fc7995693488af63c.zip
57 files changed, 1777 insertions, 1678 deletions
diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index b66da683c..90f9579a3 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -14,12 +14,15 @@
 
 #include "./vpx_config.h"
 #include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_filter.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 
@@ -945,7 +948,7 @@ void wrap_convolve8_horiz_sse2_8(const uint8_t *src, ptrdiff_t src_stride,
                                  const int16_t *filter_y,
                                  int filter_y_stride,
                                  int w, int h) {
-  vp9_highbd_convolve8_horiz_sse2(src, src_stride, dst, dst_stride, filter_x,
+  vpx_highbd_convolve8_horiz_sse2(src, src_stride, dst, dst_stride, filter_x,
                                   filter_x_stride, filter_y, filter_y_stride,
                                   w, h, 8);
 }
@@ -957,7 +960,7 @@ void wrap_convolve8_avg_horiz_sse2_8(const uint8_t *src, ptrdiff_t src_stride,
                                      const int16_t *filter_y,
                                      int filter_y_stride,
                                      int w, int h) {
-  vp9_highbd_convolve8_avg_horiz_sse2(src, src_stride, dst, dst_stride,
+  vpx_highbd_convolve8_avg_horiz_sse2(src, src_stride, dst, dst_stride,
                                       filter_x, filter_x_stride,
                                       filter_y, filter_y_stride, w, h, 8);
 }
@@ -969,7 +972,7 @@ void wrap_convolve8_vert_sse2_8(const uint8_t *src, ptrdiff_t src_stride,
                                 const int16_t *filter_y,
                                 int filter_y_stride,
                                 int w, int h) {
-  vp9_highbd_convolve8_vert_sse2(src, src_stride, dst, dst_stride,
+  vpx_highbd_convolve8_vert_sse2(src, src_stride, dst, dst_stride,
                                  filter_x, filter_x_stride,
                                  filter_y, filter_y_stride, w, h, 8);
 }
@@ -981,7 +984,7 @@ void wrap_convolve8_avg_vert_sse2_8(const uint8_t *src, ptrdiff_t src_stride,
                                     const int16_t *filter_y,
                                     int filter_y_stride,
                                     int w, int h) {
-  vp9_highbd_convolve8_avg_vert_sse2(src, src_stride, dst, dst_stride,
+  vpx_highbd_convolve8_avg_vert_sse2(src, src_stride, dst, dst_stride,
                                      filter_x, filter_x_stride,
                                      filter_y, filter_y_stride, w, h, 8);
 }
@@ -993,7 +996,7 @@ void wrap_convolve8_sse2_8(const uint8_t *src, ptrdiff_t src_stride,
                            const int16_t *filter_y,
                            int filter_y_stride,
                            int w, int h) {
-  vp9_highbd_convolve8_sse2(src, src_stride, dst, dst_stride,
+  vpx_highbd_convolve8_sse2(src, src_stride, dst, dst_stride,
                             filter_x, filter_x_stride,
                             filter_y, filter_y_stride, w, h, 8);
 }
@@ -1005,7 +1008,7 @@ void wrap_convolve8_avg_sse2_8(const uint8_t *src, ptrdiff_t src_stride,
                                const int16_t *filter_y,
                                int filter_y_stride,
                                int w, int h) {
-  vp9_highbd_convolve8_avg_sse2(src, src_stride, dst, dst_stride,
+  vpx_highbd_convolve8_avg_sse2(src, src_stride, dst, dst_stride,
                                 filter_x, filter_x_stride,
                                 filter_y, filter_y_stride, w, h, 8);
 }
@@ -1017,7 +1020,7 @@ void wrap_convolve8_horiz_sse2_10(const uint8_t *src, ptrdiff_t src_stride,
                                   const int16_t *filter_y,
                                   int filter_y_stride,
                                   int w, int h) {
-  vp9_highbd_convolve8_horiz_sse2(src, src_stride, dst, dst_stride,
+  vpx_highbd_convolve8_horiz_sse2(src, src_stride, dst, dst_stride,
                                   filter_x, filter_x_stride,
                                   filter_y, filter_y_stride, w, h, 10);
 }
@@ -1029,7 +1032,7 @@ void wrap_convolve8_avg_horiz_sse2_10(const uint8_t *src, ptrdiff_t src_stride,
                                       const int16_t *filter_y,
                                       int filter_y_stride,
                                       int w, int h) {
-  vp9_highbd_convolve8_avg_horiz_sse2(src, src_stride, dst, dst_stride,
+  vpx_highbd_convolve8_avg_horiz_sse2(src, src_stride, dst, dst_stride,
                                       filter_x, filter_x_stride,
                                       filter_y, filter_y_stride, w, h, 10);
 }
@@ -1041,7 +1044,7 @@ void wrap_convolve8_vert_sse2_10(const uint8_t *src, ptrdiff_t src_stride,
                                  const int16_t *filter_y,
                                  int filter_y_stride,
                                  int w, int h) {
-  vp9_highbd_convolve8_vert_sse2(src, src_stride, dst, dst_stride,
+  vpx_highbd_convolve8_vert_sse2(src, src_stride, dst, dst_stride,
                                  filter_x, filter_x_stride,
                                  filter_y, filter_y_stride, w, h, 10);
 }
@@ -1053,7 +1056,7 @@ void wrap_convolve8_avg_vert_sse2_10(const uint8_t *src, ptrdiff_t src_stride,
                                      const int16_t *filter_y,
                                      int filter_y_stride,
                                      int w, int h) {
-  vp9_highbd_convolve8_avg_vert_sse2(src, src_stride, dst, dst_stride,
+  vpx_highbd_convolve8_avg_vert_sse2(src, src_stride, dst, dst_stride,
                                      filter_x, filter_x_stride,
                                      filter_y, filter_y_stride, w, h, 10);
 }
@@ -1065,7 +1068,7 @@ void wrap_convolve8_sse2_10(const uint8_t *src, ptrdiff_t src_stride,
                             const int16_t *filter_y,
                             int filter_y_stride,
                             int w, int h) {
-  vp9_highbd_convolve8_sse2(src, src_stride, dst, dst_stride,
+  vpx_highbd_convolve8_sse2(src, src_stride, dst, dst_stride,
                             filter_x, filter_x_stride,
                             filter_y, filter_y_stride, w, h, 10);
 }
@@ -1077,7 +1080,7 @@ void wrap_convolve8_avg_sse2_10(const uint8_t *src, ptrdiff_t src_stride,
                                 const int16_t *filter_y,
                                 int filter_y_stride,
                                 int w, int h) {
-  vp9_highbd_convolve8_avg_sse2(src, src_stride, dst, dst_stride,
+  vpx_highbd_convolve8_avg_sse2(src, src_stride, dst, dst_stride,
                                 filter_x, filter_x_stride,
                                 filter_y, filter_y_stride, w, h, 10);
 }
@@ -1089,7 +1092,7 @@ void wrap_convolve8_horiz_sse2_12(const uint8_t *src, ptrdiff_t src_stride,
                                   const int16_t *filter_y,
                                   int filter_y_stride,
                                   int w, int h) {
-  vp9_highbd_convolve8_horiz_sse2(src, src_stride, dst, dst_stride,
+  vpx_highbd_convolve8_horiz_sse2(src, src_stride, dst, dst_stride,
                                   filter_x, filter_x_stride,
                                   filter_y, filter_y_stride, w, h, 12);
 }
@@ -1101,7 +1104,7 @@ void wrap_convolve8_avg_horiz_sse2_12(const uint8_t *src, ptrdiff_t src_stride,
                                       const int16_t *filter_y,
                                       int filter_y_stride,
                                       int w, int h) {
-  vp9_highbd_convolve8_avg_horiz_sse2(src, src_stride, dst, dst_stride,
+  vpx_highbd_convolve8_avg_horiz_sse2(src, src_stride, dst, dst_stride,
                                       filter_x, filter_x_stride,
                                       filter_y, filter_y_stride, w, h, 12);
 }
@@ -1113,7 +1116,7 @@ void wrap_convolve8_vert_sse2_12(const uint8_t *src, ptrdiff_t src_stride,
                                  const int16_t *filter_y,
                                  int filter_y_stride,
                                  int w, int h) {
-  vp9_highbd_convolve8_vert_sse2(src, src_stride, dst, dst_stride,
+  vpx_highbd_convolve8_vert_sse2(src, src_stride, dst, dst_stride,
                                  filter_x, filter_x_stride,
                                  filter_y, filter_y_stride, w, h, 12);
 }
@@ -1125,7 +1128,7 @@ void wrap_convolve8_avg_vert_sse2_12(const uint8_t *src, ptrdiff_t src_stride,
                                      const int16_t *filter_y,
                                      int filter_y_stride,
                                      int w, int h) {
-  vp9_highbd_convolve8_avg_vert_sse2(src, src_stride, dst, dst_stride,
+  vpx_highbd_convolve8_avg_vert_sse2(src, src_stride, dst, dst_stride,
                                      filter_x, filter_x_stride,
                                      filter_y, filter_y_stride, w, h, 12);
 }
@@ -1137,7 +1140,7 @@ void wrap_convolve8_sse2_12(const uint8_t *src, ptrdiff_t src_stride,
                             const int16_t *filter_y,
                             int filter_y_stride,
                             int w, int h) {
-  vp9_highbd_convolve8_sse2(src, src_stride, dst, dst_stride,
+  vpx_highbd_convolve8_sse2(src, src_stride, dst, dst_stride,
                             filter_x, filter_x_stride,
                             filter_y, filter_y_stride, w, h, 12);
 }
@@ -1149,7 +1152,7 @@ void wrap_convolve8_avg_sse2_12(const uint8_t *src, ptrdiff_t src_stride,
                                 const int16_t *filter_y,
                                 int filter_y_stride,
                                 int w, int h) {
-  vp9_highbd_convolve8_avg_sse2(src, src_stride, dst, dst_stride,
+  vpx_highbd_convolve8_avg_sse2(src, src_stride, dst, dst_stride,
                                 filter_x, filter_x_stride,
                                 filter_y, filter_y_stride, w, h, 12);
 }
@@ -1162,7 +1165,7 @@ void wrap_convolve_copy_c_8(const uint8_t *src, ptrdiff_t src_stride,
                             const int16_t *filter_y,
                             int filter_y_stride,
                             int w, int h) {
-  vp9_highbd_convolve_copy_c(src, src_stride, dst, dst_stride,
+  vpx_highbd_convolve_copy_c(src, src_stride, dst, dst_stride,
                              filter_x, filter_x_stride,
                              filter_y, filter_y_stride, w, h, 8);
 }
@@ -1174,7 +1177,7 @@ void wrap_convolve_avg_c_8(const uint8_t *src, ptrdiff_t src_stride,
                            const int16_t *filter_y,
                            int filter_y_stride,
                            int w, int h) {
-  vp9_highbd_convolve_avg_c(src, src_stride, dst, dst_stride,
+  vpx_highbd_convolve_avg_c(src, src_stride, dst, dst_stride,
                             filter_x, filter_x_stride,
                             filter_y, filter_y_stride, w, h, 8);
 }
@@ -1186,7 +1189,7 @@ void wrap_convolve8_horiz_c_8(const uint8_t *src, ptrdiff_t src_stride,
                               const int16_t *filter_y,
                               int filter_y_stride,
                               int w, int h) {
-  vp9_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride,
+  vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride,
                                filter_x, filter_x_stride,
                                filter_y, filter_y_stride, w, h, 8);
 }
@@ -1198,7 +1201,7 @@ void wrap_convolve8_avg_horiz_c_8(const uint8_t *src, ptrdiff_t src_stride,
                                   const int16_t *filter_y,
                                   int filter_y_stride,
                                   int w, int h) {
-  vp9_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
+  vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
                                    filter_x, filter_x_stride,
                                    filter_y, filter_y_stride, w, h, 8);
 }
@@ -1210,7 +1213,7 @@ void wrap_convolve8_vert_c_8(const uint8_t *src, ptrdiff_t src_stride,
                              const int16_t *filter_y,
                              int filter_y_stride,
                              int w, int h) {
-  vp9_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride,
+  vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride,
                               filter_x, filter_x_stride,
                               filter_y, filter_y_stride, w, h, 8);
 }
@@ -1222,7 +1225,7 @@ void wrap_convolve8_avg_vert_c_8(const uint8_t *src, ptrdiff_t src_stride,
                                  const int16_t *filter_y,
                                  int filter_y_stride,
                                  int w, int h) {
-  vp9_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
+  vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
                                   filter_x, filter_x_stride,
                                   filter_y, filter_y_stride, w, h, 8);
 }
@@ -1234,7 +1237,7 @@ void wrap_convolve8_c_8(const uint8_t *src, ptrdiff_t src_stride,
                         const int16_t *filter_y,
                         int filter_y_stride,
                         int w, int h) {
-  vp9_highbd_convolve8_c(src, src_stride, dst, dst_stride,
+  vpx_highbd_convolve8_c(src, src_stride, dst, dst_stride,
                          filter_x, filter_x_stride,
                          filter_y, filter_y_stride, w, h, 8);
 }
@@ -1246,7 +1249,7 @@ void wrap_convolve8_avg_c_8(const uint8_t *src, ptrdiff_t src_stride,
                             const int16_t *filter_y,
                             int filter_y_stride,
                             int w, int h) {
-  vp9_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride,
+  vpx_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride,
                              filter_x, filter_x_stride,
                              filter_y, filter_y_stride, w, h, 8);
 }
@@ -1258,7 +1261,7 @@ void wrap_convolve_copy_c_10(const uint8_t *src, ptrdiff_t src_stride,
                              const int16_t *filter_y,
                              int filter_y_stride,
                              int w, int h) {
-  vp9_highbd_convolve_copy_c(src, src_stride, dst, dst_stride,
+  vpx_highbd_convolve_copy_c(src, src_stride, dst, dst_stride,
                              filter_x, filter_x_stride,
                              filter_y, filter_y_stride, w, h, 10);
 }
@@ -1270,7 +1273,7 @@ void wrap_convolve_avg_c_10(const uint8_t *src, ptrdiff_t src_stride,
                             const int16_t *filter_y,
                             int filter_y_stride,
                             int w, int h) {
-  vp9_highbd_convolve_avg_c(src, src_stride, dst, dst_stride,
+  vpx_highbd_convolve_avg_c(src, src_stride, dst, dst_stride,
                             filter_x, filter_x_stride,
                             filter_y, filter_y_stride, w, h, 10);
 }
@@ -1282,7 +1285,7 @@ void wrap_convolve8_horiz_c_10(const uint8_t *src, ptrdiff_t src_stride,
                                const int16_t *filter_y,
                                int filter_y_stride,
                                int w, int h) {
-  vp9_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride,
+  vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride,
                                filter_x, filter_x_stride,
                                filter_y, filter_y_stride, w, h, 10);
 }
@@ -1294,7 +1297,7 @@ void wrap_convolve8_avg_horiz_c_10(const uint8_t *src, ptrdiff_t src_stride,
                                    const int16_t *filter_y,
                                    int filter_y_stride,
                                    int w, int h) {
-  vp9_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
+  vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
                                    filter_x, filter_x_stride,
                                    filter_y, filter_y_stride, w, h, 10);
 }
@@ -1306,7 +1309,7 @@ void wrap_convolve8_vert_c_10(const uint8_t *src, ptrdiff_t src_stride,
                               const int16_t *filter_y,
                               int filter_y_stride,
                               int w, int h) {
-  vp9_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride,
+  vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride,
                               filter_x, filter_x_stride,
                               filter_y, filter_y_stride, w, h, 10);
 }
@@ -1318,7 +1321,7 @@ void wrap_convolve8_avg_vert_c_10(const uint8_t *src, ptrdiff_t src_stride,
                                   const int16_t *filter_y,
                                   int filter_y_stride,
                                   int w, int h) {
-  vp9_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
+  vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
                                   filter_x, filter_x_stride,
                                   filter_y, filter_y_stride, w, h, 10);
 }
@@ -1330,7 +1333,7 @@ void wrap_convolve8_c_10(const uint8_t *src, ptrdiff_t src_stride,
                          const int16_t *filter_y,
                          int filter_y_stride,
                          int w, int h) {
-  vp9_highbd_convolve8_c(src, src_stride, dst, dst_stride,
+  vpx_highbd_convolve8_c(src, src_stride, dst, dst_stride,
                          filter_x, filter_x_stride,
                          filter_y, filter_y_stride, w, h, 10);
 }
@@ -1342,7 +1345,7 @@ void wrap_convolve8_avg_c_10(const uint8_t *src, ptrdiff_t src_stride,
                              const int16_t *filter_y,
                              int filter_y_stride,
                              int w, int h) {
-  vp9_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride,
+  vpx_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride,
                              filter_x, filter_x_stride,
                              filter_y, filter_y_stride, w, h, 10);
 }
@@ -1354,7 +1357,7 @@ void wrap_convolve_copy_c_12(const uint8_t *src, ptrdiff_t src_stride,
                              const int16_t *filter_y,
                              int filter_y_stride,
                              int w, int h) {
-  vp9_highbd_convolve_copy_c(src, src_stride, dst, dst_stride,
+  vpx_highbd_convolve_copy_c(src, src_stride, dst, dst_stride,
                              filter_x, filter_x_stride,
                              filter_y, filter_y_stride, w, h, 12);
 }
@@ -1366,7 +1369,7 @@ void wrap_convolve_avg_c_12(const uint8_t *src, ptrdiff_t src_stride,
                             const int16_t *filter_y,
                             int filter_y_stride,
                             int w, int h) {
-  vp9_highbd_convolve_avg_c(src, src_stride, dst, dst_stride,
+  vpx_highbd_convolve_avg_c(src, src_stride, dst, dst_stride,
                             filter_x, filter_x_stride,
                             filter_y, filter_y_stride, w, h, 12);
 }
@@ -1378,7 +1381,7 @@ void wrap_convolve8_horiz_c_12(const uint8_t *src, ptrdiff_t src_stride,
                                const int16_t *filter_y,
                                int filter_y_stride,
                                int w, int h) {
-  vp9_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride,
+  vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride,
                                filter_x, filter_x_stride,
                                filter_y, filter_y_stride, w, h, 12);
 }
@@ -1390,7 +1393,7 @@ void wrap_convolve8_avg_horiz_c_12(const uint8_t *src, ptrdiff_t src_stride,
                                    const int16_t *filter_y,
                                    int filter_y_stride,
                                    int w, int h) {
-  vp9_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
+  vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
                                    filter_x, filter_x_stride,
                                    filter_y, filter_y_stride, w, h, 12);
 }
@@ -1402,7 +1405,7 @@ void wrap_convolve8_vert_c_12(const uint8_t *src, ptrdiff_t src_stride,
                               const int16_t *filter_y,
                               int filter_y_stride,
                               int w, int h) {
-  vp9_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride,
+  vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride,
                               filter_x, filter_x_stride,
                               filter_y, filter_y_stride, w, h, 12);
 }
@@ -1414,7 +1417,7 @@ void wrap_convolve8_avg_vert_c_12(const uint8_t *src, ptrdiff_t src_stride,
                                   const int16_t *filter_y,
                                   int filter_y_stride,
                                   int w, int h) {
-  vp9_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
+  vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
                                   filter_x, filter_x_stride,
                                   filter_y, filter_y_stride, w, h, 12);
 }
@@ -1426,7 +1429,7 @@ void wrap_convolve8_c_12(const uint8_t *src, ptrdiff_t src_stride,
                          const int16_t *filter_y,
                          int filter_y_stride,
                          int w, int h) {
-  vp9_highbd_convolve8_c(src, src_stride, dst, dst_stride,
+  vpx_highbd_convolve8_c(src, src_stride, dst, dst_stride,
                          filter_x, filter_x_stride,
                          filter_y, filter_y_stride, w, h, 12);
 }
@@ -1438,7 +1441,7 @@ void wrap_convolve8_avg_c_12(const uint8_t *src, ptrdiff_t src_stride,
                              const int16_t *filter_y,
                              int filter_y_stride,
                              int w, int h) {
-  vp9_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride,
+  vpx_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride,
                              filter_x, filter_x_stride,
                              filter_y, filter_y_stride, w, h, 12);
 }
@@ -1504,10 +1507,10 @@ INSTANTIATE_TEST_CASE_P(C_12, ConvolveTest, ::testing::Values(
 #else
 
 const ConvolveFunctions convolve8_c(
-    vp9_convolve_copy_c, vp9_convolve_avg_c,
-    vp9_convolve8_horiz_c, vp9_convolve8_avg_horiz_c,
-    vp9_convolve8_vert_c, vp9_convolve8_avg_vert_c,
-    vp9_convolve8_c, vp9_convolve8_avg_c, 0);
+    vpx_convolve_copy_c, vpx_convolve_avg_c,
+    vpx_convolve8_horiz_c, vpx_convolve8_avg_horiz_c,
+    vpx_convolve8_vert_c, vpx_convolve8_avg_vert_c,
+    vpx_convolve8_c, vpx_convolve8_avg_c, 0);
 
 INSTANTIATE_TEST_CASE_P(C, ConvolveTest, ::testing::Values(
     make_tuple(4, 4, &convolve8_c),
@@ -1585,13 +1588,13 @@ INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest, ::testing::Values(
 #else
 const ConvolveFunctions convolve8_sse2(
 #if CONFIG_USE_X86INC
-    vp9_convolve_copy_sse2, vp9_convolve_avg_sse2,
+    vpx_convolve_copy_sse2, vpx_convolve_avg_sse2,
 #else
-    vp9_convolve_copy_c, vp9_convolve_avg_c,
+    vpx_convolve_copy_c, vpx_convolve_avg_c,
 #endif  // CONFIG_USE_X86INC
-    vp9_convolve8_horiz_sse2, vp9_convolve8_avg_horiz_sse2,
-    vp9_convolve8_vert_sse2, vp9_convolve8_avg_vert_sse2,
-    vp9_convolve8_sse2, vp9_convolve8_avg_sse2, 0);
+    vpx_convolve8_horiz_sse2, vpx_convolve8_avg_horiz_sse2,
+    vpx_convolve8_vert_sse2, vpx_convolve8_avg_vert_sse2,
+    vpx_convolve8_sse2, vpx_convolve8_avg_sse2, 0);
 
 INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest, ::testing::Values(
     make_tuple(4, 4, &convolve8_sse2),
@@ -1612,10 +1615,10 @@ INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest, ::testing::Values(
 
 #if HAVE_SSSE3
 const ConvolveFunctions convolve8_ssse3(
-    vp9_convolve_copy_c, vp9_convolve_avg_c,
-    vp9_convolve8_horiz_ssse3, vp9_convolve8_avg_horiz_ssse3,
-    vp9_convolve8_vert_ssse3, vp9_convolve8_avg_vert_ssse3,
-    vp9_convolve8_ssse3, vp9_convolve8_avg_ssse3, 0);
+    vpx_convolve_copy_c, vpx_convolve_avg_c,
+    vpx_convolve8_horiz_ssse3, vpx_convolve8_avg_horiz_ssse3,
+    vpx_convolve8_vert_ssse3, vpx_convolve8_avg_vert_ssse3,
+    vpx_convolve8_ssse3, vpx_convolve8_avg_ssse3, 0);
 
 INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest, ::testing::Values(
     make_tuple(4, 4, &convolve8_ssse3),
@@ -1635,10 +1638,10 @@ INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest, ::testing::Values(
 
 #if HAVE_AVX2 && HAVE_SSSE3
 const ConvolveFunctions convolve8_avx2(
-    vp9_convolve_copy_c, vp9_convolve_avg_c,
-    vp9_convolve8_horiz_avx2, vp9_convolve8_avg_horiz_ssse3,
-    vp9_convolve8_vert_avx2, vp9_convolve8_avg_vert_ssse3,
-    vp9_convolve8_avx2, vp9_convolve8_avg_ssse3, 0);
+    vpx_convolve_copy_c, vpx_convolve_avg_c,
+    vpx_convolve8_horiz_avx2, vpx_convolve8_avg_horiz_ssse3,
+    vpx_convolve8_vert_avx2, vpx_convolve8_avg_vert_ssse3,
+    vpx_convolve8_avx2, vpx_convolve8_avg_ssse3, 0);
 
 INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest, ::testing::Values(
     make_tuple(4, 4, &convolve8_avx2),
@@ -1659,16 +1662,16 @@ INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest, ::testing::Values(
 #if HAVE_NEON
 #if HAVE_NEON_ASM
 const ConvolveFunctions convolve8_neon(
-    vp9_convolve_copy_neon, vp9_convolve_avg_neon,
-    vp9_convolve8_horiz_neon, vp9_convolve8_avg_horiz_neon,
-    vp9_convolve8_vert_neon, vp9_convolve8_avg_vert_neon,
-    vp9_convolve8_neon, vp9_convolve8_avg_neon, 0);
+    vpx_convolve_copy_neon, vpx_convolve_avg_neon,
+    vpx_convolve8_horiz_neon, vpx_convolve8_avg_horiz_neon,
+    vpx_convolve8_vert_neon, vpx_convolve8_avg_vert_neon,
+    vpx_convolve8_neon, vpx_convolve8_avg_neon, 0);
 #else  // HAVE_NEON
 const ConvolveFunctions convolve8_neon(
-    vp9_convolve_copy_neon, vp9_convolve_avg_neon,
-    vp9_convolve8_horiz_neon, vp9_convolve8_avg_horiz_neon,
-    vp9_convolve8_vert_neon, vp9_convolve8_avg_vert_neon,
-    vp9_convolve8_neon, vp9_convolve8_avg_neon, 0);
+    vpx_convolve_copy_neon, vpx_convolve_avg_neon,
+    vpx_convolve8_horiz_neon, vpx_convolve8_avg_horiz_neon,
+    vpx_convolve8_vert_neon, vpx_convolve8_avg_vert_neon,
+    vpx_convolve8_neon, vpx_convolve8_avg_neon, 0);
 #endif  // HAVE_NEON_ASM
 
 INSTANTIATE_TEST_CASE_P(NEON, ConvolveTest, ::testing::Values(
@@ -1689,10 +1692,10 @@ INSTANTIATE_TEST_CASE_P(NEON, ConvolveTest, ::testing::Values(
 
 #if HAVE_DSPR2
 const ConvolveFunctions convolve8_dspr2(
-    vp9_convolve_copy_dspr2, vp9_convolve_avg_dspr2,
-    vp9_convolve8_horiz_dspr2, vp9_convolve8_avg_horiz_dspr2,
-    vp9_convolve8_vert_dspr2, vp9_convolve8_avg_vert_dspr2,
-    vp9_convolve8_dspr2, vp9_convolve8_avg_dspr2, 0);
+    vpx_convolve_copy_dspr2, vpx_convolve_avg_dspr2,
+    vpx_convolve8_horiz_dspr2, vpx_convolve8_avg_horiz_dspr2,
+    vpx_convolve8_vert_dspr2, vpx_convolve8_avg_vert_dspr2,
+    vpx_convolve8_dspr2, vpx_convolve8_avg_dspr2, 0);
 
 INSTANTIATE_TEST_CASE_P(DSPR2, ConvolveTest, ::testing::Values(
     make_tuple(4, 4, &convolve8_dspr2),
@@ -1712,10 +1715,10 @@ INSTANTIATE_TEST_CASE_P(DSPR2, ConvolveTest, ::testing::Values(
 
 #if HAVE_MSA
 const ConvolveFunctions convolve8_msa(
-    vp9_convolve_copy_msa, vp9_convolve_avg_msa,
-    vp9_convolve8_horiz_msa, vp9_convolve8_avg_horiz_msa,
-    vp9_convolve8_vert_msa, vp9_convolve8_avg_vert_msa,
-    vp9_convolve8_msa, vp9_convolve8_avg_msa, 0);
+    vpx_convolve_copy_msa, vpx_convolve_avg_msa,
+    vpx_convolve8_horiz_msa, vpx_convolve8_avg_horiz_msa,
+    vpx_convolve8_vert_msa, vpx_convolve8_avg_vert_msa,
+    vpx_convolve8_msa, vpx_convolve8_avg_msa, 0);
 
 INSTANTIATE_TEST_CASE_P(MSA, ConvolveTest, ::testing::Values(
     make_tuple(4, 4, &convolve8_msa),
diff --git a/vp9/common/arm/neon/vp9_convolve8_avg_neon.c b/vp9/common/arm/neon/vp9_convolve8_avg_neon.c
deleted file mode 100644
index dd569d348..000000000
--- a/vp9/common/arm/neon/vp9_convolve8_avg_neon.c
+++ /dev/null
@@ -1,390 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stddef.h>
-#include <arm_neon.h>
-
-#include "./vpx_config.h"
-#include "vpx_ports/mem.h"
-
-void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
-                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4,
-                               int w, int h);
-void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
-                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4,
-                               int w, int h);
-
-static INLINE int32x4_t MULTIPLY_BY_Q0(
-        int16x4_t dsrc0,
-        int16x4_t dsrc1,
-        int16x4_t dsrc2,
-        int16x4_t dsrc3,
-        int16x4_t dsrc4,
-        int16x4_t dsrc5,
-        int16x4_t dsrc6,
-        int16x4_t dsrc7,
-        int16x8_t q0s16) {
-    int32x4_t qdst;
-    int16x4_t d0s16, d1s16;
-
-    d0s16 = vget_low_s16(q0s16);
-    d1s16 = vget_high_s16(q0s16);
-
-    qdst = vmull_lane_s16(dsrc0, d0s16, 0);
-    qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1);
-    qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2);
-    qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3);
-    qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0);
-    qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1);
-    qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2);
-    qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3);
-    return qdst;
-}
-
-void vp9_convolve8_avg_horiz_neon(
-        uint8_t *src,
-        ptrdiff_t src_stride,
-        uint8_t *dst,
-        ptrdiff_t dst_stride,
-        const int16_t *filter_x,
-        int x_step_q4,
-        const int16_t *filter_y,  // unused
-        int y_step_q4,            // unused
-        int w,
-        int h) {
-    int width;
-    uint8_t *s, *d;
-    uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8;
-    uint32x2_t d2u32, d3u32, d6u32, d7u32, d28u32, d29u32, d30u32, d31u32;
-    uint8x16_t q1u8, q3u8, q12u8, q13u8, q14u8, q15u8;
-    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16;
-    int16x4_t d24s16, d25s16, d26s16, d27s16;
-    uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16;
-    int16x8_t q0s16;
-    uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
-    int32x4_t q1s32, q2s32, q14s32, q15s32;
-    uint16x8x2_t q0x2u16;
-    uint8x8x2_t d0x2u8, d1x2u8;
-    uint32x2x2_t d0x2u32;
-    uint16x4x2_t d0x2u16, d1x2u16;
-    uint32x4x2_t q0x2u32;
-
-    if (x_step_q4 != 16) {
-        vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
-                                  filter_x, x_step_q4,
-                                  filter_y, y_step_q4, w, h);
-        return;
-    }
-
-    q0s16 = vld1q_s16(filter_x);
-
-    src -= 3;  // adjust for taps
-    for (; h > 0; h -= 4) {  // loop_horiz_v
-        s = src;
-        d24u8 = vld1_u8(s);
-        s += src_stride;
-        d25u8 = vld1_u8(s);
-        s += src_stride;
-        d26u8 = vld1_u8(s);
-        s += src_stride;
-        d27u8 = vld1_u8(s);
-
-        q12u8 = vcombine_u8(d24u8, d25u8);
-        q13u8 = vcombine_u8(d26u8, d27u8);
-
-        q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8),
-                            vreinterpretq_u16_u8(q13u8));
-        d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
-        d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
-        d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
-        d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1]));
-        d0x2u8 = vtrn_u8(d24u8, d25u8);
-        d1x2u8 = vtrn_u8(d26u8, d27u8);
-
-        __builtin_prefetch(src + src_stride * 4);
-        __builtin_prefetch(src + src_stride * 5);
-
-        q8u16 = vmovl_u8(d0x2u8.val[0]);
-        q9u16 = vmovl_u8(d0x2u8.val[1]);
-        q10u16 = vmovl_u8(d1x2u8.val[0]);
-        q11u16 = vmovl_u8(d1x2u8.val[1]);
-
-        src += 7;
-        d16u16 = vget_low_u16(q8u16);
-        d17u16 = vget_high_u16(q8u16);
-        d18u16 = vget_low_u16(q9u16);
-        d19u16 = vget_high_u16(q9u16);
-        q8u16 = vcombine_u16(d16u16, d18u16);  // vswp 17 18
-        q9u16 = vcombine_u16(d17u16, d19u16);
-
-        d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
-        d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));  // vmov 23 21
-        for (width = w;
-             width > 0;
-             width -= 4, src += 4, dst += 4) {  // loop_horiz
-            s = src;
-            d28u32 = vld1_dup_u32((const uint32_t *)s);
-            s += src_stride;
-            d29u32 = vld1_dup_u32((const uint32_t *)s);
-            s += src_stride;
-            d31u32 = vld1_dup_u32((const uint32_t *)s);
-            s += src_stride;
-            d30u32 = vld1_dup_u32((const uint32_t *)s);
-
-            __builtin_prefetch(src + 64);
-
-            d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32),
-                               vreinterpret_u16_u32(d31u32));
-            d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32),
-                               vreinterpret_u16_u32(d30u32));
-            d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]),   // d28
-                             vreinterpret_u8_u16(d1x2u16.val[0]));  // d29
-            d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]),   // d31
-                             vreinterpret_u8_u16(d1x2u16.val[1]));  // d30
-
-            __builtin_prefetch(src + 64 + src_stride);
-
-            q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
-            q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
-            q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8),
-                                vreinterpretq_u32_u8(q15u8));
-
-            d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
-            d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
-            q12u16 = vmovl_u8(d28u8);
-            q13u16 = vmovl_u8(d29u8);
-
-            __builtin_prefetch(src + 64 + src_stride * 2);
-
-            d = dst;
-            d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0);
-            d += dst_stride;
-            d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0);
-            d += dst_stride;
-            d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1);
-            d += dst_stride;
-            d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1);
-
-            d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
-            d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
-            d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
-            d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
-            d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-            d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-            d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-            d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-            d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-
-            q1s32  = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16,
-                                    d18s16, d19s16, d23s16, d24s16, q0s16);
-            q2s32  = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16,
-                                    d19s16, d23s16, d24s16, d26s16, q0s16);
-            q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16,
-                                    d23s16, d24s16, d26s16, d27s16, q0s16);
-            q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16,
-                                    d24s16, d26s16, d27s16, d25s16, q0s16);
-
-            __builtin_prefetch(src + 64 + src_stride * 3);
-
-            d2u16 = vqrshrun_n_s32(q1s32, 7);
-            d3u16 = vqrshrun_n_s32(q2s32, 7);
-            d4u16 = vqrshrun_n_s32(q14s32, 7);
-            d5u16 = vqrshrun_n_s32(q15s32, 7);
-
-            q1u16 = vcombine_u16(d2u16, d3u16);
-            q2u16 = vcombine_u16(d4u16, d5u16);
-
-            d2u8 = vqmovn_u16(q1u16);
-            d3u8 = vqmovn_u16(q2u16);
-
-            d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8),
-                               vreinterpret_u16_u8(d3u8));
-            d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
-                               vreinterpret_u32_u16(d0x2u16.val[1]));
-            d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
-                             vreinterpret_u8_u32(d0x2u32.val[1]));
-
-            q1u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
-            q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32));
-
-            q1u8 = vrhaddq_u8(q1u8, q3u8);
-
-            d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8));
-            d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8));
-
-            d = dst;
-            vst1_lane_u32((uint32_t *)d, d2u32, 0);
-            d += dst_stride;
-            vst1_lane_u32((uint32_t *)d, d3u32, 0);
-            d += dst_stride;
-            vst1_lane_u32((uint32_t *)d, d2u32, 1);
-            d += dst_stride;
-            vst1_lane_u32((uint32_t *)d, d3u32, 1);
-
-            q8u16 = q9u16;
-            d20s16 = d23s16;
-            q11u16 = q12u16;
-            q9u16 = q13u16;
-            d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-        }
-        src += src_stride * 4 - w - 7;
-        dst += dst_stride * 4 - w;
-    }
-    return;
-}
-
-void vp9_convolve8_avg_vert_neon(
-        uint8_t *src,
-        ptrdiff_t src_stride,
-        uint8_t *dst,
-        ptrdiff_t dst_stride,
-        const int16_t *filter_x,  // unused
-        int x_step_q4,            // unused
-        const int16_t *filter_y,
-        int y_step_q4,
-        int w,
-        int h) {
-    int height;
-    uint8_t *s, *d;
-    uint8x8_t d2u8, d3u8;
-    uint32x2_t d2u32, d3u32, d6u32, d7u32;
-    uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32;
-    uint8x16_t q1u8, q3u8;
-    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16;
-    int16x4_t d24s16, d25s16, d26s16, d27s16;
-    uint16x4_t d2u16, d3u16, d4u16, d5u16;
-    int16x8_t q0s16;
-    uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
-    int32x4_t q1s32, q2s32, q14s32, q15s32;
-
-    if (y_step_q4 != 16) {
-        vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
-                                 filter_x, x_step_q4,
-                                 filter_y, y_step_q4, w, h);
-        return;
-    }
-
-    src -= src_stride * 3;
-    q0s16 = vld1q_s16(filter_y);
-    for (; w > 0; w -= 4, src += 4, dst += 4) {  // loop_vert_h
-        s = src;
-        d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0);
-        s += src_stride;
-        d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1);
-        s += src_stride;
-        d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0);
-        s += src_stride;
-        d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1);
-        s += src_stride;
-        d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0);
-        s += src_stride;
-        d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1);
-        s += src_stride;
-        d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0);
-        s += src_stride;
-
-        q8u16  = vmovl_u8(vreinterpret_u8_u32(d16u32));
-        q9u16  = vmovl_u8(vreinterpret_u8_u32(d18u32));
-        q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32));
-        q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32));
-
-        d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
-        d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
-        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-        d = dst;
-        for (height = h; height > 0; height -= 4) {  // loop_vert
-            d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0);
-            s += src_stride;
-            d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0);
-            s += src_stride;
-            d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1);
-            s += src_stride;
-            d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1);
-            s += src_stride;
-
-            q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32));
-            q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32));
-
-            d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0);
-            d += dst_stride;
-            d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1);
-            d += dst_stride;
-            d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0);
-            d += dst_stride;
-            d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1);
-            d -= dst_stride * 3;
-
-            d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
-            d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
-            d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
-            d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
-            d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-            d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-            d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-            d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-
-            __builtin_prefetch(s);
-            __builtin_prefetch(s + src_stride);
-            q1s32  = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16,
-                                    d20s16, d21s16, d22s16, d24s16, q0s16);
-            __builtin_prefetch(s + src_stride * 2);
-            __builtin_prefetch(s + src_stride * 3);
-            q2s32  = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16,
-                                    d21s16, d22s16, d24s16, d26s16, q0s16);
-            __builtin_prefetch(d);
-            __builtin_prefetch(d + dst_stride);
-            q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16,
-                                    d22s16, d24s16, d26s16, d27s16, q0s16);
-            __builtin_prefetch(d + dst_stride * 2);
-            __builtin_prefetch(d + dst_stride * 3);
-            q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16,
-                                    d24s16, d26s16, d27s16, d25s16, q0s16);
-
-            d2u16 = vqrshrun_n_s32(q1s32, 7);
-            d3u16 = vqrshrun_n_s32(q2s32, 7);
-            d4u16 = vqrshrun_n_s32(q14s32, 7);
-            d5u16 = vqrshrun_n_s32(q15s32, 7);
-
-            q1u16 = vcombine_u16(d2u16, d3u16);
-            q2u16 = vcombine_u16(d4u16, d5u16);
-
-            d2u8 = vqmovn_u16(q1u16);
-            d3u8 = vqmovn_u16(q2u16);
-
-            q1u8 = vcombine_u8(d2u8, d3u8);
-            q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32));
-
-            q1u8 = vrhaddq_u8(q1u8, q3u8);
-
-            d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8));
-            d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8));
-
-            vst1_lane_u32((uint32_t *)d, d2u32, 0);
-            d += dst_stride;
-            vst1_lane_u32((uint32_t *)d, d2u32, 1);
-            d += dst_stride;
-            vst1_lane_u32((uint32_t *)d, d3u32, 0);
-            d += dst_stride;
-            vst1_lane_u32((uint32_t *)d, d3u32, 1);
-            d += dst_stride;
-
-            q8u16 = q10u16;
-            d18s16 = d22s16;
-            d19s16 = d24s16;
-            q10u16 = q13u16;
-            d22s16 = d25s16;
-        }
-    }
-    return;
-}
diff --git a/vp9/common/arm/neon/vp9_convolve8_neon.c b/vp9/common/arm/neon/vp9_convolve8_neon.c
deleted file mode 100644
index 5c555c458..000000000
--- a/vp9/common/arm/neon/vp9_convolve8_neon.c
+++ /dev/null
@@ -1,357 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stddef.h>
-#include <arm_neon.h>
-
-#include "./vpx_config.h"
-#include "vpx_ports/mem.h"
-
-void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
-                           uint8_t *dst, ptrdiff_t dst_stride,
-                           const int16_t *filter_x, int x_step_q4,
-                           const int16_t *filter_y, int y_step_q4,
-                           int w, int h);
-void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
-                           uint8_t *dst, ptrdiff_t dst_stride,
-                           const int16_t *filter_x, int x_step_q4,
-                           const int16_t *filter_y, int y_step_q4,
-                           int w, int h);
-
-static INLINE int32x4_t MULTIPLY_BY_Q0(
-        int16x4_t dsrc0,
-        int16x4_t dsrc1,
-        int16x4_t dsrc2,
-        int16x4_t dsrc3,
-        int16x4_t dsrc4,
-        int16x4_t dsrc5,
-        int16x4_t dsrc6,
-        int16x4_t dsrc7,
-        int16x8_t q0s16) {
-    int32x4_t qdst;
-    int16x4_t d0s16, d1s16;
-
-    d0s16 = vget_low_s16(q0s16);
-    d1s16 = vget_high_s16(q0s16);
-
-    qdst = vmull_lane_s16(dsrc0, d0s16, 0);
-    qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1);
-    qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2);
-    qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3);
-    qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0);
-    qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1);
-    qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2);
-    qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3);
-    return qdst;
-}
-
-void vp9_convolve8_horiz_neon(
-        uint8_t *src,
-        ptrdiff_t src_stride,
-        uint8_t *dst,
-        ptrdiff_t dst_stride,
-        const int16_t *filter_x,
-        int x_step_q4,
-        const int16_t *filter_y,  // unused
-        int y_step_q4,            // unused
-        int w,
-        int h) {
-    int width;
-    uint8_t *s, *d, *psrc, *pdst;
-    uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8;
-    uint32x2_t d2u32, d3u32, d28u32, d29u32, d30u32, d31u32;
-    uint8x16_t q12u8, q13u8, q14u8, q15u8;
-    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16;
-    int16x4_t d24s16, d25s16, d26s16, d27s16;
-    uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16;
-    int16x8_t q0s16;
-    uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
-    int32x4_t q1s32, q2s32, q14s32, q15s32;
-    uint16x8x2_t q0x2u16;
-    uint8x8x2_t d0x2u8, d1x2u8;
-    uint32x2x2_t d0x2u32;
-    uint16x4x2_t d0x2u16, d1x2u16;
-    uint32x4x2_t q0x2u32;
-
-    if (x_step_q4 != 16) {
-        vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
-                              filter_x, x_step_q4,
-                              filter_y, y_step_q4, w, h);
-        return;
-    }
-
-    q0s16 = vld1q_s16(filter_x);
-
-    src -= 3;  // adjust for taps
-    for (; h > 0; h -= 4,
-        src += src_stride * 4,
-        dst += dst_stride * 4) {  // loop_horiz_v
-        s = src;
-        d24u8 = vld1_u8(s);
-        s += src_stride;
-        d25u8 = vld1_u8(s);
-        s += src_stride;
-        d26u8 = vld1_u8(s);
-        s += src_stride;
-        d27u8 = vld1_u8(s);
-
-        q12u8 = vcombine_u8(d24u8, d25u8);
-        q13u8 = vcombine_u8(d26u8, d27u8);
-
-        q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8),
-                            vreinterpretq_u16_u8(q13u8));
-        d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
-        d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
-        d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
-        d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1]));
-        d0x2u8 = vtrn_u8(d24u8, d25u8);
-        d1x2u8 = vtrn_u8(d26u8, d27u8);
-
-        __builtin_prefetch(src + src_stride * 4);
-        __builtin_prefetch(src + src_stride * 5);
-        __builtin_prefetch(src + src_stride * 6);
-
-        q8u16  = vmovl_u8(d0x2u8.val[0]);
-        q9u16  = vmovl_u8(d0x2u8.val[1]);
-        q10u16 = vmovl_u8(d1x2u8.val[0]);
-        q11u16 = vmovl_u8(d1x2u8.val[1]);
-
-        d16u16 = vget_low_u16(q8u16);
-        d17u16 = vget_high_u16(q8u16);
-        d18u16 = vget_low_u16(q9u16);
-        d19u16 = vget_high_u16(q9u16);
-        q8u16 = vcombine_u16(d16u16, d18u16);  // vswp 17 18
-        q9u16 = vcombine_u16(d17u16, d19u16);
-
-        d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
-        d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));  // vmov 23 21
-        for (width = w, psrc = src + 7, pdst = dst;
-             width > 0;
-             width -= 4, psrc += 4, pdst += 4) {  // loop_horiz
-            s = psrc;
-            d28u32 = vld1_dup_u32((const uint32_t *)s);
-            s += src_stride;
-            d29u32 = vld1_dup_u32((const uint32_t *)s);
-            s += src_stride;
-            d31u32 = vld1_dup_u32((const uint32_t *)s);
-            s += src_stride;
-            d30u32 = vld1_dup_u32((const uint32_t *)s);
-
-            __builtin_prefetch(psrc + 64);
-
-            d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32),
-                               vreinterpret_u16_u32(d31u32));
-            d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32),
-                               vreinterpret_u16_u32(d30u32));
-            d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]),   // d28
-                             vreinterpret_u8_u16(d1x2u16.val[0]));  // d29
-            d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]),   // d31
-                             vreinterpret_u8_u16(d1x2u16.val[1]));  // d30
-
-            __builtin_prefetch(psrc + 64 + src_stride);
-
-            q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
-            q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
-            q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8),
-                                vreinterpretq_u32_u8(q15u8));
-
-            d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
-            d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
-            q12u16 = vmovl_u8(d28u8);
-            q13u16 = vmovl_u8(d29u8);
-
-            __builtin_prefetch(psrc + 64 + src_stride * 2);
-
-            d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
-            d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
-            d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
-            d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
-            d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-            d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-            d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-            d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-            d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-
-            q1s32  = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16,
-                                    d18s16, d19s16, d23s16, d24s16, q0s16);
-            q2s32  = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16,
-                                    d19s16, d23s16, d24s16, d26s16, q0s16);
-            q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16,
-                                    d23s16, d24s16, d26s16, d27s16, q0s16);
-            q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16,
-                                    d24s16, d26s16, d27s16, d25s16, q0s16);
-
-            __builtin_prefetch(psrc + 60 + src_stride * 3);
-
-            d2u16 = vqrshrun_n_s32(q1s32, 7);
-            d3u16 = vqrshrun_n_s32(q2s32, 7);
-            d4u16 = vqrshrun_n_s32(q14s32, 7);
-            d5u16 = vqrshrun_n_s32(q15s32, 7);
-
-            q1u16 = vcombine_u16(d2u16, d3u16);
-            q2u16 = vcombine_u16(d4u16, d5u16);
-
-            d2u8 = vqmovn_u16(q1u16);
-            d3u8 = vqmovn_u16(q2u16);
-
-            d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8),
-                               vreinterpret_u16_u8(d3u8));
-            d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
-                               vreinterpret_u32_u16(d0x2u16.val[1]));
-            d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
-                             vreinterpret_u8_u32(d0x2u32.val[1]));
-
-            d2u32 = vreinterpret_u32_u8(d0x2u8.val[0]);
-            d3u32 = vreinterpret_u32_u8(d0x2u8.val[1]);
-
-            d = pdst;
-            vst1_lane_u32((uint32_t *)d, d2u32, 0);
-            d += dst_stride;
-            vst1_lane_u32((uint32_t *)d, d3u32, 0);
-            d += dst_stride;
-            vst1_lane_u32((uint32_t *)d, d2u32, 1);
-            d += dst_stride;
-            vst1_lane_u32((uint32_t *)d, d3u32, 1);
-
-            q8u16 = q9u16;
-            d20s16 = d23s16;
-            q11u16 = q12u16;
-            q9u16 = q13u16;
-            d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-        }
-    }
-    return;
-}
-
-void vp9_convolve8_vert_neon(
-        uint8_t *src,
-        ptrdiff_t src_stride,
-        uint8_t *dst,
-        ptrdiff_t dst_stride,
-        const int16_t *filter_x,  // unused
-        int x_step_q4,            // unused
-        const int16_t *filter_y,
-        int y_step_q4,
-        int w,
-        int h) {
-    int height;
-    uint8_t *s, *d;
-    uint32x2_t d2u32, d3u32;
-    uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32;
-    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16;
-    int16x4_t d24s16, d25s16, d26s16, d27s16;
-    uint16x4_t d2u16, d3u16, d4u16, d5u16;
-    int16x8_t q0s16;
-    uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
-    int32x4_t q1s32, q2s32, q14s32, q15s32;
-
-    if (y_step_q4 != 16) {
-        vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
-                             filter_x, x_step_q4,
-                             filter_y, y_step_q4, w, h);
-        return;
-    }
-
-    src -= src_stride * 3;
-    q0s16 = vld1q_s16(filter_y);
-    for (; w > 0; w -= 4, src += 4, dst += 4) {  // loop_vert_h
-        s = src;
-        d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0);
-        s += src_stride;
-        d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1);
-        s += src_stride;
-        d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0);
-        s += src_stride;
-        d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1);
-        s += src_stride;
-        d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0);
-        s += src_stride;
-        d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1);
-        s += src_stride;
-        d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0);
-        s += src_stride;
-
-        q8u16  = vmovl_u8(vreinterpret_u8_u32(d16u32));
-        q9u16  = vmovl_u8(vreinterpret_u8_u32(d18u32));
-        q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32));
-        q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32));
-
-        d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
-        d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
-        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-        d = dst;
-        for (height = h; height > 0; height -= 4) {  // loop_vert
-            d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0);
-            s += src_stride;
-            d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0);
-            s += src_stride;
-            d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1);
-            s += src_stride;
-            d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1);
-            s += src_stride;
-
-            q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32));
-            q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32));
-
-            d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
-            d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
-            d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
-            d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
-            d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-            d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-            d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-            d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-
-            __builtin_prefetch(d);
-            __builtin_prefetch(d + dst_stride);
-            q1s32  = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16,
-                                    d20s16, d21s16, d22s16, d24s16, q0s16);
-            __builtin_prefetch(d + dst_stride * 2);
-            __builtin_prefetch(d + dst_stride * 3);
-            q2s32  = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16,
-                                    d21s16, d22s16, d24s16, d26s16, q0s16);
-            __builtin_prefetch(s);
-            __builtin_prefetch(s + src_stride);
-            q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16,
-                                    d22s16, d24s16, d26s16, d27s16, q0s16);
-            __builtin_prefetch(s + src_stride * 2);
-            __builtin_prefetch(s + src_stride * 3);
-            q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16,
-                                    d24s16, d26s16, d27s16, d25s16, q0s16);
-
-            d2u16 = vqrshrun_n_s32(q1s32, 7);
-            d3u16 = vqrshrun_n_s32(q2s32, 7);
-            d4u16 = vqrshrun_n_s32(q14s32, 7);
-            d5u16 = vqrshrun_n_s32(q15s32, 7);
-
-            q1u16 = vcombine_u16(d2u16, d3u16);
-            q2u16 = vcombine_u16(d4u16, d5u16);
-
-            d2u32 = vreinterpret_u32_u8(vqmovn_u16(q1u16));
-            d3u32 = vreinterpret_u32_u8(vqmovn_u16(q2u16));
-
-            vst1_lane_u32((uint32_t *)d, d2u32, 0);
-            d += dst_stride;
-            vst1_lane_u32((uint32_t *)d, d2u32, 1);
-            d += dst_stride;
-            vst1_lane_u32((uint32_t *)d, d3u32, 0);
-            d += dst_stride;
-            vst1_lane_u32((uint32_t *)d, d3u32, 1);
-            d += dst_stride;
-
-            q8u16 = q10u16;
-            d18s16 = d22s16;
-            d19s16 = d24s16;
-            q10u16 = q13u16;
-            d22s16 = d25s16;
-        }
-    }
-    return;
-}
diff --git a/vp9/common/arm/neon/vp9_convolve_avg_neon.c b/vp9/common/arm/neon/vp9_convolve_avg_neon.c
deleted file mode 100644
index 3a3db353e..000000000
--- a/vp9/common/arm/neon/vp9_convolve_avg_neon.c
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stddef.h>
-#include <arm_neon.h>
-
-void vp9_convolve_avg_neon(
-        const uint8_t *src,    // r0
-        ptrdiff_t src_stride,  // r1
-        uint8_t *dst,          // r2
-        ptrdiff_t dst_stride,  // r3
-        const int16_t *filter_x,
-        int filter_x_stride,
-        const int16_t *filter_y,
-        int filter_y_stride,
-        int w,
-        int h) {
-    uint8_t *d;
-    uint8x8_t d0u8, d1u8, d2u8, d3u8;
-    uint32x2_t d0u32, d2u32;
-    uint8x16_t q0u8, q1u8, q2u8, q3u8, q8u8, q9u8, q10u8, q11u8;
-    (void)filter_x;  (void)filter_x_stride;
-    (void)filter_y;  (void)filter_y_stride;
-
-    d = dst;
-    if (w > 32) {  // avg64
-        for (; h > 0; h -= 1) {
-            q0u8  = vld1q_u8(src);
-            q1u8  = vld1q_u8(src + 16);
-            q2u8  = vld1q_u8(src + 32);
-            q3u8  = vld1q_u8(src + 48);
-            src += src_stride;
-            q8u8  = vld1q_u8(d);
-            q9u8  = vld1q_u8(d + 16);
-            q10u8 = vld1q_u8(d + 32);
-            q11u8 = vld1q_u8(d + 48);
-            d += dst_stride;
-
-            q0u8 = vrhaddq_u8(q0u8, q8u8);
-            q1u8 = vrhaddq_u8(q1u8, q9u8);
-            q2u8 = vrhaddq_u8(q2u8, q10u8);
-            q3u8 = vrhaddq_u8(q3u8, q11u8);
-
-            vst1q_u8(dst, q0u8);
-            vst1q_u8(dst + 16, q1u8);
-            vst1q_u8(dst + 32, q2u8);
-            vst1q_u8(dst + 48, q3u8);
-            dst += dst_stride;
-        }
-    } else if (w == 32) {  // avg32
-        for (; h > 0; h -= 2) {
-            q0u8 = vld1q_u8(src);
-            q1u8 = vld1q_u8(src + 16);
-            src += src_stride;
-            q2u8 = vld1q_u8(src);
-            q3u8 = vld1q_u8(src + 16);
-            src += src_stride;
-            q8u8 = vld1q_u8(d);
-            q9u8 = vld1q_u8(d + 16);
-            d += dst_stride;
-            q10u8 = vld1q_u8(d);
-            q11u8 = vld1q_u8(d + 16);
-            d += dst_stride;
-
-            q0u8 = vrhaddq_u8(q0u8, q8u8);
-            q1u8 = vrhaddq_u8(q1u8, q9u8);
-            q2u8 = vrhaddq_u8(q2u8, q10u8);
-            q3u8 = vrhaddq_u8(q3u8, q11u8);
-
-            vst1q_u8(dst, q0u8);
-            vst1q_u8(dst + 16, q1u8);
-            dst += dst_stride;
-            vst1q_u8(dst, q2u8);
-            vst1q_u8(dst + 16, q3u8);
-            dst += dst_stride;
-        }
-    } else if (w > 8) {  // avg16
-        for (; h > 0; h -= 2) {
-            q0u8 = vld1q_u8(src);
-            src += src_stride;
-            q1u8 = vld1q_u8(src);
-            src += src_stride;
-            q2u8 = vld1q_u8(d);
-            d += dst_stride;
-            q3u8 = vld1q_u8(d);
-            d += dst_stride;
-
-            q0u8 = vrhaddq_u8(q0u8, q2u8);
-            q1u8 = vrhaddq_u8(q1u8, q3u8);
-
-            vst1q_u8(dst, q0u8);
-            dst += dst_stride;
-            vst1q_u8(dst, q1u8);
-            dst += dst_stride;
-        }
-    } else if (w == 8) {  // avg8
-        for (; h > 0; h -= 2) {
-            d0u8 = vld1_u8(src);
-            src += src_stride;
-            d1u8 = vld1_u8(src);
-            src += src_stride;
-            d2u8 = vld1_u8(d);
-            d += dst_stride;
-            d3u8 = vld1_u8(d);
-            d += dst_stride;
-
-            q0u8 = vcombine_u8(d0u8, d1u8);
-            q1u8 = vcombine_u8(d2u8, d3u8);
-            q0u8 = vrhaddq_u8(q0u8, q1u8);
-
-            vst1_u8(dst, vget_low_u8(q0u8));
-            dst += dst_stride;
-            vst1_u8(dst, vget_high_u8(q0u8));
-            dst += dst_stride;
-        }
-    } else {  // avg4
-        for (; h > 0; h -= 2) {
-            d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 0);
-            src += src_stride;
-            d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 1);
-            src += src_stride;
-            d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 0);
-            d += dst_stride;
-            d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 1);
-            d += dst_stride;
-
-            d0u8 = vrhadd_u8(vreinterpret_u8_u32(d0u32),
-                             vreinterpret_u8_u32(d2u32));
-
-            d0u32 = vreinterpret_u32_u8(d0u8);
-            vst1_lane_u32((uint32_t *)dst, d0u32, 0);
-            dst += dst_stride;
-            vst1_lane_u32((uint32_t *)dst, d0u32, 1);
-            dst += dst_stride;
-        }
-    }
-    return;
-}
diff --git a/vp9/common/arm/neon/vp9_copy_neon.c b/vp9/common/arm/neon/vp9_copy_neon.c
deleted file mode 100644
index f334abe11..000000000
--- a/vp9/common/arm/neon/vp9_copy_neon.c
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stddef.h>
-#include <arm_neon.h>
-
-void vp9_convolve_copy_neon(
-        const uint8_t *src,    // r0
-        ptrdiff_t src_stride,  // r1
-        uint8_t *dst,          // r2
-        ptrdiff_t dst_stride,  // r3
-        const int16_t *filter_x,
-        int filter_x_stride,
-        const int16_t *filter_y,
-        int filter_y_stride,
-        int w,
-        int h) {
-    uint8x8_t d0u8, d2u8;
-    uint8x16_t q0u8, q1u8, q2u8, q3u8;
-    (void)filter_x;  (void)filter_x_stride;
-    (void)filter_y;  (void)filter_y_stride;
-
-    if (w > 32) {  // copy64
-        for (; h > 0; h--) {
-            q0u8 = vld1q_u8(src);
-            q1u8 = vld1q_u8(src + 16);
-            q2u8 = vld1q_u8(src + 32);
-            q3u8 = vld1q_u8(src + 48);
-            src += src_stride;
-
-            vst1q_u8(dst, q0u8);
-            vst1q_u8(dst + 16, q1u8);
-            vst1q_u8(dst + 32, q2u8);
-            vst1q_u8(dst + 48, q3u8);
-            dst += dst_stride;
-        }
-    } else if (w == 32) {  // copy32
-        for (; h > 0; h -= 2) {
-            q0u8 = vld1q_u8(src);
-            q1u8 = vld1q_u8(src + 16);
-            src += src_stride;
-            q2u8 = vld1q_u8(src);
-            q3u8 = vld1q_u8(src + 16);
-            src += src_stride;
-
-            vst1q_u8(dst, q0u8);
-            vst1q_u8(dst + 16, q1u8);
-            dst += dst_stride;
-            vst1q_u8(dst, q2u8);
-            vst1q_u8(dst + 16, q3u8);
-            dst += dst_stride;
-        }
-    } else if (w > 8) {  // copy16
-        for (; h > 0; h -= 2) {
-            q0u8 = vld1q_u8(src);
-            src += src_stride;
-            q1u8 = vld1q_u8(src);
-            src += src_stride;
-
-            vst1q_u8(dst, q0u8);
-            dst += dst_stride;
-            vst1q_u8(dst, q1u8);
-            dst += dst_stride;
-        }
-    } else if (w == 8) {  // copy8
-        for (; h > 0; h -= 2) {
-            d0u8 = vld1_u8(src);
-            src += src_stride;
-            d2u8 = vld1_u8(src);
-            src += src_stride;
-
-            vst1_u8(dst, d0u8);
-            dst += dst_stride;
-            vst1_u8(dst, d2u8);
-            dst += dst_stride;
-        }
-    } else {  // copy4
-        for (; h > 0; h--) {
-            *(uint32_t *)dst = *(const uint32_t *)src;
-            src += src_stride;
-            dst += dst_stride;
-        }
-    }
-    return;
-}
diff --git a/vp9/common/vp9_entropymode.h b/vp9/common/vp9_entropymode.h
index 371738aba..0285be155 100644
--- a/vp9/common/vp9_entropymode.h
+++ b/vp9/common/vp9_entropymode.h
@@ -11,9 +11,10 @@
 #ifndef VP9_COMMON_VP9_ENTROPYMODE_H_
 #define VP9_COMMON_VP9_ENTROPYMODE_H_
 
-#include "vp9/common/vp9_filter.h"
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_entropymv.h"
+#include "vp9/common/vp9_filter.h"
+#include "vpx_dsp/vpx_filter.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/vp9/common/vp9_filter.h b/vp9/common/vp9_filter.h
index 40d6a0d6a..efa24bc67 100644
--- a/vp9/common/vp9_filter.h
+++ b/vp9/common/vp9_filter.h
@@ -13,6 +13,7 @@
 
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_filter.h"
 #include "vpx_ports/mem.h"
 
 
@@ -20,13 +21,6 @@
 extern "C" {
 #endif
 
-#define FILTER_BITS 7
-
-#define SUBPEL_BITS 4
-#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
-#define SUBPEL_SHIFTS (1 << SUBPEL_BITS)
-#define SUBPEL_TAPS 8
-
 #define EIGHTTAP            0
 #define EIGHTTAP_SMOOTH     1
 #define EIGHTTAP_SHARP      2
@@ -36,9 +30,8 @@ extern "C" {
 // 8-tap, 8-tap-smooth, 8-tap-sharp, and switching between the three.
 #define SWITCHABLE_FILTER_CONTEXTS (SWITCHABLE_FILTERS + 1)
 #define SWITCHABLE 4 /* should be the last one */
-typedef uint8_t INTERP_FILTER;
 
-typedef int16_t InterpKernel[SUBPEL_TAPS];
+typedef uint8_t INTERP_FILTER;
 
 extern const InterpKernel *vp9_filter_kernels[4];
 
diff --git a/vp9/common/vp9_idct.h b/vp9/common/vp9_idct.h
index 2aa8ee978..1a4ea2f04 100644
--- a/vp9/common/vp9_idct.h
+++ b/vp9/common/vp9_idct.h
@@ -15,6 +15,9 @@
 
 #include "./vpx_config.h"
 #include "vpx_dsp/txfm_common.h"
+#if CONFIG_VP9_HIGHBITDEPTH
+#include "vpx_dsp/vpx_dsp_common.h"
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 #include "vpx_ports/mem.h"
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_enums.h"
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index 6d38fabd1..db9971da7 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -16,7 +16,6 @@
 #include "vpx/vpx_integer.h"
 
 #include "vp9/common/vp9_blockd.h"
-#include "vp9/common/vp9_filter.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
 
diff --git a/vp9/common/vp9_reconinter.h b/vp9/common/vp9_reconinter.h
index 7f63f5a30..9bc62900a 100644
--- a/vp9/common/vp9_reconinter.h
+++ b/vp9/common/vp9_reconinter.h
@@ -11,8 +11,10 @@
 #ifndef VP9_COMMON_VP9_RECONINTER_H_
 #define VP9_COMMON_VP9_RECONINTER_H_
 
-#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_filter.h"
 #include "vp9/common/vp9_onyxc_int.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_filter.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c
index d87a0430b..46f08a7bd 100644
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c
@@ -11,6 +11,9 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 
+#if CONFIG_VP9_HIGHBITDEPTH
+#include "vpx_dsp/vpx_dsp_common.h"
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 #include "vpx_ports/vpx_once.h"
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index ee99d7a5e..883733f55 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -54,12 +54,6 @@ if ($opts{arch} eq "x86_64") {
   $avx2_x86_64 = 'avx2';
 }
 
-# optimizations which depend on multiple features
-$avx2_ssse3 = '';
-if ((vpx_config("HAVE_AVX2") eq "yes") && (vpx_config("HAVE_SSSE3") eq "yes")) {
-  $avx2_ssse3 = 'avx2';
-}
-
 #
 # post proc
 #
@@ -88,33 +82,6 @@ specialize qw/vp9_filter_by_weight8x8 sse2 msa/;
 }
 
 #
-# Sub Pixel Filters
-#
-add_proto qw/void vp9_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve_copy neon dspr2 msa/, "$sse2_x86inc";
-
-add_proto qw/void vp9_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve_avg neon dspr2 msa/, "$sse2_x86inc";
-
-add_proto qw/void vp9_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8 sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
-
-add_proto qw/void vp9_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_horiz sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
-
-add_proto qw/void vp9_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_vert sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
-
-add_proto qw/void vp9_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_avg sse2 ssse3 neon dspr2 msa/;
-
-add_proto qw/void vp9_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_avg_horiz sse2 ssse3 neon dspr2 msa/;
-
-add_proto qw/void vp9_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_avg_vert sse2 ssse3 neon dspr2 msa/;
-
-#
 # dct
 #
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
diff --git a/vp9/common/vp9_scale.c b/vp9/common/vp9_scale.c
index 6db8f9caa..8f5c72e7c 100644
--- a/vp9/common/vp9_scale.c
+++ b/vp9/common/vp9_scale.c
@@ -8,9 +8,10 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vp9/common/vp9_filter.h"
 #include "vp9/common/vp9_scale.h"
+#include "vpx_dsp/vpx_filter.h"
 
 static INLINE int scaled_x(int val, const struct scale_factors *sf) {
   return (int)((int64_t)val * sf->x_scale_fp >> REF_SCALE_SHIFT);
@@ -81,85 +82,85 @@ void vp9_setup_scale_factors_for_frame(struct scale_factors *sf,
   if (sf->x_step_q4 == 16) {
     if (sf->y_step_q4 == 16) {
       // No scaling in either direction.
-      sf->predict[0][0][0] = vp9_convolve_copy;
-      sf->predict[0][0][1] = vp9_convolve_avg;
-      sf->predict[0][1][0] = vp9_convolve8_vert;
-      sf->predict[0][1][1] = vp9_convolve8_avg_vert;
-      sf->predict[1][0][0] = vp9_convolve8_horiz;
-      sf->predict[1][0][1] = vp9_convolve8_avg_horiz;
+      sf->predict[0][0][0] = vpx_convolve_copy;
+      sf->predict[0][0][1] = vpx_convolve_avg;
+      sf->predict[0][1][0] = vpx_convolve8_vert;
+      sf->predict[0][1][1] = vpx_convolve8_avg_vert;
+      sf->predict[1][0][0] = vpx_convolve8_horiz;
+      sf->predict[1][0][1] = vpx_convolve8_avg_horiz;
     } else {
       // No scaling in x direction. Must always scale in the y direction.
-      sf->predict[0][0][0] = vp9_convolve8_vert;
-      sf->predict[0][0][1] = vp9_convolve8_avg_vert;
-      sf->predict[0][1][0] = vp9_convolve8_vert;
-      sf->predict[0][1][1] = vp9_convolve8_avg_vert;
-      sf->predict[1][0][0] = vp9_convolve8;
-      sf->predict[1][0][1] = vp9_convolve8_avg;
+      sf->predict[0][0][0] = vpx_convolve8_vert;
+      sf->predict[0][0][1] = vpx_convolve8_avg_vert;
+      sf->predict[0][1][0] = vpx_convolve8_vert;
+      sf->predict[0][1][1] = vpx_convolve8_avg_vert;
+      sf->predict[1][0][0] = vpx_convolve8;
+      sf->predict[1][0][1] = vpx_convolve8_avg;
     }
   } else {
     if (sf->y_step_q4 == 16) {
       // No scaling in the y direction. Must always scale in the x direction.
-      sf->predict[0][0][0] = vp9_convolve8_horiz;
-      sf->predict[0][0][1] = vp9_convolve8_avg_horiz;
-      sf->predict[0][1][0] = vp9_convolve8;
-      sf->predict[0][1][1] = vp9_convolve8_avg;
-      sf->predict[1][0][0] = vp9_convolve8_horiz;
-      sf->predict[1][0][1] = vp9_convolve8_avg_horiz;
+      sf->predict[0][0][0] = vpx_convolve8_horiz;
+      sf->predict[0][0][1] = vpx_convolve8_avg_horiz;
+      sf->predict[0][1][0] = vpx_convolve8;
+      sf->predict[0][1][1] = vpx_convolve8_avg;
+      sf->predict[1][0][0] = vpx_convolve8_horiz;
+      sf->predict[1][0][1] = vpx_convolve8_avg_horiz;
     } else {
       // Must always scale in both directions.
-      sf->predict[0][0][0] = vp9_convolve8;
-      sf->predict[0][0][1] = vp9_convolve8_avg;
-      sf->predict[0][1][0] = vp9_convolve8;
-      sf->predict[0][1][1] = vp9_convolve8_avg;
-      sf->predict[1][0][0] = vp9_convolve8;
-      sf->predict[1][0][1] = vp9_convolve8_avg;
+      sf->predict[0][0][0] = vpx_convolve8;
+      sf->predict[0][0][1] = vpx_convolve8_avg;
+      sf->predict[0][1][0] = vpx_convolve8;
+      sf->predict[0][1][1] = vpx_convolve8_avg;
+      sf->predict[1][0][0] = vpx_convolve8;
+      sf->predict[1][0][1] = vpx_convolve8_avg;
     }
   }
   // 2D subpel motion always gets filtered in both directions
-  sf->predict[1][1][0] = vp9_convolve8;
-  sf->predict[1][1][1] = vp9_convolve8_avg;
+  sf->predict[1][1][0] = vpx_convolve8;
+  sf->predict[1][1][1] = vpx_convolve8_avg;
 #if CONFIG_VP9_HIGHBITDEPTH
   if (use_highbd) {
     if (sf->x_step_q4 == 16) {
       if (sf->y_step_q4 == 16) {
         // No scaling in either direction.
-        sf->highbd_predict[0][0][0] = vp9_highbd_convolve_copy;
-        sf->highbd_predict[0][0][1] = vp9_highbd_convolve_avg;
-        sf->highbd_predict[0][1][0] = vp9_highbd_convolve8_vert;
-        sf->highbd_predict[0][1][1] = vp9_highbd_convolve8_avg_vert;
-        sf->highbd_predict[1][0][0] = vp9_highbd_convolve8_horiz;
-        sf->highbd_predict[1][0][1] = vp9_highbd_convolve8_avg_horiz;
+        sf->highbd_predict[0][0][0] = vpx_highbd_convolve_copy;
+        sf->highbd_predict[0][0][1] = vpx_highbd_convolve_avg;
+        sf->highbd_predict[0][1][0] = vpx_highbd_convolve8_vert;
+        sf->highbd_predict[0][1][1] = vpx_highbd_convolve8_avg_vert;
+        sf->highbd_predict[1][0][0] = vpx_highbd_convolve8_horiz;
+        sf->highbd_predict[1][0][1] = vpx_highbd_convolve8_avg_horiz;
       } else {
         // No scaling in x direction. Must always scale in the y direction.
-        sf->highbd_predict[0][0][0] = vp9_highbd_convolve8_vert;
-        sf->highbd_predict[0][0][1] = vp9_highbd_convolve8_avg_vert;
-        sf->highbd_predict[0][1][0] = vp9_highbd_convolve8_vert;
-        sf->highbd_predict[0][1][1] = vp9_highbd_convolve8_avg_vert;
-        sf->highbd_predict[1][0][0] = vp9_highbd_convolve8;
-        sf->highbd_predict[1][0][1] = vp9_highbd_convolve8_avg;
+        sf->highbd_predict[0][0][0] = vpx_highbd_convolve8_vert;
+        sf->highbd_predict[0][0][1] = vpx_highbd_convolve8_avg_vert;
+        sf->highbd_predict[0][1][0] = vpx_highbd_convolve8_vert;
+        sf->highbd_predict[0][1][1] = vpx_highbd_convolve8_avg_vert;
+        sf->highbd_predict[1][0][0] = vpx_highbd_convolve8;
+        sf->highbd_predict[1][0][1] = vpx_highbd_convolve8_avg;
       }
     } else {
       if (sf->y_step_q4 == 16) {
         // No scaling in the y direction. Must always scale in the x direction.
-        sf->highbd_predict[0][0][0] = vp9_highbd_convolve8_horiz;
-        sf->highbd_predict[0][0][1] = vp9_highbd_convolve8_avg_horiz;
-        sf->highbd_predict[0][1][0] = vp9_highbd_convolve8;
-        sf->highbd_predict[0][1][1] = vp9_highbd_convolve8_avg;
-        sf->highbd_predict[1][0][0] = vp9_highbd_convolve8_horiz;
-        sf->highbd_predict[1][0][1] = vp9_highbd_convolve8_avg_horiz;
+        sf->highbd_predict[0][0][0] = vpx_highbd_convolve8_horiz;
+        sf->highbd_predict[0][0][1] = vpx_highbd_convolve8_avg_horiz;
+        sf->highbd_predict[0][1][0] = vpx_highbd_convolve8;
+        sf->highbd_predict[0][1][1] = vpx_highbd_convolve8_avg;
+        sf->highbd_predict[1][0][0] = vpx_highbd_convolve8_horiz;
+        sf->highbd_predict[1][0][1] = vpx_highbd_convolve8_avg_horiz;
       } else {
         // Must always scale in both directions.
-        sf->highbd_predict[0][0][0] = vp9_highbd_convolve8;
-        sf->highbd_predict[0][0][1] = vp9_highbd_convolve8_avg;
-        sf->highbd_predict[0][1][0] = vp9_highbd_convolve8;
-        sf->highbd_predict[0][1][1] = vp9_highbd_convolve8_avg;
-        sf->highbd_predict[1][0][0] = vp9_highbd_convolve8;
-        sf->highbd_predict[1][0][1] = vp9_highbd_convolve8_avg;
+        sf->highbd_predict[0][0][0] = vpx_highbd_convolve8;
+        sf->highbd_predict[0][0][1] = vpx_highbd_convolve8_avg;
+        sf->highbd_predict[0][1][0] = vpx_highbd_convolve8;
+        sf->highbd_predict[0][1][1] = vpx_highbd_convolve8_avg;
+        sf->highbd_predict[1][0][0] = vpx_highbd_convolve8;
+        sf->highbd_predict[1][0][1] = vpx_highbd_convolve8_avg;
       }
     }
     // 2D subpel motion always gets filtered in both directions.
-    sf->highbd_predict[1][1][0] = vp9_highbd_convolve8;
-    sf->highbd_predict[1][1][1] = vp9_highbd_convolve8_avg;
+    sf->highbd_predict[1][1][0] = vpx_highbd_convolve8;
+    sf->highbd_predict[1][1][1] = vpx_highbd_convolve8_avg;
   }
 #endif
 }
diff --git a/vp9/common/vp9_scale.h b/vp9/common/vp9_scale.h
index a1601a72f..5e9104107 100644
--- a/vp9/common/vp9_scale.h
+++ b/vp9/common/vp9_scale.h
@@ -12,7 +12,7 @@
 #define VP9_COMMON_VP9_SCALE_H_
 
 #include "vp9/common/vp9_mv.h"
-#include "vp9/common/vp9_convolve.h"
+#include "vpx_dsp/vpx_convolve.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index c6d3bf19e..ecebe1efb 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -12,6 +12,7 @@
 #include <stdlib.h>  // qsort()
 
 #include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "./vpx_scale_rtcd.h"
 
 #include "vpx_dsp/bitreader_buffer.h"
diff --git a/vp9/encoder/vp9_blockiness.c b/vp9/encoder/vp9_blockiness.c
index b8629bd3b..fc3eac6c7 100644
--- a/vp9/encoder/vp9_blockiness.c
+++ b/vp9/encoder/vp9_blockiness.c
@@ -8,12 +8,14 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "./vpx_config.h"
 #include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_convolve.h"
 #include "vp9/common/vp9_filter.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_convolve.h"
+#include "vpx_dsp/vpx_filter.h"
 #include "vpx_ports/mem.h"
 
 static int horizontal_filter(const uint8_t *s) {
diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index 08134e152..f1d73790a 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -10,6 +10,7 @@
 
 #include <assert.h>
 #include <limits.h>
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_scale/yv12config.h"
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_reconinter.h"
@@ -336,12 +337,12 @@ void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
   }
 
   if (decision == FILTER_BLOCK) {
-    vp9_convolve_copy(avg_start, avg.y_stride, src.buf, src.stride,
+    vpx_convolve_copy(avg_start, avg.y_stride, src.buf, src.stride,
                       NULL, 0, NULL, 0,
                       num_4x4_blocks_wide_lookup[bs] << 2,
                       num_4x4_blocks_high_lookup[bs] << 2);
   } else {  // COPY_BLOCK
-    vp9_convolve_copy(src.buf, src.stride, avg_start, avg.y_stride,
+    vpx_convolve_copy(src.buf, src.stride, avg_start, avg.y_stride,
                       NULL, 0, NULL, 0,
                       num_4x4_blocks_wide_lookup[bs] << 2,
                       num_4x4_blocks_high_lookup[bs] << 2);
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 53e747c26..8aee22756 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -12,11 +12,12 @@
 #include <stdio.h>
 #include <limits.h>
 
-#include "./vpx_config.h"
 #include "./vp9_rtcd.h"
+#include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "./vpx_scale_rtcd.h"
 #include "vpx/internal/vpx_psnr.h"
+#include "vpx_dsp/vpx_filter.h"
 #include "vpx_ports/mem.h"
 #include "vpx_ports/vpx_timer.h"
 #include "vpx_scale/vpx_scale.h"
@@ -2580,18 +2581,18 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
 
 #if CONFIG_VP9_HIGHBITDEPTH
         if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
-          vp9_highbd_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
+          vpx_highbd_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
                                kernel[x_q4 & 0xf], 16 * src_w / dst_w,
                                kernel[y_q4 & 0xf], 16 * src_h / dst_h,
                                16 / factor, 16 / factor, bd);
         } else {
-          vp9_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
+          vpx_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
                         kernel[x_q4 & 0xf], 16 * src_w / dst_w,
                         kernel[y_q4 & 0xf], 16 * src_h / dst_h,
                         16 / factor, 16 / factor);
         }
 #else
-        vp9_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
+        vpx_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
                       kernel[x_q4 & 0xf], 16 * src_w / dst_w,
                       kernel[y_q4 & 0xf], 16 * src_h / dst_h,
                       16 / factor, 16 / factor);
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 34bf8a6a1..00f7dcdf3 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -1504,15 +1504,15 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
         this_mode_pred = &tmp[get_pred_buffer(tmp, 3)];
 #if CONFIG_VP9_HIGHBITDEPTH
         if (cm->use_highbitdepth)
-          vp9_highbd_convolve_copy(best_pred->data, best_pred->stride,
+          vpx_highbd_convolve_copy(best_pred->data, best_pred->stride,
                                    this_mode_pred->data, this_mode_pred->stride,
                                    NULL, 0, NULL, 0, bw, bh, xd->bd);
         else
-          vp9_convolve_copy(best_pred->data, best_pred->stride,
+          vpx_convolve_copy(best_pred->data, best_pred->stride,
                           this_mode_pred->data, this_mode_pred->stride,
                           NULL, 0, NULL, 0, bw, bh);
 #else
-        vp9_convolve_copy(best_pred->data, best_pred->stride,
+        vpx_convolve_copy(best_pred->data, best_pred->stride,
                           this_mode_pred->data, this_mode_pred->stride,
                           NULL, 0, NULL, 0, bw, bh);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
@@ -1577,15 +1577,15 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     if (best_pred->data != orig_dst.buf && is_inter_mode(mbmi->mode)) {
 #if CONFIG_VP9_HIGHBITDEPTH
       if (cm->use_highbitdepth)
-        vp9_highbd_convolve_copy(best_pred->data, best_pred->stride,
+        vpx_highbd_convolve_copy(best_pred->data, best_pred->stride,
                                  pd->dst.buf, pd->dst.stride, NULL, 0,
                                  NULL, 0, bw, bh, xd->bd);
       else
-        vp9_convolve_copy(best_pred->data, best_pred->stride,
+        vpx_convolve_copy(best_pred->data, best_pred->stride,
                           pd->dst.buf, pd->dst.stride, NULL, 0,
                           NULL, 0, bw, bh);
 #else
-      vp9_convolve_copy(best_pred->data, best_pred->stride,
+      vpx_convolve_copy(best_pred->data, best_pred->stride,
                         pd->dst.buf, pd->dst.stride, NULL, 0,
                         NULL, 0, bw, bh);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp9/encoder/vp9_resize.c b/vp9/encoder/vp9_resize.c
index f46cad804..e5ae9594c 100644
--- a/vp9/encoder/vp9_resize.c
+++ b/vp9/encoder/vp9_resize.c
@@ -15,6 +15,9 @@
 #include <stdlib.h>
 #include <string.h>
 
+#if CONFIG_VP9_HIGHBITDEPTH
+#include "vpx_dsp/vpx_dsp_common.h"
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 #include "vpx_ports/mem.h"
 #include "vp9/common/vp9_common.h"
 #include "vp9/encoder/vp9_resize.h"
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 78ea63fa1..339bc8b6e 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -13,14 +13,10 @@ VP9_COMMON_SRCS-yes += vp9_iface_common.h
 VP9_COMMON_SRCS-yes += common/vp9_ppflags.h
 VP9_COMMON_SRCS-yes += common/vp9_alloccommon.c
 VP9_COMMON_SRCS-yes += common/vp9_blockd.c
-VP9_COMMON_SRCS-yes += common/vp9_convolve.c
-VP9_COMMON_SRCS-yes += common/vp9_convolve.h
 VP9_COMMON_SRCS-yes += common/vp9_debugmodes.c
 VP9_COMMON_SRCS-yes += common/vp9_entropy.c
 VP9_COMMON_SRCS-yes += common/vp9_entropymode.c
 VP9_COMMON_SRCS-yes += common/vp9_entropymv.c
-VP9_COMMON_SRCS-yes += common/vp9_filter.c
-VP9_COMMON_SRCS-yes += common/vp9_filter.h
 VP9_COMMON_SRCS-yes += common/vp9_frame_buffers.c
 VP9_COMMON_SRCS-yes += common/vp9_frame_buffers.h
 VP9_COMMON_SRCS-yes += common/vp9_idct.c
@@ -31,6 +27,8 @@ VP9_COMMON_SRCS-yes += common/vp9_entropy.h
 VP9_COMMON_SRCS-yes += common/vp9_entropymode.h
 VP9_COMMON_SRCS-yes += common/vp9_entropymv.h
 VP9_COMMON_SRCS-yes += common/vp9_enums.h
+VP9_COMMON_SRCS-yes += common/vp9_filter.h
+VP9_COMMON_SRCS-yes += common/vp9_filter.c
 VP9_COMMON_SRCS-yes += common/vp9_idct.h
 VP9_COMMON_SRCS-yes += common/vp9_loopfilter.h
 VP9_COMMON_SRCS-yes += common/vp9_thread_common.h
@@ -64,33 +62,16 @@ VP9_COMMON_SRCS-yes += common/vp9_common_data.h
 VP9_COMMON_SRCS-yes += common/vp9_scan.c
 VP9_COMMON_SRCS-yes += common/vp9_scan.h
 
-VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/convolve.h
-VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c
 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.h
 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c
 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.h
 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.c
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_sse2.asm
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_8t_sse2.asm
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_bilinear_sse2.asm
-VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm
-VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_bilinear_ssse3.asm
-VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_subpixel_8t_intrin_avx2.c
-VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_intrin_ssse3.c
 ifeq ($(CONFIG_VP9_POSTPROC),yes)
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_mfqe_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm
 endif
 
-ifeq ($(CONFIG_USE_X86INC),yes)
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_copy_sse2.asm
-endif
-
-ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_high_subpixel_8t_sse2.asm
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_high_subpixel_bilinear_sse2.asm
-endif
-
 # common (c)
 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_common_dspr2.h
 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_convolve2_avg_dspr2.c
@@ -113,15 +94,6 @@ VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans32_dspr2.c
 endif
 
 # common (msa)
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_avg_horiz_msa.c
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_avg_msa.c
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_avg_vert_msa.c
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_horiz_msa.c
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_msa.c
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_vert_msa.c
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve_avg_msa.c
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve_copy_msa.c
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve_msa.h
 VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct4x4_msa.c
 VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct8x8_msa.c
 VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct16x16_msa.c
@@ -151,11 +123,6 @@ endif
 # neon with assembly and intrinsics implementations. If both are available
 # prefer assembly.
 ifeq ($(HAVE_NEON_ASM), yes)
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve8_avg_neon_asm$(ASM)
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve8_neon_asm$(ASM)
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve_avg_neon_asm$(ASM)
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve_neon.c
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_copy_neon_asm$(ASM)
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_1_add_neon_asm$(ASM)
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_add_neon_asm$(ASM)
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_neon.c
@@ -167,11 +134,6 @@ VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_1_add_neon_asm$(ASM)
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_add_neon_asm$(ASM)
 else
 ifeq ($(HAVE_NEON), yes)
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve8_avg_neon.c
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve8_neon.c
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve_avg_neon.c
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve_neon.c
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_copy_neon.c
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_1_add_neon.c
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_add_neon.c
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_neon.c
diff --git a/vpx_dsp/arm/vpx_convolve8_avg_neon.c b/vpx_dsp/arm/vpx_convolve8_avg_neon.c
new file mode 100644
index 000000000..5464250e6
--- /dev/null
+++ b/vpx_dsp/arm/vpx_convolve8_avg_neon.c
@@ -0,0 +1,393 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const int16_t *filter_x, int x_step_q4,
+                               const int16_t *filter_y, int y_step_q4,
+                               int w, int h);
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const int16_t *filter_x, int x_step_q4,
+                               const int16_t *filter_y, int y_step_q4,
+                               int w, int h);
+
+static INLINE int32x4_t MULTIPLY_BY_Q0(
+    int16x4_t dsrc0,
+    int16x4_t dsrc1,
+    int16x4_t dsrc2,
+    int16x4_t dsrc3,
+    int16x4_t dsrc4,
+    int16x4_t dsrc5,
+    int16x4_t dsrc6,
+    int16x4_t dsrc7,
+    int16x8_t q0s16) {
+  int32x4_t qdst;
+  int16x4_t d0s16, d1s16;
+
+  d0s16 = vget_low_s16(q0s16);
+  d1s16 = vget_high_s16(q0s16);
+
+  qdst = vmull_lane_s16(dsrc0, d0s16, 0);
+  qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1);
+  qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2);
+  qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3);
+  qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0);
+  qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1);
+  qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2);
+  qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3);
+  return qdst;
+}
+
+void vpx_convolve8_avg_horiz_neon(
+    const uint8_t *src,
+    ptrdiff_t src_stride,
+    uint8_t *dst,
+    ptrdiff_t dst_stride,
+    const int16_t *filter_x,
+    int x_step_q4,
+    const int16_t *filter_y,  // unused
+    int y_step_q4,            // unused
+    int w,
+    int h) {
+  int width;
+  const uint8_t *s;
+  uint8_t *d;
+  uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8;
+  uint32x2_t d2u32, d3u32, d6u32, d7u32, d28u32, d29u32, d30u32, d31u32;
+  uint8x16_t q1u8, q3u8, q12u8, q13u8, q14u8, q15u8;
+  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16;
+  int16x4_t d24s16, d25s16, d26s16, d27s16;
+  uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16;
+  int16x8_t q0s16;
+  uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
+  int32x4_t q1s32, q2s32, q14s32, q15s32;
+  uint16x8x2_t q0x2u16;
+  uint8x8x2_t d0x2u8, d1x2u8;
+  uint32x2x2_t d0x2u32;
+  uint16x4x2_t d0x2u16, d1x2u16;
+  uint32x4x2_t q0x2u32;
+
+  if (x_step_q4 != 16) {
+    vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
+                              filter_x, x_step_q4,
+                              filter_y, y_step_q4, w, h);
+    return;
+}
+
+  q0s16 = vld1q_s16(filter_x);
+
+  src -= 3;  // adjust for taps
+  for (; h > 0; h -= 4) {  // loop_horiz_v
+    s = src;
+    d24u8 = vld1_u8(s);
+    s += src_stride;
+    d25u8 = vld1_u8(s);
+    s += src_stride;
+    d26u8 = vld1_u8(s);
+    s += src_stride;
+    d27u8 = vld1_u8(s);
+
+    q12u8 = vcombine_u8(d24u8, d25u8);
+    q13u8 = vcombine_u8(d26u8, d27u8);
+
+    q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8),
+                        vreinterpretq_u16_u8(q13u8));
+    d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
+    d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
+    d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
+    d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1]));
+    d0x2u8 = vtrn_u8(d24u8, d25u8);
+    d1x2u8 = vtrn_u8(d26u8, d27u8);
+
+    __builtin_prefetch(src + src_stride * 4);
+    __builtin_prefetch(src + src_stride * 5);
+
+    q8u16 = vmovl_u8(d0x2u8.val[0]);
+    q9u16 = vmovl_u8(d0x2u8.val[1]);
+    q10u16 = vmovl_u8(d1x2u8.val[0]);
+    q11u16 = vmovl_u8(d1x2u8.val[1]);
+
+    src += 7;
+    d16u16 = vget_low_u16(q8u16);
+    d17u16 = vget_high_u16(q8u16);
+    d18u16 = vget_low_u16(q9u16);
+    d19u16 = vget_high_u16(q9u16);
+    q8u16 = vcombine_u16(d16u16, d18u16);  // vswp 17 18
+    q9u16 = vcombine_u16(d17u16, d19u16);
+
+    d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+    d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));  // vmov 23 21
+    for (width = w;
+         width > 0;
+         width -= 4, src += 4, dst += 4) {  // loop_horiz
+      s = src;
+      d28u32 = vld1_dup_u32((const uint32_t *)s);
+      s += src_stride;
+      d29u32 = vld1_dup_u32((const uint32_t *)s);
+      s += src_stride;
+      d31u32 = vld1_dup_u32((const uint32_t *)s);
+      s += src_stride;
+      d30u32 = vld1_dup_u32((const uint32_t *)s);
+
+      __builtin_prefetch(src + 64);
+
+      d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32),
+                         vreinterpret_u16_u32(d31u32));
+      d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32),
+                         vreinterpret_u16_u32(d30u32));
+      d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]),   // d28
+                       vreinterpret_u8_u16(d1x2u16.val[0]));  // d29
+      d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]),   // d31
+                       vreinterpret_u8_u16(d1x2u16.val[1]));  // d30
+
+      __builtin_prefetch(src + 64 + src_stride);
+
+      q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
+      q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
+      q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8),
+                          vreinterpretq_u32_u8(q15u8));
+
+      d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
+      d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
+      q12u16 = vmovl_u8(d28u8);
+      q13u16 = vmovl_u8(d29u8);
+
+      __builtin_prefetch(src + 64 + src_stride * 2);
+
+      d = dst;
+      d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0);
+      d += dst_stride;
+      d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0);
+      d += dst_stride;
+      d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1);
+      d += dst_stride;
+      d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1);
+
+      d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
+      d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
+      d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+      d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+      d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+      d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+      d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+      d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+      d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+
+      q1s32  = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16,
+                              d18s16, d19s16, d23s16, d24s16, q0s16);
+      q2s32  = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16,
+                              d19s16, d23s16, d24s16, d26s16, q0s16);
+      q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16,
+                              d23s16, d24s16, d26s16, d27s16, q0s16);
+      q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16,
+                              d24s16, d26s16, d27s16, d25s16, q0s16);
+
+      __builtin_prefetch(src + 64 + src_stride * 3);
+
+      d2u16 = vqrshrun_n_s32(q1s32, 7);
+      d3u16 = vqrshrun_n_s32(q2s32, 7);
+      d4u16 = vqrshrun_n_s32(q14s32, 7);
+      d5u16 = vqrshrun_n_s32(q15s32, 7);
+
+      q1u16 = vcombine_u16(d2u16, d3u16);
+      q2u16 = vcombine_u16(d4u16, d5u16);
+
+      d2u8 = vqmovn_u16(q1u16);
+      d3u8 = vqmovn_u16(q2u16);
+
+      d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8),
+                         vreinterpret_u16_u8(d3u8));
+      d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
+                         vreinterpret_u32_u16(d0x2u16.val[1]));
+      d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
+                       vreinterpret_u8_u32(d0x2u32.val[1]));
+
+      q1u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
+      q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32));
+
+      q1u8 = vrhaddq_u8(q1u8, q3u8);
+
+      d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8));
+      d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8));
+
+      d = dst;
+      vst1_lane_u32((uint32_t *)d, d2u32, 0);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d3u32, 0);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d2u32, 1);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d3u32, 1);
+
+      q8u16 = q9u16;
+      d20s16 = d23s16;
+      q11u16 = q12u16;
+      q9u16 = q13u16;
+      d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+    }
+    src += src_stride * 4 - w - 7;
+    dst += dst_stride * 4 - w;
+  }
+  return;
+}
+
+void vpx_convolve8_avg_vert_neon(
+    const uint8_t *src,
+    ptrdiff_t src_stride,
+    uint8_t *dst,
+    ptrdiff_t dst_stride,
+    const int16_t *filter_x,  // unused
+    int x_step_q4,            // unused
+    const int16_t *filter_y,
+    int y_step_q4,
+    int w,
+    int h) {
+  int height;
+  const uint8_t *s;
+  uint8_t *d;
+  uint8x8_t d2u8, d3u8;
+  uint32x2_t d2u32, d3u32, d6u32, d7u32;
+  uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32;
+  uint8x16_t q1u8, q3u8;
+  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16;
+  int16x4_t d24s16, d25s16, d26s16, d27s16;
+  uint16x4_t d2u16, d3u16, d4u16, d5u16;
+  int16x8_t q0s16;
+  uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
+  int32x4_t q1s32, q2s32, q14s32, q15s32;
+
+  if (y_step_q4 != 16) {
+    vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
+                             filter_x, x_step_q4,
+                             filter_y, y_step_q4, w, h);
+    return;
+  }
+
+  src -= src_stride * 3;
+  q0s16 = vld1q_s16(filter_y);
+  for (; w > 0; w -= 4, src += 4, dst += 4) {  // loop_vert_h
+    s = src;
+    d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0);
+    s += src_stride;
+    d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1);
+    s += src_stride;
+    d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0);
+    s += src_stride;
+    d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1);
+    s += src_stride;
+    d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0);
+    s += src_stride;
+    d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1);
+    s += src_stride;
+    d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0);
+    s += src_stride;
+
+    q8u16  = vmovl_u8(vreinterpret_u8_u32(d16u32));
+    q9u16  = vmovl_u8(vreinterpret_u8_u32(d18u32));
+    q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32));
+    q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32));
+
+    d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+    d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+    d = dst;
+    for (height = h; height > 0; height -= 4) {  // loop_vert
+      d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0);
+      s += src_stride;
+      d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0);
+      s += src_stride;
+      d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1);
+      s += src_stride;
+      d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1);
+      s += src_stride;
+
+      q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32));
+      q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32));
+
+      d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0);
+      d += dst_stride;
+      d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1);
+      d += dst_stride;
+      d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0);
+      d += dst_stride;
+      d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1);
+      d -= dst_stride * 3;
+
+      d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
+      d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
+      d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+      d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
+      d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+      d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+      d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+      d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+
+      __builtin_prefetch(s);
+      __builtin_prefetch(s + src_stride);
+      q1s32  = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16,
+                              d20s16, d21s16, d22s16, d24s16, q0s16);
+      __builtin_prefetch(s + src_stride * 2);
+      __builtin_prefetch(s + src_stride * 3);
+      q2s32  = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16,
+                              d21s16, d22s16, d24s16, d26s16, q0s16);
+      __builtin_prefetch(d);
+      __builtin_prefetch(d + dst_stride);
+      q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16,
+                              d22s16, d24s16, d26s16, d27s16, q0s16);
+      __builtin_prefetch(d + dst_stride * 2);
+      __builtin_prefetch(d + dst_stride * 3);
+      q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16,
+                              d24s16, d26s16, d27s16, d25s16, q0s16);
+
+      d2u16 = vqrshrun_n_s32(q1s32, 7);
+      d3u16 = vqrshrun_n_s32(q2s32, 7);
+      d4u16 = vqrshrun_n_s32(q14s32, 7);
+      d5u16 = vqrshrun_n_s32(q15s32, 7);
+
+      q1u16 = vcombine_u16(d2u16, d3u16);
+      q2u16 = vcombine_u16(d4u16, d5u16);
+
+      d2u8 = vqmovn_u16(q1u16);
+      d3u8 = vqmovn_u16(q2u16);
+
+      q1u8 = vcombine_u8(d2u8, d3u8);
+      q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32));
+
+      q1u8 = vrhaddq_u8(q1u8, q3u8);
+
+      d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8));
+      d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8));
+
+      vst1_lane_u32((uint32_t *)d, d2u32, 0);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d2u32, 1);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d3u32, 0);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d3u32, 1);
+      d += dst_stride;
+
+      q8u16 = q10u16;
+      d18s16 = d22s16;
+      d19s16 = d24s16;
+      q10u16 = q13u16;
+      d22s16 = d25s16;
+    }
+  }
+  return;
+}
diff --git a/vp9/common/arm/neon/vp9_convolve8_avg_neon_asm.asm b/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm
index 4d85846f0..a19f97db7 100644
--- a/vp9/common/arm/neon/vp9_convolve8_avg_neon_asm.asm
+++ b/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm
@@ -17,10 +17,10 @@
     ; VP9_FILTER_WEIGHT == 128
     ; VP9_FILTER_SHIFT == 7
 
-    EXPORT  |vp9_convolve8_avg_horiz_neon|
-    EXPORT  |vp9_convolve8_avg_vert_neon|
-    IMPORT  |vp9_convolve8_avg_horiz_c|
-    IMPORT  |vp9_convolve8_avg_vert_c|
+    EXPORT  |vpx_convolve8_avg_horiz_neon|
+    EXPORT  |vpx_convolve8_avg_vert_neon|
+    IMPORT  |vpx_convolve8_avg_horiz_c|
+    IMPORT  |vpx_convolve8_avg_vert_c|
     ARM
     REQUIRE8
     PRESERVE8
@@ -51,10 +51,10 @@
 ; sp[]int w
 ; sp[]int h
 
-|vp9_convolve8_avg_horiz_neon| PROC
+|vpx_convolve8_avg_horiz_neon| PROC
     ldr             r12, [sp, #4]           ; x_step_q4
     cmp             r12, #16
-    bne             vp9_convolve8_avg_horiz_c
+    bne             vpx_convolve8_avg_horiz_c
 
     push            {r4-r10, lr}
 
@@ -78,7 +78,7 @@
 
     mov             r10, r6                 ; w loop counter
 
-vp9_convolve8_avg_loop_horiz_v
+vpx_convolve8_avg_loop_horiz_v
     vld1.8          {d24}, [r0], r1
     vld1.8          {d25}, [r0], r1
     vld1.8          {d26}, [r0], r1
@@ -101,7 +101,7 @@ vp9_convolve8_avg_loop_horiz_v
 
     add             r0, r0, #3
 
-vp9_convolve8_avg_loop_horiz
+vpx_convolve8_avg_loop_horiz
     add             r5, r0, #64
 
     vld1.32         {d28[]}, [r0], r1
@@ -170,23 +170,23 @@ vp9_convolve8_avg_loop_horiz
     vmov            q9,  q13
 
     subs            r6, r6, #4              ; w -= 4
-    bgt             vp9_convolve8_avg_loop_horiz
+    bgt             vpx_convolve8_avg_loop_horiz
 
     ; outer loop
     mov             r6, r10                 ; restore w counter
     add             r0, r0, r9              ; src += src_stride * 4 - w
     add             r2, r2, r12             ; dst += dst_stride * 4 - w
     subs            r7, r7, #4              ; h -= 4
-    bgt vp9_convolve8_avg_loop_horiz_v
+    bgt vpx_convolve8_avg_loop_horiz_v
 
     pop             {r4-r10, pc}
 
     ENDP
 
-|vp9_convolve8_avg_vert_neon| PROC
+|vpx_convolve8_avg_vert_neon| PROC
     ldr             r12, [sp, #12]
     cmp             r12, #16
-    bne             vp9_convolve8_avg_vert_c
+    bne             vpx_convolve8_avg_vert_c
 
     push            {r4-r8, lr}
 
@@ -203,7 +203,7 @@ vp9_convolve8_avg_loop_horiz
     lsl             r1, r1, #1
     lsl             r3, r3, #1
 
-vp9_convolve8_avg_loop_vert_h
+vpx_convolve8_avg_loop_vert_h
     mov             r4, r0
     add             r7, r0, r1, asr #1
     mov             r5, r2
@@ -223,7 +223,7 @@ vp9_convolve8_avg_loop_vert_h
     vmovl.u8        q10, d20
     vmovl.u8        q11, d22
 
-vp9_convolve8_avg_loop_vert
+vpx_convolve8_avg_loop_vert
     ; always process a 4x4 block at a time
     vld1.u32        {d24[0]}, [r7], r1
     vld1.u32        {d26[0]}, [r4], r1
@@ -288,13 +288,13 @@ vp9_convolve8_avg_loop_vert
     vmov            d22, d25
 
     subs            r12, r12, #4            ; h -= 4
-    bgt             vp9_convolve8_avg_loop_vert
+    bgt             vpx_convolve8_avg_loop_vert
 
     ; outer loop
     add             r0, r0, #4
     add             r2, r2, #4
     subs            r6, r6, #4              ; w -= 4
-    bgt             vp9_convolve8_avg_loop_vert_h
+    bgt             vpx_convolve8_avg_loop_vert_h
 
     pop             {r4-r8, pc}
 
diff --git a/vpx_dsp/arm/vpx_convolve8_neon.c b/vpx_dsp/arm/vpx_convolve8_neon.c
new file mode 100644
index 000000000..6f634b3c7
--- /dev/null
+++ b/vpx_dsp/arm/vpx_convolve8_neon.c
@@ -0,0 +1,360 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const int16_t *filter_x, int x_step_q4,
+                           const int16_t *filter_y, int y_step_q4,
+                           int w, int h);
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const int16_t *filter_x, int x_step_q4,
+                           const int16_t *filter_y, int y_step_q4,
+                           int w, int h);
+
+static INLINE int32x4_t MULTIPLY_BY_Q0(
+    int16x4_t dsrc0,
+    int16x4_t dsrc1,
+    int16x4_t dsrc2,
+    int16x4_t dsrc3,
+    int16x4_t dsrc4,
+    int16x4_t dsrc5,
+    int16x4_t dsrc6,
+    int16x4_t dsrc7,
+    int16x8_t q0s16) {
+  int32x4_t qdst;
+  int16x4_t d0s16, d1s16;
+
+  d0s16 = vget_low_s16(q0s16);
+  d1s16 = vget_high_s16(q0s16);
+
+  qdst = vmull_lane_s16(dsrc0, d0s16, 0);
+  qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1);
+  qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2);
+  qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3);
+  qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0);
+  qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1);
+  qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2);
+  qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3);
+  return qdst;
+}
+
+void vpx_convolve8_horiz_neon(
+    const uint8_t *src,
+    ptrdiff_t src_stride,
+    uint8_t *dst,
+    ptrdiff_t dst_stride,
+    const int16_t *filter_x,
+    int x_step_q4,
+    const int16_t *filter_y,  // unused
+    int y_step_q4,            // unused
+    int w,
+    int h) {
+  int width;
+  const uint8_t *s, *psrc;
+  uint8_t *d, *pdst;
+  uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8;
+  uint32x2_t d2u32, d3u32, d28u32, d29u32, d30u32, d31u32;
+  uint8x16_t q12u8, q13u8, q14u8, q15u8;
+  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16;
+  int16x4_t d24s16, d25s16, d26s16, d27s16;
+  uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16;
+  int16x8_t q0s16;
+  uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
+  int32x4_t q1s32, q2s32, q14s32, q15s32;
+  uint16x8x2_t q0x2u16;
+  uint8x8x2_t d0x2u8, d1x2u8;
+  uint32x2x2_t d0x2u32;
+  uint16x4x2_t d0x2u16, d1x2u16;
+  uint32x4x2_t q0x2u32;
+
+  if (x_step_q4 != 16) {
+    vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride,
+                          filter_x, x_step_q4,
+                          filter_y, y_step_q4, w, h);
+    return;
+  }
+
+  q0s16 = vld1q_s16(filter_x);
+
+  src -= 3;  // adjust for taps
+  for (; h > 0; h -= 4,
+    src += src_stride * 4,
+    dst += dst_stride * 4) {  // loop_horiz_v
+    s = src;
+    d24u8 = vld1_u8(s);
+    s += src_stride;
+    d25u8 = vld1_u8(s);
+    s += src_stride;
+    d26u8 = vld1_u8(s);
+    s += src_stride;
+    d27u8 = vld1_u8(s);
+
+    q12u8 = vcombine_u8(d24u8, d25u8);
+    q13u8 = vcombine_u8(d26u8, d27u8);
+
+    q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8),
+                        vreinterpretq_u16_u8(q13u8));
+    d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
+    d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
+    d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
+    d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1]));
+    d0x2u8 = vtrn_u8(d24u8, d25u8);
+    d1x2u8 = vtrn_u8(d26u8, d27u8);
+
+    __builtin_prefetch(src + src_stride * 4);
+    __builtin_prefetch(src + src_stride * 5);
+    __builtin_prefetch(src + src_stride * 6);
+
+    q8u16  = vmovl_u8(d0x2u8.val[0]);
+    q9u16  = vmovl_u8(d0x2u8.val[1]);
+    q10u16 = vmovl_u8(d1x2u8.val[0]);
+    q11u16 = vmovl_u8(d1x2u8.val[1]);
+
+    d16u16 = vget_low_u16(q8u16);
+    d17u16 = vget_high_u16(q8u16);
+    d18u16 = vget_low_u16(q9u16);
+    d19u16 = vget_high_u16(q9u16);
+    q8u16 = vcombine_u16(d16u16, d18u16);  // vswp 17 18
+    q9u16 = vcombine_u16(d17u16, d19u16);
+
+    d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+    d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));  // vmov 23 21
+    for (width = w, psrc = src + 7, pdst = dst;
+         width > 0;
+         width -= 4, psrc += 4, pdst += 4) {  // loop_horiz
+      s = psrc;
+      d28u32 = vld1_dup_u32((const uint32_t *)s);
+      s += src_stride;
+      d29u32 = vld1_dup_u32((const uint32_t *)s);
+      s += src_stride;
+      d31u32 = vld1_dup_u32((const uint32_t *)s);
+      s += src_stride;
+      d30u32 = vld1_dup_u32((const uint32_t *)s);
+
+      __builtin_prefetch(psrc + 64);
+
+      d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32),
+                         vreinterpret_u16_u32(d31u32));
+      d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32),
+                         vreinterpret_u16_u32(d30u32));
+      d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]),   // d28
+                       vreinterpret_u8_u16(d1x2u16.val[0]));  // d29
+      d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]),   // d31
+                       vreinterpret_u8_u16(d1x2u16.val[1]));  // d30
+
+      __builtin_prefetch(psrc + 64 + src_stride);
+
+      q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
+      q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
+      q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8),
+                          vreinterpretq_u32_u8(q15u8));
+
+      d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
+      d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
+      q12u16 = vmovl_u8(d28u8);
+      q13u16 = vmovl_u8(d29u8);
+
+      __builtin_prefetch(psrc + 64 + src_stride * 2);
+
+      d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
+      d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
+      d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+      d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+      d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+      d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+      d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+      d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+      d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+
+      q1s32  = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16,
+                              d18s16, d19s16, d23s16, d24s16, q0s16);
+      q2s32  = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16,
+                              d19s16, d23s16, d24s16, d26s16, q0s16);
+      q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16,
+                              d23s16, d24s16, d26s16, d27s16, q0s16);
+      q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16,
+                              d24s16, d26s16, d27s16, d25s16, q0s16);
+
+      __builtin_prefetch(psrc + 60 + src_stride * 3);
+
+      d2u16 = vqrshrun_n_s32(q1s32, 7);
+      d3u16 = vqrshrun_n_s32(q2s32, 7);
+      d4u16 = vqrshrun_n_s32(q14s32, 7);
+      d5u16 = vqrshrun_n_s32(q15s32, 7);
+
+      q1u16 = vcombine_u16(d2u16, d3u16);
+      q2u16 = vcombine_u16(d4u16, d5u16);
+
+      d2u8 = vqmovn_u16(q1u16);
+      d3u8 = vqmovn_u16(q2u16);
+
+      d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8),
+                         vreinterpret_u16_u8(d3u8));
+      d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
+                         vreinterpret_u32_u16(d0x2u16.val[1]));
+      d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
+                       vreinterpret_u8_u32(d0x2u32.val[1]));
+
+      d2u32 = vreinterpret_u32_u8(d0x2u8.val[0]);
+      d3u32 = vreinterpret_u32_u8(d0x2u8.val[1]);
+
+      d = pdst;
+      vst1_lane_u32((uint32_t *)d, d2u32, 0);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d3u32, 0);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d2u32, 1);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d3u32, 1);
+
+      q8u16 = q9u16;
+      d20s16 = d23s16;
+      q11u16 = q12u16;
+      q9u16 = q13u16;
+      d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+    }
+  }
+  return;
+}
+
+void vpx_convolve8_vert_neon(
+    const uint8_t *src,
+    ptrdiff_t src_stride,
+    uint8_t *dst,
+    ptrdiff_t dst_stride,
+    const int16_t *filter_x,  // unused
+    int x_step_q4,            // unused
+    const int16_t *filter_y,
+    int y_step_q4,
+    int w,
+    int h) {
+  int height;
+  const uint8_t *s;
+  uint8_t *d;
+  uint32x2_t d2u32, d3u32;
+  uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32;
+  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16;
+  int16x4_t d24s16, d25s16, d26s16, d27s16;
+  uint16x4_t d2u16, d3u16, d4u16, d5u16;
+  int16x8_t q0s16;
+  uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
+  int32x4_t q1s32, q2s32, q14s32, q15s32;
+
+  if (y_step_q4 != 16) {
+    vpx_convolve8_vert_c(src, src_stride, dst, dst_stride,
+                         filter_x, x_step_q4,
+                         filter_y, y_step_q4, w, h);
+    return;
+  }
+
+  src -= src_stride * 3;
+  q0s16 = vld1q_s16(filter_y);
+  for (; w > 0; w -= 4, src += 4, dst += 4) {  // loop_vert_h
+    s = src;
+    d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0);
+    s += src_stride;
+    d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1);
+    s += src_stride;
+    d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0);
+    s += src_stride;
+    d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1);
+    s += src_stride;
+    d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0);
+    s += src_stride;
+    d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1);
+    s += src_stride;
+    d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0);
+    s += src_stride;
+
+    q8u16  = vmovl_u8(vreinterpret_u8_u32(d16u32));
+    q9u16  = vmovl_u8(vreinterpret_u8_u32(d18u32));
+    q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32));
+    q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32));
+
+    d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+    d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+    d = dst;
+    for (height = h; height > 0; height -= 4) {  // loop_vert
+      d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0);
+      s += src_stride;
+      d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0);
+      s += src_stride;
+      d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1);
+      s += src_stride;
+      d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1);
+      s += src_stride;
+
+      q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32));
+      q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32));
+
+      d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
+      d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
+      d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+      d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
+      d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+      d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+      d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+      d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+
+      __builtin_prefetch(d);
+      __builtin_prefetch(d + dst_stride);
+      q1s32  = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16,
+                              d20s16, d21s16, d22s16, d24s16, q0s16);
+      __builtin_prefetch(d + dst_stride * 2);
+      __builtin_prefetch(d + dst_stride * 3);
+      q2s32  = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16,
+                              d21s16, d22s16, d24s16, d26s16, q0s16);
+      __builtin_prefetch(s);
+      __builtin_prefetch(s + src_stride);
+      q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16,
+                              d22s16, d24s16, d26s16, d27s16, q0s16);
+      __builtin_prefetch(s + src_stride * 2);
+      __builtin_prefetch(s + src_stride * 3);
+      q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16,
+                              d24s16, d26s16, d27s16, d25s16, q0s16);
+
+      d2u16 = vqrshrun_n_s32(q1s32, 7);
+      d3u16 = vqrshrun_n_s32(q2s32, 7);
+      d4u16 = vqrshrun_n_s32(q14s32, 7);
+      d5u16 = vqrshrun_n_s32(q15s32, 7);
+
+      q1u16 = vcombine_u16(d2u16, d3u16);
+      q2u16 = vcombine_u16(d4u16, d5u16);
+
+      d2u32 = vreinterpret_u32_u8(vqmovn_u16(q1u16));
+      d3u32 = vreinterpret_u32_u8(vqmovn_u16(q2u16));
+
+      vst1_lane_u32((uint32_t *)d, d2u32, 0);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d2u32, 1);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d3u32, 0);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d3u32, 1);
+      d += dst_stride;
+
+      q8u16 = q10u16;
+      d18s16 = d22s16;
+      d19s16 = d24s16;
+      q10u16 = q13u16;
+      d22s16 = d25s16;
+    }
+  }
+  return;
+}
diff --git a/vp9/common/arm/neon/vp9_convolve8_neon_asm.asm b/vpx_dsp/arm/vpx_convolve8_neon_asm.asm
index 184c3ad67..bc530de41 100644
--- a/vp9/common/arm/neon/vp9_convolve8_neon_asm.asm
+++ b/vpx_dsp/arm/vpx_convolve8_neon_asm.asm
@@ -17,10 +17,10 @@
     ; VP9_FILTER_WEIGHT == 128
     ; VP9_FILTER_SHIFT == 7
 
-    EXPORT  |vp9_convolve8_horiz_neon|
-    EXPORT  |vp9_convolve8_vert_neon|
-    IMPORT  |vp9_convolve8_horiz_c|
-    IMPORT  |vp9_convolve8_vert_c|
+    EXPORT  |vpx_convolve8_horiz_neon|
+    EXPORT  |vpx_convolve8_vert_neon|
+    IMPORT  |vpx_convolve8_horiz_c|
+    IMPORT  |vpx_convolve8_vert_c|
     ARM
     REQUIRE8
     PRESERVE8
@@ -51,10 +51,10 @@
 ; sp[]int w
 ; sp[]int h
 
-|vp9_convolve8_horiz_neon| PROC
+|vpx_convolve8_horiz_neon| PROC
     ldr             r12, [sp, #4]           ; x_step_q4
     cmp             r12, #16
-    bne             vp9_convolve8_horiz_c
+    bne             vpx_convolve8_horiz_c
 
     push            {r4-r10, lr}
 
@@ -78,7 +78,7 @@
 
     mov             r10, r6                 ; w loop counter
 
-vp9_convolve8_loop_horiz_v
+vpx_convolve8_loop_horiz_v
     vld1.8          {d24}, [r0], r1
     vld1.8          {d25}, [r0], r1
     vld1.8          {d26}, [r0], r1
@@ -101,7 +101,7 @@ vp9_convolve8_loop_horiz_v
 
     add             r0, r0, #3
 
-vp9_convolve8_loop_horiz
+vpx_convolve8_loop_horiz
     add             r5, r0, #64
 
     vld1.32         {d28[]}, [r0], r1
@@ -159,23 +159,23 @@ vp9_convolve8_loop_horiz
     vmov            q9,  q13
 
     subs            r6, r6, #4              ; w -= 4
-    bgt             vp9_convolve8_loop_horiz
+    bgt             vpx_convolve8_loop_horiz
 
     ; outer loop
     mov             r6, r10                 ; restore w counter
     add             r0, r0, r9              ; src += src_stride * 4 - w
     add             r2, r2, r12             ; dst += dst_stride * 4 - w
     subs            r7, r7, #4              ; h -= 4
-    bgt vp9_convolve8_loop_horiz_v
+    bgt vpx_convolve8_loop_horiz_v
 
     pop             {r4-r10, pc}
 
     ENDP
 
-|vp9_convolve8_vert_neon| PROC
+|vpx_convolve8_vert_neon| PROC
     ldr             r12, [sp, #12]
     cmp             r12, #16
-    bne             vp9_convolve8_vert_c
+    bne             vpx_convolve8_vert_c
 
     push            {r4-r8, lr}
 
@@ -192,7 +192,7 @@ vp9_convolve8_loop_horiz
     lsl             r1, r1, #1
     lsl             r3, r3, #1
 
-vp9_convolve8_loop_vert_h
+vpx_convolve8_loop_vert_h
     mov             r4, r0
     add             r7, r0, r1, asr #1
     mov             r5, r2
@@ -212,7 +212,7 @@ vp9_convolve8_loop_vert_h
     vmovl.u8        q10, d20
     vmovl.u8        q11, d22
 
-vp9_convolve8_loop_vert
+vpx_convolve8_loop_vert
     ; always process a 4x4 block at a time
     vld1.u32        {d24[0]}, [r7], r1
     vld1.u32        {d26[0]}, [r4], r1
@@ -266,13 +266,13 @@ vp9_convolve8_loop_vert
     vmov            d22, d25
 
     subs            r12, r12, #4            ; h -= 4
-    bgt             vp9_convolve8_loop_vert
+    bgt             vpx_convolve8_loop_vert
 
     ; outer loop
     add             r0, r0, #4
     add             r2, r2, #4
     subs            r6, r6, #4              ; w -= 4
-    bgt             vp9_convolve8_loop_vert_h
+    bgt             vpx_convolve8_loop_vert_h
 
     pop             {r4-r8, pc}
 
diff --git a/vpx_dsp/arm/vpx_convolve_avg_neon.c b/vpx_dsp/arm/vpx_convolve_avg_neon.c
new file mode 100644
index 000000000..dc58a332f
--- /dev/null
+++ b/vpx_dsp/arm/vpx_convolve_avg_neon.c
@@ -0,0 +1,147 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+void vpx_convolve_avg_neon(
+    const uint8_t *src,    // r0
+    ptrdiff_t src_stride,  // r1
+    uint8_t *dst,          // r2
+    ptrdiff_t dst_stride,  // r3
+    const int16_t *filter_x,
+    int filter_x_stride,
+    const int16_t *filter_y,
+    int filter_y_stride,
+    int w,
+    int h) {
+  uint8_t *d;
+  uint8x8_t d0u8, d1u8, d2u8, d3u8;
+  uint32x2_t d0u32, d2u32;
+  uint8x16_t q0u8, q1u8, q2u8, q3u8, q8u8, q9u8, q10u8, q11u8;
+  (void)filter_x;  (void)filter_x_stride;
+  (void)filter_y;  (void)filter_y_stride;
+
+  d = dst;
+  if (w > 32) {  // avg64
+    for (; h > 0; h -= 1) {
+      q0u8  = vld1q_u8(src);
+      q1u8  = vld1q_u8(src + 16);
+      q2u8  = vld1q_u8(src + 32);
+      q3u8  = vld1q_u8(src + 48);
+      src += src_stride;
+      q8u8  = vld1q_u8(d);
+      q9u8  = vld1q_u8(d + 16);
+      q10u8 = vld1q_u8(d + 32);
+      q11u8 = vld1q_u8(d + 48);
+      d += dst_stride;
+
+      q0u8 = vrhaddq_u8(q0u8, q8u8);
+      q1u8 = vrhaddq_u8(q1u8, q9u8);
+      q2u8 = vrhaddq_u8(q2u8, q10u8);
+      q3u8 = vrhaddq_u8(q3u8, q11u8);
+
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q1u8);
+      vst1q_u8(dst + 32, q2u8);
+      vst1q_u8(dst + 48, q3u8);
+      dst += dst_stride;
+    }
+  } else if (w == 32) {  // avg32
+    for (; h > 0; h -= 2) {
+      q0u8 = vld1q_u8(src);
+      q1u8 = vld1q_u8(src + 16);
+      src += src_stride;
+      q2u8 = vld1q_u8(src);
+      q3u8 = vld1q_u8(src + 16);
+      src += src_stride;
+      q8u8 = vld1q_u8(d);
+      q9u8 = vld1q_u8(d + 16);
+      d += dst_stride;
+      q10u8 = vld1q_u8(d);
+      q11u8 = vld1q_u8(d + 16);
+      d += dst_stride;
+
+      q0u8 = vrhaddq_u8(q0u8, q8u8);
+      q1u8 = vrhaddq_u8(q1u8, q9u8);
+      q2u8 = vrhaddq_u8(q2u8, q10u8);
+      q3u8 = vrhaddq_u8(q3u8, q11u8);
+
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q1u8);
+      dst += dst_stride;
+      vst1q_u8(dst, q2u8);
+      vst1q_u8(dst + 16, q3u8);
+      dst += dst_stride;
+    }
+  } else if (w > 8) {  // avg16
+    for (; h > 0; h -= 2) {
+      q0u8 = vld1q_u8(src);
+      src += src_stride;
+      q1u8 = vld1q_u8(src);
+      src += src_stride;
+      q2u8 = vld1q_u8(d);
+      d += dst_stride;
+      q3u8 = vld1q_u8(d);
+      d += dst_stride;
+
+      q0u8 = vrhaddq_u8(q0u8, q2u8);
+      q1u8 = vrhaddq_u8(q1u8, q3u8);
+
+      vst1q_u8(dst, q0u8);
+      dst += dst_stride;
+      vst1q_u8(dst, q1u8);
+      dst += dst_stride;
+    }
+  } else if (w == 8) {  // avg8
+    for (; h > 0; h -= 2) {
+      d0u8 = vld1_u8(src);
+      src += src_stride;
+      d1u8 = vld1_u8(src);
+      src += src_stride;
+      d2u8 = vld1_u8(d);
+      d += dst_stride;
+      d3u8 = vld1_u8(d);
+      d += dst_stride;
+
+      q0u8 = vcombine_u8(d0u8, d1u8);
+      q1u8 = vcombine_u8(d2u8, d3u8);
+      q0u8 = vrhaddq_u8(q0u8, q1u8);
+
+      vst1_u8(dst, vget_low_u8(q0u8));
+      dst += dst_stride;
+      vst1_u8(dst, vget_high_u8(q0u8));
+      dst += dst_stride;
+    }
+  } else {  // avg4
+    for (; h > 0; h -= 2) {
+      d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 0);
+      src += src_stride;
+      d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 1);
+      src += src_stride;
+      d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 0);
+      d += dst_stride;
+      d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 1);
+      d += dst_stride;
+
+      d0u8 = vrhadd_u8(vreinterpret_u8_u32(d0u32),
+                       vreinterpret_u8_u32(d2u32));
+
+      d0u32 = vreinterpret_u32_u8(d0u8);
+      vst1_lane_u32((uint32_t *)dst, d0u32, 0);
+      dst += dst_stride;
+      vst1_lane_u32((uint32_t *)dst, d0u32, 1);
+      dst += dst_stride;
+    }
+  }
+  return;
+}
diff --git a/vp9/common/arm/neon/vp9_convolve_avg_neon_asm.asm b/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm
index 7d2453021..97e6189fd 100644
--- a/vp9/common/arm/neon/vp9_convolve_avg_neon_asm.asm
+++ b/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm
@@ -8,14 +8,14 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-    EXPORT  |vp9_convolve_avg_neon|
+    EXPORT  |vpx_convolve_avg_neon|
     ARM
     REQUIRE8
     PRESERVE8
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
-|vp9_convolve_avg_neon| PROC
+|vpx_convolve_avg_neon| PROC
     push                {r4-r6, lr}
     ldrd                r4, r5, [sp, #32]
     mov                 r6, r2
diff --git a/vpx_dsp/arm/vpx_convolve_copy_neon.c b/vpx_dsp/arm/vpx_convolve_copy_neon.c
new file mode 100644
index 000000000..d8fb97a86
--- /dev/null
+++ b/vpx_dsp/arm/vpx_convolve_copy_neon.c
@@ -0,0 +1,94 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+void vpx_convolve_copy_neon(
+    const uint8_t *src,    // r0
+    ptrdiff_t src_stride,  // r1
+    uint8_t *dst,          // r2
+    ptrdiff_t dst_stride,  // r3
+    const int16_t *filter_x,
+    int filter_x_stride,
+    const int16_t *filter_y,
+    int filter_y_stride,
+    int w,
+    int h) {
+  uint8x8_t d0u8, d2u8;
+  uint8x16_t q0u8, q1u8, q2u8, q3u8;
+  (void)filter_x;  (void)filter_x_stride;
+  (void)filter_y;  (void)filter_y_stride;
+
+  if (w > 32) {  // copy64
+    for (; h > 0; h--) {
+      q0u8 = vld1q_u8(src);
+      q1u8 = vld1q_u8(src + 16);
+      q2u8 = vld1q_u8(src + 32);
+      q3u8 = vld1q_u8(src + 48);
+      src += src_stride;
+
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q1u8);
+      vst1q_u8(dst + 32, q2u8);
+      vst1q_u8(dst + 48, q3u8);
+      dst += dst_stride;
+    }
+  } else if (w == 32) {  // copy32
+    for (; h > 0; h -= 2) {
+      q0u8 = vld1q_u8(src);
+      q1u8 = vld1q_u8(src + 16);
+      src += src_stride;
+      q2u8 = vld1q_u8(src);
+      q3u8 = vld1q_u8(src + 16);
+      src += src_stride;
+
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q1u8);
+      dst += dst_stride;
+      vst1q_u8(dst, q2u8);
+      vst1q_u8(dst + 16, q3u8);
+      dst += dst_stride;
+    }
+  } else if (w > 8) {  // copy16
+    for (; h > 0; h -= 2) {
+      q0u8 = vld1q_u8(src);
+      src += src_stride;
+      q1u8 = vld1q_u8(src);
+      src += src_stride;
+
+      vst1q_u8(dst, q0u8);
+      dst += dst_stride;
+      vst1q_u8(dst, q1u8);
+      dst += dst_stride;
+    }
+  } else if (w == 8) {  // copy8
+    for (; h > 0; h -= 2) {
+      d0u8 = vld1_u8(src);
+      src += src_stride;
+      d2u8 = vld1_u8(src);
+      src += src_stride;
+
+      vst1_u8(dst, d0u8);
+      dst += dst_stride;
+      vst1_u8(dst, d2u8);
+      dst += dst_stride;
+    }
+  } else {  // copy4
+    for (; h > 0; h--) {
+      *(uint32_t *)dst = *(const uint32_t *)src;
+      src += src_stride;
+      dst += dst_stride;
+    }
+  }
+  return;
+}
diff --git a/vp9/common/arm/neon/vp9_copy_neon_asm.asm b/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm
index a0bd04a35..89164ad48 100644
--- a/vp9/common/arm/neon/vp9_copy_neon_asm.asm
+++ b/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm
@@ -8,14 +8,14 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-    EXPORT  |vp9_convolve_copy_neon|
+    EXPORT  |vpx_convolve_copy_neon|
     ARM
     REQUIRE8
     PRESERVE8
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
-|vp9_convolve_copy_neon| PROC
+|vpx_convolve_copy_neon| PROC
     push                {r4-r5, lr}
     ldrd                r4, r5, [sp, #28]
 
diff --git a/vp9/common/arm/neon/vp9_convolve_neon.c b/vpx_dsp/arm/vpx_convolve_neon.c
index 2e28cb20e..2c0f7e9e7 100644
--- a/vp9/common/arm/neon/vp9_convolve_neon.c
+++ b/vpx_dsp/arm/vpx_convolve_neon.c
@@ -8,11 +8,11 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_ports/mem.h"
 
-void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride,
                         uint8_t *dst, ptrdiff_t dst_stride,
                         const int16_t *filter_x, int x_step_q4,
                         const int16_t *filter_y, int y_step_q4,
@@ -26,7 +26,7 @@ void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride,
   int intermediate_height = h + 7;
 
   if (x_step_q4 != 16 || y_step_q4 != 16) {
-    vp9_convolve8_c(src, src_stride,
+    vpx_convolve8_c(src, src_stride,
                     dst, dst_stride,
                     filter_x, x_step_q4,
                     filter_y, y_step_q4,
@@ -39,19 +39,19 @@ void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride,
    * the temp buffer which has lots of extra room and is subsequently discarded
    * this is safe if somewhat less than ideal.
    */
-  vp9_convolve8_horiz_neon(src - src_stride * 3, src_stride,
+  vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride,
                            temp, 64,
                            filter_x, x_step_q4, filter_y, y_step_q4,
                            w, intermediate_height);
 
   /* Step into the temp buffer 3 lines to get the actual frame data */
-  vp9_convolve8_vert_neon(temp + 64 * 3, 64,
+  vpx_convolve8_vert_neon(temp + 64 * 3, 64,
                           dst, dst_stride,
                           filter_x, x_step_q4, filter_y, y_step_q4,
                           w, h);
 }
 
-void vp9_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
                             const int16_t *filter_x, int x_step_q4,
                             const int16_t *filter_y, int y_step_q4,
@@ -60,7 +60,7 @@ void vp9_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
   int intermediate_height = h + 7;
 
   if (x_step_q4 != 16 || y_step_q4 != 16) {
-    vp9_convolve8_avg_c(src, src_stride,
+    vpx_convolve8_avg_c(src, src_stride,
                         dst, dst_stride,
                         filter_x, x_step_q4,
                         filter_y, y_step_q4,
@@ -71,11 +71,11 @@ void vp9_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
   /* This implementation has the same issues as above. In addition, we only want
    * to average the values after both passes.
    */
-  vp9_convolve8_horiz_neon(src - src_stride * 3, src_stride,
+  vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride,
                            temp, 64,
                            filter_x, x_step_q4, filter_y, y_step_q4,
                            w, intermediate_height);
-  vp9_convolve8_avg_vert_neon(temp + 64 * 3,
+  vpx_convolve8_avg_vert_neon(temp + 64 * 3,
                               64, dst, dst_stride,
                               filter_x, x_step_q4, filter_y, y_step_q4,
                               w, h);
diff --git a/vpx_dsp/loopfilter.c b/vpx_dsp/loopfilter.c
index dc8aca5c3..66f4d9576 100644
--- a/vpx_dsp/loopfilter.c
+++ b/vpx_dsp/loopfilter.c
@@ -8,6 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <stdlib.h>
+
 #include "./vpx_config.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_ports/mem.h"
diff --git a/vp9/common/mips/msa/vp9_convolve8_avg_horiz_msa.c b/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c
index 89364cb95..c5ad08cc9 100644
--- a/vp9/common/mips/msa/vp9_convolve8_avg_horiz_msa.c
+++ b/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "./vp9_rtcd.h"
-#include "vp9/common/mips/msa/vp9_convolve_msa.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/vpx_convolve_msa.h"
 
 static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src,
                                               int32_t src_stride,
@@ -687,7 +687,7 @@ static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src,
   }
 }
 
-void vp9_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
                                  uint8_t *dst, ptrdiff_t dst_stride,
                                  const int16_t *filter_x, int x_step_q4,
                                  const int16_t *filter_y, int y_step_q4,
@@ -695,14 +695,14 @@ void vp9_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
   int8_t cnt, filt_hor[8];
 
   if (16 != x_step_q4) {
-    vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
+    vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
                               filter_x, x_step_q4, filter_y, y_step_q4,
                               w, h);
     return;
   }
 
   if (((const int32_t *)filter_x)[1] == 0x800000) {
-    vp9_convolve_avg(src, src_stride, dst, dst_stride,
+    vpx_convolve_avg(src, src_stride, dst, dst_stride,
                      filter_x, x_step_q4, filter_y, y_step_q4,
                      w, h);
     return;
@@ -740,7 +740,7 @@ void vp9_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
                                           &filt_hor[3], h);
         break;
       default:
-        vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
+        vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
                                   filter_x, x_step_q4, filter_y, y_step_q4,
                                   w, h);
         break;
@@ -773,7 +773,7 @@ void vp9_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
                                           filt_hor, h);
         break;
       default:
-        vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
+        vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
                                   filter_x, x_step_q4, filter_y, y_step_q4,
                                   w, h);
         break;
diff --git a/vp9/common/mips/msa/vp9_convolve8_avg_msa.c b/vpx_dsp/mips/vpx_convolve8_avg_msa.c
index e9f3a9dc3..f46df6782 100644
--- a/vp9/common/mips/msa/vp9_convolve8_avg_msa.c
+++ b/vpx_dsp/mips/vpx_convolve8_avg_msa.c
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "./vp9_rtcd.h"
-#include "vp9/common/mips/msa/vp9_convolve_msa.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/vpx_convolve_msa.h"
 
 static void common_hv_8ht_8vt_and_aver_dst_4w_msa(const uint8_t *src,
                                                   int32_t src_stride,
@@ -576,7 +576,7 @@ static void common_hv_2ht_2vt_and_aver_dst_64w_msa(const uint8_t *src,
   }
 }
 
-void vp9_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
                            uint8_t *dst, ptrdiff_t dst_stride,
                            const int16_t *filter_x, int x_step_q4,
                            const int16_t *filter_y, int y_step_q4,
@@ -584,7 +584,7 @@ void vp9_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
   int8_t cnt, filt_hor[8], filt_ver[8];
 
   if (16 != x_step_q4 || 16 != y_step_q4) {
-    vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,
+    vpx_convolve8_avg_c(src, src_stride, dst, dst_stride,
                         filter_x, x_step_q4, filter_y, y_step_q4,
                         w, h);
     return;
@@ -592,7 +592,7 @@ void vp9_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
 
   if (((const int32_t *)filter_x)[1] == 0x800000 &&
       ((const int32_t *)filter_y)[1] == 0x800000) {
-    vp9_convolve_avg(src, src_stride, dst, dst_stride,
+    vpx_convolve_avg(src, src_stride, dst, dst_stride,
                      filter_x, x_step_q4, filter_y, y_step_q4,
                      w, h);
     return;
@@ -632,14 +632,14 @@ void vp9_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
                                                &filt_hor[3], &filt_ver[3], h);
         break;
       default:
-        vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,
+        vpx_convolve8_avg_c(src, src_stride, dst, dst_stride,
                             filter_x, x_step_q4, filter_y, y_step_q4,
                             w, h);
         break;
     }
   } else if (((const int32_t *)filter_x)[0] == 0 ||
              ((const int32_t *)filter_y)[0] == 0) {
-    vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,
+    vpx_convolve8_avg_c(src, src_stride, dst, dst_stride,
                         filter_x, x_step_q4, filter_y, y_step_q4,
                         w, h);
   } else {
@@ -670,7 +670,7 @@ void vp9_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
                                                filt_hor, filt_ver, h);
         break;
       default:
-        vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,
+        vpx_convolve8_avg_c(src, src_stride, dst, dst_stride,
                             filter_x, x_step_q4, filter_y, y_step_q4,
                             w, h);
         break;
diff --git a/vp9/common/mips/msa/vp9_convolve8_avg_vert_msa.c b/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c
index 85dfd30c7..4d4b3d9e3 100644
--- a/vp9/common/mips/msa/vp9_convolve8_avg_vert_msa.c
+++ b/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "./vp9_rtcd.h"
-#include "vp9/common/mips/msa/vp9_convolve_msa.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/vpx_convolve_msa.h"
 
 static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src,
                                              int32_t src_stride,
@@ -657,7 +657,7 @@ static void common_vt_2t_and_aver_dst_64w_msa(const uint8_t *src,
   }
 }
 
-void vp9_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
                                 uint8_t *dst, ptrdiff_t dst_stride,
                                 const int16_t *filter_x, int x_step_q4,
                                 const int16_t *filter_y, int y_step_q4,
@@ -665,14 +665,14 @@ void vp9_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
   int8_t cnt, filt_ver[8];
 
   if (16 != y_step_q4) {
-    vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
+    vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
                              filter_x, x_step_q4, filter_y, y_step_q4,
                              w, h);
     return;
   }
 
   if (((const int32_t *)filter_y)[1] == 0x800000) {
-    vp9_convolve_avg(src, src_stride, dst, dst_stride,
+    vpx_convolve_avg(src, src_stride, dst, dst_stride,
                      filter_x, x_step_q4, filter_y, y_step_q4,
                      w, h);
     return;
@@ -710,7 +710,7 @@ void vp9_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
                                           &filt_ver[3], h);
         break;
       default:
-        vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
+        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
                                  filter_x, x_step_q4, filter_y, y_step_q4,
                                  w, h);
         break;
@@ -744,7 +744,7 @@ void vp9_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
                                           filt_ver, h);
         break;
       default:
-        vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
+        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
                                  filter_x, x_step_q4, filter_y, y_step_q4,
                                  w, h);
         break;
diff --git a/vp9/common/mips/msa/vp9_convolve8_horiz_msa.c b/vpx_dsp/mips/vpx_convolve8_horiz_msa.c
index f175bf9b6..2bb314d7f 100644
--- a/vp9/common/mips/msa/vp9_convolve8_horiz_msa.c
+++ b/vpx_dsp/mips/vpx_convolve8_horiz_msa.c
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "./vp9_rtcd.h"
-#include "vp9/common/mips/msa/vp9_convolve_msa.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/vpx_convolve_msa.h"
 
 static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride,
                                  uint8_t *dst, int32_t dst_stride,
@@ -647,7 +647,7 @@ static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride,
   }
 }
 
-void vp9_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
                              const int16_t *filter_x, int x_step_q4,
                              const int16_t *filter_y, int y_step_q4,
@@ -655,14 +655,14 @@ void vp9_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
   int8_t cnt, filt_hor[8];
 
   if (16 != x_step_q4) {
-    vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
+    vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride,
                           filter_x, x_step_q4, filter_y, y_step_q4,
                           w, h);
     return;
   }
 
   if (((const int32_t *)filter_x)[1] == 0x800000) {
-    vp9_convolve_copy(src, src_stride, dst, dst_stride,
+    vpx_convolve_copy(src, src_stride, dst, dst_stride,
                       filter_x, x_step_q4, filter_y, y_step_q4,
                       w, h);
     return;
@@ -700,7 +700,7 @@ void vp9_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
                              &filt_hor[3], h);
         break;
       default:
-        vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
+        vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride,
                               filter_x, x_step_q4, filter_y, y_step_q4,
                               w, h);
         break;
@@ -733,7 +733,7 @@ void vp9_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
                              filt_hor, h);
         break;
       default:
-        vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
+        vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride,
                               filter_x, x_step_q4, filter_y, y_step_q4,
                               w, h);
         break;
diff --git a/vp9/common/mips/msa/vp9_convolve8_msa.c b/vpx_dsp/mips/vpx_convolve8_msa.c
index b1279d97c..52f2028cd 100644
--- a/vp9/common/mips/msa/vp9_convolve8_msa.c
+++ b/vpx_dsp/mips/vpx_convolve8_msa.c
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "./vp9_rtcd.h"
-#include "vp9/common/mips/msa/vp9_convolve_msa.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/vpx_convolve_msa.h"
 
 const uint8_t mc_filt_mask_arr[16 * 3] = {
   /* 8 width cases */
@@ -551,7 +551,7 @@ static void common_hv_2ht_2vt_64w_msa(const uint8_t *src, int32_t src_stride,
   }
 }
 
-void vp9_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride,
                        uint8_t *dst, ptrdiff_t dst_stride,
                        const int16_t *filter_x, int32_t x_step_q4,
                        const int16_t *filter_y, int32_t y_step_q4,
@@ -559,7 +559,7 @@ void vp9_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride,
   int8_t cnt, filt_hor[8], filt_ver[8];
 
   if (16 != x_step_q4 || 16 != y_step_q4) {
-    vp9_convolve8_c(src, src_stride, dst, dst_stride,
+    vpx_convolve8_c(src, src_stride, dst, dst_stride,
                     filter_x, x_step_q4, filter_y, y_step_q4,
                     w, h);
     return;
@@ -567,7 +567,7 @@ void vp9_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride,
 
   if (((const int32_t *)filter_x)[1] == 0x800000 &&
       ((const int32_t *)filter_y)[1] == 0x800000) {
-    vp9_convolve_copy(src, src_stride, dst, dst_stride,
+    vpx_convolve_copy(src, src_stride, dst, dst_stride,
                       filter_x, x_step_q4, filter_y, y_step_q4,
                       w, h);
     return;
@@ -607,14 +607,14 @@ void vp9_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride,
                                   &filt_hor[3], &filt_ver[3], (int32_t)h);
         break;
       default:
-        vp9_convolve8_c(src, src_stride, dst, dst_stride,
+        vpx_convolve8_c(src, src_stride, dst, dst_stride,
                         filter_x, x_step_q4, filter_y, y_step_q4,
                         w, h);
         break;
     }
   } else if (((const int32_t *)filter_x)[0] == 0 ||
              ((const int32_t *)filter_y)[0] == 0) {
-    vp9_convolve8_c(src, src_stride, dst, dst_stride,
+    vpx_convolve8_c(src, src_stride, dst, dst_stride,
                     filter_x, x_step_q4, filter_y, y_step_q4,
                     w, h);
   } else {
@@ -645,7 +645,7 @@ void vp9_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride,
                                   filt_hor, filt_ver, (int32_t)h);
         break;
       default:
-        vp9_convolve8_c(src, src_stride, dst, dst_stride,
+        vpx_convolve8_c(src, src_stride, dst, dst_stride,
                         filter_x, x_step_q4, filter_y, y_step_q4,
                         w, h);
         break;
diff --git a/vp9/common/mips/msa/vp9_convolve8_vert_msa.c b/vpx_dsp/mips/vpx_convolve8_vert_msa.c
index e9ec2507a..85f175760 100644
--- a/vp9/common/mips/msa/vp9_convolve8_vert_msa.c
+++ b/vpx_dsp/mips/vpx_convolve8_vert_msa.c
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "./vp9_rtcd.h"
-#include "vp9/common/mips/msa/vp9_convolve_msa.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/vpx_convolve_msa.h"
 
 static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride,
                                 uint8_t *dst, int32_t dst_stride,
@@ -650,7 +650,7 @@ static void common_vt_2t_64w_msa(const uint8_t *src, int32_t src_stride,
   }
 }
 
-void vp9_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
                             const int16_t *filter_x, int x_step_q4,
                             const int16_t *filter_y, int y_step_q4,
@@ -658,14 +658,14 @@ void vp9_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
   int8_t cnt, filt_ver[8];
 
   if (16 != y_step_q4) {
-    vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
+    vpx_convolve8_vert_c(src, src_stride, dst, dst_stride,
                          filter_x, x_step_q4, filter_y, y_step_q4,
                          w, h);
     return;
   }
 
   if (((const int32_t *)filter_y)[1] == 0x800000) {
-    vp9_convolve_copy(src, src_stride, dst, dst_stride,
+    vpx_convolve_copy(src, src_stride, dst, dst_stride,
                       filter_x, x_step_q4, filter_y, y_step_q4,
                       w, h);
     return;
@@ -703,7 +703,7 @@ void vp9_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
                              &filt_ver[3], h);
         break;
       default:
-        vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
+        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride,
                              filter_x, x_step_q4, filter_y, y_step_q4,
                              w, h);
         break;
@@ -736,7 +736,7 @@ void vp9_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
                              filt_ver, h);
         break;
       default:
-        vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
+        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride,
                              filter_x, x_step_q4, filter_y, y_step_q4,
                              w, h);
         break;
diff --git a/vp9/common/mips/msa/vp9_convolve_avg_msa.c b/vpx_dsp/mips/vpx_convolve_avg_msa.c
index 7c11e4065..4c3d97803 100644
--- a/vp9/common/mips/msa/vp9_convolve_avg_msa.c
+++ b/vpx_dsp/mips/vpx_convolve_avg_msa.c
@@ -186,7 +186,7 @@ static void avg_width64_msa(const uint8_t *src, int32_t src_stride,
   }
 }
 
-void vp9_convolve_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_convolve_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
                           uint8_t *dst, ptrdiff_t dst_stride,
                           const int16_t *filter_x, int32_t filter_x_stride,
                           const int16_t *filter_y, int32_t filter_y_stride,
diff --git a/vp9/common/mips/msa/vp9_convolve_copy_msa.c b/vpx_dsp/mips/vpx_convolve_copy_msa.c
index 39a0b24d5..ba4012281 100644
--- a/vp9/common/mips/msa/vp9_convolve_copy_msa.c
+++ b/vpx_dsp/mips/vpx_convolve_copy_msa.c
@@ -196,7 +196,7 @@ static void copy_width64_msa(const uint8_t *src, int32_t src_stride,
   copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 64);
 }
 
-void vp9_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride,
                            uint8_t *dst, ptrdiff_t dst_stride,
                            const int16_t *filter_x, int32_t filter_x_stride,
                            const int16_t *filter_y, int32_t filter_y_stride,
diff --git a/vp9/common/mips/msa/vp9_convolve_msa.h b/vpx_dsp/mips/vpx_convolve_msa.h
index 71c616b67..e0013983a 100644
--- a/vp9/common/mips/msa/vp9_convolve_msa.h
+++ b/vpx_dsp/mips/vpx_convolve_msa.h
@@ -8,11 +8,11 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_MIPS_MSA_VP9_CONVOLVE_MSA_H_
-#define VP9_COMMON_MIPS_MSA_VP9_CONVOLVE_MSA_H_
+#ifndef VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_
+#define VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_
 
-#include "vp9/common/vp9_filter.h"
 #include "vpx_dsp/mips/macros_msa.h"
+#include "vpx_dsp/vpx_filter.h"
 
 extern const uint8_t mc_filt_mask_arr[16 * 3];
 
@@ -116,4 +116,4 @@ extern const uint8_t mc_filt_mask_arr[16 * 3];
   AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);          \
   ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride);                             \
 }
-#endif  /* VP9_COMMON_MIPS_MSA_VP9_CONVOLVE_MSA_H_ */
+#endif  /* VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_ */
diff --git a/vp9/common/vp9_convolve.c b/vpx_dsp/vpx_convolve.c
index 90e337fd6..f06da3d34 100644
--- a/vp9/common/vp9_convolve.c
+++ b/vpx_dsp/vpx_convolve.c
@@ -9,13 +9,14 @@
  */
 
 #include <assert.h>
+#include <string.h>
 
 #include "./vpx_config.h"
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_convolve.h"
-#include "vp9/common/vp9_filter.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_convolve.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
 #include "vpx_ports/mem.h"
 
 static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
@@ -154,7 +155,7 @@ static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
   return (int)((const InterpKernel *)(intptr_t)f - base);
 }
 
-void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                            uint8_t *dst, ptrdiff_t dst_stride,
                            const int16_t *filter_x, int x_step_q4,
                            const int16_t *filter_y, int y_step_q4,
@@ -169,7 +170,7 @@ void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                  x0_q4, x_step_q4, w, h);
 }
 
-void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
                                const int16_t *filter_x, int x_step_q4,
                                const int16_t *filter_y, int y_step_q4,
@@ -184,7 +185,7 @@ void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                      x0_q4, x_step_q4, w, h);
 }
 
-void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                           uint8_t *dst, ptrdiff_t dst_stride,
                           const int16_t *filter_x, int x_step_q4,
                           const int16_t *filter_y, int y_step_q4,
@@ -199,7 +200,7 @@ void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                 y0_q4, y_step_q4, w, h);
 }
 
-void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
                               const int16_t *filter_x, int x_step_q4,
                               const int16_t *filter_y, int y_step_q4,
@@ -214,7 +215,7 @@ void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                     y0_q4, y_step_q4, w, h);
 }
 
-void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
                      uint8_t *dst, ptrdiff_t dst_stride,
                      const int16_t *filter_x, int x_step_q4,
                      const int16_t *filter_y, int y_step_q4,
@@ -230,7 +231,7 @@ void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
            filters_y, y0_q4, y_step_q4, w, h);
 }
 
-void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
                          uint8_t *dst, ptrdiff_t dst_stride,
                          const int16_t *filter_x, int x_step_q4,
                          const int16_t *filter_y, int y_step_q4,
@@ -240,12 +241,12 @@ void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
   assert(w <= 64);
   assert(h <= 64);
 
-  vp9_convolve8_c(src, src_stride, temp, 64,
+  vpx_convolve8_c(src, src_stride, temp, 64,
                   filter_x, x_step_q4, filter_y, y_step_q4, w, h);
-  vp9_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h);
+  vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h);
 }
 
-void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
                          uint8_t *dst, ptrdiff_t dst_stride,
                          const int16_t *filter_x, int filter_x_stride,
                          const int16_t *filter_y, int filter_y_stride,
@@ -262,7 +263,7 @@ void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
                         uint8_t *dst, ptrdiff_t dst_stride,
                         const int16_t *filter_x, int filter_x_stride,
                         const int16_t *filter_y, int filter_y_stride,
@@ -423,7 +424,7 @@ static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride,
 }
 
 
-void vp9_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                                   uint8_t *dst, ptrdiff_t dst_stride,
                                   const int16_t *filter_x, int x_step_q4,
                                   const int16_t *filter_y, int y_step_q4,
@@ -437,7 +438,7 @@ void vp9_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                         x0_q4, x_step_q4, w, h, bd);
 }
 
-void vp9_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                                       uint8_t *dst, ptrdiff_t dst_stride,
                                       const int16_t *filter_x, int x_step_q4,
                                       const int16_t *filter_y, int y_step_q4,
@@ -451,7 +452,7 @@ void vp9_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                             x0_q4, x_step_q4, w, h, bd);
 }
 
-void vp9_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                                  uint8_t *dst, ptrdiff_t dst_stride,
                                  const int16_t *filter_x, int x_step_q4,
                                  const int16_t *filter_y, int y_step_q4,
@@ -465,7 +466,7 @@ void vp9_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                        y0_q4, y_step_q4, w, h, bd);
 }
 
-void vp9_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                                      uint8_t *dst, ptrdiff_t dst_stride,
                                      const int16_t *filter_x, int x_step_q4,
                                      const int16_t *filter_y, int y_step_q4,
@@ -479,7 +480,7 @@ void vp9_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                            y0_q4, y_step_q4, w, h, bd);
 }
 
-void vp9_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
                             const int16_t *filter_x, int x_step_q4,
                             const int16_t *filter_y, int y_step_q4,
@@ -495,7 +496,7 @@ void vp9_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
                   filters_y, y0_q4, y_step_q4, w, h, bd);
 }
 
-void vp9_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
                                 uint8_t *dst, ptrdiff_t dst_stride,
                                 const int16_t *filter_x, int x_step_q4,
                                 const int16_t *filter_y, int y_step_q4,
@@ -505,13 +506,13 @@ void vp9_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
   assert(w <= 64);
   assert(h <= 64);
 
-  vp9_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), 64,
+  vpx_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), 64,
                          filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);
-  vp9_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), 64, dst, dst_stride,
+  vpx_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), 64, dst, dst_stride,
                             NULL, 0, NULL, 0, w, h, bd);
 }
 
-void vp9_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
+void vpx_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
                                 uint8_t *dst8, ptrdiff_t dst_stride,
                                 const int16_t *filter_x, int filter_x_stride,
                                 const int16_t *filter_y, int filter_y_stride,
@@ -532,7 +533,7 @@ void vp9_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
   }
 }
 
-void vp9_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride,
+void vpx_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride,
                                uint8_t *dst8, ptrdiff_t dst_stride,
                                const int16_t *filter_x, int filter_x_stride,
                                const int16_t *filter_y, int filter_y_stride,
diff --git a/vp9/common/vp9_convolve.h b/vpx_dsp/vpx_convolve.h
index 8b044c897..9ed3f1750 100644
--- a/vp9/common/vp9_convolve.h
+++ b/vpx_dsp/vpx_convolve.h
@@ -7,8 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef VP9_COMMON_VP9_CONVOLVE_H_
-#define VP9_COMMON_VP9_CONVOLVE_H_
+#ifndef VPX_DSP_VPX_CONVOLVE_H_
+#define VPX_DSP_VPX_CONVOLVE_H_
 
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
@@ -35,4 +35,4 @@ typedef void (*highbd_convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_CONVOLVE_H_
+#endif  // VPX_DSP_VPX_CONVOLVE_H_
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index ff2bb5236..de4578257 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -54,6 +54,54 @@ DSP_SRCS-$(HAVE_DSPR2)  += mips/intrapred8_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2)  += mips/intrapred16_dspr2.c
 endif  # CONFIG_VP9
 
+# interpolation filters
+DSP_SRCS-yes += vpx_convolve.c
+DSP_SRCS-yes += vpx_convolve.h
+DSP_SRCS-yes += vpx_filter.h
+
+DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/convolve.h
+DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/vpx_asm_stubs.c
+DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_subpixel_8t_sse2.asm
+DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_subpixel_bilinear_sse2.asm
+DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_ssse3.asm
+DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_bilinear_ssse3.asm
+DSP_SRCS-$(HAVE_AVX2)  += x86/vpx_subpixel_8t_intrin_avx2.c
+DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_intrin_ssse3.c
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_high_subpixel_8t_sse2.asm
+DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_high_subpixel_bilinear_sse2.asm
+endif
+ifeq ($(CONFIG_USE_X86INC),yes)
+DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_convolve_copy_sse2.asm
+endif
+
+ifeq ($(HAVE_NEON_ASM),yes)
+DSP_SRCS-yes += arm/vpx_convolve_copy_neon_asm$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_avg_neon_asm$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_neon_asm$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve_avg_neon_asm$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve_neon.c
+else
+ifeq ($(HAVE_NEON),yes)
+DSP_SRCS-yes += arm/vpx_convolve_copy_neon.c
+DSP_SRCS-yes += arm/vpx_convolve8_avg_neon.c
+DSP_SRCS-yes += arm/vpx_convolve8_neon.c
+DSP_SRCS-yes += arm/vpx_convolve_avg_neon.c
+DSP_SRCS-yes += arm/vpx_convolve_neon.c
+endif  # HAVE_NEON
+endif  # HAVE_NEON_ASM
+
+# common (msa)
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_avg_horiz_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_avg_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_avg_vert_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_horiz_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_vert_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve_avg_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve_copy_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve_msa.h
+
 # loop filters
 DSP_SRCS-yes += loopfilter.c
 
diff --git a/vpx_dsp/vpx_dsp_common.h b/vpx_dsp/vpx_dsp_common.h
index 1fd7c153a..ccb818959 100644
--- a/vpx_dsp/vpx_dsp_common.h
+++ b/vpx_dsp/vpx_dsp_common.h
@@ -11,8 +11,6 @@
 #ifndef VPX_DSP_COMMON_H_
 #define VPX_DSP_COMMON_H_
 
-#include <stdlib.h>
-
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 85b25229a..6e63271eb 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -34,6 +34,12 @@ if (vpx_config("CONFIG_USE_X86INC") eq "yes") {
   }
 }
 
+# optimizations which depend on multiple features
+$avx2_ssse3 = '';
+if ((vpx_config("HAVE_AVX2") eq "yes") && (vpx_config("HAVE_SSSE3") eq "yes")) {
+  $avx2_ssse3 = 'avx2';
+}
+
 # functions that are 64 bit only.
 $mmx_x86_64 = $sse2_x86_64 = $ssse3_x86_64 = $avx_x86_64 = $avx2_x86_64 = '';
 if ($opts{arch} eq "x86_64") {
@@ -366,6 +372,62 @@ if (vpx_config("CONFIG_VP9") eq "yes") {
 }  # CONFIG_VP9
 
 #
+# Sub Pixel Filters
+#
+add_proto qw/void vpx_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve_copy neon msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve_avg neon msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8 sse2 ssse3 neon msa/, "$avx2_ssse3";
+
+add_proto qw/void vpx_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8_horiz sse2 ssse3 neon msa/, "$avx2_ssse3";
+
+add_proto qw/void vpx_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8_vert sse2 ssse3 neon msa/, "$avx2_ssse3";
+
+add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8_avg sse2 ssse3 neon msa/;
+
+add_proto qw/void vpx_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 neon msa/;
+
+add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8_avg_vert sse2 ssse3 neon msa/;
+
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  #
+  # Sub Pixel Filters
+  #
+  add_proto qw/void vpx_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vpx_highbd_convolve_copy/;
+
+  add_proto qw/void vpx_highbd_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vpx_highbd_convolve_avg/;
+
+  add_proto qw/void vpx_highbd_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vpx_highbd_convolve8/, "$sse2_x86_64";
+
+  add_proto qw/void vpx_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vpx_highbd_convolve8_horiz/, "$sse2_x86_64";
+
+  add_proto qw/void vpx_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vpx_highbd_convolve8_vert/, "$sse2_x86_64";
+
+  add_proto qw/void vpx_highbd_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vpx_highbd_convolve8_avg/, "$sse2_x86_64";
+
+  add_proto qw/void vpx_highbd_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vpx_highbd_convolve8_avg_horiz/, "$sse2_x86_64";
+
+  add_proto qw/void vpx_highbd_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vpx_highbd_convolve8_avg_vert/, "$sse2_x86_64";
+}  # CONFIG_VP9_HIGHBITDEPTH
+
+#
 # Loopfilter
 #
 add_proto qw/void vpx_lpf_vertical_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
diff --git a/vpx_dsp/vpx_filter.h b/vpx_dsp/vpx_filter.h
new file mode 100644
index 000000000..2617febf3
--- /dev/null
+++ b/vpx_dsp/vpx_filter.h
@@ -0,0 +1,34 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_VPX_FILTER_H_
+#define VPX_DSP_VPX_FILTER_H_
+
+#include "vpx/vpx_integer.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define FILTER_BITS 7
+
+#define SUBPEL_BITS 4
+#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
+#define SUBPEL_SHIFTS (1 << SUBPEL_BITS)
+#define SUBPEL_TAPS 8
+
+typedef int16_t InterpKernel[SUBPEL_TAPS];
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_DSP_VPX_FILTER_H_
diff --git a/vp9/common/x86/convolve.h b/vpx_dsp/x86/convolve.h
index de2df47e5..c0144981b 100644
--- a/vp9/common/x86/convolve.h
+++ b/vpx_dsp/x86/convolve.h
@@ -7,8 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef VP9_COMMON_X86_CONVOLVE_H_
-#define VP9_COMMON_X86_CONVOLVE_H_
+#ifndef VPX_DSP_X86_CONVOLVE_H_
+#define VPX_DSP_X86_CONVOLVE_H_
 
 #include <assert.h>
 
@@ -26,7 +26,7 @@ typedef void filter8_1dfunction (
 );
 
 #define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
-  void vp9_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \
+  void vpx_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \
                                     uint8_t *dst, ptrdiff_t dst_stride, \
                                     const int16_t *filter_x, int x_step_q4, \
                                     const int16_t *filter_y, int y_step_q4, \
@@ -34,7 +34,7 @@ typedef void filter8_1dfunction (
   if (step_q4 == 16 && filter[3] != 128) { \
     if (filter[0] || filter[1] || filter[2]) { \
       while (w >= 16) { \
-        vp9_filter_block1d16_##dir##8_##avg##opt(src_start, \
+        vpx_filter_block1d16_##dir##8_##avg##opt(src_start, \
                                                  src_stride, \
                                                  dst, \
                                                  dst_stride, \
@@ -45,7 +45,7 @@ typedef void filter8_1dfunction (
         w -= 16; \
       } \
       while (w >= 8) { \
-        vp9_filter_block1d8_##dir##8_##avg##opt(src_start, \
+        vpx_filter_block1d8_##dir##8_##avg##opt(src_start, \
                                                 src_stride, \
                                                 dst, \
                                                 dst_stride, \
@@ -56,7 +56,7 @@ typedef void filter8_1dfunction (
         w -= 8; \
       } \
       while (w >= 4) { \
-        vp9_filter_block1d4_##dir##8_##avg##opt(src_start, \
+        vpx_filter_block1d4_##dir##8_##avg##opt(src_start, \
                                                 src_stride, \
                                                 dst, \
                                                 dst_stride, \
@@ -68,7 +68,7 @@ typedef void filter8_1dfunction (
       } \
     } else { \
       while (w >= 16) { \
-        vp9_filter_block1d16_##dir##2_##avg##opt(src, \
+        vpx_filter_block1d16_##dir##2_##avg##opt(src, \
                                                  src_stride, \
                                                  dst, \
                                                  dst_stride, \
@@ -79,7 +79,7 @@ typedef void filter8_1dfunction (
         w -= 16; \
       } \
       while (w >= 8) { \
-        vp9_filter_block1d8_##dir##2_##avg##opt(src, \
+        vpx_filter_block1d8_##dir##2_##avg##opt(src, \
                                                 src_stride, \
                                                 dst, \
                                                 dst_stride, \
@@ -90,7 +90,7 @@ typedef void filter8_1dfunction (
         w -= 8; \
       } \
       while (w >= 4) { \
-        vp9_filter_block1d4_##dir##2_##avg##opt(src, \
+        vpx_filter_block1d4_##dir##2_##avg##opt(src, \
                                                 src_stride, \
                                                 dst, \
                                                 dst_stride, \
@@ -103,14 +103,14 @@ typedef void filter8_1dfunction (
     } \
   } \
   if (w) { \
-    vp9_convolve8_##name##_c(src, src_stride, dst, dst_stride, \
+    vpx_convolve8_##name##_c(src, src_stride, dst, dst_stride, \
                              filter_x, x_step_q4, filter_y, y_step_q4, \
                              w, h); \
   } \
 }
 
 #define FUN_CONV_2D(avg, opt) \
-void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
+void vpx_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
                               uint8_t *dst, ptrdiff_t dst_stride, \
                               const int16_t *filter_x, int x_step_q4, \
                               const int16_t *filter_y, int y_step_q4, \
@@ -121,23 +121,23 @@ void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
     if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
         filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
       DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]); \
-      vp9_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
+      vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
                                 filter_x, x_step_q4, filter_y, y_step_q4, \
                                 w, h + 7); \
-      vp9_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \
+      vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \
                                       filter_x, x_step_q4, filter_y, \
                                       y_step_q4, w, h); \
     } else { \
       DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]); \
-      vp9_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \
+      vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \
                                 filter_x, x_step_q4, filter_y, y_step_q4, \
                                 w, h + 1); \
-      vp9_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \
+      vpx_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \
                                       filter_x, x_step_q4, filter_y, \
                                       y_step_q4, w, h); \
     } \
   } else { \
-    vp9_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
+    vpx_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
                            filter_x, x_step_q4, filter_y, y_step_q4, w, h); \
   } \
 }
@@ -155,7 +155,7 @@ typedef void highbd_filter8_1dfunction (
 );
 
 #define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
-  void vp9_highbd_convolve8_##name##_##opt(const uint8_t *src8, \
+  void vpx_highbd_convolve8_##name##_##opt(const uint8_t *src8, \
                                            ptrdiff_t src_stride, \
                                            uint8_t *dst8, \
                                            ptrdiff_t dst_stride, \
@@ -169,7 +169,7 @@ typedef void highbd_filter8_1dfunction (
     uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
     if (filter[0] || filter[1] || filter[2]) { \
       while (w >= 16) { \
-        vp9_highbd_filter_block1d16_##dir##8_##avg##opt(src_start, \
+        vpx_highbd_filter_block1d16_##dir##8_##avg##opt(src_start, \
                                                         src_stride, \
                                                         dst, \
                                                         dst_stride, \
@@ -181,7 +181,7 @@ typedef void highbd_filter8_1dfunction (
         w -= 16; \
       } \
       while (w >= 8) { \
-        vp9_highbd_filter_block1d8_##dir##8_##avg##opt(src_start, \
+        vpx_highbd_filter_block1d8_##dir##8_##avg##opt(src_start, \
                                                        src_stride, \
                                                        dst, \
                                                        dst_stride, \
@@ -193,7 +193,7 @@ typedef void highbd_filter8_1dfunction (
         w -= 8; \
       } \
       while (w >= 4) { \
-        vp9_highbd_filter_block1d4_##dir##8_##avg##opt(src_start, \
+        vpx_highbd_filter_block1d4_##dir##8_##avg##opt(src_start, \
                                                        src_stride, \
                                                        dst, \
                                                        dst_stride, \
@@ -206,7 +206,7 @@ typedef void highbd_filter8_1dfunction (
       } \
     } else { \
       while (w >= 16) { \
-        vp9_highbd_filter_block1d16_##dir##2_##avg##opt(src, \
+        vpx_highbd_filter_block1d16_##dir##2_##avg##opt(src, \
                                                         src_stride, \
                                                         dst, \
                                                         dst_stride, \
@@ -218,7 +218,7 @@ typedef void highbd_filter8_1dfunction (
         w -= 16; \
       } \
       while (w >= 8) { \
-        vp9_highbd_filter_block1d8_##dir##2_##avg##opt(src, \
+        vpx_highbd_filter_block1d8_##dir##2_##avg##opt(src, \
                                                        src_stride, \
                                                        dst, \
                                                        dst_stride, \
@@ -230,7 +230,7 @@ typedef void highbd_filter8_1dfunction (
         w -= 8; \
       } \
       while (w >= 4) { \
-        vp9_highbd_filter_block1d4_##dir##2_##avg##opt(src, \
+        vpx_highbd_filter_block1d4_##dir##2_##avg##opt(src, \
                                                        src_stride, \
                                                        dst, \
                                                        dst_stride, \
@@ -244,14 +244,14 @@ typedef void highbd_filter8_1dfunction (
     } \
   } \
   if (w) { \
-    vp9_highbd_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \
+    vpx_highbd_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \
                                     filter_x, x_step_q4, filter_y, y_step_q4, \
                                     w, h, bd); \
   } \
 }
 
 #define HIGH_FUN_CONV_2D(avg, opt) \
-void vp9_highbd_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
+void vpx_highbd_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
                                      uint8_t *dst, ptrdiff_t dst_stride, \
                                      const int16_t *filter_x, int x_step_q4, \
                                      const int16_t *filter_y, int y_step_q4, \
@@ -262,35 +262,35 @@ void vp9_highbd_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
     if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
         filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
       DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \
-      vp9_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \
+      vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \
                                        CONVERT_TO_BYTEPTR(fdata2), 64, \
                                        filter_x, x_step_q4, \
                                        filter_y, y_step_q4, \
                                        w, h + 7, bd); \
-      vp9_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2) + 192, \
+      vpx_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2) + 192, \
                                              64, dst, dst_stride, \
                                              filter_x, x_step_q4, \
                                              filter_y, y_step_q4, \
                                              w, h, bd); \
     } else { \
       DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \
-      vp9_highbd_convolve8_horiz_##opt(src, src_stride, \
+      vpx_highbd_convolve8_horiz_##opt(src, src_stride, \
                                        CONVERT_TO_BYTEPTR(fdata2), 64, \
                                        filter_x, x_step_q4, \
                                        filter_y, y_step_q4, \
                                        w, h + 1, bd); \
-      vp9_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), 64, \
+      vpx_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), 64, \
                                              dst, dst_stride, \
                                              filter_x, x_step_q4, \
                                              filter_y, y_step_q4, \
                                              w, h, bd); \
     } \
   } else { \
-    vp9_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
+    vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
                                   filter_x, x_step_q4, filter_y, y_step_q4, w, \
                                   h, bd); \
   } \
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-#endif  // VP9_COMMON_X86_CONVOLVE_H_
+#endif  // VPX_DSP_X86_CONVOLVE_H_
diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vpx_dsp/x86/vpx_asm_stubs.c
index fd55fb8c6..422b0fc42 100644
--- a/vp9/common/x86/vp9_asm_stubs.c
+++ b/vpx_dsp/x86/vpx_asm_stubs.c
@@ -8,53 +8,53 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "./vp9_rtcd.h"
 #include "./vpx_config.h"
-#include "vp9/common/x86/convolve.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/convolve.h"
 
 #if HAVE_SSE2
-filter8_1dfunction vp9_filter_block1d16_v8_sse2;
-filter8_1dfunction vp9_filter_block1d16_h8_sse2;
-filter8_1dfunction vp9_filter_block1d8_v8_sse2;
-filter8_1dfunction vp9_filter_block1d8_h8_sse2;
-filter8_1dfunction vp9_filter_block1d4_v8_sse2;
-filter8_1dfunction vp9_filter_block1d4_h8_sse2;
-filter8_1dfunction vp9_filter_block1d16_v8_avg_sse2;
-filter8_1dfunction vp9_filter_block1d16_h8_avg_sse2;
-filter8_1dfunction vp9_filter_block1d8_v8_avg_sse2;
-filter8_1dfunction vp9_filter_block1d8_h8_avg_sse2;
-filter8_1dfunction vp9_filter_block1d4_v8_avg_sse2;
-filter8_1dfunction vp9_filter_block1d4_h8_avg_sse2;
+filter8_1dfunction vpx_filter_block1d16_v8_sse2;
+filter8_1dfunction vpx_filter_block1d16_h8_sse2;
+filter8_1dfunction vpx_filter_block1d8_v8_sse2;
+filter8_1dfunction vpx_filter_block1d8_h8_sse2;
+filter8_1dfunction vpx_filter_block1d4_v8_sse2;
+filter8_1dfunction vpx_filter_block1d4_h8_sse2;
+filter8_1dfunction vpx_filter_block1d16_v8_avg_sse2;
+filter8_1dfunction vpx_filter_block1d16_h8_avg_sse2;
+filter8_1dfunction vpx_filter_block1d8_v8_avg_sse2;
+filter8_1dfunction vpx_filter_block1d8_h8_avg_sse2;
+filter8_1dfunction vpx_filter_block1d4_v8_avg_sse2;
+filter8_1dfunction vpx_filter_block1d4_h8_avg_sse2;
 
-filter8_1dfunction vp9_filter_block1d16_v2_sse2;
-filter8_1dfunction vp9_filter_block1d16_h2_sse2;
-filter8_1dfunction vp9_filter_block1d8_v2_sse2;
-filter8_1dfunction vp9_filter_block1d8_h2_sse2;
-filter8_1dfunction vp9_filter_block1d4_v2_sse2;
-filter8_1dfunction vp9_filter_block1d4_h2_sse2;
-filter8_1dfunction vp9_filter_block1d16_v2_avg_sse2;
-filter8_1dfunction vp9_filter_block1d16_h2_avg_sse2;
-filter8_1dfunction vp9_filter_block1d8_v2_avg_sse2;
-filter8_1dfunction vp9_filter_block1d8_h2_avg_sse2;
-filter8_1dfunction vp9_filter_block1d4_v2_avg_sse2;
-filter8_1dfunction vp9_filter_block1d4_h2_avg_sse2;
+filter8_1dfunction vpx_filter_block1d16_v2_sse2;
+filter8_1dfunction vpx_filter_block1d16_h2_sse2;
+filter8_1dfunction vpx_filter_block1d8_v2_sse2;
+filter8_1dfunction vpx_filter_block1d8_h2_sse2;
+filter8_1dfunction vpx_filter_block1d4_v2_sse2;
+filter8_1dfunction vpx_filter_block1d4_h2_sse2;
+filter8_1dfunction vpx_filter_block1d16_v2_avg_sse2;
+filter8_1dfunction vpx_filter_block1d16_h2_avg_sse2;
+filter8_1dfunction vpx_filter_block1d8_v2_avg_sse2;
+filter8_1dfunction vpx_filter_block1d8_h2_avg_sse2;
+filter8_1dfunction vpx_filter_block1d4_v2_avg_sse2;
+filter8_1dfunction vpx_filter_block1d4_h2_avg_sse2;
 
-// void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
 //                               uint8_t *dst, ptrdiff_t dst_stride,
 //                               const int16_t *filter_x, int x_step_q4,
 //                               const int16_t *filter_y, int y_step_q4,
 //                               int w, int h);
-// void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
 //                              uint8_t *dst, ptrdiff_t dst_stride,
 //                              const int16_t *filter_x, int x_step_q4,
 //                              const int16_t *filter_y, int y_step_q4,
 //                              int w, int h);
-// void vp9_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
 //                                   uint8_t *dst, ptrdiff_t dst_stride,
 //                                   const int16_t *filter_x, int x_step_q4,
 //                                   const int16_t *filter_y, int y_step_q4,
 //                                   int w, int h);
-// void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
 //                                  uint8_t *dst, ptrdiff_t dst_stride,
 //                                  const int16_t *filter_x, int x_step_q4,
 //                                  const int16_t *filter_y, int y_step_q4,
@@ -64,12 +64,12 @@ FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
 FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
 FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2);
 
-// void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
 //                         uint8_t *dst, ptrdiff_t dst_stride,
 //                         const int16_t *filter_x, int x_step_q4,
 //                         const int16_t *filter_y, int y_step_q4,
 //                         int w, int h);
-// void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
 //                             uint8_t *dst, ptrdiff_t dst_stride,
 //                             const int16_t *filter_x, int x_step_q4,
 //                             const int16_t *filter_y, int y_step_q4,
@@ -78,33 +78,33 @@ FUN_CONV_2D(, sse2);
 FUN_CONV_2D(avg_ , sse2);
 
 #if CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
-highbd_filter8_1dfunction vp9_highbd_filter_block1d16_v8_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d16_h8_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d8_v8_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d8_h8_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d4_v8_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d4_h8_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d16_v8_avg_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d16_h8_avg_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d8_v8_avg_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d8_h8_avg_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d4_v8_avg_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d4_h8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2;
 
-highbd_filter8_1dfunction vp9_highbd_filter_block1d16_v2_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d16_h2_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d8_v2_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d8_h2_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d4_v2_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d4_h2_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d16_v2_avg_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d16_h2_avg_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d8_v2_avg_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d8_h2_avg_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d4_v2_avg_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d4_h2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2;
 
-// void vp9_highbd_convolve8_horiz_sse2(const uint8_t *src,
+// void vpx_highbd_convolve8_horiz_sse2(const uint8_t *src,
 //                                      ptrdiff_t src_stride,
 //                                      uint8_t *dst,
 //                                      ptrdiff_t dst_stride,
@@ -113,7 +113,7 @@ highbd_filter8_1dfunction vp9_highbd_filter_block1d4_h2_avg_sse2;
 //                                      const int16_t *filter_y,
 //                                      int y_step_q4,
 //                                      int w, int h, int bd);
-// void vp9_highbd_convolve8_vert_sse2(const uint8_t *src,
+// void vpx_highbd_convolve8_vert_sse2(const uint8_t *src,
 //                                     ptrdiff_t src_stride,
 //                                     uint8_t *dst,
 //                                     ptrdiff_t dst_stride,
@@ -122,7 +122,7 @@ highbd_filter8_1dfunction vp9_highbd_filter_block1d4_h2_avg_sse2;
 //                                     const int16_t *filter_y,
 //                                     int y_step_q4,
 //                                     int w, int h, int bd);
-// void vp9_highbd_convolve8_avg_horiz_sse2(const uint8_t *src,
+// void vpx_highbd_convolve8_avg_horiz_sse2(const uint8_t *src,
 //                                          ptrdiff_t src_stride,
 //                                          uint8_t *dst,
 //                                          ptrdiff_t dst_stride,
@@ -131,7 +131,7 @@ highbd_filter8_1dfunction vp9_highbd_filter_block1d4_h2_avg_sse2;
 //                                          const int16_t *filter_y,
 //                                          int y_step_q4,
 //                                          int w, int h, int bd);
-// void vp9_highbd_convolve8_avg_vert_sse2(const uint8_t *src,
+// void vpx_highbd_convolve8_avg_vert_sse2(const uint8_t *src,
 //                                         ptrdiff_t src_stride,
 //                                         uint8_t *dst,
 //                                         ptrdiff_t dst_stride,
@@ -146,12 +146,12 @@ HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
 HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
                  sse2);
 
-// void vp9_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
 //                                uint8_t *dst, ptrdiff_t dst_stride,
 //                                const int16_t *filter_x, int x_step_q4,
 //                                const int16_t *filter_y, int y_step_q4,
 //                                int w, int h, int bd);
-// void vp9_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// void vpx_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
 //                                    uint8_t *dst, ptrdiff_t dst_stride,
 //                                    const int16_t *filter_x, int x_step_q4,
 //                                    const int16_t *filter_y, int y_step_q4,
diff --git a/vp9/common/x86/vp9_copy_sse2.asm b/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
index b26383708..6cd620a59 100644
--- a/vp9/common/x86/vp9_copy_sse2.asm
+++ b/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
@@ -8,6 +8,8 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
+%define program_name vpx
+
 %include "third_party/x86inc/x86inc.asm"
 
 SECTION .text
diff --git a/vp9/common/x86/vp9_high_subpixel_8t_sse2.asm b/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm
index 29ec151ed..bfc816f23 100644
--- a/vp9/common/x86/vp9_high_subpixel_8t_sse2.asm
+++ b/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm
@@ -197,7 +197,7 @@
     movdqu      [rdi + %2], xmm0
 %endm
 
-;void vp9_filter_block1d4_v8_sse2
+;void vpx_filter_block1d4_v8_sse2
 ;(
 ;    unsigned char *src_ptr,
 ;    unsigned int   src_pitch,
@@ -206,8 +206,8 @@
 ;    unsigned int   output_height,
 ;    short *filter
 ;)
-global sym(vp9_highbd_filter_block1d4_v8_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d4_v8_sse2):
+global sym(vpx_highbd_filter_block1d4_v8_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d4_v8_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -267,7 +267,7 @@ sym(vp9_highbd_filter_block1d4_v8_sse2):
     pop         rbp
     ret
 
-;void vp9_filter_block1d8_v8_sse2
+;void vpx_filter_block1d8_v8_sse2
 ;(
 ;    unsigned char *src_ptr,
 ;    unsigned int   src_pitch,
@@ -276,8 +276,8 @@ sym(vp9_highbd_filter_block1d4_v8_sse2):
 ;    unsigned int   output_height,
 ;    short *filter
 ;)
-global sym(vp9_highbd_filter_block1d8_v8_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d8_v8_sse2):
+global sym(vpx_highbd_filter_block1d8_v8_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d8_v8_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -326,7 +326,7 @@ sym(vp9_highbd_filter_block1d8_v8_sse2):
     pop         rbp
     ret
 
-;void vp9_filter_block1d16_v8_sse2
+;void vpx_filter_block1d16_v8_sse2
 ;(
 ;    unsigned char *src_ptr,
 ;    unsigned int   src_pitch,
@@ -335,8 +335,8 @@ sym(vp9_highbd_filter_block1d8_v8_sse2):
 ;    unsigned int   output_height,
 ;    short *filter
 ;)
-global sym(vp9_highbd_filter_block1d16_v8_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d16_v8_sse2):
+global sym(vpx_highbd_filter_block1d16_v8_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d16_v8_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -389,8 +389,8 @@ sym(vp9_highbd_filter_block1d16_v8_sse2):
     pop         rbp
     ret
 
-global sym(vp9_highbd_filter_block1d4_v8_avg_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d4_v8_avg_sse2):
+global sym(vpx_highbd_filter_block1d4_v8_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d4_v8_avg_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -450,8 +450,8 @@ sym(vp9_highbd_filter_block1d4_v8_avg_sse2):
     pop         rbp
     ret
 
-global sym(vp9_highbd_filter_block1d8_v8_avg_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d8_v8_avg_sse2):
+global sym(vpx_highbd_filter_block1d8_v8_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d8_v8_avg_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -499,8 +499,8 @@ sym(vp9_highbd_filter_block1d8_v8_avg_sse2):
     pop         rbp
     ret
 
-global sym(vp9_highbd_filter_block1d16_v8_avg_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d16_v8_avg_sse2):
+global sym(vpx_highbd_filter_block1d16_v8_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d16_v8_avg_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -552,7 +552,7 @@ sym(vp9_highbd_filter_block1d16_v8_avg_sse2):
     pop         rbp
     ret
 
-;void vp9_filter_block1d4_h8_sse2
+;void vpx_filter_block1d4_h8_sse2
 ;(
 ;    unsigned char  *src_ptr,
 ;    unsigned int    src_pixels_per_line,
@@ -561,8 +561,8 @@ sym(vp9_highbd_filter_block1d16_v8_avg_sse2):
 ;    unsigned int    output_height,
 ;    short *filter
 ;)
-global sym(vp9_highbd_filter_block1d4_h8_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d4_h8_sse2):
+global sym(vpx_highbd_filter_block1d4_h8_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d4_h8_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -627,7 +627,7 @@ sym(vp9_highbd_filter_block1d4_h8_sse2):
     pop         rbp
     ret
 
-;void vp9_filter_block1d8_h8_sse2
+;void vpx_filter_block1d8_h8_sse2
 ;(
 ;    unsigned char  *src_ptr,
 ;    unsigned int    src_pixels_per_line,
@@ -636,8 +636,8 @@ sym(vp9_highbd_filter_block1d4_h8_sse2):
 ;    unsigned int    output_height,
 ;    short *filter
 ;)
-global sym(vp9_highbd_filter_block1d8_h8_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d8_h8_sse2):
+global sym(vpx_highbd_filter_block1d8_h8_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d8_h8_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -693,7 +693,7 @@ sym(vp9_highbd_filter_block1d8_h8_sse2):
     pop         rbp
     ret
 
-;void vp9_filter_block1d16_h8_sse2
+;void vpx_filter_block1d16_h8_sse2
 ;(
 ;    unsigned char  *src_ptr,
 ;    unsigned int    src_pixels_per_line,
@@ -702,8 +702,8 @@ sym(vp9_highbd_filter_block1d8_h8_sse2):
 ;    unsigned int    output_height,
 ;    short *filter
 ;)
-global sym(vp9_highbd_filter_block1d16_h8_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d16_h8_sse2):
+global sym(vpx_highbd_filter_block1d16_h8_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d16_h8_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -770,8 +770,8 @@ sym(vp9_highbd_filter_block1d16_h8_sse2):
     pop         rbp
     ret
 
-global sym(vp9_highbd_filter_block1d4_h8_avg_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d4_h8_avg_sse2):
+global sym(vpx_highbd_filter_block1d4_h8_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d4_h8_avg_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -836,8 +836,8 @@ sym(vp9_highbd_filter_block1d4_h8_avg_sse2):
     pop         rbp
     ret
 
-global sym(vp9_highbd_filter_block1d8_h8_avg_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d8_h8_avg_sse2):
+global sym(vpx_highbd_filter_block1d8_h8_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d8_h8_avg_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -893,8 +893,8 @@ sym(vp9_highbd_filter_block1d8_h8_avg_sse2):
     pop         rbp
     ret
 
-global sym(vp9_highbd_filter_block1d16_h8_avg_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d16_h8_avg_sse2):
+global sym(vpx_highbd_filter_block1d16_h8_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d16_h8_avg_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
diff --git a/vp9/common/x86/vp9_high_subpixel_bilinear_sse2.asm b/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm
index 93784121c..72f2ff71d 100644
--- a/vp9/common/x86/vp9_high_subpixel_bilinear_sse2.asm
+++ b/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm
@@ -171,8 +171,8 @@
 %endm
 %endif
 
-global sym(vp9_highbd_filter_block1d4_v2_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d4_v2_sse2):
+global sym(vpx_highbd_filter_block1d4_v2_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d4_v2_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -196,8 +196,8 @@ sym(vp9_highbd_filter_block1d4_v2_sse2):
     ret
 
 %if ARCH_X86_64
-global sym(vp9_highbd_filter_block1d8_v2_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d8_v2_sse2):
+global sym(vpx_highbd_filter_block1d8_v2_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d8_v2_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -222,8 +222,8 @@ sym(vp9_highbd_filter_block1d8_v2_sse2):
     pop         rbp
     ret
 
-global sym(vp9_highbd_filter_block1d16_v2_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d16_v2_sse2):
+global sym(vpx_highbd_filter_block1d16_v2_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d16_v2_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -251,8 +251,8 @@ sym(vp9_highbd_filter_block1d16_v2_sse2):
     ret
 %endif
 
-global sym(vp9_highbd_filter_block1d4_v2_avg_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d4_v2_avg_sse2):
+global sym(vpx_highbd_filter_block1d4_v2_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d4_v2_avg_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -276,8 +276,8 @@ sym(vp9_highbd_filter_block1d4_v2_avg_sse2):
     ret
 
 %if ARCH_X86_64
-global sym(vp9_highbd_filter_block1d8_v2_avg_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d8_v2_avg_sse2):
+global sym(vpx_highbd_filter_block1d8_v2_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d8_v2_avg_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -302,8 +302,8 @@ sym(vp9_highbd_filter_block1d8_v2_avg_sse2):
     pop         rbp
     ret
 
-global sym(vp9_highbd_filter_block1d16_v2_avg_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d16_v2_avg_sse2):
+global sym(vpx_highbd_filter_block1d16_v2_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d16_v2_avg_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -331,8 +331,8 @@ sym(vp9_highbd_filter_block1d16_v2_avg_sse2):
     ret
 %endif
 
-global sym(vp9_highbd_filter_block1d4_h2_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d4_h2_sse2):
+global sym(vpx_highbd_filter_block1d4_h2_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d4_h2_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -357,8 +357,8 @@ sym(vp9_highbd_filter_block1d4_h2_sse2):
     ret
 
 %if ARCH_X86_64
-global sym(vp9_highbd_filter_block1d8_h2_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d8_h2_sse2):
+global sym(vpx_highbd_filter_block1d8_h2_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d8_h2_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -383,8 +383,8 @@ sym(vp9_highbd_filter_block1d8_h2_sse2):
     pop         rbp
     ret
 
-global sym(vp9_highbd_filter_block1d16_h2_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d16_h2_sse2):
+global sym(vpx_highbd_filter_block1d16_h2_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d16_h2_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -412,8 +412,8 @@ sym(vp9_highbd_filter_block1d16_h2_sse2):
     ret
 %endif
 
-global sym(vp9_highbd_filter_block1d4_h2_avg_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d4_h2_avg_sse2):
+global sym(vpx_highbd_filter_block1d4_h2_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d4_h2_avg_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -438,8 +438,8 @@ sym(vp9_highbd_filter_block1d4_h2_avg_sse2):
     ret
 
 %if ARCH_X86_64
-global sym(vp9_highbd_filter_block1d8_h2_avg_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d8_h2_avg_sse2):
+global sym(vpx_highbd_filter_block1d8_h2_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d8_h2_avg_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -464,8 +464,8 @@ sym(vp9_highbd_filter_block1d8_h2_avg_sse2):
     pop         rbp
     ret
 
-global sym(vp9_highbd_filter_block1d16_h2_avg_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d16_h2_avg_sse2):
+global sym(vpx_highbd_filter_block1d16_h2_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d16_h2_avg_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
diff --git a/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
index cee8d1e76..29ede19f7 100644
--- a/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
@@ -11,11 +11,11 @@
 // Due to a header conflict between math.h and intrinsics includes with ceil()
 // in certain configurations under vs9 this include needs to precede
 // immintrin.h.
-#include "./vp9_rtcd.h"
 
 #include <immintrin.h>
 
-#include "vp9/common/x86/convolve.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/convolve.h"
 #include "vpx_ports/mem.h"
 
 // filters for 16_h8 and 16_v8
@@ -60,7 +60,7 @@ DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
 # define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
 #endif  // __clang__
 
-static void vp9_filter_block1d16_h8_avx2(const uint8_t *src_ptr,
+static void vpx_filter_block1d16_h8_avx2(const uint8_t *src_ptr,
                                          ptrdiff_t src_pixels_per_line,
                                          uint8_t *output_ptr,
                                          ptrdiff_t output_pitch,
@@ -304,7 +304,7 @@ static void vp9_filter_block1d16_h8_avx2(const uint8_t *src_ptr,
   }
 }
 
-static void vp9_filter_block1d16_v8_avx2(const uint8_t *src_ptr,
+static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr,
                                          ptrdiff_t src_pitch,
                                          uint8_t *output_ptr,
                                          ptrdiff_t out_pitch,
@@ -551,41 +551,41 @@ static void vp9_filter_block1d16_v8_avx2(const uint8_t *src_ptr,
 }
 
 #if HAVE_AVX2 && HAVE_SSSE3
-filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
 #if ARCH_X86_64
-filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
-#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_intrin_ssse3
-#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_intrin_ssse3
-#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_intrin_ssse3
+filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
+#define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_intrin_ssse3
+#define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_intrin_ssse3
+#define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_intrin_ssse3
 #else  // ARCH_X86
-filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
-#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3
-#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3
-#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3
+filter8_1dfunction vpx_filter_block1d8_v8_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h8_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
+#define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_ssse3
+#define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_ssse3
+#define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_ssse3
 #endif  // ARCH_X86_64
-filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
-#define vp9_filter_block1d4_v8_avx2 vp9_filter_block1d4_v8_ssse3
-#define vp9_filter_block1d16_v2_avx2 vp9_filter_block1d16_v2_ssse3
-#define vp9_filter_block1d16_h2_avx2 vp9_filter_block1d16_h2_ssse3
-#define vp9_filter_block1d8_v2_avx2  vp9_filter_block1d8_v2_ssse3
-#define vp9_filter_block1d8_h2_avx2  vp9_filter_block1d8_h2_ssse3
-#define vp9_filter_block1d4_v2_avx2  vp9_filter_block1d4_v2_ssse3
-#define vp9_filter_block1d4_h2_avx2  vp9_filter_block1d4_h2_ssse3
-// void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
+filter8_1dfunction vpx_filter_block1d16_v2_ssse3;
+filter8_1dfunction vpx_filter_block1d16_h2_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v2_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h2_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v2_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h2_ssse3;
+#define vpx_filter_block1d4_v8_avx2 vpx_filter_block1d4_v8_ssse3
+#define vpx_filter_block1d16_v2_avx2 vpx_filter_block1d16_v2_ssse3
+#define vpx_filter_block1d16_h2_avx2 vpx_filter_block1d16_h2_ssse3
+#define vpx_filter_block1d8_v2_avx2  vpx_filter_block1d8_v2_ssse3
+#define vpx_filter_block1d8_h2_avx2  vpx_filter_block1d8_h2_ssse3
+#define vpx_filter_block1d4_v2_avx2  vpx_filter_block1d4_v2_ssse3
+#define vpx_filter_block1d4_h2_avx2  vpx_filter_block1d4_h2_ssse3
+// void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
 //                                uint8_t *dst, ptrdiff_t dst_stride,
 //                                const int16_t *filter_x, int x_step_q4,
 //                                const int16_t *filter_y, int y_step_q4,
 //                                int w, int h);
-// void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
+// void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
 //                               uint8_t *dst, ptrdiff_t dst_stride,
 //                               const int16_t *filter_x, int x_step_q4,
 //                               const int16_t *filter_y, int y_step_q4,
@@ -593,7 +593,7 @@ filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
 
-// void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
+// void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
 //                          uint8_t *dst, ptrdiff_t dst_stride,
 //                          const int16_t *filter_x, int x_step_q4,
 //                          const int16_t *filter_y, int y_step_q4,
diff --git a/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
index 5fd2857e1..01771dec9 100644
--- a/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
@@ -11,11 +11,11 @@
 // Due to a header conflict between math.h and intrinsics includes with ceil()
 // in certain configurations under vs9 this include needs to precede
 // tmmintrin.h.
-#include "./vp9_rtcd.h"
 
 #include <tmmintrin.h>
 
-#include "vp9/common/x86/convolve.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/convolve.h"
 #include "vpx_ports/mem.h"
 #include "vpx_ports/emmintrin_compat.h"
 
@@ -46,11 +46,11 @@ DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = {
 };
 
 // These are reused by the avx2 intrinsics.
-filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
 
-void vp9_filter_block1d4_h8_intrin_ssse3(const uint8_t *src_ptr,
+void vpx_filter_block1d4_h8_intrin_ssse3(const uint8_t *src_ptr,
                                          ptrdiff_t src_pixels_per_line,
                                          uint8_t *output_ptr,
                                          ptrdiff_t output_pitch,
@@ -121,7 +121,7 @@ void vp9_filter_block1d4_h8_intrin_ssse3(const uint8_t *src_ptr,
   }
 }
 
-void vp9_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr,
+void vpx_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr,
                                          ptrdiff_t src_pixels_per_line,
                                          uint8_t *output_ptr,
                                          ptrdiff_t output_pitch,
@@ -201,7 +201,7 @@ void vp9_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr,
   }
 }
 
-static void vp9_filter_block1d16_h8_intrin_ssse3(const uint8_t *src_ptr,
+static void vpx_filter_block1d16_h8_intrin_ssse3(const uint8_t *src_ptr,
                                                  ptrdiff_t src_pixels_per_line,
                                                  uint8_t *output_ptr,
                                                  ptrdiff_t output_pitch,
@@ -318,7 +318,7 @@ static void vp9_filter_block1d16_h8_intrin_ssse3(const uint8_t *src_ptr,
   }
 }
 
-void vp9_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr,
+void vpx_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr,
                                          ptrdiff_t src_pitch,
                                          uint8_t *output_ptr,
                                          ptrdiff_t out_pitch,
@@ -406,7 +406,7 @@ void vp9_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr,
   }
 }
 
-static void vp9_filter_block1d16_v8_intrin_ssse3(const uint8_t *src_ptr,
+static void vpx_filter_block1d16_v8_intrin_ssse3(const uint8_t *src_ptr,
                                                  ptrdiff_t src_pitch,
                                                  uint8_t *output_ptr,
                                                  ptrdiff_t out_pitch,
@@ -522,61 +522,61 @@ static void vp9_filter_block1d16_v8_intrin_ssse3(const uint8_t *src_ptr,
 }
 
 #if ARCH_X86_64
-filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
-#define vp9_filter_block1d16_v8_ssse3 vp9_filter_block1d16_v8_intrin_ssse3
-#define vp9_filter_block1d16_h8_ssse3 vp9_filter_block1d16_h8_intrin_ssse3
-#define vp9_filter_block1d8_v8_ssse3 vp9_filter_block1d8_v8_intrin_ssse3
-#define vp9_filter_block1d8_h8_ssse3 vp9_filter_block1d8_h8_intrin_ssse3
-#define vp9_filter_block1d4_h8_ssse3 vp9_filter_block1d4_h8_intrin_ssse3
+filter8_1dfunction vpx_filter_block1d16_v8_intrin_ssse3;
+filter8_1dfunction vpx_filter_block1d16_h8_intrin_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
+#define vpx_filter_block1d16_v8_ssse3 vpx_filter_block1d16_v8_intrin_ssse3
+#define vpx_filter_block1d16_h8_ssse3 vpx_filter_block1d16_h8_intrin_ssse3
+#define vpx_filter_block1d8_v8_ssse3 vpx_filter_block1d8_v8_intrin_ssse3
+#define vpx_filter_block1d8_h8_ssse3 vpx_filter_block1d8_h8_intrin_ssse3
+#define vpx_filter_block1d4_h8_ssse3 vpx_filter_block1d4_h8_intrin_ssse3
 #else  // ARCH_X86
-filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
+filter8_1dfunction vpx_filter_block1d16_v8_ssse3;
+filter8_1dfunction vpx_filter_block1d16_h8_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v8_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h8_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
 #endif  // ARCH_X86_64
-filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3;
-
-filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
-filter8_1dfunction vp9_filter_block1d16_v2_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h2_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v2_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h2_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v2_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h2_avg_ssse3;
-
-// void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+filter8_1dfunction vpx_filter_block1d16_v8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d16_h8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h8_avg_ssse3;
+
+filter8_1dfunction vpx_filter_block1d16_v2_ssse3;
+filter8_1dfunction vpx_filter_block1d16_h2_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v2_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h2_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v2_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h2_ssse3;
+filter8_1dfunction vpx_filter_block1d16_v2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d16_h2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3;
+
+// void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 //                                uint8_t *dst, ptrdiff_t dst_stride,
 //                                const int16_t *filter_x, int x_step_q4,
 //                                const int16_t *filter_y, int y_step_q4,
 //                                int w, int h);
-// void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 //                               uint8_t *dst, ptrdiff_t dst_stride,
 //                               const int16_t *filter_x, int x_step_q4,
 //                               const int16_t *filter_y, int y_step_q4,
 //                               int w, int h);
-// void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 //                                    uint8_t *dst, ptrdiff_t dst_stride,
 //                                    const int16_t *filter_x, int x_step_q4,
 //                                    const int16_t *filter_y, int y_step_q4,
 //                                    int w, int h);
-// void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 //                                   uint8_t *dst, ptrdiff_t dst_stride,
 //                                   const int16_t *filter_x, int x_step_q4,
 //                                   const int16_t *filter_y, int y_step_q4,
@@ -587,12 +587,12 @@ FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
 FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
             ssse3);
 
-// void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 //                          uint8_t *dst, ptrdiff_t dst_stride,
 //                          const int16_t *filter_x, int x_step_q4,
 //                          const int16_t *filter_y, int y_step_q4,
 //                          int w, int h);
-// void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 //                              uint8_t *dst, ptrdiff_t dst_stride,
 //                              const int16_t *filter_x, int x_step_q4,
 //                              const int16_t *filter_y, int y_step_q4,
diff --git a/vp9/common/x86/vp9_subpixel_8t_sse2.asm b/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm
index 9dc8d0abb..08f3d6a6c 100644
--- a/vp9/common/x86/vp9_subpixel_8t_sse2.asm
+++ b/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm
@@ -176,7 +176,7 @@
     movq        [rdi + %2], xmm0
 %endm
 
-;void vp9_filter_block1d4_v8_sse2
+;void vpx_filter_block1d4_v8_sse2
 ;(
 ;    unsigned char *src_ptr,
 ;    unsigned int   src_pitch,
@@ -185,8 +185,8 @@
 ;    unsigned int   output_height,
 ;    short *filter
 ;)
-global sym(vp9_filter_block1d4_v8_sse2) PRIVATE
-sym(vp9_filter_block1d4_v8_sse2):
+global sym(vpx_filter_block1d4_v8_sse2) PRIVATE
+sym(vpx_filter_block1d4_v8_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -243,7 +243,7 @@ sym(vp9_filter_block1d4_v8_sse2):
     pop         rbp
     ret
 
-;void vp9_filter_block1d8_v8_sse2
+;void vpx_filter_block1d8_v8_sse2
 ;(
 ;    unsigned char *src_ptr,
 ;    unsigned int   src_pitch,
@@ -252,8 +252,8 @@ sym(vp9_filter_block1d4_v8_sse2):
 ;    unsigned int   output_height,
 ;    short *filter
 ;)
-global sym(vp9_filter_block1d8_v8_sse2) PRIVATE
-sym(vp9_filter_block1d8_v8_sse2):
+global sym(vpx_filter_block1d8_v8_sse2) PRIVATE
+sym(vpx_filter_block1d8_v8_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -302,7 +302,7 @@ sym(vp9_filter_block1d8_v8_sse2):
     pop         rbp
     ret
 
-;void vp9_filter_block1d16_v8_sse2
+;void vpx_filter_block1d16_v8_sse2
 ;(
 ;    unsigned char *src_ptr,
 ;    unsigned int   src_pitch,
@@ -311,8 +311,8 @@ sym(vp9_filter_block1d8_v8_sse2):
 ;    unsigned int   output_height,
 ;    short *filter
 ;)
-global sym(vp9_filter_block1d16_v8_sse2) PRIVATE
-sym(vp9_filter_block1d16_v8_sse2):
+global sym(vpx_filter_block1d16_v8_sse2) PRIVATE
+sym(vpx_filter_block1d16_v8_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -365,8 +365,8 @@ sym(vp9_filter_block1d16_v8_sse2):
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d4_v8_avg_sse2) PRIVATE
-sym(vp9_filter_block1d4_v8_avg_sse2):
+global sym(vpx_filter_block1d4_v8_avg_sse2) PRIVATE
+sym(vpx_filter_block1d4_v8_avg_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -423,8 +423,8 @@ sym(vp9_filter_block1d4_v8_avg_sse2):
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d8_v8_avg_sse2) PRIVATE
-sym(vp9_filter_block1d8_v8_avg_sse2):
+global sym(vpx_filter_block1d8_v8_avg_sse2) PRIVATE
+sym(vpx_filter_block1d8_v8_avg_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -472,8 +472,8 @@ sym(vp9_filter_block1d8_v8_avg_sse2):
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d16_v8_avg_sse2) PRIVATE
-sym(vp9_filter_block1d16_v8_avg_sse2):
+global sym(vpx_filter_block1d16_v8_avg_sse2) PRIVATE
+sym(vpx_filter_block1d16_v8_avg_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -525,7 +525,7 @@ sym(vp9_filter_block1d16_v8_avg_sse2):
     pop         rbp
     ret
 
-;void vp9_filter_block1d4_h8_sse2
+;void vpx_filter_block1d4_h8_sse2
 ;(
 ;    unsigned char  *src_ptr,
 ;    unsigned int    src_pixels_per_line,
@@ -534,8 +534,8 @@ sym(vp9_filter_block1d16_v8_avg_sse2):
 ;    unsigned int    output_height,
 ;    short *filter
 ;)
-global sym(vp9_filter_block1d4_h8_sse2) PRIVATE
-sym(vp9_filter_block1d4_h8_sse2):
+global sym(vpx_filter_block1d4_h8_sse2) PRIVATE
+sym(vpx_filter_block1d4_h8_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -599,7 +599,7 @@ sym(vp9_filter_block1d4_h8_sse2):
     pop         rbp
     ret
 
-;void vp9_filter_block1d8_h8_sse2
+;void vpx_filter_block1d8_h8_sse2
 ;(
 ;    unsigned char  *src_ptr,
 ;    unsigned int    src_pixels_per_line,
@@ -608,8 +608,8 @@ sym(vp9_filter_block1d4_h8_sse2):
 ;    unsigned int    output_height,
 ;    short *filter
 ;)
-global sym(vp9_filter_block1d8_h8_sse2) PRIVATE
-sym(vp9_filter_block1d8_h8_sse2):
+global sym(vpx_filter_block1d8_h8_sse2) PRIVATE
+sym(vpx_filter_block1d8_h8_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -674,7 +674,7 @@ sym(vp9_filter_block1d8_h8_sse2):
     pop         rbp
     ret
 
-;void vp9_filter_block1d16_h8_sse2
+;void vpx_filter_block1d16_h8_sse2
 ;(
 ;    unsigned char  *src_ptr,
 ;    unsigned int    src_pixels_per_line,
@@ -683,8 +683,8 @@ sym(vp9_filter_block1d8_h8_sse2):
 ;    unsigned int    output_height,
 ;    short *filter
 ;)
-global sym(vp9_filter_block1d16_h8_sse2) PRIVATE
-sym(vp9_filter_block1d16_h8_sse2):
+global sym(vpx_filter_block1d16_h8_sse2) PRIVATE
+sym(vpx_filter_block1d16_h8_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -769,8 +769,8 @@ sym(vp9_filter_block1d16_h8_sse2):
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d4_h8_avg_sse2) PRIVATE
-sym(vp9_filter_block1d4_h8_avg_sse2):
+global sym(vpx_filter_block1d4_h8_avg_sse2) PRIVATE
+sym(vpx_filter_block1d4_h8_avg_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -834,8 +834,8 @@ sym(vp9_filter_block1d4_h8_avg_sse2):
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d8_h8_avg_sse2) PRIVATE
-sym(vp9_filter_block1d8_h8_avg_sse2):
+global sym(vpx_filter_block1d8_h8_avg_sse2) PRIVATE
+sym(vpx_filter_block1d8_h8_avg_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -900,8 +900,8 @@ sym(vp9_filter_block1d8_h8_avg_sse2):
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d16_h8_avg_sse2) PRIVATE
-sym(vp9_filter_block1d16_h8_avg_sse2):
+global sym(vpx_filter_block1d16_h8_avg_sse2) PRIVATE
+sym(vpx_filter_block1d16_h8_avg_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
diff --git a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm b/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
index 4a5bf1b60..68acc03ce 100644
--- a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
+++ b/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
@@ -310,7 +310,7 @@
     jnz         .loop
 %endm
 
-;void vp9_filter_block1d8_v8_ssse3
+;void vpx_filter_block1d8_v8_ssse3
 ;(
 ;    unsigned char *src_ptr,
 ;    unsigned int   src_pitch,
@@ -319,8 +319,8 @@
 ;    unsigned int   output_height,
 ;    short *filter
 ;)
-global sym(vp9_filter_block1d4_v8_ssse3) PRIVATE
-sym(vp9_filter_block1d4_v8_ssse3):
+global sym(vpx_filter_block1d4_v8_ssse3) PRIVATE
+sym(vpx_filter_block1d4_v8_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -351,7 +351,7 @@ sym(vp9_filter_block1d4_v8_ssse3):
     pop         rbp
     ret
 
-;void vp9_filter_block1d8_v8_ssse3
+;void vpx_filter_block1d8_v8_ssse3
 ;(
 ;    unsigned char *src_ptr,
 ;    unsigned int   src_pitch,
@@ -360,8 +360,8 @@ sym(vp9_filter_block1d4_v8_ssse3):
 ;    unsigned int   output_height,
 ;    short *filter
 ;)
-global sym(vp9_filter_block1d8_v8_ssse3) PRIVATE
-sym(vp9_filter_block1d8_v8_ssse3):
+global sym(vpx_filter_block1d8_v8_ssse3) PRIVATE
+sym(vpx_filter_block1d8_v8_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -392,7 +392,7 @@ sym(vp9_filter_block1d8_v8_ssse3):
     pop         rbp
     ret
 
-;void vp9_filter_block1d16_v8_ssse3
+;void vpx_filter_block1d16_v8_ssse3
 ;(
 ;    unsigned char *src_ptr,
 ;    unsigned int   src_pitch,
@@ -401,8 +401,8 @@ sym(vp9_filter_block1d8_v8_ssse3):
 ;    unsigned int   output_height,
 ;    short *filter
 ;)
-global sym(vp9_filter_block1d16_v8_ssse3) PRIVATE
-sym(vp9_filter_block1d16_v8_ssse3):
+global sym(vpx_filter_block1d16_v8_ssse3) PRIVATE
+sym(vpx_filter_block1d16_v8_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -436,8 +436,8 @@ sym(vp9_filter_block1d16_v8_ssse3):
 ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 
-global sym(vp9_filter_block1d4_v8_avg_ssse3) PRIVATE
-sym(vp9_filter_block1d4_v8_avg_ssse3):
+global sym(vpx_filter_block1d4_v8_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d4_v8_avg_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -468,8 +468,8 @@ sym(vp9_filter_block1d4_v8_avg_ssse3):
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d8_v8_avg_ssse3) PRIVATE
-sym(vp9_filter_block1d8_v8_avg_ssse3):
+global sym(vpx_filter_block1d8_v8_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d8_v8_avg_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -500,8 +500,8 @@ sym(vp9_filter_block1d8_v8_avg_ssse3):
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d16_v8_avg_ssse3) PRIVATE
-sym(vp9_filter_block1d16_v8_avg_ssse3):
+global sym(vpx_filter_block1d16_v8_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d16_v8_avg_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -838,7 +838,7 @@ sym(vp9_filter_block1d16_v8_avg_ssse3):
     jnz         .loop
 %endm
 
-;void vp9_filter_block1d4_h8_ssse3
+;void vpx_filter_block1d4_h8_ssse3
 ;(
 ;    unsigned char  *src_ptr,
 ;    unsigned int    src_pixels_per_line,
@@ -847,8 +847,8 @@ sym(vp9_filter_block1d16_v8_avg_ssse3):
 ;    unsigned int    output_height,
 ;    short *filter
 ;)
-global sym(vp9_filter_block1d4_h8_ssse3) PRIVATE
-sym(vp9_filter_block1d4_h8_ssse3):
+global sym(vpx_filter_block1d4_h8_ssse3) PRIVATE
+sym(vpx_filter_block1d4_h8_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -877,7 +877,7 @@ sym(vp9_filter_block1d4_h8_ssse3):
     pop         rbp
     ret
 
-;void vp9_filter_block1d8_h8_ssse3
+;void vpx_filter_block1d8_h8_ssse3
 ;(
 ;    unsigned char  *src_ptr,
 ;    unsigned int    src_pixels_per_line,
@@ -886,8 +886,8 @@ sym(vp9_filter_block1d4_h8_ssse3):
 ;    unsigned int    output_height,
 ;    short *filter
 ;)
-global sym(vp9_filter_block1d8_h8_ssse3) PRIVATE
-sym(vp9_filter_block1d8_h8_ssse3):
+global sym(vpx_filter_block1d8_h8_ssse3) PRIVATE
+sym(vpx_filter_block1d8_h8_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -919,7 +919,7 @@ sym(vp9_filter_block1d8_h8_ssse3):
     pop         rbp
     ret
 
-;void vp9_filter_block1d16_h8_ssse3
+;void vpx_filter_block1d16_h8_ssse3
 ;(
 ;    unsigned char  *src_ptr,
 ;    unsigned int    src_pixels_per_line,
@@ -928,8 +928,8 @@ sym(vp9_filter_block1d8_h8_ssse3):
 ;    unsigned int    output_height,
 ;    short *filter
 ;)
-global sym(vp9_filter_block1d16_h8_ssse3) PRIVATE
-sym(vp9_filter_block1d16_h8_ssse3):
+global sym(vpx_filter_block1d16_h8_ssse3) PRIVATE
+sym(vpx_filter_block1d16_h8_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -961,8 +961,8 @@ sym(vp9_filter_block1d16_h8_ssse3):
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d4_h8_avg_ssse3) PRIVATE
-sym(vp9_filter_block1d4_h8_avg_ssse3):
+global sym(vpx_filter_block1d4_h8_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d4_h8_avg_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -991,8 +991,8 @@ sym(vp9_filter_block1d4_h8_avg_ssse3):
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d8_h8_avg_ssse3) PRIVATE
-sym(vp9_filter_block1d8_h8_avg_ssse3):
+global sym(vpx_filter_block1d8_h8_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d8_h8_avg_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -1024,8 +1024,8 @@ sym(vp9_filter_block1d8_h8_avg_ssse3):
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d16_h8_avg_ssse3) PRIVATE
-sym(vp9_filter_block1d16_h8_avg_ssse3):
+global sym(vpx_filter_block1d16_h8_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d16_h8_avg_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
diff --git a/vp9/common/x86/vp9_subpixel_bilinear_sse2.asm b/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm
index d94ccf2e9..a378dd040 100644
--- a/vp9/common/x86/vp9_subpixel_bilinear_sse2.asm
+++ b/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm
@@ -131,8 +131,8 @@
     dec         rcx
 %endm
 
-global sym(vp9_filter_block1d4_v2_sse2) PRIVATE
-sym(vp9_filter_block1d4_v2_sse2):
+global sym(vpx_filter_block1d4_v2_sse2) PRIVATE
+sym(vpx_filter_block1d4_v2_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -155,8 +155,8 @@ sym(vp9_filter_block1d4_v2_sse2):
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d8_v2_sse2) PRIVATE
-sym(vp9_filter_block1d8_v2_sse2):
+global sym(vpx_filter_block1d8_v2_sse2) PRIVATE
+sym(vpx_filter_block1d8_v2_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -181,8 +181,8 @@ sym(vp9_filter_block1d8_v2_sse2):
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d16_v2_sse2) PRIVATE
-sym(vp9_filter_block1d16_v2_sse2):
+global sym(vpx_filter_block1d16_v2_sse2) PRIVATE
+sym(vpx_filter_block1d16_v2_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -209,8 +209,8 @@ sym(vp9_filter_block1d16_v2_sse2):
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d4_v2_avg_sse2) PRIVATE
-sym(vp9_filter_block1d4_v2_avg_sse2):
+global sym(vpx_filter_block1d4_v2_avg_sse2) PRIVATE
+sym(vpx_filter_block1d4_v2_avg_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -233,8 +233,8 @@ sym(vp9_filter_block1d4_v2_avg_sse2):
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d8_v2_avg_sse2) PRIVATE
-sym(vp9_filter_block1d8_v2_avg_sse2):
+global sym(vpx_filter_block1d8_v2_avg_sse2) PRIVATE
+sym(vpx_filter_block1d8_v2_avg_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -259,8 +259,8 @@ sym(vp9_filter_block1d8_v2_avg_sse2):
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d16_v2_avg_sse2) PRIVATE
-sym(vp9_filter_block1d16_v2_avg_sse2):
+global sym(vpx_filter_block1d16_v2_avg_sse2) PRIVATE
+sym(vpx_filter_block1d16_v2_avg_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -287,8 +287,8 @@ sym(vp9_filter_block1d16_v2_avg_sse2):
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d4_h2_sse2) PRIVATE
-sym(vp9_filter_block1d4_h2_sse2):
+global sym(vpx_filter_block1d4_h2_sse2) PRIVATE
+sym(vpx_filter_block1d4_h2_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -312,8 +312,8 @@ sym(vp9_filter_block1d4_h2_sse2):
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d8_h2_sse2) PRIVATE
-sym(vp9_filter_block1d8_h2_sse2):
+global sym(vpx_filter_block1d8_h2_sse2) PRIVATE
+sym(vpx_filter_block1d8_h2_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -339,8 +339,8 @@ sym(vp9_filter_block1d8_h2_sse2):
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d16_h2_sse2) PRIVATE
-sym(vp9_filter_block1d16_h2_sse2):
+global sym(vpx_filter_block1d16_h2_sse2) PRIVATE
+sym(vpx_filter_block1d16_h2_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -367,8 +367,8 @@ sym(vp9_filter_block1d16_h2_sse2):
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d4_h2_avg_sse2) PRIVATE
-sym(vp9_filter_block1d4_h2_avg_sse2):
+global sym(vpx_filter_block1d4_h2_avg_sse2) PRIVATE
+sym(vpx_filter_block1d4_h2_avg_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -392,8 +392,8 @@ sym(vp9_filter_block1d4_h2_avg_sse2):
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d8_h2_avg_sse2) PRIVATE
-sym(vp9_filter_block1d8_h2_avg_sse2):
+global sym(vpx_filter_block1d8_h2_avg_sse2) PRIVATE
+sym(vpx_filter_block1d8_h2_avg_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -419,8 +419,8 @@ sym(vp9_filter_block1d8_h2_avg_sse2):
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d16_h2_avg_sse2) PRIVATE
-sym(vp9_filter_block1d16_h2_avg_sse2):
+global sym(vpx_filter_block1d16_h2_avg_sse2) PRIVATE
+sym(vpx_filter_block1d16_h2_avg_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
diff --git a/vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm b/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm
index b5e18fe6d..3c8cfd225 100644
--- a/vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm
+++ b/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm
@@ -109,8 +109,8 @@
     dec         rcx
 %endm
 
-global sym(vp9_filter_block1d4_v2_ssse3) PRIVATE
-sym(vp9_filter_block1d4_v2_ssse3):
+global sym(vpx_filter_block1d4_v2_ssse3) PRIVATE
+sym(vpx_filter_block1d4_v2_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -133,8 +133,8 @@ sym(vp9_filter_block1d4_v2_ssse3):
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d8_v2_ssse3) PRIVATE
-sym(vp9_filter_block1d8_v2_ssse3):
+global sym(vpx_filter_block1d8_v2_ssse3) PRIVATE
+sym(vpx_filter_block1d8_v2_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -159,8 +159,8 @@ sym(vp9_filter_block1d8_v2_ssse3):
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d16_v2_ssse3) PRIVATE
-sym(vp9_filter_block1d16_v2_ssse3):
+global sym(vpx_filter_block1d16_v2_ssse3) PRIVATE
+sym(vpx_filter_block1d16_v2_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -186,8 +186,8 @@ sym(vp9_filter_block1d16_v2_ssse3):
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d4_v2_avg_ssse3) PRIVATE
-sym(vp9_filter_block1d4_v2_avg_ssse3):
+global sym(vpx_filter_block1d4_v2_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d4_v2_avg_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -210,8 +210,8 @@ sym(vp9_filter_block1d4_v2_avg_ssse3):
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d8_v2_avg_ssse3) PRIVATE
-sym(vp9_filter_block1d8_v2_avg_ssse3):
+global sym(vpx_filter_block1d8_v2_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d8_v2_avg_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -236,8 +236,8 @@ sym(vp9_filter_block1d8_v2_avg_ssse3):
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d16_v2_avg_ssse3) PRIVATE
-sym(vp9_filter_block1d16_v2_avg_ssse3):
+global sym(vpx_filter_block1d16_v2_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d16_v2_avg_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -263,8 +263,8 @@ sym(vp9_filter_block1d16_v2_avg_ssse3):
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d4_h2_ssse3) PRIVATE
-sym(vp9_filter_block1d4_h2_ssse3):
+global sym(vpx_filter_block1d4_h2_ssse3) PRIVATE
+sym(vpx_filter_block1d4_h2_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -288,8 +288,8 @@ sym(vp9_filter_block1d4_h2_ssse3):
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d8_h2_ssse3) PRIVATE
-sym(vp9_filter_block1d8_h2_ssse3):
+global sym(vpx_filter_block1d8_h2_ssse3) PRIVATE
+sym(vpx_filter_block1d8_h2_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -315,8 +315,8 @@ sym(vp9_filter_block1d8_h2_ssse3):
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d16_h2_ssse3) PRIVATE
-sym(vp9_filter_block1d16_h2_ssse3):
+global sym(vpx_filter_block1d16_h2_ssse3) PRIVATE
+sym(vpx_filter_block1d16_h2_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -342,8 +342,8 @@ sym(vp9_filter_block1d16_h2_ssse3):
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d4_h2_avg_ssse3) PRIVATE
-sym(vp9_filter_block1d4_h2_avg_ssse3):
+global sym(vpx_filter_block1d4_h2_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d4_h2_avg_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -367,8 +367,8 @@ sym(vp9_filter_block1d4_h2_avg_ssse3):
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d8_h2_avg_ssse3) PRIVATE
-sym(vp9_filter_block1d8_h2_avg_ssse3):
+global sym(vpx_filter_block1d8_h2_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d8_h2_avg_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -394,8 +394,8 @@ sym(vp9_filter_block1d8_h2_avg_ssse3):
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d16_h2_avg_ssse3) PRIVATE
-sym(vp9_filter_block1d16_h2_avg_ssse3):
+global sym(vpx_filter_block1d16_h2_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d16_h2_avg_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
author	Zoe Liu <zoeliu@google.com>	2015-07-22 10:40:42 -0700
committer	Zoe Liu <zoeliu@google.com>	2015-07-31 10:27:33 -0700
commit	7186a2dd863028faa54d2e5fc7995693488af63c (patch)
tree	62139af4f8934023aaa441b757107af434408812
parent	0e3f494b217bde5e1d47107cdfbb044e4d801cec (diff)
download	libvpx-7186a2dd863028faa54d2e5fc7995693488af63c.tar libvpx-7186a2dd863028faa54d2e5fc7995693488af63c.tar.gz libvpx-7186a2dd863028faa54d2e5fc7995693488af63c.tar.bz2 libvpx-7186a2dd863028faa54d2e5fc7995693488af63c.zip