39 files changed, 1157 insertions, 287 deletions
diff --git a/vp8/common/onyx.h b/vp8/common/onyx.h
index ebdabc9b2..05c72df3f 100644
--- a/vp8/common/onyx.h
+++ b/vp8/common/onyx.h
@@ -247,35 +247,35 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf);
 void vp8_remove_compressor(struct VP8_COMP **comp);
 
 void vp8_init_config(struct VP8_COMP *onyx, VP8_CONFIG *oxcf);
-void vp8_change_config(struct VP8_COMP *onyx, VP8_CONFIG *oxcf);
+void vp8_change_config(struct VP8_COMP *cpi, VP8_CONFIG *oxcf);
 
-int vp8_receive_raw_frame(struct VP8_COMP *comp, unsigned int frame_flags,
+int vp8_receive_raw_frame(struct VP8_COMP *cpi, unsigned int frame_flags,
                           YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
-                          int64_t end_time_stamp);
-int vp8_get_compressed_data(struct VP8_COMP *comp, unsigned int *frame_flags,
+                          int64_t end_time);
+int vp8_get_compressed_data(struct VP8_COMP *cpi, unsigned int *frame_flags,
                             size_t *size, unsigned char *dest,
                             unsigned char *dest_end, int64_t *time_stamp,
                             int64_t *time_end, int flush);
-int vp8_get_preview_raw_frame(struct VP8_COMP *comp, YV12_BUFFER_CONFIG *dest,
+int vp8_get_preview_raw_frame(struct VP8_COMP *cpi, YV12_BUFFER_CONFIG *dest,
                               vp8_ppflags_t *flags);
 
-int vp8_use_as_reference(struct VP8_COMP *comp, int ref_frame_flags);
-int vp8_update_reference(struct VP8_COMP *comp, int ref_frame_flags);
-int vp8_get_reference(struct VP8_COMP *comp,
+int vp8_use_as_reference(struct VP8_COMP *cpi, int ref_frame_flags);
+int vp8_update_reference(struct VP8_COMP *cpi, int ref_frame_flags);
+int vp8_get_reference(struct VP8_COMP *cpi,
                       enum vpx_ref_frame_type ref_frame_flag,
                       YV12_BUFFER_CONFIG *sd);
-int vp8_set_reference(struct VP8_COMP *comp,
+int vp8_set_reference(struct VP8_COMP *cpi,
                       enum vpx_ref_frame_type ref_frame_flag,
                       YV12_BUFFER_CONFIG *sd);
-int vp8_update_entropy(struct VP8_COMP *comp, int update);
-int vp8_set_roimap(struct VP8_COMP *comp, unsigned char *map, unsigned int rows,
+int vp8_update_entropy(struct VP8_COMP *cpi, int update);
+int vp8_set_roimap(struct VP8_COMP *cpi, unsigned char *map, unsigned int rows,
                    unsigned int cols, int delta_q[4], int delta_lf[4],
                    unsigned int threshold[4]);
-int vp8_set_active_map(struct VP8_COMP *comp, unsigned char *map,
+int vp8_set_active_map(struct VP8_COMP *cpi, unsigned char *map,
                        unsigned int rows, unsigned int cols);
-int vp8_set_internal_size(struct VP8_COMP *comp, VPX_SCALING horiz_mode,
+int vp8_set_internal_size(struct VP8_COMP *cpi, VPX_SCALING horiz_mode,
                           VPX_SCALING vert_mode);
-int vp8_get_quantizer(struct VP8_COMP *c);
+int vp8_get_quantizer(struct VP8_COMP *cpi);
 
 #ifdef __cplusplus
 }
diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl
index 24636b91c..235c77e38 100644
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -223,7 +223,7 @@ specialize qw/vp8_full_search_sad sse3 sse4_1/;
 $vp8_full_search_sad_sse3=vp8_full_search_sadx3;
 $vp8_full_search_sad_sse4_1=vp8_full_search_sadx8;
 
-add_proto qw/int vp8_refining_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv";
+add_proto qw/int vp8_refining_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv";
 specialize qw/vp8_refining_search_sad sse2 msa/;
 $vp8_refining_search_sad_sse2=vp8_refining_search_sadx4;
 $vp8_refining_search_sad_msa=vp8_refining_search_sadx4;
diff --git a/vp8/encoder/boolhuff.h b/vp8/encoder/boolhuff.h
index 2cf62def1..ba37cc01d 100644
--- a/vp8/encoder/boolhuff.h
+++ b/vp8/encoder/boolhuff.h
@@ -35,11 +35,11 @@ typedef struct {
   struct vpx_internal_error_info *error;
 } BOOL_CODER;
 
-extern void vp8_start_encode(BOOL_CODER *bc, unsigned char *buffer,
-                             unsigned char *buffer_end);
+void vp8_start_encode(BOOL_CODER *br, unsigned char *source,
+                      unsigned char *source_end);
 
-extern void vp8_encode_value(BOOL_CODER *br, int data, int bits);
-extern void vp8_stop_encode(BOOL_CODER *bc);
+void vp8_encode_value(BOOL_CODER *br, int data, int bits);
+void vp8_stop_encode(BOOL_CODER *br);
 extern const unsigned int vp8_prob_cost[256];
 
 DECLARE_ALIGNED(16, extern const unsigned char, vp8_norm[256]);
diff --git a/vp8/encoder/denoising.c b/vp8/encoder/denoising.c
index eb963b97e..e54d1e9f4 100644
--- a/vp8/encoder/denoising.c
+++ b/vp8/encoder/denoising.c
@@ -213,13 +213,12 @@ int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride,
   return FILTER_BLOCK;
 }
 
-int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv,
-                             int mc_avg_uv_stride,
-                             unsigned char *running_avg_uv, int avg_uv_stride,
+int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride,
+                             unsigned char *running_avg, int avg_stride,
                              unsigned char *sig, int sig_stride,
                              unsigned int motion_magnitude,
                              int increase_denoising) {
-  unsigned char *running_avg_uv_start = running_avg_uv;
+  unsigned char *running_avg_start = running_avg;
   unsigned char *sig_start = sig;
   int sum_diff_thresh;
   int r, c;
@@ -259,13 +258,13 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv,
       int adjustment = 0;
       int absdiff = 0;
 
-      diff = mc_running_avg_uv[c] - sig[c];
+      diff = mc_running_avg[c] - sig[c];
       absdiff = abs(diff);
 
       // When |diff| <= |3 + shift_inc1|, use pixel value from
       // last denoised raw.
       if (absdiff <= 3 + shift_inc1) {
-        running_avg_uv[c] = mc_running_avg_uv[c];
+        running_avg[c] = mc_running_avg[c];
         sum_diff += diff;
       } else {
         if (absdiff >= 4 && absdiff <= 7) {
@@ -277,16 +276,16 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv,
         }
         if (diff > 0) {
           if ((sig[c] + adjustment) > 255) {
-            running_avg_uv[c] = 255;
+            running_avg[c] = 255;
           } else {
-            running_avg_uv[c] = sig[c] + adjustment;
+            running_avg[c] = sig[c] + adjustment;
           }
           sum_diff += adjustment;
         } else {
           if ((sig[c] - adjustment) < 0) {
-            running_avg_uv[c] = 0;
+            running_avg[c] = 0;
           } else {
-            running_avg_uv[c] = sig[c] - adjustment;
+            running_avg[c] = sig[c] - adjustment;
           }
           sum_diff -= adjustment;
         }
@@ -294,8 +293,8 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv,
     }
     /* Update pointers for next iteration. */
     sig += sig_stride;
-    mc_running_avg_uv += mc_avg_uv_stride;
-    running_avg_uv += avg_uv_stride;
+    mc_running_avg += mc_avg_stride;
+    running_avg += avg_stride;
   }
 
   sum_diff_thresh = SUM_DIFF_THRESHOLD_UV;
@@ -314,27 +313,27 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv,
     // Only apply the adjustment for max delta up to 3.
     if (delta < 4) {
       sig -= sig_stride * 8;
-      mc_running_avg_uv -= mc_avg_uv_stride * 8;
-      running_avg_uv -= avg_uv_stride * 8;
+      mc_running_avg -= mc_avg_stride * 8;
+      running_avg -= avg_stride * 8;
       for (r = 0; r < 8; ++r) {
         for (c = 0; c < 8; ++c) {
-          int diff = mc_running_avg_uv[c] - sig[c];
+          int diff = mc_running_avg[c] - sig[c];
           int adjustment = abs(diff);
           if (adjustment > delta) adjustment = delta;
           if (diff > 0) {
             // Bring denoised signal down.
-            if (running_avg_uv[c] - adjustment < 0) {
-              running_avg_uv[c] = 0;
+            if (running_avg[c] - adjustment < 0) {
+              running_avg[c] = 0;
             } else {
-              running_avg_uv[c] = running_avg_uv[c] - adjustment;
+              running_avg[c] = running_avg[c] - adjustment;
             }
             sum_diff -= adjustment;
           } else if (diff < 0) {
             // Bring denoised signal up.
-            if (running_avg_uv[c] + adjustment > 255) {
-              running_avg_uv[c] = 255;
+            if (running_avg[c] + adjustment > 255) {
+              running_avg[c] = 255;
             } else {
-              running_avg_uv[c] = running_avg_uv[c] + adjustment;
+              running_avg[c] = running_avg[c] + adjustment;
             }
             sum_diff += adjustment;
           }
@@ -342,8 +341,8 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv,
         // TODO(marpan): Check here if abs(sum_diff) has gone below the
         // threshold sum_diff_thresh, and if so, we can exit the row loop.
         sig += sig_stride;
-        mc_running_avg_uv += mc_avg_uv_stride;
-        running_avg_uv += avg_uv_stride;
+        mc_running_avg += mc_avg_stride;
+        running_avg += avg_stride;
       }
       if (abs(sum_diff) > sum_diff_thresh) return COPY_BLOCK;
     } else {
@@ -351,7 +350,7 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv,
     }
   }
 
-  vp8_copy_mem8x8(running_avg_uv_start, avg_uv_stride, sig_start, sig_stride);
+  vp8_copy_mem8x8(running_avg_start, avg_stride, sig_start, sig_stride);
   return FILTER_BLOCK;
 }
 
diff --git a/vp8/encoder/mcomp.h b/vp8/encoder/mcomp.h
index 490b0b872..397f872e2 100644
--- a/vp8/encoder/mcomp.h
+++ b/vp8/encoder/mcomp.h
@@ -19,8 +19,8 @@ extern "C" {
 #endif
 
 #ifdef VP8_ENTROPY_STATS
-extern void init_mv_ref_counts();
-extern void accum_mv_refs(MB_PREDICTION_MODE, const int near_mv_ref_cts[4]);
+void init_mv_ref_counts();
+void accum_mv_refs(MB_PREDICTION_MODE, const int near_mv_ref_cts[4]);
 #endif
 
 /* The maximum number of steps in a step search given the largest allowed
@@ -34,15 +34,15 @@ extern void accum_mv_refs(MB_PREDICTION_MODE, const int near_mv_ref_cts[4]);
 /* Maximum size of the first step in full pel units */
 #define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS - 1))
 
-extern void print_mode_context(void);
-extern int vp8_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvcost[2], int Weight);
-extern void vp8_init_dsmotion_compensation(MACROBLOCK *x, int stride);
-extern void vp8_init3smotion_compensation(MACROBLOCK *x, int stride);
+void print_mode_context(void);
+int vp8_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvcost[2], int Weight);
+void vp8_init_dsmotion_compensation(MACROBLOCK *x, int stride);
+void vp8_init3smotion_compensation(MACROBLOCK *x, int stride);
 
-extern int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
-                          int_mv *best_mv, int search_param, int error_per_bit,
-                          const vp8_variance_fn_ptr_t *vf, int *mvsadcost[2],
-                          int *mvcost[2], int_mv *center_mv);
+int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
+                   int_mv *best_mv, int search_param, int sad_per_bit,
+                   const vp8_variance_fn_ptr_t *vfp, int *mvsadcost[2],
+                   int *mvcost[2], int_mv *center_mv);
 
 typedef int(fractional_mv_step_fp)(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
                                    int_mv *bestmv, int_mv *ref_mv,
@@ -51,10 +51,10 @@ typedef int(fractional_mv_step_fp)(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
                                    int *mvcost[2], int *distortion,
                                    unsigned int *sse);
 
-extern fractional_mv_step_fp vp8_find_best_sub_pixel_step_iteratively;
-extern fractional_mv_step_fp vp8_find_best_sub_pixel_step;
-extern fractional_mv_step_fp vp8_find_best_half_pixel_step;
-extern fractional_mv_step_fp vp8_skip_fractional_mv_step;
+fractional_mv_step_fp vp8_find_best_sub_pixel_step_iteratively;
+fractional_mv_step_fp vp8_find_best_sub_pixel_step;
+fractional_mv_step_fp vp8_find_best_half_pixel_step;
+fractional_mv_step_fp vp8_skip_fractional_mv_step;
 
 typedef int (*vp8_full_search_fn_t)(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
                                     int_mv *ref_mv, int sad_per_bit,
diff --git a/vp8/encoder/modecosts.h b/vp8/encoder/modecosts.h
index 422a79b36..09ee2b552 100644
--- a/vp8/encoder/modecosts.h
+++ b/vp8/encoder/modecosts.h
@@ -17,7 +17,7 @@ extern "C" {
 
 struct VP8_COMP;
 
-void vp8_init_mode_costs(struct VP8_COMP *x);
+void vp8_init_mode_costs(struct VP8_COMP *c);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 2da940199..ddd588294 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -2106,8 +2106,8 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
   return cpi;
 }
 
-void vp8_remove_compressor(VP8_COMP **ptr) {
-  VP8_COMP *cpi = *ptr;
+void vp8_remove_compressor(VP8_COMP **comp) {
+  VP8_COMP *cpi = *comp;
 
   if (!cpi) return;
 
@@ -2326,7 +2326,7 @@ void vp8_remove_compressor(VP8_COMP **ptr) {
 
   vp8_remove_common(&cpi->common);
   vpx_free(cpi);
-  *ptr = 0;
+  *comp = 0;
 
 #ifdef OUTPUT_YUV_SRC
   fclose(yuv_file);
diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
index 1bb54fc2b..6bb3cacc5 100644
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -173,9 +173,8 @@ static int get_prediction_error(BLOCK *be, BLOCKD *b) {
 
 static int pick_intra4x4block(MACROBLOCK *x, int ib,
                               B_PREDICTION_MODE *best_mode,
-                              const int *mode_costs,
-
-                              int *bestrate, int *bestdistortion) {
+                              const int *mode_costs, int *bestrate,
+                              int *bestdistortion) {
   BLOCKD *b = &x->e_mbd.block[ib];
   BLOCK *be = &x->block[ib];
   int dst_stride = x->e_mbd.dst.y_stride;
@@ -1303,9 +1302,9 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
   update_mvcount(x, &best_ref_mv);
 }
 
-void vp8_pick_intra_mode(MACROBLOCK *x, int *rate_) {
+void vp8_pick_intra_mode(MACROBLOCK *x, int *rate) {
   int error4x4, error16x16 = INT_MAX;
-  int rate, best_rate = 0, distortion, best_sse;
+  int rate_, best_rate = 0, distortion, best_sse;
   MB_PREDICTION_MODE mode, best_mode = DC_PRED;
   int this_rd;
   unsigned int sse;
@@ -1323,23 +1322,23 @@ void vp8_pick_intra_mode(MACROBLOCK *x, int *rate_) {
                                      xd->predictor, 16);
     distortion = vpx_variance16x16(*(b->base_src), b->src_stride, xd->predictor,
                                    16, &sse);
-    rate = x->mbmode_cost[xd->frame_type][mode];
-    this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+    rate_ = x->mbmode_cost[xd->frame_type][mode];
+    this_rd = RDCOST(x->rdmult, x->rddiv, rate_, distortion);
 
     if (error16x16 > this_rd) {
       error16x16 = this_rd;
       best_mode = mode;
       best_sse = sse;
-      best_rate = rate;
+      best_rate = rate_;
     }
   }
   xd->mode_info_context->mbmi.mode = best_mode;
 
-  error4x4 = pick_intra4x4mby_modes(x, &rate, &best_sse);
+  error4x4 = pick_intra4x4mby_modes(x, &rate_, &best_sse);
   if (error4x4 < error16x16) {
     xd->mode_info_context->mbmi.mode = B_PRED;
-    best_rate = rate;
+    best_rate = rate_;
   }
 
-  *rate_ = best_rate;
+  *rate = best_rate;
 }
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index b4182c5cd..679d66bbf 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -2358,11 +2358,11 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
   rd_update_mvcount(x, &best_ref_mv);
 }
 
-void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate_) {
+void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate) {
   int error4x4, error16x16;
   int rate4x4, rate16x16 = 0, rateuv;
   int dist4x4, dist16x16, distuv;
-  int rate;
+  int rate_;
   int rate4x4_tokenonly = 0;
   int rate16x16_tokenonly = 0;
   int rateuv_tokenonly = 0;
@@ -2370,7 +2370,7 @@ void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate_) {
   x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;
 
   rd_pick_intra_mbuv_mode(x, &rateuv, &rateuv_tokenonly, &distuv);
-  rate = rateuv;
+  rate_ = rateuv;
 
   error16x16 = rd_pick_intra16x16mby_mode(x, &rate16x16, &rate16x16_tokenonly,
                                           &dist16x16);
@@ -2380,10 +2380,10 @@ void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate_) {
 
   if (error4x4 < error16x16) {
     x->e_mbd.mode_info_context->mbmi.mode = B_PRED;
-    rate += rate4x4;
+    rate_ += rate4x4;
   } else {
-    rate += rate16x16;
+    rate_ += rate16x16;
   }
 
-  *rate_ = rate;
+  *rate = rate_;
 }
diff --git a/vp8/encoder/rdopt.h b/vp8/encoder/rdopt.h
index e22b58b8a..cc3db8197 100644
--- a/vp8/encoder/rdopt.h
+++ b/vp8/encoder/rdopt.h
@@ -63,12 +63,12 @@ static INLINE void insertsortsad(int arr[], int idx[], int len) {
   }
 }
 
-extern void vp8_initialize_rd_consts(VP8_COMP *cpi, MACROBLOCK *x, int Qvalue);
-extern void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x,
-                                   int recon_yoffset, int recon_uvoffset,
-                                   int *returnrate, int *returndistortion,
-                                   int *returnintra, int mb_row, int mb_col);
-extern void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate);
+void vp8_initialize_rd_consts(VP8_COMP *cpi, MACROBLOCK *x, int Qvalue);
+void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
+                            int recon_uvoffset, int *returnrate,
+                            int *returndistortion, int *returnintra, int mb_row,
+                            int mb_col);
+void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate);
 
 static INLINE void get_plane_pointers(const YV12_BUFFER_CONFIG *fb,
                                       unsigned char *plane[3],
@@ -110,9 +110,9 @@ static INLINE void get_reference_search_order(const VP8_COMP *cpi,
   for (; i < 4; ++i) ref_frame_map[i] = -1;
 }
 
-extern void vp8_mv_pred(VP8_COMP *cpi, MACROBLOCKD *xd, const MODE_INFO *here,
-                        int_mv *mvp, int refframe, int *ref_frame_sign_bias,
-                        int *sr, int near_sadidx[]);
+void vp8_mv_pred(VP8_COMP *cpi, MACROBLOCKD *xd, const MODE_INFO *here,
+                 int_mv *mvp, int refframe, int *ref_frame_sign_bias, int *sr,
+                 int near_sadidx[]);
 void vp8_cal_sad(VP8_COMP *cpi, MACROBLOCKD *xd, MACROBLOCK *x,
                  int recon_yoffset, int near_sadidx[]);
 int VP8_UVSSE(MACROBLOCK *x);
diff --git a/vp8/encoder/treewriter.h b/vp8/encoder/treewriter.h
index 0d7b06e56..c02683a58 100644
--- a/vp8/encoder/treewriter.h
+++ b/vp8/encoder/treewriter.h
@@ -91,9 +91,9 @@ static INLINE int vp8_cost_token(vp8_tree t, const vp8_prob *const p,
 
 /* Fill array of costs for all possible token values. */
 
-void vp8_cost_tokens(int *Costs, const vp8_prob *, vp8_tree);
+void vp8_cost_tokens(int *c, const vp8_prob *, vp8_tree);
 
-void vp8_cost_tokens2(int *Costs, const vp8_prob *, vp8_tree, int);
+void vp8_cost_tokens2(int *c, const vp8_prob *, vp8_tree, int);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp9/common/vp9_entropymv.h b/vp9/common/vp9_entropymv.h
index dcc8e2998..ee9d37973 100644
--- a/vp9/common/vp9_entropymv.h
+++ b/vp9/common/vp9_entropymv.h
@@ -25,7 +25,7 @@ struct VP9Common;
 
 void vp9_init_mv_probs(struct VP9Common *cm);
 
-void vp9_adapt_mv_probs(struct VP9Common *cm, int usehp);
+void vp9_adapt_mv_probs(struct VP9Common *cm, int allow_hp);
 
 static INLINE int use_mv_hp(const MV *ref) {
   const int kMvRefThresh = 64;  // threshold for use of high-precision 1/8 mv
@@ -127,7 +127,7 @@ typedef struct {
   nmv_component_counts comps[2];
 } nmv_context_counts;
 
-void vp9_inc_mv(const MV *mv, nmv_context_counts *mvctx);
+void vp9_inc_mv(const MV *mv, nmv_context_counts *counts);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index da9180b71..04a93f3b9 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -880,12 +880,12 @@ void vp9_adjust_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
 // This function sets up the bit masks for the entire 64x64 region represented
 // by mi_row, mi_col.
 void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
-                    MODE_INFO **mi, const int mode_info_stride,
+                    MODE_INFO **mi8x8, const int mode_info_stride,
                     LOOP_FILTER_MASK *lfm) {
   int idx_32, idx_16, idx_8;
   const loop_filter_info_n *const lfi_n = &cm->lf_info;
-  MODE_INFO **mip = mi;
-  MODE_INFO **mip2 = mi;
+  MODE_INFO **mip = mi8x8;
+  MODE_INFO **mip2 = mi8x8;
 
   // These are offsets to the next mi in the 64x64 block. It is what gets
   // added to the mi ptr as we go through each loop. It helps us to avoid
diff --git a/vp9/common/vp9_loopfilter.h b/vp9/common/vp9_loopfilter.h
index daf3b9131..39648a72c 100644
--- a/vp9/common/vp9_loopfilter.h
+++ b/vp9/common/vp9_loopfilter.h
@@ -97,7 +97,7 @@ struct VP9LfSyncData;
 // This function sets up the bit masks for the entire 64x64 region represented
 // by mi_row, mi_col.
 void vp9_setup_mask(struct VP9Common *const cm, const int mi_row,
-                    const int mi_col, MODE_INFO **mi_8x8,
+                    const int mi_col, MODE_INFO **mi8x8,
                     const int mode_info_stride, LOOP_FILTER_MASK *lfm);
 
 void vp9_filter_block_plane_ss00(struct VP9Common *const cm,
@@ -120,7 +120,7 @@ void vp9_loop_filter_init(struct VP9Common *cm);
 void vp9_loop_filter_frame_init(struct VP9Common *cm, int default_filt_lvl);
 
 void vp9_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct VP9Common *cm,
-                           struct macroblockd *mbd, int filter_level,
+                           struct macroblockd *xd, int frame_filter_level,
                            int y_only, int partial_frame);
 
 // Get the superblock lfm for a given mi_row, mi_col.
diff --git a/vp9/common/vp9_postproc.h b/vp9/common/vp9_postproc.h
index 0aafa72ca..67efc1b4e 100644
--- a/vp9/common/vp9_postproc.h
+++ b/vp9/common/vp9_postproc.h
@@ -38,7 +38,7 @@ struct VP9Common;
 #define MFQE_PRECISION 4
 
 int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest,
-                        vp9_ppflags_t *flags, int unscaled_width);
+                        vp9_ppflags_t *ppflags, int unscaled_width);
 
 void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q,
                  uint8_t *limits);
diff --git a/vp9/common/vp9_reconinter.h b/vp9/common/vp9_reconinter.h
index 2c6d6695a..992e30c34 100644
--- a/vp9/common/vp9_reconinter.h
+++ b/vp9/common/vp9_reconinter.h
@@ -61,15 +61,15 @@ void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
                                    BLOCK_SIZE bsize);
 
 void vp9_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
-                               int dst_stride, const MV *mv_q3,
+                               int dst_stride, const MV *src_mv,
                                const struct scale_factors *sf, int w, int h,
-                               int do_avg, const InterpKernel *kernel,
+                               int ref, const InterpKernel *kernel,
                                enum mv_precision precision, int x, int y);
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vp9_highbd_build_inter_predictor(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
-    const MV *mv_q3, const struct scale_factors *sf, int w, int h, int do_avg,
+    const MV *src_mv, const struct scale_factors *sf, int w, int h, int ref,
     const InterpKernel *kernel, enum mv_precision precision, int x, int y,
     int bd);
 #endif
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 6d7f95260..d7ad2b693 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -62,7 +62,7 @@ add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, i
 
 add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
 
-add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
 
 if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
   # Note that there are more specializations appended when
@@ -100,7 +100,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 
   add_proto qw/void vp9_highbd_iht8x8_64_add/, "const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd";
 
-  add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd";
+  add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd";
 
   if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
     specialize qw/vp9_highbd_iht4x4_16_add neon sse4_1/;
diff --git a/vp9/common/vp9_scale.h b/vp9/common/vp9_scale.h
index 53c6eef72..aaafdf867 100644
--- a/vp9/common/vp9_scale.h
+++ b/vp9/common/vp9_scale.h
@@ -42,7 +42,7 @@ MV32 vp9_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf);
 #if CONFIG_VP9_HIGHBITDEPTH
 void vp9_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w,
                                        int other_h, int this_w, int this_h,
-                                       int use_high);
+                                       int use_highbd);
 #else
 void vp9_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w,
                                        int other_h, int this_w, int this_h);
diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h
index 5354105f8..9a582fffb 100644
--- a/vp9/decoder/vp9_decoder.h
+++ b/vp9/decoder/vp9_decoder.h
@@ -93,7 +93,7 @@ typedef struct VP9Decoder {
 } VP9Decoder;
 
 int vp9_receive_compressed_data(struct VP9Decoder *pbi, size_t size,
-                                const uint8_t **dest);
+                                const uint8_t **psource);
 
 int vp9_get_raw_frame(struct VP9Decoder *pbi, YV12_BUFFER_CONFIG *sd,
                       vp9_ppflags_t *flags);
diff --git a/vp9/encoder/arm/neon/vp9_dct_neon.c b/vp9/encoder/arm/neon/vp9_dct_neon.c
index 513718e7c..f8dd0a6f7 100644
--- a/vp9/encoder/arm/neon/vp9_dct_neon.c
+++ b/vp9/encoder/arm/neon/vp9_dct_neon.c
@@ -23,13 +23,13 @@ void vp9_fdct8x8_quant_neon(const int16_t *input, int stride,
                             int skip_block, const int16_t *round_ptr,
                             const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
                             tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                            uint16_t *eob_ptr, const int16_t *scan_ptr,
-                            const int16_t *iscan_ptr) {
+                            uint16_t *eob_ptr, const int16_t *scan,
+                            const int16_t *iscan) {
   tran_low_t temp_buffer[64];
   (void)coeff_ptr;
 
   vpx_fdct8x8_neon(input, temp_buffer, stride);
   vp9_quantize_fp_neon(temp_buffer, n_coeffs, skip_block, round_ptr, quant_ptr,
-                       qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan_ptr,
-                       iscan_ptr);
+                       qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
+                       iscan);
 }
diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c
index 97a09bdff..2cec8bd03 100644
--- a/vp9/encoder/arm/neon/vp9_quantize_neon.c
+++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c
@@ -122,7 +122,7 @@ void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count,
                                 const int16_t *quant_ptr,
                                 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                                const int16_t *scan, const int16_t *iscan_ptr) {
+                                const int16_t *scan, const int16_t *iscan) {
   const int16x8_t one = vdupq_n_s16(1);
   const int16x8_t neg_one = vdupq_n_s16(-1);
 
@@ -134,8 +134,8 @@ void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count,
   const int16x8_t dequant_thresh = vshrq_n_s16(vld1q_s16(dequant_ptr), 2);
 
   // Process dc and the first seven ac coeffs.
-  const uint16x8_t iscan =
-      vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one));
+  const uint16x8_t v_iscan =
+      vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
   const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
   const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
   const int16x8_t coeff_abs = vabsq_s16(coeff);
@@ -169,12 +169,12 @@ void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count,
 
   dqcoeff = vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1));
 
-  eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), iscan);
+  eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
 
   store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
   store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff);
 
-  iscan_ptr += 8;
+  iscan += 8;
   coeff_ptr += 8;
   qcoeff_ptr += 8;
   dqcoeff_ptr += 8;
@@ -188,8 +188,8 @@ void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count,
 
     // Process the rest of the ac coeffs.
     for (i = 8; i < 32 * 32; i += 8) {
-      const uint16x8_t iscan =
-          vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one));
+      const uint16x8_t v_iscan =
+          vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
       const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
       const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
       const int16x8_t coeff_abs = vabsq_s16(coeff);
@@ -215,12 +215,12 @@ void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count,
           vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1));
 
       eob_max =
-          vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), iscan));
+          vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
 
       store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
       store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff);
 
-      iscan_ptr += 8;
+      iscan += 8;
       coeff_ptr += 8;
       qcoeff_ptr += 8;
       dqcoeff_ptr += 8;
diff --git a/vp9/encoder/ppc/vp9_quantize_vsx.c b/vp9/encoder/ppc/vp9_quantize_vsx.c
index 3720b0876..4f88b8fff 100644
--- a/vp9/encoder/ppc/vp9_quantize_vsx.c
+++ b/vp9/encoder/ppc/vp9_quantize_vsx.c
@@ -42,8 +42,8 @@ void vp9_quantize_fp_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                          int skip_block, const int16_t *round_ptr,
                          const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
                          tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                         uint16_t *eob_ptr, const int16_t *scan_ptr,
-                         const int16_t *iscan_ptr) {
+                         uint16_t *eob_ptr, const int16_t *scan,
+                         const int16_t *iscan) {
   int16x8_t qcoeff0, qcoeff1, dqcoeff0, dqcoeff1, eob;
   bool16x8_t zero_coeff0, zero_coeff1;
 
@@ -52,10 +52,10 @@ void vp9_quantize_fp_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   int16x8_t dequant = vec_vsx_ld(0, dequant_ptr);
   int16x8_t coeff0 = vec_vsx_ld(0, coeff_ptr);
   int16x8_t coeff1 = vec_vsx_ld(16, coeff_ptr);
-  int16x8_t scan0 = vec_vsx_ld(0, iscan_ptr);
-  int16x8_t scan1 = vec_vsx_ld(16, iscan_ptr);
+  int16x8_t scan0 = vec_vsx_ld(0, iscan);
+  int16x8_t scan1 = vec_vsx_ld(16, iscan);
 
-  (void)scan_ptr;
+  (void)scan;
   (void)skip_block;
   assert(!skip_block);
 
@@ -103,9 +103,9 @@ void vp9_quantize_fp_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
       coeff0 = vec_vsx_ld(off0, coeff_ptr);
       coeff1 = vec_vsx_ld(off1, coeff_ptr);
       coeff2 = vec_vsx_ld(off2, coeff_ptr);
-      scan0 = vec_vsx_ld(off0, iscan_ptr);
-      scan1 = vec_vsx_ld(off1, iscan_ptr);
-      scan2 = vec_vsx_ld(off2, iscan_ptr);
+      scan0 = vec_vsx_ld(off0, iscan);
+      scan1 = vec_vsx_ld(off1, iscan);
+      scan2 = vec_vsx_ld(off2, iscan);
 
       qcoeff0 = vec_mulhi(vec_vaddshs(vec_abs(coeff0), round), quant);
       zero_coeff0 = vec_cmpeq(qcoeff0, vec_zeros_s16);
@@ -169,8 +169,7 @@ void vp9_quantize_fp_32x32_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                                const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
                                tran_low_t *dqcoeff_ptr,
                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                               const int16_t *scan_ptr,
-                               const int16_t *iscan_ptr) {
+                               const int16_t *scan, const int16_t *iscan) {
   // In stage 1, we quantize 16 coeffs (DC + 15 AC)
   // In stage 2, we loop 42 times and quantize 24 coeffs per iteration
   // (32 * 32 - 16) / 24 = 42
@@ -188,13 +187,13 @@ void vp9_quantize_fp_32x32_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   int16x8_t dequant = vec_vsx_ld(0, dequant_ptr);
   int16x8_t coeff0 = vec_vsx_ld(0, coeff_ptr);
   int16x8_t coeff1 = vec_vsx_ld(16, coeff_ptr);
-  int16x8_t scan0 = vec_vsx_ld(0, iscan_ptr);
-  int16x8_t scan1 = vec_vsx_ld(16, iscan_ptr);
+  int16x8_t scan0 = vec_vsx_ld(0, iscan);
+  int16x8_t scan1 = vec_vsx_ld(16, iscan);
   int16x8_t thres = vec_sra(dequant, vec_splats((uint16_t)2));
   int16x8_t abs_coeff0 = vec_abs(coeff0);
   int16x8_t abs_coeff1 = vec_abs(coeff1);
 
-  (void)scan_ptr;
+  (void)scan;
   (void)skip_block;
   (void)n_coeffs;
   assert(!skip_block);
@@ -238,9 +237,9 @@ void vp9_quantize_fp_32x32_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     coeff0 = vec_vsx_ld(off0, coeff_ptr);
     coeff1 = vec_vsx_ld(off1, coeff_ptr);
     coeff2 = vec_vsx_ld(off2, coeff_ptr);
-    scan0 = vec_vsx_ld(off0, iscan_ptr);
-    scan1 = vec_vsx_ld(off1, iscan_ptr);
-    scan2 = vec_vsx_ld(off2, iscan_ptr);
+    scan0 = vec_vsx_ld(off0, iscan);
+    scan1 = vec_vsx_ld(off1, iscan);
+    scan2 = vec_vsx_ld(off2, iscan);
 
     abs_coeff0 = vec_abs(coeff0);
     abs_coeff1 = vec_abs(coeff1);
diff --git a/vp9/encoder/vp9_encodemv.h b/vp9/encoder/vp9_encodemv.h
index 8bbf85787..2f1be4b23 100644
--- a/vp9/encoder/vp9_encodemv.h
+++ b/vp9/encoder/vp9_encodemv.h
@@ -27,7 +27,7 @@ void vp9_encode_mv(VP9_COMP *cpi, vpx_writer *w, const MV *mv, const MV *ref,
                    unsigned int *const max_mv_magnitude);
 
 void vp9_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
-                              const nmv_context *mvctx, int usehp);
+                              const nmv_context *ctx, int usehp);
 
 void vp9_update_mv_count(ThreadData *td);
 
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 07289588c..02814599d 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -811,7 +811,7 @@ void vp9_change_config(VP9_COMP *cpi, const VP9EncoderConfig *oxcf);
 // frame is made and not just a copy of the pointer..
 int vp9_receive_raw_frame(VP9_COMP *cpi, vpx_enc_frame_flags_t frame_flags,
                           YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
-                          int64_t end_time_stamp);
+                          int64_t end_time);
 
 int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
                             size_t *size, uint8_t *dest, int64_t *time_stamp,
@@ -832,9 +832,11 @@ int vp9_set_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag,
 
 int vp9_update_entropy(VP9_COMP *cpi, int update);
 
-int vp9_set_active_map(VP9_COMP *cpi, unsigned char *map, int rows, int cols);
+int vp9_set_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows,
+                       int cols);
 
-int vp9_get_active_map(VP9_COMP *cpi, unsigned char *map, int rows, int cols);
+int vp9_get_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows,
+                       int cols);
 
 int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING horiz_mode,
                           VPX_SCALING vert_mode);
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index 6bd85a152..558ecbcd8 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -59,7 +59,7 @@ struct SPEED_FEATURES;
 int vp9_init_search_range(int size);
 
 int vp9_refining_search_sad(const struct macroblock *x, struct mv *ref_mv,
-                            int sad_per_bit, int distance,
+                            int error_per_bit, int search_range,
                             const struct vp9_variance_vtable *fn_ptr,
                             const struct mv *center_mv);
 
diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h
index 3b441bf1f..a9f75555e 100644
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -194,7 +194,7 @@ struct VP9EncoderConfig;
 void vp9_rc_init(const struct VP9EncoderConfig *oxcf, int pass,
                  RATE_CONTROL *rc);
 
-int vp9_estimate_bits_at_q(FRAME_TYPE frame_kind, int q, int mbs,
+int vp9_estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs,
                            double correction_factor, vpx_bit_depth_t bit_depth);
 
 double vp9_convert_qindex_to_q(int qindex, vpx_bit_depth_t bit_depth);
@@ -205,9 +205,9 @@ void vp9_rc_init_minq_luts(void);
 
 int vp9_rc_get_default_min_gf_interval(int width, int height, double framerate);
 // Note vp9_rc_get_default_max_gf_interval() requires the min_gf_interval to
-// be passed in to ensure that the max_gf_interval returned is at least as bis
+// be passed in to ensure that the max_gf_interval returned is at least as big
 // as that.
-int vp9_rc_get_default_max_gf_interval(double framerate, int min_frame_rate);
+int vp9_rc_get_default_max_gf_interval(double framerate, int min_gf_interval);
 
 // Generally at the high level, the following flow is expected
 // to be enforced for rate control:
@@ -253,7 +253,7 @@ int vp9_rc_drop_frame(struct VP9_COMP *cpi);
 
 // Computes frame size bounds.
 void vp9_rc_compute_frame_size_bounds(const struct VP9_COMP *cpi,
-                                      int this_frame_target,
+                                      int frame_target,
                                       int *frame_under_shoot_limit,
                                       int *frame_over_shoot_limit);
 
diff --git a/vp9/encoder/vp9_rd.h b/vp9/encoder/vp9_rd.h
index f2fc776a4..a1a98bd91 100644
--- a/vp9/encoder/vp9_rd.h
+++ b/vp9/encoder/vp9_rd.h
@@ -145,7 +145,7 @@ void vp9_initialize_rd_consts(struct VP9_COMP *cpi);
 
 void vp9_initialize_me_consts(struct VP9_COMP *cpi, MACROBLOCK *x, int qindex);
 
-void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n,
+void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2,
                                   unsigned int qstep, int *rate, int64_t *dist);
 
 void vp9_model_rd_from_var_lapndz_vec(unsigned int var[MAX_MB_PLANE],
@@ -176,8 +176,8 @@ void vp9_set_rd_speed_thresholds(struct VP9_COMP *cpi);
 
 void vp9_set_rd_speed_thresholds_sub8x8(struct VP9_COMP *cpi);
 
-void vp9_update_rd_thresh_fact(int (*fact)[MAX_MODES], int rd_thresh, int bsize,
-                               int best_mode_index);
+void vp9_update_rd_thresh_fact(int (*factor_buf)[MAX_MODES], int rd_thresh,
+                               int bsize, int best_mode_index);
 
 static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh,
                                       const int *const thresh_fact) {
diff --git a/vp9/encoder/x86/vp9_dct_intrin_sse2.c b/vp9/encoder/x86/vp9_dct_intrin_sse2.c
index 293cdcd67..0cecd6540 100644
--- a/vp9/encoder/x86/vp9_dct_intrin_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_intrin_sse2.c
@@ -185,8 +185,8 @@ void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride,
                             int skip_block, const int16_t *round_ptr,
                             const int16_t *quant_ptr, int16_t *qcoeff_ptr,
                             int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                            uint16_t *eob_ptr, const int16_t *scan_ptr,
-                            const int16_t *iscan_ptr) {
+                            uint16_t *eob_ptr, const int16_t *scan,
+                            const int16_t *iscan) {
   __m128i zero;
   int pass;
 
@@ -215,7 +215,7 @@ void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride,
   __m128i *in[8];
   int index = 0;
 
-  (void)scan_ptr;
+  (void)scan;
   (void)coeff_ptr;
 
   // Pre-condition input (shift by two)
@@ -449,7 +449,7 @@ void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride,
     in7 = _mm_srai_epi16(in7, 1);
   }
 
-  iscan_ptr += n_coeffs;
+  iscan += n_coeffs;
   qcoeff_ptr += n_coeffs;
   dqcoeff_ptr += n_coeffs;
   n_coeffs = -n_coeffs;
@@ -518,8 +518,8 @@ void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride,
         zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
         nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
         nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
-        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
+        iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs));
+        iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1);
         // Add one to convert from indices to counts
         iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
         iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
@@ -582,8 +582,8 @@ void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride,
         zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
         nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
         nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
-        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
+        iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs));
+        iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1);
         // Add one to convert from indices to counts
         iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
         iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
diff --git a/vp9/encoder/x86/vp9_dct_ssse3.c b/vp9/encoder/x86/vp9_dct_ssse3.c
index bf874a09e..99c193894 100644
--- a/vp9/encoder/x86/vp9_dct_ssse3.c
+++ b/vp9/encoder/x86/vp9_dct_ssse3.c
@@ -18,11 +18,13 @@
 #include "vpx_dsp/x86/inv_txfm_sse2.h"
 #include "vpx_dsp/x86/txfm_common_sse2.h"
 
-void vp9_fdct8x8_quant_ssse3(
-    const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs,
-    int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr,
-    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-    uint16_t *eob_ptr, const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride,
+                             tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             int skip_block, const int16_t *round_ptr,
+                             const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
+                             tran_low_t *dqcoeff_ptr,
+                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                             const int16_t *scan, const int16_t *iscan) {
   __m128i zero;
   int pass;
 
@@ -52,7 +54,7 @@ void vp9_fdct8x8_quant_ssse3(
   __m128i *in[8];
   int index = 0;
 
-  (void)scan_ptr;
+  (void)scan;
   (void)coeff_ptr;
 
   // Pre-condition input (shift by two)
@@ -280,7 +282,7 @@ void vp9_fdct8x8_quant_ssse3(
     in7 = _mm_srai_epi16(in7, 1);
   }
 
-  iscan_ptr += n_coeffs;
+  iscan += n_coeffs;
   qcoeff_ptr += n_coeffs;
   dqcoeff_ptr += n_coeffs;
   n_coeffs = -n_coeffs;
@@ -350,8 +352,8 @@ void vp9_fdct8x8_quant_ssse3(
         zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
         nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
         nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
-        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
+        iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs));
+        iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1);
         // Add one to convert from indices to counts
         iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
         iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
@@ -427,8 +429,8 @@ void vp9_fdct8x8_quant_ssse3(
         zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
         nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
         nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
-        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
+        iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs));
+        iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1);
         // Add one to convert from indices to counts
         iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
         iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
diff --git a/vp9/encoder/x86/vp9_quantize_avx2.c b/vp9/encoder/x86/vp9_quantize_avx2.c
index 4bebc34d6..556a9fbaa 100644
--- a/vp9/encoder/x86/vp9_quantize_avx2.c
+++ b/vp9/encoder/x86/vp9_quantize_avx2.c
@@ -50,18 +50,18 @@ void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                           int skip_block, const int16_t *round_ptr,
                           const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
                           tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                          uint16_t *eob_ptr, const int16_t *scan_ptr,
-                          const int16_t *iscan_ptr) {
+                          uint16_t *eob_ptr, const int16_t *scan,
+                          const int16_t *iscan) {
   __m128i eob;
   __m256i round256, quant256, dequant256;
   __m256i eob256, thr256;
 
-  (void)scan_ptr;
+  (void)scan;
   (void)skip_block;
   assert(!skip_block);
 
   coeff_ptr += n_coeffs;
-  iscan_ptr += n_coeffs;
+  iscan += n_coeffs;
   qcoeff_ptr += n_coeffs;
   dqcoeff_ptr += n_coeffs;
   n_coeffs = -n_coeffs;
@@ -97,7 +97,7 @@ void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
       store_tran_low(coeff256, dqcoeff_ptr + n_coeffs);
     }
 
-    eob256 = scan_eob_256((const __m256i *)(iscan_ptr + n_coeffs), &coeff256);
+    eob256 = scan_eob_256((const __m256i *)(iscan + n_coeffs), &coeff256);
     n_coeffs += 8 * 2;
   }
 
@@ -124,8 +124,7 @@ void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
       coeff256 = _mm256_mullo_epi16(qcoeff256, dequant256);
       store_tran_low(coeff256, dqcoeff_ptr + n_coeffs);
       eob256 = _mm256_max_epi16(
-          eob256,
-          scan_eob_256((const __m256i *)(iscan_ptr + n_coeffs), &coeff256));
+          eob256, scan_eob_256((const __m256i *)(iscan + n_coeffs), &coeff256));
     } else {
       store_zero_tran_low(qcoeff_ptr + n_coeffs);
       store_zero_tran_low(dqcoeff_ptr + n_coeffs);
diff --git a/vp9/encoder/x86/vp9_quantize_sse2.c b/vp9/encoder/x86/vp9_quantize_sse2.c
index ca0ad4407..885220a71 100644
--- a/vp9/encoder/x86/vp9_quantize_sse2.c
+++ b/vp9/encoder/x86/vp9_quantize_sse2.c
@@ -21,20 +21,20 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                           int skip_block, const int16_t *round_ptr,
                           const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
                           tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                          uint16_t *eob_ptr, const int16_t *scan_ptr,
-                          const int16_t *iscan_ptr) {
+                          uint16_t *eob_ptr, const int16_t *scan,
+                          const int16_t *iscan) {
   __m128i zero;
   __m128i thr;
   int16_t nzflag;
   __m128i eob;
   __m128i round, quant, dequant;
 
-  (void)scan_ptr;
+  (void)scan;
   (void)skip_block;
   assert(!skip_block);
 
   coeff_ptr += n_coeffs;
-  iscan_ptr += n_coeffs;
+  iscan += n_coeffs;
   qcoeff_ptr += n_coeffs;
   dqcoeff_ptr += n_coeffs;
   n_coeffs = -n_coeffs;
@@ -100,8 +100,8 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
       zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
       nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
       nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-      iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
-      iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
+      iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs));
+      iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1);
       // Add one to convert from indices to counts
       iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
       iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
@@ -175,8 +175,8 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
       zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
       nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
       nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-      iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
-      iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
+      iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs));
+      iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1);
       // Add one to convert from indices to counts
       iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
       iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
diff --git a/vpx_dsp/x86/convolve.h b/vpx_dsp/x86/convolve.h
index f47cce4d2..aa60f44f7 100644
--- a/vpx_dsp/x86/convolve.h
+++ b/vpx_dsp/x86/convolve.h
@@ -128,7 +128,7 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
       int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) {        \
     const int16_t *filter = filter_kernel[offset];                            \
     if (step_q4 == 16 && filter[3] != 128) {                                  \
-      if (filter[0] | filter[1] | filter[2]) {                                \
+      if (filter[0] | filter[1] | filter[6] | filter[7]) {                    \
         while (w >= 16) {                                                     \
           vpx_highbd_filter_block1d16_##dir##8_##avg##opt(                    \
               src_start, src_stride, dst, dst_stride, h, filter, bd);         \
@@ -150,6 +150,28 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
           dst += 4;                                                           \
           w -= 4;                                                             \
         }                                                                     \
+      } else if (filter[2] | filter[5]) {                                     \
+        while (w >= 16) {                                                     \
+          vpx_highbd_filter_block1d16_##dir##4_##avg##opt(                    \
+              src_start, src_stride, dst, dst_stride, h, filter, bd);         \
+          src += 16;                                                          \
+          dst += 16;                                                          \
+          w -= 16;                                                            \
+        }                                                                     \
+        while (w >= 8) {                                                      \
+          vpx_highbd_filter_block1d8_##dir##4_##avg##opt(                     \
+              src_start, src_stride, dst, dst_stride, h, filter, bd);         \
+          src += 8;                                                           \
+          dst += 8;                                                           \
+          w -= 8;                                                             \
+        }                                                                     \
+        while (w >= 4) {                                                      \
+          vpx_highbd_filter_block1d4_##dir##4_##avg##opt(                     \
+              src_start, src_stride, dst, dst_stride, h, filter, bd);         \
+          src += 4;                                                           \
+          dst += 4;                                                           \
+          w -= 4;                                                             \
+        }                                                                     \
       } else {                                                                \
         while (w >= 16) {                                                     \
           vpx_highbd_filter_block1d16_##dir##2_##avg##opt(                    \
diff --git a/vpx_dsp/x86/convolve_avx2.h b/vpx_dsp/x86/convolve_avx2.h
index e9fc9c06a..99bc9637f 100644
--- a/vpx_dsp/x86/convolve_avx2.h
+++ b/vpx_dsp/x86/convolve_avx2.h
@@ -134,6 +134,13 @@ static INLINE void mm256_storeu2_epi32(__m128i *const dst_ptr_1,
       _mm_cvtsi128_si32(_mm256_extractf128_si256(*src, 1));
 }
 
+static INLINE __m256i mm256_round_epi32(const __m256i *const src,
+                                        const __m256i *const half_depth,
+                                        const int depth) {
+  const __m256i nearest_src = _mm256_add_epi32(*src, *half_depth);
+  return _mm256_srai_epi32(nearest_src, depth);
+}
+
 static INLINE __m256i mm256_round_epi16(const __m256i *const src,
                                         const __m256i *const half_depth,
                                         const int depth) {
@@ -141,6 +148,15 @@ static INLINE __m256i mm256_round_epi16(const __m256i *const src,
   return _mm256_srai_epi16(nearest_src, depth);
 }
 
+static INLINE __m256i mm256_madd_add_epi32(const __m256i *const src_0,
+                                           const __m256i *const src_1,
+                                           const __m256i *const ker_0,
+                                           const __m256i *const ker_1) {
+  const __m256i tmp_0 = _mm256_madd_epi16(*src_0, *ker_0);
+  const __m256i tmp_1 = _mm256_madd_epi16(*src_1, *ker_1);
+  return _mm256_add_epi32(tmp_0, tmp_1);
+}
+
 #undef MM256_BROADCASTSI128_SI256
 
 #endif  // VPX_VPX_DSP_X86_CONVOLVE_AVX2_H_
diff --git a/vpx_dsp/x86/convolve_sse2.h b/vpx_dsp/x86/convolve_sse2.h
index 81fae2951..844354639 100644
--- a/vpx_dsp/x86/convolve_sse2.h
+++ b/vpx_dsp/x86/convolve_sse2.h
@@ -32,10 +32,10 @@ static INLINE __m128i extract_quarter_3_epi16_sse2(const __m128i *const reg) {
 // Interprets src as 8-bit words, zero extends to form 16-bit words, then
 // multiplies with ker and add the adjacent results to form 32-bit words.
 // Finally adds the result from 1 and 2 together.
-static INLINE __m128i multiply_add_epi8_sse2(const __m128i *const src_1,
-                                             const __m128i *const src_2,
-                                             const __m128i *const ker_1,
-                                             const __m128i *const ker_2) {
+static INLINE __m128i mm_madd_add_epi8_sse2(const __m128i *const src_1,
+                                            const __m128i *const src_2,
+                                            const __m128i *const ker_1,
+                                            const __m128i *const ker_2) {
   const __m128i src_1_half = _mm_unpacklo_epi8(*src_1, _mm_setzero_si128());
   const __m128i src_2_half = _mm_unpacklo_epi8(*src_2, _mm_setzero_si128());
   const __m128i madd_1 = _mm_madd_epi16(src_1_half, *ker_1);
@@ -43,25 +43,44 @@ static INLINE __m128i multiply_add_epi8_sse2(const __m128i *const src_1,
   return _mm_add_epi32(madd_1, madd_2);
 }
 
-static INLINE __m128i multiply_add_packs_epi16_sse2(const __m128i *const src_0,
-                                                    const __m128i *const src_1,
-                                                    const __m128i *const ker) {
+// Interprets src as 16-bit words, then multiplies with ker and add the
+// adjacent results to form 32-bit words. Finally adds the result from 1 and 2
+// together.
+static INLINE __m128i mm_madd_add_epi16_sse2(const __m128i *const src_1,
+                                             const __m128i *const src_2,
+                                             const __m128i *const ker_1,
+                                             const __m128i *const ker_2) {
+  const __m128i madd_1 = _mm_madd_epi16(*src_1, *ker_1);
+  const __m128i madd_2 = _mm_madd_epi16(*src_2, *ker_2);
+  return _mm_add_epi32(madd_1, madd_2);
+}
+
+static INLINE __m128i mm_madd_packs_epi16_sse2(const __m128i *const src_0,
+                                               const __m128i *const src_1,
+                                               const __m128i *const ker) {
   const __m128i madd_1 = _mm_madd_epi16(*src_0, *ker);
   const __m128i madd_2 = _mm_madd_epi16(*src_1, *ker);
   return _mm_packs_epi32(madd_1, madd_2);
 }
 
 // Interleaves src_1 and src_2
-static INLINE __m128i combine_epi32_sse2(const __m128i *const src_1,
-                                         const __m128i *const src_2) {
+static INLINE __m128i mm_zip_epi32_sse2(const __m128i *const src_1,
+                                        const __m128i *const src_2) {
   const __m128i tmp_1 = _mm_unpacklo_epi32(*src_1, *src_2);
   const __m128i tmp_2 = _mm_unpackhi_epi32(*src_1, *src_2);
   return _mm_packs_epi32(tmp_1, tmp_2);
 }
 
-static INLINE __m128i round_epi16_sse2(const __m128i *const src,
-                                       const __m128i *const half_depth,
-                                       const int depth) {
+static INLINE __m128i mm_round_epi32_sse2(const __m128i *const src,
+                                          const __m128i *const half_depth,
+                                          const int depth) {
+  const __m128i nearest_src = _mm_add_epi32(*src, *half_depth);
+  return _mm_srai_epi32(nearest_src, depth);
+}
+
+static INLINE __m128i mm_round_epi16_sse2(const __m128i *const src,
+                                          const __m128i *const half_depth,
+                                          const int depth) {
   const __m128i nearest_src = _mm_adds_epi16(*src, *half_depth);
   return _mm_srai_epi16(nearest_src, depth);
 }
diff --git a/vpx_dsp/x86/highbd_convolve_avx2.c b/vpx_dsp/x86/highbd_convolve_avx2.c
index ef94522a3..ff5ef5f85 100644
--- a/vpx_dsp/x86/highbd_convolve_avx2.c
+++ b/vpx_dsp/x86/highbd_convolve_avx2.c
@@ -9,9 +9,9 @@
  */
 
 #include <immintrin.h>
-
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/x86/convolve.h"
+#include "vpx_dsp/x86/convolve_avx2.h"
 
 // -----------------------------------------------------------------------------
 // Copy and average
@@ -209,6 +209,7 @@ static const uint8_t signal_pattern_2[32] = { 6,  7,  8,  9,  8,  9,  10, 11,
 static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 };
 
 #define CONV8_ROUNDING_BITS (7)
+#define CONV8_ROUNDING_NUM (1 << (CONV8_ROUNDING_BITS - 1))
 
 // -----------------------------------------------------------------------------
 // Horizontal Filtering
@@ -923,6 +924,200 @@ static void vpx_highbd_filter_block1d16_h8_avg_avx2(
   } while (height > 0);
 }
 
+static void vpx_highbd_filter_block1d4_h4_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
+  // We extract the middle four elements of the kernel into two registers in
+  // the form
+  // ... k[3] k[2] k[3] k[2]
+  // ... k[5] k[4] k[5] k[4]
+  // Then we shuffle the source into
+  // ... s[1] s[0] s[0] s[-1]
+  // ... s[3] s[2] s[2] s[1]
+  // Calling multiply and add gives us half of the sum. Calling add on the two
+  // halves gives us the output. Since avx2 allows us to use 256-bit buffer, we
+  // can do this two rows at a time.
+
+  __m256i src_reg, src_reg_shift_0, src_reg_shift_2;
+  __m256i res_reg;
+  __m256i idx_shift_0 =
+      _mm256_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2,
+                       3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9);
+  __m256i idx_shift_2 =
+      _mm256_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4,
+                       5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13);
+
+  __m128i kernel_reg_128;  // Kernel
+  __m256i kernel_reg, kernel_reg_23,
+      kernel_reg_45;  // Segments of the kernel used
+  const __m256i reg_round =
+      _mm256_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
+  const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
+  const ptrdiff_t unrolled_src_stride = src_stride << 1;
+  const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
+  int h;
+
+  // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+  src_ptr -= 1;
+
+  // Load Kernel
+  kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
+  kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
+  kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
+
+  for (h = height; h >= 2; h -= 2) {
+    // Load the source
+    src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride);
+    src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
+    src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
+
+    // Get the output
+    res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
+                                   &kernel_reg_23, &kernel_reg_45);
+
+    // Round the result
+    res_reg = mm256_round_epi32(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
+
+    // Finally combine to get the final dst
+    res_reg = _mm256_packus_epi32(res_reg, res_reg);
+    res_reg = _mm256_min_epi16(res_reg, reg_max);
+    mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
+                        &res_reg);
+
+    src_ptr += unrolled_src_stride;
+    dst_ptr += unrolled_dst_stride;
+  }
+
+  // Repeat for the last row if needed
+  if (h > 0) {
+    // Load the source
+    src_reg = mm256_loadu2_si128(src_ptr, src_ptr + 4);
+    src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
+    src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
+
+    // Get the output
+    res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
+                                   &kernel_reg_23, &kernel_reg_45);
+
+    // Round the result
+    res_reg = mm256_round_epi32(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
+
+    // Finally combine to get the final dst
+    res_reg = _mm256_packus_epi32(res_reg, res_reg);
+    res_reg = _mm256_min_epi16(res_reg, reg_max);
+    _mm_storel_epi64((__m128i *)dst_ptr, _mm256_castsi256_si128(res_reg));
+  }
+}
+
+void vpx_highbd_filter_block1d8_h4_avx2(const uint16_t *src_ptr,
+                                        ptrdiff_t src_stride, uint16_t *dst_ptr,
+                                        ptrdiff_t dst_stride, uint32_t height,
+                                        const int16_t *kernel, int bd) {
+  // We will extract the middle four elements of the kernel into two registers
+  // in the form
+  // ... k[3] k[2] k[3] k[2]
+  // ... k[5] k[4] k[5] k[4]
+  // Then we shuffle the source into
+  // ... s[1] s[0] s[0] s[-1]
+  // ... s[3] s[2] s[2] s[1]
+  // Calling multiply and add gives us half of the sum of the first half.
+  // Calling add gives us first half of the output. Repat again to get the whole
+  // output. Since avx2 allows us to use 256-bit buffer, we can do this two rows
+  // at a time.
+
+  __m256i src_reg, src_reg_shift_0, src_reg_shift_2;
+  __m256i res_reg, res_first, res_last;
+  __m256i idx_shift_0 =
+      _mm256_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2,
+                       3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9);
+  __m256i idx_shift_2 =
+      _mm256_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4,
+                       5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13);
+
+  __m128i kernel_reg_128;  // Kernel
+  __m256i kernel_reg, kernel_reg_23,
+      kernel_reg_45;  // Segments of the kernel used
+  const __m256i reg_round =
+      _mm256_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
+  const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
+  const ptrdiff_t unrolled_src_stride = src_stride << 1;
+  const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
+  int h;
+
+  // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+  src_ptr -= 1;
+
+  // Load Kernel
+  kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
+  kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
+  kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
+
+  for (h = height; h >= 2; h -= 2) {
+    // Load the source
+    src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride);
+    src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
+    src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
+
+    // Result for first half
+    res_first = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
+                                     &kernel_reg_23, &kernel_reg_45);
+
+    // Do again to get the second half of dst
+    // Load the source
+    src_reg = mm256_loadu2_si128(src_ptr + 4, src_ptr + src_stride + 4);
+    src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
+    src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
+
+    // Result for second half
+    res_last = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
+                                    &kernel_reg_23, &kernel_reg_45);
+
+    // Round each result
+    res_first = mm256_round_epi32(&res_first, &reg_round, CONV8_ROUNDING_BITS);
+    res_last = mm256_round_epi32(&res_last, &reg_round, CONV8_ROUNDING_BITS);
+
+    // Finally combine to get the final dst
+    res_reg = _mm256_packus_epi32(res_first, res_last);
+    res_reg = _mm256_min_epi16(res_reg, reg_max);
+    mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
+                       &res_reg);
+
+    src_ptr += unrolled_src_stride;
+    dst_ptr += unrolled_dst_stride;
+  }
+
+  // Repeat for the last row if needed
+  if (h > 0) {
+    src_reg = _mm256_loadu_si256((const __m256i *)src_ptr);
+    // Reorder into 2 1 1 2
+    src_reg = _mm256_permute4x64_epi64(src_reg, 0x94);
+
+    src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
+    src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
+
+    res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
+                                   &kernel_reg_23, &kernel_reg_45);
+
+    res_reg = mm256_round_epi32(&res_first, &reg_round, CONV8_ROUNDING_BITS);
+
+    res_reg = _mm256_packus_epi32(res_reg, res_reg);
+    res_reg = _mm256_permute4x64_epi64(res_reg, 0x8);
+
+    _mm_store_si128((__m128i *)dst_ptr, _mm256_castsi256_si128(res_reg));
+  }
+}
+
+static void vpx_highbd_filter_block1d16_h4_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
+  vpx_highbd_filter_block1d8_h4_avx2(src_ptr, src_stride, dst_ptr, dst_stride,
+                                     height, kernel, bd);
+  vpx_highbd_filter_block1d8_h4_avx2(src_ptr + 8, src_stride, dst_ptr + 8,
+                                     dst_stride, height, kernel, bd);
+}
+
 static void vpx_highbd_filter_block1d8_v8_avg_avx2(
     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
@@ -1058,39 +1253,239 @@ static void vpx_highbd_filter_block1d8_v2_avg_avx2(
   } while (height > 0);
 }
 
-void vpx_highbd_filter_block1d4_h8_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
-                                        ptrdiff_t, uint32_t, const int16_t *,
-                                        int);
-void vpx_highbd_filter_block1d4_h2_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
-                                        ptrdiff_t, uint32_t, const int16_t *,
-                                        int);
-void vpx_highbd_filter_block1d4_v8_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
-                                        ptrdiff_t, uint32_t, const int16_t *,
-                                        int);
-void vpx_highbd_filter_block1d4_v2_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
-                                        ptrdiff_t, uint32_t, const int16_t *,
-                                        int);
+void vpx_highbd_filter_block1d4_v4_avx2(const uint16_t *src_ptr,
+                                        ptrdiff_t src_stride, uint16_t *dst_ptr,
+                                        ptrdiff_t dst_stride, uint32_t height,
+                                        const int16_t *kernel, int bd) {
+  // We will load two rows of pixels and rearrange them into the form
+  // ... s[1,0] s[0,0] s[0,0] s[-1,0]
+  // so that we can call multiply and add with the kernel partial output. Then
+  // we can call add with another row to get the output.
+
+  // Register for source s[-1:3, :]
+  __m256i src_reg_1, src_reg_2, src_reg_3;
+  // Interleaved rows of the source. lo is first half, hi second
+  __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23;
+  __m256i src_reg_m1001, src_reg_1223;
+
+  // Result after multiply and add
+  __m256i res_reg;
+
+  __m128i kernel_reg_128;                            // Kernel
+  __m256i kernel_reg, kernel_reg_23, kernel_reg_45;  // Segments of kernel used
+
+  const __m256i reg_round =
+      _mm256_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
+  const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
+  const ptrdiff_t src_stride_unrolled = src_stride << 1;
+  const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+  int h;
+
+  // We only need to go num_taps/2 - 1 row above the souce, so we move
+  // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down
+  src_ptr += src_stride_unrolled;
+
+  // Load Kernel
+  kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
+  kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
+  kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
+
+  // Row -1 to row 0
+  src_reg_m10 = mm256_loadu2_epi64((const __m128i *)src_ptr,
+                                   (const __m128i *)(src_ptr + src_stride));
+
+  // Row 0 to row 1
+  src_reg_1 = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)));
+  src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21);
+
+  // First three rows
+  src_reg_m1001 = _mm256_unpacklo_epi16(src_reg_m10, src_reg_01);
+
+  for (h = height; h > 1; h -= 2) {
+    src_reg_2 = _mm256_castsi128_si256(
+        _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3)));
+
+    src_reg_12 = _mm256_inserti128_si256(src_reg_1,
+                                         _mm256_castsi256_si128(src_reg_2), 1);
+
+    src_reg_3 = _mm256_castsi128_si256(
+        _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4)));
+
+    src_reg_23 = _mm256_inserti128_si256(src_reg_2,
+                                         _mm256_castsi256_si128(src_reg_3), 1);
+
+    // Last three rows
+    src_reg_1223 = _mm256_unpacklo_epi16(src_reg_12, src_reg_23);
+
+    // Output
+    res_reg = mm256_madd_add_epi32(&src_reg_m1001, &src_reg_1223,
+                                   &kernel_reg_23, &kernel_reg_45);
+
+    // Round the words
+    res_reg = mm256_round_epi32(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
+
+    // Combine to get the result
+    res_reg = _mm256_packus_epi32(res_reg, res_reg);
+    res_reg = _mm256_min_epi16(res_reg, reg_max);
+
+    // Save the result
+    mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
+                        &res_reg);
+
+    // Update the source by two rows
+    src_ptr += src_stride_unrolled;
+    dst_ptr += dst_stride_unrolled;
+
+    src_reg_m1001 = src_reg_1223;
+    src_reg_1 = src_reg_3;
+  }
+}
+
+void vpx_highbd_filter_block1d8_v4_avx2(const uint16_t *src_ptr,
+                                        ptrdiff_t src_stride, uint16_t *dst_ptr,
+                                        ptrdiff_t dst_stride, uint32_t height,
+                                        const int16_t *kernel, int bd) {
+  // We will load two rows of pixels and rearrange them into the form
+  // ... s[1,0] s[0,0] s[0,0] s[-1,0]
+  // so that we can call multiply and add with the kernel partial output. Then
+  // we can call add with another row to get the output.
+
+  // Register for source s[-1:3, :]
+  __m256i src_reg_1, src_reg_2, src_reg_3;
+  // Interleaved rows of the source. lo is first half, hi second
+  __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23;
+  __m256i src_reg_m1001_lo, src_reg_m1001_hi, src_reg_1223_lo, src_reg_1223_hi;
+
+  __m128i kernel_reg_128;                            // Kernel
+  __m256i kernel_reg, kernel_reg_23, kernel_reg_45;  // Segments of kernel
+
+  // Result after multiply and add
+  __m256i res_reg, res_reg_lo, res_reg_hi;
+
+  const __m256i reg_round =
+      _mm256_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
+  const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
+  const ptrdiff_t src_stride_unrolled = src_stride << 1;
+  const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+  int h;
+
+  // We only need to go num_taps/2 - 1 row above the souce, so we move
+  // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down
+  src_ptr += src_stride_unrolled;
+
+  // Load Kernel
+  kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
+  kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
+  kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
+
+  // Row -1 to row 0
+  src_reg_m10 = mm256_loadu2_si128((const __m128i *)src_ptr,
+                                   (const __m128i *)(src_ptr + src_stride));
+
+  // Row 0 to row 1
+  src_reg_1 = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)));
+  src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21);
+
+  // First three rows
+  src_reg_m1001_lo = _mm256_unpacklo_epi16(src_reg_m10, src_reg_01);
+  src_reg_m1001_hi = _mm256_unpackhi_epi16(src_reg_m10, src_reg_01);
+
+  for (h = height; h > 1; h -= 2) {
+    src_reg_2 = _mm256_castsi128_si256(
+        _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3)));
+
+    src_reg_12 = _mm256_inserti128_si256(src_reg_1,
+                                         _mm256_castsi256_si128(src_reg_2), 1);
+
+    src_reg_3 = _mm256_castsi128_si256(
+        _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4)));
+
+    src_reg_23 = _mm256_inserti128_si256(src_reg_2,
+                                         _mm256_castsi256_si128(src_reg_3), 1);
+
+    // Last three rows
+    src_reg_1223_lo = _mm256_unpacklo_epi16(src_reg_12, src_reg_23);
+    src_reg_1223_hi = _mm256_unpackhi_epi16(src_reg_12, src_reg_23);
+
+    // Output from first half
+    res_reg_lo = mm256_madd_add_epi32(&src_reg_m1001_lo, &src_reg_1223_lo,
+                                      &kernel_reg_23, &kernel_reg_45);
+
+    // Output from second half
+    res_reg_hi = mm256_madd_add_epi32(&src_reg_m1001_hi, &src_reg_1223_hi,
+                                      &kernel_reg_23, &kernel_reg_45);
+
+    // Round the words
+    res_reg_lo =
+        mm256_round_epi32(&res_reg_lo, &reg_round, CONV8_ROUNDING_BITS);
+    res_reg_hi =
+        mm256_round_epi32(&res_reg_hi, &reg_round, CONV8_ROUNDING_BITS);
+
+    // Combine to get the result
+    res_reg = _mm256_packus_epi32(res_reg_lo, res_reg_hi);
+    res_reg = _mm256_min_epi16(res_reg, reg_max);
+
+    // Save the result
+    mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
+                       &res_reg);
+
+    // Update the source by two rows
+    src_ptr += src_stride_unrolled;
+    dst_ptr += dst_stride_unrolled;
+
+    src_reg_m1001_lo = src_reg_1223_lo;
+    src_reg_m1001_hi = src_reg_1223_hi;
+    src_reg_1 = src_reg_3;
+  }
+}
+
+void vpx_highbd_filter_block1d16_v4_avx2(const uint16_t *src_ptr,
+                                         ptrdiff_t src_stride,
+                                         uint16_t *dst_ptr,
+                                         ptrdiff_t dst_stride, uint32_t height,
+                                         const int16_t *kernel, int bd) {
+  vpx_highbd_filter_block1d8_v4_avx2(src_ptr, src_stride, dst_ptr, dst_stride,
+                                     height, kernel, bd);
+  vpx_highbd_filter_block1d8_v4_avx2(src_ptr + 8, src_stride, dst_ptr + 8,
+                                     dst_stride, height, kernel, bd);
+}
+
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2;
+
 #define vpx_highbd_filter_block1d4_h8_avx2 vpx_highbd_filter_block1d4_h8_sse2
 #define vpx_highbd_filter_block1d4_h2_avx2 vpx_highbd_filter_block1d4_h2_sse2
 #define vpx_highbd_filter_block1d4_v8_avx2 vpx_highbd_filter_block1d4_v8_sse2
 #define vpx_highbd_filter_block1d4_v2_avx2 vpx_highbd_filter_block1d4_v2_sse2
 
+#define vpx_highbd_filter_block1d16_v4_avg_avx2 \
+  vpx_highbd_filter_block1d16_v8_avg_avx2
+#define vpx_highbd_filter_block1d16_h4_avg_avx2 \
+  vpx_highbd_filter_block1d16_h8_avg_avx2
+#define vpx_highbd_filter_block1d8_v4_avg_avx2 \
+  vpx_highbd_filter_block1d8_v8_avg_avx2
+#define vpx_highbd_filter_block1d8_h4_avg_avx2 \
+  vpx_highbd_filter_block1d8_h8_avg_avx2
+#define vpx_highbd_filter_block1d4_v4_avg_avx2 \
+  vpx_highbd_filter_block1d4_v8_avg_avx2
+#define vpx_highbd_filter_block1d4_h4_avg_avx2 \
+  vpx_highbd_filter_block1d4_h8_avg_avx2
+
 HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2);
 HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , avx2);
 HIGH_FUN_CONV_2D(, avx2);
 
-void vpx_highbd_filter_block1d4_h8_avg_sse2(const uint16_t *, ptrdiff_t,
-                                            uint16_t *, ptrdiff_t, uint32_t,
-                                            const int16_t *, int);
-void vpx_highbd_filter_block1d4_h2_avg_sse2(const uint16_t *, ptrdiff_t,
-                                            uint16_t *, ptrdiff_t, uint32_t,
-                                            const int16_t *, int);
-void vpx_highbd_filter_block1d4_v8_avg_sse2(const uint16_t *, ptrdiff_t,
-                                            uint16_t *, ptrdiff_t, uint32_t,
-                                            const int16_t *, int);
-void vpx_highbd_filter_block1d4_v2_avg_sse2(const uint16_t *, ptrdiff_t,
-                                            uint16_t *, ptrdiff_t, uint32_t,
-                                            const int16_t *, int);
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2;
+
 #define vpx_highbd_filter_block1d4_h8_avg_avx2 \
   vpx_highbd_filter_block1d4_h8_avg_sse2
 #define vpx_highbd_filter_block1d4_h2_avg_avx2 \
diff --git a/vpx_dsp/x86/vpx_asm_stubs.c b/vpx_dsp/x86/vpx_asm_stubs.c
index 80c7654d5..9d6f83787 100644
--- a/vpx_dsp/x86/vpx_asm_stubs.c
+++ b/vpx_dsp/x86/vpx_asm_stubs.c
@@ -104,6 +104,25 @@ highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_avg_sse2;
 highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2;
 highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2;
 
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v4_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h4_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v4_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h4_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v4_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h4_sse2;
+#define vpx_highbd_filter_block1d16_v4_avg_sse2 \
+  vpx_highbd_filter_block1d16_v8_avg_sse2
+#define vpx_highbd_filter_block1d16_h4_avg_sse2 \
+  vpx_highbd_filter_block1d16_h8_avg_sse2
+#define vpx_highbd_filter_block1d8_v4_avg_sse2 \
+  vpx_highbd_filter_block1d8_v8_avg_sse2
+#define vpx_highbd_filter_block1d8_h4_avg_sse2 \
+  vpx_highbd_filter_block1d8_h8_avg_sse2
+#define vpx_highbd_filter_block1d4_v4_avg_sse2 \
+  vpx_highbd_filter_block1d4_v8_avg_sse2
+#define vpx_highbd_filter_block1d4_h4_avg_sse2 \
+  vpx_highbd_filter_block1d4_h8_avg_sse2
+
 highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_sse2;
 highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_sse2;
 highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_sse2;
diff --git a/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c b/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c
index fa223aed0..0be2c0fef 100644
--- a/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c
+++ b/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c
@@ -16,6 +16,9 @@
 #include "vpx_dsp/x86/convolve_sse2.h"
 #include "vpx_ports/mem.h"
 
+#define CONV8_ROUNDING_BITS (7)
+#define CONV8_ROUNDING_NUM (1 << (CONV8_ROUNDING_BITS - 1))
+
 void vpx_filter_block1d16_h4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride,
                                   uint8_t *dst_ptr, ptrdiff_t dst_stride,
                                   uint32_t height, const int16_t *kernel) {
@@ -54,15 +57,15 @@ void vpx_filter_block1d16_h4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride,
     src_reg_shift_3 = _mm_srli_si128(src_reg, 3);
 
     // Output 6 4 2 0
-    even = multiply_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23,
-                                  &kernel_reg_45);
+    even = mm_madd_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23,
+                                 &kernel_reg_45);
 
     // Output 7 5 3 1
-    odd = multiply_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3,
-                                 &kernel_reg_23, &kernel_reg_45);
+    odd = mm_madd_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3,
+                                &kernel_reg_23, &kernel_reg_45);
 
     // Combine to get the first half of the dst
-    dst_first = combine_epi32_sse2(&even, &odd);
+    dst_first = mm_zip_epi32_sse2(&even, &odd);
 
     // Do again to get the second half of dst
     src_reg = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
@@ -71,19 +74,19 @@ void vpx_filter_block1d16_h4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride,
     src_reg_shift_3 = _mm_srli_si128(src_reg, 3);
 
     // Output 14 12 10 8
-    even = multiply_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23,
-                                  &kernel_reg_45);
+    even = mm_madd_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23,
+                                 &kernel_reg_45);
 
     // Output 15 13 11 9
-    odd = multiply_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3,
-                                 &kernel_reg_23, &kernel_reg_45);
+    odd = mm_madd_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3,
+                                &kernel_reg_23, &kernel_reg_45);
 
     // Combine to get the second half of the dst
-    dst_second = combine_epi32_sse2(&even, &odd);
+    dst_second = mm_zip_epi32_sse2(&even, &odd);
 
     // Round each result
-    dst_first = round_epi16_sse2(&dst_first, &reg_32, 6);
-    dst_second = round_epi16_sse2(&dst_second, &reg_32, 6);
+    dst_first = mm_round_epi16_sse2(&dst_first, &reg_32, 6);
+    dst_second = mm_round_epi16_sse2(&dst_second, &reg_32, 6);
 
     // Finally combine to get the final dst
     dst_first = _mm_packus_epi16(dst_first, dst_second);
@@ -181,21 +184,21 @@ void vpx_filter_block1d16_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride,
     src_reg_23_hi = _mm_unpackhi_epi8(src_reg_2, src_reg_3);
 
     // Partial output from first half
-    res_reg_m10_lo = multiply_add_packs_epi16_sse2(
+    res_reg_m10_lo = mm_madd_packs_epi16_sse2(
         &src_reg_m10_lo_1, &src_reg_m10_lo_2, &kernel_reg_23);
 
-    res_reg_01_lo = multiply_add_packs_epi16_sse2(
-        &src_reg_01_lo_1, &src_reg_01_lo_2, &kernel_reg_23);
+    res_reg_01_lo = mm_madd_packs_epi16_sse2(&src_reg_01_lo_1, &src_reg_01_lo_2,
+                                             &kernel_reg_23);
 
     src_reg_12_lo_1 = _mm_unpacklo_epi8(src_reg_12_lo, _mm_setzero_si128());
     src_reg_12_lo_2 = _mm_unpackhi_epi8(src_reg_12_lo, _mm_setzero_si128());
-    res_reg_12_lo = multiply_add_packs_epi16_sse2(
-        &src_reg_12_lo_1, &src_reg_12_lo_2, &kernel_reg_45);
+    res_reg_12_lo = mm_madd_packs_epi16_sse2(&src_reg_12_lo_1, &src_reg_12_lo_2,
+                                             &kernel_reg_45);
 
     src_reg_23_lo_1 = _mm_unpacklo_epi8(src_reg_23_lo, _mm_setzero_si128());
     src_reg_23_lo_2 = _mm_unpackhi_epi8(src_reg_23_lo, _mm_setzero_si128());
-    res_reg_23_lo = multiply_add_packs_epi16_sse2(
-        &src_reg_23_lo_1, &src_reg_23_lo_2, &kernel_reg_45);
+    res_reg_23_lo = mm_madd_packs_epi16_sse2(&src_reg_23_lo_1, &src_reg_23_lo_2,
+                                             &kernel_reg_45);
 
     // Add to get first half of the results
     res_reg_m1012_lo = _mm_adds_epi16(res_reg_m10_lo, res_reg_12_lo);
@@ -203,31 +206,31 @@ void vpx_filter_block1d16_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride,
 
     // Now repeat everything again for the second half
     // Partial output for second half
-    res_reg_m10_hi = multiply_add_packs_epi16_sse2(
+    res_reg_m10_hi = mm_madd_packs_epi16_sse2(
         &src_reg_m10_hi_1, &src_reg_m10_hi_2, &kernel_reg_23);
 
-    res_reg_01_hi = multiply_add_packs_epi16_sse2(
-        &src_reg_01_hi_1, &src_reg_01_hi_2, &kernel_reg_23);
+    res_reg_01_hi = mm_madd_packs_epi16_sse2(&src_reg_01_hi_1, &src_reg_01_hi_2,
+                                             &kernel_reg_23);
 
     src_reg_12_hi_1 = _mm_unpacklo_epi8(src_reg_12_hi, _mm_setzero_si128());
     src_reg_12_hi_2 = _mm_unpackhi_epi8(src_reg_12_hi, _mm_setzero_si128());
-    res_reg_12_hi = multiply_add_packs_epi16_sse2(
-        &src_reg_12_hi_1, &src_reg_12_hi_2, &kernel_reg_45);
+    res_reg_12_hi = mm_madd_packs_epi16_sse2(&src_reg_12_hi_1, &src_reg_12_hi_2,
+                                             &kernel_reg_45);
 
     src_reg_23_hi_1 = _mm_unpacklo_epi8(src_reg_23_hi, _mm_setzero_si128());
     src_reg_23_hi_2 = _mm_unpackhi_epi8(src_reg_23_hi, _mm_setzero_si128());
-    res_reg_23_hi = multiply_add_packs_epi16_sse2(
-        &src_reg_23_hi_1, &src_reg_23_hi_2, &kernel_reg_45);
+    res_reg_23_hi = mm_madd_packs_epi16_sse2(&src_reg_23_hi_1, &src_reg_23_hi_2,
+                                             &kernel_reg_45);
 
     // Second half of the results
     res_reg_m1012_hi = _mm_adds_epi16(res_reg_m10_hi, res_reg_12_hi);
     res_reg_0123_hi = _mm_adds_epi16(res_reg_01_hi, res_reg_23_hi);
 
     // Round the words
-    res_reg_m1012_lo = round_epi16_sse2(&res_reg_m1012_lo, &reg_32, 6);
-    res_reg_0123_lo = round_epi16_sse2(&res_reg_0123_lo, &reg_32, 6);
-    res_reg_m1012_hi = round_epi16_sse2(&res_reg_m1012_hi, &reg_32, 6);
-    res_reg_0123_hi = round_epi16_sse2(&res_reg_0123_hi, &reg_32, 6);
+    res_reg_m1012_lo = mm_round_epi16_sse2(&res_reg_m1012_lo, &reg_32, 6);
+    res_reg_0123_lo = mm_round_epi16_sse2(&res_reg_0123_lo, &reg_32, 6);
+    res_reg_m1012_hi = mm_round_epi16_sse2(&res_reg_m1012_hi, &reg_32, 6);
+    res_reg_0123_hi = mm_round_epi16_sse2(&res_reg_0123_hi, &reg_32, 6);
 
     // Combine to get the result
     res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, res_reg_m1012_hi);
@@ -288,16 +291,16 @@ void vpx_filter_block1d8_h4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride,
     src_reg_shift_3 = _mm_srli_si128(src_reg, 3);
 
     // Output 6 4 2 0
-    even = multiply_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23,
-                                  &kernel_reg_45);
+    even = mm_madd_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23,
+                                 &kernel_reg_45);
 
     // Output 7 5 3 1
-    odd = multiply_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3,
-                                 &kernel_reg_23, &kernel_reg_45);
+    odd = mm_madd_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3,
+                                &kernel_reg_23, &kernel_reg_45);
 
     // Combine to get the first half of the dst
-    dst_first = combine_epi32_sse2(&even, &odd);
-    dst_first = round_epi16_sse2(&dst_first, &reg_32, 6);
+    dst_first = mm_zip_epi32_sse2(&even, &odd);
+    dst_first = mm_round_epi16_sse2(&dst_first, &reg_32, 6);
 
     // Saturate and convert to 8-bit words
     dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128());
@@ -383,29 +386,29 @@ void vpx_filter_block1d8_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride,
     src_reg_23_lo = _mm_unpacklo_epi8(src_reg_2, src_reg_3);
 
     // Partial output
-    res_reg_m10_lo = multiply_add_packs_epi16_sse2(
+    res_reg_m10_lo = mm_madd_packs_epi16_sse2(
         &src_reg_m10_lo_1, &src_reg_m10_lo_2, &kernel_reg_23);
 
-    res_reg_01_lo = multiply_add_packs_epi16_sse2(
-        &src_reg_01_lo_1, &src_reg_01_lo_2, &kernel_reg_23);
+    res_reg_01_lo = mm_madd_packs_epi16_sse2(&src_reg_01_lo_1, &src_reg_01_lo_2,
+                                             &kernel_reg_23);
 
     src_reg_12_lo_1 = _mm_unpacklo_epi8(src_reg_12_lo, _mm_setzero_si128());
     src_reg_12_lo_2 = _mm_unpackhi_epi8(src_reg_12_lo, _mm_setzero_si128());
-    res_reg_12_lo = multiply_add_packs_epi16_sse2(
-        &src_reg_12_lo_1, &src_reg_12_lo_2, &kernel_reg_45);
+    res_reg_12_lo = mm_madd_packs_epi16_sse2(&src_reg_12_lo_1, &src_reg_12_lo_2,
+                                             &kernel_reg_45);
 
     src_reg_23_lo_1 = _mm_unpacklo_epi8(src_reg_23_lo, _mm_setzero_si128());
     src_reg_23_lo_2 = _mm_unpackhi_epi8(src_reg_23_lo, _mm_setzero_si128());
-    res_reg_23_lo = multiply_add_packs_epi16_sse2(
-        &src_reg_23_lo_1, &src_reg_23_lo_2, &kernel_reg_45);
+    res_reg_23_lo = mm_madd_packs_epi16_sse2(&src_reg_23_lo_1, &src_reg_23_lo_2,
+                                             &kernel_reg_45);
 
     // Add to get results
     res_reg_m1012_lo = _mm_adds_epi16(res_reg_m10_lo, res_reg_12_lo);
     res_reg_0123_lo = _mm_adds_epi16(res_reg_01_lo, res_reg_23_lo);
 
     // Round the words
-    res_reg_m1012_lo = round_epi16_sse2(&res_reg_m1012_lo, &reg_32, 6);
-    res_reg_0123_lo = round_epi16_sse2(&res_reg_0123_lo, &reg_32, 6);
+    res_reg_m1012_lo = mm_round_epi16_sse2(&res_reg_m1012_lo, &reg_32, 6);
+    res_reg_0123_lo = mm_round_epi16_sse2(&res_reg_0123_lo, &reg_32, 6);
 
     // Convert to 8-bit words
     res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, _mm_setzero_si128());
@@ -480,7 +483,7 @@ void vpx_filter_block1d4_h4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride,
     dst_first = _mm_add_epi32(tmp_0, tmp_1);
     dst_first = _mm_packs_epi32(dst_first, _mm_setzero_si128());
 
-    dst_first = round_epi16_sse2(&dst_first, &reg_32, 6);
+    dst_first = mm_round_epi16_sse2(&dst_first, &reg_32, 6);
 
     // Saturate and convert to 8-bit words
     dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128());
@@ -565,27 +568,27 @@ void vpx_filter_block1d4_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride,
     src_reg_23_lo = _mm_unpacklo_epi8(src_reg_2, src_reg_3);
 
     // Partial output
-    res_reg_m10_lo = multiply_add_packs_epi16_sse2(&src_reg_m10_lo_1, &reg_zero,
-                                                   &kernel_reg_23);
+    res_reg_m10_lo =
+        mm_madd_packs_epi16_sse2(&src_reg_m10_lo_1, &reg_zero, &kernel_reg_23);
 
-    res_reg_01_lo = multiply_add_packs_epi16_sse2(&src_reg_01_lo_1, &reg_zero,
-                                                  &kernel_reg_23);
+    res_reg_01_lo =
+        mm_madd_packs_epi16_sse2(&src_reg_01_lo_1, &reg_zero, &kernel_reg_23);
 
     src_reg_12_lo_1 = _mm_unpacklo_epi8(src_reg_12_lo, _mm_setzero_si128());
-    res_reg_12_lo = multiply_add_packs_epi16_sse2(&src_reg_12_lo_1, &reg_zero,
-                                                  &kernel_reg_45);
+    res_reg_12_lo =
+        mm_madd_packs_epi16_sse2(&src_reg_12_lo_1, &reg_zero, &kernel_reg_45);
 
     src_reg_23_lo_1 = _mm_unpacklo_epi8(src_reg_23_lo, _mm_setzero_si128());
-    res_reg_23_lo = multiply_add_packs_epi16_sse2(&src_reg_23_lo_1, &reg_zero,
-                                                  &kernel_reg_45);
+    res_reg_23_lo =
+        mm_madd_packs_epi16_sse2(&src_reg_23_lo_1, &reg_zero, &kernel_reg_45);
 
     // Add to get results
     res_reg_m1012_lo = _mm_adds_epi16(res_reg_m10_lo, res_reg_12_lo);
     res_reg_0123_lo = _mm_adds_epi16(res_reg_01_lo, res_reg_23_lo);
 
     // Round the words
-    res_reg_m1012_lo = round_epi16_sse2(&res_reg_m1012_lo, &reg_32, 6);
-    res_reg_0123_lo = round_epi16_sse2(&res_reg_0123_lo, &reg_32, 6);
+    res_reg_m1012_lo = mm_round_epi16_sse2(&res_reg_m1012_lo, &reg_32, 6);
+    res_reg_0123_lo = mm_round_epi16_sse2(&res_reg_0123_lo, &reg_32, 6);
 
     // Convert to 8-bit words
     res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, reg_zero);
@@ -604,3 +607,399 @@ void vpx_filter_block1d4_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride,
     src_reg_1 = src_reg_3;
   }
 }
+
+void vpx_highbd_filter_block1d4_h4_sse2(const uint16_t *src_ptr,
+                                        ptrdiff_t src_stride, uint16_t *dst_ptr,
+                                        ptrdiff_t dst_stride, uint32_t height,
+                                        const int16_t *kernel, int bd) {
+  // We will load multiple shifted versions of the row and shuffle them into
+  // 16-bit words of the form
+  // ... s[2] s[1] s[0] s[-1]
+  // ... s[4] s[3] s[2] s[1]
+  // Then we call multiply and add to get partial results
+  // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2]
+  // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4]
+  // The two results are then added together to get the even output
+
+  __m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3;
+  __m128i res_reg;
+  __m128i even, odd;
+
+  __m128i kernel_reg;                    // Kernel
+  __m128i kernel_reg_23, kernel_reg_45;  // Segments of the kernel used
+  const __m128i reg_round =
+      _mm_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
+  const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1);
+  const __m128i reg_zero = _mm_setzero_si128();
+  int h;
+
+  // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+  src_ptr -= 1;
+
+  // Load Kernel
+  kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
+  kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
+
+  for (h = height; h > 0; --h) {
+    src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
+    src_reg_shift_1 = _mm_srli_si128(src_reg, 2);
+    src_reg_shift_2 = _mm_srli_si128(src_reg, 4);
+    src_reg_shift_3 = _mm_srli_si128(src_reg, 6);
+
+    // Output 2 0
+    even = mm_madd_add_epi16_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23,
+                                  &kernel_reg_45);
+
+    // Output 3 1
+    odd = mm_madd_add_epi16_sse2(&src_reg_shift_1, &src_reg_shift_3,
+                                 &kernel_reg_23, &kernel_reg_45);
+
+    // Combine to get the first half of the dst
+    res_reg = _mm_unpacklo_epi32(even, odd);
+    res_reg = mm_round_epi32_sse2(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
+    res_reg = _mm_packs_epi32(res_reg, reg_zero);
+
+    // Saturate the result and save
+    res_reg = _mm_min_epi16(res_reg, reg_max);
+    res_reg = _mm_max_epi16(res_reg, reg_zero);
+    _mm_storel_epi64((__m128i *)dst_ptr, res_reg);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+void vpx_highbd_filter_block1d4_v4_sse2(const uint16_t *src_ptr,
+                                        ptrdiff_t src_stride, uint16_t *dst_ptr,
+                                        ptrdiff_t dst_stride, uint32_t height,
+                                        const int16_t *kernel, int bd) {
+  // We will load two rows of pixels as 16-bit words, and shuffle them into the
+  // form
+  // ... s[0,1] s[-1,1] s[0,0] s[-1,0]
+  // ... s[0,7] s[-1,7] s[0,6] s[-1,6]
+  // ... s[0,9] s[-1,9] s[0,8] s[-1,8]
+  // ... s[0,13] s[-1,13] s[0,12] s[-1,12]
+  // so that we can call multiply and add with the kernel to get 32-bit words of
+  // the form
+  // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2]
+  // Finally, we can add multiple rows together to get the desired output.
+
+  // Register for source s[-1:3, :]
+  __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
+  // Interleaved rows of the source. lo is first half, hi second
+  __m128i src_reg_m10, src_reg_01;
+  __m128i src_reg_12, src_reg_23;
+
+  __m128i kernel_reg;                    // Kernel
+  __m128i kernel_reg_23, kernel_reg_45;  // Segments of the kernel used
+
+  // Result after multiply and add
+  __m128i res_reg_m10, res_reg_01, res_reg_12, res_reg_23;
+  __m128i res_reg_m1012, res_reg_0123;
+
+  const __m128i reg_round =
+      _mm_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
+  const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1);
+  const __m128i reg_zero = _mm_setzero_si128();
+
+  // We will compute the result two rows at a time
+  const ptrdiff_t src_stride_unrolled = src_stride << 1;
+  const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+  int h;
+
+  // We only need to go num_taps/2 - 1 row above the source, so we move
+  // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down
+  src_ptr += src_stride_unrolled;
+
+  // Load Kernel
+  kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
+  kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
+
+  // First shuffle the data
+  src_reg_m1 = _mm_loadl_epi64((const __m128i *)src_ptr);
+  src_reg_0 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride));
+  src_reg_m10 = _mm_unpacklo_epi16(src_reg_m1, src_reg_0);
+
+  // More shuffling
+  src_reg_1 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 2));
+  src_reg_01 = _mm_unpacklo_epi16(src_reg_0, src_reg_1);
+
+  for (h = height; h > 1; h -= 2) {
+    src_reg_2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3));
+
+    src_reg_12 = _mm_unpacklo_epi16(src_reg_1, src_reg_2);
+
+    src_reg_3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4));
+
+    src_reg_23 = _mm_unpacklo_epi16(src_reg_2, src_reg_3);
+
+    // Partial output
+    res_reg_m10 = _mm_madd_epi16(src_reg_m10, kernel_reg_23);
+    res_reg_01 = _mm_madd_epi16(src_reg_01, kernel_reg_23);
+    res_reg_12 = _mm_madd_epi16(src_reg_12, kernel_reg_45);
+    res_reg_23 = _mm_madd_epi16(src_reg_23, kernel_reg_45);
+
+    // Add to get results
+    res_reg_m1012 = _mm_add_epi32(res_reg_m10, res_reg_12);
+    res_reg_0123 = _mm_add_epi32(res_reg_01, res_reg_23);
+
+    // Round the words
+    res_reg_m1012 =
+        mm_round_epi32_sse2(&res_reg_m1012, &reg_round, CONV8_ROUNDING_BITS);
+    res_reg_0123 =
+        mm_round_epi32_sse2(&res_reg_0123, &reg_round, CONV8_ROUNDING_BITS);
+
+    res_reg_m1012 = _mm_packs_epi32(res_reg_m1012, reg_zero);
+    res_reg_0123 = _mm_packs_epi32(res_reg_0123, reg_zero);
+
+    // Saturate according to bit depth
+    res_reg_m1012 = _mm_min_epi16(res_reg_m1012, reg_max);
+    res_reg_0123 = _mm_min_epi16(res_reg_0123, reg_max);
+    res_reg_m1012 = _mm_max_epi16(res_reg_m1012, reg_zero);
+    res_reg_0123 = _mm_max_epi16(res_reg_0123, reg_zero);
+
+    // Save only half of the register (8 words)
+    _mm_storel_epi64((__m128i *)dst_ptr, res_reg_m1012);
+    _mm_storel_epi64((__m128i *)(dst_ptr + dst_stride), res_reg_0123);
+
+    // Update the source by two rows
+    src_ptr += src_stride_unrolled;
+    dst_ptr += dst_stride_unrolled;
+
+    src_reg_m10 = src_reg_12;
+    src_reg_01 = src_reg_23;
+    src_reg_1 = src_reg_3;
+  }
+}
+
+void vpx_highbd_filter_block1d8_h4_sse2(const uint16_t *src_ptr,
+                                        ptrdiff_t src_stride, uint16_t *dst_ptr,
+                                        ptrdiff_t dst_stride, uint32_t height,
+                                        const int16_t *kernel, int bd) {
+  // We will load multiple shifted versions of the row and shuffle them into
+  // 16-bit words of the form
+  // ... s[2] s[1] s[0] s[-1]
+  // ... s[4] s[3] s[2] s[1]
+  // Then we call multiply and add to get partial results
+  // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2]
+  // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4]
+  // The two results are then added together for the first half of even
+  // output.
+  // Repeat multiple times to get the whole outoput
+
+  __m128i src_reg, src_reg_next, src_reg_shift_1, src_reg_shift_2,
+      src_reg_shift_3;
+  __m128i res_reg;
+  __m128i even, odd;
+  __m128i tmp_0, tmp_1;
+
+  __m128i kernel_reg;                    // Kernel
+  __m128i kernel_reg_23, kernel_reg_45;  // Segments of the kernel used
+  const __m128i reg_round =
+      _mm_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
+  const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1);
+  const __m128i reg_zero = _mm_setzero_si128();
+  int h;
+
+  // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+  src_ptr -= 1;
+
+  // Load Kernel
+  kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
+  kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
+
+  for (h = height; h > 0; --h) {
+    // We will put first half in the first half of the reg, and second half in
+    // second half
+    src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
+    src_reg_next = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
+
+    // Output 6 4 2 0
+    tmp_0 = _mm_srli_si128(src_reg, 4);
+    tmp_1 = _mm_srli_si128(src_reg_next, 2);
+    src_reg_shift_2 = _mm_unpacklo_epi64(tmp_0, tmp_1);
+    even = mm_madd_add_epi16_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23,
+                                  &kernel_reg_45);
+
+    // Output 7 5 3 1
+    tmp_0 = _mm_srli_si128(src_reg, 2);
+    tmp_1 = src_reg_next;
+    src_reg_shift_1 = _mm_unpacklo_epi64(tmp_0, tmp_1);
+
+    tmp_0 = _mm_srli_si128(src_reg, 6);
+    tmp_1 = _mm_srli_si128(src_reg_next, 4);
+    src_reg_shift_3 = _mm_unpacklo_epi64(tmp_0, tmp_1);
+
+    odd = mm_madd_add_epi16_sse2(&src_reg_shift_1, &src_reg_shift_3,
+                                 &kernel_reg_23, &kernel_reg_45);
+
+    // Combine to get the first half of the dst
+    even = mm_round_epi32_sse2(&even, &reg_round, CONV8_ROUNDING_BITS);
+    odd = mm_round_epi32_sse2(&odd, &reg_round, CONV8_ROUNDING_BITS);
+    res_reg = mm_zip_epi32_sse2(&even, &odd);
+
+    // Saturate the result and save
+    res_reg = _mm_min_epi16(res_reg, reg_max);
+    res_reg = _mm_max_epi16(res_reg, reg_zero);
+
+    _mm_store_si128((__m128i *)dst_ptr, res_reg);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+void vpx_highbd_filter_block1d8_v4_sse2(const uint16_t *src_ptr,
+                                        ptrdiff_t src_stride, uint16_t *dst_ptr,
+                                        ptrdiff_t dst_stride, uint32_t height,
+                                        const int16_t *kernel, int bd) {
+  // We will load two rows of pixels as 16-bit words, and shuffle them into the
+  // form
+  // ... s[0,1] s[-1,1] s[0,0] s[-1,0]
+  // ... s[0,7] s[-1,7] s[0,6] s[-1,6]
+  // ... s[0,9] s[-1,9] s[0,8] s[-1,8]
+  // ... s[0,13] s[-1,13] s[0,12] s[-1,12]
+  // so that we can call multiply and add with the kernel to get 32-bit words of
+  // the form
+  // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2]
+  // Finally, we can add multiple rows together to get the desired output.
+
+  // Register for source s[-1:3, :]
+  __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
+  // Interleaved rows of the source. lo is first half, hi second
+  __m128i src_reg_m10_lo, src_reg_01_lo, src_reg_m10_hi, src_reg_01_hi;
+  __m128i src_reg_12_lo, src_reg_23_lo, src_reg_12_hi, src_reg_23_hi;
+
+  // Result after multiply and add
+  __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo;
+  __m128i res_reg_m10_hi, res_reg_01_hi, res_reg_12_hi, res_reg_23_hi;
+  __m128i res_reg_m1012, res_reg_0123;
+  __m128i res_reg_m1012_lo, res_reg_0123_lo;
+  __m128i res_reg_m1012_hi, res_reg_0123_hi;
+
+  __m128i kernel_reg;                    // Kernel
+  __m128i kernel_reg_23, kernel_reg_45;  // Segments of the kernel used
+
+  const __m128i reg_round =
+      _mm_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
+  const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1);
+  const __m128i reg_zero = _mm_setzero_si128();
+
+  // We will compute the result two rows at a time
+  const ptrdiff_t src_stride_unrolled = src_stride << 1;
+  const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+  int h;
+
+  // We only need to go num_taps/2 - 1 row above the source, so we move
+  // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down
+  src_ptr += src_stride_unrolled;
+
+  // Load Kernel
+  kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
+  kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
+
+  // First shuffle the data
+  src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr);
+  src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride));
+  src_reg_m10_lo = _mm_unpacklo_epi16(src_reg_m1, src_reg_0);
+  src_reg_m10_hi = _mm_unpackhi_epi16(src_reg_m1, src_reg_0);
+
+  // More shuffling
+  src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2));
+  src_reg_01_lo = _mm_unpacklo_epi16(src_reg_0, src_reg_1);
+  src_reg_01_hi = _mm_unpackhi_epi16(src_reg_0, src_reg_1);
+
+  for (h = height; h > 1; h -= 2) {
+    src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3));
+
+    src_reg_12_lo = _mm_unpacklo_epi16(src_reg_1, src_reg_2);
+    src_reg_12_hi = _mm_unpackhi_epi16(src_reg_1, src_reg_2);
+
+    src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4));
+
+    src_reg_23_lo = _mm_unpacklo_epi16(src_reg_2, src_reg_3);
+    src_reg_23_hi = _mm_unpackhi_epi16(src_reg_2, src_reg_3);
+
+    // Partial output for first half
+    res_reg_m10_lo = _mm_madd_epi16(src_reg_m10_lo, kernel_reg_23);
+    res_reg_01_lo = _mm_madd_epi16(src_reg_01_lo, kernel_reg_23);
+    res_reg_12_lo = _mm_madd_epi16(src_reg_12_lo, kernel_reg_45);
+    res_reg_23_lo = _mm_madd_epi16(src_reg_23_lo, kernel_reg_45);
+
+    // Add to get results
+    res_reg_m1012_lo = _mm_add_epi32(res_reg_m10_lo, res_reg_12_lo);
+    res_reg_0123_lo = _mm_add_epi32(res_reg_01_lo, res_reg_23_lo);
+
+    // Round the words
+    res_reg_m1012_lo =
+        mm_round_epi32_sse2(&res_reg_m1012_lo, &reg_round, CONV8_ROUNDING_BITS);
+    res_reg_0123_lo =
+        mm_round_epi32_sse2(&res_reg_0123_lo, &reg_round, CONV8_ROUNDING_BITS);
+
+    // Partial output for first half
+    res_reg_m10_hi = _mm_madd_epi16(src_reg_m10_hi, kernel_reg_23);
+    res_reg_01_hi = _mm_madd_epi16(src_reg_01_hi, kernel_reg_23);
+    res_reg_12_hi = _mm_madd_epi16(src_reg_12_hi, kernel_reg_45);
+    res_reg_23_hi = _mm_madd_epi16(src_reg_23_hi, kernel_reg_45);
+
+    // Add to get results
+    res_reg_m1012_hi = _mm_add_epi32(res_reg_m10_hi, res_reg_12_hi);
+    res_reg_0123_hi = _mm_add_epi32(res_reg_01_hi, res_reg_23_hi);
+
+    // Round the words
+    res_reg_m1012_hi =
+        mm_round_epi32_sse2(&res_reg_m1012_hi, &reg_round, CONV8_ROUNDING_BITS);
+    res_reg_0123_hi =
+        mm_round_epi32_sse2(&res_reg_0123_hi, &reg_round, CONV8_ROUNDING_BITS);
+
+    // Combine the two halfs
+    res_reg_m1012 = _mm_packs_epi32(res_reg_m1012_lo, res_reg_m1012_hi);
+    res_reg_0123 = _mm_packs_epi32(res_reg_0123_lo, res_reg_0123_hi);
+
+    // Saturate according to bit depth
+    res_reg_m1012 = _mm_min_epi16(res_reg_m1012, reg_max);
+    res_reg_0123 = _mm_min_epi16(res_reg_0123, reg_max);
+    res_reg_m1012 = _mm_max_epi16(res_reg_m1012, reg_zero);
+    res_reg_0123 = _mm_max_epi16(res_reg_0123, reg_zero);
+
+    // Save only half of the register (8 words)
+    _mm_store_si128((__m128i *)dst_ptr, res_reg_m1012);
+    _mm_store_si128((__m128i *)(dst_ptr + dst_stride), res_reg_0123);
+
+    // Update the source by two rows
+    src_ptr += src_stride_unrolled;
+    dst_ptr += dst_stride_unrolled;
+
+    src_reg_m10_lo = src_reg_12_lo;
+    src_reg_m10_hi = src_reg_12_hi;
+    src_reg_01_lo = src_reg_23_lo;
+    src_reg_01_hi = src_reg_23_hi;
+    src_reg_1 = src_reg_3;
+  }
+}
+
+void vpx_highbd_filter_block1d16_h4_sse2(const uint16_t *src_ptr,
+                                         ptrdiff_t src_stride,
+                                         uint16_t *dst_ptr,
+                                         ptrdiff_t dst_stride, uint32_t height,
+                                         const int16_t *kernel, int bd) {
+  vpx_highbd_filter_block1d8_h4_sse2(src_ptr, src_stride, dst_ptr, dst_stride,
+                                     height, kernel, bd);
+  vpx_highbd_filter_block1d8_h4_sse2(src_ptr + 8, src_stride, dst_ptr + 8,
+                                     dst_stride, height, kernel, bd);
+}
+
+void vpx_highbd_filter_block1d16_v4_sse2(const uint16_t *src_ptr,
+                                         ptrdiff_t src_stride,
+                                         uint16_t *dst_ptr,
+                                         ptrdiff_t dst_stride, uint32_t height,
+                                         const int16_t *kernel, int bd) {
+  vpx_highbd_filter_block1d8_v4_sse2(src_ptr, src_stride, dst_ptr, dst_stride,
+                                     height, kernel, bd);
+  vpx_highbd_filter_block1d8_v4_sse2(src_ptr + 8, src_stride, dst_ptr + 8,
+                                     dst_stride, height, kernel, bd);
+}
diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
index 0ccf89694..b55b7e57a 100644
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
@@ -623,7 +623,7 @@ void vpx_filter_block1d8_h4_avx2(const uint8_t *src_ptr, ptrdiff_t src_stride,
                               _mm256_castsi256_si128(kernel_reg_45));
     dst_reg = _mm_adds_epi16(tmp_0, tmp_1);
 
-    dst_reg = round_epi16_sse2(&dst_reg, &reg_32, 6);
+    dst_reg = mm_round_epi16_sse2(&dst_reg, &reg_32, 6);
 
     dst_reg = _mm_packus_epi16(dst_reg, _mm_setzero_si128());
 
@@ -797,7 +797,7 @@ void vpx_filter_block1d4_h4_avx2(const uint8_t *src_ptr, ptrdiff_t src_stride,
     dst = _mm_hadds_epi16(dst, _mm_setzero_si128());
 
     // Round result
-    dst = round_epi16_sse2(&dst, &reg_32, 6);
+    dst = mm_round_epi16_sse2(&dst, &reg_32, 6);
 
     // Pack to 8-bits
     dst = _mm_packus_epi16(dst, _mm_setzero_si128());
diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
index 9e5b73047..b5f6ca57d 100644
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
@@ -246,8 +246,8 @@ void vpx_filter_block1d16_h4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_stride,
     dst_second = _mm_adds_epi16(tmp_0, tmp_1);
 
     // Round each result
-    dst_first = round_epi16_sse2(&dst_first, &reg_32, 6);
-    dst_second = round_epi16_sse2(&dst_second, &reg_32, 6);
+    dst_first = mm_round_epi16_sse2(&dst_first, &reg_32, 6);
+    dst_second = mm_round_epi16_sse2(&dst_second, &reg_32, 6);
 
     // Finally combine to get the final dst
     dst_first = _mm_packus_epi16(dst_first, dst_second);
@@ -348,10 +348,10 @@ void vpx_filter_block1d16_v4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_stride,
     res_reg_0123_hi = _mm_adds_epi16(res_reg_01_hi, res_reg_23_hi);
 
     // Round the words
-    res_reg_m1012_lo = round_epi16_sse2(&res_reg_m1012_lo, &reg_32, 6);
-    res_reg_0123_lo = round_epi16_sse2(&res_reg_0123_lo, &reg_32, 6);
-    res_reg_m1012_hi = round_epi16_sse2(&res_reg_m1012_hi, &reg_32, 6);
-    res_reg_0123_hi = round_epi16_sse2(&res_reg_0123_hi, &reg_32, 6);
+    res_reg_m1012_lo = mm_round_epi16_sse2(&res_reg_m1012_lo, &reg_32, 6);
+    res_reg_0123_lo = mm_round_epi16_sse2(&res_reg_0123_lo, &reg_32, 6);
+    res_reg_m1012_hi = mm_round_epi16_sse2(&res_reg_m1012_hi, &reg_32, 6);
+    res_reg_0123_hi = mm_round_epi16_sse2(&res_reg_0123_hi, &reg_32, 6);
 
     // Combine to get the result
     res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, res_reg_m1012_hi);
@@ -421,7 +421,7 @@ void vpx_filter_block1d8_h4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_stride,
     dst_first = _mm_adds_epi16(tmp_0, tmp_1);
 
     // Round round result
-    dst_first = round_epi16_sse2(&dst_first, &reg_32, 6);
+    dst_first = mm_round_epi16_sse2(&dst_first, &reg_32, 6);
 
     // Pack to 8-bits
     dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128());
@@ -504,8 +504,8 @@ void vpx_filter_block1d8_v4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_stride,
     res_reg_0123 = _mm_adds_epi16(res_reg_01, res_reg_23);
 
     // Round the words
-    res_reg_m1012 = round_epi16_sse2(&res_reg_m1012, &reg_32, 6);
-    res_reg_0123 = round_epi16_sse2(&res_reg_0123, &reg_32, 6);
+    res_reg_m1012 = mm_round_epi16_sse2(&res_reg_m1012, &reg_32, 6);
+    res_reg_0123 = mm_round_epi16_sse2(&res_reg_0123, &reg_32, 6);
 
     // Pack from 16-bit to 8-bit
     res_reg_m1012 = _mm_packus_epi16(res_reg_m1012, _mm_setzero_si128());
@@ -563,7 +563,7 @@ void vpx_filter_block1d4_h4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_stride,
     dst_first = _mm_hadds_epi16(dst_first, _mm_setzero_si128());
 
     // Round result
-    dst_first = round_epi16_sse2(&dst_first, &reg_32, 6);
+    dst_first = mm_round_epi16_sse2(&dst_first, &reg_32, 6);
 
     // Pack to 8-bits
     dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128());
@@ -648,8 +648,8 @@ void vpx_filter_block1d4_v4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_stride,
     reg_1 = _mm_hadds_epi16(reg_1, _mm_setzero_si128());
 
     // Round the words
-    reg_0 = round_epi16_sse2(&reg_0, &reg_32, 6);
-    reg_1 = round_epi16_sse2(&reg_1, &reg_32, 6);
+    reg_0 = mm_round_epi16_sse2(&reg_0, &reg_32, 6);
+    reg_1 = mm_round_epi16_sse2(&reg_1, &reg_32, 6);
 
     // Pack from 16-bit to 8-bit and put them in the right order
     reg_0 = _mm_packus_epi16(reg_0, reg_0);