14 files changed, 352 insertions, 268 deletions
diff --git a/vp9/common/arm/neon/vp9_loopfilter_16_neon.asm b/vp9/common/arm/neon/vp9_loopfilter_16_neon.asm
index e559272cd..751bc74bc 100644
--- a/vp9/common/arm/neon/vp9_loopfilter_16_neon.asm
+++ b/vp9/common/arm/neon/vp9_loopfilter_16_neon.asm
@@ -112,27 +112,27 @@
     vabd.u8     q4, q10, q9                 ; m6 = abs(q3 - q2)
 
     ; only compare the largest value to limit
-    vmax.u8     q11, q11, q12               ; m1 = max(m1, m2)
-    vmax.u8     q12, q13, q14               ; m2 = max(m3, m4)
+    vmax.u8     q11, q11, q12               ; m7 = max(m1, m2)
+    vmax.u8     q12, q13, q14               ; m8 = max(m3, m4)
 
     vabd.u8     q9, q6, q7                  ; abs(p0 - q0)
 
-    vmax.u8     q3, q3, q4                  ; m3 = max(m5, m6)
+    vmax.u8     q3, q3, q4                  ; m9 = max(m5, m6)
 
     vmov.u8     q10, #0x80
 
-    vmax.u8     q15, q11, q12               ; m1 = max(m1, m2)
+    vmax.u8     q15, q11, q12               ; m10 = max(m7, m8)
 
     vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh)*-1
     vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh)*-1
-    vmax.u8     q15, q15, q3                ; m1 = max(m1, m3)
+    vmax.u8     q15, q15, q3                ; m11 = max(m10, m9)
 
     vabd.u8     q2, q5, q8                  ; a = abs(p1 - q1)
     vqadd.u8    q9, q9, q9                  ; b = abs(p0 - q0) * 2
 
     veor        q7, q7, q10                 ; qs0
 
-    vcge.u8     q15, q1, q15                ; abs(m1) > limit
+    vcge.u8     q15, q1, q15                ; abs(m11) > limit
 
     vshr.u8     q2, q2, #1                  ; a = a / 2
     veor        q6, q6, q10                 ; ps0
@@ -142,7 +142,7 @@
 
     veor        q8, q8, q10                 ; qs1
 
-    vmov.u8     q4, #3
+    vmov.u16    q4, #3
 
     vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)
     vsubl.s8    q11, d15, d13
@@ -150,13 +150,15 @@
     vcge.u8     q9, q0, q9                  ; a > blimit
 
     vqsub.s8    q1, q5, q8                  ; filter = clamp(ps1-qs1)
-    vorr        q14, q13, q14               ; hevmask
+    vorr        q14, q13, q14               ; hev
 
     vmul.i16    q2, q2, q4                  ; 3 * ( qs0 - ps0)
     vmul.i16    q11, q11, q4
 
     vand        q1, q1, q14                 ; filter &= hev
-    vand        q15, q15, q9                ; filter_mask
+    vand        q15, q15, q9                ; mask
+
+    vmov.u8     q4, #3
 
     vaddw.s8    q2, q2, d2                  ; filter + 3 * (qs0 - ps0)
     vaddw.s8    q11, q11, d3
@@ -180,15 +182,14 @@
     ; outer tap adjustments
     vrshr.s8    q1, q1, #1                  ; filter = ++filter1 >> 1
 
-    veor        q6, q11, q10                ; *op0 = u^0x80
+    veor        q7, q0,  q10                ; *oq0 = u^0x80
 
     vbic        q1, q1, q14                 ; filter &= ~hev
 
     vqadd.s8    q13, q5, q1                 ; u = clamp(ps1 + filter)
     vqsub.s8    q12, q8, q1                 ; u = clamp(qs1 - filter)
 
-
-    veor        q7, q0,  q10                ; *oq0 = u^0x80
+    veor        q6, q11, q10                ; *op0 = u^0x80
     veor        q5, q13, q10                ; *op1 = u^0x80
     veor        q8, q12, q10                ; *oq1 = u^0x80
 
diff --git a/vp9/common/arm/neon/vp9_loopfilter_16_neon.c b/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
index 2f022dc1d..b97e7aa4a 100644
--- a/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
+++ b/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
@@ -10,17 +10,6 @@
 
 #include "./vp9_rtcd.h"
 
-void vp9_loop_filter_horizontal_edge_16_neon(uint8_t *s, int p /* pitch */,
-                                             const uint8_t *blimit0,
-                                             const uint8_t *limit0,
-                                             const uint8_t *thresh0,
-                                             const uint8_t *blimit1,
-                                             const uint8_t *limit1,
-                                             const uint8_t *thresh1) {
-  vp9_loop_filter_horizontal_edge(s, p, blimit0, limit0, thresh0, 1);
-  vp9_loop_filter_horizontal_edge(s + 8, p, blimit1, limit1, thresh1, 1);
-}
-
 void vp9_mbloop_filter_horizontal_edge_16_neon(uint8_t *s, int p /* pitch */,
                                                const uint8_t *blimit0,
                                                const uint8_t *limit0,
@@ -31,3 +20,34 @@ void vp9_mbloop_filter_horizontal_edge_16_neon(uint8_t *s, int p /* pitch */,
   vp9_mbloop_filter_horizontal_edge(s, p, blimit0, limit0, thresh0, 1);
   vp9_mbloop_filter_horizontal_edge(s + 8, p, blimit1, limit1, thresh1, 1);
 }
+
+void vp9_loop_filter_vertical_edge_16_neon(uint8_t *s, int p,
+                                           const uint8_t *blimit0,
+                                           const uint8_t *limit0,
+                                           const uint8_t *thresh0,
+                                           const uint8_t *blimit1,
+                                           const uint8_t *limit1,
+                                           const uint8_t *thresh1) {
+  vp9_loop_filter_vertical_edge_neon(s, p, blimit0, limit0, thresh0, 1);
+  vp9_loop_filter_vertical_edge_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
+}
+
+void vp9_mbloop_filter_vertical_edge_16_neon(uint8_t *s, int p,
+                                             const uint8_t *blimit0,
+                                             const uint8_t *limit0,
+                                             const uint8_t *thresh0,
+                                             const uint8_t *blimit1,
+                                             const uint8_t *limit1,
+                                             const uint8_t *thresh1) {
+  vp9_mbloop_filter_vertical_edge_neon(s, p, blimit0, limit0, thresh0, 1);
+  vp9_mbloop_filter_vertical_edge_neon(s + 8 * p, p, blimit1, limit1, thresh1,
+                                       1);
+}
+
+void vp9_mb_lpf_vertical_edge_w_16_neon(uint8_t *s, int p,
+                                        const uint8_t *blimit,
+                                        const uint8_t *limit,
+                                        const uint8_t *thresh) {
+  vp9_mb_lpf_vertical_edge_w_neon(s, p, blimit, limit, thresh);
+  vp9_mb_lpf_vertical_edge_w_neon(s + 8 * p, p, blimit, limit, thresh);
+}
diff --git a/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c b/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c
index 36cfc83c4..0c0f155ae 100644
--- a/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c
@@ -306,4 +306,59 @@ void vp9_loop_filter_vertical_edge_dspr2(unsigned char *s,
     }
   }
 }
+
+void vp9_loop_filter_horizontal_edge_16_dspr2(uint8_t *s, int p /* pitch */,
+                                              const uint8_t *blimit0,
+                                              const uint8_t *limit0,
+                                              const uint8_t *thresh0,
+                                              const uint8_t *blimit1,
+                                              const uint8_t *limit1,
+                                              const uint8_t *thresh1) {
+  vp9_loop_filter_horizontal_edge_dspr2(s, p, blimit0, limit0, thresh0, 1);
+  vp9_loop_filter_horizontal_edge_dspr2(s + 8, p, blimit1, limit1, thresh1, 1);
+}
+
+void vp9_mbloop_filter_horizontal_edge_16_dspr2(uint8_t *s, int p /* pitch */,
+                                                const uint8_t *blimit0,
+                                                const uint8_t *limit0,
+                                                const uint8_t *thresh0,
+                                                const uint8_t *blimit1,
+                                                const uint8_t *limit1,
+                                                const uint8_t *thresh1) {
+  vp9_mbloop_filter_horizontal_edge_dspr2(s, p, blimit0, limit0, thresh0, 1);
+  vp9_mbloop_filter_horizontal_edge_dspr2(s + 8, p, blimit1, limit1, thresh1,
+                                          1);
+}
+
+void vp9_loop_filter_vertical_edge_16_dspr2(uint8_t *s, int p,
+                                            const uint8_t *blimit0,
+                                            const uint8_t *limit0,
+                                            const uint8_t *thresh0,
+                                            const uint8_t *blimit1,
+                                            const uint8_t *limit1,
+                                            const uint8_t *thresh1) {
+  vp9_loop_filter_vertical_edge_dspr2(s, p, blimit0, limit0, thresh0, 1);
+  vp9_loop_filter_vertical_edge_dspr2(s + 8 * p, p, blimit1, limit1, thresh1,
+                                      1);
+}
+
+void vp9_mbloop_filter_vertical_edge_16_dspr2(uint8_t *s, int p,
+                                              const uint8_t *blimit0,
+                                              const uint8_t *limit0,
+                                              const uint8_t *thresh0,
+                                              const uint8_t *blimit1,
+                                              const uint8_t *limit1,
+                                              const uint8_t *thresh1) {
+  vp9_mbloop_filter_vertical_edge_dspr2(s, p, blimit0, limit0, thresh0, 1);
+  vp9_mbloop_filter_vertical_edge_dspr2(s + 8 * p, p, blimit1, limit1, thresh1,
+                                       1);
+}
+
+void vp9_mb_lpf_vertical_edge_w_16_dspr2(uint8_t *s, int p,
+                                         const uint8_t *blimit,
+                                         const uint8_t *limit,
+                                         const uint8_t *thresh) {
+  vp9_mb_lpf_vertical_edge_w_dspr2(s, p, blimit, limit, thresh);
+  vp9_mb_lpf_vertical_edge_w_dspr2(s + 8 * p, p, blimit, limit, thresh);
+}
 #endif  // #if HAVE_DSPR2
diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index 6e12638e3..28671c38c 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -211,8 +211,6 @@ void vp9_remove_common(VP9_COMMON *cm) {
 
 void vp9_initialize_common() {
   vp9_init_neighbors();
-  vp9_coef_tree_initialize();
-  vp9_entropy_mode_init();
 }
 
 void vp9_update_frame_size(VP9_COMMON *cm) {
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 37da92bd3..993ee7935 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -198,7 +198,6 @@ struct buf_2d {
 };
 
 struct macroblockd_plane {
-  int16_t *qcoeff;
   int16_t *dqcoeff;
   uint16_t *eobs;
   PLANE_TYPE plane_type;
@@ -387,19 +386,6 @@ static INLINE void foreach_transformed_block_uv(
     foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg);
 }
 
-static int raster_block_offset(BLOCK_SIZE plane_bsize,
-                               int raster_block, int stride) {
-  const int bw = b_width_log2(plane_bsize);
-  const int y = 4 * (raster_block >> bw);
-  const int x = 4 * (raster_block & ((1 << bw) - 1));
-  return y * stride + x;
-}
-static int16_t* raster_block_offset_int16(BLOCK_SIZE plane_bsize,
-                                          int raster_block, int16_t *base) {
-  const int stride = 4 << b_width_log2(plane_bsize);
-  return base + raster_block_offset(plane_bsize, raster_block, stride);
-}
-
 static void txfrm_block_to_raster_xy(BLOCK_SIZE plane_bsize,
                                      TX_SIZE tx_size, int block,
                                      int *x, int *y) {
diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c
index 0f978cc95..b35c43fcd 100644
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c
@@ -113,49 +113,6 @@ DECLARE_ALIGNED(16, const uint8_t, vp9_pt_energy_class[MAX_ENTROPY_TOKENS]) = {
   0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5
 };
 
-// Array indices are identical to previously-existing CONTEXT_NODE indices
-const vp9_tree_index vp9_coef_tree[TREE_SIZE(MAX_ENTROPY_TOKENS)] = {
-  -DCT_EOB_TOKEN, 2,                          /* 0 = EOB */
-  -ZERO_TOKEN, 4,                             /* 1 = ZERO */
-  -ONE_TOKEN, 6,                              /* 2 = ONE */
-  8, 12,                                      /* 3 = LOW_VAL */
-  -TWO_TOKEN, 10,                            /* 4 = TWO */
-  -THREE_TOKEN, -FOUR_TOKEN,                /* 5 = THREE */
-  14, 16,                                   /* 6 = HIGH_LOW */
-  -DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2,   /* 7 = CAT_ONE */
-  18, 20,                                   /* 8 = CAT_THREEFOUR */
-  -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4,   /* 9 = CAT_THREE */
-  -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6    /* 10 = CAT_FIVE */
-};
-
-// Unconstrained Node Tree
-const vp9_tree_index vp9_coef_con_tree[TREE_SIZE(MAX_ENTROPY_TOKENS)] = {
-  2, 6,                                     /* 0 = LOW_VAL */
-  -TWO_TOKEN, 4,                            /* 1 = TWO */
-  -THREE_TOKEN, -FOUR_TOKEN,                /* 2 = THREE */
-  8, 10,                                    /* 3 = HIGH_LOW */
-  -DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2,   /* 4 = CAT_ONE */
-  12, 14,                                   /* 5 = CAT_THREEFOUR */
-  -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4,   /* 6 = CAT_THREE */
-  -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6    /* 7 = CAT_FIVE */
-};
-
-
-
-struct vp9_token vp9_coef_encodings[MAX_ENTROPY_TOKENS];
-
-/* Trees for extra bits.  Probabilities are constant and
-   do not depend on previously encoded bits */
-
-static const vp9_prob Pcat1[] = { 159};
-static const vp9_prob Pcat2[] = { 165, 145};
-static const vp9_prob Pcat3[] = { 173, 148, 140};
-static const vp9_prob Pcat4[] = { 176, 155, 140, 135};
-static const vp9_prob Pcat5[] = { 180, 157, 141, 134, 130};
-static const vp9_prob Pcat6[] = {
-  254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129
-};
-
 const vp9_tree_index vp9_coefmodel_tree[6] = {
   -DCT_EOB_MODEL_TOKEN, 2,                      /* 0 = EOB */
   -ZERO_TOKEN, 4,                               /* 1 = ZERO */
@@ -446,43 +403,6 @@ void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full) {
   extend_to_full_distribution(&full[UNCONSTRAINED_NODES], model[PIVOT_NODE]);
 }
 
-static vp9_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[28];
-
-static void init_bit_tree(vp9_tree_index *p, int n) {
-  int i = 0;
-
-  while (++i < n) {
-    p[0] = p[1] = i << 1;
-    p += 2;
-  }
-
-  p[0] = p[1] = 0;
-}
-
-static void init_bit_trees() {
-  init_bit_tree(cat1, 1);
-  init_bit_tree(cat2, 2);
-  init_bit_tree(cat3, 3);
-  init_bit_tree(cat4, 4);
-  init_bit_tree(cat5, 5);
-  init_bit_tree(cat6, 14);
-}
-
-const vp9_extra_bit vp9_extra_bits[MAX_ENTROPY_TOKENS] = {
-  {0, 0, 0, 0},           // ZERO_TOKEN
-  {0, 0, 0, 1},           // ONE_TOKEN
-  {0, 0, 0, 2},           // TWO_TOKEN
-  {0, 0, 0, 3},           // THREE_TOKEN
-  {0, 0, 0, 4},           // FOUR_TOKEN
-  {cat1, Pcat1, 1, 5},    // DCT_VAL_CATEGORY1
-  {cat2, Pcat2, 2, 7},    // DCT_VAL_CATEGORY2
-  {cat3, Pcat3, 3, 11},   // DCT_VAL_CATEGORY3
-  {cat4, Pcat4, 4, 19},   // DCT_VAL_CATEGORY4
-  {cat5, Pcat5, 5, 35},   // DCT_VAL_CATEGORY5
-  {cat6, Pcat6, 14, 67},  // DCT_VAL_CATEGORY6
-  {0, 0, 0, 0}            // DCT_EOB_TOKEN
-};
-
 #include "vp9/common/vp9_default_coef_probs.h"
 
 void vp9_default_coef_probs(VP9_COMMON *cm) {
@@ -492,11 +412,6 @@ void vp9_default_coef_probs(VP9_COMMON *cm) {
   vp9_copy(cm->fc.coef_probs[TX_32X32], default_coef_probs_32x32);
 }
 
-void vp9_coef_tree_initialize() {
-  init_bit_trees();
-  vp9_tokens_from_tree(vp9_coef_encodings, vp9_coef_tree);
-}
-
 #define COEF_COUNT_SAT 24
 #define COEF_MAX_UPDATE_FACTOR 112
 #define COEF_COUNT_SAT_KEY 24
diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index 92a6c592a..941b251c3 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -44,15 +44,9 @@
 extern DECLARE_ALIGNED(16, const uint8_t,
                        vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
 
-extern const vp9_tree_index vp9_coef_tree[TREE_SIZE(MAX_ENTROPY_TOKENS)];
-
-extern const vp9_tree_index vp9_coef_con_tree[];
-
 #define DCT_EOB_MODEL_TOKEN     3      /* EOB       Extra Bits 0+0 */
 extern const vp9_tree_index vp9_coefmodel_tree[];
 
-extern struct vp9_token vp9_coef_encodings[MAX_ENTROPY_TOKENS];
-
 typedef struct {
   const vp9_tree_index *tree;
   const vp9_prob *prob;
@@ -105,8 +99,6 @@ typedef unsigned int vp9_coeff_stats[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
 
 struct VP9Common;
 void vp9_default_coef_probs(struct VP9Common *cm);
-
-void vp9_coef_tree_initialize();
 void vp9_adapt_coef_probs(struct VP9Common *cm);
 
 static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) {
diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c
index 5d74c6967..265242129 100644
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -232,21 +232,18 @@ const vp9_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)] = {
   -D63_PRED, 16,                    /* 7 = D63_NODE */
   -D153_PRED, -D207_PRED             /* 8 = D153_NODE */
 };
-struct vp9_token vp9_intra_mode_encodings[INTRA_MODES];
 
 const vp9_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)] = {
   -INTER_OFFSET(ZEROMV), 2,
   -INTER_OFFSET(NEARESTMV), 4,
   -INTER_OFFSET(NEARMV), -INTER_OFFSET(NEWMV)
 };
-struct vp9_token vp9_inter_mode_encodings[INTER_MODES];
 
 const vp9_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)] = {
   -PARTITION_NONE, 2,
   -PARTITION_HORZ, 4,
   -PARTITION_VERT, -PARTITION_SPLIT
 };
-struct vp9_token vp9_partition_encodings[PARTITION_TYPES];
 
 static const vp9_prob default_intra_inter_p[INTRA_INTER_CONTEXTS] = {
   9, 102, 187, 225
@@ -337,15 +334,6 @@ const vp9_tree_index vp9_switchable_interp_tree
   -EIGHTTAP, 2,
   -EIGHTTAP_SMOOTH, -EIGHTTAP_SHARP
 };
-struct vp9_token vp9_switchable_interp_encodings[SWITCHABLE_FILTERS];
-
-void vp9_entropy_mode_init() {
-  vp9_tokens_from_tree(vp9_intra_mode_encodings, vp9_intra_mode_tree);
-  vp9_tokens_from_tree(vp9_switchable_interp_encodings,
-                       vp9_switchable_interp_tree);
-  vp9_tokens_from_tree(vp9_partition_encodings, vp9_partition_tree);
-  vp9_tokens_from_tree(vp9_inter_mode_encodings, vp9_inter_mode_tree);
-}
 
 #define COUNT_SAT 20
 #define MAX_UPDATE_FACTOR 128
diff --git a/vp9/common/vp9_entropymode.h b/vp9/common/vp9_entropymode.h
index 38b419948..df58bea3c 100644
--- a/vp9/common/vp9_entropymode.h
+++ b/vp9/common/vp9_entropymode.h
@@ -37,24 +37,13 @@ struct tx_counts {
 extern const vp9_prob vp9_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
 extern const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES]
                                         [INTRA_MODES - 1];
-
 extern const vp9_prob vp9_kf_partition_probs[PARTITION_CONTEXTS]
                                             [PARTITION_TYPES - 1];
-
 extern const vp9_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)];
-extern struct vp9_token vp9_intra_mode_encodings[INTRA_MODES];
-
 extern const vp9_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)];
-extern struct vp9_token vp9_inter_mode_encodings[INTER_MODES];
-
 extern const vp9_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)];
-extern struct vp9_token vp9_partition_encodings[PARTITION_TYPES];
-
 extern const vp9_tree_index vp9_switchable_interp_tree
                                 [TREE_SIZE(SWITCHABLE_FILTERS)];
-extern struct vp9_token vp9_switchable_interp_encodings[SWITCHABLE_FILTERS];
-
-void vp9_entropy_mode_init();
 
 void vp9_setup_past_independence(struct VP9Common *cm);
 
diff --git a/vp9/common/vp9_enums.h b/vp9/common/vp9_enums.h
index 9e4117e17..34411a34f 100644
--- a/vp9/common/vp9_enums.h
+++ b/vp9/common/vp9_enums.h
@@ -52,20 +52,22 @@ typedef enum PARTITION_TYPE {
 #define PARTITION_PLOFFSET   4  // number of probability models per block size
 #define PARTITION_CONTEXTS (4 * PARTITION_PLOFFSET)
 
+// block transform size
 typedef enum {
-  TX_4X4 = 0,                      // 4x4 dct transform
-  TX_8X8 = 1,                      // 8x8 dct transform
-  TX_16X16 = 2,                    // 16x16 dct transform
-  TX_32X32 = 3,                    // 32x32 dct transform
+  TX_4X4 = 0,                      // 4x4 transform
+  TX_8X8 = 1,                      // 8x8 transform
+  TX_16X16 = 2,                    // 16x16 transform
+  TX_32X32 = 3,                    // 32x32 transform
   TX_SIZES
 } TX_SIZE;
 
+// frame transform mode
 typedef enum {
-  ONLY_4X4            = 0,
-  ALLOW_8X8           = 1,
-  ALLOW_16X16         = 2,
-  ALLOW_32X32         = 3,
-  TX_MODE_SELECT      = 4,
+  ONLY_4X4            = 0,        // only 4x4 transform used
+  ALLOW_8X8           = 1,        // allow block transform size up to 8x8
+  ALLOW_16X16         = 2,        // allow block transform size up to 16x16
+  ALLOW_32X32         = 3,        // allow block transform size up to 32x32
+  TX_MODE_SELECT      = 4,        // transform specified for each block
   TX_MODES            = 5,
 } TX_MODE;
 
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index 0b48de2cb..ff2bc45e4 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -353,29 +353,17 @@ static void filter_selectively_vert_row2(PLANE_TYPE plane_type,
 
     // TODO(yunqingwang): count in loopfilter functions should be removed.
     if (mask & 1) {
-      if ((mask_16x16_0 | mask_16x16_1) & 1) {
-        if ((mask_16x16_0 & mask_16x16_1) & 1) {
-          // TODO(yunqingwang): Combine 2 calls as 1 wide filtering.
-          vp9_mb_lpf_vertical_edge_w(s, pitch, lfi0->mblim, lfi0->lim,
-                                     lfi0->hev_thr);
-          vp9_mb_lpf_vertical_edge_w(s + 8 *pitch, pitch, lfi1->mblim,
-                                     lfi1->lim, lfi1->hev_thr);
-        } else if (mask_16x16_0 & 1) {
-          vp9_mb_lpf_vertical_edge_w(s, pitch, lfi0->mblim, lfi0->lim,
-                                     lfi0->hev_thr);
-        } else {
-          vp9_mb_lpf_vertical_edge_w(s + 8 *pitch, pitch, lfi1->mblim,
-                                     lfi1->lim, lfi1->hev_thr);
-        }
+      if (mask_16x16_0 & 1) {
+        // if (mask_16x16_0 & 1) is 1, then (mask_16x16_1 & 1) is 1.
+        vp9_mb_lpf_vertical_edge_w_16(s, pitch, lfi0->mblim, lfi0->lim,
+                                      lfi0->hev_thr);
       }
 
       if ((mask_8x8_0 | mask_8x8_1) & 1) {
         if ((mask_8x8_0 & mask_8x8_1) & 1) {
-          // TODO(yunqingwang): Combine 2 calls as 1 wide filtering.
-          vp9_mbloop_filter_vertical_edge(s, pitch, lfi0->mblim, lfi0->lim,
-                                          lfi0->hev_thr, 1);
-          vp9_mbloop_filter_vertical_edge(s + 8 *pitch, pitch, lfi1->mblim,
-                                          lfi1->lim, lfi1->hev_thr, 1);
+          vp9_mbloop_filter_vertical_edge_16(s, pitch, lfi0->mblim, lfi0->lim,
+                                          lfi0->hev_thr, lfi1->mblim,
+                                          lfi1->lim, lfi1->hev_thr);
         } else if (mask_8x8_0 & 1) {
           vp9_mbloop_filter_vertical_edge(s, pitch, lfi0->mblim, lfi0->lim,
                                           lfi0->hev_thr, 1);
@@ -387,11 +375,9 @@ static void filter_selectively_vert_row2(PLANE_TYPE plane_type,
 
       if ((mask_4x4_0 | mask_4x4_1) & 1) {
         if ((mask_4x4_0 & mask_4x4_1) & 1) {
-          // TODO(yunqingwang): Combine 2 calls as 1 wide filtering.
-          vp9_loop_filter_vertical_edge(s, pitch, lfi0->mblim, lfi0->lim,
-                                        lfi0->hev_thr, 1);
-          vp9_loop_filter_vertical_edge(s + 8 *pitch, pitch, lfi1->mblim,
-                                        lfi1->lim, lfi1->hev_thr, 1);
+          vp9_loop_filter_vertical_edge_16(s, pitch, lfi0->mblim, lfi0->lim,
+                                        lfi0->hev_thr, lfi1->mblim,
+                                        lfi1->lim, lfi1->hev_thr);
         } else if (mask_4x4_0 & 1) {
           vp9_loop_filter_vertical_edge(s, pitch, lfi0->mblim, lfi0->lim,
                                         lfi0->hev_thr, 1);
@@ -403,11 +389,9 @@ static void filter_selectively_vert_row2(PLANE_TYPE plane_type,
 
       if ((mask_4x4_int_0 | mask_4x4_int_1) & 1) {
         if ((mask_4x4_int_0 & mask_4x4_int_1) & 1) {
-          // TODO(yunqingwang): Combine 2 calls as 1 wide filtering.
-          vp9_loop_filter_vertical_edge(s + 4, pitch, lfi0->mblim, lfi0->lim,
-                                        lfi0->hev_thr, 1);
-          vp9_loop_filter_vertical_edge(s + 8 *pitch + 4, pitch, lfi1->mblim,
-                                        lfi1->lim, lfi1->hev_thr, 1);
+          vp9_loop_filter_vertical_edge_16(s + 4, pitch, lfi0->mblim, lfi0->lim,
+                                        lfi0->hev_thr, lfi1->mblim,
+                                        lfi1->lim, lfi1->hev_thr);
         } else if (mask_4x4_int_0 & 1) {
           vp9_loop_filter_vertical_edge(s + 4, pitch, lfi0->mblim, lfi0->lim,
                                         lfi0->hev_thr, 1);
@@ -448,14 +432,10 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
     count = 1;
     if (mask & 1) {
       if (mask_16x16 & 1) {
-        if ((mask_16x16 & 3) == 3) {
-          vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim,
-                                       lfi->hev_thr, 2);
-          count = 2;
-        } else {
-          vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim,
-                                       lfi->hev_thr, 1);
-        }
+        // If (mask_16x16 & 1) is 1, then (mask_16x16 & 3) is 3.
+        vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim,
+                                     lfi->hev_thr, 2);
+        count = 2;
       } else if (mask_8x8 & 1) {
         if ((mask_8x8 & 3) == 3) {
           // Next block's thresholds
diff --git a/vp9/common/vp9_loopfilter_filters.c b/vp9/common/vp9_loopfilter_filters.c
index 9edf8701f..ef8de2010 100644
--- a/vp9/common/vp9_loopfilter_filters.c
+++ b/vp9/common/vp9_loopfilter_filters.c
@@ -169,6 +169,34 @@ void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch,
   }
 }
 
+void vp9_loop_filter_vertical_edge_16_c(uint8_t *s, int pitch,
+                                        const uint8_t *blimit0,
+                                        const uint8_t *limit0,
+                                        const uint8_t *thresh0,
+                                        const uint8_t *blimit1,
+                                        const uint8_t *limit1,
+                                        const uint8_t *thresh1) {
+  int i, j;
+  const uint8_t *blimit = blimit0;
+  const uint8_t *limit = limit0;
+  const uint8_t *thresh = thresh0;
+
+  for (i = 0; i < 2; ++i) {
+    for (j = 0; j < 8; ++j) {
+      const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+      const uint8_t q0 = s[0],  q1 = s[1],  q2 = s[2],  q3 = s[3];
+      const int8_t mask = filter_mask(*limit, *blimit,
+                                      p3, p2, p1, p0, q0, q1, q2, q3);
+      const int8_t hev = hev_mask(*thresh, p1, p0, q0, q1);
+      filter4(mask, hev, s - 2, s - 1, s, s + 1);
+      s += pitch;
+    }
+    blimit = blimit1;
+    limit = limit1;
+    thresh = thresh1;
+  }
+}
+
 static INLINE void filter8(int8_t mask, uint8_t hev, uint8_t flat,
                            uint8_t *op3, uint8_t *op2,
                            uint8_t *op1, uint8_t *op0,
@@ -264,6 +292,36 @@ void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch,
   }
 }
 
+void vp9_mbloop_filter_vertical_edge_16_c(uint8_t *s, int pitch,
+                                          const uint8_t *blimit0,
+                                          const uint8_t *limit0,
+                                          const uint8_t *thresh0,
+                                          const uint8_t *blimit1,
+                                          const uint8_t *limit1,
+                                          const uint8_t *thresh1) {
+  int i, j;
+  const uint8_t *blimit = blimit0;
+  const uint8_t *limit = limit0;
+  const uint8_t *thresh = thresh0;
+
+  for (i = 0; i < 2; ++i) {
+    for (j = 0; j < 8; ++j) {
+      const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+      const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+      const int8_t mask = filter_mask(*limit, *blimit,
+                                      p3, p2, p1, p0, q0, q1, q2, q3);
+      const int8_t hev = hev_mask(thresh[0], p1, p0, q0, q1);
+      const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+      filter8(mask, hev, flat, s - 4, s - 3, s - 2, s - 1,
+                               s,     s + 1, s + 2, s + 3);
+      s += pitch;
+    }
+    blimit = blimit1;
+    limit = limit1;
+    thresh = thresh1;
+  }
+}
+
 static INLINE void filter16(int8_t mask, uint8_t hev,
                             uint8_t flat, uint8_t flat2,
                             uint8_t *op7, uint8_t *op6,
@@ -366,3 +424,26 @@ void vp9_mb_lpf_vertical_edge_w_c(uint8_t *s, int p,
     s += p;
   }
 }
+
+void vp9_mb_lpf_vertical_edge_w_16_c(uint8_t *s, int p,
+                                     const uint8_t *blimit,
+                                     const uint8_t *limit,
+                                     const uint8_t *thresh) {
+  int i;
+
+  for (i = 0; i < 16; ++i) {
+    const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+    const uint8_t q0 = s[0], q1 = s[1],  q2 = s[2], q3 = s[3];
+    const int8_t mask = filter_mask(*limit, *blimit,
+                                    p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t hev = hev_mask(*thresh, p1, p0, q0, q1);
+    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t flat2 = flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0,
+                                    q0, s[4], s[5], s[6], s[7]);
+
+    filter16(mask, hev, flat, flat2,
+             s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1,
+             s,     s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7);
+    s += p;
+  }
+}
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index e18e757c1..627ea31ed 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -193,12 +193,21 @@ specialize vp9_dc_128_predictor_32x32
 prototype void vp9_mb_lpf_vertical_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"
 specialize vp9_mb_lpf_vertical_edge_w sse2 neon dspr2
 
+prototype void vp9_mb_lpf_vertical_edge_w_16 "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"
+specialize vp9_mb_lpf_vertical_edge_w_16 sse2 neon dspr2
+
 prototype void vp9_mbloop_filter_vertical_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
 specialize vp9_mbloop_filter_vertical_edge sse2 neon dspr2
 
+prototype void vp9_mbloop_filter_vertical_edge_16 "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"
+specialize vp9_mbloop_filter_vertical_edge_16 sse2 neon dspr2
+
 prototype void vp9_loop_filter_vertical_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
 specialize vp9_loop_filter_vertical_edge mmx neon dspr2
 
+prototype void vp9_loop_filter_vertical_edge_16 "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"
+specialize vp9_loop_filter_vertical_edge_16 sse2 neon dspr2
+
 prototype void vp9_mb_lpf_horizontal_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
 specialize vp9_mb_lpf_horizontal_edge_w sse2 avx2 neon dspr2
 
@@ -206,13 +215,13 @@ prototype void vp9_mbloop_filter_horizontal_edge "uint8_t *s, int pitch, const u
 specialize vp9_mbloop_filter_horizontal_edge sse2 neon dspr2
 
 prototype void vp9_mbloop_filter_horizontal_edge_16 "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"
-specialize vp9_mbloop_filter_horizontal_edge_16 sse2 neon
+specialize vp9_mbloop_filter_horizontal_edge_16 sse2 neon dspr2
 
 prototype void vp9_loop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
 specialize vp9_loop_filter_horizontal_edge mmx neon dspr2
 
 prototype void vp9_loop_filter_horizontal_edge_16 "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"
-specialize vp9_loop_filter_horizontal_edge_16 sse2 neon
+specialize vp9_loop_filter_horizontal_edge_16 sse2 neon dspr2
 
 #
 # post proc
@@ -698,31 +707,31 @@ fi
 
 # fdct functions
 prototype void vp9_short_fht4x4 "const int16_t *input, int16_t *output, int stride, int tx_type"
-specialize vp9_short_fht4x4 sse2
+specialize vp9_short_fht4x4 sse2 avx2
 
 prototype void vp9_short_fht8x8 "const int16_t *input, int16_t *output, int stride, int tx_type"
-specialize vp9_short_fht8x8 sse2
+specialize vp9_short_fht8x8 sse2 avx2
 
 prototype void vp9_short_fht16x16 "const int16_t *input, int16_t *output, int stride, int tx_type"
-specialize vp9_short_fht16x16 sse2
+specialize vp9_short_fht16x16 sse2 avx2
 
 prototype void vp9_fwht4x4 "const int16_t *input, int16_t *output, int stride"
 specialize vp9_fwht4x4
 
 prototype void vp9_fdct4x4 "const int16_t *input, int16_t *output, int stride"
-specialize vp9_fdct4x4 sse2
+specialize vp9_fdct4x4 sse2 avx2
 
 prototype void vp9_fdct8x8 "const int16_t *input, int16_t *output, int stride"
-specialize vp9_fdct8x8 sse2
+specialize vp9_fdct8x8 sse2 avx2
 
 prototype void vp9_fdct16x16 "const int16_t *input, int16_t *output, int stride"
-specialize vp9_fdct16x16 sse2
+specialize vp9_fdct16x16 sse2 avx2
 
 prototype void vp9_fdct32x32 "const int16_t *input, int16_t *output, int stride"
-specialize vp9_fdct32x32 sse2
+specialize vp9_fdct32x32 sse2 avx2
 
 prototype void vp9_fdct32x32_rd "const int16_t *input, int16_t *output, int stride"
-specialize vp9_fdct32x32_rd sse2
+specialize vp9_fdct32x32_rd sse2 avx2
 
 #
 # Motion search
diff --git a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
index 925f74d19..3ca55cfc3 100644
--- a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
+++ b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include <emmintrin.h>  /* SSE2 */
+#include <emmintrin.h>  // SSE2
 #include "vp9/common/vp9_loopfilter.h"
 #include "vpx_ports/emmintrin_compat.h"
 
@@ -99,7 +99,7 @@ static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s,
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
-    /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+    // (vp9_filter + 3 * (qs0 - ps0)) & mask
     filt = _mm_and_si128(filt, mask);
 
     filter1 = _mm_adds_epi8(filt, t4);
@@ -110,11 +110,11 @@ static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s,
     filter2 = _mm_unpacklo_epi8(zero, filter2);
     filter2 = _mm_srai_epi16(filter2, 0xB);
 
-    /* Filter1 >> 3 */
+    // Filter1 >> 3
     filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
     qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
 
-    /* filt >> 1 */
+    // filt >> 1
     filt = _mm_adds_epi16(filter1, t1);
     filt = _mm_srai_epi16(filt, 1);
     filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
@@ -473,13 +473,13 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
-    /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+    // (vp9_filter + 3 * (qs0 - ps0)) & mask
     filt = _mm_and_si128(filt, mask);
 
     filter1 = _mm_adds_epi8(filt, t4);
     filter2 = _mm_adds_epi8(filt, t3);
 
-    /* Filter1 >> 3 */
+    // Filter1 >> 3
     work_a = _mm_cmpgt_epi8(zero, filter1);
     filter1 = _mm_srli_epi16(filter1, 3);
     work_a = _mm_and_si128(work_a, te0);
@@ -487,7 +487,7 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
     filter1 = _mm_or_si128(filter1, work_a);
     qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
 
-    /* Filter2 >> 3 */
+    // Filter2 >> 3
     work_a = _mm_cmpgt_epi8(zero, filter2);
     filter2 = _mm_srli_epi16(filter2, 3);
     work_a = _mm_and_si128(work_a, te0);
@@ -495,7 +495,7 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
     filter2 = _mm_or_si128(filter2, work_a);
     ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
 
-    /* filt >> 1 */
+    // filt >> 1
     filt = _mm_adds_epi8(filter1, t1);
     work_a = _mm_cmpgt_epi8(zero, filt);
     filt = _mm_srli_epi16(filt, 1);
@@ -1014,23 +1014,23 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
-    /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+    // (vp9_filter + 3 * (qs0 - ps0)) & mask
     filt = _mm_and_si128(filt, mask);
 
     filter1 = _mm_adds_epi8(filt, t4);
     filter2 = _mm_adds_epi8(filt, t3);
 
-    /* Filter1 >> 3 */
+    // Filter1 >> 3
     filter1 = _mm_unpacklo_epi8(zero, filter1);
     filter1 = _mm_srai_epi16(filter1, 11);
     filter1 = _mm_packs_epi16(filter1, filter1);
 
-    /* Filter2 >> 3 */
+    // Filter2 >> 3
     filter2 = _mm_unpacklo_epi8(zero, filter2);
     filter2 = _mm_srai_epi16(filter2, 11);
     filter2 = _mm_packs_epi16(filter2, zero);
 
-    /* filt >> 1 */
+    // filt >> 1
     filt = _mm_adds_epi8(filter1, t1);
     filt = _mm_unpacklo_epi8(zero, filt);
     filt = _mm_srai_epi16(filt, 9);
@@ -1083,7 +1083,7 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
   }
 }
 
-void vp9_mbloop_filter_horizontal_edge_16_sse2(uint8_t *s, int p /* pitch */,
+void vp9_mbloop_filter_horizontal_edge_16_sse2(uint8_t *s, int p,
                                                const uint8_t *_blimit0,
                                                const uint8_t *_limit0,
                                                const uint8_t *_thresh0,
@@ -1255,27 +1255,27 @@ void vp9_mbloop_filter_horizontal_edge_16_sse2(uint8_t *s, int p /* pitch */,
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
-    /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+    // (vp9_filter + 3 * (qs0 - ps0)) & mask
     filt = _mm_and_si128(filt, mask);
 
     filter1 = _mm_adds_epi8(filt, t4);
     filter2 = _mm_adds_epi8(filt, t3);
 
-    /* Filter1 >> 3 */
+    // Filter1 >> 3
     work_a = _mm_cmpgt_epi8(zero, filter1);
     filter1 = _mm_srli_epi16(filter1, 3);
     work_a = _mm_and_si128(work_a, te0);
     filter1 = _mm_and_si128(filter1, t1f);
     filter1 = _mm_or_si128(filter1, work_a);
 
-    /* Filter2 >> 3 */
+    // Filter2 >> 3
     work_a = _mm_cmpgt_epi8(zero, filter2);
     filter2 = _mm_srli_epi16(filter2, 3);
     work_a = _mm_and_si128(work_a, te0);
     filter2 = _mm_and_si128(filter2, t1f);
     filter2 = _mm_or_si128(filter2, work_a);
 
-    /* filt >> 1 */
+    // filt >> 1
     filt = _mm_adds_epi8(filter1, t1);
     work_a = _mm_cmpgt_epi8(zero, filt);
     filt = _mm_srli_epi16(filt, 1);
@@ -1427,27 +1427,27 @@ void vp9_loop_filter_horizontal_edge_16_sse2(unsigned char *s,
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
-    /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+    // (vp9_filter + 3 * (qs0 - ps0)) & mask
     filt = _mm_and_si128(filt, mask);
 
     filter1 = _mm_adds_epi8(filt, t4);
     filter2 = _mm_adds_epi8(filt, t3);
 
-    /* Filter1 >> 3 */
+    // Filter1 >> 3
     work_a = _mm_cmpgt_epi8(zero, filter1);
     filter1 = _mm_srli_epi16(filter1, 3);
     work_a = _mm_and_si128(work_a, te0);
     filter1 = _mm_and_si128(filter1, t1f);
     filter1 = _mm_or_si128(filter1, work_a);
 
-    /* Filter2 >> 3 */
+    // Filter2 >> 3
     work_a = _mm_cmpgt_epi8(zero, filter2);
     filter2 = _mm_srli_epi16(filter2, 3);
     work_a = _mm_and_si128(work_a, te0);
     filter2 = _mm_and_si128(filter2, t1f);
     filter2 = _mm_or_si128(filter2, work_a);
 
-    /* filt >> 1 */
+    // filt >> 1
     filt = _mm_adds_epi8(filter1, t1);
     work_a = _mm_cmpgt_epi8(zero, filt);
     filt = _mm_srli_epi16(filt, 1);
@@ -1474,7 +1474,7 @@ static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
   __m128i x0, x1, x2, x3, x4, x5, x6, x7;
   __m128i x8, x9, x10, x11, x12, x13, x14, x15;
 
-  /* Read in 16 lines */
+  // Read in 16 lines
   x0 = _mm_loadl_epi64((__m128i *)in0);
   x8 = _mm_loadl_epi64((__m128i *)in1);
   x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p));
@@ -1512,7 +1512,7 @@ static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
   x14 = _mm_unpacklo_epi32(x12, x13);
   x15 = _mm_unpackhi_epi32(x12, x13);
 
-  /* Store first 4-line result */
+  // Store first 4-line result
   _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14));
   _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14));
   _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15));
@@ -1528,7 +1528,7 @@ static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
   x14 = _mm_unpacklo_epi32(x12, x13);
   x15 = _mm_unpackhi_epi32(x12, x13);
 
-  /* Store second 4-line result */
+  // Store second 4-line result
   _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14));
   _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14));
   _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15));
@@ -1598,61 +1598,129 @@ static INLINE void transpose(unsigned char *src[], int in_p,
   } while (++idx8x8 < num_8x8_to_transpose);
 }
 
-void vp9_mbloop_filter_vertical_edge_sse2(unsigned char *s,
-                                          int p,
+void vp9_loop_filter_vertical_edge_16_sse2(uint8_t *s, int p,
+                                           const uint8_t *blimit0,
+                                           const uint8_t *limit0,
+                                           const uint8_t *thresh0,
+                                           const uint8_t *blimit1,
+                                           const uint8_t *limit1,
+                                           const uint8_t *thresh1) {
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 16 * 8);
+  unsigned char *src[2];
+  unsigned char *dst[2];
+
+  // Transpose 8x16
+  transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+
+  // Loop filtering
+  vp9_loop_filter_horizontal_edge_16_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
+                                          thresh0, blimit1, limit1, thresh1);
+  src[0] = t_dst;
+  src[1] = t_dst + 8;
+  dst[0] = s - 4;
+  dst[1] = s - 4 + p * 8;
+
+  // Transpose back
+  transpose(src, 16, dst, p, 2);
+}
+
+void vp9_mbloop_filter_vertical_edge_sse2(unsigned char *s, int p,
                                           const unsigned char *blimit,
                                           const unsigned char *limit,
                                           const unsigned char *thresh,
                                           int count) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256);
+  DECLARE_ALIGNED_ARRAY(8, unsigned char, t_dst, 8 * 8);
+  unsigned char *src[1];
+  unsigned char *dst[1];
+  (void)count;
+
+  // Transpose 8x8
+  src[0] = s - 4;
+  dst[0] = t_dst;
+
+  transpose(src, p, dst, 8, 1);
+
+  // Loop filtering
+  vp9_mbloop_filter_horizontal_edge_sse2(t_dst + 4 * 8, 8, blimit, limit,
+                                         thresh, 1);
+
+  src[0] = t_dst;
+  dst[0] = s - 4;
+
+  // Transpose back
+  transpose(src, 8, dst, p, 1);
+}
+
+void vp9_mbloop_filter_vertical_edge_16_sse2(uint8_t *s, int p,
+                                             const uint8_t *blimit0,
+                                             const uint8_t *limit0,
+                                             const uint8_t *thresh0,
+                                             const uint8_t *blimit1,
+                                             const uint8_t *limit1,
+                                             const uint8_t *thresh1) {
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 16 * 8);
   unsigned char *src[2];
   unsigned char *dst[2];
 
-  (void)count;
-  /* Transpose 16x16 */
-  transpose8x16(s - 8, s - 8 + p * 8, p, t_dst, 16);
-  transpose8x16(s, s + p * 8, p, t_dst + 16 * 8, 16);
+  // Transpose 8x16
+  transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
 
-  /* Loop filtering */
-  vp9_mbloop_filter_horizontal_edge_sse2(t_dst + 8 * 16, 16, blimit, limit,
-                                         thresh, 1);
-  src[0] = t_dst + 3 * 16;
-  src[1] = t_dst + 3 * 16 + 8;
+  // Loop filtering
+  vp9_mbloop_filter_horizontal_edge_16_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
+                                            thresh0, blimit1, limit1, thresh1);
+  src[0] = t_dst;
+  src[1] = t_dst + 8;
 
-  dst[0] = s - 5;
-  dst[1] = s - 5 + p * 8;
+  dst[0] = s - 4;
+  dst[1] = s - 4 + p * 8;
 
-  /* Transpose 16x8 */
+  // Transpose back
   transpose(src, 16, dst, p, 2);
 }
 
-void vp9_mb_lpf_vertical_edge_w_sse2(unsigned char *s,
-                                     int p,
+void vp9_mb_lpf_vertical_edge_w_sse2(unsigned char *s, int p,
                                      const unsigned char *blimit,
                                      const unsigned char *limit,
                                      const unsigned char *thresh) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256);
-  unsigned char *src[4];
-  unsigned char *dst[4];
-
-  dst[0] = t_dst;
-  dst[1] = t_dst + 8 * 16;
+  DECLARE_ALIGNED_ARRAY(8, unsigned char, t_dst, 8 * 16);
+  unsigned char *src[2];
+  unsigned char *dst[2];
 
   src[0] = s - 8;
-  src[1] = s - 8 + 8;
+  src[1] = s;
+  dst[0] = t_dst;
+  dst[1] = t_dst + 8 * 8;
 
-  /* Transpose 16x16 */
-  transpose(src, p, dst, 16, 2);
+  // Transpose 16x8
+  transpose(src, p, dst, 8, 2);
 
-  /* Loop filtering */
-  vp9_mb_lpf_horizontal_edge_w_sse2(t_dst + 8 * 16, 16, blimit, limit,
-                                    thresh, 1);
+  // Loop filtering
+  mb_lpf_horizontal_edge_w_sse2_8(t_dst + 8 * 8, 8, blimit, limit, thresh);
 
   src[0] = t_dst;
-  src[1] = t_dst + 8 * 16;
-
+  src[1] = t_dst + 8 * 8;
   dst[0] = s - 8;
-  dst[1] = s - 8 + 8;
+  dst[1] = s;
 
-  transpose(src, 16, dst, p, 2);
+  // Transpose back
+  transpose(src, 8, dst, p, 2);
+}
+
+void vp9_mb_lpf_vertical_edge_w_16_sse2(unsigned char *s, int p,
+                                        const uint8_t *blimit,
+                                        const uint8_t *limit,
+                                        const uint8_t *thresh) {
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256);
+
+  // Transpose 16x16
+  transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
+  transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
+
+  // Loop filtering
+  mb_lpf_horizontal_edge_w_sse2_16(t_dst + 8 * 16, 16, blimit, limit,
+                                   thresh);
+
+  // Transpose back
+  transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
+  transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
 }