22 files changed, 704 insertions, 1981 deletions
diff --git a/vp8/common/blockd.h b/vp8/common/blockd.h
index 3ab4cc3a9..cb546e74b 100644
--- a/vp8/common/blockd.h
+++ b/vp8/common/blockd.h
@@ -86,9 +86,7 @@ typedef enum
   BILINEAR = 1,
   EIGHTTAP = 2,
   EIGHTTAP_SHARP = 3,
-#if CONFIG_SWITCHABLE_INTERP
   SWITCHABLE  /* should be the last one */
-#endif
 } INTERPOLATIONFILTERTYPE;
 
 typedef enum
@@ -135,14 +133,12 @@ typedef enum {
   TX_SIZE_MAX                  // Number of different transforms available
 } TX_SIZE;
 
-#if CONFIG_HYBRIDTRANSFORM8X8 || CONFIG_HYBRIDTRANSFORM || CONFIG_HYBRIDTRANSFORM16X16
 typedef enum {
   DCT_DCT   = 0,                      // DCT  in both horizontal and vertical
-  ADST_DCT  = 1,                      // ADST in horizontal, DCT in vertical
-  DCT_ADST  = 2,                      // DCT  in horizontal, ADST in vertical
+  ADST_DCT  = 1,                      // ADST in vertical, DCT in horizontal
+  DCT_ADST  = 2,                      // DCT  in vertical, ADST in horizontal
   ADST_ADST = 3                       // ADST in both directions
 } TX_TYPE;
-#endif
 
 #define VP8_YMODES  (B_PRED + 1)
 #define VP8_UV_MODES (TM_PRED + 1)
@@ -177,6 +173,14 @@ typedef enum {
 #define VP8_BINTRAMODES (B_HU_PRED + 1)  /* 10 */
 #define VP8_SUBMVREFS (1 + NEW4X4 - LEFT4X4)
 
+typedef enum {
+  PARTITIONING_16X8 = 0,
+  PARTITIONING_8X16,
+  PARTITIONING_8X8,
+  PARTITIONING_4X4,
+  NB_PARTITIONINGS,
+} SPLITMV_PARTITIONING_TYPE;
+
 /* For keyframes, intra block modes are predicted by the (already decoded)
    modes for the Y blocks to the left and above us; for interframes, there
    is a single probability table. */
@@ -184,9 +188,7 @@ typedef enum {
 union b_mode_info {
   struct {
     B_PREDICTION_MODE first;
-#if CONFIG_HYBRIDTRANSFORM8X8 || CONFIG_HYBRIDTRANSFORM || CONFIG_HYBRIDTRANSFORM16X16
     TX_TYPE           tx_type;
-#endif
 
 #if CONFIG_COMP_INTRA_PRED
     B_PREDICTION_MODE second;
@@ -220,7 +222,7 @@ typedef struct {
   int mv_ref_index[MAX_REF_FRAMES];
 #endif
 
-  unsigned char partitioning;
+  SPLITMV_PARTITIONING_TYPE partitioning;
   unsigned char mb_skip_coeff;                                /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */
   unsigned char need_to_clamp_mvs;
   unsigned char need_to_clamp_secondmv;
@@ -239,9 +241,7 @@ typedef struct {
   // Flag to turn prediction signal filter on(1)/off(0 ) at the MB level
   unsigned int pred_filter_enabled;
 #endif
-#if CONFIG_SWITCHABLE_INTERP
     INTERPOLATIONFILTERTYPE interp_filter;
-#endif
 
 #if CONFIG_SUPERBLOCKS
   // FIXME need a SB array of 4 MB_MODE_INFOs that
@@ -388,17 +388,11 @@ typedef struct MacroBlockD {
 
 } MACROBLOCKD;
 
-#if CONFIG_HYBRIDTRANSFORM || CONFIG_HYBRIDTRANSFORM16X16
 #define ACTIVE_HT 110                // quantization stepsize threshold
-#endif
 
-#if CONFIG_HYBRIDTRANSFORM8X8
 #define ACTIVE_HT8 300
-#endif
 
-#if CONFIG_HYBRIDTRANSFORM16X16
 #define ACTIVE_HT16 300
-#endif
 
 // convert MB_PREDICTION_MODE to B_PREDICTION_MODE
 static B_PREDICTION_MODE pred_mode_conv(MB_PREDICTION_MODE mode) {
@@ -442,7 +436,6 @@ static B_PREDICTION_MODE pred_mode_conv(MB_PREDICTION_MODE mode) {
   return b_mode;
 }
 
-#if CONFIG_HYBRIDTRANSFORM || CONFIG_HYBRIDTRANSFORM8X8 || CONFIG_HYBRIDTRANSFORM16X16
 // transform mapping
 static TX_TYPE txfm_map(B_PREDICTION_MODE bmode) {
   // map transform type
@@ -470,9 +463,7 @@ static TX_TYPE txfm_map(B_PREDICTION_MODE bmode) {
   }
   return tx_type;
 }
-#endif
 
-#if CONFIG_HYBRIDTRANSFORM
 static TX_TYPE get_tx_type_4x4(const MACROBLOCKD *xd, const BLOCKD *b) {
   TX_TYPE tx_type = DCT_DCT;
   if (xd->mode_info_context->mbmi.mode == B_PRED &&
@@ -481,9 +472,7 @@ static TX_TYPE get_tx_type_4x4(const MACROBLOCKD *xd, const BLOCKD *b) {
   }
   return tx_type;
 }
-#endif
 
-#if CONFIG_HYBRIDTRANSFORM8X8
 static TX_TYPE get_tx_type_8x8(const MACROBLOCKD *xd, const BLOCKD *b) {
   TX_TYPE tx_type = DCT_DCT;
   if (xd->mode_info_context->mbmi.mode == I8X8_PRED &&
@@ -492,9 +481,7 @@ static TX_TYPE get_tx_type_8x8(const MACROBLOCKD *xd, const BLOCKD *b) {
   }
   return tx_type;
 }
-#endif
 
-#if CONFIG_HYBRIDTRANSFORM16X16
 static TX_TYPE get_tx_type_16x16(const MACROBLOCKD *xd, const BLOCKD *b) {
   TX_TYPE tx_type = DCT_DCT;
   if (xd->mode_info_context->mbmi.mode < I8X8_PRED &&
@@ -503,34 +490,24 @@ static TX_TYPE get_tx_type_16x16(const MACROBLOCKD *xd, const BLOCKD *b) {
   }
   return tx_type;
 }
-#endif
 
-#if CONFIG_HYBRIDTRANSFORM || CONFIG_HYBRIDTRANSFORM8X8 || \
-    CONFIG_HYBRIDTRANSFORM16X16
 static TX_TYPE get_tx_type(const MACROBLOCKD *xd, const BLOCKD *b) {
   TX_TYPE tx_type = DCT_DCT;
   int ib = (b - xd->block);
   if (ib >= 16)
     return tx_type;
-#if CONFIG_HYBRIDTRANSFORM16X16
   if (xd->mode_info_context->mbmi.txfm_size == TX_16X16) {
     tx_type = get_tx_type_16x16(xd, b);
   }
-#endif
-#if CONFIG_HYBRIDTRANSFORM8X8
   if (xd->mode_info_context->mbmi.txfm_size  == TX_8X8) {
     ib = (ib & 8) + ((ib & 4) >> 1);
     tx_type = get_tx_type_8x8(xd, &xd->block[ib]);
   }
-#endif
-#if CONFIG_HYBRIDTRANSFORM
   if (xd->mode_info_context->mbmi.txfm_size  == TX_4X4) {
     tx_type = get_tx_type_4x4(xd, b);
   }
-#endif
   return tx_type;
 }
-#endif
 
 extern void vp8_build_block_doffsets(MACROBLOCKD *xd);
 extern void vp8_setup_block_dptrs(MACROBLOCKD *xd);
diff --git a/vp8/common/default_coef_probs.h b/vp8/common/default_coef_probs.h
index 5e21195ee..bd1f795d0 100644
--- a/vp8/common/default_coef_probs.h
+++ b/vp8/common/default_coef_probs.h
@@ -13,9 +13,9 @@
 
 
 static const vp8_prob default_coef_probs [BLOCK_TYPES]
-[COEF_BANDS]
-[PREV_COEF_CONTEXTS]
-[ENTROPY_NODES] = {
+                                         [COEF_BANDS]
+                                         [PREV_COEF_CONTEXTS]
+                                         [ENTROPY_NODES] = {
   {
     /* Block Type ( 0 ) */
     {
@@ -254,11 +254,10 @@ static const vp8_prob default_coef_probs [BLOCK_TYPES]
   }
 };
 
-#if CONFIG_HYBRIDTRANSFORM
 static const vp8_prob default_hybrid_coef_probs [BLOCK_TYPES]
-[COEF_BANDS]
-[PREV_COEF_CONTEXTS]
-[ENTROPY_NODES] = {
+                                                [COEF_BANDS]
+                                                [PREV_COEF_CONTEXTS]
+                                                [ENTROPY_NODES] = {
   {
     /* Block Type ( 0 ) */
     {
@@ -496,7 +495,6 @@ static const vp8_prob default_hybrid_coef_probs [BLOCK_TYPES]
     }
   }
 };
-#endif
 
 static const vp8_prob
 default_coef_probs_8x8[BLOCK_TYPES_8X8]
@@ -731,12 +729,11 @@ default_coef_probs_8x8[BLOCK_TYPES_8X8]
   }
 };
 
-#if CONFIG_HYBRIDTRANSFORM8X8
 static const vp8_prob
 default_hybrid_coef_probs_8x8[BLOCK_TYPES_8X8]
-                                 [COEF_BANDS]
-                                 [PREV_COEF_CONTEXTS]
-                                 [ENTROPY_NODES] = {
+                             [COEF_BANDS]
+                             [PREV_COEF_CONTEXTS]
+                             [ENTROPY_NODES] = {
   {
     /* block Type 0 */
     {
@@ -964,7 +961,6 @@ default_hybrid_coef_probs_8x8[BLOCK_TYPES_8X8]
     }
   }
 };
-#endif
 
 static const vp8_prob
   default_coef_probs_16x16[BLOCK_TYPES_16X16]
@@ -1173,7 +1169,6 @@ static const vp8_prob
   }
 };
 
-#if CONFIG_HYBRIDTRANSFORM16X16
 static const vp8_prob
   default_hybrid_coef_probs_16x16[BLOCK_TYPES_16X16]
                                  [COEF_BANDS]
@@ -1380,4 +1375,3 @@ static const vp8_prob
     }
   }
 };
-#endif
diff --git a/vp8/common/entropy.c b/vp8/common/entropy.c
index 90f7a52c2..a3f731a3c 100644
--- a/vp8/common/entropy.c
+++ b/vp8/common/entropy.c
@@ -64,8 +64,6 @@ DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]) = {
   7, 11, 14, 15,
 };
 
-
-#if CONFIG_HYBRIDTRANSFORM
 DECLARE_ALIGNED(16, const int, vp8_col_scan[16]) = {
   0, 4,  8, 12,
   1, 5,  9, 13,
@@ -78,7 +76,6 @@ DECLARE_ALIGNED(16, const int, vp8_row_scan[16]) = {
   8,   9, 10, 11,
   12, 13, 14, 15
 };
-#endif
 
 
 DECLARE_ALIGNED(64, const int, vp8_coef_bands_8x8[64]) = { 0, 1, 2, 3, 5, 4, 4, 5,
@@ -208,25 +205,19 @@ vp8_extra_bit_struct vp8_extra_bits[12] = {
 void vp8_default_coef_probs(VP8_COMMON *pc) {
   vpx_memcpy(pc->fc.coef_probs, default_coef_probs,
              sizeof(pc->fc.coef_probs));
-#if CONFIG_HYBRIDTRANSFORM
   vpx_memcpy(pc->fc.hybrid_coef_probs, default_hybrid_coef_probs,
              sizeof(pc->fc.hybrid_coef_probs));
-#endif
 
   vpx_memcpy(pc->fc.coef_probs_8x8, default_coef_probs_8x8,
              sizeof(pc->fc.coef_probs_8x8));
-#if CONFIG_HYBRIDTRANSFORM8X8
   vpx_memcpy(pc->fc.hybrid_coef_probs_8x8, default_hybrid_coef_probs_8x8,
              sizeof(pc->fc.hybrid_coef_probs_8x8));
-#endif
 
   vpx_memcpy(pc->fc.coef_probs_16x16, default_coef_probs_16x16,
              sizeof(pc->fc.coef_probs_16x16));
-#if CONFIG_HYBRIDTRANSFORM16X16
   vpx_memcpy(pc->fc.hybrid_coef_probs_16x16,
              default_hybrid_coef_probs_16x16,
              sizeof(pc->fc.hybrid_coef_probs_16x16));
-#endif
 }
 
 void vp8_coef_tree_initialize() {
@@ -344,7 +335,6 @@ void vp8_adapt_coef_probs(VP8_COMMON *cm) {
         }
       }
 
-#if CONFIG_HYBRIDTRANSFORM
   for (i = 0; i < BLOCK_TYPES; ++i)
     for (j = 0; j < COEF_BANDS; ++j)
       for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
@@ -366,7 +356,6 @@ void vp8_adapt_coef_probs(VP8_COMMON *cm) {
           else cm->fc.hybrid_coef_probs[i][j][k][t] = prob;
         }
       }
-#endif
 
   for (i = 0; i < BLOCK_TYPES_8X8; ++i)
     for (j = 0; j < COEF_BANDS; ++j)
@@ -390,7 +379,6 @@ void vp8_adapt_coef_probs(VP8_COMMON *cm) {
         }
       }
 
-#if CONFIG_HYBRIDTRANSFORM8X8
   for (i = 0; i < BLOCK_TYPES_8X8; ++i)
     for (j = 0; j < COEF_BANDS; ++j)
       for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
@@ -413,7 +401,6 @@ void vp8_adapt_coef_probs(VP8_COMMON *cm) {
           else cm->fc.hybrid_coef_probs_8x8[i][j][k][t] = prob;
         }
       }
-#endif
 
   for (i = 0; i < BLOCK_TYPES_16X16; ++i)
     for (j = 0; j < COEF_BANDS; ++j)
@@ -437,7 +424,6 @@ void vp8_adapt_coef_probs(VP8_COMMON *cm) {
         }
       }
 
-#if CONFIG_HYBRIDTRANSFORM16X16
   for (i = 0; i < BLOCK_TYPES_16X16; ++i)
     for (j = 0; j < COEF_BANDS; ++j)
       for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
@@ -458,5 +444,4 @@ void vp8_adapt_coef_probs(VP8_COMMON *cm) {
           else cm->fc.hybrid_coef_probs_16x16[i][j][k][t] = prob;
         }
       }
-#endif
 }
diff --git a/vp8/common/entropy.h b/vp8/common/entropy.h
index b9dfb344f..48a100ac6 100644
--- a/vp8/common/entropy.h
+++ b/vp8/common/entropy.h
@@ -104,10 +104,8 @@ struct VP8Common;
 void vp8_default_coef_probs(struct VP8Common *);
 extern DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]);
 
-#if CONFIG_HYBRIDTRANSFORM
 extern DECLARE_ALIGNED(16, const int, vp8_col_scan[16]);
 extern DECLARE_ALIGNED(16, const int, vp8_row_scan[16]);
-#endif
 
 extern short vp8_default_zig_zag_mask[16];
 extern DECLARE_ALIGNED(64, const int, vp8_default_zig_zag1d_8x8[64]);
diff --git a/vp8/common/entropymode.c b/vp8/common/entropymode.c
index 5627aa43a..bcd9f3707 100644
--- a/vp8/common/entropymode.c
+++ b/vp8/common/entropymode.c
@@ -215,9 +215,9 @@ const vp8_tree_index vp8_uv_mode_tree[VP8_UV_MODES * 2 - 2] = {
 };
 
 const vp8_tree_index vp8_mbsplit_tree[6] = {
-  -3, 2,
-  -2, 4,
-  -0, -1
+  -PARTITIONING_4X4,   2,
+  -PARTITIONING_8X8,   4,
+  -PARTITIONING_16X8, -PARTITIONING_8X16,
 };
 
 const vp8_tree_index vp8_mv_ref_tree[8] = {
@@ -301,11 +301,8 @@ void vp8_init_mbmode_probs(VP8_COMMON *x) {
 
   vpx_memcpy(x->fc.sub_mv_ref_prob, vp8_sub_mv_ref_prob2, sizeof(vp8_sub_mv_ref_prob2));
   vpx_memcpy(x->fc.mbsplit_prob, vp8_mbsplit_probs, sizeof(vp8_mbsplit_probs));
-#if CONFIG_SWITCHABLE_INTERP
   vpx_memcpy(x->fc.switchable_interp_prob, vp8_switchable_interp_prob,
              sizeof(vp8_switchable_interp_prob));
-#endif
-
 }
 
 
@@ -338,7 +335,6 @@ void vp8_kf_default_bmode_probs(vp8_prob p [VP8_BINTRAMODES] [VP8_BINTRAMODES] [
   } while (++i < VP8_BINTRAMODES);
 }
 
-#if CONFIG_SWITCHABLE_INTERP
 #if VP8_SWITCHABLE_FILTERS == 3
 const vp8_tree_index vp8_switchable_interp_tree[VP8_SWITCHABLE_FILTERS*2-2] = {
   -0, 2,
@@ -363,19 +359,10 @@ const vp8_prob vp8_switchable_interp_prob [VP8_SWITCHABLE_FILTERS+1]
   { 64},
   {192},
 };
-//#define SWITCHABLE_86
-#ifdef SWITCHABLE_86
-const INTERPOLATIONFILTERTYPE vp8_switchable_interp[VP8_SWITCHABLE_FILTERS] = {
-  EIGHTTAP, SIXTAP};
-const int vp8_switchable_interp_map[SWITCHABLE+1] = {1, -1, 0, -1, -1}; //8, 6
-#else
 const INTERPOLATIONFILTERTYPE vp8_switchable_interp[VP8_SWITCHABLE_FILTERS] = {
   EIGHTTAP, EIGHTTAP_SHARP};
 const int vp8_switchable_interp_map[SWITCHABLE+1] = {-1, -1, 0, 1, -1}; //8, 8s
 #endif
-#endif
-#endif
-
 
 void vp8_entropy_mode_init() {
   vp8_tokens_from_tree(vp8_bmode_encodings,   vp8_bmode_tree);
@@ -387,10 +374,8 @@ void vp8_entropy_mode_init() {
   vp8_tokens_from_tree(vp8_uv_mode_encodings,  vp8_uv_mode_tree);
   vp8_tokens_from_tree(vp8_i8x8_mode_encodings,  vp8_i8x8_mode_tree);
   vp8_tokens_from_tree(vp8_mbsplit_encodings, vp8_mbsplit_tree);
-#if CONFIG_SWITCHABLE_INTERP
   vp8_tokens_from_tree(vp8_switchable_interp_encodings,
                        vp8_switchable_interp_tree);
-#endif
 
   vp8_tokens_from_tree_offset(vp8_mv_ref_encoding_array,
                               vp8_mv_ref_tree, NEARESTMV);
diff --git a/vp8/common/entropymode.h b/vp8/common/entropymode.h
index 430c949a6..debb5659e 100644
--- a/vp8/common/entropymode.h
+++ b/vp8/common/entropymode.h
@@ -76,16 +76,14 @@ void vp8_kf_default_bmode_probs(vp8_prob dest [VP8_BINTRAMODES] [VP8_BINTRAMODES
 
 void vp8_adapt_mode_probs(struct VP8Common *);
 
-#if CONFIG_SWITCHABLE_INTERP
 #define VP8_SWITCHABLE_FILTERS 2 /* number of switchable filters */
 extern const  INTERPOLATIONFILTERTYPE vp8_switchable_interp
                   [VP8_SWITCHABLE_FILTERS];
-extern const  int vp8_switchable_interp_map[SWITCHABLE+1];
+extern const  int vp8_switchable_interp_map[SWITCHABLE + 1];
 extern const  vp8_tree_index vp8_switchable_interp_tree
-                  [2*(VP8_SWITCHABLE_FILTERS-1)];
+                  [2*(VP8_SWITCHABLE_FILTERS - 1)];
 extern struct vp8_token_struct vp8_switchable_interp_encodings
                   [VP8_SWITCHABLE_FILTERS];
 extern const  vp8_prob vp8_switchable_interp_prob
-                  [VP8_SWITCHABLE_FILTERS+1][VP8_SWITCHABLE_FILTERS-1];
-#endif
+                  [VP8_SWITCHABLE_FILTERS + 1][VP8_SWITCHABLE_FILTERS - 1];
 #endif
diff --git a/vp8/common/entropymv.c b/vp8/common/entropymv.c
index 6c31236ec..a442a2438 100644
--- a/vp8/common/entropymv.c
+++ b/vp8/common/entropymv.c
@@ -14,8 +14,6 @@
 
 //#define MV_COUNT_TESTING
 
-#if CONFIG_NEWMVENTROPY
-
 #define MV_COUNT_SAT 16
 #define MV_MAX_UPDATE_FACTOR 160
 
@@ -450,413 +448,13 @@ void vp8_adapt_nmv_probs(VP8_COMMON *cm, int usehp) {
   }
 }
 
-#else   /* CONFIG_NEWMVENTROPY */
-
-#define MV_COUNT_SAT 16
-#define MV_MAX_UPDATE_FACTOR 128
-
-const MV_CONTEXT_HP vp8_mv_update_probs_hp[2] = {
-  {{
-      237,
-      246,
-      253, 253, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
-      254, 254, 254, 254, 254, 250, 250, 252, 254, 254, 254
-    }
-  },
-  {{
-      231,
-      243,
-      245, 253, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
-      254, 254, 254, 254, 254, 251, 251, 254, 254, 254, 254
-    }
-  }
-};
-const MV_CONTEXT_HP vp8_default_mv_context_hp[2] = {
-  {{
-      /* row */
-      162,                                        /* is short */
-      128,                                        /* sign */
-      220, 204, 180, 192, 192, 119, 192, 192, 180, 140, 192, 192, 224, 224, 224, /* short tree */
-      128, 129, 132,  75, 145, 178, 206, 239, 254, 254, 254 /* long bits */
-    }
-  },
-  {{
-      /* same for column */
-      164,                                        /* is short */
-      128,
-      220, 204, 180, 192, 192, 119, 192, 192, 180, 140, 192, 192, 224, 224, 224, /* short tree */
-      128, 130, 130,  74, 148, 180, 203, 236, 254, 254, 254 /* long bits */
-    }
-  }
-};
-
-const MV_CONTEXT vp8_mv_update_probs[2] = {
-  {{
-      237,
-      246,
-      253, 253, 254, 254, 254, 254, 254,
-      254, 254, 254, 254, 254, 250, 250, 252, 254, 254
-    }
-  },
-  {{
-      231,
-      243,
-      245, 253, 254, 254, 254, 254, 254,
-      254, 254, 254, 254, 254, 251, 251, 254, 254, 254
-    }
-  }
-};
-const MV_CONTEXT vp8_default_mv_context[2] = {
-  {{
-      /* row */
-      162,                                        /* is short */
-      128,                                        /* sign */
-      225, 146, 172, 147, 214,  39, 156,          /* short tree */
-      128, 129, 132,  75, 145, 178, 206, 239, 254, 254 /* long bits */
-    }
-  },
-  {{
-      /* same for column */
-      164,                                        /* is short */
-      128,
-      204, 170, 119, 235, 140, 230, 228,
-      128, 130, 130,  74, 148, 180, 203, 236, 254, 254 /* long bits */
-    }
-  }
-};
-
-const vp8_tree_index vp8_small_mvtree_hp [30] = {
-  2,  16,
-  4,  10,
-  6,   8,
-  -0,  -1,
-  -2,  -3,
-  12,  14,
-  -4,  -5,
-  -6,  -7,
-  18,  24,
-  20,  22,
-  -8,  -9,
-  -10, -11,
-  26,  28,
-  -12, -13,
-  -14, -15
-};
-struct vp8_token_struct vp8_small_mvencodings_hp [16];
-
-const vp8_tree_index vp8_small_mvtree [14] = {
-  2, 8,
-  4, 6,
-  -0, -1,
-  -2, -3,
-  10, 12,
-  -4, -5,
-  -6, -7
-};
-struct vp8_token_struct vp8_small_mvencodings [8];
-
-__inline static void calc_prob(vp8_prob *p, const unsigned int ct[2], int pbits) {
-  const unsigned int tot = ct[0] + ct[1];
-  if (tot) {
-    const vp8_prob x = ((ct[0] * 255) / tot) & -(1 << (8 - pbits));
-    *p = x ? x : 1;
-  } else {
-    *p = 128;
-  }
-}
-
-static void compute_component_probs(
-  const unsigned int events [MVvals],
-  vp8_prob Pnew [MVPcount],
-  unsigned int is_short_ct[2],
-  unsigned int sign_ct[2],
-  unsigned int bit_ct [mvlong_width] [2],
-  unsigned int short_ct  [mvnum_short],
-  unsigned int short_bct [mvnum_short - 1] [2]
-) {
-  is_short_ct[0] = is_short_ct[1] = 0;
-  sign_ct[0] = sign_ct[1] = 0;
-  vpx_memset(bit_ct, 0, sizeof(unsigned int)*mvlong_width * 2);
-  vpx_memset(short_ct, 0, sizeof(unsigned int)*mvnum_short);
-  vpx_memset(short_bct, 0, sizeof(unsigned int) * (mvnum_short - 1) * 2);
-
-  {
-    const int c = events [mv_max];
-    is_short_ct [0] += c;    // Short vector
-    short_ct [0] += c;       // Magnitude distribution
-  }
-  {
-    int j = 1;
-    do {
-      const int c1 = events [mv_max + j];  // positive
-      const int c2 = events [mv_max - j];  // negative
-      const int c  = c1 + c2;
-      int a = j;
-
-      sign_ct [0] += c1;
-      sign_ct [1] += c2;
-
-      if (a < mvnum_short) {
-        is_short_ct [0] += c;     // Short vector
-        short_ct [a] += c;       // Magnitude distribution
-      } else {
-        int k = mvlong_width - 1;
-        is_short_ct [1] += c;     // Long vector
-
-        do
-          bit_ct [k] [(a >> k) & 1] += c;
-
-        while (--k >= 0);
-      }
-    } while (++j <= mv_max);
-  }
-  calc_prob(Pnew + mvpis_short, is_short_ct, 8);
-
-  calc_prob(Pnew + MVPsign, sign_ct, 8);
-
-  {
-    vp8_prob p [mvnum_short - 1];    /* actually only need branch ct */
-    int j = 0;
-
-    vp8_tree_probs_from_distribution(
-      mvnum_short, vp8_small_mvencodings, vp8_small_mvtree,
-      p, short_bct, short_ct,
-      256, 1
-    );
-
-    do
-      calc_prob(Pnew + MVPshort + j, short_bct[j], 8);
-    while (++j < mvnum_short - 1);
-  }
-
-  {
-    int j = 0;
-    do
-      calc_prob(Pnew + MVPbits + j, bit_ct[j], 8);
-    while (++j < mvlong_width);
-  }
-}
-
-static void compute_component_probs_hp(
-  const unsigned int events [MVvals_hp],
-  vp8_prob Pnew [MVPcount_hp],
-  unsigned int is_short_ct[2],
-  unsigned int sign_ct[2],
-  unsigned int bit_ct [mvlong_width_hp] [2],
-  unsigned int short_ct  [mvnum_short_hp],
-  unsigned int short_bct [mvnum_short_hp - 1] [2]
-) {
-  is_short_ct[0] = is_short_ct[1] = 0;
-  sign_ct[0] = sign_ct[1] = 0;
-  vpx_memset(bit_ct, 0, sizeof(unsigned int)*mvlong_width_hp * 2);
-  vpx_memset(short_ct, 0, sizeof(unsigned int)*mvnum_short_hp);
-  vpx_memset(short_bct, 0, sizeof(unsigned int) * (mvnum_short_hp - 1) * 2);
-
-  {
-    const int c = events [mv_max_hp];
-    is_short_ct [0] += c;    // Short vector
-    short_ct [0] += c;       // Magnitude distribution
-  }
-  {
-    int j = 1;
-    do {
-      const int c1 = events [mv_max_hp + j];  // positive
-      const int c2 = events [mv_max_hp - j];  // negative
-      const int c  = c1 + c2;
-      int a = j;
-
-      sign_ct [0] += c1;
-      sign_ct [1] += c2;
-
-      if (a < mvnum_short_hp) {
-        is_short_ct [0] += c;     // Short vector
-        short_ct [a] += c;       // Magnitude distribution
-      } else {
-        int k = mvlong_width_hp - 1;
-        is_short_ct [1] += c;     // Long vector
-
-        do
-          bit_ct [k] [(a >> k) & 1] += c;
-
-        while (--k >= 0);
-      }
-    } while (++j <= mv_max_hp);
-  }
-  calc_prob(Pnew + mvpis_short_hp, is_short_ct, 8);
-
-  calc_prob(Pnew + MVPsign_hp, sign_ct, 8);
-
-  {
-    vp8_prob p [mvnum_short_hp - 1];    /* actually only need branch ct */
-    int j = 0;
-
-    vp8_tree_probs_from_distribution(
-      mvnum_short_hp, vp8_small_mvencodings_hp, vp8_small_mvtree_hp,
-      p, short_bct, short_ct,
-      256, 1
-    );
-
-    do
-      calc_prob(Pnew + MVPshort_hp + j, short_bct[j], 8);
-    while (++j < mvnum_short_hp - 1);
-  }
-
-  {
-    int j = 0;
-    do
-      calc_prob(Pnew + MVPbits_hp + j, bit_ct[j], 8);
-    while (++j < mvlong_width_hp);
-  }
-}
-
-void vp8_adapt_mv_probs(VP8_COMMON *cm) {
-  int i, t, count, factor;
-#ifdef MV_COUNT_TESTING
-  printf("static const unsigned int\nMVcount[2][MVvals]={\n");
-  for (i = 0; i < 2; ++i) {
-    printf("  { ");
-    for (t = 0; t < MVvals; t++) {
-      printf("%d, ", cm->fc.MVcount[i][t]);
-      if (t % 16 == 15 && t != MVvals - 1) printf("\n    ");
-    }
-    printf("},\n");
-  }
-  printf("};\n");
-  printf("static const unsigned int\nMVcount_hp[2][MVvals_hp]={\n");
-  for (i = 0; i < 2; ++i) {
-    printf("  { ");
-    for (t = 0; t < MVvals_hp; t++) {
-      printf("%d, ", cm->fc.MVcount_hp[i][t]);
-      if (t % 16 == 15 && t != MVvals_hp - 1) printf("\n    ");
-    }
-    printf("},\n");
-  }
-  printf("};\n");
-#endif  /* MV_COUNT_TESTING */
-
-  for (i = 0; i < 2; ++i) {
-    int prob;
-    unsigned int is_short_ct[2];
-    unsigned int sign_ct[2];
-    unsigned int bit_ct [mvlong_width] [2];
-    unsigned int short_ct  [mvnum_short];
-    unsigned int short_bct [mvnum_short - 1] [2];
-    vp8_prob Pnew [MVPcount];
-    compute_component_probs(cm->fc.MVcount[i], Pnew,
-                            is_short_ct, sign_ct,
-                            bit_ct, short_ct, short_bct);
-    count = is_short_ct[0] + is_short_ct[1];
-    count = count > MV_COUNT_SAT ? MV_COUNT_SAT : count;
-    factor = (MV_MAX_UPDATE_FACTOR * count / MV_COUNT_SAT);
-    prob = ((int)cm->fc.pre_mvc[i].prob[mvpis_short] * (256 - factor) +
-            (int)Pnew[mvpis_short] * factor + 128) >> 8;
-    if (prob <= 0) cm->fc.mvc[i].prob[mvpis_short] = 1;
-    else if (prob > 255) cm->fc.mvc[i].prob[mvpis_short] = 255;
-    else cm->fc.mvc[i].prob[mvpis_short] = prob;
-
-    count = sign_ct[0] + sign_ct[1];
-    count = count > MV_COUNT_SAT ? MV_COUNT_SAT : count;
-    factor = (MV_MAX_UPDATE_FACTOR * count / MV_COUNT_SAT);
-    prob = ((int)cm->fc.pre_mvc[i].prob[MVPsign] * (256 - factor) +
-            (int)Pnew[MVPsign] * factor + 128) >> 8;
-    if (prob <= 0) cm->fc.mvc[i].prob[MVPsign] = 1;
-    else if (prob > 255) cm->fc.mvc[i].prob[MVPsign] = 255;
-    else cm->fc.mvc[i].prob[MVPsign] = prob;
-
-    for (t = 0; t < mvnum_short - 1; ++t) {
-      count = short_bct[t][0] + short_bct[t][1];
-      count = count > MV_COUNT_SAT ? MV_COUNT_SAT : count;
-      factor = (MV_MAX_UPDATE_FACTOR * count / MV_COUNT_SAT);
-      prob = ((int)cm->fc.pre_mvc[i].prob[MVPshort + t] * (256 - factor) +
-              (int)Pnew[MVPshort + t] * factor + 128) >> 8;
-      if (prob <= 0) cm->fc.mvc[i].prob[MVPshort + t] = 1;
-      else if (prob > 255) cm->fc.mvc[i].prob[MVPshort + t] = 255;
-      else cm->fc.mvc[i].prob[MVPshort + t] = prob;
-    }
-    for (t = 0; t < mvlong_width; ++t) {
-      count = bit_ct[t][0] + bit_ct[t][1];
-      count = count > MV_COUNT_SAT ? MV_COUNT_SAT : count;
-      factor = (MV_MAX_UPDATE_FACTOR * count / MV_COUNT_SAT);
-      prob = ((int)cm->fc.pre_mvc[i].prob[MVPbits + t] * (256 - factor) +
-              (int)Pnew[MVPbits + t] * factor + 128) >> 8;
-      if (prob <= 0) cm->fc.mvc[i].prob[MVPbits + t] = 1;
-      else if (prob > 255) cm->fc.mvc[i].prob[MVPbits + t] = 255;
-      else cm->fc.mvc[i].prob[MVPbits + t] = prob;
-    }
-  }
-  for (i = 0; i < 2; ++i) {
-    int prob;
-    unsigned int is_short_ct[2];
-    unsigned int sign_ct[2];
-    unsigned int bit_ct [mvlong_width_hp] [2];
-    unsigned int short_ct  [mvnum_short_hp];
-    unsigned int short_bct [mvnum_short_hp - 1] [2];
-    vp8_prob Pnew [MVPcount_hp];
-    compute_component_probs_hp(cm->fc.MVcount_hp[i], Pnew,
-                               is_short_ct, sign_ct,
-                               bit_ct, short_ct, short_bct);
-    count = is_short_ct[0] + is_short_ct[1];
-    count = count > MV_COUNT_SAT ? MV_COUNT_SAT : count;
-    factor = (MV_MAX_UPDATE_FACTOR * count / MV_COUNT_SAT);
-    prob = ((int)cm->fc.pre_mvc_hp[i].prob[mvpis_short_hp] * (256 - factor) +
-            (int)Pnew[mvpis_short_hp] * factor + 128) >> 8;
-    if (prob <= 0) cm->fc.mvc_hp[i].prob[mvpis_short_hp] = 1;
-    else if (prob > 255) cm->fc.mvc_hp[i].prob[mvpis_short_hp] = 255;
-    else cm->fc.mvc_hp[i].prob[mvpis_short_hp] = prob;
-
-    count = sign_ct[0] + sign_ct[1];
-    count = count > MV_COUNT_SAT ? MV_COUNT_SAT : count;
-    factor = (MV_MAX_UPDATE_FACTOR * count / MV_COUNT_SAT);
-    prob = ((int)cm->fc.pre_mvc_hp[i].prob[MVPsign_hp] * (256 - factor) +
-            (int)Pnew[MVPsign_hp] * factor + 128) >> 8;
-    if (prob <= 0) cm->fc.mvc_hp[i].prob[MVPsign_hp] = 1;
-    else if (prob > 255) cm->fc.mvc_hp[i].prob[MVPsign_hp] = 255;
-    else cm->fc.mvc_hp[i].prob[MVPsign_hp] = prob;
-
-    for (t = 0; t < mvnum_short_hp - 1; ++t) {
-      count = short_bct[t][0] + short_bct[t][1];
-      count = count > MV_COUNT_SAT ? MV_COUNT_SAT : count;
-      factor = (MV_MAX_UPDATE_FACTOR * count / MV_COUNT_SAT);
-      prob = ((int)cm->fc.pre_mvc_hp[i].prob[MVPshort_hp + t] * (256 - factor) +
-              (int)Pnew[MVPshort_hp + t] * factor + 128) >> 8;
-      if (prob <= 0) cm->fc.mvc_hp[i].prob[MVPshort_hp + t] = 1;
-      else if (prob > 255) cm->fc.mvc_hp[i].prob[MVPshort_hp + t] = 255;
-      else cm->fc.mvc_hp[i].prob[MVPshort_hp + t] = prob;
-    }
-    for (t = 0; t < mvlong_width_hp; ++t) {
-      count = bit_ct[t][0] + bit_ct[t][1];
-      count = count > MV_COUNT_SAT ? MV_COUNT_SAT : count;
-      factor = (MV_MAX_UPDATE_FACTOR * count / MV_COUNT_SAT);
-      prob = ((int)cm->fc.pre_mvc_hp[i].prob[MVPbits_hp + t] * (256 - factor) +
-              (int)Pnew[MVPbits_hp + t] * factor + 128) >> 8;
-      if (prob <= 0) cm->fc.mvc_hp[i].prob[MVPbits_hp + t] = 1;
-      else if (prob > 255) cm->fc.mvc_hp[i].prob[MVPbits_hp + t] = 255;
-      else cm->fc.mvc_hp[i].prob[MVPbits_hp + t] = prob;
-    }
-  }
-}
-
-#endif  /* CONFIG_NEWMVENTROPY */
-
 void vp8_entropy_mv_init() {
-#if CONFIG_NEWMVENTROPY
   vp8_tokens_from_tree(vp8_mv_joint_encodings, vp8_mv_joint_tree);
   vp8_tokens_from_tree(vp8_mv_class_encodings, vp8_mv_class_tree);
   vp8_tokens_from_tree(vp8_mv_class0_encodings, vp8_mv_class0_tree);
   vp8_tokens_from_tree(vp8_mv_fp_encodings, vp8_mv_fp_tree);
-#else
-  vp8_tokens_from_tree(vp8_small_mvencodings, vp8_small_mvtree);
-  vp8_tokens_from_tree(vp8_small_mvencodings_hp, vp8_small_mvtree_hp);
-#endif
 }
 
 void vp8_init_mv_probs(VP8_COMMON *cm) {
-#if CONFIG_NEWMVENTROPY
   vpx_memcpy(&cm->fc.nmvc, &vp8_default_nmv_context, sizeof(nmv_context));
-#else
-  vpx_memcpy(cm->fc.mvc,
-             vp8_default_mv_context, sizeof(vp8_default_mv_context));
-  vpx_memcpy(cm->fc.mvc_hp,
-             vp8_default_mv_context_hp, sizeof(vp8_default_mv_context_hp));
-#endif
 }
diff --git a/vp8/common/entropymv.h b/vp8/common/entropymv.h
index 1a193b172..80540a54c 100644
--- a/vp8/common/entropymv.h
+++ b/vp8/common/entropymv.h
@@ -22,7 +22,6 @@ void vp8_entropy_mv_init();
 void vp8_init_mv_probs(struct VP8Common *cm);
 void vp8_adapt_mv_probs(struct VP8Common *cm);
 
-#if CONFIG_NEWMVENTROPY
 void vp8_adapt_nmv_probs(struct VP8Common *cm, int usehp);
 void vp8_lower_mv_precision(MV *mv);
 int vp8_use_nmv_hp(const MV *ref);
@@ -129,65 +128,4 @@ void vp8_counts_to_nmv_context(
     unsigned int (*branch_ct_class0_hp)[2],
     unsigned int (*branch_ct_hp)[2]);
 
-#else  /* CONFIG_NEWMVENTROPY */
-
-enum {
-  mv_max  = 1023,              /* max absolute value of a MV component */
-  MVvals = (2 * mv_max) + 1,   /* # possible values "" */
-  mvlong_width = 10,       /* Large MVs have 9 bit magnitudes */
-  mvnum_short = 8,         /* magnitudes 0 through 7 */
-  mvnum_short_bits = 3,         /* number of bits for short mvs */
-
-  mvfp_max  = 255,              /* max absolute value of a full pixel MV component */
-  MVfpvals = (2 * mvfp_max) + 1, /* # possible full pixel MV values */
-
-  /* probability offsets for coding each MV component */
-
-  mvpis_short = 0,         /* short (<= 7) vs long (>= 8) */
-  MVPsign,                /* sign for non-zero */
-  MVPshort,               /* 8 short values = 7-position tree */
-
-  MVPbits = MVPshort + mvnum_short - 1, /* mvlong_width long value bits */
-  MVPcount = MVPbits + mvlong_width    /* (with independent probabilities) */
-};
-
-typedef struct mv_context {
-  vp8_prob prob[MVPcount];  /* often come in row, col pairs */
-} MV_CONTEXT;
-
-extern const MV_CONTEXT vp8_mv_update_probs[2], vp8_default_mv_context[2];
-
-enum {
-  mv_max_hp  = 2047,              /* max absolute value of a MV component */
-  MVvals_hp = (2 * mv_max_hp) + 1,   /* # possible values "" */
-  mvlong_width_hp = 11,       /* Large MVs have 9 bit magnitudes */
-  mvnum_short_hp = 16,         /* magnitudes 0 through 15 */
-  mvnum_short_bits_hp = 4,         /* number of bits for short mvs */
-
-  mvfp_max_hp  = 255,              /* max absolute value of a full pixel MV component */
-  MVfpvals_hp = (2 * mvfp_max_hp) + 1, /* # possible full pixel MV values */
-
-  /* probability offsets for coding each MV component */
-
-  mvpis_short_hp = 0,         /* short (<= 7) vs long (>= 8) */
-  MVPsign_hp,                /* sign for non-zero */
-  MVPshort_hp,               /* 8 short values = 7-position tree */
-
-  MVPbits_hp = MVPshort_hp + mvnum_short_hp - 1, /* mvlong_width long value bits */
-  MVPcount_hp = MVPbits_hp + mvlong_width_hp    /* (with independent probabilities) */
-};
-
-typedef struct mv_context_hp {
-  vp8_prob prob[MVPcount_hp];  /* often come in row, col pairs */
-} MV_CONTEXT_HP;
-
-extern const MV_CONTEXT_HP vp8_mv_update_probs_hp[2], vp8_default_mv_context_hp[2];
-
-extern const vp8_tree_index vp8_small_mvtree[];
-extern struct vp8_token_struct vp8_small_mvencodings [8];
-extern const vp8_tree_index vp8_small_mvtree_hp[];
-extern struct vp8_token_struct vp8_small_mvencodings_hp [16];
-
-#endif  /* CONFIG_NEWMVENTROPY */
-
 #endif
diff --git a/vp8/common/findnearmv.c b/vp8/common/findnearmv.c
index 7c9ea1066..5fc135090 100644
--- a/vp8/common/findnearmv.c
+++ b/vp8/common/findnearmv.c
@@ -10,7 +10,7 @@
 
 
 #include "findnearmv.h"
-#include "vp8/encoder/variance.h"
+#include "vp8/common/sadmxn.h"
 #include <limits.h>
 
 const unsigned char vp8_mbsplit_offset[4][16] = {
@@ -22,11 +22,7 @@ const unsigned char vp8_mbsplit_offset[4][16] = {
 
 static void lower_mv_precision(int_mv *mv, int usehp)
 {
-#if CONFIG_NEWMVENTROPY
   if (!usehp || !vp8_use_nmv_hp(&mv->as_mv)) {
-#else
-  if (!usehp) {
-#endif
     if (mv->as_mv.row & 1)
       mv->as_mv.row += (mv->as_mv.row > 0 ? -1 : 1);
     if (mv->as_mv.col & 1)
@@ -199,6 +195,23 @@ vp8_prob *vp8_mv_ref_probs(VP8_COMMON *pc,
 }
 
 #if CONFIG_NEWBESTREFMV
+unsigned int vp8_sad3x16_c(
+  const unsigned char *src_ptr,
+  int  src_stride,
+  const unsigned char *ref_ptr,
+  int  ref_stride,
+  int max_sad) {
+  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 3, 16);
+}
+unsigned int vp8_sad16x3_c(
+  const unsigned char *src_ptr,
+  int  src_stride,
+  const unsigned char *ref_ptr,
+  int  ref_stride,
+  int max_sad) {
+  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 3);
+}
+
 /* check a list of motion vectors by sad score using a number rows of pixels
  * above and a number cols of pixels in the left to select the one with best
  * score to use as ref motion vector
@@ -260,10 +273,10 @@ void vp8_find_best_ref_mvs(MACROBLOCKD *xd,
 
     sad = 0;
     if (xd->up_available)
-      sad += vp8_sad16x3_c(above_src, xd->dst.y_stride,
+      sad += vp8_sad16x3(above_src, xd->dst.y_stride,
                            above_ref + offset, ref_y_stride, INT_MAX);
     if (xd->left_available)
-      sad += vp8_sad3x16_c(left_src, xd->dst.y_stride,
+      sad += vp8_sad3x16(left_src, xd->dst.y_stride,
                            left_ref + offset, ref_y_stride, INT_MAX);
     // Add the entry to our list and then resort the list on score.
     sad_scores[i] = sad;
diff --git a/vp8/common/idct.h b/vp8/common/idct.h
index d096e8182..ae33df668 100644
--- a/vp8/common/idct.h
+++ b/vp8/common/idct.h
@@ -109,12 +109,9 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_lossless_c);
 extern prototype_second_order(vp8_short_inv_walsh4x4_1_lossless_c);
 #endif
 
-#if CONFIG_HYBRIDTRANSFORM8X8 || CONFIG_HYBRIDTRANSFORM || CONFIG_HYBRIDTRANSFORM16X16
 #include "vp8/common/blockd.h"
 void vp8_ihtllm_c(short *input, short *output, int pitch,
                   TX_TYPE tx_type, int tx_dim);
-#endif
-
 
 typedef prototype_idct((*vp8_idct_fn_t));
 typedef prototype_idct_scalar_add((*vp8_idct_scalar_add_fn_t));
diff --git a/vp8/common/idctllm.c b/vp8/common/idctllm.c
index d705fec32..c7369b2e2 100644
--- a/vp8/common/idctllm.c
+++ b/vp8/common/idctllm.c
@@ -26,9 +26,7 @@
 #include "vp8/common/idct.h"
 #include "vp8/common/systemdependent.h"
 
-#if CONFIG_HYBRIDTRANSFORM
 #include "vp8/common/blockd.h"
-#endif
 
 #include <math.h>
 
@@ -38,7 +36,6 @@ static const int rounding = 0;
 
 // TODO: these transforms can be further converted into integer forms
 //       for complexity optimization
-#if CONFIG_HYBRIDTRANSFORM || CONFIG_HYBRIDTRANSFORM8X8 || CONFIG_HYBRIDTRANSFORM16X16
 float idct_4[16] = {
   0.500000000000000,   0.653281482438188,   0.500000000000000,   0.270598050073099,
   0.500000000000000,   0.270598050073099,  -0.500000000000000,  -0.653281482438188,
@@ -90,9 +87,7 @@ float iadst_8[64] = {
   0.483002021635509,  -0.466553967085785,   0.434217976756762,  -0.387095214016348,
   0.326790388032145,  -0.255357107325375,   0.175227946595736,  -0.089131608307532
 };
-#endif
 
-#if CONFIG_HYBRIDTRANSFORM16X16 || CONFIG_HYBRIDTRANSFORM || CONFIG_HYBRIDTRANSFORM8X8
 float idct_16[256] = {
   0.250000,  0.351851,  0.346760,  0.338330,  0.326641,  0.311806,  0.293969,  0.273300,
   0.250000,  0.224292,  0.196424,  0.166664,  0.135299,  0.102631,  0.068975,  0.034654,
@@ -162,9 +157,7 @@ float iadst_16[256] = {
   0.347761, -0.344612,  0.338341, -0.329007,  0.316693, -0.301511,  0.283599, -0.263118,
   0.240255, -0.215215,  0.188227, -0.159534,  0.129396, -0.098087,  0.065889, -0.033094
 };
-#endif
 
-#if CONFIG_HYBRIDTRANSFORM8X8 || CONFIG_HYBRIDTRANSFORM || CONFIG_HYBRIDTRANSFORM16X16
 void vp8_ihtllm_c(short *input, short *output, int pitch,
                   TX_TYPE tx_type, int tx_dim) {
 
@@ -289,7 +282,6 @@ void vp8_ihtllm_c(short *input, short *output, int pitch,
   }
   vp8_clear_system_state(); // Make it simd safe : __asm emms;
 }
-#endif
 
 void vp8_short_idct4x4llm_c(short *input, short *output, int pitch) {
   int i;
diff --git a/vp8/common/loopfilter_filters.c b/vp8/common/loopfilter_filters.c
index 3f97d2101..323d48de8 100644
--- a/vp8/common/loopfilter_filters.c
+++ b/vp8/common/loopfilter_filters.c
@@ -7,8 +7,6 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-
-
 #include <stdlib.h>
 #include "vpx_config.h"
 #include "loopfilter.h"
@@ -94,6 +92,7 @@ static __inline void vp8_filter(signed char mask, uc hev, uc *op1,
   *op1 = u ^ 0x80;
 
 }
+
 void vp8_loop_filter_horizontal_edge_c
 (
   unsigned char *s,
@@ -218,6 +217,7 @@ static __inline void vp8_mbfilter(signed char mask, uc hev, uc flat,
     Filter2 = vp8_signed_char_clamp(vp8_filter + 3);
     Filter1 >>= 3;
     Filter2 >>= 3;
+
     u = vp8_signed_char_clamp(qs0 - Filter1);
     *oq0 = u ^ 0x80;
     u = vp8_signed_char_clamp(ps0 + Filter2);
@@ -271,8 +271,6 @@ void vp8_mbloop_filter_horizontal_edge_c
   } while (++i < count * 8);
 
 }
-
-
 void vp8_mbloop_filter_vertical_edge_c
 (
   unsigned char *s,
diff --git a/vp8/common/onyxc_int.h b/vp8/common/onyxc_int.h
index 0396a7087..38df3500a 100644
--- a/vp8/common/onyxc_int.h
+++ b/vp8/common/onyxc_int.h
@@ -51,27 +51,14 @@ typedef struct frame_contexts {
   vp8_prob sub_mv_ref_prob [SUBMVREF_COUNT][VP8_SUBMVREFS - 1];
   vp8_prob mbsplit_prob [VP8_NUMMBSPLITS - 1];
   vp8_prob coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-#if CONFIG_HYBRIDTRANSFORM
   vp8_prob hybrid_coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-#endif
   vp8_prob coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-#if CONFIG_HYBRIDTRANSFORM8X8
   vp8_prob hybrid_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-#endif
   vp8_prob coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-#if CONFIG_HYBRIDTRANSFORM16X16
   vp8_prob hybrid_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-#endif
 
-#if CONFIG_NEWMVENTROPY
   nmv_context nmvc;
   nmv_context pre_nmvc;
-#else
-  MV_CONTEXT mvc[2];
-  MV_CONTEXT_HP mvc_hp[2];
-  MV_CONTEXT pre_mvc[2];
-  MV_CONTEXT_HP pre_mvc_hp[2];
-#endif
   vp8_prob pre_bmode_prob [VP8_BINTRAMODES - 1];
   vp8_prob pre_ymode_prob [VP8_YMODES - 1]; /* interframe intra mode probs */
   vp8_prob pre_uv_mode_prob [VP8_YMODES][VP8_UV_MODES - 1];
@@ -87,56 +74,37 @@ typedef struct frame_contexts {
 
   vp8_prob pre_coef_probs [BLOCK_TYPES] [COEF_BANDS]
       [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-#if CONFIG_HYBRIDTRANSFORM
   vp8_prob pre_hybrid_coef_probs [BLOCK_TYPES] [COEF_BANDS]
       [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-#endif
 
   vp8_prob pre_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS]
       [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-#if CONFIG_HYBRIDTRANSFORM8X8
   vp8_prob pre_hybrid_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS]
       [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-#endif
 
   vp8_prob pre_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS]
       [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-#if CONFIG_HYBRIDTRANSFORM16X16
   vp8_prob pre_hybrid_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS]
       [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-#endif
 
   unsigned int coef_counts [BLOCK_TYPES] [COEF_BANDS]
       [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
-#if CONFIG_HYBRIDTRANSFORM
   unsigned int hybrid_coef_counts [BLOCK_TYPES] [COEF_BANDS]
       [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
-#endif
 
   unsigned int coef_counts_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS]
       [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
-#if CONFIG_HYBRIDTRANSFORM8X8
   unsigned int hybrid_coef_counts_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS]
       [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
-#endif
 
   unsigned int coef_counts_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS]
       [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
-#if CONFIG_HYBRIDTRANSFORM16X16
   unsigned int hybrid_coef_counts_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS]
       [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
-#endif
 
-#if CONFIG_NEWMVENTROPY
   nmv_context_counts NMVcount;
-#else
-  unsigned int MVcount [2] [MVvals];
-  unsigned int MVcount_hp [2] [MVvals_hp];
-#endif
-#if CONFIG_SWITCHABLE_INTERP
-  vp8_prob switchable_interp_prob[VP8_SWITCHABLE_FILTERS+1]
-                                 [VP8_SWITCHABLE_FILTERS-1];
-#endif
+  vp8_prob switchable_interp_prob[VP8_SWITCHABLE_FILTERS + 1]
+                                 [VP8_SWITCHABLE_FILTERS - 1];
 
   int mode_context[6][4];
   int mode_context_a[6][4];
@@ -161,10 +129,8 @@ typedef enum {
   ONLY_4X4            = 0,
   ALLOW_8X8           = 1,
   ALLOW_16X16         = 2,
-#if CONFIG_TX_SELECT
   TX_MODE_SELECT      = 3,
-#endif
-  NB_TXFM_MODES       = 3 + CONFIG_TX_SELECT,
+  NB_TXFM_MODES       = 4,
 } TXFM_MODE;
 
 typedef struct VP8_COMMON_RTCD {
@@ -302,10 +268,8 @@ typedef struct VP8Common {
 
   vp8_prob prob_comppred[COMP_PRED_CONTEXTS];
 
-#if CONFIG_TX_SELECT
   // FIXME contextualize
   vp8_prob prob_tx[TX_SIZE_MAX - 1];
-#endif
 
   vp8_prob mbskip_pred_probs[MBSKIP_CONTEXTS];
 
diff --git a/vp8/common/postproc.c b/vp8/common/postproc.c
index 17bbe3281..388612e8a 100644
--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c
@@ -783,7 +783,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
 
         if (mi->mbmi.mode == SPLITMV) {
           switch (mi->mbmi.partitioning) {
-            case 0 : {  /* mv_top_bottom */
+            case PARTITIONING_16X8 : {  /* mv_top_bottom */
               union b_mode_info *bmi = &mi->bmi[0];
               MV *mv = &bmi->mv.as_mv;
 
@@ -803,7 +803,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
 
               break;
             }
-            case 1 : {  /* mv_left_right */
+            case PARTITIONING_8X16 : {  /* mv_left_right */
               union b_mode_info *bmi = &mi->bmi[0];
               MV *mv = &bmi->mv.as_mv;
 
@@ -823,7 +823,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
 
               break;
             }
-            case 2 : {  /* mv_quarters   */
+            case PARTITIONING_8X8 : {  /* mv_quarters   */
               union b_mode_info *bmi = &mi->bmi[0];
               MV *mv = &bmi->mv.as_mv;
 
@@ -858,6 +858,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
               vp8_blit_line(x0 + 12,  x1, y0 + 12,  y1, y_buffer, y_stride);
               break;
             }
+            case PARTITIONING_4X4:
             default : {
               union b_mode_info *bmi = mi->bmi;
               int bx0, by0;
diff --git a/vp8/common/pred_common.c b/vp8/common/pred_common.c
index a32389433..a97eed8e4 100644
--- a/vp8/common/pred_common.c
+++ b/vp8/common/pred_common.c
@@ -63,7 +63,6 @@ unsigned char get_pred_context(const VP8_COMMON *const cm,
                      (m - cm->mode_info_stride)->mbmi.mb_skip_coeff;
       break;
 
-#if CONFIG_SWITCHABLE_INTERP
     case PRED_SWITCHABLE_INTERP:
       {
         int left_in_image = (m - 1)->mbmi.mb_in_image;
@@ -93,7 +92,6 @@ unsigned char get_pred_context(const VP8_COMMON *const cm,
           pred_context = VP8_SWITCHABLE_FILTERS;
       }
       break;
-#endif
 
     default:
       // TODO *** add error trap code.
@@ -175,11 +173,10 @@ const vp8_prob *get_pred_probs(const VP8_COMMON *const cm,
       pred_probability = &cm->mbskip_pred_probs[pred_context];
       break;
 
-#if CONFIG_SWITCHABLE_INTERP
     case PRED_SWITCHABLE_INTERP:
       pred_probability = &cm->fc.switchable_interp_prob[pred_context][0];
       break;
-#endif
+
     default:
       // TODO *** add error trap code.
       pred_probability = NULL;
diff --git a/vp8/common/pred_common.h b/vp8/common/pred_common.h
index 402e0235f..2a9875dfe 100644
--- a/vp8/common/pred_common.h
+++ b/vp8/common/pred_common.h
@@ -22,12 +22,9 @@ typedef enum {
   PRED_REF = 1,
   PRED_COMP = 2,
   PRED_MBSKIP = 3,
-#if CONFIG_SWITCHABLE_INTERP
-  PRED_SWITCHABLE_INTERP = 4,
-#endif
+  PRED_SWITCHABLE_INTERP = 4
 } PRED_ID;
 
-
 extern unsigned char get_pred_context(const VP8_COMMON *const cm,
                                       const MACROBLOCKD *const xd,
                                       PRED_ID pred_id);
diff --git a/vp8/common/reconinter.c b/vp8/common/reconinter.c
index a41d233ab..6c60845fb 100644
--- a/vp8/common/reconinter.c
+++ b/vp8/common/reconinter.c
@@ -36,13 +36,7 @@ void vp8_setup_interp_filters(MACROBLOCKD *xd,
         &cm->rtcd.subpix, sixtap_avg8x8);
     xd->subpixel_predict_avg16x16 = SUBPIX_INVOKE(
         &cm->rtcd.subpix, sixtap_avg16x16);
-  }
-  else if (mcomp_filter_type == EIGHTTAP
-#if CONFIG_SWITCHABLE_INTERP
-           ||
-           mcomp_filter_type == SWITCHABLE
-#endif
-          ) {
+  } else if (mcomp_filter_type == EIGHTTAP || mcomp_filter_type == SWITCHABLE) {
     xd->subpixel_predict        = SUBPIX_INVOKE(
         &cm->rtcd.subpix, eighttap4x4);
     xd->subpixel_predict8x4     = SUBPIX_INVOKE(
@@ -965,7 +959,7 @@ static void build_inter4x4_predictors_mb(MACROBLOCKD *xd) {
   MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
   BLOCKD *blockd = xd->block;
 
-  if (xd->mode_info_context->mbmi.partitioning < 3) {
+  if (xd->mode_info_context->mbmi.partitioning != PARTITIONING_4X4) {
     blockd[ 0].bmi = xd->mode_info_context->bmi[ 0];
     blockd[ 2].bmi = xd->mode_info_context->bmi[ 2];
     blockd[ 8].bmi = xd->mode_info_context->bmi[ 8];
diff --git a/vp8/common/rtcd_defs.sh b/vp8/common/rtcd_defs.sh
index ef272df90..ea64c9682 100644
--- a/vp8/common/rtcd_defs.sh
+++ b/vp8/common/rtcd_defs.sh
@@ -125,22 +125,22 @@ specialize vp8_comp_intra_uv4x4_predict;
 # Loopfilter
 #
 prototype void vp8_loop_filter_mbv "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
-specialize vp8_loop_filter_mbv;
+specialize vp8_loop_filter_mbv sse2
 
 prototype void vp8_loop_filter_bv "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
-specialize vp8_loop_filter_bv;
+specialize vp8_loop_filter_bv sse2
 
 prototype void vp8_loop_filter_bv8x8 "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
-specialize vp8_loop_filter_bv8x8;
+specialize vp8_loop_filter_bv8x8 sse2
 
 prototype void vp8_loop_filter_mbh "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
-specialize vp8_loop_filter_mbh;
+specialize vp8_loop_filter_mbh sse2
 
 prototype void vp8_loop_filter_bh "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
-specialize vp8_loop_filter_bh;
+specialize vp8_loop_filter_bh sse2
 
 prototype void vp8_loop_filter_bh8x8 "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
-specialize vp8_loop_filter_bh8x8;
+specialize vp8_loop_filter_bh8x8 sse2
 
 prototype void vp8_loop_filter_simple_mbv "unsigned char *y, int ystride, const unsigned char *blimit"
 specialize vp8_loop_filter_simple_mbv mmx sse2 media neon
@@ -174,3 +174,210 @@ vp8_loop_filter_simple_bh_sse2=vp8_loop_filter_bhs_sse2
 vp8_loop_filter_simple_bh_media=vp8_loop_filter_bhs_armv6
 vp8_loop_filter_simple_bh_neon=vp8_loop_filter_bhs_neon
 
+#
+# sad 16x3, 3x16
+#
+prototype unsigned int vp8_sad16x3 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int ref_stride, int max_sad"
+specialize vp8_sad16x3
+
+prototype unsigned int vp8_sad3x16 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int ref_stride, int max_sad"
+specialize vp8_sad3x16
+
+#
+# Encoder functions below this point.
+#
+if [ "$CONFIG_VP8_ENCODER" = "yes" ]; then
+
+
+# variance
+[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2
+
+prototype unsigned int vp8_variance32x32 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp8_variance32x32
+
+prototype unsigned int vp8_variance16x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp8_variance16x16 mmx sse2
+vp8_variance16x16_sse2=vp8_variance16x16_wmt
+vp8_variance16x16_mmx=vp8_variance16x16_mmx
+
+prototype unsigned int vp8_variance16x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp8_variance16x8 mmx sse2
+vp8_variance16x8_sse2=vp8_variance16x8_wmt
+vp8_variance16x8_mmx=vp8_variance16x8_mmx
+
+prototype unsigned int vp8_variance8x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp8_variance8x16 mmx sse2
+vp8_variance8x16_sse2=vp8_variance8x16_wmt
+vp8_variance8x16_mmx=vp8_variance8x16_mmx
+
+prototype unsigned int vp8_variance8x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp8_variance8x8 mmx sse2
+vp8_variance8x8_sse2=vp8_variance8x8_wmt
+vp8_variance8x8_mmx=vp8_variance8x8_mmx
+
+prototype unsigned int vp8_variance4x4 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp8_variance4x4 mmx sse2
+vp8_variance4x4_sse2=vp8_variance4x4_wmt
+vp8_variance4x4_mmx=vp8_variance4x4_mmx
+
+prototype unsigned int vp8_sub_pixel_variance32x32 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp8_sub_pixel_variance32x32
+
+prototype unsigned int vp8_sub_pixel_variance16x16 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp8_sub_pixel_variance16x16 sse2 mmx ssse3
+vp8_sub_pixel_variance16x16_sse2=vp8_sub_pixel_variance16x16_wmt
+
+prototype unsigned int vp8_sub_pixel_variance8x16 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp8_sub_pixel_variance8x16 sse2 mmx
+vp8_sub_pixel_variance8x16_sse2=vp8_sub_pixel_variance8x16_wmt
+
+prototype unsigned int vp8_sub_pixel_variance16x8 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp8_sub_pixel_variance16x8 sse2 mmx ssse3
+vp8_sub_pixel_variance16x8_sse2=vp8_sub_pixel_variance16x8_ssse3;
+vp8_sub_pixel_variance16x8_sse2=vp8_sub_pixel_variance16x8_wmt
+
+prototype unsigned int vp8_sub_pixel_variance8x8 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp8_sub_pixel_variance8x8 sse2 mmx
+vp8_sub_pixel_variance8x8_sse2=vp8_sub_pixel_variance8x8_wmt
+
+prototype unsigned int vp8_sub_pixel_variance4x4 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp8_sub_pixel_variance4x4 sse2 mmx
+vp8_sub_pixel_variance4x4_sse2=vp8_sub_pixel_variance4x4_wmt
+
+prototype unsigned int vp8_sad32x32 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"
+specialize vp8_sad32x32
+
+prototype unsigned int vp8_sad16x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"
+specialize vp8_sad16x16 mmx sse2 sse3
+vp8_sad16x16_sse2=vp8_sad16x16_wmt
+
+prototype unsigned int vp8_sad16x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"
+specialize vp8_sad16x8 mmx sse2
+vp8_sad16x8_sse2=vp8_sad16x8_wmt
+
+prototype unsigned int vp8_sad8x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"
+specialize vp8_sad8x16 mmx sse2
+vp8_sad8x16_sse2=vp8_sad8x16_wmt
+
+prototype unsigned int vp8_sad8x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"
+specialize vp8_sad8x8 mmx sse2
+vp8_sad8x8_sse2=vp8_sad8x8_wmt
+
+prototype unsigned int vp8_sad4x4 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"
+specialize vp8_sad4x4 mmx sse2
+vp8_sad4x4_sse2=vp8_sad4x4_wmt
+
+prototype unsigned int vp8_variance_halfpixvar16x16_h "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp8_variance_halfpixvar16x16_h mmx sse2
+vp8_variance_halfpixvar16x16_h_sse2=vp8_variance_halfpixvar16x16_h_wmt
+
+prototype unsigned int vp8_variance_halfpixvar16x16_v "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp8_variance_halfpixvar16x16_v mmx sse2
+vp8_variance_halfpixvar16x16_v_sse2=vp8_variance_halfpixvar16x16_v_wmt
+
+prototype unsigned int vp8_variance_halfpixvar16x16_hv "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp8_variance_halfpixvar16x16_hv mmx sse2
+vp8_variance_halfpixvar16x16_hv_sse2=vp8_variance_halfpixvar16x16_hv_wmt
+
+prototype unsigned int vp8_variance_halfpixvar32x32_h "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp8_variance_halfpixvar32x32_h
+
+prototype unsigned int vp8_variance_halfpixvar32x32_v "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp8_variance_halfpixvar32x32_v
+
+prototype unsigned int vp8_variance_halfpixvar32x32_hv "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp8_variance_halfpixvar32x32_hv
+
+prototype void vp8_sad32x32x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
+specialize vp8_sad32x32x3
+
+prototype void vp8_sad16x16x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
+specialize vp8_sad16x16x3 sse3 ssse3
+
+prototype void vp8_sad16x8x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
+specialize vp8_sad16x8x3 sse3 ssse3
+
+prototype void vp8_sad8x16x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
+specialize vp8_sad8x16x3 sse3
+
+prototype void vp8_sad8x8x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
+specialize vp8_sad8x8x3 sse3
+
+prototype void vp8_sad4x4x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
+specialize vp8_sad4x4x3 sse3
+
+prototype void vp8_sad32x32x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
+specialize vp8_sad32x32x8
+
+prototype void vp8_sad16x16x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
+specialize vp8_sad16x16x8 sse4
+
+prototype void vp8_sad16x8x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
+specialize vp8_sad16x8x8 sse4
+
+prototype void vp8_sad8x16x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
+specialize vp8_sad8x16x8 sse4
+
+prototype void vp8_sad8x8x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
+specialize vp8_sad8x8x8 sse4
+
+prototype void vp8_sad4x4x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
+specialize vp8_sad4x4x8 sse4
+
+prototype void vp8_sad32x32x4d "const unsigned char *src_ptr, int  src_stride, unsigned char *ref_ptr[], int  ref_stride, unsigned int *sad_array"
+specialize vp8_sad32x32x4d
+
+prototype void vp8_sad16x16x4d "const unsigned char *src_ptr, int  src_stride, unsigned char *ref_ptr[], int  ref_stride, unsigned int *sad_array"
+specialize vp8_sad16x16x4d sse3
+
+prototype void vp8_sad16x8x4d "const unsigned char *src_ptr, int  src_stride, unsigned char *ref_ptr[], int  ref_stride, unsigned int *sad_array"
+specialize vp8_sad16x8x4d sse3
+
+prototype void vp8_sad8x16x4d "const unsigned char *src_ptr, int  src_stride, unsigned char *ref_ptr[], int  ref_stride, unsigned int *sad_array"
+specialize vp8_sad8x16x4d sse3
+
+prototype void vp8_sad8x8x4d "const unsigned char *src_ptr, int  src_stride, unsigned char *ref_ptr[], int  ref_stride, unsigned int *sad_array"
+specialize vp8_sad8x8x4d sse3
+
+prototype void vp8_sad4x4x4d "const unsigned char *src_ptr, int  src_stride, unsigned char *ref_ptr[], int  ref_stride, unsigned int *sad_array"
+specialize vp8_sad4x4x4d sse3
+
+#
+# Block copy
+#
+case $arch in
+    x86*)
+    prototype void vp8_copy32xn "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n"
+    specialize vp8_copy32xn sse2 sse3
+    ;;
+esac
+
+prototype unsigned int vp8_sub_pixel_mse16x16 "const unsigned char  *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, const unsigned char *dst_ptr, int dst_pixels_per_line, unsigned int *sse"
+specialize vp8_sub_pixel_mse16x16 sse2 mmx
+vp8_sub_pixel_mse16x16_sse2=vp8_sub_pixel_mse16x16_wmt
+
+prototype unsigned int vp8_mse16x16 "const unsigned char *src_ptr, int  source_stride, const unsigned char *ref_ptr, int  recon_stride, unsigned int *sse"
+specialize vp8_mse16x16 mmx sse2
+vp8_mse16x16_sse2=vp8_mse16x16_wmt
+
+prototype unsigned int vp8_sub_pixel_mse32x32 "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp8_sub_pixel_mse32x32
+
+prototype unsigned int vp8_get_mb_ss "const short *"
+specialize vp8_get_mb_ss mmx sse2
+
+#
+# Structured Similarity (SSIM)
+#
+if [ "$CONFIG_INTERNAL_STATS" = "yes" ]; then
+    [ $arch = "x86_64" ] && sse2_on_x86_64=sse2
+
+    prototype void vp8_ssim_parms_8x8 "unsigned char *s, int sp, unsigned char *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"
+    specialize vp8_ssim_parms_8x8 $sse2_on_x86_64
+
+    prototype void vp8_ssim_parms_16x16 "unsigned char *s, int sp, unsigned char *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"
+    specialize vp8_ssim_parms_16x16 $sse2_on_x86_64
+fi
+
+fi
+# end encoder functions
diff --git a/vp8/common/sadmxn.h b/vp8/common/sadmxn.h
new file mode 100644
index 000000000..47b8dfc58
--- /dev/null
+++ b/vp8/common/sadmxn.h
@@ -0,0 +1,37 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef __INC_SAD_H
+#define __INC_SAD_H
+
+static __inline
+unsigned int sad_mx_n_c(
+  const unsigned char *src_ptr,
+  int  src_stride,
+  const unsigned char *ref_ptr,
+  int  ref_stride,
+  int m,
+  int n) {
+  int r, c;
+  unsigned int sad = 0;
+
+  for (r = 0; r < n; r++) {
+    for (c = 0; c < m; c++) {
+      sad += abs(src_ptr[c] - ref_ptr[c]);
+    }
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  }
+
+  return sad;
+}
+
+#endif
diff --git a/vp8/common/x86/loopfilter_mmx.asm b/vp8/common/x86/loopfilter_mmx.asm
index 697a5dee6..63b72385b 100644
--- a/vp8/common/x86/loopfilter_mmx.asm
+++ b/vp8/common/x86/loopfilter_mmx.asm
@@ -594,790 +594,6 @@ sym(vp8_loop_filter_vertical_edge_mmx):
     ret
 
 
-;void vp8_mbloop_filter_horizontal_edge_mmx
-;(
-;    unsigned char *src_ptr,
-;    int  src_pixel_step,
-;    const char *blimit,
-;    const char *limit,
-;    const char *thresh,
-;    int count
-;)
-global sym(vp8_mbloop_filter_horizontal_edge_mmx)
-sym(vp8_mbloop_filter_horizontal_edge_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub          rsp, 32      ; reserve 32 bytes
-    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
-    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
-
-        mov         rsi, arg(0) ;src_ptr
-        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
-
-        movsxd      rcx, dword ptr arg(5) ;count
-.next8_mbh:
-        mov         rdx, arg(3) ;limit
-        movq        mm7, [rdx]
-        mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
-        add         rdi, rax
-
-        ; calculate breakout conditions
-        movq        mm2, [rdi+2*rax]      ; q3
-
-        movq        mm1, [rsi+2*rax]      ; q2
-        movq        mm6, mm1              ; q2
-        psubusb     mm1, mm2              ; q2-=q3
-        psubusb     mm2, mm6              ; q3-=q2
-        por         mm1, mm2              ; abs(q3-q2)
-        psubusb     mm1, mm7
-
-
-        ; mm1 = abs(q3-q2), mm6 =q2, mm7 = limit
-        movq        mm4, [rsi+rax]        ; q1
-        movq        mm3, mm4              ; q1
-        psubusb     mm4, mm6              ; q1-=q2
-        psubusb     mm6, mm3              ; q2-=q1
-        por         mm4, mm6              ; abs(q2-q1)
-        psubusb     mm4, mm7
-        por        mm1, mm4
-
-
-        ; mm1 = mask,      mm3=q1, mm7 = limit
-
-        movq        mm4, [rsi]            ; q0
-        movq        mm0, mm4              ; q0
-        psubusb     mm4, mm3              ; q0-=q1
-        psubusb     mm3, mm0              ; q1-=q0
-        por         mm4, mm3              ; abs(q0-q1)
-        movq        t0, mm4               ; save to t0
-        psubusb     mm4, mm7
-        por        mm1, mm4
-
-
-        ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1)
-
-        neg         rax                   ; negate pitch to deal with above border
-
-        movq        mm2, [rsi+4*rax]      ; p3
-        movq        mm4, [rdi+4*rax]      ; p2
-        movq        mm5, mm4              ; p2
-        psubusb     mm4, mm2              ; p2-=p3
-        psubusb     mm2, mm5              ; p3-=p2
-        por         mm4, mm2              ; abs(p3 - p2)
-        psubusb     mm4, mm7
-        por        mm1, mm4
-        ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1)
-
-        movq        mm4, [rsi+2*rax]      ; p1
-        movq        mm3, mm4              ; p1
-        psubusb     mm4, mm5              ; p1-=p2
-        psubusb     mm5, mm3              ; p2-=p1
-        por         mm4, mm5              ; abs(p2 - p1)
-        psubusb     mm4, mm7
-        por        mm1, mm4
-
-        movq        mm2, mm3              ; p1
-
-
-        ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1)
-
-        movq        mm4, [rsi+rax]        ; p0
-        movq        mm5, mm4              ; p0
-        psubusb     mm4, mm3              ; p0-=p1
-        psubusb     mm3, mm5              ; p1-=p0
-        por         mm4, mm3              ; abs(p1 - p0)
-        movq        t1, mm4               ; save to t1
-        psubusb     mm4, mm7
-        por        mm1, mm4
-        ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1) t1 = abs(p1-p0)
-        ; mm5 = p0
-        movq        mm3, [rdi]            ; q1
-        movq        mm4, mm3              ; q1
-        psubusb     mm3, mm2              ; q1-=p1
-        psubusb     mm2, mm4              ; p1-=q1
-        por         mm2, mm3              ; abs(p1-q1)
-        pand        mm2, [GLOBAL(tfe)]    ; set lsb of each byte to zero
-        psrlw       mm2, 1                ; abs(p1-q1)/2
-
-        movq        mm6, mm5              ; p0
-        movq        mm3, mm0              ; q0
-        psubusb     mm5, mm3              ; p0-=q0
-        psubusb     mm3, mm6              ; q0-=p0
-        por         mm5, mm3              ; abs(p0 - q0)
-        paddusb     mm5, mm5              ; abs(p0-q0)*2
-        paddusb     mm5, mm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
-        mov         rdx, arg(2) ;blimit           ; get blimit
-        movq        mm7, [rdx]            ; blimit
-
-        psubusb     mm5,    mm7           ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
-        por         mm1,    mm5
-        pxor        mm5,    mm5
-        pcmpeqb     mm1,    mm5           ; mask mm1
-
-        ; mm1 = mask, mm0=q0,  mm7 = blimit, t0 = abs(q0-q1) t1 = abs(p1-p0)
-        ; mm6 = p0,
-
-        ; calculate high edge variance
-        mov         rdx, arg(4) ;thresh           ; get thresh
-        movq        mm7, [rdx]            ;
-        movq        mm4, t0               ; get abs (q1 - q0)
-        psubusb     mm4, mm7
-        movq        mm3, t1               ; get abs (p1 - p0)
-        psubusb     mm3, mm7
-        paddb       mm4, mm3              ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
-
-        pcmpeqb     mm4,        mm5
-
-        pcmpeqb     mm5,        mm5
-        pxor        mm4,        mm5
-
-
-
-        ; mm1 = mask, mm0=q0,  mm7 = thresh, t0 = abs(q0-q1) t1 = abs(p1-p0)
-        ; mm6 = p0, mm4=hev
-        ; start work on filters
-        movq        mm2, [rsi+2*rax]      ; p1
-        movq        mm7, [rdi]            ; q1
-        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values
-        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values
-        psubsb      mm2, mm7              ; p1 - q1
-
-        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values
-        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values
-        movq        mm3, mm0              ; q0
-        psubsb      mm0, mm6              ; q0 - p0
-        paddsb      mm2, mm0              ; 1 * (q0 - p0) + (p1 - q1)
-        paddsb      mm2, mm0              ; 2 * (q0 - p0)
-        paddsb      mm2, mm0              ; 3 * (q0 - p0) + (p1 - q1)
-        pand        mm1, mm2              ; mask filter values we don't care about
-
-
-        ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0
-        movq        mm2, mm1              ; vp8_filter
-        pand        mm2, mm4;             ; Filter2 = vp8_filter & hev
-
-        movq        mm5,        mm2       ;
-        paddsb      mm5,        [GLOBAL(t3)];
-
-        pxor        mm0, mm0              ; 0
-        pxor        mm7, mm7              ; 0
-
-        punpcklbw   mm0, mm5              ; e0f0g0h0
-        psraw       mm0, 11               ; sign extended shift right by 3
-        punpckhbw   mm7, mm5              ; a0b0c0d0
-        psraw       mm7, 11               ; sign extended shift right by 3
-        packsswb    mm0, mm7              ; Filter2 >>=3;
-
-        movq        mm5, mm0              ; Filter2
-
-        paddsb      mm2, [GLOBAL(t4)]     ; vp8_signed_char_clamp(Filter2 + 4)
-        pxor        mm0, mm0              ; 0
-        pxor        mm7, mm7              ; 0
-
-        punpcklbw   mm0, mm2              ; e0f0g0h0
-        psraw       mm0, 11               ; sign extended shift right by 3
-        punpckhbw   mm7, mm2              ; a0b0c0d0
-        psraw       mm7, 11               ; sign extended shift right by 3
-        packsswb    mm0, mm7              ; Filter2 >>=3;
-
-        ; mm0= filter2 mm1 = vp8_filter,  mm3 =qs0 mm5=s mm4 =hev mm6=ps0
-        psubsb      mm3, mm0              ; qs0 =qs0 - filter1
-        paddsb      mm6, mm5              ; ps0 =ps0 + Fitler2
-
-        ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0
-        ; vp8_filter &= ~hev;
-        ; Filter2 = vp8_filter;
-        pandn       mm4, mm1              ; vp8_filter&=~hev
-
-
-        ; mm3=qs0, mm4=filter2, mm6=ps0
-
-        ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
-        ; s = vp8_signed_char_clamp(qs0 - u);
-        ; *oq0 = s^0x80;
-        ; s = vp8_signed_char_clamp(ps0 + u);
-        ; *op0 = s^0x80;
-        pxor        mm0, mm0
-
-        pxor        mm1, mm1
-        pxor        mm2, mm2
-        punpcklbw   mm1, mm4
-        punpckhbw   mm2, mm4
-        pmulhw      mm1, [GLOBAL(s27)]
-        pmulhw      mm2, [GLOBAL(s27)]
-        paddw       mm1, [GLOBAL(s63)]
-        paddw       mm2, [GLOBAL(s63)]
-        psraw       mm1, 7
-        psraw       mm2, 7
-        packsswb    mm1, mm2
-
-        psubsb      mm3, mm1
-        paddsb      mm6, mm1
-
-        pxor        mm3, [GLOBAL(t80)]
-        pxor        mm6, [GLOBAL(t80)]
-        movq        [rsi+rax], mm6
-        movq        [rsi],     mm3
-
-        ; roughly 2/7th difference across boundary
-        ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
-        ; s = vp8_signed_char_clamp(qs1 - u);
-        ; *oq1 = s^0x80;
-        ; s = vp8_signed_char_clamp(ps1 + u);
-        ; *op1 = s^0x80;
-        pxor        mm1, mm1
-        pxor        mm2, mm2
-        punpcklbw   mm1, mm4
-        punpckhbw   mm2, mm4
-        pmulhw      mm1, [GLOBAL(s18)]
-        pmulhw      mm2, [GLOBAL(s18)]
-        paddw       mm1, [GLOBAL(s63)]
-        paddw       mm2, [GLOBAL(s63)]
-        psraw       mm1, 7
-        psraw       mm2, 7
-        packsswb    mm1, mm2
-
-        movq        mm3, [rdi]
-        movq        mm6, [rsi+rax*2]       ; p1
-
-        pxor        mm3, [GLOBAL(t80)]
-        pxor        mm6, [GLOBAL(t80)]
-
-        paddsb      mm6, mm1
-        psubsb      mm3, mm1
-
-        pxor        mm6, [GLOBAL(t80)]
-        pxor        mm3, [GLOBAL(t80)]
-        movq        [rdi], mm3
-        movq        [rsi+rax*2], mm6
-
-        ; roughly 1/7th difference across boundary
-        ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
-        ; s = vp8_signed_char_clamp(qs2 - u);
-        ; *oq2 = s^0x80;
-        ; s = vp8_signed_char_clamp(ps2 + u);
-        ; *op2 = s^0x80;
-        pxor        mm1, mm1
-        pxor        mm2, mm2
-        punpcklbw   mm1, mm4
-        punpckhbw   mm2, mm4
-        pmulhw      mm1, [GLOBAL(s9)]
-        pmulhw      mm2, [GLOBAL(s9)]
-        paddw       mm1, [GLOBAL(s63)]
-        paddw       mm2, [GLOBAL(s63)]
-        psraw       mm1, 7
-        psraw       mm2, 7
-        packsswb    mm1, mm2
-
-
-        movq        mm6, [rdi+rax*4]
-        neg         rax
-        movq        mm3, [rdi+rax  ]
-
-        pxor        mm6, [GLOBAL(t80)]
-        pxor        mm3, [GLOBAL(t80)]
-
-        paddsb      mm6, mm1
-        psubsb      mm3, mm1
-
-        pxor        mm6, [GLOBAL(t80)]
-        pxor        mm3, [GLOBAL(t80)]
-        movq        [rdi+rax  ], mm3
-        neg         rax
-        movq        [rdi+rax*4], mm6
-
-;EARLY_BREAK_OUT:
-        neg         rax
-        add         rsi,8
-        dec         rcx
-        jnz         .next8_mbh
-
-    add rsp, 32
-    pop rsp
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp8_mbloop_filter_vertical_edge_mmx
-;(
-;    unsigned char *src_ptr,
-;    int  src_pixel_step,
-;    const char *blimit,
-;    const char *limit,
-;    const char *thresh,
-;    int count
-;)
-global sym(vp8_mbloop_filter_vertical_edge_mmx)
-sym(vp8_mbloop_filter_vertical_edge_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub          rsp, 96      ; reserve 96 bytes
-    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
-    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
-    %define srct [rsp + 32]   ;__declspec(align(16)) char srct[64];
-
-        mov         rsi,        arg(0) ;src_ptr
-        movsxd      rax,        dword ptr arg(1) ;src_pixel_step     ; destination pitch?
-
-        lea         rsi,        [rsi + rax*4 - 4]
-
-        movsxd      rcx,        dword ptr arg(5) ;count
-.next8_mbv:
-        lea         rdi,        [rsi + rax]  ; rdi points to row +1 for indirect addressing
-
-        ;transpose
-        movq        mm0,        [rdi+2*rax]                 ; 77 76 75 74 73 72 71 70
-        movq        mm6,        [rsi+2*rax]                 ; 67 66 65 64 63 62 61 60
-
-        movq        mm7,        mm6                         ; 77 76 75 74 73 72 71 70
-        punpckhbw   mm7,        mm0                         ; 77 67 76 66 75 65 74 64
-
-        punpcklbw   mm6,        mm0                         ; 73 63 72 62 71 61 70 60
-        movq        mm0,        [rsi+rax]                   ; 57 56 55 54 53 52 51 50
-
-        movq        mm4,        [rsi]                       ; 47 46 45 44 43 42 41 40
-        movq        mm5,        mm4                         ; 47 46 45 44 43 42 41 40
-
-        punpckhbw   mm5,        mm0                         ; 57 47 56 46 55 45 54 44
-        punpcklbw   mm4,        mm0                         ; 53 43 52 42 51 41 50 40
-
-        movq        mm3,        mm5                         ; 57 47 56 46 55 45 54 44
-        punpckhwd   mm5,        mm7                         ; 77 67 57 47 76 66 56 46
-
-        punpcklwd   mm3,        mm7                         ; 75 65 55 45 74 64 54 44
-        movq        mm2,        mm4                         ; 53 43 52 42 51 41 50 40
-
-        punpckhwd   mm4,        mm6                         ; 73 63 53 43 72 62 52 42
-        punpcklwd   mm2,        mm6                         ; 71 61 51 41 70 60 50 40
-
-        neg         rax
-
-        movq        mm7,        [rsi+rax]                   ; 37 36 35 34 33 32 31 30
-        movq        mm6,        [rsi+rax*2]                 ; 27 26 25 24 23 22 21 20
-
-        movq        mm1,        mm6                         ; 27 26 25 24 23 22 21 20
-        punpckhbw   mm6,        mm7                         ; 37 27 36 36 35 25 34 24
-
-        punpcklbw   mm1,        mm7                         ; 33 23 32 22 31 21 30 20
-
-        movq        mm7,        [rsi+rax*4];                ; 07 06 05 04 03 02 01 00
-        punpckhbw   mm7,        [rdi+rax*4]                 ; 17 07 16 06 15 05 14 04
-
-        movq        mm0,        mm7                         ; 17 07 16 06 15 05 14 04
-        punpckhwd   mm7,        mm6                         ; 37 27 17 07 36 26 16 06
-
-        punpcklwd   mm0,        mm6                         ; 35 25 15 05 34 24 14 04
-        movq        mm6,        mm7                         ; 37 27 17 07 36 26 16 06
-
-        punpckhdq   mm7,        mm5                         ; 77 67 57 47 37 27 17 07  = q3
-        punpckldq   mm6,        mm5                         ; 76 66 56 46 36 26 16 06  = q2
-
-        lea         rdx,        srct
-        movq        mm5,        mm6                         ; 76 66 56 46 36 26 16 06
-
-        movq        [rdx+56],   mm7
-        psubusb     mm5,        mm7                         ; q2-q3
-
-
-        movq        [rdx+48],   mm6
-        psubusb     mm7,        mm6                         ; q3-q2
-
-        por         mm7,        mm5;                        ; mm7=abs (q3-q2)
-        movq        mm5,        mm0                         ; 35 25 15 05 34 24 14 04
-
-        punpckhdq   mm5,        mm3                         ; 75 65 55 45 35 25 15 05 = q1
-        punpckldq   mm0,        mm3                         ; 74 64 54 44 34 24 15 04 = q0
-
-        movq        mm3,        mm5                         ; 75 65 55 45 35 25 15 05 = q1
-        psubusb     mm3,        mm6                         ; q1-q2
-
-        psubusb     mm6,        mm5                         ; q2-q1
-        por         mm6,        mm3                         ; mm6=abs(q2-q1)
-
-        movq        [rdx+40],   mm5                         ; save q1
-        movq        [rdx+32],   mm0                         ; save q0
-
-        movq        mm3,        [rsi+rax*4]                 ; 07 06 05 04 03 02 01 00
-        punpcklbw   mm3,        [rdi+rax*4]                 ; 13 03 12 02 11 01 10 00
-
-        movq        mm0,        mm3                         ; 13 03 12 02 11 01 10 00
-        punpcklwd   mm0,        mm1                         ; 31 21 11 01 30 20 10 00
-
-        punpckhwd   mm3,        mm1                         ; 33 23 13 03 32 22 12 02
-        movq        mm1,        mm0                         ; 31 21 11 01 30 20 10 00
-
-        punpckldq   mm0,        mm2                         ; 70 60 50 40 30 20 10 00  =p3
-        punpckhdq   mm1,        mm2                         ; 71 61 51 41 31 21 11 01  =p2
-
-        movq        [rdx],      mm0                         ; save p3
-        movq        [rdx+8],    mm1                         ; save p2
-
-        movq        mm2,        mm1                         ; 71 61 51 41 31 21 11 01  =p2
-        psubusb     mm2,        mm0                         ; p2-p3
-
-        psubusb     mm0,        mm1                         ; p3-p2
-        por         mm0,        mm2                         ; mm0=abs(p3-p2)
-
-        movq        mm2,        mm3                         ; 33 23 13 03 32 22 12 02
-        punpckldq   mm2,        mm4                         ; 72 62 52 42 32 22 12 02 = p1
-
-        punpckhdq   mm3,        mm4                         ; 73 63 53 43 33 23 13 03 = p0
-        movq        [rdx+24],   mm3                         ; save p0
-
-        movq        [rdx+16],   mm2                         ; save p1
-        movq        mm5,        mm2                         ; mm5 = p1
-
-        psubusb     mm2,        mm1                         ; p1-p2
-        psubusb     mm1,        mm5                         ; p2-p1
-
-        por         mm1,        mm2                         ; mm1=abs(p2-p1)
-        mov         rdx,        arg(3) ;limit
-
-        movq        mm4,        [rdx]                       ; mm4 = limit
-        psubusb     mm7,        mm4                         ; abs(q3-q2) > limit
-
-        psubusb     mm0,        mm4                         ; abs(p3-p2) > limit
-        psubusb     mm1,        mm4                         ; abs(p2-p1) > limit
-
-        psubusb     mm6,        mm4                         ; abs(q2-q1) > limit
-        por         mm7,        mm6                         ; or
-
-        por         mm0,        mm1                         ;
-        por         mm0,        mm7                         ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
-
-        movq        mm1,        mm5                         ; p1
-
-        movq        mm7,        mm3                         ; mm3=mm7=p0
-        psubusb     mm7,        mm5                         ; p0 - p1
-
-        psubusb     mm5,        mm3                         ; p1 - p0
-        por         mm5,        mm7                         ; abs(p1-p0)
-
-        movq        t0,         mm5                         ; save abs(p1-p0)
-        lea         rdx,        srct
-
-        psubusb     mm5,        mm4                         ; mm5 = abs(p1-p0) > limit
-        por         mm0,        mm5                         ; mm0=mask
-
-        movq        mm5,        [rdx+32]                    ; mm5=q0
-        movq        mm7,        [rdx+40]                    ; mm7=q1
-
-        movq        mm6,        mm5                         ; mm6=q0
-        movq        mm2,        mm7                         ; q1
-        psubusb     mm5,        mm7                         ; q0-q1
-
-        psubusb     mm7,        mm6                         ; q1-q0
-        por         mm7,        mm5                         ; abs(q1-q0)
-
-        movq        t1,         mm7                         ; save abs(q1-q0)
-        psubusb     mm7,        mm4                         ; mm7=abs(q1-q0)> limit
-
-        por         mm0,        mm7                         ; mask
-
-        movq        mm5,        mm2                         ; q1
-        psubusb     mm5,        mm1                         ; q1-=p1
-        psubusb     mm1,        mm2                         ; p1-=q1
-        por         mm5,        mm1                         ; abs(p1-q1)
-        pand        mm5,        [GLOBAL(tfe)]               ; set lsb of each byte to zero
-        psrlw       mm5,        1                           ; abs(p1-q1)/2
-
-        mov         rdx,        arg(2) ;blimit                      ;
-
-        movq        mm4,        [rdx]                       ;blimit
-        movq        mm1,        mm3                         ; mm1=mm3=p0
-
-        movq        mm7,        mm6                         ; mm7=mm6=q0
-        psubusb     mm1,        mm7                         ; p0-q0
-
-        psubusb     mm7,        mm3                         ; q0-p0
-        por         mm1,        mm7                         ; abs(q0-p0)
-        paddusb     mm1,        mm1                         ; abs(q0-p0)*2
-        paddusb     mm1,        mm5                         ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
-        psubusb     mm1,        mm4                         ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
-        por         mm1,        mm0;                        ; mask
-
-        pxor        mm0,        mm0
-        pcmpeqb     mm1,        mm0
-
-        ; calculate high edge variance
-        mov         rdx,        arg(4) ;thresh            ; get thresh
-        movq        mm7,        [rdx]
-        ;
-        movq        mm4,        t0              ; get abs (q1 - q0)
-        psubusb     mm4,        mm7             ; abs(q1 - q0) > thresh
-
-        movq        mm3,        t1              ; get abs (p1 - p0)
-        psubusb     mm3,        mm7             ; abs(p1 - p0)> thresh
-
-        por         mm4,        mm3             ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
-        pcmpeqb     mm4,        mm0
-
-        pcmpeqb     mm0,        mm0
-        pxor        mm4,        mm0
-
-
-
-
-        ; start work on filters
-        lea         rdx,        srct
-
-        ; start work on filters
-        movq        mm2, [rdx+16]         ; p1
-        movq        mm7, [rdx+40]         ; q1
-        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values
-        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values
-        psubsb      mm2, mm7              ; p1 - q1
-
-        movq        mm6, [rdx+24]         ; p0
-        movq        mm0, [rdx+32]         ; q0
-        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values
-        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values
-
-        movq        mm3, mm0              ; q0
-        psubsb      mm0, mm6              ; q0 - p0
-        paddsb      mm2, mm0              ; 1 * (q0 - p0) + (p1 - q1)
-        paddsb      mm2, mm0              ; 2 * (q0 - p0)
-        paddsb      mm2, mm0              ; 3 * (q0 - p0) + (p1 - q1)
-        pand       mm1, mm2           ; mask filter values we don't care about
-
-        ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0
-        movq        mm2, mm1              ; vp8_filter
-        pand        mm2, mm4;             ; Filter2 = vp8_filter & hev
-
-        movq        mm5,        mm2       ;
-        paddsb      mm5,        [GLOBAL(t3)];
-
-        pxor        mm0, mm0              ; 0
-        pxor        mm7, mm7              ; 0
-
-        punpcklbw   mm0, mm5              ; e0f0g0h0
-        psraw       mm0, 11               ; sign extended shift right by 3
-        punpckhbw   mm7, mm5              ; a0b0c0d0
-        psraw       mm7, 11               ; sign extended shift right by 3
-        packsswb    mm0, mm7              ; Filter2 >>=3;
-
-        movq        mm5, mm0              ; Filter2
-
-        paddsb      mm2, [GLOBAL(t4)]     ; vp8_signed_char_clamp(Filter2 + 4)
-        pxor        mm0, mm0              ; 0
-        pxor        mm7, mm7              ; 0
-
-        punpcklbw   mm0, mm2              ; e0f0g0h0
-        psraw       mm0, 11               ; sign extended shift right by 3
-        punpckhbw   mm7, mm2              ; a0b0c0d0
-        psraw       mm7, 11               ; sign extended shift right by 3
-        packsswb    mm0, mm7              ; Filter2 >>=3;
-
-        ; mm0= filter2 mm1 = vp8_filter,  mm3 =qs0 mm5=s mm4 =hev mm6=ps0
-        psubsb      mm3, mm0              ; qs0 =qs0 - filter1
-        paddsb      mm6, mm5              ; ps0 =ps0 + Fitler2
-
-        ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0
-        ; vp8_filter &= ~hev;
-        ; Filter2 = vp8_filter;
-        pandn       mm4, mm1              ; vp8_filter&=~hev
-
-
-        ; mm3=qs0, mm4=filter2, mm6=ps0
-
-        ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
-        ; s = vp8_signed_char_clamp(qs0 - u);
-        ; *oq0 = s^0x80;
-        ; s = vp8_signed_char_clamp(ps0 + u);
-        ; *op0 = s^0x80;
-        pxor        mm0, mm0
-
-        pxor        mm1, mm1
-        pxor        mm2, mm2
-        punpcklbw   mm1, mm4
-        punpckhbw   mm2, mm4
-        pmulhw      mm1, [GLOBAL(s27)]
-        pmulhw      mm2, [GLOBAL(s27)]
-        paddw       mm1, [GLOBAL(s63)]
-        paddw       mm2, [GLOBAL(s63)]
-        psraw       mm1, 7
-        psraw       mm2, 7
-        packsswb    mm1, mm2
-
-        psubsb      mm3, mm1
-        paddsb      mm6, mm1
-
-        pxor        mm3, [GLOBAL(t80)]
-        pxor        mm6, [GLOBAL(t80)]
-        movq        [rdx+24], mm6
-        movq        [rdx+32], mm3
-
-        ; roughly 2/7th difference across boundary
-        ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
-        ; s = vp8_signed_char_clamp(qs1 - u);
-        ; *oq1 = s^0x80;
-        ; s = vp8_signed_char_clamp(ps1 + u);
-        ; *op1 = s^0x80;
-        pxor        mm1, mm1
-        pxor        mm2, mm2
-        punpcklbw   mm1, mm4
-        punpckhbw   mm2, mm4
-        pmulhw      mm1, [GLOBAL(s18)]
-        pmulhw      mm2, [GLOBAL(s18)]
-        paddw       mm1, [GLOBAL(s63)]
-        paddw       mm2, [GLOBAL(s63)]
-        psraw       mm1, 7
-        psraw       mm2, 7
-        packsswb    mm1, mm2
-
-        movq        mm3, [rdx + 40]
-        movq        mm6, [rdx + 16]       ; p1
-        pxor        mm3, [GLOBAL(t80)]
-        pxor        mm6, [GLOBAL(t80)]
-
-        paddsb      mm6, mm1
-        psubsb      mm3, mm1
-
-        pxor        mm6, [GLOBAL(t80)]
-        pxor        mm3, [GLOBAL(t80)]
-        movq        [rdx + 40], mm3
-        movq        [rdx + 16], mm6
-
-        ; roughly 1/7th difference across boundary
-        ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
-        ; s = vp8_signed_char_clamp(qs2 - u);
-        ; *oq2 = s^0x80;
-        ; s = vp8_signed_char_clamp(ps2 + u);
-        ; *op2 = s^0x80;
-        pxor        mm1, mm1
-        pxor        mm2, mm2
-        punpcklbw   mm1, mm4
-        punpckhbw   mm2, mm4
-        pmulhw      mm1, [GLOBAL(s9)]
-        pmulhw      mm2, [GLOBAL(s9)]
-        paddw       mm1, [GLOBAL(s63)]
-        paddw       mm2, [GLOBAL(s63)]
-        psraw       mm1, 7
-        psraw       mm2, 7
-        packsswb    mm1, mm2
-
-        movq        mm6, [rdx+ 8]
-        movq        mm3, [rdx+48]
-
-        pxor        mm6, [GLOBAL(t80)]
-        pxor        mm3, [GLOBAL(t80)]
-
-        paddsb      mm6, mm1
-        psubsb      mm3, mm1
-
-        pxor        mm6, [GLOBAL(t80)]          ; mm6 = 71 61 51 41 31 21 11 01
-        pxor        mm3, [GLOBAL(t80)]          ; mm3 = 76 66 56 46 36 26 15 06
-
-        ; tranpose and write back
-        movq        mm0,    [rdx]               ; mm0 = 70 60 50 40 30 20 10 00
-        movq        mm1,    mm0                 ; mm0 = 70 60 50 40 30 20 10 00
-
-        punpcklbw   mm0,    mm6                 ; mm0 = 31 30 21 20 11 10 01 00
-        punpckhbw   mm1,    mm6                 ; mm3 = 71 70 61 60 51 50 41 40
-
-        movq        mm2,    [rdx+16]            ; mm2 = 72 62 52 42 32 22 12 02
-        movq        mm6,    mm2                 ; mm3 = 72 62 52 42 32 22 12 02
-
-        punpcklbw   mm2,    [rdx+24]            ; mm2 = 33 32 23 22 13 12 03 02
-        punpckhbw   mm6,    [rdx+24]            ; mm3 = 73 72 63 62 53 52 43 42
-
-        movq        mm5,    mm0                 ; mm5 = 31 30 21 20 11 10 01 00
-        punpcklwd   mm0,    mm2                 ; mm0 = 13 12 11 10 03 02 01 00
-
-        punpckhwd   mm5,    mm2                 ; mm5 = 33 32 31 30 23 22 21 20
-        movq        mm4,    mm1                 ; mm4 = 71 70 61 60 51 50 41 40
-
-        punpcklwd   mm1,    mm6                 ; mm1 = 53 52 51 50 43 42 41 40
-        punpckhwd   mm4,    mm6                 ; mm4 = 73 72 71 70 63 62 61 60
-
-        movq        mm2,    [rdx+32]            ; mm2 = 74 64 54 44 34 24 14 04
-        punpcklbw   mm2,    [rdx+40]            ; mm2 = 35 34 25 24 15 14 05 04
-
-        movq        mm6,    mm3                 ; mm6 = 76 66 56 46 36 26 15 06
-        punpcklbw   mm6,    [rdx+56]            ; mm6 = 37 36 27 26 17 16 07 06
-
-        movq        mm7,    mm2                 ; mm7 = 35 34 25 24 15 14 05 04
-        punpcklwd   mm2,    mm6                 ; mm2 = 17 16 15 14 07 06 05 04
-
-        punpckhwd   mm7,    mm6                 ; mm7 = 37 36 35 34 27 26 25 24
-        movq        mm6,    mm0                 ; mm6 = 13 12 11 10 03 02 01 00
-
-        punpckldq   mm0,    mm2                 ; mm0 = 07 06 05 04 03 02 01 00
-        punpckhdq   mm6,    mm2                 ; mm6 = 17 16 15 14 13 12 11 10
-
-        movq        [rsi+rax*4], mm0            ; write out
-        movq        [rdi+rax*4], mm6            ; write out
-
-        movq        mm0,    mm5                 ; mm0 = 33 32 31 30 23 22 21 20
-        punpckldq   mm0,    mm7                 ; mm0 = 27 26 25 24 23 22 20 20
-
-        punpckhdq   mm5,    mm7                 ; mm5 = 37 36 35 34 33 32 31 30
-        movq        [rsi+rax*2], mm0            ; write out
-
-        movq        [rdi+rax*2], mm5            ; write out
-        movq        mm2,    [rdx+32]            ; mm2 = 74 64 54 44 34 24 14 04
-
-        punpckhbw   mm2,    [rdx+40]            ; mm2 = 75 74 65 64 54 54 45 44
-        punpckhbw   mm3,    [rdx+56]            ; mm3 = 77 76 67 66 57 56 47 46
-
-        movq        mm5,    mm2                 ; mm5 = 75 74 65 64 54 54 45 44
-        punpcklwd   mm2,    mm3                 ; mm2 = 57 56 55 54 47 46 45 44
-
-        punpckhwd   mm5,    mm3                 ; mm5 = 77 76 75 74 67 66 65 64
-        movq        mm0,    mm1                 ; mm0=  53 52 51 50 43 42 41 40
-
-        movq        mm3,    mm4                 ; mm4 = 73 72 71 70 63 62 61 60
-        punpckldq   mm0,    mm2                 ; mm0 = 47 46 45 44 43 42 41 40
-
-        punpckhdq   mm1,    mm2                 ; mm1 = 57 56 55 54 53 52 51 50
-        movq        [rsi],  mm0                 ; write out
-
-        movq        [rdi],  mm1                 ; write out
-        neg         rax
-
-        punpckldq   mm3,    mm5                 ; mm3 = 67 66 65 64 63 62 61 60
-        punpckhdq   mm4,    mm5                 ; mm4 = 77 76 75 74 73 72 71 60
-
-        movq        [rsi+rax*2], mm3
-        movq        [rdi+rax*2], mm4
-
-        lea         rsi,        [rsi+rax*8]
-        dec         rcx
-
-        jnz         .next8_mbv
-
-    add rsp, 96
-    pop rsp
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
 ;void vp8_loop_filter_simple_horizontal_edge_mmx
 ;(
 ;    unsigned char *src_ptr,
diff --git a/vp8/common/x86/loopfilter_sse2.asm b/vp8/common/x86/loopfilter_sse2.asm
index 295609c58..6f6531c86 100644
--- a/vp8/common/x86/loopfilter_sse2.asm
+++ b/vp8/common/x86/loopfilter_sse2.asm
@@ -380,302 +380,6 @@ sym(vp8_loop_filter_horizontal_edge_uv_sse2):
     ret
 
 
-%macro MB_FILTER_AND_WRITEBACK 1
-%if %1 == 0
-        movdqa      xmm2,                   p1              ; p1
-        movdqa      xmm7,                   q1              ; q1
-%elif %1 == 1
-        movdqa      xmm2,                   [rsi+2*rax]     ; p1
-        movdqa      xmm7,                   [rdi]           ; q1
-
-        mov         rcx,                    rax
-        neg         rcx
-%elif %1 == 2
-        lea         rdx,                    srct
-
-        movdqa      xmm2,                   [rdx+32]        ; p1
-        movdqa      xmm7,                   [rdx+80]        ; q1
-        movdqa      xmm6,                   [rdx+48]        ; p0
-        movdqa      xmm0,                   [rdx+64]        ; q0
-%endif
-
-        pxor        xmm2,                   [GLOBAL(t80)]   ; p1 offset to convert to signed values
-        pxor        xmm7,                   [GLOBAL(t80)]   ; q1 offset to convert to signed values
-        pxor        xmm6,                   [GLOBAL(t80)]   ; offset to convert to signed values
-        pxor        xmm0,                   [GLOBAL(t80)]   ; offset to convert to signed values
-
-        psubsb      xmm2,                   xmm7            ; p1 - q1
-        movdqa      xmm3,                   xmm0            ; q0
-
-        psubsb      xmm0,                   xmm6            ; q0 - p0
-
-        paddsb      xmm2,                   xmm0            ; 1 * (q0 - p0) + (p1 - q1)
-
-        paddsb      xmm2,                   xmm0            ; 2 * (q0 - p0)
-
-        paddsb      xmm2,                   xmm0            ; 3 * (q0 - p0) + (p1 - q1)
-
-        pand        xmm1,                   xmm2            ; mask filter values we don't care about
-
-        movdqa      xmm2,                   xmm1            ; vp8_filter
-
-        pand        xmm2,                   xmm4            ; Filter2 = vp8_filter & hev
-        pxor        xmm0,                   xmm0
-
-        pandn       xmm4,                   xmm1            ; vp8_filter&=~hev
-        pxor        xmm1,                   xmm1
-
-        punpcklbw   xmm0,                   xmm4            ; Filter 2 (hi)
-        movdqa      xmm5,                   xmm2
-
-        punpckhbw   xmm1,                   xmm4            ; Filter 2 (lo)
-        paddsb      xmm5,                   [GLOBAL(t3)]    ; vp8_signed_char_clamp(Filter2 + 3)
-
-        pmulhw      xmm1,                   [GLOBAL(s9)]    ; Filter 2 (lo) * 9
-
-        pmulhw      xmm0,                   [GLOBAL(s9)]    ; Filter 2 (hi) * 9
-
-        punpckhbw   xmm7,                   xmm5            ; axbxcxdx
-        paddsb      xmm2,                   [GLOBAL(t4)]    ; vp8_signed_char_clamp(Filter2 + 4)
-
-        punpcklbw   xmm5,                   xmm5            ; exfxgxhx
-        psraw       xmm7,                   11              ; sign extended shift right by 3
-
-        psraw       xmm5,                   11              ; sign extended shift right by 3
-        punpckhbw   xmm4,                   xmm2            ; axbxcxdx
-
-        punpcklbw   xmm2,                   xmm2            ; exfxgxhx
-        psraw       xmm4,                   11              ; sign extended shift right by 3
-
-        packsswb    xmm5,                   xmm7            ; Filter2 >>=3;
-        psraw       xmm2,                   11              ; sign extended shift right by 3
-
-        packsswb    xmm2,                   xmm4            ; Filter1 >>=3;
-        movdqa      xmm7,                   xmm1
-
-        paddsb      xmm6,                   xmm5            ; ps0 =ps0 + Fitler2
-        movdqa      xmm4,                   xmm1
-
-        psubsb      xmm3,                   xmm2            ; qs0 =qs0 - Filter1
-        movdqa      xmm5,                   xmm0
-
-        movdqa      xmm2,                   xmm5
-        paddw       xmm0,                   [GLOBAL(s63)]   ; Filter 2 (hi) * 9 + 63
-
-        paddw       xmm1,                   [GLOBAL(s63)]   ; Filter 2 (lo) * 9 + 63
-        paddw       xmm5,                   xmm5            ; Filter 2 (hi) * 18
-
-        paddw       xmm7,                   xmm7            ; Filter 2 (lo) * 18
-        paddw       xmm5,                   xmm0            ; Filter 2 (hi) * 27 + 63
-
-        paddw       xmm7,                   xmm1            ; Filter 2 (lo) * 27 + 63
-        paddw       xmm2,                   xmm0            ; Filter 2 (hi) * 18 + 63
-
-        paddw       xmm4,                   xmm1            ; Filter 2 (lo) * 18 + 63
-        psraw       xmm0,                   7               ; (Filter 2 (hi) * 9 + 63) >> 7
-
-        psraw       xmm1,                   7               ; (Filter 2 (lo) * 9 + 63) >> 7
-        psraw       xmm2,                   7               ; (Filter 2 (hi) * 18 + 63) >> 7
-
-        packsswb    xmm0,                   xmm1            ; u1 = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
-        psraw       xmm4,                   7               ; (Filter 2 (lo) * 18 + 63) >> 7
-
-        psraw       xmm5,                   7               ; (Filter 2 (hi) * 27 + 63) >> 7
-        packsswb    xmm2,                   xmm4            ; u2 = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
-
-        psraw       xmm7,                   7               ; (Filter 2 (lo) * 27 + 63) >> 7
-
-        packsswb    xmm5,                   xmm7            ; u3 = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
-
-        psubsb      xmm3,                   xmm5            ; sq = vp8_signed_char_clamp(qs0 - u3)
-        paddsb      xmm6,                   xmm5            ; sp = vp8_signed_char_clamp(ps0 - u3)
-
-%if %1 == 0
-        movdqa      xmm5,                   q2              ; q2
-        movdqa      xmm1,                   q1              ; q1
-        movdqa      xmm4,                   p1              ; p1
-        movdqa      xmm7,                   p2              ; p2
-
-%elif %1 == 1
-        movdqa      xmm5,                   XMMWORD PTR [rdi+rcx]   ; q2
-        movdqa      xmm1,                   XMMWORD PTR [rdi]       ; q1
-        movdqa      xmm4,                   XMMWORD PTR [rsi+rax*2] ; p1
-        movdqa      xmm7,                   XMMWORD PTR [rdi+rax*4] ; p2
-%elif %1 == 2
-        movdqa      xmm5,                   XMMWORD PTR [rdx+96]    ; q2
-        movdqa      xmm1,                   XMMWORD PTR [rdx+80]    ; q1
-        movdqa      xmm4,                   XMMWORD PTR [rdx+32]    ; p1
-        movdqa      xmm7,                   XMMWORD PTR [rdx+16]    ; p2
-%endif
-
-        pxor        xmm3,                   [GLOBAL(t80)]   ; *oq0 = sq^0x80
-        pxor        xmm6,                   [GLOBAL(t80)]   ; *oq0 = sp^0x80
-
-        pxor        xmm1,                   [GLOBAL(t80)]
-        pxor        xmm4,                   [GLOBAL(t80)]
-
-        psubsb      xmm1,                   xmm2            ; sq = vp8_signed_char_clamp(qs1 - u2)
-        paddsb      xmm4,                   xmm2            ; sp = vp8_signed_char_clamp(ps1 - u2)
-
-        pxor        xmm1,                   [GLOBAL(t80)]   ; *oq1 = sq^0x80;
-        pxor        xmm4,                   [GLOBAL(t80)]   ; *op1 = sp^0x80;
-
-        pxor        xmm7,                   [GLOBAL(t80)]
-        pxor        xmm5,                   [GLOBAL(t80)]
-
-        paddsb      xmm7,                   xmm0            ; sp = vp8_signed_char_clamp(ps2 - u)
-        psubsb      xmm5,                   xmm0            ; sq = vp8_signed_char_clamp(qs2 - u)
-
-        pxor        xmm7,                   [GLOBAL(t80)]   ; *op2 = sp^0x80;
-        pxor        xmm5,                   [GLOBAL(t80)]   ; *oq2 = sq^0x80;
-
-%if %1 == 0
-        lea         rsi,                    [rsi+rcx*2]
-        lea         rdi,                    [rdi+rcx*2]
-
-        movq        MMWORD PTR [rsi],       xmm6            ; p0
-        movhps      MMWORD PTR [rdi],       xmm6
-        movq        MMWORD PTR [rsi + rcx], xmm3            ; q0
-        movhps      MMWORD PTR [rdi + rcx], xmm3
-
-        movq        MMWORD PTR [rsi+rcx*2], xmm1            ; q1
-        movhps      MMWORD PTR [rdi+rcx*2], xmm1
-
-        movq        MMWORD PTR [rsi + rax], xmm4            ; p1
-        movhps      MMWORD PTR [rdi + rax], xmm4
-
-        movq        MMWORD PTR [rsi+rax*2], xmm7            ; p2
-        movhps      MMWORD PTR [rdi+rax*2], xmm7
-
-        lea         rsi,                    [rsi + rcx]
-        lea         rdi,                    [rdi + rcx]
-        movq        MMWORD PTR [rsi+rcx*2], xmm5            ; q2
-        movhps      MMWORD PTR [rdi+rcx*2], xmm5
-%elif %1 == 1
-        movdqa      XMMWORD PTR [rdi+rcx],  xmm5            ; q2
-        movdqa      XMMWORD PTR [rdi],      xmm1            ; q1
-        movdqa      XMMWORD PTR [rsi],      xmm3            ; q0
-        movdqa      XMMWORD PTR [rsi+rax  ],xmm6            ; p0
-        movdqa      XMMWORD PTR [rsi+rax*2],xmm4            ; p1
-        movdqa      XMMWORD PTR [rdi+rax*4],xmm7            ; p2
-%elif %1 == 2
-        movdqa      XMMWORD PTR [rdx+80],   xmm1            ; q1
-        movdqa      XMMWORD PTR [rdx+64],   xmm3            ; q0
-        movdqa      XMMWORD PTR [rdx+48],   xmm6            ; p0
-        movdqa      XMMWORD PTR [rdx+32],   xmm4            ; p1
-%endif
-
-%endmacro
-
-
-;void vp8_mbloop_filter_horizontal_edge_sse2
-;(
-;    unsigned char *src_ptr,
-;    int            src_pixel_step,
-;    const char    *blimit,
-;    const char    *limit,
-;    const char    *thresh,
-;    int            count
-;)
-global sym(vp8_mbloop_filter_horizontal_edge_sse2)
-sym(vp8_mbloop_filter_horizontal_edge_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 32     ; reserve 32 bytes
-    %define t0 [rsp + 0]    ;__declspec(align(16)) char t0[16];
-    %define t1 [rsp + 16]   ;__declspec(align(16)) char t1[16];
-
-        mov         rsi,                    arg(0)            ;src_ptr
-        movsxd      rax,                    dword ptr arg(1)  ;src_pixel_step
-
-        mov         rdx,                    arg(3)            ;limit
-        movdqa      xmm7,                   XMMWORD PTR [rdx]
-
-        lea         rdi,                    [rsi+rax]         ; rdi points to row +1 for indirect addressing
-
-        ; calculate breakout conditions and high edge variance
-        LFH_FILTER_AND_HEV_MASK 1
-        ; filter and write back the results
-        MB_FILTER_AND_WRITEBACK 1
-
-    add rsp, 32
-    pop rsp
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp8_mbloop_filter_horizontal_edge_uv_sse2
-;(
-;    unsigned char *u,
-;    int            src_pixel_step,
-;    const char    *blimit,
-;    const char    *limit,
-;    const char    *thresh,
-;    unsigned char *v
-;)
-global sym(vp8_mbloop_filter_horizontal_edge_uv_sse2)
-sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 96       ; reserve 96 bytes
-    %define q2  [rsp + 0]     ;__declspec(align(16)) char q2[16];
-    %define q1  [rsp + 16]    ;__declspec(align(16)) char q1[16];
-    %define p2  [rsp + 32]    ;__declspec(align(16)) char p2[16];
-    %define p1  [rsp + 48]    ;__declspec(align(16)) char p1[16];
-    %define t0  [rsp + 64]    ;__declspec(align(16)) char t0[16];
-    %define t1  [rsp + 80]    ;__declspec(align(16)) char t1[16];
-
-        mov         rsi,                    arg(0)             ; u
-        mov         rdi,                    arg(5)             ; v
-        movsxd      rax,                    dword ptr arg(1)   ; src_pixel_step
-        mov         rcx,                    rax
-        neg         rax                     ; negate pitch to deal with above border
-
-        mov         rdx,                    arg(3)             ;limit
-        movdqa      xmm7,                   XMMWORD PTR [rdx]
-
-        lea         rsi,                    [rsi + rcx]
-        lea         rdi,                    [rdi + rcx]
-
-        ; calculate breakout conditions and high edge variance
-        LFH_FILTER_AND_HEV_MASK 0
-        ; filter and write back the results
-        MB_FILTER_AND_WRITEBACK 0
-
-    add rsp, 96
-    pop rsp
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
 %macro TRANSPOSE_16X8 2
         movq        xmm4,               QWORD PTR [rsi]        ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00
         movq        xmm1,               QWORD PTR [rdi]        ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10
@@ -1141,233 +845,6 @@ sym(vp8_loop_filter_vertical_edge_uv_sse2):
     pop         rbp
     ret
 
-%macro MBV_TRANSPOSE 0
-        movdqa      xmm0,               [rdx]               ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
-        movdqa      xmm1,               xmm0                ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
-
-        punpcklbw   xmm0,               xmm7                ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
-        punpckhbw   xmm1,               xmm7                ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
-
-        movdqa      xmm2,               [rdx+32]            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
-        movdqa      xmm6,               xmm2                ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
-
-        punpcklbw   xmm2,               [rdx+48]            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
-        punpckhbw   xmm6,               [rdx+48]            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
-
-        movdqa      xmm3,               xmm0                ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
-        punpcklwd   xmm0,               xmm2                ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
-
-        punpckhwd   xmm3,               xmm2                ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
-        movdqa      xmm4,               xmm1                ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
-
-        punpcklwd   xmm1,               xmm6                ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
-        punpckhwd   xmm4,               xmm6                ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
-
-        movdqa      xmm2,               [rdx+64]            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
-        punpcklbw   xmm2,               [rdx+80]            ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
-
-        movdqa      xmm6,               xmm5                ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
-        punpcklbw   xmm6,               [rdx+112]           ; 77 76 67 66 57 56 47 46 37 36 27 26 17 16 07 06
-
-        movdqa      xmm7,               xmm2                ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
-        punpcklwd   xmm2,               xmm6                ; 37 36 35 34 27 26 25 24 17 16 15 14 07 06 05 04
-
-        punpckhwd   xmm7,               xmm6                ; 77 76 75 74 67 66 65 64 57 56 55 54 47 46 45 44
-        movdqa      xmm6,               xmm0                ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
-
-        punpckldq   xmm0,               xmm2                ; 17 16 15 14 13 12 11 10 07 06 05 04 03 02 01 00
-        punpckhdq   xmm6,               xmm2                ; 37 36 35 34 33 32 31 30 27 26 25 24 23 22 21 20
-%endmacro
-
-%macro MBV_WRITEBACK_1 0
-        movq        QWORD  PTR [rsi],   xmm0
-        movhps      MMWORD PTR [rdi],   xmm0
-
-        movq        QWORD  PTR [rsi+2*rax], xmm6
-        movhps      MMWORD PTR [rdi+2*rax], xmm6
-
-        movdqa      xmm0,               xmm3                ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
-        punpckldq   xmm0,               xmm7                ; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40
-
-        punpckhdq   xmm3,               xmm7                ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60
-
-        movq        QWORD  PTR [rsi+4*rax], xmm0
-        movhps      MMWORD PTR [rdi+4*rax], xmm0
-
-        movq        QWORD  PTR [rsi+2*rcx], xmm3
-        movhps      MMWORD PTR [rdi+2*rcx], xmm3
-
-        movdqa      xmm2,               [rdx+64]            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
-        punpckhbw   xmm2,               [rdx+80]            ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
-
-        punpckhbw   xmm5,               [rdx+112]           ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86
-        movdqa      xmm0,               xmm2
-
-        punpcklwd   xmm0,               xmm5                ; b7 b6 b4 b4 a7 a6 a5 a4 97 96 95 94 87 86 85 84
-        punpckhwd   xmm2,               xmm5                ; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4
-
-        movdqa      xmm5,               xmm1                ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
-        punpckldq   xmm1,               xmm0                ; 97 96 95 94 93 92 91 90 87 86 85 83 84 82 81 80
-
-        punpckhdq   xmm5,               xmm0                ; b7 b6 b5 b4 b3 b2 b1 b0 a7 a6 a5 a4 a3 a2 a1 a0
-%endmacro
-
-%macro MBV_WRITEBACK_2 0
-        movq        QWORD  PTR [rsi],   xmm1
-        movhps      MMWORD PTR [rdi],   xmm1
-
-        movq        QWORD  PTR [rsi+2*rax], xmm5
-        movhps      MMWORD PTR [rdi+2*rax], xmm5
-
-        movdqa      xmm1,               xmm4                ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
-        punpckldq   xmm1,               xmm2                ; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0
-        punpckhdq   xmm4,               xmm2                ; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0
-
-        movq        QWORD  PTR [rsi+4*rax], xmm1
-        movhps      MMWORD PTR [rdi+4*rax], xmm1
-
-        movq        QWORD  PTR [rsi+2*rcx], xmm4
-        movhps      MMWORD PTR [rdi+2*rcx], xmm4
-%endmacro
-
-
-;void vp8_mbloop_filter_vertical_edge_sse2
-;(
-;    unsigned char *src_ptr,
-;    int            src_pixel_step,
-;    const char    *blimit,
-;    const char    *limit,
-;    const char    *thresh,
-;    int            count
-;)
-global sym(vp8_mbloop_filter_vertical_edge_sse2)
-sym(vp8_mbloop_filter_vertical_edge_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub          rsp, 160     ; reserve 160 bytes
-    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[16];
-    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[16];
-    %define srct [rsp + 32]   ;__declspec(align(16)) char srct[128];
-
-        mov         rsi,                arg(0)              ; src_ptr
-        movsxd      rax,                dword ptr arg(1)    ; src_pixel_step
-
-        lea         rsi,                [rsi - 4]
-        lea         rdi,                [rsi + rax]         ; rdi points to row +1 for indirect addressing
-        lea         rcx,                [rax*2+rax]
-
-        ; Transpose
-        TRANSPOSE_16X8 1, 0
-
-        ; calculate filter mask and high edge variance
-        LFV_FILTER_MASK_HEV_MASK 0
-
-        neg         rax
-        ; start work on filters
-        MB_FILTER_AND_WRITEBACK 2
-
-        lea         rsi,                [rsi+rax*8]
-        lea         rdi,                [rdi+rax*8]
-
-        ; transpose and write back
-        MBV_TRANSPOSE
-
-        neg         rax
-
-        MBV_WRITEBACK_1
-
-        lea         rsi,                [rsi+rax*8]
-        lea         rdi,                [rdi+rax*8]
-        MBV_WRITEBACK_2
-
-    add rsp, 160
-    pop rsp
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp8_mbloop_filter_vertical_edge_uv_sse2
-;(
-;    unsigned char *u,
-;    int            src_pixel_step,
-;    const char    *blimit,
-;    const char    *limit,
-;    const char    *thresh,
-;    unsigned char *v
-;)
-global sym(vp8_mbloop_filter_vertical_edge_uv_sse2)
-sym(vp8_mbloop_filter_vertical_edge_uv_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub          rsp, 160     ; reserve 160 bytes
-    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[16];
-    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[16];
-    %define srct [rsp + 32]   ;__declspec(align(16)) char srct[128];
-
-        mov         rsi,                arg(0)              ; u_ptr
-        movsxd      rax,                dword ptr arg(1)    ; src_pixel_step
-
-        lea         rsi,                [rsi - 4]
-        lea         rdi,                [rsi + rax]         ; rdi points to row +1 for indirect addressing
-        lea         rcx,                [rax+2*rax]
-
-        lea         rdx,                srct
-
-        ; Transpose
-        TRANSPOSE_16X8 0, 0
-
-        ; calculate filter mask and high edge variance
-        LFV_FILTER_MASK_HEV_MASK 0
-
-        ; start work on filters
-        MB_FILTER_AND_WRITEBACK 2
-
-        ; transpose and write back
-        MBV_TRANSPOSE
-
-        mov         rsi,                arg(0)             ;u_ptr
-        lea         rsi,                [rsi - 4]
-        lea         rdi,                [rsi + rax]
-        MBV_WRITEBACK_1
-        mov         rsi,                arg(5)             ;v_ptr
-        lea         rsi,                [rsi - 4]
-        lea         rdi,                [rsi + rax]
-        MBV_WRITEBACK_2
-
-    add rsp, 160
-    pop rsp
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
 ;void vp8_loop_filter_simple_horizontal_edge_sse2
 ;(
 ;    unsigned char *src_ptr,
diff --git a/vp8/common/x86/loopfilter_x86.c b/vp8/common/x86/loopfilter_x86.c
index e7239818e..716d10c79 100644
--- a/vp8/common/x86/loopfilter_x86.c
+++ b/vp8/common/x86/loopfilter_x86.c
@@ -9,63 +9,36 @@
  */
 
 
+#include <emmintrin.h>  // SSE2
 #include "vpx_config.h"
 #include "vp8/common/loopfilter.h"
 
-prototype_loopfilter(vp8_mbloop_filter_vertical_edge_mmx);
-prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_mmx);
 prototype_loopfilter(vp8_loop_filter_vertical_edge_mmx);
 prototype_loopfilter(vp8_loop_filter_horizontal_edge_mmx);
 
 prototype_loopfilter(vp8_loop_filter_vertical_edge_sse2);
 prototype_loopfilter(vp8_loop_filter_horizontal_edge_sse2);
-prototype_loopfilter(vp8_mbloop_filter_vertical_edge_sse2);
-prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_sse2);
 
 extern loop_filter_uvfunction vp8_loop_filter_horizontal_edge_uv_sse2;
 extern loop_filter_uvfunction vp8_loop_filter_vertical_edge_uv_sse2;
-extern loop_filter_uvfunction vp8_mbloop_filter_horizontal_edge_uv_sse2;
-extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_sse2;
 
 #if HAVE_MMX
 /* Horizontal MB filtering */
 void vp8_loop_filter_mbh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                              int y_stride, int uv_stride, struct loop_filter_info *lfi) {
-  vp8_mbloop_filter_horizontal_edge_mmx(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
-
-  if (u_ptr)
-    vp8_mbloop_filter_horizontal_edge_mmx(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
-
-  if (v_ptr)
-    vp8_mbloop_filter_horizontal_edge_mmx(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
 }
 
 
 /* Vertical MB Filtering */
 void vp8_loop_filter_mbv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                              int y_stride, int uv_stride, struct loop_filter_info *lfi) {
-  vp8_mbloop_filter_vertical_edge_mmx(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
-
-  if (u_ptr)
-    vp8_mbloop_filter_vertical_edge_mmx(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
-
-  if (v_ptr)
-    vp8_mbloop_filter_vertical_edge_mmx(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
 }
 
 
 /* Horizontal B Filtering */
 void vp8_loop_filter_bh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                             int y_stride, int uv_stride, struct loop_filter_info *lfi) {
-  vp8_loop_filter_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
-  vp8_loop_filter_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
-  vp8_loop_filter_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
-
-  if (u_ptr)
-    vp8_loop_filter_horizontal_edge_mmx(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
 
-  if (v_ptr)
-    vp8_loop_filter_horizontal_edge_mmx(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
 }
 
 
@@ -99,26 +72,413 @@ void vp8_loop_filter_bvs_mmx(unsigned char *y_ptr, int y_stride, const unsigned
 #endif
 
 
-/* Horizontal MB filtering */
 #if HAVE_SSE2
+void vp8_mbloop_filter_horizontal_edge_c_sse2
+(
+  unsigned char *s,
+  int p,
+  const unsigned char *_blimit,
+  const unsigned char *_limit,
+  const unsigned char *_thresh,
+  int count
+) {
+  DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
+  __m128i mask, hev, flat;
+  __m128i thresh, limit, blimit;
+  const __m128i zero = _mm_set1_epi16(0);
+  __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
+
+  thresh = _mm_shuffle_epi32(_mm_cvtsi32_si128(_thresh[0] * 0x01010101), 0);
+  limit = _mm_shuffle_epi32(_mm_cvtsi32_si128(_limit[0] * 0x01010101), 0);
+  blimit = _mm_shuffle_epi32(_mm_cvtsi32_si128(_blimit[0] * 0x01010101), 0);
+
+  p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
+  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
+  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+  q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
+  {
+    const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
+                                          _mm_subs_epu8(p0, p1));
+    const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
+                                          _mm_subs_epu8(q0, q1));
+    const __m128i one = _mm_set1_epi8(1);
+    const __m128i fe = _mm_set1_epi8(0xfe);
+    const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+    __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
+                                    _mm_subs_epu8(q0, p0));
+    __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
+                                    _mm_subs_epu8(q1, p1));
+    __m128i work;
+    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+    hev = _mm_subs_epu8(flat, thresh);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(flat, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
+                                     _mm_subs_epu8(p1, p2)),
+                         _mm_or_si128(_mm_subs_epu8(p3, p2),
+                                      _mm_subs_epu8(p2, p3)));
+    mask = _mm_max_epu8(work, mask);
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
+                                     _mm_subs_epu8(q1, q2)),
+                         _mm_or_si128(_mm_subs_epu8(q3, q2),
+                                      _mm_subs_epu8(q2, q3)));
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_cmpeq_epi8(mask, zero);
+
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
+                                     _mm_subs_epu8(p0, p2)),
+                         _mm_or_si128(_mm_subs_epu8(q2, q0),
+                                      _mm_subs_epu8(q0, q2)));
+    flat = _mm_max_epu8(work, flat);
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
+                                     _mm_subs_epu8(p0, p3)),
+                         _mm_or_si128(_mm_subs_epu8(q3, q0),
+                                      _mm_subs_epu8(q0, q3)));
+    flat = _mm_max_epu8(work, flat);
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0),
+                                     _mm_subs_epu8(p0, p4)),
+                         _mm_or_si128(_mm_subs_epu8(q4, q0),
+                                      _mm_subs_epu8(q0, q4)));
+    flat = _mm_max_epu8(work, flat);
+    flat = _mm_subs_epu8(flat, one);
+    flat = _mm_cmpeq_epi8(flat, zero);
+    flat = _mm_and_si128(flat, mask);
+  }
+  {
+    const __m128i four = _mm_set1_epi16(4);
+    unsigned char *src = s;
+    int i = 0;
+    do {
+      __m128i workp_a, workp_b, workp_shft;
+      p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 5 * p)), zero);
+      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
+      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
+      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
+      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
+      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
+      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
+      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
+      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
+      q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 4 * p)), zero);
+
+      workp_a = _mm_add_epi16(_mm_add_epi16(p4, p3), _mm_add_epi16(p2, p1));
+      workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
+      workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p4);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_op2[i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_op1[i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p4), q2);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_op0[i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_oq0[i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q4);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_oq1[i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q4);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_oq2[i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      src += 8;
+    } while (++i < count);
+  }
+  // lp filter
+  {
+    const __m128i t4 = _mm_set1_epi8(4);
+    const __m128i t3 = _mm_set1_epi8(3);
+    const __m128i t80 = _mm_set1_epi8(0x80);
+    const __m128i te0 = _mm_set1_epi8(0xe0);
+    const __m128i t1f = _mm_set1_epi8(0x1f);
+    const __m128i t1 = _mm_set1_epi8(0x1);
+    const __m128i t7f = _mm_set1_epi8(0x7f);
+
+    const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)),
+                                      t80);
+    const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)),
+                                      t80);
+    const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)),
+                                      t80);
+    const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)),
+                                      t80);
+    __m128i vp8_filt;
+    __m128i work_a;
+    __m128i filter1, filter2;
+
+    vp8_filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+    work_a = _mm_subs_epi8(qs0, ps0);
+    vp8_filt = _mm_adds_epi8(vp8_filt, work_a);
+    vp8_filt = _mm_adds_epi8(vp8_filt, work_a);
+    vp8_filt = _mm_adds_epi8(vp8_filt, work_a);
+    /* (vp8_filter + 3 * (qs0 - ps0)) & mask */
+    vp8_filt = _mm_and_si128(vp8_filt, mask);
+
+    filter1 = _mm_adds_epi8(vp8_filt, t4);
+    filter2 = _mm_adds_epi8(vp8_filt, t3);
+
+    /* Filter1 >> 3 */
+    work_a = _mm_cmpgt_epi8(zero, filter1);
+    filter1 = _mm_srli_epi16(filter1, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter1 = _mm_and_si128(filter1, t1f);
+    filter1 = _mm_or_si128(filter1, work_a);
+
+    /* Filter2 >> 3 */
+    work_a = _mm_cmpgt_epi8(zero, filter2);
+    filter2 = _mm_srli_epi16(filter2, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter2 = _mm_and_si128(filter2, t1f);
+    filter2 = _mm_or_si128(filter2, work_a);
+
+    /* vp8_filt >> 1 */
+    vp8_filt = _mm_adds_epi8(filter1, t1);
+    work_a = _mm_cmpgt_epi8(zero, vp8_filt);
+    vp8_filt = _mm_srli_epi16(vp8_filt, 1);
+    work_a = _mm_and_si128(work_a, t80);
+    vp8_filt = _mm_and_si128(vp8_filt, t7f);
+    vp8_filt = _mm_or_si128(vp8_filt, work_a);
+
+    vp8_filt = _mm_andnot_si128(hev, vp8_filt);
+
+    work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+    q0 = _mm_load_si128((__m128i *)flat_oq0);
+    work_a = _mm_andnot_si128(flat, work_a);
+    q0 = _mm_and_si128(flat, q0);
+    q0 = _mm_or_si128(work_a, q0);
+
+    work_a = _mm_xor_si128(_mm_subs_epi8(qs1, vp8_filt), t80);
+    q1 = _mm_load_si128((__m128i *)flat_oq1);
+    work_a = _mm_andnot_si128(flat, work_a);
+    q1 = _mm_and_si128(flat, q1);
+    q1 = _mm_or_si128(work_a, q1);
+
+    work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
+    q2 = _mm_load_si128((__m128i *)flat_oq2);
+    work_a = _mm_andnot_si128(flat, work_a);
+    q2 = _mm_and_si128(flat, q2);
+    q2 = _mm_or_si128(work_a, q2);
+
+    work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+    p0 = _mm_load_si128((__m128i *)flat_op0);
+    work_a = _mm_andnot_si128(flat, work_a);
+    p0 = _mm_and_si128(flat, p0);
+    p0 = _mm_or_si128(work_a, p0);
+
+    work_a = _mm_xor_si128(_mm_adds_epi8(ps1, vp8_filt), t80);
+    p1 = _mm_load_si128((__m128i *)flat_op1);
+    work_a = _mm_andnot_si128(flat, work_a);
+    p1 = _mm_and_si128(flat, p1);
+    p1 = _mm_or_si128(work_a, p1);
+
+    work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
+    p2 = _mm_load_si128((__m128i *)flat_op2);
+    work_a = _mm_andnot_si128(flat, work_a);
+    p2 = _mm_and_si128(flat, p2);
+    p2 = _mm_or_si128(work_a, p2);
+
+    if (count == 1) {
+      _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
+      _mm_storel_epi64((__m128i *)(s - 2 * p), p1);
+      _mm_storel_epi64((__m128i *)(s - 1 * p), p0);
+      _mm_storel_epi64((__m128i *)(s + 0 * p), q0);
+      _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
+      _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
+    } else {
+      _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
+      _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+      _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+      _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
+      _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+      _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
+    }
+  }
+}
+static __inline void transpose(unsigned char *src[], int in_p,
+                               unsigned char *dst[], int out_p,
+                               int num_8x8_to_transpose) {
+  int idx8x8 = 0;
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+
+  do {
+    unsigned char *in = src[idx8x8];
+    unsigned char *out = dst[idx8x8];
+
+    x0 = _mm_loadl_epi64((__m128i *)(in + 0*in_p));  // 00 01 02 03 04 05 06 07
+    x1 = _mm_loadl_epi64((__m128i *)(in + 1*in_p));  // 10 11 12 13 14 15 16 17
+    x2 = _mm_loadl_epi64((__m128i *)(in + 2*in_p));  // 20 21 22 23 24 25 26 27
+    x3 = _mm_loadl_epi64((__m128i *)(in + 3*in_p));  // 30 31 32 33 34 35 36 37
+    x4 = _mm_loadl_epi64((__m128i *)(in + 4*in_p));  // 40 41 42 43 44 45 46 47
+    x5 = _mm_loadl_epi64((__m128i *)(in + 5*in_p));  // 50 51 52 53 54 55 56 57
+    x6 = _mm_loadl_epi64((__m128i *)(in + 6*in_p));  // 60 61 62 63 64 65 66 67
+    x7 = _mm_loadl_epi64((__m128i *)(in + 7*in_p));  // 70 71 72 73 74 75 76 77
+    // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+    x0 = _mm_unpacklo_epi8(x0, x1);
+    // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+    x1 = _mm_unpacklo_epi8(x2, x3);
+    // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+    x2 = _mm_unpacklo_epi8(x4, x5);
+    // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+    x3 = _mm_unpacklo_epi8(x6, x7);
+    // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+    x4 = _mm_unpacklo_epi16(x0, x1);
+    // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+    x5 = _mm_unpacklo_epi16(x2, x3);
+    // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+    x6 = _mm_unpacklo_epi32(x4, x5);
+    // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+    x7 = _mm_unpackhi_epi32(x4, x5);
+
+    _mm_storel_pd((double *)(out + 0*out_p),
+                  _mm_cvtepi32_pd(x6));  // 00 10 20 30 40 50 60 70
+    _mm_storeh_pd((double *)(out + 1*out_p),
+                  _mm_cvtepi32_pd(x6));  // 01 11 21 31 41 51 61 71
+    _mm_storel_pd((double *)(out + 2*out_p),
+                  _mm_cvtepi32_pd(x7));  // 02 12 22 32 42 52 62 72
+    _mm_storeh_pd((double *)(out + 3*out_p),
+                  _mm_cvtepi32_pd(x7));  // 03 13 23 33 43 53 63 73
+
+    // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+    x4 = _mm_unpackhi_epi16(x0, x1);
+    // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
+    x5 = _mm_unpackhi_epi16(x2, x3);
+    // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
+    x6 = _mm_unpacklo_epi32(x4, x5);
+    // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
+    x7 = _mm_unpackhi_epi32(x4, x5);
+
+    _mm_storel_pd((double *)(out + 4*out_p),
+                  _mm_cvtepi32_pd(x6));  // 04 14 24 34 44 54 64 74
+    _mm_storeh_pd((double *)(out + 5*out_p),
+                  _mm_cvtepi32_pd(x6));  // 05 15 25 35 45 55 65 75
+    _mm_storel_pd((double *)(out + 6*out_p),
+                  _mm_cvtepi32_pd(x7));  // 06 16 26 36 46 56 66 76
+    _mm_storeh_pd((double *)(out + 7*out_p),
+                  _mm_cvtepi32_pd(x7));  // 07 17 27 37 47 57 67 77
+  } while (++idx8x8 < num_8x8_to_transpose);
+}
+void vp8_mbloop_filter_vertical_edge_c_sse2
+(
+  unsigned char *s,
+  int p,
+  const unsigned char *blimit,
+  const unsigned char *limit,
+  const unsigned char *thresh,
+  int count
+) {
+  DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 16]);
+  unsigned char *src[4];
+  unsigned char *dst[4];
+
+  src[0] = s - 5;
+  src[1] = s - 5 + 8;
+  src[2] = s - 5 + p*8;
+  src[3] = s - 5 + p*8 + 8;
+
+  dst[0] = t_dst;
+  dst[1] = t_dst + 16*8;
+  dst[2] = t_dst + 8;
+  dst[3] = t_dst + 16*8 + 8;
+
+  // 16x16->16x16 or 16x8->8x16
+  transpose(src, p, dst, 16, (1 << count));
+
+  vp8_mbloop_filter_horizontal_edge_c_sse2(t_dst + 5*16, 16, blimit, limit,
+                                           thresh, count);
+
+  dst[0] = s - 5;
+  dst[1] = s - 5 + p*8;
+
+  src[0] = t_dst;
+  src[1] = t_dst + 8;
+
+  // 16x8->8x16 or 8x8->8x8
+  transpose(src, 16, dst, p, (1 << (count - 1)));
+}
+
+/* Horizontal MB filtering */
 void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                               int y_stride, int uv_stride, struct loop_filter_info *lfi) {
-  vp8_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
 
+  vp8_mbloop_filter_horizontal_edge_c_sse2(y_ptr, y_stride, lfi->mblim,
+                                           lfi->lim, lfi->hev_thr, 2);
+
+  /* TODO: write sse2 version with u,v interleaved */
   if (u_ptr)
-    vp8_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, v_ptr);
+    vp8_mbloop_filter_horizontal_edge_c_sse2(u_ptr, uv_stride, lfi->mblim,
+                                             lfi->lim, lfi->hev_thr, 1);
+
+  if (v_ptr)
+    vp8_mbloop_filter_horizontal_edge_c_sse2(v_ptr, uv_stride, lfi->mblim,
+                                             lfi->lim, lfi->hev_thr, 1);
 }
 
+void vp8_loop_filter_bh8x8_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
+                             unsigned char *v_ptr, int y_stride, int uv_stride,
+                             struct loop_filter_info *lfi) {
+  vp8_mbloop_filter_horizontal_edge_c_sse2(
+    y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+}
 
 /* Vertical MB Filtering */
-void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride, struct loop_filter_info *lfi) {
-  vp8_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
+void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
+                              unsigned char *v_ptr, int y_stride, int uv_stride,
+                              struct loop_filter_info *lfi) {
+  vp8_mbloop_filter_vertical_edge_c_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim,
+                                         lfi->hev_thr, 2);
 
+  /* TODO: write sse2 version with u,v interleaved */
   if (u_ptr)
-    vp8_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, v_ptr);
+    vp8_mbloop_filter_vertical_edge_c_sse2(u_ptr, uv_stride, lfi->mblim,
+                                           lfi->lim, lfi->hev_thr, 1);
+
+  if (v_ptr)
+    vp8_mbloop_filter_vertical_edge_c_sse2(v_ptr, uv_stride, lfi->mblim,
+                                           lfi->lim, lfi->hev_thr, 1);
 }
 
+void vp8_loop_filter_bv8x8_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
+                             unsigned char *v_ptr, int y_stride, int uv_stride,
+                             struct loop_filter_info *lfi) {
+  vp8_mbloop_filter_vertical_edge_c_sse2(
+    y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+}
 
 /* Horizontal B Filtering */
 void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,