10 files changed, 848 insertions, 215 deletions
diff --git a/vp8/common/blockd.h b/vp8/common/blockd.h
index 4e5d9e813..46d002af9 100644
--- a/vp8/common/blockd.h
+++ b/vp8/common/blockd.h
@@ -44,6 +44,9 @@ void vpx_log(const char *format, ...);
 /* Segment Feature Masks */
 #define SEGMENT_DELTADATA   0
 #define SEGMENT_ABSDATA     1
+#if CONFIG_NEW_MVREF
+#define MAX_MV_REFS 10
+#endif
 
 typedef struct {
   int r, c;
@@ -179,6 +182,14 @@ typedef enum {
   B_MODE_COUNT
 } B_PREDICTION_MODE;
 
+#if CONFIG_NEW_MVREF
+// Segment level features.
+typedef enum {
+  FIRST_REF = 0,
+  SECOND_REF = 1
+} MV_REF_TYPE;
+#endif
+
 #if CONFIG_HYBRIDTRANSFORM8X8
 // convert MB_PREDICTION_MODE to B_PREDICTION_MODE
 static B_PREDICTION_MODE pred_mode_conv(MB_PREDICTION_MODE mode) {
@@ -268,9 +279,14 @@ typedef struct {
   MV_REFERENCE_FRAME ref_frame, second_ref_frame;
   TX_SIZE txfm_size;
   int_mv mv[2]; // for each reference frame used
-#if CONFIG_NEWBESTREFMV
+#if CONFIG_NEWBESTREFMV || CONFIG_NEW_MVREF
   int_mv ref_mv, second_ref_mv;
 #endif
+#if CONFIG_NEW_MVREF
+  int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REFS];
+  int mv_ref_index[MAX_REF_FRAMES];
+#endif
+
   unsigned char partitioning;
   unsigned char mb_skip_coeff;                                /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */
   unsigned char need_to_clamp_mvs;
@@ -432,9 +448,14 @@ typedef struct MacroBlockD {
 #endif
 
   int mb_index;   // Index of the MB in the SB (0..3)
+
 #if CONFIG_NEWBESTREFMV
+#if CONFIG_NEW_MVREF
+  int_mv ref_mv[MAX_MV_REFS];
+#else
   int_mv ref_mv[4];
 #endif
+#endif
 
 #if CONFIG_HYBRIDTRANSFORM
   int q_index;
diff --git a/vp8/common/findnearmv.c b/vp8/common/findnearmv.c
index 6f7361dd0..694f4cc32 100644
--- a/vp8/common/findnearmv.c
+++ b/vp8/common/findnearmv.c
@@ -200,6 +200,139 @@ vp8_prob *vp8_mv_ref_probs(VP8_COMMON *pc,
  * above and a number cols of pixels in the left to select the one with best
  * score to use as ref motion vector
  */
+
+#if CONFIG_NEW_MVREF
+
+void vp8_find_best_ref_mvs(MACROBLOCKD *xd,
+                           unsigned char *ref_y_buffer,
+                           int ref_y_stride,
+                           int_mv *best_mv,
+                           int_mv *nearest,
+                           int_mv *near) {
+  int_mv *ref_mv = xd->ref_mv;
+  int i, j;
+  unsigned char *above_src;
+  unsigned char *left_src;
+  unsigned char *above_ref;
+  unsigned char *left_ref;
+  int sad;
+  int sad_scores[MAX_MV_REFS];
+  int_mv sorted_mvs[MAX_MV_REFS];
+  int zero_seen = FALSE;
+
+  // Default all to 0,0 if nothing else available
+  best_mv->as_int = nearest->as_int = near->as_int = 0;
+  vpx_memset(sorted_mvs, 0, sizeof(sorted_mvs));
+
+  above_src = xd->dst.y_buffer - xd->dst.y_stride * 2;
+  left_src  = xd->dst.y_buffer - 2;
+  above_ref = ref_y_buffer - ref_y_stride * 2;
+  left_ref  = ref_y_buffer - 2;
+
+  for(i = 0; i < MAX_MV_REFS; ++i) {
+    int_mv this_mv;
+    int offset=0;
+    int row_offset, col_offset;
+
+    this_mv.as_int = ref_mv[i].as_int;
+
+    // If we see a 0,0 vector for a second time we have reached the end of
+    // the list of valid candidate vectors.
+    if (!this_mv.as_int)
+      if (zero_seen)
+        break;
+      else
+        zero_seen = TRUE;
+
+    vp8_clamp_mv(&this_mv,
+                 xd->mb_to_left_edge - LEFT_TOP_MARGIN + 16,
+                 xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
+                 xd->mb_to_top_edge - LEFT_TOP_MARGIN + 16,
+                 xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
+
+    row_offset = (this_mv.as_mv.row > 0) ?
+      ((this_mv.as_mv.row + 3) >> 3):((this_mv.as_mv.row + 4) >> 3);
+    col_offset = (this_mv.as_mv.col > 0) ?
+      ((this_mv.as_mv.col + 3) >> 3):((this_mv.as_mv.col + 4) >> 3);
+    offset = ref_y_stride * row_offset + col_offset;
+
+    sad = vp8_sad16x2_c(above_src, xd->dst.y_stride,
+                        above_ref + offset, ref_y_stride, INT_MAX);
+
+    sad += vp8_sad2x16_c(left_src, xd->dst.y_stride,
+                         left_ref + offset, ref_y_stride, INT_MAX);
+
+    // Add the entry to our list and then resort the list on score.
+    sad_scores[i] = sad;
+    sorted_mvs[i].as_int = this_mv.as_int;
+    j = i;
+    while (j > 0) {
+      if (sad_scores[j] < sad_scores[j-1]) {
+        sad_scores[j] = sad_scores[j-1];
+        sorted_mvs[j].as_int = sorted_mvs[j-1].as_int;
+        sad_scores[j-1] = sad;
+        sorted_mvs[j-1].as_int = this_mv.as_int;
+        j--;
+      } else
+        break;
+    }
+  }
+
+  // If not see add 0,0 as a possibility
+  /*if ( (i < MAX_MV_REFS) && !zero_seen ) {
+
+    sad = vp8_sad16x2_c(above_src, xd->dst.y_stride,
+                        above_ref, ref_y_stride,
+                        INT_MAX);
+    sad += vp8_sad2x16_c(left_src, xd->dst.y_stride,
+                         left_ref, ref_y_stride,
+                         INT_MAX);
+    this_mv.as_int = 0;
+
+    // Add the entry to our list and then resort the list on score.
+    sad_scores[i] = sad;
+    sorted_mvs[i].as_int = this_mv.as_int;
+    j = i;
+    while (j > 0) {
+      if (sad_scores[j] < sad_scores[j-1]) {
+        sad_scores[j] = sad_scores[j-1];
+        sorted_mvs[j].as_int = sorted_mvs[j-1].as_int;
+        sad_scores[j-1] = sad;
+        sorted_mvs[j-1].as_int = this_mv.as_int;
+        j--;
+      } else
+        break;
+    }
+  }*/
+
+  // Set the best mv to the first entry in the sorted list
+  best_mv->as_int = sorted_mvs[0].as_int;
+
+  // Provided that there are non zero vectors available there will not
+  // be more than one 0,0 entry in the sorted list.
+  // The best ref mv is always set to the first entry (which gave the best
+  // results. The nearest is set to the first non zero vector if available and
+  // near to the second non zero vector if avaialable.
+  // We do not use 0,0 as a nearest or near as 0,0 has its own mode.
+  if ( sorted_mvs[0].as_int ) {
+    nearest->as_int = sorted_mvs[0].as_int;
+    if ( sorted_mvs[1].as_int )
+      near->as_int = sorted_mvs[1].as_int;
+    else
+      near->as_int = sorted_mvs[2].as_int;
+  } else {
+      nearest->as_int = sorted_mvs[1].as_int;
+      near->as_int = sorted_mvs[2].as_int;
+  }
+
+  if (!xd->allow_high_precision_mv)
+    lower_mv_precision(best_mv);
+
+  vp8_clamp_mv2(best_mv, xd);
+}
+
+#else // !CONFIG_NEW_MVREF
+
 void vp8_find_best_ref_mvs(MACROBLOCKD *xd,
                            unsigned char *ref_y_buffer,
                            int ref_y_stride,
@@ -270,5 +403,5 @@ void vp8_find_best_ref_mvs(MACROBLOCKD *xd,
     nearest->as_int = best_mv->as_int;
   }
 }
-
-#endif
+#endif  // CONFIG_NEW_MVREF
+#endif  // CONFIG_NEWBESTREFMV
diff --git a/vp8/common/mvref_common.c b/vp8/common/mvref_common.c
new file mode 100644
index 000000000..1c345dba5
--- /dev/null
+++ b/vp8/common/mvref_common.c
@@ -0,0 +1,303 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "mvref_common.h"
+
+#if CONFIG_NEW_MVREF
+
+#define MVREF_NEIGHBOURS 8
+static int mv_ref_search[MVREF_NEIGHBOURS][2] =
+  { {0,-1},{-1,0},{-1,-1},{0,-2},{-2,0},{-1,-2},{-2,-1},{-2,-2} };
+static int ref_distance_weight[MVREF_NEIGHBOURS] =
+  { 3,3,2,1,1,1,1,1 };
+  //{ 4,4,2,1,1,1,1,1 };
+
+// clamp_mv
+#define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units
+static void clamp_mv(const MACROBLOCKD *xd, int_mv *mv) {
+
+  if (mv->as_mv.col < (xd->mb_to_left_edge - MV_BORDER))
+    mv->as_mv.col = xd->mb_to_left_edge - MV_BORDER;
+  else if (mv->as_mv.col > xd->mb_to_right_edge + MV_BORDER)
+    mv->as_mv.col = xd->mb_to_right_edge + MV_BORDER;
+
+  if (mv->as_mv.row < (xd->mb_to_top_edge - MV_BORDER))
+    mv->as_mv.row = xd->mb_to_top_edge - MV_BORDER;
+  else if (mv->as_mv.row > xd->mb_to_bottom_edge + MV_BORDER)
+    mv->as_mv.row = xd->mb_to_bottom_edge + MV_BORDER;
+}
+
+// Code for selecting / building and entropy coding a motion vector reference
+// Returns a seperation value for two vectors.
+// This is taken as the sum of the abs x and y difference.
+unsigned int mv_distance(int_mv *mv1, int_mv *mv2) {
+  return (abs(mv1->as_mv.row - mv2->as_mv.row) +
+          abs(mv1->as_mv.col - mv2->as_mv.col));
+}
+
+// Gets a best matching candidate refenence motion vector
+// from the given mode info structure (if available)
+int get_candidate_mvref(
+  const MODE_INFO *candidate_mi,
+  MV_REFERENCE_FRAME ref_frame,
+  MV_REFERENCE_FRAME *candidate_ref_frame,
+  int_mv *candidate_mv
+) {
+
+  int ret_val = FALSE;
+
+  if (ref_frame == candidate_mi->mbmi.ref_frame) {
+    candidate_mv->as_int = candidate_mi->mbmi.mv[FIRST_REF].as_int;
+    *candidate_ref_frame = ref_frame;
+    ret_val = TRUE;
+
+  } else if (ref_frame == candidate_mi->mbmi.second_ref_frame) {
+    candidate_mv->as_int = candidate_mi->mbmi.mv[SECOND_REF].as_int;
+    *candidate_ref_frame = ref_frame;
+    ret_val = TRUE;
+
+  } else if (candidate_mi->mbmi.ref_frame != INTRA_FRAME) {
+    candidate_mv->as_int = candidate_mi->mbmi.mv[FIRST_REF].as_int;
+    *candidate_ref_frame = candidate_mi->mbmi.ref_frame;
+    ret_val = TRUE;
+
+  } else if (candidate_mi->mbmi.second_ref_frame != INTRA_FRAME) {
+    candidate_mv->as_int = candidate_mi->mbmi.mv[SECOND_REF].as_int;
+    *candidate_ref_frame = candidate_mi->mbmi.second_ref_frame;
+    ret_val = TRUE;
+  }
+
+  return ret_val;
+}
+
+// Performs mv adjustment based on reference frame and clamps the MV
+// if it goes off the edge of the buffer.
+void scale_mv(
+  MACROBLOCKD *xd,
+  MV_REFERENCE_FRAME this_ref_frame,
+  MV_REFERENCE_FRAME candidate_ref_frame,
+  int_mv *candidate_mv,
+  int *ref_sign_bias
+) {
+
+  if (candidate_ref_frame != this_ref_frame) {
+
+    //int frame_distances[MAX_REF_FRAMES];
+    //int last_distance = 1;
+    //int gf_distance = xd->frames_since_golden;
+    //int arf_distance = xd->frames_till_alt_ref_frame;
+
+    // Sign inversion where appropriate.
+    if (ref_sign_bias[candidate_ref_frame] != ref_sign_bias[this_ref_frame]) {
+      candidate_mv->as_mv.row = -candidate_mv->as_mv.row;
+      candidate_mv->as_mv.col = -candidate_mv->as_mv.col;
+    }
+
+    // Scale based on frame distance if the reference frames not the same.
+    /*frame_distances[INTRA_FRAME] = 1;   // should never be used
+    frame_distances[LAST_FRAME] = 1;
+    frame_distances[GOLDEN_FRAME] =
+      (xd->frames_since_golden) ? xd->frames_since_golden : 1;
+    frame_distances[ALTREF_FRAME] =
+      (xd->frames_till_alt_ref_frame) ? xd->frames_till_alt_ref_frame : 1;
+
+    if (frame_distances[this_ref_frame] &&
+        frame_distances[candidate_ref_frame]) {
+      candidate_mv->as_mv.row =
+        (short)(((int)(candidate_mv->as_mv.row) *
+                 frame_distances[this_ref_frame]) /
+                frame_distances[candidate_ref_frame]);
+
+      candidate_mv->as_mv.col =
+        (short)(((int)(candidate_mv->as_mv.col) *
+                 frame_distances[this_ref_frame]) /
+                frame_distances[candidate_ref_frame]);
+    }
+    */
+  }
+
+  // Clamp the MV so it does not point out of the frame buffer
+  clamp_mv(xd, candidate_mv);
+}
+
+// Adds a new candidate reference vector to the list if indeed it is new.
+// If it is not new then the score of the existing candidate that it matches
+// is increased and the list is resorted.
+void addmv_and_shuffle(
+  int_mv *mv_list,
+  int *mv_scores,
+  int *index,
+  int_mv candidate_mv,
+  int weight
+) {
+
+  int i = *index;
+  int duplicate_found = FALSE;
+
+  // Check for duplicates. If there is one increment its score.
+  while (i > 0) {
+    i--;
+    if (candidate_mv.as_int == mv_list[i].as_int) {
+      duplicate_found = TRUE;
+      mv_scores[i] += weight;
+      break;
+    }
+  }
+
+  // If no duplicate was found add the new vector and give it a weight
+  if (!duplicate_found) {
+    mv_list[*index].as_int = candidate_mv.as_int;
+    mv_scores[*index] = weight;
+    i = *index;
+    (*index)++;
+  }
+
+  // Reshuffle the list so that highest scoring mvs at the top.
+  while (i > 0) {
+    if (mv_scores[i] > mv_scores[i-1]) {
+      int tmp_score = mv_scores[i-1];
+      int_mv tmp_mv = mv_list[i-1];
+
+      mv_scores[i-1] = mv_scores[i];
+      mv_list[i-1] = mv_list[i];
+      mv_scores[i] = tmp_score;
+      mv_list[i] = tmp_mv;
+      i--;
+    } else
+      break;
+  }
+}
+
+
+// Measure the distance of each reference candidate from the actual
+// residual vector and return the nearest
+unsigned int pick_best_mv_ref( int_mv target_mv,
+                               int_mv * mv_ref_list,
+                               int_mv * best_ref ) {
+
+  int i;
+  int best_index = 0;
+  unsigned int distance, distance2;
+
+  distance = mv_distance(&target_mv, &mv_ref_list[0]);
+
+  for (i = 1; i < MAX_MV_REFS; ++i ) {
+    distance2 =
+      mv_distance(&target_mv, &mv_ref_list[i]);
+    if (distance2 < distance) {
+      distance = distance2;
+      best_index = i;
+    }
+  }
+
+  (*best_ref).as_int = mv_ref_list[best_index].as_int;
+
+  return best_index;
+}
+
+// This function searches the neighbourhood of a given MB/SB and populates a
+// list of candidate reference vectors.
+//
+void find_mv_refs(
+  MACROBLOCKD *xd,
+  MODE_INFO *here,
+  MODE_INFO *lf_here,
+  MV_REFERENCE_FRAME ref_frame,
+  int_mv *mv_ref_list,
+  int *ref_sign_bias
+) {
+
+  int i;
+  MODE_INFO *candidate_mi;
+  int_mv candidate_mvs[MAX_MV_REFS];
+  int_mv c_refmv;
+  MV_REFERENCE_FRAME c_ref_frame;
+  int candidate_scores[MAX_MV_REFS];
+  int index = 0;
+  int ref_weight = 0;
+  int valid_mv_ref;
+
+  // Blank the reference vector lists and other local structures.
+  vpx_memset(mv_ref_list, 0, sizeof(int_mv) * MAX_MV_REFS);
+  vpx_memset(candidate_mvs, 0, sizeof(int_mv) * MAX_MV_REFS);
+  vpx_memset(candidate_scores, 0, sizeof(candidate_scores));
+
+  // Populate a list with candidate reference vectors from the
+  // spatial neighbours.
+  for (i = 0; i < 2; ++i) {
+    if (((mv_ref_search[i][0] << 7) >= xd->mb_to_left_edge) &&
+        ((mv_ref_search[i][1] << 7) >= xd->mb_to_top_edge)) {
+
+      candidate_mi = here + mv_ref_search[i][0] +
+                     (mv_ref_search[i][1] * xd->mode_info_stride);
+
+      valid_mv_ref = get_candidate_mvref(candidate_mi, ref_frame,
+                                         &c_ref_frame, &c_refmv);
+
+      if (valid_mv_ref) {
+          scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias );
+          ref_weight = ref_distance_weight[i] +
+                       ((c_ref_frame == ref_frame) << 3);
+
+          addmv_and_shuffle(candidate_mvs, candidate_scores,
+                            &index, c_refmv, ref_weight);
+      }
+    }
+  }
+
+  // Look at the corresponding vector in the last frame
+  candidate_mi = lf_here;
+  valid_mv_ref = get_candidate_mvref(candidate_mi, ref_frame,
+                                     &c_ref_frame, &c_refmv);
+  if (valid_mv_ref) {
+    scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias );
+    ref_weight = 2 + ((c_ref_frame == ref_frame) << 3);
+    addmv_and_shuffle(candidate_mvs, candidate_scores,
+                      &index, c_refmv, ref_weight);
+  }
+
+  // Populate a list with candidate reference vectors from the
+  // spatial neighbours.
+  for (i = 2; i < MVREF_NEIGHBOURS; ++i) {
+    if (((mv_ref_search[i][0] << 7) >= xd->mb_to_left_edge) &&
+        ((mv_ref_search[i][1] << 7) >= xd->mb_to_top_edge)) {
+
+      candidate_mi = here + mv_ref_search[i][0] +
+                     (mv_ref_search[i][1] * xd->mode_info_stride);
+
+      valid_mv_ref = get_candidate_mvref(candidate_mi, ref_frame,
+                                         &c_ref_frame, &c_refmv);
+
+      if (valid_mv_ref) {
+          scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias );
+          ref_weight = ref_distance_weight[i] +
+                       ((c_ref_frame == ref_frame) << 3);
+
+          addmv_and_shuffle(candidate_mvs, candidate_scores,
+                            &index, c_refmv, ref_weight);
+      }
+    }
+  }
+
+  // 0,0 is always a valid reference.
+  for (i = 0; i < index; ++i)
+    if (candidate_mvs[i].as_int == 0)
+      break;
+  if (i == index) {
+    c_refmv.as_int = 0;
+    addmv_and_shuffle(candidate_mvs, candidate_scores,
+                      &index, c_refmv, 1);
+  }
+
+  // Copy over the candidate list.
+  vpx_memcpy(mv_ref_list, candidate_mvs, sizeof(candidate_mvs));
+}
+
+#endif
diff --git a/vp8/common/mvref_common.h b/vp8/common/mvref_common.h
new file mode 100644
index 000000000..9be408894
--- /dev/null
+++ b/vp8/common/mvref_common.h
@@ -0,0 +1,37 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "onyxc_int.h"
+#include "blockd.h"
+
+// MR reference entropy header file.
+#if CONFIG_NEW_MVREF
+
+#ifndef __INC_MVREF_COMMON_H
+#define __INC_MVREF_COMMON_H
+
+unsigned int mv_distance(int_mv *mv1, int_mv *mv2);
+
+unsigned int pick_best_mv_ref( int_mv target_mv,
+                      int_mv * mv_ref_list,
+                      int_mv * best_ref );
+
+void find_mv_refs(
+  MACROBLOCKD *xd,
+  MODE_INFO *here,
+  MODE_INFO *lf_here,
+  MV_REFERENCE_FRAME ref_frame,
+  int_mv * mv_ref_list,
+  int *ref_sign_bias
+);
+
+#endif
+
+#endif
diff --git a/vp8/common/recon.h b/vp8/common/recon.h
index 3527fc14d..0bb5c8863 100644
--- a/vp8/common/recon.h
+++ b/vp8/common/recon.h
@@ -262,4 +262,12 @@ typedef struct vp8_recon_rtcd_vtable {
 
 void vp8_recon_intra_mbuv(const vp8_recon_rtcd_vtable_t *rtcd,
                           MACROBLOCKD *xd);
+
+#if CONFIG_SUPERBLOCKS
+extern void vp8_recon_mby_s_c(const vp8_recon_rtcd_vtable_t *rtcd,
+                              MACROBLOCKD *xd, uint8_t *dst);
+extern void vp8_recon_mbuv_s_c(const vp8_recon_rtcd_vtable_t *rtcd,
+                               MACROBLOCKD *xd, uint8_t *udst, uint8_t *vdst);
+#endif
+
 #endif
diff --git a/vp8/common/reconinter.h b/vp8/common/reconinter.h
index 7ad0adbd4..37e34b5e1 100644
--- a/vp8/common/reconinter.h
+++ b/vp8/common/reconinter.h
@@ -45,6 +45,15 @@ extern void vp8_build_2nd_inter16x16_predictors_mb(MACROBLOCKD *xd,
                                                    int dst_ystride,
                                                    int dst_uvstride);
 
+#if CONFIG_SUPERBLOCKS
+extern void vp8_build_inter32x32_predictors_sb(MACROBLOCKD *x,
+                                               unsigned char *dst_y,
+                                               unsigned char *dst_u,
+                                               unsigned char *dst_v,
+                                               int dst_ystride,
+                                               int dst_uvstride);
+#endif
+
 extern void vp8_build_inter_predictors_mb(MACROBLOCKD *xd);
 
 extern void vp8_build_inter_predictors_b(BLOCKD *d, int pitch,
diff --git a/vp8/common/reconintra.c b/vp8/common/reconintra.c
index e391fa9be..cad9652b7 100644
--- a/vp8/common/reconintra.c
+++ b/vp8/common/reconintra.c
@@ -207,10 +207,10 @@ void vp8_recon_intra_mbuv(const vp8_recon_rtcd_vtable_t *rtcd,
   }
 }
 
-void vp8_build_intra_predictors_internal(MACROBLOCKD *xd,
-                                         unsigned char *src, int src_stride,
+void vp8_build_intra_predictors_internal(unsigned char *src, int src_stride,
                                          unsigned char *ypred_ptr,
-                                         int y_stride, int mode, int bsize) {
+                                         int y_stride, int mode, int bsize,
+                                         int up_available, int left_available) {
 
   unsigned char *yabove_row = src - src_stride;
   unsigned char yleft_col[32];
@@ -218,7 +218,7 @@ void vp8_build_intra_predictors_internal(MACROBLOCKD *xd,
   int r, c, i;
 
   for (i = 0; i < bsize; i++) {
-    yleft_col[i] = xd->dst.y_buffer [i * src_stride - 1];
+    yleft_col[i] = src[i * src_stride - 1];
   }
 
   /* for Y */
@@ -230,8 +230,10 @@ void vp8_build_intra_predictors_internal(MACROBLOCKD *xd,
       int average = 0;
       int log2_bsize_minus_1;
 
-      assert(bsize == 8 || bsize == 16 || bsize == 32);
-      if (bsize == 8) {
+      assert(bsize == 4 || bsize == 8 || bsize == 16 || bsize == 32);
+      if (bsize == 4) {
+        log2_bsize_minus_1 = 1;
+      } else if (bsize == 8) {
         log2_bsize_minus_1 = 2;
       } else if (bsize == 16) {
         log2_bsize_minus_1 = 3;
@@ -239,19 +241,19 @@ void vp8_build_intra_predictors_internal(MACROBLOCKD *xd,
         log2_bsize_minus_1 = 4;
       }
 
-      if (xd->up_available || xd->left_available) {
-        if (xd->up_available) {
+      if (up_available || left_available) {
+        if (up_available) {
           for (i = 0; i < bsize; i++) {
             average += yabove_row[i];
           }
         }
 
-        if (xd->left_available) {
+        if (left_available) {
           for (i = 0; i < bsize; i++) {
             average += yleft_col[i];
           }
         }
-        shift = log2_bsize_minus_1 + xd->up_available + xd->left_available;
+        shift = log2_bsize_minus_1 + up_available + left_available;
         expected_dc = (average + (1 << (shift - 1))) >> shift;
       } else {
         expected_dc = 128;
@@ -332,22 +334,25 @@ void vp8_build_intra_predictors_internal(MACROBLOCKD *xd,
 }
 
 void vp8_build_intra_predictors_mby(MACROBLOCKD *xd) {
-  vp8_build_intra_predictors_internal(xd, xd->dst.y_buffer, xd->dst.y_stride,
+  vp8_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
                                       xd->predictor, 16,
-                                      xd->mode_info_context->mbmi.mode, 16);
+                                      xd->mode_info_context->mbmi.mode, 16,
+                                      xd->up_available, xd->left_available);
 }
 
 void vp8_build_intra_predictors_mby_s(MACROBLOCKD *xd) {
-  vp8_build_intra_predictors_internal(xd, xd->dst.y_buffer, xd->dst.y_stride,
+  vp8_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
                                       xd->dst.y_buffer, xd->dst.y_stride,
-                                      xd->mode_info_context->mbmi.mode, 16);
+                                      xd->mode_info_context->mbmi.mode, 16,
+                                      xd->up_available, xd->left_available);
 }
 
 #if CONFIG_SUPERBLOCKS
-void vp8_build_intra_predictors_sby_s(MACROBLOCKD *x) {
-  vp8_build_intra_predictors_internal(x, x->dst.y_buffer, x->dst.y_stride,
-                                      x->dst.y_buffer, x->dst.y_stride,
-                                      x->mode_info_context->mbmi.mode, 32);
+void vp8_build_intra_predictors_sby_s(MACROBLOCKD *xd) {
+  vp8_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
+                                      xd->dst.y_buffer, xd->dst.y_stride,
+                                      xd->mode_info_context->mbmi.mode, 32,
+                                      xd->up_available, xd->left_available);
 }
 #endif
 
@@ -356,14 +361,16 @@ void vp8_build_comp_intra_predictors_mby(MACROBLOCKD *xd) {
   unsigned char predictor[2][256];
   int i;
 
-  vp8_build_intra_predictors_internal(xd, xd->dst.y_buffer, xd->dst.y_stride,
+  vp8_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
                                       predictor[0], 16,
                                       xd->mode_info_context->mbmi.mode,
-                                      16);
-  vp8_build_intra_predictors_internal(xd, xd->dst.y_buffer, xd->dst.y_stride,
+                                      16, xd->up_available,
+                                      xd->left_available);
+  vp8_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
                                       predictor[1], 16,
                                       xd->mode_info_context->mbmi.second_mode,
-                                      16);
+                                      16, xd->up_available,
+                                      xd->left_available);
 
   for (i = 0; i < 256; i++) {
     xd->predictor[i] = (predictor[0][i] + predictor[1][i] + 1) >> 1;
@@ -376,10 +383,12 @@ void vp8_build_intra_predictors_mbuv_internal(MACROBLOCKD *xd,
                                               unsigned char *vpred_ptr,
                                               int uv_stride,
                                               int mode, int bsize) {
-  vp8_build_intra_predictors_internal(xd, xd->dst.u_buffer, xd->dst.uv_stride,
-                                      upred_ptr, uv_stride, mode, bsize);
-  vp8_build_intra_predictors_internal(xd, xd->dst.v_buffer, xd->dst.uv_stride,
-                                      vpred_ptr, uv_stride, mode, bsize);
+  vp8_build_intra_predictors_internal(xd->dst.u_buffer, xd->dst.uv_stride,
+                                      upred_ptr, uv_stride, mode, bsize,
+                                      xd->up_available, xd->left_available);
+  vp8_build_intra_predictors_internal(xd->dst.v_buffer, xd->dst.uv_stride,
+                                      vpred_ptr, uv_stride, mode, bsize,
+                                      xd->up_available, xd->left_available);
 }
 
 void vp8_build_intra_predictors_mbuv(MACROBLOCKD *xd) {
@@ -428,95 +437,9 @@ void vp8_build_comp_intra_predictors_mbuv(MACROBLOCKD *xd) {
 void vp8_intra8x8_predict(BLOCKD *xd,
                           int mode,
                           unsigned char *predictor) {
-
-  unsigned char *yabove_row = *(xd->base_dst) + xd->dst - xd->dst_stride;
-  unsigned char yleft_col[8];
-  unsigned char ytop_left = yabove_row[-1];
-  int r, c, i;
-
-  for (i = 0; i < 8; i++) {
-    yleft_col[i] = (*(xd->base_dst))[xd->dst - 1 + i * xd->dst_stride];
-  }
-  switch (mode) {
-    case DC_PRED: {
-      int expected_dc = 0;
-
-      for (i = 0; i < 8; i++) {
-        expected_dc += yabove_row[i];
-        expected_dc += yleft_col[i];
-      }
-      expected_dc = (expected_dc + 8) >> 4;
-
-      for (r = 0; r < 8; r++) {
-        for (c = 0; c < 8; c++) {
-          predictor[c] = expected_dc;
-        }
-        predictor += 16;
-      }
-    }
-    break;
-    case V_PRED: {
-      for (r = 0; r < 8; r++) {
-        for (c = 0; c < 8; c++) {
-          predictor[c] = yabove_row[c];
-        }
-        predictor += 16;
-      }
-
-    }
-    break;
-    case H_PRED: {
-
-      for (r = 0; r < 8; r++) {
-        for (c = 0; c < 8; c++) {
-          predictor[c] = yleft_col[r];
-        }
-        predictor += 16;
-      }
-    }
-    break;
-    case TM_PRED: {
-      /* prediction similar to true_motion prediction */
-      for (r = 0; r < 8; r++) {
-        for (c = 0; c < 8; c++) {
-          int pred = yabove_row[c] - ytop_left + yleft_col[r];
-          if (pred < 0)
-            pred = 0;
-
-          if (pred > 255)
-            pred = 255;
-          predictor[c] = pred;
-        }
-
-        predictor += 16;
-      }
-    }
-    break;
-    case D45_PRED: {
-      d45_predictor(predictor, 16, 8,  yabove_row, yleft_col);
-    }
-    break;
-    case D135_PRED: {
-      d135_predictor(predictor, 16, 8,  yabove_row, yleft_col);
-    }
-    break;
-    case D117_PRED: {
-      d117_predictor(predictor, 16, 8,  yabove_row, yleft_col);
-    }
-    break;
-    case D153_PRED: {
-      d153_predictor(predictor, 16, 8,  yabove_row, yleft_col);
-    }
-    break;
-    case D27_PRED: {
-      d27_predictor(predictor, 16, 8,  yabove_row, yleft_col);
-    }
-    break;
-    case D63_PRED: {
-      d63_predictor(predictor, 16, 8,  yabove_row, yleft_col);
-    }
-    break;
-  }
+  vp8_build_intra_predictors_internal(*(xd->base_dst) + xd->dst,
+                                      xd->dst_stride, predictor, 16,
+                                      mode, 8, 1, 1);
 }
 
 #if CONFIG_COMP_INTRA_PRED
@@ -540,96 +463,9 @@ void vp8_comp_intra8x8_predict(BLOCKD *xd,
 void vp8_intra_uv4x4_predict(BLOCKD *xd,
                              int mode,
                              unsigned char *predictor) {
-
-  unsigned char *above_row = *(xd->base_dst) + xd->dst - xd->dst_stride;
-  unsigned char left_col[4];
-  unsigned char top_left = above_row[-1];
-  int r, c, i;
-
-  for (i = 0; i < 4; i++) {
-    left_col[i] = (*(xd->base_dst))[xd->dst - 1 + i * xd->dst_stride];
-  }
-  switch (mode) {
-    case DC_PRED: {
-      int expected_dc = 0;
-
-      for (i = 0; i < 4; i++) {
-        expected_dc += above_row[i];
-        expected_dc += left_col[i];
-      }
-      expected_dc = (expected_dc + 4) >> 3;
-
-      for (r = 0; r < 4; r++) {
-        for (c = 0; c < 4; c++) {
-          predictor[c] = expected_dc;
-        }
-        predictor += 8;
-      }
-    }
-    break;
-    case V_PRED: {
-      for (r = 0; r < 4; r++) {
-        for (c = 0; c < 4; c++) {
-
-          predictor[c] = above_row[c];
-        }
-        predictor += 8;
-      }
-
-    }
-    break;
-    case H_PRED: {
-
-      for (r = 0; r < 4; r++) {
-        for (c = 0; c < 4; c++) {
-          predictor[c] = left_col[r];
-        }
-        predictor += 8;
-      }
-    }
-    break;
-    case TM_PRED: {
-      /* prediction similar to true_motion prediction */
-      for (r = 0; r < 4; r++) {
-        for (c = 0; c < 4; c++) {
-          int pred = above_row[c] - top_left + left_col[r];
-          if (pred < 0)
-            pred = 0;
-
-          if (pred > 255)
-            pred = 255;
-          predictor[c] = pred;
-        }
-
-        predictor += 8;
-      }
-    }
-    break;
-    case D45_PRED: {
-      d45_predictor(predictor, 8, 4,  above_row, left_col);
-    }
-    break;
-    case D135_PRED: {
-      d135_predictor(predictor, 8, 4,  above_row, left_col);
-    }
-    break;
-    case D117_PRED: {
-      d117_predictor(predictor, 8, 4,  above_row, left_col);
-    }
-    break;
-    case D153_PRED: {
-      d153_predictor(predictor, 8, 4,  above_row, left_col);
-    }
-    break;
-    case D27_PRED: {
-      d27_predictor(predictor, 8, 4,  above_row, left_col);
-    }
-    break;
-    case D63_PRED: {
-      d63_predictor(predictor, 8, 4,  above_row, left_col);
-    }
-    break;
-  }
+  vp8_build_intra_predictors_internal(*(xd->base_dst) + xd->dst,
+                                      xd->dst_stride, predictor, 8,
+                                      mode, 4, 1, 1);
 }
 
 #if CONFIG_COMP_INTRA_PRED
diff --git a/vp8/common/rtcd_defs.sh b/vp8/common/rtcd_defs.sh
index 1cb5de311..66029f88e 100644
--- a/vp8/common/rtcd_defs.sh
+++ b/vp8/common/rtcd_defs.sh
@@ -14,8 +14,8 @@ prototype void vp8_filter_block2d_16x16_8 "const unsigned char *src_ptr, const u
 # compiles warning free but a dissassembly of generated code show bugs. To be
 # on the safe side, only enabled when compiled with 'gcc'.
 if [ "$CONFIG_GCC" = "yes" ]; then
-    specialize vp8_filter_block2d_4x4_8 sse4_1
-    specialize vp8_filter_block2d_8x4_8 sse4_1
-    specialize vp8_filter_block2d_8x8_8 sse4_1
-    specialize vp8_filter_block2d_16x16_8 sse4_1
+    specialize vp8_filter_block2d_4x4_8 sse4_1 sse2
+    specialize vp8_filter_block2d_8x4_8 sse4_1 sse2
+    specialize vp8_filter_block2d_8x8_8 sse4_1 sse2
+    specialize vp8_filter_block2d_16x16_8 sse4_1 sse2
 fi
diff --git a/vp8/common/x86/filter_sse2.c b/vp8/common/x86/filter_sse2.c
new file mode 100644
index 000000000..fe57b4e0b
--- /dev/null
+++ b/vp8/common/x86/filter_sse2.c
@@ -0,0 +1,289 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h> // for alignment checks
+#include <emmintrin.h> // SSE2
+#include "vp8/common/filter.h"
+#include "vpx_ports/mem.h" // for DECLARE_ALIGNED
+#include "vpx_rtcd.h"
+
+// TODO(cd): After cleanup, commit faster versions for non 4x4 size. This is
+//           just a quick partial snapshot so that other can already use some
+//           speedup.
+// TODO(cd): Use vectorized 8 tap filtering code as speedup to pure C 6 tap
+//           filtering.
+// TODO(cd): Add some comments, better variable naming.
+// TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum
+//           of positive above 128), or have higher precision filter
+//           coefficients.
+
+DECLARE_ALIGNED(16, static const unsigned int, rounding_c[4]) = {
+  VP8_FILTER_WEIGHT >> 1,
+  VP8_FILTER_WEIGHT >> 1,
+  VP8_FILTER_WEIGHT >> 1,
+  VP8_FILTER_WEIGHT >> 1,
+};
+
+// Creating a macro to do more than four pixels at once to hide instruction
+// latency is actually slower :-(
+#define DO_FOUR_PIXELS(result, src_ptr, offset)                                \
+  {                                                                            \
+  /* Do shifted load to achieve require shuffles through unpacking */          \
+  const __m128i src0  = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 0)); \
+  const __m128i src1  = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 1)); \
+  const __m128i src2  = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 2)); \
+  const __m128i src3  = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 3)); \
+  const __m128i src01 = _mm_unpacklo_epi8(src0, src1);                         \
+  const __m128i src01_16 = _mm_unpacklo_epi8(src01, zero);                     \
+  const __m128i src23 = _mm_unpacklo_epi8(src2, src3);                         \
+  const __m128i src23_16 = _mm_unpacklo_epi8(src23, zero);                     \
+  /* Shit by 4 bytes through suffle to get additional shifted loads */         \
+  const __m128i src4  = _mm_shuffle_epi32(src0, _MM_SHUFFLE(3, 3, 2, 1));      \
+  const __m128i src5  = _mm_shuffle_epi32(src1, _MM_SHUFFLE(3, 3, 2, 1));      \
+  const __m128i src6  = _mm_shuffle_epi32(src2, _MM_SHUFFLE(3, 3, 2, 1));      \
+  const __m128i src7  = _mm_shuffle_epi32(src3, _MM_SHUFFLE(3, 3, 2, 1));      \
+  const __m128i src45 = _mm_unpacklo_epi8(src4, src5);                         \
+  const __m128i src45_16 = _mm_unpacklo_epi8(src45, zero);                     \
+  const __m128i src67 = _mm_unpacklo_epi8(src6, src7);                         \
+  const __m128i src67_16 = _mm_unpacklo_epi8(src67, zero);                     \
+  /* multiply accumulate them */                                               \
+  const __m128i mad01 = _mm_madd_epi16(src01_16, fil01);                       \
+  const __m128i mad23 = _mm_madd_epi16(src23_16, fil23);                       \
+  const __m128i mad45 = _mm_madd_epi16(src45_16, fil45);                       \
+  const __m128i mad67 = _mm_madd_epi16(src67_16, fil67);                       \
+  const __m128i mad0123 = _mm_add_epi32(mad01, mad23);                         \
+  const __m128i mad4567 = _mm_add_epi32(mad45, mad67);                         \
+  __m128i mad_all = _mm_add_epi32(mad0123, mad4567);                           \
+  mad_all = _mm_add_epi32(mad_all, rounding);                                  \
+  result = _mm_srai_epi32(mad_all, VP8_FILTER_SHIFT);                          \
+  }
+
+void vp8_filter_block2d_4x4_8_sse2
+(
+ const unsigned char *src_ptr, const unsigned int src_stride,
+ const short *HFilter_aligned16, const short *VFilter_aligned16,
+ unsigned char *dst_ptr, unsigned int dst_stride
+) {
+  __m128i intermediateA, intermediateB, intermediateC;
+
+  const int kInterp_Extend = 4;
+
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i rounding = _mm_load_si128((const __m128i *)rounding_c);
+
+  // check alignment
+  assert(0 == ((long)HFilter_aligned16)%16);
+  assert(0 == ((long)VFilter_aligned16)%16);
+
+  {
+    __m128i transpose3_0;
+    __m128i transpose3_1;
+    __m128i transpose3_2;
+    __m128i transpose3_3;
+
+    // Horizontal pass (src -> intermediate).
+    {
+      const __m128i HFilter = _mm_load_si128((const __m128i *)HFilter_aligned16);
+      // get first two columns filter coefficients
+      __m128i fil01 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(0, 0, 0, 0));
+      __m128i fil23 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(1, 1, 1, 1));
+      __m128i fil45 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(2, 2, 2, 2));
+      __m128i fil67 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(3, 3, 3, 3));
+      src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);
+
+      {
+        __m128i mad_all0;
+        __m128i mad_all1;
+        __m128i mad_all2;
+        __m128i mad_all3;
+        DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)
+        DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)
+        DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)
+        DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride)
+        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
+        mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);
+        intermediateA = _mm_packus_epi16(mad_all0, mad_all2);
+        // --
+        src_ptr += src_stride*4;
+        // --
+        DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)
+        DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)
+        DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)
+        DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride)
+        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
+        mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);
+        intermediateB = _mm_packus_epi16(mad_all0, mad_all2);
+        // --
+        src_ptr += src_stride*4;
+        // --
+        DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)
+        DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)
+        DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)
+        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
+        mad_all2 = _mm_packs_epi32(mad_all2, mad_all2);
+        intermediateC = _mm_packus_epi16(mad_all0, mad_all2);
+      }
+    }
+
+    // Transpose result (intermediate -> transpose3_x)
+    {
+      // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33
+      // 40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73
+      // 80 81 82 83 90 91 92 93 A0 A1 A2 A3 xx xx xx xx
+      const __m128i transpose0_0 = _mm_unpacklo_epi8(intermediateA, intermediateB);
+      const __m128i transpose0_1 = _mm_unpackhi_epi8(intermediateA, intermediateB);
+      const __m128i transpose0_2 = _mm_unpacklo_epi8(intermediateC, intermediateC);
+      const __m128i transpose0_3 = _mm_unpackhi_epi8(intermediateC, intermediateC);
+      // 00 40 01 41 02 42 03 43 10 50 11 51 12 52 13 53
+      // 20 60 21 61 22 62 23 63 30 70 31 71 32 72 33 73
+      // 80 xx 81 xx 82 xx 83 xx 90 xx 91 xx 92 xx 93 xx
+      // A0 xx A1 xx A2 xx A3 xx xx xx xx xx xx xx xx xx
+      const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1);
+      const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1);
+      const __m128i transpose1_2 = _mm_unpacklo_epi8(transpose0_2, transpose0_3);
+      const __m128i transpose1_3 = _mm_unpackhi_epi8(transpose0_2, transpose0_3);
+      // 00 20 40 60 01 21 41 61 02 22 42 62 03 23 43 63
+      // 10 30 50 70 11 31 51 71 12 32 52 72 13 33 53 73
+      // 80 A0 xx xx 81 A1 xx xx 82 A2 xx xx 83 A3 xx xx
+      // 90 xx xx xx 91 xx xx xx 92 xx xx xx 93 xx xx xx
+      const __m128i transpose2_0 = _mm_unpacklo_epi8(transpose1_0, transpose1_1);
+      const __m128i transpose2_1 = _mm_unpackhi_epi8(transpose1_0, transpose1_1);
+      const __m128i transpose2_2 = _mm_unpacklo_epi8(transpose1_2, transpose1_3);
+      const __m128i transpose2_3 = _mm_unpackhi_epi8(transpose1_2, transpose1_3);
+      // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+      // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+      // 80 90 A0 xx xx xx xx xx 81 91 A1 xx xx xx xx xx
+      // 82 92 A2 xx xx xx xx xx 83 93 A3 xx xx xx xx xx
+      transpose3_0 = _mm_castps_si128(
+                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),
+                                           _mm_castsi128_ps(transpose2_2),
+                                           _MM_SHUFFLE(1, 0, 1, 0)));
+      transpose3_1 = _mm_castps_si128(
+                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),
+                                           _mm_castsi128_ps(transpose2_2),
+                                           _MM_SHUFFLE(3, 2, 3, 2)));
+      transpose3_2 = _mm_castps_si128(
+                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),
+                                           _mm_castsi128_ps(transpose2_3),
+                                           _MM_SHUFFLE(1, 0, 1, 0)));
+      transpose3_3 = _mm_castps_si128(
+                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),
+                                           _mm_castsi128_ps(transpose2_3),
+                                           _MM_SHUFFLE(3, 2, 3, 2)));
+      // 00 10 20 30 40 50 60 70 80 90 A0 xx xx xx xx xx
+      // 01 11 21 31 41 51 61 71 81 91 A1 xx xx xx xx xx
+      // 02 12 22 32 42 52 62 72 82 92 A2 xx xx xx xx xx
+      // 03 13 23 33 43 53 63 73 83 93 A3 xx xx xx xx xx
+    }
+
+    // Vertical pass (transpose3_x -> dst).
+    {
+      const __m128i VFilter = _mm_load_si128((const __m128i *)VFilter_aligned16);
+      // get first two columns filter coefficients
+      __m128i fil01 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(0, 0, 0, 0));
+      __m128i fil23 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(1, 1, 1, 1));
+      __m128i fil45 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(2, 2, 2, 2));
+      __m128i fil67 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(3, 3, 3, 3));
+      __m128i col0, col1, col2, col3;
+        DECLARE_ALIGNED(16, unsigned char, temp[32]);
+      {
+        _mm_store_si128((__m128i *)temp, transpose3_0);
+        DO_FOUR_PIXELS(col0, temp, 0);
+      }
+      {
+        _mm_store_si128((__m128i *)temp, transpose3_1);
+        DO_FOUR_PIXELS(col1, temp, 0);
+      }
+      {
+        _mm_store_si128((__m128i *)temp, transpose3_2);
+        DO_FOUR_PIXELS(col2, temp, 0);
+      }
+      {
+        _mm_store_si128((__m128i *)temp, transpose3_3);
+        DO_FOUR_PIXELS(col3, temp, 0);
+      }
+      // transpose
+      {
+        __m128i T0 = _mm_unpacklo_epi32(col0, col1);
+        __m128i T1 = _mm_unpacklo_epi32(col2, col3);
+        __m128i T2 = _mm_unpackhi_epi32(col0, col1);
+        __m128i T3 = _mm_unpackhi_epi32(col2, col3);
+        col0 = _mm_unpacklo_epi64(T0, T1);
+        col1 = _mm_unpackhi_epi64(T0, T1);
+        col2 = _mm_unpacklo_epi64(T2, T3);
+        col3 = _mm_unpackhi_epi64(T2, T3);
+      }
+      // saturate to 8 bit
+      {
+        col0 = _mm_packs_epi32(col0, col0);
+        col0 = _mm_packus_epi16(col0, col0);
+        col1 = _mm_packs_epi32(col1, col1);
+        col1 = _mm_packus_epi16(col1, col1);
+        col2 = _mm_packs_epi32 (col2, col2);
+        col2 = _mm_packus_epi16(col2, col2);
+        col3 = _mm_packs_epi32 (col3, col3);
+        col3 = _mm_packus_epi16(col3, col3);
+      }
+      // store
+      {
+        *((unsigned int *)&dst_ptr[dst_stride * 0]) = _mm_cvtsi128_si32(col0);
+        *((unsigned int *)&dst_ptr[dst_stride * 1]) = _mm_cvtsi128_si32(col1);
+        *((unsigned int *)&dst_ptr[dst_stride * 2]) = _mm_cvtsi128_si32(col2);
+        *((unsigned int *)&dst_ptr[dst_stride * 3]) = _mm_cvtsi128_si32(col3);
+      }
+    }
+  }
+}
+
+void vp8_filter_block2d_8x4_8_sse2
+(
+ const unsigned char *src_ptr, const unsigned int src_stride,
+ const short *HFilter_aligned16, const short *VFilter_aligned16,
+ unsigned char *dst_ptr, unsigned int dst_stride
+) {
+  int j;
+  for (j=0; j<8; j+=4) {
+    vp8_filter_block2d_4x4_8_sse2(src_ptr + j, src_stride,
+                                  HFilter_aligned16, VFilter_aligned16,
+                                  dst_ptr + j, dst_stride);
+  }
+}
+
+void vp8_filter_block2d_8x8_8_sse2
+(
+ const unsigned char *src_ptr, const unsigned int src_stride,
+ const short *HFilter_aligned16, const short *VFilter_aligned16,
+ unsigned char *dst_ptr, unsigned int dst_stride
+) {
+  int i, j;
+  for (i=0; i<8; i+=4) {
+    for (j=0; j<8; j+=4) {
+      vp8_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride,
+                                    HFilter_aligned16, VFilter_aligned16,
+                                    dst_ptr + j + i*dst_stride, dst_stride);
+    }
+  }
+}
+
+void vp8_filter_block2d_16x16_8_sse2
+(
+ const unsigned char *src_ptr, const unsigned int src_stride,
+ const short *HFilter_aligned16, const short *VFilter_aligned16,
+ unsigned char *dst_ptr, unsigned int dst_stride
+) {
+  int i, j;
+  for (i=0; i<16; i+=4) {
+    for (j=0; j<16; j+=4) {
+      vp8_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride,
+                                    HFilter_aligned16, VFilter_aligned16,
+                                    dst_ptr + j + i*dst_stride, dst_stride);
+    }
+  }
+}
diff --git a/vp8/common/x86/filter_sse4.c b/vp8/common/x86/filter_sse4.c
index a037622e1..c461db173 100644
--- a/vp8/common/x86/filter_sse4.c
+++ b/vp8/common/x86/filter_sse4.c
@@ -25,9 +25,6 @@
 // TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum
 //           of positive above 128), or have higher precision filter
 //           coefficients.
-// TODO(cd): Remove use of _mm_extract_epi32 and _mm_extract_epi64, to not
-//           require SSE4.1
-// TODO(cd): Remove use of _mm_shuffle_epi8 to not require SSSE3
 
 DECLARE_ALIGNED(16, static const unsigned char, mask0123_c[16]) = {
   0x00, 0x01,