9 files changed, 118 insertions, 717 deletions
diff --git a/vp9/common/arm/neon/vp9_reconintra_neon.asm b/vp9/common/arm/neon/vp9_reconintra_neon.asm
index 98619bb30..71bf24c9f 100644
--- a/vp9/common/arm/neon/vp9_reconintra_neon.asm
+++ b/vp9/common/arm/neon/vp9_reconintra_neon.asm
@@ -17,6 +17,7 @@
     EXPORT  |vp9_h_predictor_16x16_neon|
     EXPORT  |vp9_h_predictor_32x32_neon|
     EXPORT  |vp9_tm_predictor_4x4_neon|
+    EXPORT  |vp9_tm_predictor_8x8_neon|
     ARM
     REQUIRE8
     PRESERVE8
@@ -328,8 +329,78 @@ loop_h
     vqshrun.s16         d1, q2, #0
     vst1.32             {d0[0]}, [r0], r1
     vst1.32             {d1[0]}, [r0], r1
-
     bx                  lr
     ENDP                ; |vp9_tm_predictor_4x4_neon|
 
+;void vp9_tm_predictor_8x8_neon (uint8_t *dst, ptrdiff_t y_stride,
+;                                const uint8_t *above,
+;                                const uint8_t *left)
+; r0  uint8_t *dst
+; r1  ptrdiff_t y_stride
+; r2  const uint8_t *above
+; r3  const uint8_t *left
+
+|vp9_tm_predictor_8x8_neon| PROC
+    ; Load ytop_left = above[-1];
+    sub                 r12, r2, #1
+    ldrb                r12, [r12]
+    vdup.u8             d0, r12
+
+    ; Load above 8 pixels
+    vld1.64             {d2}, [r2]
+
+    ; Compute above - ytop_left
+    vsubl.u8            q3, d2, d0
+
+    ; Load left row by row and compute left + (above - ytop_left)
+    ; 1st row and 2nd row
+    ldrb                r12, [r3], #1
+    ldrb                r2, [r3], #1
+    vdup.u16            q1, r12
+    vdup.u16            q2, r2
+    vadd.s16            q1, q1, q3
+    vadd.s16            q2, q2, q3
+    vqshrun.s16         d0, q1, #0
+    vqshrun.s16         d1, q2, #0
+    vst1.64             {d0}, [r0], r1
+    vst1.64             {d1}, [r0], r1
+
+    ; 3rd row and 4th row
+    ldrb                r12, [r3], #1
+    ldrb                r2, [r3], #1
+    vdup.u16            q1, r12
+    vdup.u16            q2, r2
+    vadd.s16            q1, q1, q3
+    vadd.s16            q2, q2, q3
+    vqshrun.s16         d0, q1, #0
+    vqshrun.s16         d1, q2, #0
+    vst1.64             {d0}, [r0], r1
+    vst1.64             {d1}, [r0], r1
+
+    ; 5th row and 6th row
+    ldrb                r12, [r3], #1
+    ldrb                r2, [r3], #1
+    vdup.u16            q1, r12
+    vdup.u16            q2, r2
+    vadd.s16            q1, q1, q3
+    vadd.s16            q2, q2, q3
+    vqshrun.s16         d0, q1, #0
+    vqshrun.s16         d1, q2, #0
+    vst1.64             {d0}, [r0], r1
+    vst1.64             {d1}, [r0], r1
+
+    ; 7rd row and 8th row
+    ldrb                r12, [r3], #1
+    ldrb                r2, [r3], #1
+    vdup.u16            q1, r12
+    vdup.u16            q2, r2
+    vadd.s16            q1, q1, q3
+    vadd.s16            q2, q2, q3
+    vqshrun.s16         d0, q1, #0
+    vqshrun.s16         d1, q2, #0
+    vst1.64             {d0}, [r0], r1
+    vst1.64             {d1}, [r0], r1
+    bx                  lr
+    ENDP                ; |vp9_tm_predictor_8x8_neon|
+
     END
diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index ff20553d6..ca42090c1 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -85,7 +85,7 @@ int vp9_resize_frame_buffers(VP9_COMMON *cm, int width, int height) {
   int mi_size;
 
   if (vp9_realloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y,
-                               VP9BORDERINPIXELS, NULL, NULL, NULL) < 0)
+                               VP9_DEC_BORDER_IN_PIXELS, NULL, NULL, NULL) < 0)
     goto fail;
 
   set_mb_mi(cm, aligned_width, aligned_height);
@@ -154,7 +154,7 @@ int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) {
   for (i = 0; i < cm->fb_count; i++) {
     cm->fb_idx_ref_cnt[i] = 0;
     if (vp9_alloc_frame_buffer(&cm->yv12_fb[i], width, height, ss_x, ss_y,
-                               VP9BORDERINPIXELS) < 0)
+                               VP9_ENC_BORDER_IN_PIXELS) < 0)
       goto fail;
   }
 
@@ -167,7 +167,7 @@ int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) {
   }
 
   if (vp9_alloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y,
-                             VP9BORDERINPIXELS) < 0)
+                             VP9_ENC_BORDER_IN_PIXELS) < 0)
     goto fail;
 
   set_mb_mi(cm, aligned_width, aligned_height);
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 21e2b16a4..ad78b0dc4 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -122,7 +122,6 @@ typedef struct {
   TX_SIZE tx_size;
   int_mv mv[2];                // for each reference frame used
   int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
-  int_mv best_mv[2];
 
   uint8_t mode_context[MAX_REF_FRAMES];
 
@@ -242,6 +241,9 @@ typedef struct macroblockd {
   /* pointer to current frame */
   const YV12_BUFFER_CONFIG *cur_buf;
 
+  /* mc buffer */
+  DECLARE_ALIGNED(16, uint8_t, mc_buf[80 * 2 * 80 * 2]);
+
   int lossless;
   /* Inverse transform function pointers. */
   void (*itxm_add)(const int16_t *input, uint8_t *dest, int stride, int eob);
diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index f43a85f14..ba162fd20 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -112,8 +112,8 @@ static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) {
 // This macro is currently unused but may be used by certain implementations
 #define MAXBAND_INDEX 21
 
-extern const uint8_t vp9_coefband_trans_8x8plus[1024];
-extern const uint8_t vp9_coefband_trans_4x4[16];
+extern DECLARE_ALIGNED(16, const uint8_t, vp9_coefband_trans_8x8plus[1024]);
+extern DECLARE_ALIGNED(16, const uint8_t, vp9_coefband_trans_4x4[16]);
 
 static const uint8_t *get_band_translate(TX_SIZE tx_size) {
   return tx_size == TX_4X4 ? vp9_coefband_trans_4x4
diff --git a/vp9/common/vp9_mvref_common.h b/vp9/common/vp9_mvref_common.h
index 06adbabaa..cd89390d5 100644
--- a/vp9/common/vp9_mvref_common.h
+++ b/vp9/common/vp9_mvref_common.h
@@ -32,8 +32,10 @@ static INLINE void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
                        mv_ref_list, -1, mi_row, mi_col);
 }
 
-#define LEFT_TOP_MARGIN     ((VP9BORDERINPIXELS - VP9_INTERP_EXTEND) << 3)
-#define RIGHT_BOTTOM_MARGIN ((VP9BORDERINPIXELS - VP9_INTERP_EXTEND) << 3)
+#define LEFT_TOP_MARGIN     ((VP9_ENC_BORDER_IN_PIXELS  \
+                            - VP9_INTERP_EXTEND) << 3)
+#define RIGHT_BOTTOM_MARGIN ((VP9_ENC_BORDER_IN_PIXELS  \
+                            - VP9_INTERP_EXTEND) << 3)
 
 // check a list of motion vectors by sad score using a number rows of pixels
 // above and a number cols of pixels in the left to select the one with best
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index 397f446f3..b5a9248c3 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -20,15 +20,16 @@
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
 
-static void build_mc_border(const uint8_t *src, uint8_t *dst, int stride,
-                             int x, int y, int b_w, int b_h, int w, int h) {
+static void build_mc_border(const uint8_t *src, int src_stride,
+                            uint8_t *dst, int dst_stride,
+                            int x, int y, int b_w, int b_h, int w, int h) {
   // Get a pointer to the start of the real data for this row.
-  const uint8_t *ref_row = src - x - y * stride;
+  const uint8_t *ref_row = src - x - y * src_stride;
 
   if (y >= h)
-    ref_row += (h - 1) * stride;
+    ref_row += (h - 1) * src_stride;
   else if (y > 0)
-    ref_row += y * stride;
+    ref_row += y * src_stride;
 
   do {
     int right = 0, copy;
@@ -49,16 +50,16 @@ static void build_mc_border(const uint8_t *src, uint8_t *dst, int stride,
       memset(dst, ref_row[0], left);
 
     if (copy)
-      memmove(dst + left, ref_row + x + left, copy);
+      memcpy(dst + left, ref_row + x + left, copy);
 
     if (right)
       memset(dst + left + copy, ref_row[w - 1], right);
 
-    dst += stride;
+    dst += dst_stride;
     ++y;
 
     if (y > 0 && y < h)
-      ref_row += stride;
+      ref_row += src_stride;
   } while (--b_h);
 }
 
@@ -281,7 +282,7 @@ static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
 
     MV32 scaled_mv;
     int xs, ys, x0, y0, x0_16, y0_16, x1, y1, frame_width,
-        frame_height, subpel_x, subpel_y;
+        frame_height, subpel_x, subpel_y, buf_stride;
     uint8_t *ref_frame, *buf_ptr;
     const YV12_BUFFER_CONFIG *ref_buf = xd->block_refs[ref]->buf;
 
@@ -308,7 +309,7 @@ static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
       scaled_mv = vp9_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
       xs = sf->x_step_q4;
       ys = sf->y_step_q4;
-      // Get block position in the scaled reference frame.
+      // Map the top left corner of the block into the reference frame.
       x0 = sf->scale_value_x(x0, sf);
       y0 = sf->scale_value_y(y0, sf);
       x0_16 = sf->scale_value_x(x0_16, sf);
@@ -321,7 +322,7 @@ static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
     subpel_x = scaled_mv.col & SUBPEL_MASK;
     subpel_y = scaled_mv.row & SUBPEL_MASK;
 
-    // Get reference block top left coordinate.
+    // Calculate the top left corner of the best matching block in the reference frame.
     x0 += scaled_mv.col >> SUBPEL_BITS;
     y0 += scaled_mv.row >> SUBPEL_BITS;
     x0_16 += scaled_mv.col;
@@ -329,24 +330,28 @@ static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
 
     // Get reference block bottom right coordinate.
     x1 = ((x0_16 + (w - 1) * xs) >> SUBPEL_BITS) + 1;
-    y1 = ((y0_16 + (h - 1) * xs) >> SUBPEL_BITS) + 1;
+    y1 = ((y0_16 + (h - 1) * ys) >> SUBPEL_BITS) + 1;
 
     // Get reference block pointer.
     buf_ptr = ref_frame + y0 * pre_buf->stride + x0;
+    buf_stride = pre_buf->stride;
 
-    // Do border extension if there is motion or
+    // Do border extension if there is motion or the
     // width/height is not a multiple of 8 pixels.
     if (scaled_mv.col || scaled_mv.row ||
         (frame_width & 0x7) || (frame_height & 0x7)) {
+      int x_pad = 0, y_pad = 0;
 
-      if (subpel_x) {
+      if (subpel_x || (sf->x_step_q4 & SUBPEL_MASK)) {
         x0 -= VP9_INTERP_EXTEND - 1;
         x1 += VP9_INTERP_EXTEND;
+        x_pad = 1;
       }
 
-      if (subpel_y) {
+      if (subpel_y || (sf->y_step_q4 & SUBPEL_MASK)) {
         y0 -= VP9_INTERP_EXTEND - 1;
         y1 += VP9_INTERP_EXTEND;
+        y_pad = 1;
       }
 
       // Skip border extension if block is inside the frame.
@@ -354,12 +359,14 @@ static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
           y0 < 0 || y0 > frame_height - 1 || y1 < 0 || y1 > frame_height - 1) {
         uint8_t *buf_ptr1 = ref_frame + y0 * pre_buf->stride + x0;
         // Extend the border.
-        build_mc_border(buf_ptr1, buf_ptr1, pre_buf->stride, x0, y0, x1 - x0,
-                        y1 - y0, frame_width, frame_height);
+        build_mc_border(buf_ptr1, pre_buf->stride, xd->mc_buf, x1 - x0,
+                        x0, y0, x1 - x0, y1 - y0, frame_width, frame_height);
+        buf_stride = x1 - x0;
+        buf_ptr = xd->mc_buf + y_pad * 3 * buf_stride + x_pad * 3;
       }
     }
 
-    inter_predictor(buf_ptr, pre_buf->stride, dst, dst_buf->stride, subpel_x,
+    inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
                     subpel_y, sf, w, h, ref, &xd->subpix, xs, ys);
   }
 }
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 8f858f47c..caa6947b3 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -96,7 +96,7 @@ prototype void vp9_v_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint
 specialize vp9_v_predictor_8x8 $sse_x86inc neon
 
 prototype void vp9_tm_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_tm_predictor_8x8 $sse2_x86inc dspr2
+specialize vp9_tm_predictor_8x8 $sse2_x86inc neon dspr2
 
 prototype void vp9_dc_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_dc_predictor_8x8 $sse_x86inc dspr2
@@ -742,7 +742,7 @@ specialize vp9_full_search_sad sse3 sse4_1
 vp9_full_search_sad_sse3=vp9_full_search_sadx3
 vp9_full_search_sad_sse4_1=vp9_full_search_sadx8
 
-prototype int vp9_refining_search_sad "struct macroblock *x, struct mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv"
+prototype int vp9_refining_search_sad "const struct macroblock *x, struct mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv"
 specialize vp9_refining_search_sad sse3
 vp9_refining_search_sad_sse3=vp9_refining_search_sadx4
 
@@ -756,9 +756,5 @@ specialize vp9_full_range_search
 prototype void vp9_temporal_filter_apply "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count"
 specialize vp9_temporal_filter_apply sse2
 
-prototype void vp9_yv12_copy_partial_frame "struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int fraction"
-specialize vp9_yv12_copy_partial_frame
-
-
 fi
 # end encoder functions
diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c
index f4f758297..f95423678 100644
--- a/vp9/common/x86/vp9_asm_stubs.c
+++ b/vp9/common/x86/vp9_asm_stubs.c
@@ -23,105 +23,20 @@ typedef void filter8_1dfunction (
   const short *filter
 );
 
-#if (HAVE_SSSE3)
+#if HAVE_SSSE3
+filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
 filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3;
 filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3;
 filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3;
 filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3;
 filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3;
 filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3;
-#if (ARCH_X86_64)
-filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
-
-void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4,
-                               int w, int h) {
-  /* Ensure the filter can be compressed to int16_t. */
-  if (x_step_q4 == 16 && filter_x[3] != 128) {
-    while (w >= 16) {
-      vp9_filter_block1d16_h8_intrin_ssse3(src, src_stride,
-                                    dst, dst_stride,
-                                    h, filter_x);
-      src += 16;
-      dst += 16;
-      w -= 16;
-    }
-    while (w >= 8) {
-      vp9_filter_block1d8_h8_intrin_ssse3(src, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_x);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    }
-    while (w >= 4) {
-      vp9_filter_block1d4_h8_intrin_ssse3(src, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_x);
-      src += 4;
-      dst += 4;
-      w -= 4;
-    }
-  }
-  if (w) {
-    vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
-                          filter_x, x_step_q4, filter_y, y_step_q4,
-                          w, h);
-  }
-}
 
-void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4,
-                              int w, int h) {
-  if (y_step_q4 == 16 && filter_y[3] != 128) {
-    while (w >= 16) {
-      vp9_filter_block1d16_v8_intrin_ssse3(src - src_stride * 3, src_stride,
-                                    dst, dst_stride,
-                                    h, filter_y);
-      src += 16;
-      dst += 16;
-      w -= 16;
-    }
-    while (w >= 8) {
-      vp9_filter_block1d8_v8_intrin_ssse3(src - src_stride * 3, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_y);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    }
-    while (w >= 4) {
-      vp9_filter_block1d4_v8_intrin_ssse3(src - src_stride * 3, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_y);
-      src += 4;
-      dst += 4;
-      w -= 4;
-    }
-  }
-  if (w) {
-    vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
-                         filter_x, x_step_q4, filter_y, y_step_q4,
-                         w, h);
-  }
-}
-
-#else
-filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
 void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
                                const int16_t *filter_x, int x_step_q4,
@@ -198,7 +113,6 @@ void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
                          w, h);
   }
 }
-#endif
 
 void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
diff --git a/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c b/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
deleted file mode 100644
index 303fced3b..000000000
--- a/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
+++ /dev/null
@@ -1,591 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <tmmintrin.h>
-#include "vpx_ports/mem.h"
-#include "vpx_ports/emmintrin_compat.h"
-
-
-// filters only for the 4_h8 convolution
-DECLARE_ALIGNED(16, const unsigned char,
-filt1_4_h8[16])= {0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6};
-
-DECLARE_ALIGNED(16, const unsigned char,
-filt2_4_h8[16])= {4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10};
-
-// filters for 8_h8 and 16_h8
-DECLARE_ALIGNED(16, const unsigned char,
-filt1_global[16])= {0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8};
-
-DECLARE_ALIGNED(16, const unsigned char,
-filt2_global[16])= {2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10};
-
-DECLARE_ALIGNED(16, const unsigned char,
-filt3_global[16])= {4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12};
-
-DECLARE_ALIGNED(16, const unsigned char,
-filt4_global[16])= {6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14};
-
-
-
-void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr,
-                                         unsigned int src_pixels_per_line,
-                                         unsigned char *output_ptr,
-                                         unsigned int output_pitch,
-                                         unsigned int output_height,
-                                         int16_t *filter) {
-    __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
-    __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
-    __m128i addFilterReg64, filtersReg, srcReg, minReg;
-    unsigned int i;
-
-    // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-    addFilterReg64 =_mm_set1_epi32((int)0x0400040u);
-    filtersReg = _mm_loadu_si128((__m128i *)filter);
-    // converting the 16 bit (short) to  8 bit (byte) and have the same data
-    // in both lanes of 128 bit register.
-    filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
-    // duplicate only the first 16 bits in the filter into the first lane
-    firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
-    // duplicate only the third 16 bit in the filter into the first lane
-    secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
-    // duplicate only the seconds 16 bits in the filter into the second lane
-    firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
-    // duplicate only the forth 16 bits in the filter into the second lane
-    secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
-
-    // loading the local filters
-    thirdFilters =_mm_load_si128((__m128i const *)filt1_4_h8);
-    forthFilters = _mm_load_si128((__m128i const *)filt2_4_h8);
-
-    for (i = 0; i < output_height; i++) {
-        srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3));
-
-        // filter the source buffer
-        srcRegFilt1= _mm_shuffle_epi8(srcReg, thirdFilters);
-        srcRegFilt2= _mm_shuffle_epi8(srcReg, forthFilters);
-
-        // multiply 2 adjacent elements with the filter and add the result
-        srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
-        srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
-
-        // extract the higher half of the lane
-        srcRegFilt3 =  _mm_srli_si128(srcRegFilt1, 8);
-        srcRegFilt4 =  _mm_srli_si128(srcRegFilt2, 8);
-
-        minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
-
-        // add and saturate all the results together
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
-
-        srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
-
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
-
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
-
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
-        // shift by 7 bit each 16 bits
-        srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
-        // shrink to 8 bit each 16 bits
-        srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-
-        src_ptr+=src_pixels_per_line;
-
-        // save only 4 bytes
-        *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1);
-
-        output_ptr+=output_pitch;
-    }
-}
-
-
-void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr,
-                                         unsigned int src_pixels_per_line,
-                                         unsigned char *output_ptr,
-                                         unsigned int output_pitch,
-                                         unsigned int output_height,
-                                         int16_t *filter) {
-    __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
-    __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
-    __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
-    __m128i addFilterReg64, filtersReg, minReg;
-    unsigned int i;
-
-    // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-    addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-    filtersReg = _mm_loadu_si128((__m128i *)filter);
-    // converting the 16 bit (short) to  8 bit (byte) and have the same data
-    // in both lanes of 128 bit register.
-    filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
-    // duplicate only the first 16 bits (first and second byte)
-    // across 128 bit register
-    firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
-    // duplicate only the second 16 bits (third and forth byte)
-    // across 128 bit register
-    secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
-    // duplicate only the third 16 bits (fifth and sixth byte)
-    // across 128 bit register
-    thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
-    // duplicate only the forth 16 bits (seventh and eighth byte)
-    // across 128 bit register
-    forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
-    filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
-    filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
-    filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
-    filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
-
-    for (i = 0; i < output_height; i++) {
-        srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3));
-
-        // filter the source buffer
-        srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg);
-        srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg);
-
-        // multiply 2 adjacent elements with the filter and add the result
-        srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
-        srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
-
-        // filter the source buffer
-        srcRegFilt3= _mm_shuffle_epi8(srcReg, filt3Reg);
-        srcRegFilt4= _mm_shuffle_epi8(srcReg, filt4Reg);
-
-        // multiply 2 adjacent elements with the filter and add the result
-        srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
-        srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);
-
-        // add and saturate all the results together
-        minReg = _mm_min_epi16(srcRegFilt4, srcRegFilt3);
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
-
-        srcRegFilt4= _mm_max_epi16(srcRegFilt4, srcRegFilt3);
-
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
-
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
-
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
-        // shift by 7 bit each 16 bits
-        srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
-        // shrink to 8 bit each 16 bits
-        srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-
-        src_ptr+=src_pixels_per_line;
-
-       // save only 8 bytes
-        _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
-
-        output_ptr+=output_pitch;
-    }
-}
-
-void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr,
-                                          unsigned int src_pixels_per_line,
-                                          unsigned char *output_ptr,
-                                          unsigned int output_pitch,
-                                          unsigned int output_height,
-                                          int16_t *filter) {
-    __m128i addFilterReg64, filtersReg, srcReg1, srcReg2;
-    __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
-    __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
-    __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3;
-    unsigned int i;
-
-    // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-    addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-    filtersReg = _mm_loadu_si128((__m128i *)filter);
-    // converting the 16 bit (short) to  8 bit (byte) and have the same data
-    // in both lanes of 128 bit register.
-    filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
-    // duplicate only the first 16 bits (first and second byte)
-    // across 128 bit register
-    firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
-    // duplicate only the second 16 bits (third and forth byte)
-    // across 128 bit register
-    secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
-    // duplicate only the third 16 bits (fifth and sixth byte)
-    // across 128 bit register
-    thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
-    // duplicate only the forth 16 bits (seventh and eighth byte)
-    // across 128 bit register
-    forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
-    filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
-    filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
-    filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
-    filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
-
-    for (i = 0; i < output_height; i++) {
-        srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3));
-
-        // filter the source buffer
-        srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg);
-        srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt2Reg);
-
-        // multiply 2 adjacent elements with the filter and add the result
-        srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters);
-        srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
-
-        // add and saturate the results together
-        srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
-
-        // filter the source buffer
-        srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt4Reg);
-        srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg);
-
-        // multiply 2 adjacent elements with the filter and add the result
-        srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
-        srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
-
-        // add and saturate the results together
-        srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
-        _mm_min_epi16(srcRegFilt3, srcRegFilt2));
-
-        // reading the next 16 bytes.
-        // (part of it was being read by earlier read)
-        srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5));
-
-        // add and saturate the results together
-        srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
-        _mm_max_epi16(srcRegFilt3, srcRegFilt2));
-
-        // filter the source buffer
-        srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg);
-        srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt2Reg);
-
-        // multiply 2 adjacent elements with the filter and add the result
-        srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters);
-        srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
-
-        // add and saturate the results together
-        srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
-
-        // filter the source buffer
-        srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt4Reg);
-        srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg);
-
-        // multiply 2 adjacent elements with the filter and add the result
-        srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
-        srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
-
-        // add and saturate the results together
-        srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
-        _mm_min_epi16(srcRegFilt3, srcRegFilt2));
-        srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
-        _mm_max_epi16(srcRegFilt3, srcRegFilt2));
-
-        srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64);
-
-        srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, addFilterReg64);
-
-        // shift by 7 bit each 16 bit
-        srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
-        srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
-
-        // shrink to 8 bit each 16 bits, the first lane contain the first
-        // convolve result and the second lane contain the second convolve
-        // result
-        srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
-
-        src_ptr+=src_pixels_per_line;
-
-        // save 16 bytes
-        _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);
-
-        output_ptr+=output_pitch;
-    }
-}
-
-
-
-void vp9_filter_block1d4_v8_intrin_ssse3(unsigned char *src_ptr,
-                                         unsigned int src_pitch,
-                                         unsigned char *output_ptr,
-                                         unsigned int out_pitch,
-                                         unsigned int output_height,
-                                         int16_t *filter) {
-    __m128i addFilterReg64, filtersReg, firstFilters, secondFilters;
-    __m128i minReg, srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
-    unsigned int i;
-
-    // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-    addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-    filtersReg = _mm_loadu_si128((__m128i *)filter);
-    // converting the 16 bit (short) to  8 bit (byte) and have the same data
-    // in both lanes of 128 bit register.
-    filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
-    // duplicate only the first 16 bits in the filter into the first lane
-    firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
-    // duplicate only the second 16 bits in the filter into the second lane
-    firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
-    // duplicate only the third 16 bits in the filter into the first lane
-    secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
-    // duplicate only the forth 16 bits in the filter into the second lane
-    secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
-
-    for (i = 0; i < output_height; i++) {
-        // load the first 4 byte
-        srcRegFilt1 = _mm_cvtsi32_si128(*((int*)&src_ptr[0]));
-        // load the next 4 bytes in stride of src_pitch
-        srcRegFilt2 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch)[0]));
-
-        // merge the result together
-        srcRegFilt1 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
-
-
-        srcRegFilt2 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*2)[0]));
-        srcRegFilt3 =  _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*3)[0]));
-
-        // merge the result together
-        srcRegFilt2 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
-
-        srcRegFilt3 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*4)[0]));
-        srcRegFilt4 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*5)[0]));
-
-        // merge the result together
-        srcRegFilt3 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
-        srcRegFilt1 = _mm_unpacklo_epi64(srcRegFilt1, srcRegFilt2);
-
-        srcRegFilt4 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*6)[0]));
-        srcRegFilt2 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*7)[0]));
-
-        // merge the result together
-        srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt4, srcRegFilt2);
-        srcRegFilt3 = _mm_unpacklo_epi64(srcRegFilt3, srcRegFilt4);
-
-        // multiply 2 adjacent elements with the filter and add the result
-        srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
-        srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
-
-        // extract the second lane of the 128 bit register
-        srcRegFilt2 = _mm_srli_si128(srcRegFilt1, 8);
-
-        // add and saturate the results together
-        minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
-        _mm_srli_si128(srcRegFilt3, 8));
-        srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
-        // shift by 7 bit each 16 bit
-        srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
-        // shrink to 8 bit each 16 bits
-        srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-
-        src_ptr+=src_pitch;
-
-        // save only 4 bytes convolve result
-        *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1);
-
-        output_ptr+=out_pitch;
-    }
-}
-
-void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr,
-                                         unsigned int src_pitch,
-                                         unsigned char *output_ptr,
-                                         unsigned int out_pitch,
-                                         unsigned int output_height,
-                                         int16_t *filter) {
-    __m128i addFilterReg64, filtersReg, minReg, srcRegFilt6;
-    __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
-    __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4, srcRegFilt5;
-    unsigned int i;
-
-    // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-    addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-    filtersReg = _mm_loadu_si128((__m128i *)filter);
-    // converting the 16 bit (short) to  8 bit (byte) and have the same data
-    // in both lanes of 128 bit register.
-    filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
-    // duplicate only the first 16 bits in the filter
-    firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
-    // duplicate only the second 16 bits in the filter
-    secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
-    // duplicate only the third 16 bits in the filter
-    thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
-    // duplicate only the forth 16 bits in the filter
-    forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
-    for (i = 0; i < output_height; i++) {
-        // load the first 8 bytes
-        srcRegFilt1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]);
-        // load the next 8 bytes in stride of src_pitch
-        srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch)[0]);
-        srcRegFilt3 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*2)[0]);
-        srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*3)[0]);
-
-        // merge the result together
-        srcRegFilt1 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
-        srcRegFilt3 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
-
-        // load the next 8 bytes in stride of src_pitch
-        srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*4)[0]);
-        srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*5)[0]);
-        srcRegFilt5 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*6)[0]);
-        srcRegFilt6 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*7)[0]);
-
-        // merge the result together
-        srcRegFilt2 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt4);
-        srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt5, srcRegFilt6);
-
-        // multiply 2 adjacent elements with the filter and add the result
-        srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
-        srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
-        srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
-        srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters);
-
-        // add and saturate the results together
-        minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5);
-        srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
-        // shift by 7 bit each 16 bit
-        srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
-        // shrink to 8 bit each 16 bits
-        srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-
-        src_ptr+=src_pitch;
-
-        // save only 8 bytes convolve result
-        _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
-
-        output_ptr+=out_pitch;
-    }
-}
-
-
-void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
-                                          unsigned int src_pitch,
-                                          unsigned char *output_ptr,
-                                          unsigned int out_pitch,
-                                          unsigned int output_height,
-                                          int16_t *filter) {
-    __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt2, srcRegFilt3;
-    __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
-    __m128i srcRegFilt4, srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;
-    unsigned int i;
-
-    // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-    addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-    filtersReg = _mm_loadu_si128((__m128i *)filter);
-    // converting the 16 bit (short) to  8 bit (byte) and have the same data
-    // in both lanes of 128 bit register.
-    filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
-    // duplicate only the first 16 bits in the filter
-    firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
-    // duplicate only the second 16 bits in the filter
-    secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
-    // duplicate only the third 16 bits in the filter
-    thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
-    // duplicate only the forth 16 bits in the filter
-    forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
-
-    for (i = 0; i < output_height; i++) {
-        // load the first 16 bytes
-        srcRegFilt1 = _mm_loadu_si128((__m128i *)(src_ptr));
-        // load the next 16 bytes in stride of src_pitch
-        srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch));
-        srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6));
-        srcRegFilt4 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7));
-
-        // merge the result together
-        srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
-        srcRegFilt6 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
-        srcRegFilt1 = _mm_unpackhi_epi8(srcRegFilt1, srcRegFilt2);
-        srcRegFilt3 = _mm_unpackhi_epi8(srcRegFilt3, srcRegFilt4);
-
-        // multiply 2 adjacent elements with the filter and add the result
-        srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters);
-        srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters);
-        srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
-        srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
-
-
-        // add and saturate the results together
-        srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6);
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
-
-        // load the next 16 bytes in stride of two/three src_pitch
-        srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2));
-        srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3));
-
-        // merge the result together
-        srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
-        srcRegFilt6 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);
-
-        // multiply 2 adjacent elements with the filter and add the result
-        srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, secondFilters);
-        srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters);
-
-        // load the next 16 bytes in stride of four/five src_pitch
-        srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4));
-        srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5));
-
-        // merge the result together
-        srcRegFilt7 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
-        srcRegFilt8 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);
-
-        // multiply 2 adjacent elements with the filter and add the result
-        srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, thirdFilters);
-        srcRegFilt8 = _mm_maddubs_epi16(srcRegFilt8, thirdFilters);
-
-
-        // add and saturate the results together
-        srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
-        _mm_min_epi16(srcRegFilt4, srcRegFilt7));
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
-        _mm_min_epi16(srcRegFilt6, srcRegFilt8));
-
-
-        // add and saturate the results together
-        srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
-        _mm_max_epi16(srcRegFilt4, srcRegFilt7));
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
-        _mm_max_epi16(srcRegFilt6, srcRegFilt8));
-        srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg64);
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
-        // shift by 7 bit each 16 bit
-        srcRegFilt5 = _mm_srai_epi16(srcRegFilt5, 7);
-        srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
-        // shrink to 8 bit each 16 bits, the first lane contain the first
-        // convolve result and the second lane contain the second convolve
-        // result
-        srcRegFilt1 = _mm_packus_epi16(srcRegFilt5, srcRegFilt1);
-
-        src_ptr+=src_pitch;
-
-        // save 16 bytes convolve result
-        _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
-
-        output_ptr+=out_pitch;
-    }
-}