13 files changed, 510 insertions, 753 deletions
diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c
index f4f758297..f95423678 100644
--- a/vp9/common/x86/vp9_asm_stubs.c
+++ b/vp9/common/x86/vp9_asm_stubs.c
@@ -23,105 +23,20 @@ typedef void filter8_1dfunction (
   const short *filter
 );
 
-#if (HAVE_SSSE3)
+#if HAVE_SSSE3
+filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
 filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3;
 filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3;
 filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3;
 filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3;
 filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3;
 filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3;
-#if (ARCH_X86_64)
-filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
-
-void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4,
-                               int w, int h) {
-  /* Ensure the filter can be compressed to int16_t. */
-  if (x_step_q4 == 16 && filter_x[3] != 128) {
-    while (w >= 16) {
-      vp9_filter_block1d16_h8_intrin_ssse3(src, src_stride,
-                                    dst, dst_stride,
-                                    h, filter_x);
-      src += 16;
-      dst += 16;
-      w -= 16;
-    }
-    while (w >= 8) {
-      vp9_filter_block1d8_h8_intrin_ssse3(src, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_x);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    }
-    while (w >= 4) {
-      vp9_filter_block1d4_h8_intrin_ssse3(src, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_x);
-      src += 4;
-      dst += 4;
-      w -= 4;
-    }
-  }
-  if (w) {
-    vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
-                          filter_x, x_step_q4, filter_y, y_step_q4,
-                          w, h);
-  }
-}
 
-void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4,
-                              int w, int h) {
-  if (y_step_q4 == 16 && filter_y[3] != 128) {
-    while (w >= 16) {
-      vp9_filter_block1d16_v8_intrin_ssse3(src - src_stride * 3, src_stride,
-                                    dst, dst_stride,
-                                    h, filter_y);
-      src += 16;
-      dst += 16;
-      w -= 16;
-    }
-    while (w >= 8) {
-      vp9_filter_block1d8_v8_intrin_ssse3(src - src_stride * 3, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_y);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    }
-    while (w >= 4) {
-      vp9_filter_block1d4_v8_intrin_ssse3(src - src_stride * 3, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_y);
-      src += 4;
-      dst += 4;
-      w -= 4;
-    }
-  }
-  if (w) {
-    vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
-                         filter_x, x_step_q4, filter_y, y_step_q4,
-                         w, h);
-  }
-}
-
-#else
-filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
 void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
                                const int16_t *filter_x, int x_step_q4,
@@ -198,7 +113,6 @@ void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
                          w, h);
   }
 }
-#endif
 
 void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
diff --git a/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c b/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
deleted file mode 100644
index 303fced3b..000000000
--- a/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
+++ /dev/null
@@ -1,591 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <tmmintrin.h>
-#include "vpx_ports/mem.h"
-#include "vpx_ports/emmintrin_compat.h"
-
-
-// filters only for the 4_h8 convolution
-DECLARE_ALIGNED(16, const unsigned char,
-filt1_4_h8[16])= {0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6};
-
-DECLARE_ALIGNED(16, const unsigned char,
-filt2_4_h8[16])= {4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10};
-
-// filters for 8_h8 and 16_h8
-DECLARE_ALIGNED(16, const unsigned char,
-filt1_global[16])= {0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8};
-
-DECLARE_ALIGNED(16, const unsigned char,
-filt2_global[16])= {2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10};
-
-DECLARE_ALIGNED(16, const unsigned char,
-filt3_global[16])= {4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12};
-
-DECLARE_ALIGNED(16, const unsigned char,
-filt4_global[16])= {6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14};
-
-
-
-void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr,
-                                         unsigned int src_pixels_per_line,
-                                         unsigned char *output_ptr,
-                                         unsigned int output_pitch,
-                                         unsigned int output_height,
-                                         int16_t *filter) {
-    __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
-    __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
-    __m128i addFilterReg64, filtersReg, srcReg, minReg;
-    unsigned int i;
-
-    // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-    addFilterReg64 =_mm_set1_epi32((int)0x0400040u);
-    filtersReg = _mm_loadu_si128((__m128i *)filter);
-    // converting the 16 bit (short) to  8 bit (byte) and have the same data
-    // in both lanes of 128 bit register.
-    filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
-    // duplicate only the first 16 bits in the filter into the first lane
-    firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
-    // duplicate only the third 16 bit in the filter into the first lane
-    secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
-    // duplicate only the seconds 16 bits in the filter into the second lane
-    firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
-    // duplicate only the forth 16 bits in the filter into the second lane
-    secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
-
-    // loading the local filters
-    thirdFilters =_mm_load_si128((__m128i const *)filt1_4_h8);
-    forthFilters = _mm_load_si128((__m128i const *)filt2_4_h8);
-
-    for (i = 0; i < output_height; i++) {
-        srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3));
-
-        // filter the source buffer
-        srcRegFilt1= _mm_shuffle_epi8(srcReg, thirdFilters);
-        srcRegFilt2= _mm_shuffle_epi8(srcReg, forthFilters);
-
-        // multiply 2 adjacent elements with the filter and add the result
-        srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
-        srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
-
-        // extract the higher half of the lane
-        srcRegFilt3 =  _mm_srli_si128(srcRegFilt1, 8);
-        srcRegFilt4 =  _mm_srli_si128(srcRegFilt2, 8);
-
-        minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
-
-        // add and saturate all the results together
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
-
-        srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
-
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
-
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
-
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
-        // shift by 7 bit each 16 bits
-        srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
-        // shrink to 8 bit each 16 bits
-        srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-
-        src_ptr+=src_pixels_per_line;
-
-        // save only 4 bytes
-        *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1);
-
-        output_ptr+=output_pitch;
-    }
-}
-
-
-void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr,
-                                         unsigned int src_pixels_per_line,
-                                         unsigned char *output_ptr,
-                                         unsigned int output_pitch,
-                                         unsigned int output_height,
-                                         int16_t *filter) {
-    __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
-    __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
-    __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
-    __m128i addFilterReg64, filtersReg, minReg;
-    unsigned int i;
-
-    // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-    addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-    filtersReg = _mm_loadu_si128((__m128i *)filter);
-    // converting the 16 bit (short) to  8 bit (byte) and have the same data
-    // in both lanes of 128 bit register.
-    filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
-    // duplicate only the first 16 bits (first and second byte)
-    // across 128 bit register
-    firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
-    // duplicate only the second 16 bits (third and forth byte)
-    // across 128 bit register
-    secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
-    // duplicate only the third 16 bits (fifth and sixth byte)
-    // across 128 bit register
-    thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
-    // duplicate only the forth 16 bits (seventh and eighth byte)
-    // across 128 bit register
-    forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
-    filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
-    filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
-    filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
-    filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
-
-    for (i = 0; i < output_height; i++) {
-        srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3));
-
-        // filter the source buffer
-        srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg);
-        srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg);
-
-        // multiply 2 adjacent elements with the filter and add the result
-        srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
-        srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
-
-        // filter the source buffer
-        srcRegFilt3= _mm_shuffle_epi8(srcReg, filt3Reg);
-        srcRegFilt4= _mm_shuffle_epi8(srcReg, filt4Reg);
-
-        // multiply 2 adjacent elements with the filter and add the result
-        srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
-        srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);
-
-        // add and saturate all the results together
-        minReg = _mm_min_epi16(srcRegFilt4, srcRegFilt3);
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
-
-        srcRegFilt4= _mm_max_epi16(srcRegFilt4, srcRegFilt3);
-
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
-
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
-
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
-        // shift by 7 bit each 16 bits
-        srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
-        // shrink to 8 bit each 16 bits
-        srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-
-        src_ptr+=src_pixels_per_line;
-
-       // save only 8 bytes
-        _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
-
-        output_ptr+=output_pitch;
-    }
-}
-
-void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr,
-                                          unsigned int src_pixels_per_line,
-                                          unsigned char *output_ptr,
-                                          unsigned int output_pitch,
-                                          unsigned int output_height,
-                                          int16_t *filter) {
-    __m128i addFilterReg64, filtersReg, srcReg1, srcReg2;
-    __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
-    __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
-    __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3;
-    unsigned int i;
-
-    // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-    addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-    filtersReg = _mm_loadu_si128((__m128i *)filter);
-    // converting the 16 bit (short) to  8 bit (byte) and have the same data
-    // in both lanes of 128 bit register.
-    filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
-    // duplicate only the first 16 bits (first and second byte)
-    // across 128 bit register
-    firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
-    // duplicate only the second 16 bits (third and forth byte)
-    // across 128 bit register
-    secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
-    // duplicate only the third 16 bits (fifth and sixth byte)
-    // across 128 bit register
-    thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
-    // duplicate only the forth 16 bits (seventh and eighth byte)
-    // across 128 bit register
-    forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
-    filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
-    filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
-    filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
-    filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
-
-    for (i = 0; i < output_height; i++) {
-        srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3));
-
-        // filter the source buffer
-        srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg);
-        srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt2Reg);
-
-        // multiply 2 adjacent elements with the filter and add the result
-        srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters);
-        srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
-
-        // add and saturate the results together
-        srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
-
-        // filter the source buffer
-        srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt4Reg);
-        srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg);
-
-        // multiply 2 adjacent elements with the filter and add the result
-        srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
-        srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
-
-        // add and saturate the results together
-        srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
-        _mm_min_epi16(srcRegFilt3, srcRegFilt2));
-
-        // reading the next 16 bytes.
-        // (part of it was being read by earlier read)
-        srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5));
-
-        // add and saturate the results together
-        srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
-        _mm_max_epi16(srcRegFilt3, srcRegFilt2));
-
-        // filter the source buffer
-        srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg);
-        srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt2Reg);
-
-        // multiply 2 adjacent elements with the filter and add the result
-        srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters);
-        srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
-
-        // add and saturate the results together
-        srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
-
-        // filter the source buffer
-        srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt4Reg);
-        srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg);
-
-        // multiply 2 adjacent elements with the filter and add the result
-        srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
-        srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
-
-        // add and saturate the results together
-        srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
-        _mm_min_epi16(srcRegFilt3, srcRegFilt2));
-        srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
-        _mm_max_epi16(srcRegFilt3, srcRegFilt2));
-
-        srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64);
-
-        srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, addFilterReg64);
-
-        // shift by 7 bit each 16 bit
-        srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
-        srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
-
-        // shrink to 8 bit each 16 bits, the first lane contain the first
-        // convolve result and the second lane contain the second convolve
-        // result
-        srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
-
-        src_ptr+=src_pixels_per_line;
-
-        // save 16 bytes
-        _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);
-
-        output_ptr+=output_pitch;
-    }
-}
-
-
-
-void vp9_filter_block1d4_v8_intrin_ssse3(unsigned char *src_ptr,
-                                         unsigned int src_pitch,
-                                         unsigned char *output_ptr,
-                                         unsigned int out_pitch,
-                                         unsigned int output_height,
-                                         int16_t *filter) {
-    __m128i addFilterReg64, filtersReg, firstFilters, secondFilters;
-    __m128i minReg, srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
-    unsigned int i;
-
-    // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-    addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-    filtersReg = _mm_loadu_si128((__m128i *)filter);
-    // converting the 16 bit (short) to  8 bit (byte) and have the same data
-    // in both lanes of 128 bit register.
-    filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
-    // duplicate only the first 16 bits in the filter into the first lane
-    firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
-    // duplicate only the second 16 bits in the filter into the second lane
-    firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
-    // duplicate only the third 16 bits in the filter into the first lane
-    secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
-    // duplicate only the forth 16 bits in the filter into the second lane
-    secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
-
-    for (i = 0; i < output_height; i++) {
-        // load the first 4 byte
-        srcRegFilt1 = _mm_cvtsi32_si128(*((int*)&src_ptr[0]));
-        // load the next 4 bytes in stride of src_pitch
-        srcRegFilt2 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch)[0]));
-
-        // merge the result together
-        srcRegFilt1 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
-
-
-        srcRegFilt2 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*2)[0]));
-        srcRegFilt3 =  _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*3)[0]));
-
-        // merge the result together
-        srcRegFilt2 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
-
-        srcRegFilt3 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*4)[0]));
-        srcRegFilt4 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*5)[0]));
-
-        // merge the result together
-        srcRegFilt3 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
-        srcRegFilt1 = _mm_unpacklo_epi64(srcRegFilt1, srcRegFilt2);
-
-        srcRegFilt4 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*6)[0]));
-        srcRegFilt2 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*7)[0]));
-
-        // merge the result together
-        srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt4, srcRegFilt2);
-        srcRegFilt3 = _mm_unpacklo_epi64(srcRegFilt3, srcRegFilt4);
-
-        // multiply 2 adjacent elements with the filter and add the result
-        srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
-        srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
-
-        // extract the second lane of the 128 bit register
-        srcRegFilt2 = _mm_srli_si128(srcRegFilt1, 8);
-
-        // add and saturate the results together
-        minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
-        _mm_srli_si128(srcRegFilt3, 8));
-        srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
-        // shift by 7 bit each 16 bit
-        srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
-        // shrink to 8 bit each 16 bits
-        srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-
-        src_ptr+=src_pitch;
-
-        // save only 4 bytes convolve result
-        *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1);
-
-        output_ptr+=out_pitch;
-    }
-}
-
-void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr,
-                                         unsigned int src_pitch,
-                                         unsigned char *output_ptr,
-                                         unsigned int out_pitch,
-                                         unsigned int output_height,
-                                         int16_t *filter) {
-    __m128i addFilterReg64, filtersReg, minReg, srcRegFilt6;
-    __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
-    __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4, srcRegFilt5;
-    unsigned int i;
-
-    // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-    addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-    filtersReg = _mm_loadu_si128((__m128i *)filter);
-    // converting the 16 bit (short) to  8 bit (byte) and have the same data
-    // in both lanes of 128 bit register.
-    filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
-    // duplicate only the first 16 bits in the filter
-    firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
-    // duplicate only the second 16 bits in the filter
-    secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
-    // duplicate only the third 16 bits in the filter
-    thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
-    // duplicate only the forth 16 bits in the filter
-    forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
-    for (i = 0; i < output_height; i++) {
-        // load the first 8 bytes
-        srcRegFilt1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]);
-        // load the next 8 bytes in stride of src_pitch
-        srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch)[0]);
-        srcRegFilt3 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*2)[0]);
-        srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*3)[0]);
-
-        // merge the result together
-        srcRegFilt1 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
-        srcRegFilt3 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
-
-        // load the next 8 bytes in stride of src_pitch
-        srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*4)[0]);
-        srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*5)[0]);
-        srcRegFilt5 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*6)[0]);
-        srcRegFilt6 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*7)[0]);
-
-        // merge the result together
-        srcRegFilt2 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt4);
-        srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt5, srcRegFilt6);
-
-        // multiply 2 adjacent elements with the filter and add the result
-        srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
-        srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
-        srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
-        srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters);
-
-        // add and saturate the results together
-        minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5);
-        srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
-        // shift by 7 bit each 16 bit
-        srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
-        // shrink to 8 bit each 16 bits
-        srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-
-        src_ptr+=src_pitch;
-
-        // save only 8 bytes convolve result
-        _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
-
-        output_ptr+=out_pitch;
-    }
-}
-
-
-void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
-                                          unsigned int src_pitch,
-                                          unsigned char *output_ptr,
-                                          unsigned int out_pitch,
-                                          unsigned int output_height,
-                                          int16_t *filter) {
-    __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt2, srcRegFilt3;
-    __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
-    __m128i srcRegFilt4, srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;
-    unsigned int i;
-
-    // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-    addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-    filtersReg = _mm_loadu_si128((__m128i *)filter);
-    // converting the 16 bit (short) to  8 bit (byte) and have the same data
-    // in both lanes of 128 bit register.
-    filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
-    // duplicate only the first 16 bits in the filter
-    firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
-    // duplicate only the second 16 bits in the filter
-    secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
-    // duplicate only the third 16 bits in the filter
-    thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
-    // duplicate only the forth 16 bits in the filter
-    forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
-
-    for (i = 0; i < output_height; i++) {
-        // load the first 16 bytes
-        srcRegFilt1 = _mm_loadu_si128((__m128i *)(src_ptr));
-        // load the next 16 bytes in stride of src_pitch
-        srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch));
-        srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6));
-        srcRegFilt4 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7));
-
-        // merge the result together
-        srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
-        srcRegFilt6 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
-        srcRegFilt1 = _mm_unpackhi_epi8(srcRegFilt1, srcRegFilt2);
-        srcRegFilt3 = _mm_unpackhi_epi8(srcRegFilt3, srcRegFilt4);
-
-        // multiply 2 adjacent elements with the filter and add the result
-        srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters);
-        srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters);
-        srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
-        srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
-
-
-        // add and saturate the results together
-        srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6);
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
-
-        // load the next 16 bytes in stride of two/three src_pitch
-        srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2));
-        srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3));
-
-        // merge the result together
-        srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
-        srcRegFilt6 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);
-
-        // multiply 2 adjacent elements with the filter and add the result
-        srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, secondFilters);
-        srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters);
-
-        // load the next 16 bytes in stride of four/five src_pitch
-        srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4));
-        srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5));
-
-        // merge the result together
-        srcRegFilt7 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
-        srcRegFilt8 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);
-
-        // multiply 2 adjacent elements with the filter and add the result
-        srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, thirdFilters);
-        srcRegFilt8 = _mm_maddubs_epi16(srcRegFilt8, thirdFilters);
-
-
-        // add and saturate the results together
-        srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
-        _mm_min_epi16(srcRegFilt4, srcRegFilt7));
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
-        _mm_min_epi16(srcRegFilt6, srcRegFilt8));
-
-
-        // add and saturate the results together
-        srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
-        _mm_max_epi16(srcRegFilt4, srcRegFilt7));
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
-        _mm_max_epi16(srcRegFilt6, srcRegFilt8));
-        srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg64);
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
-        // shift by 7 bit each 16 bit
-        srcRegFilt5 = _mm_srai_epi16(srcRegFilt5, 7);
-        srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
-        // shrink to 8 bit each 16 bits, the first lane contain the first
-        // convolve result and the second lane contain the second convolve
-        // result
-        srcRegFilt1 = _mm_packus_epi16(srcRegFilt5, srcRegFilt1);
-
-        src_ptr+=src_pitch;
-
-        // save 16 bytes convolve result
-        _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
-
-        output_ptr+=out_pitch;
-    }
-}
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index c81378153..2eb99ea15 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -357,9 +357,9 @@ static void read_intra_block_mode_info(VP9_COMMON *const cm, MODE_INFO *mi,
 }
 
 static INLINE int assign_mv(VP9_COMMON *cm, MB_PREDICTION_MODE mode,
-                             int_mv mv[2], int_mv best_mv[2],
-                             int_mv nearest_mv[2], int_mv near_mv[2],
-                             int is_compound, int allow_hp, vp9_reader *r) {
+                            int_mv mv[2], int_mv ref_mv[2],
+                            int_mv nearest_mv[2], int_mv near_mv[2],
+                            int is_compound, int allow_hp, vp9_reader *r) {
   int i;
   int ret = 1;
 
@@ -367,10 +367,10 @@ static INLINE int assign_mv(VP9_COMMON *cm, MB_PREDICTION_MODE mode,
     case NEWMV: {
       nmv_context_counts *const mv_counts = cm->frame_parallel_decoding_mode ?
                                             NULL : &cm->counts.mv;
-      read_mv(r, &mv[0].as_mv, &best_mv[0].as_mv,
+      read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv,
               &cm->fc.nmvc, mv_counts, allow_hp);
       if (is_compound)
-        read_mv(r, &mv[1].as_mv, &best_mv[1].as_mv,
+        read_mv(r, &mv[1].as_mv, &ref_mv[1].as_mv,
                 &cm->fc.nmvc, mv_counts, allow_hp);
       for (i = 0; i < 1 + is_compound; ++i) {
         ret = ret && mv[i].as_mv.row < MV_UPP && mv[i].as_mv.row > MV_LOW;
@@ -380,17 +380,20 @@ static INLINE int assign_mv(VP9_COMMON *cm, MB_PREDICTION_MODE mode,
     }
     case NEARESTMV: {
       mv[0].as_int = nearest_mv[0].as_int;
-      if (is_compound) mv[1].as_int = nearest_mv[1].as_int;
+      if (is_compound)
+        mv[1].as_int = nearest_mv[1].as_int;
       break;
     }
     case NEARMV: {
       mv[0].as_int = near_mv[0].as_int;
-      if (is_compound) mv[1].as_int = near_mv[1].as_int;
+      if (is_compound)
+        mv[1].as_int = near_mv[1].as_int;
       break;
     }
     case ZEROMV: {
       mv[0].as_int = 0;
-      if (is_compound) mv[1].as_int = 0;
+      if (is_compound)
+        mv[1].as_int = 0;
       break;
     }
     default: {
@@ -423,7 +426,7 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
   const BLOCK_SIZE bsize = mbmi->sb_type;
   const int allow_hp = cm->allow_high_precision_mv;
 
-  int_mv nearest[2], nearmv[2], best[2];
+  int_mv nearestmv[2], nearmv[2];
   int inter_mode_ctx, ref, is_compound;
 
   read_ref_frames(cm, xd, r, mbmi->segment_id, mbmi->ref_frame);
@@ -452,8 +455,7 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
   if (bsize < BLOCK_8X8 || mbmi->mode != ZEROMV) {
     for (ref = 0; ref < 1 + is_compound; ++ref) {
       vp9_find_best_ref_mvs(xd, allow_hp, mbmi->ref_mvs[mbmi->ref_frame[ref]],
-                            &nearest[ref], &nearmv[ref]);
-      best[ref].as_int = nearest[ref].as_int;
+                            &nearestmv[ref], &nearmv[ref]);
     }
   }
 
@@ -466,6 +468,7 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
     const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];  // 1 or 2
     int idx, idy;
     int b_mode;
+    int_mv nearest_sub8x8[2], near_sub8x8[2];
     for (idy = 0; idy < 2; idy += num_4x4_h) {
       for (idx = 0; idx < 2; idx += num_4x4_w) {
         int_mv block[2];
@@ -475,9 +478,11 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
         if (b_mode == NEARESTMV || b_mode == NEARMV)
           for (ref = 0; ref < 1 + is_compound; ++ref)
             vp9_append_sub8x8_mvs_for_idx(cm, xd, tile, j, ref, mi_row, mi_col,
-                                          &nearest[ref], &nearmv[ref]);
+                                          &nearest_sub8x8[ref],
+                                          &near_sub8x8[ref]);
 
-        if (!assign_mv(cm, b_mode, block, best, nearest, nearmv,
+        if (!assign_mv(cm, b_mode, block, nearestmv,
+                       nearest_sub8x8, near_sub8x8,
                        is_compound, allow_hp, r)) {
           xd->corrupted |= 1;
           break;
@@ -499,9 +504,8 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
     mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
     mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
   } else {
-    xd->corrupted |= !assign_mv(cm, mbmi->mode, mbmi->mv,
-                                best, nearest, nearmv,
-                                is_compound, allow_hp, r);
+    xd->corrupted |= !assign_mv(cm, mbmi->mode, mbmi->mv, nearestmv,
+                                nearestmv, nearmv, is_compound, allow_hp, r);
   }
 }
 
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index c011948b9..c1b95817f 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -116,6 +116,7 @@ struct macroblock {
   unsigned int source_variance;
   unsigned int pred_sse[MAX_REF_FRAMES];
   int pred_mv_sad[MAX_REF_FRAMES];
+  int mode_sad[MAX_REF_FRAMES][INTER_MODES + 1];
 
   int nmvjointcost[MV_JOINTS];
   int nmvcosts[2][MV_VALS];
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index b0fae6593..a66b9fb8e 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -34,6 +34,7 @@
 #include "vp9/encoder/vp9_encodemv.h"
 #include "vp9/encoder/vp9_extend.h"
 #include "vp9/encoder/vp9_onyx_int.h"
+#include "vp9/encoder/vp9_pickmode.h"
 #include "vp9/encoder/vp9_rdopt.h"
 #include "vp9/encoder/vp9_segmentation.h"
 #include "vp9/encoder/vp9_tokenize.h"
@@ -629,11 +630,11 @@ static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile,
   }
 }
 
-static void pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
-                          int mi_row, int mi_col,
-                          int *totalrate, int64_t *totaldist,
-                          BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
-                          int64_t best_rd) {
+static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
+                             int mi_row, int mi_col,
+                             int *totalrate, int64_t *totaldist,
+                             BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                             int64_t best_rd) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -1038,6 +1039,132 @@ static int sb_has_motion(const VP9_COMMON *cm, MODE_INFO **prev_mi_8x8) {
   return 0;
 }
 
+// TODO(jingning) This currently serves as a test framework for non-RD mode
+// decision. To be continued on optimizing the partition type decisions.
+static void pick_partition_type(VP9_COMP *cpi,
+                                const TileInfo *const tile,
+                                MODE_INFO **mi_8x8, TOKENEXTRA **tp,
+                                int mi_row, int mi_col,
+                                BLOCK_SIZE bsize, int *rate, int64_t *dist,
+                                int do_recon) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
+  const int mi_stride = cm->mode_info_stride;
+  const int num_8x8_subsize = (num_8x8_blocks_wide_lookup[bsize] >> 1);
+  int i;
+  PARTITION_TYPE partition = PARTITION_NONE;
+  BLOCK_SIZE subsize;
+  BLOCK_SIZE bs_type = mi_8x8[0]->mbmi.sb_type;
+  int sub_rate[4] = {0};
+  int64_t sub_dist[4] = {0};
+  int mi_offset;
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
+
+  partition = partition_lookup[b_width_log2(bsize)][bs_type];
+  subsize = get_subsize(bsize, partition);
+
+  if (bsize < BLOCK_8X8) {
+    // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0
+    // there is nothing to be done.
+    if (x->ab_index != 0) {
+      *rate = 0;
+      *dist = 0;
+      return;
+    }
+  } else {
+    *(get_sb_partitioning(x, bsize)) = subsize;
+  }
+
+  switch (partition) {
+    case PARTITION_NONE:
+      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, rate, dist,
+                       bsize, get_block_context(x, bsize), INT64_MAX);
+      break;
+    case PARTITION_HORZ:
+      *get_sb_index(x, subsize) = 0;
+      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sub_rate[0], &sub_dist[0],
+                       subsize, get_block_context(x, subsize), INT64_MAX);
+      if (bsize >= BLOCK_8X8 && mi_row + num_8x8_subsize < cm->mi_rows) {
+        update_state(cpi, get_block_context(x, subsize), subsize, 0);
+        encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
+        *get_sb_index(x, subsize) = 1;
+        rd_pick_sb_modes(cpi, tile, mi_row + num_8x8_subsize, mi_col,
+                         &sub_rate[1], &sub_dist[1], subsize,
+                         get_block_context(x, subsize), INT64_MAX);
+      }
+      *rate = sub_rate[0] + sub_rate[1];
+      *dist = sub_dist[0] + sub_dist[1];
+      break;
+    case PARTITION_VERT:
+      *get_sb_index(x, subsize) = 0;
+      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sub_rate[0], &sub_dist[0],
+                       subsize, get_block_context(x, subsize), INT64_MAX);
+      if (bsize >= BLOCK_8X8 && mi_col + num_8x8_subsize < cm->mi_cols) {
+        update_state(cpi, get_block_context(x, subsize), subsize, 0);
+        encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
+        *get_sb_index(x, subsize) = 1;
+        rd_pick_sb_modes(cpi, tile, mi_row, mi_col + num_8x8_subsize,
+                         &sub_rate[1], &sub_dist[1], subsize,
+                         get_block_context(x, subsize), INT64_MAX);
+      }
+      *rate = sub_rate[0] + sub_rate[1];
+      *dist = sub_dist[1] + sub_dist[1];
+      break;
+    case PARTITION_SPLIT:
+      *get_sb_index(x, subsize) = 0;
+      pick_partition_type(cpi, tile, mi_8x8, tp, mi_row, mi_col, subsize,
+                          &sub_rate[0], &sub_dist[0], 0);
+
+      if ((mi_col + num_8x8_subsize) < cm->mi_cols) {
+        *get_sb_index(x, subsize) = 1;
+        pick_partition_type(cpi, tile, mi_8x8 + num_8x8_subsize, tp,
+                            mi_row, mi_col + num_8x8_subsize, subsize,
+                            &sub_rate[1], &sub_dist[1], 0);
+      }
+
+      if ((mi_row + num_8x8_subsize) < cm->mi_rows) {
+        *get_sb_index(x, subsize) = 2;
+        pick_partition_type(cpi, tile, mi_8x8 + num_8x8_subsize * mi_stride, tp,
+                            mi_row + num_8x8_subsize, mi_col, subsize,
+                            &sub_rate[2], &sub_dist[2], 0);
+      }
+
+      if ((mi_col + num_8x8_subsize) < cm->mi_cols &&
+          (mi_row + num_8x8_subsize) < cm->mi_rows) {
+        *get_sb_index(x, subsize) = 3;
+        mi_offset = num_8x8_subsize * mi_stride + num_8x8_subsize;
+        pick_partition_type(cpi, tile, mi_8x8 + mi_offset, tp,
+                            mi_row + num_8x8_subsize, mi_col + num_8x8_subsize,
+                            subsize, &sub_rate[3], &sub_dist[3], 0);
+      }
+
+      for (i = 0; i < 4; ++i) {
+        *rate += sub_rate[i];
+        *dist += sub_dist[i];
+      }
+
+      break;
+    default:
+      assert(0);
+  }
+
+  if (do_recon) {
+    int output_enabled = (bsize == BLOCK_64X64);
+
+    // Check the projected output rate for this SB against it's target
+    // and and if necessary apply a Q delta using segmentation to get
+    // closer to the target.
+    if ((cpi->oxcf.aq_mode == COMPLEXITY_AQ) && cm->seg.update_map) {
+      select_in_frame_q_segment(cpi, mi_row, mi_col,
+                                output_enabled, *rate);
+    }
+
+    encode_sb(cpi, tile, tp, mi_row, mi_col, output_enabled, bsize);
+  }
+}
+
 static void rd_use_partition(VP9_COMP *cpi,
                              const TileInfo *const tile,
                              MODE_INFO **mi_8x8,
@@ -1117,8 +1244,8 @@ static void rd_use_partition(VP9_COMP *cpi,
         mi_row + (ms >> 1) < cm->mi_rows &&
         mi_col + (ms >> 1) < cm->mi_cols) {
       *(get_sb_partitioning(x, bsize)) = bsize;
-      pick_sb_modes(cpi, tile, mi_row, mi_col, &none_rate, &none_dist, bsize,
-                    get_block_context(x, bsize), INT64_MAX);
+      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &none_rate, &none_dist, bsize,
+                       get_block_context(x, bsize), INT64_MAX);
 
       pl = partition_plane_context(cpi->above_seg_context,
                                    cpi->left_seg_context,
@@ -1133,13 +1260,15 @@ static void rd_use_partition(VP9_COMP *cpi,
 
   switch (partition) {
     case PARTITION_NONE:
-      pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, &last_part_dist,
-                    bsize, get_block_context(x, bsize), INT64_MAX);
+      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate,
+                       &last_part_dist, bsize,
+                       get_block_context(x, bsize), INT64_MAX);
       break;
     case PARTITION_HORZ:
       *get_sb_index(x, subsize) = 0;
-      pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, &last_part_dist,
-                    subsize, get_block_context(x, subsize), INT64_MAX);
+      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate,
+                       &last_part_dist, subsize,
+                       get_block_context(x, subsize), INT64_MAX);
       if (last_part_rate != INT_MAX &&
           bsize >= BLOCK_8X8 && mi_row + (mh >> 1) < cm->mi_rows) {
         int rt = 0;
@@ -1147,8 +1276,8 @@ static void rd_use_partition(VP9_COMP *cpi,
         update_state(cpi, get_block_context(x, subsize), subsize, 0);
         encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
         *get_sb_index(x, subsize) = 1;
-        pick_sb_modes(cpi, tile, mi_row + (ms >> 1), mi_col, &rt, &dt, subsize,
-                      get_block_context(x, subsize), INT64_MAX);
+        rd_pick_sb_modes(cpi, tile, mi_row + (ms >> 1), mi_col, &rt, &dt,
+                         subsize, get_block_context(x, subsize), INT64_MAX);
         if (rt == INT_MAX || dt == INT_MAX) {
           last_part_rate = INT_MAX;
           last_part_dist = INT_MAX;
@@ -1161,8 +1290,9 @@ static void rd_use_partition(VP9_COMP *cpi,
       break;
     case PARTITION_VERT:
       *get_sb_index(x, subsize) = 0;
-      pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, &last_part_dist,
-                    subsize, get_block_context(x, subsize), INT64_MAX);
+      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate,
+                       &last_part_dist, subsize,
+                       get_block_context(x, subsize), INT64_MAX);
       if (last_part_rate != INT_MAX &&
           bsize >= BLOCK_8X8 && mi_col + (ms >> 1) < cm->mi_cols) {
         int rt = 0;
@@ -1170,8 +1300,8 @@ static void rd_use_partition(VP9_COMP *cpi,
         update_state(cpi, get_block_context(x, subsize), subsize, 0);
         encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
         *get_sb_index(x, subsize) = 1;
-        pick_sb_modes(cpi, tile, mi_row, mi_col + (ms >> 1), &rt, &dt, subsize,
-                      get_block_context(x, subsize), INT64_MAX);
+        rd_pick_sb_modes(cpi, tile, mi_row, mi_col + (ms >> 1), &rt, &dt,
+                         subsize, get_block_context(x, subsize), INT64_MAX);
         if (rt == INT_MAX || dt == INT_MAX) {
           last_part_rate = INT_MAX;
           last_part_dist = INT_MAX;
@@ -1245,9 +1375,9 @@ static void rd_use_partition(VP9_COMP *cpi,
 
       save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
 
-      pick_sb_modes(cpi, tile, mi_row + y_idx, mi_col + x_idx, &rt, &dt,
-                    split_subsize, get_block_context(x, split_subsize),
-                    INT64_MAX);
+      rd_pick_sb_modes(cpi, tile, mi_row + y_idx, mi_col + x_idx, &rt, &dt,
+                       split_subsize, get_block_context(x, split_subsize),
+                       INT64_MAX);
 
       restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
 
@@ -1611,8 +1741,8 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
 
   // PARTITION_NONE
   if (partition_none_allowed) {
-    pick_sb_modes(cpi, tile, mi_row, mi_col, &this_rate, &this_dist, bsize,
-                  get_block_context(x, bsize), best_rd);
+    rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &this_rate, &this_dist, bsize,
+                     get_block_context(x, bsize), best_rd);
     if (this_rate != INT_MAX) {
       if (bsize >= BLOCK_8X8) {
         pl = partition_plane_context(cpi->above_seg_context,
@@ -1722,8 +1852,8 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
         partition_none_allowed)
       get_block_context(x, subsize)->pred_filter_type =
           get_block_context(x, bsize)->mic.mbmi.interp_filter;
-    pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
-                  get_block_context(x, subsize), best_rd);
+    rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
+                     get_block_context(x, subsize), best_rd);
     sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
 
     if (sum_rd < best_rd && mi_row + ms < cm->mi_rows) {
@@ -1737,9 +1867,9 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
           partition_none_allowed)
         get_block_context(x, subsize)->pred_filter_type =
             get_block_context(x, bsize)->mic.mbmi.interp_filter;
-      pick_sb_modes(cpi, tile, mi_row + ms, mi_col, &this_rate,
-                    &this_dist, subsize, get_block_context(x, subsize),
-                    best_rd - sum_rd);
+      rd_pick_sb_modes(cpi, tile, mi_row + ms, mi_col, &this_rate,
+                       &this_dist, subsize, get_block_context(x, subsize),
+                       best_rd - sum_rd);
       if (this_rate == INT_MAX) {
         sum_rd = INT64_MAX;
       } else {
@@ -1775,8 +1905,8 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
         partition_none_allowed)
       get_block_context(x, subsize)->pred_filter_type =
           get_block_context(x, bsize)->mic.mbmi.interp_filter;
-    pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
-                  get_block_context(x, subsize), best_rd);
+    rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
+                     get_block_context(x, subsize), best_rd);
     sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
     if (sum_rd < best_rd && mi_col + ms < cm->mi_cols) {
       update_state(cpi, get_block_context(x, subsize), subsize, 0);
@@ -1789,9 +1919,9 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
           partition_none_allowed)
         get_block_context(x, subsize)->pred_filter_type =
             get_block_context(x, bsize)->mic.mbmi.interp_filter;
-      pick_sb_modes(cpi, tile, mi_row, mi_col + ms, &this_rate,
-                    &this_dist, subsize, get_block_context(x, subsize),
-                    best_rd - sum_rd);
+      rd_pick_sb_modes(cpi, tile, mi_row, mi_col + ms, &this_rate,
+                       &this_dist, subsize, get_block_context(x, subsize),
+                       best_rd - sum_rd);
       if (this_rate == INT_MAX) {
         sum_rd = INT64_MAX;
       } else {
@@ -1862,8 +1992,8 @@ static void rd_pick_reference_frame(VP9_COMP *cpi, const TileInfo *const tile,
   if ((mi_row + (ms >> 1) < cm->mi_rows) &&
       (mi_col + (ms >> 1) < cm->mi_cols)) {
     cpi->set_ref_frame_mask = 1;
-    pick_sb_modes(cpi, tile, mi_row, mi_col, &r, &d, BLOCK_64X64,
-                  get_block_context(x, BLOCK_64X64), INT64_MAX);
+    rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &r, &d, BLOCK_64X64,
+                     get_block_context(x, BLOCK_64X64), INT64_MAX);
     pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context,
                                  mi_row, mi_col, BLOCK_64X64);
     r += x->partition_cost[pl][PARTITION_NONE];
@@ -1875,6 +2005,34 @@ static void rd_pick_reference_frame(VP9_COMP *cpi, const TileInfo *const tile,
   restore_context(cpi, mi_row, mi_col, a, l, sa, sl, BLOCK_64X64);
 }
 
+static void encode_sb_row_rt(VP9_COMP *cpi, const TileInfo *const tile,
+                             int mi_row, TOKENEXTRA **tp) {
+  VP9_COMMON *const cm = &cpi->common;
+  int mi_col;
+
+  cpi->sf.always_this_block_size = BLOCK_8X8;
+
+  // Initialize the left context for the new SB row
+  vpx_memset(&cpi->left_context, 0, sizeof(cpi->left_context));
+  vpx_memset(cpi->left_seg_context, 0, sizeof(cpi->left_seg_context));
+
+  // Code each SB in the row
+  for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
+       mi_col += MI_BLOCK_SIZE) {
+    int dummy_rate;
+    int64_t dummy_dist;
+    const int idx_str = cm->mode_info_stride * mi_row + mi_col;
+    MODE_INFO **mi_8x8 = cm->mi_grid_visible + idx_str;
+
+    vp9_zero(cpi->mb.pred_mv);
+
+    set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
+    set_partitioning(cpi, tile, mi_8x8, mi_row, mi_col);
+    pick_partition_type(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
+                        &dummy_rate, &dummy_dist, 1);
+  }
+}
+
 static void encode_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
                           int mi_row, TOKENEXTRA **tp) {
   VP9_COMMON *const cm = &cpi->common;
@@ -2101,7 +2259,11 @@ static void encode_frame_internal(VP9_COMP *cpi) {
           vp9_tile_init(&tile, cm, tile_row, tile_col);
           for (mi_row = tile.mi_row_start;
                mi_row < tile.mi_row_end; mi_row += 8)
+#if 1
             encode_sb_row(cpi, &tile, mi_row, &tp);
+#else
+            encode_sb_row_rt(cpi, &tile, mi_row, &tp);
+#endif
 
           cpi->tok_count[tile_row][tile_col] = (unsigned int)(tp - tp_old);
           assert(tp - cpi->tok <= get_token_alloc(cm->mb_rows, cm->mb_cols));
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 538599d58..0a5af18cb 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -1965,7 +1965,10 @@ void vp9_get_one_pass_params(VP9_COMP *cpi) {
        cpi->rc.frames_to_key == 0 ||
        (cpi->oxcf.auto_key && test_for_kf_one_pass(cpi)))) {
     cm->frame_type = KEY_FRAME;
+    cpi->rc.this_key_frame_forced = cm->current_video_frame != 0 &&
+                                    cpi->rc.frames_to_key == 0;
     cpi->rc.frames_to_key = cpi->key_frame_frequency;
+    cpi->rc.kf_boost = 300;
   } else {
     cm->frame_type = INTER_FRAME;
   }
@@ -1982,7 +1985,10 @@ void vp9_get_one_pass_cbr_params(VP9_COMP *cpi) {
       cpi->rc.frames_to_key == 0 ||
       (cpi->oxcf.auto_key && test_for_kf_one_pass(cpi)))) {
     cm->frame_type = KEY_FRAME;
+    cpi->rc.this_key_frame_forced = cm->current_video_frame != 0 &&
+                                    cpi->rc.frames_to_key == 0;
     cpi->rc.frames_to_key = cpi->key_frame_frequency;
+    cpi->rc.kf_boost = 300;
   } else {
     cm->frame_type = INTER_FRAME;
   }
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
new file mode 100644
index 000000000..17d1f5984
--- /dev/null
+++ b/vp9/encoder/vp9_pickmode.c
@@ -0,0 +1,230 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdio.h>
+#include <math.h>
+#include <limits.h>
+#include <assert.h>
+
+#include "vp9/common/vp9_pragmas.h"
+#include "vp9/encoder/vp9_tokenize.h"
+#include "vp9/encoder/vp9_treewriter.h"
+#include "vp9/encoder/vp9_onyx_int.h"
+#include "vp9/common/vp9_entropymode.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/common/vp9_reconintra.h"
+#include "vp9/common/vp9_quant_common.h"
+#include "vp9/encoder/vp9_encodemb.h"
+#include "vp9/encoder/vp9_quantize.h"
+#include "vp9/encoder/vp9_variance.h"
+#include "vp9/encoder/vp9_mcomp.h"
+#include "vp9/encoder/vp9_rdopt.h"
+#include "vp9/encoder/vp9_ratectrl.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp9/common/vp9_systemdependent.h"
+#include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/common/vp9_seg_common.h"
+#include "vp9/common/vp9_pred_common.h"
+#include "vp9/common/vp9_entropy.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_mvref_common.h"
+#include "vp9/common/vp9_common.h"
+
+static int full_pixel_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
+                                     const TileInfo *const tile,
+                                     BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                     int_mv *tmp_mv, int *rate_mv) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+  struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
+  int bestsme = INT_MAX;
+  int further_steps, step_param;
+  int sadpb = x->sadperbit16;
+  MV mvp_full;
+  int ref = mbmi->ref_frame[0];
+  int_mv ref_mv = mbmi->ref_mvs[ref][0];
+  int i;
+
+  int tmp_col_min = x->mv_col_min;
+  int tmp_col_max = x->mv_col_max;
+  int tmp_row_min = x->mv_row_min;
+  int tmp_row_max = x->mv_row_max;
+
+  int buf_offset;
+  int stride = xd->plane[0].pre[0].stride;
+
+  YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi, ref);
+
+  if (scaled_ref_frame) {
+    int i;
+    // Swap out the reference frame for a version that's been scaled to
+    // match the resolution of the current frame, allowing the existing
+    // motion search code to be used without additional modifications.
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      backup_yv12[i] = xd->plane[i].pre[0];
+
+    setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
+  }
+
+  vp9_set_mv_search_range(x, &ref_mv.as_mv);
+
+  // TODO(jingning) exploiting adaptive motion search control in non-RD
+  // mode decision too.
+  step_param = 6;
+  further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
+
+  for (i = LAST_FRAME; i <= ALTREF_FRAME && cpi->common.show_frame; ++i) {
+    if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
+      tmp_mv->as_int = INVALID_MV;
+
+      if (scaled_ref_frame) {
+        int i;
+        for (i = 0; i < MAX_MB_PLANE; i++)
+          xd->plane[i].pre[0] = backup_yv12[i];
+      }
+      return INT_MAX;
+    }
+  }
+
+  mvp_full = mbmi->ref_mvs[ref][x->mv_best_ref_index[ref]].as_mv;
+
+  mvp_full.col >>= 3;
+  mvp_full.row >>= 3;
+
+  bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
+                                   sadpb, further_steps, 1,
+                                   &cpi->fn_ptr[bsize],
+                                   &ref_mv.as_mv, tmp_mv);
+
+  x->mv_col_min = tmp_col_min;
+  x->mv_col_max = tmp_col_max;
+  x->mv_row_min = tmp_row_min;
+  x->mv_row_max = tmp_row_max;
+
+  if (scaled_ref_frame) {
+    int i;
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      xd->plane[i].pre[0] = backup_yv12[i];
+  }
+
+  // TODO(jingning) This step can be merged into full pixel search step in the
+  // re-designed log-diamond search
+  buf_offset = tmp_mv->as_mv.row * stride + tmp_mv->as_mv.col;
+
+  // Find sad for current vector.
+  bestsme = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, x->plane[0].src.stride,
+                                   xd->plane[0].pre[0].buf + buf_offset,
+                                   stride, 0x7fffffff);
+
+  // scale to 1/8 pixel resolution
+  tmp_mv->as_mv.row = tmp_mv->as_mv.row << 3;
+  tmp_mv->as_mv.col = tmp_mv->as_mv.col << 3;
+
+  // calculate the bit cost on motion vector
+  *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv.as_mv,
+                             x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+
+
+  return bestsme;
+}
+
+// TODO(jingning) placeholder for inter-frame non-RD mode decision.
+// this needs various further optimizations. to be continued..
+int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
+                            const TileInfo *const tile,
+                            int mi_row, int mi_col,
+                            int *returnrate,
+                            int64_t *returndistortion,
+                            BLOCK_SIZE bsize,
+                            PICK_MODE_CONTEXT *ctx) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+  const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]);
+  MB_PREDICTION_MODE this_mode;
+  MV_REFERENCE_FRAME ref_frame;
+  int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
+  struct buf_2d yv12_mb[4][MAX_MB_PLANE];
+  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+                                    VP9_ALT_FLAG };
+  int64_t best_rd = INT64_MAX;
+  int64_t this_rd;
+
+  x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
+
+  x->skip = 0;
+  if (cpi->active_map_enabled && x->active_ptr[0] == 0)
+    x->skip = 1;
+
+  // initialize mode decisions
+  *returnrate = INT_MAX;
+  vpx_memset(mbmi, 0, sizeof(MB_MODE_INFO));
+  mbmi->sb_type = bsize;
+  mbmi->ref_frame[0] = NONE;
+  mbmi->ref_frame[1] = NONE;
+  mbmi->tx_size = MIN(max_txsize_lookup[bsize],
+                      tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    x->pred_mv_sad[ref_frame] = INT_MAX;
+    if (cpi->ref_frame_flags & flag_list[ref_frame]) {
+      vp9_setup_buffer_inter(cpi, x, tile, get_ref_frame_idx(cpi, ref_frame),
+                             ref_frame, block_size, mi_row, mi_col,
+                             frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
+    }
+    frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
+    frame_mv[ZEROMV][ref_frame].as_int = 0;
+  }
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    int rate_mv = 0;
+
+    if (!(cpi->ref_frame_flags & flag_list[ref_frame]))
+      continue;
+
+    // Select prediction reference frames.
+    xd->plane[0].pre[0] = yv12_mb[ref_frame][0];
+
+
+    x->mode_sad[ref_frame][INTER_OFFSET(NEWMV)] =
+        full_pixel_motion_search(cpi, x, tile, bsize, mi_row, mi_col,
+                                 &frame_mv[NEWMV][ref_frame], &rate_mv);
+
+    if (frame_mv[NEWMV][ref_frame].as_int == INVALID_MV)
+      continue;
+
+    clamp_mv2(&frame_mv[NEARESTMV][ref_frame].as_mv, xd);
+    clamp_mv2(&frame_mv[NEARMV][ref_frame].as_mv, xd);
+
+    for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
+      int rate = x->inter_mode_cost[mbmi->mode_context[ref_frame]]
+                                   [INTER_OFFSET(this_mode)];
+      int64_t dist = x->mode_sad[ref_frame][INTER_OFFSET(this_mode)] *
+                      x->mode_sad[ref_frame][INTER_OFFSET(this_mode)];
+      this_rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
+
+      if (this_rd < best_rd) {
+        best_rd = this_rd;
+        mbmi->mode = this_mode;
+        mbmi->ref_frame[0] = ref_frame;
+        mbmi->mv[0].as_int = frame_mv[this_mode][ref_frame].as_int;
+      }
+    }
+  }
+
+  // TODO(jingning) sub-pixel motion search, if NEWMV is chosen
+
+  // TODO(jingning) intra prediction search, if the best SAD is above a certain
+  // threshold.
+
+  // store mode decisions
+  ctx->mic = *xd->mi_8x8[0];
+
+  return INT64_MAX;
+}
diff --git a/vp9/encoder/vp9_pickmode.h b/vp9/encoder/vp9_pickmode.h
new file mode 100644
index 000000000..32750fa69
--- /dev/null
+++ b/vp9/encoder/vp9_pickmode.h
@@ -0,0 +1,19 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/encoder/vp9_onyx_int.h"
+
+int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
+                            const struct TileInfo *const tile,
+                            int mi_row, int mi_col,
+                            int *returnrate,
+                            int64_t *returndistortion,
+                            BLOCK_SIZE bsize,
+                            PICK_MODE_CONTEXT *ctx);
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 72ab00f98..4d2d43a11 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -602,7 +602,7 @@ int vp9_rc_pick_q_and_adjust_q_bounds(const VP9_COMP *cpi,
                                             (last_boosted_q * 0.75));
       active_best_quality = MAX(qindex + delta_qindex, rc->best_quality);
     } else if (!(cpi->pass == 0 && cm->current_video_frame == 0)) {
-      // not first frame of one pass
+      // not first frame of one pass and kf_boost is set
       double q_adj_factor = 1.0;
       double q_val;
 
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 5ba891597..fa6b362d4 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -37,8 +37,6 @@
 #include "vp9/common/vp9_mvref_common.h"
 #include "vp9/common/vp9_common.h"
 
-#define INVALID_MV 0x80008000
-
 /* Factor to weigh the rate for switchable interp filters */
 #define SWITCHABLE_INTERP_RATE_FACTOR 1
 
@@ -113,14 +111,6 @@ const REF_DEFINITION vp9_ref_order[MAX_REFS] = {
 static int rd_thresh_block_size_factor[BLOCK_SIZES] =
   {2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32};
 
-#define RD_THRESH_MAX_FACT 64
-#define RD_THRESH_INC      1
-#define RD_THRESH_POW      1.25
-#define RD_MULT_EPB_RATIO  64
-
-#define MV_COST_WEIGHT      108
-#define MV_COST_WEIGHT_SUB  120
-
 static int raster_block_offset(BLOCK_SIZE plane_bsize,
                                int raster_block, int stride) {
   const int bw = b_width_log2(plane_bsize);
@@ -2133,8 +2123,10 @@ static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
     max_mv = MAX(max_mv,
                  MAX(abs(this_mv.as_mv.row), abs(this_mv.as_mv.col)) >> 3);
     // only need to check zero mv once
-    if (!this_mv.as_int && zero_seen)
+    if (!this_mv.as_int && zero_seen) {
+      x->mode_sad[ref_frame][i] = x->mode_sad[ref_frame][INTER_OFFSET(ZEROMV)];
       continue;
+    }
     zero_seen = zero_seen || !this_mv.as_int;
 
     row_offset = this_mv.as_mv.row >> 3;
@@ -2145,6 +2137,9 @@ static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
     this_sad = cpi->fn_ptr[block_size].sdf(src_y_ptr, x->plane[0].src.stride,
                                            ref_y_ptr, ref_y_stride,
                                            0x7fffffff);
+    x->mode_sad[ref_frame][i] = this_sad;
+    if (this_mv.as_int == 0)
+      x->mode_sad[ref_frame][INTER_OFFSET(ZEROMV)] = this_sad;
 
     // Note if it is the best so far.
     if (this_sad < best_sad) {
@@ -2153,6 +2148,12 @@ static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
     }
   }
 
+  if (!zero_seen)
+    x->mode_sad[ref_frame][INTER_OFFSET(ZEROMV)] =
+        cpi->fn_ptr[block_size].sdf(src_y_ptr, x->plane[0].src.stride,
+                                    ref_y_buffer, ref_y_stride,
+                                    0x7fffffff);
+
   // Note the index of the mv that worked best in the reference list.
   x->mv_best_ref_index[ref_frame] = best_index;
   x->max_mv_context[ref_frame] = max_mv;
@@ -2312,7 +2313,7 @@ void vp9_setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
             frame_type, block_size);
 }
 
-static YV12_BUFFER_CONFIG *get_scaled_ref_frame(VP9_COMP *cpi, int ref_frame) {
+YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(VP9_COMP *cpi, int ref_frame) {
   YV12_BUFFER_CONFIG *scaled_ref_frame = NULL;
   int fb = get_ref_frame_idx(cpi, ref_frame);
   int fb_scale = get_scale_ref_frame_idx(cpi, ref_frame);
@@ -2350,7 +2351,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   int tmp_row_min = x->mv_row_min;
   int tmp_row_max = x->mv_row_max;
 
-  YV12_BUFFER_CONFIG *scaled_ref_frame = get_scaled_ref_frame(cpi, ref);
+  YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi, ref);
 
   int_mv pred_mv[3];
   pred_mv[0] = mbmi->ref_mvs[ref][0];
@@ -2498,8 +2499,8 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   struct buf_2d scaled_first_yv12 = xd->plane[0].pre[0];
   int last_besterr[2] = {INT_MAX, INT_MAX};
   YV12_BUFFER_CONFIG *const scaled_ref_frame[2] = {
-    get_scaled_ref_frame(cpi, mbmi->ref_frame[0]),
-    get_scaled_ref_frame(cpi, mbmi->ref_frame[1])
+    vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[0]),
+    vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[1])
   };
 
   for (ref = 0; ref < 2; ++ref) {
diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h
index 4b244a50a..696cf6b11 100644
--- a/vp9/encoder/vp9_rdopt.h
+++ b/vp9/encoder/vp9_rdopt.h
@@ -19,6 +19,16 @@
   (((128 + ((int64_t)R) * (RM)) >> 8) + (D << DM))
 #define QIDX_SKIP_THRESH     115
 
+#define RD_THRESH_MAX_FACT 64
+#define RD_THRESH_INC      1
+#define RD_THRESH_POW      1.25
+#define RD_MULT_EPB_RATIO  64
+
+#define MV_COST_WEIGHT      108
+#define MV_COST_WEIGHT_SUB  120
+
+#define INVALID_MV 0x80008000
+
 struct TileInfo;
 
 int vp9_compute_rd_mult(VP9_COMP *cpi, int qindex);
@@ -36,6 +46,8 @@ void vp9_setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
                             int_mv frame_near_mv[MAX_REF_FRAMES],
                             struct buf_2d yv12_mb[4][MAX_MB_PLANE]);
 
+YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(VP9_COMP *cpi, int ref_frame);
+
 void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                int *r, int64_t *d, BLOCK_SIZE bsize,
                                PICK_MODE_CONTEXT *ctx, int64_t best_rd);
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 478b45ac0..b1c029cba 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -74,9 +74,6 @@ VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_8t_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm
-ifeq ($(ARCH_X86_64),yes)
-VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_intrin_ssse3.c
-endif
 ifeq ($(CONFIG_VP9_POSTPROC),yes)
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 0d79346fe..9ea0f549f 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -43,6 +43,7 @@ VP9_CX_SRCS-yes += encoder/vp9_psnr.h
 VP9_CX_SRCS-yes += encoder/vp9_quantize.h
 VP9_CX_SRCS-yes += encoder/vp9_ratectrl.h
 VP9_CX_SRCS-yes += encoder/vp9_rdopt.h
+VP9_CX_SRCS-yes += encoder/vp9_pickmode.h
 VP9_CX_SRCS-yes += encoder/vp9_sadmxn.h
 VP9_CX_SRCS-yes += encoder/vp9_tokenize.h
 VP9_CX_SRCS-yes += encoder/vp9_treewriter.h
@@ -55,6 +56,7 @@ VP9_CX_SRCS-yes += encoder/vp9_psnr.c
 VP9_CX_SRCS-yes += encoder/vp9_quantize.c
 VP9_CX_SRCS-yes += encoder/vp9_ratectrl.c
 VP9_CX_SRCS-yes += encoder/vp9_rdopt.c
+VP9_CX_SRCS-yes += encoder/vp9_pickmode.c
 VP9_CX_SRCS-yes += encoder/vp9_sad_c.c
 VP9_CX_SRCS-yes += encoder/vp9_segmentation.c
 VP9_CX_SRCS-yes += encoder/vp9_segmentation.h