9 files changed, 259 insertions, 132 deletions
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index cb7cc65d7..e27e2e64e 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -734,20 +734,6 @@ void vp8_encode_frame(VP8_COMP *cpi)
 
     x->activity_sum = 0;
 
-#if 0
-    // Experimental rd code
-    // 2 Pass - Possibly set Rdmult based on last frame distortion + this frame target bits or other metrics
-    // such as cpi->rate_correction_factor that indicate relative complexity.
-    /*if ( cpi->pass == 2 && (cpi->last_frame_distortion > 0) && (cpi->target_bits_per_mb > 0) )
-    {
-        //x->rdmult = ((cpi->last_frame_distortion * 256)/cpi->common.MBs)/ cpi->target_bits_per_mb;
-        x->rdmult = (int)(cpi->RDMULT * cpi->rate_correction_factor);
-    }
-    else
-        x->rdmult = cpi->RDMULT; */
-    //x->rdmult = (int)(cpi->RDMULT * pow( (cpi->rate_correction_factor * 2.0), 0.75 ));
-#endif
-
     xd->mode_info_context->mbmi.mode = DC_PRED;
     xd->mode_info_context->mbmi.uv_mode = DC_PRED;
 
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 22b25e3af..ee461c610 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -3093,9 +3093,6 @@ static int pick_frame_size(VP8_COMP *cpi)
         }
     }
 
-    // Note target_size in bits * 256 per MB
-    cpi->target_bits_per_mb = (cpi->this_frame_target * 256) / cpi->common.MBs;
-
     return 1;
 }
 static void set_quantizer(VP8_COMP *cpi, int Q)
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index c5dc0c194..05e8c4e6a 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -365,7 +365,6 @@ typedef struct
     int this_frame_target;
     int projected_frame_size;
     int last_q[2];                   // Separate values for Intra/Inter
-    int target_bits_per_mb;
 
     double rate_correction_factor;
     double key_frame_rate_correction_factor;
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index e6c7c9ab3..d694d39fb 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -36,7 +36,6 @@
 #include "dct.h"
 #include "systemdependent.h"
 
-#define DIAMONDSEARCH 1
 #if CONFIG_RUNTIME_CPU_DETECT
 #define IF_RTCD(x)  (x)
 #else
@@ -46,19 +45,6 @@
 
 void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x);
 
-
-#define RDFUNC(RM,DM,R,D,target_rd) ( ((128+(R)*(RM)) >> 8) + (DM)*(D) )
-/*int  RDFUNC( int RM,int DM, int R, int D, int target_r )
-{
-    int rd_value;
-
-    rd_value =  ( ((128+(R)*(RM)) >> 8) + (DM)*(D) );
-
-    return rd_value;
-}*/
-
-#define UVRDFUNC(RM,DM,R,D,target_r)  RDFUNC(RM,DM,R,D,target_r)
-
 #define RDCOST(RM,DM,R,D) ( ((128+(R)*(RM)) >> 8) + (DM)*(D) )
 
 #define MAXF(a,b)            (((a) > (b)) ? (a) : (b))
@@ -223,8 +209,6 @@ void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue)
 {
     int q;
     int i;
-    int *thresh;
-    int threshmult;
     double capped_q = (Qvalue < 160) ? (double)Qvalue : 160.0;
     double rdconst = 3.00;
 
@@ -271,22 +255,6 @@ void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue)
     if (q < 8)
         q = 8;
 
-    if (cpi->ref_frame_flags == VP8_ALT_FLAG)
-    {
-        thresh      = &cpi->rd_threshes[THR_NEWA];
-        threshmult  = cpi->sf.thresh_mult[THR_NEWA];
-    }
-    else if (cpi->ref_frame_flags == VP8_GOLD_FLAG)
-    {
-        thresh      = &cpi->rd_threshes[THR_NEWG];
-        threshmult  = cpi->sf.thresh_mult[THR_NEWG];
-    }
-    else
-    {
-        thresh      = &cpi->rd_threshes[THR_NEWMV];
-        threshmult  = cpi->sf.thresh_mult[THR_NEWMV];
-    }
-
     if (cpi->RDMULT > 1000)
     {
         cpi->RDDIV = 1;
@@ -775,7 +743,7 @@ static int vp8_rd_inter_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate, int *distort
     *rate       = rd_cost_mbuv(x);
     *distortion = ENCODEMB_INVOKE(&cpi->rtcd.encodemb, mbuverr)(x) / 4;
 
-    return UVRDFUNC(x->rdmult, x->rddiv, *rate, *distortion, cpi->target_bits_per_mb);
+    return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
 }
 
 int vp8_rd_pick_intra_mbuv_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate, int *rate_tokenonly, int *distortion)
@@ -800,7 +768,7 @@ int vp8_rd_pick_intra_mbuv_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate, int *ra
 
         distortion = vp8_get_mbuvrecon_error(IF_RTCD(&cpi->rtcd.variance), x);
 
-        this_rd = UVRDFUNC(x->rdmult, x->rddiv, rate, distortion, cpi->target_bits_per_mb);
+        this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
 
         if (this_rd < best_rd)
         {
@@ -1097,7 +1065,7 @@ void vp8_rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi,
     // Segmentation method overheads
     rate = vp8_cost_token(vp8_mbsplit_tree, vp8_mbsplit_probs, vp8_mbsplit_encodings + segmentation);
     rate += vp8_cost_mv_ref(SPLITMV, bsi->mdcounts);
-    this_segment_rd += RDFUNC(x->rdmult, x->rddiv, rate, 0, cpi->target_bits_per_mb);
+    this_segment_rd += RDCOST(x->rdmult, x->rddiv, rate, 0);
     br += rate;
 
     for (i = 0; i < label_count; i++)
@@ -1252,7 +1220,7 @@ void vp8_rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi,
             labelyrate = rdcost_mbsegment_y(x, labels, i, ta_s, tl_s);
             rate += labelyrate;
 
-            this_rd = RDFUNC(x->rdmult, x->rddiv, rate, distortion, cpi->target_bits_per_mb);
+            this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
 
             if (this_rd < best_label_rd)
             {
@@ -1751,7 +1719,7 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
     //int intermodecost[MAX_MODES];
 
     MB_PREDICTION_MODE uv_intra_mode;
-    int uvintra_eob = 0;
+
     int force_no_skip = 0;
 
     MV mvp;
@@ -1770,27 +1738,6 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
 
     ref_frame_cost[INTRA_FRAME]   = vp8_cost_zero(cpi->prob_intra_coded);
 
-    // Experimental code
-    // Adjust the RD multiplier based on the best case distortion we saw in the most recently coded mb
-    //if ( (cpi->last_mb_distortion) > 0 && (cpi->target_bits_per_mb > 0) )
-    /*{
-        int tmprdmult;
-
-        //tmprdmult = (cpi->last_mb_distortion * 256) / ((cpi->av_per_frame_bandwidth*256)/cpi->common.MBs);
-        tmprdmult = (cpi->last_mb_distortion * 256) / cpi->target_bits_per_mb;
-        //tmprdmult = tmprdmult;
-
-        //if ( tmprdmult > cpi->RDMULT * 2 )
-        //  tmprdmult = cpi->RDMULT * 2;
-        //else if ( tmprdmult < cpi->RDMULT / 2 )
-        //  tmprdmult = cpi->RDMULT / 2;
-
-        //tmprdmult = (tmprdmult < 25) ? 25 : tmprdmult;
-
-        //x->rdmult = tmprdmult;
-
-    }*/
-
     // Special case treatment when GF and ARF are not sensible options for reference
     if (cpi->ref_frame_flags == VP8_LAST_FLAG)
     {
@@ -1820,12 +1767,6 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
     x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;
     vp8_rd_pick_intra_mbuv_mode(cpi, x, &uv_intra_rate, &uv_intra_rate_tokenonly, &uv_intra_distortion);
     uv_intra_mode = x->e_mbd.mode_info_context->mbmi.uv_mode;
-    {
-        uvintra_eob = 0;
-
-        for (i = 16; i < 24; i++)
-            uvintra_eob += x->e_mbd.block[i].eob;
-    }
 
     for (mode_index = 0; mode_index < MAX_MODES; mode_index++)
     {
@@ -2339,8 +2280,8 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
                             distortion_uv = sse2;
 
                             disable_skip = 1;
-                            this_rd = RDFUNC(x->rdmult, x->rddiv, rate2,
-                                             distortion2, cpi->target_bits_per_mb);
+                            this_rd = RDCOST(x->rdmult, x->rddiv, rate2,
+                                             distortion2);
 
                             break;
                         }
@@ -2414,7 +2355,7 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
                 }
             }
             // Calculate the final RD estimate for this mode
-            this_rd = RDFUNC(x->rdmult, x->rddiv, rate2, distortion2, cpi->target_bits_per_mb);
+            this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
         }
 
         // Experimental debug code.
@@ -2442,8 +2383,8 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
             other_cost += ref_frame_cost[x->e_mbd.mode_info_context->mbmi.ref_frame];
 
             /* Calculate the final y RD estimate for this mode */
-            best_yrd = RDFUNC(x->rdmult, x->rddiv, (rate2-rate_uv-other_cost),
-                              (distortion2-distortion_uv), cpi->target_bits_per_mb);
+            best_yrd = RDCOST(x->rdmult, x->rddiv, (rate2-rate_uv-other_cost),
+                              (distortion2-distortion_uv));
 
             *returnrate = rate2;
             *returndistortion = distortion2;
diff --git a/vp8/encoder/temporal_filter.c b/vp8/encoder/temporal_filter.c
index 2fffaa95f..396e3390d 100644
--- a/vp8/encoder/temporal_filter.c
+++ b/vp8/encoder/temporal_filter.c
@@ -36,36 +36,9 @@
 
 #define ALT_REF_MC_ENABLED 1    // dis/enable MC in AltRef filtering
 #define ALT_REF_SUBPEL_ENABLED 1 // dis/enable subpel in MC AltRef filtering
-#define USE_FILTER_LUT 0         // use lookup table to improve filter
 
 #if VP8_TEMPORAL_ALT_REF
 
-#if USE_FILTER_LUT
-// for (strength = 0; strength <= 6; strength++) {
-//   for (delta = 0; delta <= 18; delta++) {
-//     float coeff = (3.0 * delta * delta) / pow(2, strength);
-//     printf("%3d", (int)roundf(coeff > 16 ? 0 : 16-coeff));
-//   }
-//   printf("\n");
-// }
-static int modifier_lut[7][19] =
-{
-    // Strength=0
-    {16, 13,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},
-    // Strength=1
-    {16, 15, 10,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},
-    // Strength=2
-    {16, 15, 13,  9,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},
-    // Strength=3
-    {16, 16, 15, 13, 10,  7,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},
-    // Strength=4
-    {16, 16, 15, 14, 13, 11,  9,  7,  4,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0},
-    // Strength=5
-    {16, 16, 16, 15, 15, 14, 13, 11, 10,  8,  7,  5,  3,  0,  0,  0,  0,  0,  0},
-    // Strength=6
-    {16, 16, 16, 16, 15, 15, 14, 14, 13, 12, 11, 10,  9,  8,  7,  5,  4,  2,  1}
-};
-#endif
 static void vp8_temporal_filter_predictors_mb_c
 (
     MACROBLOCKD *x,
@@ -86,14 +59,11 @@ static void vp8_temporal_filter_predictors_mb_c
 
     if ((mv_row | mv_col) & 7)
     {
-//        vp8_sixtap_predict16x16_c(yptr, stride,
-//                                    mv_col & 7, mv_row & 7, &pred[0], 16);
         x->subpixel_predict16x16(yptr, stride,
                                     mv_col & 7, mv_row & 7, &pred[0], 16);
     }
     else
     {
-        //vp8_copy_mem16x16_c (yptr, stride, &pred[0], 16);
         RECON_INVOKE(&x->rtcd->recon, copy16x16)(yptr, stride, &pred[0], 16);
     }
 
@@ -127,17 +97,13 @@ void vp8_temporal_filter_apply_c
     int strength,
     int filter_weight,
     unsigned int *accumulator,
-    unsigned int *count
+    unsigned short *count
 )
 {
     int i, j, k;
     int modifier;
     int byte = 0;
 
-#if USE_FILTER_LUT
-    int *lut = modifier_lut[strength];
-#endif
-
     for (i = 0,k = 0; i < block_size; i++)
     {
         for (j = 0; j < block_size; j++, k++)
@@ -146,11 +112,10 @@ void vp8_temporal_filter_apply_c
             int src_byte = frame1[byte];
             int pixel_value = *frame2++;
 
-#if USE_FILTER_LUT
-            modifier = abs(src_byte-pixel_value);
-            modifier = modifier>18 ? 0 : lut[modifier];
-#else
             modifier   = src_byte - pixel_value;
+            // This is an integer approximation of:
+            // float coeff = (3.0 * modifer * modifier) / pow(2, strength);
+            // modifier =  (int)roundf(coeff > 16 ? 0 : 16-coeff);
             modifier  *= modifier;
             modifier  *= 3;
             modifier  += 1 << (strength - 1);
@@ -160,7 +125,6 @@ void vp8_temporal_filter_apply_c
                 modifier = 16;
 
             modifier = 16 - modifier;
-#endif
             modifier *= filter_weight;
 
             count[k] += modifier;
@@ -331,12 +295,12 @@ static void vp8_temporal_filter_iterate_c
     int MBs  = cpi->common.MBs;
     int mb_y_offset = 0;
     int mb_uv_offset = 0;
-    unsigned int accumulator[384];
-    unsigned int count[384];
+    DECLARE_ALIGNED_ARRAY(16, unsigned int, accumulator, 16*16 + 8*8 + 8*8);
+    DECLARE_ALIGNED_ARRAY(16, unsigned short, count, 16*16 + 8*8 + 8*8);
     MACROBLOCKD *mbd = &cpi->mb.e_mbd;
     YV12_BUFFER_CONFIG *f = cpi->frames[alt_ref_index];
     unsigned char *dst1, *dst2;
-    DECLARE_ALIGNED(16, unsigned char,  predictor[384]);
+    DECLARE_ALIGNED_ARRAY(16, unsigned char,  predictor, 16*16 + 8*8 + 8*8);
 
     // Save input state
     unsigned char *y_buffer = mbd->pre.y_buffer;
@@ -366,7 +330,7 @@ static void vp8_temporal_filter_iterate_c
             int stride;
 
             vpx_memset(accumulator, 0, 384*sizeof(unsigned int));
-            vpx_memset(count, 0, 384*sizeof(unsigned int));
+            vpx_memset(count, 0, 384*sizeof(unsigned short));
 
 #if ALT_REF_MC_ENABLED
             // Reduced search extent by 3 for 6-tap filter & smaller UMV border
diff --git a/vp8/encoder/temporal_filter.h b/vp8/encoder/temporal_filter.h
index 7b8c21c04..740037a85 100644
--- a/vp8/encoder/temporal_filter.h
+++ b/vp8/encoder/temporal_filter.h
@@ -22,9 +22,13 @@
      int strength, \
      int filter_weight, \
      unsigned int *accumulator, \
-     unsigned int *count \
+     unsigned short *count \
     )
 
+#if ARCH_X86 || ARCH_X86_64
+#include "x86/temporal_filter_x86.h"
+#endif
+
 #ifndef vp8_temporal_filter_apply
 #define vp8_temporal_filter_apply vp8_temporal_filter_apply_c
 #endif
diff --git a/vp8/encoder/x86/temporal_filter_apply_sse2.asm b/vp8/encoder/x86/temporal_filter_apply_sse2.asm
new file mode 100644
index 000000000..0127b012e
--- /dev/null
+++ b/vp8/encoder/x86/temporal_filter_apply_sse2.asm
@@ -0,0 +1,207 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+; void vp8_temporal_filter_apply_sse2 | arg
+;  (unsigned char  *frame1,           |  0
+;   unsigned int    stride,           |  1
+;   unsigned char  *frame2,           |  2
+;   unsigned int    block_size,       |  3
+;   int             strength,         |  4
+;   int             filter_weight,    |  5
+;   unsigned int   *accumulator,      |  6
+;   unsigned short *count)            |  7
+global sym(vp8_temporal_filter_apply_sse2)
+sym(vp8_temporal_filter_apply_sse2):
+
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 8
+    SAVE_XMM
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ALIGN_STACK 16, rax
+    %define block_size    0
+    %define strength      16
+    %define filter_weight 32
+    %define rounding_bit  48
+    %define rbp_backup    64
+    %define stack_size    80
+    sub         rsp,           stack_size
+    mov         [rsp + rbp_backup], rbp
+    ; end prolog
+
+        mov         rdx,            arg(3)
+        mov         [rsp + block_size], rdx
+        movd        xmm6,            arg(4)
+        movdqa      [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read
+
+        ; calculate the rounding bit outside the loop
+        ; 0x8000 >> (16 - strength)
+        mov         rdx,            16
+        sub         rdx,            arg(4) ; 16 - strength
+        movd        xmm4,           rdx    ; can't use rdx w/ shift
+        movdqa      xmm5,           [GLOBAL(_const_top_bit)]
+        psrlw       xmm5,           xmm4
+        movdqa      [rsp + rounding_bit], xmm5
+
+        mov         rsi,            arg(0) ; src/frame1
+        mov         rdx,            arg(2) ; predictor frame
+        mov         rdi,            arg(6) ; accumulator
+        mov         rax,            arg(7) ; count
+
+        ; dup the filter weight and store for later
+        movd        xmm0,           arg(5) ; filter_weight
+        pshuflw     xmm0,           xmm0, 0
+        punpcklwd   xmm0,           xmm0
+        movdqa      [rsp + filter_weight], xmm0
+
+        mov         rbp,            arg(1) ; stride
+        pxor        xmm7,           xmm7   ; zero for extraction
+
+        lea         rcx,            [rdx + 16*16*1]
+        cmp         dword ptr [rsp + block_size], 8
+        jne         temporal_filter_apply_load_16
+        lea         rcx,            [rdx + 8*8*1]
+
+temporal_filter_apply_load_8:
+        movq        xmm0,           [rsi]  ; first row
+        lea         rsi,            [rsi + rbp] ; += stride
+        punpcklbw   xmm0,           xmm7   ; src[ 0- 7]
+        movq        xmm1,           [rsi]  ; second row
+        lea         rsi,            [rsi + rbp] ; += stride
+        punpcklbw   xmm1,           xmm7   ; src[ 8-15]
+        jmp         temporal_filter_apply_load_finished
+
+temporal_filter_apply_load_16:
+        movdqa      xmm0,           [rsi]  ; src (frame1)
+        lea         rsi,            [rsi + rbp] ; += stride
+        movdqa      xmm1,           xmm0
+        punpcklbw   xmm0,           xmm7   ; src[ 0- 7]
+        punpckhbw   xmm1,           xmm7   ; src[ 8-15]
+
+temporal_filter_apply_load_finished:
+        movdqa      xmm2,           [rdx]  ; predictor (frame2)
+        movdqa      xmm3,           xmm2
+        punpcklbw   xmm2,           xmm7   ; pred[ 0- 7]
+        punpckhbw   xmm3,           xmm7   ; pred[ 8-15]
+
+        ; modifier = src_byte - pixel_value
+        psubw       xmm0,           xmm2   ; src - pred[ 0- 7]
+        psubw       xmm1,           xmm3   ; src - pred[ 8-15]
+
+        ; modifier *= modifier
+        pmullw      xmm0,           xmm0   ; modifer[ 0- 7]^2
+        pmullw      xmm1,           xmm1   ; modifer[ 8-15]^2
+
+        ; modifier *= 3
+        pmullw      xmm0,           [GLOBAL(_const_3w)]
+        pmullw      xmm1,           [GLOBAL(_const_3w)]
+
+        ; modifer += 0x8000 >> (16 - strength)
+        paddw       xmm0,           [rsp + rounding_bit]
+        paddw       xmm1,           [rsp + rounding_bit]
+
+        ; modifier >>= strength
+        psrlw       xmm0,           [rsp + strength]
+        psrlw       xmm1,           [rsp + strength]
+
+        ; modifier = 16 - modifier
+        ; saturation takes care of modifier > 16
+        movdqa      xmm3,           [GLOBAL(_const_16w)]
+        movdqa      xmm2,           [GLOBAL(_const_16w)]
+        psubusw     xmm3,           xmm1
+        psubusw     xmm2,           xmm0
+
+        ; modifier *= filter_weight
+        pmullw      xmm2,           [rsp + filter_weight]
+        pmullw      xmm3,           [rsp + filter_weight]
+
+        ; count
+        movdqa      xmm4,           [rax]
+        movdqa      xmm5,           [rax+16]
+        ; += modifier
+        paddw       xmm4,           xmm2
+        paddw       xmm5,           xmm3
+        ; write back
+        movdqa      [rax],          xmm4
+        movdqa      [rax+16],       xmm5
+        lea         rax,            [rax + 16*2] ; count += 16*(sizeof(short))
+
+        ; load and extract the predictor up to shorts
+        pxor        xmm7,           xmm7
+        movdqa      xmm0,           [rdx]
+        lea         rdx,            [rdx + 16*1] ; pred += 16*(sizeof(char))
+        movdqa      xmm1,           xmm0
+        punpcklbw   xmm0,           xmm7   ; pred[ 0- 7]
+        punpckhbw   xmm1,           xmm7   ; pred[ 8-15]
+
+        ; modifier *= pixel_value
+        pmullw      xmm0,           xmm2
+        pmullw      xmm1,           xmm3
+
+        ; expand to double words
+        movdqa      xmm2,           xmm0
+        punpcklwd   xmm0,           xmm7   ; [ 0- 3]
+        punpckhwd   xmm2,           xmm7   ; [ 4- 7]
+        movdqa      xmm3,           xmm1
+        punpcklwd   xmm1,           xmm7   ; [ 8-11]
+        punpckhwd   xmm3,           xmm7   ; [12-15]
+
+        ; accumulator
+        movdqa      xmm4,           [rdi]
+        movdqa      xmm5,           [rdi+16]
+        movdqa      xmm6,           [rdi+32]
+        movdqa      xmm7,           [rdi+48]
+        ; += modifier
+        paddw       xmm4,           xmm0
+        paddw       xmm5,           xmm2
+        paddw       xmm6,           xmm1
+        paddw       xmm7,           xmm3
+        ; write back
+        movdqa      [rdi],          xmm4
+        movdqa      [rdi+16],       xmm5
+        movdqa      [rdi+32],       xmm6
+        movdqa      [rdi+48],       xmm7
+        lea         rdi,            [rdi + 16*4] ; accumulator += 16*(sizeof(int))
+
+        cmp         rdx,            rcx
+        je          temporal_filter_apply_epilog
+        pxor        xmm7,           xmm7   ; zero for extraction
+        cmp         dword ptr [rsp + block_size], 16
+        je          temporal_filter_apply_load_16
+        jmp         temporal_filter_apply_load_8
+
+temporal_filter_apply_epilog:
+    ; begin epilog
+    mov         rbp,            [rsp + rbp_backup]
+    add         rsp,            stack_size
+    pop         rsp
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+_const_3w:
+    times 8 dw 3
+align 16
+_const_top_bit:
+    times 8 dw 1<<15
+align 16
+_const_16w
+    times 8 dw 16
diff --git a/vp8/encoder/x86/temporal_filter_x86.h b/vp8/encoder/x86/temporal_filter_x86.h
new file mode 100644
index 000000000..2daa14018
--- /dev/null
+++ b/vp8/encoder/x86/temporal_filter_x86.h
@@ -0,0 +1,27 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_VP8_TEMPORAL_FILTER_X86_H
+#define __INC_VP8_TEMPORAL_FILTER_X86_H
+
+#if HAVE_SSE2
+extern prototype_apply(vp8_temporal_filter_apply_sse2);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+#undef  vp8_temporal_filter_apply
+#define vp8_temporal_filter_apply vp8_temporal_filter_apply_sse2
+
+#endif
+
+#endif
+
+#endif // __INC_VP8_TEMPORAL_FILTER_X86_H
diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c
index 6e317e2a2..c7dffc443 100644
--- a/vp8/encoder/x86/x86_csystemdependent.c
+++ b/vp8/encoder/x86/x86_csystemdependent.c
@@ -309,6 +309,8 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
 
         /*cpi->rtcd.quantize.quantb            = vp8_regular_quantize_b_sse2;*/
         cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_sse2;
+
+        cpi->rtcd.temporal.apply                 = vp8_temporal_filter_apply_sse2;
     }
 #endif