30 files changed, 1601 insertions, 808 deletions
diff --git a/vp8/common/alloccommon.c b/vp8/common/alloccommon.c
index b606aaca0..919ef499a 100644
--- a/vp8/common/alloccommon.c
+++ b/vp8/common/alloccommon.c
@@ -37,14 +37,15 @@ static void update_mode_info_border(MODE_INFO *mi, int rows, int cols)
 void vp8_de_alloc_frame_buffers(VP8_COMMON *oci)
 {
     int i;
-
     for (i = 0; i < NUM_YV12_BUFFERS; i++)
         vp8_yv12_de_alloc_frame_buffer(&oci->yv12_fb[i]);
 
     vp8_yv12_de_alloc_frame_buffer(&oci->temp_scale_frame);
+#if CONFIG_POSTPROC
     vp8_yv12_de_alloc_frame_buffer(&oci->post_proc_buffer);
     if (oci->post_proc_buffer_int_used)
         vp8_yv12_de_alloc_frame_buffer(&oci->post_proc_buffer_int);
+#endif
 
     vpx_free(oci->above_context);
     vpx_free(oci->mip);
@@ -97,6 +98,7 @@ int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height)
         return 1;
     }
 
+#if CONFIG_POSTPROC
     if (vp8_yv12_alloc_frame_buffer(&oci->post_proc_buffer, width, height, VP8BORDERINPIXELS) < 0)
     {
         vp8_de_alloc_frame_buffers(oci);
@@ -104,6 +106,9 @@ int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height)
     }
 
     oci->post_proc_buffer_int_used = 0;
+    vpx_memset(&oci->postproc_state, 0, sizeof(oci->postproc_state));
+    vpx_memset((&oci->post_proc_buffer)->buffer_alloc,128,(&oci->post_proc_buffer)->frame_size);
+#endif
 
     oci->mb_rows = height >> 4;
     oci->mb_cols = width >> 4;
diff --git a/vp8/common/asm_com_offsets.c b/vp8/common/asm_com_offsets.c
index 5cf151980..ae22b5f6b 100644
--- a/vp8/common/asm_com_offsets.c
+++ b/vp8/common/asm_com_offsets.c
@@ -15,6 +15,10 @@
 #include "vpx_scale/yv12config.h"
 #include "vp8/common/blockd.h"
 
+#if CONFIG_POSTPROC
+#include "postproc.h"
+#endif /* CONFIG_POSTPROC */
+
 BEGIN
 
 /* vpx_scale */
@@ -30,6 +34,11 @@ DEFINE(yv12_buffer_config_v_buffer,             offsetof(YV12_BUFFER_CONFIG, v_b
 DEFINE(yv12_buffer_config_border,               offsetof(YV12_BUFFER_CONFIG, border));
 DEFINE(VP8BORDERINPIXELS_VAL,                   VP8BORDERINPIXELS);
 
+#if CONFIG_POSTPROC
+/* mfqe.c / filter_by_weight */
+DEFINE(MFQE_PRECISION_VAL,                      MFQE_PRECISION);
+#endif /* CONFIG_POSTPROC */
+
 END
 
 /* add asserts for any offset that is not supported by assembly code */
@@ -53,3 +62,10 @@ ct_assert(B_HU_PRED, B_HU_PRED == 9);
 /* vp8_yv12_extend_frame_borders_neon makes several assumptions based on this */
 ct_assert(VP8BORDERINPIXELS_VAL, VP8BORDERINPIXELS == 32)
 #endif
+
+#if HAVE_SSE2
+#if CONFIG_POSTPROC
+/* vp8_filter_by_weight16x16 and 8x8 */
+ct_assert(MFQE_PRECISION_VAL, MFQE_PRECISION == 4)
+#endif /* CONFIG_POSTPROC */
+#endif /* HAVE_SSE2 */
diff --git a/vp8/common/blockd.h b/vp8/common/blockd.h
index c8d1bab7d..692f0ebd2 100644
--- a/vp8/common/blockd.h
+++ b/vp8/common/blockd.h
@@ -215,6 +215,12 @@ typedef struct macroblockd
     MODE_INFO *mode_info_context;
     int mode_info_stride;
 
+#if CONFIG_TEMPORAL_DENOISING
+    MB_PREDICTION_MODE best_sse_inter_mode;
+    int_mv best_sse_mv;
+    unsigned char need_to_clamp_best_mvs;
+#endif
+
     FRAME_TYPE frame_type;
 
     int up_available;
diff --git a/vp8/common/loopfilter_filters.c b/vp8/common/loopfilter_filters.c
index 60a7ff262..8235f6e9f 100644
--- a/vp8/common/loopfilter_filters.c
+++ b/vp8/common/loopfilter_filters.c
@@ -15,7 +15,7 @@
 
 typedef unsigned char uc;
 
-static __inline signed char vp8_signed_char_clamp(int t)
+static signed char vp8_signed_char_clamp(int t)
 {
     t = (t < -128 ? -128 : t);
     t = (t > 127 ? 127 : t);
@@ -24,9 +24,9 @@ static __inline signed char vp8_signed_char_clamp(int t)
 
 
 /* should we apply any filter at all ( 11111111 yes, 00000000 no) */
-static __inline signed char vp8_filter_mask(uc limit, uc blimit,
-                                     uc p3, uc p2, uc p1, uc p0,
-                                     uc q0, uc q1, uc q2, uc q3)
+static signed char vp8_filter_mask(uc limit, uc blimit,
+                            uc p3, uc p2, uc p1, uc p0,
+                            uc q0, uc q1, uc q2, uc q3)
 {
     signed char mask = 0;
     mask |= (abs(p3 - p2) > limit);
@@ -40,7 +40,7 @@ static __inline signed char vp8_filter_mask(uc limit, uc blimit,
 }
 
 /* is there high variance internal edge ( 11111111 yes, 00000000 no) */
-static __inline signed char vp8_hevmask(uc thresh, uc p1, uc p0, uc q0, uc q1)
+static signed char vp8_hevmask(uc thresh, uc p1, uc p0, uc q0, uc q1)
 {
     signed char hev = 0;
     hev  |= (abs(p1 - p0) > thresh) * -1;
@@ -48,7 +48,7 @@ static __inline signed char vp8_hevmask(uc thresh, uc p1, uc p0, uc q0, uc q1)
     return hev;
 }
 
-static __inline void vp8_filter(signed char mask, uc hev, uc *op1,
+static void vp8_filter(signed char mask, uc hev, uc *op1,
         uc *op0, uc *oq0, uc *oq1)
 
 {
@@ -158,7 +158,7 @@ void vp8_loop_filter_vertical_edge_c
     while (++i < count * 8);
 }
 
-static __inline void vp8_mbfilter(signed char mask, uc hev,
+static void vp8_mbfilter(signed char mask, uc hev,
                            uc *op2, uc *op1, uc *op0, uc *oq0, uc *oq1, uc *oq2)
 {
     signed char s, u;
@@ -279,7 +279,7 @@ void vp8_mbloop_filter_vertical_edge_c
 }
 
 /* should we apply any filter at all ( 11111111 yes, 00000000 no) */
-static __inline signed char vp8_simple_filter_mask(uc blimit, uc p1, uc p0, uc q0, uc q1)
+static signed char vp8_simple_filter_mask(uc blimit, uc p1, uc p0, uc q0, uc q1)
 {
 /* Why does this cause problems for win32?
  * error C2143: syntax error : missing ';' before 'type'
@@ -289,7 +289,7 @@ static __inline signed char vp8_simple_filter_mask(uc blimit, uc p1, uc p0, uc q
     return mask;
 }
 
-static __inline void vp8_simple_filter(signed char mask, uc *op1, uc *op0, uc *oq0, uc *oq1)
+static void vp8_simple_filter(signed char mask, uc *op1, uc *op0, uc *oq0, uc *oq1)
 {
     signed char vp8_filter, Filter1, Filter2;
     signed char p1 = (signed char) * op1 ^ 0x80;
diff --git a/vp8/common/mfqe.c b/vp8/common/mfqe.c
new file mode 100644
index 000000000..84e336915
--- /dev/null
+++ b/vp8/common/mfqe.c
@@ -0,0 +1,271 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+/* MFQE: Multiframe Quality Enhancement
+ * In rate limited situations keyframes may cause significant visual artifacts
+ * commonly referred to as "popping." This file implements a postproccesing
+ * algorithm which blends data from the preceeding frame when there is no
+ * motion and the q from the previous frame is lower which indicates that it is
+ * higher quality.
+ */
+
+#include "postproc.h"
+#include "variance.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_rtcd.h"
+#include "vpx_scale/yv12config.h"
+
+#include <limits.h>
+#include <stdlib.h>
+
+
+static void filter_by_weight(unsigned char *src, int src_stride,
+                             unsigned char *dst, int dst_stride,
+                             int block_size, int src_weight)
+{
+    int dst_weight = (1 << MFQE_PRECISION) - src_weight;
+    int rounding_bit = 1 << (MFQE_PRECISION - 1);
+    int r, c;
+
+    for (r = 0; r < block_size; r++)
+    {
+        for (c = 0; c < block_size; c++)
+        {
+            dst[c] = (src[c] * src_weight +
+                      dst[c] * dst_weight +
+                      rounding_bit) >> MFQE_PRECISION;
+        }
+        src += src_stride;
+        dst += dst_stride;
+    }
+}
+
+void vp8_filter_by_weight16x16_c(unsigned char *src, int src_stride,
+                                 unsigned char *dst, int dst_stride,
+                                 int src_weight)
+{
+    filter_by_weight(src, src_stride, dst, dst_stride, 16, src_weight);
+}
+
+void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride,
+                               unsigned char *dst, int dst_stride,
+                               int src_weight)
+{
+    filter_by_weight(src, src_stride, dst, dst_stride, 8, src_weight);
+}
+
+void vp8_filter_by_weight4x4_c(unsigned char *src, int src_stride,
+                               unsigned char *dst, int dst_stride,
+                               int src_weight)
+{
+    filter_by_weight(src, src_stride, dst, dst_stride, 4, src_weight);
+}
+
+static void apply_ifactor(unsigned char *y_src,
+                          int y_src_stride,
+                          unsigned char *y_dst,
+                          int y_dst_stride,
+                          unsigned char *u_src,
+                          unsigned char *v_src,
+                          int uv_src_stride,
+                          unsigned char *u_dst,
+                          unsigned char *v_dst,
+                          int uv_dst_stride,
+                          int block_size,
+                          int src_weight)
+{
+    if (block_size == 16)
+    {
+        vp8_filter_by_weight16x16(y_src, y_src_stride, y_dst, y_dst_stride, src_weight);
+        vp8_filter_by_weight8x8(u_src, uv_src_stride, u_dst, uv_dst_stride, src_weight);
+        vp8_filter_by_weight8x8(v_src, uv_src_stride, v_dst, uv_dst_stride, src_weight);
+    }
+    else /* if (block_size == 8) */
+    {
+        vp8_filter_by_weight8x8(y_src, y_src_stride, y_dst, y_dst_stride, src_weight);
+        vp8_filter_by_weight4x4(u_src, uv_src_stride, u_dst, uv_dst_stride, src_weight);
+        vp8_filter_by_weight4x4(v_src, uv_src_stride, v_dst, uv_dst_stride, src_weight);
+    }
+}
+
+static void multiframe_quality_enhance_block
+(
+    int blksize, /* Currently only values supported are 16 and 8 */
+    int qcurr,
+    int qprev,
+    unsigned char *y,
+    unsigned char *u,
+    unsigned char *v,
+    int y_stride,
+    int uv_stride,
+    unsigned char *yd,
+    unsigned char *ud,
+    unsigned char *vd,
+    int yd_stride,
+    int uvd_stride
+)
+{
+    static const unsigned char VP8_ZEROS[16]=
+    {
+        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+    };
+
+    int uvblksize = blksize >> 1;
+    int qdiff = qcurr - qprev;
+
+    int i;
+    unsigned char *up;
+    unsigned char *udp;
+    unsigned char *vp;
+    unsigned char *vdp;
+
+    unsigned int act, sad, thr, sse;
+
+    if (blksize == 16)
+    {
+        act = (vp8_variance16x16(yd, yd_stride, VP8_ZEROS, 0, &sse)+128)>>8;
+        sad = (vp8_sad16x16(y, y_stride, yd, yd_stride, INT_MAX)+128)>>8;
+    }
+    else /* if (blksize == 8) */
+    {
+        act = (vp8_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse)+32)>>6;
+        sad = (vp8_sad8x8(y, y_stride, yd, yd_stride, INT_MAX)+32)>>6;
+    }
+
+    /* thr = qdiff/8 + log2(act) + log4(qprev) */
+    thr = (qdiff>>3);
+    while (act>>=1) thr++;
+    while (qprev>>=2) thr++;
+
+    if (sad < thr)
+    {
+        int ifactor = (sad << MFQE_PRECISION) / thr;
+        ifactor >>= (qdiff >> 5);
+
+        if (ifactor)
+        {
+            apply_ifactor(y, y_stride, yd, yd_stride,
+                          u, v, uv_stride,
+                          ud, vd, uvd_stride,
+                          blksize, ifactor);
+        }
+        /* else implicitly copy from previous frame */
+    }
+    else
+    {
+        if (blksize == 16)
+        {
+            vp8_copy_mem16x16(y, y_stride, yd, yd_stride);
+            vp8_copy_mem8x8(u, uv_stride, ud, uvd_stride);
+            vp8_copy_mem8x8(v, uv_stride, vd, uvd_stride);
+        }
+        else /* if (blksize == 8) */
+        {
+            vp8_copy_mem8x8(y, y_stride, yd, yd_stride);
+            for (up = u, udp = ud, i = 0; i < uvblksize; ++i, up += uv_stride, udp += uvd_stride)
+                vpx_memcpy(udp, up, uvblksize);
+            for (vp = v, vdp = vd, i = 0; i < uvblksize; ++i, vp += uv_stride, vdp += uvd_stride)
+                vpx_memcpy(vdp, vp, uvblksize);
+        }
+    }
+}
+
+void vp8_multiframe_quality_enhance
+(
+    VP8_COMMON *cm
+)
+{
+    YV12_BUFFER_CONFIG *show = cm->frame_to_show;
+    YV12_BUFFER_CONFIG *dest = &cm->post_proc_buffer;
+
+    FRAME_TYPE frame_type = cm->frame_type;
+    /* Point at base of Mb MODE_INFO list has motion vectors etc */
+    const MODE_INFO *mode_info_context = cm->mi;
+    int mb_row;
+    int mb_col;
+    int qcurr = cm->base_qindex;
+    int qprev = cm->postproc_state.last_base_qindex;
+
+    unsigned char *y_ptr, *u_ptr, *v_ptr;
+    unsigned char *yd_ptr, *ud_ptr, *vd_ptr;
+
+    /* Set up the buffer pointers */
+    y_ptr = show->y_buffer;
+    u_ptr = show->u_buffer;
+    v_ptr = show->v_buffer;
+    yd_ptr = dest->y_buffer;
+    ud_ptr = dest->u_buffer;
+    vd_ptr = dest->v_buffer;
+
+    /* postprocess each macro block */
+    for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
+    {
+        for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+        {
+            /* if motion is high there will likely be no benefit */
+            if (((frame_type == INTER_FRAME &&
+                  abs(mode_info_context->mbmi.mv.as_mv.row) <= 10 &&
+                  abs(mode_info_context->mbmi.mv.as_mv.col) <= 10) ||
+                 (frame_type == KEY_FRAME)))
+            {
+                if (mode_info_context->mbmi.mode == B_PRED || mode_info_context->mbmi.mode == SPLITMV)
+                {
+                    int i, j;
+                    for (i=0; i<2; ++i)
+                        for (j=0; j<2; ++j)
+                            multiframe_quality_enhance_block(8, qcurr, qprev,
+                                                             y_ptr + 8*(i*show->y_stride+j),
+                                                             u_ptr + 4*(i*show->uv_stride+j),
+                                                             v_ptr + 4*(i*show->uv_stride+j),
+                                                             show->y_stride,
+                                                             show->uv_stride,
+                                                             yd_ptr + 8*(i*dest->y_stride+j),
+                                                             ud_ptr + 4*(i*dest->uv_stride+j),
+                                                             vd_ptr + 4*(i*dest->uv_stride+j),
+                                                             dest->y_stride,
+                                                             dest->uv_stride);
+                }
+                else
+                {
+                    multiframe_quality_enhance_block(16, qcurr, qprev, y_ptr,
+                                                     u_ptr, v_ptr,
+                                                     show->y_stride,
+                                                     show->uv_stride,
+                                                     yd_ptr, ud_ptr, vd_ptr,
+                                                     dest->y_stride,
+                                                     dest->uv_stride);
+                }
+            }
+            else
+            {
+                vp8_copy_mem16x16(y_ptr, show->y_stride, yd_ptr, dest->y_stride);
+                vp8_copy_mem8x8(u_ptr, show->uv_stride, ud_ptr, dest->uv_stride);
+                vp8_copy_mem8x8(v_ptr, show->uv_stride, vd_ptr, dest->uv_stride);
+            }
+            y_ptr += 16;
+            u_ptr += 8;
+            v_ptr += 8;
+            yd_ptr += 16;
+            ud_ptr += 8;
+            vd_ptr += 8;
+            mode_info_context++;     /* step to next MB */
+        }
+
+        y_ptr += show->y_stride  * 16 - 16 * cm->mb_cols;
+        u_ptr += show->uv_stride *  8 - 8 * cm->mb_cols;
+        v_ptr += show->uv_stride *  8 - 8 * cm->mb_cols;
+        yd_ptr += dest->y_stride  * 16 - 16 * cm->mb_cols;
+        ud_ptr += dest->uv_stride *  8 - 8 * cm->mb_cols;
+        vd_ptr += dest->uv_stride *  8 - 8 * cm->mb_cols;
+
+        mode_info_context++;         /* Skip border mb */
+    }
+}
diff --git a/vp8/common/onyx.h b/vp8/common/onyx.h
index eb7d5458d..4c39b49f0 100644
--- a/vp8/common/onyx.h
+++ b/vp8/common/onyx.h
@@ -72,7 +72,7 @@ extern "C"
 
 
 #include <assert.h>
-    static __inline void Scale2Ratio(int mode, int *hr, int *hs)
+    static void Scale2Ratio(int mode, int *hr, int *hs)
     {
         switch (mode)
         {
diff --git a/vp8/common/onyxc_int.h b/vp8/common/onyxc_int.h
index 84cf3b340..7743ed5c5 100644
--- a/vp8/common/onyxc_int.h
+++ b/vp8/common/onyxc_int.h
@@ -92,11 +92,13 @@ typedef struct VP8Common
     int fb_idx_ref_cnt[NUM_YV12_BUFFERS];
     int new_fb_idx, lst_fb_idx, gld_fb_idx, alt_fb_idx;
 
-    YV12_BUFFER_CONFIG post_proc_buffer;
     YV12_BUFFER_CONFIG temp_scale_frame;
 
+#if CONFIG_POSTPROC
+    YV12_BUFFER_CONFIG post_proc_buffer;
     YV12_BUFFER_CONFIG post_proc_buffer_int;
     int post_proc_buffer_int_used;
+#endif
 
     FRAME_TYPE last_frame_type;  /* Save last frame's frame type for motion search. */
     FRAME_TYPE frame_type;
diff --git a/vp8/common/postproc.c b/vp8/common/postproc.c
index 280ce0294..50ed54309 100644
--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c
@@ -17,7 +17,6 @@
 #include "vpx_scale/yv12extend.h"
 #include "vpx_scale/vpxscale.h"
 #include "systemdependent.h"
-#include "variance.h"
 
 #include <limits.h>
 #include <math.h>
@@ -30,7 +29,6 @@
     ( (0.439*(float)(t>>16)) - (0.368*(float)(t>>8&0xff)) - (0.071*(float)(t&0xff)) + 128)
 
 /* global constants */
-#define MFQE_PRECISION 4
 #if CONFIG_POSTPROC_VISUALIZER
 static const unsigned char MB_PREDICTION_MODE_colors[MB_MODE_COUNT][3] =
 {
@@ -362,6 +360,7 @@ void vp8_deblock(YV12_BUFFER_CONFIG         *source,
     vp8_post_proc_down_and_across(source->v_buffer, post->v_buffer, source->uv_stride, post->uv_stride, source->uv_height, source->uv_width, ppl);
 }
 
+#if !(CONFIG_TEMPORAL_DENOISING)
 void vp8_de_noise(YV12_BUFFER_CONFIG         *source,
                   YV12_BUFFER_CONFIG         *post,
                   int                         q,
@@ -398,6 +397,7 @@ void vp8_de_noise(YV12_BUFFER_CONFIG         *source,
         source->uv_width - 4, ppl);
 
 }
+#endif
 
 double vp8_gaussian(double sigma, double mu, double x)
 {
@@ -693,214 +693,7 @@ static void constrain_line (int x0, int *x1, int y0, int *y1, int width, int hei
     }
 }
 
-
-static void multiframe_quality_enhance_block
-(
-    int blksize, /* Currently only values supported are 16, 8, 4 */
-    int qcurr,
-    int qprev,
-    unsigned char *y,
-    unsigned char *u,
-    unsigned char *v,
-    int y_stride,
-    int uv_stride,
-    unsigned char *yd,
-    unsigned char *ud,
-    unsigned char *vd,
-    int yd_stride,
-    int uvd_stride
-)
-{
-    static const unsigned char VP8_ZEROS[16]=
-    {
-         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-    };
-    int blksizeby2 = blksize >> 1;
-    int qdiff = qcurr - qprev;
-
-    int i, j;
-    unsigned char *yp;
-    unsigned char *ydp;
-    unsigned char *up;
-    unsigned char *udp;
-    unsigned char *vp;
-    unsigned char *vdp;
-
-    unsigned int act, sse, sad, thr;
-    if (blksize == 16)
-    {
-        act = (vp8_variance16x16(yd, yd_stride, VP8_ZEROS, 0, &sse)+128)>>8;
-        sad = (vp8_sad16x16(y, y_stride, yd, yd_stride, INT_MAX)+128)>>8;
-    }
-    else if (blksize == 8)
-    {
-        act = (vp8_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse)+32)>>6;
-        sad = (vp8_sad8x8(y, y_stride, yd, yd_stride, INT_MAX)+32)>>6;
-    }
-    else
-    {
-        act = (vp8_variance4x4(yd, yd_stride, VP8_ZEROS, 0, &sse)+8)>>4;
-        sad = (vp8_sad4x4(y, y_stride, yd, yd_stride, INT_MAX)+8)>>4;
-    }
-    /* thr = qdiff/8 + log2(act) + log4(qprev) */
-    thr = (qdiff>>3);
-    while (act>>=1) thr++;
-    while (qprev>>=2) thr++;
-    if (sad < thr)
-    {
-        static const int roundoff = (1 << (MFQE_PRECISION - 1));
-        int ifactor = (sad << MFQE_PRECISION) / thr;
-        ifactor >>= (qdiff >> 5);
-        // TODO: SIMD optimize this section
-        if (ifactor)
-        {
-            int icfactor = (1 << MFQE_PRECISION) - ifactor;
-            for (yp = y, ydp = yd, i = 0; i < blksize; ++i, yp += y_stride, ydp += yd_stride)
-            {
-                for (j = 0; j < blksize; ++j)
-                    ydp[j] = (int)((yp[j] * ifactor + ydp[j] * icfactor + roundoff) >> MFQE_PRECISION);
-            }
-            for (up = u, udp = ud, i = 0; i < blksizeby2; ++i, up += uv_stride, udp += uvd_stride)
-            {
-                for (j = 0; j < blksizeby2; ++j)
-                    udp[j] = (int)((up[j] * ifactor + udp[j] * icfactor + roundoff) >> MFQE_PRECISION);
-            }
-            for (vp = v, vdp = vd, i = 0; i < blksizeby2; ++i, vp += uv_stride, vdp += uvd_stride)
-            {
-                for (j = 0; j < blksizeby2; ++j)
-                    vdp[j] = (int)((vp[j] * ifactor + vdp[j] * icfactor + roundoff) >> MFQE_PRECISION);
-            }
-        }
-    }
-    else
-    {
-        if (blksize == 16)
-        {
-            vp8_copy_mem16x16(y, y_stride, yd, yd_stride);
-            vp8_copy_mem8x8(u, uv_stride, ud, uvd_stride);
-            vp8_copy_mem8x8(v, uv_stride, vd, uvd_stride);
-        }
-        else if (blksize == 8)
-        {
-            vp8_copy_mem8x8(y, y_stride, yd, yd_stride);
-            for (up = u, udp = ud, i = 0; i < blksizeby2; ++i, up += uv_stride, udp += uvd_stride)
-                vpx_memcpy(udp, up, blksizeby2);
-            for (vp = v, vdp = vd, i = 0; i < blksizeby2; ++i, vp += uv_stride, vdp += uvd_stride)
-                vpx_memcpy(vdp, vp, blksizeby2);
-        }
-        else
-        {
-            for (yp = y, ydp = yd, i = 0; i < blksize; ++i, yp += y_stride, ydp += yd_stride)
-                vpx_memcpy(ydp, yp, blksize);
-            for (up = u, udp = ud, i = 0; i < blksizeby2; ++i, up += uv_stride, udp += uvd_stride)
-                vpx_memcpy(udp, up, blksizeby2);
-            for (vp = v, vdp = vd, i = 0; i < blksizeby2; ++i, vp += uv_stride, vdp += uvd_stride)
-                vpx_memcpy(vdp, vp, blksizeby2);
-        }
-    }
-}
-
-void vp8_multiframe_quality_enhance
-(
-    VP8_COMMON *cm
-)
-{
-    YV12_BUFFER_CONFIG *show = cm->frame_to_show;
-    YV12_BUFFER_CONFIG *dest = &cm->post_proc_buffer;
-
-    FRAME_TYPE frame_type = cm->frame_type;
-    /* Point at base of Mb MODE_INFO list has motion vectors etc */
-    const MODE_INFO *mode_info_context = cm->mi;
-    int mb_row;
-    int mb_col;
-    int qcurr = cm->base_qindex;
-    int qprev = cm->postproc_state.last_base_qindex;
-
-    unsigned char *y_ptr, *u_ptr, *v_ptr;
-    unsigned char *yd_ptr, *ud_ptr, *vd_ptr;
-
-    /* Set up the buffer pointers */
-    y_ptr = show->y_buffer;
-    u_ptr = show->u_buffer;
-    v_ptr = show->v_buffer;
-    yd_ptr = dest->y_buffer;
-    ud_ptr = dest->u_buffer;
-    vd_ptr = dest->v_buffer;
-
-    /* postprocess each macro block */
-    for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
-    {
-        for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
-        {
-            /* if motion is high there will likely be no benefit */
-            if (((frame_type == INTER_FRAME &&
-                  abs(mode_info_context->mbmi.mv.as_mv.row) <= 10 &&
-                  abs(mode_info_context->mbmi.mv.as_mv.col) <= 10) ||
-                 (frame_type == KEY_FRAME)))
-            {
-                if (mode_info_context->mbmi.mode == B_PRED || mode_info_context->mbmi.mode == SPLITMV)
-                {
-                    int i, j;
-                    for (i=0; i<2; ++i)
-                        for (j=0; j<2; ++j)
-                            multiframe_quality_enhance_block(8,
-                                                             qcurr,
-                                                             qprev,
-                                                             y_ptr + 8*(i*show->y_stride+j),
-                                                             u_ptr + 4*(i*show->uv_stride+j),
-                                                             v_ptr + 4*(i*show->uv_stride+j),
-                                                             show->y_stride,
-                                                             show->uv_stride,
-                                                             yd_ptr + 8*(i*dest->y_stride+j),
-                                                             ud_ptr + 4*(i*dest->uv_stride+j),
-                                                             vd_ptr + 4*(i*dest->uv_stride+j),
-                                                             dest->y_stride,
-                                                             dest->uv_stride);
-                }
-                else
-                {
-                    multiframe_quality_enhance_block(16,
-                                                     qcurr,
-                                                     qprev,
-                                                     y_ptr,
-                                                     u_ptr,
-                                                     v_ptr,
-                                                     show->y_stride,
-                                                     show->uv_stride,
-                                                     yd_ptr,
-                                                     ud_ptr,
-                                                     vd_ptr,
-                                                     dest->y_stride,
-                                                     dest->uv_stride);
-
-                }
-            }
-            else
-            {
-                vp8_copy_mem16x16(y_ptr, show->y_stride, yd_ptr, dest->y_stride);
-                vp8_copy_mem8x8(u_ptr, show->uv_stride, ud_ptr, dest->uv_stride);
-                vp8_copy_mem8x8(v_ptr, show->uv_stride, vd_ptr, dest->uv_stride);
-            }
-            y_ptr += 16;
-            u_ptr += 8;
-            v_ptr += 8;
-            yd_ptr += 16;
-            ud_ptr += 8;
-            vd_ptr += 8;
-            mode_info_context++;     /* step to next MB */
-        }
-
-        y_ptr += show->y_stride  * 16 - 16 * cm->mb_cols;
-        u_ptr += show->uv_stride *  8 - 8 * cm->mb_cols;
-        v_ptr += show->uv_stride *  8 - 8 * cm->mb_cols;
-        yd_ptr += dest->y_stride  * 16 - 16 * cm->mb_cols;
-        ud_ptr += dest->uv_stride *  8 - 8 * cm->mb_cols;
-        vd_ptr += dest->uv_stride *  8 - 8 * cm->mb_cols;
-
-        mode_info_context++;         /* Skip border mb */
-    }
-}
-
+#if CONFIG_POSTPROC
 int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *ppflags)
 {
     int q = oci->filter_level * 10 / 6;
@@ -923,6 +716,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
         dest->y_height = oci->Height;
         dest->uv_height = dest->y_height / 2;
         oci->postproc_state.last_base_qindex = oci->base_qindex;
+        oci->postproc_state.last_frame_valid = 1;
         return 0;
     }
 
@@ -943,7 +737,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
 
             // insure that postproc is set to all 0's so that post proc
             // doesn't pull random data in from edge
-            vpx_memset((&oci->post_proc_buffer_int)->buffer_alloc,126,(&oci->post_proc_buffer)->frame_size);
+            vpx_memset((&oci->post_proc_buffer_int)->buffer_alloc,128,(&oci->post_proc_buffer)->frame_size);
 
         }
     }
@@ -953,6 +747,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
 #endif
 
     if ((flags & VP8D_MFQE) &&
+         oci->postproc_state.last_frame_valid &&
          oci->current_video_frame >= 2 &&
          oci->base_qindex - oci->postproc_state.last_base_qindex >= 10)
     {
@@ -992,6 +787,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
         vp8_yv12_copy_frame_ptr(oci->frame_to_show, &oci->post_proc_buffer);
         oci->postproc_state.last_base_qindex = oci->base_qindex;
     }
+    oci->postproc_state.last_frame_valid = 1;
 
     if (flags & VP8D_ADDNOISE)
     {
@@ -1378,3 +1174,4 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
     dest->uv_height = dest->y_height / 2;
     return 0;
 }
+#endif
diff --git a/vp8/common/postproc.h b/vp8/common/postproc.h
index 1db74379f..6ac788cbd 100644
--- a/vp8/common/postproc.h
+++ b/vp8/common/postproc.h
@@ -19,6 +19,7 @@ struct postproc_state
     int           last_noise;
     char          noise[3072];
     int           last_base_qindex;
+    int           last_frame_valid;
     DECLARE_ALIGNED(16, char, blackclamp[16]);
     DECLARE_ALIGNED(16, char, whiteclamp[16]);
     DECLARE_ALIGNED(16, char, bothclamp[16]);
@@ -40,4 +41,8 @@ void vp8_deblock(YV12_BUFFER_CONFIG         *source,
                  int                         q,
                  int                         low_var_thresh,
                  int                         flag);
+
+#define MFQE_PRECISION 4
+
+void vp8_multiframe_quality_enhance(struct VP8Common *cm);
 #endif
diff --git a/vp8/common/rtcd_defs.sh b/vp8/common/rtcd_defs.sh
index ff8e30c3f..0fdb4fa00 100644
--- a/vp8/common/rtcd_defs.sh
+++ b/vp8/common/rtcd_defs.sh
@@ -166,6 +166,15 @@ if [ "$CONFIG_POSTPROC" = "yes" ]; then
 
     prototype void vp8_blend_b "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride"
     # no asm yet
+
+    prototype void vp8_filter_by_weight16x16 "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight"
+    specialize vp8_filter_by_weight16x16 sse2
+
+    prototype void vp8_filter_by_weight8x8 "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight"
+    specialize vp8_filter_by_weight8x8 sse2
+
+    prototype void vp8_filter_by_weight4x4 "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight"
+    # no asm yet
 fi
 
 #
diff --git a/vp8/common/sad_c.c b/vp8/common/sad_c.c
index f745bbd3d..6a3e889b1 100644
--- a/vp8/common/sad_c.c
+++ b/vp8/common/sad_c.c
@@ -13,7 +13,7 @@
 #include "vpx_config.h"
 #include "vpx/vpx_integer.h"
 
-static __inline
+static
 unsigned int sad_mx_n_c(
     const unsigned char *src_ptr,
     int  src_stride,
diff --git a/vp8/common/x86/mfqe_sse2.asm b/vp8/common/x86/mfqe_sse2.asm
new file mode 100644
index 000000000..10d21f320
--- /dev/null
+++ b/vp8/common/x86/mfqe_sse2.asm
@@ -0,0 +1,281 @@
+;
+;  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp8_filter_by_weight16x16_sse2
+;(
+;    unsigned char *src,
+;    int            src_stride,
+;    unsigned char *dst,
+;    int            dst_stride,
+;    int            src_weight
+;)
+global sym(vp8_filter_by_weight16x16_sse2)
+sym(vp8_filter_by_weight16x16_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    movd        xmm0, arg(4)                ; src_weight
+    pshuflw     xmm0, xmm0, 0x0             ; replicate to all low words
+    punpcklqdq  xmm0, xmm0                  ; replicate to all hi words
+
+    movdqa      xmm1, [GLOBAL(tMFQE)]
+    psubw       xmm1, xmm0                  ; dst_weight
+
+    mov         rax, arg(0)                 ; src
+    mov         rsi, arg(1)                 ; src_stride
+    mov         rdx, arg(2)                 ; dst
+    mov         rdi, arg(3)                 ; dst_stride
+
+    mov         rcx, 16                     ; loop count
+    pxor        xmm6, xmm6
+
+.combine
+    movdqa      xmm2, [rax]
+    movdqa      xmm4, [rdx]
+    add         rax, rsi
+
+    ; src * src_weight
+    movdqa      xmm3, xmm2
+    punpcklbw   xmm2, xmm6
+    punpckhbw   xmm3, xmm6
+    pmullw      xmm2, xmm0
+    pmullw      xmm3, xmm0
+
+    ; dst * dst_weight
+    movdqa      xmm5, xmm4
+    punpcklbw   xmm4, xmm6
+    punpckhbw   xmm5, xmm6
+    pmullw      xmm4, xmm1
+    pmullw      xmm5, xmm1
+
+    ; sum, round and shift
+    paddw       xmm2, xmm4
+    paddw       xmm3, xmm5
+    paddw       xmm2, [GLOBAL(tMFQE_round)]
+    paddw       xmm3, [GLOBAL(tMFQE_round)]
+    psrlw       xmm2, 4
+    psrlw       xmm3, 4
+
+    packuswb    xmm2, xmm3
+    movdqa      [rdx], xmm2
+    add         rdx, rdi
+
+    dec         rcx
+    jnz         .combine
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+
+    ret
+
+;void vp8_filter_by_weight8x8_sse2
+;(
+;    unsigned char *src,
+;    int            src_stride,
+;    unsigned char *dst,
+;    int            dst_stride,
+;    int            src_weight
+;)
+global sym(vp8_filter_by_weight8x8_sse2)
+sym(vp8_filter_by_weight8x8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    movd        xmm0, arg(4)                ; src_weight
+    pshuflw     xmm0, xmm0, 0x0             ; replicate to all low words
+    punpcklqdq  xmm0, xmm0                  ; replicate to all hi words
+
+    movdqa      xmm1, [GLOBAL(tMFQE)]
+    psubw       xmm1, xmm0                  ; dst_weight
+
+    mov         rax, arg(0)                 ; src
+    mov         rsi, arg(1)                 ; src_stride
+    mov         rdx, arg(2)                 ; dst
+    mov         rdi, arg(3)                 ; dst_stride
+
+    mov         rcx, 8                      ; loop count
+    pxor        xmm4, xmm4
+
+.combine
+    movq        xmm2, [rax]
+    movq        xmm3, [rdx]
+    add         rax, rsi
+
+    ; src * src_weight
+    punpcklbw   xmm2, xmm4
+    pmullw      xmm2, xmm0
+
+    ; dst * dst_weight
+    punpcklbw   xmm3, xmm4
+    pmullw      xmm3, xmm1
+
+    ; sum, round and shift
+    paddw       xmm2, xmm3
+    paddw       xmm2, [GLOBAL(tMFQE_round)]
+    psrlw       xmm2, 4
+
+    packuswb    xmm2, xmm4
+    movq        [rdx], xmm2
+    add         rdx, rdi
+
+    dec         rcx
+    jnz         .combine
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+
+    ret
+
+;void vp8_variance_and_sad_16x16_sse2 | arg
+;(
+;    unsigned char *src1,          0
+;    int            stride1,       1
+;    unsigned char *src2,          2
+;    int            stride2,       3
+;    unsigned int  *variance,      4
+;    unsigned int  *sad,           5
+;)
+global sym(vp8_variance_and_sad_16x16_sse2)
+sym(vp8_variance_and_sad_16x16_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov         rax,        arg(0)          ; src1
+    mov         rcx,        arg(1)          ; stride1
+    mov         rdx,        arg(2)          ; src2
+    mov         rdi,        arg(3)          ; stride2
+
+    mov         rsi,        16              ; block height
+
+    ; Prep accumulator registers
+    pxor        xmm3, xmm3                  ; SAD
+    pxor        xmm4, xmm4                  ; sum of src2
+    pxor        xmm5, xmm5                  ; sum of src2^2
+
+    ; Because we're working with the actual output frames
+    ; we can't depend on any kind of data alignment.
+.accumulate
+    movdqa      xmm0, [rax]                 ; src1
+    movdqa      xmm1, [rdx]                 ; src2
+    add         rax, rcx                    ; src1 + stride1
+    add         rdx, rdi                    ; src2 + stride2
+
+    ; SAD(src1, src2)
+    psadbw      xmm0, xmm1
+    paddusw     xmm3, xmm0
+
+    ; SUM(src2)
+    pxor        xmm2, xmm2
+    psadbw      xmm2, xmm1                  ; sum src2 by misusing SAD against 0
+    paddusw     xmm4, xmm2
+
+    ; pmaddubsw would be ideal if it took two unsigned values. instead,
+    ; it expects a signed and an unsigned value. so instead we zero extend
+    ; and operate on words.
+    pxor        xmm2, xmm2
+    movdqa      xmm0, xmm1
+    punpcklbw   xmm0, xmm2
+    punpckhbw   xmm1, xmm2
+    pmaddwd     xmm0, xmm0
+    pmaddwd     xmm1, xmm1
+    paddd       xmm5, xmm0
+    paddd       xmm5, xmm1
+
+    sub         rsi,        1
+    jnz         .accumulate
+
+    ; phaddd only operates on adjacent double words.
+    ; Finalize SAD and store
+    movdqa      xmm0, xmm3
+    psrldq      xmm0, 8
+    paddusw     xmm0, xmm3
+    paddd       xmm0, [GLOBAL(t128)]
+    psrld       xmm0, 8
+
+    mov         rax,  arg(5)
+    movd        [rax], xmm0
+
+    ; Accumulate sum of src2
+    movdqa      xmm0, xmm4
+    psrldq      xmm0, 8
+    paddusw     xmm0, xmm4
+    ; Square src2. Ignore high value
+    pmuludq     xmm0, xmm0
+    psrld       xmm0, 8
+
+    ; phaddw could be used to sum adjacent values but we want
+    ; all the values summed. promote to doubles, accumulate,
+    ; shift and sum
+    pxor        xmm2, xmm2
+    movdqa      xmm1, xmm5
+    punpckldq   xmm1, xmm2
+    punpckhdq   xmm5, xmm2
+    paddd       xmm1, xmm5
+    movdqa      xmm2, xmm1
+    psrldq      xmm1, 8
+    paddd       xmm1, xmm2
+
+    psubd       xmm1, xmm0
+
+    ; (variance + 128) >> 8
+    paddd       xmm1, [GLOBAL(t128)]
+    psrld       xmm1, 8
+    mov         rax,  arg(4)
+
+    movd        [rax], xmm1
+
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+t128:
+    ddq 128
+align 16
+tMFQE: ; 1 << MFQE_PRECISION
+    times 8 dw 0x10
+align 16
+tMFQE_round: ; 1 << (MFQE_PRECISION - 1)
+    times 8 dw 0x08
+
diff --git a/vp8/decoder/detokenize.c b/vp8/decoder/detokenize.c
index ba94c58bb..c5752ee0b 100644
--- a/vp8/decoder/detokenize.c
+++ b/vp8/decoder/detokenize.c
@@ -15,58 +15,6 @@
 #include "vpx_ports/mem.h"
 #include "detokenize.h"
 
-#define BOOL_DATA unsigned char
-
-#define OCB_X PREV_COEF_CONTEXTS * ENTROPY_NODES
-DECLARE_ALIGNED(16, static const unsigned char, coef_bands_x[16]) =
-{
-    0 * OCB_X, 1 * OCB_X, 2 * OCB_X, 3 * OCB_X,
-    6 * OCB_X, 4 * OCB_X, 5 * OCB_X, 6 * OCB_X,
-    6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X,
-    6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 7 * OCB_X
-};
-#define EOB_CONTEXT_NODE            0
-#define ZERO_CONTEXT_NODE           1
-#define ONE_CONTEXT_NODE            2
-#define LOW_VAL_CONTEXT_NODE        3
-#define TWO_CONTEXT_NODE            4
-#define THREE_CONTEXT_NODE          5
-#define HIGH_LOW_CONTEXT_NODE       6
-#define CAT_ONE_CONTEXT_NODE        7
-#define CAT_THREEFOUR_CONTEXT_NODE  8
-#define CAT_THREE_CONTEXT_NODE      9
-#define CAT_FIVE_CONTEXT_NODE       10
-
-#define CAT1_MIN_VAL    5
-#define CAT2_MIN_VAL    7
-#define CAT3_MIN_VAL   11
-#define CAT4_MIN_VAL   19
-#define CAT5_MIN_VAL   35
-#define CAT6_MIN_VAL   67
-
-#define CAT1_PROB0    159
-#define CAT2_PROB0    145
-#define CAT2_PROB1    165
-
-#define CAT3_PROB0 140
-#define CAT3_PROB1 148
-#define CAT3_PROB2 173
-
-#define CAT4_PROB0 135
-#define CAT4_PROB1 140
-#define CAT4_PROB2 155
-#define CAT4_PROB3 176
-
-#define CAT5_PROB0 130
-#define CAT5_PROB1 134
-#define CAT5_PROB2 141
-#define CAT5_PROB3 157
-#define CAT5_PROB4 180
-
-static const unsigned char cat6_prob[12] =
-{ 129, 130, 133, 140, 153, 177, 196, 230, 243, 254, 254, 0 };
-
-
 void vp8_reset_mb_tokens_context(MACROBLOCKD *x)
 {
     /* Clear entropy contexts for Y2 blocks */
@@ -83,302 +31,216 @@ void vp8_reset_mb_tokens_context(MACROBLOCKD *x)
     }
 }
 
-DECLARE_ALIGNED(16, extern const unsigned char, vp8_norm[256]);
-#define FILL \
-    if(count < 0) \
-        VP8DX_BOOL_DECODER_FILL(count, value, bufptr, bufend);
+/*
+    ------------------------------------------------------------------------------
+    Residual decoding (Paragraph 13.2 / 13.3)
+*/
+static const uint8_t kBands[16 + 1] = {
+  0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7,
+  0  /* extra entry as sentinel */
+};
 
-#define NORMALIZE \
-    /*if(range < 0x80)*/                            \
-    { \
-        shift = vp8_norm[range]; \
-        range <<= shift; \
-        value <<= shift; \
-        count -= shift; \
-    }
+static const uint8_t kCat3[] = { 173, 148, 140, 0 };
+static const uint8_t kCat4[] = { 176, 155, 140, 135, 0 };
+static const uint8_t kCat5[] = { 180, 157, 141, 134, 130, 0 };
+static const uint8_t kCat6[] =
+  { 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0 };
+static const uint8_t* const kCat3456[] = { kCat3, kCat4, kCat5, kCat6 };
+static const uint8_t kZigzag[16] = {
+  0, 1, 4, 8,  5, 2, 3, 6,  9, 12, 13, 10,  7, 11, 14, 15
+};
 
-#define DECODE_AND_APPLYSIGN(value_to_sign) \
-    split = (range + 1) >> 1; \
-    bigsplit = (VP8_BD_VALUE)split << (VP8_BD_VALUE_SIZE - 8); \
-    FILL \
-    if ( value < bigsplit ) \
-    { \
-        range = split; \
-        v= value_to_sign; \
-    } \
-    else \
-    { \
-        range = range-split; \
-        value = value-bigsplit; \
-        v = -value_to_sign; \
-    } \
-    range +=range;                   \
-    value +=value;                   \
-    count--;
+#define VP8GetBit vp8dx_decode_bool
+#define NUM_PROBAS  11
+#define NUM_CTX  3
 
-#define DECODE_AND_BRANCH_IF_ZERO(probability,branch) \
-    { \
-        split = 1 +  ((( probability*(range-1) ) )>> 8); \
-        bigsplit = (VP8_BD_VALUE)split << (VP8_BD_VALUE_SIZE - 8); \
-        FILL \
-        if ( value < bigsplit ) \
-        { \
-            range = split; \
-            NORMALIZE \
-            goto branch; \
-        } \
-        value -= bigsplit; \
-        range = range - split; \
-        NORMALIZE \
-    }
+typedef const uint8_t (*ProbaArray)[NUM_CTX][NUM_PROBAS];  // for const-casting
 
-#define DECODE_AND_LOOP_IF_ZERO(probability,branch) \
-    { \
-        split = 1 + ((( probability*(range-1) ) ) >> 8); \
-        bigsplit = (VP8_BD_VALUE)split << (VP8_BD_VALUE_SIZE - 8); \
-        FILL \
-        if ( value < bigsplit ) \
-        { \
-            range = split; \
-            NORMALIZE \
-            Prob = coef_probs; \
-            if(c<15) {\
-            ++c; \
-            Prob += coef_bands_x[c]; \
-            goto branch; \
-            } goto BLOCK_FINISHED; /*for malformed input */\
-        } \
-        value -= bigsplit; \
-        range = range - split; \
-        NORMALIZE \
-    }
+static int GetSigned(BOOL_DECODER *br, int value_to_sign)
+{
+    int split = (br->range + 1) >> 1;
+    VP8_BD_VALUE bigsplit = (VP8_BD_VALUE)split << (VP8_BD_VALUE_SIZE - 8);
+    int v;
 
-#define DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val) \
-    DECODE_AND_APPLYSIGN(val) \
-    Prob = coef_probs + (ENTROPY_NODES*2); \
-    if(c < 15){\
-        qcoeff_ptr [ scan[c] ] = (int16_t) v; \
-        ++c; \
-        goto DO_WHILE; }\
-    qcoeff_ptr [ 15 ] = (int16_t) v; \
-    goto BLOCK_FINISHED;
+    if(br->count < 0)
+        vp8dx_bool_decoder_fill(br);
 
+    if ( br->value < bigsplit )
+    {
+        br->range = split;
+        v= value_to_sign;
+    }
+    else
+    {
+        br->range = br->range-split;
+        br->value = br->value-bigsplit;
+        v = -value_to_sign;
+    }
+    br->range +=br->range;
+    br->value +=br->value;
+    br->count--;
 
-#define DECODE_EXTRABIT_AND_ADJUST_VAL(prob, bits_count)\
-    split = 1 +  (((range-1) * prob) >> 8); \
-    bigsplit = (VP8_BD_VALUE)split << (VP8_BD_VALUE_SIZE - 8); \
-    FILL \
-    if(value >= bigsplit)\
-    {\
-        range = range-split;\
-        value = value-bigsplit;\
-        val += ((uint16_t)1<<bits_count);\
-    }\
-    else\
-    {\
-        range = split;\
-    }\
-    NORMALIZE
+    return v;
+}
+/*
+   Returns the position of the last non-zero coeff plus one
+   (and 0 if there's no coeff at all)
+*/
+static int GetCoeffs(BOOL_DECODER *br, ProbaArray prob,
+                     int ctx, int n, int16_t* out)
+{
+    const uint8_t* p = prob[n][ctx];
+    if (!VP8GetBit(br, p[0]))
+    {   /* first EOB is more a 'CBP' bit. */
+        return 0;
+    }
+    while (1)
+    {
+        ++n;
+        if (!VP8GetBit(br, p[1]))
+        {
+            p = prob[kBands[n]][0];
+        }
+        else
+        {  /* non zero coeff */
+            int v, j;
+            if (!VP8GetBit(br, p[2]))
+            {
+                p = prob[kBands[n]][1];
+                v = 1;
+            }
+            else
+            {
+                if (!VP8GetBit(br, p[3]))
+                {
+                    if (!VP8GetBit(br, p[4]))
+                    {
+                        v = 2;
+                    }
+                    else
+                    {
+                        v = 3 + VP8GetBit(br, p[5]);
+                    }
+                }
+                else
+                {
+                    if (!VP8GetBit(br, p[6]))
+                    {
+                        if (!VP8GetBit(br, p[7]))
+                        {
+                            v = 5 + VP8GetBit(br, 159);
+                        } else
+                        {
+                            v = 7 + 2 * VP8GetBit(br, 165);
+                            v += VP8GetBit(br, 145);
+                        }
+                    }
+                    else
+                    {
+                        const uint8_t* tab;
+                        const int bit1 = VP8GetBit(br, p[8]);
+                        const int bit0 = VP8GetBit(br, p[9 + bit1]);
+                        const int cat = 2 * bit1 + bit0;
+                        v = 0;
+                        for (tab = kCat3456[cat]; *tab; ++tab)
+                        {
+                            v += v + VP8GetBit(br, *tab);
+                        }
+                        v += 3 + (8 << cat);
+                    }
+                }
+                p = prob[kBands[n]][2];
+            }
+            j = kZigzag[n - 1];
+
+            out[j] = GetSigned(br, v);
+
+            if (n == 16 || !VP8GetBit(br, p[0]))
+            {   /* EOB */
+                return n;
+            }
+        }
+        if (n == 16)
+        {
+            return 16;
+        }
+    }
+}
 
 int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *x)
 {
-    ENTROPY_CONTEXT *A = (ENTROPY_CONTEXT *)x->above_context;
-    ENTROPY_CONTEXT *L = (ENTROPY_CONTEXT *)x->left_context;
-    const FRAME_CONTEXT * const fc = &dx->common.fc;
-
     BOOL_DECODER *bc = x->current_bc;
-
+    const FRAME_CONTEXT * const fc = &dx->common.fc;
     char *eobs = x->eobs;
 
-    ENTROPY_CONTEXT *a;
-    ENTROPY_CONTEXT *l;
     int i;
-
+    int nonzeros;
     int eobtotal = 0;
 
-    register int count;
-
-    const BOOL_DATA *bufptr;
-    const BOOL_DATA *bufend;
-    register unsigned int range;
-    VP8_BD_VALUE value;
-    const int *scan;
-    register unsigned int shift;
-    unsigned int split;
-    VP8_BD_VALUE bigsplit;
     short *qcoeff_ptr;
+    ProbaArray coef_probs;
+    ENTROPY_CONTEXT *a_ctx = ((ENTROPY_CONTEXT *)x->above_context);
+    ENTROPY_CONTEXT *l_ctx = ((ENTROPY_CONTEXT *)x->left_context);
+    ENTROPY_CONTEXT *a;
+    ENTROPY_CONTEXT *l;
+    int skip_dc = 0;
 
-    const vp8_prob *coef_probs;
-    int stop;
-    int val, bits_count;
-    int c;
-    int v;
-    const vp8_prob *Prob;
-    int start_coeff;
-
-
-    i = 0;
-    stop = 16;
-
-    scan = vp8_default_zig_zag1d;
     qcoeff_ptr = &x->qcoeff[0];
-    coef_probs = fc->coef_probs [3] [ 0 ] [0];
 
     if (x->mode_info_context->mbmi.mode != B_PRED &&
         x->mode_info_context->mbmi.mode != SPLITMV)
     {
-        i = 24;
-        stop = 24;
-        qcoeff_ptr += 24*16;
-        eobtotal -= 16;
-        coef_probs = fc->coef_probs [1] [ 0 ] [0];
-    }
-
-    bufend  = bc->user_buffer_end;
-    bufptr  = bc->user_buffer;
-    value   = bc->value;
-    count   = bc->count;
-    range   = bc->range;
-
-    start_coeff = 0;
-
-BLOCK_LOOP:
-    a = A + vp8_block2above[i];
-    l = L + vp8_block2left[i];
-
-    c = start_coeff;
+        a = a_ctx + 8;
+        l = l_ctx + 8;
 
-    VP8_COMBINEENTROPYCONTEXTS(v, *a, *l);
+        coef_probs = fc->coef_probs [1];
 
-    Prob = coef_probs;
-    Prob += v * ENTROPY_NODES;
-    *a = *l = 0;
+        nonzeros = GetCoeffs(bc, coef_probs, (*a + *l), 0, qcoeff_ptr + 24 * 16);
+        *a = *l = (nonzeros > 0);
 
-DO_WHILE:
-    Prob += coef_bands_x[c];
-    DECODE_AND_BRANCH_IF_ZERO(Prob[EOB_CONTEXT_NODE], BLOCK_FINISHED);
-    *a = *l = 1;
+        eobs[24] = nonzeros;
+        eobtotal += nonzeros - 16;
 
-CHECK_0_:
-    DECODE_AND_LOOP_IF_ZERO(Prob[ZERO_CONTEXT_NODE], CHECK_0_);
-    DECODE_AND_BRANCH_IF_ZERO(Prob[ONE_CONTEXT_NODE], ONE_CONTEXT_NODE_0_);
-    DECODE_AND_BRANCH_IF_ZERO(Prob[LOW_VAL_CONTEXT_NODE],
-                              LOW_VAL_CONTEXT_NODE_0_);
-    DECODE_AND_BRANCH_IF_ZERO(Prob[HIGH_LOW_CONTEXT_NODE],
-                              HIGH_LOW_CONTEXT_NODE_0_);
-    DECODE_AND_BRANCH_IF_ZERO(Prob[CAT_THREEFOUR_CONTEXT_NODE],
-                              CAT_THREEFOUR_CONTEXT_NODE_0_);
-    DECODE_AND_BRANCH_IF_ZERO(Prob[CAT_FIVE_CONTEXT_NODE],
-                              CAT_FIVE_CONTEXT_NODE_0_);
-
-    val = CAT6_MIN_VAL;
-    bits_count = 10;
-
-    do
+        coef_probs = fc->coef_probs [0];
+        skip_dc = 1;
+    }
+    else
     {
-        DECODE_EXTRABIT_AND_ADJUST_VAL(cat6_prob[bits_count], bits_count);
-        bits_count -- ;
+        coef_probs = fc->coef_probs [3];
+        skip_dc = 0;
     }
-    while (bits_count >= 0);
 
-    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val);
-
-CAT_FIVE_CONTEXT_NODE_0_:
-    val = CAT5_MIN_VAL;
-    DECODE_EXTRABIT_AND_ADJUST_VAL(CAT5_PROB4, 4);
-    DECODE_EXTRABIT_AND_ADJUST_VAL(CAT5_PROB3, 3);
-    DECODE_EXTRABIT_AND_ADJUST_VAL(CAT5_PROB2, 2);
-    DECODE_EXTRABIT_AND_ADJUST_VAL(CAT5_PROB1, 1);
-    DECODE_EXTRABIT_AND_ADJUST_VAL(CAT5_PROB0, 0);
-    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val);
-
-CAT_THREEFOUR_CONTEXT_NODE_0_:
-    DECODE_AND_BRANCH_IF_ZERO(Prob[CAT_THREE_CONTEXT_NODE],
-                              CAT_THREE_CONTEXT_NODE_0_);
-    val = CAT4_MIN_VAL;
-    DECODE_EXTRABIT_AND_ADJUST_VAL(CAT4_PROB3, 3);
-    DECODE_EXTRABIT_AND_ADJUST_VAL(CAT4_PROB2, 2);
-    DECODE_EXTRABIT_AND_ADJUST_VAL(CAT4_PROB1, 1);
-    DECODE_EXTRABIT_AND_ADJUST_VAL(CAT4_PROB0, 0);
-    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val);
-
-CAT_THREE_CONTEXT_NODE_0_:
-    val = CAT3_MIN_VAL;
-    DECODE_EXTRABIT_AND_ADJUST_VAL(CAT3_PROB2, 2);
-    DECODE_EXTRABIT_AND_ADJUST_VAL(CAT3_PROB1, 1);
-    DECODE_EXTRABIT_AND_ADJUST_VAL(CAT3_PROB0, 0);
-    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val);
-
-HIGH_LOW_CONTEXT_NODE_0_:
-    DECODE_AND_BRANCH_IF_ZERO(Prob[CAT_ONE_CONTEXT_NODE],
-                              CAT_ONE_CONTEXT_NODE_0_);
-
-    val = CAT2_MIN_VAL;
-    DECODE_EXTRABIT_AND_ADJUST_VAL(CAT2_PROB1, 1);
-    DECODE_EXTRABIT_AND_ADJUST_VAL(CAT2_PROB0, 0);
-    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val);
-
-CAT_ONE_CONTEXT_NODE_0_:
-    val = CAT1_MIN_VAL;
-    DECODE_EXTRABIT_AND_ADJUST_VAL(CAT1_PROB0, 0);
-    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val);
-
-LOW_VAL_CONTEXT_NODE_0_:
-    DECODE_AND_BRANCH_IF_ZERO(Prob[TWO_CONTEXT_NODE], TWO_CONTEXT_NODE_0_);
-    DECODE_AND_BRANCH_IF_ZERO(Prob[THREE_CONTEXT_NODE], THREE_CONTEXT_NODE_0_);
-    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(4);
-
-THREE_CONTEXT_NODE_0_:
-    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(3);
-
-TWO_CONTEXT_NODE_0_:
-    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(2);
-
-ONE_CONTEXT_NODE_0_:
-    DECODE_AND_APPLYSIGN(1);
-    Prob = coef_probs + ENTROPY_NODES;
-
-    if (c < 15)
+    for (i = 0; i < 16; ++i)
     {
-        qcoeff_ptr [ scan[c] ] = (int16_t) v;
-        ++c;
-        goto DO_WHILE;
-    }
+        a = a_ctx + (i&3);
+        l = l_ctx + ((i&0xc)>>2);
 
-    qcoeff_ptr [ 15 ] = (int16_t) v;
-BLOCK_FINISHED:
-    eobs[i] = c;
-    eobtotal += c;
-    qcoeff_ptr += 16;
+        nonzeros = GetCoeffs(bc, coef_probs, (*a + *l), skip_dc, qcoeff_ptr);
+        *a = *l = (nonzeros > 0);
 
-    i++;
+        nonzeros += skip_dc;
+        eobs[i] = nonzeros;
+        eobtotal += nonzeros;
+        qcoeff_ptr += 16;
+    }
 
-    if (i < stop)
-        goto BLOCK_LOOP;
+    coef_probs = fc->coef_probs [2];
 
-    if (i == 25)
+    a_ctx += 4;
+    l_ctx += 4;
+    for (i = 16; i < 24; ++i)
     {
-        start_coeff = 1;
-        i = 0;
-        stop = 16;
-        coef_probs = fc->coef_probs [0] [ 0 ] [0];
-        qcoeff_ptr -= (24*16 + 16);
-        goto BLOCK_LOOP;
-    }
+        a = a_ctx + ((i > 19)<<1) + (i&1);
+        l = l_ctx + ((i > 19)<<1) + ((i&3)>1);
 
-    if (i == 16)
-    {
-        start_coeff = 0;
-        coef_probs = fc->coef_probs [2] [ 0 ] [0];
-        stop = 24;
-        goto BLOCK_LOOP;
+        nonzeros = GetCoeffs(bc, coef_probs, (*a + *l), 0, qcoeff_ptr);
+        *a = *l = (nonzeros > 0);
+
+        eobs[i] = nonzeros;
+        eobtotal += nonzeros;
+        qcoeff_ptr += 16;
     }
 
-    FILL
-    bc->user_buffer = bufptr;
-    bc->value = value;
-    bc->count = count;
-    bc->range = range;
     return eobtotal;
-
 }
+
diff --git a/vp8/encoder/denoising.c b/vp8/encoder/denoising.c
new file mode 100644
index 000000000..d487065c0
--- /dev/null
+++ b/vp8/encoder/denoising.c
@@ -0,0 +1,212 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "denoising.h"
+
+#include "vp8/common/reconinter.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_rtcd.h"
+
+const unsigned int NOISE_MOTION_THRESHOLD = 20*20;
+const unsigned int NOISE_DIFF2_THRESHOLD = 75;
+// SSE_DIFF_THRESHOLD is selected as ~95% confidence assuming var(noise) ~= 100.
+const unsigned int SSE_DIFF_THRESHOLD = 16*16*20;
+const unsigned int SSE_THRESHOLD = 16*16*40;
+
+static uint8_t blend(uint8_t state, uint8_t sample, uint8_t factor_q8)
+{
+  return (uint8_t)(
+      (((uint16_t)factor_q8 * ((uint16_t)state) +  // Q8
+        (uint16_t)(256 - factor_q8) * ((uint16_t)sample)) + 128)  // Q8
+      >> 8);
+}
+
+static unsigned int denoiser_motion_compensate(YV12_BUFFER_CONFIG* src,
+                                               YV12_BUFFER_CONFIG* dst,
+                                               MACROBLOCK* x,
+                                               unsigned int best_sse,
+                                               unsigned int zero_mv_sse,
+                                               int recon_yoffset,
+                                               int recon_uvoffset)
+{
+  MACROBLOCKD filter_xd = x->e_mbd;
+  int mv_col;
+  int mv_row;
+  int sse_diff = zero_mv_sse - best_sse;
+  // Compensate the running average.
+  filter_xd.pre.y_buffer = src->y_buffer + recon_yoffset;
+  filter_xd.pre.u_buffer = src->u_buffer + recon_uvoffset;
+  filter_xd.pre.v_buffer = src->v_buffer + recon_uvoffset;
+  // Write the compensated running average to the destination buffer.
+  filter_xd.dst.y_buffer = dst->y_buffer + recon_yoffset;
+  filter_xd.dst.u_buffer = dst->u_buffer + recon_uvoffset;
+  filter_xd.dst.v_buffer = dst->v_buffer + recon_uvoffset;
+  // Use the best MV for the compensation.
+  filter_xd.mode_info_context->mbmi.ref_frame = LAST_FRAME;
+  filter_xd.mode_info_context->mbmi.mode = filter_xd.best_sse_inter_mode;
+  filter_xd.mode_info_context->mbmi.mv = filter_xd.best_sse_mv;
+  filter_xd.mode_info_context->mbmi.need_to_clamp_mvs =
+      filter_xd.need_to_clamp_best_mvs;
+  mv_col = filter_xd.best_sse_mv.as_mv.col;
+  mv_row = filter_xd.best_sse_mv.as_mv.row;
+  if (filter_xd.mode_info_context->mbmi.mode <= B_PRED ||
+      (mv_row*mv_row + mv_col*mv_col <= NOISE_MOTION_THRESHOLD &&
+       sse_diff < SSE_DIFF_THRESHOLD))
+  {
+    // Handle intra blocks as referring to last frame with zero motion and
+    // let the absolute pixel difference affect the filter factor.
+    // Also consider small amount of motion as being random walk due to noise,
+    // if it doesn't mean that we get a much bigger error.
+    // Note that any changes to the mode info only affects the denoising.
+    filter_xd.mode_info_context->mbmi.ref_frame = LAST_FRAME;
+    filter_xd.mode_info_context->mbmi.mode = ZEROMV;
+    filter_xd.mode_info_context->mbmi.mv.as_int = 0;
+    x->e_mbd.best_sse_inter_mode = ZEROMV;
+    x->e_mbd.best_sse_mv.as_int = 0;
+    best_sse = zero_mv_sse;
+  }
+  if (!x->skip)
+  {
+    vp8_build_inter_predictors_mb(&filter_xd);
+  }
+  else
+  {
+    vp8_build_inter16x16_predictors_mb(&filter_xd,
+                                       filter_xd.dst.y_buffer,
+                                       filter_xd.dst.u_buffer,
+                                       filter_xd.dst.v_buffer,
+                                       filter_xd.dst.y_stride,
+                                       filter_xd.dst.uv_stride);
+  }
+  return best_sse;
+}
+
+static void denoiser_filter(YV12_BUFFER_CONFIG* mc_running_avg,
+                            YV12_BUFFER_CONFIG* running_avg,
+                            MACROBLOCK* signal,
+                            unsigned int motion_magnitude2,
+                            int y_offset,
+                            int uv_offset)
+{
+  unsigned char* sig = signal->thismb;
+  int sig_stride = 16;
+  unsigned char* mc_running_avg_y = mc_running_avg->y_buffer + y_offset;
+  int mc_avg_y_stride = mc_running_avg->y_stride;
+  unsigned char* running_avg_y = running_avg->y_buffer + y_offset;
+  int avg_y_stride = running_avg->y_stride;
+  int r, c;
+  for (r = 0; r < 16; r++)
+  {
+    for (c = 0; c < 16; c++)
+    {
+      int diff;
+      int absdiff = 0;
+      unsigned int filter_coefficient;
+      absdiff = sig[c] - mc_running_avg_y[c];
+      absdiff = absdiff > 0 ? absdiff : -absdiff;
+      assert(absdiff >= 0 && absdiff < 256);
+      filter_coefficient = (255 << 8) / (256 + ((absdiff * 330) >> 3));
+      // Allow some additional filtering of static blocks, or blocks with very
+      // small motion vectors.
+      filter_coefficient += filter_coefficient / (3 + (motion_magnitude2 >> 3));
+      filter_coefficient = filter_coefficient > 255 ? 255 : filter_coefficient;
+
+      running_avg_y[c] = blend(mc_running_avg_y[c], sig[c], filter_coefficient);
+      diff = sig[c] - running_avg_y[c];
+
+      if (diff * diff < NOISE_DIFF2_THRESHOLD)
+      {
+        // Replace with mean to suppress the noise.
+        sig[c] = running_avg_y[c];
+      }
+      else
+      {
+        // Replace the filter state with the signal since the change in this
+        // pixel isn't classified as noise.
+        running_avg_y[c] = sig[c];
+      }
+    }
+    sig += sig_stride;
+    mc_running_avg_y += mc_avg_y_stride;
+    running_avg_y += avg_y_stride;
+  }
+}
+
+int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height)
+{
+  assert(denoiser);
+  denoiser->yv12_running_avg.flags = 0;
+  if (vp8_yv12_alloc_frame_buffer(&(denoiser->yv12_running_avg), width,
+                                  height, VP8BORDERINPIXELS) < 0)
+  {
+      vp8_denoiser_free(denoiser);
+      return 1;
+  }
+  denoiser->yv12_mc_running_avg.flags = 0;
+  if (vp8_yv12_alloc_frame_buffer(&(denoiser->yv12_mc_running_avg), width,
+                                  height, VP8BORDERINPIXELS) < 0)
+  {
+      vp8_denoiser_free(denoiser);
+      return 1;
+  }
+  vpx_memset(denoiser->yv12_running_avg.buffer_alloc, 0,
+             denoiser->yv12_running_avg.frame_size);
+  vpx_memset(denoiser->yv12_mc_running_avg.buffer_alloc, 0,
+             denoiser->yv12_mc_running_avg.frame_size);
+  return 0;
+}
+
+void vp8_denoiser_free(VP8_DENOISER *denoiser)
+{
+  assert(denoiser);
+  vp8_yv12_de_alloc_frame_buffer(&denoiser->yv12_running_avg);
+  vp8_yv12_de_alloc_frame_buffer(&denoiser->yv12_mc_running_avg);
+}
+
+void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
+                             MACROBLOCK *x,
+                             unsigned int best_sse,
+                             unsigned int zero_mv_sse,
+                             int recon_yoffset,
+                             int recon_uvoffset) {
+  int mv_row;
+  int mv_col;
+  unsigned int motion_magnitude2;
+  // Motion compensate the running average.
+  best_sse = denoiser_motion_compensate(&denoiser->yv12_running_avg,
+                                        &denoiser->yv12_mc_running_avg,
+                                        x,
+                                        best_sse,
+                                        zero_mv_sse,
+                                        recon_yoffset,
+                                        recon_uvoffset);
+
+  mv_row = x->e_mbd.best_sse_mv.as_mv.row;
+  mv_col = x->e_mbd.best_sse_mv.as_mv.col;
+  motion_magnitude2 = mv_row*mv_row + mv_col*mv_col;
+  if (best_sse > SSE_THRESHOLD ||
+      motion_magnitude2 > 8 * NOISE_MOTION_THRESHOLD)
+  {
+    // No filtering of this block since it differs too much from the predictor,
+    // or the motion vector magnitude is considered too big.
+    vp8_copy_mem16x16(x->thismb, 16,
+                      denoiser->yv12_running_avg.y_buffer + recon_yoffset,
+                      denoiser->yv12_running_avg.y_stride);
+    return;
+  }
+  // Filter.
+  denoiser_filter(&denoiser->yv12_mc_running_avg,
+                  &denoiser->yv12_running_avg,
+                  x,
+                  motion_magnitude2,
+                  recon_yoffset,
+                  recon_uvoffset);
+}
diff --git a/vp8/encoder/denoising.h b/vp8/encoder/denoising.h
new file mode 100644
index 000000000..343531bb1
--- /dev/null
+++ b/vp8/encoder/denoising.h
@@ -0,0 +1,33 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_ENCODER_DENOISING_H_
+#define VP8_ENCODER_DENOISING_H_
+
+#include "block.h"
+
+typedef struct vp8_denoiser
+{
+  YV12_BUFFER_CONFIG yv12_running_avg;
+  YV12_BUFFER_CONFIG yv12_mc_running_avg;
+} VP8_DENOISER;
+
+int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height);
+
+void vp8_denoiser_free(VP8_DENOISER *denoiser);
+
+void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
+                             MACROBLOCK *x,
+                             unsigned int best_sse,
+                             unsigned int zero_mv_sse,
+                             int recon_yoffset,
+                             int recon_uvoffset);
+
+#endif  // VP8_ENCODER_DENOISING_H_
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index 21757f8f0..962a719c8 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -1179,6 +1179,13 @@ int vp8cx_encode_inter_macroblock
     else
         x->encode_breakout = cpi->oxcf.encode_breakout;
 
+#if CONFIG_TEMPORAL_DENOISING
+    // Reset the best sse mode/mv for each macroblock.
+    x->e_mbd.best_sse_inter_mode = 0;
+    x->e_mbd.best_sse_mv.as_int = 0;
+    x->e_mbd.need_to_clamp_best_mvs = 0;
+#endif
+
     if (cpi->sf.RD)
     {
         int zbin_mode_boost_enabled = cpi->zbin_mode_boost_enabled;
diff --git a/vp8/encoder/encodemv.c b/vp8/encoder/encodemv.c
index c122d038d..0145f6d20 100644
--- a/vp8/encoder/encodemv.c
+++ b/vp8/encoder/encodemv.c
@@ -186,7 +186,7 @@ void vp8_build_component_cost_table(int *mvcost[2], const MV_CONTEXT *mvc, int m
 #define MV_PROB_UPDATE_CORRECTION   -1
 
 
-__inline static void calc_prob(vp8_prob *p, const unsigned int ct[2])
+static void calc_prob(vp8_prob *p, const unsigned int ct[2])
 {
     const unsigned int tot = ct[0] + ct[1];
 
diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index cb8fd3e89..ac83622d5 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -387,7 +387,11 @@ void vp8_end_first_pass(VP8_COMP *cpi)
     output_stats(cpi, cpi->output_pkt_list, &cpi->twopass.total_stats);
 }
 
-static void zz_motion_search( VP8_COMP *cpi, MACROBLOCK * x, YV12_BUFFER_CONFIG * recon_buffer, int * best_motion_err, int recon_yoffset )
+static void zz_motion_search( VP8_COMP *cpi, MACROBLOCK * x,
+                              YV12_BUFFER_CONFIG * raw_buffer,
+                              int * raw_motion_err,
+                              YV12_BUFFER_CONFIG * recon_buffer,
+                              int * best_motion_err, int recon_yoffset)
 {
     MACROBLOCKD * const xd = & x->e_mbd;
     BLOCK *b = &x->block[0];
@@ -395,15 +399,22 @@ static void zz_motion_search( VP8_COMP *cpi, MACROBLOCK * x, YV12_BUFFER_CONFIG
 
     unsigned char *src_ptr = (*(b->base_src) + b->src);
     int src_stride = b->src_stride;
+    unsigned char *raw_ptr;
+    int raw_stride = raw_buffer->y_stride;
     unsigned char *ref_ptr;
     int ref_stride = x->e_mbd.pre.y_stride;
 
+    // Set up pointers for this macro block raw buffer
+    raw_ptr = (unsigned char *)(raw_buffer->y_buffer + recon_yoffset
+                                + d->offset);
+    vp8_mse16x16 ( src_ptr, src_stride, raw_ptr, raw_stride,
+                   (unsigned int *)(raw_motion_err));
+
     // Set up pointers for this macro block recon buffer
     xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;
-
     ref_ptr = (unsigned char *)(xd->pre.y_buffer + d->offset );
-
-    vp8_mse16x16 ( src_ptr, src_stride, ref_ptr, ref_stride, (unsigned int *)(best_motion_err));
+    vp8_mse16x16 ( src_ptr, src_stride, ref_ptr, ref_stride,
+                   (unsigned int *)(best_motion_err));
 }
 
 static void first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x,
@@ -595,12 +606,18 @@ void vp8_first_pass(VP8_COMP *cpi)
                 MV tmp_mv = {0, 0};
                 int tmp_err;
                 int motion_error = INT_MAX;
+                int raw_motion_error = INT_MAX;
 
                 // Simple 0,0 motion with no mv overhead
-                zz_motion_search( cpi, x, lst_yv12, &motion_error, recon_yoffset );
+                zz_motion_search( cpi, x, cpi->last_frame_unscaled_source,
+                                  &raw_motion_error, lst_yv12, &motion_error,
+                                  recon_yoffset );
                 d->bmi.mv.as_mv.row = 0;
                 d->bmi.mv.as_mv.col = 0;
 
+                if (raw_motion_error < cpi->oxcf.encode_breakout)
+                    goto skip_motion_search;
+
                 // Test last reference frame using the previous best mv as the
                 // starting point (best reference) for the search
                 first_pass_motion_search(cpi, x, &best_ref_mv,
@@ -648,6 +665,7 @@ void vp8_first_pass(VP8_COMP *cpi)
                     xd->pre.v_buffer = lst_yv12->v_buffer + recon_uvoffset;
                 }
 
+skip_motion_search:
                 /* Intra assumed best */
                 best_ref_mv.as_int = 0;
 
diff --git a/vp8/encoder/lookahead.c b/vp8/encoder/lookahead.c
index 3e582e369..4c9228186 100644
--- a/vp8/encoder/lookahead.c
+++ b/vp8/encoder/lookahead.c
@@ -73,6 +73,9 @@ vp8_lookahead_init(unsigned int width,
     else if(depth > MAX_LAG_BUFFERS)
         depth = MAX_LAG_BUFFERS;
 
+    /* Keep last frame in lookahead buffer by increasing depth by 1.*/
+    depth += 1;
+
     /* Align the buffer dimensions */
     width = (width + 15) & ~15;
     height = (height + 15) & ~15;
@@ -110,7 +113,7 @@ vp8_lookahead_push(struct lookahead_ctx *ctx,
     int mb_rows = (src->y_height + 15) >> 4;
     int mb_cols = (src->y_width + 15) >> 4;
 
-    if(ctx->sz + 1 > ctx->max_sz)
+    if(ctx->sz + 2 > ctx->max_sz)
         return 1;
     ctx->sz++;
     buf = pop(ctx, &ctx->write_idx);
@@ -177,7 +180,7 @@ vp8_lookahead_pop(struct lookahead_ctx *ctx,
 {
     struct lookahead_entry* buf = NULL;
 
-    if(ctx->sz && (drain || ctx->sz == ctx->max_sz))
+    if(ctx->sz && (drain || ctx->sz == ctx->max_sz - 1))
     {
         buf = pop(ctx, &ctx->read_idx);
         ctx->sz--;
@@ -188,18 +191,33 @@ vp8_lookahead_pop(struct lookahead_ctx *ctx,
 
 struct lookahead_entry*
 vp8_lookahead_peek(struct lookahead_ctx *ctx,
-                   unsigned int          index)
+                   unsigned int          index,
+                   int                   direction)
 {
     struct lookahead_entry* buf = NULL;
 
-    assert(index < ctx->max_sz);
-    if(index < ctx->sz)
+    if (direction == PEEK_FORWARD)
+    {
+        assert(index < ctx->max_sz - 1);
+        if(index < ctx->sz)
+        {
+            index += ctx->read_idx;
+            if(index >= ctx->max_sz)
+                index -= ctx->max_sz;
+            buf = ctx->buf + index;
+        }
+    }
+    else if (direction == PEEK_BACKWARD)
     {
-        index += ctx->read_idx;
-        if(index >= ctx->max_sz)
-            index -= ctx->max_sz;
+        assert(index == 1);
+
+        if(ctx->read_idx == 0)
+            index = ctx->max_sz - 1;
+        else
+            index = ctx->read_idx - index;
         buf = ctx->buf + index;
     }
+
     return buf;
 }
 
diff --git a/vp8/encoder/lookahead.h b/vp8/encoder/lookahead.h
index 32bafcd63..cf56b75b7 100644
--- a/vp8/encoder/lookahead.h
+++ b/vp8/encoder/lookahead.h
@@ -82,6 +82,8 @@ vp8_lookahead_pop(struct lookahead_ctx *ctx,
                   int                   drain);
 
 
+#define PEEK_FORWARD   1
+#define PEEK_BACKWARD -1
 /**\brief Get a future source buffer to encode
  *
  * \param[in] ctx       Pointer to the lookahead context
@@ -92,7 +94,8 @@ vp8_lookahead_pop(struct lookahead_ctx *ctx,
  */
 struct lookahead_entry*
 vp8_lookahead_peek(struct lookahead_ctx *ctx,
-                   unsigned int          index);
+                   unsigned int          index,
+                   int                   direction);
 
 
 /**\brief Get the number of frames currently in the lookahead queue
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 5ad51e846..cd62c9c17 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -144,7 +144,7 @@ extern void vp8cx_init_quantizer(VP8_COMP *cpi);
 extern const int vp8cx_base_skip_false_prob[128];
 
 // Tables relating active max Q to active min Q
-static const int kf_low_motion_minq[QINDEX_RANGE] =
+static const unsigned char kf_low_motion_minq[QINDEX_RANGE] =
 {
     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
@@ -155,7 +155,7 @@ static const int kf_low_motion_minq[QINDEX_RANGE] =
     11,11,12,12,13,13,13,13,14,14,15,15,15,15,16,16,
     16,16,17,17,18,18,18,18,19,20,20,21,21,22,23,23
 };
-static const int kf_high_motion_minq[QINDEX_RANGE] =
+static const unsigned char kf_high_motion_minq[QINDEX_RANGE] =
 {
     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
@@ -166,7 +166,7 @@ static const int kf_high_motion_minq[QINDEX_RANGE] =
     16,16,17,17,18,18,18,18,19,19,20,20,20,20,21,21,
     21,21,22,22,23,23,24,25,25,26,26,27,28,28,29,30
 };
-static const int gf_low_motion_minq[QINDEX_RANGE] =
+static const unsigned char gf_low_motion_minq[QINDEX_RANGE] =
 {
     0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,
     3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,
@@ -177,7 +177,7 @@ static const int gf_low_motion_minq[QINDEX_RANGE] =
     35,35,36,36,37,37,38,38,39,39,40,40,41,41,42,42,
     43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58
 };
-static const int gf_mid_motion_minq[QINDEX_RANGE] =
+static const unsigned char gf_mid_motion_minq[QINDEX_RANGE] =
 {
     0,0,0,0,1,1,1,1,1,1,2,2,3,3,3,4,
     4,4,5,5,5,6,6,6,7,7,7,8,8,8,9,9,
@@ -188,7 +188,7 @@ static const int gf_mid_motion_minq[QINDEX_RANGE] =
     38,39,39,40,40,41,41,42,42,43,43,44,45,46,47,48,
     49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64
 };
-static const int gf_high_motion_minq[QINDEX_RANGE] =
+static const unsigned char gf_high_motion_minq[QINDEX_RANGE] =
 {
     0,0,0,0,1,1,1,1,1,2,2,2,3,3,3,4,
     4,4,5,5,5,6,6,6,7,7,7,8,8,8,9,9,
@@ -199,7 +199,7 @@ static const int gf_high_motion_minq[QINDEX_RANGE] =
     41,41,42,42,43,44,45,46,47,48,49,50,51,52,53,54,
     55,56,57,58,59,60,62,64,66,68,70,72,74,76,78,80
 };
-static const int inter_minq[QINDEX_RANGE] =
+static const unsigned char inter_minq[QINDEX_RANGE] =
 {
     0,0,1,1,2,3,3,4,4,5,6,6,7,8,8,9,
     9,10,11,11,12,13,13,14,15,15,16,17,17,18,19,20,
@@ -1107,8 +1107,7 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi)
         vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
                            "Failed to allocate scaled source buffer");
 
-
-        vpx_free(cpi->tok);
+    vpx_free(cpi->tok);
 
     {
 #if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
@@ -1680,6 +1679,17 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
     cpi->alt_ref_source = NULL;
     cpi->is_src_frame_alt_ref = 0;
 
+#if CONFIG_TEMPORAL_DENOISING
+    if (cpi->oxcf.noise_sensitivity)
+    {
+      if (!cpi->denoiser.yv12_mc_running_avg.buffer_alloc)
+      {
+        int width = (cpi->oxcf.Width + 15) & ~15;
+        int height = (cpi->oxcf.Height + 15) & ~15;
+        vp8_denoiser_allocate(&cpi->denoiser, width, height);
+      }
+    }
+#endif
 
 #if 0
     // Experimental RD Code
@@ -2314,6 +2324,9 @@ void vp8_remove_compressor(VP8_COMP **ptr)
     vp8cx_remove_encoder_threads(cpi);
 #endif
 
+#if CONFIG_TEMPORAL_DENOISING
+    vp8_denoiser_free(&cpi->denoiser);
+#endif
     dealloc_compressor_data(cpi);
     vpx_free(cpi->mb.ss);
     vpx_free(cpi->tok);
@@ -2920,7 +2933,6 @@ static void Pass1Encode(VP8_COMP *cpi, unsigned long *size, unsigned char *dest,
     (void) frame_flags;
     vp8_set_quantizer(cpi, 26);
 
-    scale_and_extend_source(cpi->un_scaled_source, cpi);
     vp8_first_pass(cpi);
 }
 #endif
@@ -3133,7 +3145,12 @@ void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm)
     }
 
     vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show);
-
+#if CONFIG_TEMPORAL_DENOISING
+    if (cpi->oxcf.noise_sensitivity)
+    {
+      vp8_yv12_extend_frame_borders(&cpi->denoiser.yv12_running_avg);
+    }
+#endif
 }
 
 static void encode_frame_to_data_rate
@@ -3589,7 +3606,7 @@ static void encode_frame_to_data_rate
 
 
     scale_and_extend_source(cpi->un_scaled_source, cpi);
-#if !(CONFIG_REALTIME_ONLY) && CONFIG_POSTPROC
+#if !(CONFIG_REALTIME_ONLY) && CONFIG_POSTPROC && !(CONFIG_TEMPORAL_DENOISING)
 
     if (cpi->oxcf.noise_sensitivity > 0)
     {
@@ -4702,7 +4719,8 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
         cpi->source_alt_ref_pending)
     {
         if ((cpi->source = vp8_lookahead_peek(cpi->lookahead,
-                                              cpi->frames_till_gf_update_due)))
+                                              cpi->frames_till_gf_update_due,
+                                              PEEK_FORWARD)))
         {
             cpi->alt_ref_source = cpi->source;
             if (cpi->oxcf.arnr_max_frames > 0)
@@ -4724,6 +4742,15 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
 
     if (!cpi->source)
     {
+        /* Read last frame source if we are encoding first pass. */
+        if (cpi->pass == 1 && cm->current_video_frame > 0)
+        {
+            if((cpi->last_source = vp8_lookahead_peek(cpi->lookahead, 1,
+                                                      PEEK_BACKWARD)) == NULL)
+              return -1;
+        }
+
+
         if ((cpi->source = vp8_lookahead_pop(cpi->lookahead, flush)))
         {
             cm->show_frame = 1;
@@ -4743,6 +4770,11 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
         *time_stamp = cpi->source->ts_start;
         *time_end = cpi->source->ts_end;
         *frame_flags = cpi->source->flags;
+
+        if (cpi->pass == 1 && cm->current_video_frame > 0)
+        {
+            cpi->last_frame_unscaled_source = &cpi->last_source->img;
+        }
     }
     else
     {
@@ -5026,7 +5058,6 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
                 double frame_psnr;
                 YV12_BUFFER_CONFIG      *orig = cpi->Source;
                 YV12_BUFFER_CONFIG      *recon = cpi->common.frame_to_show;
-                YV12_BUFFER_CONFIG      *pp = &cm->post_proc_buffer;
                 int y_samples = orig->y_height * orig->y_width ;
                 int uv_samples = orig->uv_height * orig->uv_width ;
                 int t_samples = y_samples + 2 * uv_samples;
@@ -5050,7 +5081,9 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
                 cpi->total_v += vp8_mse2psnr(uv_samples, 255.0, ve);
                 cpi->total_sq_error += sq_error;
                 cpi->total  += frame_psnr;
+#if CONFIG_POSTPROC
                 {
+                    YV12_BUFFER_CONFIG      *pp = &cm->post_proc_buffer;
                     double frame_psnr2, frame_ssim2 = 0;
                     double weight = 0;
 
@@ -5101,6 +5134,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
                          }
                     }
                 }
+#endif
             }
 
             if (cpi->b_calculate_ssimg)
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index e9e2ee52e..c7a1de8e8 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -28,6 +28,9 @@
 #include "mcomp.h"
 #include "vp8/common/findnearmv.h"
 #include "lookahead.h"
+#if CONFIG_TEMPORAL_DENOISING
+#include "vp8/encoder/denoising.h"
+#endif
 
 //#define SPEEDSTATS 1
 #define MIN_GF_INTERVAL             4
@@ -313,10 +316,12 @@ typedef struct VP8_COMP
     struct lookahead_ctx    *lookahead;
     struct lookahead_entry  *source;
     struct lookahead_entry  *alt_ref_source;
+    struct lookahead_entry  *last_source;
 
     YV12_BUFFER_CONFIG *Source;
     YV12_BUFFER_CONFIG *un_scaled_source;
     YV12_BUFFER_CONFIG scaled_source;
+    YV12_BUFFER_CONFIG *last_frame_unscaled_source;
 
     int source_alt_ref_pending; // frame in src_buffers has been identified to be encoded as an alt ref
     int source_alt_ref_active;  // an alt ref frame has been encoded and is usable
@@ -661,6 +666,10 @@ typedef struct VP8_COMP
 
     int droppable;
 
+#if CONFIG_TEMPORAL_DENOISING
+    VP8_DENOISER denoiser;
+#endif
+
     // Coding layer state variables
     unsigned int current_layer;
     LAYER_CONTEXT layer_context[MAX_LAYERS];
diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
index 65e6c1294..24e041f8d 100644
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -24,6 +24,9 @@
 #include "mcomp.h"
 #include "rdopt.h"
 #include "vpx_mem/vpx_mem.h"
+#if CONFIG_TEMPORAL_DENOISING
+#include "denoising.h"
+#endif
 
 extern int VP8_UVSSE(MACROBLOCK *x);
 
@@ -450,6 +453,48 @@ void get_lower_res_motion_info(VP8_COMP *cpi, MACROBLOCKD *xd, int *dissim,
 }
 #endif
 
+static void check_for_encode_breakout(unsigned int sse, MACROBLOCK* x)
+{
+    if (sse < x->encode_breakout)
+    {
+        // Check u and v to make sure skip is ok
+        int sse2 = 0;
+
+        sse2 = VP8_UVSSE(x);
+
+        if (sse2 * 2 < x->encode_breakout)
+            x->skip = 1;
+        else
+            x->skip = 0;
+    }
+}
+
+static int evaluate_inter_mode(unsigned int* sse, int rate2, int* distortion2, VP8_COMP *cpi, MACROBLOCK *x)
+{
+    MB_PREDICTION_MODE this_mode = x->e_mbd.mode_info_context->mbmi.mode;
+    int_mv mv = x->e_mbd.mode_info_context->mbmi.mv;
+    int this_rd;
+    /* Exit early and don't compute the distortion if this macroblock
+     * is marked inactive. */
+    if (cpi->active_map_enabled && x->active_ptr[0] == 0)
+    {
+        *sse = 0;
+        *distortion2 = 0;
+        x->skip = 1;
+        return INT_MAX;
+    }
+
+    if((this_mode != NEWMV) ||
+        !(cpi->sf.half_pixel_search) || cpi->common.full_pixel==1)
+        *distortion2 = get_inter_mbpred_error(x,
+                                              &cpi->fn_ptr[BLOCK_16X16],
+                                              sse, mv);
+
+    this_rd = RDCOST(x->rdmult, x->rddiv, rate2, *distortion2);
+
+    check_for_encode_breakout(*sse, x);
+    return this_rd;
+}
 
 void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
                          int recon_uvoffset, int *returnrate,
@@ -476,7 +521,10 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
     int distortion2;
     int bestsme = INT_MAX;
     int best_mode_index = 0;
-    unsigned int sse = INT_MAX, best_sse = INT_MAX;
+    unsigned int sse = INT_MAX, best_rd_sse = INT_MAX;
+#if CONFIG_TEMPORAL_DENOISING
+    unsigned int zero_mv_sse = 0, best_sse = INT_MAX;
+#endif
 
     int_mv mvp;
 
@@ -488,9 +536,6 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
     int ref_frame_map[4];
     int sign_bias = 0;
 
-    int have_subp_search = cpi->sf.half_pixel_search;  /* In real-time mode,
-                                       when Speed >= 15, no sub-pixel search. */
-
 #if CONFIG_MULTI_RES_ENCODING
     int dissim = INT_MAX;
     int parent_ref_frame = 0;
@@ -657,7 +702,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
         {
         case B_PRED:
             /* Pass best so far to pick_intra4x4mby_modes to use as breakout */
-            distortion2 = best_sse;
+            distortion2 = best_rd_sse;
             pick_intra4x4mby_modes(x, &rate, &distortion2);
 
             if (distortion2 == INT_MAX)
@@ -905,43 +950,38 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
             rate2 += vp8_cost_mv_ref(this_mode, mdcounts);
             x->e_mbd.mode_info_context->mbmi.mv.as_int =
                                                     mode_mv[this_mode].as_int;
-
-            /* Exit early and don't compute the distortion if this macroblock
-             * is marked inactive. */
-            if (cpi->active_map_enabled && x->active_ptr[0] == 0)
-            {
-                sse = 0;
-                distortion2 = 0;
-                x->skip = 1;
-                break;
-            }
-
-            if((this_mode != NEWMV) ||
-                !(have_subp_search) || cpi->common.full_pixel==1)
-                distortion2 = get_inter_mbpred_error(x,
-                                                     &cpi->fn_ptr[BLOCK_16X16],
-                                                     &sse, mode_mv[this_mode]);
-
-            this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
-
-            if (sse < x->encode_breakout)
-            {
-                // Check u and v to make sure skip is ok
-                int sse2 = 0;
-
-                sse2 = VP8_UVSSE(x);
-
-                if (sse2 * 2 < x->encode_breakout)
-                    x->skip = 1;
-                else
-                    x->skip = 0;
-            }
+            this_rd = evaluate_inter_mode(&sse, rate2, &distortion2, cpi, x);
 
             break;
         default:
             break;
         }
 
+#if CONFIG_TEMPORAL_DENOISING
+        if (cpi->oxcf.noise_sensitivity)
+        {
+          // Store for later use by denoiser.
+          if (this_mode == ZEROMV &&
+              x->e_mbd.mode_info_context->mbmi.ref_frame == LAST_FRAME)
+          {
+            zero_mv_sse = sse;
+          }
+
+          // Store the best NEWMV in x for later use in the denoiser.
+          // We are restricted to the LAST_FRAME since the denoiser only keeps
+          // one filter state.
+          if (x->e_mbd.mode_info_context->mbmi.mode == NEWMV &&
+              x->e_mbd.mode_info_context->mbmi.ref_frame == LAST_FRAME)
+          {
+            best_sse = sse;
+            x->e_mbd.best_sse_inter_mode = NEWMV;
+            x->e_mbd.best_sse_mv = x->e_mbd.mode_info_context->mbmi.mv;
+            x->e_mbd.need_to_clamp_best_mvs =
+                x->e_mbd.mode_info_context->mbmi.need_to_clamp_mvs;
+          }
+        }
+#endif
+
         if (this_rd < best_rd || x->skip)
         {
             // Note index of best mode
@@ -949,7 +989,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
 
             *returnrate = rate2;
             *returndistortion = distortion2;
-            best_sse = sse;
+            best_rd_sse = sse;
             best_rd = this_rd;
             vpx_memcpy(&best_mbmode, &x->e_mbd.mode_info_context->mbmi,
                        sizeof(MB_MODE_INFO));
@@ -1011,6 +1051,43 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
         cpi->error_bins[this_rdbin] ++;
     }
 
+#if CONFIG_TEMPORAL_DENOISING
+    if (cpi->oxcf.noise_sensitivity)
+    {
+      if (x->e_mbd.best_sse_inter_mode == DC_PRED) {
+        // No best MV found.
+        x->e_mbd.best_sse_inter_mode = best_mbmode.mode;
+        x->e_mbd.best_sse_mv = best_mbmode.mv;
+        x->e_mbd.need_to_clamp_best_mvs = best_mbmode.need_to_clamp_mvs;
+        best_sse = best_rd_sse;
+      }
+      vp8_denoiser_denoise_mb(&cpi->denoiser, x, best_sse, zero_mv_sse,
+                              recon_yoffset, recon_uvoffset);
+
+      // Reevaluate ZEROMV after denoising.
+      if (best_mbmode.ref_frame == INTRA_FRAME)
+      {
+        int this_rd = 0;
+        rate2 = 0;
+        distortion2 = 0;
+        x->e_mbd.mode_info_context->mbmi.ref_frame = LAST_FRAME;
+        rate2 += x->ref_frame_cost[x->e_mbd.mode_info_context->mbmi.ref_frame];
+        this_mode = ZEROMV;
+        rate2 += vp8_cost_mv_ref(this_mode, mdcounts);
+        x->e_mbd.mode_info_context->mbmi.mode = this_mode;
+        x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;
+        x->e_mbd.mode_info_context->mbmi.mv.as_int = 0;
+        this_rd = evaluate_inter_mode(&sse, rate2, &distortion2, cpi, x);
+
+        if (this_rd < best_rd || x->skip)
+        {
+            vpx_memcpy(&best_mbmode, &x->e_mbd.mode_info_context->mbmi,
+                       sizeof(MB_MODE_INFO));
+        }
+      }
+    }
+#endif
+
     if (cpi->is_src_frame_alt_ref &&
         (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME))
     {
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index bb2b0ca71..8f575e498 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -33,11 +33,33 @@
 #include "rdopt.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vp8/common/systemdependent.h"
+#if CONFIG_TEMPORAL_DENOISING
+#include "denoising.h"
+#endif
 
 extern void vp8_update_zbin_extra(VP8_COMP *cpi, MACROBLOCK *x);
 
 #define MAXF(a,b)            (((a) > (b)) ? (a) : (b))
 
+typedef struct rate_distortion_struct
+{
+    int rate2;
+    int rate_y;
+    int rate_uv;
+    int distortion2;
+    int distortion_uv;
+} RATE_DISTORTION;
+
+typedef struct best_mode_struct
+{
+  int yrd;
+  int rd;
+  int intra_rd;
+  MB_MODE_INFO mbmode;
+  union b_mode_info bmodes[16];
+  PARTITION_INFO partition;
+} BEST_MODE;
+
 static const int auto_speed_thresh[17] =
 {
     1000,
@@ -741,7 +763,7 @@ static int rd_pick_intra4x4mby_modes(VP8_COMP *cpi, MACROBLOCK *mb, int *Rate,
         return INT_MAX;
 
     *Rate = cost;
-    *rate_y += tot_rate_y;
+    *rate_y = tot_rate_y;
     *Distortion = distortion;
 
     return RDCOST(mb->rdmult, mb->rddiv, cost, distortion);
@@ -1327,7 +1349,7 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x,
     }
 }
 
-static __inline
+static
 void vp8_cal_step_param(int sr, int *sp)
 {
     int step = 0;
@@ -1711,6 +1733,181 @@ static void rd_update_mvcount(VP8_COMP *cpi, MACROBLOCK *x, int_mv *best_ref_mv)
     }
 }
 
+static int evaluate_inter_mode_rd(int mdcounts[4],
+                                  RATE_DISTORTION* rd,
+                                  int* disable_skip,
+                                  VP8_COMP *cpi, MACROBLOCK *x)
+{
+    MB_PREDICTION_MODE this_mode = x->e_mbd.mode_info_context->mbmi.mode;
+    BLOCK *b = &x->block[0];
+    MACROBLOCKD *xd = &x->e_mbd;
+    int distortion;
+    vp8_build_inter16x16_predictors_mby(&x->e_mbd, x->e_mbd.predictor, 16);
+
+    if (cpi->active_map_enabled && x->active_ptr[0] == 0) {
+        x->skip = 1;
+    }
+    else if (x->encode_breakout)
+    {
+        unsigned int sse;
+        unsigned int var;
+        int threshold = (xd->block[0].dequant[1]
+                    * xd->block[0].dequant[1] >>4);
+
+        if(threshold < x->encode_breakout)
+            threshold = x->encode_breakout;
+
+        var = vp8_variance16x16
+                (*(b->base_src), b->src_stride,
+                x->e_mbd.predictor, 16, &sse);
+
+        if (sse < threshold)
+        {
+             unsigned int q2dc = xd->block[24].dequant[0];
+            /* If theres is no codeable 2nd order dc
+               or a very small uniform pixel change change */
+            if ((sse - var < q2dc * q2dc >>4) ||
+                (sse /2 > var && sse-var < 64))
+            {
+                // Check u and v to make sure skip is ok
+                int sse2=  VP8_UVSSE(x);
+                if (sse2 * 2 < threshold)
+                {
+                    x->skip = 1;
+                    rd->distortion2 = sse + sse2;
+                    rd->rate2 = 500;
+
+                    /* for best_yrd calculation */
+                    rd->rate_uv = 0;
+                    rd->distortion_uv = sse2;
+
+                    *disable_skip = 1;
+                    return RDCOST(x->rdmult, x->rddiv, rd->rate2,
+                                  rd->distortion2);
+                }
+            }
+        }
+    }
+
+
+    //intermodecost[mode_index] = vp8_cost_mv_ref(this_mode, mdcounts);   // Experimental debug code
+
+    // Add in the Mv/mode cost
+    rd->rate2 += vp8_cost_mv_ref(this_mode, mdcounts);
+
+    // Y cost and distortion
+    macro_block_yrd(x, &rd->rate_y, &distortion);
+    rd->rate2 += rd->rate_y;
+    rd->distortion2 += distortion;
+
+    // UV cost and distortion
+    rd_inter16x16_uv(cpi, x, &rd->rate_uv, &rd->distortion_uv,
+                     cpi->common.full_pixel);
+    rd->rate2 += rd->rate_uv;
+    rd->distortion2 += rd->distortion_uv;
+    return INT_MAX;
+}
+
+static int calculate_final_rd_costs(int this_rd,
+                                    RATE_DISTORTION* rd,
+                                    int* other_cost,
+                                    int disable_skip,
+                                    int uv_intra_tteob,
+                                    int intra_rd_penalty,
+                                    VP8_COMP *cpi, MACROBLOCK *x)
+{
+    MB_PREDICTION_MODE this_mode = x->e_mbd.mode_info_context->mbmi.mode;
+    // Where skip is allowable add in the default per mb cost for the no skip case.
+    // where we then decide to skip we have to delete this and replace it with the
+    // cost of signallying a skip
+    if (cpi->common.mb_no_coeff_skip)
+    {
+        *other_cost += vp8_cost_bit(cpi->prob_skip_false, 0);
+        rd->rate2 += *other_cost;
+    }
+
+    /* Estimate the reference frame signaling cost and add it
+     * to the rolling cost variable.
+     */
+    rd->rate2 +=
+        x->ref_frame_cost[x->e_mbd.mode_info_context->mbmi.ref_frame];
+
+    if (!disable_skip)
+    {
+        // Test for the condition where skip block will be activated because there are no non zero coefficients and make any necessary adjustment for rate
+        if (cpi->common.mb_no_coeff_skip)
+        {
+            int i;
+            int tteob;
+            int has_y2_block = (this_mode!=SPLITMV && this_mode!=B_PRED);
+
+            tteob = 0;
+            if(has_y2_block)
+                tteob += x->e_mbd.eobs[24];
+
+            for (i = 0; i < 16; i++)
+                tteob += (x->e_mbd.eobs[i] > has_y2_block);
+
+            if (x->e_mbd.mode_info_context->mbmi.ref_frame)
+            {
+                for (i = 16; i < 24; i++)
+                    tteob += x->e_mbd.eobs[i];
+            }
+            else
+                tteob += uv_intra_tteob;
+
+            if (tteob == 0)
+            {
+                rd->rate2 -= (rd->rate_y + rd->rate_uv);
+                //for best_yrd calculation
+                rd->rate_uv = 0;
+
+                // Back out no skip flag costing and add in skip flag costing
+                if (cpi->prob_skip_false)
+                {
+                    int prob_skip_cost;
+
+                    prob_skip_cost = vp8_cost_bit(cpi->prob_skip_false, 1);
+                    prob_skip_cost -= vp8_cost_bit(cpi->prob_skip_false, 0);
+                    rd->rate2 += prob_skip_cost;
+                    *other_cost += prob_skip_cost;
+                }
+            }
+        }
+        // Calculate the final RD estimate for this mode
+        this_rd = RDCOST(x->rdmult, x->rddiv, rd->rate2, rd->distortion2);
+        if (this_rd < INT_MAX && x->e_mbd.mode_info_context->mbmi.ref_frame
+                                 == INTRA_FRAME)
+            this_rd += intra_rd_penalty;
+    }
+    return this_rd;
+}
+
+static void update_best_mode(BEST_MODE* best_mode, int this_rd,
+                             RATE_DISTORTION* rd, int other_cost, MACROBLOCK *x)
+{
+    MB_PREDICTION_MODE this_mode = x->e_mbd.mode_info_context->mbmi.mode;
+
+    other_cost +=
+    x->ref_frame_cost[x->e_mbd.mode_info_context->mbmi.ref_frame];
+
+    /* Calculate the final y RD estimate for this mode */
+    best_mode->yrd = RDCOST(x->rdmult, x->rddiv, (rd->rate2-rd->rate_uv-other_cost),
+                      (rd->distortion2-rd->distortion_uv));
+
+    best_mode->rd = this_rd;
+    vpx_memcpy(&best_mode->mbmode, &x->e_mbd.mode_info_context->mbmi, sizeof(MB_MODE_INFO));
+    vpx_memcpy(&best_mode->partition, x->partition_info, sizeof(PARTITION_INFO));
+
+    if ((this_mode == B_PRED) || (this_mode == SPLITMV))
+    {
+        int i;
+        for (i = 0; i < 16; i++)
+        {
+            best_mode->bmodes[i] = x->e_mbd.block[i].bmi;
+        }
+    }
+}
 
 void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
                             int recon_uvoffset, int *returnrate,
@@ -1719,9 +1916,6 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
     BLOCK *b = &x->block[0];
     BLOCKD *d = &x->e_mbd.block[0];
     MACROBLOCKD *xd = &x->e_mbd;
-    union b_mode_info best_bmodes[16];
-    MB_MODE_INFO best_mbmode;
-    PARTITION_INFO best_partition;
     int_mv best_ref_mv_sb[2];
     int_mv mode_mv_sb[2][MB_MODE_COUNT];
     int_mv best_ref_mv;
@@ -1729,21 +1923,16 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
     MB_PREDICTION_MODE this_mode;
     int num00;
     int best_mode_index = 0;
+    BEST_MODE best_mode;
 
     int i;
     int mode_index;
     int mdcounts[4];
     int rate;
-    int distortion;
-    int best_rd = INT_MAX;
-    int best_intra_rd = INT_MAX;
-    int rate2, distortion2;
+    RATE_DISTORTION rd;
     int uv_intra_rate, uv_intra_distortion, uv_intra_rate_tokenonly;
     int uv_intra_tteob = 0;
     int uv_intra_done = 0;
-    int rate_y, UNINITIALIZED_IS_SAFE(rate_uv);
-    int distortion_uv;
-    int best_yrd = INT_MAX;
 
     MB_PREDICTION_MODE uv_intra_mode = 0;
     int_mv mvp;
@@ -1760,9 +1949,12 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
 
     mode_mv = mode_mv_sb[sign_bias];
     best_ref_mv.as_int = 0;
+    best_mode.rd = INT_MAX;
+    best_mode.yrd = INT_MAX;
+    best_mode.intra_rd = INT_MAX;
     vpx_memset(mode_mv_sb, 0, sizeof(mode_mv_sb));
-    vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));
-    vpx_memset(&best_bmodes, 0, sizeof(best_bmodes));
+    vpx_memset(&best_mode.mbmode, 0, sizeof(best_mode.mbmode));
+    vpx_memset(&best_mode.bmodes, 0, sizeof(best_mode.bmodes));
 
     /* Setup search priorities */
     get_reference_search_order(cpi, ref_frame_map);
@@ -1799,15 +1991,15 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
         int this_ref_frame = ref_frame_map[vp8_ref_frame_order[mode_index]];
 
         // Test best rd so far against threshold for trying this mode.
-        if (best_rd <= cpi->rd_threshes[mode_index])
+        if (best_mode.rd <= cpi->rd_threshes[mode_index])
             continue;
 
         if (this_ref_frame < 0)
             continue;
 
         // These variables hold are rolling total cost and distortion for this mode
-        rate2 = 0;
-        distortion2 = 0;
+        rd.rate2 = 0;
+        rd.distortion2 = 0;
 
         this_mode = vp8_mode_order[mode_index];
 
@@ -1907,16 +2099,17 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
             int tmp_rd;
 
             // Note the rate value returned here includes the cost of coding the BPRED mode : x->mbmode_cost[x->e_mbd.frame_type][BPRED];
-            tmp_rd = rd_pick_intra4x4mby_modes(cpi, x, &rate, &rate_y, &distortion, best_yrd);
-            rate2 += rate;
-            distortion2 += distortion;
+            int distortion;
+            tmp_rd = rd_pick_intra4x4mby_modes(cpi, x, &rate, &rd.rate_y, &distortion, best_mode.yrd);
+            rd.rate2 += rate;
+            rd.distortion2 += distortion;
 
-            if(tmp_rd < best_yrd)
+            if(tmp_rd < best_mode.yrd)
             {
-                rate2 += uv_intra_rate;
-                rate_uv = uv_intra_rate_tokenonly;
-                distortion2 += uv_intra_distortion;
-                distortion_uv = uv_intra_distortion;
+                rd.rate2 += uv_intra_rate;
+                rd.rate_uv = uv_intra_rate_tokenonly;
+                rd.distortion2 += uv_intra_distortion;
+                rd.distortion_uv = uv_intra_distortion;
             }
             else
             {
@@ -1930,24 +2123,25 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
         {
             int tmp_rd;
             int this_rd_thresh;
+            int distortion;
 
             this_rd_thresh = (vp8_ref_frame_order[mode_index] == 1) ? cpi->rd_threshes[THR_NEW1] : cpi->rd_threshes[THR_NEW3];
             this_rd_thresh = (vp8_ref_frame_order[mode_index] == 2) ? cpi->rd_threshes[THR_NEW2] : this_rd_thresh;
 
             tmp_rd = vp8_rd_pick_best_mbsegmentation(cpi, x, &best_ref_mv,
-                                                     best_yrd, mdcounts,
-                                                     &rate, &rate_y, &distortion, this_rd_thresh) ;
+                                                     best_mode.yrd, mdcounts,
+                                                     &rate, &rd.rate_y, &distortion, this_rd_thresh) ;
 
-            rate2 += rate;
-            distortion2 += distortion;
+            rd.rate2 += rate;
+            rd.distortion2 += distortion;
 
             // If even the 'Y' rd value of split is higher than best so far then dont bother looking at UV
-            if (tmp_rd < best_yrd)
+            if (tmp_rd < best_mode.yrd)
             {
                 // Now work out UV cost and add it in
-                rd_inter4x4_uv(cpi, x, &rate_uv, &distortion_uv, cpi->common.full_pixel);
-                rate2 += rate_uv;
-                distortion2 += distortion_uv;
+                rd_inter4x4_uv(cpi, x, &rd.rate_uv, &rd.distortion_uv, cpi->common.full_pixel);
+                rd.rate2 += rd.rate_uv;
+                rd.distortion2 += rd.distortion_uv;
             }
             else
             {
@@ -1960,18 +2154,21 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
         case V_PRED:
         case H_PRED:
         case TM_PRED:
+        {
+            int distortion;
             x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;
             vp8_build_intra_predictors_mby
                 (&x->e_mbd);
-            macro_block_yrd(x, &rate_y, &distortion) ;
-            rate2 += rate_y;
-            distortion2 += distortion;
-            rate2 += x->mbmode_cost[x->e_mbd.frame_type][x->e_mbd.mode_info_context->mbmi.mode];
-            rate2 += uv_intra_rate;
-            rate_uv = uv_intra_rate_tokenonly;
-            distortion2 += uv_intra_distortion;
-            distortion_uv = uv_intra_distortion;
-            break;
+            macro_block_yrd(x, &rd.rate_y, &distortion) ;
+            rd.rate2 += rd.rate_y;
+            rd.distortion2 += distortion;
+            rd.rate2 += x->mbmode_cost[x->e_mbd.frame_type][x->e_mbd.mode_info_context->mbmi.mode];
+            rd.rate2 += uv_intra_rate;
+            rd.rate_uv = uv_intra_rate_tokenonly;
+            rd.distortion2 += uv_intra_distortion;
+            rd.distortion_uv = uv_intra_distortion;
+        }
+        break;
 
         case NEWMV:
         {
@@ -2114,7 +2311,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
             mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
 
             // Add the new motion vector cost to our rolling cost variable
-            rate2 += vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv, x->mvcost, 96);
+            rd.rate2 += vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv, x->mvcost, 96);
         }
 
         case NEARESTMV:
@@ -2136,177 +2333,57 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
                 continue;
 
             vp8_set_mbmode_and_mvs(x, this_mode, &mode_mv[this_mode]);
-            vp8_build_inter16x16_predictors_mby(&x->e_mbd, x->e_mbd.predictor, 16);
-
-            if (cpi->active_map_enabled && x->active_ptr[0] == 0) {
-                x->skip = 1;
-            }
-            else if (x->encode_breakout)
-            {
-                unsigned int sse;
-                unsigned int var;
-                int threshold = (xd->block[0].dequant[1]
-                            * xd->block[0].dequant[1] >>4);
-
-                if(threshold < x->encode_breakout)
-                    threshold = x->encode_breakout;
-
-                var = vp8_variance16x16
-                        (*(b->base_src), b->src_stride,
-                        x->e_mbd.predictor, 16, &sse);
-
-                if (sse < threshold)
-                {
-                     unsigned int q2dc = xd->block[24].dequant[0];
-                    /* If theres is no codeable 2nd order dc
-                       or a very small uniform pixel change change */
-                    if ((sse - var < q2dc * q2dc >>4) ||
-                        (sse /2 > var && sse-var < 64))
-                    {
-                        // Check u and v to make sure skip is ok
-                        int sse2=  VP8_UVSSE(x);
-                        if (sse2 * 2 < threshold)
-                        {
-                            x->skip = 1;
-                            distortion2 = sse + sse2;
-                            rate2 = 500;
-
-                            /* for best_yrd calculation */
-                            rate_uv = 0;
-                            distortion_uv = sse2;
-
-                            disable_skip = 1;
-                            this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
-
-                            break;
-                        }
-                    }
-                }
-            }
-
-
-            //intermodecost[mode_index] = vp8_cost_mv_ref(this_mode, mdcounts);   // Experimental debug code
-
-            // Add in the Mv/mode cost
-            rate2 += vp8_cost_mv_ref(this_mode, mdcounts);
-
-            // Y cost and distortion
-            macro_block_yrd(x, &rate_y, &distortion);
-            rate2 += rate_y;
-            distortion2 += distortion;
-
-            // UV cost and distortion
-            rd_inter16x16_uv(cpi, x, &rate_uv, &distortion_uv, cpi->common.full_pixel);
-            rate2 += rate_uv;
-            distortion2 += distortion_uv;
+            this_rd = evaluate_inter_mode_rd(mdcounts, &rd,
+                                             &disable_skip, cpi, x);
             break;
 
         default:
             break;
         }
 
-        // Where skip is allowable add in the default per mb cost for the no skip case.
-        // where we then decide to skip we have to delete this and replace it with the
-        // cost of signallying a skip
-        if (cpi->common.mb_no_coeff_skip)
-        {
-            other_cost += vp8_cost_bit(cpi->prob_skip_false, 0);
-            rate2 += other_cost;
-        }
+        this_rd = calculate_final_rd_costs(this_rd, &rd, &other_cost,
+                                           disable_skip, uv_intra_tteob,
+                                           intra_rd_penalty, cpi, x);
 
-        /* Estimate the reference frame signaling cost and add it
-         * to the rolling cost variable.
-         */
-        rate2 +=
-            x->ref_frame_cost[x->e_mbd.mode_info_context->mbmi.ref_frame];
-
-        if (!disable_skip)
+        // Keep record of best intra distortion
+        if ((x->e_mbd.mode_info_context->mbmi.ref_frame == INTRA_FRAME) &&
+            (this_rd < best_mode.intra_rd) )
         {
-            // Test for the condition where skip block will be activated because there are no non zero coefficients and make any necessary adjustment for rate
-            if (cpi->common.mb_no_coeff_skip)
-            {
-                int tteob;
-                int has_y2_block = (this_mode!=SPLITMV && this_mode!=B_PRED);
-
-                tteob = 0;
-                if(has_y2_block)
-                    tteob += x->e_mbd.eobs[24];
-
-                for (i = 0; i < 16; i++)
-                    tteob += (x->e_mbd.eobs[i] > has_y2_block);
-
-                if (x->e_mbd.mode_info_context->mbmi.ref_frame)
-                {
-                    for (i = 16; i < 24; i++)
-                        tteob += x->e_mbd.eobs[i];
-                }
-                else
-                    tteob += uv_intra_tteob;
-
-                if (tteob == 0)
-                {
-                    rate2 -= (rate_y + rate_uv);
-                    //for best_yrd calculation
-                    rate_uv = 0;
-
-                    // Back out no skip flag costing and add in skip flag costing
-                    if (cpi->prob_skip_false)
-                    {
-                        int prob_skip_cost;
-
-                        prob_skip_cost = vp8_cost_bit(cpi->prob_skip_false, 1);
-                        prob_skip_cost -= vp8_cost_bit(cpi->prob_skip_false, 0);
-                        rate2 += prob_skip_cost;
-                        other_cost += prob_skip_cost;
-                    }
-                }
-            }
-            // Calculate the final RD estimate for this mode
-            this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
-            if (this_rd < INT_MAX && x->e_mbd.mode_info_context->mbmi.ref_frame
-                                     == INTRA_FRAME)
-                this_rd += intra_rd_penalty;
+          best_mode.intra_rd = this_rd;
+            *returnintra = rd.distortion2 ;
         }
 
-        // Keep record of best intra distortion
-        if ((x->e_mbd.mode_info_context->mbmi.ref_frame == INTRA_FRAME) &&
-            (this_rd < best_intra_rd) )
+#if CONFIG_TEMPORAL_DENOISING
+        if (cpi->oxcf.noise_sensitivity)
         {
-            best_intra_rd = this_rd;
-            *returnintra = distortion2 ;
+          // Store the best NEWMV in x for later use in the denoiser.
+          // We are restricted to the LAST_FRAME since the denoiser only keeps
+          // one filter state.
+          if (x->e_mbd.mode_info_context->mbmi.mode == NEWMV &&
+              x->e_mbd.mode_info_context->mbmi.ref_frame == LAST_FRAME)
+          {
+            x->e_mbd.best_sse_inter_mode = NEWMV;
+            x->e_mbd.best_sse_mv = x->e_mbd.mode_info_context->mbmi.mv;
+            x->e_mbd.need_to_clamp_best_mvs =
+                x->e_mbd.mode_info_context->mbmi.need_to_clamp_mvs;
+          }
         }
+#endif
 
         // Did this mode help.. i.i is it the new best mode
-        if (this_rd < best_rd || x->skip)
+        if (this_rd < best_mode.rd || x->skip)
         {
             // Note index of best mode so far
             best_mode_index = mode_index;
-
+            *returnrate = rd.rate2;
+            *returndistortion = rd.distortion2;
             if (this_mode <= B_PRED)
             {
                 x->e_mbd.mode_info_context->mbmi.uv_mode = uv_intra_mode;
                 /* required for left and above block mv */
                 x->e_mbd.mode_info_context->mbmi.mv.as_int = 0;
             }
-
-            other_cost +=
-            x->ref_frame_cost[x->e_mbd.mode_info_context->mbmi.ref_frame];
-
-            /* Calculate the final y RD estimate for this mode */
-            best_yrd = RDCOST(x->rdmult, x->rddiv, (rate2-rate_uv-other_cost),
-                              (distortion2-distortion_uv));
-
-            *returnrate = rate2;
-            *returndistortion = distortion2;
-            best_rd = this_rd;
-            vpx_memcpy(&best_mbmode, &x->e_mbd.mode_info_context->mbmi, sizeof(MB_MODE_INFO));
-            vpx_memcpy(&best_partition, x->partition_info, sizeof(PARTITION_INFO));
-
-            if ((this_mode == B_PRED) || (this_mode == SPLITMV))
-                for (i = 0; i < 16; i++)
-                {
-                    best_bmodes[i] = x->e_mbd.block[i].bmi;
-                }
+            update_best_mode(&best_mode, this_rd, &rd, other_cost, x);
 
 
             // Testing this mode gave rise to an improvement in best error score. Lower threshold a bit for next time
@@ -2359,9 +2436,50 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
     // Note how often each mode chosen as best
     cpi->mode_chosen_counts[best_mode_index] ++;
 
+#if CONFIG_TEMPORAL_DENOISING
+    if (cpi->oxcf.noise_sensitivity)
+    {
+      if (x->e_mbd.best_sse_inter_mode == DC_PRED) {
+        // No best MV found.
+        x->e_mbd.best_sse_inter_mode = best_mode.mbmode.mode;
+        x->e_mbd.best_sse_mv = best_mode.mbmode.mv;
+        x->e_mbd.need_to_clamp_best_mvs = best_mode.mbmode.need_to_clamp_mvs;
+      }
+
+      // TODO(holmer): No SSEs are calculated in rdopt.c. What else can be used?
+      vp8_denoiser_denoise_mb(&cpi->denoiser, x, 0, 0,
+                              recon_yoffset, recon_uvoffset);
+      // Reevalute ZEROMV if the current mode is INTRA.
+      if (best_mode.mbmode.ref_frame == INTRA_FRAME)
+      {
+        int this_rd = INT_MAX;
+        int disable_skip = 0;
+        int other_cost = 0;
+        vpx_memset(&rd, 0, sizeof(rd));
+        x->e_mbd.mode_info_context->mbmi.ref_frame = LAST_FRAME;
+        rd.rate2 += x->ref_frame_cost[LAST_FRAME];
+        rd.rate2 += vp8_cost_mv_ref(ZEROMV, mdcounts);
+        x->e_mbd.mode_info_context->mbmi.mode = ZEROMV;
+        x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;
+        x->e_mbd.mode_info_context->mbmi.mv.as_int = 0;
+        this_rd = evaluate_inter_mode_rd(mdcounts, &rd, &disable_skip, cpi, x);
+        this_rd = calculate_final_rd_costs(this_rd, &rd, &other_cost,
+                                           disable_skip, uv_intra_tteob,
+                                           intra_rd_penalty, cpi, x);
+        if (this_rd < best_mode.rd || x->skip)
+        {
+            // Note index of best mode so far
+            best_mode_index = mode_index;
+            *returnrate = rd.rate2;
+            *returndistortion = rd.distortion2;
+            update_best_mode(&best_mode, this_rd, &rd, other_cost, x);
+        }
+      }
+    }
+#endif
 
     if (cpi->is_src_frame_alt_ref &&
-        (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME))
+        (best_mode.mbmode.mode != ZEROMV || best_mode.mbmode.ref_frame != ALTREF_FRAME))
     {
         x->e_mbd.mode_info_context->mbmi.mode = ZEROMV;
         x->e_mbd.mode_info_context->mbmi.ref_frame = ALTREF_FRAME;
@@ -2370,26 +2488,25 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
         x->e_mbd.mode_info_context->mbmi.mb_skip_coeff =
                                         (cpi->common.mb_no_coeff_skip);
         x->e_mbd.mode_info_context->mbmi.partitioning = 0;
-
         return;
     }
 
 
     // macroblock modes
-    vpx_memcpy(&x->e_mbd.mode_info_context->mbmi, &best_mbmode, sizeof(MB_MODE_INFO));
+    vpx_memcpy(&x->e_mbd.mode_info_context->mbmi, &best_mode.mbmode, sizeof(MB_MODE_INFO));
 
-    if (best_mbmode.mode == B_PRED)
+    if (best_mode.mbmode.mode == B_PRED)
     {
         for (i = 0; i < 16; i++)
-            xd->mode_info_context->bmi[i].as_mode = best_bmodes[i].as_mode;
+            xd->mode_info_context->bmi[i].as_mode = best_mode.bmodes[i].as_mode;
     }
 
-    if (best_mbmode.mode == SPLITMV)
+    if (best_mode.mbmode.mode == SPLITMV)
     {
         for (i = 0; i < 16; i++)
-            xd->mode_info_context->bmi[i].mv.as_int = best_bmodes[i].mv.as_int;
+            xd->mode_info_context->bmi[i].mv.as_int = best_mode.bmodes[i].mv.as_int;
 
-        vpx_memcpy(x->partition_info, &best_partition, sizeof(PARTITION_INFO));
+        vpx_memcpy(x->partition_info, &best_mode.partition, sizeof(PARTITION_INFO));
 
         x->e_mbd.mode_info_context->mbmi.mv.as_int =
                                       x->partition_info->bmi[15].mv.as_int;
diff --git a/vp8/encoder/temporal_filter.c b/vp8/encoder/temporal_filter.c
index 709f6e2b4..7e7def462 100644
--- a/vp8/encoder/temporal_filter.c
+++ b/vp8/encoder/temporal_filter.c
@@ -525,7 +525,8 @@ void vp8_temporal_filter_prepare_c
     {
         int which_buffer =  start_frame - frame;
         struct lookahead_entry* buf = vp8_lookahead_peek(cpi->lookahead,
-                                                         which_buffer);
+                                                         which_buffer,
+                                                         PEEK_FORWARD);
         cpi->frames[frames_to_blur-1-frame] = &buf->img;
     }
 
diff --git a/vp8/encoder/tokenize.c b/vp8/encoder/tokenize.c
index 967b6026a..efe2b4826 100644
--- a/vp8/encoder/tokenize.c
+++ b/vp8/encoder/tokenize.c
@@ -482,7 +482,7 @@ void vp8_tokenize_initialize()
 }
 
 
-static __inline void stuff2nd_order_b
+static void stuff2nd_order_b
 (
     TOKENEXTRA **tp,
     ENTROPY_CONTEXT *a,
@@ -506,7 +506,7 @@ static __inline void stuff2nd_order_b
 
 }
 
-static __inline void stuff1st_order_b
+static void stuff1st_order_b
 (
     TOKENEXTRA **tp,
     ENTROPY_CONTEXT *a,
@@ -530,7 +530,7 @@ static __inline void stuff1st_order_b
     *a = *l = pt;
 
 }
-static __inline
+static
 void stuff1st_order_buv
 (
     TOKENEXTRA **tp,
diff --git a/vp8/encoder/treewriter.h b/vp8/encoder/treewriter.h
index 0aa19431c..48574f33c 100644
--- a/vp8/encoder/treewriter.h
+++ b/vp8/encoder/treewriter.h
@@ -42,7 +42,7 @@ typedef BOOL_CODER vp8_writer;
 
 /* Both of these return bits, not scaled bits. */
 
-static __inline unsigned int vp8_cost_branch(const unsigned int ct[2], vp8_prob p)
+static unsigned int vp8_cost_branch(const unsigned int ct[2], vp8_prob p)
 {
     /* Imitate existing calculation */
 
@@ -53,7 +53,7 @@ static __inline unsigned int vp8_cost_branch(const unsigned int ct[2], vp8_prob
 /* Small functions to write explicit values and tokens, as well as
    estimate their lengths. */
 
-static __inline void vp8_treed_write
+static void vp8_treed_write
 (
     vp8_writer *const w,
     vp8_tree t,
@@ -72,7 +72,7 @@ static __inline void vp8_treed_write
     }
     while (n);
 }
-static __inline void vp8_write_token
+static void vp8_write_token
 (
     vp8_writer *const w,
     vp8_tree t,
@@ -83,7 +83,7 @@ static __inline void vp8_write_token
     vp8_treed_write(w, t, p, x->value, x->Len);
 }
 
-static __inline int vp8_treed_cost(
+static int vp8_treed_cost(
     vp8_tree t,
     const vp8_prob *const p,
     int v,
@@ -103,7 +103,7 @@ static __inline int vp8_treed_cost(
 
     return c;
 }
-static __inline int vp8_cost_token
+static int vp8_cost_token
 (
     vp8_tree t,
     const vp8_prob *const p,
diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index f68d007c1..3403557e9 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -79,6 +79,7 @@ VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/filter_x86.c
 VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/filter_x86.h
 VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp8_asm_stubs.c
 VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/loopfilter_x86.c
+VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/mfqe.c
 VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/postproc.h
 VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/postproc.c
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/dequantize_mmx.asm
@@ -112,6 +113,7 @@ VP8_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/sad_sse4.asm
 ifeq ($(CONFIG_POSTPROC),yes)
 VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/postproc_x86.c
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/postproc_mmx.asm
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/mfqe_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/postproc_sse2.asm
 endif
 
diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index fa78ec31c..683194a1d 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -183,14 +183,20 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
 
     RANGE_CHECK_BOOL(vp8_cfg,               enable_auto_alt_ref);
     RANGE_CHECK(vp8_cfg, cpu_used,           -16, 16);
-
+#if CONFIG_TEMPORAL_DENOISING
+    RANGE_CHECK(vp8_cfg, noise_sensitivity, 0, 1);
+#endif
 #if !(CONFIG_REALTIME_ONLY)
     RANGE_CHECK(vp8_cfg, encoding_mode,      VP8_BEST_QUALITY_ENCODING, VP8_REAL_TIME_ENCODING);
+#if !(CONFIG_TEMPORAL_DENOISING)
     RANGE_CHECK_HI(vp8_cfg, noise_sensitivity,  6);
+#endif
 #else
     RANGE_CHECK(vp8_cfg, encoding_mode,      VP8_REAL_TIME_ENCODING, VP8_REAL_TIME_ENCODING);
+#if !(CONFIG_TEMPORAL_DENOISING)
     RANGE_CHECK(vp8_cfg, noise_sensitivity,  0, 0);
 #endif
+#endif
 
     RANGE_CHECK(vp8_cfg, token_partitions,   VP8_ONE_TOKENPARTITION, VP8_EIGHT_TOKENPARTITION);
     RANGE_CHECK_HI(vp8_cfg, Sharpness,       7);
diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk
index aad847a72..2e940d787 100644
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -47,6 +47,8 @@ VP8_CX_SRCS-yes += encoder/firstpass.c
 VP8_CX_SRCS-yes += encoder/block.h
 VP8_CX_SRCS-yes += encoder/boolhuff.h
 VP8_CX_SRCS-yes += encoder/bitstream.h
+VP8_CX_SRCS-$(CONFIG_TEMPORAL_DENOISING) += encoder/denoising.h
+VP8_CX_SRCS-$(CONFIG_TEMPORAL_DENOISING) += encoder/denoising.c
 VP8_CX_SRCS-yes += encoder/encodeintra.h
 VP8_CX_SRCS-yes += encoder/encodemb.h
 VP8_CX_SRCS-yes += encoder/encodemv.h