56 files changed, 1323 insertions, 1406 deletions
diff --git a/README b/README
index dddc5eae4..d5ef59823 100644
--- a/README
+++ b/README
@@ -44,15 +44,9 @@ COMPILING THE APPLICATIONS/LIBRARIES:
 
     armv5te-linux-rvct
     armv5te-linux-gcc
-    armv5te-symbian-gcc
     armv6-darwin-gcc
     armv6-linux-rvct
     armv6-linux-gcc
-    armv6-symbian-gcc
-    iwmmxt-linux-rvct
-    iwmmxt-linux-gcc
-    iwmmxt2-linux-rvct
-    iwmmxt2-linux-gcc
     armv7-linux-rvct
     armv7-linux-gcc
     mips32-linux-gcc
diff --git a/build/make/ads2gas.pl b/build/make/ads2gas.pl
index cea967f93..c55ed0fe4 100755
--- a/build/make/ads2gas.pl
+++ b/build/make/ads2gas.pl
@@ -129,11 +129,14 @@ while (<STDIN>)
     # ARM code
     s/\sARM/.arm/g;
 
+    # eabi_attributes numerical equivalents can be found in the
+    # "ARM IHI 0045C" document.
+
     # REQUIRE8 Stack is required to be 8-byte aligned
-    s/\sREQUIRE8/.eabi_attribute Tag_ABI_align_needed, 1/g;
+    s/\sREQUIRE8/.eabi_attribute 24, 1 \@Tag_ABI_align_needed/g;
 
     # PRESERVE8 Stack 8-byte align is preserved
-    s/\sPRESERVE8/.eabi_attribute Tag_ABI_align_preserved, 1/g;
+    s/\sPRESERVE8/.eabi_attribute 25, 1 \@Tag_ABI_align_preserved/g;
 
     # Use PROC and ENDP to give the symbols a .size directive.
     # This makes them show up properly in debugging tools like gdb and valgrind.
diff --git a/build/make/configure.sh b/build/make/configure.sh
index 0426f9220..6039a5066 100755
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -603,8 +603,8 @@ process_common_toolchain() {
 
     # Enable the architecture family
     case ${tgt_isa} in
-        arm*|iwmmxt*) enable arm;;
-    mips*)        enable mips;;
+        arm*) enable arm;;
+        mips*) enable mips;;
     esac
 
     # PIC is probably what we want when building shared libs
@@ -665,37 +665,25 @@ process_common_toolchain() {
 
     # Process ARM architecture variants
     case ${toolchain} in
-    arm*|iwmmxt*)
-    # on arm, isa versions are supersets
-    enabled armv7a && soft_enable armv7 ### DEBUG
-    enabled armv7 && soft_enable armv6
-    enabled armv7 || enabled armv6 && soft_enable armv5te
-    enabled armv7 || enabled armv6 && soft_enable fast_unaligned
-    enabled iwmmxt2 && soft_enable iwmmxt
-    enabled iwmmxt && soft_enable armv5te
+    arm*)
+        # on arm, isa versions are supersets
+        enabled armv7a && soft_enable armv7 ### DEBUG
+        enabled armv7 && soft_enable armv6
+        enabled armv7 || enabled armv6 && soft_enable armv5te
+        enabled armv7 || enabled armv6 && soft_enable fast_unaligned
 
-    asm_conversion_cmd="cat"
+        asm_conversion_cmd="cat"
 
         case ${tgt_cc} in
         gcc)
-        if enabled iwmmxt || enabled iwmmxt2
-            then
-                CROSS=${CROSS:-arm-iwmmxt-linux-gnueabi-}
-            elif enabled symbian; then
-                CROSS=${CROSS:-arm-none-symbianelf-}
-            else
-                CROSS=${CROSS:-arm-none-linux-gnueabi-}
-            fi
+            CROSS=${CROSS:-arm-none-linux-gnueabi-}
             link_with_cc=gcc
             setup_gnu_toolchain
             arch_int=${tgt_isa##armv}
             arch_int=${arch_int%%te}
             check_add_asflags --defsym ARCHITECTURE=${arch_int}
             tune_cflags="-mtune="
-        if enabled iwmmxt || enabled iwmmxt2
-            then
-                check_add_asflags -mcpu=${tgt_isa}
-            elif enabled armv7
+            if enabled armv7
             then
                 check_add_cflags -march=armv7-a -mcpu=cortex-a8 -mfpu=neon -mfloat-abi=softfp  #-ftree-vectorize
                 check_add_asflags -mcpu=cortex-a8 -mfpu=neon -mfloat-abi=softfp  #-march=armv7-a
@@ -802,19 +790,6 @@ process_common_toolchain() {
             fi
         ;;
 
-        symbian*)
-            enable symbian
-            # Add the paths for the alternate libc
-            for d in include/libc; do
-                try_dir="${alt_libc}/${d}"
-                [ -d "${try_dir}" ] && add_cflags -I"${try_dir}"
-            done
-            for d in release/armv5/urel; do
-                try_dir="${alt_libc}/${d}"
-                [ -d "${try_dir}" ] && add_ldflags -L"${try_dir}"
-            done
-            add_cflags -DIMPORT_C=
-
         esac
     ;;
     mips*)
diff --git a/configure b/configure
index 6f20c6b77..68708907b 100755
--- a/configure
+++ b/configure
@@ -83,16 +83,10 @@ EOF
 all_platforms="${all_platforms} armv5te-linux-rvct"
 all_platforms="${all_platforms} armv5te-linux-gcc"
 all_platforms="${all_platforms} armv5te-none-rvct"
-all_platforms="${all_platforms} armv5te-symbian-gcc"
 all_platforms="${all_platforms} armv6-darwin-gcc"
 all_platforms="${all_platforms} armv6-linux-rvct"
 all_platforms="${all_platforms} armv6-linux-gcc"
 all_platforms="${all_platforms} armv6-none-rvct"
-all_platforms="${all_platforms} armv6-symbian-gcc"
-all_platforms="${all_platforms} iwmmxt-linux-rvct"
-all_platforms="${all_platforms} iwmmxt-linux-gcc"
-all_platforms="${all_platforms} iwmmxt2-linux-rvct"
-all_platforms="${all_platforms} iwmmxt2-linux-gcc"
 all_platforms="${all_platforms} armv7-darwin-gcc"    #neon Cortex-A8
 all_platforms="${all_platforms} armv7-linux-rvct"    #neon Cortex-A8
 all_platforms="${all_platforms} armv7-linux-gcc"     #neon Cortex-A8
@@ -198,8 +192,6 @@ ARCH_EXT_LIST="
     armv5te
     armv6
     armv7
-    iwmmxt
-    iwmmxt2
 
     mips32
 
diff --git a/examples/postproc.txt b/examples/postproc.txt
index 0940ea24c..51b251a04 100644
--- a/examples/postproc.txt
+++ b/examples/postproc.txt
@@ -58,7 +58,7 @@ if(frame_cnt%30 == 1) {
     if(vpx_codec_control(&codec, VP8_SET_POSTPROC, &pp))
         die_codec(&codec, "Failed to turn off postproc");
 } else if(frame_cnt%30 == 16) {
-    vp8_postproc_cfg_t  pp = {VP8_DEBLOCK | VP8_DEMACROBLOCK, 4, 0};
+    vp8_postproc_cfg_t  pp = {VP8_DEBLOCK | VP8_DEMACROBLOCK | VP8_MFQE, 4, 0};
 
     if(vpx_codec_control(&codec, VP8_SET_POSTPROC, &pp))
         die_codec(&codec, "Failed to turn on postproc");
diff --git a/vp8/common/alloccommon.c b/vp8/common/alloccommon.c
index 97a3559a4..b606aaca0 100644
--- a/vp8/common/alloccommon.c
+++ b/vp8/common/alloccommon.c
@@ -43,6 +43,8 @@ void vp8_de_alloc_frame_buffers(VP8_COMMON *oci)
 
     vp8_yv12_de_alloc_frame_buffer(&oci->temp_scale_frame);
     vp8_yv12_de_alloc_frame_buffer(&oci->post_proc_buffer);
+    if (oci->post_proc_buffer_int_used)
+        vp8_yv12_de_alloc_frame_buffer(&oci->post_proc_buffer_int);
 
     vpx_free(oci->above_context);
     vpx_free(oci->mip);
@@ -101,6 +103,8 @@ int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height)
         return 1;
     }
 
+    oci->post_proc_buffer_int_used = 0;
+
     oci->mb_rows = height >> 4;
     oci->mb_cols = width >> 4;
     oci->MBs = oci->mb_rows * oci->mb_cols;
diff --git a/vp8/common/arm/arm_systemdependent.c b/vp8/common/arm/arm_systemdependent.c
index 1e2467411..89a2be825 100644
--- a/vp8/common/arm/arm_systemdependent.c
+++ b/vp8/common/arm/arm_systemdependent.c
@@ -11,7 +11,6 @@
 
 #include "vpx_config.h"
 #include "vpx_ports/arm.h"
-#include "vp8/common/g_common.h"
 #include "vp8/common/pragmas.h"
 #include "vp8/common/subpixel.h"
 #include "vp8/common/loopfilter.h"
diff --git a/vp8/common/arm/dequantize_arm.c b/vp8/common/arm/dequantize_arm.c
index 20a8ac4fc..7cf4bf943 100644
--- a/vp8/common/arm/dequantize_arm.c
+++ b/vp8/common/arm/dequantize_arm.c
@@ -23,22 +23,20 @@ extern void vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ);
 
 #if HAVE_ARMV7
 
-void vp8_dequantize_b_neon(BLOCKD *d)
+void vp8_dequantize_b_neon(BLOCKD *d, short *DQC)
 {
     short *DQ  = d->dqcoeff;
     short *Q   = d->qcoeff;
-    short *DQC = d->dequant;
 
     vp8_dequantize_b_loop_neon(Q, DQC, DQ);
 }
 #endif
 
 #if HAVE_ARMV6
-void vp8_dequantize_b_v6(BLOCKD *d)
+void vp8_dequantize_b_v6(BLOCKD *d, short *DQC)
 {
     short *DQ  = d->dqcoeff;
     short *Q   = d->qcoeff;
-    short *DQC = d->dequant;
 
     vp8_dequantize_b_loop_v6(Q, DQC, DQ);
 }
diff --git a/vp8/common/blockd.h b/vp8/common/blockd.h
index 99b731c78..b237206e6 100644
--- a/vp8/common/blockd.h
+++ b/vp8/common/blockd.h
@@ -184,7 +184,6 @@ typedef struct
     short *qcoeff;
     short *dqcoeff;
     unsigned char  *predictor;
-    short *diff;
     short *dequant;
 
     /* 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries */
@@ -203,12 +202,16 @@ typedef struct
 
 typedef struct MacroBlockD
 {
-    DECLARE_ALIGNED(16, short, diff[400]);      /* from idct diff */
     DECLARE_ALIGNED(16, unsigned char,  predictor[384]);
     DECLARE_ALIGNED(16, short, qcoeff[400]);
     DECLARE_ALIGNED(16, short, dqcoeff[400]);
     DECLARE_ALIGNED(16, char,  eobs[25]);
 
+    DECLARE_ALIGNED(16, short,  dequant_y1[16]);
+    DECLARE_ALIGNED(16, short,  dequant_y1_dc[16]);
+    DECLARE_ALIGNED(16, short,  dequant_y2[16]);
+    DECLARE_ALIGNED(16, short,  dequant_uv[16]);
+
     /* 16 Y blocks, 4 U, 4 V, 1 DC 2nd order block, each with 16 entries. */
     BLOCKD block[25];
     int fullpixel_mask;
diff --git a/vp8/common/dequantize.c b/vp8/common/dequantize.c
index 4a48a3192..96245162f 100644
--- a/vp8/common/dequantize.c
+++ b/vp8/common/dequantize.c
@@ -14,12 +14,11 @@
 #include "vp8/common/idct.h"
 #include "vpx_mem/vpx_mem.h"
 
-void vp8_dequantize_b_c(BLOCKD *d)
+void vp8_dequantize_b_c(BLOCKD *d, short *DQC)
 {
     int i;
     short *DQ  = d->dqcoeff;
     short *Q   = d->qcoeff;
-    short *DQC = d->dequant;
 
     for (i = 0; i < 16; i++)
     {
diff --git a/vp8/common/dequantize.h b/vp8/common/dequantize.h
index f66cf2bac..429359190 100644
--- a/vp8/common/dequantize.h
+++ b/vp8/common/dequantize.h
@@ -14,7 +14,7 @@
 #include "vp8/common/blockd.h"
 
 #define prototype_dequant_block(sym) \
-    void sym(BLOCKD *x)
+    void sym(BLOCKD *x, short *DQC)
 
 #define prototype_dequant_idct_add(sym) \
     void sym(short *input, short *dq, \
diff --git a/vp8/common/g_common.h b/vp8/common/g_common.h
deleted file mode 100644
index 5f523980b..000000000
--- a/vp8/common/g_common.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-extern void (*vp8_clear_system_state)(void);
-extern void (*vp8_plane_add_noise)(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int DPitch, int q);
-extern void (*de_interlace)
-(
-    unsigned char *src_ptr,
-    unsigned char *dst_ptr,
-    int Width,
-    int Height,
-    int Stride
-);
diff --git a/vp8/common/generic/systemdependent.c b/vp8/common/generic/systemdependent.c
index dbf8d6504..01d76206d 100644
--- a/vp8/common/generic/systemdependent.c
+++ b/vp8/common/generic/systemdependent.c
@@ -10,7 +10,6 @@
 
 
 #include "vpx_config.h"
-#include "vp8/common/g_common.h"
 #include "vp8/common/subpixel.h"
 #include "vp8/common/loopfilter.h"
 #include "vp8/common/recon.h"
diff --git a/vp8/common/invtrans.h b/vp8/common/invtrans.h
index 7eec58e26..f49e2e577 100644
--- a/vp8/common/invtrans.h
+++ b/vp8/common/invtrans.h
@@ -17,6 +17,10 @@
 #include "blockd.h"
 #include "onyxc_int.h"
 
+#if CONFIG_MULTITHREAD
+#include "vpx_mem/vpx_mem.h"
+#endif
+
 static void eob_adjust(char *eobs, short *diff)
 {
     /* eob adjust.... the idct can only skip if both the dc and eob are zero */
@@ -32,9 +36,7 @@ static void eob_adjust(char *eobs, short *diff)
 static void vp8_inverse_transform_mby(MACROBLOCKD *xd,
                                       const VP8_COMMON_RTCD *rtcd)
 {
-    short *DQC = xd->block[0].dequant;
-    /* save the dc dequant constant in case it is overridden */
-    short dc_dequant_temp = DQC[0];
+    short *DQC = xd->dequant_y1;
 
     if (xd->mode_info_context->mbmi.mode != SPLITMV)
     {
@@ -51,15 +53,11 @@ static void vp8_inverse_transform_mby(MACROBLOCKD *xd,
         }
         eob_adjust(xd->eobs, xd->qcoeff);
 
-        /* override the dc dequant constant */
-        DQC[0] = 1;
+        DQC = xd->dequant_y1_dc;
     }
     DEQUANT_INVOKE (&rtcd->dequant, idct_add_y_block)
-                    (xd->qcoeff, xd->block[0].dequant,
+                    (xd->qcoeff, DQC,
                      xd->dst.y_buffer,
                      xd->dst.y_stride, xd->eobs);
-
-    /* restore the dc dequant constant */
-    DQC[0] = dc_dequant_temp;
 }
 #endif
diff --git a/vp8/common/mbpitch.c b/vp8/common/mbpitch.c
index 11fa3ffa7..f8971d754 100644
--- a/vp8/common/mbpitch.c
+++ b/vp8/common/mbpitch.c
@@ -87,7 +87,6 @@ void vp8_setup_block_dptrs(MACROBLOCKD *x)
     {
         for (c = 0; c < 4; c++)
         {
-            x->block[r*4+c].diff      = &x->diff[r * 4 * 16 + c * 4];
             x->block[r*4+c].predictor = x->predictor + r * 4 * 16 + c * 4;
         }
     }
@@ -96,7 +95,6 @@ void vp8_setup_block_dptrs(MACROBLOCKD *x)
     {
         for (c = 0; c < 2; c++)
         {
-            x->block[16+r*2+c].diff      = &x->diff[256 + r * 4 * 8 + c * 4];
             x->block[16+r*2+c].predictor = x->predictor + 256 + r * 4 * 8 + c * 4;
 
         }
@@ -106,14 +104,11 @@ void vp8_setup_block_dptrs(MACROBLOCKD *x)
     {
         for (c = 0; c < 2; c++)
         {
-            x->block[20+r*2+c].diff      = &x->diff[320+ r * 4 * 8 + c * 4];
             x->block[20+r*2+c].predictor = x->predictor + 320 + r * 4 * 8 + c * 4;
 
         }
     }
 
-    x->block[24].diff = &x->diff[384];
-
     for (r = 0; r < 25; r++)
     {
         x->block[r].qcoeff  = x->qcoeff  + r * 16;
diff --git a/vp8/common/onyx.h b/vp8/common/onyx.h
index d17a32b82..eb7d5458d 100644
--- a/vp8/common/onyx.h
+++ b/vp8/common/onyx.h
@@ -147,10 +147,14 @@ extern "C"
         int over_shoot_pct;
 
         // buffering parameters
-        int64_t starting_buffer_level;  // in seconds
+        int64_t starting_buffer_level;  // in bytes
         int64_t optimal_buffer_level;
         int64_t maximum_buffer_size;
 
+        int64_t starting_buffer_level_in_ms;  // in milli-seconds
+        int64_t optimal_buffer_level_in_ms;
+        int64_t maximum_buffer_size_in_ms;
+
         // controlling quality
         int fixed_q;
         int worst_allowed_q;
diff --git a/vp8/common/onyxc_int.h b/vp8/common/onyxc_int.h
index f733ff774..f91383de8 100644
--- a/vp8/common/onyxc_int.h
+++ b/vp8/common/onyxc_int.h
@@ -93,9 +93,9 @@ typedef struct VP8Common
 {
     struct vpx_internal_error_info  error;
 
-    DECLARE_ALIGNED(16, short, Y1dequant[QINDEX_RANGE][16]);
-    DECLARE_ALIGNED(16, short, Y2dequant[QINDEX_RANGE][16]);
-    DECLARE_ALIGNED(16, short, UVdequant[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, Y1dequant[QINDEX_RANGE][2]);
+    DECLARE_ALIGNED(16, short, Y2dequant[QINDEX_RANGE][2]);
+    DECLARE_ALIGNED(16, short, UVdequant[QINDEX_RANGE][2]);
 
     int Width;
     int Height;
@@ -114,6 +114,8 @@ typedef struct VP8Common
     YV12_BUFFER_CONFIG post_proc_buffer;
     YV12_BUFFER_CONFIG temp_scale_frame;
 
+    YV12_BUFFER_CONFIG post_proc_buffer_int;
+    int post_proc_buffer_int_used;
 
     FRAME_TYPE last_frame_type;  /* Save last frame's frame type for motion search. */
     FRAME_TYPE frame_type;
diff --git a/vp8/common/postproc.c b/vp8/common/postproc.c
index ace4c113c..cb81cb52a 100644
--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c
@@ -12,9 +12,12 @@
 #include "vpx_config.h"
 #include "vpx_scale/yv12config.h"
 #include "postproc.h"
+#include "common.h"
+#include "recon.h"
 #include "vpx_scale/yv12extend.h"
 #include "vpx_scale/vpxscale.h"
 #include "systemdependent.h"
+#include "../encoder/variance.h"
 
 #include <math.h>
 #include <stdlib.h>
@@ -26,6 +29,7 @@
     ( (0.439*(float)(t>>16)) - (0.368*(float)(t>>8&0xff)) - (0.071*(float)(t&0xff)) + 128)
 
 /* global constants */
+#define MFQE_PRECISION 4
 #if CONFIG_POSTPROC_VISUALIZER
 static const unsigned char MB_PREDICTION_MODE_colors[MB_MODE_COUNT][3] =
 {
@@ -121,7 +125,6 @@ const short vp8_rv[] =
     0, 9, 5, 5, 11, 10, 13, 9, 10, 13,
 };
 
-
 extern void vp8_blit_text(const char *msg, unsigned char *address, const int pitch);
 extern void vp8_blit_line(int x0, int x1, int y0, int y1, unsigned char *image, const int pitch);
 /***********************************************************************************************************
@@ -323,11 +326,11 @@ static void vp8_deblock_and_de_macro_block(YV12_BUFFER_CONFIG         *source,
 }
 
 void vp8_deblock(YV12_BUFFER_CONFIG         *source,
-                        YV12_BUFFER_CONFIG         *post,
-                        int                         q,
-                        int                         low_var_thresh,
-                        int                         flag,
-                        vp8_postproc_rtcd_vtable_t *rtcd)
+                 YV12_BUFFER_CONFIG         *post,
+                 int                         q,
+                 int                         low_var_thresh,
+                 int                         flag,
+                 vp8_postproc_rtcd_vtable_t *rtcd)
 {
     double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
     int ppl = (int)(level + .5);
@@ -671,6 +674,128 @@ static void constrain_line (int x0, int *x1, int y0, int *y1, int width, int hei
     }
 }
 
+int vp8_references_buffer( VP8_COMMON *oci, int ref_frame )
+{
+    const MODE_INFO *mi = oci->mi;
+    int mb_row, mb_col;
+
+    for (mb_row = 0; mb_row < oci->mb_rows; mb_row++)
+    {
+        for (mb_col = 0; mb_col < oci->mb_cols; mb_col++,mi++)
+        {
+            if( mi->mbmi.ref_frame == ref_frame)
+              return 1;
+        }
+        mi++;
+    }
+    return 0;
+
+}
+
+static void multiframe_quality_enhance_block
+(
+    int blksize, /* Currently only values supported are 16, 8, 4 */
+    int qcurr,
+    int qprev,
+    unsigned char *y,
+    unsigned char *u,
+    unsigned char *v,
+    int y_stride,
+    int uv_stride,
+    unsigned char *yd,
+    unsigned char *ud,
+    unsigned char *vd,
+    int yd_stride,
+    int uvd_stride
+)
+{
+    static const unsigned char VP8_ZEROS[16]=
+    {
+         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+    };
+    int blksizeby2 = blksize >> 1;
+    int blksizesq = blksize * blksize;
+
+    int i, j;
+    unsigned char *yp;
+    unsigned char *ydp;
+    unsigned char *up;
+    unsigned char *udp;
+    unsigned char *vp;
+    unsigned char *vdp;
+
+    unsigned int act, sse, sad, thr;
+    if (blksize == 16)
+    {
+        act = vp8_variance_var16x16(y, y_stride, VP8_ZEROS, 0, &sse);
+        sad = vp8_variance_sad16x16(y, y_stride, yd, yd_stride, 0);
+    }
+    else if (blksize == 8)
+    {
+        act = vp8_variance_var8x8(y, y_stride, VP8_ZEROS, 0, &sse);
+        sad = vp8_variance_sad8x8(y, y_stride, yd, yd_stride, 0);
+    }
+    else
+    {
+        act = vp8_variance_var4x4(y, y_stride, VP8_ZEROS, 0, &sse);
+        sad = vp8_variance_sad4x4(y, y_stride, yd, yd_stride, 0);
+    }
+
+    thr = 6 * blksizesq + (act >> 3);
+    if (thr > 12 * blksizesq) thr = 12 * blksizesq;
+    // These thresholds should be adapted later based on qcurr and qprev
+    if (sad < thr)
+    {
+        static const int roundoff = (1 << (MFQE_PRECISION - 1));
+        int ifactor = (sad << MFQE_PRECISION) / thr;
+        // TODO: SIMD optimize this section
+        if (ifactor)
+        {
+            int icfactor = (1 << MFQE_PRECISION) - ifactor;
+            for (yp = y, ydp = yd, i = 0; i < blksize; ++i, yp += y_stride, ydp += yd_stride)
+            {
+                for (j = 0; j < blksize; ++j)
+                    ydp[j] = (int)((yp[j] * ifactor + ydp[j] * icfactor + roundoff) >> MFQE_PRECISION);
+            }
+            for (up = u, udp = ud, i = 0; i < blksizeby2; ++i, up += uv_stride, udp += uvd_stride)
+            {
+                for (j = 0; j < blksizeby2; ++j)
+                    udp[j] = (int)((up[j] * ifactor + udp[j] * icfactor + roundoff) >> MFQE_PRECISION);
+            }
+            for (vp = v, vdp = vd, i = 0; i < blksizeby2; ++i, vp += uv_stride, vdp += uvd_stride)
+            {
+                for (j = 0; j < blksizeby2; ++j)
+                    vdp[j] = (int)((vp[j] * ifactor + vdp[j] * icfactor + roundoff) >> MFQE_PRECISION);
+            }
+        }
+    }
+    else
+    {
+        if (blksize == 16)
+        {
+            vp8_recon_copy16x16(y, y_stride, yd, yd_stride);
+            vp8_recon_copy8x8(u, uv_stride, ud, uvd_stride);
+            vp8_recon_copy8x8(v, uv_stride, vd, uvd_stride);
+        }
+        else if (blksize == 8)
+        {
+            vp8_recon_copy8x8(y, y_stride, yd, yd_stride);
+            for (up = u, udp = ud, i = 0; i < blksizeby2; ++i, up += uv_stride, udp += uvd_stride)
+                vpx_memcpy(udp, up, blksizeby2);
+            for (vp = v, vdp = vd, i = 0; i < blksizeby2; ++i, vp += uv_stride, vdp += uvd_stride)
+                vpx_memcpy(vdp, vp, blksizeby2);
+        }
+        else
+        {
+            for (yp = y, ydp = yd, i = 0; i < blksize; ++i, yp += y_stride, ydp += yd_stride)
+                vpx_memcpy(ydp, yp, blksize);
+            for (up = u, udp = ud, i = 0; i < blksizeby2; ++i, up += uv_stride, udp += uvd_stride)
+                vpx_memcpy(udp, up, blksizeby2);
+            for (vp = v, vdp = vd, i = 0; i < blksizeby2; ++i, vp += uv_stride, vdp += uvd_stride)
+                vpx_memcpy(vdp, vp, blksizeby2);
+        }
+    }
+}
 
 #if CONFIG_RUNTIME_CPU_DETECT
 #define RTCD_VTABLE(oci) (&(oci)->rtcd.postproc)
@@ -678,6 +803,104 @@ static void constrain_line (int x0, int *x1, int y0, int *y1, int width, int hei
 #define RTCD_VTABLE(oci) NULL
 #endif
 
+void vp8_multiframe_quality_enhance
+(
+    VP8_COMMON *cm
+)
+{
+    YV12_BUFFER_CONFIG *show = cm->frame_to_show;
+    YV12_BUFFER_CONFIG *dest = &cm->post_proc_buffer;
+
+    FRAME_TYPE frame_type = cm->frame_type;
+    /* Point at base of Mb MODE_INFO list has motion vectors etc */
+    const MODE_INFO *mode_info_context = cm->mi;
+    int mb_row;
+    int mb_col;
+    int qcurr = cm->base_qindex;
+    int qprev = cm->postproc_state.last_base_qindex;
+
+    unsigned char *y_ptr, *u_ptr, *v_ptr;
+    unsigned char *yd_ptr, *ud_ptr, *vd_ptr;
+
+    /* Set up the buffer pointers */
+    y_ptr = show->y_buffer;
+    u_ptr = show->u_buffer;
+    v_ptr = show->v_buffer;
+    yd_ptr = dest->y_buffer;
+    ud_ptr = dest->u_buffer;
+    vd_ptr = dest->v_buffer;
+
+    /* postprocess each macro block */
+    for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
+    {
+        for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+        {
+            /* if motion is high there will likely be no benefit */
+            if (((frame_type == INTER_FRAME &&
+                  abs(mode_info_context->mbmi.mv.as_mv.row) <= 10 &&
+                  abs(mode_info_context->mbmi.mv.as_mv.col) <= 10) ||
+                 (frame_type == KEY_FRAME)) &&
+                mode_info_context->mbmi.mode != B_PRED)
+            {
+                multiframe_quality_enhance_block(16,
+                                                 qcurr,
+                                                 qprev,
+                                                 y_ptr,
+                                                 u_ptr,
+                                                 v_ptr,
+                                                 show->y_stride,
+                                                 show->uv_stride,
+                                                 yd_ptr,
+                                                 ud_ptr,
+                                                 vd_ptr,
+                                                 dest->y_stride,
+                                                 dest->uv_stride);
+            }
+            else if (mode_info_context->mbmi.mode == B_PRED)
+            {
+                int i, j;
+                for (i=0; i<2; ++i)
+                    for (j=0; j<2; ++j)
+                        multiframe_quality_enhance_block(8,
+                                                         qcurr,
+                                                         qprev,
+                                                         y_ptr + 8*(i*show->y_stride+j),
+                                                         u_ptr + 4*(i*show->uv_stride+j),
+                                                         v_ptr + 4*(i*show->uv_stride+j),
+                                                         show->y_stride,
+                                                         show->uv_stride,
+                                                         yd_ptr + 8*(i*dest->y_stride+j),
+                                                         ud_ptr + 4*(i*dest->uv_stride+j),
+                                                         vd_ptr + 4*(i*dest->uv_stride+j),
+                                                         dest->y_stride,
+                                                         dest->uv_stride);
+            }
+            else
+            {
+                vp8_recon_copy16x16(y_ptr, show->y_stride, yd_ptr, dest->y_stride);
+                vp8_recon_copy8x8(u_ptr, show->uv_stride, ud_ptr, dest->uv_stride);
+                vp8_recon_copy8x8(v_ptr, show->uv_stride, vd_ptr, dest->uv_stride);
+            }
+            y_ptr += 16;
+            u_ptr += 8;
+            v_ptr += 8;
+            yd_ptr += 16;
+            ud_ptr += 8;
+            vd_ptr += 8;
+            mode_info_context++;     /* step to next MB */
+        }
+
+        y_ptr += show->y_stride  * 16 - 16 * cm->mb_cols;
+        u_ptr += show->uv_stride *  8 - 8 * cm->mb_cols;
+        v_ptr += show->uv_stride *  8 - 8 * cm->mb_cols;
+        yd_ptr += dest->y_stride  * 16 - 16 * cm->mb_cols;
+        ud_ptr += dest->uv_stride *  8 - 8 * cm->mb_cols;
+        vd_ptr += dest->uv_stride *  8 - 8 * cm->mb_cols;
+
+        mode_info_context++;         /* Skip border mb */
+    }
+}
+
 int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *ppflags)
 {
     int q = oci->filter_level * 10 / 6;
@@ -699,27 +922,65 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
         dest->y_width = oci->Width;
         dest->y_height = oci->Height;
         dest->uv_height = dest->y_height / 2;
+        oci->postproc_state.last_base_qindex = oci->base_qindex;
         return 0;
+    }
 
+    /* Allocate post_proc_buffer_int if needed */
+    if ((flags & VP8D_MFQE) && !oci->post_proc_buffer_int_used)
+    {
+        if ((flags & VP8D_DEBLOCK) || (flags & VP8D_DEMACROBLOCK))
+        {
+            if (vp8_yv12_alloc_frame_buffer(&oci->post_proc_buffer_int, oci->Width, oci->Height, VP8BORDERINPIXELS) >= 0)
+            {
+                oci->post_proc_buffer_int_used = 1;
+            }
+        }
     }
 
 #if ARCH_X86||ARCH_X86_64
     vpx_reset_mmx_state();
 #endif
 
-    if (flags & VP8D_DEMACROBLOCK)
+    if ((flags & VP8D_MFQE) &&
+         oci->current_video_frame >= 2 &&
+         oci->base_qindex - oci->postproc_state.last_base_qindex >= 10)
+    {
+        vp8_multiframe_quality_enhance(oci);
+        if (((flags & VP8D_DEBLOCK) || (flags & VP8D_DEMACROBLOCK)) &&
+            oci->post_proc_buffer_int_used)
+        {
+            vp8_yv12_copy_frame_ptr(&oci->post_proc_buffer, &oci->post_proc_buffer_int);
+            if (flags & VP8D_DEMACROBLOCK)
+            {
+                vp8_deblock_and_de_macro_block(&oci->post_proc_buffer_int, &oci->post_proc_buffer,
+                                               q + (deblock_level - 5) * 10, 1, 0, RTCD_VTABLE(oci));
+            }
+            else if (flags & VP8D_DEBLOCK)
+            {
+                vp8_deblock(&oci->post_proc_buffer_int, &oci->post_proc_buffer,
+                            q, 1, 0, RTCD_VTABLE(oci));
+            }
+        }
+        /* Move partially towards the base q of the previous frame */
+        oci->postproc_state.last_base_qindex = (3*oci->postproc_state.last_base_qindex + oci->base_qindex)>>2;
+    }
+    else if (flags & VP8D_DEMACROBLOCK)
     {
         vp8_deblock_and_de_macro_block(oci->frame_to_show, &oci->post_proc_buffer,
                                        q + (deblock_level - 5) * 10, 1, 0, RTCD_VTABLE(oci));
+        oci->postproc_state.last_base_qindex = oci->base_qindex;
     }
     else if (flags & VP8D_DEBLOCK)
     {
         vp8_deblock(oci->frame_to_show, &oci->post_proc_buffer,
                     q, 1, 0, RTCD_VTABLE(oci));
+        oci->postproc_state.last_base_qindex = oci->base_qindex;
     }
     else
     {
         vp8_yv12_copy_frame_ptr(oci->frame_to_show, &oci->post_proc_buffer);
+        oci->postproc_state.last_base_qindex = oci->base_qindex;
     }
 
     if (flags & VP8D_ADDNOISE)
diff --git a/vp8/common/postproc.h b/vp8/common/postproc.h
index c641b9ca5..d5aaf6216 100644
--- a/vp8/common/postproc.h
+++ b/vp8/common/postproc.h
@@ -104,6 +104,7 @@ struct postproc_state
     int           last_q;
     int           last_noise;
     char          noise[3072];
+    int           last_base_qindex;
     DECLARE_ALIGNED(16, char, blackclamp[16]);
     DECLARE_ALIGNED(16, char, whiteclamp[16]);
     DECLARE_ALIGNED(16, char, bothclamp[16]);
diff --git a/vp8/common/ppc/systemdependent.c b/vp8/common/ppc/systemdependent.c
index 1f5d79068..7046a63e8 100644
--- a/vp8/common/ppc/systemdependent.c
+++ b/vp8/common/ppc/systemdependent.c
@@ -9,7 +9,6 @@
  */
 
 
-#include "g_common.h"
 #include "subpixel.h"
 #include "loopfilter.h"
 #include "recon.h"
diff --git a/vp8/common/ppflags.h b/vp8/common/ppflags.h
index 65b0cab6a..665e21fd9 100644
--- a/vp8/common/ppflags.h
+++ b/vp8/common/ppflags.h
@@ -23,7 +23,8 @@ enum
     VP8D_DEBUG_TXT_RATE_INFO    = 1<<6,
     VP8D_DEBUG_DRAW_MV          = 1<<7,
     VP8D_DEBUG_CLR_BLK_MODES    = 1<<8,
-    VP8D_DEBUG_CLR_FRM_REF_BLKS = 1<<9
+    VP8D_DEBUG_CLR_FRM_REF_BLKS = 1<<9,
+    VP8D_MFQE                   = 1<<10
 };
 
 typedef struct
diff --git a/vp8/common/x86/idct_blk_mmx.c b/vp8/common/x86/idct_blk_mmx.c
index 49cebd6f5..8ff483708 100644
--- a/vp8/common/x86/idct_blk_mmx.c
+++ b/vp8/common/x86/idct_blk_mmx.c
@@ -14,12 +14,12 @@
 
 extern void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q);
 
-void vp8_dequantize_b_mmx(BLOCKD *d)
+void vp8_dequantize_b_mmx(BLOCKD *d, short *DQC)
 {
     short *sq = (short *) d->qcoeff;
     short *dq = (short *) d->dqcoeff;
-    short *q = (short *) d->dequant;
-    vp8_dequantize_b_impl_mmx(sq, dq, q);
+
+    vp8_dequantize_b_impl_mmx(sq, dq, DQC);
 }
 
 void vp8_dequant_idct_add_y_block_mmx
diff --git a/vp8/common/x86/loopfilter_sse2.asm b/vp8/common/x86/loopfilter_sse2.asm
index 86927d9f1..2ad010adb 100644
--- a/vp8/common/x86/loopfilter_sse2.asm
+++ b/vp8/common/x86/loopfilter_sse2.asm
@@ -1385,52 +1385,54 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2):
     SHADOW_ARGS_TO_STACK 3
     SAVE_XMM 7
     GET_GOT     rbx
-    push        rsi
-    push        rdi
     ; end prolog
 
-        mov         rsi, arg(0)             ;src_ptr
+        mov         rcx, arg(0)             ;src_ptr
         movsxd      rax, dword ptr arg(1)   ;src_pixel_step     ; destination pitch?
-        mov         rdx, arg(2)             ;blimit
-        movdqa      xmm3, XMMWORD PTR [rdx]
 
-        mov         rdi, rsi                ; rdi points to row +1 for indirect addressing
-        add         rdi, rax
+        lea         rdx, [rcx + rax]
         neg         rax
 
         ; calculate mask
-        movdqa      xmm1, [rsi+2*rax]       ; p1
-        movdqa      xmm0, [rdi]             ; q1
+        movdqa      xmm0, [rdx]             ; q1
+        mov         rdx, arg(2)             ;blimit
+        movdqa      xmm1, [rcx+2*rax]       ; p1
+
         movdqa      xmm2, xmm1
         movdqa      xmm7, xmm0
-        movdqa      xmm4, xmm0
+
         psubusb     xmm0, xmm1              ; q1-=p1
-        psubusb     xmm1, xmm4              ; p1-=q1
+        psubusb     xmm1, xmm7              ; p1-=q1
         por         xmm1, xmm0              ; abs(p1-q1)
         pand        xmm1, [GLOBAL(tfe)]     ; set lsb of each byte to zero
         psrlw       xmm1, 1                 ; abs(p1-q1)/2
 
-        movdqa      xmm5, [rsi+rax]         ; p0
-        movdqa      xmm4, [rsi]             ; q0
+        movdqa      xmm3, XMMWORD PTR [rdx]
+
+        movdqa      xmm5, [rcx+rax]         ; p0
+        movdqa      xmm4, [rcx]             ; q0
         movdqa      xmm0, xmm4              ; q0
         movdqa      xmm6, xmm5              ; p0
         psubusb     xmm5, xmm4              ; p0-=q0
         psubusb     xmm4, xmm6              ; q0-=p0
         por         xmm5, xmm4              ; abs(p0 - q0)
+
+        movdqa      xmm4, [GLOBAL(t80)]
+
         paddusb     xmm5, xmm5              ; abs(p0-q0)*2
         paddusb     xmm5, xmm1              ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
         psubusb     xmm5, xmm3              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
         pxor        xmm3, xmm3
         pcmpeqb     xmm5, xmm3
 
+
         ; start work on filters
-        pxor        xmm2, [GLOBAL(t80)]     ; p1 offset to convert to signed values
-        pxor        xmm7, [GLOBAL(t80)]     ; q1 offset to convert to signed values
+        pxor        xmm2, xmm4     ; p1 offset to convert to signed values
+        pxor        xmm7, xmm4     ; q1 offset to convert to signed values
         psubsb      xmm2, xmm7              ; p1 - q1
 
-        pxor        xmm6, [GLOBAL(t80)]     ; offset to convert to signed values
-        pxor        xmm0, [GLOBAL(t80)]     ; offset to convert to signed values
+        pxor        xmm6, xmm4     ; offset to convert to signed values
+        pxor        xmm0, xmm4     ; offset to convert to signed values
         movdqa      xmm3, xmm0              ; q0
         psubsb      xmm0, xmm6              ; q0 - p0
         paddsb      xmm2, xmm0              ; p1 - q1 + 1 * (q0 - p0)
@@ -1438,42 +1440,36 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2):
         paddsb      xmm2, xmm0              ; p1 - q1 + 3 * (q0 - p0)
         pand        xmm5, xmm2              ; mask filter values we don't care about
 
-        ; do + 4 side
-        paddsb      xmm5, [GLOBAL(t4)]      ; 3* (q0 - p0) + (p1 - q1) + 4
-
-        movdqa      xmm0, xmm5              ; get a copy of filters
-        psllw       xmm0, 8                 ; shift left 8
-        psraw       xmm0, 3                 ; arithmetic shift right 11
-        psrlw       xmm0, 8
-        movdqa      xmm1, xmm5              ; get a copy of filters
-        psraw       xmm1, 11                ; arithmetic shift right 11
-        psllw       xmm1, 8                 ; shift left 8 to put it back
-
-        por         xmm0, xmm1              ; put the two together to get result
+        paddsb      xmm5, [GLOBAL(t4)]      ;  3* (q0 - p0) + (p1 - q1) + 4
+        movdqa      xmm0, xmm5
+        psubsb      xmm5, [GLOBAL(t1s)]     ; +3 instead of +4
 
-        psubsb      xmm3, xmm0              ; q0-= q0 add
-        pxor        xmm3, [GLOBAL(t80)]     ; unoffset
-        movdqa      [rsi], xmm3             ; write back
+        movdqa      xmm1, [GLOBAL(te0)]
+        movdqa      xmm2, [GLOBAL(t1f)]
 
-        ; now do +3 side
-        psubsb      xmm5, [GLOBAL(t1s)]     ; +3 instead of +4
+        pxor        xmm7, xmm7
+        pcmpgtb     xmm7, xmm0              ;save sign
+        pand        xmm7, xmm1              ;preserve the upper 3 bits
+        psrlw       xmm0, 3
+        pand        xmm0, xmm2              ;clear out upper 3 bits
+        por         xmm0, xmm7              ;add sign
+        psubsb      xmm3, xmm0              ; q0-= q0sz add
 
-        movdqa      xmm0, xmm5              ; get a copy of filters
-        psllw       xmm0, 8                 ; shift left 8
-        psraw       xmm0, 3                 ; arithmetic shift right 11
-        psrlw       xmm0, 8
-        psraw       xmm5, 11                ; arithmetic shift right 11
-        psllw       xmm5, 8                 ; shift left 8 to put it back
-        por         xmm0, xmm5              ; put the two together to get result
+        pxor        xmm7, xmm7
+        pcmpgtb     xmm7, xmm5              ;save sign
+        pand        xmm7, xmm1              ;preserve the upper 3 bits
+        psrlw       xmm5, 3
+        pand        xmm5, xmm2              ;clear out upper 3 bits
+        por         xmm5, xmm7              ;add sign
+        paddsb      xmm6, xmm5              ; p0+= p0 add
 
+        pxor        xmm3, xmm4     ; unoffset
+        movdqa      [rcx], xmm3             ; write back
 
-        paddsb      xmm6, xmm0              ; p0+= p0 add
-        pxor        xmm6, [GLOBAL(t80)]     ; unoffset
-        movdqa      [rsi+rax], xmm6         ; write back
+        pxor        xmm6, xmm4     ; unoffset
+        movdqa      [rcx+rax], xmm6         ; write back
 
     ; begin epilog
-    pop rdi
-    pop rsi
     RESTORE_GOT
     RESTORE_XMM
     UNSHADOW_ARGS
@@ -1536,9 +1532,6 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
         punpckldq   xmm0,       xmm1                    ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
         punpckhdq   xmm2,       xmm1                    ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
 
-        movdqa      t0,         xmm0                    ; save to t0
-        movdqa      t1,         xmm2                    ; save to t1
-
         lea         rsi,        [rsi + rax*8]
         lea         rdi,        [rsi + rax]
         lea         rdx,        [rsi + rax*4]
@@ -1551,26 +1544,24 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
         punpckldq   xmm4,       xmm1                    ; c3 c2 c1 c0 83 82 81 80
         punpckldq   xmm6,       xmm3                    ; d3 d2 d1 d0 93 92 91 90
 
-        movd        xmm0,       [rsi + rax*2]           ; a3 a2 a1 a0
+        movd        xmm1,       [rsi + rax*2]           ; a3 a2 a1 a0
         movd        xmm5,       [rdx + rax*2]           ; e3 e2 e1 e0
-        movd        xmm2,       [rdi + rax*2]           ; b3 b2 b1 b0
+        movd        xmm3,       [rdi + rax*2]           ; b3 b2 b1 b0
         movd        xmm7,       [rcx + rax*2]           ; f3 f2 f1 f0
-        punpckldq   xmm0,       xmm5                    ; e3 e2 e1 e0 a3 a2 a1 a0
-        punpckldq   xmm2,       xmm7                    ; f3 f2 f1 f0 b3 b2 b1 b0
+        punpckldq   xmm1,       xmm5                    ; e3 e2 e1 e0 a3 a2 a1 a0
+        punpckldq   xmm3,       xmm7                    ; f3 f2 f1 f0 b3 b2 b1 b0
 
         punpcklbw   xmm4,       xmm6                    ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80
-        punpcklbw   xmm0,       xmm2                    ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0
+        punpcklbw   xmm1,       xmm3                    ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0
 
-        movdqa      xmm1,       xmm4
-        punpcklwd   xmm4,       xmm0                    ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
-        punpckhwd   xmm1,       xmm0                    ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
+        movdqa      xmm7,       xmm4
+        punpcklwd   xmm4,       xmm1                    ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
+        punpckhwd   xmm7,       xmm1                    ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
 
         movdqa      xmm6,       xmm4
-        punpckldq   xmm4,       xmm1                    ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
-        punpckhdq   xmm6,       xmm1                    ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
+        punpckldq   xmm4,       xmm7                    ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
+        punpckhdq   xmm6,       xmm7                    ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
 
-        movdqa      xmm0,       t0                      ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
-        movdqa      xmm2,       t1                      ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
         movdqa      xmm1,       xmm0
         movdqa      xmm3,       xmm2
 
@@ -1579,6 +1570,8 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
         punpcklqdq  xmm2,       xmm6                    ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
         punpckhqdq  xmm3,       xmm6                    ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
 
+        mov         rdx,        arg(2)                          ;blimit
+
         ; calculate mask
         movdqa      xmm6,       xmm0                            ; p1
         movdqa      xmm7,       xmm3                            ; q1
@@ -1588,6 +1581,8 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
         pand        xmm6,       [GLOBAL(tfe)]                   ; set lsb of each byte to zero
         psrlw       xmm6,       1                               ; abs(p1-q1)/2
 
+        movdqa      xmm7, [rdx]
+
         movdqa      xmm5,       xmm1                            ; p0
         movdqa      xmm4,       xmm2                            ; q0
         psubusb     xmm5,       xmm2                            ; p0-=q0
@@ -1596,8 +1591,7 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
         paddusb     xmm5,       xmm5                            ; abs(p0-q0)*2
         paddusb     xmm5,       xmm6                            ; abs (p0 - q0) *2 + abs(p1-q1)/2
 
-        mov         rdx,        arg(2)                          ;blimit
-        movdqa      xmm7, XMMWORD PTR [rdx]
+        movdqa      xmm4, [GLOBAL(t80)]
 
         psubusb     xmm5,        xmm7                           ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
         pxor        xmm7,        xmm7
@@ -1607,59 +1601,48 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
         movdqa        t0,        xmm0
         movdqa        t1,        xmm3
 
-        pxor        xmm0,        [GLOBAL(t80)]                  ; p1 offset to convert to signed values
-        pxor        xmm3,        [GLOBAL(t80)]                  ; q1 offset to convert to signed values
-
+        pxor        xmm0,        xmm4                  ; p1 offset to convert to signed values
+        pxor        xmm3,        xmm4                  ; q1 offset to convert to signed values
         psubsb      xmm0,        xmm3                           ; p1 - q1
-        movdqa      xmm6,        xmm1                           ; p0
-
-        movdqa      xmm7,        xmm2                           ; q0
-        pxor        xmm6,        [GLOBAL(t80)]                  ; offset to convert to signed values
 
-        pxor        xmm7,        [GLOBAL(t80)]                  ; offset to convert to signed values
-        movdqa      xmm3,        xmm7                           ; offseted ; q0
-
-        psubsb      xmm7,        xmm6                           ; q0 - p0
-        paddsb      xmm0,        xmm7                           ; p1 - q1 + 1 * (q0 - p0)
+        movdqa      xmm6,        xmm1                           ; p0
+;        movdqa      xmm7,        xmm2                           ; q0
 
-        paddsb      xmm0,        xmm7                           ; p1 - q1 + 2 * (q0 - p0)
-        paddsb      xmm0,        xmm7                           ; p1 - q1 + 3 * (q0 - p0)
+        pxor        xmm6,        xmm4                  ; offset to convert to signed values
+        pxor        xmm2,        xmm4                  ; offset to convert to signed values
 
+        movdqa      xmm3,        xmm2                           ; offseted ; q0
+        psubsb      xmm2,        xmm6                           ; q0 - p0
+        paddsb      xmm0,        xmm2                           ; p1 - q1 + 1 * (q0 - p0)
+        paddsb      xmm0,        xmm2                           ; p1 - q1 + 2 * (q0 - p0)
+        paddsb      xmm0,        xmm2                           ; p1 - q1 + 3 * (q0 - p0)
         pand        xmm5,        xmm0                           ; mask filter values we don't care about
 
-
         paddsb      xmm5,        [GLOBAL(t4)]                   ;  3* (q0 - p0) + (p1 - q1) + 4
-
-        movdqa      xmm0,        xmm5                           ; get a copy of filters
-        psllw       xmm0,        8                              ; shift left 8
-
-        psraw       xmm0,        3                              ; arithmetic shift right 11
-        psrlw       xmm0,        8
-
-        movdqa      xmm7,        xmm5                           ; get a copy of filters
-        psraw       xmm7,        11                             ; arithmetic shift right 11
-
-        psllw       xmm7,        8                              ; shift left 8 to put it back
-        por         xmm0,        xmm7                           ; put the two together to get result
-
-        psubsb      xmm3,        xmm0                           ; q0-= q0sz add
-        pxor        xmm3,        [GLOBAL(t80)]                  ; unoffset   q0
-
-        ; now do +3 side
+        movdqa      xmm0, xmm5
         psubsb      xmm5,        [GLOBAL(t1s)]                  ; +3 instead of +4
-        movdqa      xmm0,        xmm5                           ; get a copy of filters
 
-        psllw       xmm0,        8                              ; shift left 8
-        psraw       xmm0,        3                              ; arithmetic shift right 11
+        movdqa  xmm1, [GLOBAL(te0)]
+        movdqa  xmm2, [GLOBAL(t1f)]
 
-        psrlw       xmm0,        8
-        psraw       xmm5,        11                             ; arithmetic shift right 11
+        pxor        xmm7, xmm7
+        pcmpgtb     xmm7, xmm0              ;save sign
+        pand        xmm7, xmm1              ;preserve the upper 3 bits
+        psrlw       xmm0, 3
+        pand        xmm0, xmm2              ;clear out upper 3 bits
+        por         xmm0, xmm7              ;add sign
+        psubsb      xmm3, xmm0              ; q0-= q0sz add
 
-        psllw       xmm5,        8                              ; shift left 8 to put it back
-        por         xmm0,        xmm5                           ; put the two together to get result
+        pxor        xmm7, xmm7
+        pcmpgtb     xmm7, xmm5              ;save sign
+        pand        xmm7, xmm1              ;preserve the upper 3 bits
+        psrlw       xmm5, 3
+        pand        xmm5, xmm2              ;clear out upper 3 bits
+        por         xmm5, xmm7              ;add sign
+        paddsb      xmm6, xmm5              ; p0+= p0 add
 
-        paddsb      xmm6,        xmm0                           ; p0+= p0 add
-        pxor        xmm6,        [GLOBAL(t80)]                  ; unoffset   p0
+        pxor        xmm3,        xmm4                  ; unoffset   q0
+        pxor        xmm6,        xmm4                  ; unoffset   p0
 
         movdqa      xmm0,        t0                             ; p1
         movdqa      xmm4,        t1                             ; q1
@@ -1763,3 +1746,9 @@ s9:
 align 16
 s63:
     times 8 dw 0x003f
+align 16
+te0:
+    times 16 db 0xe0
+align 16
+t1f:
+    times 16 db 0x1f
diff --git a/vp8/common/x86/recon_sse2.asm b/vp8/common/x86/recon_sse2.asm
index a82c1b4fd..4b68ef5f2 100644
--- a/vp8/common/x86/recon_sse2.asm
+++ b/vp8/common/x86/recon_sse2.asm
@@ -559,12 +559,492 @@ sym(vp8_intra_pred_uv_ho_%1):
 vp8_intra_pred_uv_ho mmx2
 vp8_intra_pred_uv_ho ssse3
 
+;void vp8_intra_pred_y_dc_sse2(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *src,
+;    int src_stride,
+;    )
+global sym(vp8_intra_pred_y_dc_sse2)
+sym(vp8_intra_pred_y_dc_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ; from top
+    mov         rsi,        arg(2) ;src;
+    movsxd      rax,        dword ptr arg(3) ;src_stride;
+    sub         rsi,        rax
+    pxor        xmm0,       xmm0
+    movdqa      xmm1,       [rsi]
+    psadbw      xmm1,       xmm0
+    movq        xmm2,       xmm1
+    punpckhqdq  xmm1,       xmm1
+    paddw       xmm1,       xmm2
+
+    ; from left
+    dec         rsi
+    lea         rdi,        [rax*3]
+    movzx       ecx,        byte [rsi+rax]
+    movzx       edx,        byte [rsi+rax*2]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rdi]
+    add         ecx,        edx
+    lea         rsi,        [rsi+rax*4]
+    movzx       edx,        byte [rsi]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax*2]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rdi]
+    add         ecx,        edx
+    lea         rsi,        [rsi+rax*4]
+    movzx       edx,        byte [rsi]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax*2]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rdi]
+    add         ecx,        edx
+    lea         rsi,        [rsi+rax*4]
+    movzx       edx,        byte [rsi]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax*2]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rdi]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax*4]
+    add         ecx,        edx
+
+    ; add up
+    pextrw      edx,        xmm1, 0x0
+    lea         edx,        [edx+ecx+16]
+    sar         edx,        5
+    movd        xmm1,       edx
+    ; FIXME use pshufb for ssse3 version
+    pshuflw     xmm1,       xmm1, 0x0
+    punpcklqdq  xmm1,       xmm1
+    packuswb    xmm1,       xmm1
+
+    ; write out
+    mov         rsi,        2
+    mov         rdi,        arg(0) ;dst;
+    movsxd      rcx,        dword ptr arg(1) ;dst_stride
+    lea         rax,        [rcx*3]
+
+.label
+    movdqa [rdi      ],     xmm1
+    movdqa [rdi+rcx  ],     xmm1
+    movdqa [rdi+rcx*2],     xmm1
+    movdqa [rdi+rax  ],     xmm1
+    lea         rdi,        [rdi+rcx*4]
+    movdqa [rdi      ],     xmm1
+    movdqa [rdi+rcx  ],     xmm1
+    movdqa [rdi+rcx*2],     xmm1
+    movdqa [rdi+rax  ],     xmm1
+    lea         rdi,        [rdi+rcx*4]
+    dec         rsi
+    jnz .label
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_intra_pred_y_dctop_sse2(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *src,
+;    int src_stride,
+;    )
+global sym(vp8_intra_pred_y_dctop_sse2)
+sym(vp8_intra_pred_y_dctop_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    GET_GOT     rbx
+    ; end prolog
+
+    ; from top
+    mov         rcx,        arg(2) ;src;
+    movsxd      rax,        dword ptr arg(3) ;src_stride;
+    sub         rcx,        rax
+    pxor        xmm0,       xmm0
+    movdqa      xmm1,       [rcx]
+    psadbw      xmm1,       xmm0
+    movdqa      xmm2,       xmm1
+    punpckhqdq  xmm1,       xmm1
+    paddw       xmm1,       xmm2
+
+    ; add up
+    paddw       xmm1,       [GLOBAL(dc_8)]
+    psraw       xmm1,       4
+    ; FIXME use pshufb for ssse3 version
+    pshuflw     xmm1,       xmm1, 0x0
+    punpcklqdq  xmm1,       xmm1
+    packuswb    xmm1,       xmm1
+
+    ; write out
+    mov         rsi,        2
+    mov         rdx,        arg(0) ;dst;
+    movsxd      rcx,        dword ptr arg(1) ;dst_stride
+    lea         rax,        [rcx*3]
+
+.label
+    movdqa [rdx      ],     xmm1
+    movdqa [rdx+rcx  ],     xmm1
+    movdqa [rdx+rcx*2],     xmm1
+    movdqa [rdx+rax  ],     xmm1
+    lea         rdx,        [rdx+rcx*4]
+    movdqa [rdx      ],     xmm1
+    movdqa [rdx+rcx  ],     xmm1
+    movdqa [rdx+rcx*2],     xmm1
+    movdqa [rdx+rax  ],     xmm1
+    lea         rdx,        [rdx+rcx*4]
+    dec         rsi
+    jnz .label
+
+    ; begin epilog
+    RESTORE_GOT
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_intra_pred_y_dcleft_sse2(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *src,
+;    int src_stride,
+;    )
+global sym(vp8_intra_pred_y_dcleft_sse2)
+sym(vp8_intra_pred_y_dcleft_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ; from left
+    mov         rsi,        arg(2) ;src;
+    movsxd      rax,        dword ptr arg(3) ;src_stride;
+    dec         rsi
+    lea         rdi,        [rax*3]
+    movzx       ecx,        byte [rsi]
+    movzx       edx,        byte [rsi+rax]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax*2]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rdi]
+    add         ecx,        edx
+    lea         rsi,        [rsi+rax*4]
+    movzx       edx,        byte [rsi]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax*2]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rdi]
+    add         ecx,        edx
+    lea         rsi,        [rsi+rax*4]
+    movzx       edx,        byte [rsi]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax*2]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rdi]
+    add         ecx,        edx
+    lea         rsi,        [rsi+rax*4]
+    movzx       edx,        byte [rsi]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax*2]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rdi]
+    lea         edx,        [ecx+edx+8]
+
+    ; add up
+    shr         edx,        4
+    movd        xmm1,       edx
+    ; FIXME use pshufb for ssse3 version
+    pshuflw     xmm1,       xmm1, 0x0
+    punpcklqdq  xmm1,       xmm1
+    packuswb    xmm1,       xmm1
+
+    ; write out
+    mov         rsi,        2
+    mov         rdi,        arg(0) ;dst;
+    movsxd      rcx,        dword ptr arg(1) ;dst_stride
+    lea         rax,        [rcx*3]
+
+.label
+    movdqa [rdi      ],     xmm1
+    movdqa [rdi+rcx  ],     xmm1
+    movdqa [rdi+rcx*2],     xmm1
+    movdqa [rdi+rax  ],     xmm1
+    lea         rdi,        [rdi+rcx*4]
+    movdqa [rdi      ],     xmm1
+    movdqa [rdi+rcx  ],     xmm1
+    movdqa [rdi+rcx*2],     xmm1
+    movdqa [rdi+rax  ],     xmm1
+    lea         rdi,        [rdi+rcx*4]
+    dec         rsi
+    jnz .label
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_intra_pred_y_dc128_sse2(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *src,
+;    int src_stride,
+;    )
+global sym(vp8_intra_pred_y_dc128_sse2)
+sym(vp8_intra_pred_y_dc128_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    GET_GOT     rbx
+    ; end prolog
+
+    ; write out
+    mov         rsi,        2
+    movdqa      xmm1,       [GLOBAL(dc_128)]
+    mov         rax,        arg(0) ;dst;
+    movsxd      rdx,        dword ptr arg(1) ;dst_stride
+    lea         rcx,        [rdx*3]
+
+.label
+    movdqa [rax      ],     xmm1
+    movdqa [rax+rdx  ],     xmm1
+    movdqa [rax+rdx*2],     xmm1
+    movdqa [rax+rcx  ],     xmm1
+    lea         rax,        [rax+rdx*4]
+    movdqa [rax      ],     xmm1
+    movdqa [rax+rdx  ],     xmm1
+    movdqa [rax+rdx*2],     xmm1
+    movdqa [rax+rcx  ],     xmm1
+    lea         rax,        [rax+rdx*4]
+    dec         rsi
+    jnz .label
+
+    ; begin epilog
+    RESTORE_GOT
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_intra_pred_y_tm_sse2(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *src,
+;    int src_stride,
+;    )
+%macro vp8_intra_pred_y_tm 1
+global sym(vp8_intra_pred_y_tm_%1)
+sym(vp8_intra_pred_y_tm_%1):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+    GET_GOT     rbx
+    ; end prolog
+
+    ; read top row
+    mov         edx,        8
+    mov         rsi,        arg(2) ;src;
+    movsxd      rax,        dword ptr arg(3) ;src_stride;
+    sub         rsi,        rax
+    pxor        xmm0,       xmm0
+%ifidn %1, ssse3
+    movdqa      xmm3,       [GLOBAL(dc_1024)]
+%endif
+    movdqa      xmm1,       [rsi]
+    movdqa      xmm2,       xmm1
+    punpcklbw   xmm1,       xmm0
+    punpckhbw   xmm2,       xmm0
+
+    ; set up left ptrs ans subtract topleft
+    movd        xmm4,       [rsi-1]
+    lea         rsi,        [rsi+rax-1]
+%ifidn %1, sse2
+    punpcklbw   xmm4,       xmm0
+    pshuflw     xmm4,       xmm4, 0x0
+    punpcklqdq  xmm4,       xmm4
+%else
+    pshufb      xmm4,       xmm3
+%endif
+    psubw       xmm1,       xmm4
+    psubw       xmm2,       xmm4
+
+    ; set up dest ptrs
+    mov         rdi,        arg(0) ;dst;
+    movsxd      rcx,        dword ptr arg(1) ;dst_stride
+vp8_intra_pred_y_tm_%1_loop:
+    movd        xmm4,       [rsi]
+    movd        xmm5,       [rsi+rax]
+%ifidn %1, sse2
+    punpcklbw   xmm4,       xmm0
+    punpcklbw   xmm5,       xmm0
+    pshuflw     xmm4,       xmm4, 0x0
+    pshuflw     xmm5,       xmm5, 0x0
+    punpcklqdq  xmm4,       xmm4
+    punpcklqdq  xmm5,       xmm5
+%else
+    pshufb      xmm4,       xmm3
+    pshufb      xmm5,       xmm3
+%endif
+    movdqa      xmm6,       xmm4
+    movdqa      xmm7,       xmm5
+    paddw       xmm4,       xmm1
+    paddw       xmm6,       xmm2
+    paddw       xmm5,       xmm1
+    paddw       xmm7,       xmm2
+    packuswb    xmm4,       xmm6
+    packuswb    xmm5,       xmm7
+    movdqa [rdi    ],       xmm4
+    movdqa [rdi+rcx],       xmm5
+    lea         rsi,        [rsi+rax*2]
+    lea         rdi,        [rdi+rcx*2]
+    dec         edx
+    jnz vp8_intra_pred_y_tm_%1_loop
+
+    ; begin epilog
+    RESTORE_GOT
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%endmacro
+
+vp8_intra_pred_y_tm sse2
+vp8_intra_pred_y_tm ssse3
+
+;void vp8_intra_pred_y_ve_sse2(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *src,
+;    int src_stride,
+;    )
+global sym(vp8_intra_pred_y_ve_sse2)
+sym(vp8_intra_pred_y_ve_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    ; end prolog
+
+    ; read from top
+    mov         rax,        arg(2) ;src;
+    movsxd      rdx,        dword ptr arg(3) ;src_stride;
+    sub         rax,        rdx
+    movdqa      xmm1,       [rax]
+
+    ; write out
+    mov         rsi,        2
+    mov         rax,        arg(0) ;dst;
+    movsxd      rdx,        dword ptr arg(1) ;dst_stride
+    lea         rcx,        [rdx*3]
+
+.label
+    movdqa [rax      ],     xmm1
+    movdqa [rax+rdx  ],     xmm1
+    movdqa [rax+rdx*2],     xmm1
+    movdqa [rax+rcx  ],     xmm1
+    lea         rax,        [rax+rdx*4]
+    movdqa [rax      ],     xmm1
+    movdqa [rax+rdx  ],     xmm1
+    movdqa [rax+rdx*2],     xmm1
+    movdqa [rax+rcx  ],     xmm1
+    lea         rax,        [rax+rdx*4]
+    dec         rsi
+    jnz .label
+
+    ; begin epilog
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_intra_pred_y_ho_sse2(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *src,
+;    int src_stride,
+;    )
+global sym(vp8_intra_pred_y_ho_sse2)
+sym(vp8_intra_pred_y_ho_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ; read from left and write out
+    mov         edx,        8
+    mov         rsi,        arg(2) ;src;
+    movsxd      rax,        dword ptr arg(3) ;src_stride;
+    mov         rdi,        arg(0) ;dst;
+    movsxd      rcx,        dword ptr arg(1) ;dst_stride
+    dec         rsi
+
+vp8_intra_pred_y_ho_sse2_loop:
+    movd        xmm0,       [rsi]
+    movd        xmm1,       [rsi+rax]
+    ; FIXME use pshufb for ssse3 version
+    punpcklbw   xmm0,       xmm0
+    punpcklbw   xmm1,       xmm1
+    pshuflw     xmm0,       xmm0, 0x0
+    pshuflw     xmm1,       xmm1, 0x0
+    punpcklqdq  xmm0,       xmm0
+    punpcklqdq  xmm1,       xmm1
+    movdqa [rdi    ],       xmm0
+    movdqa [rdi+rcx],       xmm1
+    lea         rsi,        [rsi+rax*2]
+    lea         rdi,        [rdi+rcx*2]
+    dec         edx
+    jnz vp8_intra_pred_y_ho_sse2_loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
 SECTION_RODATA
+align 16
 dc_128:
-    times 8 db 128
+    times 16 db 128
 dc_4:
     times 4 dw 4
 align 16
+dc_8:
+    times 8 dw 8
+align 16
 dc_1024:
     times 8 dw 0x400
 align 16
diff --git a/vp8/common/x86/recon_wrapper_sse2.c b/vp8/common/x86/recon_wrapper_sse2.c
index fcc75a901..44221cd0b 100644
--- a/vp8/common/x86/recon_wrapper_sse2.c
+++ b/vp8/common/x86/recon_wrapper_sse2.c
@@ -94,3 +94,69 @@ void vp8_build_intra_predictors_mbuv_s_ssse3(MACROBLOCKD *x)
                                         vp8_intra_pred_uv_tm_ssse3,
                                         vp8_intra_pred_uv_ho_ssse3);
 }
+
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_y_dc_sse2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_y_dctop_sse2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_y_dcleft_sse2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_y_dc128_sse2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_y_ho_sse2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_y_ve_sse2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_y_tm_sse2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_y_tm_ssse3);
+
+static void vp8_build_intra_predictors_mby_x86(MACROBLOCKD *x,
+                                               unsigned char *dst_y,
+                                               int dst_stride,
+                                               build_intra_predictors_mbuv_fn_t tm_func)
+{
+    int mode = x->mode_info_context->mbmi.mode;
+    build_intra_predictors_mbuv_fn_t fn;
+    int src_stride = x->dst.y_stride;
+    switch (mode) {
+        case  V_PRED: fn = vp8_intra_pred_y_ve_sse2; break;
+        case  H_PRED: fn = vp8_intra_pred_y_ho_sse2; break;
+        case TM_PRED: fn = tm_func; break;
+        case DC_PRED:
+            if (x->up_available) {
+                if (x->left_available) {
+                    fn = vp8_intra_pred_y_dc_sse2; break;
+                } else {
+                    fn = vp8_intra_pred_y_dctop_sse2; break;
+                }
+            } else if (x->left_available) {
+                fn = vp8_intra_pred_y_dcleft_sse2; break;
+            } else {
+                fn = vp8_intra_pred_y_dc128_sse2; break;
+            }
+            break;
+        default: return;
+    }
+
+    fn(dst_y, dst_stride, x->dst.y_buffer, src_stride);
+    return;
+}
+
+void vp8_build_intra_predictors_mby_sse2(MACROBLOCKD *x)
+{
+    vp8_build_intra_predictors_mby_x86(x, x->predictor, 16,
+                                       vp8_intra_pred_y_tm_sse2);
+}
+
+void vp8_build_intra_predictors_mby_ssse3(MACROBLOCKD *x)
+{
+    vp8_build_intra_predictors_mby_x86(x, x->predictor, 16,
+                                       vp8_intra_pred_y_tm_ssse3);
+}
+
+void vp8_build_intra_predictors_mby_s_sse2(MACROBLOCKD *x)
+{
+    vp8_build_intra_predictors_mby_x86(x, x->dst.y_buffer, x->dst.y_stride,
+                                       vp8_intra_pred_y_tm_sse2);
+}
+
+void vp8_build_intra_predictors_mby_s_ssse3(MACROBLOCKD *x)
+{
+    vp8_build_intra_predictors_mby_x86(x, x->dst.y_buffer, x->dst.y_stride,
+                                       vp8_intra_pred_y_tm_ssse3);
+
+}
diff --git a/vp8/common/x86/recon_x86.h b/vp8/common/x86/recon_x86.h
index fbb3dcb63..afacc60d1 100644
--- a/vp8/common/x86/recon_x86.h
+++ b/vp8/common/x86/recon_x86.h
@@ -42,6 +42,8 @@ extern prototype_copy_block(vp8_copy_mem16x16_mmx);
 extern prototype_copy_block(vp8_copy_mem16x16_sse2);
 extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_sse2);
 extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_s_sse2);
+extern prototype_build_intra_predictors(vp8_build_intra_predictors_mby_sse2);
+extern prototype_build_intra_predictors(vp8_build_intra_predictors_mby_s_sse2);
 
 #if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_recon_copy16x16
@@ -53,12 +55,20 @@ extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_s_sse2);
 #undef  vp8_recon_build_intra_predictors_mbuv_s
 #define vp8_recon_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_sse2
 
+#undef  vp8_recon_build_intra_predictors_mby
+#define vp8_recon_build_intra_predictors_mby vp8_build_intra_predictors_mby_sse2
+
+#undef  vp8_recon_build_intra_predictors_mby_s
+#define vp8_recon_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_sse2
+
 #endif
 #endif
 
 #if HAVE_SSSE3
 extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_ssse3);
 extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_s_ssse3);
+extern prototype_build_intra_predictors(vp8_build_intra_predictors_mby_ssse3);
+extern prototype_build_intra_predictors(vp8_build_intra_predictors_mby_s_ssse3);
 
 #if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_recon_build_intra_predictors_mbuv
@@ -67,6 +77,12 @@ extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_s_ssse3)
 #undef  vp8_recon_build_intra_predictors_mbuv_s
 #define vp8_recon_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_ssse3
 
+#undef  vp8_recon_build_intra_predictors_mby
+#define vp8_recon_build_intra_predictors_mby vp8_build_intra_predictors_mby_ssse3
+
+#undef  vp8_recon_build_intra_predictors_mby_s
+#define vp8_recon_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_ssse3
+
 #endif
 #endif
 #endif
diff --git a/vp8/common/x86/x86_systemdependent.c b/vp8/common/x86/x86_systemdependent.c
index ad3a1f76b..e1e1b7987 100644
--- a/vp8/common/x86/x86_systemdependent.c
+++ b/vp8/common/x86/x86_systemdependent.c
@@ -11,7 +11,6 @@
 
 #include "vpx_config.h"
 #include "vpx_ports/x86.h"
-#include "vp8/common/g_common.h"
 #include "vp8/common/subpixel.h"
 #include "vp8/common/loopfilter.h"
 #include "vp8/common/recon.h"
@@ -86,6 +85,10 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx)
             vp8_build_intra_predictors_mbuv_sse2;
         rtcd->recon.build_intra_predictors_mbuv_s =
             vp8_build_intra_predictors_mbuv_s_sse2;
+        rtcd->recon.build_intra_predictors_mby =
+            vp8_build_intra_predictors_mby_sse2;
+        rtcd->recon.build_intra_predictors_mby_s =
+            vp8_build_intra_predictors_mby_s_sse2;
 
         rtcd->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_sse2;
         rtcd->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_sse2;
@@ -132,6 +135,10 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx)
             vp8_build_intra_predictors_mbuv_ssse3;
         rtcd->recon.build_intra_predictors_mbuv_s =
             vp8_build_intra_predictors_mbuv_s_ssse3;
+        rtcd->recon.build_intra_predictors_mby =
+            vp8_build_intra_predictors_mby_ssse3;
+        rtcd->recon.build_intra_predictors_mby_s =
+            vp8_build_intra_predictors_mby_s_ssse3;
     }
 #endif
 
diff --git a/vp8/decoder/decodframe.c b/vp8/decoder/decodframe.c
index 11d0e38f5..917aeceb6 100644
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -42,7 +42,6 @@
 
 void vp8cx_init_de_quantizer(VP8D_COMP *pbi)
 {
-    int i;
     int Q;
     VP8_COMMON *const pc = & pbi->common;
 
@@ -52,15 +51,9 @@ void vp8cx_init_de_quantizer(VP8D_COMP *pbi)
         pc->Y2dequant[Q][0] = (short)vp8_dc2quant(Q, pc->y2dc_delta_q);
         pc->UVdequant[Q][0] = (short)vp8_dc_uv_quant(Q, pc->uvdc_delta_q);
 
-        /* all the ac values = ; */
-        for (i = 1; i < 16; i++)
-        {
-            int rc = vp8_default_zig_zag1d[i];
-
-            pc->Y1dequant[Q][rc] = (short)vp8_ac_yquant(Q);
-            pc->Y2dequant[Q][rc] = (short)vp8_ac2quant(Q, pc->y2ac_delta_q);
-            pc->UVdequant[Q][rc] = (short)vp8_ac_uv_quant(Q, pc->uvac_delta_q);
-        }
+        pc->Y1dequant[Q][1] = (short)vp8_ac_yquant(Q);
+        pc->Y2dequant[Q][1] = (short)vp8_ac2quant(Q, pc->y2ac_delta_q);
+        pc->UVdequant[Q][1] = (short)vp8_ac_uv_quant(Q, pc->uvac_delta_q);
     }
 }
 
@@ -88,19 +81,19 @@ void mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd)
     else
         QIndex = pc->base_qindex;
 
-    /* Set up the block level dequant pointers */
-    for (i = 0; i < 16; i++)
-    {
-        xd->block[i].dequant = pc->Y1dequant[QIndex];
-    }
+    /* Set up the macroblock dequant constants */
+    xd->dequant_y1_dc[0] = 1;
+    xd->dequant_y1[0] = pc->Y1dequant[QIndex][0];
+    xd->dequant_y2[0] = pc->Y2dequant[QIndex][0];
+    xd->dequant_uv[0] = pc->UVdequant[QIndex][0];
 
-    for (i = 16; i < 24; i++)
+    for (i = 1; i < 16; i++)
     {
-        xd->block[i].dequant = pc->UVdequant[QIndex];
+        xd->dequant_y1_dc[i] =
+        xd->dequant_y1[i] = pc->Y1dequant[QIndex][1];
+        xd->dequant_y2[i] = pc->Y2dequant[QIndex][1];
+        xd->dequant_uv[i] = pc->UVdequant[QIndex][1];
     }
-
-    xd->block[24].dequant = pc->Y2dequant[QIndex];
-
 }
 
 #if CONFIG_RUNTIME_CPU_DETECT
@@ -180,6 +173,8 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
         }
         else
         {
+            short *DQC = xd->dequant_y1;
+
             /* clear out residual eob info */
             if(xd->mode_info_context->mbmi.mb_skip_coeff)
                 vpx_memset(xd->eobs, 0, 25);
@@ -200,13 +195,13 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
                     if (xd->eobs[i] > 1)
                     {
                         DEQUANT_INVOKE(&pbi->common.rtcd.dequant, idct_add)
-                            (b->qcoeff, b->dequant,
+                            (b->qcoeff, DQC,
                             *(b->base_dst) + b->dst, b->dst_stride);
                     }
                     else
                     {
                         IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)
-                            (b->qcoeff[0] * b->dequant[0],
+                            (b->qcoeff[0] * DQC[0],
                             *(b->base_dst) + b->dst, b->dst_stride,
                             *(b->base_dst) + b->dst, b->dst_stride);
                         ((int *)b->qcoeff)[0] = 0;
@@ -233,10 +228,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
         /* dequantization and idct */
         if (mode != B_PRED)
         {
-            short *DQC = xd->block[0].dequant;
-
-            /* save the dc dequant constant in case it is overridden */
-            short dc_dequant_temp = DQC[0];
+            short *DQC = xd->dequant_y1;
 
             if (mode != SPLITMV)
             {
@@ -245,7 +237,8 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
                 /* do 2nd order transform on the dc block */
                 if (xd->eobs[24] > 1)
                 {
-                    DEQUANT_INVOKE(&pbi->common.rtcd.dequant, block)(b);
+                    DEQUANT_INVOKE(&pbi->common.rtcd.dequant, block)(b,
+                        xd->dequant_y2);
 
                     IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0],
                         xd->qcoeff);
@@ -260,7 +253,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
                 }
                 else
                 {
-                    b->dqcoeff[0] = b->qcoeff[0] * b->dequant[0];
+                    b->dqcoeff[0] = b->qcoeff[0] * xd->dequant_y2[0];
                     IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0],
                         xd->qcoeff);
                     ((int *)b->qcoeff)[0] = 0;
@@ -269,20 +262,17 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
                 /* override the dc dequant constant in order to preserve the
                  * dc components
                  */
-                DQC[0] = 1;
+                DQC = xd->dequant_y1_dc;
             }
 
             DEQUANT_INVOKE (&pbi->common.rtcd.dequant, idct_add_y_block)
-                            (xd->qcoeff, xd->block[0].dequant,
+                            (xd->qcoeff, DQC,
                              xd->dst.y_buffer,
                              xd->dst.y_stride, xd->eobs);
-
-            /* restore the dc dequant constant */
-            DQC[0] = dc_dequant_temp;
         }
 
         DEQUANT_INVOKE (&pbi->common.rtcd.dequant, idct_add_uv_block)
-                        (xd->qcoeff+16*16, xd->block[16].dequant,
+                        (xd->qcoeff+16*16, xd->dequant_uv,
                          xd->dst.u_buffer, xd->dst.v_buffer,
                          xd->dst.uv_stride, xd->eobs+16);
     }
diff --git a/vp8/decoder/onyxd_if.c b/vp8/decoder/onyxd_if.c
index 9a45702cf..80648d39f 100644
--- a/vp8/decoder/onyxd_if.c
+++ b/vp8/decoder/onyxd_if.c
@@ -20,7 +20,6 @@
 #include "vpx_scale/yv12extend.h"
 #include "vp8/common/loopfilter.h"
 #include "vp8/common/swapyv12buffer.h"
-#include "vp8/common/g_common.h"
 #include "vp8/common/threading.h"
 #include "decoderthreading.h"
 #include <stdio.h>
diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c
index 947b3a1c6..2ce00f705 100644
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -37,7 +37,7 @@ extern void mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd);
 static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC *mbrd, int count)
 {
     VP8_COMMON *const pc = & pbi->common;
-    int i, j;
+    int i;
 
     for (i = 0; i < count; i++)
     {
@@ -77,10 +77,10 @@ static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_D
 
         mbd->current_bc = &pbi->bc2;
 
-        for (j = 0; j < 25; j++)
-        {
-            mbd->block[j].dequant = xd->block[j].dequant;
-        }
+        vpx_memcpy(mbd->dequant_y1_dc, xd->dequant_y1_dc, sizeof(xd->dequant_y1_dc));
+        vpx_memcpy(mbd->dequant_y1, xd->dequant_y1, sizeof(xd->dequant_y1));
+        vpx_memcpy(mbd->dequant_y2, xd->dequant_y2, sizeof(xd->dequant_y2));
+        vpx_memcpy(mbd->dequant_uv, xd->dequant_uv, sizeof(xd->dequant_uv));
 
         mbd->fullpixel_mask = 0xffffffff;
         if(pc->full_pixel)
@@ -177,6 +177,8 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
     /* dequantization and idct */
     if (xd->mode_info_context->mbmi.mode == B_PRED)
     {
+        short *DQC = xd->dequant_y1;
+
         for (i = 0; i < 16; i++)
         {
             BLOCKD *b = &xd->block[i];
@@ -190,13 +192,13 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
                 if (xd->eobs[i] > 1)
                 {
                     DEQUANT_INVOKE(&pbi->common.rtcd.dequant, idct_add)
-                        (b->qcoeff, b->dequant,
+                        (b->qcoeff, DQC,
                         *(b->base_dst) + b->dst, b->dst_stride);
                 }
                 else
                 {
                     IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)
-                        (b->qcoeff[0] * b->dequant[0],
+                        (b->qcoeff[0] * DQC[0],
                         *(b->base_dst) + b->dst, b->dst_stride,
                         *(b->base_dst) + b->dst, b->dst_stride);
                     ((int *)b->qcoeff)[0] = 0;
@@ -206,9 +208,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
     }
     else
     {
-        short *DQC = xd->block[0].dequant;
-
-        DECLARE_ALIGNED(16, short, local_dequant[16]);
+        short *DQC = xd->dequant_y1;
 
         if (xd->mode_info_context->mbmi.mode != SPLITMV)
         {
@@ -217,7 +217,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
             /* do 2nd order transform on the dc block */
             if (xd->eobs[24] > 1)
             {
-                DEQUANT_INVOKE(&pbi->common.rtcd.dequant, block)(b);
+                DEQUANT_INVOKE(&pbi->common.rtcd.dequant, block)(b, xd->dequant_y2);
 
                 IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0],
                     xd->qcoeff);
@@ -232,20 +232,13 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
             }
             else
             {
-                b->dqcoeff[0] = b->qcoeff[0] * b->dequant[0];
+                b->dqcoeff[0] = b->qcoeff[0] * xd->dequant_y2[0];
                 IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], xd->qcoeff);
                 ((int *)b->qcoeff)[0] = 0;
             }
 
-            /* make a local copy of the dequant constants */
-            vpx_memcpy(local_dequant, xd->block[0].dequant,
-                       sizeof(local_dequant));
-
             /* override the dc dequant constant */
-            local_dequant[0] = 1;
-
-            /* use the new dequant constants */
-            DQC = local_dequant;
+            DQC = xd->dequant_y1_dc;
         }
 
         DEQUANT_INVOKE (&pbi->common.rtcd.dequant, idct_add_y_block)
@@ -255,7 +248,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
     }
 
     DEQUANT_INVOKE (&pbi->common.rtcd.dequant, idct_add_uv_block)
-                    (xd->qcoeff+16*16, xd->block[16].dequant,
+                    (xd->qcoeff+16*16, xd->dequant_uv,
                      xd->dst.u_buffer, xd->dst.v_buffer,
                      xd->dst.uv_stride, xd->eobs+16);
 }
diff --git a/vp8/encoder/block.h b/vp8/encoder/block.h
index 5e5a60db7..0a74ca46d 100644
--- a/vp8/encoder/block.h
+++ b/vp8/encoder/block.h
@@ -45,10 +45,6 @@ typedef struct
     unsigned char **base_src;
     int src;
     int src_stride;
-
-//  MV  enc_mv;
-    int force_empty;
-
 } BLOCK;
 
 typedef struct
@@ -107,7 +103,6 @@ typedef struct
     int mv_row_min;
     int mv_row_max;
 
-    int vector_range;    // Used to monitor limiting range of recent vectors to guide search.
     int skip;
 
     int encode_breakout;
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index 88868d684..b5c5c7445 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -595,8 +595,6 @@ void init_encode_frame_mb_context(VP8_COMP *cpi)
     // Activity map pointer
     x->mb_activity_ptr = cpi->mb_activity_map;
 
-    x->vector_range = 32;
-
     x->act_zbin_adj = 0;
 
     x->partition_info = x->pi;
@@ -1122,7 +1120,7 @@ int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,
         vp8_inverse_transform_mby(xd, IF_RTCD(&cpi->common.rtcd));
 
     DEQUANT_INVOKE (&cpi->common.rtcd.dequant, idct_add_uv_block)
-                    (xd->qcoeff+16*16, xd->block[16].dequant,
+                    (xd->qcoeff+16*16, xd->dequant_uv,
                      xd->dst.u_buffer, xd->dst.v_buffer,
                      xd->dst.uv_stride, xd->eobs+16);
     return rate;
@@ -1307,7 +1305,7 @@ int vp8cx_encode_inter_macroblock
             vp8_inverse_transform_mby(xd, IF_RTCD(&cpi->common.rtcd));
 
         DEQUANT_INVOKE (&cpi->common.rtcd.dequant, idct_add_uv_block)
-                        (xd->qcoeff+16*16, xd->block[16].dequant,
+                        (xd->qcoeff+16*16, xd->dequant_uv,
                          xd->dst.u_buffer, xd->dst.v_buffer,
                          xd->dst.uv_stride, xd->eobs+16);
     }
diff --git a/vp8/encoder/encodeintra.c b/vp8/encoder/encodeintra.c
index 4378b634e..16393a1ff 100644
--- a/vp8/encoder/encodeintra.c
+++ b/vp8/encoder/encodeintra.c
@@ -18,7 +18,6 @@
 #include "vp8/common/invtrans.h"
 #include "vp8/common/recon.h"
 #include "dct.h"
-#include "vp8/common/g_common.h"
 #include "encodeintra.h"
 
 
diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c
index ef8ead588..24339a5e0 100644
--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -302,7 +302,6 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
     z->mv_col_max    = x->mv_col_max;
     z->mv_row_min    = x->mv_row_min;
     z->mv_row_max    = x->mv_row_max;
-    z->vector_range = x->vector_range ;
     */
 
     z->vp8_short_fdct4x4     = x->vp8_short_fdct4x4;
@@ -350,8 +349,6 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
         z->block[i].src             = x->block[i].src;
         */
         z->block[i].src_stride       = x->block[i].src_stride;
-        z->block[i].force_empty      = x->block[i].force_empty;
-
     }
 
     {
@@ -387,10 +384,22 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
         zd->mb_segement_abs_delta      = xd->mb_segement_abs_delta;
         vpx_memcpy(zd->segment_feature_data, xd->segment_feature_data, sizeof(xd->segment_feature_data));
 
-        for (i = 0; i < 25; i++)
-        {
-            zd->block[i].dequant = xd->block[i].dequant;
-        }
+        vpx_memcpy(zd->dequant_y1_dc, xd->dequant_y1_dc, sizeof(xd->dequant_y1_dc));
+        vpx_memcpy(zd->dequant_y1, xd->dequant_y1, sizeof(xd->dequant_y1));
+        vpx_memcpy(zd->dequant_y2, xd->dequant_y2, sizeof(xd->dequant_y2));
+        vpx_memcpy(zd->dequant_uv, xd->dequant_uv, sizeof(xd->dequant_uv));
+
+#if 1
+        /*TODO:  Remove dequant from BLOCKD.  This is a temporary solution until
+         * the quantizer code uses a passed in pointer to the dequant constants.
+         * This will also require modifications to the x86 and neon assembly.
+         * */
+        for (i = 0; i < 16; i++)
+            zd->block[i].dequant = zd->dequant_y1;
+        for (i = 16; i < 24; i++)
+            zd->block[i].dequant = zd->dequant_uv;
+        zd->block[24].dequant = zd->dequant_y2;
+#endif
     }
 }
 
@@ -421,8 +430,6 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi,
 #endif
         mb->gf_active_ptr            = x->gf_active_ptr;
 
-        mb->vector_range             = 32;
-
         vpx_memset(mbr_ei[i].segment_counts, 0, sizeof(mbr_ei[i].segment_counts));
         mbr_ei[i].totalrate = 0;
 
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 6e0254644..9223781af 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -23,7 +23,6 @@
 #include "ratectrl.h"
 #include "vp8/common/quant_common.h"
 #include "segmentation.h"
-#include "vp8/common/g_common.h"
 #include "vpx_scale/yv12extend.h"
 #if CONFIG_POSTPROC
 #include "vp8/common/postproc.h"
@@ -251,6 +250,9 @@ static void save_layer_context(VP8_COMP *cpi)
     lc->starting_buffer_level            = cpi->oxcf.starting_buffer_level;
     lc->optimal_buffer_level             = cpi->oxcf.optimal_buffer_level;
     lc->maximum_buffer_size              = cpi->oxcf.maximum_buffer_size;
+    lc->starting_buffer_level_in_ms      = cpi->oxcf.starting_buffer_level_in_ms;
+    lc->optimal_buffer_level_in_ms       = cpi->oxcf.optimal_buffer_level_in_ms;
+    lc->maximum_buffer_size_in_ms        = cpi->oxcf.maximum_buffer_size_in_ms;
     lc->buffer_level                     = cpi->buffer_level;
     lc->bits_off_target                  = cpi->bits_off_target;
     lc->total_actual_bits                = cpi->total_actual_bits;
@@ -288,6 +290,9 @@ static void restore_layer_context(VP8_COMP *cpi, const int layer)
     cpi->oxcf.starting_buffer_level       = lc->starting_buffer_level;
     cpi->oxcf.optimal_buffer_level        = lc->optimal_buffer_level;
     cpi->oxcf.maximum_buffer_size         = lc->maximum_buffer_size;
+    cpi->oxcf.starting_buffer_level_in_ms = lc->starting_buffer_level_in_ms;
+    cpi->oxcf.optimal_buffer_level_in_ms  = lc->optimal_buffer_level_in_ms;
+    cpi->oxcf.maximum_buffer_size_in_ms   = lc->maximum_buffer_size_in_ms;
     cpi->buffer_level                     = lc->buffer_level;
     cpi->bits_off_target                  = lc->bits_off_target;
     cpi->total_actual_bits                = lc->total_actual_bits;
@@ -1255,6 +1260,8 @@ static void init_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
     if (cpi->frame_rate > 180)
         cpi->frame_rate = 30;
 
+    cpi->ref_frame_rate = cpi->frame_rate;
+
     // change includes all joint functionality
     vp8_change_config(cpi, oxcf);
 
@@ -1290,6 +1297,10 @@ static void init_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
                         cpi->output_frame_rate / cpi->oxcf.rate_decimator[i];
             lc->target_bandwidth = cpi->oxcf.target_bitrate[i] * 1000;
 
+            lc->starting_buffer_level_in_ms = oxcf->starting_buffer_level;
+            lc->optimal_buffer_level_in_ms  = oxcf->optimal_buffer_level;
+            lc->maximum_buffer_size_in_ms   = oxcf->maximum_buffer_size;
+
             lc->starting_buffer_level =
               rescale(oxcf->starting_buffer_level,
                           lc->target_bandwidth, 1000);
@@ -1346,6 +1357,56 @@ static void init_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
 #endif
 }
 
+void update_layer_contexts (VP8_COMP *cpi)
+{
+    VP8_CONFIG *oxcf = &cpi->oxcf;
+
+    /* Update snapshots of the layer contexts to reflect new parameters */
+    if (oxcf->number_of_layers > 1)
+    {
+        unsigned int i;
+        double prev_layer_frame_rate=0;
+
+        for (i=0; i<oxcf->number_of_layers; i++)
+        {
+            LAYER_CONTEXT *lc = &cpi->layer_context[i];
+
+            lc->frame_rate =
+                cpi->ref_frame_rate / oxcf->rate_decimator[i];
+            lc->target_bandwidth = oxcf->target_bitrate[i] * 1000;
+
+            lc->starting_buffer_level = rescale(
+                          oxcf->starting_buffer_level_in_ms,
+                          lc->target_bandwidth, 1000);
+
+            if (oxcf->optimal_buffer_level == 0)
+                lc->optimal_buffer_level = lc->target_bandwidth / 8;
+            else
+                lc->optimal_buffer_level = rescale(
+                          oxcf->optimal_buffer_level_in_ms,
+                          lc->target_bandwidth, 1000);
+
+            if (oxcf->maximum_buffer_size == 0)
+                lc->maximum_buffer_size = lc->target_bandwidth / 8;
+            else
+                lc->maximum_buffer_size = rescale(
+                          oxcf->maximum_buffer_size_in_ms,
+                          lc->target_bandwidth, 1000);
+
+            // Work out the average size of a frame within this layer
+            if (i > 0)
+                lc->avg_frame_size_for_layer = (oxcf->target_bitrate[i] -
+                    oxcf->target_bitrate[i-1]) * 1000 /
+                    (lc->frame_rate - prev_layer_frame_rate);
+
+            lc->active_worst_quality         = oxcf->worst_allowed_q;
+            lc->active_best_quality          = oxcf->best_allowed_q;
+            lc->avg_frame_qindex             = oxcf->worst_allowed_q;
+
+            prev_layer_frame_rate = lc->frame_rate;
+        }
+    }
+}
 
 void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
 {
@@ -1486,9 +1547,12 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
     // local file playback mode == really big buffer
     if (cpi->oxcf.end_usage == USAGE_LOCAL_FILE_PLAYBACK)
     {
-        cpi->oxcf.starting_buffer_level   = 60000;
-        cpi->oxcf.optimal_buffer_level    = 60000;
-        cpi->oxcf.maximum_buffer_size     = 240000;
+        cpi->oxcf.starting_buffer_level       = 60000;
+        cpi->oxcf.optimal_buffer_level        = 60000;
+        cpi->oxcf.maximum_buffer_size         = 240000;
+        cpi->oxcf.starting_buffer_level_in_ms = 60000;
+        cpi->oxcf.optimal_buffer_level_in_ms  = 60000;
+        cpi->oxcf.maximum_buffer_size_in_ms   = 240000;
     }
 
     // Convert target bandwidth from Kbit/s to Bit/s
@@ -4257,14 +4321,15 @@ static void encode_frame_to_data_rate
 
         vp8_clear_system_state();  //__asm emms;
 
-        if (cpi->twopass.total_left_stats->coded_error != 0.0)
-            fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %6d %6d"
+        if (cpi->twopass.total_left_stats.coded_error != 0.0)
+            fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %10d %6d %6d"
                        "%6d %6d %6d %5d %5d %5d %8d %8.2f %10d %10.3f"
                        "%10.3f %8d\n",
                        cpi->common.current_video_frame, cpi->this_frame_target,
                        cpi->projected_frame_size,
                        (cpi->projected_frame_size - cpi->this_frame_target),
                        (int)cpi->total_target_vs_actual,
+                       cpi->buffer_level,
                        (cpi->oxcf.starting_buffer_level-cpi->bits_off_target),
                        (int)cpi->total_actual_bits, cm->base_qindex,
                        cpi->active_best_quality, cpi->active_worst_quality,
@@ -4275,18 +4340,19 @@ static void encode_frame_to_data_rate
                        cm->frame_type, cpi->gfu_boost,
                        cpi->twopass.est_max_qcorrection_factor,
                        (int)cpi->twopass.bits_left,
-                       cpi->twopass.total_left_stats->coded_error,
+                       cpi->twopass.total_left_stats.coded_error,
                        (double)cpi->twopass.bits_left /
-                           cpi->twopass.total_left_stats->coded_error,
+                           cpi->twopass.total_left_stats.coded_error,
                        cpi->tot_recode_hits);
         else
-            fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %6d %6d"
+            fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %10d %6d %6d"
                        "%6d %6d %6d %5d %5d %5d %8d %8.2f %10d %10.3f"
                        "%8d\n",
                        cpi->common.current_video_frame,
                        cpi->this_frame_target, cpi->projected_frame_size,
                        (cpi->projected_frame_size - cpi->this_frame_target),
                        (int)cpi->total_target_vs_actual,
+                       cpi->buffer_level,
                        (cpi->oxcf.starting_buffer_level-cpi->bits_off_target),
                        (int)cpi->total_actual_bits, cm->base_qindex,
                        cpi->active_best_quality, cpi->active_worst_quality,
@@ -4297,7 +4363,7 @@ static void encode_frame_to_data_rate
                        cm->frame_type, cpi->gfu_boost,
                        cpi->twopass.est_max_qcorrection_factor,
                        (int)cpi->twopass.bits_left,
-                       cpi->twopass.total_left_stats->coded_error,
+                       cpi->twopass.total_left_stats.coded_error,
                        cpi->tot_recode_hits);
 
         fclose(f);
@@ -4670,13 +4736,6 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
         return -1;
     }
 
-    // Restore layer specific context if necessary
-    if (cpi->oxcf.number_of_layers > 1)
-    {
-        restore_layer_context (cpi,
-           cpi->oxcf.layer_id[cm->current_video_frame % cpi->oxcf.periodicity]);
-    }
-
     if (cpi->source->ts_start < cpi->first_time_stamp_ever)
     {
         cpi->first_time_stamp_ever = cpi->source->ts_start;
@@ -4684,16 +4743,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
     }
 
     // adjust frame rates based on timestamps given
-    if (cpi->oxcf.number_of_layers > 1 )
-    {
-        vp8_new_frame_rate (
-              cpi, cpi->layer_context[cpi->current_layer].frame_rate);
-
-        cpi->last_time_stamp_seen = cpi->source->ts_start;
-        cpi->last_end_time_stamp_seen = cpi->source->ts_end;
-
-    }
-    else if (!cm->refresh_alt_ref_frame)
+    if (!cm->refresh_alt_ref_frame || (cpi->oxcf.number_of_layers > 1))
     {
         int64_t this_duration;
         int step = 0;
@@ -4718,7 +4768,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
         if (this_duration)
         {
             if (step)
-                vp8_new_frame_rate(cpi, 10000000.0 / this_duration);
+                cpi->ref_frame_rate = 10000000.0 / this_duration;
             else
             {
                 double avg_duration, interval;
@@ -4731,18 +4781,46 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
                 if(interval > 10000000.0)
                     interval = 10000000;
 
-                avg_duration = 10000000.0 / cpi->frame_rate;
+                avg_duration = 10000000.0 / cpi->ref_frame_rate;
                 avg_duration *= (interval - avg_duration + this_duration);
                 avg_duration /= interval;
 
-                vp8_new_frame_rate(cpi, 10000000.0 / avg_duration);
+                cpi->ref_frame_rate = 10000000.0 / avg_duration;
+            }
+
+            if (cpi->oxcf.number_of_layers > 1)
+            {
+                int i;
+
+                // Update frame rates for each layer
+                for (i=0; i<cpi->oxcf.number_of_layers; i++)
+                {
+                    LAYER_CONTEXT *lc = &cpi->layer_context[i];
+                    lc->frame_rate = cpi->ref_frame_rate /
+                                  cpi->oxcf.rate_decimator[i];
+                }
             }
+            else
+                vp8_new_frame_rate(cpi, cpi->ref_frame_rate);
         }
 
         cpi->last_time_stamp_seen = cpi->source->ts_start;
         cpi->last_end_time_stamp_seen = cpi->source->ts_end;
     }
 
+    if (cpi->oxcf.number_of_layers > 1)
+    {
+        int layer;
+
+        update_layer_contexts (cpi);
+
+        // Restore layer specific context & set frame rate
+        layer = cpi->oxcf.layer_id[
+                            cm->current_video_frame % cpi->oxcf.periodicity];
+        restore_layer_context (cpi, layer);
+        vp8_new_frame_rate (cpi, cpi->layer_context[layer].frame_rate);
+    }
+
     if (cpi->compressor_speed == 2)
     {
         if (cpi->oxcf.number_of_layers == 1)
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index 46951e3b9..35efd3a00 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -253,6 +253,9 @@ typedef struct
     int starting_buffer_level;
     int optimal_buffer_level;
     int maximum_buffer_size;
+    int starting_buffer_level_in_ms;
+    int optimal_buffer_level_in_ms;
+    int maximum_buffer_size_in_ms;
 
     int avg_frame_size_for_layer;
 
@@ -421,6 +424,7 @@ typedef struct VP8_COMP
     int buffered_mode;
 
     double frame_rate;
+    double ref_frame_rate;
     int64_t buffer_level;
     int bits_off_target;
 
diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
index 592da9dbb..405c72dbd 100644
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -21,7 +21,6 @@
 #include "vp8/common/reconinter.h"
 #include "vp8/common/reconintra.h"
 #include "vp8/common/reconintra4x4.h"
-#include "vp8/common/g_common.h"
 #include "variance.h"
 #include "mcomp.h"
 #include "rdopt.h"
diff --git a/vp8/encoder/quantize.c b/vp8/encoder/quantize.c
index 3ca8758ef..d2aa7fe72 100644
--- a/vp8/encoder/quantize.c
+++ b/vp8/encoder/quantize.c
@@ -436,7 +436,8 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)
     int quant_val;
     int Q;
 
-    int zbin_boost[16] = {0, 0, 8, 10, 12, 14, 16, 20, 24, 28, 32, 36, 40, 44, 44, 44};
+    int zbin_boost[16] = {0, 0, 8, 10, 12, 14, 16, 20, 24, 28, 32, 36, 40, 44,
+                          44, 44};
 
     for (Q = 0; Q < QINDEX_RANGE; Q++)
     {
@@ -469,36 +470,58 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)
         cpi->zrun_zbin_boost_uv[Q][0] = (quant_val * zbin_boost[0]) >> 7;
 
         // all the ac values = ;
-        for (i = 1; i < 16; i++)
+        quant_val = vp8_ac_yquant(Q);
+        cpi->Y1quant_fast[Q][1] = (1 << 16) / quant_val;
+        invert_quant(cpi->sf.improved_quant, cpi->Y1quant[Q] + 1,
+                     cpi->Y1quant_shift[Q] + 1, quant_val);
+        cpi->Y1zbin[Q][1] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
+        cpi->Y1round[Q][1] = (qrounding_factors[Q] * quant_val) >> 7;
+        cpi->common.Y1dequant[Q][1] = quant_val;
+        cpi->zrun_zbin_boost_y1[Q][1] = (quant_val * zbin_boost[1]) >> 7;
+
+        quant_val = vp8_ac2quant(Q, cpi->common.y2ac_delta_q);
+        cpi->Y2quant_fast[Q][1] = (1 << 16) / quant_val;
+        invert_quant(cpi->sf.improved_quant, cpi->Y2quant[Q] + 1,
+                     cpi->Y2quant_shift[Q] + 1, quant_val);
+        cpi->Y2zbin[Q][1] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7;
+        cpi->Y2round[Q][1] = (qrounding_factors_y2[Q] * quant_val) >> 7;
+        cpi->common.Y2dequant[Q][1] = quant_val;
+        cpi->zrun_zbin_boost_y2[Q][1] = (quant_val * zbin_boost[1]) >> 7;
+
+        quant_val = vp8_ac_uv_quant(Q, cpi->common.uvac_delta_q);
+        cpi->UVquant_fast[Q][1] = (1 << 16) / quant_val;
+        invert_quant(cpi->sf.improved_quant, cpi->UVquant[Q] + 1,
+                     cpi->UVquant_shift[Q] + 1, quant_val);
+        cpi->UVzbin[Q][1] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
+        cpi->UVround[Q][1] = (qrounding_factors[Q] * quant_val) >> 7;
+        cpi->common.UVdequant[Q][1] = quant_val;
+        cpi->zrun_zbin_boost_uv[Q][1] = (quant_val * zbin_boost[1]) >> 7;
+
+        for (i = 2; i < 16; i++)
         {
-            int rc = vp8_default_zig_zag1d[i];
-
-            quant_val = vp8_ac_yquant(Q);
-            cpi->Y1quant_fast[Q][rc] = (1 << 16) / quant_val;
-            invert_quant(cpi->sf.improved_quant, cpi->Y1quant[Q] + rc,
-                         cpi->Y1quant_shift[Q] + rc, quant_val);
-            cpi->Y1zbin[Q][rc] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
-            cpi->Y1round[Q][rc] = (qrounding_factors[Q] * quant_val) >> 7;
-            cpi->common.Y1dequant[Q][rc] = quant_val;
-            cpi->zrun_zbin_boost_y1[Q][i] = (quant_val * zbin_boost[i]) >> 7;
-
-            quant_val = vp8_ac2quant(Q, cpi->common.y2ac_delta_q);
-            cpi->Y2quant_fast[Q][rc] = (1 << 16) / quant_val;
-            invert_quant(cpi->sf.improved_quant, cpi->Y2quant[Q] + rc,
-                         cpi->Y2quant_shift[Q] + rc, quant_val);
-            cpi->Y2zbin[Q][rc] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7;
-            cpi->Y2round[Q][rc] = (qrounding_factors_y2[Q] * quant_val) >> 7;
-            cpi->common.Y2dequant[Q][rc] = quant_val;
-            cpi->zrun_zbin_boost_y2[Q][i] = (quant_val * zbin_boost[i]) >> 7;
-
-            quant_val = vp8_ac_uv_quant(Q, cpi->common.uvac_delta_q);
-            cpi->UVquant_fast[Q][rc] = (1 << 16) / quant_val;
-            invert_quant(cpi->sf.improved_quant, cpi->UVquant[Q] + rc,
-                         cpi->UVquant_shift[Q] + rc, quant_val);
-            cpi->UVzbin[Q][rc] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
-            cpi->UVround[Q][rc] = (qrounding_factors[Q] * quant_val) >> 7;
-            cpi->common.UVdequant[Q][rc] = quant_val;
-            cpi->zrun_zbin_boost_uv[Q][i] = (quant_val * zbin_boost[i]) >> 7;
+            cpi->Y1quant_fast[Q][i] = cpi->Y1quant_fast[Q][1];
+            cpi->Y1quant[Q][i] = cpi->Y1quant[Q][1];
+            cpi->Y1quant_shift[Q][i] = cpi->Y1quant_shift[Q][1];
+            cpi->Y1zbin[Q][i] = cpi->Y1zbin[Q][1];
+            cpi->Y1round[Q][i] = cpi->Y1round[Q][1];
+            cpi->zrun_zbin_boost_y1[Q][i] = (cpi->common.Y1dequant[Q][1] *
+                                             zbin_boost[i]) >> 7;
+
+            cpi->Y2quant_fast[Q][i] = cpi->Y2quant_fast[Q][1];
+            cpi->Y2quant[Q][i] = cpi->Y2quant[Q][1];
+            cpi->Y2quant_shift[Q][i] = cpi->Y2quant_shift[Q][1];
+            cpi->Y2zbin[Q][i] = cpi->Y2zbin[Q][1];
+            cpi->Y2round[Q][i] = cpi->Y2round[Q][1];
+            cpi->zrun_zbin_boost_y2[Q][i] = (cpi->common.Y2dequant[Q][1] *
+                                             zbin_boost[i]) >> 7;
+
+            cpi->UVquant_fast[Q][i] = cpi->UVquant_fast[Q][1];
+            cpi->UVquant[Q][i] = cpi->UVquant[Q][1];
+            cpi->UVquant_shift[Q][i] = cpi->UVquant_shift[Q][1];
+            cpi->UVzbin[Q][i] = cpi->UVzbin[Q][1];
+            cpi->UVround[Q][i] = cpi->UVround[Q][1];
+            cpi->zrun_zbin_boost_uv[Q][i] = (cpi->common.UVdequant[Q][1] *
+                                             zbin_boost[i]) >> 7;
         }
     }
 }
@@ -615,6 +638,31 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x, int ok_to_skip)
      */
     if (!ok_to_skip || QIndex != x->q_index)
     {
+
+        xd->dequant_y1_dc[0] = 1;
+        xd->dequant_y1[0] = cpi->common.Y1dequant[QIndex][0];
+        xd->dequant_y2[0] = cpi->common.Y2dequant[QIndex][0];
+        xd->dequant_uv[0] = cpi->common.UVdequant[QIndex][0];
+
+        for (i = 1; i < 16; i++)
+        {
+            xd->dequant_y1_dc[i] =
+            xd->dequant_y1[i] = cpi->common.Y1dequant[QIndex][1];
+            xd->dequant_y2[i] = cpi->common.Y2dequant[QIndex][1];
+            xd->dequant_uv[i] = cpi->common.UVdequant[QIndex][1];
+        }
+#if 1
+        /*TODO:  Remove dequant from BLOCKD.  This is a temporary solution until
+         * the quantizer code uses a passed in pointer to the dequant constants.
+         * This will also require modifications to the x86 and neon assembly.
+         * */
+        for (i = 0; i < 16; i++)
+            x->e_mbd.block[i].dequant = xd->dequant_y1; //cpi->common.Y1dequant[QIndex];
+        for (i = 16; i < 24; i++)
+            x->e_mbd.block[i].dequant = xd->dequant_uv; //cpi->common.UVdequant[QIndex];
+        x->e_mbd.block[24].dequant = xd->dequant_y2; //cpi->common.Y2dequant[QIndex];
+#endif
+
         // Y
         zbin_extra = ZBIN_EXTRA_Y;
 
@@ -625,7 +673,6 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x, int ok_to_skip)
             x->block[i].quant_shift = cpi->Y1quant_shift[QIndex];
             x->block[i].zbin = cpi->Y1zbin[QIndex];
             x->block[i].round = cpi->Y1round[QIndex];
-            x->e_mbd.block[i].dequant = cpi->common.Y1dequant[QIndex];
             x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_y1[QIndex];
             x->block[i].zbin_extra = (short)zbin_extra;
         }
@@ -640,7 +687,6 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x, int ok_to_skip)
             x->block[i].quant_shift = cpi->UVquant_shift[QIndex];
             x->block[i].zbin = cpi->UVzbin[QIndex];
             x->block[i].round = cpi->UVround[QIndex];
-            x->e_mbd.block[i].dequant = cpi->common.UVdequant[QIndex];
             x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_uv[QIndex];
             x->block[i].zbin_extra = (short)zbin_extra;
         }
@@ -653,7 +699,6 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x, int ok_to_skip)
         x->block[24].quant_shift = cpi->Y2quant_shift[QIndex];
         x->block[24].zbin = cpi->Y2zbin[QIndex];
         x->block[24].round = cpi->Y2round[QIndex];
-        x->e_mbd.block[24].dequant = cpi->common.Y2dequant[QIndex];
         x->block[24].zrun_zbin_boost = cpi->zrun_zbin_boost_y2[QIndex];
         x->block[24].zbin_extra = (short)zbin_extra;
 
@@ -663,6 +708,9 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x, int ok_to_skip)
         cpi->last_zbin_over_quant = cpi->zbin_over_quant;
         cpi->last_zbin_mode_boost = cpi->zbin_mode_boost;
         x->last_act_zbin_adj = x->act_zbin_adj;
+
+
+
     }
     else if(cpi->last_zbin_over_quant != cpi->zbin_over_quant
             || cpi->last_zbin_mode_boost != cpi->zbin_mode_boost
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index 726d3c4eb..d29aa75fe 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -28,7 +28,6 @@
 #include "encodemb.h"
 #include "quantize.h"
 #include "vp8/common/idct.h"
-#include "vp8/common/g_common.h"
 #include "variance.h"
 #include "mcomp.h"
 #include "rdopt.h"
diff --git a/vp8/encoder/temporal_filter.c b/vp8/encoder/temporal_filter.c
index a773d4391..545e4f205 100644
--- a/vp8/encoder/temporal_filter.c
+++ b/vp8/encoder/temporal_filter.c
@@ -22,7 +22,6 @@
 #include "ratectrl.h"
 #include "vp8/common/quant_common.h"
 #include "segmentation.h"
-#include "vp8/common/g_common.h"
 #include "vpx_scale/yv12extend.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vp8/common/swapyv12buffer.h"
diff --git a/vp8/encoder/x86/quantize_ssse3.asm b/vp8/encoder/x86/quantize_ssse3.asm
index 34cc9c3bb..e698e904c 100644
--- a/vp8/encoder/x86/quantize_ssse3.asm
+++ b/vp8/encoder/x86/quantize_ssse3.asm
@@ -80,6 +80,9 @@ sym(vp8_fast_quantize_b_ssse3):
     mov         rdi, [rsi + vp8_blockd_dequant]
     mov         rcx, [rsi + vp8_blockd_dqcoeff]
 
+    movdqa      xmm2, xmm1                  ;store y for getting eob
+    movdqa      xmm3, xmm5
+
     pxor        xmm1, xmm0
     pxor        xmm5, xmm4
     psubw       xmm1, xmm0
@@ -88,35 +91,30 @@ sym(vp8_fast_quantize_b_ssse3):
     movdqa      [rax], xmm1
     movdqa      [rax + 16], xmm5
 
-    movdqa      xmm2, [rdi]
-    movdqa      xmm3, [rdi + 16]
-
-    pxor        xmm4, xmm4
-    pmullw      xmm2, xmm1
-    pmullw      xmm3, xmm5
-
-    pcmpeqw     xmm1, xmm4                  ;non zero mask
-    pcmpeqw     xmm5, xmm4                  ;non zero mask
-    packsswb    xmm1, xmm5
-    pshufb      xmm1, [GLOBAL(zz_shuf)]
+    movdqa      xmm0, [rdi]
+    movdqa      xmm4, [rdi + 16]
 
-    pmovmskb    edx, xmm1
+    pmullw      xmm0, xmm1
+    pmullw      xmm4, xmm5
+    pxor        xmm1, xmm1
 
-    xor         rdi, rdi
-    mov         eax, -1
-    xor         dx, ax                      ;flip the bits for bsr
-    bsr         eax, edx
+    pcmpgtw     xmm2, xmm1                  ;calculate eob
+    pcmpgtw     xmm3, xmm1
+    packsswb    xmm2, xmm3
+    pshufb      xmm2, [GLOBAL(zz_shuf)]
 
-    movdqa      [rcx], xmm2                 ;store dqcoeff
-    movdqa      [rcx + 16], xmm3            ;store dqcoeff
+    pmovmskb    edx, xmm2
 
+    movdqa      [rcx], xmm0                 ;store dqcoeff
+    movdqa      [rcx + 16], xmm4            ;store dqcoeff
     mov         rcx, [rsi + vp8_blockd_eob]
 
-    sub         edi, edx                    ;check for all zeros in bit mask
-    sar         edi, 31                     ;0 or -1
+    bsr         eax, edx                    ;count 0
     add         eax, 1
-    and         eax, edi                    ;if the bit mask was all zero,
-                                            ;then eob = 0
+
+    cmp         edx, 0                      ;if all 0, eob=0
+    cmove       eax, edx
+
     mov         BYTE PTR [rcx], al          ;store eob
 
     ; begin epilog
diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index eb483d235..0e564320f 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -39,7 +39,6 @@ VP8_COMMON_SRCS-yes += common/entropymode.h
 VP8_COMMON_SRCS-yes += common/entropymv.h
 VP8_COMMON_SRCS-yes += common/extend.h
 VP8_COMMON_SRCS-yes += common/findnearmv.h
-VP8_COMMON_SRCS-yes += common/g_common.h
 VP8_COMMON_SRCS-yes += common/header.h
 VP8_COMMON_SRCS-yes += common/idct.h
 VP8_COMMON_SRCS-yes += common/invtrans.h
diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index 6181ee8ee..f2f376a7c 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -335,6 +335,10 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf,
     oxcf->under_shoot_pct          = cfg.rc_undershoot_pct;
     oxcf->over_shoot_pct           = cfg.rc_overshoot_pct;
 
+    oxcf->maximum_buffer_size_in_ms   = cfg.rc_buf_sz;
+    oxcf->starting_buffer_level_in_ms = cfg.rc_buf_initial_sz;
+    oxcf->optimal_buffer_level_in_ms  = cfg.rc_buf_optimal_sz;
+
     oxcf->maximum_buffer_size      = cfg.rc_buf_sz;
     oxcf->starting_buffer_level    = cfg.rc_buf_initial_sz;
     oxcf->optimal_buffer_level     = cfg.rc_buf_optimal_sz;
@@ -1237,7 +1241,7 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] =
         /* keyframing settings (kf) */
         VPX_KF_AUTO,        /* g_kfmode*/
         0,                  /* kf_min_dist */
-        9999,               /* kf_max_dist */
+        128,                /* kf_max_dist */
 
 #if VPX_ENCODER_ABI_VERSION == (1 + VPX_CODEC_ABI_VERSION)
         1,                  /* g_delete_first_pass_file */
diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c
index fbe58171c..de2714317 100644
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -412,7 +412,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,
                 && (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC))
             {
                 ctx->postproc_cfg.post_proc_flag =
-                    VP8_DEBLOCK | VP8_DEMACROBLOCK;
+                    VP8_DEBLOCK | VP8_DEMACROBLOCK | VP8_MFQE;
                 ctx->postproc_cfg.deblocking_level = 4;
                 ctx->postproc_cfg.noise_level = 0;
             }
@@ -700,6 +700,27 @@ static vpx_codec_err_t vp8_get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
         return VPX_CODEC_INVALID_PARAM;
 }
 
+extern int vp8_references_buffer( VP8_COMMON *oci, int ref_frame );
+static vpx_codec_err_t vp8_get_last_ref_frame(vpx_codec_alg_priv_t *ctx,
+                                              int ctrl_id,
+                                              va_list args)
+{
+    int *ref_info = va_arg(args, int *);
+    VP8D_COMP *pbi = (VP8D_COMP *)ctx->pbi;
+    VP8_COMMON *oci = &pbi->common;
+
+    if (ref_info)
+    {
+        *ref_info =
+            (vp8_references_buffer( oci, ALTREF_FRAME )?VP8_ALTR_FRAME:0) |
+            (vp8_references_buffer( oci, GOLDEN_FRAME )?VP8_GOLD_FRAME:0) |
+            (vp8_references_buffer( oci, LAST_FRAME )?VP8_LAST_FRAME:0);
+
+        return VPX_CODEC_OK;
+    }
+    else
+        return VPX_CODEC_INVALID_PARAM;
+}
 
 static vpx_codec_err_t vp8_get_frame_corrupted(vpx_codec_alg_priv_t *ctx,
                                                int ctrl_id,
diff --git a/vp8_scalable_patterns.c b/vp8_scalable_patterns.c
index 41ecaa78b..65883ff80 100644
--- a/vp8_scalable_patterns.c
+++ b/vp8_scalable_patterns.c
@@ -129,27 +129,29 @@ int main(int argc, char **argv) {
     int                  got_data;
     int                  flags = 0;
     int                  i;
+    int                  pts = 0;              // PTS starts at 0
+    int                  frame_duration = 1;   // 1 timebase tick per frame
 
     int                  layering_mode = 0;
     int                  frames_in_layer[MAX_LAYERS] = {0};
     int                  layer_flags[MAX_PERIODICITY] = {0};
 
     // Check usage and arguments
-    if (argc < 7)
-        die("Usage: %s <infile> <outfile> <width> <height> <mode> "
-            "<Rate_0> ... <Rate_nlayers-1>\n", argv[0]);
+    if (argc < 9)
+        die("Usage: %s <infile> <outfile> <width> <height> <rate_num> "
+            " <rate_den> <mode> <Rate_0> ... <Rate_nlayers-1>\n", argv[0]);
 
     width  = strtol (argv[3], NULL, 0);
     height = strtol (argv[4], NULL, 0);
     if (width < 16 || width%2 || height <16 || height%2)
         die ("Invalid resolution: %d x %d", width, height);
 
-    if (!sscanf(argv[5], "%d", &layering_mode))
-        die ("Invalid mode %s", argv[5]);
+    if (!sscanf(argv[7], "%d", &layering_mode))
+        die ("Invalid mode %s", argv[7]);
     if (layering_mode<0 || layering_mode>6)
-        die ("Invalid mode (0..6) %s", argv[5]);
+        die ("Invalid mode (0..6) %s", argv[7]);
 
-    if (argc != 6+mode_to_num_layers[layering_mode])
+    if (argc != 8+mode_to_num_layers[layering_mode])
         die ("Invalid number of arguments");
 
     if (!vpx_img_alloc (&raw, VPX_IMG_FMT_I420, width, height, 1))
@@ -168,8 +170,14 @@ int main(int argc, char **argv) {
     cfg.g_w = width;
     cfg.g_h = height;
 
-    for (i=6; i<6+mode_to_num_layers[layering_mode]; i++)
-        if (!sscanf(argv[i], "%d", &cfg.ts_target_bitrate[i-6]))
+    // Timebase format e.g. 30fps: numerator=1, demoninator=30
+    if (!sscanf (argv[5], "%d", &cfg.g_timebase.num ))
+        die ("Invalid timebase numerator %s", argv[5]);
+    if (!sscanf (argv[6], "%d", &cfg.g_timebase.den ))
+        die ("Invalid timebase denominator %s", argv[6]);
+
+    for (i=8; i<8+mode_to_num_layers[layering_mode]; i++)
+        if (!sscanf(argv[i], "%d", &cfg.ts_target_bitrate[i-8]))
             die ("Invalid data rate %s", argv[i]);
 
     // Real time parameters
@@ -193,7 +201,7 @@ int main(int argc, char **argv) {
     cfg.kf_min_dist = cfg.kf_max_dist = 1000;
 
     // Temporal scaling parameters:
-    // NOTE: The 3 prediction frames cannot be used interchangebly due to
+    // NOTE: The 3 prediction frames cannot be used interchangeably due to
     // differences in the way they are handled throughout the code. The
     // frames should be allocated to layers in the order LAST, GF, ARF.
     // Other combinations work, but may produce slightly inferior results.
@@ -210,14 +218,15 @@ int main(int argc, char **argv) {
         cfg.ts_rate_decimator[1] = 1;
         memcpy(cfg.ts_layer_id, ids, sizeof(ids));
 
+#if 1
         // 0=L, 1=GF, Intra-layer prediction enabled
         layer_flags[0] = VPX_EFLAG_FORCE_KF  |
                          VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF |
                          VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF;
         layer_flags[1] = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST |
                          VP8_EFLAG_NO_REF_ARF;
-#if 0
-        // 0=L, 1=GF, Intra-layer 1 prediction disabled
+#else
+        // 0=L, 1=GF, Intra-layer prediction disabled
         layer_flags[0] = VPX_EFLAG_FORCE_KF  |
                          VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF |
                          VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF;
@@ -275,7 +284,7 @@ int main(int argc, char **argv) {
     case 3:
     {
         // 3-layers, 4-frame period
-        int ids[6] = {0,2,1,2};
+        int ids[4] = {0,2,1,2};
         cfg.ts_number_layers     = 3;
         cfg.ts_periodicity       = 4;
         cfg.ts_rate_decimator[0] = 4;
@@ -295,13 +304,12 @@ int main(int argc, char **argv) {
                          VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF |
                          VP8_EFLAG_NO_UPD_ARF;
         break;
-        cfg.ts_rate_decimator[2] = 1;
     }
 
     case 4:
     {
         // 3-layers, 4-frame period
-        int ids[6] = {0,2,1,2};
+        int ids[4] = {0,2,1,2};
         cfg.ts_number_layers     = 3;
         cfg.ts_periodicity       = 4;
         cfg.ts_rate_decimator[0] = 4;
@@ -326,7 +334,7 @@ int main(int argc, char **argv) {
     case 5:
     {
         // 3-layers, 4-frame period
-        int ids[6] = {0,2,1,2};
+        int ids[4] = {0,2,1,2};
         cfg.ts_number_layers     = 3;
         cfg.ts_periodicity       = 4;
         cfg.ts_rate_decimator[0] = 4;
@@ -417,7 +425,7 @@ int main(int argc, char **argv) {
         flags = layer_flags[frame_cnt % cfg.ts_periodicity];
 
         frame_avail = read_frame(infile, &raw);
-        if (vpx_codec_encode(&codec, frame_avail? &raw : NULL, frame_cnt,
+        if (vpx_codec_encode(&codec, frame_avail? &raw : NULL, pts,
                             1, flags, VPX_DL_REALTIME))
             die_codec(&codec, "Failed to encode frame");
 
@@ -446,6 +454,7 @@ int main(int argc, char **argv) {
             fflush (stdout);
         }
         frame_cnt++;
+        pts += frame_duration;
     }
     printf ("\n");
     fclose (infile);
diff --git a/vpx/vp8.h b/vpx/vp8.h
index 983cc4ad4..eec979763 100644
--- a/vpx/vp8.h
+++ b/vpx/vp8.h
@@ -63,6 +63,7 @@ enum vp8_postproc_level
     VP8_DEBUG_TXT_MBLK_MODES    = 1<<4, /**< print macro block modes over each macro block */
     VP8_DEBUG_TXT_DC_DIFF       = 1<<5, /**< print dc diff for each macro block */
     VP8_DEBUG_TXT_RATE_INFO     = 1<<6, /**< print video rate info (encoder only) */
+    VP8_MFQE                    = 1<<10,
 };
 
 /*!\brief post process flags
diff --git a/vpx/vp8dx.h b/vpx/vp8dx.h
index 1d9d53165..86610358c 100644
--- a/vpx/vp8dx.h
+++ b/vpx/vp8dx.h
@@ -55,6 +55,11 @@ enum vp8_dec_control_id
     /** check if the indicated frame is corrupted */
     VP8D_GET_FRAME_CORRUPTED,
 
+    /** control function to get info on which reference frames were used
+     *  by the last decode
+     */
+    VP8D_GET_LAST_REF_USED,
+
     VP8_DECODER_CTRL_ID_MAX
 } ;
 
@@ -69,7 +74,7 @@ enum vp8_dec_control_id
 
 VPX_CTRL_USE_TYPE(VP8D_GET_LAST_REF_UPDATES,   int *)
 VPX_CTRL_USE_TYPE(VP8D_GET_FRAME_CORRUPTED,    int *)
-
+VPX_CTRL_USE_TYPE(VP8D_GET_LAST_REF_USED,      int *)
 
 /*! @} - end defgroup vp8_decoder */
 
diff --git a/vpx_ports/vpxtypes.h b/vpx_ports/vpxtypes.h
index c7ccc0510..f2fb08954 100644
--- a/vpx_ports/vpxtypes.h
+++ b/vpx_ports/vpxtypes.h
@@ -96,11 +96,6 @@ typedef unsigned __int64 vpxu64;
 # define PRId64 "lld"
 # define VPX64 PRId64
 typedef long vpxs64;
-#elif defined(__SYMBIAN32__)
-# undef  PRId64
-# define PRId64 "u"
-# define VPX64 PRId64
-typedef unsigned int vpxs64;
 #else
 # error "64 bit integer type undefined for this platform!"
 #endif
diff --git a/vpx_scale/arm/armv4/gen_scalers_armv4.asm b/vpx_scale/arm/armv4/gen_scalers_armv4.asm
deleted file mode 100644
index e495184e7..000000000
--- a/vpx_scale/arm/armv4/gen_scalers_armv4.asm
+++ /dev/null
@@ -1,774 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |horizontal_line_4_5_scale_armv4|
-    EXPORT  |vertical_band_4_5_scale_armv4|
-    EXPORT  |horizontal_line_2_3_scale_armv4|
-    EXPORT  |vertical_band_2_3_scale_armv4|
-    EXPORT  |horizontal_line_3_5_scale_armv4|
-    EXPORT  |vertical_band_3_5_scale_armv4|
-    EXPORT  |horizontal_line_3_4_scale_armv4|
-    EXPORT  |vertical_band_3_4_scale_armv4|
-    EXPORT  |horizontal_line_1_2_scale_armv4|
-    EXPORT  |vertical_band_1_2_scale_armv4|
-
-    AREA    |.text|, CODE, READONLY  ; name this block of code
-
-src         RN  r0
-srcw        RN  r1
-dest        RN  r2
-mask        RN  r12
-c51_205     RN  r10
-c102_154    RN  r11
-;/****************************************************************************
-; *
-; *  ROUTINE       : horizontal_line_4_5_scale_armv4
-; *
-; *  INPUTS        : const unsigned char *source : Pointer to source data.
-; *                  unsigned int source_width    : Stride of source.
-; *                  unsigned char *dest         : Pointer to destination data.
-; *                  unsigned int dest_width      : Stride of destination (NOT USED).
-; *
-; *  OUTPUTS       : None.
-; *
-; *  RETURNS       : void
-; *
-; *  FUNCTION      : Copies horizontal line of pixels from source to
-; *                  destination scaling up by 4 to 5.
-; *
-; *  SPECIAL NOTES : None.
-; *
-; ****************************************************************************/
-;void horizontal_line_4_5_scale_armv4
-;(
-;   r0 = UINT8 *source
-;   r1 = UINT32 source_width
-;   r2 = UINT8 *dest
-;   r3 = UINT32 dest_width
-;)
-|horizontal_line_4_5_scale_armv4| PROC
-    stmdb   sp!, {r4 - r11, lr}
-
-    mov     mask, #255              ; mask for selection
-    ldr     c51_205, =0x3300cd
-    ldr     c102_154, =0x66009a
-
-    ldr     r3, [src], #4
-
-hl45_loop
-
-    and     r4, r3, mask            ; a = src[0]
-    and     r5, mask, r3, lsr #8    ; b = src[1]
-    strb    r4, [dest], #1
-
-    orr     r6, r4, r5, lsl #16     ; b | a
-    and     r7, mask, r3, lsr #16   ; c = src[2]
-    mul     r6, c51_205, r6         ; a * 51 + 205 * b
-
-    orr     r5, r5, r7, lsl #16     ; c | b
-    mul     r5, c102_154, r5        ; b * 102 + 154 * c
-    add     r6, r6, #0x8000
-    and     r8, mask, r3, lsr #24   ; d = src[3]
-    mov     r6, r6, lsr #24
-    strb    r6, [dest], #1
-
-    orr     r7, r8, r7, lsl #16     ; c | d
-    mul     r7, c102_154, r7        ; c * 154 + 102 * d
-    add     r5, r5, #0x8000
-    ldr     r3, [src], #4
-    mov     r5, r5, lsr #24
-    strb    r5, [dest], #1
-
-    add     r7, r7, #0x8000
-    and     r9, mask, r3            ; e = src[4]
-    orr     r9, r9, r8, lsl #16     ; d | e
-    mul     r9, c51_205, r9         ; d * 205 + 51 * e
-    mov     r7, r7, lsr #24
-    strb    r7, [dest], #1
-
-    add     r9, r9, #0x8000
-    subs    srcw, srcw, #4
-    mov     r9, r9, lsr #24
-    strb    r9, [dest], #1
-
-    bne     hl45_loop
-
-    and     r4, r3, mask
-    and     r5, mask, r3, lsl #8
-    strb    r4, [dest], #1
-
-    orr     r6, r4, r5, lsl #16     ; b | a
-    mul     r6, c51_205, r6
-
-    and     r7, mask, r3, lsl #16
-    orr     r5, r5, r7, lsl #16     ; c | b
-    mul     r5, c102_154, r5
-    add     r6, r6, #0x8000
-    and     r8, mask, r3, lsl #24
-    mov     r6, r6, lsr #24
-    strb    r6, [dest], #1
-
-    orr     r7, r8, r7, lsl #16     ; c | d
-    mul     r7, c102_154, r7
-    add     r5, r5, #0x8000
-    mov     r5, r5, lsr #24
-    strb    r5, [dest], #1
-
-    add     r7, r7, #0x8000
-    mov     r7, r7, lsr #24
-    strb    r7, [dest], #1
-
-    ldrb    r3, [src]
-    strb    r3, [dest], #1
-
-    ldmia   sp!, {r4 - r11, pc}
-    ENDP    ;|vp8cx_horizontal_line_4_5_scale_c|
-
-;/****************************************************************************
-; *
-; *  ROUTINE       : vertical_band_4_5_scale_armv4
-; *
-; *  INPUTS        : unsigned char *dest    : Pointer to destination data.
-; *                  unsigned int dest_pitch : Stride of destination data.
-; *                  unsigned int dest_width : Width of destination data.
-; *
-; *  OUTPUTS       : None.
-; *
-; *  RETURNS       : void
-; *
-; *  FUNCTION      : Scales vertical band of pixels by scale 4 to 5. The
-; *                  height of the band scaled is 4-pixels.
-; *
-; *  SPECIAL NOTES : The routine uses the first line of the band below
-; *                  the current band.
-; *
-; ****************************************************************************/
-;void vertical_band_4_5_scale_armv4
-;(
-;   r0 = UINT8 *dest
-;   r1 = UINT32 dest_pitch
-;   r2 = UINT32 dest_width
-;)
-|vertical_band_4_5_scale_armv4| PROC
-    stmdb   sp!, {r4 - r11, lr}
-
-    ldr     c51_205, =0x3300cd
-    ldr     c102_154, =0x66009a
-
-vl45_loop
-    mov     r3, src
-    ldrb    r4, [r3], r1            ; a = des [0]
-    ldrb    r5, [r3], r1            ; b = des [dest_pitch]
-    ldrb    r7, [r3], r1            ; c = des[dest_pitch*2]
-    add     lr, src, r1
-
-    orr     r6, r4, r5, lsl #16     ; b | a
-    mul     r6, c51_205, r6         ; a * 51 + 205 * b
-
-    ldrb    r8, [r3], r1            ; d = des[dest_pitch*3]
-    orr     r5, r5, r7, lsl #16     ; c | b
-    mul     r5, c102_154, r5        ; b * 102 + 154 * c
-    add     r6, r6, #0x8000
-    orr     r7, r8, r7, lsl #16     ; c | d
-    mov     r6, r6, lsr #24
-    strb    r6, [lr], r1
-
-    ldrb    r9, [r3, r1]            ; e = des [dest_pitch * 5]
-    mul     r7, c102_154, r7        ; c * 154 + 102 * d
-    add     r5, r5, #0x8000
-    orr     r9, r9, r8, lsl #16     ; d | e
-    mov     r5, r5, lsr #24
-    strb    r5, [lr], r1
-
-    mul     r9, c51_205, r9         ; d * 205 + 51 * e
-    add     r7, r7, #0x8000
-    add     src, src, #1
-    mov     r7, r7, lsr #24
-    strb    r7, [lr], r1
-
-    add     r9, r9, #0x8000
-    subs    r2, r2, #1
-    mov     r9, r9, lsr #24
-    strb    r9, [lr], r1
-
-    bne     vl45_loop
-
-    ldmia   sp!, {r4 - r11, pc}
-    ENDP    ;|vertical_band_4_5_scale_armv4|
-
-;/****************************************************************************
-; *
-; *  ROUTINE       : horizontal_line_2_3_scale_armv4
-; *
-; *  INPUTS        : const unsigned char *source : Pointer to source data.
-; *                  unsigned int source_width    : Stride of source.
-; *                  unsigned char *dest         : Pointer to destination data.
-; *                  unsigned int dest_width      : Stride of destination (NOT USED).
-; *
-; *  OUTPUTS       : None.
-; *
-; *  RETURNS       : void
-; *
-; *  FUNCTION      : Copies horizontal line of pixels from source to
-; *                  destination scaling up by 2 to 3.
-; *
-; *  SPECIAL NOTES : None.
-; *
-; *
-; ****************************************************************************/
-;void horizontal_line_2_3_scale_armv4
-;(
-;   const unsigned char *source,
-;   unsigned int source_width,
-;   unsigned char *dest,
-;   unsigned int dest_width
-;)
-|horizontal_line_2_3_scale_armv4| PROC
-    stmdb   sp!, {r4 - r11, lr}
-    ldr     lr,  =85
-    ldr     r12, =171
-
-hl23_loop
-
-    ldrb    r3, [src], #1           ; a
-    ldrb    r4, [src], #1           ; b
-    ldrb    r5, [src]               ; c
-
-    strb    r3, [dest], #1
-    mul     r4, r12, r4             ; b * 171
-    mla     r6, lr, r3, r4          ; a * 85
-    mla     r7, lr, r5, r4          ; c * 85
-
-    add     r6, r6, #128
-    mov     r6, r6, lsr #8
-    strb    r6, [dest], #1
-
-    add     r7, r7, #128
-    mov     r7, r7, lsr #8
-    strb    r7, [dest], #1
-
-    subs    srcw, srcw, #2
-    bne     hl23_loop
-
-    ldrb    r4, [src, #1]           ; b
-    strb    r5, [dest], #1
-    strb    r4, [dest, #1]
-
-    mul     r4, r12, r4             ; b * 171
-    mla     r6, lr, r5, r4          ; a * 85 + b *171
-
-    add     r6, r6, #128
-    mov     r6, r6, lsr #8
-    strb    r6, [dest]
-
-    ldmia   sp!, {r4 - r11, pc}
-    ENDP    ;|horizontal_line_2_3_scale_armv4|
-
-;/****************************************************************************
-; *
-; *  ROUTINE       : vertical_band_2_3_scale_armv4
-; *
-; *  INPUTS        : unsigned char *dest    : Pointer to destination data.
-; *                  unsigned int dest_pitch : Stride of destination data.
-; *                  unsigned int dest_width : Width of destination data.
-; *
-; *  OUTPUTS       : None.
-; *
-; *  RETURNS       : void
-; *
-; *  FUNCTION      : Scales vertical band of pixels by scale 2 to 3. The
-; *                  height of the band scaled is 2-pixels.
-; *
-; *  SPECIAL NOTES : The routine uses the first line of the band below
-; *                  the current band.
-; *
-; ****************************************************************************/
-;void vertical_band_2_3_scale_armv4
-;(
-;   r0 = UINT8 *dest
-;   r1 = UINT32 dest_pitch
-;   r2 = UINT32 dest_width
-;)
-|vertical_band_2_3_scale_armv4| PROC
-    stmdb   sp!, {r4 - r8, lr}
-    ldr     lr,  =85
-    ldr     r12, =171
-    add     r3, r1, r1, lsl #1      ; 3 * dest_pitch
-
-vl23_loop
-    ldrb    r4, [src]               ; a = des [0]
-    ldrb    r5, [src, r1]           ; b = des [dest_pitch]
-    ldrb    r7, [src, r3]           ; c = des [dest_pitch*3]
-    subs    r2, r2, #1
-
-    mul     r5, r12, r5             ; b * 171
-    mla     r6, lr, r4, r5          ; a * 85
-    mla     r8, lr, r7, r5          ; c * 85
-
-    add     r6, r6, #128
-    mov     r6, r6, lsr #8
-    strb    r6, [src, r1]
-
-    add     r8, r8, #128
-    mov     r8, r8, lsr #8
-    strb    r8, [src, r1, lsl #1]
-
-    add     src, src, #1
-
-    bne     vl23_loop
-
-    ldmia   sp!, {r4 - r8, pc}
-    ENDP    ;|vertical_band_2_3_scale_armv4|
-
-;/****************************************************************************
-; *
-; *  ROUTINE       : vp8cx_horizontal_line_3_5_scale_c
-; *
-; *  INPUTS        : const unsigned char *source : Pointer to source data.
-; *                  unsigned int source_width    : Stride of source.
-; *                  unsigned char *dest         : Pointer to destination data.
-; *                  unsigned int dest_width      : Stride of destination (NOT USED).
-; *
-; *  OUTPUTS       : None.
-; *
-; *  RETURNS       : void
-; *
-; *  FUNCTION      : Copies horizontal line of pixels from source to
-; *                  destination scaling up by 3 to 5.
-; *
-; *  SPECIAL NOTES : None.
-; *
-; *
-; ****************************************************************************/
-;void vp8cx_horizontal_line_3_5_scale_c
-;(
-;   const unsigned char *source,
-;   unsigned int source_width,
-;   unsigned char *dest,
-;   unsigned int dest_width
-;)
-|horizontal_line_3_5_scale_armv4| PROC
-    stmdb   sp!, {r4 - r11, lr}
-
-    ldr     c51_205, =0x3300cd
-    ldr     c102_154, =0x66009a
-
-    ldrb    r4, [src], #1           ; a = src[0]
-
-hl35_loop
-
-    ldrb    r8, [src], #1           ; b = src[1]
-    strb    r4, [dest], #1
-
-    orr     r6, r4, r8, lsl #16     ; b | a
-    ldrb    r9, [src], #1           ; c = src[2]
-    mul     r6, c102_154, r6        ; a * 102 + 154 * b
-
-    orr     r5, r9, r8, lsl #16     ; b | c
-    mul     r5, c51_205, r5         ; b * 205 + 51 * c
-    add     r6, r6, #0x8000
-    ldrb    r4, [src], #1           ; d = src[3]
-    mov     r6, r6, lsr #24
-    strb    r6, [dest], #1
-
-    orr     r7, r8, r9, lsl #16     ; c | b
-    mul     r7, c51_205, r7         ; c * 205 + 154 * b
-    add     r5, r5, #0x8000
-    mov     r5, r5, lsr #24
-    strb    r5, [dest], #1
-
-    orr     r9, r4, r9, lsl #16     ; c | d
-    mul     r9, c102_154, r9        ; c * 154 + 102 * d
-    add     r7, r7, #0x8000
-    mov     r7, r7, lsr #24
-    strb    r7, [dest], #1
-
-    add     r9, r9, #0x8000
-    subs    srcw, srcw, #3
-    mov     r9, r9, lsr #24
-    strb    r9, [dest], #1
-
-    bpl     hl35_loop
-
-    ldrb    r5, [src], #1           ; b = src[1]
-    strb    r4, [dest], #1
-
-    orr     r6, r4, r8, lsl #16     ; b | a
-    ldrb    r9, [src], #1           ; c = src[2]
-    mul     r6, c102_154, r6        ; a * 102 + 154 * b
-
-    orr     r5, r9, r8, lsl #16     ; b | c
-    mul     r5, c51_205, r5         ; b * 205 + 51 * c
-    add     r6, r6, #0x8000
-    mov     r6, r6, lsr #24
-    strb    r6, [dest], #1
-
-    orr     r7, r8, r9, lsl #16     ; c | b
-    mul     r7, c51_205, r7         ; c * 205 + 154 * b
-    add     r5, r5, #0x8000
-    mov     r5, r5, lsr #24
-    strb    r5, [dest], #1
-
-    add     r7, r7, #0x8000
-    mov     r7, r7, lsr #24
-    strb    r7, [dest], #1
-    strb    r9, [dest], #1
-
-    ldmia   sp!, {r4 - r11, pc}
-    ENDP    ;|vp8cx_horizontal_line_3_5_scale_c|
-
-
-;/****************************************************************************
-; *
-; *  ROUTINE       : vp8cx_vertical_band_3_5_scale_c
-; *
-; *  INPUTS        : unsigned char *dest    : Pointer to destination data.
-; *                  unsigned int dest_pitch : Stride of destination data.
-; *                  unsigned int dest_width : Width of destination data.
-; *
-; *  OUTPUTS       : None.
-; *
-; *  RETURNS       : void
-; *
-; *  FUNCTION      : Scales vertical band of pixels by scale 3 to 5. The
-; *                  height of the band scaled is 3-pixels.
-; *
-; *  SPECIAL NOTES : The routine uses the first line of the band below
-; *                  the current band.
-; *
-; ****************************************************************************/
-;void vertical_band_4_5_scale_armv4
-;(
-;   r0 = UINT8 *dest
-;   r1 = UINT32 dest_pitch
-;   r2 = UINT32 dest_width
-;)
-|vertical_band_3_5_scale_armv4| PROC
-    stmdb   sp!, {r4 - r11, lr}
-
-    ldr     c51_205, =0x3300cd
-    ldr     c102_154, =0x66009a
-
-vl35_loop
-    mov     r3, src
-    ldrb    r4, [r3], r1            ; a = des [0]
-    ldrb    r5, [r3], r1            ; b = des [dest_pitch]
-    ldrb    r7, [r3], r1            ; c = des[dest_pitch*2]
-    add     lr, src, r1
-
-    orr     r8, r4, r5, lsl #16     ; b | a
-    mul     r6, c102_154, r8        ; a * 102 + 154 * b
-
-    ldrb    r8, [r3, r1, lsl #1]    ; d = des[dest_pitch*5]
-    orr     r3, r7, r5, lsl #16     ; b | c
-    mul     r9, c51_205, r3         ; b * 205 + 51 * c
-    add     r6, r6, #0x8000
-    orr     r3, r5, r7, lsl #16     ; c | b
-    mov     r6, r6, lsr #24
-    strb    r6, [lr], r1
-
-    mul     r5, c51_205, r3         ; c * 205 + 154 * b
-    add     r9, r9, #0x8000
-    orr     r3, r8, r7, lsl #16     ; c | d
-    mov     r9, r9, lsr #24
-    strb    r9, [lr], r1
-
-    mul     r7, c102_154, r3        ; c * 154 + 102 * d
-    add     r5, r5, #0x8000
-    add     src, src, #1
-    mov     r5, r5, lsr #24
-    strb    r5, [lr], r1
-
-    add     r7, r7, #0x8000
-    subs    r2, r2, #1
-    mov     r7, r7, lsr #24
-    strb    r7, [lr], r1
-
-
-    bne     vl35_loop
-
-    ldmia   sp!, {r4 - r11, pc}
-    ENDP    ;|vertical_band_3_5_scale_armv4|
-
-;/****************************************************************************
-; *
-; *  ROUTINE       : horizontal_line_3_4_scale_armv4
-; *
-; *  INPUTS        : const unsigned char *source : Pointer to source data.
-; *                  unsigned int source_width    : Stride of source.
-; *                  unsigned char *dest         : Pointer to destination data.
-; *                  unsigned int dest_width      : Stride of destination (NOT USED).
-; *
-; *  OUTPUTS       : None.
-; *
-; *  RETURNS       : void
-; *
-; *  FUNCTION      : Copies horizontal line of pixels from source to
-; *                  destination scaling up by 3 to 4.
-; *
-; *  SPECIAL NOTES : None.
-; *
-; *
-; ****************************************************************************/
-;void horizontal_line_3_4_scale_armv4
-;(
-;   const unsigned char *source,
-;   unsigned int source_width,
-;   unsigned char *dest,
-;   unsigned int dest_width
-;)
-|horizontal_line_3_4_scale_armv4| PROC
-    stmdb   sp!, {r4 - r11, lr}
-
-    ldr     r10, =64
-    ldr     r11, =192
-    mov     r9, #128
-
-    ldrb    r4, [src], #1           ; a = src[0]
-
-hl34_loop
-
-    ldrb    r8, [src], #1           ; b = src[1]
-    ldrb    r7, [src], #1           ; c = src[2]
-    strb    r4, [dest], #1
-
-    mla     r4, r10, r4, r9         ; a*64 + 128
-    mla     r4, r11, r8, r4         ; a*64 + b*192 + 1
-
-    add     r8, r8, #1              ; b + 1
-    add     r8, r8, r7              ; b + c + 1
-    mov     r8, r8, asr #1          ; (b + c + 1) >> 1
-
-    mov     r4, r4, asr #8          ; (a*64 + b*192 + 1) >> 8
-    strb    r4, [dest], #1
-
-    strb    r8, [dest], #1
-
-    ldrb    r4, [src], #1           ; [a+1]
-
-    mla     r7, r11, r7, r9         ; c*192 + 128
-    mla     r7, r4, r10, r7         ; a*64 + b*192 + 128
-
-    subs    srcw, srcw, #3
-
-    mov     r7, r7, asr #8          ; (a*64 + b*192 + 128) >> 8
-    strb    r7, [dest], #1
-
-    bpl     hl34_loop
-
-    ldrb    r8, [src], #1           ; b = src[1]
-    ldrb    r7, [src], #1           ; c = src[2]
-    strb    r4, [dest], #1
-
-    mla     r4, r10, r4, r9         ; a*64 + 128
-    mla     r4, r11, r8, r4         ; a*64 + b*192 + 1
-    mov     r4, r4, asr #8          ; (a*64 + b*192 + 1) >> 8
-    strb    r4, [dest], #1
-
-    add     r8, r8, #1              ; b + 1
-    add     r8, r8, r7              ; b + c + 1
-    mov     r8, r8, asr #1          ; (b + c + 1) >> 1
-    strb    r8, [dest], #1
-    strb    r7, [dest], #1
-
-    ldmia   sp!, {r4 - r11, pc}
-    ENDP    ;|vp8cx_horizontal_line_3_4_scale_c|
-
-
-;/****************************************************************************
-; *
-; *  ROUTINE       : vertical_band_3_4_scale_armv4
-; *
-; *  INPUTS        : unsigned char *dest    : Pointer to destination data.
-; *                  unsigned int dest_pitch : Stride of destination data.
-; *                  unsigned int dest_width : Width of destination data.
-; *
-; *  OUTPUTS       : None.
-; *
-; *  RETURNS       : void
-; *
-; *  FUNCTION      : Scales vertical band of pixels by scale 3 to 4. The
-; *                  height of the band scaled is 3-pixels.
-; *
-; *  SPECIAL NOTES : The routine uses the first line of the band below
-; *                  the current band.
-; *
-; ****************************************************************************/
-;void vertical_band_3_4_scale_armv4
-;(
-;   r0 = UINT8 *dest
-;   r1 = UINT32 dest_pitch
-;   r2 = UINT32 dest_width
-;)
-|vertical_band_3_4_scale_armv4| PROC
-    stmdb   sp!, {r4 - r11, lr}
-
-    ldr     r10, =64
-    ldr     r11, =192
-    mov     r9, #128
-
-;   ldr     r1,[r1]
-vl34_loop
-    mov     r3, src
-    ldrb    r4, [r3], r1            ; a = des [0]
-    ldrb    r5, [r3], r1            ; b = des [dest_pitch]
-    ldrb    r7, [r3], r1            ; c = des [dest_pitch*2]
-    add     lr, src, r1
-
-    mla     r4, r10, r4, r9         ; a*64 + 128
-    mla     r4, r11, r5, r4         ; a*64 + b*192 + 1
-
-    add     r5, r5, #1              ; b + 1
-    add     r5, r5, r7              ; b + c + 1
-    mov     r5, r5, asr #1          ; (b + c + 1) >> 1
-
-    mov     r4, r4, asr #8          ; (a*64 + b*192 + 1) >> 8
-    strb    r4, [lr], r1
-
-    ldrb    r4, [r3, r1]            ; a = des [dest_pitch*4]
-
-    strb    r5, [lr], r1
-
-    mla     r7, r11, r7, r9         ; c*192 + 128
-    mla     r7, r4, r10, r7         ; a*64 + b*192 + 128
-    mov     r7, r7, asr #8          ; (a*64 + b*192 + 128) >> 8
-
-    add     src, src, #1
-    subs    r2, r2, #1
-
-    strb    r7, [lr]
-
-    bne     vl34_loop
-
-    ldmia   sp!, {r4 - r11, pc}
-    ENDP    ;|vertical_band_3_4_scale_armv4|
-
-;/****************************************************************************
-; *
-; *  ROUTINE       : vp8cx_horizontal_line_1_2_scale_c
-; *
-; *  INPUTS        : const unsigned char *source : Pointer to source data.
-; *                  unsigned int source_width    : Stride of source.
-; *                  unsigned char *dest         : Pointer to destination data.
-; *                  unsigned int dest_width      : Stride of destination (NOT USED).
-; *
-; *  OUTPUTS       : None.
-; *
-; *  RETURNS       : void
-; *
-; *  FUNCTION      : Copies horizontal line of pixels from source to
-; *                  destination scaling up by 1 to 2.
-; *
-; *  SPECIAL NOTES : None.
-; *
-; ****************************************************************************/
-;void vp8cx_horizontal_line_1_2_scale_c
-;(
-;   const unsigned char *source,
-;   unsigned int source_width,
-;   unsigned char *dest,
-;   unsigned int dest_width
-;)
-|horizontal_line_1_2_scale_armv4| PROC
-    stmdb   sp!, {r4 - r5, lr}
-
-    sub     srcw, srcw, #1
-
-    ldrb    r3, [src], #1
-    ldrb    r4, [src], #1
-hl12_loop
-    subs    srcw, srcw, #1
-
-    add     r5, r3, r4
-    add     r5, r5, #1
-    mov     r5, r5, lsr #1
-
-    orr     r5, r3, r5, lsl #8
-    strh    r5, [dest], #2
-
-    mov     r3, r4
-
-    ldrneb  r4, [src], #1
-    bne     hl12_loop
-
-    orr     r5, r4, r4, lsl #8
-    strh    r5, [dest]
-
-    ldmia   sp!, {r4 - r5, pc}
-    ENDP    ;|vertical_band_3_5_scale_armv4|
-
-;/****************************************************************************
-; *
-; *  ROUTINE       : vp8cx_vertical_band_1_2_scale_c
-; *
-; *  INPUTS        : unsigned char *dest    : Pointer to destination data.
-; *                  unsigned int dest_pitch : Stride of destination data.
-; *                  unsigned int dest_width : Width of destination data.
-; *
-; *  OUTPUTS       : None.
-; *
-; *  RETURNS       : void
-; *
-; *  FUNCTION      : Scales vertical band of pixels by scale 1 to 2. The
-; *                  height of the band scaled is 1-pixel.
-; *
-; *  SPECIAL NOTES : The routine uses the first line of the band below
-; *                  the current band.
-; *
-; ****************************************************************************/
-;void vp8cx_vertical_band_1_2_scale_c
-;(
-;   r0 = UINT8 *dest
-;   r1 = UINT32 dest_pitch
-;   r2 = UINT32 dest_width
-;)
-|vertical_band_1_2_scale_armv4| PROC
-    stmdb   sp!, {r4 - r7, lr}
-
-    ldr     mask, =0xff00ff             ; mask for selection
-    ldr     lr, = 0x010001
-
-vl12_loop
-    mov     r3, src
-    ldr     r4, [r3], r1
-    ldr     r5, [r3, r1]
-
-    add     src, src, #4
-    subs    r2, r2, #4
-
-    and     r6, r4, mask
-    and     r7, r5, mask
-
-    add     r6, r7, r6
-    add     r6, r6, lr
-
-    and     r4, mask, r4, lsr #8
-    and     r5, mask, r5, lsr #8
-
-    mov     r6, r6, lsr #1
-    and     r6, r6, mask
-
-    add     r4, r5, r4
-    add     r4, r4, lr
-
-    mov     r4, r4, lsr #1
-    and     r4, r4, mask
-
-    orr     r5, r6, r4, lsl #8
-
-    str     r5, [r3]
-
-    bpl     vl12_loop
-
-    ldmia   sp!, {r4 - r7, pc}
-    ENDP    ;|vertical_band_3_5_scale_armv4|
-
-    END
diff --git a/vpx_scale/generic/vpxscale.c b/vpx_scale/generic/vpxscale.c
index 13c9122f0..29b130876 100644
--- a/vpx_scale/generic/vpxscale.c
+++ b/vpx_scale/generic/vpxscale.c
@@ -27,7 +27,6 @@
 /****************************************************************************
 *  Exports
 ****************************************************************************/
-#ifndef VPX_NO_GLOBALS
 void (*vp8_vertical_band_4_5_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) = 0;
 void (*vp8_last_vertical_band_4_5_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) = 0;
 void (*vp8_vertical_band_2_3_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) = 0;
@@ -51,9 +50,6 @@ void (*vp8_vertical_band_2_1_scale_i)(unsigned char *source, unsigned int src_pi
 void (*vp8_horizontal_line_2_1_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width) = 0;
 void (*vp8_horizontal_line_5_3_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width) = 0;
 void (*vp8_horizontal_line_5_4_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width) = 0;
-#else
-# include "vpxscale_nofp.h"
-#endif
 
 typedef struct
 {
diff --git a/vpx_scale/include/arm/vpxscale_nofp.h b/vpx_scale/include/arm/vpxscale_nofp.h
deleted file mode 100644
index 3e1a9fa83..000000000
--- a/vpx_scale/include/arm/vpxscale_nofp.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-void  vp8cx_vertical_band_4_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-void  vp8cx_last_vertical_band_4_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-void  vp8cx_vertical_band_2_3_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-void  vp8cx_last_vertical_band_2_3_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-void  vp8cx_vertical_band_3_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-void  vp8cx_last_vertical_band_3_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-void  vp8cx_vertical_band_3_4_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-void  vp8cx_last_vertical_band_3_4_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-void  vp8cx_horizontal_line_1_2_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
-void  vp8cx_horizontal_line_3_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
-void  vp8cx_horizontal_line_3_5_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
-void  vp8cx_horizontal_line_2_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
-void  vp8cx_horizontal_line_4_5_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
-void  vp8cx_vertical_band_1_2_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-void  vp8cx_last_vertical_band_1_2_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-
-void  vp8cx_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-void  vp8cx_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-void  vp8cx_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-void  vp8cx_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-void  vp8cx_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
-void  vp8cx_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
-void  vp8cx_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
-
-void horizontal_line_4_5_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
-void horizontal_line_2_3_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
-void horizontal_line_3_5_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
-void horizontal_line_3_4_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
-void horizontal_line_1_2_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
-void vertical_band_4_5_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-void vertical_band_2_3_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-void vertical_band_3_5_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-void vertical_band_3_4_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-void vertical_band_1_2_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-
-#define vp8_vertical_band_4_5_scale     vertical_band_4_5_scale_armv4
-#define vp8_last_vertical_band_4_5_scale vp8cx_last_vertical_band_4_5_scale_c
-#define vp8_vertical_band_2_3_scale     vertical_band_2_3_scale_armv4
-#define vp8_last_vertical_band_2_3_scale vp8cx_last_vertical_band_2_3_scale_c
-#define vp8_vertical_band_3_5_scale     vertical_band_3_5_scale_armv4
-#define vp8_last_vertical_band_3_5_scale vp8cx_last_vertical_band_3_5_scale_c
-#define vp8_vertical_band_3_4_scale     vertical_band_3_4_scale_armv4
-#define vp8_last_vertical_band_3_4_scale vp8cx_last_vertical_band_3_4_scale_c
-#define vp8_horizontal_line_1_2_scale   horizontal_line_1_2_scale_armv4
-#define vp8_horizontal_line_3_5_scale   horizontal_line_3_5_scale_armv4
-#define vp8_horizontal_line_3_4_scale   horizontal_line_3_4_scale_armv4
-#define vp8_horizontal_line_4_5_scale   horizontal_line_4_5_scale_armv4
-#define vp8_horizontal_line_2_3_scale   horizontal_line_2_3_scale_armv4
-#define vp8_vertical_band_1_2_scale     vertical_band_1_2_scale_armv4
-#define vp8_last_vertical_band_1_2_scale vp8cx_last_vertical_band_1_2_scale_c
-#define vp8_vertical_band_5_4_scale     vp8cx_vertical_band_5_4_scale_c
-#define vp8_vertical_band_5_3_scale     vp8cx_vertical_band_5_3_scale_c
-#define vp8_vertical_band_2_1_scale     vp8cx_vertical_band_2_1_scale_c
-#define vp8_vertical_band_2_1_scale_i   vp8cx_vertical_band_2_1_scale_i_c
-#define vp8_horizontal_line_2_1_scale   vp8cx_horizontal_line_2_1_scale_c
-#define vp8_horizontal_line_5_3_scale   vp8cx_horizontal_line_5_3_scale_c
-#define vp8_horizontal_line_5_4_scale   vp8cx_horizontal_line_5_4_scale_c
diff --git a/vpx_scale/include/generic/vpxscale_nofp.h b/vpx_scale/include/generic/vpxscale_nofp.h
deleted file mode 100644
index 7b8205a1b..000000000
--- a/vpx_scale/include/generic/vpxscale_nofp.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-void  vp8cx_vertical_band_4_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-void  vp8cx_last_vertical_band_4_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-void  vp8cx_vertical_band_2_3_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-void  vp8cx_last_vertical_band_2_3_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-void  vp8cx_vertical_band_3_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-void  vp8cx_last_vertical_band_3_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-void  vp8cx_horizontal_line_1_2_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
-void  vp8cx_horizontal_line_3_5_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
-void  vp8cx_horizontal_line_2_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
-void  vp8cx_horizontal_line_4_5_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
-void  vp8cx_vertical_band_1_2_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-void  vp8cx_last_vertical_band_1_2_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-
-void  vp8cx_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-void  vp8cx_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-void  vp8cx_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-void  vp8cx_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-void  vp8cx_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
-void  vp8cx_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
-void  vp8cx_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
-
-#define vp8_vertical_band_4_5_scale     vp8cx_vertical_band_4_5_scale_c
-#define vp8_last_vertical_band_4_5_scale vp8cx_last_vertical_band_4_5_scale_c
-#define vp8_vertical_band_2_3_scale     vp8cx_vertical_band_2_3_scale_c
-#define vp8_last_vertical_band_2_3_scale vp8cx_last_vertical_band_2_3_scale_c
-#define vp8_vertical_band_3_5_scale     vp8cx_vertical_band_3_5_scale_c
-#define vp8_last_vertical_band_3_5_scale vp8cx_last_vertical_band_3_5_scale_c
-#define vp8_horizontal_line_1_2_scale   vp8cx_horizontal_line_1_2_scale_c
-#define vp8_horizontal_line_3_5_scale   vp8cx_horizontal_line_3_5_scale_c
-#define vp8_horizontal_line_4_5_scale   vp8cx_horizontal_line_4_5_scale_c
-#define vp8_horizontal_line_2_3_scale   vp8cx_horizontal_line_2_3_scale_c
-#define vp8_vertical_band_1_2_scale     vp8cx_vertical_band_1_2_scale_c
-#define vp8_last_vertical_band_1_2_scale vp8cx_last_vertical_band_1_2_scale_c
-#define vp8_vertical_band_5_4_scale     vp8cx_vertical_band_5_4_scale_c
-#define vp8_vertical_band_5_3_scale     vp8cx_vertical_band_5_3_scale_c
-#define vp8_vertical_band_2_1_scale     vp8cx_vertical_band_2_1_scale_c
-#define vp8_vertical_band_2_1_scale_i   vp8cx_vertical_band_2_1_scale_i_c
-#define vp8_horizontal_line_2_1_scale   vp8cx_horizontal_line_2_1_scale_c
-#define vp8_horizontal_line_5_3_scale   vp8cx_horizontal_line_5_3_scale_c
-#define vp8_horizontal_line_5_4_scale   vp8cx_horizontal_line_5_4_scale_c
diff --git a/vpx_scale/include/symbian/vpxscale_nofp.h b/vpx_scale/include/symbian/vpxscale_nofp.h
deleted file mode 100644
index 3e1a9fa83..000000000
--- a/vpx_scale/include/symbian/vpxscale_nofp.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-void  vp8cx_vertical_band_4_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-void  vp8cx_last_vertical_band_4_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-void  vp8cx_vertical_band_2_3_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-void  vp8cx_last_vertical_band_2_3_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-void  vp8cx_vertical_band_3_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-void  vp8cx_last_vertical_band_3_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-void  vp8cx_vertical_band_3_4_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-void  vp8cx_last_vertical_band_3_4_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-void  vp8cx_horizontal_line_1_2_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
-void  vp8cx_horizontal_line_3_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
-void  vp8cx_horizontal_line_3_5_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
-void  vp8cx_horizontal_line_2_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
-void  vp8cx_horizontal_line_4_5_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
-void  vp8cx_vertical_band_1_2_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-void  vp8cx_last_vertical_band_1_2_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-
-void  vp8cx_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-void  vp8cx_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-void  vp8cx_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-void  vp8cx_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-void  vp8cx_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
-void  vp8cx_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
-void  vp8cx_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
-
-void horizontal_line_4_5_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
-void horizontal_line_2_3_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
-void horizontal_line_3_5_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
-void horizontal_line_3_4_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
-void horizontal_line_1_2_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
-void vertical_band_4_5_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-void vertical_band_2_3_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-void vertical_band_3_5_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-void vertical_band_3_4_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-void vertical_band_1_2_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-
-#define vp8_vertical_band_4_5_scale     vertical_band_4_5_scale_armv4
-#define vp8_last_vertical_band_4_5_scale vp8cx_last_vertical_band_4_5_scale_c
-#define vp8_vertical_band_2_3_scale     vertical_band_2_3_scale_armv4
-#define vp8_last_vertical_band_2_3_scale vp8cx_last_vertical_band_2_3_scale_c
-#define vp8_vertical_band_3_5_scale     vertical_band_3_5_scale_armv4
-#define vp8_last_vertical_band_3_5_scale vp8cx_last_vertical_band_3_5_scale_c
-#define vp8_vertical_band_3_4_scale     vertical_band_3_4_scale_armv4
-#define vp8_last_vertical_band_3_4_scale vp8cx_last_vertical_band_3_4_scale_c
-#define vp8_horizontal_line_1_2_scale   horizontal_line_1_2_scale_armv4
-#define vp8_horizontal_line_3_5_scale   horizontal_line_3_5_scale_armv4
-#define vp8_horizontal_line_3_4_scale   horizontal_line_3_4_scale_armv4
-#define vp8_horizontal_line_4_5_scale   horizontal_line_4_5_scale_armv4
-#define vp8_horizontal_line_2_3_scale   horizontal_line_2_3_scale_armv4
-#define vp8_vertical_band_1_2_scale     vertical_band_1_2_scale_armv4
-#define vp8_last_vertical_band_1_2_scale vp8cx_last_vertical_band_1_2_scale_c
-#define vp8_vertical_band_5_4_scale     vp8cx_vertical_band_5_4_scale_c
-#define vp8_vertical_band_5_3_scale     vp8cx_vertical_band_5_3_scale_c
-#define vp8_vertical_band_2_1_scale     vp8cx_vertical_band_2_1_scale_c
-#define vp8_vertical_band_2_1_scale_i   vp8cx_vertical_band_2_1_scale_i_c
-#define vp8_horizontal_line_2_1_scale   vp8cx_horizontal_line_2_1_scale_c
-#define vp8_horizontal_line_5_3_scale   vp8cx_horizontal_line_5_3_scale_c
-#define vp8_horizontal_line_5_4_scale   vp8cx_horizontal_line_5_4_scale_c
diff --git a/vpx_scale/include/vpxscale_nofp.h b/vpx_scale/include/vpxscale_nofp.h
deleted file mode 100644
index a704bd92c..000000000
--- a/vpx_scale/include/vpxscale_nofp.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#if defined(__S60_V20__) || defined(__SYMBIAN32__) && !defined(__WINS__)
-#include "symbian\vpxscale_nofp.h"
-#else
-#include "generic\vpxscale_nofp.h"
-#endif
diff --git a/vpx_scale/vpxscale.h b/vpx_scale/vpxscale.h
index a13a65f57..1a4997c55 100644
--- a/vpx_scale/vpxscale.h
+++ b/vpx_scale/vpxscale.h
@@ -61,19 +61,6 @@ extern void (*vp8_horizontal_line_2_1_scale)(const unsigned char *source, unsign
 extern void (*vp8_horizontal_line_5_3_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
 extern void (*vp8_horizontal_line_5_4_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
 
-void horizontal_line_4_5_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
-void horizontal_line_2_3_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
-void horizontal_line_3_5_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
-void horizontal_line_3_4_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
-void horizontal_line_1_2_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
-void vertical_band_4_5_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-void vertical_band_2_3_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-void vertical_band_3_5_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-void vertical_band_3_4_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-void vertical_band_1_2_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-
-
-extern void  dmachine_specific_config(int mmx_enabled, int xmm_enabled, int wmt_enabled);
 extern void vp8_yv12_scale_or_center
 (
     YV12_BUFFER_CONFIG *src_yuv_config,
diff --git a/vpxdec.c b/vpxdec.c
index 7401101f8..4482f3dc7 100644
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -124,11 +124,13 @@ static const arg_def_t pp_disp_b_modes = ARG_DEF(NULL, "pp-dbg-b-modes", 1,
                                        "Display only selected block modes");
 static const arg_def_t pp_disp_mvs = ARG_DEF(NULL, "pp-dbg-mvs", 1,
                                        "Draw only selected motion vectors");
+static const arg_def_t mfqe = ARG_DEF(NULL, "mfqe", 0,
+                                       "Enable multiframe quality enhancement");
 
 static const arg_def_t *vp8_pp_args[] =
 {
     &addnoise_level, &deblock, &demacroblock_level, &pp_debug_info,
-    &pp_disp_ref_frame, &pp_disp_mb_modes, &pp_disp_b_modes, &pp_disp_mvs,
+    &pp_disp_ref_frame, &pp_disp_mb_modes, &pp_disp_b_modes, &pp_disp_mvs, &mfqe,
     NULL
 };
 #endif
@@ -803,6 +805,11 @@ int main(int argc, const char **argv_)
             postproc = 1;
             vp8_pp_cfg.post_proc_flag |= VP8_DEBLOCK;
         }
+        else if (arg_match(&arg, &mfqe, argi))
+        {
+            postproc = 1;
+            vp8_pp_cfg.post_proc_flag |= VP8_MFQE;
+        }
         else if (arg_match(&arg, &pp_debug_info, argi))
         {
             unsigned int level = arg_parse_uint(&arg);