22 files changed, 754 insertions, 656 deletions
diff --git a/configure b/configure
index 11e086e9c..f32fffe38 100755
--- a/configure
+++ b/configure
@@ -41,6 +41,7 @@ Advanced options:
   ${toggle_shared}                shared library support
   ${toggle_small}                 favor smaller size over speed
   ${toggle_arm_asm_detok}         assembly version of the detokenizer (ARM platforms only)
+  ${toggle_postproc_visualizer}   macro block / block level visualizers
 
 Codecs:
   Codecs can be selectively enabled or disabled individually, or by family:
@@ -250,6 +251,7 @@ CONFIG_LIST="
     shared
     small
     arm_asm_detok
+    postproc_visualizer
 "
 CMDLINE_SELECT="
     extra_warnings
@@ -289,6 +291,7 @@ CMDLINE_SELECT="
     shared
     small
     arm_asm_detok
+    postproc_visualizer
 "
 
 process_cmdline() {
@@ -325,8 +328,6 @@ post_process_cmdline() {
     for c in ${CODECS}; do
         enabled ${c} && enable ${c##*_}s
     done
-
-
 }
 
 
@@ -536,6 +537,10 @@ process_toolchain() {
 
     # Other toolchain specific defaults
     case $toolchain in x86*|ppc*|universal*) soft_enable postproc;; esac
+
+    if enabled postproc_visualizer; then
+        enabled postproc || die "postproc_visualizer requires postproc to be enabled"
+    fi
 }
 
 
diff --git a/vp8/common/onyx.h b/vp8/common/onyx.h
index a006306db..3c199d1c2 100644
--- a/vp8/common/onyx.h
+++ b/vp8/common/onyx.h
@@ -204,7 +204,7 @@ extern "C"
 // and not just a copy of the pointer..
     int vp8_receive_raw_frame(VP8_PTR comp, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, INT64 time_stamp, INT64 end_time_stamp);
     int vp8_get_compressed_data(VP8_PTR comp, unsigned int *frame_flags, unsigned long *size, unsigned char *dest, INT64 *time_stamp, INT64 *time_end, int flush);
-    int vp8_get_preview_raw_frame(VP8_PTR comp, YV12_BUFFER_CONFIG *dest, int deblock_level, int noise_level, int flags);
+    int vp8_get_preview_raw_frame(VP8_PTR comp, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *flags);
 
     int vp8_use_as_reference(VP8_PTR comp, int ref_frame_flags);
     int vp8_update_reference(VP8_PTR comp, int ref_frame_flags);
diff --git a/vp8/common/onyxd.h b/vp8/common/onyxd.h
index 00a97d97d..e53bc3138 100644
--- a/vp8/common/onyxd.h
+++ b/vp8/common/onyxd.h
@@ -51,7 +51,7 @@ extern "C"
     int vp8dx_get_setting(VP8D_PTR comp, VP8D_SETTING oxst);
 
     int vp8dx_receive_compressed_data(VP8D_PTR comp, unsigned long size, const unsigned char *dest, INT64 time_stamp);
-    int vp8dx_get_raw_frame(VP8D_PTR comp, YV12_BUFFER_CONFIG *sd, INT64 *time_stamp, INT64 *time_end_stamp, int deblock_level,  int noise_level, int flags);
+    int vp8dx_get_raw_frame(VP8D_PTR comp, YV12_BUFFER_CONFIG *sd, INT64 *time_stamp, INT64 *time_end_stamp, vp8_ppflags_t *flags);
 
     int vp8dx_get_reference(VP8D_PTR comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
     int vp8dx_set_reference(VP8D_PTR comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
diff --git a/vp8/common/postproc.c b/vp8/common/postproc.c
index e797e1036..15b1c2c89 100644
--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c
@@ -26,7 +26,7 @@
     ( (0.439*(float)(t>>16)) - (0.368*(float)(t>>8&0xff)) - (0.071*(float)(t&0xff)) + 128)
 
 /* global constants */
-
+#if CONFIG_POSTPROC_VISUALIZER
 static const unsigned char MB_PREDICTION_MODE_colors[MB_MODE_COUNT][3] =
 {
     { RGB_TO_YUV(0x98FB98) },   /* PaleGreen */
@@ -59,13 +59,14 @@ static const unsigned char B_PREDICTION_MODE_colors[B_MODE_COUNT][3] =
     { RGB_TO_YUV(0xccff33) },   /* Yellow */
 };
 
-static const unsigned char MV_REFERENCE_FRAME_colors[MB_MODE_COUNT][3] =
+static const unsigned char MV_REFERENCE_FRAME_colors[MAX_REF_FRAMES][3] =
 {
     { RGB_TO_YUV(0x00ff00) },   /* Blue */
     { RGB_TO_YUV(0x0000ff) },   /* Green */
     { RGB_TO_YUV(0xffff00) },   /* Yellow */
     { RGB_TO_YUV(0xff0000) },   /* Red */
 };
+#endif
 
 static const short kernel5[] =
 {
@@ -677,10 +678,13 @@ static void constrain_line (int x0, int *x1, int y0, int *y1, int width, int hei
 #define RTCD_VTABLE(oci) NULL
 #endif
 
-int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_level, int noise_level, int flags)
+int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *ppflags)
 {
     char message[512];
     int q = oci->filter_level * 10 / 6;
+    int flags = ppflags->post_proc_flag;
+    int deblock_level = ppflags->deblocking_level;
+    int noise_level = ppflags->noise_level;
 
     if (!oci->frame_to_show)
         return -1;
@@ -737,7 +741,8 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
          oci->post_proc_buffer.y_stride);
     }
 
-    if (flags & VP8D_DEBUG_LEVEL1)
+#if CONFIG_POSTPROC_VISUALIZER
+    if (flags & VP8D_DEBUG_TXT_FRAME_INFO)
     {
         sprintf(message, "F%1dG%1dQ%3dF%3dP%d_s%dx%d",
                 (oci->frame_type == KEY_FRAME),
@@ -749,7 +754,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
         vp8_blit_text(message, oci->post_proc_buffer.y_buffer, oci->post_proc_buffer.y_stride);
     }
 
-    if (flags & VP8D_DEBUG_LEVEL2)
+    if (flags & VP8D_DEBUG_TXT_MBLK_MODES)
     {
         int i, j;
         unsigned char *y_ptr;
@@ -781,7 +786,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
         }
     }
 
-    if (flags & VP8D_DEBUG_LEVEL3)
+    if (flags & VP8D_DEBUG_TXT_DC_DIFF)
     {
         int i, j;
         unsigned char *y_ptr;
@@ -816,45 +821,14 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
         }
     }
 
-    if (flags & VP8D_DEBUG_LEVEL4)
+    if (flags & VP8D_DEBUG_TXT_RATE_INFO)
     {
         sprintf(message, "Bitrate: %10.2f frame_rate: %10.2f ", oci->bitrate, oci->framerate);
         vp8_blit_text(message, oci->post_proc_buffer.y_buffer, oci->post_proc_buffer.y_stride);
-#if 0
-        int i, j;
-        unsigned char *y_ptr;
-        YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
-        int mb_rows = post->y_height >> 4;
-        int mb_cols = post->y_width  >> 4;
-        int mb_index = 0;
-        MODE_INFO *mi = oci->mi;
-
-        y_ptr = post->y_buffer + 4 * post->y_stride + 4;
-
-        /* vp8_filter each macro block */
-        for (i = 0; i < mb_rows; i++)
-        {
-            for (j = 0; j < mb_cols; j++)
-            {
-                char zz[4];
-
-                sprintf(zz, "%c", mi[mb_index].mbmi.dc_diff + '0');
-                vp8_blit_text(zz, y_ptr, post->y_stride);
-                mb_index ++;
-                y_ptr += 16;
-            }
-
-            mb_index ++; /* border */
-            y_ptr += post->y_stride  * 16 - post->y_width;
-
-        }
-
-#endif
-
     }
 
     /* Draw motion vectors */
-    if (flags & VP8D_DEBUG_DRAW_MV)
+    if ((flags & VP8D_DEBUG_DRAW_MV) && ppflags->display_mv_flag)
     {
         YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
         int width  = post->y_width;
@@ -871,6 +845,12 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
             {
                 int x1, y1;
 
+                if (!(ppflags->display_mv_flag & (1<<mi->mbmi.mode)))
+                {
+                    mi++;
+                    continue;
+                }
+
                 if (mi->mbmi.mode == SPLITMV)
                 {
                     switch (mi->mbmi.partitioning)
@@ -996,6 +976,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
                     else
                         vp8_blit_line  (lx0,  x1, ly0,  y1, y_buffer, y_stride);
                 }
+
                 mi++;
             }
             mi++;
@@ -1003,7 +984,8 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
     }
 
     /* Color in block modes */
-    if (flags & VP8D_DEBUG_CLR_BLK_MODES)
+    if ((flags & VP8D_DEBUG_CLR_BLK_MODES)
+        && (ppflags->display_mb_modes_flag || ppflags->display_b_modes_flag))
     {
         int y, x;
         YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
@@ -1021,7 +1003,8 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
             {
                 int Y = 0, U = 0, V = 0;
 
-                if (mi->mbmi.mode == B_PRED)
+                if (mi->mbmi.mode == B_PRED &&
+                    ((ppflags->display_mb_modes_flag & B_PRED) || ppflags->display_b_modes_flag))
                 {
                     int by, bx;
                     unsigned char *yl, *ul, *vl;
@@ -1035,13 +1018,16 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
                     {
                         for (bx = 0; bx < 16; bx += 4)
                         {
-                            Y = B_PREDICTION_MODE_colors[bmi->mode][0];
-                            U = B_PREDICTION_MODE_colors[bmi->mode][1];
-                            V = B_PREDICTION_MODE_colors[bmi->mode][2];
-
-                            POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_b)
-                                (yl+bx, ul+(bx>>1), vl+(bx>>1), Y, U, V, 0xc000, y_stride);
+                            if ((ppflags->display_b_modes_flag & (1<<mi->mbmi.mode))
+                                || (ppflags->display_mb_modes_flag & B_PRED))
+                            {
+                                Y = B_PREDICTION_MODE_colors[bmi->mode][0];
+                                U = B_PREDICTION_MODE_colors[bmi->mode][1];
+                                V = B_PREDICTION_MODE_colors[bmi->mode][2];
 
+                                POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_b)
+                                    (yl+bx, ul+(bx>>1), vl+(bx>>1), Y, U, V, 0xc000, y_stride);
+                            }
                             bmi++;
                         }
 
@@ -1050,7 +1036,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
                         vl += y_stride*1;
                     }
                 }
-                else
+                else if (ppflags->display_mb_modes_flag & (1<<mi->mbmi.mode))
                 {
                     Y = MB_PREDICTION_MODE_colors[mi->mbmi.mode][0];
                     U = MB_PREDICTION_MODE_colors[mi->mbmi.mode][1];
@@ -1059,6 +1045,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
                     POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_mb_inner)
                         (y_ptr+x, u_ptr+(x>>1), v_ptr+(x>>1), Y, U, V, 0xc000, y_stride);
                 }
+
                 mi++;
             }
             y_ptr += y_stride*16;
@@ -1070,7 +1057,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
     }
 
     /* Color in frame reference blocks */
-    if (flags & VP8D_DEBUG_CLR_FRM_REF_BLKS)
+    if ((flags & VP8D_DEBUG_CLR_FRM_REF_BLKS) && ppflags->display_ref_frame_flag)
     {
         int y, x;
         YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
@@ -1088,12 +1075,15 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
             {
                 int Y = 0, U = 0, V = 0;
 
-                Y = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][0];
-                U = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][1];
-                V = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][2];
+                if (ppflags->display_ref_frame_flag & (1<<mi->mbmi.ref_frame))
+                {
+                    Y = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][0];
+                    U = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][1];
+                    V = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][2];
 
-                POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_mb_outer)
-                    (y_ptr+x, u_ptr+(x>>1), v_ptr+(x>>1), Y, U, V, 0xc000, y_stride);
+                    POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_mb_outer)
+                        (y_ptr+x, u_ptr+(x>>1), v_ptr+(x>>1), Y, U, V, 0xc000, y_stride);
+                }
 
                 mi++;
             }
@@ -1104,6 +1094,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
             mi++;
         }
     }
+#endif
 
     *dest = oci->post_proc_buffer;
 
diff --git a/vp8/common/postproc.h b/vp8/common/postproc.h
index 7485135bf..c641b9ca5 100644
--- a/vp8/common/postproc.h
+++ b/vp8/common/postproc.h
@@ -111,7 +111,7 @@ struct postproc_state
 #include "onyxc_int.h"
 #include "ppflags.h"
 int vp8_post_proc_frame(struct VP8Common *oci, YV12_BUFFER_CONFIG *dest,
-                        int deblock_level, int noise_level, int flags);
+                        vp8_ppflags_t *flags);
 
 
 void vp8_de_noise(YV12_BUFFER_CONFIG         *source,
diff --git a/vp8/common/ppflags.h b/vp8/common/ppflags.h
index b8d713cf0..65b0cab6a 100644
--- a/vp8/common/ppflags.h
+++ b/vp8/common/ppflags.h
@@ -17,13 +17,24 @@ enum
     VP8D_DEBLOCK                = 1<<0,
     VP8D_DEMACROBLOCK           = 1<<1,
     VP8D_ADDNOISE               = 1<<2,
-    VP8D_DEBUG_LEVEL1           = 1<<3,
-    VP8D_DEBUG_LEVEL2           = 1<<4,
-    VP8D_DEBUG_LEVEL3           = 1<<5,
-    VP8D_DEBUG_LEVEL4           = 1<<6,
+    VP8D_DEBUG_TXT_FRAME_INFO   = 1<<3,
+    VP8D_DEBUG_TXT_MBLK_MODES   = 1<<4,
+    VP8D_DEBUG_TXT_DC_DIFF      = 1<<5,
+    VP8D_DEBUG_TXT_RATE_INFO    = 1<<6,
     VP8D_DEBUG_DRAW_MV          = 1<<7,
     VP8D_DEBUG_CLR_BLK_MODES    = 1<<8,
     VP8D_DEBUG_CLR_FRM_REF_BLKS = 1<<9
 };
 
+typedef struct
+{
+    int post_proc_flag;
+    int deblocking_level;
+    int noise_level;
+    int display_ref_frame_flag;
+    int display_mb_modes_flag;
+    int display_b_modes_flag;
+    int display_mv_flag;
+} vp8_ppflags_t;
+
 #endif
diff --git a/vp8/decoder/onyxd_if.c b/vp8/decoder/onyxd_if.c
index 6eda45e4a..aa2709f5b 100644
--- a/vp8/decoder/onyxd_if.c
+++ b/vp8/decoder/onyxd_if.c
@@ -506,7 +506,7 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign
     pbi->common.error.setjmp = 0;
     return retcode;
 }
-int vp8dx_get_raw_frame(VP8D_PTR ptr, YV12_BUFFER_CONFIG *sd, INT64 *time_stamp, INT64 *time_end_stamp, int deblock_level,  int noise_level, int flags)
+int vp8dx_get_raw_frame(VP8D_PTR ptr, YV12_BUFFER_CONFIG *sd, INT64 *time_stamp, INT64 *time_end_stamp, vp8_ppflags_t *flags)
 {
     int ret = -1;
     VP8D_COMP *pbi = (VP8D_COMP *) ptr;
@@ -524,7 +524,7 @@ int vp8dx_get_raw_frame(VP8D_PTR ptr, YV12_BUFFER_CONFIG *sd, INT64 *time_stamp,
 
     sd->clrtype = pbi->common.clr_type;
 #if CONFIG_POSTPROC
-    ret = vp8_post_proc_frame(&pbi->common, sd, deblock_level, noise_level, flags);
+    ret = vp8_post_proc_frame(&pbi->common, sd, flags);
 #else
 
     if (pbi->common.frame_to_show)
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index b67edd39f..2aac20b31 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -146,16 +146,25 @@ static const int qzbin_factors_y2[129] =
 
 #define EXACT_QUANT
 #ifdef EXACT_QUANT
-static void vp8cx_invert_quant(short *quant, short *shift, short d)
+static void vp8cx_invert_quant(int improved_quant, short *quant,
+                               short *shift, short d)
 {
-    unsigned t;
-    int l;
-    t = d;
-    for(l = 0; t > 1; l++)
-        t>>=1;
-    t = 1 + (1<<(16+l))/d;
-    *quant = (short)(t - (1<<16));
-    *shift = l;
+    if(improved_quant)
+    {
+        unsigned t;
+        int l;
+        t = d;
+        for(l = 0; t > 1; l++)
+            t>>=1;
+        t = 1 + (1<<(16+l))/d;
+        *quant = (short)(t - (1<<16));
+        *shift = l;
+    }
+    else
+    {
+        *quant = (1 << 16) / d;
+        *shift = 0;
+    }
 }
 
 void vp8cx_init_quantizer(VP8_COMP *cpi)
@@ -170,7 +179,7 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)
     {
         // dc values
         quant_val = vp8_dc_quant(Q, cpi->common.y1dc_delta_q);
-        vp8cx_invert_quant(cpi->Y1quant[Q] + 0,
+        vp8cx_invert_quant(cpi->sf.improved_quant, cpi->Y1quant[Q] + 0,
                            cpi->Y1quant_shift[Q] + 0, quant_val);
         cpi->Y1zbin[Q][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
         cpi->Y1round[Q][0] = (qrounding_factors[Q] * quant_val) >> 7;
@@ -178,7 +187,7 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)
         cpi->zrun_zbin_boost_y1[Q][0] = (quant_val * zbin_boost[0]) >> 7;
 
         quant_val = vp8_dc2quant(Q, cpi->common.y2dc_delta_q);
-        vp8cx_invert_quant(cpi->Y2quant[Q] + 0,
+        vp8cx_invert_quant(cpi->sf.improved_quant, cpi->Y2quant[Q] + 0,
                            cpi->Y2quant_shift[Q] + 0, quant_val);
         cpi->Y2zbin[Q][0] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7;
         cpi->Y2round[Q][0] = (qrounding_factors_y2[Q] * quant_val) >> 7;
@@ -186,7 +195,7 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)
         cpi->zrun_zbin_boost_y2[Q][0] = (quant_val * zbin_boost[0]) >> 7;
 
         quant_val = vp8_dc_uv_quant(Q, cpi->common.uvdc_delta_q);
-        vp8cx_invert_quant(cpi->UVquant[Q] + 0,
+        vp8cx_invert_quant(cpi->sf.improved_quant, cpi->UVquant[Q] + 0,
                            cpi->UVquant_shift[Q] + 0, quant_val);
         cpi->UVzbin[Q][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;;
         cpi->UVround[Q][0] = (qrounding_factors[Q] * quant_val) >> 7;
@@ -199,7 +208,7 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)
             int rc = vp8_default_zig_zag1d[i];
 
             quant_val = vp8_ac_yquant(Q);
-            vp8cx_invert_quant(cpi->Y1quant[Q] + rc,
+            vp8cx_invert_quant(cpi->sf.improved_quant, cpi->Y1quant[Q] + rc,
                                cpi->Y1quant_shift[Q] + rc, quant_val);
             cpi->Y1zbin[Q][rc] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
             cpi->Y1round[Q][rc] = (qrounding_factors[Q] * quant_val) >> 7;
@@ -207,7 +216,7 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)
             cpi->zrun_zbin_boost_y1[Q][i] = (quant_val * zbin_boost[i]) >> 7;
 
             quant_val = vp8_ac2quant(Q, cpi->common.y2ac_delta_q);
-            vp8cx_invert_quant(cpi->Y2quant[Q] + rc,
+            vp8cx_invert_quant(cpi->sf.improved_quant, cpi->Y2quant[Q] + rc,
                                cpi->Y2quant_shift[Q] + rc, quant_val);
             cpi->Y2zbin[Q][rc] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7;
             cpi->Y2round[Q][rc] = (qrounding_factors_y2[Q] * quant_val) >> 7;
@@ -215,7 +224,7 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)
             cpi->zrun_zbin_boost_y2[Q][i] = (quant_val * zbin_boost[i]) >> 7;
 
             quant_val = vp8_ac_uv_quant(Q, cpi->common.uvac_delta_q);
-            vp8cx_invert_quant(cpi->UVquant[Q] + rc,
+            vp8cx_invert_quant(cpi->sf.improved_quant, cpi->UVquant[Q] + rc,
                                cpi->UVquant_shift[Q] + rc, quant_val);
             cpi->UVzbin[Q][rc] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
             cpi->UVround[Q][rc] = (qrounding_factors[Q] * quant_val) >> 7;
@@ -405,14 +414,14 @@ void encode_mb_row(VP8_COMP *cpi,
     // Set up limit values for vertical motion vector components
     // to prevent them extending beyond the UMV borders
     x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16));
-    x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16) 
+    x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16)
                         + (VP8BORDERINPIXELS - 16);
 
     // for each macroblock col in image
     for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
     {
-        // Distance of Mb to the left & right edges, specified in 
-        // 1/8th pel units as they are always compared to values 
+        // Distance of Mb to the left & right edges, specified in
+        // 1/8th pel units as they are always compared to values
         // that are in 1/8th pel units
         xd->mb_to_left_edge = -((mb_col * 16) << 3);
         xd->mb_to_right_edge = ((cm->mb_cols - 1 - mb_col) * 16) << 3;
@@ -420,7 +429,7 @@ void encode_mb_row(VP8_COMP *cpi,
         // Set up limit values for horizontal motion vector components
         // to prevent them extending beyond the UMV borders
         x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16));
-        x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) 
+        x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16)
                             + (VP8BORDERINPIXELS - 16);
 
         xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
diff --git a/vp8/encoder/encodemb.c b/vp8/encoder/encodemb.c
index 043eac219..e9753ac48 100644
--- a/vp8/encoder/encodemb.c
+++ b/vp8/encoder/encodemb.c
@@ -309,8 +309,10 @@ void vp8_optimize_b(MACROBLOCK *mb, int ib, int type,
     eob = d->eob;
 
     /* Now set up a Viterbi trellis to evaluate alternative roundings. */
-    /* TODO: These should vary with the block type, since the quantizer does. */
     rdmult = (mb->rdmult << 2)*err_mult;
+    if(mb->e_mbd.mode_info_context->mbmi.ref_frame==INTRA_FRAME)
+        rdmult = (rdmult * 9)>>4;
+
     rddiv = mb->rddiv;
     best_mask[0] = best_mask[1] = 0;
     /* Initialize the sentinel node of the trellis. */
diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index 8a94fa369..a7f5ce44c 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -1439,7 +1439,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
 
         // Boost for arf frame
         Boost = (cpi->gfu_boost * 3 * GFQ_ADJUSTMENT) / (2 * 100);
-        Boost += (cpi->baseline_gf_interval * 50);
+        Boost += (i * 50);
         allocation_chunks = (i * 100) + Boost;
 
         // Normalize Altboost and allocations chunck down to prevent overflow
@@ -1738,16 +1738,6 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
 
         vp8_avg_stats(&sectionstats);
 
-        if (sectionstats.pcnt_motion < .17)
-            cpi->section_is_low_motion = 1;
-        else
-            cpi->section_is_low_motion = 0;
-
-        if (sectionstats.mvc_abs + sectionstats.mvr_abs > 45)
-            cpi->section_is_fast_motion = 1;
-        else
-            cpi->section_is_fast_motion = 0;
-
         cpi->section_intra_rating = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
 
         Ratio = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
@@ -1980,7 +1970,14 @@ void vp8_second_pass(VP8_COMP *cpi)
             cpi->ni_av_qi                     = cpi->worst_quality;
         }
     }
-    else
+    // The last few frames of a clip almost always have to few or too many
+    // bits and for the sake of over exact rate control we dont want to make
+    // radical adjustments to the allowed quantizer range just to use up a
+    // few surplus bits or get beneath the target rate.
+    else if ( (cpi->common.current_video_frame <
+                  (((unsigned int)cpi->total_stats->count * 255)>>8)) &&
+              ((cpi->common.current_video_frame + cpi->baseline_gf_interval) <
+                  (unsigned int)cpi->total_stats->count) )
     {
         if (frames_left < 1)
             frames_left = 1;
@@ -2344,17 +2341,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
 
         vp8_avg_stats(&sectionstats);
 
-        if (sectionstats.pcnt_motion < .17)
-            cpi->section_is_low_motion = 1;
-        else
-            cpi->section_is_low_motion = 0;
-
-        if (sectionstats.mvc_abs + sectionstats.mvr_abs > 45)
-            cpi->section_is_fast_motion = 1;
-        else
-            cpi->section_is_fast_motion = 0;
-
-        cpi->section_intra_rating = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
+         cpi->section_intra_rating = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
 
         Ratio = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
         // if( (Ratio > 11) ) //&& (sectionstats.pcnt_second_ref < .20) )
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 93e13d139..00ecf97a6 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -563,6 +563,7 @@ void vp8_set_speed_features(VP8_COMP *cpi)
     int Speed = cpi->Speed;
     int i;
     VP8_COMMON *cm = &cpi->common;
+    int last_improved_quant = sf->improved_quant;
 
     // Initialise default mode frequency sampling variables
     for (i = 0; i < MAX_MODES; i ++)
@@ -1262,6 +1263,8 @@ void vp8_set_speed_features(VP8_COMP *cpi)
     {
         cpi->mb.quantize_b      = QUANTIZE_INVOKE(&cpi->rtcd.quantize, fastquantb);
     }
+    if (cpi->sf.improved_quant != last_improved_quant)
+        vp8cx_init_quantizer(cpi);
 
 #if CONFIG_RUNTIME_CPU_DETECT
     cpi->mb.e_mbd.rtcd = &cpi->common.rtcd;
@@ -5224,7 +5227,7 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
     return 0;
 }
 
-int vp8_get_preview_raw_frame(VP8_PTR comp, YV12_BUFFER_CONFIG *dest, int deblock_level, int noise_level, int flags)
+int vp8_get_preview_raw_frame(VP8_PTR comp, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *flags)
 {
     VP8_COMP *cpi = (VP8_COMP *) comp;
 
@@ -5234,7 +5237,7 @@ int vp8_get_preview_raw_frame(VP8_PTR comp, YV12_BUFFER_CONFIG *dest, int debloc
     {
         int ret;
 #if CONFIG_POSTPROC
-        ret = vp8_post_proc_frame(&cpi->common, dest, deblock_level, noise_level, flags);
+        ret = vp8_post_proc_frame(&cpi->common, dest, flags);
 #else
 
         if (cpi->common.frame_to_show)
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index be5b00de8..a9eedf399 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -614,9 +614,6 @@ typedef struct
     unsigned int tempdata2;
 
     int base_skip_false_prob[128];
-    unsigned int section_is_low_motion;
-    unsigned int section_benefits_from_aggresive_q;
-    unsigned int section_is_fast_motion;
     unsigned int section_intra_rating;
 
     double section_max_qfactor;
diff --git a/vp8/encoder/quantize.c b/vp8/encoder/quantize.c
index c2c0351c0..a1be6614b 100644
--- a/vp8/encoder/quantize.c
+++ b/vp8/encoder/quantize.c
@@ -17,7 +17,8 @@
 #include "predictdc.h"
 
 #define EXACT_QUANT
-#ifdef EXACT_QUANT
+
+#ifdef EXACT_FASTQUANT
 void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
 {
     int i, rc, eob;
@@ -64,6 +65,45 @@ void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
     d->eob = eob + 1;
 }
 
+#else
+
+void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
+{
+    int i, rc, eob;
+    int zbin;
+    int x, y, z, sz;
+    short *coeff_ptr   = b->coeff;
+    short *round_ptr   = b->round;
+    short *quant_ptr   = b->quant;
+    short *qcoeff_ptr  = d->qcoeff;
+    short *dqcoeff_ptr = d->dqcoeff;
+    short *dequant_ptr = d->dequant;
+
+    eob = -1;
+    for (i = 0; i < 16; i++)
+    {
+        rc   = vp8_default_zig_zag1d[i];
+        z    = coeff_ptr[rc];
+
+        sz = (z >> 31);                                 // sign of z
+        x  = (z ^ sz) - sz;                             // x = abs(z)
+
+        y  = ((x + round_ptr[rc]) * quant_ptr[rc]) >> 16; // quantize (x)
+        x  = (y ^ sz) - sz;                         // get the sign back
+        qcoeff_ptr[rc] = x;                          // write to destination
+        dqcoeff_ptr[rc] = x * dequant_ptr[rc];        // dequantized value
+
+        if (y)
+        {
+            eob = i;                                // last nonzero coeffs
+        }
+    }
+    d->eob = eob + 1;
+}
+
+#endif
+
+#ifdef EXACT_QUANT
 void vp8_regular_quantize_b(BLOCK *b, BLOCKD *d)
 {
     int i, rc, eob;
@@ -178,39 +218,6 @@ void vp8_strict_quantize_b(BLOCK *b, BLOCKD *d)
 }
 
 #else
-void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
-{
-    int i, rc, eob;
-    int zbin;
-    int x, y, z, sz;
-    short *coeff_ptr   = b->coeff;
-    short *round_ptr   = b->round;
-    short *quant_ptr   = b->quant;
-    short *qcoeff_ptr  = d->qcoeff;
-    short *dqcoeff_ptr = d->dqcoeff;
-    short *dequant_ptr = d->dequant;
-
-    eob = -1;
-    for (i = 0; i < 16; i++)
-    {
-        rc   = vp8_default_zig_zag1d[i];
-        z    = coeff_ptr[rc];
-
-        sz = (z >> 31);                                 // sign of z
-        x  = (z ^ sz) - sz;                             // x = abs(z)
-
-        y  = ((x + round_ptr[rc]) * quant_ptr[rc]) >> 16; // quantize (x)
-        x  = (y ^ sz) - sz;                         // get the sign back
-        qcoeff_ptr[rc] = x;                          // write to destination
-        dqcoeff_ptr[rc] = x * dequant_ptr[rc];        // dequantized value
-
-        if (y)
-        {
-            eob = i;                                // last nonzero coeffs
-        }
-    }
-    d->eob = eob + 1;
-}
 
 void vp8_regular_quantize_b(BLOCK *b, BLOCKD *d)
 {
diff --git a/vp8/encoder/x86/dct_mmx.asm b/vp8/encoder/x86/dct_mmx.asm
index 5acaca875..f07b030bd 100644
--- a/vp8/encoder/x86/dct_mmx.asm
+++ b/vp8/encoder/x86/dct_mmx.asm
@@ -11,511 +11,231 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
-section .text
-    global sym(vp8_short_fdct4x4_mmx)
-    global sym(vp8_short_fdct8x4_wmt)
-
-
-%define         DCTCONSTANTSBITS         (16)
-%define         DCTROUNDINGVALUE         (1<< (DCTCONSTANTSBITS-1))
-%define         x_c1                      (60547)          ; cos(pi  /8) * (1<<15)
-%define         x_c2                      (46341)          ; cos(pi*2/8) * (1<<15)
-%define         x_c3                      (25080)          ; cos(pi*3/8) * (1<<15)
-
-
 ;void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch)
+global sym(vp8_short_fdct4x4_mmx)
 sym(vp8_short_fdct4x4_mmx):
     push        rbp
-    mov         rbp, rsp
+    mov         rbp,        rsp
     SHADOW_ARGS_TO_STACK 3
     GET_GOT     rbx
-    push rsi
-    push rdi
+    push        rsi
+    push        rdi
     ; end prolog
-        mov     rsi,    arg(0) ;input
-        mov     rdi,    arg(1) ;output
-
-        lea     rdx,    [GLOBAL(dct_const_mmx)]
-        movsxd  rax,    dword ptr arg(2) ;pitch
-
-        lea     rcx,    [rsi + rax*2]
-        ; read the input data
-        movq    mm0,    [rsi]
-        movq    mm1,    [rsi + rax    ]
-
-        movq    mm2,    [rcx]
-        movq    mm3,    [rcx + rax]
-        ; get the constants
-        ;shift to left by 1 for prescision
-        psllw   mm0,    3
-        psllw   mm1,    3
-
-        psllw   mm2,    3
-        psllw   mm3,    3
-
-        ; transpose for the second stage
-        movq    mm4,    mm0         ; 00 01 02 03
-        movq    mm5,    mm2         ; 10 11 12 03
-
-        punpcklwd   mm0,    mm1     ; 00 10 01 11
-        punpckhwd   mm4,    mm1     ; 02 12 03 13
-
-        punpcklwd   mm2,    mm3     ; 20 30 21 31
-        punpckhwd   mm5,    mm3     ; 22 32 23 33
-
-
-        movq        mm1,    mm0     ; 00 10 01 11
-        punpckldq   mm0,    mm2     ; 00 10 20 30
-
-        punpckhdq   mm1,    mm2     ; 01 11 21 31
-
-        movq        mm2,    mm4     ; 02 12 03 13
-        punpckldq   mm2,    mm5     ; 02 12 22 32
-
-        punpckhdq   mm4,    mm5     ; 03 13 23 33
-        movq        mm3,    mm4
-
-
-        ; first stage
-        movq    mm5,    mm0
-        movq    mm4,    mm1
-
-        paddw   mm0,    mm3         ; a = 0 + 3
-        paddw   mm1,    mm2         ; b = 1 + 2
-
-        psubw   mm4,    mm2         ; c = 1 - 2
-        psubw   mm5,    mm3         ; d = 0 - 3
-
-
-        ; output 0 and 2
-        movq    mm6,    [rdx +  16] ; c2
-        movq    mm2,    mm0         ; a
-
-        paddw   mm0,    mm1         ; a + b
-        psubw   mm2,    mm1         ; a - b
-
-        movq    mm1,    mm0         ; a + b
-        pmulhw  mm0,    mm6         ; 00 01 02 03
-
-        paddw   mm0,    mm1         ; output 00 01 02 03
-        pmulhw  mm6,    mm2         ; 20 21 22 23
-
-        paddw   mm2,    mm6         ; output 20 21 22 23
-
-        ; output 1 and 3
-        movq    mm6,    [rdx +  8]  ; c1
-        movq    mm7,    [rdx + 24]  ; c3
-
-        movq    mm1,    mm4         ; c
-        movq    mm3,    mm5         ; d
-
-        pmulhw  mm1,    mm7         ; c * c3
-        pmulhw  mm3,    mm6         ; d * c1
-
-        paddw   mm3,    mm5         ; d * c1 rounded
-        paddw   mm1,    mm3         ; output 10 11 12 13
-
-        movq    mm3,    mm4         ; c
-        pmulhw  mm5,    mm7         ; d * c3
-
-        pmulhw  mm4,    mm6         ; c * c1
-        paddw   mm3,    mm4         ; round c* c1
-
-        psubw   mm5,    mm3         ; output 30 31 32 33
-        movq    mm3,    mm5
-
-
-        ; done with vertical
-        ; transpose for the second stage
-        movq    mm4,    mm0         ; 00 01 02 03
-        movq    mm5,    mm2         ; 10 11 12 03
-
-        punpcklwd   mm0,    mm1     ; 00 10 01 11
-        punpckhwd   mm4,    mm1     ; 02 12 03 13
-
-        punpcklwd   mm2,    mm3     ; 20 30 21 31
-        punpckhwd   mm5,    mm3     ; 22 32 23 33
-
-
-        movq        mm1,    mm0     ; 00 10 01 11
-        punpckldq   mm0,    mm2     ; 00 10 20 30
-
-        punpckhdq   mm1,    mm2     ; 01 11 21 31
-
-        movq        mm2,    mm4     ; 02 12 03 13
-        punpckldq   mm2,    mm5     ; 02 12 22 32
-
-        punpckhdq   mm4,    mm5     ; 03 13 23 33
-        movq        mm3,    mm4
-
-
-        ; first stage
-        movq    mm5,    mm0
-        movq    mm4,    mm1
 
-        paddw   mm0,    mm3         ; a = 0 + 3
-        paddw   mm1,    mm2         ; b = 1 + 2
+        mov         rsi,        arg(0)      ; input
+        mov         rdi,        arg(1)      ; output
 
-        psubw   mm4,    mm2         ; c = 1 - 2
-        psubw   mm5,    mm3         ; d = 0 - 3
+        movsxd      rax,        dword ptr arg(2) ;pitch
 
-
-        ; output 0 and 2
-        movq    mm6,    [rdx +  16] ; c2
-        movq    mm2,    mm0         ; a
-        paddw   mm0,    mm1         ; a + b
-
-        psubw   mm2,    mm1         ; a - b
-
-        movq    mm1,    mm0         ; a + b
-        pmulhw  mm0,    mm6         ; 00 01 02 03
-
-        paddw   mm0,    mm1         ; output 00 01 02 03
-        pmulhw  mm6,    mm2         ; 20 21 22 23
-
-        paddw   mm2,    mm6         ; output 20 21 22 23
-
-
-        ; output 1 and 3
-        movq    mm6,    [rdx +  8]  ; c1
-        movq    mm7,    [rdx + 24]  ; c3
-
-        movq    mm1,    mm4         ; c
-        movq    mm3,    mm5         ; d
-
-        pmulhw  mm1,    mm7         ; c * c3
-        pmulhw  mm3,    mm6         ; d * c1
-
-        paddw   mm3,    mm5         ; d * c1 rounded
-        paddw   mm1,    mm3         ; output 10 11 12 13
-
-        movq    mm3,    mm4         ; c
-        pmulhw  mm5,    mm7         ; d * c3
-
-        pmulhw  mm4,    mm6         ; c * c1
-        paddw   mm3,    mm4         ; round c* c1
-
-        psubw   mm5,    mm3         ; output 30 31 32 33
-        movq    mm3,    mm5
-        ; done with vertical
-
-        pcmpeqw mm4,    mm4
-        pcmpeqw mm5,    mm5
-        psrlw   mm4,    15
-        psrlw   mm5,    15
-
-        psllw   mm4,    2
-        psllw   mm5,    2
-
-        paddw   mm0,    mm4
-        paddw   mm1,    mm5
-        paddw   mm2,    mm4
-        paddw   mm3,    mm5
-
-        psraw   mm0, 3
-        psraw   mm1, 3
-        psraw   mm2, 3
-        psraw   mm3, 3
-
-        movq        [rdi   ],   mm0
-        movq        [rdi+ 8],   mm1
-        movq        [rdi+16],   mm2
-        movq        [rdi+24],   mm3
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp8_short_fdct8x4_wmt(short *input, short *output, int pitch)
-sym(vp8_short_fdct8x4_wmt):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 3
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-        mov         rsi,    arg(0) ;input
-        mov         rdi,    arg(1) ;output
-
-        lea         rdx,    [GLOBAL(dct_const_xmm)]
-        movsxd      rax,    dword ptr arg(2) ;pitch
-
-        lea         rcx,    [rsi + rax*2]
+        lea         rcx,        [rsi + rax*2]
         ; read the input data
-        movdqa      xmm0,       [rsi]
-        movdqa      xmm2,       [rsi + rax]
-
-        movdqa      xmm4,       [rcx]
-        movdqa      xmm3,       [rcx + rax]
-        ; get the constants
-        ;shift to left by 1 for prescision
-        psllw       xmm0,        3
-        psllw       xmm2,        3
-
-        psllw       xmm4,        3
-        psllw       xmm3,        3
-
-        ; transpose for the second stage
-        movdqa      xmm1,       xmm0         ; 00 01 02 03 04 05 06 07
-        movdqa      xmm5,       xmm4         ; 20 21 22 23 24 25 26 27
-
-        punpcklwd   xmm0,       xmm2         ; 00 10 01 11 02 12 03 13
-        punpckhwd   xmm1,       xmm2         ; 04 14 05 15 06 16 07 17
+        movq        mm0,        [rsi]
+        movq        mm1,        [rsi + rax]
 
-        punpcklwd   xmm4,       xmm3         ; 20 30 21 31 22 32 23 33
-        punpckhwd   xmm5,       xmm3         ; 24 34 25 35 26 36 27 37
+        movq        mm2,        [rcx]
+        movq        mm4,        [rcx + rax]
 
-        movdqa      xmm2,       xmm0         ; 00 10 01 11 02 12 03 13
-        punpckldq   xmm0,       xmm4         ; 00 10 20 30 01 11 21 31
+        ; transpose for the first stage
+        movq        mm3,        mm0         ; 00 01 02 03
+        movq        mm5,        mm2         ; 20 21 22 23
 
-        punpckhdq   xmm2,       xmm4         ; 02 12 22 32 03 13 23 33
+        punpcklwd   mm0,        mm1         ; 00 10 01 11
+        punpckhwd   mm3,        mm1         ; 02 12 03 13
 
+        punpcklwd   mm2,        mm4         ; 20 30 21 31
+        punpckhwd   mm5,        mm4         ; 22 32 23 33
 
-        movdqa      xmm4,       xmm1         ; 04 14 05 15 06 16 07 17
-        punpckldq   xmm4,       xmm5         ; 04 14 24 34 05 15 25 35
+        movq        mm1,        mm0         ; 00 10 01 11
+        punpckldq   mm0,        mm2         ; 00 10 20 30
 
-        punpckhdq   xmm1,       xmm5         ; 06 16 26 36 07 17 27 37
-        movdqa      xmm3,       xmm2         ; 02 12 22 32 03 13 23 33
+        punpckhdq   mm1,        mm2         ; 01 11 21 31
 
-        punpckhqdq  xmm3,       xmm1         ; 03 13 23 33 07 17 27 37
-        punpcklqdq  xmm2,       xmm1         ; 02 12 22 32 06 16 26 36
+        movq        mm2,        mm3         ; 02 12 03 13
+        punpckldq   mm2,        mm5         ; 02 12 22 32
 
-        movdqa      xmm1,       xmm0         ; 00 10 20 30 01 11 21 31
-        punpcklqdq  xmm0,       xmm4         ; 00 10 20 30 04 14 24 34
+        punpckhdq   mm3,        mm5         ; 03 13 23 33
 
-        punpckhqdq  xmm1,       xmm4         ; 01 11 21 32 05 15 25 35
-
-        ; xmm0 0
-        ; xmm1 1
-        ; xmm2 2
-        ; xmm3 3
+        ; mm0 0
+        ; mm1 1
+        ; mm2 2
+        ; mm3 3
 
         ; first stage
-        movdqa      xmm5,       xmm0
-        movdqa      xmm4,       xmm1
-
-        paddw       xmm0,       xmm3         ; a = 0 + 3
-        paddw       xmm1,       xmm2         ; b = 1 + 2
-
-        psubw       xmm4,       xmm2         ; c = 1 - 2
-        psubw       xmm5,       xmm3         ; d = 0 - 3
+        movq        mm5,        mm0
+        movq        mm4,        mm1
 
+        paddw       mm0,        mm3         ; a1 = 0 + 3
+        paddw       mm1,        mm2         ; b1 = 1 + 2
 
-        ; output 0 and 2
-        movdqa      xmm6,       [rdx +  32] ; c2
-        movdqa      xmm2,       xmm0         ; a
+        psubw       mm4,        mm2         ; c1 = 1 - 2
+        psubw       mm5,        mm3         ; d1 = 0 - 3
 
-        paddw       xmm0,       xmm1         ; a + b
-        psubw       xmm2,       xmm1         ; a - b
+        psllw       mm5,        3
+        psllw       mm4,        3
 
-        movdqa      xmm1,       xmm0         ; a + b
-        pmulhw      xmm0,       xmm6         ; 00 01 02 03
+        psllw       mm0,        3
+        psllw       mm1,        3
 
-        paddw       xmm0,       xmm1         ; output 00 01 02 03
-        pmulhw      xmm6,       xmm2         ; 20 21 22 23
+        ; output 0 and 2
+        movq        mm2,        mm0         ; a1
 
-        paddw       xmm2,       xmm6         ; output 20 21 22 23
+        paddw       mm0,        mm1         ; op[0] = a1 + b1
+        psubw       mm2,        mm1         ; op[2] = a1 - b1
 
         ; output 1 and 3
-        movdqa      xmm6,       [rdx + 16]  ; c1
-        movdqa      xmm7,       [rdx + 48]  ; c3
-
-        movdqa      xmm1,       xmm4         ; c
-        movdqa      xmm3,       xmm5         ; d
+        ; interleave c1, d1
+        movq        mm1,        mm5         ; d1
+        punpcklwd   mm1,        mm4         ; c1 d1
+        punpckhwd   mm5,        mm4         ; c1 d1
 
-        pmulhw      xmm1,       xmm7         ; c * c3
-        pmulhw      xmm3,       xmm6         ; d * c1
+        movq        mm3,        mm1
+        movq        mm4,        mm5
 
-        paddw       xmm3,       xmm5         ; d * c1 rounded
-        paddw       xmm1,       xmm3         ; output 10 11 12 13
+        pmaddwd     mm1,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
+        pmaddwd     mm4,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
 
-        movdqa      xmm3,       xmm4         ; c
-        pmulhw      xmm5,       xmm7         ; d * c3
+        pmaddwd     mm3,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
+        pmaddwd     mm5,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
 
-        pmulhw      xmm4,       xmm6         ; c * c1
-        paddw       xmm3,       xmm4         ; round c* c1
+        paddd       mm1,        MMWORD PTR[GLOBAL(_14500)]
+        paddd       mm4,        MMWORD PTR[GLOBAL(_14500)]
+        paddd       mm3,        MMWORD PTR[GLOBAL(_7500)]
+        paddd       mm5,        MMWORD PTR[GLOBAL(_7500)]
 
-        psubw       xmm5,       xmm3         ; output 30 31 32 33
-        movdqa      xmm3,       xmm5
+        psrad       mm1,        12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
+        psrad       mm4,        12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
+        psrad       mm3,        12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
+        psrad       mm5,        12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
 
+        packssdw    mm1,        mm4         ; op[1]
+        packssdw    mm3,        mm5         ; op[3]
 
         ; done with vertical
         ; transpose for the second stage
-        movdqa      xmm4,       xmm2         ; 02 12 22 32 06 16 26 36
-        movdqa      xmm2,       xmm1         ; 01 11 21 31 05 15 25 35
+        movq        mm4,        mm0         ; 00 10 20 30
+        movq        mm5,        mm2         ; 02 12 22 32
 
-        movdqa      xmm1,       xmm0         ; 00 10 20 30 04 14 24 34
-        movdqa      xmm5,       xmm4         ; 02 12 22 32 06 16 26 36
+        punpcklwd   mm0,        mm1         ; 00 01 10 11
+        punpckhwd   mm4,        mm1         ; 20 21 30 31
 
-        punpcklwd   xmm0,       xmm2         ; 00 01 10 11 20 21 30 31
-        punpckhwd   xmm1,       xmm2         ; 04 05 14 15 24 25 34 35
+        punpcklwd   mm2,        mm3         ; 02 03 12 13
+        punpckhwd   mm5,        mm3         ; 22 23 32 33
 
-        punpcklwd   xmm4,       xmm3         ; 02 03 12 13 22 23 32 33
-        punpckhwd   xmm5,       xmm3         ; 06 07 16 17 26 27 36 37
+        movq        mm1,        mm0         ; 00 01 10 11
+        punpckldq   mm0,        mm2         ; 00 01 02 03
 
-        movdqa      xmm2,       xmm0         ; 00 01 10 11 20 21 30 31
-        punpckldq   xmm0,       xmm4         ; 00 01 02 03 10 11 12 13
+        punpckhdq   mm1,        mm2         ; 01 22 12 13
 
-        punpckhdq   xmm2,       xmm4         ; 20 21 22 23 30 31 32 33
+        movq        mm2,        mm4         ; 20 31 30 31
+        punpckldq   mm2,        mm5         ; 20 21 22 23
 
+        punpckhdq   mm4,        mm5         ; 30 31 32 33
 
-        movdqa      xmm4,       xmm1         ; 04 05 14 15 24 25 34 35
-        punpckldq   xmm4,       xmm5         ; 04 05 06 07 14 15 16 17
+        ; mm0 0
+        ; mm1 1
+        ; mm2 2
+        ; mm3 4
 
-        punpckhdq   xmm1,       xmm5         ; 24 25 26 27 34 35 36 37
-        movdqa      xmm3,       xmm2         ; 20 21 22 23 30 31 32 33
+        movq        mm5,        mm0
+        movq        mm3,        mm1
 
-        punpckhqdq  xmm3,       xmm1         ; 30 31 32 33 34 35 36 37
-        punpcklqdq  xmm2,       xmm1         ; 20 21 22 23 24 25 26 27
+        paddw       mm0,        mm4         ; a1 = 0 + 3
+        paddw       mm1,        mm2         ; b1 = 1 + 2
 
-        movdqa      xmm1,       xmm0         ; 00 01 02 03 10 11 12 13
-        punpcklqdq  xmm0,       xmm4         ; 00 01 02 03 04 05 06 07
+        psubw       mm3,        mm2         ; c1 = 1 - 2
+        psubw       mm5,        mm4         ; d1 = 0 - 3
 
-        punpckhqdq  xmm1,       xmm4         ; 10 11 12 13 14 15 16 17
+        pxor        mm6,        mm6         ; zero out for compare
 
-        ; first stage
-        movdqa      xmm5,       xmm0
-        movdqa      xmm4,       xmm1
-
-        paddw       xmm0,       xmm3         ; a = 0 + 3
-        paddw       xmm1,       xmm2         ; b = 1 + 2
-
-        psubw       xmm4,       xmm2         ; c = 1 - 2
-        psubw       xmm5,       xmm3         ; d = 0 - 3
+        pcmpeqw     mm6,        mm5         ; d1 != 0
 
+        pandn       mm6,        MMWORD PTR[GLOBAL(_cmp_mask)]   ; clear upper,
+                                                                ; and keep bit 0 of lower
 
         ; output 0 and 2
-        movdqa      xmm6,       [rdx +  32] ; c2
-        movdqa      xmm2,       xmm0         ; a
+        movq        mm2,        mm0         ; a1
 
-        paddw       xmm0,       xmm1         ; a + b
-        psubw       xmm2,       xmm1         ; a - b
+        paddw       mm0,        mm1         ; a1 + b1
+        psubw       mm2,        mm1         ; a1 - b1
 
-        movdqa      xmm1,       xmm0         ; a + b
-        pmulhw      xmm0,       xmm6         ; 00 01 02 03
+        paddw       mm0,        MMWORD PTR[GLOBAL(_7w)]
+        paddw       mm2,        MMWORD PTR[GLOBAL(_7w)]
 
-        paddw       xmm0,       xmm1         ; output 00 01 02 03
-        pmulhw      xmm6,       xmm2         ; 20 21 22 23
+        psraw       mm0,        4           ; op[0] = (a1 + b1 + 7)>>4
+        psraw       mm2,        4           ; op[8] = (a1 - b1 + 7)>>4
 
-        paddw       xmm2,       xmm6         ; output 20 21 22 23
+        movq        MMWORD PTR[rdi + 0 ],  mm0
+        movq        MMWORD PTR[rdi + 16],  mm2
 
         ; output 1 and 3
-        movdqa      xmm6,       [rdx + 16]  ; c1
-        movdqa      xmm7,       [rdx + 48]  ; c3
+        ; interleave c1, d1
+        movq        mm1,        mm5         ; d1
+        punpcklwd   mm1,        mm3         ; c1 d1
+        punpckhwd   mm5,        mm3         ; c1 d1
 
-        movdqa      xmm1,       xmm4         ; c
-        movdqa      xmm3,       xmm5         ; d
+        movq        mm3,        mm1
+        movq        mm4,        mm5
 
-        pmulhw      xmm1,       xmm7         ; c * c3
-        pmulhw      xmm3,       xmm6         ; d * c1
+        pmaddwd     mm1,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
+        pmaddwd     mm4,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
 
-        paddw       xmm3,       xmm5         ; d * c1 rounded
-        paddw       xmm1,       xmm3         ; output 10 11 12 13
+        pmaddwd     mm3,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
+        pmaddwd     mm5,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
 
-        movdqa      xmm3,       xmm4         ; c
-        pmulhw      xmm5,       xmm7         ; d * c3
+        paddd       mm1,        MMWORD PTR[GLOBAL(_12000)]
+        paddd       mm4,        MMWORD PTR[GLOBAL(_12000)]
+        paddd       mm3,        MMWORD PTR[GLOBAL(_51000)]
+        paddd       mm5,        MMWORD PTR[GLOBAL(_51000)]
 
-        pmulhw      xmm4,       xmm6         ; c * c1
-        paddw       xmm3,       xmm4         ; round c* c1
+        psrad       mm1,        16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
+        psrad       mm4,        16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
+        psrad       mm3,        16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
+        psrad       mm5,        16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
 
-        psubw       xmm5,       xmm3         ; output 30 31 32 33
-        movdqa      xmm3,       xmm5
-        ; done with vertical
+        packssdw    mm1,        mm4         ; op[4]
+        packssdw    mm3,        mm5         ; op[12]
+
+        paddw       mm1,        mm6         ; op[4] += (d1!=0)
 
+        movq        MMWORD PTR[rdi + 8 ],  mm1
+        movq        MMWORD PTR[rdi + 24],  mm3
 
-        pcmpeqw     xmm4,       xmm4
-        pcmpeqw     xmm5,       xmm5;
-        psrlw       xmm4,       15
-        psrlw       xmm5,       15
-
-        psllw       xmm4,       2
-        psllw       xmm5,       2
-
-        paddw       xmm0,       xmm4
-        paddw       xmm1,       xmm5
-        paddw       xmm2,       xmm4
-        paddw       xmm3,       xmm5
-
-        psraw       xmm0,       3
-        psraw       xmm1,       3
-        psraw       xmm2,       3
-        psraw       xmm3,       3
-
-        movq        QWORD PTR[rdi   ],   xmm0
-        movq        QWORD PTR[rdi+ 8],   xmm1
-        movq        QWORD PTR[rdi+16],   xmm2
-        movq        QWORD PTR[rdi+24],   xmm3
-
-        psrldq      xmm0,       8
-        psrldq      xmm1,       8
-        psrldq      xmm2,       8
-        psrldq      xmm3,       8
-
-        movq        QWORD PTR[rdi+32],   xmm0
-        movq        QWORD PTR[rdi+40],   xmm1
-        movq        QWORD PTR[rdi+48],   xmm2
-        movq        QWORD PTR[rdi+56],   xmm3
-    ; begin epilog
-    pop rdi
-    pop rsi
+     ; begin epilog
+    pop         rdi
+    pop         rsi
     RESTORE_GOT
     UNSHADOW_ARGS
     pop         rbp
     ret
 
-
 SECTION_RODATA
-;static const unsigned int dct1st_stage_rounding_mmx[2] =
-align 16
-dct1st_stage_rounding_mmx:
-    times 2 dd 8192
-
-
-;static const unsigned int dct2nd_stage_rounding_mmx[2] =
-align 16
-dct2nd_stage_rounding_mmx:
-    times 2 dd 32768
-
-
-;static const short dct_matrix[4][4]=
-align 16
-dct_matrix:
-    times 4 dw 23170
-
-    dw  30274
-    dw  12540
-    dw -12540
-    dw -30274
-
-    dw 23170
-    times 2 dw -23170
-    dw 23170
-
-    dw  12540
-    dw -30274
-    dw  30274
-    dw -12540
-
-
-;static const unsigned short dct_const_mmx[4 * 4]=
-align 16
-dct_const_mmx:
-    times 4 dw 0
-    times 4 dw 60547
-    times 4 dw 46341
-    times 4 dw 25080
-
-
-;static const unsigned short dct_const_xmm[8 * 4]=
-align 16
-dct_const_xmm:
-    times 8 dw 0
-    times 8 dw 60547
-    times 8 dw 46341
-    times 8 dw 25080
+align 8
+_5352_2217:
+    dw 5352
+    dw 2217
+    dw 5352
+    dw 2217
+align 8
+_2217_neg5352:
+    dw 2217
+    dw -5352
+    dw 2217
+    dw -5352
+align 8
+_cmp_mask:
+    times 4 dw 1
+align 8
+_7w:
+    times 4 dw 7
+align 8
+_14500:
+    times 2 dd 14500
+align 8
+_7500:
+    times 2 dd 7500
+align 8
+_12000:
+    times 2 dd 12000
+align 8
+_51000:
+    times 2 dd 51000
diff --git a/vp8/encoder/x86/dct_sse2.asm b/vp8/encoder/x86/dct_sse2.asm
index 723a78d76..652dd9804 100644
--- a/vp8/encoder/x86/dct_sse2.asm
+++ b/vp8/encoder/x86/dct_sse2.asm
@@ -11,32 +11,68 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
-;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch)
-global sym(vp8_short_fdct4x4_sse2)
-sym(vp8_short_fdct4x4_sse2):
+%macro STACK_FRAME_CREATE 0
+%if ABI_IS_32BIT
+  %define       input       rsi
+  %define       output      rdi
+  %define       pitch       rax
     push        rbp
     mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 3
-;;    SAVE_XMM
     GET_GOT     rbx
     push        rsi
     push        rdi
     ; end prolog
 
     mov         rsi, arg(0)
-    movsxd      rax, DWORD PTR arg(2)
-    lea         rdi, [rsi + rax*2]
+    mov         rdi, arg(1)
+
+    movsxd      rax, dword ptr arg(2)
+    lea         rcx, [rsi + rax*2]
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+    %define     input       rcx
+    %define     output      rdx
+    %define     pitch       r8
+  %else
+    %define     input       rdi
+    %define     output      rsi
+    %define     pitch       rdx
+  %endif
+%endif
+%endmacro
+
+%macro STACK_FRAME_DESTROY 0
+  %define     input
+  %define     output
+  %define     pitch
+
+%if ABI_IS_32BIT
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    pop         rbp
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+  %endif
+%endif
+    ret
+%endmacro
+
+;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch)
+global sym(vp8_short_fdct4x4_sse2)
+sym(vp8_short_fdct4x4_sse2):
 
-    movq        xmm0, MMWORD PTR[rsi   ]        ;03 02 01 00
-    movq        xmm2, MMWORD PTR[rsi + rax]     ;13 12 11 10
-    movq        xmm1, MMWORD PTR[rsi + rax*2]   ;23 22 21 20
-    movq        xmm3, MMWORD PTR[rdi + rax]     ;33 32 31 30
+    STACK_FRAME_CREATE
+
+    movq        xmm0, MMWORD PTR[input        ] ;03 02 01 00
+    movq        xmm2, MMWORD PTR[input+  pitch] ;13 12 11 10
+    lea         input,          [input+2*pitch]
+    movq        xmm1, MMWORD PTR[input        ] ;23 22 21 20
+    movq        xmm3, MMWORD PTR[input+  pitch] ;33 32 31 30
 
     punpcklqdq  xmm0, xmm2                      ;13 12 11 10 03 02 01 00
     punpcklqdq  xmm1, xmm3                      ;33 32 31 30 23 22 21 20
 
-    mov         rdi, arg(1)
-
     movdqa      xmm2, xmm0
     punpckldq   xmm0, xmm1                      ;23 22 03 02 21 20 01 00
     punpckhdq   xmm2, xmm1                      ;33 32 13 12 31 30 11 10
@@ -51,6 +87,7 @@ sym(vp8_short_fdct4x4_sse2):
     psubw       xmm3, xmm1                      ;c1 d1 c1 d1 c1 d1 c1 d1
     psllw       xmm0, 3                         ;b1 <<= 3 a1 <<= 3
     psllw       xmm3, 3                         ;c1 <<= 3 d1 <<= 3
+
     movdqa      xmm1, xmm0
     pmaddwd     xmm0, XMMWORD PTR[GLOBAL(_mult_add)]    ;a1 + b1
     pmaddwd     xmm1, XMMWORD PTR[GLOBAL(_mult_sub)]    ;a1 - b1
@@ -121,17 +158,216 @@ sym(vp8_short_fdct4x4_sse2):
     punpcklqdq  xmm0, xmm3                      ;op[4] op[0]
     punpckhqdq  xmm1, xmm3                      ;op[12] op[8]
 
-    movdqa      XMMWORD PTR[rdi + 0], xmm0
-    movdqa      XMMWORD PTR[rdi + 16], xmm1
+    movdqa      XMMWORD PTR[output +  0], xmm0
+    movdqa      XMMWORD PTR[output + 16], xmm1
 
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-;;    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
+    STACK_FRAME_DESTROY
+
+;void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch)
+global sym(vp8_short_fdct8x4_sse2)
+sym(vp8_short_fdct8x4_sse2):
+
+    STACK_FRAME_CREATE
+
+        ; read the input data
+        movdqa      xmm0,       [input        ]
+        movdqa      xmm2,       [input+  pitch]
+        lea         input,      [input+2*pitch]
+        movdqa      xmm4,       [input        ]
+        movdqa      xmm3,       [input+  pitch]
+
+        ; transpose for the first stage
+        movdqa      xmm1,       xmm0        ; 00 01 02 03 04 05 06 07
+        movdqa      xmm5,       xmm4        ; 20 21 22 23 24 25 26 27
+
+        punpcklwd   xmm0,       xmm2        ; 00 10 01 11 02 12 03 13
+        punpckhwd   xmm1,       xmm2        ; 04 14 05 15 06 16 07 17
+
+        punpcklwd   xmm4,       xmm3        ; 20 30 21 31 22 32 23 33
+        punpckhwd   xmm5,       xmm3        ; 24 34 25 35 26 36 27 37
+
+        movdqa      xmm2,       xmm0        ; 00 10 01 11 02 12 03 13
+        punpckldq   xmm0,       xmm4        ; 00 10 20 30 01 11 21 31
+
+        punpckhdq   xmm2,       xmm4        ; 02 12 22 32 03 13 23 33
+
+        movdqa      xmm4,       xmm1        ; 04 14 05 15 06 16 07 17
+        punpckldq   xmm4,       xmm5        ; 04 14 24 34 05 15 25 35
+
+        punpckhdq   xmm1,       xmm5        ; 06 16 26 36 07 17 27 37
+        movdqa      xmm3,       xmm2        ; 02 12 22 32 03 13 23 33
+
+        punpckhqdq  xmm3,       xmm1        ; 03 13 23 33 07 17 27 37
+        punpcklqdq  xmm2,       xmm1        ; 02 12 22 32 06 16 26 36
+
+        movdqa      xmm1,       xmm0        ; 00 10 20 30 01 11 21 31
+        punpcklqdq  xmm0,       xmm4        ; 00 10 20 30 04 14 24 34
+
+        punpckhqdq  xmm1,       xmm4        ; 01 11 21 32 05 15 25 35
+
+        ; xmm0 0
+        ; xmm1 1
+        ; xmm2 2
+        ; xmm3 3
+
+        ; first stage
+        movdqa      xmm5,       xmm0
+        movdqa      xmm4,       xmm1
+
+        paddw       xmm0,       xmm3        ; a1 = 0 + 3
+        paddw       xmm1,       xmm2        ; b1 = 1 + 2
+
+        psubw       xmm4,       xmm2        ; c1 = 1 - 2
+        psubw       xmm5,       xmm3        ; d1 = 0 - 3
+
+        psllw       xmm5,        3
+        psllw       xmm4,        3
+
+        psllw       xmm0,        3
+        psllw       xmm1,        3
+
+        ; output 0 and 2
+        movdqa      xmm2,       xmm0        ; a1
+
+        paddw       xmm0,       xmm1        ; op[0] = a1 + b1
+        psubw       xmm2,       xmm1        ; op[2] = a1 - b1
+
+        ; output 1 and 3
+        ; interleave c1, d1
+        movdqa      xmm1,       xmm5        ; d1
+        punpcklwd   xmm1,       xmm4        ; c1 d1
+        punpckhwd   xmm5,       xmm4        ; c1 d1
+
+        movdqa      xmm3,       xmm1
+        movdqa      xmm4,       xmm5
+
+        pmaddwd     xmm1,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
+        pmaddwd     xmm4,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
+
+        pmaddwd     xmm3,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
+        pmaddwd     xmm5,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
+
+        paddd       xmm1,       XMMWORD PTR[GLOBAL(_14500)]
+        paddd       xmm4,       XMMWORD PTR[GLOBAL(_14500)]
+        paddd       xmm3,       XMMWORD PTR[GLOBAL(_7500)]
+        paddd       xmm5,       XMMWORD PTR[GLOBAL(_7500)]
+
+        psrad       xmm1,       12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
+        psrad       xmm4,       12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
+        psrad       xmm3,       12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
+        psrad       xmm5,       12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
+
+        packssdw    xmm1,       xmm4        ; op[1]
+        packssdw    xmm3,       xmm5        ; op[3]
+
+        ; done with vertical
+        ; transpose for the second stage
+        movdqa      xmm4,       xmm0         ; 00 10 20 30 04 14 24 34
+        movdqa      xmm5,       xmm2         ; 02 12 22 32 06 16 26 36
+
+        punpcklwd   xmm0,       xmm1         ; 00 01 10 11 20 21 30 31
+        punpckhwd   xmm4,       xmm1         ; 04 05 14 15 24 25 34 35
+
+        punpcklwd   xmm2,       xmm3         ; 02 03 12 13 22 23 32 33
+        punpckhwd   xmm5,       xmm3         ; 06 07 16 17 26 27 36 37
+
+        movdqa      xmm1,       xmm0         ; 00 01 10 11 20 21 30 31
+        punpckldq   xmm0,       xmm2         ; 00 01 02 03 10 11 12 13
+
+        punpckhdq   xmm1,       xmm2         ; 20 21 22 23 30 31 32 33
+
+        movdqa      xmm2,       xmm4         ; 04 05 14 15 24 25 34 35
+        punpckldq   xmm2,       xmm5         ; 04 05 06 07 14 15 16 17
+
+        punpckhdq   xmm4,       xmm5         ; 24 25 26 27 34 35 36 37
+        movdqa      xmm3,       xmm1         ; 20 21 22 23 30 31 32 33
+
+        punpckhqdq  xmm3,       xmm4         ; 30 31 32 33 34 35 36 37
+        punpcklqdq  xmm1,       xmm4         ; 20 21 22 23 24 25 26 27
+
+        movdqa      xmm4,       xmm0         ; 00 01 02 03 10 11 12 13
+        punpcklqdq  xmm0,       xmm2         ; 00 01 02 03 04 05 06 07
+
+        punpckhqdq  xmm4,       xmm2         ; 10 11 12 13 14 15 16 17
+
+        ; xmm0 0
+        ; xmm1 4
+        ; xmm2 1
+        ; xmm3 3
+
+        movdqa      xmm5,       xmm0
+        movdqa      xmm2,       xmm1
+
+        paddw       xmm0,       xmm3        ; a1 = 0 + 3
+        paddw       xmm1,       xmm4        ; b1 = 1 + 2
+
+        psubw       xmm4,       xmm2        ; c1 = 1 - 2
+        psubw       xmm5,       xmm3        ; d1 = 0 - 3
+
+        pxor        xmm6,       xmm6        ; zero out for compare
+
+        pcmpeqw     xmm6,       xmm5        ; d1 != 0
+
+        pandn       xmm6,       XMMWORD PTR[GLOBAL(_cmp_mask8x4)]   ; clear upper,
+                                                                    ; and keep bit 0 of lower
+
+        ; output 0 and 2
+        movdqa      xmm2,       xmm0        ; a1
+
+        paddw       xmm0,       xmm1        ; a1 + b1
+        psubw       xmm2,       xmm1        ; a1 - b1
+
+        paddw       xmm0,       XMMWORD PTR[GLOBAL(_7w)]
+        paddw       xmm2,       XMMWORD PTR[GLOBAL(_7w)]
+
+        psraw       xmm0,       4           ; op[0] = (a1 + b1 + 7)>>4
+        psraw       xmm2,       4           ; op[8] = (a1 - b1 + 7)>>4
+
+        ; output 1 and 3
+        ; interleave c1, d1
+        movdqa      xmm1,       xmm5        ; d1
+        punpcklwd   xmm1,       xmm4        ; c1 d1
+        punpckhwd   xmm5,       xmm4        ; c1 d1
+
+        movdqa      xmm3,       xmm1
+        movdqa      xmm4,       xmm5
+
+        pmaddwd     xmm1,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
+        pmaddwd     xmm4,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
+
+        pmaddwd     xmm3,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
+        pmaddwd     xmm5,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
+
+        paddd       xmm1,       XMMWORD PTR[GLOBAL(_12000)]
+        paddd       xmm4,       XMMWORD PTR[GLOBAL(_12000)]
+        paddd       xmm3,       XMMWORD PTR[GLOBAL(_51000)]
+        paddd       xmm5,       XMMWORD PTR[GLOBAL(_51000)]
+
+        psrad       xmm1,       16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
+        psrad       xmm4,       16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
+        psrad       xmm3,       16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
+        psrad       xmm5,       16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
+
+        packssdw    xmm1,       xmm4        ; op[4]
+        packssdw    xmm3,       xmm5        ; op[12]
+
+        paddw       xmm1,       xmm6        ; op[4] += (d1!=0)
+
+        movdqa      xmm4,       xmm0
+        movdqa      xmm5,       xmm2
+
+        punpcklqdq  xmm0,       xmm1
+        punpckhqdq  xmm4,       xmm1
+
+        punpcklqdq  xmm2,       xmm3
+        punpckhqdq  xmm5,       xmm3
+
+        movdqa      XMMWORD PTR[output + 0 ],  xmm0
+        movdqa      XMMWORD PTR[output + 16],  xmm2
+        movdqa      XMMWORD PTR[output + 32],  xmm4
+        movdqa      XMMWORD PTR[output + 48],  xmm5
+
+    STACK_FRAME_DESTROY
 
 SECTION_RODATA
 align 16
@@ -161,7 +397,9 @@ align 16
 _cmp_mask:
     times 4 dw 1
     times 4 dw 0
-
+align 16
+_cmp_mask8x4:
+    times 8 dw 1
 align 16
 _mult_sub:
     dw 1
@@ -176,6 +414,9 @@ align 16
 _7:
     times 4 dd 7
 align 16
+_7w:
+    times 8 dw 7
+align 16
 _14500:
     times 4 dd 14500
 align 16
diff --git a/vp8/encoder/x86/dct_x86.h b/vp8/encoder/x86/dct_x86.h
index 05824c684..59a5cb1d7 100644
--- a/vp8/encoder/x86/dct_x86.h
+++ b/vp8/encoder/x86/dct_x86.h
@@ -24,33 +24,31 @@ extern prototype_fdct(vp8_short_fdct4x4_mmx);
 extern prototype_fdct(vp8_short_fdct8x4_mmx);
 
 #if !CONFIG_RUNTIME_CPU_DETECT
-#if 0
+
 #undef  vp8_fdct_short4x4
 #define vp8_fdct_short4x4 vp8_short_fdct4x4_mmx
 
 #undef  vp8_fdct_short8x4
 #define vp8_fdct_short8x4 vp8_short_fdct8x4_mmx
-#endif
 
 #endif
+
 #endif
 
 
 #if HAVE_SSE2
-extern prototype_fdct(vp8_short_fdct8x4_wmt);
+extern prototype_fdct(vp8_short_fdct8x4_sse2);
 extern prototype_fdct(vp8_short_walsh4x4_sse2);
 
 extern prototype_fdct(vp8_short_fdct4x4_sse2);
 
 #if !CONFIG_RUNTIME_CPU_DETECT
-#if 1
-/* short SSE2 DCT currently disabled, does not match the MMX version */
+
 #undef  vp8_fdct_short4x4
 #define vp8_fdct_short4x4 vp8_short_fdct4x4_sse2
 
 #undef  vp8_fdct_short8x4
 #define vp8_fdct_short8x4 vp8_short_fdct8x4_sse2
-#endif
 
 #undef  vp8_fdct_fast4x4
 #define vp8_fdct_fast4x4 vp8_short_fdct4x4_sse2
@@ -58,7 +56,7 @@ extern prototype_fdct(vp8_short_fdct4x4_sse2);
 #undef  vp8_fdct_fast8x4
 #define vp8_fdct_fast8x4 vp8_short_fdct8x4_sse2
 
-#undef vp8_fdct_walsh_short4x4
+#undef  vp8_fdct_walsh_short4x4
 #define vp8_fdct_walsh_short4x4  vp8_short_walsh4x4_sse2
 
 #endif
diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c
index fb1b37ccb..781079849 100644
--- a/vp8/encoder/x86/x86_csystemdependent.c
+++ b/vp8/encoder/x86/x86_csystemdependent.c
@@ -18,11 +18,10 @@
 #if HAVE_MMX
 void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch)
 {
-    vp8_short_fdct4x4_c(input,   output,    pitch);
-    vp8_short_fdct4x4_c(input + 4, output + 16, pitch);
+    vp8_short_fdct4x4_mmx(input,   output,    pitch);
+    vp8_short_fdct4x4_mmx(input + 4, output + 16, pitch);
 }
 
-
 int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
                                  short *qcoeff_ptr, short *dequant_ptr,
                                  short *scan_mask, short *round_ptr,
@@ -82,12 +81,6 @@ void vp8_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch)
 #endif
 
 #if HAVE_SSE2
-void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch)
-{
-    vp8_short_fdct4x4_sse2(input,   output,    pitch);
-    vp8_short_fdct4x4_sse2(input + 4, output + 16, pitch);
-}
-
 int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr,
                                  short *qcoeff_ptr, short *dequant_ptr,
                                  short *scan_mask, short *round_ptr,
@@ -249,18 +242,11 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
         cpi->rtcd.variance.get8x8var             = vp8_get8x8var_mmx;
         cpi->rtcd.variance.get16x16var           = vp8_get16x16var_mmx;
         cpi->rtcd.variance.get4x4sse_cs          = vp8_get4x4sse_cs_mmx;
-#if 0 // new fdct
+
         cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_mmx;
         cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_mmx;
         cpi->rtcd.fdct.fast4x4                   = vp8_short_fdct4x4_mmx;
         cpi->rtcd.fdct.fast8x4                   = vp8_short_fdct8x4_mmx;
-#else
-        cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_c;
-        cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_c;
-        cpi->rtcd.fdct.fast4x4                   = vp8_short_fdct4x4_c;
-        cpi->rtcd.fdct.fast8x4                   = vp8_short_fdct8x4_c;
-
-#endif
 
         cpi->rtcd.fdct.walsh_short4x4            = vp8_short_walsh4x4_c;
 
diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index 8e50b7f1b..6a2872031 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -860,8 +860,16 @@ static vpx_image_t *vp8e_get_preview(vpx_codec_alg_priv_t *ctx)
 {
 
     YV12_BUFFER_CONFIG sd;
+    vp8_ppflags_t flags = {0};
 
-    if (0 == vp8_get_preview_raw_frame(ctx->cpi, &sd, ctx->preview_ppcfg.deblocking_level, ctx->preview_ppcfg.noise_level, ctx->preview_ppcfg.post_proc_flag))
+    if (ctx->preview_ppcfg.post_proc_flag)
+    {
+        flags.post_proc_flag        = ctx->preview_ppcfg.post_proc_flag;
+        flags.deblocking_level      = ctx->preview_ppcfg.deblocking_level;
+        flags.noise_level           = ctx->preview_ppcfg.noise_level;
+    }
+
+    if (0 == vp8_get_preview_raw_frame(ctx->cpi, &sd, &flags))
     {
 
         /*
diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c
index 9964124d1..9dd492217 100644
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -65,12 +65,19 @@ struct vpx_codec_alg_priv
     vpx_codec_priv_t        base;
     vpx_codec_mmap_t        mmaps[NELEMENTS(vp8_mem_req_segs)-1];
     vpx_codec_dec_cfg_t     cfg;
-    vp8_stream_info_t   si;
+    vp8_stream_info_t       si;
     int                     defer_alloc;
     int                     decoder_init;
     VP8D_PTR                pbi;
     int                     postproc_cfg_set;
     vp8_postproc_cfg_t      postproc_cfg;
+#if CONFIG_POSTPROC_VISUALIZER
+    unsigned int            dbg_postproc_flag;
+    int                     dbg_color_ref_frame_flag;
+    int                     dbg_color_mb_modes_flag;
+    int                     dbg_color_b_modes_flag;
+    int                     dbg_display_mv_flag;
+#endif
     vpx_image_t             img;
     int                     img_setup;
     int                     img_avail;
@@ -416,15 +423,27 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,
     {
         YV12_BUFFER_CONFIG sd;
         INT64 time_stamp = 0, time_end_stamp = 0;
-        int ppflag       = 0;
-        int ppdeblocking = 0;
-        int ppnoise      = 0;
+        vp8_ppflags_t flags = {0};
 
         if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)
         {
-            ppflag      = ctx->postproc_cfg.post_proc_flag;
-            ppdeblocking = ctx->postproc_cfg.deblocking_level;
-            ppnoise     = ctx->postproc_cfg.noise_level;
+            flags.post_proc_flag= ctx->postproc_cfg.post_proc_flag
+#if CONFIG_POSTPROC_VISUALIZER
+
+                                | ((ctx->dbg_color_ref_frame_flag != 0) ? VP8D_DEBUG_CLR_FRM_REF_BLKS : 0)
+                                | ((ctx->dbg_color_mb_modes_flag != 0) ? VP8D_DEBUG_CLR_BLK_MODES : 0)
+                                | ((ctx->dbg_color_b_modes_flag != 0) ? VP8D_DEBUG_CLR_BLK_MODES : 0)
+                                | ((ctx->dbg_display_mv_flag != 0) ? VP8D_DEBUG_DRAW_MV : 0)
+#endif
+                                ;
+            flags.deblocking_level      = ctx->postproc_cfg.deblocking_level;
+            flags.noise_level           = ctx->postproc_cfg.noise_level;
+#if CONFIG_POSTPROC_VISUALIZER
+            flags.display_ref_frame_flag= ctx->dbg_color_ref_frame_flag;
+            flags.display_mb_modes_flag = ctx->dbg_color_mb_modes_flag;
+            flags.display_b_modes_flag  = ctx->dbg_color_b_modes_flag;
+            flags.display_mv_flag       = ctx->dbg_display_mv_flag;
+#endif
         }
 
         if (vp8dx_receive_compressed_data(ctx->pbi, data_sz, data, deadline))
@@ -433,7 +452,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,
             res = update_error_state(ctx, &pbi->common.error);
         }
 
-        if (!res && 0 == vp8dx_get_raw_frame(ctx->pbi, &sd, &time_stamp, &time_end_stamp, ppdeblocking, ppnoise, ppflag))
+        if (!res && 0 == vp8dx_get_raw_frame(ctx->pbi, &sd, &time_stamp, &time_end_stamp, &flags))
         {
             /* Align width/height */
             unsigned int a_w = (sd.y_width + 15) & ~15;
@@ -646,12 +665,38 @@ static vpx_codec_err_t vp8_set_postproc(vpx_codec_alg_priv_t *ctx,
 #endif
 }
 
+static vpx_codec_err_t vp8_set_dbg_options(vpx_codec_alg_priv_t *ctx,
+                                        int ctrl_id,
+                                        va_list args)
+{
+#if CONFIG_POSTPROC_VISUALIZER && CONFIG_POSTPROC
+    int data = va_arg(args, int);
+
+#define MAP(id, var) case id: var = data; break;
+
+    switch (ctrl_id)
+    {
+        MAP (VP8_SET_DBG_COLOR_REF_FRAME,   ctx->dbg_color_ref_frame_flag);
+        MAP (VP8_SET_DBG_COLOR_MB_MODES,    ctx->dbg_color_mb_modes_flag);
+        MAP (VP8_SET_DBG_COLOR_B_MODES,     ctx->dbg_color_b_modes_flag);
+        MAP (VP8_SET_DBG_DISPLAY_MV,        ctx->dbg_display_mv_flag);
+    }
+
+    return VPX_CODEC_OK;
+#else
+    return VPX_CODEC_INCAPABLE;
+#endif
+}
 
 vpx_codec_ctrl_fn_map_t vp8_ctf_maps[] =
 {
-    {VP8_SET_REFERENCE,  vp8_set_reference},
-    {VP8_COPY_REFERENCE, vp8_get_reference},
-    {VP8_SET_POSTPROC,   vp8_set_postproc},
+    {VP8_SET_REFERENCE,             vp8_set_reference},
+    {VP8_COPY_REFERENCE,            vp8_get_reference},
+    {VP8_SET_POSTPROC,              vp8_set_postproc},
+    {VP8_SET_DBG_COLOR_REF_FRAME,   vp8_set_dbg_options},
+    {VP8_SET_DBG_COLOR_MB_MODES,    vp8_set_dbg_options},
+    {VP8_SET_DBG_COLOR_B_MODES,     vp8_set_dbg_options},
+    {VP8_SET_DBG_DISPLAY_MV,        vp8_set_dbg_options},
     { -1, NULL},
 };
 
diff --git a/vpx/vp8.h b/vpx/vp8.h
index d7ed8d8c1..32c01325f 100644
--- a/vpx/vp8.h
+++ b/vpx/vp8.h
@@ -38,9 +38,13 @@
  */
 enum vp8_dec_control_id
 {
-    VP8_SET_REFERENCE       = 1,    /**< pass in an external frame into decoder to be used as reference frame */
-    VP8_COPY_REFERENCE      = 2,    /**< get a copy of reference frame from the decoder */
-    VP8_SET_POSTPROC        = 3,    /**< set decoder's the post processing settings  */
+    VP8_SET_REFERENCE           = 1,    /**< pass in an external frame into decoder to be used as reference frame */
+    VP8_COPY_REFERENCE          = 2,    /**< get a copy of reference frame from the decoder */
+    VP8_SET_POSTPROC            = 3,    /**< set the decoder's post processing settings  */
+    VP8_SET_DBG_COLOR_REF_FRAME = 4,    /**< set the reference frames to color for each macroblock */
+    VP8_SET_DBG_COLOR_MB_MODES  = 5,    /**< set which macro block modes to color */
+    VP8_SET_DBG_COLOR_B_MODES   = 6,    /**< set which blocks modes to color */
+    VP8_SET_DBG_DISPLAY_MV      = 7,    /**< set which motion vector modes to draw */
     VP8_COMMON_CTRL_ID_MAX
 };
 
@@ -50,10 +54,14 @@ enum vp8_dec_control_id
  */
 enum vp8_postproc_level
 {
-    VP8_NOFILTERING    = 0,
-    VP8_DEBLOCK        = 1,
-    VP8_DEMACROBLOCK   = 2,
-    VP8_ADDNOISE       = 4
+    VP8_NOFILTERING             = 0,
+    VP8_DEBLOCK                 = 1<<0,
+    VP8_DEMACROBLOCK            = 1<<1,
+    VP8_ADDNOISE                = 1<<2,
+    VP8_DEBUG_TXT_FRAME_INFO    = 1<<3, /**< print frame information */
+    VP8_DEBUG_TXT_MBLK_MODES    = 1<<4, /**< print macro block modes over each macro block */
+    VP8_DEBUG_TXT_DC_DIFF       = 1<<5, /**< print dc diff for each macro block */
+    VP8_DEBUG_TXT_RATE_INFO     = 1<<6, /**< print video rate info (encoder only) */
 };
 
 /*!\brief post process flags
@@ -65,9 +73,9 @@ enum vp8_postproc_level
 
 typedef struct vp8_postproc_cfg
 {
-    int post_proc_flag;           /**< the types of post processing to be done, should be combination of "vp8_postproc_level" */
-    int deblocking_level;        /**< the strength of deblocking, valid range [0, 16] */
-    int noise_level;             /**< the strength of additive noise, valid range [0, 16] */
+    int post_proc_flag;         /**< the types of post processing to be done, should be combination of "vp8_postproc_level" */
+    int deblocking_level;       /**< the strength of deblocking, valid range [0, 16] */
+    int noise_level;            /**< the strength of additive noise, valid range [0, 16] */
 } vp8_postproc_cfg_t;
 
 /*!\brief reference frame type
@@ -95,12 +103,16 @@ typedef struct vpx_ref_frame
 
 /*!\brief vp8 decoder control funciton parameter type
  *
- * defines the data type for each of VP8 decoder control funciton requires
+ * defines the data type for each of VP8 decoder control function requires
  */
 
 VPX_CTRL_USE_TYPE(VP8_SET_REFERENCE,           vpx_ref_frame_t *)
 VPX_CTRL_USE_TYPE(VP8_COPY_REFERENCE,          vpx_ref_frame_t *)
 VPX_CTRL_USE_TYPE(VP8_SET_POSTPROC,            vp8_postproc_cfg_t *)
+VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_REF_FRAME, int)
+VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_MB_MODES,  int)
+VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_B_MODES,   int)
+VPX_CTRL_USE_TYPE(VP8_SET_DBG_DISPLAY_MV,      int)
 
 
 /*! @} - end defgroup vp8 */
diff --git a/vpxdec.c b/vpxdec.c
index 9b565b022..5eb0bebde 100644
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -108,11 +108,19 @@ static const arg_def_t demacroblock_level = ARG_DEF(NULL, "demacroblock-level",
         "Enable VP8 demacroblocking, w/ level");
 static const arg_def_t pp_debug_info = ARG_DEF(NULL, "pp-debug-info", 1,
                                        "Enable VP8 visible debug info");
-
+static const arg_def_t pp_disp_ref_frame = ARG_DEF(NULL, "pp-dbg-ref-frame", 1,
+                                       "Display only selected reference frame per macro block");
+static const arg_def_t pp_disp_mb_modes = ARG_DEF(NULL, "pp-dbg-mb-modes", 1,
+                                       "Display only selected macro block modes");
+static const arg_def_t pp_disp_b_modes = ARG_DEF(NULL, "pp-dbg-b-modes", 1,
+                                       "Display only selected block modes");
+static const arg_def_t pp_disp_mvs = ARG_DEF(NULL, "pp-dbg-mvs", 1,
+                                       "Draw only selected motion vectors");
 
 static const arg_def_t *vp8_pp_args[] =
 {
     &addnoise_level, &deblock, &demacroblock_level, &pp_debug_info,
+    &pp_disp_ref_frame, &pp_disp_mb_modes, &pp_disp_b_modes, &pp_disp_mvs,
     NULL
 };
 #endif
@@ -705,6 +713,10 @@ int main(int argc, const char **argv_)
     vpx_codec_dec_cfg_t     cfg = {0};
 #if CONFIG_VP8_DECODER
     vp8_postproc_cfg_t      vp8_pp_cfg = {0};
+    int                     vp8_dbg_color_ref_frame = 0;
+    int                     vp8_dbg_color_mb_modes = 0;
+    int                     vp8_dbg_color_b_modes = 0;
+    int                     vp8_dbg_display_mv = 0;
 #endif
     struct input_ctx        input = {0};
 
@@ -790,6 +802,42 @@ int main(int argc, const char **argv_)
             if (level)
                 vp8_pp_cfg.post_proc_flag |= level;
         }
+        else if (arg_match(&arg, &pp_disp_ref_frame, argi))
+        {
+            unsigned int flags = arg_parse_int(&arg);
+            if (flags)
+            {
+                postproc = 1;
+                vp8_dbg_color_ref_frame = flags;
+            }
+        }
+        else if (arg_match(&arg, &pp_disp_mb_modes, argi))
+        {
+            unsigned int flags = arg_parse_int(&arg);
+            if (flags)
+            {
+                postproc = 1;
+                vp8_dbg_color_mb_modes = flags;
+            }
+        }
+        else if (arg_match(&arg, &pp_disp_b_modes, argi))
+        {
+            unsigned int flags = arg_parse_int(&arg);
+            if (flags)
+            {
+                postproc = 1;
+                vp8_dbg_color_b_modes = flags;
+            }
+        }
+        else if (arg_match(&arg, &pp_disp_mvs, argi))
+        {
+            unsigned int flags = arg_parse_int(&arg);
+            if (flags)
+            {
+                postproc = 1;
+                vp8_dbg_display_mv = flags;
+            }
+        }
 
 #endif
         else
@@ -929,6 +977,33 @@ int main(int argc, const char **argv_)
         return EXIT_FAILURE;
     }
 
+    if (vp8_dbg_color_ref_frame
+        && vpx_codec_control(&decoder, VP8_SET_DBG_COLOR_REF_FRAME, vp8_dbg_color_ref_frame))
+    {
+        fprintf(stderr, "Failed to configure reference block visualizer: %s\n", vpx_codec_error(&decoder));
+        return EXIT_FAILURE;
+    }
+
+    if (vp8_dbg_color_mb_modes
+        && vpx_codec_control(&decoder, VP8_SET_DBG_COLOR_MB_MODES, vp8_dbg_color_mb_modes))
+    {
+        fprintf(stderr, "Failed to configure macro block visualizer: %s\n", vpx_codec_error(&decoder));
+        return EXIT_FAILURE;
+    }
+
+    if (vp8_dbg_color_b_modes
+        && vpx_codec_control(&decoder, VP8_SET_DBG_COLOR_B_MODES, vp8_dbg_color_b_modes))
+    {
+        fprintf(stderr, "Failed to configure block visualizer: %s\n", vpx_codec_error(&decoder));
+        return EXIT_FAILURE;
+    }
+
+    if (vp8_dbg_display_mv
+        && vpx_codec_control(&decoder, VP8_SET_DBG_DISPLAY_MV, vp8_dbg_display_mv))
+    {
+        fprintf(stderr, "Failed to configure motion vector visualizer: %s\n", vpx_codec_error(&decoder));
+        return EXIT_FAILURE;
+    }
 #endif
 
     /* Decode file */
diff --git a/vpxenc.c b/vpxenc.c
index b139c6829..af9839ce5 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -435,7 +435,7 @@ struct EbmlGlobal
     int debug;
 
     FILE    *stream;
-    uint64_t last_pts_ms;
+    int64_t last_pts_ms;
     vpx_rational_t  framerate;
 
     /* These pointers are to the start of an element */
@@ -648,7 +648,7 @@ write_webm_block(EbmlGlobal                *glob,
     unsigned char  track_number;
     unsigned short block_timecode = 0;
     unsigned char  flags;
-    uint64_t       pts_ms;
+    int64_t        pts_ms;
     int            start_cluster = 0, is_keyframe;
 
     /* Calculate the PTS of this frame in milliseconds */
@@ -1074,6 +1074,7 @@ int main(int argc, const char **argv_)
     int                      psnr_count = 0;
 
     exec_name = argv_[0];
+    ebml.last_pts_ms = -1;
 
     if (argc < 3)
         usage_exit();