85 files changed, 3512 insertions, 3272 deletions
diff --git a/build/make/Android.mk b/build/make/Android.mk
index 9dbbac9b7..8c21da20c 100644
--- a/build/make/Android.mk
+++ b/build/make/Android.mk
@@ -118,6 +118,10 @@ $(ASM_CNV_PATH)/libvpx/%.asm.s: $(LIBVPX_PATH)/%.asm $(ASM_CNV_OFFSETS_DEPEND)
 	@mkdir -p $(dir $@)
 	@$(CONFIG_DIR)/$(ASM_CONVERSION) <$< > $@
 
+# For building vpx_rtcd.h, which has a rule in libs.mk
+TGT_ISA:=$(word 1, $(subst -, ,$(TOOLCHAIN)))
+target := libs
+$(foreach file, $(LOCAL_SRC_FILES), $(LOCAL_PATH)/$(file)): vpx_rtcd.h
 
 LOCAL_SRC_FILES += vpx_config.c
 
@@ -171,6 +175,7 @@ clean:
 	@$(RM) $(CODEC_SRCS_ASM_ADS2GAS) $(CODEC_SRCS_ASM_NEON_ADS2GAS)
 	@$(RM) $(patsubst %.asm, %.*, $(ASM_CNV_OFFSETS_DEPEND))
 	@$(RM) -r $(ASM_CNV_PATH)
+	@$(RM) $(CLEAN-OBJS)
 
 include $(BUILD_SHARED_LIBRARY)
 
diff --git a/build/make/configure.sh b/build/make/configure.sh
index 15134ab6a..571fd84e9 100755
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -391,6 +391,7 @@ LDFLAGS = ${LDFLAGS}
 ASFLAGS = ${ASFLAGS}
 extralibs = ${extralibs}
 AS_SFX    = ${AS_SFX:-.asm}
+EXE_SFX   = ${EXE_SFX}
 RTCD_OPTIONS = ${RTCD_OPTIONS}
 EOF
 
@@ -540,6 +541,7 @@ setup_gnu_toolchain() {
     STRIP=${STRIP:-${CROSS}strip}
     NM=${NM:-${CROSS}nm}
         AS_SFX=.s
+        EXE_SFX=
 }
 
 process_common_toolchain() {
@@ -593,6 +595,9 @@ process_common_toolchain() {
             *solaris2.10)
                 tgt_os=solaris
                 ;;
+            *os2*)
+                tgt_os=os2
+                ;;
         esac
 
         if [ -n "$tgt_isa" ] && [ -n "$tgt_os" ]; then
@@ -791,6 +796,8 @@ process_common_toolchain() {
             add_cflags "--sysroot=${alt_libc}"
             add_ldflags "--sysroot=${alt_libc}"
 
+            add_cflags "-I${SDK_PATH}/sources/android/cpufeatures/"
+
             enable pic
             soft_enable realtime_only
             if [ ${tgt_isa} == "armv7" ]; then
@@ -919,6 +926,9 @@ process_common_toolchain() {
                 LD=${LD:-${CROSS}gcc}
                 CROSS=${CROSS:-g}
                 ;;
+            os2)
+                AS=${AS:-nasm}
+                ;;
         esac
 
         AS="${alt_as:-${AS:-auto}}"
@@ -989,6 +999,11 @@ process_common_toolchain() {
                 # enabled icc && ! enabled pic && add_cflags -fno-pic -mdynamic-no-pic
                 enabled icc && ! enabled pic && add_cflags -fno-pic
             ;;
+            os2)
+                add_asflags -f aout
+                enabled debug && add_asflags -g
+                EXE_SFX=.exe
+            ;;
             *) log "Warning: Unknown os $tgt_os while setting up $AS flags"
             ;;
         esac
diff --git a/build/make/gen_asm_deps.sh b/build/make/gen_asm_deps.sh
index 7c6c5d565..717f87020 100755
--- a/build/make/gen_asm_deps.sh
+++ b/build/make/gen_asm_deps.sh
@@ -42,7 +42,7 @@ done
 
 [ -n "$srcfile" ] || show_help
 sfx=${sfx:-asm}
-includes=$(egrep -i "include +\"?+[a-z0-9_/]+\.${sfx}" $srcfile |
+includes=$(LC_ALL=C egrep -i "include +\"?+[a-z0-9_/]+\.${sfx}" $srcfile |
            perl -p -e "s;.*?([a-z0-9_/]+.${sfx}).*;\1;")
 #" restore editor state
 for inc in ${includes}; do
diff --git a/build/make/rtcd.sh b/build/make/rtcd.sh
index a5f1e6d73..1dffde58a 100755
--- a/build/make/rtcd.sh
+++ b/build/make/rtcd.sh
@@ -196,8 +196,8 @@ filter() {
 # Helper functions for generating the arch specific RTCD files
 #
 common_top() {
-  local outfile_basename=$(basename ${outfile:-rtcd.h})
-  local include_guard=$(echo -n $outfile_basename | tr '[a-z]' '[A-Z]' | tr -c '[A-Z]' _)
+  local outfile_basename=$(basename ${symbol:-rtcd.h})
+  local include_guard=$(echo $outfile_basename | tr '[a-z]' '[A-Z]' | tr -c '[A-Z]' _)
   cat <<EOF
 #ifndef ${include_guard}
 #define ${include_guard}
@@ -225,7 +225,7 @@ x86() {
 
   # Assign the helper variable for each enabled extension
   for opt in $ALL_ARCHS; do
-    local uc=$(echo -n $opt | tr '[a-z]' '[A-Z]')
+    local uc=$(echo $opt | tr '[a-z]' '[A-Z]')
     eval "have_${opt}=\"flags & HAS_${uc}\""
   done
 
@@ -253,7 +253,7 @@ arm() {
 
   # Assign the helper variable for each enabled extension
   for opt in $ALL_ARCHS; do
-    local uc=$(echo -n $opt | tr '[a-z]' '[A-Z]')
+    local uc=$(echo $opt | tr '[a-z]' '[A-Z]')
     eval "have_${opt}=\"flags & HAS_${uc}\""
   done
 
diff --git a/configure b/configure
index 7ecd72efa..2724c3008 100755
--- a/configure
+++ b/configure
@@ -39,6 +39,7 @@ Advanced options:
   ${toggle_multithread}           multithreaded encoding and decoding
   ${toggle_spatial_resampling}    spatial sampling (scaling) support
   ${toggle_realtime_only}         enable this option while building for real-time encoding
+  ${toggle_onthefly_bitpacking}   enable on-the-fly bitpacking in real-time encoding
   ${toggle_error_concealment}     enable this option to get a decoder which is able to conceal losses
   ${toggle_runtime_cpu_detect}    runtime cpu detection
   ${toggle_shared}                shared library support
@@ -46,6 +47,7 @@ Advanced options:
   ${toggle_small}                 favor smaller size over speed
   ${toggle_postproc_visualizer}   macro block / block level visualizers
   ${toggle_multi_res_encoding}    enable multiple-resolution encoding
+  ${toggle_temporal_denoising}    enable temporal denoising and disable the spatial denoiser
 
 Codecs:
   Codecs can be selectively enabled or disabled individually, or by family:
@@ -109,6 +111,7 @@ all_platforms="${all_platforms} x86-darwin9-icc"
 all_platforms="${all_platforms} x86-darwin10-gcc"
 all_platforms="${all_platforms} x86-linux-gcc"
 all_platforms="${all_platforms} x86-linux-icc"
+all_platforms="${all_platforms} x86-os2-gcc"
 all_platforms="${all_platforms} x86-solaris-gcc"
 all_platforms="${all_platforms} x86-win32-gcc"
 all_platforms="${all_platforms} x86-win32-vs7"
@@ -163,6 +166,7 @@ enable md5
 enable spatial_resampling
 enable multithread
 enable os_support
+enable temporal_denoising
 
 [ -d ${source_path}/../include ] && enable alt_tree_layout
 for d in vp8; do
@@ -252,6 +256,7 @@ CONFIG_LIST="
     static_msvcrt
     spatial_resampling
     realtime_only
+    onthefly_bitpacking
     error_concealment
     shared
     static
@@ -260,6 +265,7 @@ CONFIG_LIST="
     os_support
     unit_tests
     multi_res_encoding
+    temporal_denoising
 "
 CMDLINE_SELECT="
     extra_warnings
@@ -296,6 +302,7 @@ CMDLINE_SELECT="
     mem_tracker
     spatial_resampling
     realtime_only
+    onthefly_bitpacking
     error_concealment
     shared
     static
@@ -303,6 +310,7 @@ CMDLINE_SELECT="
     postproc_visualizer
     unit_tests
     multi_res_encoding
+    temporal_denoising
 "
 
 process_cmdline() {
diff --git a/examples.mk b/examples.mk
index f6c904588..518608d72 100644
--- a/examples.mk
+++ b/examples.mk
@@ -168,12 +168,12 @@ $(eval $(if $(filter universal%,$(TOOLCHAIN)),LIPO_OBJS,BUILD_OBJS):=yes)
 # Create build/install dependencies for all examples. The common case
 # is handled here. The MSVS case is handled below.
 NOT_MSVS = $(if $(CONFIG_MSVS),,yes)
-DIST-BINS-$(NOT_MSVS)      += $(addprefix bin/,$(ALL_EXAMPLES:.c=))
-INSTALL-BINS-$(NOT_MSVS)   += $(addprefix bin/,$(UTILS:.c=))
+DIST-BINS-$(NOT_MSVS)      += $(addprefix bin/,$(ALL_EXAMPLES:.c=$(EXE_SFX)))
+INSTALL-BINS-$(NOT_MSVS)   += $(addprefix bin/,$(UTILS:.c=$(EXE_SFX)))
 DIST-SRCS-yes              += $(ALL_SRCS)
 INSTALL-SRCS-yes           += $(UTIL_SRCS)
 OBJS-$(NOT_MSVS)           += $(if $(BUILD_OBJS),$(call objs,$(ALL_SRCS)))
-BINS-$(NOT_MSVS)           += $(addprefix $(BUILD_PFX),$(ALL_EXAMPLES:.c=))
+BINS-$(NOT_MSVS)           += $(addprefix $(BUILD_PFX),$(ALL_EXAMPLES:.c=$(EXE_SFX)))
 
 
 # Instantiate linker template for all examples.
@@ -183,7 +183,7 @@ $(foreach bin,$(BINS-yes),\
     $(if $(BUILD_OBJS),$(eval $(bin):\
         $(LIB_PATH)/lib$(CODEC_LIB)$(CODEC_LIB_SUF)))\
     $(if $(BUILD_OBJS),$(eval $(call linker_template,$(bin),\
-        $(call objs,$($(notdir $(bin)).SRCS)) \
+        $(call objs,$($(notdir $(bin:$(EXE_SFX)=)).SRCS)) \
         -l$(CODEC_LIB) $(addprefix -l,$(CODEC_EXTRA_LIBS))\
         )))\
     $(if $(LIPO_OBJS),$(eval $(call lipo_bin_template,$(bin))))\
diff --git a/libs.mk b/libs.mk
index e083a9983..b77276e1b 100644
--- a/libs.mk
+++ b/libs.mk
@@ -17,6 +17,7 @@ else
   ASM:=.asm
 endif
 
+CODEC_SRCS-yes += CHANGELOG
 CODEC_SRCS-yes += libs.mk
 
 include $(SRC_PATH_BARE)/vpx/vpx_codec.mk
@@ -280,19 +281,21 @@ $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm
 # Calculate platform- and compiler-specific offsets for hand coded assembly
 #
 
+OFFSET_PATTERN:='^[a-zA-Z0-9_]* EQU'
+
 ifeq ($(filter icc gcc,$(TGT_CC)), $(TGT_CC))
     $(BUILD_PFX)asm_com_offsets.asm: $(BUILD_PFX)$(VP8_PREFIX)common/asm_com_offsets.c.S
-	grep -w EQU $< | tr -d '$$\#' $(ADS2GAS) > $@
+	LC_ALL=C grep $(OFFSET_PATTERN) $< | tr -d '$$\#' $(ADS2GAS) > $@
     $(BUILD_PFX)$(VP8_PREFIX)common/asm_com_offsets.c.S: $(VP8_PREFIX)common/asm_com_offsets.c
     CLEAN-OBJS += $(BUILD_PFX)asm_com_offsets.asm $(BUILD_PFX)$(VP8_PREFIX)common/asm_com_offsets.c.S
 
     $(BUILD_PFX)asm_enc_offsets.asm: $(BUILD_PFX)$(VP8_PREFIX)encoder/asm_enc_offsets.c.S
-	grep -w EQU $< | tr -d '$$\#' $(ADS2GAS) > $@
+	LC_ALL=C grep $(OFFSET_PATTERN) $< | tr -d '$$\#' $(ADS2GAS) > $@
     $(BUILD_PFX)$(VP8_PREFIX)encoder/asm_enc_offsets.c.S: $(VP8_PREFIX)encoder/asm_enc_offsets.c
     CLEAN-OBJS += $(BUILD_PFX)asm_enc_offsets.asm $(BUILD_PFX)$(VP8_PREFIX)encoder/asm_enc_offsets.c.S
 
     $(BUILD_PFX)asm_dec_offsets.asm: $(BUILD_PFX)$(VP8_PREFIX)decoder/asm_dec_offsets.c.S
-	grep -w EQU $< | tr -d '$$\#' $(ADS2GAS) > $@
+	LC_ALL=C grep $(OFFSET_PATTERN) $< | tr -d '$$\#' $(ADS2GAS) > $@
     $(BUILD_PFX)$(VP8_PREFIX)decoder/asm_dec_offsets.c.S: $(VP8_PREFIX)decoder/asm_dec_offsets.c
     CLEAN-OBJS += $(BUILD_PFX)asm_dec_offsets.asm $(BUILD_PFX)$(VP8_PREFIX)decoder/asm_dec_offsets.c.S
 else
@@ -327,7 +330,7 @@ CLEAN-OBJS += $(BUILD_PFX)vpx_version.h
 # Rule to generate runtime cpu detection files
 #
 $(OBJS-yes:.o=.d): vpx_rtcd.h
-vpx_rtcd.h: $(sort $(filter %rtcd_defs.sh,$(CODEC_SRCS)))
+vpx_rtcd.h: $(SRC_PATH_BARE)/$(sort $(filter %rtcd_defs.sh,$(CODEC_SRCS)))
 	@echo "    [CREATE] $@"
 	$(qexec)$(SRC_PATH_BARE)/build/make/rtcd.sh --arch=$(TGT_ISA) \
           --sym=vpx_rtcd \
diff --git a/mainpage.dox b/mainpage.dox
index 5613ae771..e2ec28002 100644
--- a/mainpage.dox
+++ b/mainpage.dox
@@ -12,8 +12,12 @@
 
   This distribution of the WebM VP8 Codec SDK includes the following support:
 
-  \if vp8_encoder    - \ref vp8_encoder   \endif
-  \if vp8_decoder    - \ref vp8_decoder   \endif
+  \if vp8_encoder
+  - \ref vp8_encoder
+  \endif
+  \if vp8_decoder
+  - \ref vp8_decoder
+  \endif
 
 
   \section main_startpoints Starting Points
@@ -24,8 +28,12 @@
   - Read the \ref samples "sample code" for examples of how to interact with the
     codec.
   - \ref codec reference
-    \if encoder - \ref encoder reference \endif
-    \if decoder - \ref decoder reference \endif
+    \if encoder
+    - \ref encoder reference
+    \endif
+    \if decoder
+    - \ref decoder reference
+    \endif
 
   \section main_support Support Options & FAQ
   The WebM project is an open source project supported by its community. For
diff --git a/tools_common.c b/tools_common.c
index d188bbe20..6f9502869 100644
--- a/tools_common.c
+++ b/tools_common.c
@@ -9,15 +9,21 @@
  */
 #include <stdio.h>
 #include "tools_common.h"
-#ifdef _WIN32
+#if defined(_WIN32) || defined(__OS2__)
 #include <io.h>
 #include <fcntl.h>
+
+#ifdef __OS2__
+#define _setmode    setmode
+#define _fileno     fileno
+#define _O_BINARY   O_BINARY
+#endif
 #endif
 
 FILE* set_binary_mode(FILE *stream)
 {
     (void)stream;
-#ifdef _WIN32
+#if defined(_WIN32) || defined(__OS2__)
     _setmode(_fileno(stream), _O_BINARY);
 #endif
     return stream;
diff --git a/usage.dox b/usage.dox
index 9370e428f..92fd6b26e 100644
--- a/usage.dox
+++ b/usage.dox
@@ -1,6 +1,6 @@
 /*!\page usage Usage
 
-    The vpx Multi-Format codec SDK provides a unified interface amongst its
+    The vpx multi-format codec SDK provides a unified interface amongst its
     supported codecs. This abstraction allows applications using this SDK to
     easily support multiple video formats with minimal code duplication or
     "special casing." This section describes the interface common to all codecs.
@@ -14,8 +14,12 @@
 
     Fore more information on decoder and encoder specific usage, see the
     following pages:
-    \if decoder - \subpage usage_decode \endif
-    \if decoder - \subpage usage_encode \endif
+    \if decoder
+    - \subpage usage_decode
+    \endif
+    \if decoder
+    - \subpage usage_encode
+    \endif
 
     \section usage_types Important Data Types
     There are two important data structures to consider in this interface.
diff --git a/vp8/common/alloccommon.c b/vp8/common/alloccommon.c
index b606aaca0..b32d8a939 100644
--- a/vp8/common/alloccommon.c
+++ b/vp8/common/alloccommon.c
@@ -37,7 +37,6 @@ static void update_mode_info_border(MODE_INFO *mi, int rows, int cols)
 void vp8_de_alloc_frame_buffers(VP8_COMMON *oci)
 {
     int i;
-
     for (i = 0; i < NUM_YV12_BUFFERS; i++)
         vp8_yv12_de_alloc_frame_buffer(&oci->yv12_fb[i]);
 
diff --git a/vp8/encoder/arm/armv6/vp8_sad16x16_armv6.asm b/vp8/common/arm/armv6/vp8_sad16x16_armv6.asm
index 1b4f5cf3b..1b4f5cf3b 100644
--- a/vp8/encoder/arm/armv6/vp8_sad16x16_armv6.asm
+++ b/vp8/common/arm/armv6/vp8_sad16x16_armv6.asm
diff --git a/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm b/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm
index 5feaa8bc2..dc84c30da 100644
--- a/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm
+++ b/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm
@@ -144,7 +144,7 @@ loop
     ldr     r6, [sp, #40]       ; get address of sse
     mul     r0, r8, r8          ; sum * sum
     str     r11, [r6]           ; store sse
-    sub     r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
+    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
 
     ldmfd   sp!, {r4-r12, pc}
 
diff --git a/vp8/encoder/arm/armv6/vp8_variance8x8_armv6.asm b/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm
index adc353d20..adc353d20 100644
--- a/vp8/encoder/arm/armv6/vp8_variance8x8_armv6.asm
+++ b/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm
diff --git a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm b/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm
index 1b5489795..dd2ce685c 100644
--- a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm
+++ b/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm
@@ -169,7 +169,7 @@ loop
     ldr     r6, [sp, #40]       ; get address of sse
     mul     r0, r8, r8          ; sum * sum
     str     r11, [r6]           ; store sse
-    sub     r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
+    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
 
     ldmfd   sp!, {r4-r12, pc}
 
diff --git a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm b/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm
index 38c55edf8..f972d9b5b 100644
--- a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm
+++ b/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm
@@ -210,7 +210,7 @@ loop
     ldr     r6, [sp, #40]       ; get address of sse
     mul     r0, r8, r8          ; sum * sum
     str     r11, [r6]           ; store sse
-    sub     r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
+    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
 
     ldmfd   sp!, {r4-r12, pc}
 
diff --git a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm b/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm
index 22a50eb00..f5da9c09e 100644
--- a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm
+++ b/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm
@@ -171,7 +171,7 @@ loop
     ldr     r6, [sp, #40]       ; get address of sse
     mul     r0, r8, r8          ; sum * sum
     str     r11, [r6]           ; store sse
-    sub     r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
+    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
 
     ldmfd   sp!, {r4-r12, pc}
 
diff --git a/vp8/encoder/arm/neon/sad16_neon.asm b/vp8/common/arm/neon/sad16_neon.asm
index d7c590e15..d7c590e15 100644
--- a/vp8/encoder/arm/neon/sad16_neon.asm
+++ b/vp8/common/arm/neon/sad16_neon.asm
diff --git a/vp8/encoder/arm/neon/sad8_neon.asm b/vp8/common/arm/neon/sad8_neon.asm
index 23ba6df93..23ba6df93 100644
--- a/vp8/encoder/arm/neon/sad8_neon.asm
+++ b/vp8/common/arm/neon/sad8_neon.asm
diff --git a/vp8/encoder/arm/neon/variance_neon.asm b/vp8/common/arm/neon/variance_neon.asm
index e1a46869a..e3b48327d 100644
--- a/vp8/encoder/arm/neon/variance_neon.asm
+++ b/vp8/common/arm/neon/variance_neon.asm
@@ -77,14 +77,14 @@ variance16x16_neon_loop
     ;vmov.32        r1, d1[0]
     ;mul            r0, r0, r0
     ;str            r1, [r12]
-    ;sub            r0, r1, r0, asr #8
+    ;sub            r0, r1, r0, lsr #8
 
-    ;sum is in [-255x256, 255x256]. sumxsum is 32-bit. Shift to right should
-    ;have sign-bit exension, which is vshr.s. Have to use s32 to make it right.
+    ; while sum is signed, sum * sum is always positive and must be treated as
+    ; unsigned to avoid propagating the sign bit.
     vmull.s32       q5, d0, d0
     vst1.32         {d1[0]}, [r12]              ;store sse
-    vshr.s32        d10, d10, #8
-    vsub.s32        d0, d1, d10
+    vshr.u32        d10, d10, #8
+    vsub.u32        d0, d1, d10
 
     vmov.32         r0, d0[0]                   ;return
     bx              lr
@@ -145,8 +145,8 @@ variance16x8_neon_loop
 
     vmull.s32       q5, d0, d0
     vst1.32         {d1[0]}, [r12]              ;store sse
-    vshr.s32        d10, d10, #7
-    vsub.s32        d0, d1, d10
+    vshr.u32        d10, d10, #7
+    vsub.u32        d0, d1, d10
 
     vmov.32         r0, d0[0]                   ;return
     bx              lr
@@ -200,8 +200,8 @@ variance8x16_neon_loop
 
     vmull.s32       q5, d0, d0
     vst1.32         {d1[0]}, [r12]              ;store sse
-    vshr.s32        d10, d10, #7
-    vsub.s32        d0, d1, d10
+    vshr.u32        d10, d10, #7
+    vsub.u32        d0, d1, d10
 
     vmov.32         r0, d0[0]                   ;return
     bx              lr
@@ -265,8 +265,8 @@ variance8x8_neon_loop
 
     vmull.s32       q5, d0, d0
     vst1.32         {d1[0]}, [r12]              ;store sse
-    vshr.s32        d10, d10, #6
-    vsub.s32        d0, d1, d10
+    vshr.u32        d10, d10, #6
+    vsub.u32        d0, d1, d10
 
     vmov.32         r0, d0[0]                   ;return
     bx              lr
diff --git a/vp8/encoder/arm/neon/vp8_subpixelvariance16x16_neon.asm b/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm
index 5107d8b99..d753ad129 100644
--- a/vp8/encoder/arm/neon/vp8_subpixelvariance16x16_neon.asm
+++ b/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm
@@ -405,8 +405,8 @@ sub_pixel_variance16x16_neon_loop
 
     vmull.s32       q5, d0, d0
     vst1.32         {d1[0]}, [r6]               ;store sse
-    vshr.s32        d10, d10, #8
-    vsub.s32        d0, d1, d10
+    vshr.u32        d10, d10, #8
+    vsub.u32        d0, d1, d10
 
     add             sp, sp, #528
     vmov.32         r0, d0[0]                   ;return
diff --git a/vp8/encoder/arm/neon/vp8_subpixelvariance16x16s_neon.asm b/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm
index 0a2b71c49..155be4fc5 100644
--- a/vp8/encoder/arm/neon/vp8_subpixelvariance16x16s_neon.asm
+++ b/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm
@@ -112,8 +112,8 @@ vp8_filt_fpo16x16s_4_0_loop_neon
 
     vmull.s32       q5, d0, d0
     vst1.32         {d1[0]}, [lr]               ;store sse
-    vshr.s32        d10, d10, #8
-    vsub.s32        d0, d1, d10
+    vshr.u32        d10, d10, #8
+    vsub.u32        d0, d1, d10
 
     vmov.32         r0, d0[0]                   ;return
     pop             {pc}
@@ -208,8 +208,8 @@ vp8_filt_spo16x16s_0_4_loop_neon
 
     vmull.s32       q5, d0, d0
     vst1.32         {d1[0]}, [lr]               ;store sse
-    vshr.s32        d10, d10, #8
-    vsub.s32        d0, d1, d10
+    vshr.u32        d10, d10, #8
+    vsub.u32        d0, d1, d10
 
     vmov.32         r0, d0[0]                   ;return
     pop             {pc}
@@ -327,8 +327,8 @@ vp8_filt16x16s_4_4_loop_neon
 
     vmull.s32       q5, d0, d0
     vst1.32         {d1[0]}, [lr]               ;store sse
-    vshr.s32        d10, d10, #8
-    vsub.s32        d0, d1, d10
+    vshr.u32        d10, d10, #8
+    vsub.u32        d0, d1, d10
 
     vmov.32         r0, d0[0]                   ;return
     pop             {pc}
@@ -560,8 +560,8 @@ sub_pixel_variance16x16s_neon_loop
 
     vmull.s32       q5, d0, d0
     vst1.32         {d1[0]}, [lr]               ;store sse
-    vshr.s32        d10, d10, #8
-    vsub.s32        d0, d1, d10
+    vshr.u32        d10, d10, #8
+    vsub.u32        d0, d1, d10
 
     add             sp, sp, #256
     vmov.32         r0, d0[0]                   ;return
diff --git a/vp8/encoder/arm/neon/vp8_subpixelvariance8x8_neon.asm b/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm
index 38b58780a..cc7ae52c9 100644
--- a/vp8/encoder/arm/neon/vp8_subpixelvariance8x8_neon.asm
+++ b/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm
@@ -206,8 +206,8 @@ sub_pixel_variance8x8_neon_loop
 
     vmull.s32       q5, d0, d0
     vst1.32         {d1[0]}, [lr]               ;store sse
-    vshr.s32        d10, d10, #6
-    vsub.s32        d0, d1, d10
+    vshr.u32        d10, d10, #6
+    vsub.u32        d0, d1, d10
 
     vmov.32         r0, d0[0]                   ;return
     pop             {r4-r5, pc}
diff --git a/vp8/encoder/arm/variance_arm.c b/vp8/common/arm/variance_arm.c
index 052a2578a..41d5eb352 100644
--- a/vp8/encoder/arm/variance_arm.c
+++ b/vp8/common/arm/variance_arm.c
@@ -10,7 +10,7 @@
 
 #include "vpx_config.h"
 #include "vpx_rtcd.h"
-#include "vp8/encoder/variance.h"
+#include "vp8/common/variance.h"
 #include "vp8/common/filter.h"
 
 #if HAVE_MEDIA
diff --git a/vp8/common/blockd.h b/vp8/common/blockd.h
index ae32538fc..692f0ebd2 100644
--- a/vp8/common/blockd.h
+++ b/vp8/common/blockd.h
@@ -215,11 +215,21 @@ typedef struct macroblockd
     MODE_INFO *mode_info_context;
     int mode_info_stride;
 
+#if CONFIG_TEMPORAL_DENOISING
+    MB_PREDICTION_MODE best_sse_inter_mode;
+    int_mv best_sse_mv;
+    unsigned char need_to_clamp_best_mvs;
+#endif
+
     FRAME_TYPE frame_type;
 
     int up_available;
     int left_available;
 
+    unsigned char *recon_above[3];
+    unsigned char *recon_left[3];
+    int recon_left_stride[2];
+
     /* Y,U,V,Y2 */
     ENTROPY_CONTEXT_PLANES *above_context;
     ENTROPY_CONTEXT_PLANES *left_context;
diff --git a/vp8/common/generic/systemdependent.c b/vp8/common/generic/systemdependent.c
index c009cbdff..39660abaa 100644
--- a/vp8/common/generic/systemdependent.c
+++ b/vp8/common/generic/systemdependent.c
@@ -19,11 +19,15 @@
 #include "vp8/common/onyxc_int.h"
 
 #if CONFIG_MULTITHREAD
-#if HAVE_UNISTD_H
+#if HAVE_UNISTD_H && !defined(__OS2__)
 #include <unistd.h>
 #elif defined(_WIN32)
 #include <windows.h>
 typedef void (WINAPI *PGNSI)(LPSYSTEM_INFO);
+#elif defined(__OS2__)
+#define INCL_DOS
+#define INCL_DOSSPINLOCK
+#include <os2.h>
 #endif
 #endif
 
@@ -32,7 +36,7 @@ static int get_cpu_count()
 {
     int core_count = 16;
 
-#if HAVE_UNISTD_H
+#if HAVE_UNISTD_H && !defined(__OS2__)
 #if defined(_SC_NPROCESSORS_ONLN)
     core_count = sysconf(_SC_NPROCESSORS_ONLN);
 #elif defined(_SC_NPROC_ONLN)
@@ -55,6 +59,21 @@ static int get_cpu_count()
 
         core_count = sysinfo.dwNumberOfProcessors;
     }
+#elif defined(__OS2__)
+    {
+        ULONG proc_id;
+        ULONG status;
+
+        core_count = 0;
+        for (proc_id = 1; ; proc_id++)
+        {
+            if (DosGetProcessorStatus(proc_id, &status))
+                break;
+
+            if (status == PROC_ONLINE)
+                core_count++;
+        }
+    }
 #else
     /* other platforms */
 #endif
diff --git a/vp8/common/idctllm_test.cc b/vp8/common/idctllm_test.cc
new file mode 100755
index 000000000..0f6ebe7fe
--- /dev/null
+++ b/vp8/common/idctllm_test.cc
@@ -0,0 +1,31 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+ extern "C" {
+    void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr,
+                            int pred_stride, unsigned char *dst_ptr,
+                            int dst_stride);
+}
+
+#include "vpx_config.h"
+#include "idctllm_test.h"
+namespace
+{
+
+INSTANTIATE_TEST_CASE_P(C, IDCTTest,
+                        ::testing::Values(vp8_short_idct4x4llm_c));
+
+} // namespace
+
+int main(int argc, char **argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/vp8/common/idctllm_test.h b/vp8/common/idctllm_test.h
new file mode 100755
index 000000000..a6a694b18
--- /dev/null
+++ b/vp8/common/idctllm_test.h
@@ -0,0 +1,113 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+ #include "third_party/googletest/src/include/gtest/gtest.h"
+typedef void (*idct_fn_t)(short *input, unsigned char *pred_ptr,
+                            int pred_stride, unsigned char *dst_ptr,
+                            int dst_stride);
+namespace {
+class IDCTTest : public ::testing::TestWithParam<idct_fn_t>
+{
+  protected:
+    virtual void SetUp()
+    {
+        int i;
+
+        UUT = GetParam();
+        memset(input, 0, sizeof(input));
+        /* Set up guard blocks */
+        for(i=0; i<256; i++)
+            output[i] = ((i&0xF)<4&&(i<64))?0:-1;
+    }
+
+    idct_fn_t UUT;
+    short input[16];
+    unsigned char output[256];
+    unsigned char predict[256];
+};
+
+TEST_P(IDCTTest, TestGuardBlocks)
+{
+    int i;
+
+    for(i=0; i<256; i++)
+        if((i&0xF) < 4 && i<64)
+            EXPECT_EQ(0, output[i]) << i;
+        else
+            EXPECT_EQ(255, output[i]);
+}
+
+TEST_P(IDCTTest, TestAllZeros)
+{
+    int i;
+
+    UUT(input, output, 16, output, 16);
+
+    for(i=0; i<256; i++)
+        if((i&0xF) < 4 && i<64)
+            EXPECT_EQ(0, output[i]) << "i==" << i;
+        else
+            EXPECT_EQ(255, output[i]) << "i==" << i;
+}
+
+TEST_P(IDCTTest, TestAllOnes)
+{
+    int i;
+
+    input[0] = 4;
+    UUT(input, output, 16, output, 16);
+
+    for(i=0; i<256; i++)
+        if((i&0xF) < 4 && i<64)
+            EXPECT_EQ(1, output[i]) << "i==" << i;
+        else
+            EXPECT_EQ(255, output[i]) << "i==" << i;
+}
+
+TEST_P(IDCTTest, TestAddOne)
+{
+    int i;
+
+    for(i=0; i<256; i++)
+        predict[i] = i;
+
+    input[0] = 4;
+    UUT(input, predict, 16, output, 16);
+
+    for(i=0; i<256; i++)
+        if((i&0xF) < 4 && i<64)
+            EXPECT_EQ(i+1, output[i]) << "i==" << i;
+        else
+            EXPECT_EQ(255, output[i]) << "i==" << i;
+}
+
+TEST_P(IDCTTest, TestWithData)
+{
+    int i;
+
+    for(i=0; i<16; i++)
+        input[i] = i;
+
+    UUT(input, output, 16, output, 16);
+
+    for(i=0; i<256; i++)
+        if((i&0xF) > 3 || i>63)
+            EXPECT_EQ(255, output[i]) << "i==" << i;
+        else if(i == 0)
+            EXPECT_EQ(11, output[i]) << "i==" << i;
+        else if(i == 34)
+            EXPECT_EQ(1, output[i]) << "i==" << i;
+        else if(i == 2 || i == 17 || i == 32)
+            EXPECT_EQ(3, output[i]) << "i==" << i;
+        else
+            EXPECT_EQ(0, output[i]) << "i==" << i;
+}
+}
diff --git a/vp8/common/postproc.c b/vp8/common/postproc.c
index 87e999772..d41445d03 100644
--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c
@@ -17,8 +17,9 @@
 #include "vpx_scale/yv12extend.h"
 #include "vpx_scale/vpxscale.h"
 #include "systemdependent.h"
-#include "../encoder/variance.h"
+#include "variance.h"
 
+#include <limits.h>
 #include <math.h>
 #include <stdlib.h>
 #include <stdio.h>
@@ -361,6 +362,7 @@ void vp8_deblock(YV12_BUFFER_CONFIG         *source,
     vp8_post_proc_down_and_across(source->v_buffer, post->v_buffer, source->uv_stride, post->uv_stride, source->uv_height, source->uv_width, ppl);
 }
 
+#if !(CONFIG_TEMPORAL_DENOISING)
 void vp8_de_noise(YV12_BUFFER_CONFIG         *source,
                   YV12_BUFFER_CONFIG         *post,
                   int                         q,
@@ -397,6 +399,7 @@ void vp8_de_noise(YV12_BUFFER_CONFIG         *source,
         source->uv_width - 4, ppl);
 
 }
+#endif
 
 double vp8_gaussian(double sigma, double mu, double x)
 {
@@ -729,17 +732,17 @@ static void multiframe_quality_enhance_block
     if (blksize == 16)
     {
         act = (vp8_variance16x16(yd, yd_stride, VP8_ZEROS, 0, &sse)+128)>>8;
-        sad = (vp8_sad16x16(y, y_stride, yd, yd_stride, 0)+128)>>8;
+        sad = (vp8_sad16x16(y, y_stride, yd, yd_stride, INT_MAX)+128)>>8;
     }
     else if (blksize == 8)
     {
         act = (vp8_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse)+32)>>6;
-        sad = (vp8_sad8x8(y, y_stride, yd, yd_stride, 0)+32)>>6;
+        sad = (vp8_sad8x8(y, y_stride, yd, yd_stride, INT_MAX)+32)>>6;
     }
     else
     {
         act = (vp8_variance4x4(yd, yd_stride, VP8_ZEROS, 0, &sse)+8)>>4;
-        sad = (vp8_sad4x4(y, y_stride, yd, yd_stride, 0)+8)>>4;
+        sad = (vp8_sad4x4(y, y_stride, yd, yd_stride, INT_MAX)+8)>>4;
     }
     /* thr = qdiff/8 + log2(act) + log4(qprev) */
     thr = (qdiff>>3);
diff --git a/vp8/encoder/ppc/sad_altivec.asm b/vp8/common/ppc/sad_altivec.asm
index e5f26380f..e5f26380f 100644
--- a/vp8/encoder/ppc/sad_altivec.asm
+++ b/vp8/common/ppc/sad_altivec.asm
diff --git a/vp8/encoder/ppc/variance_altivec.asm b/vp8/common/ppc/variance_altivec.asm
index a1ebf663a..fb8d5bb1d 100644
--- a/vp8/encoder/ppc/variance_altivec.asm
+++ b/vp8/common/ppc/variance_altivec.asm
@@ -98,7 +98,7 @@
     stw     r4, 0(r7)           ;# sse
 
     mullw   r3, r3, r3          ;# sum*sum
-    srawi   r3, r3, \DS         ;# (sum*sum) >> DS
+    srlwi   r3, r3, \DS         ;# (sum*sum) >> DS
     subf    r3, r3, r4          ;# sse - ((sum*sum) >> DS)
 .endm
 
@@ -142,7 +142,7 @@
     stw     r4, 0(r7)           ;# sse
 
     mullw   r3, r3, r3          ;# sum*sum
-    srawi   r3, r3, \DS         ;# (sum*sum) >> 8
+    srlwi   r3, r3, \DS         ;# (sum*sum) >> 8
     subf    r3, r3, r4          ;# sse - ((sum*sum) >> 8)
 .endm
 
@@ -367,7 +367,7 @@ vp8_variance4x4_ppc:
     stw     r4, 0(r7)           ;# sse
 
     mullw   r3, r3, r3          ;# sum*sum
-    srawi   r3, r3, 4           ;# (sum*sum) >> 4
+    srlwi   r3, r3, 4           ;# (sum*sum) >> 4
     subf    r3, r3, r4          ;# sse - ((sum*sum) >> 4)
 
     epilogue
diff --git a/vp8/encoder/ppc/variance_subpixel_altivec.asm b/vp8/common/ppc/variance_subpixel_altivec.asm
index 301360b1d..2308373a1 100644
--- a/vp8/encoder/ppc/variance_subpixel_altivec.asm
+++ b/vp8/common/ppc/variance_subpixel_altivec.asm
@@ -157,7 +157,7 @@
     stw     r4, 0(r9)           ;# sse
 
     mullw   r3, r3, r3          ;# sum*sum
-    srawi   r3, r3, \DS         ;# (sum*sum) >> 8
+    srlwi   r3, r3, \DS         ;# (sum*sum) >> 8
     subf    r3, r3, r4          ;# sse - ((sum*sum) >> 8)
 .endm
 
diff --git a/vp8/common/reconintra.c b/vp8/common/reconintra.c
index 62af368d1..4b13777c8 100644
--- a/vp8/common/reconintra.c
+++ b/vp8/common/reconintra.c
@@ -136,21 +136,21 @@ void vp8_build_intra_predictors_mby_c(MACROBLOCKD *x)
     }
 }
 
-void vp8_build_intra_predictors_mby_s_c(MACROBLOCKD *x)
+void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x,
+                                          unsigned char * yabove_row,
+                                          unsigned char * yleft,
+                                          int left_stride,
+                                          unsigned char * ypred_ptr)
 {
-
-    unsigned char *yabove_row = x->dst.y_buffer - x->dst.y_stride;
     unsigned char yleft_col[16];
     unsigned char ytop_left = yabove_row[-1];
-    unsigned char *ypred_ptr = x->predictor;
     int r, c, i;
 
     int y_stride = x->dst.y_stride;
-    ypred_ptr = x->dst.y_buffer; /*x->predictor;*/
 
     for (i = 0; i < 16; i++)
     {
-        yleft_col[i] = x->dst.y_buffer [i* x->dst.y_stride -1];
+        yleft_col[i] = yleft[i* left_stride];
     }
 
     /* for Y */
@@ -400,24 +400,27 @@ void vp8_build_intra_predictors_mbuv_c(MACROBLOCKD *x)
     }
 }
 
-void vp8_build_intra_predictors_mbuv_s_c(MACROBLOCKD *x)
+void vp8_build_intra_predictors_mbuv_s_c(MACROBLOCKD *x,
+                                         unsigned char * uabove_row,
+                                         unsigned char * vabove_row,
+                                         unsigned char * uleft,
+                                         unsigned char * vleft,
+                                         int left_stride,
+                                         unsigned char * upred_ptr,
+                                         unsigned char * vpred_ptr)
 {
-    unsigned char *uabove_row = x->dst.u_buffer - x->dst.uv_stride;
-    unsigned char uleft_col[16];
+    unsigned char uleft_col[8];
     unsigned char utop_left = uabove_row[-1];
-    unsigned char *vabove_row = x->dst.v_buffer - x->dst.uv_stride;
-    unsigned char vleft_col[20];
+    unsigned char vleft_col[8];
     unsigned char vtop_left = vabove_row[-1];
-    unsigned char *upred_ptr = x->dst.u_buffer; /*&x->predictor[256];*/
-    unsigned char *vpred_ptr = x->dst.v_buffer; /*&x->predictor[320];*/
     int uv_stride = x->dst.uv_stride;
 
     int i, j;
 
     for (i = 0; i < 8; i++)
     {
-        uleft_col[i] = x->dst.u_buffer [i* x->dst.uv_stride -1];
-        vleft_col[i] = x->dst.v_buffer [i* x->dst.uv_stride -1];
+        uleft_col[i] = uleft [i* left_stride];
+        vleft_col[i] = vleft [i* left_stride];
     }
 
     switch (x->mode_info_context->mbmi.uv_mode)
diff --git a/vp8/common/reconintra4x4.c b/vp8/common/reconintra4x4.c
index f4424ffdd..dcc35ec15 100644
--- a/vp8/common/reconintra4x4.c
+++ b/vp8/common/reconintra4x4.c
@@ -13,20 +13,19 @@
 #include "vpx_rtcd.h"
 #include "blockd.h"
 
-void vp8_intra4x4_predict_c(unsigned char *src, int src_stride,
-                            int b_mode,
-                            unsigned char *dst, int dst_stride)
+void vp8_intra4x4_predict_d_c(unsigned char *Above,
+                              unsigned char *yleft, int left_stride,
+                              int b_mode,
+                              unsigned char *dst, int dst_stride,
+                              unsigned char top_left)
 {
     int i, r, c;
 
-    unsigned char *Above = src - src_stride;
     unsigned char Left[4];
-    unsigned char top_left = Above[-1];
-
-    Left[0] = src[-1];
-    Left[1] = src[-1 + src_stride];
-    Left[2] = src[-1 + 2 * src_stride];
-    Left[3] = src[-1 + 3 * src_stride];
+    Left[0] = yleft[0];
+    Left[1] = yleft[left_stride];
+    Left[2] = yleft[2 * left_stride];
+    Left[3] = yleft[3 * left_stride];
 
     switch (b_mode)
     {
@@ -295,24 +294,15 @@ void vp8_intra4x4_predict_c(unsigned char *src, int src_stride,
     }
 }
 
-
-
-
-
-/* copy 4 bytes from the above right down so that the 4x4 prediction modes using pixels above and
- * to the right prediction have filled in pixels to use.
- */
-void vp8_intra_prediction_down_copy(MACROBLOCKD *x)
+void vp8_intra4x4_predict_c(unsigned char *src, int src_stride,
+                            int b_mode,
+                            unsigned char *dst, int dst_stride)
 {
-    int dst_stride = x->dst.y_stride;
-    unsigned char *above_right = x->dst.y_buffer - dst_stride + 16;
-
-    unsigned int *src_ptr = (unsigned int *)above_right;
-    unsigned int *dst_ptr0 = (unsigned int *)(above_right + 4 * dst_stride);
-    unsigned int *dst_ptr1 = (unsigned int *)(above_right + 8 * dst_stride);
-    unsigned int *dst_ptr2 = (unsigned int *)(above_right + 12 * dst_stride);
+    unsigned char *Above = src - src_stride;
 
-    *dst_ptr0 = *src_ptr;
-    *dst_ptr1 = *src_ptr;
-    *dst_ptr2 = *src_ptr;
+    vp8_intra4x4_predict_d_c(Above,
+                             src - 1, src_stride,
+                             b_mode,
+                             dst, dst_stride,
+                             Above[-1]);
 }
diff --git a/vp8/common/reconintra4x4.h b/vp8/common/reconintra4x4.h
index b528df6d5..d2b0d4346 100644
--- a/vp8/common/reconintra4x4.h
+++ b/vp8/common/reconintra4x4.h
@@ -11,9 +11,22 @@
 
 #ifndef __INC_RECONINTRA4x4_H
 #define __INC_RECONINTRA4x4_H
+#include "vp8/common/blockd.h"
 
-struct macroblockd;
+static void intra_prediction_down_copy(MACROBLOCKD *xd,
+                                             unsigned char *above_right_src)
+{
+    int dst_stride = xd->dst.y_stride;
+    unsigned char *above_right_dst = xd->dst.y_buffer - dst_stride + 16;
 
-extern void vp8_intra_prediction_down_copy(struct macroblockd *x);
+    unsigned int *src_ptr = (unsigned int *)above_right_src;
+    unsigned int *dst_ptr0 = (unsigned int *)(above_right_dst + 4 * dst_stride);
+    unsigned int *dst_ptr1 = (unsigned int *)(above_right_dst + 8 * dst_stride);
+    unsigned int *dst_ptr2 = (unsigned int *)(above_right_dst + 12 * dst_stride);
+
+    *dst_ptr0 = *src_ptr;
+    *dst_ptr1 = *src_ptr;
+    *dst_ptr2 = *src_ptr;
+}
 
 #endif
diff --git a/vp8/common/rtcd_defs.sh b/vp8/common/rtcd_defs.sh
index 32d83d7ae..ff8e30c3f 100644
--- a/vp8/common/rtcd_defs.sh
+++ b/vp8/common/rtcd_defs.sh
@@ -125,14 +125,14 @@ vp8_copy_mem8x4_media=vp8_copy_mem8x4_v6
 prototype void vp8_build_intra_predictors_mby "struct macroblockd *x"
 specialize vp8_build_intra_predictors_mby sse2 ssse3 neon
 
-prototype void vp8_build_intra_predictors_mby_s "struct macroblockd *x"
-specialize vp8_build_intra_predictors_mby_s sse2 ssse3 neon
+prototype void vp8_build_intra_predictors_mby_s "struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr"
+#TODO: fix assembly --- specialize vp8_build_intra_predictors_mby_s sse2 ssse3 neon
 
 prototype void vp8_build_intra_predictors_mbuv "struct macroblockd *x"
 specialize vp8_build_intra_predictors_mbuv sse2 ssse3
 
-prototype void vp8_build_intra_predictors_mbuv_s "struct macroblockd *x"
-specialize vp8_build_intra_predictors_mbuv_s sse2 ssse3
+prototype void vp8_build_intra_predictors_mbuv_s "struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row,  unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr"
+#TODO: fix assembly --- specialize vp8_build_intra_predictors_mbuv_s sse2 ssse3
 
 prototype void vp8_intra4x4_predict "unsigned char *src, int src_stride, int b_mode, unsigned char *dst, int dst_stride"
 specialize vp8_intra4x4_predict media
diff --git a/vp8/encoder/sad_c.c b/vp8/common/sad_c.c
index 3b6e26c4e..f745bbd3d 100644
--- a/vp8/encoder/sad_c.c
+++ b/vp8/common/sad_c.c
@@ -13,24 +13,30 @@
 #include "vpx_config.h"
 #include "vpx/vpx_integer.h"
 
-unsigned int vp8_sad16x16_c(
+static __inline
+unsigned int sad_mx_n_c(
     const unsigned char *src_ptr,
     int  src_stride,
     const unsigned char *ref_ptr,
     int  ref_stride,
-    int max_sad)
+    int  max_sad,
+    int  m,
+    int  n)
 {
 
     int r, c;
     unsigned int sad = 0;
 
-    for (r = 0; r < 16; r++)
+    for (r = 0; r < n; r++)
     {
-        for (c = 0; c < 16; c++)
+        for (c = 0; c < m; c++)
         {
             sad += abs(src_ptr[c] - ref_ptr[c]);
         }
 
+        if (sad > max_sad)
+          break;
+
         src_ptr += src_stride;
         ref_ptr += ref_stride;
     }
@@ -38,32 +44,19 @@ unsigned int vp8_sad16x16_c(
     return sad;
 }
 
+/* max_sad is provided as an optional optimization point. Alternative
+ * implementations of these functions are not required to check it.
+ */
 
-static __inline
-unsigned int sad_mx_n_c(
+unsigned int vp8_sad16x16_c(
     const unsigned char *src_ptr,
     int  src_stride,
     const unsigned char *ref_ptr,
     int  ref_stride,
-    int m,
-    int n)
+    int  max_sad)
 {
 
-    int r, c;
-    unsigned int sad = 0;
-
-    for (r = 0; r < n; r++)
-    {
-        for (c = 0; c < m; c++)
-        {
-            sad += abs(src_ptr[c] - ref_ptr[c]);
-        }
-
-        src_ptr += src_stride;
-        ref_ptr += ref_stride;
-    }
-
-    return sad;
+    return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, max_sad, 16, 16);
 }
 
 
@@ -72,10 +65,10 @@ unsigned int vp8_sad8x8_c(
     int  src_stride,
     const unsigned char *ref_ptr,
     int  ref_stride,
-    int max_sad)
+    int  max_sad)
 {
 
-    return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 8);
+    return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, max_sad, 8, 8);
 }
 
 
@@ -84,10 +77,10 @@ unsigned int vp8_sad16x8_c(
     int  src_stride,
     const unsigned char *ref_ptr,
     int  ref_stride,
-    int max_sad)
+    int  max_sad)
 {
 
-    return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 8);
+    return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, max_sad, 16, 8);
 
 }
 
@@ -97,10 +90,10 @@ unsigned int vp8_sad8x16_c(
     int  src_stride,
     const unsigned char *ref_ptr,
     int  ref_stride,
-    int max_sad)
+    int  max_sad)
 {
 
-    return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 16);
+    return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, max_sad, 8, 16);
 }
 
 
@@ -109,10 +102,10 @@ unsigned int vp8_sad4x4_c(
     int  src_stride,
     const unsigned char *ref_ptr,
     int  ref_stride,
-    int max_sad)
+    int  max_sad)
 {
 
-    return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 4, 4);
+    return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, max_sad, 4, 4);
 }
 
 void vp8_sad16x16x3_c(
diff --git a/vp8/common/threading.h b/vp8/common/threading.h
index da6347d3c..ed9e3e60d 100644
--- a/vp8/common/threading.h
+++ b/vp8/common/threading.h
@@ -33,6 +33,29 @@
 #define pthread_getspecific(ts_key) TlsGetValue(ts_key)
 #define pthread_setspecific(ts_key, value) TlsSetValue(ts_key, (void *)value)
 #define pthread_self() GetCurrentThreadId()
+
+#elif defined(__OS2__)
+/* OS/2 */
+#define INCL_DOS
+#include <os2.h>
+
+#include <stdlib.h>
+#define THREAD_FUNCTION void
+#define THREAD_FUNCTION_RETURN void
+#define THREAD_SPECIFIC_INDEX PULONG
+#define pthread_t TID
+#define pthread_attr_t ULONG
+#define pthread_create(thhandle,attr,thfunc,tharg) \
+    ((int)((*(thhandle)=_beginthread(thfunc,NULL,1024*1024,tharg))==-1))
+#define pthread_join(thread, result) ((int)DosWaitThread(&(thread),0))
+#define pthread_detach(thread) 0
+#define thread_sleep(nms) DosSleep(nms)
+#define pthread_cancel(thread) DosKillThread(thread)
+#define ts_key_create(ts_key, destructor) \
+    DosAllocThreadLocalMemory(1, &(ts_key));
+#define pthread_getspecific(ts_key) ((void *)(*(ts_key)))
+#define pthread_setspecific(ts_key, value) (*(ts_key)=(ULONG)(value))
+#define pthread_self() _gettid()
 #else
 #ifdef __APPLE__
 #include <mach/mach_init.h>
@@ -64,6 +87,76 @@
 #define sem_destroy(sem) if(*sem)((int)(CloseHandle(*sem))==TRUE)
 #define thread_sleep(nms) Sleep(nms)
 
+#elif defined(__OS2__)
+typedef struct
+{
+    HEV  event;
+    HMTX wait_mutex;
+    HMTX count_mutex;
+    int  count;
+} sem_t;
+
+static inline int sem_init(sem_t *sem, int pshared, unsigned int value)
+{
+    DosCreateEventSem(NULL, &sem->event, pshared ? DC_SEM_SHARED : 0,
+                      value > 0 ? TRUE : FALSE);
+    DosCreateMutexSem(NULL, &sem->wait_mutex, 0, FALSE);
+    DosCreateMutexSem(NULL, &sem->count_mutex, 0, FALSE);
+
+    sem->count = value;
+
+    return 0;
+}
+
+static inline int sem_wait(sem_t * sem)
+{
+    DosRequestMutexSem(sem->wait_mutex, -1);
+
+    DosWaitEventSem(sem->event, -1);
+
+    DosRequestMutexSem(sem->count_mutex, -1);
+
+    sem->count--;
+    if (sem->count == 0)
+    {
+        ULONG post_count;
+
+        DosResetEventSem(sem->event, &post_count);
+    }
+
+    DosReleaseMutexSem(sem->count_mutex);
+
+    DosReleaseMutexSem(sem->wait_mutex);
+
+    return 0;
+}
+
+static inline int sem_post(sem_t * sem)
+{
+    DosRequestMutexSem(sem->count_mutex, -1);
+
+    if (sem->count < 32768)
+    {
+        sem->count++;
+        DosPostEventSem(sem->event);
+    }
+
+    DosReleaseMutexSem(sem->count_mutex);
+
+    return 0;
+}
+
+static inline int sem_destroy(sem_t * sem)
+{
+    DosCloseEventSem(sem->event);
+    DosCloseMutexSem(sem->wait_mutex);
+    DosCloseMutexSem(sem->count_mutex);
+
+    return 0;
+}
+
+#define thread_sleep(nms) DosSleep(nms)
+
 #else
 
 #ifdef __APPLE__
diff --git a/vp8/encoder/variance.h b/vp8/common/variance.h
index b77aa28f4..b77aa28f4 100644
--- a/vp8/encoder/variance.h
+++ b/vp8/common/variance.h
diff --git a/vp8/encoder/variance_c.c b/vp8/common/variance_c.c
index c7b9c2209..996404dd6 100644
--- a/vp8/encoder/variance_c.c
+++ b/vp8/common/variance_c.c
@@ -10,7 +10,7 @@
 
 
 #include "variance.h"
-#include "vp8/common/filter.h"
+#include "filter.h"
 
 
 unsigned int vp8_get_mb_ss_c
@@ -75,7 +75,7 @@ unsigned int vp8_variance16x16_c(
 
     variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
     *sse = var;
-    return (var - ((avg * avg) >> 8));
+    return (var - ((unsigned int)(avg * avg) >> 8));
 }
 
 unsigned int vp8_variance8x16_c(
@@ -91,7 +91,7 @@ unsigned int vp8_variance8x16_c(
 
     variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, &var, &avg);
     *sse = var;
-    return (var - ((avg * avg) >> 7));
+    return (var - ((unsigned int)(avg * avg) >> 7));
 }
 
 unsigned int vp8_variance16x8_c(
@@ -107,7 +107,7 @@ unsigned int vp8_variance16x8_c(
 
     variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, &var, &avg);
     *sse = var;
-    return (var - ((avg * avg) >> 7));
+    return (var - ((unsigned int)(avg * avg) >> 7));
 }
 
 
@@ -124,7 +124,7 @@ unsigned int vp8_variance8x8_c(
 
     variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, &var, &avg);
     *sse = var;
-    return (var - ((avg * avg) >> 6));
+    return (var - ((unsigned int)(avg * avg) >> 6));
 }
 
 unsigned int vp8_variance4x4_c(
@@ -140,7 +140,7 @@ unsigned int vp8_variance4x4_c(
 
     variance(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4, &var, &avg);
     *sse = var;
-    return (var - ((avg * avg) >> 4));
+    return (var - ((unsigned int)(avg * avg) >> 4));
 }
 
 
diff --git a/vp8/common/x86/idctllm_mmx_test.cc b/vp8/common/x86/idctllm_mmx_test.cc
new file mode 100755
index 000000000..8c115335e
--- /dev/null
+++ b/vp8/common/x86/idctllm_mmx_test.cc
@@ -0,0 +1,31 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+ extern "C" {
+    void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred_ptr,
+                            int pred_stride, unsigned char *dst_ptr,
+                            int dst_stride);
+}
+
+#include "vp8/common/idctllm_test.h"
+
+namespace
+{
+
+INSTANTIATE_TEST_CASE_P(MMX, IDCTTest,
+                        ::testing::Values(vp8_short_idct4x4llm_mmx));
+
+} // namespace
+
+int main(int argc, char **argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/vp8/common/x86/postproc_x86.c b/vp8/common/x86/postproc_x86.c
new file mode 100644
index 000000000..a25921bee
--- /dev/null
+++ b/vp8/common/x86/postproc_x86.c
@@ -0,0 +1,21 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/* On Android NDK, rand is inlined function, but postproc needs rand symbol */
+#if defined(__ANDROID__)
+#define rand __rand
+#include <stdlib.h>
+#undef rand
+
+extern int rand(void)
+{
+  return __rand();
+}
+#endif
diff --git a/vp8/encoder/x86/sad_mmx.asm b/vp8/common/x86/sad_mmx.asm
index 407b39979..407b39979 100644
--- a/vp8/encoder/x86/sad_mmx.asm
+++ b/vp8/common/x86/sad_mmx.asm
diff --git a/vp8/encoder/x86/sad_sse2.asm b/vp8/common/x86/sad_sse2.asm
index fa8e3e3f8..0b01d7b61 100644
--- a/vp8/encoder/x86/sad_sse2.asm
+++ b/vp8/common/x86/sad_sse2.asm
@@ -89,7 +89,7 @@ sym(vp8_sad16x16_wmt):
 ;    int  src_stride,
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride,
-;    int  max_err)
+;    int  max_sad)
 global sym(vp8_sad8x16_wmt)
 sym(vp8_sad8x16_wmt):
     push        rbp
diff --git a/vp8/encoder/x86/sad_sse3.asm b/vp8/common/x86/sad_sse3.asm
index a2550974c..c2af3c86f 100644
--- a/vp8/encoder/x86/sad_sse3.asm
+++ b/vp8/common/x86/sad_sse3.asm
@@ -19,7 +19,7 @@
   %define     end_ptr       rcx
   %define     ret_var       rbx
   %define     result_ptr    arg(4)
-  %define     max_err       arg(4)
+  %define     max_sad       arg(4)
   %define     height        dword ptr arg(4)
     push        rbp
     mov         rbp,        rsp
@@ -42,7 +42,7 @@
     %define     end_ptr     r10
     %define     ret_var     r11
     %define     result_ptr  [rsp+xmm_stack_space+8+4*8]
-    %define     max_err     [rsp+xmm_stack_space+8+4*8]
+    %define     max_sad     [rsp+xmm_stack_space+8+4*8]
     %define     height      dword ptr [rsp+xmm_stack_space+8+4*8]
   %else
     %define     src_ptr     rdi
@@ -52,7 +52,7 @@
     %define     end_ptr     r9
     %define     ret_var     r10
     %define     result_ptr  r8
-    %define     max_err     r8
+    %define     max_sad     r8
     %define     height      r8
   %endif
 %endif
@@ -67,7 +67,7 @@
   %define     end_ptr
   %define     ret_var
   %define     result_ptr
-  %define     max_err
+  %define     max_sad
   %define     height
 
 %if ABI_IS_32BIT
@@ -587,7 +587,7 @@ sym(vp8_sad4x4x3_sse3):
 ;    int  src_stride,
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride,
-;    int  max_err)
+;    int  max_sad)
 ;%define lddqu movdqu
 global sym(vp8_sad16x16_sse3)
 sym(vp8_sad16x16_sse3):
diff --git a/vp8/encoder/x86/sad_sse4.asm b/vp8/common/x86/sad_sse4.asm
index 03ecec4b3..03ecec4b3 100644
--- a/vp8/encoder/x86/sad_sse4.asm
+++ b/vp8/common/x86/sad_sse4.asm
diff --git a/vp8/encoder/x86/sad_ssse3.asm b/vp8/common/x86/sad_ssse3.asm
index 95b6c89e6..95b6c89e6 100644
--- a/vp8/encoder/x86/sad_ssse3.asm
+++ b/vp8/common/x86/sad_ssse3.asm
diff --git a/vp8/encoder/x86/variance_impl_mmx.asm b/vp8/common/x86/variance_impl_mmx.asm
index 2be8bbeb3..2be8bbeb3 100644
--- a/vp8/encoder/x86/variance_impl_mmx.asm
+++ b/vp8/common/x86/variance_impl_mmx.asm
diff --git a/vp8/encoder/x86/variance_impl_sse2.asm b/vp8/common/x86/variance_impl_sse2.asm
index 762922091..762922091 100644
--- a/vp8/encoder/x86/variance_impl_sse2.asm
+++ b/vp8/common/x86/variance_impl_sse2.asm
diff --git a/vp8/encoder/x86/variance_impl_ssse3.asm b/vp8/common/x86/variance_impl_ssse3.asm
index 97e8b0e2e..97e8b0e2e 100644
--- a/vp8/encoder/x86/variance_impl_ssse3.asm
+++ b/vp8/common/x86/variance_impl_ssse3.asm
diff --git a/vp8/encoder/x86/variance_mmx.c b/vp8/common/x86/variance_mmx.c
index e2524b46a..0c4dd4a98 100644
--- a/vp8/encoder/x86/variance_mmx.c
+++ b/vp8/common/x86/variance_mmx.c
@@ -9,7 +9,7 @@
  */
 
 #include "vpx_config.h"
-#include "vp8/encoder/variance.h"
+#include "vp8/common/variance.h"
 #include "vp8/common/pragmas.h"
 #include "vpx_ports/mem.h"
 #include "vp8/common/x86/filter_x86.h"
@@ -91,7 +91,7 @@ unsigned int vp8_variance4x4_mmx(
 
     vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
     *sse = var;
-    return (var - ((avg * avg) >> 4));
+    return (var - ((unsigned int)(avg * avg) >> 4));
 
 }
 
@@ -108,7 +108,7 @@ unsigned int vp8_variance8x8_mmx(
     vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
     *sse = var;
 
-    return (var - ((avg * avg) >> 6));
+    return (var - ((unsigned int)(avg * avg) >> 6));
 
 }
 
@@ -153,7 +153,7 @@ unsigned int vp8_variance16x16_mmx(
     var = sse0 + sse1 + sse2 + sse3;
     avg = sum0 + sum1 + sum2 + sum3;
     *sse = var;
-    return (var - ((avg * avg) >> 8));
+    return (var - ((unsigned int)(avg * avg) >> 8));
 }
 
 unsigned int vp8_variance16x8_mmx(
@@ -172,7 +172,7 @@ unsigned int vp8_variance16x8_mmx(
     var = sse0 + sse1;
     avg = sum0 + sum1;
     *sse = var;
-    return (var - ((avg * avg) >> 7));
+    return (var - ((unsigned int)(avg * avg) >> 7));
 
 }
 
@@ -194,7 +194,7 @@ unsigned int vp8_variance8x16_mmx(
     avg = sum0 + sum1;
     *sse = var;
 
-    return (var - ((avg * avg) >> 7));
+    return (var - ((unsigned int)(avg * avg) >> 7));
 
 }
 
@@ -219,7 +219,7 @@ unsigned int vp8_sub_pixel_variance4x4_mmx
         &xsum, &xxsum
     );
     *sse = xxsum;
-    return (xxsum - ((xsum * xsum) >> 4));
+    return (xxsum - ((unsigned int)(xsum * xsum) >> 4));
 }
 
 
@@ -244,7 +244,7 @@ unsigned int vp8_sub_pixel_variance8x8_mmx
         &xsum, &xxsum
     );
     *sse = xxsum;
-    return (xxsum - ((xsum * xsum) >> 6));
+    return (xxsum - ((unsigned int)(xsum * xsum) >> 6));
 }
 
 unsigned int vp8_sub_pixel_variance16x16_mmx
@@ -282,7 +282,7 @@ unsigned int vp8_sub_pixel_variance16x16_mmx
     xxsum0 += xxsum1;
 
     *sse = xxsum0;
-    return (xxsum0 - ((xsum0 * xsum0) >> 8));
+    return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 8));
 
 
 }
@@ -335,7 +335,7 @@ unsigned int vp8_sub_pixel_variance16x8_mmx
     xxsum0 += xxsum1;
 
     *sse = xxsum0;
-    return (xxsum0 - ((xsum0 * xsum0) >> 7));
+    return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 7));
 }
 
 unsigned int vp8_sub_pixel_variance8x16_mmx
@@ -358,7 +358,7 @@ unsigned int vp8_sub_pixel_variance8x16_mmx
         &xsum, &xxsum
     );
     *sse = xxsum;
-    return (xxsum - ((xsum * xsum) >> 7));
+    return (xxsum - ((unsigned int)(xsum * xsum) >> 7));
 }
 
 
diff --git a/vp8/encoder/x86/variance_sse2.c b/vp8/common/x86/variance_sse2.c
index 39213b03d..2769a302b 100644
--- a/vp8/encoder/x86/variance_sse2.c
+++ b/vp8/common/x86/variance_sse2.c
@@ -9,7 +9,7 @@
  */
 
 #include "vpx_config.h"
-#include "vp8/encoder/variance.h"
+#include "vp8/common/variance.h"
 #include "vp8/common/pragmas.h"
 #include "vpx_ports/mem.h"
 #include "vp8/common/x86/filter_x86.h"
@@ -148,7 +148,7 @@ unsigned int vp8_variance4x4_wmt(
 
     vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
     *sse = var;
-    return (var - ((avg * avg) >> 4));
+    return (var - ((unsigned int)(avg * avg) >> 4));
 
 }
 
@@ -165,7 +165,7 @@ unsigned int vp8_variance8x8_wmt
 
     vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
     *sse = var;
-    return (var - ((avg * avg) >> 6));
+    return (var - ((unsigned int)(avg * avg) >> 6));
 
 }
 
@@ -184,7 +184,7 @@ unsigned int vp8_variance16x16_wmt
 
     vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
     *sse = sse0;
-    return (sse0 - ((sum0 * sum0) >> 8));
+    return (sse0 - ((unsigned int)(sum0 * sum0) >> 8));
 }
 unsigned int vp8_mse16x16_wmt(
     const unsigned char *src_ptr,
@@ -220,7 +220,7 @@ unsigned int vp8_variance16x8_wmt
     var = sse0 + sse1;
     avg = sum0 + sum1;
     *sse = var;
-    return (var - ((avg * avg) >> 7));
+    return (var - ((unsigned int)(avg * avg) >> 7));
 
 }
 
@@ -241,7 +241,7 @@ unsigned int vp8_variance8x16_wmt
     var = sse0 + sse1;
     avg = sum0 + sum1;
     *sse = var;
-    return (var - ((avg * avg) >> 7));
+    return (var - ((unsigned int)(avg * avg) >> 7));
 
 }
 
@@ -265,7 +265,7 @@ unsigned int vp8_sub_pixel_variance4x4_wmt
         &xsum, &xxsum
     );
     *sse = xxsum;
-    return (xxsum - ((xsum * xsum) >> 4));
+    return (xxsum - ((unsigned int)(xsum * xsum) >> 4));
 }
 
 
@@ -314,7 +314,7 @@ unsigned int vp8_sub_pixel_variance8x8_wmt
     }
 
     *sse = xxsum;
-    return (xxsum - ((xsum * xsum) >> 6));
+    return (xxsum - ((unsigned int)(xsum * xsum) >> 6));
 }
 
 unsigned int vp8_sub_pixel_variance16x16_wmt
@@ -375,7 +375,7 @@ unsigned int vp8_sub_pixel_variance16x16_wmt
     }
 
     *sse = xxsum0;
-    return (xxsum0 - ((xsum0 * xsum0) >> 8));
+    return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 8));
 }
 
 unsigned int vp8_sub_pixel_mse16x16_wmt(
@@ -446,7 +446,7 @@ unsigned int vp8_sub_pixel_variance16x8_wmt
     }
 
     *sse = xxsum0;
-    return (xxsum0 - ((xsum0 * xsum0) >> 7));
+    return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 7));
 }
 
 unsigned int vp8_sub_pixel_variance8x16_wmt
@@ -494,7 +494,7 @@ unsigned int vp8_sub_pixel_variance8x16_wmt
     }
 
     *sse = xxsum;
-    return (xxsum - ((xsum * xsum) >> 7));
+    return (xxsum - ((unsigned int)(xsum * xsum) >> 7));
 }
 
 
@@ -514,7 +514,7 @@ unsigned int vp8_variance_halfpixvar16x16_h_wmt(
         &xsum0, &xxsum0);
 
     *sse = xxsum0;
-    return (xxsum0 - ((xsum0 * xsum0) >> 8));
+    return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 8));
 }
 
 
@@ -533,7 +533,7 @@ unsigned int vp8_variance_halfpixvar16x16_v_wmt(
         &xsum0, &xxsum0);
 
     *sse = xxsum0;
-    return (xxsum0 - ((xsum0 * xsum0) >> 8));
+    return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 8));
 }
 
 
@@ -553,5 +553,5 @@ unsigned int vp8_variance_halfpixvar16x16_hv_wmt(
         &xsum0, &xxsum0);
 
     *sse = xxsum0;
-    return (xxsum0 - ((xsum0 * xsum0) >> 8));
+    return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 8));
 }
diff --git a/vp8/encoder/x86/variance_ssse3.c b/vp8/common/x86/variance_ssse3.c
index 73f2e01a2..1be0d929d 100644
--- a/vp8/encoder/x86/variance_ssse3.c
+++ b/vp8/common/x86/variance_ssse3.c
@@ -9,7 +9,7 @@
  */
 
 #include "vpx_config.h"
-#include "vp8/encoder/variance.h"
+#include "vp8/common/variance.h"
 #include "vp8/common/pragmas.h"
 #include "vpx_ports/mem.h"
 
@@ -112,7 +112,7 @@ unsigned int vp8_sub_pixel_variance16x16_ssse3
     }
 
     *sse = xxsum0;
-    return (xxsum0 - ((xsum0 * xsum0) >> 8));
+    return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 8));
 }
 
 unsigned int vp8_sub_pixel_variance16x8_ssse3
@@ -161,5 +161,5 @@ unsigned int vp8_sub_pixel_variance16x8_ssse3
     }
 
     *sse = xxsum0;
-    return (xxsum0 - ((xsum0 * xsum0) >> 7));
+    return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 7));
 }
diff --git a/vp8/decoder/decodemv.c b/vp8/decoder/decodemv.c
index 5b913ae6b..4ade1c09e 100644
--- a/vp8/decoder/decodemv.c
+++ b/vp8/decoder/decodemv.c
@@ -138,30 +138,6 @@ static void read_mvcontexts(vp8_reader *bc, MV_CONTEXT *mvc)
     while (++i < 2);
 }
 
-static int_mv sub_mv_ref(vp8_reader *bc, const vp8_prob *p, int_mv abovemv,
-                         int_mv leftmv, int_mv best_mv, const MV_CONTEXT * mvc)
-{
-    int_mv blockmv;
-    blockmv.as_int = 0;
-    if( vp8_read(bc, p[0]) )
-    {
-        if( vp8_read(bc, p[1]) )
-        {
-            if( vp8_read(bc, p[2]) )
-            {
-                read_mv(bc, &blockmv.as_mv, (const MV_CONTEXT *) mvc);
-                blockmv.as_mv.row += best_mv.as_mv.row;
-                blockmv.as_mv.col += best_mv.as_mv.col;
-            }
-            return blockmv;
-        }
-        else
-            return abovemv;
-    }
-    else
-        return leftmv;
-}
-
 static const unsigned char mbsplit_fill_count[4] = {8, 8, 4, 1};
 static const unsigned char mbsplit_fill_offset[4][16] = {
     { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15},
@@ -171,8 +147,6 @@ static const unsigned char mbsplit_fill_offset[4][16] = {
 };
 
 
-
-
 static void mb_mode_mv_init(VP8D_COMP *pbi)
 {
     vp8_reader *const bc = & pbi->bc;
@@ -235,11 +209,11 @@ const vp8_prob vp8_sub_mv_ref_prob3 [8][VP8_SUBMVREFS-1] =
 };
 
 static
-const vp8_prob * get_sub_mv_ref_prob(const int_mv *l, const int_mv *a)
+const vp8_prob * get_sub_mv_ref_prob(const int left, const int above)
 {
-    int lez = (l->as_int == 0);
-    int aez = (a->as_int == 0);
-    int lea = (l->as_int == a->as_int);
+    int lez = (left == 0);
+    int aez = (above == 0);
+    int lea = (left == above);
     const vp8_prob * prob;
 
     prob = vp8_sub_mv_ref_prob3[(aez << 2) |
@@ -250,7 +224,8 @@ const vp8_prob * get_sub_mv_ref_prob(const int_mv *l, const int_mv *a)
 }
 
 static void decode_split_mv(vp8_reader *const bc, MODE_INFO *mi,
-                        MB_MODE_INFO *mbmi, int mis, int_mv best_mv,
+                        const MODE_INFO *left_mb, const MODE_INFO *above_mb,
+                        MB_MODE_INFO *mbmi, int_mv best_mv,
                         MV_CONTEXT *const mvc, int mb_to_left_edge,
                         int mb_to_right_edge, int mb_to_top_edge,
                         int mb_to_bottom_edge)
@@ -273,7 +248,6 @@ static void decode_split_mv(vp8_reader *const bc, MODE_INFO *mi,
         }
     }
 
-    mbmi->need_to_clamp_mvs = 0;
     do  /* for each subset j */
     {
         int_mv leftmv, abovemv;
@@ -283,18 +257,60 @@ static void decode_split_mv(vp8_reader *const bc, MODE_INFO *mi,
         const vp8_prob *prob;
         k = vp8_mbsplit_offset[s][j];
 
-        leftmv.as_int = left_block_mv(mi, k);
-        abovemv.as_int = above_block_mv(mi, k, mis);
+        if (!(k & 3))
+        {
+            /* On L edge, get from MB to left of us */
+            if(left_mb->mbmi.mode != SPLITMV)
+                leftmv.as_int =  left_mb->mbmi.mv.as_int;
+            else
+                leftmv.as_int =  (left_mb->bmi + k + 4 - 1)->mv.as_int;
+        }
+        else
+            leftmv.as_int =  (mi->bmi + k - 1)->mv.as_int;
 
-        prob = get_sub_mv_ref_prob(&leftmv, &abovemv);
+        if (!(k >> 2))
+        {
+            /* On top edge, get from MB above us */
+            if(above_mb->mbmi.mode != SPLITMV)
+                abovemv.as_int =  above_mb->mbmi.mv.as_int;
+            else
+                abovemv.as_int =  (above_mb->bmi + k + 16 - 4)->mv.as_int;
+        }
+        else
+            abovemv.as_int = (mi->bmi + k - 4)->mv.as_int;
 
-        blockmv = sub_mv_ref(bc, prob, abovemv, leftmv, best_mv, mvc);
+        prob = get_sub_mv_ref_prob(leftmv.as_int, abovemv.as_int);
 
-        mbmi->need_to_clamp_mvs |= vp8_check_mv_bounds(&blockmv,
-                                                  mb_to_left_edge,
-                                                  mb_to_right_edge,
-                                                  mb_to_top_edge,
-                                                  mb_to_bottom_edge);
+        if( vp8_read(bc, prob[0]) )
+        {
+            if( vp8_read(bc, prob[1]) )
+            {
+                blockmv.as_int = 0;
+                if( vp8_read(bc, prob[2]) )
+                {
+                    blockmv.as_mv.row = read_mvcomponent(bc, &mvc[0]) << 1;
+                    blockmv.as_mv.row += best_mv.as_mv.row;
+                    blockmv.as_mv.col = read_mvcomponent(bc, &mvc[1]) << 1;
+                    blockmv.as_mv.col += best_mv.as_mv.col;
+
+                    mbmi->need_to_clamp_mvs |= vp8_check_mv_bounds(&blockmv,
+                                                              mb_to_left_edge,
+                                                              mb_to_right_edge,
+                                                              mb_to_top_edge,
+                                                              mb_to_bottom_edge);
+                }
+            }
+            else
+            {
+                blockmv.as_int = abovemv.as_int;
+                mbmi->need_to_clamp_mvs |= above_mb->mbmi.need_to_clamp_mvs;
+            }
+        }
+        else
+        {
+            blockmv.as_int = leftmv.as_int;
+            mbmi->need_to_clamp_mvs |= left_mb->mbmi.need_to_clamp_mvs;
+        }
 
         {
             /* Fill (uniform) modes, mvs of jth subset.
@@ -318,15 +334,13 @@ static void decode_split_mv(vp8_reader *const bc, MODE_INFO *mi,
     mbmi->partitioning = s;
 }
 
-static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
-                            int mb_row, int mb_col)
+static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi)
 {
     vp8_reader *const bc = & pbi->bc;
     mbmi->ref_frame = (MV_REFERENCE_FRAME) vp8_read(bc, pbi->prob_intra);
     if (mbmi->ref_frame)    /* inter MB */
     {
         enum {CNT_INTRA, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV};
-        vp8_prob mv_ref_p [VP8_MVREFS-1];
         int cnt[4];
         int *cntx = cnt;
         int_mv near_mvs[4];
@@ -335,9 +349,7 @@ static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
         const MODE_INFO *above = mi - mis;
         const MODE_INFO *left = mi - 1;
         const MODE_INFO *aboveleft = above - 1;
-        MV_CONTEXT *const mvc = pbi->common.fc.mvc;
         int *ref_frame_sign_bias = pbi->common.ref_frame_sign_bias;
-        int propogate_mv_for_ec = 0;
 
         mbmi->need_to_clamp_mvs = 0;
 
@@ -411,36 +423,13 @@ static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
                 cnt[CNT_INTRA] += 1;
         }
 
-        mv_ref_p[0] = vp8_mode_contexts [cnt[CNT_INTRA]] [0];
-
-        if( vp8_read(bc, mv_ref_p[0]) )
+        if( vp8_read(bc, vp8_mode_contexts [cnt[CNT_INTRA]] [0]) )
         {
-            int mb_to_left_edge;
-            int mb_to_right_edge;
-
-            /* Distance of Mb to the various image edges.
-             * These specified to 8th pel as they are always compared to MV
-             * values that are in 1/8th pel units
-             */
-            pbi->mb.mb_to_left_edge =
-            mb_to_left_edge = -((mb_col * 16) << 3);
-            mb_to_left_edge -= LEFT_TOP_MARGIN;
-
-            pbi->mb.mb_to_right_edge =
-            mb_to_right_edge = ((pbi->common.mb_cols - 1 - mb_col) * 16) << 3;
-            mb_to_right_edge += RIGHT_BOTTOM_MARGIN;
 
             /* If we have three distinct MV's ... */
-            if (cnt[CNT_SPLITMV])
-            {
-                /* See if above-left MV can be merged with NEAREST */
-                if (nmv->as_int == near_mvs[CNT_NEAREST].as_int)
-                    cnt[CNT_NEAREST] += 1;
-            }
-
-            cnt[CNT_SPLITMV] = ((above->mbmi.mode == SPLITMV)
-                                + (left->mbmi.mode == SPLITMV)) * 2
-                               + (aboveleft->mbmi.mode == SPLITMV);
+            /* See if above-left MV can be merged with NEAREST */
+            cnt[CNT_NEAREST] += ( (cnt[CNT_SPLITMV] > 0) &
+                (nmv->as_int == near_mvs[CNT_NEAREST].as_int));
 
             /* Swap near and nearest if necessary */
             if (cnt[CNT_NEAR] > cnt[CNT_NEAREST])
@@ -454,35 +443,42 @@ static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
                 near_mvs[CNT_NEAR].as_int = tmp;
             }
 
-            mv_ref_p[1] = vp8_mode_contexts [cnt[CNT_NEAREST]] [1];
-
-            if( vp8_read(bc, mv_ref_p[1]) )
+            if( vp8_read(bc, vp8_mode_contexts [cnt[CNT_NEAREST]] [1]) )
             {
-                mv_ref_p[2] = vp8_mode_contexts [cnt[CNT_NEAR]] [2];
 
-                if( vp8_read(bc, mv_ref_p[2]) )
+                if( vp8_read(bc, vp8_mode_contexts [cnt[CNT_NEAR]] [2]) )
                 {
                     int mb_to_top_edge;
                     int mb_to_bottom_edge;
+                    int mb_to_left_edge;
+                    int mb_to_right_edge;
+                    MV_CONTEXT *const mvc = pbi->common.fc.mvc;
+                    int near_index;
 
                     mb_to_top_edge = pbi->mb.mb_to_top_edge;
                     mb_to_bottom_edge = pbi->mb.mb_to_bottom_edge;
                     mb_to_top_edge -= LEFT_TOP_MARGIN;
                     mb_to_bottom_edge += RIGHT_BOTTOM_MARGIN;
+                    mb_to_right_edge = pbi->mb.mb_to_right_edge;
+                    mb_to_right_edge += RIGHT_BOTTOM_MARGIN;
+                    mb_to_left_edge = pbi->mb.mb_to_left_edge;
+                    mb_to_left_edge -= LEFT_TOP_MARGIN;
 
                     /* Use near_mvs[0] to store the "best" MV */
-                    if (cnt[CNT_NEAREST] >= cnt[CNT_INTRA])
-                        near_mvs[CNT_INTRA] = near_mvs[CNT_NEAREST];
+                    near_index = CNT_INTRA +
+                        (cnt[CNT_NEAREST] >= cnt[CNT_INTRA]);
 
-                    mv_ref_p[3] = vp8_mode_contexts [cnt[CNT_SPLITMV]] [3];
+                    vp8_clamp_mv2(&near_mvs[near_index], &pbi->mb);
 
-                    vp8_clamp_mv2(&near_mvs[CNT_INTRA], &pbi->mb);
+                    cnt[CNT_SPLITMV] = ((above->mbmi.mode == SPLITMV)
+                                        + (left->mbmi.mode == SPLITMV)) * 2
+                                       + (aboveleft->mbmi.mode == SPLITMV);
 
-                    if( vp8_read(bc, mv_ref_p[3]) )
+                    if( vp8_read(bc, vp8_mode_contexts [cnt[CNT_SPLITMV]] [3]) )
                     {
-                        decode_split_mv(bc, mi,
-                                                    mbmi, mis,
-                                                    near_mvs[CNT_INTRA],
+                        decode_split_mv(bc, mi, left, above,
+                                                    mbmi,
+                                                    near_mvs[near_index],
                                                     mvc, mb_to_left_edge,
                                                     mb_to_right_edge,
                                                     mb_to_top_edge,
@@ -494,8 +490,8 @@ static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
                     {
                         int_mv *const mbmi_mv = & mbmi->mv;
                         read_mv(bc, &mbmi_mv->as_mv, (const MV_CONTEXT *) mvc);
-                        mbmi_mv->as_mv.row += near_mvs[CNT_INTRA].as_mv.row;
-                        mbmi_mv->as_mv.col += near_mvs[CNT_INTRA].as_mv.col;
+                        mbmi_mv->as_mv.row += near_mvs[near_index].as_mv.row;
+                        mbmi_mv->as_mv.col += near_mvs[near_index].as_mv.col;
 
                         /* Don't need to check this on NEARMV and NEARESTMV
                          * modes since those modes clamp the MV. The NEWMV mode
@@ -508,7 +504,6 @@ static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
                                                 mb_to_top_edge,
                                                 mb_to_bottom_edge);
                         mbmi->mode =  NEWMV;
-                        propogate_mv_for_ec = 1;
                     }
                 }
                 else
@@ -516,7 +511,6 @@ static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
                     mbmi->mode =  NEARMV;
                     vp8_clamp_mv2(&near_mvs[CNT_NEAR], &pbi->mb);
                     mbmi->mv.as_int = near_mvs[CNT_NEAR].as_int;
-                    propogate_mv_for_ec = 1;
                 }
             }
             else
@@ -524,19 +518,16 @@ static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
                 mbmi->mode =  NEARESTMV;
                 vp8_clamp_mv2(&near_mvs[CNT_NEAREST], &pbi->mb);
                 mbmi->mv.as_int = near_mvs[CNT_NEAREST].as_int;
-                propogate_mv_for_ec = 1;
             }
         }
-        else {
+        else
+        {
             mbmi->mode =  ZEROMV;
             mbmi->mv.as_int = 0;
-            propogate_mv_for_ec = 1;
         }
 
-        mbmi->uv_mode = DC_PRED;
-
 #if CONFIG_ERROR_CONCEALMENT
-        if(pbi->ec_enabled && propogate_mv_for_ec)
+        if(pbi->ec_enabled && (mbmi->mode != SPLITMV))
         {
             mi->bmi[ 0].mv.as_int =
             mi->bmi[ 1].mv.as_int =
@@ -594,7 +585,7 @@ static void read_mb_features(vp8_reader *r, MB_MODE_INFO *mi, MACROBLOCKD *x)
 }
 
 static void decode_mb_mode_mvs(VP8D_COMP *pbi, MODE_INFO *mi,
-                               MB_MODE_INFO *mbmi, int mb_row, int mb_col)
+                               MB_MODE_INFO *mbmi)
 {
     /* Read the Macroblock segmentation map if it is being updated explicitly
      * this frame (reset to 0 above by default)
@@ -615,7 +606,7 @@ static void decode_mb_mode_mvs(VP8D_COMP *pbi, MODE_INFO *mi,
     if(pbi->common.frame_type == KEY_FRAME)
         read_kf_modes(pbi, mi);
     else
-        read_mb_modes_mv(pbi, mi, &mi->mbmi, mb_row, mb_col);
+        read_mb_modes_mv(pbi, mi, &mi->mbmi);
 
 }
 
@@ -623,16 +614,20 @@ void vp8_decode_mode_mvs(VP8D_COMP *pbi)
 {
     MODE_INFO *mi = pbi->common.mi;
     int mb_row = -1;
+    int mb_to_right_edge_start;
 
     mb_mode_mv_init(pbi);
 
+    pbi->mb.mb_to_top_edge = 0;
+    pbi->mb.mb_to_bottom_edge = ((pbi->common.mb_rows - 1) * 16) << 3;
+    mb_to_right_edge_start = ((pbi->common.mb_cols - 1) * 16) << 3;
+
     while (++mb_row < pbi->common.mb_rows)
     {
         int mb_col = -1;
 
-        pbi->mb.mb_to_top_edge =  -((mb_row * 16)) << 3;
-        pbi->mb.mb_to_bottom_edge =
-            ((pbi->common.mb_rows - 1 - mb_row) * 16) << 3;
+        pbi->mb.mb_to_left_edge =  0;
+        pbi->mb.mb_to_right_edge = mb_to_right_edge_start;
 
         while (++mb_col < pbi->common.mb_cols)
         {
@@ -640,7 +635,7 @@ void vp8_decode_mode_mvs(VP8D_COMP *pbi)
             int mb_num = mb_row * pbi->common.mb_cols + mb_col;
 #endif
 
-            decode_mb_mode_mvs(pbi, mi, &mi->mbmi, mb_row, mb_col);
+            decode_mb_mode_mvs(pbi, mi, &mi->mbmi);
 
 #if CONFIG_ERROR_CONCEALMENT
             /* look for corruption. set mvs_corrupt_from_mb to the current
@@ -655,8 +650,12 @@ void vp8_decode_mode_mvs(VP8D_COMP *pbi)
             }
 #endif
 
+            pbi->mb.mb_to_left_edge -= (16 << 3);
+            pbi->mb.mb_to_right_edge -= (16 << 3);
             mi++;       /* next macroblock */
         }
+        pbi->mb.mb_to_top_edge -= (16 << 3);
+        pbi->mb.mb_to_bottom_edge -= (16 << 3);
 
         mi++;           /* skip left predictor each row */
     }
diff --git a/vp8/decoder/decodframe.c b/vp8/decoder/decodframe.c
index ad86ea5d3..08a0c4b98 100644
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -93,13 +93,14 @@ void mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd)
     }
 }
 
-
 static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
                               unsigned int mb_idx)
 {
     MB_PREDICTION_MODE mode;
     int i;
+#if CONFIG_ERROR_CONCEALMENT
     int corruption_detected = 0;
+#endif
 
     if (xd->mode_info_context->mbmi.mb_skip_coeff)
     {
@@ -152,15 +153,24 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
     }
 #endif
 
-
     /* do prediction */
     if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
     {
-        vp8_build_intra_predictors_mbuv_s(xd);
+        vp8_build_intra_predictors_mbuv_s(xd,
+                                          xd->recon_above[1],
+                                          xd->recon_above[2],
+                                          xd->recon_left[1],
+                                          xd->recon_left[2],
+                                          xd->recon_left_stride[1],
+                                          xd->dst.u_buffer, xd->dst.v_buffer);
 
         if (mode != B_PRED)
         {
-            vp8_build_intra_predictors_mby_s(xd);
+            vp8_build_intra_predictors_mby_s(xd,
+                                                 xd->recon_above[0],
+                                                 xd->recon_left[0],
+                                                 xd->recon_left_stride[0],
+                                                 xd->dst.y_buffer);
         }
         else
         {
@@ -172,16 +182,28 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
             if(xd->mode_info_context->mbmi.mb_skip_coeff)
                 vpx_memset(xd->eobs, 0, 25);
 
-            vp8_intra_prediction_down_copy(xd);
+            intra_prediction_down_copy(xd, xd->recon_above[0] + 16);
 
             for (i = 0; i < 16; i++)
             {
                 BLOCKD *b = &xd->block[i];
                 int b_mode = xd->mode_info_context->bmi[i].as_mode;
-
-
-                vp8_intra4x4_predict (base_dst + b->offset, dst_stride, b_mode,
-                                      base_dst + b->offset, dst_stride );
+                unsigned char *yabove;
+                unsigned char *yleft;
+                int left_stride;
+                unsigned char top_left;
+
+                yabove = base_dst + b->offset - dst_stride;
+                yleft = base_dst + b->offset - 1;
+                left_stride = dst_stride;
+                top_left = yabove[-1];
+
+                //                vp8_intra4x4_predict (base_dst + b->offset, dst_stride, b_mode,
+                  //                                    base_dst + b->offset, dst_stride );
+                vp8_intra4x4_predict_d_c(yabove, yleft, left_stride,
+                                       b_mode,
+                                       base_dst + b->offset, dst_stride,
+                                       top_left);
 
                 if (xd->eobs[i])
                 {
@@ -294,111 +316,170 @@ static int get_delta_q(vp8_reader *bc, int prev, int *q_update)
 FILE *vpxlog = 0;
 #endif
 
+static void decode_mb_rows(VP8D_COMP *pbi)
+{
+    VP8_COMMON *const pc = & pbi->common;
+    MACROBLOCKD *const xd  = & pbi->mb;
 
+    int ibc = 0;
+    int num_part = 1 << pc->multi_token_partition;
 
-static void
-decode_mb_row(VP8D_COMP *pbi, VP8_COMMON *pc, int mb_row, MACROBLOCKD *xd)
-{
     int recon_yoffset, recon_uvoffset;
-    int mb_col;
-    int ref_fb_idx = pc->lst_fb_idx;
+    int mb_row, mb_col;
+    int mb_idx = 0;
     int dst_fb_idx = pc->new_fb_idx;
-    int recon_y_stride = pc->yv12_fb[ref_fb_idx].y_stride;
-    int recon_uv_stride = pc->yv12_fb[ref_fb_idx].uv_stride;
+    int recon_y_stride = pc->yv12_fb[dst_fb_idx].y_stride;
+    int recon_uv_stride = pc->yv12_fb[dst_fb_idx].uv_stride;
 
-    vpx_memset(&pc->left_context, 0, sizeof(pc->left_context));
-    recon_yoffset = mb_row * recon_y_stride * 16;
-    recon_uvoffset = mb_row * recon_uv_stride * 8;
-    /* reset above block coeffs */
+    unsigned char *ref_buffer[MAX_REF_FRAMES][3];
+    unsigned char *dst_buffer[3];
+    int i;
+    int ref_fb_index[MAX_REF_FRAMES];
+    int ref_fb_corrupted[MAX_REF_FRAMES];
 
-    xd->above_context = pc->above_context;
-    xd->up_available = (mb_row != 0);
+    ref_fb_corrupted[INTRA_FRAME] = 0;
 
-    xd->mb_to_top_edge = -((mb_row * 16)) << 3;
-    xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;
+    ref_fb_index[LAST_FRAME]    = pc->lst_fb_idx;
+    ref_fb_index[GOLDEN_FRAME]  = pc->gld_fb_idx;
+    ref_fb_index[ALTREF_FRAME]  = pc->alt_fb_idx;
 
-    for (mb_col = 0; mb_col < pc->mb_cols; mb_col++)
+    for(i = 1; i < MAX_REF_FRAMES; i++)
     {
-        /* Distance of Mb to the various image edges.
-         * These are specified to 8th pel as they are always compared to values
-         * that are in 1/8th pel units
-         */
-        xd->mb_to_left_edge = -((mb_col * 16) << 3);
-        xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
+        ref_buffer[i][0] = pc->yv12_fb[ref_fb_index[i]].y_buffer;
+        ref_buffer[i][1] = pc->yv12_fb[ref_fb_index[i]].u_buffer;
+        ref_buffer[i][2] = pc->yv12_fb[ref_fb_index[i]].v_buffer;
 
-#if CONFIG_ERROR_CONCEALMENT
+        ref_fb_corrupted[i] = pc->yv12_fb[ref_fb_index[i]].corrupted;
+    }
+
+    dst_buffer[0] = pc->yv12_fb[dst_fb_idx].y_buffer;
+    dst_buffer[1] = pc->yv12_fb[dst_fb_idx].u_buffer;
+    dst_buffer[2] = pc->yv12_fb[dst_fb_idx].v_buffer;
+
+    xd->up_available = 0;
+
+    /* Decode the individual macro block */
+    for (mb_row = 0; mb_row < pc->mb_rows; mb_row++)
+    {
+        if (num_part > 1)
         {
-            int corrupt_residual = (!pbi->independent_partitions &&
-                                   pbi->frame_corrupt_residual) ||
-                                   vp8dx_bool_error(xd->current_bc);
-            if (pbi->ec_active &&
-                xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME &&
-                corrupt_residual)
-            {
-                /* We have an intra block with corrupt coefficients, better to
-                 * conceal with an inter block. Interpolate MVs from neighboring
-                 * MBs.
-                 *
-                 * Note that for the first mb with corrupt residual in a frame,
-                 * we might not discover that before decoding the residual. That
-                 * happens after this check, and therefore no inter concealment
-                 * will be done.
-                 */
-                vp8_interpolate_motion(xd,
-                                       mb_row, mb_col,
-                                       pc->mb_rows, pc->mb_cols,
-                                       pc->mode_info_stride);
-            }
+            xd->current_bc = & pbi->mbc[ibc];
+            ibc++;
+
+            if (ibc == num_part)
+                ibc = 0;
         }
-#endif
 
-        xd->dst.y_buffer = pc->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
-        xd->dst.u_buffer = pc->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
-        xd->dst.v_buffer = pc->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
+        recon_yoffset = mb_row * recon_y_stride * 16;
+        recon_uvoffset = mb_row * recon_uv_stride * 8;
 
-        xd->left_available = (mb_col != 0);
+        /* reset contexts */
+        xd->above_context = pc->above_context;
+        vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
 
-        /* Select the appropriate reference frame for this MB */
-        if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)
-            ref_fb_idx = pc->lst_fb_idx;
-        else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME)
-            ref_fb_idx = pc->gld_fb_idx;
-        else
-            ref_fb_idx = pc->alt_fb_idx;
+        xd->left_available = 0;
+
+        xd->mb_to_top_edge = -((mb_row * 16)) << 3;
+        xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;
+
+        xd->recon_above[0] = dst_buffer[0] + recon_yoffset;
+        xd->recon_above[1] = dst_buffer[1] + recon_uvoffset;
+        xd->recon_above[2] = dst_buffer[2] + recon_uvoffset;
+
+        xd->recon_left[0] = xd->recon_above[0] - 1;
+        xd->recon_left[1] = xd->recon_above[1] - 1;
+        xd->recon_left[2] = xd->recon_above[2] - 1;
 
-        xd->pre.y_buffer = pc->yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;
-        xd->pre.u_buffer = pc->yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
-        xd->pre.v_buffer = pc->yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
+        xd->recon_above[0] -= xd->dst.y_stride;
+        xd->recon_above[1] -= xd->dst.uv_stride;
+        xd->recon_above[2] -= xd->dst.uv_stride;
 
-        if (xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME)
+        //TODO: move to outside row loop
+        xd->recon_left_stride[0] = xd->dst.y_stride;
+        xd->recon_left_stride[1] = xd->dst.uv_stride;
+
+        for (mb_col = 0; mb_col < pc->mb_cols; mb_col++)
         {
+            /* Distance of Mb to the various image edges.
+             * These are specified to 8th pel as they are always compared to values
+             * that are in 1/8th pel units
+             */
+            xd->mb_to_left_edge = -((mb_col * 16) << 3);
+            xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
+
+#if CONFIG_ERROR_CONCEALMENT
+            {
+                int corrupt_residual = (!pbi->independent_partitions &&
+                                       pbi->frame_corrupt_residual) ||
+                                       vp8dx_bool_error(xd->current_bc);
+                if (pbi->ec_active &&
+                    xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME &&
+                    corrupt_residual)
+                {
+                    /* We have an intra block with corrupt coefficients, better to
+                     * conceal with an inter block. Interpolate MVs from neighboring
+                     * MBs.
+                     *
+                     * Note that for the first mb with corrupt residual in a frame,
+                     * we might not discover that before decoding the residual. That
+                     * happens after this check, and therefore no inter concealment
+                     * will be done.
+                     */
+                    vp8_interpolate_motion(xd,
+                                           mb_row, mb_col,
+                                           pc->mb_rows, pc->mb_cols,
+                                           pc->mode_info_stride);
+                }
+            }
+#endif
+
+            xd->dst.y_buffer = dst_buffer[0] + recon_yoffset;
+            xd->dst.u_buffer = dst_buffer[1] + recon_uvoffset;
+            xd->dst.v_buffer = dst_buffer[2] + recon_uvoffset;
+
+            xd->pre.y_buffer = ref_buffer[xd->mode_info_context->mbmi.ref_frame][0] + recon_yoffset;
+            xd->pre.u_buffer = ref_buffer[xd->mode_info_context->mbmi.ref_frame][1] + recon_uvoffset;
+            xd->pre.v_buffer = ref_buffer[xd->mode_info_context->mbmi.ref_frame][2] + recon_uvoffset;
+
             /* propagate errors from reference frames */
-            xd->corrupted |= pc->yv12_fb[ref_fb_idx].corrupted;
-        }
+            xd->corrupted |= ref_fb_corrupted[xd->mode_info_context->mbmi.ref_frame];
 
-        decode_macroblock(pbi, xd, mb_row * pc->mb_cols  + mb_col);
+            decode_macroblock(pbi, xd, mb_idx);
 
-        /* check if the boolean decoder has suffered an error */
-        xd->corrupted |= vp8dx_bool_error(xd->current_bc);
+            mb_idx++;
+            xd->left_available = 1;
 
-        recon_yoffset += 16;
-        recon_uvoffset += 8;
+            /* check if the boolean decoder has suffered an error */
+            xd->corrupted |= vp8dx_bool_error(xd->current_bc);
 
-        ++xd->mode_info_context;  /* next mb */
+            xd->recon_above[0] += 16;
+            xd->recon_above[1] += 8;
+            xd->recon_above[2] += 8;
+            xd->recon_left[0] += 16;
+            xd->recon_left[1] += 8;
+            xd->recon_left[2] += 8;
 
-        xd->above_context++;
 
-    }
+            recon_yoffset += 16;
+            recon_uvoffset += 8;
 
-    /* adjust to the next row of mbs */
-    vp8_extend_mb_row(
-        &pc->yv12_fb[dst_fb_idx],
-        xd->dst.y_buffer + 16, xd->dst.u_buffer + 8, xd->dst.v_buffer + 8
-    );
+            ++xd->mode_info_context;  /* next mb */
 
-    ++xd->mode_info_context;      /* skip prediction column */
-}
+            xd->above_context++;
+
+        }
+
+        /* adjust to the next row of mbs */
+        vp8_extend_mb_row(
+            &pc->yv12_fb[dst_fb_idx],
+            xd->dst.y_buffer + 16, xd->dst.u_buffer + 8, xd->dst.v_buffer + 8
+        );
 
+        ++xd->mode_info_context;      /* skip prediction column */
+        xd->up_available = 1;
+
+    }
+}
 
 static unsigned int read_partition_size(const unsigned char *cx_size)
 {
@@ -650,7 +731,6 @@ int vp8_decode_frame(VP8D_COMP *pbi)
     const unsigned char *data_end =  data + pbi->fragment_sizes[0];
     ptrdiff_t first_partition_length_in_bytes;
 
-    int mb_row;
     int i, j, k, l;
     const int *const mb_feature_data_bits = vp8_mb_feature_data_bits;
     int corrupt_tokens = 0;
@@ -827,6 +907,12 @@ int vp8_decode_frame(VP8D_COMP *pbi)
             }
         }
     }
+    else
+    {
+        /* No segmentation updates on this frame */
+        xd->update_mb_segmentation_map = 0;
+        xd->update_mb_segmentation_data = 0;
+    }
 
     /* Read the loop filter level and type */
     pc->filter_type = (LOOPFILTERTYPE) vp8_read_bit(bc);
@@ -1040,12 +1126,12 @@ int vp8_decode_frame(VP8D_COMP *pbi)
 #endif
 
     vpx_memset(pc->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * pc->mb_cols);
+    pbi->frame_corrupt_residual = 0;
 
 #if CONFIG_MULTITHREAD
     if (pbi->b_multithreaded_rd && pc->multi_token_partition != ONE_PARTITION)
     {
         int i;
-        pbi->frame_corrupt_residual = 0;
         vp8mt_decode_mb_rows(pbi, xd);
         vp8_yv12_extend_frame_borders_ptr(&pc->yv12_fb[pc->new_fb_idx]);    /*cm->frame_to_show);*/
         for (i = 0; i < pbi->decoding_thread_count; ++i)
@@ -1054,25 +1140,7 @@ int vp8_decode_frame(VP8D_COMP *pbi)
     else
 #endif
     {
-        int ibc = 0;
-        int num_part = 1 << pc->multi_token_partition;
-        pbi->frame_corrupt_residual = 0;
-
-        /* Decode the individual macro block */
-        for (mb_row = 0; mb_row < pc->mb_rows; mb_row++)
-        {
-
-            if (num_part > 1)
-            {
-                xd->current_bc = & pbi->mbc[ibc];
-                ibc++;
-
-                if (ibc == num_part)
-                    ibc = 0;
-            }
-
-            decode_mb_row(pbi, pc, mb_row, xd);
-        }
+        decode_mb_rows(pbi);
         corrupt_tokens |= xd->corrupted;
     }
 
diff --git a/vp8/decoder/onyxd_int.h b/vp8/decoder/onyxd_int.h
index 989f68bf8..97cf0dcea 100644
--- a/vp8/decoder/onyxd_int.h
+++ b/vp8/decoder/onyxd_int.h
@@ -32,8 +32,6 @@ typedef struct
 {
     MACROBLOCKD  mbd;
     int mb_row;
-    int current_mb_col;
-    short *coef_ptr;
 } MB_ROW_DEC;
 
 typedef struct
diff --git a/vp8/decoder/reconintra_mt.c b/vp8/decoder/reconintra_mt.c
deleted file mode 100644
index a8fedf48b..000000000
--- a/vp8/decoder/reconintra_mt.c
+++ /dev/null
@@ -1,943 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_config.h"
-#include "vpx_rtcd.h"
-#include "vpx_mem/vpx_mem.h"
-#include "onyxd_int.h"
-
-/* For skip_recon_mb(), add vp8_build_intra_predictors_mby_s(MACROBLOCKD *x) and
- * vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *x).
- */
-
-void vp8mt_build_intra_predictors_mby(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col)
-{
-    unsigned char *yabove_row;    /* = x->dst.y_buffer - x->dst.y_stride; */
-    unsigned char *yleft_col;
-    unsigned char yleft_buf[16];
-    unsigned char ytop_left;      /* = yabove_row[-1]; */
-    unsigned char *ypred_ptr = x->predictor;
-    int r, c, i;
-
-    if (pbi->common.filter_level)
-    {
-        yabove_row = pbi->mt_yabove_row[mb_row] + mb_col*16 +32;
-        yleft_col = pbi->mt_yleft_col[mb_row];
-    } else
-    {
-        yabove_row = x->dst.y_buffer - x->dst.y_stride;
-
-        for (i = 0; i < 16; i++)
-            yleft_buf[i] = x->dst.y_buffer [i* x->dst.y_stride -1];
-        yleft_col = yleft_buf;
-    }
-
-    ytop_left = yabove_row[-1];
-
-    /* for Y */
-    switch (x->mode_info_context->mbmi.mode)
-    {
-    case DC_PRED:
-    {
-        int expected_dc;
-        int i;
-        int shift;
-        int average = 0;
-
-
-        if (x->up_available || x->left_available)
-        {
-            if (x->up_available)
-            {
-                for (i = 0; i < 16; i++)
-                {
-                    average += yabove_row[i];
-                }
-            }
-
-            if (x->left_available)
-            {
-
-                for (i = 0; i < 16; i++)
-                {
-                    average += yleft_col[i];
-                }
-
-            }
-
-
-
-            shift = 3 + x->up_available + x->left_available;
-            expected_dc = (average + (1 << (shift - 1))) >> shift;
-        }
-        else
-        {
-            expected_dc = 128;
-        }
-
-        vpx_memset(ypred_ptr, expected_dc, 256);
-    }
-    break;
-    case V_PRED:
-    {
-
-        for (r = 0; r < 16; r++)
-        {
-
-            ((int *)ypred_ptr)[0] = ((int *)yabove_row)[0];
-            ((int *)ypred_ptr)[1] = ((int *)yabove_row)[1];
-            ((int *)ypred_ptr)[2] = ((int *)yabove_row)[2];
-            ((int *)ypred_ptr)[3] = ((int *)yabove_row)[3];
-            ypred_ptr += 16;
-        }
-    }
-    break;
-    case H_PRED:
-    {
-
-        for (r = 0; r < 16; r++)
-        {
-
-            vpx_memset(ypred_ptr, yleft_col[r], 16);
-            ypred_ptr += 16;
-        }
-
-    }
-    break;
-    case TM_PRED:
-    {
-
-        for (r = 0; r < 16; r++)
-        {
-            for (c = 0; c < 16; c++)
-            {
-                int pred =  yleft_col[r] + yabove_row[ c] - ytop_left;
-
-                if (pred < 0)
-                    pred = 0;
-
-                if (pred > 255)
-                    pred = 255;
-
-                ypred_ptr[c] = pred;
-            }
-
-            ypred_ptr += 16;
-        }
-
-    }
-    break;
-    case B_PRED:
-    case NEARESTMV:
-    case NEARMV:
-    case ZEROMV:
-    case NEWMV:
-    case SPLITMV:
-    case MB_MODE_COUNT:
-        break;
-    }
-}
-
-void vp8mt_build_intra_predictors_mby_s(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col)
-{
-    unsigned char *yabove_row;    /* = x->dst.y_buffer - x->dst.y_stride; */
-    unsigned char *yleft_col;
-    unsigned char yleft_buf[16];
-    unsigned char ytop_left;      /* = yabove_row[-1]; */
-    unsigned char *ypred_ptr = x->predictor;
-    int r, c, i;
-
-    int y_stride = x->dst.y_stride;
-    ypred_ptr = x->dst.y_buffer; /*x->predictor;*/
-
-    if (pbi->common.filter_level)
-    {
-        yabove_row = pbi->mt_yabove_row[mb_row] + mb_col*16 +32;
-        yleft_col = pbi->mt_yleft_col[mb_row];
-    } else
-    {
-        yabove_row = x->dst.y_buffer - x->dst.y_stride;
-
-        for (i = 0; i < 16; i++)
-            yleft_buf[i] = x->dst.y_buffer [i* x->dst.y_stride -1];
-        yleft_col = yleft_buf;
-    }
-
-    ytop_left = yabove_row[-1];
-
-    /* for Y */
-    switch (x->mode_info_context->mbmi.mode)
-    {
-    case DC_PRED:
-    {
-        int expected_dc;
-        int i;
-        int shift;
-        int average = 0;
-
-
-        if (x->up_available || x->left_available)
-        {
-            if (x->up_available)
-            {
-                for (i = 0; i < 16; i++)
-                {
-                    average += yabove_row[i];
-                }
-            }
-
-            if (x->left_available)
-            {
-
-                for (i = 0; i < 16; i++)
-                {
-                    average += yleft_col[i];
-                }
-
-            }
-
-
-
-            shift = 3 + x->up_available + x->left_available;
-            expected_dc = (average + (1 << (shift - 1))) >> shift;
-        }
-        else
-        {
-            expected_dc = 128;
-        }
-
-        /*vpx_memset(ypred_ptr, expected_dc, 256);*/
-        for (r = 0; r < 16; r++)
-        {
-            vpx_memset(ypred_ptr, expected_dc, 16);
-            ypred_ptr += y_stride; /*16;*/
-        }
-    }
-    break;
-    case V_PRED:
-    {
-
-        for (r = 0; r < 16; r++)
-        {
-
-            ((int *)ypred_ptr)[0] = ((int *)yabove_row)[0];
-            ((int *)ypred_ptr)[1] = ((int *)yabove_row)[1];
-            ((int *)ypred_ptr)[2] = ((int *)yabove_row)[2];
-            ((int *)ypred_ptr)[3] = ((int *)yabove_row)[3];
-            ypred_ptr += y_stride; /*16;*/
-        }
-    }
-    break;
-    case H_PRED:
-    {
-
-        for (r = 0; r < 16; r++)
-        {
-
-            vpx_memset(ypred_ptr, yleft_col[r], 16);
-            ypred_ptr += y_stride;  /*16;*/
-        }
-
-    }
-    break;
-    case TM_PRED:
-    {
-
-        for (r = 0; r < 16; r++)
-        {
-            for (c = 0; c < 16; c++)
-            {
-                int pred =  yleft_col[r] + yabove_row[ c] - ytop_left;
-
-                if (pred < 0)
-                    pred = 0;
-
-                if (pred > 255)
-                    pred = 255;
-
-                ypred_ptr[c] = pred;
-            }
-
-            ypred_ptr += y_stride;  /*16;*/
-        }
-
-    }
-    break;
-    case B_PRED:
-    case NEARESTMV:
-    case NEARMV:
-    case ZEROMV:
-    case NEWMV:
-    case SPLITMV:
-    case MB_MODE_COUNT:
-        break;
-    }
-}
-
-void vp8mt_build_intra_predictors_mbuv(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col)
-{
-    unsigned char *uabove_row;   /* = x->dst.u_buffer - x->dst.uv_stride; */
-    unsigned char *uleft_col;    /*[16];*/
-    unsigned char uleft_buf[8];
-    unsigned char utop_left;     /* = uabove_row[-1]; */
-    unsigned char *vabove_row;   /* = x->dst.v_buffer - x->dst.uv_stride; */
-    unsigned char *vleft_col;    /*[20];*/
-    unsigned char vleft_buf[8];
-    unsigned char vtop_left;     /* = vabove_row[-1]; */
-    unsigned char *upred_ptr = &x->predictor[256];
-    unsigned char *vpred_ptr = &x->predictor[320];
-    int i, j;
-
-    if (pbi->common.filter_level)
-    {
-        uabove_row = pbi->mt_uabove_row[mb_row] + mb_col*8 +16;
-        vabove_row = pbi->mt_vabove_row[mb_row] + mb_col*8 +16;
-        uleft_col = pbi->mt_uleft_col[mb_row];
-        vleft_col = pbi->mt_vleft_col[mb_row];
-    } else
-    {
-        uabove_row = x->dst.u_buffer - x->dst.uv_stride;
-        vabove_row = x->dst.v_buffer - x->dst.uv_stride;
-
-        for (i = 0; i < 8; i++)
-        {
-            uleft_buf[i] = x->dst.u_buffer [i* x->dst.uv_stride -1];
-            vleft_buf[i] = x->dst.v_buffer [i* x->dst.uv_stride -1];
-        }
-        uleft_col = uleft_buf;
-        vleft_col = vleft_buf;
-    }
-    utop_left = uabove_row[-1];
-    vtop_left = vabove_row[-1];
-
-    switch (x->mode_info_context->mbmi.uv_mode)
-    {
-    case DC_PRED:
-    {
-        int expected_udc;
-        int expected_vdc;
-        int i;
-        int shift;
-        int Uaverage = 0;
-        int Vaverage = 0;
-
-        if (x->up_available)
-        {
-            for (i = 0; i < 8; i++)
-            {
-                Uaverage += uabove_row[i];
-                Vaverage += vabove_row[i];
-            }
-        }
-
-        if (x->left_available)
-        {
-            for (i = 0; i < 8; i++)
-            {
-                Uaverage += uleft_col[i];
-                Vaverage += vleft_col[i];
-            }
-        }
-
-        if (!x->up_available && !x->left_available)
-        {
-            expected_udc = 128;
-            expected_vdc = 128;
-        }
-        else
-        {
-            shift = 2 + x->up_available + x->left_available;
-            expected_udc = (Uaverage + (1 << (shift - 1))) >> shift;
-            expected_vdc = (Vaverage + (1 << (shift - 1))) >> shift;
-        }
-
-
-        vpx_memset(upred_ptr, expected_udc, 64);
-        vpx_memset(vpred_ptr, expected_vdc, 64);
-
-
-    }
-    break;
-    case V_PRED:
-    {
-        int i;
-
-        for (i = 0; i < 8; i++)
-        {
-            vpx_memcpy(upred_ptr, uabove_row, 8);
-            vpx_memcpy(vpred_ptr, vabove_row, 8);
-            upred_ptr += 8;
-            vpred_ptr += 8;
-        }
-
-    }
-    break;
-    case H_PRED:
-    {
-        int i;
-
-        for (i = 0; i < 8; i++)
-        {
-            vpx_memset(upred_ptr, uleft_col[i], 8);
-            vpx_memset(vpred_ptr, vleft_col[i], 8);
-            upred_ptr += 8;
-            vpred_ptr += 8;
-        }
-    }
-
-    break;
-    case TM_PRED:
-    {
-        int i;
-
-        for (i = 0; i < 8; i++)
-        {
-            for (j = 0; j < 8; j++)
-            {
-                int predu = uleft_col[i] + uabove_row[j] - utop_left;
-                int predv = vleft_col[i] + vabove_row[j] - vtop_left;
-
-                if (predu < 0)
-                    predu = 0;
-
-                if (predu > 255)
-                    predu = 255;
-
-                if (predv < 0)
-                    predv = 0;
-
-                if (predv > 255)
-                    predv = 255;
-
-                upred_ptr[j] = predu;
-                vpred_ptr[j] = predv;
-            }
-
-            upred_ptr += 8;
-            vpred_ptr += 8;
-        }
-
-    }
-    break;
-    case B_PRED:
-    case NEARESTMV:
-    case NEARMV:
-    case ZEROMV:
-    case NEWMV:
-    case SPLITMV:
-    case MB_MODE_COUNT:
-        break;
-    }
-}
-
-void vp8mt_build_intra_predictors_mbuv_s(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col)
-{
-    unsigned char *uabove_row;  /* = x->dst.u_buffer - x->dst.uv_stride; */
-    unsigned char *uleft_col;   /*[16];*/
-    unsigned char uleft_buf[8];
-    unsigned char utop_left;    /* = uabove_row[-1]; */
-    unsigned char *vabove_row;  /* = x->dst.v_buffer - x->dst.uv_stride; */
-    unsigned char *vleft_col;   /*[20];*/
-    unsigned char vleft_buf[8];
-    unsigned char vtop_left;    /* = vabove_row[-1]; */
-    unsigned char *upred_ptr = x->dst.u_buffer; /*&x->predictor[256];*/
-    unsigned char *vpred_ptr = x->dst.v_buffer; /*&x->predictor[320];*/
-    int uv_stride = x->dst.uv_stride;
-    int i, j;
-
-    if (pbi->common.filter_level)
-    {
-        uabove_row = pbi->mt_uabove_row[mb_row] + mb_col*8 +16;
-        vabove_row = pbi->mt_vabove_row[mb_row] + mb_col*8 +16;
-        uleft_col = pbi->mt_uleft_col[mb_row];
-        vleft_col = pbi->mt_vleft_col[mb_row];
-    } else
-    {
-        uabove_row = x->dst.u_buffer - x->dst.uv_stride;
-        vabove_row = x->dst.v_buffer - x->dst.uv_stride;
-
-        for (i = 0; i < 8; i++)
-        {
-            uleft_buf[i] = x->dst.u_buffer [i* x->dst.uv_stride -1];
-            vleft_buf[i] = x->dst.v_buffer [i* x->dst.uv_stride -1];
-        }
-        uleft_col = uleft_buf;
-        vleft_col = vleft_buf;
-    }
-    utop_left = uabove_row[-1];
-    vtop_left = vabove_row[-1];
-
-    switch (x->mode_info_context->mbmi.uv_mode)
-    {
-    case DC_PRED:
-    {
-        int expected_udc;
-        int expected_vdc;
-        int i;
-        int shift;
-        int Uaverage = 0;
-        int Vaverage = 0;
-
-        if (x->up_available)
-        {
-            for (i = 0; i < 8; i++)
-            {
-                Uaverage += uabove_row[i];
-                Vaverage += vabove_row[i];
-            }
-        }
-
-        if (x->left_available)
-        {
-            for (i = 0; i < 8; i++)
-            {
-                Uaverage += uleft_col[i];
-                Vaverage += vleft_col[i];
-            }
-        }
-
-        if (!x->up_available && !x->left_available)
-        {
-            expected_udc = 128;
-            expected_vdc = 128;
-        }
-        else
-        {
-            shift = 2 + x->up_available + x->left_available;
-            expected_udc = (Uaverage + (1 << (shift - 1))) >> shift;
-            expected_vdc = (Vaverage + (1 << (shift - 1))) >> shift;
-        }
-
-
-        /*vpx_memset(upred_ptr,expected_udc,64);
-        vpx_memset(vpred_ptr,expected_vdc,64);*/
-        for (i = 0; i < 8; i++)
-        {
-            vpx_memset(upred_ptr, expected_udc, 8);
-            vpx_memset(vpred_ptr, expected_vdc, 8);
-            upred_ptr += uv_stride; /*8;*/
-            vpred_ptr += uv_stride; /*8;*/
-        }
-    }
-    break;
-    case V_PRED:
-    {
-        int i;
-
-        for (i = 0; i < 8; i++)
-        {
-            vpx_memcpy(upred_ptr, uabove_row, 8);
-            vpx_memcpy(vpred_ptr, vabove_row, 8);
-            upred_ptr += uv_stride; /*8;*/
-            vpred_ptr += uv_stride; /*8;*/
-        }
-
-    }
-    break;
-    case H_PRED:
-    {
-        int i;
-
-        for (i = 0; i < 8; i++)
-        {
-            vpx_memset(upred_ptr, uleft_col[i], 8);
-            vpx_memset(vpred_ptr, vleft_col[i], 8);
-            upred_ptr += uv_stride; /*8;*/
-            vpred_ptr += uv_stride; /*8;*/
-        }
-    }
-
-    break;
-    case TM_PRED:
-    {
-        int i;
-
-        for (i = 0; i < 8; i++)
-        {
-            for (j = 0; j < 8; j++)
-            {
-                int predu = uleft_col[i] + uabove_row[j] - utop_left;
-                int predv = vleft_col[i] + vabove_row[j] - vtop_left;
-
-                if (predu < 0)
-                    predu = 0;
-
-                if (predu > 255)
-                    predu = 255;
-
-                if (predv < 0)
-                    predv = 0;
-
-                if (predv > 255)
-                    predv = 255;
-
-                upred_ptr[j] = predu;
-                vpred_ptr[j] = predv;
-            }
-
-            upred_ptr += uv_stride; /*8;*/
-            vpred_ptr += uv_stride; /*8;*/
-        }
-
-    }
-    break;
-    case B_PRED:
-    case NEARESTMV:
-    case NEARMV:
-    case ZEROMV:
-    case NEWMV:
-    case SPLITMV:
-    case MB_MODE_COUNT:
-        break;
-    }
-}
-
-
-void vp8mt_predict_intra4x4(VP8D_COMP *pbi,
-                          MACROBLOCKD *xd,
-                          int b_mode,
-                          unsigned char *predictor,
-                          int stride,
-                          int mb_row,
-                          int mb_col,
-                          int num)
-{
-    int i, r, c;
-
-    unsigned char *Above;   /* = *(x->base_dst) + x->dst - x->dst_stride; */
-    unsigned char Left[4];
-    unsigned char top_left; /* = Above[-1]; */
-
-    BLOCKD *x = &xd->block[num];
-    int dst_stride = xd->dst.y_stride;
-    unsigned char *base_dst = xd->dst.y_buffer;
-
-
-    /*Caution: For some b_mode, it needs 8 pixels (4 above + 4 above-right).*/
-    if (num < 4 && pbi->common.filter_level)
-        Above = pbi->mt_yabove_row[mb_row] + mb_col*16 + num*4 + 32;
-    else
-        Above = base_dst + x->offset - dst_stride;
-
-    if (num%4==0 && pbi->common.filter_level)
-    {
-        for (i=0; i<4; i++)
-            Left[i] = pbi->mt_yleft_col[mb_row][num + i];
-    }else
-    {
-        Left[0] = (base_dst)[x->offset - 1];
-        Left[1] = (base_dst)[x->offset - 1 + dst_stride];
-        Left[2] = (base_dst)[x->offset - 1 + 2 * dst_stride];
-        Left[3] = (base_dst)[x->offset - 1 + 3 * dst_stride];
-    }
-
-    if ((num==4 || num==8 || num==12) && pbi->common.filter_level)
-        top_left = pbi->mt_yleft_col[mb_row][num-1];
-    else
-        top_left = Above[-1];
-
-     switch (b_mode)
-    {
-    case B_DC_PRED:
-    {
-        int expected_dc = 0;
-
-        for (i = 0; i < 4; i++)
-        {
-            expected_dc += Above[i];
-            expected_dc += Left[i];
-        }
-
-        expected_dc = (expected_dc + 4) >> 3;
-
-        for (r = 0; r < 4; r++)
-        {
-            for (c = 0; c < 4; c++)
-            {
-                predictor[c] = expected_dc;
-            }
-
-            predictor += stride;
-        }
-    }
-    break;
-    case B_TM_PRED:
-    {
-        /* prediction similar to true_motion prediction */
-        for (r = 0; r < 4; r++)
-        {
-            for (c = 0; c < 4; c++)
-            {
-                int pred = Above[c] - top_left + Left[r];
-
-                if (pred < 0)
-                    pred = 0;
-
-                if (pred > 255)
-                    pred = 255;
-
-                predictor[c] = pred;
-            }
-
-            predictor += stride;
-        }
-    }
-    break;
-
-    case B_VE_PRED:
-    {
-
-        unsigned int ap[4];
-        ap[0] = (top_left  + 2 * Above[0] + Above[1] + 2) >> 2;
-        ap[1] = (Above[0] + 2 * Above[1] + Above[2] + 2) >> 2;
-        ap[2] = (Above[1] + 2 * Above[2] + Above[3] + 2) >> 2;
-        ap[3] = (Above[2] + 2 * Above[3] + Above[4] + 2) >> 2;
-
-        for (r = 0; r < 4; r++)
-        {
-            for (c = 0; c < 4; c++)
-            {
-
-                predictor[c] = ap[c];
-            }
-
-            predictor += stride;
-        }
-
-    }
-    break;
-
-
-    case B_HE_PRED:
-    {
-
-        unsigned int lp[4];
-        lp[0] = (top_left + 2 * Left[0] + Left[1] + 2) >> 2;
-        lp[1] = (Left[0] + 2 * Left[1] + Left[2] + 2) >> 2;
-        lp[2] = (Left[1] + 2 * Left[2] + Left[3] + 2) >> 2;
-        lp[3] = (Left[2] + 2 * Left[3] + Left[3] + 2) >> 2;
-
-        for (r = 0; r < 4; r++)
-        {
-            for (c = 0; c < 4; c++)
-            {
-                predictor[c] = lp[r];
-            }
-
-            predictor += stride;
-        }
-    }
-    break;
-    case B_LD_PRED:
-    {
-        unsigned char *ptr = Above;
-        predictor[0 * stride + 0] = (ptr[0] + ptr[1] * 2 + ptr[2] + 2) >> 2;
-        predictor[0 * stride + 1] =
-            predictor[1 * stride + 0] = (ptr[1] + ptr[2] * 2 + ptr[3] + 2) >> 2;
-        predictor[0 * stride + 2] =
-            predictor[1 * stride + 1] =
-                predictor[2 * stride + 0] = (ptr[2] + ptr[3] * 2 + ptr[4] + 2) >> 2;
-        predictor[0 * stride + 3] =
-            predictor[1 * stride + 2] =
-                predictor[2 * stride + 1] =
-                    predictor[3 * stride + 0] = (ptr[3] + ptr[4] * 2 + ptr[5] + 2) >> 2;
-        predictor[1 * stride + 3] =
-            predictor[2 * stride + 2] =
-                predictor[3 * stride + 1] = (ptr[4] + ptr[5] * 2 + ptr[6] + 2) >> 2;
-        predictor[2 * stride + 3] =
-            predictor[3 * stride + 2] = (ptr[5] + ptr[6] * 2 + ptr[7] + 2) >> 2;
-        predictor[3 * stride + 3] = (ptr[6] + ptr[7] * 2 + ptr[7] + 2) >> 2;
-
-    }
-    break;
-    case B_RD_PRED:
-    {
-
-        unsigned char pp[9];
-
-        pp[0] = Left[3];
-        pp[1] = Left[2];
-        pp[2] = Left[1];
-        pp[3] = Left[0];
-        pp[4] = top_left;
-        pp[5] = Above[0];
-        pp[6] = Above[1];
-        pp[7] = Above[2];
-        pp[8] = Above[3];
-
-        predictor[3 * stride + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
-        predictor[3 * stride + 1] =
-            predictor[2 * stride + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
-        predictor[3 * stride + 2] =
-            predictor[2 * stride + 1] =
-                predictor[1 * stride + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
-        predictor[3 * stride + 3] =
-            predictor[2 * stride + 2] =
-                predictor[1 * stride + 1] =
-                    predictor[0 * stride + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
-        predictor[2 * stride + 3] =
-            predictor[1 * stride + 2] =
-                predictor[0 * stride + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
-        predictor[1 * stride + 3] =
-            predictor[0 * stride + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
-        predictor[0 * stride + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
-
-    }
-    break;
-    case B_VR_PRED:
-    {
-
-        unsigned char pp[9];
-
-        pp[0] = Left[3];
-        pp[1] = Left[2];
-        pp[2] = Left[1];
-        pp[3] = Left[0];
-        pp[4] = top_left;
-        pp[5] = Above[0];
-        pp[6] = Above[1];
-        pp[7] = Above[2];
-        pp[8] = Above[3];
-
-
-        predictor[3 * stride + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
-        predictor[2 * stride + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
-        predictor[3 * stride + 1] =
-            predictor[1 * stride + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
-        predictor[2 * stride + 1] =
-            predictor[0 * stride + 0] = (pp[4] + pp[5] + 1) >> 1;
-        predictor[3 * stride + 2] =
-            predictor[1 * stride + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
-        predictor[2 * stride + 2] =
-            predictor[0 * stride + 1] = (pp[5] + pp[6] + 1) >> 1;
-        predictor[3 * stride + 3] =
-            predictor[1 * stride + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
-        predictor[2 * stride + 3] =
-            predictor[0 * stride + 2] = (pp[6] + pp[7] + 1) >> 1;
-        predictor[1 * stride + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
-        predictor[0 * stride + 3] = (pp[7] + pp[8] + 1) >> 1;
-
-    }
-    break;
-    case B_VL_PRED:
-    {
-
-        unsigned char *pp = Above;
-
-        predictor[0 * stride + 0] = (pp[0] + pp[1] + 1) >> 1;
-        predictor[1 * stride + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
-        predictor[2 * stride + 0] =
-            predictor[0 * stride + 1] = (pp[1] + pp[2] + 1) >> 1;
-        predictor[1 * stride + 1] =
-            predictor[3 * stride + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
-        predictor[2 * stride + 1] =
-            predictor[0 * stride + 2] = (pp[2] + pp[3] + 1) >> 1;
-        predictor[3 * stride + 1] =
-            predictor[1 * stride + 2] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
-        predictor[0 * stride + 3] =
-            predictor[2 * stride + 2] = (pp[3] + pp[4] + 1) >> 1;
-        predictor[1 * stride + 3] =
-            predictor[3 * stride + 2] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
-        predictor[2 * stride + 3] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
-        predictor[3 * stride + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
-    }
-    break;
-
-    case B_HD_PRED:
-    {
-        unsigned char pp[9];
-        pp[0] = Left[3];
-        pp[1] = Left[2];
-        pp[2] = Left[1];
-        pp[3] = Left[0];
-        pp[4] = top_left;
-        pp[5] = Above[0];
-        pp[6] = Above[1];
-        pp[7] = Above[2];
-        pp[8] = Above[3];
-
-
-        predictor[3 * stride + 0] = (pp[0] + pp[1] + 1) >> 1;
-        predictor[3 * stride + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
-        predictor[2 * stride + 0] =
-            predictor[3 * stride + 2] = (pp[1] + pp[2] + 1) >> 1;
-        predictor[2 * stride + 1] =
-            predictor[3 * stride + 3] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
-        predictor[2 * stride + 2] =
-            predictor[1 * stride + 0] = (pp[2] + pp[3] + 1) >> 1;
-        predictor[2 * stride + 3] =
-            predictor[1 * stride + 1] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
-        predictor[1 * stride + 2] =
-            predictor[0 * stride + 0] = (pp[3] + pp[4] + 1) >> 1;
-        predictor[1 * stride + 3] =
-            predictor[0 * stride + 1] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
-        predictor[0 * stride + 2] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
-        predictor[0 * stride + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
-    }
-    break;
-
-
-    case B_HU_PRED:
-    {
-        unsigned char *pp = Left;
-        predictor[0 * stride + 0] = (pp[0] + pp[1] + 1) >> 1;
-        predictor[0 * stride + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
-        predictor[0 * stride + 2] =
-            predictor[1 * stride + 0] = (pp[1] + pp[2] + 1) >> 1;
-        predictor[0 * stride + 3] =
-            predictor[1 * stride + 1] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
-        predictor[1 * stride + 2] =
-            predictor[2 * stride + 0] = (pp[2] + pp[3] + 1) >> 1;
-        predictor[1 * stride + 3] =
-            predictor[2 * stride + 1] = (pp[2] + pp[3] * 2 + pp[3] + 2) >> 2;
-        predictor[2 * stride + 2] =
-            predictor[2 * stride + 3] =
-                predictor[3 * stride + 0] =
-                    predictor[3 * stride + 1] =
-                        predictor[3 * stride + 2] =
-                            predictor[3 * stride + 3] = pp[3];
-    }
-    break;
-
-
-    }
-}
-
-/* copy 4 bytes from the above right down so that the 4x4 prediction modes using pixels above and
- * to the right prediction have filled in pixels to use.
- */
-void vp8mt_intra_prediction_down_copy(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col)
-{
-    unsigned char *above_right;   /* = *(x->block[0].base_dst) + x->block[0].dst - x->block[0].dst_stride + 16; */
-    unsigned int *src_ptr;
-    unsigned int *dst_ptr0;
-    unsigned int *dst_ptr1;
-    unsigned int *dst_ptr2;
-    int dst_stride = x->dst.y_stride;
-    unsigned char *base_dst = x->dst.y_buffer;
-
-
-    if (pbi->common.filter_level)
-        above_right = pbi->mt_yabove_row[mb_row] + mb_col*16 + 32 +16;
-    else
-        above_right = base_dst + x->block[0].offset - dst_stride + 16;
-
-    src_ptr = (unsigned int *)above_right;
-    /*dst_ptr0 = (unsigned int *)(above_right + 4 * x->block[0].dst_stride);
-    dst_ptr1 = (unsigned int *)(above_right + 8 * x->block[0].dst_stride);
-    dst_ptr2 = (unsigned int *)(above_right + 12 * x->block[0].dst_stride);*/
-    dst_ptr0 = (unsigned int *)(base_dst + x->block[0].offset + 16 + 3 * dst_stride);
-    dst_ptr1 = (unsigned int *)(base_dst + x->block[0].offset + 16 + 7 * dst_stride);
-    dst_ptr2 = (unsigned int *)(base_dst + x->block[0].offset + 16 + 11 * dst_stride);
-    *dst_ptr0 = *src_ptr;
-    *dst_ptr1 = *src_ptr;
-    *dst_ptr2 = *src_ptr;
-}
diff --git a/vp8/decoder/reconintra_mt.h b/vp8/decoder/reconintra_mt.h
deleted file mode 100644
index 4576a8064..000000000
--- a/vp8/decoder/reconintra_mt.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_RECONINTRA_MT_H
-#define __INC_RECONINTRA_MT_H
-
-/* reconintra functions used in multi-threaded decoder */
-#if CONFIG_MULTITHREAD
-extern void vp8mt_build_intra_predictors_mby(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col);
-extern void vp8mt_build_intra_predictors_mby_s(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col);
-extern void vp8mt_build_intra_predictors_mbuv(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col);
-extern void vp8mt_build_intra_predictors_mbuv_s(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col);
-
-extern void vp8mt_predict_intra4x4(VP8D_COMP *pbi, MACROBLOCKD *x, int b_mode, unsigned char *predictor, int stride, int mb_row, int mb_col, int num);
-extern void vp8mt_intra_prediction_down_copy(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col);
-#endif
-
-#endif
diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c
index 16c5d394d..bc4450daf 100644
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -22,8 +22,8 @@
 #include "vp8/common/extend.h"
 #include "vpx_ports/vpx_timer.h"
 #include "detokenize.h"
+#include "vp8/common/reconintra4x4.h"
 #include "vp8/common/reconinter.h"
-#include "reconintra_mt.h"
 #if CONFIG_ERROR_CONCEALMENT
 #include "error_concealment.h"
 #endif
@@ -82,12 +82,13 @@ static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_D
         pbi->mt_current_mb_col[i]=-1;
 }
 
-
-static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int mb_col)
+static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, unsigned int mb_idx)
 {
-    int eobtotal = 0;
-    int throw_residual = 0;
+    MB_PREDICTION_MODE mode;
     int i;
+#if CONFIG_ERROR_CONCEALMENT
+    int corruption_detected = 0;
+#endif
 
     if (xd->mode_info_context->mbmi.mb_skip_coeff)
     {
@@ -95,46 +96,136 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
     }
     else if (!vp8dx_bool_error(xd->current_bc))
     {
+        int eobtotal;
         eobtotal = vp8_decode_mb_tokens(pbi, xd);
+
+        /* Special case:  Force the loopfilter to skip when eobtotal is zero */
+        xd->mode_info_context->mbmi.mb_skip_coeff = (eobtotal==0);
     }
 
-    eobtotal |= (xd->mode_info_context->mbmi.mode == B_PRED ||
-                  xd->mode_info_context->mbmi.mode == SPLITMV);
-    if (!eobtotal && !vp8dx_bool_error(xd->current_bc))
+    mode = xd->mode_info_context->mbmi.mode;
+
+    if (xd->segmentation_enabled)
+        mb_init_dequantizer(pbi, xd);
+
+
+#if CONFIG_ERROR_CONCEALMENT
+
+    if(pbi->ec_active)
     {
-        /* Special case:  Force the loopfilter to skip when eobtotal and
-         * mb_skip_coeff are zero.
-         * */
-        xd->mode_info_context->mbmi.mb_skip_coeff = 1;
+        int throw_residual;
+        /* When we have independent partitions we can apply residual even
+         * though other partitions within the frame are corrupt.
+         */
+        throw_residual = (!pbi->independent_partitions &&
+                          pbi->frame_corrupt_residual);
+        throw_residual = (throw_residual || vp8dx_bool_error(xd->current_bc));
 
-        /*mt_skip_recon_mb(pbi, xd, mb_row, mb_col);*/
-        if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
-        {
-            vp8mt_build_intra_predictors_mbuv_s(pbi, xd, mb_row, mb_col);
-            vp8mt_build_intra_predictors_mby_s(pbi, xd, mb_row, mb_col);
-        }
-        else
+        if ((mb_idx >= pbi->mvs_corrupt_from_mb || throw_residual))
         {
-            vp8_build_inter16x16_predictors_mb(xd, xd->dst.y_buffer,
-                                               xd->dst.u_buffer, xd->dst.v_buffer,
-                                               xd->dst.y_stride, xd->dst.uv_stride);
+            /* MB with corrupt residuals or corrupt mode/motion vectors.
+             * Better to use the predictor as reconstruction.
+             */
+            pbi->frame_corrupt_residual = 1;
+            vpx_memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
+            vp8_conceal_corrupt_mb(xd);
+
+
+            corruption_detected = 1;
+
+            /* force idct to be skipped for B_PRED and use the
+             * prediction only for reconstruction
+             * */
+            vpx_memset(xd->eobs, 0, 25);
         }
-        return;
     }
-
-    if (xd->segmentation_enabled)
-        mb_init_dequantizer(pbi, xd);
+#endif
 
     /* do prediction */
     if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
     {
-        vp8mt_build_intra_predictors_mbuv_s(pbi, xd, mb_row, mb_col);
-
-        if (xd->mode_info_context->mbmi.mode != B_PRED)
+        vp8_build_intra_predictors_mbuv_s(xd,
+                                          xd->recon_above[1],
+                                          xd->recon_above[2],
+                                          xd->recon_left[1],
+                                          xd->recon_left[2],
+                                          xd->recon_left_stride[1],
+                                          xd->dst.u_buffer, xd->dst.v_buffer);
+
+        if (mode != B_PRED)
         {
-            vp8mt_build_intra_predictors_mby_s(pbi, xd, mb_row, mb_col);
-        } else {
-            vp8mt_intra_prediction_down_copy(pbi, xd, mb_row, mb_col);
+            vp8_build_intra_predictors_mby_s(xd,
+                                                 xd->recon_above[0],
+                                                 xd->recon_left[0],
+                                                 xd->recon_left_stride[0],
+                                                 xd->dst.y_buffer);
+        }
+        else
+        {
+            short *DQC = xd->dequant_y1;
+            int dst_stride = xd->dst.y_stride;
+            unsigned char *base_dst = xd->dst.y_buffer;
+
+            /* clear out residual eob info */
+            if(xd->mode_info_context->mbmi.mb_skip_coeff)
+                vpx_memset(xd->eobs, 0, 25);
+
+            intra_prediction_down_copy(xd, xd->recon_above[0] + 16);
+
+            for (i = 0; i < 16; i++)
+            {
+                BLOCKD *b = &xd->block[i];
+                int b_mode = xd->mode_info_context->bmi[i].as_mode;
+                unsigned char *yabove;
+                unsigned char *yleft;
+                int left_stride;
+                unsigned char top_left;
+
+                /*Caution: For some b_mode, it needs 8 pixels (4 above + 4 above-right).*/
+                if (i < 4 && pbi->common.filter_level)
+                    yabove = xd->recon_above[0] + b->offset; //i*4;
+                else
+                    yabove = (base_dst - dst_stride) + b->offset;
+
+                if (i%4==0 && pbi->common.filter_level)
+                {
+                    yleft = xd->recon_left[0] + i;
+                    left_stride = 1;
+                }
+                else
+                {
+                    yleft = (base_dst  - 1) + b->offset;
+                    left_stride = dst_stride;
+                }
+
+                if ((i==4 || i==8 || i==12) && pbi->common.filter_level)
+                    top_left = *(xd->recon_left[0] + i - 1);
+                else
+                    top_left = yabove[-1];
+
+                vp8_intra4x4_predict_d_c(yabove, yleft, left_stride,
+                                       b_mode,
+                                       base_dst + b->offset, dst_stride,
+                                       top_left);
+
+                if (xd->eobs[i] )
+                {
+                    if (xd->eobs[i] > 1)
+                    {
+                        vp8_dequant_idct_add
+                            (b->qcoeff, DQC,
+                            base_dst + b->offset, dst_stride);
+                    }
+                    else
+                    {
+                        vp8_dc_only_idct_add
+                            (b->qcoeff[0] * DQC[0],
+                            base_dst + b->offset, dst_stride,
+                            base_dst + b->offset, dst_stride);
+                        ((int *)b->qcoeff)[0] = 0;
+                    }
+                }
+            }
         }
     }
     else
@@ -142,109 +233,394 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
         vp8_build_inter_predictors_mb(xd);
     }
 
-    /* When we have independent partitions we can apply residual even
-     * though other partitions within the frame are corrupt.
-     */
-    throw_residual = (!pbi->independent_partitions &&
-                      pbi->frame_corrupt_residual);
-    throw_residual = (throw_residual || vp8dx_bool_error(xd->current_bc));
 
 #if CONFIG_ERROR_CONCEALMENT
-    if (pbi->ec_active &&
-        (mb_row * pbi->common.mb_cols + mb_col >= pbi->mvs_corrupt_from_mb ||
-         throw_residual))
+    if (corruption_detected)
     {
-        /* MB with corrupt residuals or corrupt mode/motion vectors.
-         * Better to use the predictor as reconstruction.
-         */
-        pbi->frame_corrupt_residual = 1;
-        vpx_memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
-        vp8_conceal_corrupt_mb(xd);
         return;
     }
 #endif
 
-    /* dequantization and idct */
-    if (xd->mode_info_context->mbmi.mode == B_PRED)
+    if(!xd->mode_info_context->mbmi.mb_skip_coeff)
     {
-        short *DQC = xd->dequant_y1;
-        int dst_stride = xd->dst.y_stride;
-        unsigned char *base_dst = xd->dst.y_buffer;
-
-        for (i = 0; i < 16; i++)
+        /* dequantization and idct */
+        if (mode != B_PRED)
         {
-            BLOCKD *b = &xd->block[i];
-            int b_mode = xd->mode_info_context->bmi[i].as_mode;
-
-            vp8mt_predict_intra4x4(pbi, xd, b_mode, base_dst + b->offset,
-                                   dst_stride, mb_row, mb_col, i);
+            short *DQC = xd->dequant_y1;
 
-            if (xd->eobs[i] )
+            if (mode != SPLITMV)
             {
-                if (xd->eobs[i] > 1)
+                BLOCKD *b = &xd->block[24];
+
+                /* do 2nd order transform on the dc block */
+                if (xd->eobs[24] > 1)
                 {
-                    vp8_dequant_idct_add
-                        (b->qcoeff, DQC,
-                        base_dst + b->offset, dst_stride);
+                    vp8_dequantize_b(b, xd->dequant_y2);
+
+                    vp8_short_inv_walsh4x4(&b->dqcoeff[0],
+                        xd->qcoeff);
+                    ((int *)b->qcoeff)[0] = 0;
+                    ((int *)b->qcoeff)[1] = 0;
+                    ((int *)b->qcoeff)[2] = 0;
+                    ((int *)b->qcoeff)[3] = 0;
+                    ((int *)b->qcoeff)[4] = 0;
+                    ((int *)b->qcoeff)[5] = 0;
+                    ((int *)b->qcoeff)[6] = 0;
+                    ((int *)b->qcoeff)[7] = 0;
                 }
                 else
                 {
-                    vp8_dc_only_idct_add
-                        (b->qcoeff[0] * DQC[0],
-                        base_dst + b->offset, dst_stride,
-                        base_dst + b->offset, dst_stride);
+                    b->dqcoeff[0] = b->qcoeff[0] * xd->dequant_y2[0];
+                    vp8_short_inv_walsh4x4_1(&b->dqcoeff[0],
+                        xd->qcoeff);
                     ((int *)b->qcoeff)[0] = 0;
                 }
+
+                /* override the dc dequant constant in order to preserve the
+                 * dc components
+                 */
+                DQC = xd->dequant_y1_dc;
             }
+
+            vp8_dequant_idct_add_y_block
+                            (xd->qcoeff, DQC,
+                             xd->dst.y_buffer,
+                             xd->dst.y_stride, xd->eobs);
         }
+
+        vp8_dequant_idct_add_uv_block
+                        (xd->qcoeff+16*16, xd->dequant_uv,
+                         xd->dst.u_buffer, xd->dst.v_buffer,
+                         xd->dst.uv_stride, xd->eobs+16);
     }
-    else
-    {
-        short *DQC = xd->dequant_y1;
+}
 
-        if (xd->mode_info_context->mbmi.mode != SPLITMV)
-        {
-            BLOCKD *b = &xd->block[24];
+typedef void (*init_current_bc_fn_t)(VP8D_COMP *pbi, MACROBLOCKD *xd,
+    int start_mb_row, int mb_row, int num_part);
 
-            /* do 2nd order transform on the dc block */
-            if (xd->eobs[24] > 1)
-            {
-                vp8_dequantize_b(b, xd->dequant_y2);
-
-                vp8_short_inv_walsh4x4(&b->dqcoeff[0],
-                    xd->qcoeff);
-                ((int *)b->qcoeff)[0] = 0;
-                ((int *)b->qcoeff)[1] = 0;
-                ((int *)b->qcoeff)[2] = 0;
-                ((int *)b->qcoeff)[3] = 0;
-                ((int *)b->qcoeff)[4] = 0;
-                ((int *)b->qcoeff)[5] = 0;
-                ((int *)b->qcoeff)[6] = 0;
-                ((int *)b->qcoeff)[7] = 0;
-            }
-            else
-            {
-                b->dqcoeff[0] = b->qcoeff[0] * xd->dequant_y2[0];
-                vp8_short_inv_walsh4x4_1(&b->dqcoeff[0], xd->qcoeff);
-                ((int *)b->qcoeff)[0] = 0;
-            }
+static void init_current_bc(VP8D_COMP *pbi, MACROBLOCKD *xd, int start_mb_row,
+                     int mb_row, int num_part)
+{
+    (void) start_mb_row;
 
-            /* override the dc dequant constant */
-            DQC = xd->dequant_y1_dc;
-        }
+    xd->current_bc = &pbi->mbc[mb_row%num_part];
+}
 
-        vp8_dequant_idct_add_y_block
-                        (xd->qcoeff, DQC,
-                         xd->dst.y_buffer,
-                         xd->dst.y_stride, xd->eobs);
+static void init_current_bc_threads(VP8D_COMP *pbi, MACROBLOCKD *xd,
+                     int start_mb_row, int mb_row, int num_part)
+{
+    (void) xd;
+    pbi->mb_row_di[start_mb_row - 1].mb_row = mb_row;
+    pbi->mb_row_di[start_mb_row - 1].mbd.current_bc =  &pbi->mbc[mb_row%num_part];
+}
+
+
+static void decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd, int start_mb_row,
+                           init_current_bc_fn_t init_current_bc_fn)
+{
+    volatile int *last_row_current_mb_col = NULL;
+    int mb_row;
+    VP8_COMMON *pc = &pbi->common;
+    int nsync = pbi->sync_range;
+    int num_part = 1 << pbi->common.multi_token_partition;
+
+    int dst_fb_idx = pc->new_fb_idx;
+    unsigned char *ref_buffer[MAX_REF_FRAMES][3];
+    unsigned char *dst_buffer[3];
+    int i;
+    int ref_fb_index[MAX_REF_FRAMES];
+    int ref_fb_corrupted[MAX_REF_FRAMES];
+
+    ref_fb_corrupted[INTRA_FRAME] = 0;
+
+    ref_fb_index[LAST_FRAME]    = pc->lst_fb_idx;
+    ref_fb_index[GOLDEN_FRAME]  = pc->gld_fb_idx;
+    ref_fb_index[ALTREF_FRAME]  = pc->alt_fb_idx;
+
+    for(i = 1; i < MAX_REF_FRAMES; i++)
+    {
+        ref_buffer[i][0] = pc->yv12_fb[ref_fb_index[i]].y_buffer;
+        ref_buffer[i][1] = pc->yv12_fb[ref_fb_index[i]].u_buffer;
+        ref_buffer[i][2] = pc->yv12_fb[ref_fb_index[i]].v_buffer;
+
+        ref_fb_corrupted[i] = pc->yv12_fb[ref_fb_index[i]].corrupted;
     }
 
-    vp8_dequant_idct_add_uv_block
-                    (xd->qcoeff+16*16, xd->dequant_uv,
-                     xd->dst.u_buffer, xd->dst.v_buffer,
-                     xd->dst.uv_stride, xd->eobs+16);
+    dst_buffer[0] = pc->yv12_fb[dst_fb_idx].y_buffer;
+    dst_buffer[1] = pc->yv12_fb[dst_fb_idx].u_buffer;
+    dst_buffer[2] = pc->yv12_fb[dst_fb_idx].v_buffer;
+
+    xd->up_available = (start_mb_row != 0);
+
+    for (mb_row = start_mb_row; mb_row < pc->mb_rows; mb_row += (pbi->decoding_thread_count + 1))
+    {
+       int i;
+       int recon_yoffset, recon_uvoffset;
+       int mb_col;
+       int ref_fb_idx = pc->lst_fb_idx;
+       int dst_fb_idx = pc->new_fb_idx;
+       int recon_y_stride = pc->yv12_fb[ref_fb_idx].y_stride;
+       int recon_uv_stride = pc->yv12_fb[ref_fb_idx].uv_stride;
+
+       int filter_level;
+       loop_filter_info_n *lfi_n = &pc->lf_info;
+
+       init_current_bc_fn(pbi, xd, start_mb_row, mb_row, num_part);
+
+       if (mb_row > 0)
+           last_row_current_mb_col = &pbi->mt_current_mb_col[mb_row -1];
+
+       recon_yoffset = mb_row * recon_y_stride * 16;
+       recon_uvoffset = mb_row * recon_uv_stride * 8;
+
+       /* reset contexts */
+       xd->above_context = pc->above_context;
+       vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
+
+       xd->left_available = 0;
+
+       xd->mb_to_top_edge = -((mb_row * 16)) << 3;
+       xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;
+
+       if (pbi->common.filter_level)
+       {
+          xd->recon_above[0] = pbi->mt_yabove_row[mb_row] + 0*16 +32;
+          xd->recon_above[1] = pbi->mt_uabove_row[mb_row] + 0*8 +16;
+          xd->recon_above[2] = pbi->mt_vabove_row[mb_row] + 0*8 +16;
+
+          xd->recon_left[0] = pbi->mt_yleft_col[mb_row];
+          xd->recon_left[1] = pbi->mt_uleft_col[mb_row];
+          xd->recon_left[2] = pbi->mt_vleft_col[mb_row];
+
+          //TODO: move to outside row loop
+          xd->recon_left_stride[0] = 1;
+          xd->recon_left_stride[1] = 1;
+       }
+       else
+       {
+          xd->recon_above[0] = dst_buffer[0] + recon_yoffset;
+          xd->recon_above[1] = dst_buffer[1] + recon_uvoffset;
+          xd->recon_above[2] = dst_buffer[2] + recon_uvoffset;
+
+          xd->recon_left[0] = xd->recon_above[0] - 1;
+          xd->recon_left[1] = xd->recon_above[1] - 1;
+          xd->recon_left[2] = xd->recon_above[2] - 1;
+
+          xd->recon_above[0] -= xd->dst.y_stride;
+          xd->recon_above[1] -= xd->dst.uv_stride;
+          xd->recon_above[2] -= xd->dst.uv_stride;
+
+          //TODO: move to outside row loop
+          xd->recon_left_stride[0] = xd->dst.y_stride;
+          xd->recon_left_stride[1] = xd->dst.uv_stride;
+       }
+
+       for (mb_col = 0; mb_col < pc->mb_cols; mb_col++)
+       {
+           if ( mb_row > 0 && (mb_col & (nsync-1)) == 0)
+           {
+               while (mb_col > (*last_row_current_mb_col - nsync) && *last_row_current_mb_col != pc->mb_cols - 1)
+               {
+                   x86_pause_hint();
+                   thread_sleep(0);
+               }
+           }
+
+           /* Distance of MB to the various image edges.
+            * These are specified to 8th pel as they are always
+            * compared to values that are in 1/8th pel units.
+            */
+           xd->mb_to_left_edge = -((mb_col * 16) << 3);
+           xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
+
+    #if CONFIG_ERROR_CONCEALMENT
+           {
+               int corrupt_residual =
+                           (!pbi->independent_partitions &&
+                           pbi->frame_corrupt_residual) ||
+                           vp8dx_bool_error(xd->current_bc);
+               if (pbi->ec_active &&
+                   (xd->mode_info_context->mbmi.ref_frame ==
+                                                    INTRA_FRAME) &&
+                   corrupt_residual)
+               {
+                   /* We have an intra block with corrupt
+                    * coefficients, better to conceal with an inter
+                    * block.
+                    * Interpolate MVs from neighboring MBs
+                    *
+                    * Note that for the first mb with corrupt
+                    * residual in a frame, we might not discover
+                    * that before decoding the residual. That
+                    * happens after this check, and therefore no
+                    * inter concealment will be done.
+                    */
+                   vp8_interpolate_motion(xd,
+                                          mb_row, mb_col,
+                                          pc->mb_rows, pc->mb_cols,
+                                          pc->mode_info_stride);
+               }
+           }
+    #endif
+
+
+           xd->dst.y_buffer = dst_buffer[0] + recon_yoffset;
+           xd->dst.u_buffer = dst_buffer[1] + recon_uvoffset;
+           xd->dst.v_buffer = dst_buffer[2] + recon_uvoffset;
+
+           xd->pre.y_buffer = ref_buffer[xd->mode_info_context->mbmi.ref_frame][0] + recon_yoffset;
+           xd->pre.u_buffer = ref_buffer[xd->mode_info_context->mbmi.ref_frame][1] + recon_uvoffset;
+           xd->pre.v_buffer = ref_buffer[xd->mode_info_context->mbmi.ref_frame][2] + recon_uvoffset;
+
+           /* propagate errors from reference frames */
+           xd->corrupted |= ref_fb_corrupted[xd->mode_info_context->mbmi.ref_frame];
+
+           decode_macroblock(pbi, xd, 0);
+
+           xd->left_available = 1;
+
+           /* check if the boolean decoder has suffered an error */
+           xd->corrupted |= vp8dx_bool_error(xd->current_bc);
+
+           xd->recon_above[0] += 16;
+           xd->recon_above[1] += 8;
+           xd->recon_above[2] += 8;
+
+           if (!pbi->common.filter_level)
+           {
+              xd->recon_left[0] += 16;
+              xd->recon_left[1] += 8;
+              xd->recon_left[2] += 8;
+           }
+
+           if (pbi->common.filter_level)
+           {
+               int skip_lf = (xd->mode_info_context->mbmi.mode != B_PRED &&
+                               xd->mode_info_context->mbmi.mode != SPLITMV &&
+                               xd->mode_info_context->mbmi.mb_skip_coeff);
+
+               const int mode_index = lfi_n->mode_lf_lut[xd->mode_info_context->mbmi.mode];
+               const int seg = xd->mode_info_context->mbmi.segment_id;
+               const int ref_frame = xd->mode_info_context->mbmi.ref_frame;
+
+               filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
+
+               if( mb_row != pc->mb_rows-1 )
+               {
+                   /* Save decoded MB last row data for next-row decoding */
+                   vpx_memcpy((pbi->mt_yabove_row[mb_row + 1] + 32 + mb_col*16), (xd->dst.y_buffer + 15 * recon_y_stride), 16);
+                   vpx_memcpy((pbi->mt_uabove_row[mb_row + 1] + 16 + mb_col*8), (xd->dst.u_buffer + 7 * recon_uv_stride), 8);
+                   vpx_memcpy((pbi->mt_vabove_row[mb_row + 1] + 16 + mb_col*8), (xd->dst.v_buffer + 7 * recon_uv_stride), 8);
+               }
+
+               /* save left_col for next MB decoding */
+               if(mb_col != pc->mb_cols-1)
+               {
+                   MODE_INFO *next = xd->mode_info_context +1;
+
+                   if (next->mbmi.ref_frame == INTRA_FRAME)
+                   {
+                       for (i = 0; i < 16; i++)
+                           pbi->mt_yleft_col[mb_row][i] = xd->dst.y_buffer [i* recon_y_stride + 15];
+                       for (i = 0; i < 8; i++)
+                       {
+                           pbi->mt_uleft_col[mb_row][i] = xd->dst.u_buffer [i* recon_uv_stride + 7];
+                           pbi->mt_vleft_col[mb_row][i] = xd->dst.v_buffer [i* recon_uv_stride + 7];
+                       }
+                   }
+               }
+
+               /* loopfilter on this macroblock. */
+               if (filter_level)
+               {
+                   if(pc->filter_type == NORMAL_LOOPFILTER)
+                   {
+                       loop_filter_info lfi;
+                       FRAME_TYPE frame_type = pc->frame_type;
+                       const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
+                       lfi.mblim = lfi_n->mblim[filter_level];
+                       lfi.blim = lfi_n->blim[filter_level];
+                       lfi.lim = lfi_n->lim[filter_level];
+                       lfi.hev_thr = lfi_n->hev_thr[hev_index];
+
+                       if (mb_col > 0)
+                           vp8_loop_filter_mbv
+                           (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi);
+
+                       if (!skip_lf)
+                           vp8_loop_filter_bv
+                           (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi);
+
+                       /* don't apply across umv border */
+                       if (mb_row > 0)
+                           vp8_loop_filter_mbh
+                           (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi);
+
+                       if (!skip_lf)
+                           vp8_loop_filter_bh
+                           (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer,  recon_y_stride, recon_uv_stride, &lfi);
+                   }
+                   else
+                   {
+                       if (mb_col > 0)
+                           vp8_loop_filter_simple_mbv
+                           (xd->dst.y_buffer, recon_y_stride, lfi_n->mblim[filter_level]);
+
+                       if (!skip_lf)
+                           vp8_loop_filter_simple_bv
+                           (xd->dst.y_buffer, recon_y_stride, lfi_n->blim[filter_level]);
+
+                       /* don't apply across umv border */
+                       if (mb_row > 0)
+                           vp8_loop_filter_simple_mbh
+                           (xd->dst.y_buffer, recon_y_stride, lfi_n->mblim[filter_level]);
+
+                       if (!skip_lf)
+                           vp8_loop_filter_simple_bh
+                           (xd->dst.y_buffer, recon_y_stride, lfi_n->blim[filter_level]);
+                   }
+               }
+
+           }
+
+           recon_yoffset += 16;
+           recon_uvoffset += 8;
+
+           ++xd->mode_info_context;  /* next mb */
+
+           xd->above_context++;
+
+           /*pbi->mb_row_di[ithread].current_mb_col = mb_col;*/
+           pbi->mt_current_mb_col[mb_row] = mb_col;
+       }
+
+       /* adjust to the next row of mbs */
+       if (pbi->common.filter_level)
+       {
+           if(mb_row != pc->mb_rows-1)
+           {
+               int lasty = pc->yv12_fb[ref_fb_idx].y_width + VP8BORDERINPIXELS;
+               int lastuv = (pc->yv12_fb[ref_fb_idx].y_width>>1) + (VP8BORDERINPIXELS>>1);
+
+               for (i = 0; i < 4; i++)
+               {
+                   pbi->mt_yabove_row[mb_row +1][lasty + i] = pbi->mt_yabove_row[mb_row +1][lasty -1];
+                   pbi->mt_uabove_row[mb_row +1][lastuv + i] = pbi->mt_uabove_row[mb_row +1][lastuv -1];
+                   pbi->mt_vabove_row[mb_row +1][lastuv + i] = pbi->mt_vabove_row[mb_row +1][lastuv -1];
+               }
+           }
+       } else
+           vp8_extend_mb_row(&pc->yv12_fb[dst_fb_idx], xd->dst.y_buffer + 16, xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
+
+       ++xd->mode_info_context;      /* skip prediction column */
+       xd->up_available = 1;
+
+       /* since we have multithread */
+       xd->mode_info_context += xd->mode_info_stride * pbi->decoding_thread_count;
+    }
 }
 
+
 static THREAD_FUNCTION thread_decoding_proc(void *p_data)
 {
     int ithread = ((DECODETHREAD_DATA *)p_data)->ithread;
@@ -264,251 +640,18 @@ static THREAD_FUNCTION thread_decoding_proc(void *p_data)
                 break;
             else
             {
-                VP8_COMMON *pc = &pbi->common;
                 MACROBLOCKD *xd = &mbrd->mbd;
 
-                int mb_row;
-                int num_part = 1 << pbi->common.multi_token_partition;
-                volatile int *last_row_current_mb_col;
-                int nsync = pbi->sync_range;
-
-                for (mb_row = ithread+1; mb_row < pc->mb_rows; mb_row += (pbi->decoding_thread_count + 1))
-                {
-                    int i;
-                    int recon_yoffset, recon_uvoffset;
-                    int mb_col;
-                    int ref_fb_idx = pc->lst_fb_idx;
-                    int dst_fb_idx = pc->new_fb_idx;
-                    int recon_y_stride = pc->yv12_fb[ref_fb_idx].y_stride;
-                    int recon_uv_stride = pc->yv12_fb[ref_fb_idx].uv_stride;
-
-                    int filter_level;
-                    loop_filter_info_n *lfi_n = &pc->lf_info;
-
-                    pbi->mb_row_di[ithread].mb_row = mb_row;
-                    pbi->mb_row_di[ithread].mbd.current_bc =  &pbi->mbc[mb_row%num_part];
-
-                    last_row_current_mb_col = &pbi->mt_current_mb_col[mb_row -1];
-
-                    recon_yoffset = mb_row * recon_y_stride * 16;
-                    recon_uvoffset = mb_row * recon_uv_stride * 8;
-                    /* reset above block coeffs */
-
-                    xd->above_context = pc->above_context;
-                    xd->left_context = &mb_row_left_context;
-                    vpx_memset(&mb_row_left_context, 0, sizeof(mb_row_left_context));
-                    xd->up_available = (mb_row != 0);
-
-                    xd->mb_to_top_edge = -((mb_row * 16)) << 3;
-                    xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;
-
-                    for (mb_col = 0; mb_col < pc->mb_cols; mb_col++)
-                    {
-                        if ((mb_col & (nsync-1)) == 0)
-                        {
-                            while (mb_col > (*last_row_current_mb_col - nsync) && *last_row_current_mb_col != pc->mb_cols - 1)
-                            {
-                                x86_pause_hint();
-                                thread_sleep(0);
-                            }
-                        }
-
-                        /* Distance of MB to the various image edges.
-                         * These are specified to 8th pel as they are always
-                         * compared to values that are in 1/8th pel units.
-                         */
-                        xd->mb_to_left_edge = -((mb_col * 16) << 3);
-                        xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
-
-#if CONFIG_ERROR_CONCEALMENT
-                        {
-                            int corrupt_residual =
-                                        (!pbi->independent_partitions &&
-                                        pbi->frame_corrupt_residual) ||
-                                        vp8dx_bool_error(xd->current_bc);
-                            if (pbi->ec_active &&
-                                (xd->mode_info_context->mbmi.ref_frame ==
-                                                                 INTRA_FRAME) &&
-                                corrupt_residual)
-                            {
-                                /* We have an intra block with corrupt
-                                 * coefficients, better to conceal with an inter
-                                 * block.
-                                 * Interpolate MVs from neighboring MBs
-                                 *
-                                 * Note that for the first mb with corrupt
-                                 * residual in a frame, we might not discover
-                                 * that before decoding the residual. That
-                                 * happens after this check, and therefore no
-                                 * inter concealment will be done.
-                                 */
-                                vp8_interpolate_motion(xd,
-                                                       mb_row, mb_col,
-                                                       pc->mb_rows, pc->mb_cols,
-                                                       pc->mode_info_stride);
-                            }
-                        }
-#endif
-
-
-                        xd->dst.y_buffer = pc->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
-                        xd->dst.u_buffer = pc->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
-                        xd->dst.v_buffer = pc->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
-
-                        xd->left_available = (mb_col != 0);
-
-                        /* Select the appropriate reference frame for this MB */
-                        if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)
-                            ref_fb_idx = pc->lst_fb_idx;
-                        else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME)
-                            ref_fb_idx = pc->gld_fb_idx;
-                        else
-                            ref_fb_idx = pc->alt_fb_idx;
-
-                        xd->pre.y_buffer = pc->yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;
-                        xd->pre.u_buffer = pc->yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
-                        xd->pre.v_buffer = pc->yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
-
-                        if (xd->mode_info_context->mbmi.ref_frame !=
-                                INTRA_FRAME)
-                        {
-                            /* propagate errors from reference frames */
-                            xd->corrupted |= pc->yv12_fb[ref_fb_idx].corrupted;
-                        }
-
-                        decode_macroblock(pbi, xd, mb_row, mb_col);
-
-                        /* check if the boolean decoder has suffered an error */
-                        xd->corrupted |= vp8dx_bool_error(xd->current_bc);
-
-                        if (pbi->common.filter_level)
-                        {
-                            int skip_lf = (xd->mode_info_context->mbmi.mode != B_PRED &&
-                                            xd->mode_info_context->mbmi.mode != SPLITMV &&
-                                            xd->mode_info_context->mbmi.mb_skip_coeff);
-
-                            const int mode_index = lfi_n->mode_lf_lut[xd->mode_info_context->mbmi.mode];
-                            const int seg = xd->mode_info_context->mbmi.segment_id;
-                            const int ref_frame = xd->mode_info_context->mbmi.ref_frame;
-
-                            filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
-
-                            if( mb_row != pc->mb_rows-1 )
-                            {
-                                /* Save decoded MB last row data for next-row decoding */
-                                vpx_memcpy((pbi->mt_yabove_row[mb_row + 1] + 32 + mb_col*16), (xd->dst.y_buffer + 15 * recon_y_stride), 16);
-                                vpx_memcpy((pbi->mt_uabove_row[mb_row + 1] + 16 + mb_col*8), (xd->dst.u_buffer + 7 * recon_uv_stride), 8);
-                                vpx_memcpy((pbi->mt_vabove_row[mb_row + 1] + 16 + mb_col*8), (xd->dst.v_buffer + 7 * recon_uv_stride), 8);
-                            }
-
-                            /* save left_col for next MB decoding */
-                            if(mb_col != pc->mb_cols-1)
-                            {
-                                MODE_INFO *next = xd->mode_info_context +1;
-
-                                if (next->mbmi.ref_frame == INTRA_FRAME)
-                                {
-                                    for (i = 0; i < 16; i++)
-                                        pbi->mt_yleft_col[mb_row][i] = xd->dst.y_buffer [i* recon_y_stride + 15];
-                                    for (i = 0; i < 8; i++)
-                                    {
-                                        pbi->mt_uleft_col[mb_row][i] = xd->dst.u_buffer [i* recon_uv_stride + 7];
-                                        pbi->mt_vleft_col[mb_row][i] = xd->dst.v_buffer [i* recon_uv_stride + 7];
-                                    }
-                                }
-                            }
-
-                            /* loopfilter on this macroblock. */
-                            if (filter_level)
-                            {
-                                if(pc->filter_type == NORMAL_LOOPFILTER)
-                                {
-                                    loop_filter_info lfi;
-                                    FRAME_TYPE frame_type = pc->frame_type;
-                                    const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
-                                    lfi.mblim = lfi_n->mblim[filter_level];
-                                    lfi.blim = lfi_n->blim[filter_level];
-                                    lfi.lim = lfi_n->lim[filter_level];
-                                    lfi.hev_thr = lfi_n->hev_thr[hev_index];
-
-                                    if (mb_col > 0)
-                                        vp8_loop_filter_mbv
-                                        (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi);
-
-                                    if (!skip_lf)
-                                        vp8_loop_filter_bv
-                                        (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi);
-
-                                    /* don't apply across umv border */
-                                    if (mb_row > 0)
-                                        vp8_loop_filter_mbh
-                                        (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi);
-
-                                    if (!skip_lf)
-                                        vp8_loop_filter_bh
-                                        (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer,  recon_y_stride, recon_uv_stride, &lfi);
-                                }
-                                else
-                                {
-                                    if (mb_col > 0)
-                                        vp8_loop_filter_simple_mbv
-                                        (xd->dst.y_buffer, recon_y_stride, lfi_n->mblim[filter_level]);
-
-                                    if (!skip_lf)
-                                        vp8_loop_filter_simple_bv
-                                        (xd->dst.y_buffer, recon_y_stride, lfi_n->blim[filter_level]);
-
-                                    /* don't apply across umv border */
-                                    if (mb_row > 0)
-                                        vp8_loop_filter_simple_mbh
-                                        (xd->dst.y_buffer, recon_y_stride, lfi_n->mblim[filter_level]);
-
-                                    if (!skip_lf)
-                                        vp8_loop_filter_simple_bh
-                                        (xd->dst.y_buffer, recon_y_stride, lfi_n->blim[filter_level]);
-                                }
-                            }
-
-                        }
-
-                        recon_yoffset += 16;
-                        recon_uvoffset += 8;
-
-                        ++xd->mode_info_context;  /* next mb */
-
-                        xd->above_context++;
-
-                        /*pbi->mb_row_di[ithread].current_mb_col = mb_col;*/
-                        pbi->mt_current_mb_col[mb_row] = mb_col;
-                    }
+                xd->left_context = &mb_row_left_context;
 
-                    /* adjust to the next row of mbs */
-                    if (pbi->common.filter_level)
-                    {
-                        if(mb_row != pc->mb_rows-1)
-                        {
-                            int lasty = pc->yv12_fb[ref_fb_idx].y_width + VP8BORDERINPIXELS;
-                            int lastuv = (pc->yv12_fb[ref_fb_idx].y_width>>1) + (VP8BORDERINPIXELS>>1);
-
-                            for (i = 0; i < 4; i++)
-                            {
-                                pbi->mt_yabove_row[mb_row +1][lasty + i] = pbi->mt_yabove_row[mb_row +1][lasty -1];
-                                pbi->mt_uabove_row[mb_row +1][lastuv + i] = pbi->mt_uabove_row[mb_row +1][lastuv -1];
-                                pbi->mt_vabove_row[mb_row +1][lastuv + i] = pbi->mt_vabove_row[mb_row +1][lastuv -1];
-                            }
-                        }
-                    } else
-                        vp8_extend_mb_row(&pc->yv12_fb[dst_fb_idx], xd->dst.y_buffer + 16, xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
-
-                    ++xd->mode_info_context;      /* skip prediction column */
-
-                    /* since we have multithread */
-                    xd->mode_info_context += xd->mode_info_stride * pbi->decoding_thread_count;
-                }
+                decode_mb_rows(pbi, xd, ithread+1, init_current_bc_threads);
             }
         }
+
         /*  add this to each frame */
-        if ((mbrd->mb_row == pbi->common.mb_rows-1) || ((mbrd->mb_row == pbi->common.mb_rows-2) && (pbi->common.mb_rows % (pbi->decoding_thread_count+1))==1))
+        if ((mbrd->mb_row == pbi->common.mb_rows-1) ||
+            ((mbrd->mb_row == pbi->common.mb_rows-2) &&
+                (pbi->common.mb_rows % (pbi->decoding_thread_count+1))==1))
         {
             /*SetEvent(pbi->h_event_end_decoding);*/
             sem_post(&pbi->h_event_end_decoding);
@@ -735,16 +878,10 @@ void vp8_decoder_remove_threads(VP8D_COMP *pbi)
 
 void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)
 {
-    int mb_row;
     VP8_COMMON *pc = &pbi->common;
-
-    int num_part = 1 << pbi->common.multi_token_partition;
     int i;
-    volatile int *last_row_current_mb_col = NULL;
-    int nsync = pbi->sync_range;
 
     int filter_level = pc->filter_level;
-    loop_filter_info_n *lfi_n = &pc->lf_info;
 
     if (filter_level)
     {
@@ -777,229 +914,7 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)
     for (i = 0; i < pbi->decoding_thread_count; i++)
         sem_post(&pbi->h_event_start_decoding[i]);
 
-    for (mb_row = 0; mb_row < pc->mb_rows; mb_row += (pbi->decoding_thread_count + 1))
-    {
-        xd->current_bc = &pbi->mbc[mb_row%num_part];
-
-        /* vp8_decode_mb_row(pbi, pc, mb_row, xd); */
-        {
-            int i;
-            int recon_yoffset, recon_uvoffset;
-            int mb_col;
-            int ref_fb_idx = pc->lst_fb_idx;
-            int dst_fb_idx = pc->new_fb_idx;
-            int recon_y_stride = pc->yv12_fb[ref_fb_idx].y_stride;
-            int recon_uv_stride = pc->yv12_fb[ref_fb_idx].uv_stride;
-
-           /* volatile int *last_row_current_mb_col = NULL; */
-            if (mb_row > 0)
-                last_row_current_mb_col = &pbi->mt_current_mb_col[mb_row -1];
-
-            vpx_memset(&pc->left_context, 0, sizeof(pc->left_context));
-            recon_yoffset = mb_row * recon_y_stride * 16;
-            recon_uvoffset = mb_row * recon_uv_stride * 8;
-            /* reset above block coeffs */
-
-            xd->above_context = pc->above_context;
-            xd->up_available = (mb_row != 0);
-
-            xd->mb_to_top_edge = -((mb_row * 16)) << 3;
-            xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;
-
-            for (mb_col = 0; mb_col < pc->mb_cols; mb_col++)
-            {
-                if ( mb_row > 0 && (mb_col & (nsync-1)) == 0){
-                    while (mb_col > (*last_row_current_mb_col - nsync) && *last_row_current_mb_col != pc->mb_cols - 1)
-                    {
-                        x86_pause_hint();
-                        thread_sleep(0);
-                    }
-                }
-
-                /* Distance of MB to the various image edges.
-                 * These are specified to 8th pel as they are always compared to
-                 * values that are in 1/8th pel units.
-                 */
-                xd->mb_to_left_edge = -((mb_col * 16) << 3);
-                xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
-
-#if CONFIG_ERROR_CONCEALMENT
-                {
-                    int corrupt_residual = (!pbi->independent_partitions &&
-                                            pbi->frame_corrupt_residual) ||
-                                            vp8dx_bool_error(xd->current_bc);
-                    if (pbi->ec_active &&
-                        (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) &&
-                        corrupt_residual)
-                    {
-                        /* We have an intra block with corrupt coefficients,
-                         * better to conceal with an inter block. Interpolate
-                         * MVs from neighboring MBs
-                         *
-                         * Note that for the first mb with corrupt residual in a
-                         * frame, we might not discover that before decoding the
-                         * residual. That happens after this check, and
-                         * therefore no inter concealment will be done.
-                         */
-                        vp8_interpolate_motion(xd,
-                                               mb_row, mb_col,
-                                               pc->mb_rows, pc->mb_cols,
-                                               pc->mode_info_stride);
-                    }
-                }
-#endif
-
-
-                xd->dst.y_buffer = pc->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
-                xd->dst.u_buffer = pc->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
-                xd->dst.v_buffer = pc->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
-
-                xd->left_available = (mb_col != 0);
-
-                /* Select the appropriate reference frame for this MB */
-                if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)
-                    ref_fb_idx = pc->lst_fb_idx;
-                else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME)
-                    ref_fb_idx = pc->gld_fb_idx;
-                else
-                    ref_fb_idx = pc->alt_fb_idx;
-
-                xd->pre.y_buffer = pc->yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;
-                xd->pre.u_buffer = pc->yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
-                xd->pre.v_buffer = pc->yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
-
-                if (xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME)
-                {
-                    /* propagate errors from reference frames */
-                    xd->corrupted |= pc->yv12_fb[ref_fb_idx].corrupted;
-                }
-
-                decode_macroblock(pbi, xd, mb_row, mb_col);
-
-                /* check if the boolean decoder has suffered an error */
-                xd->corrupted |= vp8dx_bool_error(xd->current_bc);
-
-                if (pbi->common.filter_level)
-                {
-                    int skip_lf = (xd->mode_info_context->mbmi.mode != B_PRED &&
-                                    xd->mode_info_context->mbmi.mode != SPLITMV &&
-                                    xd->mode_info_context->mbmi.mb_skip_coeff);
-
-                    const int mode_index = lfi_n->mode_lf_lut[xd->mode_info_context->mbmi.mode];
-                    const int seg = xd->mode_info_context->mbmi.segment_id;
-                    const int ref_frame = xd->mode_info_context->mbmi.ref_frame;
-
-                    filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
-
-                    /* Save decoded MB last row data for next-row decoding */
-                    if(mb_row != pc->mb_rows-1)
-                    {
-                        vpx_memcpy((pbi->mt_yabove_row[mb_row +1] + 32 + mb_col*16), (xd->dst.y_buffer + 15 * recon_y_stride), 16);
-                        vpx_memcpy((pbi->mt_uabove_row[mb_row +1] + 16 + mb_col*8), (xd->dst.u_buffer + 7 * recon_uv_stride), 8);
-                        vpx_memcpy((pbi->mt_vabove_row[mb_row +1] + 16 + mb_col*8), (xd->dst.v_buffer + 7 * recon_uv_stride), 8);
-                    }
-
-                    /* save left_col for next MB decoding */
-                    if(mb_col != pc->mb_cols-1)
-                    {
-                        MODE_INFO *next = xd->mode_info_context +1;
-
-                        if (next->mbmi.ref_frame == INTRA_FRAME)
-                        {
-                            for (i = 0; i < 16; i++)
-                                pbi->mt_yleft_col[mb_row][i] = xd->dst.y_buffer [i* recon_y_stride + 15];
-                            for (i = 0; i < 8; i++)
-                            {
-                                pbi->mt_uleft_col[mb_row][i] = xd->dst.u_buffer [i* recon_uv_stride + 7];
-                                pbi->mt_vleft_col[mb_row][i] = xd->dst.v_buffer [i* recon_uv_stride + 7];
-                            }
-                        }
-                    }
-
-                    /* loopfilter on this macroblock. */
-                    if (filter_level)
-                    {
-                        if(pc->filter_type == NORMAL_LOOPFILTER)
-                        {
-                            loop_filter_info lfi;
-                            FRAME_TYPE frame_type = pc->frame_type;
-                            const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
-                            lfi.mblim = lfi_n->mblim[filter_level];
-                            lfi.blim = lfi_n->blim[filter_level];
-                            lfi.lim = lfi_n->lim[filter_level];
-                            lfi.hev_thr = lfi_n->hev_thr[hev_index];
-
-                            if (mb_col > 0)
-                                vp8_loop_filter_mbv
-                                (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi);
-
-                            if (!skip_lf)
-                                vp8_loop_filter_bv
-                                (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi);
-
-                            /* don't apply across umv border */
-                            if (mb_row > 0)
-                                vp8_loop_filter_mbh
-                                (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi);
-
-                            if (!skip_lf)
-                                vp8_loop_filter_bh
-                                (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer,  recon_y_stride, recon_uv_stride, &lfi);
-                        }
-                        else
-                        {
-                            if (mb_col > 0)
-                                vp8_loop_filter_simple_mbv
-                                (xd->dst.y_buffer, recon_y_stride, lfi_n->mblim[filter_level]);
-
-                            if (!skip_lf)
-                                vp8_loop_filter_simple_bv
-                                (xd->dst.y_buffer, recon_y_stride, lfi_n->blim[filter_level]);
-
-                            /* don't apply across umv border */
-                            if (mb_row > 0)
-                                vp8_loop_filter_simple_mbh
-                                (xd->dst.y_buffer, recon_y_stride, lfi_n->mblim[filter_level]);
-
-                            if (!skip_lf)
-                                vp8_loop_filter_simple_bh
-                                (xd->dst.y_buffer, recon_y_stride, lfi_n->blim[filter_level]);
-                        }
-                    }
-
-                }
-                recon_yoffset += 16;
-                recon_uvoffset += 8;
-
-                ++xd->mode_info_context;  /* next mb */
-
-                xd->above_context++;
-
-                pbi->mt_current_mb_col[mb_row] = mb_col;
-            }
-
-            /* adjust to the next row of mbs */
-            if (pbi->common.filter_level)
-            {
-                if(mb_row != pc->mb_rows-1)
-                {
-                    int lasty = pc->yv12_fb[ref_fb_idx].y_width + VP8BORDERINPIXELS;
-                    int lastuv = (pc->yv12_fb[ref_fb_idx].y_width>>1) + (VP8BORDERINPIXELS>>1);
-
-                    for (i = 0; i < 4; i++)
-                    {
-                        pbi->mt_yabove_row[mb_row +1][lasty + i] = pbi->mt_yabove_row[mb_row +1][lasty -1];
-                        pbi->mt_uabove_row[mb_row +1][lastuv + i] = pbi->mt_uabove_row[mb_row +1][lastuv -1];
-                        pbi->mt_vabove_row[mb_row +1][lastuv + i] = pbi->mt_vabove_row[mb_row +1][lastuv -1];
-                    }
-                }
-            }else
-                vp8_extend_mb_row(&pc->yv12_fb[dst_fb_idx], xd->dst.y_buffer + 16, xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
-
-            ++xd->mode_info_context;      /* skip prediction column */
-        }
-        xd->mode_info_context += xd->mode_info_stride * pbi->decoding_thread_count;
-    }
+    decode_mb_rows(pbi, xd, 0, init_current_bc);
 
     sem_wait(&pbi->h_event_end_decoding);   /* add back for each frame */
 }
diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c
index 0bb51730e..af00f7c8c 100644
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c
@@ -24,6 +24,7 @@
 #include "bitstream.h"
 
 #include "defaultcoefcounts.h"
+#include "vp8/common/common.h"
 
 const int vp8cx_base_skip_false_prob[128] =
 {
@@ -159,7 +160,7 @@ static void write_split(vp8_writer *bc, int x)
     );
 }
 
-static void pack_tokens_c(vp8_writer *w, const TOKENEXTRA *p, int xcount)
+void vp8_pack_tokens_c(vp8_writer *w, const TOKENEXTRA *p, int xcount)
 {
     const TOKENEXTRA *const stop = p + xcount;
     unsigned int split;
@@ -382,219 +383,23 @@ static void pack_tokens_into_partitions_c(VP8_COMP *cpi, unsigned char *cx_data,
     int i;
     unsigned char *ptr = cx_data;
     unsigned char *ptr_end = cx_data_end;
-    unsigned int shift;
-    vp8_writer *w;
-    ptr = cx_data;
+    vp8_writer * w;
 
     for (i = 0; i < num_part; i++)
     {
-        w = cpi->bc + i + 1;
-        vp8_start_encode(w, ptr, ptr_end);
-        {
-            unsigned int split;
-            int count = w->count;
-            unsigned int range = w->range;
-            unsigned int lowvalue = w->lowvalue;
-            int mb_row;
-
-            for (mb_row = i; mb_row < cpi->common.mb_rows; mb_row += num_part)
-            {
-                TOKENEXTRA *p    = cpi->tplist[mb_row].start;
-                TOKENEXTRA *stop = cpi->tplist[mb_row].stop;
-
-                while (p < stop)
-                {
-                    const int t = p->Token;
-                    vp8_token *const a = vp8_coef_encodings + t;
-                    const vp8_extra_bit_struct *const b = vp8_extra_bits + t;
-                    int i = 0;
-                    const unsigned char *pp = p->context_tree;
-                    int v = a->value;
-                    int n = a->Len;
-
-                    if (p->skip_eob_node)
-                    {
-                        n--;
-                        i = 2;
-                    }
-
-                    do
-                    {
-                        const int bb = (v >> --n) & 1;
-                        split = 1 + (((range - 1) * pp[i>>1]) >> 8);
-                        i = vp8_coef_tree[i+bb];
-
-                        if (bb)
-                        {
-                            lowvalue += split;
-                            range = range - split;
-                        }
-                        else
-                        {
-                            range = split;
-                        }
-
-                        shift = vp8_norm[range];
-                        range <<= shift;
-                        count += shift;
-
-                        if (count >= 0)
-                        {
-                            int offset = shift - count;
-
-                            if ((lowvalue << (offset - 1)) & 0x80000000)
-                            {
-                                int x = w->pos - 1;
-
-                                while (x >= 0 && w->buffer[x] == 0xff)
-                                {
-                                    w->buffer[x] = (unsigned char)0;
-                                    x--;
-                                }
-
-                                w->buffer[x] += 1;
-                            }
-
-                            validate_buffer(w->buffer + w->pos,
-                                            1,
-                                            cx_data_end,
-                                            &cpi->common.error);
-
-                            w->buffer[w->pos++] = (lowvalue >> (24 - offset));
-
-                            lowvalue <<= offset;
-                            shift = count;
-                            lowvalue &= 0xffffff;
-                            count -= 8 ;
-                        }
-
-                        lowvalue <<= shift;
-                    }
-                    while (n);
+        int mb_row;
 
+        w = cpi->bc + i + 1;
 
-                    if (b->base_val)
-                    {
-                        const int e = p->Extra, L = b->Len;
-
-                        if (L)
-                        {
-                            const unsigned char *pp = b->prob;
-                            int v = e >> 1;
-                            int n = L;              /* number of bits in v, assumed nonzero */
-                            int i = 0;
-
-                            do
-                            {
-                                const int bb = (v >> --n) & 1;
-                                split = 1 + (((range - 1) * pp[i>>1]) >> 8);
-                                i = b->tree[i+bb];
-
-                                if (bb)
-                                {
-                                    lowvalue += split;
-                                    range = range - split;
-                                }
-                                else
-                                {
-                                    range = split;
-                                }
-
-                                shift = vp8_norm[range];
-                                range <<= shift;
-                                count += shift;
-
-                                if (count >= 0)
-                                {
-                                    int offset = shift - count;
-
-                                    if ((lowvalue << (offset - 1)) & 0x80000000)
-                                    {
-                                        int x = w->pos - 1;
-
-                                        while (x >= 0 && w->buffer[x] == 0xff)
-                                        {
-                                            w->buffer[x] = (unsigned char)0;
-                                            x--;
-                                        }
-
-                                        w->buffer[x] += 1;
-                                    }
-
-                                    validate_buffer(w->buffer + w->pos,
-                                                    1,
-                                                    cx_data_end,
-                                                    &cpi->common.error);
-
-                                    w->buffer[w->pos++] =
-                                        (lowvalue >> (24 - offset));
-
-                                    lowvalue <<= offset;
-                                    shift = count;
-                                    lowvalue &= 0xffffff;
-                                    count -= 8 ;
-                                }
-
-                                lowvalue <<= shift;
-                            }
-                            while (n);
-                        }
-
-                        {
-                            split = (range + 1) >> 1;
-
-                            if (e & 1)
-                            {
-                                lowvalue += split;
-                                range = range - split;
-                            }
-                            else
-                            {
-                                range = split;
-                            }
-
-                            range <<= 1;
-
-                            if ((lowvalue & 0x80000000))
-                            {
-                                int x = w->pos - 1;
-
-                                while (x >= 0 && w->buffer[x] == 0xff)
-                                {
-                                    w->buffer[x] = (unsigned char)0;
-                                    x--;
-                                }
-
-                                w->buffer[x] += 1;
-
-                            }
-
-                            lowvalue  <<= 1;
-
-                            if (!++count)
-                            {
-                                count = -8;
-                                validate_buffer(w->buffer + w->pos,
-                                                1,
-                                                cx_data_end,
-                                                &cpi->common.error);
-
-                                w->buffer[w->pos++] = (lowvalue >> 24);
-
-                                lowvalue &= 0xffffff;
-                            }
-                        }
-
-                    }
-
-                    ++p;
-                }
-            }
+        vp8_start_encode(w, ptr, ptr_end);
 
-            w->count    = count;
-            w->lowvalue = lowvalue;
-            w->range    = range;
+        for (mb_row = i; mb_row < cpi->common.mb_rows; mb_row += num_part)
+        {
+            const TOKENEXTRA *p    = cpi->tplist[mb_row].start;
+            const TOKENEXTRA *stop = cpi->tplist[mb_row].stop;
+            int tokens = stop - p;
 
+            vp8_pack_tokens_c(w, p, tokens);
         }
 
         vp8_stop_encode(w);
@@ -605,209 +410,17 @@ static void pack_tokens_into_partitions_c(VP8_COMP *cpi, unsigned char *cx_data,
 
 static void pack_mb_row_tokens_c(VP8_COMP *cpi, vp8_writer *w)
 {
-
-    unsigned int split;
-    int count = w->count;
-    unsigned int range = w->range;
-    unsigned int lowvalue = w->lowvalue;
-    unsigned int shift;
     int mb_row;
 
     for (mb_row = 0; mb_row < cpi->common.mb_rows; mb_row++)
     {
-        TOKENEXTRA *p    = cpi->tplist[mb_row].start;
-        TOKENEXTRA *stop = cpi->tplist[mb_row].stop;
-
-        while (p < stop)
-        {
-            const int t = p->Token;
-            vp8_token *const a = vp8_coef_encodings + t;
-            const vp8_extra_bit_struct *const b = vp8_extra_bits + t;
-            int i = 0;
-            const unsigned char *pp = p->context_tree;
-            int v = a->value;
-            int n = a->Len;
-
-            if (p->skip_eob_node)
-            {
-                n--;
-                i = 2;
-            }
-
-            do
-            {
-                const int bb = (v >> --n) & 1;
-                split = 1 + (((range - 1) * pp[i>>1]) >> 8);
-                i = vp8_coef_tree[i+bb];
-
-                if (bb)
-                {
-                    lowvalue += split;
-                    range = range - split;
-                }
-                else
-                {
-                    range = split;
-                }
-
-                shift = vp8_norm[range];
-                range <<= shift;
-                count += shift;
-
-                if (count >= 0)
-                {
-                    int offset = shift - count;
-
-                    if ((lowvalue << (offset - 1)) & 0x80000000)
-                    {
-                        int x = w->pos - 1;
-
-                        while (x >= 0 && w->buffer[x] == 0xff)
-                        {
-                            w->buffer[x] = (unsigned char)0;
-                            x--;
-                        }
-
-                        w->buffer[x] += 1;
-                    }
-
-                    validate_buffer(w->buffer + w->pos,
-                                    1,
-                                    w->buffer_end,
-                                    w->error);
-
-                    w->buffer[w->pos++] = (lowvalue >> (24 - offset));
-                    lowvalue <<= offset;
-                    shift = count;
-                    lowvalue &= 0xffffff;
-                    count -= 8 ;
-                }
-
-                lowvalue <<= shift;
-            }
-            while (n);
-
-
-            if (b->base_val)
-            {
-                const int e = p->Extra, L = b->Len;
-
-                if (L)
-                {
-                    const unsigned char *pp = b->prob;
-                    int v = e >> 1;
-                    int n = L;              /* number of bits in v, assumed nonzero */
-                    int i = 0;
-
-                    do
-                    {
-                        const int bb = (v >> --n) & 1;
-                        split = 1 + (((range - 1) * pp[i>>1]) >> 8);
-                        i = b->tree[i+bb];
-
-                        if (bb)
-                        {
-                            lowvalue += split;
-                            range = range - split;
-                        }
-                        else
-                        {
-                            range = split;
-                        }
-
-                        shift = vp8_norm[range];
-                        range <<= shift;
-                        count += shift;
-
-                        if (count >= 0)
-                        {
-                            int offset = shift - count;
-
-                            if ((lowvalue << (offset - 1)) & 0x80000000)
-                            {
-                                int x = w->pos - 1;
-
-                                while (x >= 0 && w->buffer[x] == 0xff)
-                                {
-                                    w->buffer[x] = (unsigned char)0;
-                                    x--;
-                                }
-
-                                w->buffer[x] += 1;
-                            }
-
-                            validate_buffer(w->buffer + w->pos,
-                                            1,
-                                            w->buffer_end,
-                                            w->error);
-
-                            w->buffer[w->pos++] = (lowvalue >> (24 - offset));
-                            lowvalue <<= offset;
-                            shift = count;
-                            lowvalue &= 0xffffff;
-                            count -= 8 ;
-                        }
-
-                        lowvalue <<= shift;
-                    }
-                    while (n);
-                }
-
-                {
-                    split = (range + 1) >> 1;
-
-                    if (e & 1)
-                    {
-                        lowvalue += split;
-                        range = range - split;
-                    }
-                    else
-                    {
-                        range = split;
-                    }
-
-                    range <<= 1;
-
-                    if ((lowvalue & 0x80000000))
-                    {
-                        int x = w->pos - 1;
-
-                        while (x >= 0 && w->buffer[x] == 0xff)
-                        {
-                            w->buffer[x] = (unsigned char)0;
-                            x--;
-                        }
-
-                        w->buffer[x] += 1;
-
-                    }
-
-                    lowvalue  <<= 1;
-
-                    if (!++count)
-                    {
-                        count = -8;
-
-                        validate_buffer(w->buffer + w->pos,
-                                        1,
-                                        w->buffer_end,
-                                        w->error);
+        const TOKENEXTRA *p    = cpi->tplist[mb_row].start;
+        const TOKENEXTRA *stop = cpi->tplist[mb_row].stop;
+        int tokens = stop - p;
 
-                        w->buffer[w->pos++] = (lowvalue >> 24);
-                        lowvalue &= 0xffffff;
-                    }
-                }
-
-            }
-
-            ++p;
-        }
+        vp8_pack_tokens_c(w, p, tokens);
     }
 
-    w->count = count;
-    w->lowvalue = lowvalue;
-    w->range = range;
-
 }
 
 static void write_mv_ref
@@ -925,7 +538,9 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi)
 
     if (pc->mb_no_coeff_skip)
     {
-        prob_skip_false = cpi->skip_false_count * 256 / (cpi->skip_false_count + cpi->skip_true_count);
+        int total_mbs = pc->mb_rows * pc->mb_cols;
+
+        prob_skip_false = (total_mbs - cpi->skip_true_count ) * 256 / total_mbs;
 
         if (prob_skip_false <= 1)
             prob_skip_false = 1;
@@ -1112,7 +727,9 @@ static void write_kfmodes(VP8_COMP *cpi)
 
     if (c->mb_no_coeff_skip)
     {
-        prob_skip_false = cpi->skip_false_count * 256 / (cpi->skip_false_count + cpi->skip_true_count);
+        int total_mbs = c->mb_rows * c->mb_cols;
+
+        prob_skip_false = (total_mbs - cpi->skip_true_count ) * 256 / total_mbs;
 
         if (prob_skip_false <= 1)
             prob_skip_false = 1;
@@ -1167,6 +784,7 @@ static void write_kfmodes(VP8_COMP *cpi)
     }
 }
 
+#if 0
 /* This function is used for debugging probability trees. */
 static void print_prob_tree(vp8_prob
      coef_probs[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES])
@@ -1198,6 +816,7 @@ static void print_prob_tree(vp8_prob
     fprintf(f, "}\n");
     fclose(f);
 }
+#endif
 
 static void sum_probs_over_prev_coef_context(
         const unsigned int probs[PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS],
@@ -1327,7 +946,6 @@ static int default_coef_context_savings(VP8_COMP *cpi)
 
                 int t = 0;      /* token/prob index */
 
-
                 vp8_tree_probs_from_distribution(
                     MAX_ENTROPY_TOKENS, vp8_coef_encodings, vp8_coef_tree,
                     cpi->frame_coef_probs [i][j][k],
@@ -1432,10 +1050,33 @@ int vp8_estimate_entropy_savings(VP8_COMP *cpi)
     return savings;
 }
 
-static void update_coef_probs(VP8_COMP *cpi)
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+int vp8_update_coef_context(VP8_COMP *cpi)
+{
+    int savings = 0;
+
+
+    if (cpi->common.frame_type == KEY_FRAME)
+    {
+        /* Reset to default counts/probabilities at key frames */
+        vp8_copy(cpi->coef_counts, default_coef_counts);
+    }
+
+    if (cpi->oxcf.error_resilient_mode & VPX_ERROR_RESILIENT_PARTITIONS)
+        savings += independent_coef_context_savings(cpi);
+    else
+        savings += default_coef_context_savings(cpi);
+
+    return savings;
+}
+#endif
+
+void vp8_update_coef_probs(VP8_COMP *cpi)
 {
     int i = 0;
+#if !(CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
     vp8_writer *const w = cpi->bc;
+#endif
     int savings = 0;
 
     vp8_clear_system_state(); //__asm emms;
@@ -1515,7 +1156,11 @@ static void update_coef_probs(VP8_COMP *cpi)
                         cpi->common.frame_type == KEY_FRAME && newp != *Pold)
                         u = 1;
 
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+                    cpi->update_probs[i][j][k][t] = u;
+#else
                     vp8_write(w, u, upd);
+#endif
 
 
 #ifdef ENTROPY_STATS
@@ -1527,7 +1172,9 @@ static void update_coef_probs(VP8_COMP *cpi)
                         /* send/use new probability */
 
                         *Pold = newp;
+#if !(CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
                         vp8_write_literal(w, newp, 8);
+#endif
 
                         savings += s;
 
@@ -1556,6 +1203,50 @@ static void update_coef_probs(VP8_COMP *cpi)
     while (++i < BLOCK_TYPES);
 
 }
+
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+static void pack_coef_probs(VP8_COMP *cpi)
+{
+    int i = 0;
+    vp8_writer *const w = cpi->bc;
+
+    do
+    {
+        int j = 0;
+
+        do
+        {
+            int k = 0;
+
+            do
+            {
+                int t = 0;      /* token/prob index */
+
+                do
+                {
+                    const vp8_prob newp = cpi->common.fc.coef_probs [i][j][k][t];
+                    const vp8_prob upd = vp8_coef_update_probs [i][j][k][t];
+
+                    const char u = cpi->update_probs[i][j][k][t] ;
+
+                    vp8_write(w, u, upd);
+
+                    if (u)
+                    {
+                        /* send/use new probability */
+                        vp8_write_literal(w, newp, 8);
+                    }
+                }
+                while (++t < ENTROPY_NODES);
+            }
+            while (++k < PREV_COEF_CONTEXTS);
+        }
+        while (++j < COEF_BANDS);
+    }
+    while (++i < BLOCK_TYPES);
+}
+#endif
+
 #ifdef PACKET_TESTING
 FILE *vpxlogc = 0;
 #endif
@@ -1818,6 +1509,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char * dest
         vp8_write_bit(bc, pc->ref_frame_sign_bias[ALTREF_FRAME]);
     }
 
+#if !(CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
     if (cpi->oxcf.error_resilient_mode & VPX_ERROR_RESILIENT_PARTITIONS)
     {
         if (pc->frame_type == KEY_FRAME)
@@ -1825,6 +1517,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char * dest
         else
             pc->refresh_entropy_probs = 0;
     }
+#endif
 
     vp8_write_bit(bc, pc->refresh_entropy_probs);
 
@@ -1842,13 +1535,17 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char * dest
 
     vp8_clear_system_state();  //__asm emms;
 
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+    pack_coef_probs(cpi);
+#else
     if (pc->refresh_entropy_probs == 0)
     {
         // save a copy for later refresh
         vpx_memcpy(&cpi->common.lfc, &cpi->common.fc, sizeof(cpi->common.fc));
     }
 
-    update_coef_probs(cpi);
+    vp8_update_coef_probs(cpi);
+#endif
 
 #ifdef ENTROPY_STATS
     active_section = 2;
@@ -1896,6 +1593,45 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char * dest
 
     cpi->partition_sz[0] = *size;
 
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+    {
+        const int num_part = (1 << pc->multi_token_partition);
+        unsigned char * dp = cpi->partition_d[0] + cpi->partition_sz[0];
+
+        if (num_part > 1)
+        {
+            /* write token part sizes (all but last) if more than 1 */
+            validate_buffer(dp, 3 * (num_part - 1), cpi->partition_d_end[0],
+                            &pc->error);
+
+            cpi->partition_sz[0] += 3*(num_part-1);
+
+            for(i = 1; i < num_part; i++)
+            {
+                write_partition_size(dp, cpi->partition_sz[i]);
+                dp += 3;
+            }
+        }
+
+        if (!cpi->output_partition)
+        {
+            /* concatenate partition buffers */
+            for(i = 0; i < num_part; i++)
+            {
+                vpx_memmove(dp, cpi->partition_d[i+1], cpi->partition_sz[i+1]);
+                cpi->partition_d[i+1] = dp;
+                dp += cpi->partition_sz[i+1];
+            }
+        }
+
+        /* update total size */
+        *size = 0;
+        for(i = 0; i < num_part+1; i++)
+        {
+            *size += cpi->partition_sz[i];
+        }
+    }
+#else
     if (pc->multi_token_partition != ONE_PARTITION)
     {
         int num_part = 1 << pc->multi_token_partition;
@@ -1945,6 +1681,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char * dest
         *size += cpi->bc[1].pos;
         cpi->partition_sz[1] = cpi->bc[1].pos;
     }
+#endif
 }
 
 #ifdef ENTROPY_STATS
diff --git a/vp8/encoder/bitstream.h b/vp8/encoder/bitstream.h
index 736b9d494..e0bbdc4f3 100644
--- a/vp8/encoder/bitstream.h
+++ b/vp8/encoder/bitstream.h
@@ -35,7 +35,10 @@ void vp8cx_pack_mb_row_tokens_armv5(VP8_COMP *cpi, vp8_writer *w,
 # define pack_mb_row_tokens(a,b)               \
     vp8cx_pack_mb_row_tokens_armv5(a,b,vp8_coef_encodings,vp8_extra_bits,vp8_coef_tree)
 #else
-# define pack_tokens(a,b,c)                    pack_tokens_c(a,b,c)
+
+void vp8_pack_tokens_c(vp8_writer *w, const TOKENEXTRA *p, int xcount);
+
+# define pack_tokens(a,b,c)                    vp8_pack_tokens_c(a,b,c)
 # define pack_tokens_into_partitions(a,b,c,d)  pack_tokens_into_partitions_c(a,b,c,d)
 # define pack_mb_row_tokens(a,b)               pack_mb_row_tokens_c(a,b)
 #endif
diff --git a/vp8/encoder/denoising.c b/vp8/encoder/denoising.c
new file mode 100644
index 000000000..6773a659d
--- /dev/null
+++ b/vp8/encoder/denoising.c
@@ -0,0 +1,212 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "denoising.h"
+
+#include "vp8/common/reconinter.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_rtcd.h"
+
+const unsigned int NOISE_MOTION_THRESHOLD = 20*20;
+const unsigned int NOISE_DIFF2_THRESHOLD = 75;
+// SSE_DIFF_THRESHOLD is selected as ~95% confidence assuming var(noise) ~= 100.
+const unsigned int SSE_DIFF_THRESHOLD = 16*16*20;
+const unsigned int SSE_THRESHOLD = 16*16*40;
+
+static __inline uint8_t blend(uint8_t state, uint8_t sample, uint8_t factor_q8)
+{
+  return (uint8_t)(
+      (((uint16_t)factor_q8 * ((uint16_t)state) +  // Q8
+        (uint16_t)(256 - factor_q8) * ((uint16_t)sample)) + 128)  // Q8
+      >> 8);
+}
+
+static unsigned int denoiser_motion_compensate(YV12_BUFFER_CONFIG* src,
+                                               YV12_BUFFER_CONFIG* dst,
+                                               MACROBLOCK* x,
+                                               unsigned int best_sse,
+                                               unsigned int zero_mv_sse,
+                                               int recon_yoffset,
+                                               int recon_uvoffset)
+{
+  MACROBLOCKD filter_xd = x->e_mbd;
+  int mv_col;
+  int mv_row;
+  int sse_diff = zero_mv_sse - best_sse;
+  // Compensate the running average.
+  filter_xd.pre.y_buffer = src->y_buffer + recon_yoffset;
+  filter_xd.pre.u_buffer = src->u_buffer + recon_uvoffset;
+  filter_xd.pre.v_buffer = src->v_buffer + recon_uvoffset;
+  // Write the compensated running average to the destination buffer.
+  filter_xd.dst.y_buffer = dst->y_buffer + recon_yoffset;
+  filter_xd.dst.u_buffer = dst->u_buffer + recon_uvoffset;
+  filter_xd.dst.v_buffer = dst->v_buffer + recon_uvoffset;
+  // Use the best MV for the compensation.
+  filter_xd.mode_info_context->mbmi.ref_frame = LAST_FRAME;
+  filter_xd.mode_info_context->mbmi.mode = filter_xd.best_sse_inter_mode;
+  filter_xd.mode_info_context->mbmi.mv = filter_xd.best_sse_mv;
+  filter_xd.mode_info_context->mbmi.need_to_clamp_mvs =
+      filter_xd.need_to_clamp_best_mvs;
+  mv_col = filter_xd.best_sse_mv.as_mv.col;
+  mv_row = filter_xd.best_sse_mv.as_mv.row;
+  if (filter_xd.mode_info_context->mbmi.mode <= B_PRED ||
+      (mv_row*mv_row + mv_col*mv_col <= NOISE_MOTION_THRESHOLD &&
+       sse_diff < SSE_DIFF_THRESHOLD))
+  {
+    // Handle intra blocks as referring to last frame with zero motion and
+    // let the absolute pixel difference affect the filter factor.
+    // Also consider small amount of motion as being random walk due to noise,
+    // if it doesn't mean that we get a much bigger error.
+    // Note that any changes to the mode info only affects the denoising.
+    filter_xd.mode_info_context->mbmi.ref_frame = LAST_FRAME;
+    filter_xd.mode_info_context->mbmi.mode = ZEROMV;
+    filter_xd.mode_info_context->mbmi.mv.as_int = 0;
+    x->e_mbd.best_sse_inter_mode = ZEROMV;
+    x->e_mbd.best_sse_mv.as_int = 0;
+    best_sse = zero_mv_sse;
+  }
+  if (!x->skip)
+  {
+    vp8_build_inter_predictors_mb(&filter_xd);
+  }
+  else
+  {
+    vp8_build_inter16x16_predictors_mb(&filter_xd,
+                                       filter_xd.dst.y_buffer,
+                                       filter_xd.dst.u_buffer,
+                                       filter_xd.dst.v_buffer,
+                                       filter_xd.dst.y_stride,
+                                       filter_xd.dst.uv_stride);
+  }
+  return best_sse;
+}
+
+static void denoiser_filter(YV12_BUFFER_CONFIG* mc_running_avg,
+                            YV12_BUFFER_CONFIG* running_avg,
+                            MACROBLOCK* signal,
+                            unsigned int motion_magnitude2,
+                            int y_offset,
+                            int uv_offset)
+{
+  unsigned char* sig = signal->thismb;
+  int sig_stride = 16;
+  unsigned char* mc_running_avg_y = mc_running_avg->y_buffer + y_offset;
+  int mc_avg_y_stride = mc_running_avg->y_stride;
+  unsigned char* running_avg_y = running_avg->y_buffer + y_offset;
+  int avg_y_stride = running_avg->y_stride;
+  int r, c;
+  for (r = 0; r < 16; r++)
+  {
+    for (c = 0; c < 16; c++)
+    {
+      int diff;
+      int absdiff = 0;
+      unsigned int filter_coefficient;
+      absdiff = sig[c] - mc_running_avg_y[c];
+      absdiff = absdiff > 0 ? absdiff : -absdiff;
+      assert(absdiff >= 0 && absdiff < 256);
+      filter_coefficient = (255 << 8) / (256 + ((absdiff * 330) >> 3));
+      // Allow some additional filtering of static blocks, or blocks with very
+      // small motion vectors.
+      filter_coefficient += filter_coefficient / (3 + (motion_magnitude2 >> 3));
+      filter_coefficient = filter_coefficient > 255 ? 255 : filter_coefficient;
+
+      running_avg_y[c] = blend(mc_running_avg_y[c], sig[c], filter_coefficient);
+      diff = sig[c] - running_avg_y[c];
+
+      if (diff * diff < NOISE_DIFF2_THRESHOLD)
+      {
+        // Replace with mean to suppress the noise.
+        sig[c] = running_avg_y[c];
+      }
+      else
+      {
+        // Replace the filter state with the signal since the change in this
+        // pixel isn't classified as noise.
+        running_avg_y[c] = sig[c];
+      }
+    }
+    sig += sig_stride;
+    mc_running_avg_y += mc_avg_y_stride;
+    running_avg_y += avg_y_stride;
+  }
+}
+
+int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height)
+{
+  assert(denoiser);
+  denoiser->yv12_running_avg.flags = 0;
+  if (vp8_yv12_alloc_frame_buffer(&(denoiser->yv12_running_avg), width,
+                                  height, VP8BORDERINPIXELS) < 0)
+  {
+      vp8_denoiser_free(denoiser);
+      return 1;
+  }
+  denoiser->yv12_mc_running_avg.flags = 0;
+  if (vp8_yv12_alloc_frame_buffer(&(denoiser->yv12_mc_running_avg), width,
+                                  height, VP8BORDERINPIXELS) < 0)
+  {
+      vp8_denoiser_free(denoiser);
+      return 1;
+  }
+  vpx_memset(denoiser->yv12_running_avg.buffer_alloc, 0,
+             denoiser->yv12_running_avg.frame_size);
+  vpx_memset(denoiser->yv12_mc_running_avg.buffer_alloc, 0,
+             denoiser->yv12_mc_running_avg.frame_size);
+  return 0;
+}
+
+void vp8_denoiser_free(VP8_DENOISER *denoiser)
+{
+  assert(denoiser);
+  vp8_yv12_de_alloc_frame_buffer(&denoiser->yv12_running_avg);
+  vp8_yv12_de_alloc_frame_buffer(&denoiser->yv12_mc_running_avg);
+}
+
+void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
+                             MACROBLOCK *x,
+                             unsigned int best_sse,
+                             unsigned int zero_mv_sse,
+                             int recon_yoffset,
+                             int recon_uvoffset) {
+  int mv_row;
+  int mv_col;
+  unsigned int motion_magnitude2;
+  // Motion compensate the running average.
+  best_sse = denoiser_motion_compensate(&denoiser->yv12_running_avg,
+                                        &denoiser->yv12_mc_running_avg,
+                                        x,
+                                        best_sse,
+                                        zero_mv_sse,
+                                        recon_yoffset,
+                                        recon_uvoffset);
+
+  mv_row = x->e_mbd.best_sse_mv.as_mv.row;
+  mv_col = x->e_mbd.best_sse_mv.as_mv.col;
+  motion_magnitude2 = mv_row*mv_row + mv_col*mv_col;
+  if (best_sse > SSE_THRESHOLD ||
+      motion_magnitude2 > 8 * NOISE_MOTION_THRESHOLD)
+  {
+    // No filtering of this block since it differs too much from the predictor,
+    // or the motion vector magnitude is considered too big.
+    vp8_copy_mem16x16(x->thismb, 16,
+                      denoiser->yv12_running_avg.y_buffer + recon_yoffset,
+                      denoiser->yv12_running_avg.y_stride);
+    return;
+  }
+  // Filter.
+  denoiser_filter(&denoiser->yv12_mc_running_avg,
+                  &denoiser->yv12_running_avg,
+                  x,
+                  motion_magnitude2,
+                  recon_yoffset,
+                  recon_uvoffset);
+}
diff --git a/vp8/encoder/denoising.h b/vp8/encoder/denoising.h
new file mode 100644
index 000000000..343531bb1
--- /dev/null
+++ b/vp8/encoder/denoising.h
@@ -0,0 +1,33 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_ENCODER_DENOISING_H_
+#define VP8_ENCODER_DENOISING_H_
+
+#include "block.h"
+
+typedef struct vp8_denoiser
+{
+  YV12_BUFFER_CONFIG yv12_running_avg;
+  YV12_BUFFER_CONFIG yv12_mc_running_avg;
+} VP8_DENOISER;
+
+int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height);
+
+void vp8_denoiser_free(VP8_DENOISER *denoiser);
+
+void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
+                             MACROBLOCK *x,
+                             unsigned int best_sse,
+                             unsigned int zero_mv_sse,
+                             int recon_yoffset,
+                             int recon_uvoffset);
+
+#endif  // VP8_ENCODER_DENOISING_H_
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index 10f56078b..962a719c8 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -28,6 +28,9 @@
 #include <limits.h>
 #include "vp8/common/invtrans.h"
 #include "vpx_ports/vpx_timer.h"
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+#include "bitstream.h"
+#endif
 
 extern void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) ;
 extern void vp8_calc_ref_frame_costs(int *ref_frame_cost,
@@ -373,10 +376,17 @@ void encode_mb_row(VP8_COMP *cpi,
     int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
     int map_index = (mb_row * cpi->common.mb_cols);
 
+#if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
+    const int num_part = (1 << cm->multi_token_partition);
+    TOKENEXTRA * tp_start = cpi->tok;
+    vp8_writer *w;
+#endif
+
 #if CONFIG_MULTITHREAD
     const int nsync = cpi->mt_sync_range;
-    const int rightmost_col = cm->mb_cols - 1;
+    const int rightmost_col = cm->mb_cols + nsync;
     volatile const int *last_row_current_mb_col;
+    volatile int *current_mb_col = &cpi->mt_current_mb_col[mb_row];
 
     if ((cpi->b_multi_threaded != 0) && (mb_row != 0))
         last_row_current_mb_col = &cpi->mt_current_mb_col[mb_row - 1];
@@ -384,6 +394,13 @@ void encode_mb_row(VP8_COMP *cpi,
         last_row_current_mb_col = &rightmost_col;
 #endif
 
+#if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
+    if(num_part > 1)
+        w= &cpi->bc[1 + (mb_row % num_part)];
+    else
+        w = &cpi->bc[1];
+#endif
+
     // reset above block coeffs
     xd->above_context = cm->above_context;
 
@@ -411,6 +428,10 @@ void encode_mb_row(VP8_COMP *cpi,
     // for each macroblock col in image
     for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
     {
+
+#if  (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
+        *tp = cpi->tok;
+#endif
         // Distance of Mb to the left & right edges, specified in
         // 1/8th pel units as they are always compared to values
         // that are in 1/8th pel units
@@ -435,12 +456,13 @@ void encode_mb_row(VP8_COMP *cpi,
         vp8_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
 
 #if CONFIG_MULTITHREAD
-        if ((cpi->b_multi_threaded != 0) && (mb_row != 0))
+        if (cpi->b_multi_threaded != 0)
         {
+            *current_mb_col = mb_col - 1; // set previous MB done
+
             if ((mb_col & (nsync - 1)) == 0)
             {
-                while (mb_col > (*last_row_current_mb_col - nsync)
-                        && (*last_row_current_mb_col) != (cm->mb_cols - 1))
+                while (mb_col > (*last_row_current_mb_col - nsync))
                 {
                     x86_pause_hint();
                     thread_sleep(0);
@@ -495,13 +517,13 @@ void encode_mb_row(VP8_COMP *cpi,
 
 #endif
 
-            // Count of last ref frame 0,0 useage
+            // Count of last ref frame 0,0 usage
             if ((xd->mode_info_context->mbmi.mode == ZEROMV) && (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME))
                 cpi->inter_zz_count ++;
 
             // Special case code for cyclic refresh
             // If cyclic update enabled then copy xd->mbmi.segment_id; (which may have been updated based on mode
-            // during vp8cx_encode_inter_macroblock()) back into the global sgmentation map
+            // during vp8cx_encode_inter_macroblock()) back into the global segmentation map
             if ((cpi->current_layer == 0) &&
                 (cpi->cyclic_refresh_mode_enabled && xd->segmentation_enabled))
             {
@@ -525,7 +547,14 @@ void encode_mb_row(VP8_COMP *cpi,
 
         cpi->tplist[mb_row].stop = *tp;
 
-        // Increment pointer into gf useage flags structure.
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+        /* pack tokens for this MB */
+        {
+            int tok_count = *tp - tp_start;
+            pack_tokens(w, tp_start, tok_count);
+        }
+#endif
+        // Increment pointer into gf usage flags structure.
         x->gf_active_ptr++;
 
         // Increment the activity mask pointers.
@@ -539,39 +568,29 @@ void encode_mb_row(VP8_COMP *cpi,
         recon_yoffset += 16;
         recon_uvoffset += 8;
 
-        // Keep track of segment useage
+        // Keep track of segment usage
         segment_counts[xd->mode_info_context->mbmi.segment_id] ++;
 
         // skip to next mb
         xd->mode_info_context++;
         x->partition_info++;
-
         xd->above_context++;
-#if CONFIG_MULTITHREAD
-        if (cpi->b_multi_threaded != 0)
-        {
-            cpi->mt_current_mb_col[mb_row] = mb_col;
-        }
-#endif
     }
 
     //extend the recon for intra prediction
-    vp8_extend_mb_row(
-        &cm->yv12_fb[dst_fb_idx],
-        xd->dst.y_buffer + 16,
-        xd->dst.u_buffer + 8,
-        xd->dst.v_buffer + 8);
+    vp8_extend_mb_row( &cm->yv12_fb[dst_fb_idx],
+                        xd->dst.y_buffer + 16,
+                        xd->dst.u_buffer + 8,
+                        xd->dst.v_buffer + 8);
+
+#if CONFIG_MULTITHREAD
+    if (cpi->b_multi_threaded != 0)
+        *current_mb_col = rightmost_col;
+#endif
 
     // this is to account for the border
     xd->mode_info_context++;
     x->partition_info++;
-
-#if CONFIG_MULTITHREAD
-    if ((cpi->b_multi_threaded != 0) && (mb_row == cm->mb_rows - 1))
-    {
-        sem_post(&cpi->h_event_end_encoding); /* signal frame encoding end */
-    }
-#endif
 }
 
 void init_encode_frame_mb_context(VP8_COMP *cpi)
@@ -599,7 +618,7 @@ void init_encode_frame_mb_context(VP8_COMP *cpi)
     if (cm->frame_type == KEY_FRAME)
         vp8_init_mbmode_probs(cm);
 
-    // Copy data over into macro block data sturctures.
+    // Copy data over into macro block data structures.
     x->src = * cpi->Source;
     xd->pre = cm->yv12_fb[cm->lst_fb_idx];
     xd->dst = cm->yv12_fb[cm->new_fb_idx];
@@ -656,10 +675,13 @@ void vp8_encode_frame(VP8_COMP *cpi)
     MACROBLOCK *const x = & cpi->mb;
     VP8_COMMON *const cm = & cpi->common;
     MACROBLOCKD *const xd = & x->e_mbd;
-
     TOKENEXTRA *tp = cpi->tok;
     int segment_counts[MAX_MB_SEGMENTS];
     int totalrate;
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+    BOOL_CODER * bc = &cpi->bc[1]; // bc[0] is for control partition
+    const int num_part = (1 << cm->multi_token_partition);
+#endif
 
     vpx_memset(segment_counts, 0, sizeof(segment_counts));
     totalrate = 0;
@@ -688,15 +710,13 @@ void vp8_encode_frame(VP8_COMP *cpi)
         xd->subpixel_predict16x16   = vp8_bilinear_predict16x16;
     }
 
-    // Reset frame count of inter 0,0 motion vector useage.
+    // Reset frame count of inter 0,0 motion vector usage.
     cpi->inter_zz_count = 0;
 
-    vpx_memset(segment_counts, 0, sizeof(segment_counts));
-
     cpi->prediction_error = 0;
     cpi->intra_error = 0;
     cpi->skip_true_count = 0;
-    cpi->skip_false_count = 0;
+    cpi->tok_count = 0;
 
 #if 0
     // Experimental code
@@ -707,6 +727,7 @@ void vp8_encode_frame(VP8_COMP *cpi)
     xd->mode_info_context = cm->mi;
 
     vp8_zero(cpi->MVcount);
+
     vp8_zero(cpi->coef_counts);
 
     vp8cx_frame_init_quantizer(cpi);
@@ -725,9 +746,22 @@ void vp8_encode_frame(VP8_COMP *cpi)
         build_activity_map(cpi);
     }
 
-    // re-initencode frame context.
+    // re-init encode frame context.
     init_encode_frame_mb_context(cpi);
 
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+    {
+        int i;
+        for(i = 0; i < num_part; i++)
+        {
+            vp8_start_encode(&bc[i], cpi->partition_d[i + 1],
+                    cpi->partition_d_end[i + 1]);
+            bc[i].error = &cm->error;
+        }
+    }
+
+#endif
+
     {
         struct vpx_usec_timer  emr_timer;
         vpx_usec_timer_start(&emr_timer);
@@ -751,7 +785,11 @@ void vp8_encode_frame(VP8_COMP *cpi)
             {
                 vp8_zero(cm->left_context)
 
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+                tp = cpi->tok;
+#else
                 tp = cpi->tok + mb_row * (cm->mb_cols * 16 * 24);
+#endif
 
                 encode_mb_row(cpi, cm, mb_row, x, xd, &tp, segment_counts, &totalrate);
 
@@ -764,12 +802,14 @@ void vp8_encode_frame(VP8_COMP *cpi)
                 x->partition_info  += xd->mode_info_stride * cpi->encoding_thread_count;
                 x->gf_active_ptr   += cm->mb_cols * cpi->encoding_thread_count;
 
+                if(mb_row == cm->mb_rows - 1)
+                {
+                    sem_post(&cpi->h_event_end_encoding); /* signal frame encoding end */
+                }
             }
 
             sem_wait(&cpi->h_event_end_encoding); /* wait for other threads to finish */
 
-            cpi->tok_count = 0;
-
             for (mb_row = 0; mb_row < cm->mb_rows; mb_row ++)
             {
                 cpi->tok_count += cpi->tplist[mb_row].stop - cpi->tplist[mb_row].start;
@@ -802,9 +842,12 @@ void vp8_encode_frame(VP8_COMP *cpi)
             // for each macroblock row in image
             for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
             {
-
                 vp8_zero(cm->left_context)
 
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+                tp = cpi->tok;
+#endif
+
                 encode_mb_row(cpi, cm, mb_row, x, xd, &tp, segment_counts, &totalrate);
 
                 // adjust to the next row of mbs
@@ -814,16 +857,25 @@ void vp8_encode_frame(VP8_COMP *cpi)
             }
 
             cpi->tok_count = tp - cpi->tok;
+        }
 
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+        {
+            int i;
+            for(i = 0; i < num_part; i++)
+            {
+                vp8_stop_encode(&bc[i]);
+                cpi->partition_sz[i+1] = bc[i].pos;
+            }
         }
+#endif
 
         vpx_usec_timer_mark(&emr_timer);
         cpi->time_encode_mb_row += vpx_usec_timer_elapsed(&emr_timer);
-
     }
 
 
-    // Work out the segment probabilites if segmentation is enabled
+    // Work out the segment probabilities if segmentation is enabled
     if (xd->segmentation_enabled)
     {
         int tot_count;
@@ -911,20 +963,16 @@ void vp8_encode_frame(VP8_COMP *cpi)
     }
 #endif
 
-    // Adjust the projected reference frame useage probability numbers to reflect
-    // what we have just seen. This may be usefull when we make multiple itterations
+#if ! CONFIG_REALTIME_ONLY
+    // Adjust the projected reference frame usage probability numbers to reflect
+    // what we have just seen. This may be useful when we make multiple iterations
     // of the recode loop rather than continuing to use values from the previous frame.
     if ((cm->frame_type != KEY_FRAME) && ((cpi->oxcf.number_of_layers > 1) ||
         (!cm->refresh_alt_ref_frame && !cm->refresh_golden_frame)))
     {
       vp8_convert_rfct_to_prob(cpi);
     }
-
-#if 0
-    // Keep record of the total distortion this time around for future use
-    cpi->last_frame_distortion = cpi->frame_distortion;
 #endif
-
 }
 void vp8_setup_block_ptrs(MACROBLOCK *x)
 {
@@ -1094,6 +1142,7 @@ int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,
     vp8_encode_intra16x16mbuv(x);
 
     sum_intra_stats(cpi, x);
+
     vp8_tokenize_mb(cpi, &x->e_mbd, t);
 
     if (xd->mode_info_context->mbmi.mode != B_PRED)
@@ -1130,6 +1179,13 @@ int vp8cx_encode_inter_macroblock
     else
         x->encode_breakout = cpi->oxcf.encode_breakout;
 
+#if CONFIG_TEMPORAL_DENOISING
+    // Reset the best sse mode/mv for each macroblock.
+    x->e_mbd.best_sse_inter_mode = 0;
+    x->e_mbd.best_sse_mv.as_int = 0;
+    x->e_mbd.need_to_clamp_best_mvs = 0;
+#endif
+
     if (cpi->sf.RD)
     {
         int zbin_mode_boost_enabled = cpi->zbin_mode_boost_enabled;
@@ -1260,11 +1316,6 @@ int vp8cx_encode_inter_macroblock
         if (!x->skip)
         {
             vp8_encode_inter16x16(x);
-
-            // Clear mb_skip_coeff if mb_no_coeff_skip is not set
-            if (!cpi->common.mb_no_coeff_skip)
-                xd->mode_info_context->mbmi.mb_skip_coeff = 0;
-
         }
         else
             vp8_build_inter16x16_predictors_mb(xd, xd->dst.y_buffer,
@@ -1287,17 +1338,17 @@ int vp8cx_encode_inter_macroblock
     }
     else
     {
+        /* always set mb_skip_coeff as it is needed by the loopfilter */
+        xd->mode_info_context->mbmi.mb_skip_coeff = 1;
+
         if (cpi->common.mb_no_coeff_skip)
         {
-            xd->mode_info_context->mbmi.mb_skip_coeff = 1;
             cpi->skip_true_count ++;
             vp8_fix_contexts(xd);
         }
         else
         {
             vp8_stuff_mb(cpi, xd, t);
-            xd->mode_info_context->mbmi.mb_skip_coeff = 0;
-            cpi->skip_false_count ++;
         }
     }
 
diff --git a/vp8/encoder/encodeintra.c b/vp8/encoder/encodeintra.c
index d481c99d3..f73bcc59d 100644
--- a/vp8/encoder/encodeintra.c
+++ b/vp8/encoder/encodeintra.c
@@ -82,8 +82,8 @@ void vp8_encode_intra4x4mby(MACROBLOCK *mb)
 {
     int i;
 
-    MACROBLOCKD *x = &mb->e_mbd;
-    vp8_intra_prediction_down_copy(x);
+    MACROBLOCKD *xd = &mb->e_mbd;
+    intra_prediction_down_copy(xd, xd->dst.y_buffer - xd->dst.y_stride + 16);
 
     for (i = 0; i < 16; i++)
         vp8_encode_intra4x4block(mb, i);
@@ -95,7 +95,11 @@ void vp8_encode_intra16x16mby(MACROBLOCK *x)
     BLOCK *b = &x->block[0];
     MACROBLOCKD *xd = &x->e_mbd;
 
-    vp8_build_intra_predictors_mby_s(&x->e_mbd);
+    vp8_build_intra_predictors_mby_s(xd,
+                                         xd->dst.y_buffer - xd->dst.y_stride,
+                                         xd->dst.y_buffer - 1,
+                                         xd->dst.y_stride,
+                                         xd->dst.y_buffer);
 
     vp8_subtract_mby(x->src_diff, *(b->base_src),
         b->src_stride, xd->dst.y_buffer, xd->dst.y_stride);
@@ -112,7 +116,12 @@ void vp8_encode_intra16x16mbuv(MACROBLOCK *x)
 {
     MACROBLOCKD *xd = &x->e_mbd;
 
-    vp8_build_intra_predictors_mbuv_s(&x->e_mbd);
+    vp8_build_intra_predictors_mbuv_s(xd, xd->dst.u_buffer - xd->dst.uv_stride,
+                                      xd->dst.v_buffer - xd->dst.uv_stride,
+                                      xd->dst.u_buffer - 1,
+                                      xd->dst.v_buffer - 1,
+                                      xd->dst.uv_stride,
+                                      xd->dst.u_buffer, xd->dst.v_buffer);
 
     vp8_subtract_mbuv(x->src_diff, x->src.u_buffer,
         x->src.v_buffer, x->src.uv_stride, xd->dst.u_buffer,
diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c
index 7740e5db7..b549a7dca 100644
--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -13,6 +13,8 @@
 #include "vp8/common/common.h"
 #include "vp8/common/extend.h"
 
+#include "bitstream.h"
+
 #if CONFIG_MULTITHREAD
 
 extern int vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
@@ -24,9 +26,9 @@ extern void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x, int ok_to_skip
 extern void vp8_build_block_offsets(MACROBLOCK *x);
 extern void vp8_setup_block_ptrs(MACROBLOCK *x);
 
-extern void loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm);
+extern void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm);
 
-static THREAD_FUNCTION loopfilter_thread(void *p_data)
+static THREAD_FUNCTION thread_loopfilter(void *p_data)
 {
     VP8_COMP *cpi = (VP8_COMP *)(((LPFTHREAD_DATA *)p_data)->ptr1);
     VP8_COMMON *cm = &cpi->common;
@@ -41,7 +43,7 @@ static THREAD_FUNCTION loopfilter_thread(void *p_data)
             if (cpi->b_multi_threaded == 0) // we're shutting down
                 break;
 
-            loopfilter_frame(cpi, cm);
+            vp8_loopfilter_frame(cpi, cm);
 
             sem_post(&cpi->h_event_end_lpf);
         }
@@ -74,6 +76,10 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
             MACROBLOCK *x = &mbri->mb;
             MACROBLOCKD *xd = &x->e_mbd;
             TOKENEXTRA *tp ;
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+            TOKENEXTRA *tp_start = cpi->tok + (1 + ithread) * (16 * 24);
+            const int num_part = (1 << cm->multi_token_partition);
+#endif
 
             int *segment_counts = mbri->segment_counts;
             int *totalrate = &mbri->totalrate;
@@ -91,9 +97,15 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
                 int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
                 int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
                 int map_index = (mb_row * cm->mb_cols);
-                volatile int *last_row_current_mb_col;
+                volatile const int *last_row_current_mb_col;
+                volatile int *current_mb_col = &cpi->mt_current_mb_col[mb_row];
 
+#if  (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
+                vp8_writer *w = &cpi->bc[1 + (mb_row % num_part)];
+#else
                 tp = cpi->tok + (mb_row * (cm->mb_cols * 16 * 24));
+                cpi->tplist[mb_row].start = tp;
+#endif
 
                 last_row_current_mb_col = &cpi->mt_current_mb_col[mb_row - 1];
 
@@ -107,25 +119,27 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
                 recon_yoffset = (mb_row * recon_y_stride * 16);
                 recon_uvoffset = (mb_row * recon_uv_stride * 8);
 
-                cpi->tplist[mb_row].start = tp;
-
-                //printf("Thread mb_row = %d\n", mb_row);
-
                 // Set the mb activity pointer to the start of the row.
                 x->mb_activity_ptr = &cpi->mb_activity_map[map_index];
 
                 // for each macroblock col in image
                 for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
                 {
+                    *current_mb_col = mb_col - 1;
+
                     if ((mb_col & (nsync - 1)) == 0)
                     {
-                        while (mb_col > (*last_row_current_mb_col - nsync) && *last_row_current_mb_col != cm->mb_cols - 1)
+                        while (mb_col > (*last_row_current_mb_col - nsync))
                         {
                             x86_pause_hint();
                             thread_sleep(0);
                         }
                     }
 
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+                    tp = tp_start;
+#endif
+
                     // Distance of Mb to the various image edges.
                     // These specified to 8th pel as they are always compared to values that are in 1/8th pel units
                     xd->mb_to_left_edge = -((mb_col * 16) << 3);
@@ -154,7 +168,7 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
                         vp8_activity_masking(cpi, x);
 
                     // Is segmentation enabled
-                    // MB level adjutment to quantizer
+                    // MB level adjustment to quantizer
                     if (xd->segmentation_enabled)
                     {
                         // Code to set segment id in xd->mbmi.segment_id for current MB (with range checking)
@@ -196,13 +210,13 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
 
 #endif
 
-                        // Count of last ref frame 0,0 useage
+                        // Count of last ref frame 0,0 usage
                         if ((xd->mode_info_context->mbmi.mode == ZEROMV) && (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME))
                             cpi->inter_zz_count++;
 
                         // Special case code for cyclic refresh
                         // If cyclic update enabled then copy xd->mbmi.segment_id; (which may have been updated based on mode
-                        // during vp8cx_encode_inter_macroblock()) back into the global sgmentation map
+                        // during vp8cx_encode_inter_macroblock()) back into the global segmentation map
                         if (cpi->cyclic_refresh_mode_enabled && xd->segmentation_enabled)
                         {
                             const MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
@@ -223,9 +237,17 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
 
                         }
                     }
-                    cpi->tplist[mb_row].stop = tp;
 
-                    // Increment pointer into gf useage flags structure.
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+                    /* pack tokens for this MB */
+                    {
+                        int tok_count = tp - tp_start;
+                        pack_tokens(w, tp_start, tok_count);
+                    }
+#else
+                    cpi->tplist[mb_row].stop = tp;
+#endif
+                    // Increment pointer into gf usage flags structure.
                     x->gf_active_ptr++;
 
                     // Increment the activity mask pointers.
@@ -239,23 +261,21 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
                     recon_yoffset += 16;
                     recon_uvoffset += 8;
 
-                    // Keep track of segment useage
+                    // Keep track of segment usage
                     segment_counts[xd->mode_info_context->mbmi.segment_id]++;
 
                     // skip to next mb
                     xd->mode_info_context++;
                     x->partition_info++;
                     xd->above_context++;
-
-                    cpi->mt_current_mb_col[mb_row] = mb_col;
                 }
 
-                //extend the recon for intra prediction
-                vp8_extend_mb_row(
-                    &cm->yv12_fb[dst_fb_idx],
-                    xd->dst.y_buffer + 16,
-                    xd->dst.u_buffer + 8,
-                    xd->dst.v_buffer + 8);
+                vp8_extend_mb_row( &cm->yv12_fb[dst_fb_idx],
+                                    xd->dst.y_buffer + 16,
+                                    xd->dst.u_buffer + 8,
+                                    xd->dst.v_buffer + 8);
+
+                *current_mb_col = mb_col + nsync;
 
                 // this is to account for the border
                 xd->mode_info_context++;
@@ -271,7 +291,6 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
 
                 if (mb_row == cm->mb_rows - 1)
                 {
-                    //SetEvent(cpi->h_event_main);
                     sem_post(&cpi->h_event_end_encoding); /* signal frame encoding end */
                 }
             }
@@ -468,6 +487,7 @@ void vp8cx_create_encoder_threads(VP8_COMP *cpi)
 
     cpi->b_multi_threaded = 0;
     cpi->encoding_thread_count = 0;
+    cpi->b_lpf_running = 0;
 
     if (cm->processor_core_count > 1 && cpi->oxcf.multi_threaded > 1)
     {
@@ -526,7 +546,7 @@ void vp8cx_create_encoder_threads(VP8_COMP *cpi)
             sem_init(&cpi->h_event_end_lpf, 0, 0);
 
             lpfthd->ptr1 = (void *)cpi;
-            pthread_create(&cpi->h_filter_thread, 0, loopfilter_thread, lpfthd);
+            pthread_create(&cpi->h_filter_thread, 0, thread_loopfilter, lpfthd);
         }
     }
 
diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index b3b06ee70..cb8fd3e89 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -12,7 +12,7 @@
 #include "limits.h"
 #include "block.h"
 #include "onyx_int.h"
-#include "variance.h"
+#include "vp8/common/variance.h"
 #include "encodeintra.h"
 #include "vp8/common/setupintrarecon.h"
 #include "mcomp.h"
@@ -2953,10 +2953,26 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
 
         // We do three calculations for kf size.
         // The first is based on the error score for the whole kf group.
-        // The second (optionaly) on the key frames own error if this is smaller than the average for the group.
-        // The final one insures that the frame receives at least the allocation it would have received based on its own error score vs the error score remaining
-
-        allocation_chunks = ((cpi->twopass.frames_to_key - 1) * 100) + kf_boost;           // cpi->twopass.frames_to_key-1 because key frame itself is taken care of by kf_boost
+        // The second (optionaly) on the key frames own error if this is
+        // smaller than the average for the group.
+        // The final one insures that the frame receives at least the
+        // allocation it would have received based on its own error score vs
+        // the error score remaining
+        // Special case if the sequence appears almost totaly static
+        // as measured by the decay accumulator. In this case we want to
+        // spend almost all of the bits on the key frame.
+        // cpi->twopass.frames_to_key-1 because key frame itself is taken
+        // care of by kf_boost.
+        if ( decay_accumulator >= 0.99 )
+        {
+            allocation_chunks =
+                ((cpi->twopass.frames_to_key - 1) * 10) + kf_boost;
+        }
+        else
+        {
+            allocation_chunks =
+                ((cpi->twopass.frames_to_key - 1) * 100) + kf_boost;
+        }
 
         // Normalize Altboost and allocations chunck down to prevent overflow
         while (kf_boost > 1000)
diff --git a/vp8/encoder/mcomp.h b/vp8/encoder/mcomp.h
index e6ab1648c..cdb0cb63c 100644
--- a/vp8/encoder/mcomp.h
+++ b/vp8/encoder/mcomp.h
@@ -13,7 +13,7 @@
 #define __INC_MCOMP_H
 
 #include "block.h"
-#include "variance.h"
+#include "vp8/common/variance.h"
 
 #ifdef ENTROPY_STATS
 extern void init_mv_ref_counts();
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index a36c6e757..d7b1bc1c8 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -42,6 +42,11 @@
 #include <stdio.h>
 #include <limits.h>
 
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+extern int vp8_update_coef_context(VP8_COMP *cpi);
+extern void vp8_update_coef_probs(VP8_COMP *cpi);
+#endif
+
 extern void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi);
 extern void vp8cx_set_alt_lf_level(VP8_COMP *cpi, int filt_val);
 extern void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi);
@@ -323,6 +328,9 @@ static void setup_features(VP8_COMP *cpi)
 }
 
 
+static void dealloc_raw_frame_buffers(VP8_COMP *cpi);
+
+
 static void dealloc_compressor_data(VP8_COMP *cpi)
 {
     vpx_free(cpi->tplist);
@@ -349,10 +357,7 @@ static void dealloc_compressor_data(VP8_COMP *cpi)
 
     vp8_yv12_de_alloc_frame_buffer(&cpi->pick_lf_lvl_frame);
     vp8_yv12_de_alloc_frame_buffer(&cpi->scaled_source);
-#if VP8_TEMPORAL_ALT_REF
-    vp8_yv12_de_alloc_frame_buffer(&cpi->alt_ref_buffer);
-#endif
-    vp8_lookahead_destroy(cpi->lookahead);
+    dealloc_raw_frame_buffers(cpi);
 
     vpx_free(cpi->tok);
     cpi->tok = 0;
@@ -1044,6 +1049,16 @@ static void alloc_raw_frame_buffers(VP8_COMP *cpi)
 #endif
 }
 
+
+static void dealloc_raw_frame_buffers(VP8_COMP *cpi)
+{
+#if VP8_TEMPORAL_ALT_REF
+    vp8_yv12_de_alloc_frame_buffer(&cpi->alt_ref_buffer);
+#endif
+    vp8_lookahead_destroy(cpi->lookahead);
+}
+
+
 static int vp8_alloc_partition_data(VP8_COMP *cpi)
 {
         vpx_free(cpi->mb.pip);
@@ -1096,8 +1111,11 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi)
         vpx_free(cpi->tok);
 
     {
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+        unsigned int tokens = 8 * 24 * 16; /* one MB for each thread */
+#else
         unsigned int tokens = cm->mb_rows * cm->mb_cols * 24 * 16;
-
+#endif
         CHECK_MEM_ERROR(cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok)));
     }
 
@@ -1387,6 +1405,7 @@ void update_layer_contexts (VP8_COMP *cpi)
 void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
 {
     VP8_COMMON *cm = &cpi->common;
+    int last_w, last_h;
 
     if (!cpi)
         return;
@@ -1503,6 +1522,10 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
     cm->refresh_last_frame = 1;
     cm->refresh_entropy_probs = 1;
 
+#if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
+    cpi->oxcf.token_partitions = 3;
+#endif
+
     if (cpi->oxcf.token_partitions >= 0 && cpi->oxcf.token_partitions <= 3)
         cm->multi_token_partition =
             (TOKEN_PARTITION) cpi->oxcf.token_partitions;
@@ -1594,6 +1617,10 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
 
     cpi->target_bandwidth = cpi->oxcf.target_bandwidth;
 
+
+    last_w = cm->Width;
+    last_h = cm->Height;
+
     cm->Width       = cpi->oxcf.Width;
     cm->Height      = cpi->oxcf.Height;
 
@@ -1619,6 +1646,9 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
         cm->Height = (vs - 1 + cpi->oxcf.Height * vr) / vs;
     }
 
+    if (last_w != cm->Width || last_h != cm->Height)
+        cpi->force_next_frame_intra = 1;
+
     if (((cm->Width + 15) & 0xfffffff0) !=
           cm->yv12_fb[cm->lst_fb_idx].y_width ||
         ((cm->Height + 15) & 0xfffffff0) !=
@@ -1650,6 +1680,17 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
     cpi->alt_ref_source = NULL;
     cpi->is_src_frame_alt_ref = 0;
 
+#if CONFIG_TEMPORAL_DENOISING
+    if (cpi->oxcf.noise_sensitivity)
+    {
+      if (!cpi->denoiser.yv12_mc_running_avg.buffer_alloc)
+      {
+        int width = (cpi->oxcf.Width + 15) & ~15;
+        int height = (cpi->oxcf.Height + 15) & ~15;
+        vp8_denoiser_allocate(&cpi->denoiser, width, height);
+      }
+    }
+#endif
 
 #if 0
     // Experimental RD Code
@@ -2284,6 +2325,9 @@ void vp8_remove_compressor(VP8_COMP **ptr)
     vp8cx_remove_encoder_threads(cpi);
 #endif
 
+#if CONFIG_TEMPORAL_DENOISING
+    vp8_denoiser_free(&cpi->denoiser);
+#endif
     dealloc_compressor_data(cpi);
     vpx_free(cpi->mb.ss);
     vpx_free(cpi->tok);
@@ -3068,7 +3112,7 @@ void update_reference_frames(VP8_COMMON *cm)
     }
 }
 
-void loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm)
+void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm)
 {
     if (cm->no_lpf)
     {
@@ -3103,7 +3147,12 @@ void loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm)
     }
 
     vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show);
-
+#if CONFIG_TEMPORAL_DENOISING
+    if (cpi->oxcf.noise_sensitivity)
+    {
+      vp8_yv12_extend_frame_borders(&cpi->denoiser.yv12_running_avg);
+    }
+#endif
 }
 
 static void encode_frame_to_data_rate
@@ -3147,15 +3196,9 @@ static void encode_frame_to_data_rate
     // Test code for segmentation of gf/arf (0,0)
     //segmentation_test_function( cpi);
 
-    if (cpi->compressor_speed == 2)
+    if(cpi->force_next_frame_intra)
     {
-        if(cpi->oxcf.auto_key && cm->frame_type != KEY_FRAME)
-        {
-            if(cpi->force_next_frame_intra)
-            {
-                cm->frame_type = KEY_FRAME;  /* delayed intra frame */
-            }
-        }
+        cm->frame_type = KEY_FRAME;  /* delayed intra frame */
         cpi->force_next_frame_intra = 0;
     }
 
@@ -3565,7 +3608,7 @@ static void encode_frame_to_data_rate
 
 
     scale_and_extend_source(cpi->un_scaled_source, cpi);
-#if !(CONFIG_REALTIME_ONLY) && CONFIG_POSTPROC
+#if !(CONFIG_REALTIME_ONLY) && CONFIG_POSTPROC && !(CONFIG_TEMPORAL_DENOISING)
 
     if (cpi->oxcf.noise_sensitivity > 0)
     {
@@ -3704,12 +3747,49 @@ static void encode_frame_to_data_rate
             vp8_setup_key_frame(cpi);
         }
 
+#if CONFIG_MULTITHREAD
+        /*  wait for the last picture loopfilter thread done */
+        if (cpi->b_lpf_running)
+        {
+            sem_wait(&cpi->h_event_end_lpf);
+            cpi->b_lpf_running = 0;
+        }
+#endif
+
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+        {
+            if(cpi->oxcf.error_resilient_mode)
+                cm->refresh_entropy_probs = 0;
+
+            if (cpi->oxcf.error_resilient_mode & VPX_ERROR_RESILIENT_PARTITIONS)
+            {
+                if (cm->frame_type == KEY_FRAME)
+                    cm->refresh_entropy_probs = 1;
+            }
+
+            if (cm->refresh_entropy_probs == 0)
+            {
+                // save a copy for later refresh
+                vpx_memcpy(&cm->lfc, &cm->fc, sizeof(cm->fc));
+            }
+
+            vp8_update_coef_context(cpi);
+
+            vp8_update_coef_probs(cpi);
+
+            // transform / motion compensation build reconstruction frame
+            // +pack coef partitions
+            vp8_encode_frame(cpi);
+
+            /* cpi->projected_frame_size is not needed for RT mode */
+        }
+#else
         // transform / motion compensation build reconstruction frame
         vp8_encode_frame(cpi);
 
         cpi->projected_frame_size -= vp8_estimate_entropy_savings(cpi);
         cpi->projected_frame_size = (cpi->projected_frame_size > 0) ? cpi->projected_frame_size : 0;
-
+#endif
         vp8_clear_system_state();  //__asm emms;
 
         // Test to see if the stats generated for this frame indicate that we should have coded a key frame
@@ -4062,19 +4142,22 @@ static void encode_frame_to_data_rate
     if (cpi->b_multi_threaded)
     {
         sem_post(&cpi->h_event_start_lpf); /* start loopfilter in separate thread */
+        cpi->b_lpf_running = 1;
     }
     else
 #endif
     {
-        loopfilter_frame(cpi, cm);
+        vp8_loopfilter_frame(cpi, cm);
     }
 
     update_reference_frames(cm);
 
+#if !(CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
     if (cpi->oxcf.error_resilient_mode)
     {
         cm->refresh_entropy_probs = 0;
     }
+#endif
 
 #if CONFIG_MULTITHREAD
     /* wait that filter_level is picked so that we can continue with stream packing */
@@ -4086,10 +4169,11 @@ static void encode_frame_to_data_rate
     vp8_pack_bitstream(cpi, dest, dest_end, size);
 
 #if CONFIG_MULTITHREAD
-    /* wait for loopfilter thread done */
-    if (cpi->b_multi_threaded)
+    /* if PSNR packets are generated we have to wait for the lpf */
+    if (cpi->b_lpf_running && cpi->b_calculate_psnr)
     {
         sem_wait(&cpi->h_event_end_lpf);
+        cpi->b_lpf_running = 0;
     }
 #endif
 
@@ -4550,6 +4634,15 @@ int vp8_receive_raw_frame(VP8_COMP *cpi, unsigned int frame_flags, YV12_BUFFER_C
 #endif
 
     vpx_usec_timer_start(&timer);
+
+    /* Reinit the lookahead buffer if the frame size changes */
+    if (sd->y_width != cpi->oxcf.Width || sd->y_height != cpi->oxcf.Height)
+    {
+        assert(cpi->oxcf.lag_in_frames < 2);
+        dealloc_raw_frame_buffers(cpi);
+        alloc_raw_frame_buffers(cpi);
+    }
+
     if(vp8_lookahead_push(cpi->lookahead, sd, time_stamp, end_time,
                           frame_flags, cpi->active_map_enabled ? cpi->active_map : NULL))
         res = -1;
@@ -4787,6 +4880,29 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
         vpx_usec_timer_start(&ticktimer);
     }
 
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+    {
+        int i;
+        const int num_part = (1 << cm->multi_token_partition);
+        /* the available bytes in dest */
+        const unsigned long dest_size = dest_end - dest;
+        const int tok_part_buff_size = (dest_size * 9) / (10 * num_part);
+
+        unsigned char *dp = dest;
+
+        cpi->partition_d[0] = dp;
+        dp += dest_size/10;         /* reserve 1/10 for control partition */
+        cpi->partition_d_end[0] = dp;
+
+        for(i = 0; i < num_part; i++)
+        {
+            cpi->partition_d[i + 1] = dp;
+            dp += tok_part_buff_size;
+            cpi->partition_d_end[i + 1] = dp;
+        }
+    }
+#endif
+
     // start with a 0 size frame
     *size = 0;
 
@@ -5086,6 +5202,15 @@ int vp8_get_preview_raw_frame(VP8_COMP *cpi, YV12_BUFFER_CONFIG *dest, vp8_ppfla
     else
     {
         int ret;
+
+#if CONFIG_MULTITHREAD
+        if(cpi->b_lpf_running)
+        {
+            sem_wait(&cpi->h_event_end_lpf);
+            cpi->b_lpf_running = 0;
+        }
+#endif
+
 #if CONFIG_POSTPROC
         ret = vp8_post_proc_frame(&cpi->common, dest, flags);
 #else
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index 6920fc316..4b0060ed6 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -18,7 +18,7 @@
 #include "treewriter.h"
 #include "tokenize.h"
 #include "vp8/common/onyxc_int.h"
-#include "variance.h"
+#include "vp8/common/variance.h"
 #include "encodemb.h"
 #include "quantize.h"
 #include "vp8/common/entropy.h"
@@ -28,6 +28,9 @@
 #include "mcomp.h"
 #include "vp8/common/findnearmv.h"
 #include "lookahead.h"
+#if CONFIG_TEMPORAL_DENOISING
+#include "vp8/encoder/denoising.h"
+#endif
 
 //#define SPEEDSTATS 1
 #define MIN_GF_INTERVAL             4
@@ -443,9 +446,12 @@ typedef struct VP8_COMP
     unsigned int MVcount [2] [MVvals];  /* (row,col) MV cts this frame */
 
     unsigned int coef_counts [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];  /* for this frame */
+
     //DECLARE_ALIGNED(16, int, coef_counts_backup [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS]);   //not used any more
     //save vp8_tree_probs_from_distribution result for each frame to avoid repeat calculation
     vp8_prob frame_coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+    char update_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+
     unsigned int frame_branch_ct [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];
 
     int gfu_boost;
@@ -504,7 +510,6 @@ typedef struct VP8_COMP
     int gf_bad_count;
     int gf_update_recommended;
     int skip_true_count;
-    int skip_false_count;
 
     unsigned char *segmentation_map;
     signed char segment_feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS];            // Segment data (can be deltas or absolute values)
@@ -526,6 +531,7 @@ typedef struct VP8_COMP
     int mt_sync_range;
     int b_multi_threaded;
     int encoding_thread_count;
+    int b_lpf_running;
 
     pthread_t *h_encoding_thread;
     pthread_t h_filter_thread;
@@ -543,6 +549,8 @@ typedef struct VP8_COMP
 
     TOKENLIST *tplist;
     unsigned int partition_sz[MAX_PARTITIONS];
+    unsigned char *partition_d[MAX_PARTITIONS];
+    unsigned char *partition_d_end[MAX_PARTITIONS];
     // end of multithread data
 
 
@@ -656,6 +664,10 @@ typedef struct VP8_COMP
 
     int droppable;
 
+#if CONFIG_TEMPORAL_DENOISING
+    VP8_DENOISER denoiser;
+#endif
+
     // Coding layer state variables
     unsigned int current_layer;
     LAYER_CONTEXT layer_context[MAX_LAYERS];
diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
index 0fc72d7eb..24e041f8d 100644
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -20,10 +20,13 @@
 #include "encodemb.h"
 #include "vp8/common/reconinter.h"
 #include "vp8/common/reconintra4x4.h"
-#include "variance.h"
+#include "vp8/common/variance.h"
 #include "mcomp.h"
 #include "rdopt.h"
 #include "vpx_mem/vpx_mem.h"
+#if CONFIG_TEMPORAL_DENOISING
+#include "denoising.h"
+#endif
 
 extern int VP8_UVSSE(MACROBLOCK *x);
 
@@ -184,7 +187,7 @@ static int pick_intra4x4mby_modes
     int distortion = 0;
     unsigned int *bmode_costs;
 
-    vp8_intra_prediction_down_copy(xd);
+    intra_prediction_down_copy(xd, xd->dst.y_buffer - xd->dst.y_stride + 16);
 
     bmode_costs = mb->inter_bmode_costs;
 
@@ -450,6 +453,48 @@ void get_lower_res_motion_info(VP8_COMP *cpi, MACROBLOCKD *xd, int *dissim,
 }
 #endif
 
+static void check_for_encode_breakout(unsigned int sse, MACROBLOCK* x)
+{
+    if (sse < x->encode_breakout)
+    {
+        // Check u and v to make sure skip is ok
+        int sse2 = 0;
+
+        sse2 = VP8_UVSSE(x);
+
+        if (sse2 * 2 < x->encode_breakout)
+            x->skip = 1;
+        else
+            x->skip = 0;
+    }
+}
+
+static int evaluate_inter_mode(unsigned int* sse, int rate2, int* distortion2, VP8_COMP *cpi, MACROBLOCK *x)
+{
+    MB_PREDICTION_MODE this_mode = x->e_mbd.mode_info_context->mbmi.mode;
+    int_mv mv = x->e_mbd.mode_info_context->mbmi.mv;
+    int this_rd;
+    /* Exit early and don't compute the distortion if this macroblock
+     * is marked inactive. */
+    if (cpi->active_map_enabled && x->active_ptr[0] == 0)
+    {
+        *sse = 0;
+        *distortion2 = 0;
+        x->skip = 1;
+        return INT_MAX;
+    }
+
+    if((this_mode != NEWMV) ||
+        !(cpi->sf.half_pixel_search) || cpi->common.full_pixel==1)
+        *distortion2 = get_inter_mbpred_error(x,
+                                              &cpi->fn_ptr[BLOCK_16X16],
+                                              sse, mv);
+
+    this_rd = RDCOST(x->rdmult, x->rddiv, rate2, *distortion2);
+
+    check_for_encode_breakout(*sse, x);
+    return this_rd;
+}
 
 void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
                          int recon_uvoffset, int *returnrate,
@@ -476,7 +521,10 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
     int distortion2;
     int bestsme = INT_MAX;
     int best_mode_index = 0;
-    unsigned int sse = INT_MAX, best_sse = INT_MAX;
+    unsigned int sse = INT_MAX, best_rd_sse = INT_MAX;
+#if CONFIG_TEMPORAL_DENOISING
+    unsigned int zero_mv_sse = 0, best_sse = INT_MAX;
+#endif
 
     int_mv mvp;
 
@@ -488,9 +536,6 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
     int ref_frame_map[4];
     int sign_bias = 0;
 
-    int have_subp_search = cpi->sf.half_pixel_search;  /* In real-time mode,
-                                       when Speed >= 15, no sub-pixel search. */
-
 #if CONFIG_MULTI_RES_ENCODING
     int dissim = INT_MAX;
     int parent_ref_frame = 0;
@@ -657,7 +702,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
         {
         case B_PRED:
             /* Pass best so far to pick_intra4x4mby_modes to use as breakout */
-            distortion2 = best_sse;
+            distortion2 = best_rd_sse;
             pick_intra4x4mby_modes(x, &rate, &distortion2);
 
             if (distortion2 == INT_MAX)
@@ -905,43 +950,38 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
             rate2 += vp8_cost_mv_ref(this_mode, mdcounts);
             x->e_mbd.mode_info_context->mbmi.mv.as_int =
                                                     mode_mv[this_mode].as_int;
-
-            /* Exit early and don't compute the distortion if this macroblock
-             * is marked inactive. */
-            if (cpi->active_map_enabled && x->active_ptr[0] == 0)
-            {
-                sse = 0;
-                distortion2 = 0;
-                x->skip = 1;
-                break;
-            }
-
-            if((this_mode != NEWMV) ||
-                !(have_subp_search) || cpi->common.full_pixel==1)
-                distortion2 = get_inter_mbpred_error(x,
-                                                     &cpi->fn_ptr[BLOCK_16X16],
-                                                     &sse, mode_mv[this_mode]);
-
-            this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
-
-            if (sse < x->encode_breakout)
-            {
-                // Check u and v to make sure skip is ok
-                int sse2 = 0;
-
-                sse2 = VP8_UVSSE(x);
-
-                if (sse2 * 2 < x->encode_breakout)
-                    x->skip = 1;
-                else
-                    x->skip = 0;
-            }
+            this_rd = evaluate_inter_mode(&sse, rate2, &distortion2, cpi, x);
 
             break;
         default:
             break;
         }
 
+#if CONFIG_TEMPORAL_DENOISING
+        if (cpi->oxcf.noise_sensitivity)
+        {
+          // Store for later use by denoiser.
+          if (this_mode == ZEROMV &&
+              x->e_mbd.mode_info_context->mbmi.ref_frame == LAST_FRAME)
+          {
+            zero_mv_sse = sse;
+          }
+
+          // Store the best NEWMV in x for later use in the denoiser.
+          // We are restricted to the LAST_FRAME since the denoiser only keeps
+          // one filter state.
+          if (x->e_mbd.mode_info_context->mbmi.mode == NEWMV &&
+              x->e_mbd.mode_info_context->mbmi.ref_frame == LAST_FRAME)
+          {
+            best_sse = sse;
+            x->e_mbd.best_sse_inter_mode = NEWMV;
+            x->e_mbd.best_sse_mv = x->e_mbd.mode_info_context->mbmi.mv;
+            x->e_mbd.need_to_clamp_best_mvs =
+                x->e_mbd.mode_info_context->mbmi.need_to_clamp_mvs;
+          }
+        }
+#endif
+
         if (this_rd < best_rd || x->skip)
         {
             // Note index of best mode
@@ -949,7 +989,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
 
             *returnrate = rate2;
             *returndistortion = distortion2;
-            best_sse = sse;
+            best_rd_sse = sse;
             best_rd = this_rd;
             vpx_memcpy(&best_mbmode, &x->e_mbd.mode_info_context->mbmi,
                        sizeof(MB_MODE_INFO));
@@ -1011,6 +1051,43 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
         cpi->error_bins[this_rdbin] ++;
     }
 
+#if CONFIG_TEMPORAL_DENOISING
+    if (cpi->oxcf.noise_sensitivity)
+    {
+      if (x->e_mbd.best_sse_inter_mode == DC_PRED) {
+        // No best MV found.
+        x->e_mbd.best_sse_inter_mode = best_mbmode.mode;
+        x->e_mbd.best_sse_mv = best_mbmode.mv;
+        x->e_mbd.need_to_clamp_best_mvs = best_mbmode.need_to_clamp_mvs;
+        best_sse = best_rd_sse;
+      }
+      vp8_denoiser_denoise_mb(&cpi->denoiser, x, best_sse, zero_mv_sse,
+                              recon_yoffset, recon_uvoffset);
+
+      // Reevaluate ZEROMV after denoising.
+      if (best_mbmode.ref_frame == INTRA_FRAME)
+      {
+        int this_rd = 0;
+        rate2 = 0;
+        distortion2 = 0;
+        x->e_mbd.mode_info_context->mbmi.ref_frame = LAST_FRAME;
+        rate2 += x->ref_frame_cost[x->e_mbd.mode_info_context->mbmi.ref_frame];
+        this_mode = ZEROMV;
+        rate2 += vp8_cost_mv_ref(this_mode, mdcounts);
+        x->e_mbd.mode_info_context->mbmi.mode = this_mode;
+        x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;
+        x->e_mbd.mode_info_context->mbmi.mv.as_int = 0;
+        this_rd = evaluate_inter_mode(&sse, rate2, &distortion2, cpi, x);
+
+        if (this_rd < best_rd || x->skip)
+        {
+            vpx_memcpy(&best_mbmode, &x->e_mbd.mode_info_context->mbmi,
+                       sizeof(MB_MODE_INFO));
+        }
+      }
+    }
+#endif
+
     if (cpi->is_src_frame_alt_ref &&
         (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME))
     {
diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c
index 1c43c1171..472e85f2b 100644
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -392,10 +392,17 @@ static void calc_iframe_target_size(VP8_COMP *cpi)
         int Q = (cpi->common.frame_flags & FRAMEFLAGS_KEY)
                 ? cpi->avg_frame_qindex : cpi->ni_av_qi;
 
-        // Boost depends somewhat on frame rate
-        kf_boost = (int)(2 * cpi->output_frame_rate - 16);
+        int initial_boost = 24; // Corresponds to: |2.5 * per_frame_bandwidth|
+        // Boost depends somewhat on frame rate: only used for 1 layer case.
+        if (cpi->oxcf.number_of_layers == 1) {
+          kf_boost = MAX(initial_boost, (int)(2 * cpi->output_frame_rate - 16));
+        }
+        else {
+          // Initial factor: set target size to: |2.5 * per_frame_bandwidth|.
+          kf_boost = initial_boost;
+        }
 
-        // adjustment up based on q
+        // adjustment up based on q: this factor ranges from ~1.2 to 2.2.
         kf_boost = kf_boost * kf_boost_qadjustment[Q] / 100;
 
         // frame separation adjustment ( down)
@@ -403,6 +410,7 @@ static void calc_iframe_target_size(VP8_COMP *cpi)
             kf_boost = (int)(kf_boost
                        * cpi->frames_since_key / (cpi->output_frame_rate / 2));
 
+        // Minimal target size is |2* per_frame_bandwidth|.
         if (kf_boost < 16)
             kf_boost = 16;
 
@@ -1525,6 +1533,15 @@ void vp8_compute_frame_size_bounds(VP8_COMP *cpi, int *frame_under_shoot_limit,
                 }
             }
         }
+
+        // For very small rate targets where the fractional adjustment
+        // (eg * 7/8) may be tiny make sure there is at least a minimum
+        // range.
+        *frame_over_shoot_limit += 200;
+        *frame_under_shoot_limit -= 200;
+        if ( *frame_under_shoot_limit < 0 )
+            *frame_under_shoot_limit = 0;
+
     }
 }
 
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index 433412d73..fb5b0f49f 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -16,7 +16,6 @@
 #include "vpx_config.h"
 #include "vpx_rtcd.h"
 #include "vp8/common/pragmas.h"
-
 #include "tokenize.h"
 #include "treewriter.h"
 #include "onyx_int.h"
@@ -26,18 +25,41 @@
 #include "vp8/common/reconinter.h"
 #include "vp8/common/reconintra4x4.h"
 #include "vp8/common/findnearmv.h"
+#include "vp8/common/quant_common.h"
 #include "encodemb.h"
 #include "quantize.h"
-#include "variance.h"
+#include "vp8/common/variance.h"
 #include "mcomp.h"
 #include "rdopt.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vp8/common/systemdependent.h"
+#if CONFIG_TEMPORAL_DENOISING
+#include "denoising.h"
+#endif
 
 extern void vp8_update_zbin_extra(VP8_COMP *cpi, MACROBLOCK *x);
 
 #define MAXF(a,b)            (((a) > (b)) ? (a) : (b))
 
+typedef struct rate_distortion_struct
+{
+    int rate2;
+    int rate_y;
+    int rate_uv;
+    int distortion2;
+    int distortion_uv;
+} RATE_DISTORTION;
+
+typedef struct best_mode_struct
+{
+  int yrd;
+  int rd;
+  int intra_rd;
+  MB_MODE_INFO mbmode;
+  union b_mode_info bmodes[16];
+  PARTITION_INFO partition;
+} BEST_MODE;
+
 static const int auto_speed_thresh[17] =
 {
     1000,
@@ -137,9 +159,11 @@ static void fill_token_costs(
     for (i = 0; i < BLOCK_TYPES; i++)
         for (j = 0; j < COEF_BANDS; j++)
             for (k = 0; k < PREV_COEF_CONTEXTS; k++)
-
-                vp8_cost_tokens((int *)(c [i][j][k]), p [i][j][k], vp8_coef_tree);
-
+                // check for pt=0 and band > 1 if block type 0 and 0 if blocktype 1
+                if(k==0 && j>(i==0) )
+                    vp8_cost_tokens2((int *)(c [i][j][k]), p [i][j][k], vp8_coef_tree,2);
+                else
+                    vp8_cost_tokens((int *)(c [i][j][k]), p [i][j][k], vp8_coef_tree);
 }
 
 static int rd_iifactor [ 32 ] =  {    4,   4,   3,   2,   1,   0,   0,   0,
@@ -199,7 +223,7 @@ void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue)
     int q;
     int i;
     double capped_q = (Qvalue < 160) ? (double)Qvalue : 160.0;
-    double rdconst = 2.70;
+    double rdconst = 2.80;
 
     vp8_clear_system_state();  //__asm emms;
 
@@ -701,7 +725,7 @@ static int rd_pick_intra4x4mby_modes(VP8_COMP *cpi, MACROBLOCK *mb, int *Rate,
     ta = (ENTROPY_CONTEXT *)&t_above;
     tl = (ENTROPY_CONTEXT *)&t_left;
 
-    vp8_intra_prediction_down_copy(xd);
+    intra_prediction_down_copy(xd, xd->dst.y_buffer - xd->dst.y_stride + 16);
 
     bmode_costs = mb->inter_bmode_costs;
 
@@ -739,7 +763,7 @@ static int rd_pick_intra4x4mby_modes(VP8_COMP *cpi, MACROBLOCK *mb, int *Rate,
         return INT_MAX;
 
     *Rate = cost;
-    *rate_y += tot_rate_y;
+    *rate_y = tot_rate_y;
     *Distortion = distortion;
 
     return RDCOST(mb->rdmult, mb->rddiv, cost, distortion);
@@ -1709,6 +1733,181 @@ static void rd_update_mvcount(VP8_COMP *cpi, MACROBLOCK *x, int_mv *best_ref_mv)
     }
 }
 
+static int evaluate_inter_mode_rd(int mdcounts[4],
+                                  RATE_DISTORTION* rd,
+                                  int* disable_skip,
+                                  VP8_COMP *cpi, MACROBLOCK *x)
+{
+    MB_PREDICTION_MODE this_mode = x->e_mbd.mode_info_context->mbmi.mode;
+    BLOCK *b = &x->block[0];
+    MACROBLOCKD *xd = &x->e_mbd;
+    int distortion;
+    vp8_build_inter16x16_predictors_mby(&x->e_mbd, x->e_mbd.predictor, 16);
+
+    if (cpi->active_map_enabled && x->active_ptr[0] == 0) {
+        x->skip = 1;
+    }
+    else if (x->encode_breakout)
+    {
+        unsigned int sse;
+        unsigned int var;
+        int threshold = (xd->block[0].dequant[1]
+                    * xd->block[0].dequant[1] >>4);
+
+        if(threshold < x->encode_breakout)
+            threshold = x->encode_breakout;
+
+        var = vp8_variance16x16
+                (*(b->base_src), b->src_stride,
+                x->e_mbd.predictor, 16, &sse);
+
+        if (sse < threshold)
+        {
+             unsigned int q2dc = xd->block[24].dequant[0];
+            /* If theres is no codeable 2nd order dc
+               or a very small uniform pixel change change */
+            if ((sse - var < q2dc * q2dc >>4) ||
+                (sse /2 > var && sse-var < 64))
+            {
+                // Check u and v to make sure skip is ok
+                int sse2=  VP8_UVSSE(x);
+                if (sse2 * 2 < threshold)
+                {
+                    x->skip = 1;
+                    rd->distortion2 = sse + sse2;
+                    rd->rate2 = 500;
+
+                    /* for best_yrd calculation */
+                    rd->rate_uv = 0;
+                    rd->distortion_uv = sse2;
+
+                    *disable_skip = 1;
+                    return RDCOST(x->rdmult, x->rddiv, rd->rate2,
+                                  rd->distortion2);
+                }
+            }
+        }
+    }
+
+
+    //intermodecost[mode_index] = vp8_cost_mv_ref(this_mode, mdcounts);   // Experimental debug code
+
+    // Add in the Mv/mode cost
+    rd->rate2 += vp8_cost_mv_ref(this_mode, mdcounts);
+
+    // Y cost and distortion
+    macro_block_yrd(x, &rd->rate_y, &distortion);
+    rd->rate2 += rd->rate_y;
+    rd->distortion2 += distortion;
+
+    // UV cost and distortion
+    rd_inter16x16_uv(cpi, x, &rd->rate_uv, &rd->distortion_uv,
+                     cpi->common.full_pixel);
+    rd->rate2 += rd->rate_uv;
+    rd->distortion2 += rd->distortion_uv;
+    return INT_MAX;
+}
+
+static int calculate_final_rd_costs(int this_rd,
+                                    RATE_DISTORTION* rd,
+                                    int* other_cost,
+                                    int disable_skip,
+                                    int uv_intra_tteob,
+                                    int intra_rd_penalty,
+                                    VP8_COMP *cpi, MACROBLOCK *x)
+{
+    MB_PREDICTION_MODE this_mode = x->e_mbd.mode_info_context->mbmi.mode;
+    // Where skip is allowable add in the default per mb cost for the no skip case.
+    // where we then decide to skip we have to delete this and replace it with the
+    // cost of signallying a skip
+    if (cpi->common.mb_no_coeff_skip)
+    {
+        *other_cost += vp8_cost_bit(cpi->prob_skip_false, 0);
+        rd->rate2 += *other_cost;
+    }
+
+    /* Estimate the reference frame signaling cost and add it
+     * to the rolling cost variable.
+     */
+    rd->rate2 +=
+        x->ref_frame_cost[x->e_mbd.mode_info_context->mbmi.ref_frame];
+
+    if (!disable_skip)
+    {
+        // Test for the condition where skip block will be activated because there are no non zero coefficients and make any necessary adjustment for rate
+        if (cpi->common.mb_no_coeff_skip)
+        {
+            int i;
+            int tteob;
+            int has_y2_block = (this_mode!=SPLITMV && this_mode!=B_PRED);
+
+            tteob = 0;
+            if(has_y2_block)
+                tteob += x->e_mbd.eobs[24];
+
+            for (i = 0; i < 16; i++)
+                tteob += (x->e_mbd.eobs[i] > has_y2_block);
+
+            if (x->e_mbd.mode_info_context->mbmi.ref_frame)
+            {
+                for (i = 16; i < 24; i++)
+                    tteob += x->e_mbd.eobs[i];
+            }
+            else
+                tteob += uv_intra_tteob;
+
+            if (tteob == 0)
+            {
+                rd->rate2 -= (rd->rate_y + rd->rate_uv);
+                //for best_yrd calculation
+                rd->rate_uv = 0;
+
+                // Back out no skip flag costing and add in skip flag costing
+                if (cpi->prob_skip_false)
+                {
+                    int prob_skip_cost;
+
+                    prob_skip_cost = vp8_cost_bit(cpi->prob_skip_false, 1);
+                    prob_skip_cost -= vp8_cost_bit(cpi->prob_skip_false, 0);
+                    rd->rate2 += prob_skip_cost;
+                    *other_cost += prob_skip_cost;
+                }
+            }
+        }
+        // Calculate the final RD estimate for this mode
+        this_rd = RDCOST(x->rdmult, x->rddiv, rd->rate2, rd->distortion2);
+        if (this_rd < INT_MAX && x->e_mbd.mode_info_context->mbmi.ref_frame
+                                 == INTRA_FRAME)
+            this_rd += intra_rd_penalty;
+    }
+    return this_rd;
+}
+
+static void update_best_mode(BEST_MODE* best_mode, int this_rd,
+                             RATE_DISTORTION* rd, int other_cost, MACROBLOCK *x)
+{
+    MB_PREDICTION_MODE this_mode = x->e_mbd.mode_info_context->mbmi.mode;
+
+    other_cost +=
+    x->ref_frame_cost[x->e_mbd.mode_info_context->mbmi.ref_frame];
+
+    /* Calculate the final y RD estimate for this mode */
+    best_mode->yrd = RDCOST(x->rdmult, x->rddiv, (rd->rate2-rd->rate_uv-other_cost),
+                      (rd->distortion2-rd->distortion_uv));
+
+    best_mode->rd = this_rd;
+    vpx_memcpy(&best_mode->mbmode, &x->e_mbd.mode_info_context->mbmi, sizeof(MB_MODE_INFO));
+    vpx_memcpy(&best_mode->partition, x->partition_info, sizeof(PARTITION_INFO));
+
+    if ((this_mode == B_PRED) || (this_mode == SPLITMV))
+    {
+        int i;
+        for (i = 0; i < 16; i++)
+        {
+            best_mode->bmodes[i] = x->e_mbd.block[i].bmi;
+        }
+    }
+}
 
 void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
                             int recon_uvoffset, int *returnrate,
@@ -1717,9 +1916,6 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
     BLOCK *b = &x->block[0];
     BLOCKD *d = &x->e_mbd.block[0];
     MACROBLOCKD *xd = &x->e_mbd;
-    union b_mode_info best_bmodes[16];
-    MB_MODE_INFO best_mbmode;
-    PARTITION_INFO best_partition;
     int_mv best_ref_mv_sb[2];
     int_mv mode_mv_sb[2][MB_MODE_COUNT];
     int_mv best_ref_mv;
@@ -1727,21 +1923,18 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
     MB_PREDICTION_MODE this_mode;
     int num00;
     int best_mode_index = 0;
+    BEST_MODE best_mode;
 
     int i;
     int mode_index;
     int mdcounts[4];
     int rate;
-    int distortion;
-    int best_rd = INT_MAX;
-    int best_intra_rd = INT_MAX;
-    int rate2, distortion2;
+    RATE_DISTORTION rd;
     int uv_intra_rate, uv_intra_distortion, uv_intra_rate_tokenonly;
-    int rate_y, UNINITIALIZED_IS_SAFE(rate_uv);
-    int distortion_uv;
-    int best_yrd = INT_MAX;
+    int uv_intra_tteob = 0;
+    int uv_intra_done = 0;
 
-    MB_PREDICTION_MODE uv_intra_mode;
+    MB_PREDICTION_MODE uv_intra_mode = 0;
     int_mv mvp;
     int near_sadidx[8] = {0, 1, 2, 3, 4, 5, 6, 7};
     int saddone=0;
@@ -1751,11 +1944,17 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
     int ref_frame_map[4];
     int sign_bias = 0;
 
+    int intra_rd_penalty =  10* vp8_dc_quant(cpi->common.base_qindex,
+                                             cpi->common.y1dc_delta_q);
+
     mode_mv = mode_mv_sb[sign_bias];
     best_ref_mv.as_int = 0;
+    best_mode.rd = INT_MAX;
+    best_mode.yrd = INT_MAX;
+    best_mode.intra_rd = INT_MAX;
     vpx_memset(mode_mv_sb, 0, sizeof(mode_mv_sb));
-    vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));
-    vpx_memset(&best_bmodes, 0, sizeof(best_bmodes));
+    vpx_memset(&best_mode.mbmode, 0, sizeof(best_mode.mbmode));
+    vpx_memset(&best_mode.bmodes, 0, sizeof(best_mode.bmodes));
 
     /* Setup search priorities */
     get_reference_search_order(cpi, ref_frame_map);
@@ -1784,10 +1983,6 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
 
     x->skip = 0;
 
-    x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;
-    rd_pick_intra_mbuv_mode(cpi, x, &uv_intra_rate, &uv_intra_rate_tokenonly, &uv_intra_distortion);
-    uv_intra_mode = x->e_mbd.mode_info_context->mbmi.uv_mode;
-
     for (mode_index = 0; mode_index < MAX_MODES; mode_index++)
     {
         int this_rd = INT_MAX;
@@ -1796,20 +1991,19 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
         int this_ref_frame = ref_frame_map[vp8_ref_frame_order[mode_index]];
 
         // Test best rd so far against threshold for trying this mode.
-        if (best_rd <= cpi->rd_threshes[mode_index])
+        if (best_mode.rd <= cpi->rd_threshes[mode_index])
             continue;
 
         if (this_ref_frame < 0)
             continue;
 
         // These variables hold are rolling total cost and distortion for this mode
-        rate2 = 0;
-        distortion2 = 0;
+        rd.rate2 = 0;
+        rd.distortion2 = 0;
 
         this_mode = vp8_mode_order[mode_index];
 
         x->e_mbd.mode_info_context->mbmi.mode = this_mode;
-        x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;
         x->e_mbd.mode_info_context->mbmi.ref_frame = this_ref_frame;
 
         // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
@@ -1880,6 +2074,24 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
             vp8_update_zbin_extra(cpi, x);
         }
 
+        if(!uv_intra_done && this_ref_frame == INTRA_FRAME)
+        {
+            rd_pick_intra_mbuv_mode(cpi, x, &uv_intra_rate,
+                                    &uv_intra_rate_tokenonly,
+                                    &uv_intra_distortion);
+            uv_intra_mode = x->e_mbd.mode_info_context->mbmi.uv_mode;
+
+            /*
+             * Total of the eobs is used later to further adjust rate2. Since uv
+             * block's intra eobs will be overwritten when we check inter modes,
+             * we need to save uv_intra_tteob here.
+             */
+            for (i = 16; i < 24; i++)
+                uv_intra_tteob += x->e_mbd.eobs[i];
+
+            uv_intra_done = 1;
+        }
+
         switch (this_mode)
         {
         case B_PRED:
@@ -1887,16 +2099,17 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
             int tmp_rd;
 
             // Note the rate value returned here includes the cost of coding the BPRED mode : x->mbmode_cost[x->e_mbd.frame_type][BPRED];
-            tmp_rd = rd_pick_intra4x4mby_modes(cpi, x, &rate, &rate_y, &distortion, best_yrd);
-            rate2 += rate;
-            distortion2 += distortion;
+            int distortion;
+            tmp_rd = rd_pick_intra4x4mby_modes(cpi, x, &rate, &rd.rate_y, &distortion, best_mode.yrd);
+            rd.rate2 += rate;
+            rd.distortion2 += distortion;
 
-            if(tmp_rd < best_yrd)
+            if(tmp_rd < best_mode.yrd)
             {
-                rate2 += uv_intra_rate;
-                rate_uv = uv_intra_rate_tokenonly;
-                distortion2 += uv_intra_distortion;
-                distortion_uv = uv_intra_distortion;
+                rd.rate2 += uv_intra_rate;
+                rd.rate_uv = uv_intra_rate_tokenonly;
+                rd.distortion2 += uv_intra_distortion;
+                rd.distortion_uv = uv_intra_distortion;
             }
             else
             {
@@ -1910,24 +2123,25 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
         {
             int tmp_rd;
             int this_rd_thresh;
+            int distortion;
 
             this_rd_thresh = (vp8_ref_frame_order[mode_index] == 1) ? cpi->rd_threshes[THR_NEW1] : cpi->rd_threshes[THR_NEW3];
             this_rd_thresh = (vp8_ref_frame_order[mode_index] == 2) ? cpi->rd_threshes[THR_NEW2] : this_rd_thresh;
 
             tmp_rd = vp8_rd_pick_best_mbsegmentation(cpi, x, &best_ref_mv,
-                                                     best_yrd, mdcounts,
-                                                     &rate, &rate_y, &distortion, this_rd_thresh) ;
+                                                     best_mode.yrd, mdcounts,
+                                                     &rate, &rd.rate_y, &distortion, this_rd_thresh) ;
 
-            rate2 += rate;
-            distortion2 += distortion;
+            rd.rate2 += rate;
+            rd.distortion2 += distortion;
 
             // If even the 'Y' rd value of split is higher than best so far then dont bother looking at UV
-            if (tmp_rd < best_yrd)
+            if (tmp_rd < best_mode.yrd)
             {
                 // Now work out UV cost and add it in
-                rd_inter4x4_uv(cpi, x, &rate_uv, &distortion_uv, cpi->common.full_pixel);
-                rate2 += rate_uv;
-                distortion2 += distortion_uv;
+                rd_inter4x4_uv(cpi, x, &rd.rate_uv, &rd.distortion_uv, cpi->common.full_pixel);
+                rd.rate2 += rd.rate_uv;
+                rd.distortion2 += rd.distortion_uv;
             }
             else
             {
@@ -1940,18 +2154,21 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
         case V_PRED:
         case H_PRED:
         case TM_PRED:
+        {
+            int distortion;
             x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;
             vp8_build_intra_predictors_mby
                 (&x->e_mbd);
-            macro_block_yrd(x, &rate_y, &distortion) ;
-            rate2 += rate_y;
-            distortion2 += distortion;
-            rate2 += x->mbmode_cost[x->e_mbd.frame_type][x->e_mbd.mode_info_context->mbmi.mode];
-            rate2 += uv_intra_rate;
-            rate_uv = uv_intra_rate_tokenonly;
-            distortion2 += uv_intra_distortion;
-            distortion_uv = uv_intra_distortion;
-            break;
+            macro_block_yrd(x, &rd.rate_y, &distortion) ;
+            rd.rate2 += rd.rate_y;
+            rd.distortion2 += distortion;
+            rd.rate2 += x->mbmode_cost[x->e_mbd.frame_type][x->e_mbd.mode_info_context->mbmi.mode];
+            rd.rate2 += uv_intra_rate;
+            rd.rate_uv = uv_intra_rate_tokenonly;
+            rd.distortion2 += uv_intra_distortion;
+            rd.distortion_uv = uv_intra_distortion;
+        }
+        break;
 
         case NEWMV:
         {
@@ -2094,7 +2311,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
             mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
 
             // Add the new motion vector cost to our rolling cost variable
-            rate2 += vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv, x->mvcost, 96);
+            rd.rate2 += vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv, x->mvcost, 96);
         }
 
         case NEARESTMV:
@@ -2116,165 +2333,57 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
                 continue;
 
             vp8_set_mbmode_and_mvs(x, this_mode, &mode_mv[this_mode]);
-            vp8_build_inter16x16_predictors_mby(&x->e_mbd, x->e_mbd.predictor, 16);
-
-            if (cpi->active_map_enabled && x->active_ptr[0] == 0) {
-                x->skip = 1;
-            }
-            else if (x->encode_breakout)
-            {
-                unsigned int sse;
-                unsigned int var;
-                int threshold = (xd->block[0].dequant[1]
-                            * xd->block[0].dequant[1] >>4);
-
-                if(threshold < x->encode_breakout)
-                    threshold = x->encode_breakout;
-
-                var = vp8_variance16x16
-                        (*(b->base_src), b->src_stride,
-                        x->e_mbd.predictor, 16, &sse);
-
-                if (sse < threshold)
-                {
-                     unsigned int q2dc = xd->block[24].dequant[0];
-                    /* If theres is no codeable 2nd order dc
-                       or a very small uniform pixel change change */
-                    if ((sse - var < q2dc * q2dc >>4) ||
-                        (sse /2 > var && sse-var < 64))
-                    {
-                        // Check u and v to make sure skip is ok
-                        int sse2=  VP8_UVSSE(x);
-                        if (sse2 * 2 < threshold)
-                        {
-                            x->skip = 1;
-                            distortion2 = sse + sse2;
-                            rate2 = 500;
-
-                            /* for best_yrd calculation */
-                            rate_uv = 0;
-                            distortion_uv = sse2;
-
-                            disable_skip = 1;
-                            this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
-
-                            break;
-                        }
-                    }
-                }
-            }
-
-
-            //intermodecost[mode_index] = vp8_cost_mv_ref(this_mode, mdcounts);   // Experimental debug code
-
-            // Add in the Mv/mode cost
-            rate2 += vp8_cost_mv_ref(this_mode, mdcounts);
-
-            // Y cost and distortion
-            macro_block_yrd(x, &rate_y, &distortion);
-            rate2 += rate_y;
-            distortion2 += distortion;
-
-            // UV cost and distortion
-            rd_inter16x16_uv(cpi, x, &rate_uv, &distortion_uv, cpi->common.full_pixel);
-            rate2 += rate_uv;
-            distortion2 += distortion_uv;
+            this_rd = evaluate_inter_mode_rd(mdcounts, &rd,
+                                             &disable_skip, cpi, x);
             break;
 
         default:
             break;
         }
 
-        // Where skip is allowable add in the default per mb cost for the no skip case.
-        // where we then decide to skip we have to delete this and replace it with the
-        // cost of signallying a skip
-        if (cpi->common.mb_no_coeff_skip)
-        {
-            other_cost += vp8_cost_bit(cpi->prob_skip_false, 0);
-            rate2 += other_cost;
-        }
-
-        /* Estimate the reference frame signaling cost and add it
-         * to the rolling cost variable.
-         */
-        rate2 +=
-            x->ref_frame_cost[x->e_mbd.mode_info_context->mbmi.ref_frame];
+        this_rd = calculate_final_rd_costs(this_rd, &rd, &other_cost,
+                                           disable_skip, uv_intra_tteob,
+                                           intra_rd_penalty, cpi, x);
 
-        if (!disable_skip)
+        // Keep record of best intra distortion
+        if ((x->e_mbd.mode_info_context->mbmi.ref_frame == INTRA_FRAME) &&
+            (this_rd < best_mode.intra_rd) )
         {
-            // Test for the condition where skip block will be activated because there are no non zero coefficients and make any necessary adjustment for rate
-            if (cpi->common.mb_no_coeff_skip)
-            {
-                int tteob;
-
-                tteob = 0;
-
-                for (i = 0; i <= 24; i++)
-                {
-                    tteob += x->e_mbd.eobs[i];
-                }
-
-                if (tteob == 0)
-                {
-                    rate2 -= (rate_y + rate_uv);
-                    //for best_yrd calculation
-                    rate_uv = 0;
-
-                    // Back out no skip flag costing and add in skip flag costing
-                    if (cpi->prob_skip_false)
-                    {
-                        int prob_skip_cost;
-
-                        prob_skip_cost = vp8_cost_bit(cpi->prob_skip_false, 1);
-                        prob_skip_cost -= vp8_cost_bit(cpi->prob_skip_false, 0);
-                        rate2 += prob_skip_cost;
-                        other_cost += prob_skip_cost;
-                    }
-                }
-            }
-            // Calculate the final RD estimate for this mode
-            this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+          best_mode.intra_rd = this_rd;
+            *returnintra = rd.distortion2 ;
         }
 
-        // Keep record of best intra distortion
-        if ((x->e_mbd.mode_info_context->mbmi.ref_frame == INTRA_FRAME) &&
-            (this_rd < best_intra_rd) )
+#if CONFIG_TEMPORAL_DENOISING
+        if (cpi->oxcf.noise_sensitivity)
         {
-            best_intra_rd = this_rd;
-            *returnintra = distortion2 ;
+          // Store the best NEWMV in x for later use in the denoiser.
+          // We are restricted to the LAST_FRAME since the denoiser only keeps
+          // one filter state.
+          if (x->e_mbd.mode_info_context->mbmi.mode == NEWMV &&
+              x->e_mbd.mode_info_context->mbmi.ref_frame == LAST_FRAME)
+          {
+            x->e_mbd.best_sse_inter_mode = NEWMV;
+            x->e_mbd.best_sse_mv = x->e_mbd.mode_info_context->mbmi.mv;
+            x->e_mbd.need_to_clamp_best_mvs =
+                x->e_mbd.mode_info_context->mbmi.need_to_clamp_mvs;
+          }
         }
+#endif
 
         // Did this mode help.. i.i is it the new best mode
-        if (this_rd < best_rd || x->skip)
+        if (this_rd < best_mode.rd || x->skip)
         {
             // Note index of best mode so far
             best_mode_index = mode_index;
-
+            *returnrate = rd.rate2;
+            *returndistortion = rd.distortion2;
             if (this_mode <= B_PRED)
             {
                 x->e_mbd.mode_info_context->mbmi.uv_mode = uv_intra_mode;
                 /* required for left and above block mv */
                 x->e_mbd.mode_info_context->mbmi.mv.as_int = 0;
             }
-
-            other_cost +=
-            x->ref_frame_cost[x->e_mbd.mode_info_context->mbmi.ref_frame];
-
-            /* Calculate the final y RD estimate for this mode */
-            best_yrd = RDCOST(x->rdmult, x->rddiv, (rate2-rate_uv-other_cost),
-                              (distortion2-distortion_uv));
-
-            *returnrate = rate2;
-            *returndistortion = distortion2;
-            best_rd = this_rd;
-            vpx_memcpy(&best_mbmode, &x->e_mbd.mode_info_context->mbmi, sizeof(MB_MODE_INFO));
-            vpx_memcpy(&best_partition, x->partition_info, sizeof(PARTITION_INFO));
-
-            if ((this_mode == B_PRED) || (this_mode == SPLITMV))
-                for (i = 0; i < 16; i++)
-                {
-                    best_bmodes[i] = x->e_mbd.block[i].bmi;
-                }
+            update_best_mode(&best_mode, this_rd, &rd, other_cost, x);
 
 
             // Testing this mode gave rise to an improvement in best error score. Lower threshold a bit for next time
@@ -2327,9 +2436,50 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
     // Note how often each mode chosen as best
     cpi->mode_chosen_counts[best_mode_index] ++;
 
+#if CONFIG_TEMPORAL_DENOISING
+    if (cpi->oxcf.noise_sensitivity)
+    {
+      if (x->e_mbd.best_sse_inter_mode == DC_PRED) {
+        // No best MV found.
+        x->e_mbd.best_sse_inter_mode = best_mode.mbmode.mode;
+        x->e_mbd.best_sse_mv = best_mode.mbmode.mv;
+        x->e_mbd.need_to_clamp_best_mvs = best_mode.mbmode.need_to_clamp_mvs;
+      }
+
+      // TODO(holmer): No SSEs are calculated in rdopt.c. What else can be used?
+      vp8_denoiser_denoise_mb(&cpi->denoiser, x, 0, 0,
+                              recon_yoffset, recon_uvoffset);
+      // Reevalute ZEROMV if the current mode is INTRA.
+      if (best_mode.mbmode.ref_frame == INTRA_FRAME)
+      {
+        int this_rd = INT_MAX;
+        int disable_skip = 0;
+        int other_cost = 0;
+        vpx_memset(&rd, 0, sizeof(rd));
+        x->e_mbd.mode_info_context->mbmi.ref_frame = LAST_FRAME;
+        rd.rate2 += x->ref_frame_cost[LAST_FRAME];
+        rd.rate2 += vp8_cost_mv_ref(ZEROMV, mdcounts);
+        x->e_mbd.mode_info_context->mbmi.mode = ZEROMV;
+        x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;
+        x->e_mbd.mode_info_context->mbmi.mv.as_int = 0;
+        this_rd = evaluate_inter_mode_rd(mdcounts, &rd, &disable_skip, cpi, x);
+        this_rd = calculate_final_rd_costs(this_rd, &rd, &other_cost,
+                                           disable_skip, uv_intra_tteob,
+                                           intra_rd_penalty, cpi, x);
+        if (this_rd < best_mode.rd || x->skip)
+        {
+            // Note index of best mode so far
+            best_mode_index = mode_index;
+            *returnrate = rd.rate2;
+            *returndistortion = rd.distortion2;
+            update_best_mode(&best_mode, this_rd, &rd, other_cost, x);
+        }
+      }
+    }
+#endif
 
     if (cpi->is_src_frame_alt_ref &&
-        (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME))
+        (best_mode.mbmode.mode != ZEROMV || best_mode.mbmode.ref_frame != ALTREF_FRAME))
     {
         x->e_mbd.mode_info_context->mbmi.mode = ZEROMV;
         x->e_mbd.mode_info_context->mbmi.ref_frame = ALTREF_FRAME;
@@ -2338,26 +2488,25 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
         x->e_mbd.mode_info_context->mbmi.mb_skip_coeff =
                                         (cpi->common.mb_no_coeff_skip);
         x->e_mbd.mode_info_context->mbmi.partitioning = 0;
-
         return;
     }
 
 
     // macroblock modes
-    vpx_memcpy(&x->e_mbd.mode_info_context->mbmi, &best_mbmode, sizeof(MB_MODE_INFO));
+    vpx_memcpy(&x->e_mbd.mode_info_context->mbmi, &best_mode.mbmode, sizeof(MB_MODE_INFO));
 
-    if (best_mbmode.mode == B_PRED)
+    if (best_mode.mbmode.mode == B_PRED)
     {
         for (i = 0; i < 16; i++)
-            xd->mode_info_context->bmi[i].as_mode = best_bmodes[i].as_mode;
+            xd->mode_info_context->bmi[i].as_mode = best_mode.bmodes[i].as_mode;
     }
 
-    if (best_mbmode.mode == SPLITMV)
+    if (best_mode.mbmode.mode == SPLITMV)
     {
         for (i = 0; i < 16; i++)
-            xd->mode_info_context->bmi[i].mv.as_int = best_bmodes[i].mv.as_int;
+            xd->mode_info_context->bmi[i].mv.as_int = best_mode.bmodes[i].mv.as_int;
 
-        vpx_memcpy(x->partition_info, &best_partition, sizeof(PARTITION_INFO));
+        vpx_memcpy(x->partition_info, &best_mode.partition, sizeof(PARTITION_INFO));
 
         x->e_mbd.mode_info_context->mbmi.mv.as_int =
                                       x->partition_info->bmi[15].mv.as_int;
diff --git a/vp8/encoder/tokenize.c b/vp8/encoder/tokenize.c
index 8bfc47f8f..967b6026a 100644
--- a/vp8/encoder/tokenize.c
+++ b/vp8/encoder/tokenize.c
@@ -378,30 +378,27 @@ void vp8_tokenize_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t)
     x->mode_info_context->mbmi.mb_skip_coeff = mb_is_skippable(x, has_y2_block);
     if (x->mode_info_context->mbmi.mb_skip_coeff)
     {
-        cpi->skip_true_count++;
-
         if (!cpi->common.mb_no_coeff_skip)
-            vp8_stuff_mb(cpi, x, t) ;
+        {
+            vp8_stuff_mb(cpi, x, t);
+        }
         else
         {
             vp8_fix_contexts(x);
+            cpi->skip_true_count++;
         }
 
         return;
     }
 
-    cpi->skip_false_count++;
-
     plane_type = 3;
     if(has_y2_block)
     {
         tokenize2nd_order_b(x, t, cpi);
         plane_type = 0;
-
     }
 
     tokenize1st_order_b(x, t, plane_type, cpi);
-
 }
 
 
diff --git a/vp8/encoder/treewriter.c b/vp8/encoder/treewriter.c
index 03967c835..ef25f670b 100644
--- a/vp8/encoder/treewriter.c
+++ b/vp8/encoder/treewriter.c
@@ -37,3 +37,7 @@ void vp8_cost_tokens(int *c, const vp8_prob *p, vp8_tree t)
 {
     cost(c, t, p, 0, 0);
 }
+void vp8_cost_tokens2(int *c, const vp8_prob *p, vp8_tree t,int start)
+{
+    cost(c, t, p, start, 0);
+}
diff --git a/vp8/encoder/treewriter.h b/vp8/encoder/treewriter.h
index c28a0fa37..0aa19431c 100644
--- a/vp8/encoder/treewriter.h
+++ b/vp8/encoder/treewriter.h
@@ -119,4 +119,8 @@ void vp8_cost_tokens(
     int *Costs, const vp8_prob *, vp8_tree
 );
 
+void vp8_cost_tokens2(
+    int *Costs, const vp8_prob *, vp8_tree, int
+);
+
 #endif
diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index d3d82e2f4..f68d007c1 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -30,6 +30,7 @@ VP8_COMMON_SRCS-yes += common/findnearmv.c
 VP8_COMMON_SRCS-yes += common/generic/systemdependent.c
 VP8_COMMON_SRCS-yes += common/idct_blk.c
 VP8_COMMON_SRCS-yes += common/idctllm.c
+VP8_COMMON_SRCS-yes += common/idctllm_test.cc
 VP8_COMMON_SRCS-yes += common/alloccommon.h
 VP8_COMMON_SRCS-yes += common/blockd.h
 VP8_COMMON_SRCS-yes += common/common.h
@@ -63,8 +64,11 @@ VP8_COMMON_SRCS-yes += common/quant_common.c
 VP8_COMMON_SRCS-yes += common/reconinter.c
 VP8_COMMON_SRCS-yes += common/reconintra.c
 VP8_COMMON_SRCS-yes += common/reconintra4x4.c
+VP8_COMMON_SRCS-yes += common/sad_c.c
 VP8_COMMON_SRCS-yes += common/setupintrarecon.c
 VP8_COMMON_SRCS-yes += common/swapyv12buffer.c
+VP8_COMMON_SRCS-yes += common/variance_c.c
+VP8_COMMON_SRCS-yes += common/variance.h
 
 
 
@@ -80,22 +84,37 @@ VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/postproc.c
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/dequantize_mmx.asm
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idct_blk_mmx.c
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idctllm_mmx.asm
+VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idctllm_mmx_test.cc
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/iwalsh_mmx.asm
+VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/loopfilter_mmx.asm
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/recon_mmx.asm
+VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/sad_mmx.asm
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/subpixel_mmx.asm
-VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/loopfilter_mmx.asm
+VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/variance_mmx.c
+VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/variance_impl_mmx.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idct_blk_sse2.c
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idctllm_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_wrapper_sse2.c
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/sad_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/subpixel_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/iwalsh_sse2.asm
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/variance_sse2.c
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/variance_impl_sse2.asm
+VP8_COMMON_SRCS-$(HAVE_SSE3) += common/x86/sad_sse3.asm
+VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/sad_ssse3.asm
 VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/subpixel_ssse3.asm
+VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/variance_ssse3.c
+VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/variance_impl_ssse3.asm
+VP8_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/sad_sse4.asm
+
 ifeq ($(CONFIG_POSTPROC),yes)
+VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/postproc_x86.c
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/postproc_mmx.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/postproc_sse2.asm
 endif
+
 ifeq ($(ARCH_X86_64),yes)
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_block_sse2.asm
 endif
@@ -105,6 +124,7 @@ VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/filter_arm.c
 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/loopfilter_arm.c
 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/reconintra_arm.c
 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/dequantize_arm.c
+VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/variance_arm.c
 
 # common (media)
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/bilinearfilter_arm.c
@@ -124,6 +144,12 @@ VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/intra4x4_predict_v6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/dequant_idct_v6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/dequantize_v6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/idct_blk_v6.c
+VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_sad16x16_armv6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance8x8_armv6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance16x16_armv6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM)
 
 # common (neon)
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/bilinearpredict4x4_neon$(ASM)
@@ -140,6 +166,8 @@ VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/loopfiltersimplehorizontaledge_
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/loopfiltersimpleverticaledge_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/mbloopfilter_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/shortidct4x4llm_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/sad8_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/sad16_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/sixtappredict4x4_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/sixtappredict8x4_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/sixtappredict8x8_neon$(ASM)
@@ -151,3 +179,7 @@ VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/idct_dequant_full_2x_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/idct_dequant_0_2x_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/dequantizeb_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/idct_blk_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/variance_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp8_subpixelvariance8x8_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp8_subpixelvariance16x16_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp8_subpixelvariance16x16s_neon$(ASM)
diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index f2f376a7c..683194a1d 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -66,7 +66,11 @@ static const struct extraconfig_map extracfg_map[] =
             0,                          /* noise_sensitivity */
             0,                          /* Sharpness */
             0,                          /* static_thresh */
+#if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
+            VP8_EIGHT_TOKENPARTITION,
+#else
             VP8_ONE_TOKENPARTITION,     /* token_partitions */
+#endif
             0,                          /* arnr_max_frames */
             3,                          /* arnr_strength */
             3,                          /* arnr_type*/
@@ -179,14 +183,20 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
 
     RANGE_CHECK_BOOL(vp8_cfg,               enable_auto_alt_ref);
     RANGE_CHECK(vp8_cfg, cpu_used,           -16, 16);
-
+#if CONFIG_TEMPORAL_DENOISING
+    RANGE_CHECK(vp8_cfg, noise_sensitivity, 0, 1);
+#endif
 #if !(CONFIG_REALTIME_ONLY)
     RANGE_CHECK(vp8_cfg, encoding_mode,      VP8_BEST_QUALITY_ENCODING, VP8_REAL_TIME_ENCODING);
+#if !(CONFIG_TEMPORAL_DENOISING)
     RANGE_CHECK_HI(vp8_cfg, noise_sensitivity,  6);
+#endif
 #else
     RANGE_CHECK(vp8_cfg, encoding_mode,      VP8_REAL_TIME_ENCODING, VP8_REAL_TIME_ENCODING);
+#if !(CONFIG_TEMPORAL_DENOISING)
     RANGE_CHECK(vp8_cfg, noise_sensitivity,  0, 0);
 #endif
+#endif
 
     RANGE_CHECK(vp8_cfg, token_partitions,   VP8_ONE_TOKENPARTITION, VP8_EIGHT_TOKENPARTITION);
     RANGE_CHECK_HI(vp8_cfg, Sharpness,       7);
@@ -241,6 +251,11 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
         RANGE_CHECK_HI(cfg, ts_layer_id[i], cfg->ts_number_layers-1);
     }
 
+#if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
+    if(cfg->g_threads > (1 << vp8_cfg->token_partitions))
+        ERROR("g_threads cannot be bigger than number of token partitions");
+#endif
+
     return VPX_CODEC_OK;
 }
 
@@ -440,7 +455,8 @@ static vpx_codec_err_t vp8e_set_config(vpx_codec_alg_priv_t       *ctx,
 {
     vpx_codec_err_t res;
 
-    if ((cfg->g_w != ctx->cfg.g_w) || (cfg->g_h != ctx->cfg.g_h))
+    if (((cfg->g_w != ctx->cfg.g_w) || (cfg->g_h != ctx->cfg.g_h))
+        && cfg->g_lag_in_frames > 1)
         ERROR("Cannot change width or height after initialization");
 
     /* Prevent increasing lag_in_frames. This check is stricter than it needs
@@ -918,16 +934,28 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t  *ctx,
 
                     for (i = 0; i < num_partitions; ++i)
                     {
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+                        pkt.data.frame.buf = cpi->partition_d[i];
+#else
                         pkt.data.frame.buf = cx_data;
+                        cx_data += cpi->partition_sz[i];
+                        cx_data_sz -= cpi->partition_sz[i];
+#endif
                         pkt.data.frame.sz = cpi->partition_sz[i];
                         pkt.data.frame.partition_id = i;
                         /* don't set the fragment bit for the last partition */
                         if (i == (num_partitions - 1))
                             pkt.data.frame.flags &= ~VPX_FRAME_IS_FRAGMENT;
                         vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt);
-                        cx_data += cpi->partition_sz[i];
-                        cx_data_sz -= cpi->partition_sz[i];
                     }
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+                    /* In lagged mode the encoder can buffer multiple frames.
+                     * We don't want this in partitioned output because
+                     * partitions are spread all over the output buffer.
+                     * So, force an exit!
+                     */
+                    cx_data_sz -= ctx->cx_data_sz / 2;
+#endif
                 }
                 else
                 {
diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk
index b96c9adab..2e940d787 100644
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -47,6 +47,8 @@ VP8_CX_SRCS-yes += encoder/firstpass.c
 VP8_CX_SRCS-yes += encoder/block.h
 VP8_CX_SRCS-yes += encoder/boolhuff.h
 VP8_CX_SRCS-yes += encoder/bitstream.h
+VP8_CX_SRCS-$(CONFIG_TEMPORAL_DENOISING) += encoder/denoising.h
+VP8_CX_SRCS-$(CONFIG_TEMPORAL_DENOISING) += encoder/denoising.c
 VP8_CX_SRCS-yes += encoder/encodeintra.h
 VP8_CX_SRCS-yes += encoder/encodemb.h
 VP8_CX_SRCS-yes += encoder/encodemv.h
@@ -63,7 +65,6 @@ VP8_CX_SRCS-yes += encoder/ratectrl.h
 VP8_CX_SRCS-yes += encoder/rdopt.h
 VP8_CX_SRCS-yes += encoder/tokenize.h
 VP8_CX_SRCS-yes += encoder/treewriter.h
-VP8_CX_SRCS-yes += encoder/variance.h
 VP8_CX_SRCS-yes += encoder/mcomp.c
 VP8_CX_SRCS-yes += encoder/modecosts.c
 VP8_CX_SRCS-yes += encoder/onyx_if.c
@@ -73,13 +74,11 @@ VP8_CX_SRCS-yes += encoder/psnr.c
 VP8_CX_SRCS-yes += encoder/quantize.c
 VP8_CX_SRCS-yes += encoder/ratectrl.c
 VP8_CX_SRCS-yes += encoder/rdopt.c
-VP8_CX_SRCS-yes += encoder/sad_c.c
 VP8_CX_SRCS-yes += encoder/segmentation.c
 VP8_CX_SRCS-yes += encoder/segmentation.h
 VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/ssim.c
 VP8_CX_SRCS-yes += encoder/tokenize.c
 VP8_CX_SRCS-yes += encoder/treewriter.c
-VP8_CX_SRCS-yes += encoder/variance_c.c
 VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.h
 VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.c
 VP8_CX_SRCS-yes += encoder/temporal_filter.c
@@ -91,27 +90,16 @@ VP8_CX_SRCS_REMOVE-yes += encoder/firstpass.c
 VP8_CX_SRCS_REMOVE-yes += encoder/temporal_filter.c
 endif
 
-VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/variance_mmx.c
-VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/variance_impl_mmx.asm
-VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/sad_mmx.asm
 VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/dct_mmx.asm
 VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/subtract_mmx.asm
 VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp8_enc_stubs_mmx.c
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
-VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/variance_sse2.c
-VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/variance_impl_sse2.asm
-VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/sad_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp8_enc_stubs_sse2.c
-VP8_CX_SRCS-$(HAVE_SSE3) += encoder/x86/sad_sse3.asm
-VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/sad_ssse3.asm
-VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/variance_ssse3.c
-VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/variance_impl_ssse3.asm
 VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.asm
-VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/sad_sse4.asm
 VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/quantize_sse4.asm
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm
diff --git a/vp8/vp8cx_arm.mk b/vp8/vp8cx_arm.mk
index 11ebb58aa..b16615d1b 100644
--- a/vp8/vp8cx_arm.mk
+++ b/vp8/vp8cx_arm.mk
@@ -15,7 +15,6 @@
 # encoder
 VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/dct_arm.c
 VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/quantize_arm.c
-VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/variance_arm.c
 
 #File list for edsp
 # encoder
@@ -31,27 +30,15 @@ VP8_CX_SRCS-$(HAVE_EDSP)  += encoder/arm/armv5te/vp8_packtokens_partitions_armv5
 VP8_CX_SRCS-$(HAVE_MEDIA)  += encoder/arm/armv6/vp8_subtract_armv6$(ASM)
 VP8_CX_SRCS-$(HAVE_MEDIA)  += encoder/arm/armv6/vp8_short_fdct4x4_armv6$(ASM)
 VP8_CX_SRCS-$(HAVE_MEDIA)  += encoder/arm/armv6/vp8_fast_quantize_b_armv6$(ASM)
-VP8_CX_SRCS-$(HAVE_MEDIA)  += encoder/arm/armv6/vp8_sad16x16_armv6$(ASM)
-VP8_CX_SRCS-$(HAVE_MEDIA)  += encoder/arm/armv6/vp8_variance16x16_armv6$(ASM)
-VP8_CX_SRCS-$(HAVE_MEDIA)  += encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6$(ASM)
-VP8_CX_SRCS-$(HAVE_MEDIA)  += encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6$(ASM)
-VP8_CX_SRCS-$(HAVE_MEDIA)  += encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM)
 VP8_CX_SRCS-$(HAVE_MEDIA)  += encoder/arm/armv6/vp8_mse16x16_armv6$(ASM)
-VP8_CX_SRCS-$(HAVE_MEDIA)  += encoder/arm/armv6/vp8_variance8x8_armv6$(ASM)
 VP8_CX_SRCS-$(HAVE_MEDIA)  += encoder/arm/armv6/walsh_v6$(ASM)
 
 #File list for neon
 # encoder
 VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/fastquantizeb_neon$(ASM)
 VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/picklpf_arm.c
-VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/sad8_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/sad16_neon$(ASM)
 VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/shortfdct_neon$(ASM)
 VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/subtract_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/variance_neon$(ASM)
 VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/vp8_mse16x16_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/vp8_subpixelvariance8x8_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/vp8_subpixelvariance16x16_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/vp8_subpixelvariance16x16s_neon$(ASM)
 VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/vp8_memcpy_neon$(ASM)
 VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/vp8_shortwalsh4x4_neon$(ASM)
diff --git a/vp8/vp8dx.mk b/vp8/vp8dx.mk
index ce307b6d0..2cfd280cb 100644
--- a/vp8/vp8dx.mk
+++ b/vp8/vp8dx.mk
@@ -60,7 +60,5 @@ VP8_DX_SRCS-yes += decoder/onyxd_int.h
 VP8_DX_SRCS-yes += decoder/treereader.h
 VP8_DX_SRCS-yes += decoder/onyxd_if.c
 VP8_DX_SRCS-$(CONFIG_MULTITHREAD) += decoder/threading.c
-VP8_DX_SRCS-$(CONFIG_MULTITHREAD) += decoder/reconintra_mt.h
-VP8_DX_SRCS-$(CONFIG_MULTITHREAD) += decoder/reconintra_mt.c
 
 VP8_DX_SRCS-yes := $(filter-out $(VP8_DX_SRCS_REMOVE-yes),$(VP8_DX_SRCS-yes))
diff --git a/vpx/vpx_decoder.h b/vpx/vpx_decoder.h
index 6d0dba865..6b03ede75 100644
--- a/vpx/vpx_decoder.h
+++ b/vpx/vpx_decoder.h
@@ -190,10 +190,10 @@ extern "C" {
      * time stamp) order. Frames produced will always be in PTS (presentation
      * time stamp) order.
      * If the decoder is configured with VPX_CODEC_USE_INPUT_FRAGMENTS enabled,
-     * data and data_sz can contain a fragment of the encoded frame. Fragment #n
-     * must contain at least partition #n, but can also contain subsequent
-     * partitions (#n+1 - #n+i), and if so, fragments #n+1, .., #n+i must be
-     * empty. When no more data is available, this function should be called
+     * data and data_sz can contain a fragment of the encoded frame. Fragment
+     * \#n must contain at least partition \#n, but can also contain subsequent
+     * partitions (\#n+1 - \#n+i), and if so, fragments \#n+1, .., \#n+i must
+     * be empty. When no more data is available, this function should be called
      * with NULL as data and 0 as data_sz. The memory passed to this function
      * must be available until the frame has been decoded.
      *
diff --git a/vpx_ports/vpx_timer.h b/vpx_ports/vpx_timer.h
index c8335a0a8..d07e08610 100644
--- a/vpx_ports/vpx_timer.h
+++ b/vpx_ports/vpx_timer.h
@@ -11,6 +11,7 @@
 
 #ifndef VPX_TIMER_H
 #define VPX_TIMER_H
+#include "vpx/vpx_integer.h"
 
 #if CONFIG_OS_SUPPORT
 
@@ -75,7 +76,7 @@ vpx_usec_timer_mark(struct vpx_usec_timer *t)
 }
 
 
-static long
+static int64_t
 vpx_usec_timer_elapsed(struct vpx_usec_timer *t)
 {
 #if defined(_WIN32)
@@ -83,15 +84,13 @@ vpx_usec_timer_elapsed(struct vpx_usec_timer *t)
 
     diff.QuadPart = t->end.QuadPart - t->begin.QuadPart;
 
-    if (QueryPerformanceFrequency(&freq) && diff.QuadPart < freq.QuadPart)
-        return (long)(diff.QuadPart * 1000000 / freq.QuadPart);
-
-    return 1000000;
+    QueryPerformanceFrequency(&freq);
+    return diff.QuadPart * 1000000 / freq.QuadPart;
 #else
     struct timeval diff;
 
     timersub(&t->end, &t->begin, &diff);
-    return diff.tv_sec ? 1000000 : diff.tv_usec;
+    return diff.tv_sec * 1000000 + diff.tv_usec;
 #endif
 }
 
diff --git a/vpx_ports/x86_abi_support.asm b/vpx_ports/x86_abi_support.asm
index 7382a9134..cef6a0bf1 100644
--- a/vpx_ports/x86_abi_support.asm
+++ b/vpx_ports/x86_abi_support.asm
@@ -22,6 +22,8 @@
 %define ABI_IS_32BIT 1
 %elifidn __OUTPUT_FORMAT__,win32
 %define ABI_IS_32BIT 1
+%elifidn __OUTPUT_FORMAT__,aout
+%define ABI_IS_32BIT 1
 %else
 %define ABI_IS_32BIT 0
 %endif
@@ -314,6 +316,8 @@
 %macro SECTION_RODATA 0
 section .text
 %endmacro
+%elifidn __OUTPUT_FORMAT__,aout
+%define SECTION_RODATA section .data
 %else
 %define SECTION_RODATA section .rodata
 %endif
diff --git a/vpxenc.c b/vpxenc.c
index a33ca37c6..e8b82619b 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -84,22 +84,49 @@ static const struct codec_item
 
 static void usage_exit();
 
+#define LOG_ERROR(label) do \
+{\
+    const char *l=label;\
+    va_list ap;\
+    va_start(ap, fmt);\
+    if(l)\
+        fprintf(stderr, "%s: ", l);\
+    vfprintf(stderr, fmt, ap);\
+    fprintf(stderr, "\n");\
+    va_end(ap);\
+} while(0)
+
 void die(const char *fmt, ...)
 {
-    va_list ap;
-    va_start(ap, fmt);
-    vfprintf(stderr, fmt, ap);
-    fprintf(stderr, "\n");
+    LOG_ERROR(NULL);
     usage_exit();
 }
 
-static void ctx_exit_on_error(vpx_codec_ctx_t *ctx, const char *s)
+
+void fatal(const char *fmt, ...)
+{
+    LOG_ERROR("Fatal");
+    exit(EXIT_FAILURE);
+}
+
+
+void warn(const char *fmt, ...)
+{
+    LOG_ERROR("Warning");
+}
+
+
+static void ctx_exit_on_error(vpx_codec_ctx_t *ctx, const char *s, ...)
 {
+    va_list ap;
+
+    va_start(ap, s);
     if (ctx->err)
     {
         const char *detail = vpx_codec_error_detail(ctx);
 
-        fprintf(stderr, "%s: %s\n", s, vpx_codec_error(ctx));
+        vfprintf(stderr, s, ap);
+        fprintf(stderr, ": %s\n", vpx_codec_error(ctx));
 
         if (detail)
             fprintf(stderr, "    %s\n", detail);
@@ -153,10 +180,7 @@ int stats_open_file(stats_io_t *stats, const char *fpf, int pass)
         stats->file = fopen(fpf, "rb");
 
         if (fseek(stats->file, 0, SEEK_END))
-        {
-            fprintf(stderr, "First-pass stats file must be seekable!\n");
-            exit(EXIT_FAILURE);
-        }
+            fatal("First-pass stats file must be seekable!");
 
         stats->buf.sz = stats->buf_alloc_sz = ftell(stats->file);
         rewind(stats->file);
@@ -164,11 +188,8 @@ int stats_open_file(stats_io_t *stats, const char *fpf, int pass)
         stats->buf.buf = malloc(stats->buf_alloc_sz);
 
         if (!stats->buf.buf)
-        {
-            fprintf(stderr, "Failed to allocate first-pass stats buffer (%lu bytes)\n",
-                    (unsigned long)stats->buf_alloc_sz);
-            exit(EXIT_FAILURE);
-        }
+            fatal("Failed to allocate first-pass stats buffer (%lu bytes)",
+                  (unsigned long)stats->buf_alloc_sz);
 
         nbytes = fread(stats->buf.buf, 1, stats->buf.sz, stats->file);
         res = (nbytes == stats->buf.sz);
@@ -240,11 +261,7 @@ void stats_write(stats_io_t *stats, const void *pkt, size_t len)
                 stats->buf_alloc_sz = new_sz;
             }
             else
-            {
-                fprintf(stderr,
-                        "\nFailed to realloc firstpass stats buffer.\n");
-                exit(EXIT_FAILURE);
-            }
+                fatal("Failed to realloc firstpass stats buffer.");
         }
 
         memcpy(stats->buf_ptr, pkt, len);
@@ -282,10 +299,27 @@ struct detect_buffer {
 };
 
 
+struct input_state
+{
+    char                 *fn;
+    FILE                 *file;
+    y4m_input             y4m;
+    struct detect_buffer  detect;
+    enum video_file_type  file_type;
+    unsigned int          w;
+    unsigned int          h;
+    struct vpx_rational   framerate;
+    int                   use_i420;
+};
+
+
 #define IVF_FRAME_HDR_SZ (4+8) /* 4 byte size + 8 byte timestamp */
-static int read_frame(FILE *f, vpx_image_t *img, unsigned int file_type,
-                      y4m_input *y4m, struct detect_buffer *detect)
+static int read_frame(struct input_state *input, vpx_image_t *img)
 {
+    FILE *f = input->file;
+    enum video_file_type file_type = input->file_type;
+    y4m_input *y4m = &input->y4m;
+    struct detect_buffer *detect = &input->detect;
     int plane = 0;
     int shortread = 0;
 
@@ -369,14 +403,15 @@ unsigned int file_is_y4m(FILE      *infile,
 }
 
 #define IVF_FILE_HDR_SZ (32)
-unsigned int file_is_ivf(FILE *infile,
-                         unsigned int *fourcc,
-                         unsigned int *width,
-                         unsigned int *height,
-                         struct detect_buffer *detect)
+unsigned int file_is_ivf(struct input_state *input,
+                         unsigned int *fourcc)
 {
     char raw_hdr[IVF_FILE_HDR_SZ];
     int is_ivf = 0;
+    FILE *infile = input->file;
+    unsigned int *width = &input->w;
+    unsigned int *height = &input->h;
+    struct detect_buffer *detect = &input->detect;
 
     if(memcmp(detect->buf, "DKIF", 4) != 0)
         return 0;
@@ -391,8 +426,8 @@ unsigned int file_is_ivf(FILE *infile,
             is_ivf = 1;
 
             if (mem_get_le16(raw_hdr + 4) != 0)
-                fprintf(stderr, "Error: Unrecognized IVF version! This file may not"
-                        " decode properly.");
+                warn("Unrecognized IVF version! This file may not decode "
+                     "properly.");
 
             *fourcc = mem_get_le32(raw_hdr + 8);
         }
@@ -454,6 +489,13 @@ static void write_ivf_frame_header(FILE *outfile,
     if(fwrite(header, 1, 12, outfile));
 }
 
+static void write_ivf_frame_size(FILE *outfile, size_t size)
+{
+    char             header[4];
+    mem_put_le32(header, size);
+    fwrite(header, 1, 4, outfile);
+}
+
 
 typedef off_t EbmlLoc;
 
@@ -762,10 +804,7 @@ write_webm_block(EbmlGlobal                *glob,
             if(new_cue_list)
                 glob->cue_list = new_cue_list;
             else
-            {
-                fprintf(stderr, "\nFailed to realloc cue list.\n");
-                exit(EXIT_FAILURE);
-            }
+                fatal("Failed to realloc cue list.");
 
             cue = &glob->cue_list[glob->cues];
             cue->time = glob->cluster_timecode;
@@ -913,7 +952,6 @@ static double vp8_mse2psnr(double Samples, double Peak, double Mse)
 
 
 #include "args.h"
-
 static const arg_def_t debugmode = ARG_DEF("D", "debug", 0,
         "Debug mode (makes output deterministic)");
 static const arg_def_t outputfile = ARG_DEF("o", "output", 1,
@@ -948,6 +986,8 @@ static const arg_def_t framerate        = ARG_DEF(NULL, "fps", 1,
         "Stream frame rate (rate/scale)");
 static const arg_def_t use_ivf          = ARG_DEF(NULL, "ivf", 0,
         "Output IVF (default is WebM)");
+static const arg_def_t out_part = ARG_DEF("P", "output-partitions", 0,
+        "Makes encoder output partitions. Requires IVF output!");
 static const arg_def_t q_hist_n         = ARG_DEF(NULL, "q-hist", 1,
         "Show quantizer histogram (n-buckets)");
 static const arg_def_t rate_hist_n         = ARG_DEF(NULL, "rate-hist", 1,
@@ -957,7 +997,7 @@ static const arg_def_t *main_args[] =
     &debugmode,
     &outputfile, &codecarg, &passes, &pass_arg, &fpf_name, &limit, &deadline,
     &best_dl, &good_dl, &rt_dl,
-    &verbosearg, &psnrarg, &use_ivf, &q_hist_n, &rate_hist_n,
+    &verbosearg, &psnrarg, &use_ivf, &out_part, &q_hist_n, &rate_hist_n,
     NULL
 };
 
@@ -1445,58 +1485,73 @@ static void show_rate_histogram(struct rate_hist          *hist,
 #define NELEMENTS(x) (sizeof(x)/sizeof(x[0]))
 #define ARG_CTRL_CNT_MAX NELEMENTS(vp8_arg_ctrl_map)
 
-int main(int argc, const char **argv_)
+
+/* Configuration elements common to all streams */
+struct global_config
 {
-    vpx_codec_ctx_t        encoder;
-    const char                  *in_fn = NULL, *out_fn = NULL, *stats_fn = NULL;
-    int                    i;
-    FILE                  *infile, *outfile;
-    vpx_codec_enc_cfg_t    cfg;
-    vpx_codec_err_t        res;
-    int                    pass, one_pass_only = 0;
-    stats_io_t             stats;
-    vpx_image_t            raw;
-    const struct codec_item  *codec = codecs;
-    int                    frame_avail, got_data;
+    const struct codec_item  *codec;
+    int                       passes;
+    int                       pass;
+    int                       usage;
+    int                       deadline;
+    int                       use_i420;
+    int                       verbose;
+    int                       limit;
+    int                       show_psnr;
+    int                       have_framerate;
+    struct vpx_rational       framerate;
+    int                       out_part;
+    int                       debug;
+    int                       show_q_hist_buckets;
+    int                       show_rate_hist_buckets;
+};
 
-    struct arg               arg;
-    char                   **argv, **argi, **argj;
-    int                      arg_usage = 0, arg_passes = 1, arg_deadline = 0;
-    int                      arg_ctrls[ARG_CTRL_CNT_MAX][2], arg_ctrl_cnt = 0;
-    int                      arg_limit = 0;
-    static const arg_def_t **ctrl_args = no_args;
-    static const int        *ctrl_args_map = NULL;
-    int                      verbose = 0, show_psnr = 0;
-    int                      arg_use_i420 = 1;
-    unsigned long            cx_time = 0;
-    unsigned int             file_type, fourcc;
-    y4m_input                y4m;
-    struct vpx_rational      arg_framerate = {30, 1};
-    int                      arg_have_framerate = 0;
-    int                      write_webm = 1;
-    EbmlGlobal               ebml = {0};
-    uint32_t                 hash = 0;
-    uint64_t                 psnr_sse_total = 0;
-    uint64_t                 psnr_samples_total = 0;
-    double                   psnr_totals[4] = {0, 0, 0, 0};
-    int                      psnr_count = 0;
-    stereo_format_t          stereo_fmt = STEREO_FORMAT_MONO;
-    int                      counts[64]={0};
-    int                      show_q_hist_buckets=0;
-    int                      show_rate_hist_buckets=0;
-    struct rate_hist         rate_hist={0};
 
-    exec_name = argv_[0];
-    ebml.last_pts_ms = -1;
+/* Per-stream configuration */
+struct stream_config
+{
+    struct vpx_codec_enc_cfg  cfg;
+    const char               *out_fn;
+    const char               *stats_fn;
+    stereo_format_t           stereo_fmt;
+    int                       arg_ctrls[ARG_CTRL_CNT_MAX][2];
+    int                       arg_ctrl_cnt;
+    int                       write_webm;
+};
 
-    if (argc < 3)
-        usage_exit();
 
+struct stream_state
+{
+    int                       index;
+    struct stream_state      *next;
+    struct stream_config      config;
+    FILE                     *file;
+    struct rate_hist          rate_hist;
+    EbmlGlobal                ebml;
+    uint32_t                  hash;
+    uint64_t                  psnr_sse_total;
+    uint64_t                  psnr_samples_total;
+    double                    psnr_totals[4];
+    int                       psnr_count;
+    int                       counts[64];
+    vpx_codec_ctx_t           encoder;
+    unsigned int              frames_out;
+    uint64_t                  cx_time;
+    size_t                    nbytes;
+    stats_io_t                stats;
+};
 
-    /* First parse the codec and usage values, because we want to apply other
-     * parameters on top of the default configuration provided by the codec.
-     */
-    argv = argv_dup(argc - 1, argv_ + 1);
+
+static void parse_global_config(struct global_config *global, char **argv)
+{
+    char       **argi, **argj;
+    struct arg   arg;
+
+    /* Initialize default parameters */
+    memset(global, 0, sizeof(*global));
+    global->codec = codecs;
+    global->passes = 1;
+    global->use_i420 = 1;
 
     for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step)
     {
@@ -1511,7 +1566,7 @@ int main(int argc, const char **argv_)
                     k = j;
 
             if (k >= 0)
-                codec = codecs + k;
+                global->codec = codecs + k;
             else
                 die("Error: Unrecognized argument (%s) to --codec\n",
                     arg.val);
@@ -1519,621 +1574,925 @@ int main(int argc, const char **argv_)
         }
         else if (arg_match(&arg, &passes, argi))
         {
-            arg_passes = arg_parse_uint(&arg);
+            global->passes = arg_parse_uint(&arg);
 
-            if (arg_passes < 1 || arg_passes > 2)
-                die("Error: Invalid number of passes (%d)\n", arg_passes);
+            if (global->passes < 1 || global->passes > 2)
+                die("Error: Invalid number of passes (%d)\n", global->passes);
         }
         else if (arg_match(&arg, &pass_arg, argi))
         {
-            one_pass_only = arg_parse_uint(&arg);
+            global->pass = arg_parse_uint(&arg);
 
-            if (one_pass_only < 1 || one_pass_only > 2)
-                die("Error: Invalid pass selected (%d)\n", one_pass_only);
+            if (global->pass < 1 || global->pass > 2)
+                die("Error: Invalid pass selected (%d)\n",
+                    global->pass);
         }
-        else if (arg_match(&arg, &fpf_name, argi))
-            stats_fn = arg.val;
         else if (arg_match(&arg, &usage, argi))
-            arg_usage = arg_parse_uint(&arg);
+            global->usage = arg_parse_uint(&arg);
         else if (arg_match(&arg, &deadline, argi))
-            arg_deadline = arg_parse_uint(&arg);
+            global->deadline = arg_parse_uint(&arg);
         else if (arg_match(&arg, &best_dl, argi))
-            arg_deadline = VPX_DL_BEST_QUALITY;
+            global->deadline = VPX_DL_BEST_QUALITY;
         else if (arg_match(&arg, &good_dl, argi))
-            arg_deadline = VPX_DL_GOOD_QUALITY;
+            global->deadline = VPX_DL_GOOD_QUALITY;
         else if (arg_match(&arg, &rt_dl, argi))
-            arg_deadline = VPX_DL_REALTIME;
+            global->deadline = VPX_DL_REALTIME;
         else if (arg_match(&arg, &use_yv12, argi))
-        {
-            arg_use_i420 = 0;
-        }
+            global->use_i420 = 0;
         else if (arg_match(&arg, &use_i420, argi))
-        {
-            arg_use_i420 = 1;
-        }
+            global->use_i420 = 1;
         else if (arg_match(&arg, &verbosearg, argi))
-            verbose = 1;
+            global->verbose = 1;
         else if (arg_match(&arg, &limit, argi))
-            arg_limit = arg_parse_uint(&arg);
+            global->limit = arg_parse_uint(&arg);
         else if (arg_match(&arg, &psnrarg, argi))
-            show_psnr = 1;
+            global->show_psnr = 1;
         else if (arg_match(&arg, &framerate, argi))
         {
-            arg_framerate = arg_parse_rational(&arg);
-            arg_have_framerate = 1;
+            global->framerate = arg_parse_rational(&arg);
+            global->have_framerate = 1;
         }
-        else if (arg_match(&arg, &use_ivf, argi))
-            write_webm = 0;
-        else if (arg_match(&arg, &outputfile, argi))
-            out_fn = arg.val;
+        else if (arg_match(&arg,&out_part, argi))
+            global->out_part = 1;
         else if (arg_match(&arg, &debugmode, argi))
-            ebml.debug = 1;
+            global->debug = 1;
         else if (arg_match(&arg, &q_hist_n, argi))
-            show_q_hist_buckets = arg_parse_uint(&arg);
+            global->show_q_hist_buckets = arg_parse_uint(&arg);
         else if (arg_match(&arg, &rate_hist_n, argi))
-            show_rate_hist_buckets = arg_parse_uint(&arg);
+            global->show_rate_hist_buckets = arg_parse_uint(&arg);
         else
             argj++;
     }
 
-    /* Ensure that --passes and --pass are consistent. If --pass is set and --passes=2,
-     * ensure --fpf was set.
-     */
-    if (one_pass_only)
+    /* Validate global config */
+
+    if (global->pass)
     {
         /* DWIM: Assume the user meant passes=2 if pass=2 is specified */
-        if (one_pass_only > arg_passes)
+        if (global->pass > global->passes)
         {
-            fprintf(stderr, "Warning: Assuming --pass=%d implies --passes=%d\n",
-                   one_pass_only, one_pass_only);
-            arg_passes = one_pass_only;
+            warn("Assuming --pass=%d implies --passes=%d\n",
+                 global->pass, global->pass);
+            global->passes = global->pass;
         }
+    }
+}
+
+
+void open_input_file(struct input_state *input)
+{
+    unsigned int fourcc;
 
-        if (arg_passes == 2 && !stats_fn)
-            die("Must specify --fpf when --pass=%d and --passes=2\n", one_pass_only);
+    /* Parse certain options from the input file, if possible */
+    input->file = strcmp(input->fn, "-") ? fopen(input->fn, "rb")
+                                         : set_binary_mode(stdin);
+
+    if (!input->file)
+        fatal("Failed to open input file");
+
+    /* For RAW input sources, these bytes will applied on the first frame
+     *  in read_frame().
+     */
+    input->detect.buf_read = fread(input->detect.buf, 1, 4, input->file);
+    input->detect.position = 0;
+
+    if (input->detect.buf_read == 4
+        && file_is_y4m(input->file, &input->y4m, input->detect.buf))
+    {
+        if (y4m_input_open(&input->y4m, input->file, input->detect.buf, 4) >= 0)
+        {
+            input->file_type = FILE_TYPE_Y4M;
+            input->w = input->y4m.pic_w;
+            input->h = input->y4m.pic_h;
+            input->framerate.num = input->y4m.fps_n;
+            input->framerate.den = input->y4m.fps_d;
+            input->use_i420 = 0;
+        }
+        else
+            fatal("Unsupported Y4M stream.");
+    }
+    else if (input->detect.buf_read == 4 && file_is_ivf(input, &fourcc))
+    {
+        input->file_type = FILE_TYPE_IVF;
+        switch (fourcc)
+        {
+        case 0x32315659:
+            input->use_i420 = 0;
+            break;
+        case 0x30323449:
+            input->use_i420 = 1;
+            break;
+        default:
+            fatal("Unsupported fourcc (%08x) in IVF", fourcc);
+        }
+    }
+    else
+    {
+        input->file_type = FILE_TYPE_RAW;
     }
+}
+
+
+static void close_input_file(struct input_state *input)
+{
+    fclose(input->file);
+    if (input->file_type == FILE_TYPE_Y4M)
+        y4m_input_close(&input->y4m);
+}
 
-    /* Populate encoder configuration */
-    res = vpx_codec_enc_config_default(codec->iface, &cfg, arg_usage);
+static struct stream_state *new_stream(struct global_config *global,
+                                       struct stream_state  *prev)
+{
+    struct stream_state *stream;
 
-    if (res)
+    stream = calloc(1, sizeof(*stream));
+    if(!stream)
+        fatal("Failed to allocate new stream.");
+    if(prev)
     {
-        fprintf(stderr, "Failed to get config: %s\n",
-                vpx_codec_err_to_string(res));
-        return EXIT_FAILURE;
+        memcpy(stream, prev, sizeof(*stream));
+        stream->index++;
+        prev->next = stream;
     }
+    else
+    {
+        vpx_codec_err_t  res;
 
-    /* Change the default timebase to a high enough value so that the encoder
-     * will always create strictly increasing timestamps.
-     */
-    cfg.g_timebase.den = 1000;
+        /* Populate encoder configuration */
+        res = vpx_codec_enc_config_default(global->codec->iface,
+                                           &stream->config.cfg,
+                                           global->usage);
+        if (res)
+            fatal("Failed to get config: %s\n", vpx_codec_err_to_string(res));
 
-    /* Never use the library's default resolution, require it be parsed
-     * from the file or set on the command line.
-     */
-    cfg.g_w = 0;
-    cfg.g_h = 0;
+        /* Change the default timebase to a high enough value so that the
+         * encoder will always create strictly increasing timestamps.
+         */
+        stream->config.cfg.g_timebase.den = 1000;
+
+        /* Never use the library's default resolution, require it be parsed
+         * from the file or set on the command line.
+         */
+        stream->config.cfg.g_w = 0;
+        stream->config.cfg.g_h = 0;
+
+        /* Initialize remaining stream parameters */
+        stream->config.stereo_fmt = STEREO_FORMAT_MONO;
+        stream->config.write_webm = 1;
+        stream->ebml.last_pts_ms = -1;
+
+        /* Allows removal of the application version from the EBML tags */
+        stream->ebml.debug = global->debug;
+    }
+
+    /* Output files must be specified for each stream */
+    stream->config.out_fn = NULL;
+
+    stream->next = NULL;
+    return stream;
+}
+
+
+static int parse_stream_params(struct global_config *global,
+                               struct stream_state  *stream,
+                               char **argv)
+{
+    char                   **argi, **argj;
+    struct arg               arg;
+    static const arg_def_t **ctrl_args = no_args;
+    static const int        *ctrl_args_map = NULL;
+    struct stream_config    *config = &stream->config;
+    int                      eos_mark_found = 0;
+
+    /* Handle codec specific options */
+    if (global->codec->iface == &vpx_codec_vp8_cx_algo)
+    {
+        ctrl_args = vp8_args;
+        ctrl_args_map = vp8_arg_ctrl_map;
+    }
 
-    /* Now parse the remainder of the parameters. */
     for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step)
     {
         arg.argv_step = 1;
 
+        /* Once we've found an end-of-stream marker (--) we want to continue
+         * shifting arguments but not consuming them.
+         */
+        if (eos_mark_found)
+        {
+            argj++;
+            continue;
+        }
+        else if (!strcmp(*argj, "--"))
+        {
+            eos_mark_found = 1;
+            continue;
+        }
+
         if (0);
+        else if (arg_match(&arg, &outputfile, argi))
+            config->out_fn = arg.val;
+        else if (arg_match(&arg, &fpf_name, argi))
+            config->stats_fn = arg.val;
+        else if (arg_match(&arg, &use_ivf, argi))
+            config->write_webm = 0;
         else if (arg_match(&arg, &threads, argi))
-            cfg.g_threads = arg_parse_uint(&arg);
+            config->cfg.g_threads = arg_parse_uint(&arg);
         else if (arg_match(&arg, &profile, argi))
-            cfg.g_profile = arg_parse_uint(&arg);
+            config->cfg.g_profile = arg_parse_uint(&arg);
         else if (arg_match(&arg, &width, argi))
-            cfg.g_w = arg_parse_uint(&arg);
+            config->cfg.g_w = arg_parse_uint(&arg);
         else if (arg_match(&arg, &height, argi))
-            cfg.g_h = arg_parse_uint(&arg);
+            config->cfg.g_h = arg_parse_uint(&arg);
         else if (arg_match(&arg, &stereo_mode, argi))
-            stereo_fmt = arg_parse_enum_or_int(&arg);
+            config->stereo_fmt = arg_parse_enum_or_int(&arg);
         else if (arg_match(&arg, &timebase, argi))
-            cfg.g_timebase = arg_parse_rational(&arg);
+            config->cfg.g_timebase = arg_parse_rational(&arg);
         else if (arg_match(&arg, &error_resilient, argi))
-            cfg.g_error_resilient = arg_parse_uint(&arg);
+            config->cfg.g_error_resilient = arg_parse_uint(&arg);
         else if (arg_match(&arg, &lag_in_frames, argi))
-            cfg.g_lag_in_frames = arg_parse_uint(&arg);
+            config->cfg.g_lag_in_frames = arg_parse_uint(&arg);
         else if (arg_match(&arg, &dropframe_thresh, argi))
-            cfg.rc_dropframe_thresh = arg_parse_uint(&arg);
+            config->cfg.rc_dropframe_thresh = arg_parse_uint(&arg);
         else if (arg_match(&arg, &resize_allowed, argi))
-            cfg.rc_resize_allowed = arg_parse_uint(&arg);
+            config->cfg.rc_resize_allowed = arg_parse_uint(&arg);
         else if (arg_match(&arg, &resize_up_thresh, argi))
-            cfg.rc_resize_up_thresh = arg_parse_uint(&arg);
+            config->cfg.rc_resize_up_thresh = arg_parse_uint(&arg);
         else if (arg_match(&arg, &resize_down_thresh, argi))
-            cfg.rc_resize_down_thresh = arg_parse_uint(&arg);
+            config->cfg.rc_resize_down_thresh = arg_parse_uint(&arg);
         else if (arg_match(&arg, &end_usage, argi))
-            cfg.rc_end_usage = arg_parse_enum_or_int(&arg);
+            config->cfg.rc_end_usage = arg_parse_enum_or_int(&arg);
         else if (arg_match(&arg, &target_bitrate, argi))
-            cfg.rc_target_bitrate = arg_parse_uint(&arg);
+            config->cfg.rc_target_bitrate = arg_parse_uint(&arg);
         else if (arg_match(&arg, &min_quantizer, argi))
-            cfg.rc_min_quantizer = arg_parse_uint(&arg);
+            config->cfg.rc_min_quantizer = arg_parse_uint(&arg);
         else if (arg_match(&arg, &max_quantizer, argi))
-            cfg.rc_max_quantizer = arg_parse_uint(&arg);
+            config->cfg.rc_max_quantizer = arg_parse_uint(&arg);
         else if (arg_match(&arg, &undershoot_pct, argi))
-            cfg.rc_undershoot_pct = arg_parse_uint(&arg);
+            config->cfg.rc_undershoot_pct = arg_parse_uint(&arg);
         else if (arg_match(&arg, &overshoot_pct, argi))
-            cfg.rc_overshoot_pct = arg_parse_uint(&arg);
+            config->cfg.rc_overshoot_pct = arg_parse_uint(&arg);
         else if (arg_match(&arg, &buf_sz, argi))
-            cfg.rc_buf_sz = arg_parse_uint(&arg);
+            config->cfg.rc_buf_sz = arg_parse_uint(&arg);
         else if (arg_match(&arg, &buf_initial_sz, argi))
-            cfg.rc_buf_initial_sz = arg_parse_uint(&arg);
+            config->cfg.rc_buf_initial_sz = arg_parse_uint(&arg);
         else if (arg_match(&arg, &buf_optimal_sz, argi))
-            cfg.rc_buf_optimal_sz = arg_parse_uint(&arg);
+            config->cfg.rc_buf_optimal_sz = arg_parse_uint(&arg);
         else if (arg_match(&arg, &bias_pct, argi))
         {
-            cfg.rc_2pass_vbr_bias_pct = arg_parse_uint(&arg);
+            config->cfg.rc_2pass_vbr_bias_pct = arg_parse_uint(&arg);
 
-            if (arg_passes < 2)
-                fprintf(stderr,
-                        "Warning: option %s ignored in one-pass mode.\n",
-                        arg.name);
+            if (global->passes < 2)
+                warn("option %s ignored in one-pass mode.\n", arg.name);
         }
         else if (arg_match(&arg, &minsection_pct, argi))
         {
-            cfg.rc_2pass_vbr_minsection_pct = arg_parse_uint(&arg);
+            config->cfg.rc_2pass_vbr_minsection_pct = arg_parse_uint(&arg);
 
-            if (arg_passes < 2)
-                fprintf(stderr,
-                        "Warning: option %s ignored in one-pass mode.\n",
-                        arg.name);
+            if (global->passes < 2)
+                warn("option %s ignored in one-pass mode.\n", arg.name);
         }
         else if (arg_match(&arg, &maxsection_pct, argi))
         {
-            cfg.rc_2pass_vbr_maxsection_pct = arg_parse_uint(&arg);
+            config->cfg.rc_2pass_vbr_maxsection_pct = arg_parse_uint(&arg);
 
-            if (arg_passes < 2)
-                fprintf(stderr,
-                        "Warning: option %s ignored in one-pass mode.\n",
-                        arg.name);
+            if (global->passes < 2)
+                warn("option %s ignored in one-pass mode.\n", arg.name);
         }
         else if (arg_match(&arg, &kf_min_dist, argi))
-            cfg.kf_min_dist = arg_parse_uint(&arg);
+            config->cfg.kf_min_dist = arg_parse_uint(&arg);
         else if (arg_match(&arg, &kf_max_dist, argi))
-            cfg.kf_max_dist = arg_parse_uint(&arg);
+            config->cfg.kf_max_dist = arg_parse_uint(&arg);
         else if (arg_match(&arg, &kf_disabled, argi))
-            cfg.kf_mode = VPX_KF_DISABLED;
+            config->cfg.kf_mode = VPX_KF_DISABLED;
         else
-            argj++;
+        {
+            int i, match = 0;
+
+            for (i = 0; ctrl_args[i]; i++)
+            {
+                if (arg_match(&arg, ctrl_args[i], argi))
+                {
+                    int j;
+                    match = 1;
+
+                    /* Point either to the next free element or the first
+                    * instance of this control.
+                    */
+                    for(j=0; j<config->arg_ctrl_cnt; j++)
+                        if(config->arg_ctrls[j][0] == ctrl_args_map[i])
+                            break;
+
+                    /* Update/insert */
+                    assert(j < ARG_CTRL_CNT_MAX);
+                    if (j < ARG_CTRL_CNT_MAX)
+                    {
+                        config->arg_ctrls[j][0] = ctrl_args_map[i];
+                        config->arg_ctrls[j][1] = arg_parse_enum_or_int(&arg);
+                        if(j == config->arg_ctrl_cnt)
+                            config->arg_ctrl_cnt++;
+                    }
+
+                }
+            }
+
+            if (!match)
+                argj++;
+        }
     }
 
-    /* Handle codec specific options */
-#if CONFIG_VP8_ENCODER
+    return eos_mark_found;
+}
+
 
-    if (codec->iface == &vpx_codec_vp8_cx_algo)
+#define FOREACH_STREAM(func)\
+do\
+{\
+    struct stream_state  *stream;\
+\
+    for(stream = streams; stream; stream = stream->next)\
+        func;\
+}while(0)
+
+
+static void validate_stream_config(struct stream_state *stream)
+{
+    struct stream_state *streami;
+
+    if(!stream->config.cfg.g_w || !stream->config.cfg.g_h)
+        fatal("Stream %d: Specify stream dimensions with --width (-w) "
+              " and --height (-h)", stream->index);
+
+    for(streami = stream; streami; streami = streami->next)
     {
-        ctrl_args = vp8_args;
-        ctrl_args_map = vp8_arg_ctrl_map;
+        /* All streams require output files */
+        if(!streami->config.out_fn)
+            fatal("Stream %d: Output file is required (specify with -o)",
+                  streami->index);
+
+        /* Check for two streams outputting to the same file */
+        if(streami != stream)
+        {
+            const char *a = stream->config.out_fn;
+            const char *b = streami->config.out_fn;
+            if(!strcmp(a,b) && strcmp(a, "/dev/null") && strcmp(a, ":nul"))
+                fatal("Stream %d: duplicate output file (from stream %d)",
+                      streami->index, stream->index);
+        }
+
+        /* Check for two streams sharing a stats file. */
+        if(streami != stream)
+        {
+            const char *a = stream->config.stats_fn;
+            const char *b = streami->config.stats_fn;
+            if(a && b && !strcmp(a,b))
+                fatal("Stream %d: duplicate stats file (from stream %d)",
+                      streami->index, stream->index);
+        }
     }
+}
 
-#endif
 
-    for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step)
+static void set_stream_dimensions(struct stream_state *stream,
+                                  unsigned int w,
+                                  unsigned int h)
+{
+    if ((stream->config.cfg.g_w && stream->config.cfg.g_w != w)
+        ||(stream->config.cfg.g_h && stream->config.cfg.g_h != h))
+        fatal("Stream %d: Resizing not yet supported", stream->index);
+    stream->config.cfg.g_w = w;
+    stream->config.cfg.g_h = h;
+}
+
+
+static void show_stream_config(struct stream_state  *stream,
+                               struct global_config *global,
+                               struct input_state   *input)
+{
+
+#define SHOW(field) \
+    fprintf(stderr, "    %-28s = %d\n", #field, stream->config.cfg.field)
+
+    if(stream->index == 0)
     {
-        int match = 0;
+        fprintf(stderr, "Codec: %s\n",
+                vpx_codec_iface_name(global->codec->iface));
+        fprintf(stderr, "Source file: %s Format: %s\n", input->fn,
+                input->use_i420 ? "I420" : "YV12");
+    }
+    if(stream->next || stream->index)
+        fprintf(stderr, "\nStream Index: %d\n", stream->index);
+    fprintf(stderr, "Destination file: %s\n", stream->config.out_fn);
+    fprintf(stderr, "Encoder parameters:\n");
+
+    SHOW(g_usage);
+    SHOW(g_threads);
+    SHOW(g_profile);
+    SHOW(g_w);
+    SHOW(g_h);
+    SHOW(g_timebase.num);
+    SHOW(g_timebase.den);
+    SHOW(g_error_resilient);
+    SHOW(g_pass);
+    SHOW(g_lag_in_frames);
+    SHOW(rc_dropframe_thresh);
+    SHOW(rc_resize_allowed);
+    SHOW(rc_resize_up_thresh);
+    SHOW(rc_resize_down_thresh);
+    SHOW(rc_end_usage);
+    SHOW(rc_target_bitrate);
+    SHOW(rc_min_quantizer);
+    SHOW(rc_max_quantizer);
+    SHOW(rc_undershoot_pct);
+    SHOW(rc_overshoot_pct);
+    SHOW(rc_buf_sz);
+    SHOW(rc_buf_initial_sz);
+    SHOW(rc_buf_optimal_sz);
+    SHOW(rc_2pass_vbr_bias_pct);
+    SHOW(rc_2pass_vbr_minsection_pct);
+    SHOW(rc_2pass_vbr_maxsection_pct);
+    SHOW(kf_mode);
+    SHOW(kf_min_dist);
+    SHOW(kf_max_dist);
+}
 
-        arg.argv_step = 1;
 
-        for (i = 0; ctrl_args[i]; i++)
-        {
-            if (arg_match(&arg, ctrl_args[i], argi))
-            {
-                int j;
-                match = 1;
-
-                /* Point either to the next free element or the first
-                * instance of this control.
-                */
-                for(j=0; j<arg_ctrl_cnt; j++)
-                    if(arg_ctrls[j][0] == ctrl_args_map[i])
-                        break;
-
-                /* Update/insert */
-                assert(j < ARG_CTRL_CNT_MAX);
-                if (j < ARG_CTRL_CNT_MAX)
-                {
-                    arg_ctrls[j][0] = ctrl_args_map[i];
-                    arg_ctrls[j][1] = arg_parse_enum_or_int(&arg);
-                    if(j == arg_ctrl_cnt)
-                        arg_ctrl_cnt++;
-                }
+static void open_output_file(struct stream_state *stream,
+                             struct global_config *global)
+{
+    const char *fn = stream->config.out_fn;
 
-            }
-        }
+    stream->file = strcmp(fn, "-") ? fopen(fn, "wb") : set_binary_mode(stdout);
 
-        if (!match)
-            argj++;
+    if (!stream->file)
+        fatal("Failed to open output file");
+
+    if(stream->config.write_webm && fseek(stream->file, 0, SEEK_CUR))
+        fatal("WebM output to pipes not supported.");
+
+    if(stream->config.write_webm)
+    {
+        stream->ebml.stream = stream->file;
+        write_webm_file_header(&stream->ebml, &stream->config.cfg,
+                               &global->framerate,
+                               stream->config.stereo_fmt);
     }
+    else
+        write_ivf_file_header(stream->file, &stream->config.cfg,
+                              global->codec->fourcc, 0);
+}
 
-    /* Check for unrecognized options */
-    for (argi = argv; *argi; argi++)
-        if (argi[0][0] == '-' && argi[0][1])
-            die("Error: Unrecognized option %s\n", *argi);
 
-    /* Handle non-option arguments */
-    in_fn = argv[0];
+static void close_output_file(struct stream_state *stream,
+                              unsigned int         fourcc)
+{
+    if(stream->config.write_webm)
+    {
+        write_webm_file_footer(&stream->ebml, stream->hash);
+        free(stream->ebml.cue_list);
+        stream->ebml.cue_list = NULL;
+    }
+    else
+    {
+        if (!fseek(stream->file, 0, SEEK_SET))
+            write_ivf_file_header(stream->file, &stream->config.cfg,
+                                  fourcc,
+                                  stream->frames_out);
+    }
 
-    if (!in_fn)
-        usage_exit();
+    fclose(stream->file);
+}
+
+
+static void setup_pass(struct stream_state  *stream,
+                       struct global_config *global,
+                       int                   pass)
+{
+    if (stream->config.stats_fn)
+    {
+        if (!stats_open_file(&stream->stats, stream->config.stats_fn,
+                             pass))
+            fatal("Failed to open statistics store");
+    }
+    else
+    {
+        if (!stats_open_mem(&stream->stats, pass))
+            fatal("Failed to open statistics store");
+    }
+
+    stream->config.cfg.g_pass = global->passes == 2
+        ? pass ? VPX_RC_LAST_PASS : VPX_RC_FIRST_PASS
+        : VPX_RC_ONE_PASS;
+    if (pass)
+        stream->config.cfg.rc_twopass_stats_in = stats_get(&stream->stats);
+
+    stream->cx_time = 0;
+    stream->nbytes = 0;
+}
+
+
+static void initialize_encoder(struct stream_state  *stream,
+                               struct global_config *global)
+{
+    int i;
+    int flags = 0;
 
-    if(!out_fn)
-        die("Error: Output file is required (specify with -o)\n");
+    flags |= global->show_psnr ? VPX_CODEC_USE_PSNR : 0;
+    flags |= global->out_part ? VPX_CODEC_USE_OUTPUT_PARTITION : 0;
 
-    memset(&stats, 0, sizeof(stats));
+    /* Construct Encoder Context */
+    vpx_codec_enc_init(&stream->encoder, global->codec->iface,
+                        &stream->config.cfg, flags);
+    ctx_exit_on_error(&stream->encoder, "Failed to initialize encoder");
 
-    for (pass = one_pass_only ? one_pass_only - 1 : 0; pass < arg_passes; pass++)
+    /* Note that we bypass the vpx_codec_control wrapper macro because
+     * we're being clever to store the control IDs in an array. Real
+     * applications will want to make use of the enumerations directly
+     */
+    for (i = 0; i < stream->config.arg_ctrl_cnt; i++)
     {
-        int frames_in = 0, frames_out = 0;
-        int64_t nbytes = 0;
-        struct detect_buffer detect;
+        int ctrl = stream->config.arg_ctrls[i][0];
+        int value = stream->config.arg_ctrls[i][1];
+        if (vpx_codec_control_(&stream->encoder, ctrl, value))
+            fprintf(stderr, "Error: Tried to set control %d = %d\n",
+                    ctrl, value);
 
-        /* Parse certain options from the input file, if possible */
-        infile = strcmp(in_fn, "-") ? fopen(in_fn, "rb")
-                                    : set_binary_mode(stdin);
+        ctx_exit_on_error(&stream->encoder, "Failed to control codec");
+    }
+}
 
-        if (!infile)
-        {
-            fprintf(stderr, "Failed to open input file\n");
-            return EXIT_FAILURE;
-        }
 
-        /* For RAW input sources, these bytes will applied on the first frame
-         *  in read_frame().
-         */
-        detect.buf_read = fread(detect.buf, 1, 4, infile);
-        detect.position = 0;
+static void encode_frame(struct stream_state  *stream,
+                         struct global_config *global,
+                         struct vpx_image     *img,
+                         unsigned int          frames_in)
+{
+    vpx_codec_pts_t frame_start, next_frame_start;
+    struct vpx_codec_enc_cfg *cfg = &stream->config.cfg;
+    struct vpx_usec_timer timer;
+
+    frame_start = (cfg->g_timebase.den * (int64_t)(frames_in - 1)
+                  * global->framerate.den)
+                  / cfg->g_timebase.num / global->framerate.num;
+    next_frame_start = (cfg->g_timebase.den * (int64_t)(frames_in)
+                        * global->framerate.den)
+                        / cfg->g_timebase.num / global->framerate.num;
+    vpx_usec_timer_start(&timer);
+    vpx_codec_encode(&stream->encoder, img, frame_start,
+                     next_frame_start - frame_start,
+                     0, global->deadline);
+    vpx_usec_timer_mark(&timer);
+    stream->cx_time += vpx_usec_timer_elapsed(&timer);
+    ctx_exit_on_error(&stream->encoder, "Stream %d: Failed to encode frame",
+                      stream->index);
+}
+
+
+static void update_quantizer_histogram(struct stream_state *stream)
+{
+    if(stream->config.cfg.g_pass != VPX_RC_FIRST_PASS)
+    {
+        int q;
+
+        vpx_codec_control(&stream->encoder, VP8E_GET_LAST_QUANTIZER_64, &q);
+        ctx_exit_on_error(&stream->encoder, "Failed to read quantizer");
+        stream->counts[q]++;
+    }
+}
+
 
-        if (detect.buf_read == 4 && file_is_y4m(infile, &y4m, detect.buf))
+static void get_cx_data(struct stream_state  *stream,
+                        struct global_config *global,
+                        int                  *got_data)
+{
+    const vpx_codec_cx_pkt_t *pkt;
+    const struct vpx_codec_enc_cfg *cfg = &stream->config.cfg;
+    vpx_codec_iter_t iter = NULL;
+
+    while ((pkt = vpx_codec_get_cx_data(&stream->encoder, &iter)))
+    {
+        static size_t fsize = 0;
+        static off_t ivf_header_pos = 0;
+
+        *got_data = 1;
+
+        switch (pkt->kind)
         {
-            if (y4m_input_open(&y4m, infile, detect.buf, 4) >= 0)
+        case VPX_CODEC_CX_FRAME_PKT:
+            if (!(pkt->data.frame.flags & VPX_FRAME_IS_FRAGMENT))
             {
-                file_type = FILE_TYPE_Y4M;
-                cfg.g_w = y4m.pic_w;
-                cfg.g_h = y4m.pic_h;
-
-                /* Use the frame rate from the file only if none was specified
-                 * on the command-line.
-                 */
-                if (!arg_have_framerate)
-                {
-                    arg_framerate.num = y4m.fps_n;
-                    arg_framerate.den = y4m.fps_d;
-                }
+                stream->frames_out++;
+            }
+            fprintf(stderr, " %6luF",
+                    (unsigned long)pkt->data.frame.sz);
+
+            update_rate_histogram(&stream->rate_hist, cfg, pkt);
+            if(stream->config.write_webm)
+            {
+                /* Update the hash */
+                if(!stream->ebml.debug)
+                    stream->hash = murmur(pkt->data.frame.buf,
+                                          pkt->data.frame.sz, stream->hash);
 
-                arg_use_i420 = 0;
+                write_webm_block(&stream->ebml, cfg, pkt);
             }
             else
             {
-                fprintf(stderr, "Unsupported Y4M stream.\n");
-                return EXIT_FAILURE;
+                if (pkt->data.frame.partition_id <= 0)
+                {
+                    ivf_header_pos = ftello(stream->file);
+                    fsize = pkt->data.frame.sz;
+
+                    write_ivf_frame_header(stream->file, pkt);
+                }
+                else
+                {
+                    fsize += pkt->data.frame.sz;
+
+                    if (!(pkt->data.frame.flags & VPX_FRAME_IS_FRAGMENT))
+                    {
+                        off_t currpos = ftello(stream->file);
+                        fseeko(stream->file, ivf_header_pos, SEEK_SET);
+                        write_ivf_frame_size(stream->file, fsize);
+                        fseeko(stream->file, currpos, SEEK_SET);
+                    }
+                }
+
+                fwrite(pkt->data.frame.buf, 1,
+                       pkt->data.frame.sz, stream->file);
             }
-        }
-        else if (detect.buf_read == 4 &&
-                 file_is_ivf(infile, &fourcc, &cfg.g_w, &cfg.g_h, &detect))
-        {
-            file_type = FILE_TYPE_IVF;
-            switch (fourcc)
+            stream->nbytes += pkt->data.raw.sz;
+            break;
+        case VPX_CODEC_STATS_PKT:
+            stream->frames_out++;
+            fprintf(stderr, " %6luS",
+                   (unsigned long)pkt->data.twopass_stats.sz);
+            stats_write(&stream->stats,
+                        pkt->data.twopass_stats.buf,
+                        pkt->data.twopass_stats.sz);
+            stream->nbytes += pkt->data.raw.sz;
+            break;
+        case VPX_CODEC_PSNR_PKT:
+
+            if (global->show_psnr)
             {
-            case 0x32315659:
-                arg_use_i420 = 0;
-                break;
-            case 0x30323449:
-                arg_use_i420 = 1;
-                break;
-            default:
-                fprintf(stderr, "Unsupported fourcc (%08x) in IVF\n", fourcc);
-                return EXIT_FAILURE;
+                int i;
+
+                stream->psnr_sse_total += pkt->data.psnr.sse[0];
+                stream->psnr_samples_total += pkt->data.psnr.samples[0];
+                for (i = 0; i < 4; i++)
+                {
+                    fprintf(stderr, "%.3lf ", pkt->data.psnr.psnr[i]);
+                    stream->psnr_totals[i] += pkt->data.psnr.psnr[i];
+                }
+                stream->psnr_count++;
             }
-        }
-        else
-        {
-            file_type = FILE_TYPE_RAW;
-        }
 
-        if(!cfg.g_w || !cfg.g_h)
-        {
-            fprintf(stderr, "Specify stream dimensions with --width (-w) "
-                            " and --height (-h).\n");
-            return EXIT_FAILURE;
+            break;
+        default:
+            break;
         }
+    }
+}
 
-#define SHOW(field) fprintf(stderr, "    %-28s = %d\n", #field, cfg.field)
 
-        if (verbose && pass == 0)
-        {
-            fprintf(stderr, "Codec: %s\n", vpx_codec_iface_name(codec->iface));
-            fprintf(stderr, "Source file: %s Format: %s\n", in_fn,
-                    arg_use_i420 ? "I420" : "YV12");
-            fprintf(stderr, "Destination file: %s\n", out_fn);
-            fprintf(stderr, "Encoder parameters:\n");
-
-            SHOW(g_usage);
-            SHOW(g_threads);
-            SHOW(g_profile);
-            SHOW(g_w);
-            SHOW(g_h);
-            SHOW(g_timebase.num);
-            SHOW(g_timebase.den);
-            SHOW(g_error_resilient);
-            SHOW(g_pass);
-            SHOW(g_lag_in_frames);
-            SHOW(rc_dropframe_thresh);
-            SHOW(rc_resize_allowed);
-            SHOW(rc_resize_up_thresh);
-            SHOW(rc_resize_down_thresh);
-            SHOW(rc_end_usage);
-            SHOW(rc_target_bitrate);
-            SHOW(rc_min_quantizer);
-            SHOW(rc_max_quantizer);
-            SHOW(rc_undershoot_pct);
-            SHOW(rc_overshoot_pct);
-            SHOW(rc_buf_sz);
-            SHOW(rc_buf_initial_sz);
-            SHOW(rc_buf_optimal_sz);
-            SHOW(rc_2pass_vbr_bias_pct);
-            SHOW(rc_2pass_vbr_minsection_pct);
-            SHOW(rc_2pass_vbr_maxsection_pct);
-            SHOW(kf_mode);
-            SHOW(kf_min_dist);
-            SHOW(kf_max_dist);
-        }
+static void show_psnr(struct stream_state  *stream)
+{
+    int i;
+    double ovpsnr;
 
-        if(pass == (one_pass_only ? one_pass_only - 1 : 0)) {
-            if (file_type == FILE_TYPE_Y4M)
-                /*The Y4M reader does its own allocation.
-                  Just initialize this here to avoid problems if we never read any
-                   frames.*/
-                memset(&raw, 0, sizeof(raw));
-            else
-                vpx_img_alloc(&raw, arg_use_i420 ? VPX_IMG_FMT_I420 : VPX_IMG_FMT_YV12,
-                              cfg.g_w, cfg.g_h, 1);
+    if (!stream->psnr_count)
+        return;
 
-            init_rate_histogram(&rate_hist, &cfg, &arg_framerate);
-        }
+    fprintf(stderr, "Stream %d PSNR (Overall/Avg/Y/U/V)", stream->index);
+    ovpsnr = vp8_mse2psnr(stream->psnr_samples_total, 255.0,
+                          stream->psnr_sse_total);
+    fprintf(stderr, " %.3lf", ovpsnr);
 
-        outfile = strcmp(out_fn, "-") ? fopen(out_fn, "wb")
-                                      : set_binary_mode(stdout);
+    for (i = 0; i < 4; i++)
+    {
+        fprintf(stderr, " %.3lf", stream->psnr_totals[i]/stream->psnr_count);
+    }
+    fprintf(stderr, "\n");
+}
 
-        if (!outfile)
-        {
-            fprintf(stderr, "Failed to open output file\n");
-            return EXIT_FAILURE;
-        }
 
-        if(write_webm && fseek(outfile, 0, SEEK_CUR))
-        {
-            fprintf(stderr, "WebM output to pipes not supported.\n");
-            return EXIT_FAILURE;
-        }
+float usec_to_fps(uint64_t usec, unsigned int frames)
+{
+    return usec > 0 ? (float)frames * 1000000.0 / (float)usec : 0;
+}
 
-        if (stats_fn)
-        {
-            if (!stats_open_file(&stats, stats_fn, pass))
-            {
-                fprintf(stderr, "Failed to open statistics store\n");
-                return EXIT_FAILURE;
-            }
-        }
-        else
-        {
-            if (!stats_open_mem(&stats, pass))
-            {
-                fprintf(stderr, "Failed to open statistics store\n");
-                return EXIT_FAILURE;
-            }
-        }
 
-        cfg.g_pass = arg_passes == 2
-                     ? pass ? VPX_RC_LAST_PASS : VPX_RC_FIRST_PASS
-                 : VPX_RC_ONE_PASS;
-#if VPX_ENCODER_ABI_VERSION > (1 + VPX_CODEC_ABI_VERSION)
+int main(int argc, const char **argv_)
+{
+    int                    pass;
+    vpx_image_t            raw;
+    int                    frame_avail, got_data;
 
-        if (pass)
-        {
-            cfg.rc_twopass_stats_in = stats_get(&stats);
-        }
+    struct input_state       input = {0};
+    struct global_config     global;
+    struct stream_state     *streams = NULL;
+    char                   **argv, **argi;
+    unsigned long            cx_time = 0;
+    int                      stream_cnt = 0;
 
-#endif
+    exec_name = argv_[0];
 
-        if(write_webm)
-        {
-            ebml.stream = outfile;
-            write_webm_file_header(&ebml, &cfg, &arg_framerate, stereo_fmt);
-        }
-        else
-            write_ivf_file_header(outfile, &cfg, codec->fourcc, 0);
+    if (argc < 3)
+        usage_exit();
 
+    /* Setup default input stream settings */
+    input.framerate.num = 30;
+    input.framerate.den = 1;
+    input.use_i420 = 1;
 
-        /* Construct Encoder Context */
-        vpx_codec_enc_init(&encoder, codec->iface, &cfg,
-                           show_psnr ? VPX_CODEC_USE_PSNR : 0);
-        ctx_exit_on_error(&encoder, "Failed to initialize encoder");
+    /* First parse the global configuration values, because we want to apply
+     * other parameters on top of the default configuration provided by the
+     * codec.
+     */
+    argv = argv_dup(argc - 1, argv_ + 1);
+    parse_global_config(&global, argv);
 
-        /* Note that we bypass the vpx_codec_control wrapper macro because
-         * we're being clever to store the control IDs in an array. Real
-         * applications will want to make use of the enumerations directly
+    {
+        /* Now parse each stream's parameters. Using a local scope here
+         * due to the use of 'stream' as loop variable in FOREACH_STREAM
+         * loops
          */
-        for (i = 0; i < arg_ctrl_cnt; i++)
+        struct stream_state *stream = NULL;
+
+        do
         {
-            if (vpx_codec_control_(&encoder, arg_ctrls[i][0], arg_ctrls[i][1]))
-                fprintf(stderr, "Error: Tried to set control %d = %d\n",
-                        arg_ctrls[i][0], arg_ctrls[i][1]);
+            stream = new_stream(&global, stream);
+            stream_cnt++;
+            if(!streams)
+                streams = stream;
+        } while(parse_stream_params(&global, stream, argv));
+    }
+
+    /* Check for unrecognized options */
+    for (argi = argv; *argi; argi++)
+        if (argi[0][0] == '-' && argi[0][1])
+            die("Error: Unrecognized option %s\n", *argi);
+
+    /* Handle non-option arguments */
+    input.fn = argv[0];
+
+    if (!input.fn)
+        usage_exit();
+
+    for (pass = global.pass ? global.pass - 1 : 0; pass < global.passes; pass++)
+    {
+        int frames_in = 0;
+
+        open_input_file(&input);
+
+        /* If the input file doesn't specify its w/h (raw files), try to get
+         * the data from the first stream's configuration.
+         */
+        if(!input.w || !input.h)
+            FOREACH_STREAM({
+                if(stream->config.cfg.g_w && stream->config.cfg.g_h)
+                {
+                    input.w = stream->config.cfg.g_w;
+                    input.h = stream->config.cfg.g_h;
+                    break;
+                }
+            });
 
-            ctx_exit_on_error(&encoder, "Failed to control codec");
+        /* Update stream configurations from the input file's parameters */
+        FOREACH_STREAM(set_stream_dimensions(stream, input.w, input.h));
+        FOREACH_STREAM(validate_stream_config(stream));
+
+        /* Ensure that --passes and --pass are consistent. If --pass is set and
+         * --passes=2, ensure --fpf was set.
+         */
+        if (global.pass && global.passes == 2)
+            FOREACH_STREAM({
+                if(!stream->config.stats_fn)
+                    die("Stream %d: Must specify --fpf when --pass=%d"
+                        " and --passes=2\n", stream->index, global.pass);
+            });
+
+
+        /* Use the frame rate from the file only if none was specified
+         * on the command-line.
+         */
+        if (!global.have_framerate)
+            global.framerate = input.framerate;
+
+        /* Show configuration */
+        if (global.verbose && pass == 0)
+            FOREACH_STREAM(show_stream_config(stream, &global, &input));
+
+        if(pass == (global.pass ? global.pass - 1 : 0)) {
+            if (input.file_type == FILE_TYPE_Y4M)
+                /*The Y4M reader does its own allocation.
+                  Just initialize this here to avoid problems if we never read any
+                   frames.*/
+                memset(&raw, 0, sizeof(raw));
+            else
+                vpx_img_alloc(&raw,
+                              input.use_i420 ? VPX_IMG_FMT_I420
+                                             : VPX_IMG_FMT_YV12,
+                              input.w, input.h, 1);
+
+            FOREACH_STREAM(init_rate_histogram(&stream->rate_hist,
+                                               &stream->config.cfg,
+                                               &global.framerate));
         }
 
+        FOREACH_STREAM(open_output_file(stream, &global));
+        FOREACH_STREAM(setup_pass(stream, &global, pass));
+        FOREACH_STREAM(initialize_encoder(stream, &global));
+
         frame_avail = 1;
         got_data = 0;
 
         while (frame_avail || got_data)
         {
-            vpx_codec_iter_t iter = NULL;
-            const vpx_codec_cx_pkt_t *pkt;
             struct vpx_usec_timer timer;
-            int64_t frame_start, next_frame_start;
 
-            if (!arg_limit || frames_in < arg_limit)
+            if (!global.limit || frames_in < global.limit)
             {
-                frame_avail = read_frame(infile, &raw, file_type, &y4m,
-                                         &detect);
+                frame_avail = read_frame(&input, &raw);
 
                 if (frame_avail)
                     frames_in++;
 
-                fprintf(stderr,
-                        "\rPass %d/%d frame %4d/%-4d %7"PRId64"B \033[K",
-                        pass + 1, arg_passes, frames_in, frames_out, nbytes);
+                if(stream_cnt == 1)
+                    fprintf(stderr,
+                            "\rPass %d/%d frame %4d/%-4d %7"PRId64"B \033[K",
+                            pass + 1, global.passes, frames_in,
+                            streams->frames_out, (int64_t)streams->nbytes);
+                else
+                    fprintf(stderr,
+                            "\rPass %d/%d frame %4d %7lu %s (%.2f fps)\033[K",
+                            pass + 1, global.passes, frames_in,
+                            cx_time > 9999999 ? cx_time / 1000 : cx_time,
+                            cx_time > 9999999 ? "ms" : "us",
+                            usec_to_fps(cx_time, frames_in));
+
             }
             else
                 frame_avail = 0;
 
             vpx_usec_timer_start(&timer);
-
-            frame_start = (cfg.g_timebase.den * (int64_t)(frames_in - 1)
-                          * arg_framerate.den) / cfg.g_timebase.num / arg_framerate.num;
-            next_frame_start = (cfg.g_timebase.den * (int64_t)(frames_in)
-                                * arg_framerate.den)
-                                / cfg.g_timebase.num / arg_framerate.num;
-            vpx_codec_encode(&encoder, frame_avail ? &raw : NULL, frame_start,
-                             next_frame_start - frame_start,
-                             0, arg_deadline);
+            FOREACH_STREAM(encode_frame(stream, &global,
+                                        frame_avail ? &raw : NULL,
+                                        frames_in));
             vpx_usec_timer_mark(&timer);
             cx_time += vpx_usec_timer_elapsed(&timer);
-            ctx_exit_on_error(&encoder, "Failed to encode frame");
 
-            if(cfg.g_pass != VPX_RC_FIRST_PASS)
-            {
-                int q;
-
-                vpx_codec_control(&encoder, VP8E_GET_LAST_QUANTIZER_64, &q);
-                ctx_exit_on_error(&encoder, "Failed to read quantizer");
-                counts[q]++;
-            }
+            FOREACH_STREAM(update_quantizer_histogram(stream));
 
             got_data = 0;
-
-            while ((pkt = vpx_codec_get_cx_data(&encoder, &iter)))
-            {
-                got_data = 1;
-
-                switch (pkt->kind)
-                {
-                case VPX_CODEC_CX_FRAME_PKT:
-                    frames_out++;
-                    fprintf(stderr, " %6luF",
-                            (unsigned long)pkt->data.frame.sz);
-
-                    update_rate_histogram(&rate_hist, &cfg, pkt);
-                    if(write_webm)
-                    {
-                        /* Update the hash */
-                        if(!ebml.debug)
-                            hash = murmur(pkt->data.frame.buf,
-                                          pkt->data.frame.sz, hash);
-
-                        write_webm_block(&ebml, &cfg, pkt);
-                    }
-                    else
-                    {
-                        write_ivf_frame_header(outfile, pkt);
-                        if(fwrite(pkt->data.frame.buf, 1,
-                                  pkt->data.frame.sz, outfile));
-                    }
-                    nbytes += pkt->data.raw.sz;
-                    break;
-                case VPX_CODEC_STATS_PKT:
-                    frames_out++;
-                    fprintf(stderr, " %6luS",
-                           (unsigned long)pkt->data.twopass_stats.sz);
-                    stats_write(&stats,
-                                pkt->data.twopass_stats.buf,
-                                pkt->data.twopass_stats.sz);
-                    nbytes += pkt->data.raw.sz;
-                    break;
-                case VPX_CODEC_PSNR_PKT:
-
-                    if (show_psnr)
-                    {
-                        int i;
-
-                        psnr_sse_total += pkt->data.psnr.sse[0];
-                        psnr_samples_total += pkt->data.psnr.samples[0];
-                        for (i = 0; i < 4; i++)
-                        {
-                            fprintf(stderr, "%.3lf ", pkt->data.psnr.psnr[i]);
-                            psnr_totals[i] += pkt->data.psnr.psnr[i];
-                        }
-                        psnr_count++;
-                    }
-
-                    break;
-                default:
-                    break;
-                }
-            }
+            FOREACH_STREAM(get_cx_data(stream, &global, &got_data));
 
             fflush(stdout);
         }
 
-        fprintf(stderr,
-               "\rPass %d/%d frame %4d/%-4d %7"PRId64"B %7lub/f %7"PRId64"b/s"
-               " %7lu %s (%.2f fps)\033[K", pass + 1,
-               arg_passes, frames_in, frames_out, nbytes,
-               frames_in ? (unsigned long)(nbytes * 8 / frames_in) : 0,
-               frames_in ? nbytes * 8 *(int64_t)arg_framerate.num / arg_framerate.den / frames_in : 0,
-               cx_time > 9999999 ? cx_time / 1000 : cx_time,
-               cx_time > 9999999 ? "ms" : "us",
-               cx_time > 0 ? (float)frames_in * 1000000.0 / (float)cx_time : 0);
-
-        if ( (show_psnr) && (psnr_count>0) )
-        {
-            int i;
-            double ovpsnr = vp8_mse2psnr(psnr_samples_total, 255.0,
-                                         psnr_sse_total);
+        if(stream_cnt > 1)
+            fprintf(stderr, "\n");
 
-            fprintf(stderr, "\nPSNR (Overall/Avg/Y/U/V)");
+        FOREACH_STREAM(fprintf(
+            stderr,
+            "\rPass %d/%d frame %4d/%-4d %7"PRId64"B %7lub/f %7"PRId64"b/s"
+            " %7"PRId64" %s (%.2f fps)\033[K\n", pass + 1,
+            global.passes, frames_in, stream->frames_out, (int64_t)stream->nbytes,
+            frames_in ? (unsigned long)(stream->nbytes * 8 / frames_in) : 0,
+            frames_in ? (int64_t)stream->nbytes * 8
+                        * (int64_t)global.framerate.num / global.framerate.den
+                        / frames_in
+                      : 0,
+            stream->cx_time > 9999999 ? stream->cx_time / 1000 : stream->cx_time,
+            stream->cx_time > 9999999 ? "ms" : "us",
+            usec_to_fps(stream->cx_time, frames_in));
+        );
 
-            fprintf(stderr, " %.3lf", ovpsnr);
-            for (i = 0; i < 4; i++)
-            {
-                fprintf(stderr, " %.3lf", psnr_totals[i]/psnr_count);
-            }
-        }
+        if (global.show_psnr)
+            FOREACH_STREAM(show_psnr(stream));
 
-        vpx_codec_destroy(&encoder);
+        FOREACH_STREAM(vpx_codec_destroy(&stream->encoder));
 
-        fclose(infile);
-        if (file_type == FILE_TYPE_Y4M)
-            y4m_input_close(&y4m);
+        close_input_file(&input);
 
-        if(write_webm)
-        {
-            write_webm_file_footer(&ebml, hash);
-            free(ebml.cue_list);
-            ebml.cue_list = NULL;
-        }
-        else
-        {
-            if (!fseek(outfile, 0, SEEK_SET))
-                write_ivf_file_header(outfile, &cfg, codec->fourcc, frames_out);
-        }
+        FOREACH_STREAM(close_output_file(stream, global.codec->fourcc));
 
-        fclose(outfile);
-        stats_close(&stats, arg_passes-1);
-        fprintf(stderr, "\n");
+        FOREACH_STREAM(stats_close(&stream->stats, global.passes-1));
 
-        if (one_pass_only)
+        if (global.pass)
             break;
     }
 
-    if (show_q_hist_buckets)
-        show_q_histogram(counts, show_q_hist_buckets);
+    if (global.show_q_hist_buckets)
+        FOREACH_STREAM(show_q_histogram(stream->counts,
+                                        global.show_q_hist_buckets));
 
-    if (show_rate_hist_buckets)
-        show_rate_histogram(&rate_hist, &cfg, show_rate_hist_buckets);
-    destroy_rate_histogram(&rate_hist);
+    if (global.show_rate_hist_buckets)
+        FOREACH_STREAM(show_rate_histogram(&stream->rate_hist,
+                                           &stream->config.cfg,
+                                           global.show_rate_hist_buckets));
+    FOREACH_STREAM(destroy_rate_histogram(&stream->rate_hist));
 
     vpx_img_free(&raw);
     free(argv);