132 files changed, 1492 insertions, 1193 deletions
diff --git a/CHANGELOG b/CHANGELOG
index dcb9f738a..ef64a96f9 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,32 @@
+2012-12-21 v1.2.0
+  This release acts as a checkpoint for a large amount of internal refactoring
+  and testing. It also contains a number of small bugfixes, so all users are
+  encouraged to upgrade.
+
+  - Upgrading:
+    This release is ABI and API compatible with Duclair (v1.0.0). Users
+    of older releases should refer to the Upgrading notes in this
+    document for that release.
+
+  - Enhancements:
+      VP8 optimizations for MIPS dspr2
+      vpxenc: add -quiet option
+
+  - Speed:
+      Encoder and decoder speed is consistent with the Eider release.
+
+  - Quality:
+      In general, quality is consistent with the Eider release.
+
+      Minor tweaks to ARNR filtering
+      Minor improvements to real time encoding with multiple temporal layers
+
+  - Bug Fixes:
+      Fixes multithreaded encoder race condition in loopfilter
+      Fixes multi-resolution threaded encoding
+      Fix potential encoder dead-lock after picture resize
+
+
 2012-05-09 v1.1.0 "Eider"
   This introduces a number of enhancements, mostly focused on real-time
   encoding. In addition, it fixes a decoder bug (first introduced in
diff --git a/build/make/Android.mk b/build/make/Android.mk
index afd27597c..db0cebff5 100644
--- a/build/make/Android.mk
+++ b/build/make/Android.mk
@@ -27,7 +27,7 @@
 # Android.mk file in the libvpx directory:
 # LOCAL_PATH := $(call my-dir)
 # include $(CLEAR_VARS)
-# include libvpx/build/make/Android.mk
+# include jni/libvpx/build/make/Android.mk
 #
 # There are currently two TARGET_ARCH_ABI targets for ARM.
 # armeabi and armeabi-v7a.  armeabi-v7a is selected by creating an
diff --git a/build/make/ads2gas.pl b/build/make/ads2gas.pl
index ba70242dc..95be467ab 100755
--- a/build/make/ads2gas.pl
+++ b/build/make/ads2gas.pl
@@ -61,26 +61,26 @@ while (<STDIN>)
     s/:SHR:/ >> /g;
 
     # Convert ELSE to .else
-    s/ELSE/.else/g;
+    s/\bELSE\b/.else/g;
 
     # Convert ENDIF to .endif
-    s/ENDIF/.endif/g;
+    s/\bENDIF\b/.endif/g;
 
     # Convert ELSEIF to .elseif
-    s/ELSEIF/.elseif/g;
+    s/\bELSEIF\b/.elseif/g;
 
     # Convert LTORG to .ltorg
-    s/LTORG/.ltorg/g;
+    s/\bLTORG\b/.ltorg/g;
 
     # Convert endfunc to nothing.
-    s/endfunc//ig;
+    s/\bendfunc\b//ig;
 
     # Convert FUNCTION to nothing.
-    s/FUNCTION//g;
-    s/function//g;
+    s/\bFUNCTION\b//g;
+    s/\bfunction\b//g;
 
-    s/ENTRY//g;
-    s/MSARMASM/0/g;
+    s/\bENTRY\b//g;
+    s/\bMSARMASM\b/0/g;
     s/^\s+end\s+$//g;
 
     # Convert IF :DEF:to .if
@@ -149,11 +149,15 @@ while (<STDIN>)
     s/^([a-zA-Z_0-9\$]+)/$1:/ if !/EQU/;
 
     # ALIGN directive
-    s/ALIGN/.balign/g;
+    s/\bALIGN\b/.balign/g;
 
     # ARM code
     s/\sARM/.arm/g;
 
+    # push/pop
+    s/(push\s+)(r\d+)/stmdb sp\!, \{$2\}/g;
+    s/(pop\s+)(r\d+)/ldmia sp\!, \{$2\}/g;
+
     # NEON code
     s/(vld1.\d+\s+)(q\d+)/$1\{$2\}/g;
     s/(vtbl.\d+\s+[^,]+),([^,]+)/$1,\{$2\}/g;
@@ -189,7 +193,7 @@ while (<STDIN>)
     s/(\S+\s+)EQU(\s+\S+)/.equ $1, $2/;
 
     # Begin macro definition
-    if (/MACRO/) {
+    if (/\bMACRO\b/) {
         $_ = <STDIN>;
         s/^/.macro/;
         s/\$//g;                # remove formal param reference
@@ -198,7 +202,7 @@ while (<STDIN>)
 
     # For macros, use \ to reference formal params
     s/\$/\\/g;                  # End macro definition
-    s/MEND/.endm/;              # No need to tell it where to stop assembling
+    s/\bMEND\b/.endm/;              # No need to tell it where to stop assembling
     next if /^\s*END\s*$/;
     print;
     print "$comment_sub$comment\n" if defined $comment;
diff --git a/build/make/configure.sh b/build/make/configure.sh
index 72627377c..318f0f760 100755
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -277,6 +277,7 @@ clean_temp_files() {
 # Toolchain Check Functions
 #
 check_cmd() {
+    enabled external_build && return
     log "$@"
     "$@" >>${logfile} 2>&1
 }
@@ -767,6 +768,7 @@ process_common_toolchain() {
             ;;
         armv5te)
             soft_enable edsp
+            disable fast_unaligned
             ;;
         esac
 
@@ -1000,7 +1002,11 @@ EOF
         soft_enable sse2
         soft_enable sse3
         soft_enable ssse3
-        soft_enable sse4_1
+        if enabled gcc && ! disabled sse4_1 && ! check_cflags -msse4; then
+            RTCD_OPTIONS="${RTCD_OPTIONS}--disable-sse4_1 "
+        else
+            soft_enable sse4_1
+        fi
 
         case  ${tgt_os} in
             win*)
@@ -1175,9 +1181,6 @@ EOF
         ;;
     esac
 
-    # for sysconf(3) and friends.
-    check_header unistd.h
-
     # glibc needs these
     if enabled linux; then
         add_cflags -D_LARGEFILE_SOURCE
diff --git a/configure b/configure
index 5ed688e2f..57a614596 100755
--- a/configure
+++ b/configure
@@ -303,6 +303,7 @@ CONFIG_LIST="
     ${EXPERIMENT_LIST}
 "
 CMDLINE_SELECT="
+    external_build
     extra_warnings
     werror
     install_docs
@@ -502,7 +503,7 @@ process_detect() {
             fi
         fi
     fi
-    if [ -z "$CC" ]; then
+    if [ -z "$CC" ] || enabled external_build; then
         echo "Bypassing toolchain for environment detection."
         enable external_build
         check_header() {
@@ -511,6 +512,7 @@ process_detect() {
             shift
             var=`echo $header | sed 's/[^A-Za-z0-9_]/_/g'`
             disable $var
+            # Headers common to all environments
             case $header in
                 stdio.h)
                     true;
@@ -522,6 +524,25 @@ process_detect() {
                     done
                     ${result:-true}
             esac && enable $var
+
+            # Specialize windows and POSIX environments.
+            case $toolchain in
+                *-win*-*)
+                    case $header-$toolchain in
+                        stdint*-gcc) true;;
+                        *) false;;
+                    esac && enable $var
+                    ;;
+                *)
+                    case $header in
+                        stdint.h) true;;
+                        pthread.h) true;;
+                        sys/mman.h) true;;
+                        unistd.h) true;;
+                        *) false;;
+                    esac && enable $var
+            esac
+            enabled $var
         }
         check_ld() {
             true
@@ -535,6 +556,7 @@ EOF
     check_header stdint.h
     check_header pthread.h
     check_header sys/mman.h
+    check_header unistd.h # for sysconf(3) and friends.
 
     check_header vpx/vpx_integer.h -I${source_path} && enable vpx_ports
 }
@@ -643,6 +665,10 @@ process_toolchain() {
         *-android-*)
             # GTestLog must be modified to use Android logging utilities.
         ;;
+        *-darwin-*)
+            # iOS/ARM builds do not work with gtest. This does not match
+            # x86 targets.
+        ;;
         *)
             check_cxx "$@" <<EOF && soft_enable unit_tests
 int z;
diff --git a/examples/decode_with_partial_drops.txt b/examples/decode_with_partial_drops.txt
new file mode 100644
index 000000000..7b0d3d2ca
--- /dev/null
+++ b/examples/decode_with_partial_drops.txt
@@ -0,0 +1,238 @@
+@TEMPLATE decoder_tmpl.c
+Decode With Partial Drops Example
+=========================
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ INTRODUCTION
+This is an example utility which drops a series of frames (or parts of frames),
+as specified on the command line. This is useful for observing the error
+recovery features of the codec.
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ INTRODUCTION
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ EXTRA_INCLUDES
+#include <time.h>
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ EXTRA_INCLUDES
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ HELPERS
+struct parsed_header
+{
+    char key_frame;
+    int version;
+    char show_frame;
+    int first_part_size;
+};
+
+int next_packet(struct parsed_header* hdr, int pos, int length, int mtu)
+{
+    int size = 0;
+    int remaining = length - pos;
+    /* Uncompressed part is 3 bytes for P frames and 10 bytes for I frames */
+    int uncomp_part_size = (hdr->key_frame ? 10 : 3);
+    /* number of bytes yet to send from header and the first partition */
+    int remainFirst = uncomp_part_size + hdr->first_part_size - pos;
+    if (remainFirst > 0)
+    {
+        if (remainFirst <= mtu)
+        {
+            size = remainFirst;
+        }
+        else
+        {
+            size = mtu;
+        }
+
+        return size;
+    }
+
+    /* second partition; just slot it up according to MTU */
+    if (remaining <= mtu)
+    {
+        size = remaining;
+        return size;
+    }
+    return mtu;
+}
+
+void throw_packets(unsigned char* frame, int* size, int loss_rate,
+                   int* thrown, int* kept)
+{
+    unsigned char loss_frame[256*1024];
+    int pkg_size = 1;
+    int pos = 0;
+    int loss_pos = 0;
+    struct parsed_header hdr;
+    unsigned int tmp;
+    int mtu = 1500;
+
+    if (*size < 3)
+    {
+        return;
+    }
+    putc('|', stdout);
+    /* parse uncompressed 3 bytes */
+    tmp = (frame[2] << 16) | (frame[1] << 8) | frame[0];
+    hdr.key_frame = !(tmp & 0x1); /* inverse logic */
+    hdr.version = (tmp >> 1) & 0x7;
+    hdr.show_frame = (tmp >> 4) & 0x1;
+    hdr.first_part_size = (tmp >> 5) & 0x7FFFF;
+
+    /* don't drop key frames */
+    if (hdr.key_frame)
+    {
+        int i;
+        *kept = *size/mtu + ((*size % mtu > 0) ? 1 : 0); /* approximate */
+        for (i=0; i < *kept; i++)
+            putc('.', stdout);
+        return;
+    }
+
+    while ((pkg_size = next_packet(&hdr, pos, *size, mtu)) > 0)
+    {
+        int loss_event = ((rand() + 1.0)/(RAND_MAX + 1.0) < loss_rate/100.0);
+        if (*thrown == 0 && !loss_event)
+        {
+            memcpy(loss_frame + loss_pos, frame + pos, pkg_size);
+            loss_pos += pkg_size;
+            (*kept)++;
+            putc('.', stdout);
+        }
+        else
+        {
+            (*thrown)++;
+            putc('X', stdout);
+        }
+        pos += pkg_size;
+    }
+    memcpy(frame, loss_frame, loss_pos);
+    memset(frame + loss_pos, 0, *size - loss_pos);
+    *size = loss_pos;
+}
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ HELPERS
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DEC_INIT
+/* Initialize codec */
+flags = VPX_CODEC_USE_ERROR_CONCEALMENT;
+res = vpx_codec_dec_init(&codec, interface, &dec_cfg, flags);
+if(res)
+    die_codec(&codec, "Failed to initialize decoder");
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DEC_INIT
+
+Usage
+-----
+This example adds a single argument to the `simple_decoder` example,
+which specifies the range or pattern of frames to drop. The parameter is
+parsed as follows:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ USAGE
+if(argc < 4 || argc > 6)
+    die("Usage: %s <infile> <outfile> [-t <num threads>] <N-M|N/M|L,S>\n",
+        argv[0]);
+{
+    char *nptr;
+    int arg_num = 3;
+    if (argc == 6 && strncmp(argv[arg_num++], "-t", 2) == 0)
+        dec_cfg.threads = strtol(argv[arg_num++], NULL, 0);
+    n = strtol(argv[arg_num], &nptr, 0);
+    mode = (*nptr == '\0' || *nptr == ',') ? 2 : (*nptr == '-') ? 1 : 0;
+
+    m = strtol(nptr+1, NULL, 0);
+    if((!n && !m) || (*nptr != '-' && *nptr != '/' &&
+        *nptr != '\0' && *nptr != ','))
+        die("Couldn't parse pattern %s\n", argv[3]);
+}
+seed = (m > 0) ? m : (unsigned int)time(NULL);
+srand(seed);thrown_frame = 0;
+printf("Seed: %u\n", seed);
+printf("Threads: %d\n", dec_cfg.threads);
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ USAGE
+
+
+Dropping A Range Of Frames
+--------------------------
+To drop a range of frames, specify the starting frame and the ending
+frame to drop, separated by a dash. The following command will drop
+frames 5 through 10 (base 1).
+
+  $ ./decode_with_partial_drops in.ivf out.i420 5-10
+
+
+Dropping A Pattern Of Frames
+----------------------------
+To drop a pattern of frames, specify the number of frames to drop and
+the number of frames after which to repeat the pattern, separated by
+a forward-slash. The following command will drop 3 of 7 frames.
+Specifically, it will decode 4 frames, then drop 3 frames, and then
+repeat.
+
+  $ ./decode_with_partial_drops in.ivf out.i420 3/7
+
+Dropping Random Parts Of Frames
+-------------------------------
+A third argument tuple is available to split the frame into 1500 bytes pieces
+and randomly drop pieces rather than frames. The frame will be split at
+partition boundaries where possible. The following example will seed the RNG
+with the seed 123 and drop approximately 5% of the pieces. Pieces which
+are depending on an already dropped piece will also be dropped.
+
+  $ ./decode_with_partial_drops in.ivf out.i420 5,123
+
+
+Extra Variables
+---------------
+This example maintains the pattern passed on the command line in the
+`n`, `m`, and `is_range` variables:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ EXTRA_VARS
+int              n, m, mode;
+unsigned int     seed;
+int              thrown=0, kept=0;
+int              thrown_frame=0, kept_frame=0;
+vpx_codec_dec_cfg_t  dec_cfg = {0};
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ EXTRA_VARS
+
+
+Making The Drop Decision
+------------------------
+The example decides whether to drop the frame based on the current
+frame number, immediately before decoding the frame.
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PRE_DECODE
+/* Decide whether to throw parts of the frame or the whole frame
+   depending on the drop mode */
+thrown_frame = 0;
+kept_frame = 0;
+switch (mode)
+{
+case 0:
+    if (m - (frame_cnt-1)%m <= n)
+    {
+        frame_sz = 0;
+    }
+    break;
+case 1:
+    if (frame_cnt >= n && frame_cnt <= m)
+    {
+        frame_sz = 0;
+    }
+    break;
+case 2:
+    throw_packets(frame, &frame_sz, n, &thrown_frame, &kept_frame);
+    break;
+default: break;
+}
+if (mode < 2)
+{
+    if (frame_sz == 0)
+    {
+        putc('X', stdout);
+        thrown_frame++;
+    }
+    else
+    {
+        putc('.', stdout);
+        kept_frame++;
+    }
+}
+thrown += thrown_frame;
+kept += kept_frame;
+fflush(stdout);
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PRE_DECODE
diff --git a/libs.mk b/libs.mk
index e995d0aef..1f0ade34d 100644
--- a/libs.mk
+++ b/libs.mk
@@ -61,8 +61,16 @@ endef
 CODEC_SRCS-yes += CHANGELOG
 CODEC_SRCS-yes += libs.mk
 
+# If this is a universal (fat) binary, then all the subarchitectures have
+# already been built and our job is to stitch them together. The
+# BUILD_LIBVPX variable indicates whether we should be building
+# (compiling, linking) the library. The LIPO_LIBVPX variable indicates
+# that we're stitching.
+$(eval $(if $(filter universal%,$(TOOLCHAIN)),LIPO_LIBVPX,BUILD_LIBVPX):=yes)
+
 include $(SRC_PATH_BARE)/vpx/vpx_codec.mk
 CODEC_SRCS-yes += $(addprefix vpx/,$(call enabled,API_SRCS))
+CODEC_DOC_SRCS += $(addprefix vpx/,$(call enabled,API_DOC_SRCS))
 
 include $(SRC_PATH_BARE)/vpx_mem/vpx_mem.mk
 CODEC_SRCS-yes += $(addprefix vpx_mem/,$(call enabled,MEM_SRCS))
@@ -70,6 +78,9 @@ CODEC_SRCS-yes += $(addprefix vpx_mem/,$(call enabled,MEM_SRCS))
 include $(SRC_PATH_BARE)/vpx_scale/vpx_scale.mk
 CODEC_SRCS-yes += $(addprefix vpx_scale/,$(call enabled,SCALE_SRCS))
 
+include $(SRC_PATH_BARE)/vpx_ports/vpx_ports.mk
+CODEC_SRCS-yes += $(addprefix vpx_ports/,$(call enabled,PORTS_SRCS))
+
 ifneq ($(CONFIG_VP8_ENCODER)$(CONFIG_VP8_DECODER),)
   VP8_PREFIX=vp8/
   include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8_common.mk
@@ -79,11 +90,8 @@ ifeq ($(CONFIG_VP8_ENCODER),yes)
   include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8cx.mk
   CODEC_SRCS-yes += $(addprefix $(VP8_PREFIX),$(call enabled,VP8_CX_SRCS))
   CODEC_EXPORTS-yes += $(addprefix $(VP8_PREFIX),$(VP8_CX_EXPORTS))
-  CODEC_SRCS-yes += $(VP8_PREFIX)vp8cx.mk vpx/vp8.h vpx/vp8cx.h
-  CODEC_SRCS-$(ARCH_ARM) += $(VP8_PREFIX)vp8cx_arm.mk
   INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8cx.h
   INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP8_PREFIX)/%
-  CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8cx.h
   CODEC_DOC_SECTIONS += vp8 vp8_encoder
 endif
 
@@ -91,10 +99,8 @@ ifeq ($(CONFIG_VP8_DECODER),yes)
   include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8dx.mk
   CODEC_SRCS-yes += $(addprefix $(VP8_PREFIX),$(call enabled,VP8_DX_SRCS))
   CODEC_EXPORTS-yes += $(addprefix $(VP8_PREFIX),$(VP8_DX_EXPORTS))
-  CODEC_SRCS-yes += $(VP8_PREFIX)vp8dx.mk vpx/vp8.h vpx/vp8dx.h
   INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8dx.h
   INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP8_PREFIX)/%
-  CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8dx.h
   CODEC_DOC_SECTIONS += vp8 vp8_decoder
 endif
 
@@ -155,30 +161,13 @@ INSTALL_MAPS += $(foreach p,$(VS_PLATFORMS),$(LIBSUBDIR)/$(p)/%  $(p)/Release/%)
 INSTALL_MAPS += $(foreach p,$(VS_PLATFORMS),$(LIBSUBDIR)/$(p)/%  $(p)/Debug/%)
 endif
 
-# If this is a universal (fat) binary, then all the subarchitectures have
-# already been built and our job is to stitch them together. The
-# BUILD_LIBVPX variable indicates whether we should be building
-# (compiling, linking) the library. The LIPO_LIBVPX variable indicates
-# that we're stitching.
-$(eval $(if $(filter universal%,$(TOOLCHAIN)),LIPO_LIBVPX,BUILD_LIBVPX):=yes)
-
 CODEC_SRCS-$(BUILD_LIBVPX) += build/make/version.sh
 CODEC_SRCS-$(BUILD_LIBVPX) += build/make/rtcd.sh
-CODEC_SRCS-$(BUILD_LIBVPX) += vpx/vpx_integer.h
-CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/asm_offsets.h
-CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/vpx_timer.h
-CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/mem.h
+CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/emmintrin_compat.h
+CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/vpx_once.h
 CODEC_SRCS-$(BUILD_LIBVPX) += $(BUILD_PFX)vpx_config.c
 INSTALL-SRCS-no += $(BUILD_PFX)vpx_config.c
-ifeq ($(ARCH_X86)$(ARCH_X86_64),yes)
-CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/emms.asm
-CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/x86.h
-CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/x86_abi_support.asm
 CODEC_SRCS-$(BUILD_LIBVPX) += third_party/x86inc/x86inc.asm
-CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/x86_cpuid.c
-endif
-CODEC_SRCS-$(ARCH_ARM) += vpx_ports/arm_cpudetect.c
-CODEC_SRCS-$(ARCH_ARM) += vpx_ports/arm.h
 CODEC_EXPORTS-$(BUILD_LIBVPX) += vpx/exports_com
 CODEC_EXPORTS-$(CONFIG_ENCODERS) += vpx/exports_enc
 CODEC_EXPORTS-$(CONFIG_DECODERS) += vpx/exports_dec
@@ -202,8 +191,7 @@ INSTALL-LIBS-$(CONFIG_STATIC) += $(LIBSUBDIR)/libvpx.a
 INSTALL-LIBS-$(CONFIG_DEBUG_LIBS) += $(LIBSUBDIR)/libvpx_g.a
 endif
 
-CODEC_SRCS=$(filter-out %_offsets.c,\
-           $(filter-out %_test.cc,$(call enabled,CODEC_SRCS)))
+CODEC_SRCS=$(call enabled,CODEC_SRCS)
 INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(CODEC_SRCS)
 INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(call enabled,CODEC_EXPORTS)
 
@@ -306,6 +294,7 @@ CLEAN-OBJS += libvpx.syms
 define libvpx_symlink_template
 $(1): $(2)
 	@echo "    [LN]     $(2) $$@"
+	$(qexec)mkdir -p $$(dir $$@)
 	$(qexec)ln -sf $(2) $$@
 endef
 
@@ -314,7 +303,7 @@ $(eval $(call libvpx_symlink_template,\
     $(BUILD_PFX)$(LIBVPX_SO)))
 $(eval $(call libvpx_symlink_template,\
     $(addprefix $(DIST_DIR)/,$(LIBVPX_SO_SYMLINKS)),\
-    $(DIST_DIR)/$(LIBSUBDIR)/$(LIBVPX_SO)))
+    $(LIBVPX_SO)))
 
 
 INSTALL-LIBS-$(BUILD_LIBVPX_SO) += $(LIBVPX_SO_SYMLINKS)
@@ -375,10 +364,6 @@ $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm
 $(shell $(SRC_PATH_BARE)/build/make/version.sh "$(SRC_PATH_BARE)" $(BUILD_PFX)vpx_version.h)
 CLEAN-OBJS += $(BUILD_PFX)vpx_version.h
 
-CODEC_DOC_SRCS += vpx/vpx_codec.h \
-                  vpx/vpx_decoder.h \
-                  vpx/vpx_encoder.h \
-                  vpx/vpx_image.h
 
 ##
 ## libvpx test directives
diff --git a/test/datarate_test.cc b/test/datarate_test.cc
index f2a2031fd..6fbcb643d 100644
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -59,9 +59,13 @@ class DatarateTest : public ::libvpx_test::EncoderTest,
     /* Test the buffer model here before subtracting the frame. Do so because
      * the way the leaky bucket model works in libvpx is to allow the buffer to
      * empty - and then stop showing frames until we've got enough bits to
-     * show one. */
-    ASSERT_GE(bits_in_buffer_model_, 0) << "Buffer Underrun at frame "
-        << pkt->data.frame.pts;
+     * show one. As noted in comment below (issue 495), this does not currently
+     * apply to key frames. For now exclude key frames in condition below. */
+    bool key_frame = (pkt->data.frame.flags & VPX_FRAME_IS_KEY) ? true: false;
+    if (!key_frame) {
+      ASSERT_GE(bits_in_buffer_model_, 0) << "Buffer Underrun at frame "
+          << pkt->data.frame.pts;
+    }
 
     const int frame_size_in_bits = pkt->data.frame.sz * 8;
 
@@ -125,7 +129,12 @@ TEST_P(DatarateTest, BasicBufferModel) {
   ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
                                        30, 1, 0, 140);
 
-  for (int i = 70; i < 700; i += 200) {
+  // There is an issue for low bitrates in real-time mode, where the
+  // effective_datarate slightly overshoots the target bitrate.
+  // This is same the issue as noted about (#495).
+  // TODO(jimbankoski/marpan): Update test to run for lower bitrates (< 100),
+  // when the issue is resolved.
+  for (int i = 100; i < 800; i += 200) {
     cfg_.rc_target_bitrate = i;
     ResetModel();
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
diff --git a/test/decode_test_driver.cc b/test/decode_test_driver.cc
index 3610f025d..84afe7f84 100644
--- a/test/decode_test_driver.cc
+++ b/test/decode_test_driver.cc
@@ -9,6 +9,7 @@
  */
 #include "test/decode_test_driver.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/register_state_check.h"
 #include "test/video_source.h"
 
 namespace libvpx_test {
@@ -21,8 +22,9 @@ void Decoder::DecodeFrame(const uint8_t *cxdata, int size) {
     ASSERT_EQ(VPX_CODEC_OK, res_init) << DecodeError();
   }
 
-  const vpx_codec_err_t res_dec = vpx_codec_decode(&decoder_,
-                                                   cxdata, size, NULL, 0);
+  vpx_codec_err_t res_dec;
+  REGISTER_STATE_CHECK(res_dec = vpx_codec_decode(&decoder_,
+                                                  cxdata, size, NULL, 0));
   ASSERT_EQ(VPX_CODEC_OK, res_dec) << DecodeError();
 }
 
diff --git a/test/encode_test_driver.cc b/test/encode_test_driver.cc
index ebb3959ed..56339cae0 100644
--- a/test/encode_test_driver.cc
+++ b/test/encode_test_driver.cc
@@ -12,6 +12,7 @@
 #if CONFIG_VP8_DECODER
 #include "test/decode_test_driver.h"
 #endif
+#include "test/register_state_check.h"
 #include "test/video_source.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
@@ -58,9 +59,10 @@ void Encoder::EncodeFrameInternal(const VideoSource &video,
   }
 
   // Encode the frame
-  res = vpx_codec_encode(&encoder_,
-                         video.img(), video.pts(), video.duration(),
-                         frame_flags, deadline_);
+  REGISTER_STATE_CHECK(
+      res = vpx_codec_encode(&encoder_,
+                             video.img(), video.pts(), video.duration(),
+                             frame_flags, deadline_));
   ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
 }
 
diff --git a/test/idctllm_test.cc b/test/idctllm_test.cc
index 3071a2aea..d6fdffea5 100644
--- a/test/idctllm_test.cc
+++ b/test/idctllm_test.cc
@@ -13,6 +13,7 @@ extern "C" {
 #include "vpx_config.h"
 #include "vp8_rtcd.h"
 }
+#include "test/register_state_check.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
 typedef void (*idct_fn_t)(short *input, unsigned char *pred_ptr,
@@ -54,7 +55,7 @@ TEST_P(IDCTTest, TestAllZeros)
 {
     int i;
 
-    UUT(input, output, 16, output, 16);
+    REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));
 
     for(i=0; i<256; i++)
         if((i&0xF) < 4 && i<64)
@@ -68,7 +69,7 @@ TEST_P(IDCTTest, TestAllOnes)
     int i;
 
     input[0] = 4;
-    UUT(input, output, 16, output, 16);
+    REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));
 
     for(i=0; i<256; i++)
         if((i&0xF) < 4 && i<64)
@@ -85,7 +86,7 @@ TEST_P(IDCTTest, TestAddOne)
         predict[i] = i;
 
     input[0] = 4;
-    UUT(input, predict, 16, output, 16);
+    REGISTER_STATE_CHECK(UUT(input, predict, 16, output, 16));
 
     for(i=0; i<256; i++)
         if((i&0xF) < 4 && i<64)
@@ -101,7 +102,7 @@ TEST_P(IDCTTest, TestWithData)
     for(i=0; i<16; i++)
         input[i] = i;
 
-    UUT(input, output, 16, output, 16);
+    REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));
 
     for(i=0; i<256; i++)
         if((i&0xF) > 3 || i>63)
diff --git a/test/intrapred_test.cc b/test/intrapred_test.cc
index 692b67bc6..149399024 100644
--- a/test/intrapred_test.cc
+++ b/test/intrapred_test.cc
@@ -11,6 +11,7 @@
 
 #include <string.h>
 #include "test/acm_random.h"
+#include "test/register_state_check.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
 extern "C" {
 #include "vpx_config.h"
@@ -246,8 +247,10 @@ class IntraPredYTest : public ::testing::TestWithParam<intra_pred_y_fn_t>,
 
   virtual void Predict(MB_PREDICTION_MODE mode) {
     mb_.mode_info_context->mbmi.mode = mode;
-    pred_fn_(&mb_, data_ptr_[0] - kStride, data_ptr_[0] - 1, kStride,
-             data_ptr_[0], kStride);
+    REGISTER_STATE_CHECK(pred_fn_(&mb_,
+                                  data_ptr_[0] - kStride,
+                                  data_ptr_[0] - 1, kStride,
+                                  data_ptr_[0], kStride));
   }
 
   intra_pred_y_fn_t pred_fn_;
diff --git a/test/pp_filter_test.cc b/test/pp_filter_test.cc
index 36d2e63a2..412a57442 100644
--- a/test/pp_filter_test.cc
+++ b/test/pp_filter_test.cc
@@ -7,6 +7,7 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#include "test/register_state_check.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
 extern "C" {
 #include "vpx_config.h"
@@ -74,8 +75,8 @@ TEST_P(Vp8PostProcessingFilterTest, FilterOutputCheck) {
   // Initialize pixels in the output to 99.
   (void)vpx_memset(dst_image, 99, output_size);
 
-  GetParam()(src_image_ptr, dst_image_ptr, input_stride,
-             output_stride, block_width, flimits, 16);
+  REGISTER_STATE_CHECK(GetParam()(src_image_ptr, dst_image_ptr, input_stride,
+                                  output_stride, block_width, flimits, 16));
 
   static const uint8_t expected_data[block_height] = {
     4, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 4
diff --git a/test/register_state_check.h b/test/register_state_check.h
new file mode 100644
index 000000000..fb3f53b13
--- /dev/null
+++ b/test/register_state_check.h
@@ -0,0 +1,95 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef LIBVPX_TEST_REGISTER_STATE_CHECK_H_
+#define LIBVPX_TEST_REGISTER_STATE_CHECK_H_
+
+#ifdef _WIN64
+
+#define _WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <winnt.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+namespace testing {
+namespace internal {
+
+inline bool operator==(const M128A& lhs, const M128A& rhs) {
+  return (lhs.Low == rhs.Low && lhs.High == rhs.High);
+}
+
+}  // namespace internal
+}  // namespace testing
+
+namespace libvpx_test {
+
+// Compares the state of xmm[6-15] at construction with their state at
+// destruction. These registers should be preserved by the callee on
+// Windows x64.
+// Usage:
+// {
+//   RegisterStateCheck reg_check;
+//   FunctionToVerify();
+// }
+class RegisterStateCheck {
+ public:
+  RegisterStateCheck() { initialized_ = StoreRegisters(&pre_context_); }
+  ~RegisterStateCheck() { EXPECT_TRUE(Check()); }
+
+ private:
+  static bool StoreRegisters(CONTEXT* const context) {
+    const HANDLE this_thread = GetCurrentThread();
+    EXPECT_TRUE(this_thread != NULL);
+    context->ContextFlags = CONTEXT_FLOATING_POINT;
+    const bool context_saved = GetThreadContext(this_thread, context) == TRUE;
+    EXPECT_TRUE(context_saved) << "GetLastError: " << GetLastError();
+    return context_saved;
+  }
+
+  // Compares the register state. Returns true if the states match.
+  bool Check() const {
+    if (!initialized_) return false;
+    CONTEXT post_context;
+    if (!StoreRegisters(&post_context)) return false;
+
+    const M128A* xmm_pre = &pre_context_.Xmm6;
+    const M128A* xmm_post = &post_context.Xmm6;
+    for (int i = 6; i <= 15; ++i) {
+      EXPECT_EQ(*xmm_pre, *xmm_post) << "xmm" << i << " has been modified!";
+      ++xmm_pre;
+      ++xmm_post;
+    }
+    return !testing::Test::HasNonfatalFailure();
+  }
+
+  bool initialized_;
+  CONTEXT pre_context_;
+};
+
+#define REGISTER_STATE_CHECK(statement) do { \
+  libvpx_test::RegisterStateCheck reg_check; \
+  statement;                               \
+} while (false)
+
+}  // namespace libvpx_test
+
+#else  // !_WIN64
+
+namespace libvpx_test {
+
+class RegisterStateCheck {};
+#define REGISTER_STATE_CHECK(statement) statement
+
+}  // namespace libvpx_test
+
+#endif  // _WIN64
+
+#endif  // LIBVPX_TEST_REGISTER_STATE_CHECK_H_
diff --git a/test/sad_test.cc b/test/sad_test.cc
index 8cd528632..72741a901 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -21,6 +21,7 @@ extern "C" {
 }
 
 #include "test/acm_random.h"
+#include "test/register_state_check.h"
 #include "test/util.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
@@ -65,9 +66,11 @@ class SADTest : public PARAMS(int, int, sad_m_by_n_fn_t) {
 
   sad_m_by_n_fn_t sad_fn_;
   virtual unsigned int SAD(unsigned int max_sad) {
-    return sad_fn_(source_data_, source_stride_,
-                   reference_data_, reference_stride_,
-                   max_sad);
+    unsigned int ret;
+    REGISTER_STATE_CHECK(ret = sad_fn_(source_data_, source_stride_,
+                                       reference_data_, reference_stride_,
+                                       max_sad));
+    return ret;
   }
 
   // Sum of Absolute Differences. Given two blocks, calculate the absolute
diff --git a/test/sixtap_predict_test.cc b/test/sixtap_predict_test.cc
index 22d5a8473..2d4581dc0 100644
--- a/test/sixtap_predict_test.cc
+++ b/test/sixtap_predict_test.cc
@@ -12,6 +12,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "test/acm_random.h"
+#include "test/register_state_check.h"
 #include "test/util.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
 extern "C" {
@@ -136,8 +137,8 @@ TEST_P(SixtapPredictTest, TestWithPresetData) {
 
   uint8_t *src = const_cast<uint8_t*>(test_data);
 
-  sixtap_predict_(&src[kSrcStride * 2 + 2 + 1], kSrcStride,
-                  2, 2, dst_, kDstStride);
+  REGISTER_STATE_CHECK(sixtap_predict_(&src[kSrcStride * 2 + 2 + 1], kSrcStride,
+                                       2, 2, dst_, kDstStride));
 
   for (int i = 0; i < height_; ++i)
     for (int j = 0; j < width_; ++j)
@@ -162,8 +163,9 @@ TEST_P(SixtapPredictTest, TestWithRandomData) {
                                 xoffset, yoffset, dst_c_, kDstStride);
 
       // Run test.
-      sixtap_predict_(&src_[kSrcStride * 2 + 2 + 1], kSrcStride,
-                      xoffset, yoffset, dst_, kDstStride);
+      REGISTER_STATE_CHECK(
+          sixtap_predict_(&src_[kSrcStride * 2 + 2 + 1], kSrcStride,
+                          xoffset, yoffset, dst_, kDstStride));
 
       for (int i = 0; i < height_; ++i)
         for (int j = 0; j < width_; ++j)
diff --git a/test/subtract_test.cc b/test/subtract_test.cc
index f1c50d398..e7d107392 100644
--- a/test/subtract_test.cc
+++ b/test/subtract_test.cc
@@ -10,6 +10,7 @@
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "test/acm_random.h"
+#include "test/register_state_check.h"
 extern "C" {
 #include "vpx_config.h"
 #include "vp8_rtcd.h"
@@ -77,7 +78,7 @@ TEST_P(SubtractBlockTest, SimpleSubtract) {
       predictor += kDiffPredStride;
     }
 
-    GetParam()(&be, &bd, kDiffPredStride);
+    REGISTER_STATE_CHECK(GetParam()(&be, &bd, kDiffPredStride));
 
     base_src = *be.base_src;
     src_diff = be.src_diff;
diff --git a/test/test.mk b/test/test.mk
index 919cf0438..28d387264 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -1,3 +1,4 @@
+LIBVPX_TEST_SRCS-yes += register_state_check.h
 LIBVPX_TEST_SRCS-yes += test.mk
 LIBVPX_TEST_SRCS-yes += acm_random.h
 
@@ -59,16 +60,18 @@ ifneq ($(CONFIG_VP9_ENCODER)$(CONFIG_VP9_DECODER),)
 # These tests require both the encoder and decoder to be built.
 ifeq ($(CONFIG_VP9_ENCODER)$(CONFIG_VP9_DECODER),yesyes)
 LIBVPX_TEST_SRCS-yes                   += vp9_boolcoder_test.cc
+
+# IDCT test currently depends on FDCT function
+LIBVPX_TEST_SRCS-yes                   += idct8x8_test.cc
 endif
 
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct4x4_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc
 #LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct16x16_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
 ifeq ($(CONFIG_VP9_ENCODER)$(CONFIG_TX32X32),yesyes)
 LIBVPX_TEST_SRCS-yes += dct32x32_test.cc
 endif
-LIBVPX_TEST_SRCS-yes += idct8x8_test.cc
-LIBVPX_TEST_SRCS-yes += variance_test.cc
 endif # VP9
 
 
diff --git a/test/test_libvpx.cc b/test/test_libvpx.cc
index 52a4fb9d5..5610c2612 100644
--- a/test/test_libvpx.cc
+++ b/test/test_libvpx.cc
@@ -9,9 +9,10 @@
  */
 #include <string>
 #include "vpx_config.h"
-#if ARCH_X86 || ARCH_X86_64
 extern "C" {
+#if ARCH_X86 || ARCH_X86_64
 #include "vpx_ports/x86.h"
+#endif
 #if CONFIG_VP8
 extern void vp8_rtcd();
 #endif
@@ -19,7 +20,6 @@ extern void vp8_rtcd();
 extern void vp9_rtcd();
 #endif
 }
-#endif
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
 static void append_gtest_filter(const char *str) {
@@ -47,12 +47,15 @@ int main(int argc, char **argv) {
     append_gtest_filter(":-SSE4_1/*");
 #endif
 
+#if !CONFIG_SHARED
+  /* Shared library builds don't support whitebox tests that exercise internal symbols. */
 #if CONFIG_VP8
   vp8_rtcd();
 #endif
 #if CONFIG_VP9
   vp9_rtcd();
 #endif
+#endif
 
   return RUN_ALL_TESTS();
 }
diff --git a/vp8/common/loopfilter.c b/vp8/common/loopfilter.c
index 2b1ee851b..8681b7a6a 100644
--- a/vp8/common/loopfilter.c
+++ b/vp8/common/loopfilter.c
@@ -567,46 +567,28 @@ void vp8_loop_filter_partial_frame
     int mb_cols = post->y_width >> 4;
     int mb_rows = post->y_height >> 4;
 
-    int linestocopy, i;
+    int linestocopy;
 
     loop_filter_info_n *lfi_n = &cm->lf_info;
     loop_filter_info lfi;
 
     int filter_level;
-    int alt_flt_enabled = mbd->segmentation_enabled;
     FRAME_TYPE frame_type = cm->frame_type;
 
     const MODE_INFO *mode_info_context;
 
-    int lvl_seg[MAX_MB_SEGMENTS];
+#if 0
+    if(default_filt_lvl == 0) /* no filter applied */
+        return;
+#endif
+
+    /* Initialize the loop filter for this frame. */
+    vp8_loop_filter_frame_init( cm, mbd, default_filt_lvl);
 
     /* number of MB rows to use in partial filtering */
     linestocopy = mb_rows / PARTIAL_FRAME_FRACTION;
     linestocopy = linestocopy ? linestocopy << 4 : 16;     /* 16 lines per MB */
 
-    /* Note the baseline filter values for each segment */
-    /* See vp8_loop_filter_frame_init. Rather than call that for each change
-     * to default_filt_lvl, copy the relevant calculation here.
-     */
-    if (alt_flt_enabled)
-    {
-        for (i = 0; i < MAX_MB_SEGMENTS; i++)
-        {    /* Abs value */
-            if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA)
-            {
-                lvl_seg[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i];
-            }
-            /* Delta Value */
-            else
-            {
-                lvl_seg[i] = default_filt_lvl
-                        + mbd->segment_feature_data[MB_LVL_ALT_LF][i];
-                lvl_seg[i] = (lvl_seg[i] > 0) ?
-                        ((lvl_seg[i] > 63) ? 63: lvl_seg[i]) : 0;
-            }
-        }
-    }
-
     /* Set up the buffer pointers; partial image starts at ~middle of frame */
     y_ptr = post->y_buffer + ((post->y_height >> 5) * 16) * post->y_stride;
     mode_info_context = cm->mi + (post->y_height >> 5) * (mb_cols + 1);
@@ -620,10 +602,12 @@ void vp8_loop_filter_partial_frame
                            mode_info_context->mbmi.mode != SPLITMV &&
                            mode_info_context->mbmi.mb_skip_coeff);
 
-            if (alt_flt_enabled)
-                filter_level = lvl_seg[mode_info_context->mbmi.segment_id];
-            else
-                filter_level = default_filt_lvl;
+            const int mode_index =
+                lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
+            const int seg = mode_info_context->mbmi.segment_id;
+            const int ref_frame = mode_info_context->mbmi.ref_frame;
+
+            filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
 
             if (filter_level)
             {
diff --git a/vp8/common/x86/loopfilter_block_sse2.asm b/vp8/common/x86/loopfilter_block_sse2.asm
index 1c445effc..3d45c617b 100644
--- a/vp8/common/x86/loopfilter_block_sse2.asm
+++ b/vp8/common/x86/loopfilter_block_sse2.asm
@@ -150,6 +150,7 @@ sym(vp8_loop_filter_bh_y_sse2):
 
     push    rbp
     mov     rbp, rsp
+    SAVE_XMM 11
     push    r12
     push    r13
     mov     thresh, arg(4)
@@ -258,6 +259,7 @@ LF_FILTER xmm0, xmm1, xmm3, xmm8, xmm4, xmm2
 %ifidn __OUTPUT_FORMAT__,x64
     pop    r13
     pop    r12
+    RESTORE_XMM
     pop    rbp
 %endif
 
diff --git a/vp8/common/x86/recon_sse2.asm b/vp8/common/x86/recon_sse2.asm
index fe774506e..1434bcd93 100644
--- a/vp8/common/x86/recon_sse2.asm
+++ b/vp8/common/x86/recon_sse2.asm
@@ -890,6 +890,7 @@ sym(vp8_intra_pred_y_tm_%1):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 7
     push        rsi
     push        rdi
     GET_GOT     rbx
@@ -957,6 +958,7 @@ vp8_intra_pred_y_tm_%1_loop:
     RESTORE_GOT
     pop         rdi
     pop         rsi
+    RESTORE_XMM
     UNSHADOW_ARGS
     pop         rbp
     ret
diff --git a/vp8/common/x86/subpixel_ssse3.asm b/vp8/common/x86/subpixel_ssse3.asm
index 13bcaf6c3..c06f24556 100644
--- a/vp8/common/x86/subpixel_ssse3.asm
+++ b/vp8/common/x86/subpixel_ssse3.asm
@@ -352,6 +352,7 @@ sym(vp8_filter_block1d4_h6_ssse3):
     pop rdi
     pop rsi
     RESTORE_GOT
+    RESTORE_XMM
     UNSHADOW_ARGS
     pop         rbp
     ret
diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c
index a714856b0..b18cb5065 100644
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -29,6 +29,13 @@
 #include "error_concealment.h"
 #endif
 
+#define CALLOC_ARRAY(p, n) CHECK_MEM_ERROR((p), vpx_calloc(sizeof(*(p)), (n)))
+#define CALLOC_ARRAY_ALIGNED(p, n, algn) do {                      \
+  CHECK_MEM_ERROR((p), vpx_memalign((algn), sizeof(*(p)) * (n)));  \
+  memset((p), 0, (n) * sizeof(*(p)));                              \
+} while (0)
+
+
 extern void vp8_mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd);
 
 static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC *mbrd, int count)
@@ -668,11 +675,10 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi)
         pbi->b_multithreaded_rd = 1;
         pbi->decoding_thread_count = core_count - 1;
 
-        CHECK_MEM_ERROR(pbi->h_decoding_thread, vpx_malloc(sizeof(pthread_t) * pbi->decoding_thread_count));
-        CHECK_MEM_ERROR(pbi->h_event_start_decoding, vpx_malloc(sizeof(sem_t) * pbi->decoding_thread_count));
-        CHECK_MEM_ERROR(pbi->mb_row_di, vpx_memalign(32, sizeof(MB_ROW_DEC) * pbi->decoding_thread_count));
-        vpx_memset(pbi->mb_row_di, 0, sizeof(MB_ROW_DEC) * pbi->decoding_thread_count);
-        CHECK_MEM_ERROR(pbi->de_thread_data, vpx_malloc(sizeof(DECODETHREAD_DATA) * pbi->decoding_thread_count));
+        CALLOC_ARRAY(pbi->h_decoding_thread, pbi->decoding_thread_count);
+        CALLOC_ARRAY(pbi->h_event_start_decoding, pbi->decoding_thread_count);
+        CALLOC_ARRAY_ALIGNED(pbi->mb_row_di, pbi->decoding_thread_count, 32);
+        CALLOC_ARRAY(pbi->de_thread_data, pbi->decoding_thread_count);
 
         for (ithread = 0; ithread < pbi->decoding_thread_count; ithread++)
         {
@@ -796,32 +802,32 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows)
         uv_width = width >>1;
 
         /* Allocate an int for each mb row. */
-        CHECK_MEM_ERROR(pbi->mt_current_mb_col, vpx_malloc(sizeof(int) * pc->mb_rows));
+        CALLOC_ARRAY(pbi->mt_current_mb_col, pc->mb_rows);
 
         /* Allocate memory for above_row buffers. */
-        CHECK_MEM_ERROR(pbi->mt_yabove_row, vpx_malloc(sizeof(unsigned char *) * pc->mb_rows));
-        for (i=0; i< pc->mb_rows; i++)
+        CALLOC_ARRAY(pbi->mt_yabove_row, pc->mb_rows);
+        for (i = 0; i < pc->mb_rows; i++)
             CHECK_MEM_ERROR(pbi->mt_yabove_row[i], vpx_memalign(16,sizeof(unsigned char) * (width + (VP8BORDERINPIXELS<<1))));
 
-        CHECK_MEM_ERROR(pbi->mt_uabove_row, vpx_malloc(sizeof(unsigned char *) * pc->mb_rows));
-        for (i=0; i< pc->mb_rows; i++)
+        CALLOC_ARRAY(pbi->mt_uabove_row, pc->mb_rows);
+        for (i = 0; i < pc->mb_rows; i++)
             CHECK_MEM_ERROR(pbi->mt_uabove_row[i], vpx_memalign(16,sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS)));
 
-        CHECK_MEM_ERROR(pbi->mt_vabove_row, vpx_malloc(sizeof(unsigned char *) * pc->mb_rows));
-        for (i=0; i< pc->mb_rows; i++)
+        CALLOC_ARRAY(pbi->mt_vabove_row, pc->mb_rows);
+        for (i = 0; i < pc->mb_rows; i++)
             CHECK_MEM_ERROR(pbi->mt_vabove_row[i], vpx_memalign(16,sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS)));
 
         /* Allocate memory for left_col buffers. */
-        CHECK_MEM_ERROR(pbi->mt_yleft_col, vpx_malloc(sizeof(unsigned char *) * pc->mb_rows));
-        for (i=0; i< pc->mb_rows; i++)
+        CALLOC_ARRAY(pbi->mt_yleft_col, pc->mb_rows);
+        for (i = 0; i < pc->mb_rows; i++)
             CHECK_MEM_ERROR(pbi->mt_yleft_col[i], vpx_calloc(sizeof(unsigned char) * 16, 1));
 
-        CHECK_MEM_ERROR(pbi->mt_uleft_col, vpx_malloc(sizeof(unsigned char *) * pc->mb_rows));
-        for (i=0; i< pc->mb_rows; i++)
+        CALLOC_ARRAY(pbi->mt_uleft_col, pc->mb_rows);
+        for (i = 0; i < pc->mb_rows; i++)
             CHECK_MEM_ERROR(pbi->mt_uleft_col[i], vpx_calloc(sizeof(unsigned char) * 8, 1));
 
-        CHECK_MEM_ERROR(pbi->mt_vleft_col, vpx_malloc(sizeof(unsigned char *) * pc->mb_rows));
-        for (i=0; i< pc->mb_rows; i++)
+        CALLOC_ARRAY(pbi->mt_vleft_col, pc->mb_rows);
+        for (i = 0; i < pc->mb_rows; i++)
             CHECK_MEM_ERROR(pbi->mt_vleft_col[i], vpx_calloc(sizeof(unsigned char) * 8, 1));
     }
 }
diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c
index 27991433b..e666b6c7e 100644
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c
@@ -118,7 +118,7 @@ static void update_mbintra_mode_probs(VP8_COMP *cpi)
 
         update_mode(
             w, VP8_YMODES, vp8_ymode_encodings, vp8_ymode_tree,
-            Pnew, x->fc.ymode_prob, bct, (unsigned int *)cpi->ymode_count
+            Pnew, x->fc.ymode_prob, bct, (unsigned int *)cpi->mb.ymode_count
         );
     }
     {
@@ -127,7 +127,7 @@ static void update_mbintra_mode_probs(VP8_COMP *cpi)
 
         update_mode(
             w, VP8_UV_MODES, vp8_uv_mode_encodings, vp8_uv_mode_tree,
-            Pnew, x->fc.uv_mode_prob, bct, (unsigned int *)cpi->uv_mode_count
+            Pnew, x->fc.uv_mode_prob, bct, (unsigned int *)cpi->mb.uv_mode_count
         );
     }
 }
@@ -493,7 +493,7 @@ static void write_mb_features(vp8_writer *w, const MB_MODE_INFO *mi, const MACRO
 }
 void vp8_convert_rfct_to_prob(VP8_COMP *const cpi)
 {
-    const int *const rfct = cpi->count_mb_ref_frame_usage;
+    const int *const rfct = cpi->mb.count_mb_ref_frame_usage;
     const int rf_intra = rfct[INTRA_FRAME];
     const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];
 
@@ -539,7 +539,7 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi)
     {
         int total_mbs = pc->mb_rows * pc->mb_cols;
 
-        prob_skip_false = (total_mbs - cpi->skip_true_count ) * 256 / total_mbs;
+        prob_skip_false = (total_mbs - cpi->mb.skip_true_count ) * 256 / total_mbs;
 
         if (prob_skip_false <= 1)
             prob_skip_false = 1;
@@ -730,7 +730,7 @@ static void write_kfmodes(VP8_COMP *cpi)
     {
         int total_mbs = c->mb_rows * c->mb_cols;
 
-        prob_skip_false = (total_mbs - cpi->skip_true_count ) * 256 / total_mbs;
+        prob_skip_false = (total_mbs - cpi->mb.skip_true_count ) * 256 / total_mbs;
 
         if (prob_skip_false <= 1)
             prob_skip_false = 1;
@@ -851,6 +851,7 @@ static int prob_update_savings(const unsigned int *ct,
 
 static int independent_coef_context_savings(VP8_COMP *cpi)
 {
+    MACROBLOCK *const x = & cpi->mb;
     int savings = 0;
     int i = 0;
     do
@@ -867,7 +868,7 @@ static int independent_coef_context_savings(VP8_COMP *cpi)
              */
 
             probs = (const unsigned int (*)[MAX_ENTROPY_TOKENS])
-                                                    cpi->coef_counts[i][j];
+                x->coef_counts[i][j];
 
             /* Reset to default probabilities at key frames */
             if (cpi->common.frame_type == KEY_FRAME)
@@ -926,6 +927,7 @@ static int independent_coef_context_savings(VP8_COMP *cpi)
 
 static int default_coef_context_savings(VP8_COMP *cpi)
 {
+    MACROBLOCK *const x = & cpi->mb;
     int savings = 0;
     int i = 0;
     do
@@ -945,7 +947,7 @@ static int default_coef_context_savings(VP8_COMP *cpi)
                     MAX_ENTROPY_TOKENS, vp8_coef_encodings, vp8_coef_tree,
                     cpi->frame_coef_probs [i][j][k],
                     cpi->frame_branch_ct [i][j][k],
-                    cpi->coef_counts [i][j][k],
+                    x->coef_counts [i][j][k],
                     256, 1
                 );
 
@@ -994,7 +996,7 @@ int vp8_estimate_entropy_savings(VP8_COMP *cpi)
 {
     int savings = 0;
 
-    const int *const rfct = cpi->count_mb_ref_frame_usage;
+    const int *const rfct = cpi->mb.count_mb_ref_frame_usage;
     const int rf_intra = rfct[INTRA_FRAME];
     const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];
     int new_intra, new_last, new_garf, oldtotal, newtotal;
diff --git a/vp8/encoder/block.h b/vp8/encoder/block.h
index 0b0a2346a..a30f88816 100644
--- a/vp8/encoder/block.h
+++ b/vp8/encoder/block.h
@@ -18,6 +18,9 @@
 #include "vp8/common/entropy.h"
 #include "vpx_ports/mem.h"
 
+#define MAX_MODES 20
+#define MAX_ERROR_BINS 1024
+
 /* motion search site */
 typedef struct
 {
@@ -127,7 +130,26 @@ typedef struct macroblock
     unsigned char need_to_clamp_best_mvs;
 #endif
 
-
+    int skip_true_count;
+    unsigned int coef_counts [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
+    unsigned int MVcount [2] [MVvals];  /* (row,col) MV cts this frame */
+    int ymode_count [VP8_YMODES];        /* intra MB type cts this frame */
+    int uv_mode_count[VP8_UV_MODES];     /* intra MB type cts this frame */
+    int64_t prediction_error;
+    int64_t intra_error;
+    int count_mb_ref_frame_usage[MAX_REF_FRAMES];
+
+    int rd_thresh_mult[MAX_MODES];
+    int rd_threshes[MAX_MODES];
+    unsigned int mbs_tested_so_far;
+    unsigned int mode_test_hit_counts[MAX_MODES];
+    int zbin_mode_boost_enabled;
+    int zbin_mode_boost;
+    int last_zbin_mode_boost;
+
+    int last_zbin_over_quant;
+    int zbin_over_quant;
+    int error_bins[MAX_ERROR_BINS];
 
     void (*short_fdct4x4)(short *input, short *output, int pitch);
     void (*short_fdct8x4)(short *input, short *output, int pitch);
diff --git a/vp8/encoder/denoising.c b/vp8/encoder/denoising.c
index 98526d640..1ee1cb59f 100644
--- a/vp8/encoder/denoising.c
+++ b/vp8/encoder/denoising.c
@@ -140,8 +140,7 @@ int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height)
     int i;
     assert(denoiser);
 
-    /* don't need one for intra start at 1 */
-    for (i = 1; i < MAX_REF_FRAMES; i++)
+    for (i = 0; i < MAX_REF_FRAMES; i++)
     {
         denoiser->yv12_running_avg[i].flags = 0;
 
@@ -175,8 +174,7 @@ void vp8_denoiser_free(VP8_DENOISER *denoiser)
     int i;
     assert(denoiser);
 
-    /* we don't have one for intra ref frame */
-    for (i = 1; i < MAX_REF_FRAMES ; i++)
+    for (i = 0; i < MAX_REF_FRAMES ; i++)
     {
         vp8_yv12_de_alloc_frame_buffer(&denoiser->yv12_running_avg[i]);
     }
@@ -291,7 +289,7 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
     {
         /* Filter. */
         decision = vp8_denoiser_filter(&denoiser->yv12_mc_running_avg,
-                                       &denoiser->yv12_running_avg[LAST_FRAME],
+                                       &denoiser->yv12_running_avg[INTRA_FRAME],
                                        x,
                                        motion_magnitude2,
                                        recon_yoffset, recon_uvoffset);
@@ -303,7 +301,7 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
          */
         vp8_copy_mem16x16(
                 x->thismb, 16,
-                denoiser->yv12_running_avg[LAST_FRAME].y_buffer + recon_yoffset,
-                denoiser->yv12_running_avg[LAST_FRAME].y_stride);
+                denoiser->yv12_running_avg[INTRA_FRAME].y_buffer + recon_yoffset,
+                denoiser->yv12_running_avg[INTRA_FRAME].y_stride);
     }
 }
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index 2a86b88fd..d1b647be9 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -33,7 +33,7 @@
 #endif
 #include "encodeframe.h"
 
-extern void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) ;
+extern void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t) ;
 extern void vp8_calc_ref_frame_costs(int *ref_frame_cost,
                                      int prob_intra,
                                      int prob_last,
@@ -45,7 +45,6 @@ extern void vp8_auto_select_speed(VP8_COMP *cpi);
 extern void vp8cx_init_mbrthread_data(VP8_COMP *cpi,
                                       MACROBLOCK *x,
                                       MB_ROW_COMP *mbr_ei,
-                                      int mb_row,
                                       int count);
 static void adjust_act_zbin( VP8_COMP *cpi, MACROBLOCK *x );
 
@@ -530,7 +529,8 @@ void encode_mb_row(VP8_COMP *cpi,
              * segmentation map
              */
             if ((cpi->current_layer == 0) &&
-                (cpi->cyclic_refresh_mode_enabled && xd->segmentation_enabled))
+                (cpi->cyclic_refresh_mode_enabled &&
+                 xd->segmentation_enabled))
             {
                 cpi->segmentation_map[map_index+mb_col] = xd->mode_info_context->mbmi.segment_id;
 
@@ -642,10 +642,6 @@ static void init_encode_frame_mb_context(VP8_COMP *cpi)
 
     xd->left_context = &cm->left_context;
 
-    vp8_zero(cpi->count_mb_ref_frame_usage)
-    vp8_zero(cpi->ymode_count)
-    vp8_zero(cpi->uv_mode_count)
-
     x->mvc = cm->fc.mvc;
 
     vpx_memset(cm->above_context, 0,
@@ -674,6 +670,43 @@ static void init_encode_frame_mb_context(VP8_COMP *cpi)
     xd->fullpixel_mask = 0xffffffff;
     if(cm->full_pixel)
         xd->fullpixel_mask = 0xfffffff8;
+
+    vp8_zero(x->coef_counts);
+    vp8_zero(x->ymode_count);
+    vp8_zero(x->uv_mode_count)
+    x->prediction_error = 0;
+    x->intra_error = 0;
+    vp8_zero(x->count_mb_ref_frame_usage);
+}
+
+static void sum_coef_counts(MACROBLOCK *x, MACROBLOCK *x_thread)
+{
+    int i = 0;
+    do
+    {
+        int j = 0;
+        do
+        {
+            int k = 0;
+            do
+            {
+                /* at every context */
+
+                /* calc probs and branch cts for this frame only */
+                int t = 0;      /* token/prob index */
+
+                do
+                {
+                    x->coef_counts [i][j][k][t] +=
+                        x_thread->coef_counts [i][j][k][t];
+                }
+                while (++t < ENTROPY_NODES);
+            }
+            while (++k < PREV_COEF_CONTEXTS);
+        }
+        while (++j < COEF_BANDS);
+    }
+    while (++i < BLOCK_TYPES);
 }
 
 void vp8_encode_frame(VP8_COMP *cpi)
@@ -717,9 +750,7 @@ void vp8_encode_frame(VP8_COMP *cpi)
         xd->subpixel_predict16x16   = vp8_bilinear_predict16x16;
     }
 
-    cpi->prediction_error = 0;
-    cpi->intra_error = 0;
-    cpi->skip_true_count = 0;
+    cpi->mb.skip_true_count = 0;
     cpi->tok_count = 0;
 
 #if 0
@@ -730,13 +761,11 @@ void vp8_encode_frame(VP8_COMP *cpi)
 
     xd->mode_info_context = cm->mi;
 
-    vp8_zero(cpi->MVcount);
-
-    vp8_zero(cpi->coef_counts);
+    vp8_zero(cpi->mb.MVcount);
 
     vp8cx_frame_init_quantizer(cpi);
 
-    vp8_initialize_rd_consts(cpi,
+    vp8_initialize_rd_consts(cpi, x,
                              vp8_dc_quant(cm->base_qindex, cm->y1dc_delta_q));
 
     vp8cx_initialize_me_consts(cpi, cm->base_qindex);
@@ -775,7 +804,8 @@ void vp8_encode_frame(VP8_COMP *cpi)
         {
             int i;
 
-            vp8cx_init_mbrthread_data(cpi, x, cpi->mb_row_ei, 1,  cpi->encoding_thread_count);
+            vp8cx_init_mbrthread_data(cpi, x, cpi->mb_row_ei,
+                                      cpi->encoding_thread_count);
 
             for (i = 0; i < cm->mb_rows; i++)
                 cpi->mt_current_mb_col[i] = -1;
@@ -837,13 +867,49 @@ void vp8_encode_frame(VP8_COMP *cpi)
 
             for (i = 0; i < cpi->encoding_thread_count; i++)
             {
+                int mode_count;
+                int c_idx;
                 totalrate += cpi->mb_row_ei[i].totalrate;
+
+                cpi->mb.skip_true_count += cpi->mb_row_ei[i].mb.skip_true_count;
+
+                for(mode_count = 0; mode_count < VP8_YMODES; mode_count++)
+                    cpi->mb.ymode_count[mode_count] +=
+                        cpi->mb_row_ei[i].mb.ymode_count[mode_count];
+
+                for(mode_count = 0; mode_count < VP8_UV_MODES; mode_count++)
+                    cpi->mb.uv_mode_count[mode_count] +=
+                        cpi->mb_row_ei[i].mb.uv_mode_count[mode_count];
+
+                for(c_idx = 0; c_idx < MVvals; c_idx++)
+                {
+                    cpi->mb.MVcount[0][c_idx] +=
+                        cpi->mb_row_ei[i].mb.MVcount[0][c_idx];
+                    cpi->mb.MVcount[1][c_idx] +=
+                        cpi->mb_row_ei[i].mb.MVcount[1][c_idx];
+                }
+
+                cpi->mb.prediction_error +=
+                    cpi->mb_row_ei[i].mb.prediction_error;
+                cpi->mb.intra_error += cpi->mb_row_ei[i].mb.intra_error;
+
+                for(c_idx = 0; c_idx < MAX_REF_FRAMES; c_idx++)
+                    cpi->mb.count_mb_ref_frame_usage[c_idx] +=
+                        cpi->mb_row_ei[i].mb.count_mb_ref_frame_usage[c_idx];
+
+                for(c_idx = 0; c_idx < MAX_ERROR_BINS; c_idx++)
+                    cpi->mb.error_bins[c_idx] +=
+                        cpi->mb_row_ei[i].mb.error_bins[c_idx];
+
+                /* add up counts for each thread */
+                sum_coef_counts(x, &cpi->mb_row_ei[i].mb);
             }
 
         }
         else
 #endif
         {
+
             /* for each macroblock row in image */
             for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
             {
@@ -929,13 +995,14 @@ void vp8_encode_frame(VP8_COMP *cpi)
     {
         int tot_modes;
 
-        tot_modes = cpi->count_mb_ref_frame_usage[INTRA_FRAME]
-                    + cpi->count_mb_ref_frame_usage[LAST_FRAME]
-                    + cpi->count_mb_ref_frame_usage[GOLDEN_FRAME]
-                    + cpi->count_mb_ref_frame_usage[ALTREF_FRAME];
+        tot_modes = cpi->mb.count_mb_ref_frame_usage[INTRA_FRAME]
+                    + cpi->mb.count_mb_ref_frame_usage[LAST_FRAME]
+                    + cpi->mb.count_mb_ref_frame_usage[GOLDEN_FRAME]
+                    + cpi->mb.count_mb_ref_frame_usage[ALTREF_FRAME];
 
         if (tot_modes)
-            cpi->this_frame_percent_intra = cpi->count_mb_ref_frame_usage[INTRA_FRAME] * 100 / tot_modes;
+            cpi->this_frame_percent_intra =
+                cpi->mb.count_mb_ref_frame_usage[INTRA_FRAME] * 100 / tot_modes;
 
     }
 
@@ -1065,8 +1132,8 @@ static void sum_intra_stats(VP8_COMP *cpi, MACROBLOCK *x)
 
 #endif
 
-    ++cpi->ymode_count[m];
-    ++cpi->uv_mode_count[uvm];
+    ++x->ymode_count[m];
+    ++x->uv_mode_count[uvm];
 
 }
 
@@ -1093,15 +1160,16 @@ static void adjust_act_zbin( VP8_COMP *cpi, MACROBLOCK *x )
 #endif
 }
 
-int vp8cx_encode_intra_macroblock(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t)
+int vp8cx_encode_intra_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
+                                  TOKENEXTRA **t)
 {
     MACROBLOCKD *xd = &x->e_mbd;
     int rate;
 
     if (cpi->sf.RD && cpi->compressor_speed != 2)
-        vp8_rd_pick_intra_mode(cpi, x, &rate);
+        vp8_rd_pick_intra_mode(x, &rate);
     else
-        vp8_pick_intra_mode(cpi, x, &rate);
+        vp8_pick_intra_mode(x, &rate);
 
     if(cpi->oxcf.tuning == VP8_TUNE_SSIM)
     {
@@ -1118,7 +1186,7 @@ int vp8cx_encode_intra_macroblock(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t)
 
     sum_intra_stats(cpi, x);
 
-    vp8_tokenize_mb(cpi, &x->e_mbd, t);
+    vp8_tokenize_mb(cpi, x, t);
 
     if (xd->mode_info_context->mbmi.mode != B_PRED)
         vp8_inverse_transform_mby(xd);
@@ -1165,17 +1233,17 @@ int vp8cx_encode_inter_macroblock
 
     if (cpi->sf.RD)
     {
-        int zbin_mode_boost_enabled = cpi->zbin_mode_boost_enabled;
+        int zbin_mode_boost_enabled = x->zbin_mode_boost_enabled;
 
         /* Are we using the fast quantizer for the mode selection? */
         if(cpi->sf.use_fastquant_for_pick)
         {
-            cpi->mb.quantize_b      = vp8_fast_quantize_b;
-            cpi->mb.quantize_b_pair = vp8_fast_quantize_b_pair;
+            x->quantize_b      = vp8_fast_quantize_b;
+            x->quantize_b_pair = vp8_fast_quantize_b_pair;
 
             /* the fast quantizer does not use zbin_extra, so
              * do not recalculate */
-            cpi->zbin_mode_boost_enabled = 0;
+            x->zbin_mode_boost_enabled = 0;
         }
         vp8_rd_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate,
                                &distortion, &intra_error);
@@ -1183,12 +1251,12 @@ int vp8cx_encode_inter_macroblock
         /* switch back to the regular quantizer for the encode */
         if (cpi->sf.improved_quant)
         {
-            cpi->mb.quantize_b      = vp8_regular_quantize_b;
-            cpi->mb.quantize_b_pair = vp8_regular_quantize_b_pair;
+            x->quantize_b      = vp8_regular_quantize_b;
+            x->quantize_b_pair = vp8_regular_quantize_b_pair;
         }
 
         /* restore cpi->zbin_mode_boost_enabled */
-        cpi->zbin_mode_boost_enabled = zbin_mode_boost_enabled;
+        x->zbin_mode_boost_enabled = zbin_mode_boost_enabled;
 
     }
     else
@@ -1197,8 +1265,8 @@ int vp8cx_encode_inter_macroblock
                             &distortion, &intra_error, mb_row, mb_col);
     }
 
-    cpi->prediction_error += distortion;
-    cpi->intra_error += intra_error;
+    x->prediction_error += distortion;
+    x->intra_error += intra_error;
 
     if(cpi->oxcf.tuning == VP8_TUNE_SSIM)
     {
@@ -1234,22 +1302,22 @@ int vp8cx_encode_inter_macroblock
         /* Experimental code. Special case for gf and arf zeromv modes.
          * Increase zbin size to supress noise
          */
-        cpi->zbin_mode_boost = 0;
-        if (cpi->zbin_mode_boost_enabled)
+        x->zbin_mode_boost = 0;
+        if (x->zbin_mode_boost_enabled)
         {
             if ( xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME )
             {
                 if (xd->mode_info_context->mbmi.mode == ZEROMV)
                 {
                     if (xd->mode_info_context->mbmi.ref_frame != LAST_FRAME)
-                        cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
+                        x->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
                     else
-                        cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
+                        x->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
                 }
                 else if (xd->mode_info_context->mbmi.mode == SPLITMV)
-                    cpi->zbin_mode_boost = 0;
+                    x->zbin_mode_boost = 0;
                 else
-                    cpi->zbin_mode_boost = MV_ZBIN_BOOST;
+                    x->zbin_mode_boost = MV_ZBIN_BOOST;
             }
         }
 
@@ -1259,7 +1327,7 @@ int vp8cx_encode_inter_macroblock
             vp8_update_zbin_extra(cpi, x);
     }
 
-    cpi->count_mb_ref_frame_usage[xd->mode_info_context->mbmi.ref_frame] ++;
+    x->count_mb_ref_frame_usage[xd->mode_info_context->mbmi.ref_frame] ++;
 
     if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
     {
@@ -1304,7 +1372,7 @@ int vp8cx_encode_inter_macroblock
 
     if (!x->skip)
     {
-        vp8_tokenize_mb(cpi, xd, t);
+        vp8_tokenize_mb(cpi, x, t);
 
         if (xd->mode_info_context->mbmi.mode != B_PRED)
             vp8_inverse_transform_mby(xd);
@@ -1321,12 +1389,12 @@ int vp8cx_encode_inter_macroblock
 
         if (cpi->common.mb_no_coeff_skip)
         {
-            cpi->skip_true_count ++;
+            x->skip_true_count ++;
             vp8_fix_contexts(xd);
         }
         else
         {
-            vp8_stuff_mb(cpi, xd, t);
+            vp8_stuff_mb(cpi, x, t);
         }
     }
 
diff --git a/vp8/encoder/encodemv.c b/vp8/encoder/encodemv.c
index 7d8c84dd3..0c43d0692 100644
--- a/vp8/encoder/encodemv.c
+++ b/vp8/encoder/encodemv.c
@@ -363,10 +363,12 @@ void vp8_write_mvprobs(VP8_COMP *cpi)
     active_section = 4;
 #endif
     write_component_probs(
-        w, &mvc[0], &vp8_default_mv_context[0], &vp8_mv_update_probs[0], cpi->MVcount[0], 0, &flags[0]
+        w, &mvc[0], &vp8_default_mv_context[0], &vp8_mv_update_probs[0],
+        cpi->mb.MVcount[0], 0, &flags[0]
     );
     write_component_probs(
-        w, &mvc[1], &vp8_default_mv_context[1], &vp8_mv_update_probs[1], cpi->MVcount[1], 1, &flags[1]
+        w, &mvc[1], &vp8_default_mv_context[1], &vp8_mv_update_probs[1],
+        cpi->mb.MVcount[1], 1, &flags[1]
     );
 
     if (flags[0] || flags[1])
diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c
index 4c6e5d870..d4b17cef1 100644
--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -17,12 +17,6 @@
 
 #if CONFIG_MULTITHREAD
 
-extern int vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
-                                         TOKENEXTRA **t,
-                                         int recon_yoffset, int recon_uvoffset,
-                                         int mb_row, int mb_col);
-extern int vp8cx_encode_intra_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
-                                         TOKENEXTRA **t);
 extern void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x, int ok_to_skip);
 
 extern void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm);
@@ -220,7 +214,9 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
                          * vp8cx_encode_inter_macroblock()) back into the
                          * global segmentation map
                          */
-                        if (cpi->cyclic_refresh_mode_enabled && xd->segmentation_enabled)
+                        if ((cpi->current_layer == 0) &&
+                            (cpi->cyclic_refresh_mode_enabled &&
+                             xd->segmentation_enabled))
                         {
                             const MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
                             cpi->segmentation_map[map_index + mb_col] = mbmi->segment_id;
@@ -422,13 +418,23 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
             zd->block[i].dequant = zd->dequant_uv;
         zd->block[24].dequant = zd->dequant_y2;
 #endif
+
+
+        vpx_memcpy(z->rd_threshes, x->rd_threshes, sizeof(x->rd_threshes));
+        vpx_memcpy(z->rd_thresh_mult, x->rd_thresh_mult,
+                   sizeof(x->rd_thresh_mult));
+
+        z->zbin_over_quant = x->zbin_over_quant;
+        z->zbin_mode_boost_enabled = x->zbin_mode_boost_enabled;
+        z->zbin_mode_boost = x->zbin_mode_boost;
+
+        vpx_memset(z->error_bins, 0, sizeof(z->error_bins));
     }
 }
 
 void vp8cx_init_mbrthread_data(VP8_COMP *cpi,
                                MACROBLOCK *x,
                                MB_ROW_COMP *mbr_ei,
-                               int mb_row,
                                int count
                               )
 {
@@ -436,7 +442,6 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi,
     VP8_COMMON *const cm = & cpi->common;
     MACROBLOCKD *const xd = & x->e_mbd;
     int i;
-    (void) mb_row;
 
     for (i = 0; i < count; i++)
     {
@@ -477,6 +482,15 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi,
         mbd->fullpixel_mask = 0xffffffff;
         if(cm->full_pixel)
             mbd->fullpixel_mask = 0xfffffff8;
+
+        vp8_zero(mb->coef_counts);
+        vp8_zero(x->ymode_count);
+        mb->skip_true_count = 0;
+        vp8_zero(mb->MVcount);
+        mb->prediction_error = 0;
+        mb->intra_error = 0;
+        vp8_zero(mb->count_mb_ref_frame_usage);
+        mb->mbs_tested_so_far = 0;
     }
 }
 
diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index 370e82293..68095ca68 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -570,7 +570,7 @@ void vp8_first_pass(VP8_COMP *cpi)
     /* Initialise the MV cost table to the defaults */
     {
         int flag[2] = {1, 1};
-        vp8_initialize_rd_consts(cpi, vp8_dc_quant(cm->base_qindex, cm->y1dc_delta_q));
+        vp8_initialize_rd_consts(cpi, x, vp8_dc_quant(cm->base_qindex, cm->y1dc_delta_q));
         vpx_memcpy(cm->fc.mvc, vp8_default_mv_context, sizeof(vp8_default_mv_context));
         vp8_build_component_cost_table(cpi->mb.mvcost, (const MV_CONTEXT *) cm->fc.mvc, flag);
     }
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index aa09a1e3e..6858d411d 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -239,7 +239,7 @@ static void save_layer_context(VP8_COMP *cpi)
     lc->rate_correction_factor           = cpi->rate_correction_factor;
     lc->key_frame_rate_correction_factor = cpi->key_frame_rate_correction_factor;
     lc->gf_rate_correction_factor        = cpi->gf_rate_correction_factor;
-    lc->zbin_over_quant                  = cpi->zbin_over_quant;
+    lc->zbin_over_quant                  = cpi->mb.zbin_over_quant;
     lc->inter_frame_target               = cpi->inter_frame_target;
     lc->total_byte_count                 = cpi->total_byte_count;
     lc->filter_level                     = cpi->common.filter_level;
@@ -247,8 +247,8 @@ static void save_layer_context(VP8_COMP *cpi)
     lc->last_frame_percent_intra         = cpi->last_frame_percent_intra;
 
     memcpy (lc->count_mb_ref_frame_usage,
-            cpi->count_mb_ref_frame_usage,
-            sizeof(cpi->count_mb_ref_frame_usage));
+            cpi->mb.count_mb_ref_frame_usage,
+            sizeof(cpi->mb.count_mb_ref_frame_usage));
 }
 
 static void restore_layer_context(VP8_COMP *cpi, const int layer)
@@ -277,16 +277,16 @@ static void restore_layer_context(VP8_COMP *cpi, const int layer)
     cpi->rate_correction_factor           = lc->rate_correction_factor;
     cpi->key_frame_rate_correction_factor = lc->key_frame_rate_correction_factor;
     cpi->gf_rate_correction_factor        = lc->gf_rate_correction_factor;
-    cpi->zbin_over_quant                  = lc->zbin_over_quant;
+    cpi->mb.zbin_over_quant                  = lc->zbin_over_quant;
     cpi->inter_frame_target               = lc->inter_frame_target;
     cpi->total_byte_count                 = lc->total_byte_count;
     cpi->common.filter_level              = lc->filter_level;
 
     cpi->last_frame_percent_intra         = lc->last_frame_percent_intra;
 
-    memcpy (cpi->count_mb_ref_frame_usage,
+    memcpy (cpi->mb.count_mb_ref_frame_usage,
             lc->count_mb_ref_frame_usage,
-            sizeof(cpi->count_mb_ref_frame_usage));
+            sizeof(cpi->mb.count_mb_ref_frame_usage));
 }
 
 static void setup_features(VP8_COMP *cpi)
@@ -356,8 +356,6 @@ static void dealloc_compressor_data(VP8_COMP *cpi)
     /* Activity mask based per mb zbin adjustments */
     vpx_free(cpi->mb_activity_map);
     cpi->mb_activity_map = 0;
-    vpx_free(cpi->mb_norm_activity_map);
-    cpi->mb_norm_activity_map = 0;
 
     vpx_free(cpi->mb.pip);
     cpi->mb.pip = 0;
@@ -643,11 +641,10 @@ void vp8_set_speed_features(VP8_COMP *cpi)
     for (i = 0; i < MAX_MODES; i ++)
     {
         cpi->mode_check_freq[i] = 0;
-        cpi->mode_test_hit_counts[i] = 0;
         cpi->mode_chosen_counts[i] = 0;
     }
 
-    cpi->mbs_tested_so_far = 0;
+    cpi->mb.mbs_tested_so_far = 0;
 
     /* best quality defaults */
     sf->RD = 1;
@@ -841,7 +838,7 @@ void vp8_set_speed_features(VP8_COMP *cpi)
 
             for (i = 0; i < min; i++)
             {
-                sum += cpi->error_bins[i];
+                sum += cpi->mb.error_bins[i];
             }
 
             total_skip = sum;
@@ -850,7 +847,7 @@ void vp8_set_speed_features(VP8_COMP *cpi)
             /* i starts from 2 to make sure thresh started from 2048 */
             for (; i < 1024; i++)
             {
-                sum += cpi->error_bins[i];
+                sum += cpi->mb.error_bins[i];
 
                 if (10 * sum >= (unsigned int)(cpi->Speed - 6)*(total_mbs - total_skip))
                     break;
@@ -905,7 +902,7 @@ void vp8_set_speed_features(VP8_COMP *cpi)
         if (Speed >= 15)
             sf->half_pixel_search = 0;
 
-        vpx_memset(cpi->error_bins, 0, sizeof(cpi->error_bins));
+        vpx_memset(cpi->mb.error_bins, 0, sizeof(cpi->mb.error_bins));
 
     }; /* switch */
 
@@ -1080,10 +1077,7 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi)
     }
 
     /* Data used for real time vc mode to see if gf needs refreshing */
-    cpi->inter_zz_count = 0;
     cpi->zeromv_count = 0;
-    cpi->gf_bad_count = 0;
-    cpi->gf_update_recommended = 0;
 
 
     /* Structures used to monitor GF usage */
@@ -1098,11 +1092,6 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi)
                     vpx_calloc(sizeof(*cpi->mb_activity_map),
                     cm->mb_rows * cm->mb_cols));
 
-    vpx_free(cpi->mb_norm_activity_map);
-    CHECK_MEM_ERROR(cpi->mb_norm_activity_map,
-                    vpx_calloc(sizeof(*cpi->mb_norm_activity_map),
-                    cm->mb_rows * cm->mb_cols));
-
     /* allocate memory for storing last frame's MVs for MV prediction. */
     vpx_free(cpi->lfmv);
     CHECK_MEM_ERROR(cpi->lfmv, vpx_calloc((cm->mb_rows+2) * (cm->mb_cols+2),
@@ -1932,7 +1921,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
     /* Set starting values of RD threshold multipliers (128 = *1) */
     for (i = 0; i < MAX_MODES; i++)
     {
-        cpi->rd_thresh_mult[i] = 128;
+        cpi->mb.rd_thresh_mult[i] = 128;
     }
 
 #ifdef ENTROPY_STATS
@@ -2010,7 +1999,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
     cpi->refining_search_sad = vp8_refining_search_sad;
 
     /* make sure frame 1 is okay */
-    cpi->error_bins[0] = cpi->common.MBs;
+    cpi->mb.error_bins[0] = cpi->common.MBs;
 
     /* vp8cx_init_quantizer() is first called here. Add check in
      * vp8cx_frame_init_quantizer() so that vp8cx_init_quantizer is only
@@ -2783,10 +2772,14 @@ static void update_golden_frame_stats(VP8_COMP *cpi)
 
         if (cpi->common.frames_since_golden > 1)
         {
-            cpi->recent_ref_frame_usage[INTRA_FRAME] += cpi->count_mb_ref_frame_usage[INTRA_FRAME];
-            cpi->recent_ref_frame_usage[LAST_FRAME] += cpi->count_mb_ref_frame_usage[LAST_FRAME];
-            cpi->recent_ref_frame_usage[GOLDEN_FRAME] += cpi->count_mb_ref_frame_usage[GOLDEN_FRAME];
-            cpi->recent_ref_frame_usage[ALTREF_FRAME] += cpi->count_mb_ref_frame_usage[ALTREF_FRAME];
+            cpi->recent_ref_frame_usage[INTRA_FRAME] +=
+                cpi->mb.count_mb_ref_frame_usage[INTRA_FRAME];
+            cpi->recent_ref_frame_usage[LAST_FRAME] +=
+                cpi->mb.count_mb_ref_frame_usage[LAST_FRAME];
+            cpi->recent_ref_frame_usage[GOLDEN_FRAME] +=
+                cpi->mb.count_mb_ref_frame_usage[GOLDEN_FRAME];
+            cpi->recent_ref_frame_usage[ALTREF_FRAME] +=
+                cpi->mb.count_mb_ref_frame_usage[ALTREF_FRAME];
         }
     }
 }
@@ -2798,7 +2791,7 @@ static void update_rd_ref_frame_probs(VP8_COMP *cpi)
 {
     VP8_COMMON *cm = &cpi->common;
 
-    const int *const rfct = cpi->count_mb_ref_frame_usage;
+    const int *const rfct = cpi->mb.count_mb_ref_frame_usage;
     const int rf_intra = rfct[INTRA_FRAME];
     const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];
 
@@ -2865,38 +2858,17 @@ static int decide_key_frame(VP8_COMP *cpi)
 
     if ((cpi->compressor_speed == 2) && (cpi->Speed >= 5) && (cpi->sf.RD == 0))
     {
-        double change = 1.0 * abs((int)(cpi->intra_error - cpi->last_intra_error)) / (1 + cpi->last_intra_error);
-        double change2 = 1.0 * abs((int)(cpi->prediction_error - cpi->last_prediction_error)) / (1 + cpi->last_prediction_error);
+        double change = 1.0 * abs((int)(cpi->mb.intra_error -
+            cpi->last_intra_error)) / (1 + cpi->last_intra_error);
+        double change2 = 1.0 * abs((int)(cpi->mb.prediction_error -
+            cpi->last_prediction_error)) / (1 + cpi->last_prediction_error);
         double minerror = cm->MBs * 256;
 
-#if 0
+        cpi->last_intra_error = cpi->mb.intra_error;
+        cpi->last_prediction_error = cpi->mb.prediction_error;
 
-        if (10 * cpi->intra_error / (1 + cpi->prediction_error) < 15
-            && cpi->prediction_error > minerror
-            && (change > .25 || change2 > .25))
-        {
-            FILE *f = fopen("intra_inter.stt", "a");
-
-            if (cpi->prediction_error <= 0)
-                cpi->prediction_error = 1;
-
-            fprintf(f, "%d %d %d %d %14.4f\n",
-                    cm->current_video_frame,
-                    (int) cpi->prediction_error,
-                    (int) cpi->intra_error,
-                    (int)((10 * cpi->intra_error) / cpi->prediction_error),
-                    change);
-
-            fclose(f);
-        }
-
-#endif
-
-        cpi->last_intra_error = cpi->intra_error;
-        cpi->last_prediction_error = cpi->prediction_error;
-
-        if (10 * cpi->intra_error / (1 + cpi->prediction_error) < 15
-            && cpi->prediction_error > minerror
+        if (10 * cpi->mb.intra_error / (1 + cpi->mb.prediction_error) < 15
+            && cpi->mb.prediction_error > minerror
             && (change > .25 || change2 > .25))
         {
             /*(change > 1.4 || change < .75)&& cpi->this_frame_percent_intra > cpi->last_frame_percent_intra + 3*/
@@ -3160,6 +3132,57 @@ static void update_reference_frames(VP8_COMP *cpi)
         cpi->current_ref_frames[LAST_FRAME] = cm->current_video_frame;
 #endif
     }
+
+#if CONFIG_TEMPORAL_DENOISING
+    if (cpi->oxcf.noise_sensitivity)
+    {
+        /* we shouldn't have to keep multiple copies as we know in advance which
+         * buffer we should start - for now to get something up and running
+         * I've chosen to copy the buffers
+         */
+        if (cm->frame_type == KEY_FRAME)
+        {
+            int i;
+            vp8_yv12_copy_frame(
+                    cpi->Source,
+                    &cpi->denoiser.yv12_running_avg[LAST_FRAME]);
+
+            vp8_yv12_extend_frame_borders(
+                    &cpi->denoiser.yv12_running_avg[LAST_FRAME]);
+
+            for (i = 2; i < MAX_REF_FRAMES - 1; i++)
+                vp8_yv12_copy_frame(
+                        &cpi->denoiser.yv12_running_avg[LAST_FRAME],
+                        &cpi->denoiser.yv12_running_avg[i]);
+        }
+        else /* For non key frames */
+        {
+            vp8_yv12_extend_frame_borders(
+                    &cpi->denoiser.yv12_running_avg[INTRA_FRAME]);
+
+            if (cm->refresh_alt_ref_frame || cm->copy_buffer_to_arf)
+            {
+                vp8_yv12_copy_frame(
+                        &cpi->denoiser.yv12_running_avg[INTRA_FRAME],
+                        &cpi->denoiser.yv12_running_avg[ALTREF_FRAME]);
+            }
+            if (cm->refresh_golden_frame || cm->copy_buffer_to_gf)
+            {
+                vp8_yv12_copy_frame(
+                        &cpi->denoiser.yv12_running_avg[INTRA_FRAME],
+                        &cpi->denoiser.yv12_running_avg[GOLDEN_FRAME]);
+            }
+            if(cm->refresh_last_frame)
+            {
+                vp8_yv12_copy_frame(
+                        &cpi->denoiser.yv12_running_avg[INTRA_FRAME],
+                        &cpi->denoiser.yv12_running_avg[LAST_FRAME]);
+            }
+        }
+
+    }
+#endif
+
 }
 
 void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm)
@@ -3203,51 +3226,6 @@ void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm)
     }
 
     vp8_yv12_extend_frame_borders(cm->frame_to_show);
-#if CONFIG_TEMPORAL_DENOISING
-    if (cpi->oxcf.noise_sensitivity)
-    {
-
-
-        /* we shouldn't have to keep multiple copies as we know in advance which
-         * buffer we should start - for now to get something up and running
-         * I've chosen to copy the buffers
-         */
-        if (cm->frame_type == KEY_FRAME)
-        {
-            int i;
-            vp8_yv12_copy_frame(
-                    cpi->Source,
-                    &cpi->denoiser.yv12_running_avg[LAST_FRAME]);
-
-            vp8_yv12_extend_frame_borders(
-                    &cpi->denoiser.yv12_running_avg[LAST_FRAME]);
-
-            for (i = 2; i < MAX_REF_FRAMES - 1; i++)
-                vp8_yv12_copy_frame(
-                        cpi->Source,
-                        &cpi->denoiser.yv12_running_avg[i]);
-        }
-        else /* For non key frames */
-        {
-            vp8_yv12_extend_frame_borders(
-                    &cpi->denoiser.yv12_running_avg[LAST_FRAME]);
-
-            if (cm->refresh_alt_ref_frame || cm->copy_buffer_to_arf)
-            {
-                vp8_yv12_copy_frame(
-                        &cpi->denoiser.yv12_running_avg[LAST_FRAME],
-                        &cpi->denoiser.yv12_running_avg[ALTREF_FRAME]);
-            }
-            if (cm->refresh_golden_frame || cm->copy_buffer_to_gf)
-            {
-                vp8_yv12_copy_frame(
-                        &cpi->denoiser.yv12_running_avg[LAST_FRAME],
-                        &cpi->denoiser.yv12_running_avg[GOLDEN_FRAME]);
-            }
-        }
-
-    }
-#endif
 
 }
 
@@ -3331,19 +3309,19 @@ static void encode_frame_to_data_rate
     cm->copy_buffer_to_arf = 0;
 
     /* Clear zbin over-quant value and mode boost values. */
-    cpi->zbin_over_quant = 0;
-    cpi->zbin_mode_boost = 0;
+    cpi->mb.zbin_over_quant = 0;
+    cpi->mb.zbin_mode_boost = 0;
 
     /* Enable or disable mode based tweaking of the zbin
      * For 2 Pass Only used where GF/ARF prediction quality
      * is above a threshold
      */
-    cpi->zbin_mode_boost_enabled = 1;
+    cpi->mb.zbin_mode_boost_enabled = 1;
     if (cpi->pass == 2)
     {
         if ( cpi->gfu_boost <= 400 )
         {
-            cpi->zbin_mode_boost_enabled = 0;
+            cpi->mb.zbin_mode_boost_enabled = 0;
         }
     }
 
@@ -3410,7 +3388,7 @@ static void encode_frame_to_data_rate
         /* Reset the RD threshold multipliers to default of * 1 (128) */
         for (i = 0; i < MAX_MODES; i++)
         {
-            cpi->rd_thresh_mult[i] = 128;
+            cpi->mb.rd_thresh_mult[i] = 128;
         }
     }
 
@@ -4099,8 +4077,9 @@ static void encode_frame_to_data_rate
                 q_low = (Q < q_high) ? (Q + 1) : q_high;
 
                 /* If we are using over quant do the same for zbin_oq_low */
-                if (cpi->zbin_over_quant > 0)
-                    zbin_oq_low = (cpi->zbin_over_quant < zbin_oq_high) ? (cpi->zbin_over_quant + 1) : zbin_oq_high;
+                if (cpi->mb.zbin_over_quant > 0)
+                    zbin_oq_low = (cpi->mb.zbin_over_quant < zbin_oq_high) ?
+                        (cpi->mb.zbin_over_quant + 1) : zbin_oq_high;
 
                 if (undershoot_seen)
                 {
@@ -4116,11 +4095,13 @@ static void encode_frame_to_data_rate
                      * is max)
                      */
                     if (Q < MAXQ)
-                        cpi->zbin_over_quant = 0;
+                        cpi->mb.zbin_over_quant = 0;
                     else
                     {
-                        zbin_oq_low = (cpi->zbin_over_quant < zbin_oq_high) ? (cpi->zbin_over_quant + 1) : zbin_oq_high;
-                        cpi->zbin_over_quant = (zbin_oq_high + zbin_oq_low) / 2;
+                        zbin_oq_low = (cpi->mb.zbin_over_quant < zbin_oq_high) ?
+                            (cpi->mb.zbin_over_quant + 1) : zbin_oq_high;
+                        cpi->mb.zbin_over_quant =
+                            (zbin_oq_high + zbin_oq_low) / 2;
                     }
                 }
                 else
@@ -4133,7 +4114,9 @@ static void encode_frame_to_data_rate
 
                     Q = vp8_regulate_q(cpi, cpi->this_frame_target);
 
-                    while (((Q < q_low) || (cpi->zbin_over_quant < zbin_oq_low)) && (Retries < 10))
+                    while (((Q < q_low) ||
+                        (cpi->mb.zbin_over_quant < zbin_oq_low)) &&
+                        (Retries < 10))
                     {
                         vp8_update_rate_correction_factors(cpi, 0);
                         Q = vp8_regulate_q(cpi, cpi->this_frame_target);
@@ -4146,12 +4129,13 @@ static void encode_frame_to_data_rate
             /* Frame is too small */
             else
             {
-                if (cpi->zbin_over_quant == 0)
+                if (cpi->mb.zbin_over_quant == 0)
                     /* Lower q_high if not using over quant */
                     q_high = (Q > q_low) ? (Q - 1) : q_low;
                 else
                     /* else lower zbin_oq_high */
-                    zbin_oq_high = (cpi->zbin_over_quant > zbin_oq_low) ? (cpi->zbin_over_quant - 1) : zbin_oq_low;
+                    zbin_oq_high = (cpi->mb.zbin_over_quant > zbin_oq_low) ?
+                        (cpi->mb.zbin_over_quant - 1) : zbin_oq_low;
 
                 if (overshoot_seen)
                 {
@@ -4167,9 +4151,10 @@ static void encode_frame_to_data_rate
                      * is max)
                      */
                     if (Q < MAXQ)
-                        cpi->zbin_over_quant = 0;
+                        cpi->mb.zbin_over_quant = 0;
                     else
-                        cpi->zbin_over_quant = (zbin_oq_high + zbin_oq_low) / 2;
+                        cpi->mb.zbin_over_quant =
+                            (zbin_oq_high + zbin_oq_low) / 2;
                 }
                 else
                 {
@@ -4192,7 +4177,9 @@ static void encode_frame_to_data_rate
                         q_low = Q;
                     }
 
-                    while (((Q > q_high) || (cpi->zbin_over_quant > zbin_oq_high)) && (Retries < 10))
+                    while (((Q > q_high) ||
+                        (cpi->mb.zbin_over_quant > zbin_oq_high)) &&
+                        (Retries < 10))
                     {
                         vp8_update_rate_correction_factors(cpi, 0);
                         Q = vp8_regulate_q(cpi, cpi->this_frame_target);
@@ -4210,7 +4197,9 @@ static void encode_frame_to_data_rate
                 Q = q_low;
 
             /* Clamp cpi->zbin_over_quant */
-            cpi->zbin_over_quant = (cpi->zbin_over_quant < zbin_oq_low) ? zbin_oq_low : (cpi->zbin_over_quant > zbin_oq_high) ? zbin_oq_high : cpi->zbin_over_quant;
+            cpi->mb.zbin_over_quant = (cpi->mb.zbin_over_quant < zbin_oq_low) ?
+                zbin_oq_low : (cpi->mb.zbin_over_quant > zbin_oq_high) ?
+                    zbin_oq_high : cpi->mb.zbin_over_quant;
 
             Loop = Q != last_q;
         }
@@ -4292,7 +4281,6 @@ static void encode_frame_to_data_rate
         /* Point to beginning of MODE_INFO arrays. */
         MODE_INFO *tmp = cm->mi;
 
-        cpi->inter_zz_count = 0;
         cpi->zeromv_count = 0;
 
         if(cm->frame_type != KEY_FRAME)
@@ -4301,8 +4289,6 @@ static void encode_frame_to_data_rate
             {
                 for (mb_col = 0; mb_col < cm->mb_cols; mb_col ++)
                 {
-                    if(tmp->mbmi.mode == ZEROMV && tmp->mbmi.ref_frame == LAST_FRAME)
-                        cpi->inter_zz_count++;
                     if(tmp->mbmi.mode == ZEROMV)
                         cpi->zeromv_count++;
                     tmp++;
@@ -4732,67 +4718,6 @@ static void encode_frame_to_data_rate
 
 
 }
-
-
-static void check_gf_quality(VP8_COMP *cpi)
-{
-    VP8_COMMON *cm = &cpi->common;
-    int gf_active_pct = (100 * cpi->gf_active_count) / (cm->mb_rows * cm->mb_cols);
-    int gf_ref_usage_pct = (cpi->count_mb_ref_frame_usage[GOLDEN_FRAME] * 100) / (cm->mb_rows * cm->mb_cols);
-    int last_ref_zz_useage = (cpi->inter_zz_count * 100) / (cm->mb_rows * cm->mb_cols);
-
-    /* Gf refresh is not currently being signalled */
-    if (cpi->gf_update_recommended == 0)
-    {
-        if (cpi->common.frames_since_golden > 7)
-        {
-            /* Low use of gf */
-            if ((gf_active_pct < 10) || ((gf_active_pct + gf_ref_usage_pct) < 15))
-            {
-                /* ...but last frame zero zero usage is reasonbable so a
-                 * new gf might be appropriate
-                 */
-                if (last_ref_zz_useage >= 25)
-                {
-                    cpi->gf_bad_count ++;
-
-                    /* Check that the condition is stable */
-                    if (cpi->gf_bad_count >= 8)
-                    {
-                        cpi->gf_update_recommended = 1;
-                        cpi->gf_bad_count = 0;
-                    }
-                }
-                else
-                    /* Restart count as the background is not stable enough */
-                    cpi->gf_bad_count = 0;
-            }
-            else
-                /* Gf useage has picked up so reset count */
-                cpi->gf_bad_count = 0;
-        }
-    }
-    /* If the signal is set but has not been read should we cancel it. */
-    else if (last_ref_zz_useage < 15)
-    {
-        cpi->gf_update_recommended = 0;
-        cpi->gf_bad_count = 0;
-    }
-
-#if 0
-    {
-        FILE *f = fopen("gfneeded.stt", "a");
-        fprintf(f, "%10d %10d %10d %10d %10ld \n",
-                cm->current_video_frame,
-                cpi->common.frames_since_golden,
-                gf_active_pct, gf_ref_usage_pct,
-                cpi->gf_update_recommended);
-        fclose(f);
-    }
-
-#endif
-}
-
 #if !(CONFIG_REALTIME_ONLY)
 static void Pass2Encode(VP8_COMP *cpi, unsigned long *size, unsigned char *dest, unsigned char * dest_end, unsigned int *frame_flags)
 {
@@ -5096,8 +5021,6 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
 
     if (cpi->compressor_speed == 2)
     {
-        if (cpi->oxcf.number_of_layers == 1)
-            check_gf_quality(cpi);
         vpx_usec_timer_start(&tsctimer);
         vpx_usec_timer_start(&ticktimer);
     }
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index 3f1fad60b..fb8ad357c 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -43,7 +43,7 @@
 #define AF_THRESH   25
 #define AF_THRESH2  100
 #define ARF_DECAY_THRESH 12
-#define MAX_MODES 20
+
 
 #define MIN_THRESHMULT  32
 #define MAX_THRESHMULT  512
@@ -349,13 +349,9 @@ typedef struct VP8_COMP
     int ambient_err;
 
     unsigned int mode_check_freq[MAX_MODES];
-    unsigned int mode_test_hit_counts[MAX_MODES];
     unsigned int mode_chosen_counts[MAX_MODES];
-    unsigned int mbs_tested_so_far;
 
-    int rd_thresh_mult[MAX_MODES];
     int rd_baseline_thresh[MAX_MODES];
-    int rd_threshes[MAX_MODES];
 
     int RDMULT;
     int RDDIV ;
@@ -363,9 +359,7 @@ typedef struct VP8_COMP
     CODING_CONTEXT coding_context;
 
     /* Rate targetting variables */
-    int64_t prediction_error;
     int64_t last_prediction_error;
-    int64_t intra_error;
     int64_t last_intra_error;
 
     int this_frame_target;
@@ -418,12 +412,6 @@ typedef struct VP8_COMP
     int ni_frames;
     int avg_frame_qindex;
 
-    int zbin_over_quant;
-    int zbin_mode_boost;
-    int zbin_mode_boost_enabled;
-    int last_zbin_over_quant;
-    int last_zbin_mode_boost;
-
     int64_t total_byte_count;
 
     int buffered_mode;
@@ -452,13 +440,6 @@ typedef struct VP8_COMP
     int drop_frames_allowed; /* Are we permitted to drop frames? */
     int drop_frame;          /* Drop this frame? */
 
-    int ymode_count [VP8_YMODES];        /* intra MB type cts this frame */
-    int uv_mode_count[VP8_UV_MODES];     /* intra MB type cts this frame */
-
-    unsigned int MVcount [2] [MVvals];  /* (row,col) MV cts this frame */
-
-    unsigned int coef_counts [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];  /* for this frame */
-
     vp8_prob frame_coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
     char update_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
 
@@ -486,7 +467,6 @@ typedef struct VP8_COMP
     int Speed;
     int compressor_speed;
 
-    int interquantizer;
     int auto_gold;
     int auto_adjust_gold_quantizer;
     int auto_worst_q;
@@ -502,25 +482,16 @@ typedef struct VP8_COMP
     int last_skip_probs_q[3];
     int recent_ref_frame_usage[MAX_REF_FRAMES];
 
-    int count_mb_ref_frame_usage[MAX_REF_FRAMES];
     int this_frame_percent_intra;
     int last_frame_percent_intra;
 
     int ref_frame_flags;
 
     SPEED_FEATURES sf;
-    int error_bins[1024];
 
-    /* Data used for real time conferencing mode to help determine if it
-     * would be good to update the gf
-     */
-    int inter_zz_count;
     /* Count ZEROMV on all reference frames. */
     int zeromv_count;
     int lf_zeromv_pct;
-    int gf_bad_count;
-    int gf_update_recommended;
-    int skip_true_count;
 
     unsigned char *segmentation_map;
     signed char segment_feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS];
@@ -659,7 +630,6 @@ typedef struct VP8_COMP
     /* Per MB activity measurement */
     unsigned int activity_avg;
     unsigned int * mb_activity_map;
-    int * mb_norm_activity_map;
 
     /* Record of which MBs still refer to last golden frame either
      * directly or through 0,0
@@ -723,13 +693,10 @@ typedef struct VP8_COMP
     } rd_costs;
 } VP8_COMP;
 
-void control_data_rate(VP8_COMP *cpi);
-
-void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char *dest_end, unsigned long *size);
-
-int rd_cost_intra_mb(MACROBLOCKD *x);
+void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest,
+                        unsigned char *dest_end, unsigned long *size);
 
-void vp8_tokenize_mb(VP8_COMP *, MACROBLOCKD *, TOKENEXTRA **);
+void vp8_tokenize_mb(VP8_COMP *, MACROBLOCK *, TOKENEXTRA **);
 
 void vp8_set_speed_features(VP8_COMP *cpi);
 
diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
index c4fa691a4..673de2b33 100644
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -389,15 +389,16 @@ static void pick_intra_mbuv_mode(MACROBLOCK *mb)
 
 }
 
-static void update_mvcount(VP8_COMP *cpi, MACROBLOCKD *xd, int_mv *best_ref_mv)
+static void update_mvcount(VP8_COMP *cpi, MACROBLOCK *x, int_mv *best_ref_mv)
 {
+    MACROBLOCKD *xd = &x->e_mbd;
     /* Split MV modes currently not supported when RD is nopt enabled,
      * therefore, only need to modify MVcount in NEWMV mode. */
     if (xd->mode_info_context->mbmi.mode == NEWMV)
     {
-        cpi->MVcount[0][mv_max+((xd->mode_info_context->mbmi.mv.as_mv.row -
+        x->MVcount[0][mv_max+((xd->mode_info_context->mbmi.mv.as_mv.row -
                                       best_ref_mv->as_mv.row) >> 1)]++;
-        cpi->MVcount[1][mv_max+((xd->mode_info_context->mbmi.mv.as_mv.col -
+        x->MVcount[1][mv_max+((xd->mode_info_context->mbmi.mv.as_mv.col -
                                       best_ref_mv->as_mv.col) >> 1)]++;
     }
 }
@@ -679,7 +680,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
     get_predictor_pointers(cpi, plane, recon_yoffset, recon_uvoffset);
 
     /* Count of the number of MBs tested so far this frame */
-    cpi->mbs_tested_so_far++;
+    x->mbs_tested_so_far++;
 
     *returnintra = INT_MAX;
     x->skip = 0;
@@ -700,7 +701,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
         int this_rd = INT_MAX;
         int this_ref_frame = ref_frame_map[vp8_ref_frame_order[mode_index]];
 
-        if (best_rd <= cpi->rd_threshes[mode_index])
+        if (best_rd <= x->rd_threshes[mode_index])
             continue;
 
         if (this_ref_frame < 0)
@@ -745,22 +746,22 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
         /* Check to see if the testing frequency for this mode is at its max
          * If so then prevent it from being tested and increase the threshold
          * for its testing */
-        if (cpi->mode_test_hit_counts[mode_index] &&
+        if (x->mode_test_hit_counts[mode_index] &&
                                          (cpi->mode_check_freq[mode_index] > 1))
         {
-            if (cpi->mbs_tested_so_far <= (cpi->mode_check_freq[mode_index] *
-                                         cpi->mode_test_hit_counts[mode_index]))
+            if (x->mbs_tested_so_far <= (cpi->mode_check_freq[mode_index] *
+                                         x->mode_test_hit_counts[mode_index]))
             {
                 /* Increase the threshold for coding this mode to make it less
                  * likely to be chosen */
-                cpi->rd_thresh_mult[mode_index] += 4;
+                x->rd_thresh_mult[mode_index] += 4;
 
-                if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
-                    cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
+                if (x->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
+                    x->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
 
-                cpi->rd_threshes[mode_index] =
+                x->rd_threshes[mode_index] =
                                  (cpi->rd_baseline_thresh[mode_index] >> 7) *
-                                 cpi->rd_thresh_mult[mode_index];
+                                 x->rd_thresh_mult[mode_index];
                 continue;
             }
         }
@@ -768,7 +769,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
         /* We have now reached the point where we are going to test the current
          * mode so increment the counter for the number of times it has been
          * tested */
-        cpi->mode_test_hit_counts[mode_index] ++;
+        x->mode_test_hit_counts[mode_index] ++;
 
         rate2 = 0;
         distortion2 = 0;
@@ -1108,12 +1109,12 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
             /* Testing this mode gave rise to an improvement in best error
              * score. Lower threshold a bit for next time
              */
-            cpi->rd_thresh_mult[mode_index] =
-                     (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ?
-                     cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;
-            cpi->rd_threshes[mode_index] =
+            x->rd_thresh_mult[mode_index] =
+                     (x->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ?
+                     x->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;
+            x->rd_threshes[mode_index] =
                                    (cpi->rd_baseline_thresh[mode_index] >> 7) *
-                                   cpi->rd_thresh_mult[mode_index];
+                                   x->rd_thresh_mult[mode_index];
         }
 
         /* If the mode did not help improve the best error case then raise the
@@ -1121,14 +1122,14 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
          */
         else
         {
-            cpi->rd_thresh_mult[mode_index] += 4;
+            x->rd_thresh_mult[mode_index] += 4;
 
-            if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
-                cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
+            if (x->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
+                x->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
 
-            cpi->rd_threshes[mode_index] =
+            x->rd_threshes[mode_index] =
                          (cpi->rd_baseline_thresh[mode_index] >> 7) *
-                         cpi->rd_thresh_mult[mode_index];
+                         x->rd_thresh_mult[mode_index];
         }
 
         if (x->skip)
@@ -1138,16 +1139,16 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
     /* Reduce the activation RD thresholds for the best choice mode */
     if ((cpi->rd_baseline_thresh[best_mode_index] > 0) && (cpi->rd_baseline_thresh[best_mode_index] < (INT_MAX >> 2)))
     {
-        int best_adjustment = (cpi->rd_thresh_mult[best_mode_index] >> 3);
+        int best_adjustment = (x->rd_thresh_mult[best_mode_index] >> 3);
 
-        cpi->rd_thresh_mult[best_mode_index] =
-                        (cpi->rd_thresh_mult[best_mode_index]
+        x->rd_thresh_mult[best_mode_index] =
+                        (x->rd_thresh_mult[best_mode_index]
                         >= (MIN_THRESHMULT + best_adjustment)) ?
-                        cpi->rd_thresh_mult[best_mode_index] - best_adjustment :
+                        x->rd_thresh_mult[best_mode_index] - best_adjustment :
                         MIN_THRESHMULT;
-        cpi->rd_threshes[best_mode_index] =
+        x->rd_threshes[best_mode_index] =
                         (cpi->rd_baseline_thresh[best_mode_index] >> 7) *
-                        cpi->rd_thresh_mult[best_mode_index];
+                        x->rd_thresh_mult[best_mode_index];
     }
 
 
@@ -1159,7 +1160,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
             this_rdbin = 1023;
         }
 
-        cpi->error_bins[this_rdbin] ++;
+        x->error_bins[this_rdbin] ++;
     }
 
 #if CONFIG_TEMPORAL_DENOISING
@@ -1240,11 +1241,11 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
       != cpi->common.ref_frame_sign_bias[xd->mode_info_context->mbmi.ref_frame])
         best_ref_mv.as_int = best_ref_mv_sb[!sign_bias].as_int;
 
-    update_mvcount(cpi, &x->e_mbd, &best_ref_mv);
+    update_mvcount(cpi, x, &best_ref_mv);
 }
 
 
-void vp8_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate_)
+void vp8_pick_intra_mode(MACROBLOCK *x, int *rate_)
 {
     int error4x4, error16x16 = INT_MAX;
     int rate, best_rate = 0, distortion, best_sse;
diff --git a/vp8/encoder/pickinter.h b/vp8/encoder/pickinter.h
index 6fbd88795..35011cab3 100644
--- a/vp8/encoder/pickinter.h
+++ b/vp8/encoder/pickinter.h
@@ -18,7 +18,7 @@ extern void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
                                 int recon_uvoffset, int *returnrate,
                                 int *returndistortion, int *returnintra,
                                 int mb_row, int mb_col);
-extern void vp8_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate);
+extern void vp8_pick_intra_mode(MACROBLOCK *x, int *rate);
 
 extern int vp8_get_inter_mbpred_error(MACROBLOCK *mb,
                                       const vp8_variance_fn_ptr_t *vfp,
diff --git a/vp8/encoder/quantize.c b/vp8/encoder/quantize.c
index 88fea11bb..33c8ef055 100644
--- a/vp8/encoder/quantize.c
+++ b/vp8/encoder/quantize.c
@@ -587,20 +587,20 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)
 
 #define ZBIN_EXTRA_Y \
     (( cpi->common.Y1dequant[QIndex][1] *  \
-    ( cpi->zbin_over_quant +  \
-      cpi->zbin_mode_boost +  \
+    ( x->zbin_over_quant +  \
+      x->zbin_mode_boost +  \
       x->act_zbin_adj ) ) >> 7)
 
 #define ZBIN_EXTRA_UV \
     (( cpi->common.UVdequant[QIndex][1] *  \
-    ( cpi->zbin_over_quant +  \
-      cpi->zbin_mode_boost +  \
+    ( x->zbin_over_quant +  \
+      x->zbin_mode_boost +  \
       x->act_zbin_adj ) ) >> 7)
 
 #define ZBIN_EXTRA_Y2 \
     (( cpi->common.Y2dequant[QIndex][1] *  \
-    ( (cpi->zbin_over_quant / 2) +  \
-       cpi->zbin_mode_boost +  \
+    ( (x->zbin_over_quant / 2) +  \
+       x->zbin_mode_boost +  \
        x->act_zbin_adj ) ) >> 7)
 
 void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x, int ok_to_skip)
@@ -702,15 +702,15 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x, int ok_to_skip)
         /* save this macroblock QIndex for vp8_update_zbin_extra() */
         x->q_index = QIndex;
 
-        cpi->last_zbin_over_quant = cpi->zbin_over_quant;
-        cpi->last_zbin_mode_boost = cpi->zbin_mode_boost;
+        x->last_zbin_over_quant = x->zbin_over_quant;
+        x->last_zbin_mode_boost = x->zbin_mode_boost;
         x->last_act_zbin_adj = x->act_zbin_adj;
 
 
 
     }
-    else if(cpi->last_zbin_over_quant != cpi->zbin_over_quant
-            || cpi->last_zbin_mode_boost != cpi->zbin_mode_boost
+    else if(x->last_zbin_over_quant != x->zbin_over_quant
+            || x->last_zbin_mode_boost != x->zbin_mode_boost
             || x->last_act_zbin_adj != x->act_zbin_adj)
     {
         /* Y */
@@ -729,8 +729,8 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x, int ok_to_skip)
         zbin_extra = ZBIN_EXTRA_Y2;
         x->block[24].zbin_extra = (short)zbin_extra;
 
-        cpi->last_zbin_over_quant = cpi->zbin_over_quant;
-        cpi->last_zbin_mode_boost = cpi->zbin_mode_boost;
+        x->last_zbin_over_quant = x->zbin_over_quant;
+        x->last_zbin_mode_boost = x->zbin_mode_boost;
         x->last_act_zbin_adj = x->act_zbin_adj;
     }
 }
@@ -764,7 +764,7 @@ void vp8_update_zbin_extra(VP8_COMP *cpi, MACROBLOCK *x)
 void vp8cx_frame_init_quantizer(VP8_COMP *cpi)
 {
     /* Clear Zbin mode boost for default case */
-    cpi->zbin_mode_boost = 0;
+    cpi->mb.zbin_mode_boost = 0;
 
     /* MB level quantizer setup */
     vp8cx_mb_init_quantizer(cpi, &cpi->mb, 0);
diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c
index 1432c143b..a399a3877 100644
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -242,8 +242,8 @@ void vp8_save_coding_context(VP8_COMP *cpi)
     vp8_copy(cc->ymode_prob,   cpi->common.fc.ymode_prob);
     vp8_copy(cc->uv_mode_prob,  cpi->common.fc.uv_mode_prob);
 
-    vp8_copy(cc->ymode_count, cpi->ymode_count);
-    vp8_copy(cc->uv_mode_count, cpi->uv_mode_count);
+    vp8_copy(cc->ymode_count, cpi->mb.ymode_count);
+    vp8_copy(cc->uv_mode_count, cpi->mb.uv_mode_count);
 
 
     /* Stats */
@@ -280,8 +280,8 @@ void vp8_restore_coding_context(VP8_COMP *cpi)
     vp8_copy(cpi->common.fc.ymode_prob,   cc->ymode_prob);
     vp8_copy(cpi->common.fc.uv_mode_prob,  cc->uv_mode_prob);
 
-    vp8_copy(cpi->ymode_count, cc->ymode_count);
-    vp8_copy(cpi->uv_mode_count, cc->uv_mode_count);
+    vp8_copy(cpi->mb.ymode_count, cc->ymode_count);
+    vp8_copy(cpi->mb.uv_mode_count, cc->uv_mode_count);
 
     /* Stats */
 #ifdef MODE_STATS
@@ -1109,7 +1109,9 @@ void vp8_update_rate_correction_factors(VP8_COMP *cpi, int damp_var)
     }
     else
     {
-        if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame)
+        if (cpi->oxcf.number_of_layers == 1 &&
+           (cpi->common.refresh_alt_ref_frame ||
+            cpi->common.refresh_golden_frame))
             rate_correction_factor = cpi->gf_rate_correction_factor;
         else
             rate_correction_factor = cpi->rate_correction_factor;
@@ -1122,9 +1124,9 @@ void vp8_update_rate_correction_factors(VP8_COMP *cpi, int damp_var)
     projected_size_based_on_q = (int)(((.5 + rate_correction_factor * vp8_bits_per_mb[cpi->common.frame_type][Q]) * cpi->common.MBs) / (1 << BPER_MB_NORMBITS));
 
     /* Make some allowance for cpi->zbin_over_quant */
-    if (cpi->zbin_over_quant > 0)
+    if (cpi->mb.zbin_over_quant > 0)
     {
-        int Z = cpi->zbin_over_quant;
+        int Z = cpi->mb.zbin_over_quant;
         double Factor = 0.99;
         double factor_adjustment = 0.01 / 256.0;
 
@@ -1186,7 +1188,9 @@ void vp8_update_rate_correction_factors(VP8_COMP *cpi, int damp_var)
         cpi->key_frame_rate_correction_factor = rate_correction_factor;
     else
     {
-        if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame)
+        if (cpi->oxcf.number_of_layers == 1 &&
+           (cpi->common.refresh_alt_ref_frame ||
+            cpi->common.refresh_golden_frame))
             cpi->gf_rate_correction_factor = rate_correction_factor;
         else
             cpi->rate_correction_factor = rate_correction_factor;
@@ -1199,7 +1203,7 @@ int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame)
     int Q = cpi->active_worst_quality;
 
     /* Reset Zbin OQ value */
-    cpi->zbin_over_quant = 0;
+    cpi->mb.zbin_over_quant = 0;
 
     if (cpi->oxcf.fixed_q >= 0)
     {
@@ -1209,11 +1213,13 @@ int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame)
         {
             Q = cpi->oxcf.key_q;
         }
-        else if (cpi->common.refresh_alt_ref_frame)
+        else if (cpi->oxcf.number_of_layers == 1 &&
+            cpi->common.refresh_alt_ref_frame)
         {
             Q = cpi->oxcf.alt_q;
         }
-        else if (cpi->common.refresh_golden_frame)
+        else if (cpi->oxcf.number_of_layers == 1  &&
+            cpi->common.refresh_golden_frame)
         {
             Q = cpi->oxcf.gold_q;
         }
@@ -1232,7 +1238,9 @@ int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame)
             correction_factor = cpi->key_frame_rate_correction_factor;
         else
         {
-            if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame)
+            if (cpi->oxcf.number_of_layers == 1 &&
+               (cpi->common.refresh_alt_ref_frame ||
+                cpi->common.refresh_golden_frame))
                 correction_factor = cpi->gf_rate_correction_factor;
             else
                 correction_factor = cpi->rate_correction_factor;
@@ -1281,7 +1289,10 @@ int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame)
 
             if (cpi->common.frame_type == KEY_FRAME)
                 zbin_oqmax = 0;
-            else if (cpi->common.refresh_alt_ref_frame || (cpi->common.refresh_golden_frame && !cpi->source_alt_ref_active))
+            else if (cpi->oxcf.number_of_layers == 1 &&
+                (cpi->common.refresh_alt_ref_frame ||
+                (cpi->common.refresh_golden_frame &&
+                 !cpi->source_alt_ref_active)))
                 zbin_oqmax = 16;
             else
                 zbin_oqmax = ZBIN_OQ_MAX;
@@ -1307,12 +1318,12 @@ int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame)
              * normal maximum by expanding the zero bin and hence
              * decreasing the number of low magnitude non zero coefficients.
              */
-            while (cpi->zbin_over_quant < zbin_oqmax)
+            while (cpi->mb.zbin_over_quant < zbin_oqmax)
             {
-                cpi->zbin_over_quant ++;
+                cpi->mb.zbin_over_quant ++;
 
-                if (cpi->zbin_over_quant > zbin_oqmax)
-                    cpi->zbin_over_quant = zbin_oqmax;
+                if (cpi->mb.zbin_over_quant > zbin_oqmax)
+                    cpi->mb.zbin_over_quant = zbin_oqmax;
 
                 /* Adjust bits_per_mb_at_this_q estimate */
                 bits_per_mb_at_this_q = (int)(Factor * bits_per_mb_at_this_q);
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index 946d22a8f..f0ec7b6e2 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -223,7 +223,7 @@ void vp8cx_initialize_me_consts(VP8_COMP *cpi, int QIndex)
     cpi->mb.sadperbit4  =  sad_per_bit4lut[QIndex];
 }
 
-void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue)
+void vp8_initialize_rd_consts(VP8_COMP *cpi, MACROBLOCK *x, int Qvalue)
 {
     int q;
     int i;
@@ -238,15 +238,15 @@ void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue)
     cpi->RDMULT = (int)(rdconst * (capped_q * capped_q));
 
     /* Extend rate multiplier along side quantizer zbin increases */
-    if (cpi->zbin_over_quant  > 0)
+    if (cpi->mb.zbin_over_quant  > 0)
     {
         double oq_factor;
         double modq;
 
         /* Experimental code using the same basic equation as used for Q above
-         * The units of cpi->zbin_over_quant are 1/128 of Q bin size
+         * The units of cpi->mb.zbin_over_quant are 1/128 of Q bin size
          */
-        oq_factor = 1.0 + ((double)0.0015625 * cpi->zbin_over_quant);
+        oq_factor = 1.0 + ((double)0.0015625 * cpi->mb.zbin_over_quant);
         modq = (int)((double)capped_q * oq_factor);
         cpi->RDMULT = (int)(rdconst * (modq * modq));
     }
@@ -265,6 +265,11 @@ void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue)
 
     vp8_set_speed_features(cpi);
 
+    for (i = 0; i < MAX_MODES; i++)
+    {
+        x->mode_test_hit_counts[i] = 0;
+    }
+
     q = (int)pow(Qvalue, 1.25);
 
     if (q < 8)
@@ -279,14 +284,14 @@ void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue)
         {
             if (cpi->sf.thresh_mult[i] < INT_MAX)
             {
-                cpi->rd_threshes[i] = cpi->sf.thresh_mult[i] * q / 100;
+                x->rd_threshes[i] = cpi->sf.thresh_mult[i] * q / 100;
             }
             else
             {
-                cpi->rd_threshes[i] = INT_MAX;
+                x->rd_threshes[i] = INT_MAX;
             }
 
-            cpi->rd_baseline_thresh[i] = cpi->rd_threshes[i];
+            cpi->rd_baseline_thresh[i] = x->rd_threshes[i];
         }
     }
     else
@@ -297,14 +302,14 @@ void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue)
         {
             if (cpi->sf.thresh_mult[i] < (INT_MAX / q))
             {
-                cpi->rd_threshes[i] = cpi->sf.thresh_mult[i] * q;
+                x->rd_threshes[i] = cpi->sf.thresh_mult[i] * q;
             }
             else
             {
-                cpi->rd_threshes[i] = INT_MAX;
+                x->rd_threshes[i] = INT_MAX;
             }
 
-            cpi->rd_baseline_thresh[i] = cpi->rd_threshes[i];
+            cpi->rd_baseline_thresh[i] = x->rd_threshes[i];
         }
     }
 
@@ -625,7 +630,6 @@ static void copy_predictor(unsigned char *dst, const unsigned char *predictor)
     d[12] = p[12];
 }
 static int rd_pick_intra4x4block(
-    VP8_COMP *cpi,
     MACROBLOCK *x,
     BLOCK *be,
     BLOCKD *b,
@@ -701,7 +705,7 @@ static int rd_pick_intra4x4block(
     return best_rd;
 }
 
-static int rd_pick_intra4x4mby_modes(VP8_COMP *cpi, MACROBLOCK *mb, int *Rate,
+static int rd_pick_intra4x4mby_modes(MACROBLOCK *mb, int *Rate,
                                      int *rate_y, int *Distortion, int best_rd)
 {
     MACROBLOCKD *const xd = &mb->e_mbd;
@@ -741,7 +745,7 @@ static int rd_pick_intra4x4mby_modes(VP8_COMP *cpi, MACROBLOCK *mb, int *Rate,
         }
 
         total_rd += rd_pick_intra4x4block(
-            cpi, mb, mb->block + i, xd->block + i, &best_mode, bmode_costs,
+            mb, mb->block + i, xd->block + i, &best_mode, bmode_costs,
             ta + vp8_block2above[i],
             tl + vp8_block2left[i], &r, &ry, &d);
 
@@ -766,8 +770,7 @@ static int rd_pick_intra4x4mby_modes(VP8_COMP *cpi, MACROBLOCK *mb, int *Rate,
 }
 
 
-static int rd_pick_intra16x16mby_mode(VP8_COMP *cpi,
-                                      MACROBLOCK *x,
+static int rd_pick_intra16x16mby_mode(MACROBLOCK *x,
                                       int *Rate,
                                       int *rate_y,
                                       int *Distortion)
@@ -869,7 +872,8 @@ static int rd_inter4x4_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
     return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
 }
 
-static void rd_pick_intra_mbuv_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate, int *rate_tokenonly, int *distortion)
+static void rd_pick_intra_mbuv_mode(MACROBLOCK *x, int *rate,
+                                    int *rate_tokenonly, int *distortion)
 {
     MB_PREDICTION_MODE mode;
     MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
@@ -1739,18 +1743,18 @@ static void rd_update_mvcount(VP8_COMP *cpi, MACROBLOCK *x, int_mv *best_ref_mv)
         {
             if (x->partition_info->bmi[i].mode == NEW4X4)
             {
-                cpi->MVcount[0][mv_max+((x->partition_info->bmi[i].mv.as_mv.row
+                x->MVcount[0][mv_max+((x->partition_info->bmi[i].mv.as_mv.row
                                           - best_ref_mv->as_mv.row) >> 1)]++;
-                cpi->MVcount[1][mv_max+((x->partition_info->bmi[i].mv.as_mv.col
+                x->MVcount[1][mv_max+((x->partition_info->bmi[i].mv.as_mv.col
                                           - best_ref_mv->as_mv.col) >> 1)]++;
             }
         }
     }
     else if (x->e_mbd.mode_info_context->mbmi.mode == NEWMV)
     {
-        cpi->MVcount[0][mv_max+((x->e_mbd.mode_info_context->mbmi.mv.as_mv.row
+        x->MVcount[0][mv_max+((x->e_mbd.mode_info_context->mbmi.mv.as_mv.row
                                           - best_ref_mv->as_mv.row) >> 1)]++;
-        cpi->MVcount[1][mv_max+((x->e_mbd.mode_info_context->mbmi.mv.as_mv.col
+        x->MVcount[1][mv_max+((x->e_mbd.mode_info_context->mbmi.mv.as_mv.col
                                           - best_ref_mv->as_mv.col) >> 1)]++;
     }
 }
@@ -2011,7 +2015,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
 
     *returnintra = INT_MAX;
     /* Count of the number of MBs tested so far this frame */
-    cpi->mbs_tested_so_far++;
+    x->mbs_tested_so_far++;
 
     x->skip = 0;
 
@@ -2023,7 +2027,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
         int this_ref_frame = ref_frame_map[vp8_ref_frame_order[mode_index]];
 
         /* Test best rd so far against threshold for trying this mode. */
-        if (best_mode.rd <= cpi->rd_threshes[mode_index])
+        if (best_mode.rd <= x->rd_threshes[mode_index])
             continue;
 
         if (this_ref_frame < 0)
@@ -2069,19 +2073,21 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
          * max If so then prevent it from being tested and increase the
          * threshold for its testing
          */
-        if (cpi->mode_test_hit_counts[mode_index] && (cpi->mode_check_freq[mode_index] > 1))
+        if (x->mode_test_hit_counts[mode_index] && (cpi->mode_check_freq[mode_index] > 1))
         {
-            if (cpi->mbs_tested_so_far  <= cpi->mode_check_freq[mode_index] * cpi->mode_test_hit_counts[mode_index])
+            if (x->mbs_tested_so_far  <= cpi->mode_check_freq[mode_index] * x->mode_test_hit_counts[mode_index])
             {
                 /* Increase the threshold for coding this mode to make it
                  * less likely to be chosen
                  */
-                cpi->rd_thresh_mult[mode_index] += 4;
+                x->rd_thresh_mult[mode_index] += 4;
 
-                if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
-                    cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
+                if (x->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
+                    x->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
 
-                cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
+                x->rd_threshes[mode_index] =
+                    (cpi->rd_baseline_thresh[mode_index] >> 7) *
+                    x->rd_thresh_mult[mode_index];
 
                 continue;
             }
@@ -2091,28 +2097,28 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
          * current mode so increment the counter for the number of times
          * it has been tested
          */
-        cpi->mode_test_hit_counts[mode_index] ++;
+        x->mode_test_hit_counts[mode_index] ++;
 
         /* Experimental code. Special case for gf and arf zeromv modes.
          * Increase zbin size to supress noise
          */
-        if (cpi->zbin_mode_boost_enabled)
+        if (x->zbin_mode_boost_enabled)
         {
             if ( this_ref_frame == INTRA_FRAME )
-                cpi->zbin_mode_boost = 0;
+                x->zbin_mode_boost = 0;
             else
             {
                 if (vp8_mode_order[mode_index] == ZEROMV)
                 {
                     if (this_ref_frame != LAST_FRAME)
-                        cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
+                        x->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
                     else
-                        cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
+                        x->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
                 }
                 else if (vp8_mode_order[mode_index] == SPLITMV)
-                    cpi->zbin_mode_boost = 0;
+                    x->zbin_mode_boost = 0;
                 else
-                    cpi->zbin_mode_boost = MV_ZBIN_BOOST;
+                    x->zbin_mode_boost = MV_ZBIN_BOOST;
             }
 
             vp8_update_zbin_extra(cpi, x);
@@ -2120,7 +2126,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
 
         if(!uv_intra_done && this_ref_frame == INTRA_FRAME)
         {
-            rd_pick_intra_mbuv_mode(cpi, x, &uv_intra_rate,
+            rd_pick_intra_mbuv_mode(x, &uv_intra_rate,
                                     &uv_intra_rate_tokenonly,
                                     &uv_intra_distortion);
             uv_intra_mode = x->e_mbd.mode_info_context->mbmi.uv_mode;
@@ -2146,7 +2152,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
              * coding the BPRED mode: x->mbmode_cost[x->e_mbd.frame_type][BPRED]
              */
             int distortion;
-            tmp_rd = rd_pick_intra4x4mby_modes(cpi, x, &rate, &rd.rate_y, &distortion, best_mode.yrd);
+            tmp_rd = rd_pick_intra4x4mby_modes(x, &rate, &rd.rate_y, &distortion, best_mode.yrd);
             rd.rate2 += rate;
             rd.distortion2 += distortion;
 
@@ -2171,8 +2177,10 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
             int this_rd_thresh;
             int distortion;
 
-            this_rd_thresh = (vp8_ref_frame_order[mode_index] == 1) ? cpi->rd_threshes[THR_NEW1] : cpi->rd_threshes[THR_NEW3];
-            this_rd_thresh = (vp8_ref_frame_order[mode_index] == 2) ? cpi->rd_threshes[THR_NEW2] : this_rd_thresh;
+            this_rd_thresh = (vp8_ref_frame_order[mode_index] == 1) ?
+                x->rd_threshes[THR_NEW1] : x->rd_threshes[THR_NEW3];
+            this_rd_thresh = (vp8_ref_frame_order[mode_index] == 2) ?
+                x->rd_threshes[THR_NEW2] : this_rd_thresh;
 
             tmp_rd = vp8_rd_pick_best_mbsegmentation(cpi, x, &best_ref_mv,
                                                      best_mode.yrd, mdcounts,
@@ -2465,8 +2473,9 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
             /* Testing this mode gave rise to an improvement in best error
              * score. Lower threshold a bit for next time
              */
-            cpi->rd_thresh_mult[mode_index] = (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ? cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;
-            cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
+            x->rd_thresh_mult[mode_index] =
+                (x->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ?
+                    x->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;
         }
 
         /* If the mode did not help improve the best error case then raise
@@ -2474,13 +2483,14 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
          */
         else
         {
-            cpi->rd_thresh_mult[mode_index] += 4;
-
-            if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
-                cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
+            x->rd_thresh_mult[mode_index] += 4;
 
-            cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
+            if (x->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
+                x->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
         }
+        x->rd_threshes[mode_index] =
+            (cpi->rd_baseline_thresh[mode_index] >> 7) *
+                x->rd_thresh_mult[mode_index];
 
         if (x->skip)
             break;
@@ -2490,10 +2500,16 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
     /* Reduce the activation RD thresholds for the best choice mode */
     if ((cpi->rd_baseline_thresh[best_mode_index] > 0) && (cpi->rd_baseline_thresh[best_mode_index] < (INT_MAX >> 2)))
     {
-        int best_adjustment = (cpi->rd_thresh_mult[best_mode_index] >> 2);
-
-        cpi->rd_thresh_mult[best_mode_index] = (cpi->rd_thresh_mult[best_mode_index] >= (MIN_THRESHMULT + best_adjustment)) ? cpi->rd_thresh_mult[best_mode_index] - best_adjustment : MIN_THRESHMULT;
-        cpi->rd_threshes[best_mode_index] = (cpi->rd_baseline_thresh[best_mode_index] >> 7) * cpi->rd_thresh_mult[best_mode_index];
+        int best_adjustment = (x->rd_thresh_mult[best_mode_index] >> 2);
+
+        x->rd_thresh_mult[best_mode_index] =
+            (x->rd_thresh_mult[best_mode_index] >=
+                (MIN_THRESHMULT + best_adjustment)) ?
+                    x->rd_thresh_mult[best_mode_index] - best_adjustment :
+                    MIN_THRESHMULT;
+        x->rd_threshes[best_mode_index] =
+            (cpi->rd_baseline_thresh[best_mode_index] >> 7) *
+                x->rd_thresh_mult[best_mode_index];
     }
 
     /* Note how often each mode chosen as best */
@@ -2595,7 +2611,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
     rd_update_mvcount(cpi, x, &best_ref_mv);
 }
 
-void vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate_)
+void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate_)
 {
     int error4x4, error16x16;
     int rate4x4, rate16x16 = 0, rateuv;
@@ -2607,15 +2623,13 @@ void vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate_)
 
     x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;
 
-    rd_pick_intra_mbuv_mode(cpi, x, &rateuv, &rateuv_tokenonly, &distuv);
+    rd_pick_intra_mbuv_mode(x, &rateuv, &rateuv_tokenonly, &distuv);
     rate = rateuv;
 
-    error16x16 = rd_pick_intra16x16mby_mode(cpi, x,
-                                            &rate16x16, &rate16x16_tokenonly,
+    error16x16 = rd_pick_intra16x16mby_mode(x, &rate16x16, &rate16x16_tokenonly,
                                             &dist16x16);
 
-    error4x4 = rd_pick_intra4x4mby_modes(cpi, x,
-                                         &rate4x4, &rate4x4_tokenonly,
+    error4x4 = rd_pick_intra4x4mby_modes(x, &rate4x4, &rate4x4_tokenonly,
                                          &dist4x4, error16x16);
 
     if (error4x4 < error16x16)
diff --git a/vp8/encoder/rdopt.h b/vp8/encoder/rdopt.h
index bbcb59f67..1e11fa77d 100644
--- a/vp8/encoder/rdopt.h
+++ b/vp8/encoder/rdopt.h
@@ -65,9 +65,9 @@ static void insertsortsad(int arr[],int idx[], int len)
     }
 }
 
-extern void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue);
+extern void vp8_initialize_rd_consts(VP8_COMP *cpi, MACROBLOCK *x, int Qvalue);
 extern void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra);
-extern void vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate);
+extern void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate);
 
 
 static void get_plane_pointers(const YV12_BUFFER_CONFIG *fb,
diff --git a/vp8/encoder/tokenize.c b/vp8/encoder/tokenize.c
index ef41fa8f8..3b5268b61 100644
--- a/vp8/encoder/tokenize.c
+++ b/vp8/encoder/tokenize.c
@@ -23,7 +23,7 @@
 #ifdef ENTROPY_STATS
 _int64 context_counters[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
 #endif
-void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) ;
+void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t) ;
 void vp8_fix_contexts(MACROBLOCKD *x);
 
 #include "dct_value_tokens.h"
@@ -102,11 +102,12 @@ static void fill_value_tokens()
 
 static void tokenize2nd_order_b
 (
-    MACROBLOCKD *x,
+    MACROBLOCK *x,
     TOKENEXTRA **tp,
     VP8_COMP *cpi
 )
 {
+    MACROBLOCKD *xd = &x->e_mbd;
     int pt;             /* near block/prev token context index */
     int c;              /* start at DC */
     TOKENEXTRA *t = *tp;/* store tokens starting here */
@@ -117,11 +118,11 @@ static void tokenize2nd_order_b
     int band, rc, v, token;
     int eob;
 
-    b = x->block + 24;
+    b = xd->block + 24;
     qcoeff_ptr = b->qcoeff;
-    a = (ENTROPY_CONTEXT *)x->above_context + 8;
-    l = (ENTROPY_CONTEXT *)x->left_context + 8;
-    eob = x->eobs[24];
+    a = (ENTROPY_CONTEXT *)xd->above_context + 8;
+    l = (ENTROPY_CONTEXT *)xd->left_context + 8;
+    eob = xd->eobs[24];
     VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
 
     if(!eob)
@@ -131,7 +132,7 @@ static void tokenize2nd_order_b
         t->context_tree = cpi->common.fc.coef_probs [1] [0] [pt];
         t->skip_eob_node = 0;
 
-        ++cpi->coef_counts       [1] [0] [pt] [DCT_EOB_TOKEN];
+        ++x->coef_counts       [1] [0] [pt] [DCT_EOB_TOKEN];
         t++;
         *tp = t;
         *a = *l = 0;
@@ -145,7 +146,7 @@ static void tokenize2nd_order_b
 
     t->context_tree = cpi->common.fc.coef_probs [1] [0] [pt];
     t->skip_eob_node = 0;
-    ++cpi->coef_counts       [1] [0] [pt] [token];
+    ++x->coef_counts       [1] [0] [pt] [token];
     pt = vp8_prev_token_class[token];
     t++;
     c = 1;
@@ -164,7 +165,7 @@ static void tokenize2nd_order_b
 
         t->skip_eob_node = ((pt == 0));
 
-        ++cpi->coef_counts       [1] [band] [pt] [token];
+        ++x->coef_counts       [1] [band] [pt] [token];
 
         pt = vp8_prev_token_class[token];
         t++;
@@ -177,7 +178,7 @@ static void tokenize2nd_order_b
 
         t->skip_eob_node = 0;
 
-        ++cpi->coef_counts       [1] [band] [pt] [DCT_EOB_TOKEN];
+        ++x->coef_counts       [1] [band] [pt] [DCT_EOB_TOKEN];
 
         t++;
     }
@@ -189,12 +190,13 @@ static void tokenize2nd_order_b
 
 static void tokenize1st_order_b
 (
-    MACROBLOCKD *x,
+    MACROBLOCK *x,
     TOKENEXTRA **tp,
     int type,           /* which plane: 0=Y no DC, 1=Y2, 2=UV, 3=Y with DC */
     VP8_COMP *cpi
 )
 {
+    MACROBLOCKD *xd = &x->e_mbd;
     unsigned int block;
     const BLOCKD *b;
     int pt;             /* near block/prev token context index */
@@ -207,15 +209,15 @@ static void tokenize1st_order_b
     int band, rc, v;
     int tmp1, tmp2;
 
-    b = x->block;
+    b = xd->block;
     /* Luma */
     for (block = 0; block < 16; block++, b++)
     {
         tmp1 = vp8_block2above[block];
         tmp2 = vp8_block2left[block];
         qcoeff_ptr = b->qcoeff;
-        a = (ENTROPY_CONTEXT *)x->above_context + tmp1;
-        l = (ENTROPY_CONTEXT *)x->left_context + tmp2;
+        a = (ENTROPY_CONTEXT *)xd->above_context + tmp1;
+        l = (ENTROPY_CONTEXT *)xd->left_context + tmp2;
 
         VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
 
@@ -228,7 +230,7 @@ static void tokenize1st_order_b
             t->context_tree = cpi->common.fc.coef_probs [type] [c] [pt];
             t->skip_eob_node = 0;
 
-            ++cpi->coef_counts       [type] [c] [pt] [DCT_EOB_TOKEN];
+            ++x->coef_counts       [type] [c] [pt] [DCT_EOB_TOKEN];
             t++;
             *tp = t;
             *a = *l = 0;
@@ -243,7 +245,7 @@ static void tokenize1st_order_b
 
         t->context_tree = cpi->common.fc.coef_probs [type] [c] [pt];
         t->skip_eob_node = 0;
-        ++cpi->coef_counts       [type] [c] [pt] [token];
+        ++x->coef_counts       [type] [c] [pt] [token];
         pt = vp8_prev_token_class[token];
         t++;
         c++;
@@ -261,7 +263,7 @@ static void tokenize1st_order_b
             t->context_tree = cpi->common.fc.coef_probs [type] [band] [pt];
 
             t->skip_eob_node = (pt == 0);
-            ++cpi->coef_counts       [type] [band] [pt] [token];
+            ++x->coef_counts       [type] [band] [pt] [token];
 
             pt = vp8_prev_token_class[token];
             t++;
@@ -273,7 +275,7 @@ static void tokenize1st_order_b
             t->context_tree = cpi->common.fc.coef_probs [type] [band] [pt];
 
             t->skip_eob_node = 0;
-            ++cpi->coef_counts       [type] [band] [pt] [DCT_EOB_TOKEN];
+            ++x->coef_counts       [type] [band] [pt] [DCT_EOB_TOKEN];
 
             t++;
         }
@@ -287,8 +289,8 @@ static void tokenize1st_order_b
         tmp1 = vp8_block2above[block];
         tmp2 = vp8_block2left[block];
         qcoeff_ptr = b->qcoeff;
-        a = (ENTROPY_CONTEXT *)x->above_context + tmp1;
-        l = (ENTROPY_CONTEXT *)x->left_context + tmp2;
+        a = (ENTROPY_CONTEXT *)xd->above_context + tmp1;
+        l = (ENTROPY_CONTEXT *)xd->left_context + tmp2;
 
         VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
 
@@ -299,7 +301,7 @@ static void tokenize1st_order_b
             t->context_tree = cpi->common.fc.coef_probs [2] [0] [pt];
             t->skip_eob_node = 0;
 
-            ++cpi->coef_counts       [2] [0] [pt] [DCT_EOB_TOKEN];
+            ++x->coef_counts       [2] [0] [pt] [DCT_EOB_TOKEN];
             t++;
             *tp = t;
             *a = *l = 0;
@@ -314,7 +316,7 @@ static void tokenize1st_order_b
 
         t->context_tree = cpi->common.fc.coef_probs [2] [0] [pt];
         t->skip_eob_node = 0;
-        ++cpi->coef_counts       [2] [0] [pt] [token];
+        ++x->coef_counts       [2] [0] [pt] [token];
         pt = vp8_prev_token_class[token];
         t++;
         c = 1;
@@ -333,7 +335,7 @@ static void tokenize1st_order_b
 
             t->skip_eob_node = (pt == 0);
 
-            ++cpi->coef_counts       [2] [band] [pt] [token];
+            ++x->coef_counts       [2] [band] [pt] [token];
 
             pt = vp8_prev_token_class[token];
             t++;
@@ -346,7 +348,7 @@ static void tokenize1st_order_b
 
             t->skip_eob_node = 0;
 
-            ++cpi->coef_counts       [2] [band] [pt] [DCT_EOB_TOKEN];
+            ++x->coef_counts       [2] [band] [pt] [DCT_EOB_TOKEN];
 
             t++;
         }
@@ -374,16 +376,18 @@ static int mb_is_skippable(MACROBLOCKD *x, int has_y2_block)
 }
 
 
-void vp8_tokenize_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t)
+void vp8_tokenize_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t)
 {
+    MACROBLOCKD *xd = &x->e_mbd;
     int plane_type;
     int has_y2_block;
 
-    has_y2_block = (x->mode_info_context->mbmi.mode != B_PRED
-                    && x->mode_info_context->mbmi.mode != SPLITMV);
+    has_y2_block = (xd->mode_info_context->mbmi.mode != B_PRED
+                    && xd->mode_info_context->mbmi.mode != SPLITMV);
 
-    x->mode_info_context->mbmi.mb_skip_coeff = mb_is_skippable(x, has_y2_block);
-    if (x->mode_info_context->mbmi.mb_skip_coeff)
+    xd->mode_info_context->mbmi.mb_skip_coeff =
+        mb_is_skippable(xd, has_y2_block);
+    if (xd->mode_info_context->mbmi.mb_skip_coeff)
     {
         if (!cpi->common.mb_no_coeff_skip)
         {
@@ -391,8 +395,8 @@ void vp8_tokenize_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t)
         }
         else
         {
-            vp8_fix_contexts(x);
-            cpi->skip_true_count++;
+            vp8_fix_contexts(xd);
+            x->skip_true_count++;
         }
 
         return;
@@ -488,7 +492,8 @@ static void stuff2nd_order_b
     TOKENEXTRA **tp,
     ENTROPY_CONTEXT *a,
     ENTROPY_CONTEXT *l,
-    VP8_COMP *cpi
+    VP8_COMP *cpi,
+    MACROBLOCK *x
 )
 {
     int pt; /* near block/prev token context index */
@@ -498,13 +503,12 @@ static void stuff2nd_order_b
     t->Token = DCT_EOB_TOKEN;
     t->context_tree = cpi->common.fc.coef_probs [1] [0] [pt];
     t->skip_eob_node = 0;
-    ++cpi->coef_counts       [1] [0] [pt] [DCT_EOB_TOKEN];
+    ++x->coef_counts       [1] [0] [pt] [DCT_EOB_TOKEN];
     ++t;
 
     *tp = t;
     pt = 0;
     *a = *l = pt;
-
 }
 
 static void stuff1st_order_b
@@ -513,7 +517,8 @@ static void stuff1st_order_b
     ENTROPY_CONTEXT *a,
     ENTROPY_CONTEXT *l,
     int type,
-    VP8_COMP *cpi
+    VP8_COMP *cpi,
+    MACROBLOCK *x
 )
 {
     int pt; /* near block/prev token context index */
@@ -524,20 +529,21 @@ static void stuff1st_order_b
     t->Token = DCT_EOB_TOKEN;
     t->context_tree = cpi->common.fc.coef_probs [type] [band] [pt];
     t->skip_eob_node = 0;
-    ++cpi->coef_counts       [type] [band] [pt] [DCT_EOB_TOKEN];
+    ++x->coef_counts       [type] [band] [pt] [DCT_EOB_TOKEN];
     ++t;
     *tp = t;
     pt = 0; /* 0 <-> all coeff data is zero */
     *a = *l = pt;
-
 }
+
 static
 void stuff1st_order_buv
 (
     TOKENEXTRA **tp,
     ENTROPY_CONTEXT *a,
     ENTROPY_CONTEXT *l,
-    VP8_COMP *cpi
+    VP8_COMP *cpi,
+    MACROBLOCK *x
 )
 {
     int pt; /* near block/prev token context index */
@@ -547,38 +553,38 @@ void stuff1st_order_buv
     t->Token = DCT_EOB_TOKEN;
     t->context_tree = cpi->common.fc.coef_probs [2] [0] [pt];
     t->skip_eob_node = 0;
-    ++cpi->coef_counts[2] [0] [pt] [DCT_EOB_TOKEN];
+    ++x->coef_counts[2] [0] [pt] [DCT_EOB_TOKEN];
     ++t;
     *tp = t;
     pt = 0; /* 0 <-> all coeff data is zero */
     *a = *l = pt;
-
 }
 
-void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t)
+void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t)
 {
-    ENTROPY_CONTEXT * A = (ENTROPY_CONTEXT *)x->above_context;
-    ENTROPY_CONTEXT * L = (ENTROPY_CONTEXT *)x->left_context;
+    MACROBLOCKD *xd = &x->e_mbd;
+    ENTROPY_CONTEXT * A = (ENTROPY_CONTEXT *)xd->above_context;
+    ENTROPY_CONTEXT * L = (ENTROPY_CONTEXT *)xd->left_context;
     int plane_type;
     int b;
     plane_type = 3;
-    if((x->mode_info_context->mbmi.mode != B_PRED
-                        && x->mode_info_context->mbmi.mode != SPLITMV))
+    if((xd->mode_info_context->mbmi.mode != B_PRED
+                        && xd->mode_info_context->mbmi.mode != SPLITMV))
     {
         stuff2nd_order_b(t,
-                     A + vp8_block2above[24], L + vp8_block2left[24], cpi);
+                     A + vp8_block2above[24], L + vp8_block2left[24], cpi, x);
         plane_type = 0;
     }
 
     for (b = 0; b < 16; b++)
         stuff1st_order_b(t,
                          A + vp8_block2above[b],
-                         L + vp8_block2left[b], plane_type, cpi);
+                         L + vp8_block2left[b], plane_type, cpi, x);
 
     for (b = 16; b < 24; b++)
         stuff1st_order_buv(t,
                            A + vp8_block2above[b],
-                           L + vp8_block2left[b], cpi);
+                           L + vp8_block2left[b], cpi, x);
 
 }
 void vp8_fix_contexts(MACROBLOCKD *x)
diff --git a/vp8/encoder/x86/denoising_sse2.c b/vp8/encoder/x86/denoising_sse2.c
index 87cccaa66..cceb8263f 100644
--- a/vp8/encoder/x86/denoising_sse2.c
+++ b/vp8/encoder/x86/denoising_sse2.c
@@ -15,6 +15,7 @@
 #include "vp8_rtcd.h"
 
 #include <emmintrin.h>
+#include "vpx_ports/emmintrin_compat.h"
 
 union sum_union {
     __m128i v;
diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk
index f5e25f3e7..0659407ad 100644
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -20,16 +20,9 @@ ifeq ($(ARCH_ARM),yes)
   include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8cx_arm.mk
 endif
 
-VP8_CX_SRCS-yes += vp8_cx_iface.c
+VP8_CX_SRCS-yes += vp8cx.mk
 
-# encoder
-#INCLUDES += algo/vpx_common/vpx_mem/include
-#INCLUDES += common
-#INCLUDES += common
-#INCLUDES += common
-#INCLUDES += algo/vpx_ref/cpu_id/include
-#INCLUDES += common
-#INCLUDES += encoder
+VP8_CX_SRCS-yes += vp8_cx_iface.c
 
 VP8_CX_SRCS-yes += encoder/asm_enc_offsets.c
 VP8_CX_SRCS-yes += encoder/defaultcoefcounts.h
diff --git a/vp8/vp8cx_arm.mk b/vp8/vp8cx_arm.mk
index b16615d1b..b030ee57e 100644
--- a/vp8/vp8cx_arm.mk
+++ b/vp8/vp8cx_arm.mk
@@ -9,7 +9,7 @@
 ##
 
 
-#VP8_CX_SRCS list is modified according to different platforms.
+VP8_CX_SRCS-$(ARCH_ARM)  += vp8cx_arm.mk
 
 #File list for arm
 # encoder
diff --git a/vp8/vp8dx.mk b/vp8/vp8dx.mk
index b5b90d37d..8be4c7ba5 100644
--- a/vp8/vp8dx.mk
+++ b/vp8/vp8dx.mk
@@ -16,6 +16,8 @@ VP8_DX_SRCS-no  += $(VP8_COMMON_SRCS-no)
 VP8_DX_SRCS_REMOVE-yes += $(VP8_COMMON_SRCS_REMOVE-yes)
 VP8_DX_SRCS_REMOVE-no  += $(VP8_COMMON_SRCS_REMOVE-no)
 
+VP8_DX_SRCS-yes += vp8dx.mk
+
 VP8_DX_SRCS-yes += vp8_dx_iface.c
 
 VP8_DX_SRCS-yes += decoder/asm_dec_offsets.c
diff --git a/vp9/common/generic/vp9_systemdependent.c b/vp9/common/generic/vp9_systemdependent.c
index f133281b6..b02f3f083 100644
--- a/vp9/common/generic/vp9_systemdependent.c
+++ b/vp9/common/generic/vp9_systemdependent.c
@@ -9,7 +9,7 @@
  */
 
 
-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #include "vp9_rtcd.h"
 #include "vp9/common/vp9_subpixel.h"
 #include "vp9/common/vp9_loopfilter.h"
diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index bd86db8a8..a2306f0d1 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -9,7 +9,7 @@
  */
 
 
-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #include "vp9/common/vp9_blockd.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vp9/common/vp9_onyxc_int.h"
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 9ca2b22e6..bfc5b82e1 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -14,7 +14,7 @@
 
 void vpx_log(const char *format, ...);
 
-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #include "vpx_scale/yv12config.h"
 #include "vp9/common/vp9_mv.h"
 #include "vp9/common/vp9_treecoder.h"
diff --git a/vp9/common/vp9_idctllm.c b/vp9/common/vp9_idctllm.c
index 6cbc25967..4dd540e2a 100644
--- a/vp9/common/vp9_idctllm.c
+++ b/vp9/common/vp9_idctllm.c
@@ -24,7 +24,7 @@
  **************************************************************************/
 #include <assert.h>
 #include <math.h>
-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #include "vp9/common/vp9_systemdependent.h"
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_common.h"
@@ -33,60 +33,6 @@ static const int cospi8sqrt2minus1 = 20091;
 static const int sinpi8sqrt2      = 35468;
 static const int rounding = 0;
 
-// TODO: these transforms can be further converted into integer forms
-//       for complexity optimization
-static const float idct_4[16] = {
-  0.500000000000000,   0.653281482438188,   0.500000000000000,   0.270598050073099,
-  0.500000000000000,   0.270598050073099,  -0.500000000000000,  -0.653281482438188,
-  0.500000000000000,  -0.270598050073099,  -0.500000000000000,   0.653281482438188,
-  0.500000000000000,  -0.653281482438188,   0.500000000000000,  -0.270598050073099
-};
-
-static const float iadst_4[16] = {
-  0.228013428883779,   0.577350269189626,   0.656538502008139,   0.428525073124360,
-  0.428525073124360,   0.577350269189626,  -0.228013428883779,  -0.656538502008139,
-  0.577350269189626,                   0,  -0.577350269189626,   0.577350269189626,
-  0.656538502008139,  -0.577350269189626,   0.428525073124359,  -0.228013428883779
-};
-
-static const float idct_8[64] = {
-  0.353553390593274,   0.490392640201615,   0.461939766255643,   0.415734806151273,
-  0.353553390593274,   0.277785116509801,   0.191341716182545,   0.097545161008064,
-  0.353553390593274,   0.415734806151273,   0.191341716182545,  -0.097545161008064,
- -0.353553390593274,  -0.490392640201615,  -0.461939766255643,  -0.277785116509801,
-  0.353553390593274,   0.277785116509801,  -0.191341716182545,  -0.490392640201615,
- -0.353553390593274,   0.097545161008064,   0.461939766255643,   0.415734806151273,
-  0.353553390593274,   0.097545161008064,  -0.461939766255643,  -0.277785116509801,
-  0.353553390593274,   0.415734806151273,  -0.191341716182545,  -0.490392640201615,
-  0.353553390593274,  -0.097545161008064,  -0.461939766255643,   0.277785116509801,
-  0.353553390593274,  -0.415734806151273,  -0.191341716182545,   0.490392640201615,
-  0.353553390593274,  -0.277785116509801,  -0.191341716182545,   0.490392640201615,
- -0.353553390593274,  -0.097545161008064,   0.461939766255643,  -0.415734806151273,
-  0.353553390593274,  -0.415734806151273,   0.191341716182545,   0.097545161008064,
- -0.353553390593274,   0.490392640201615,  -0.461939766255643,   0.277785116509801,
-  0.353553390593274,  -0.490392640201615,   0.461939766255643,  -0.415734806151273,
-  0.353553390593274,  -0.277785116509801,   0.191341716182545,  -0.097545161008064
-};
-
-static const float iadst_8[64] = {
-  0.089131608307533,   0.255357107325376,   0.387095214016349,   0.466553967085785,
-  0.483002021635509,   0.434217976756762,   0.326790388032145,   0.175227946595735,
-  0.175227946595735,   0.434217976756762,   0.466553967085785,   0.255357107325376,
- -0.089131608307533,  -0.387095214016348,  -0.483002021635509,  -0.326790388032145,
-  0.255357107325376,   0.483002021635509,   0.175227946595735,  -0.326790388032145,
- -0.466553967085785,  -0.089131608307533,   0.387095214016349,   0.434217976756762,
-  0.326790388032145,   0.387095214016349,  -0.255357107325376,  -0.434217976756762,
-  0.175227946595735,   0.466553967085786,  -0.089131608307534,  -0.483002021635509,
-  0.387095214016349,   0.175227946595735,  -0.483002021635509,   0.089131608307533,
-  0.434217976756762,  -0.326790388032145,  -0.255357107325377,   0.466553967085785,
-  0.434217976756762,  -0.089131608307533,  -0.326790388032145,   0.483002021635509,
- -0.255357107325376,  -0.175227946595735,   0.466553967085785,  -0.387095214016348,
-  0.466553967085785,  -0.326790388032145,   0.089131608307533,   0.175227946595735,
- -0.387095214016348,   0.483002021635509,  -0.434217976756762,   0.255357107325376,
-  0.483002021635509,  -0.466553967085785,   0.434217976756762,  -0.387095214016348,
-  0.326790388032145,  -0.255357107325375,   0.175227946595736,  -0.089131608307532
-};
-
 static const int16_t idct_i4[16] = {
   8192,  10703,  8192,   4433,
   8192,   4433, -8192, -10703,
@@ -139,75 +85,7 @@ static const int16_t iadst_i8[64] = {
    5354, -4184,  2871, -1460
 };
 
-static float idct_16[256] = {
-  0.250000,  0.351851,  0.346760,  0.338330,  0.326641,  0.311806,  0.293969,  0.273300,
-  0.250000,  0.224292,  0.196424,  0.166664,  0.135299,  0.102631,  0.068975,  0.034654,
-  0.250000,  0.338330,  0.293969,  0.224292,  0.135299,  0.034654, -0.068975, -0.166664,
- -0.250000, -0.311806, -0.346760, -0.351851, -0.326641, -0.273300, -0.196424, -0.102631,
-  0.250000,  0.311806,  0.196424,  0.034654, -0.135299, -0.273300, -0.346760, -0.338330,
- -0.250000, -0.102631,  0.068975,  0.224292,  0.326641,  0.351851,  0.293969,  0.166664,
-  0.250000,  0.273300,  0.068975, -0.166664, -0.326641, -0.338330, -0.196424,  0.034654,
-  0.250000,  0.351851,  0.293969,  0.102631, -0.135299, -0.311806, -0.346760, -0.224292,
-  0.250000,  0.224292, -0.068975, -0.311806, -0.326641, -0.102631,  0.196424,  0.351851,
-  0.250000, -0.034654, -0.293969, -0.338330, -0.135299,  0.166664,  0.346760,  0.273300,
-  0.250000,  0.166664, -0.196424, -0.351851, -0.135299,  0.224292,  0.346760,  0.102631,
- -0.250000, -0.338330, -0.068975,  0.273300,  0.326641,  0.034654, -0.293969, -0.311806,
-  0.250000,  0.102631, -0.293969, -0.273300,  0.135299,  0.351851,  0.068975, -0.311806,
- -0.250000,  0.166664,  0.346760,  0.034654, -0.326641, -0.224292,  0.196424,  0.338330,
-  0.250000,  0.034654, -0.346760, -0.102631,  0.326641,  0.166664, -0.293969, -0.224292,
-  0.250000,  0.273300, -0.196424, -0.311806,  0.135299,  0.338330, -0.068975, -0.351851,
-  0.250000, -0.034654, -0.346760,  0.102631,  0.326641, -0.166664, -0.293969,  0.224292,
-  0.250000, -0.273300, -0.196424,  0.311806,  0.135299, -0.338330, -0.068975,  0.351851,
-  0.250000, -0.102631, -0.293969,  0.273300,  0.135299, -0.351851,  0.068975,  0.311806,
- -0.250000, -0.166664,  0.346760, -0.034654, -0.326641,  0.224292,  0.196424, -0.338330,
-  0.250000, -0.166664, -0.196424,  0.351851, -0.135299, -0.224292,  0.346760, -0.102631,
- -0.250000,  0.338330, -0.068975, -0.273300,  0.326641, -0.034654, -0.293969,  0.311806,
-  0.250000, -0.224292, -0.068975,  0.311806, -0.326641,  0.102631,  0.196424, -0.351851,
-  0.250000,  0.034654, -0.293969,  0.338330, -0.135299, -0.166664,  0.346760, -0.273300,
-  0.250000, -0.273300,  0.068975,  0.166664, -0.326641,  0.338330, -0.196424, -0.034654,
-  0.250000, -0.351851,  0.293969, -0.102631, -0.135299,  0.311806, -0.346760,  0.224292,
-  0.250000, -0.311806,  0.196424, -0.034654, -0.135299,  0.273300, -0.346760,  0.338330,
- -0.250000,  0.102631,  0.068975, -0.224292,  0.326641, -0.351851,  0.293969, -0.166664,
-  0.250000, -0.338330,  0.293969, -0.224292,  0.135299, -0.034654, -0.068975,  0.166664,
- -0.250000,  0.311806, -0.346760,  0.351851, -0.326641,  0.273300, -0.196424,  0.102631,
-  0.250000, -0.351851,  0.346760, -0.338330,  0.326641, -0.311806,  0.293969, -0.273300,
-  0.250000, -0.224292,  0.196424, -0.166664,  0.135299, -0.102631,  0.068975, -0.034654
-};
 
-static float iadst_16[256] = {
-  0.033094,  0.098087,  0.159534,  0.215215,  0.263118,  0.301511,  0.329007,  0.344612,
-  0.347761,  0.338341,  0.316693,  0.283599,  0.240255,  0.188227,  0.129396,  0.065889,
-  0.065889,  0.188227,  0.283599,  0.338341,  0.344612,  0.301511,  0.215215,  0.098087,
- -0.033094, -0.159534, -0.263118, -0.329007, -0.347761, -0.316693, -0.240255, -0.129396,
-  0.098087,  0.263118,  0.344612,  0.316693,  0.188227,  0.000000, -0.188227, -0.316693,
- -0.344612, -0.263118, -0.098087,  0.098087,  0.263118,  0.344612,  0.316693,  0.188227,
-  0.129396,  0.316693,  0.329007,  0.159534, -0.098087, -0.301511, -0.338341, -0.188227,
-  0.065889,  0.283599,  0.344612,  0.215215, -0.033094, -0.263118, -0.347761, -0.240255,
-  0.159534,  0.344612,  0.240255, -0.065889, -0.316693, -0.301511, -0.033094,  0.263118,
-  0.338341,  0.129396, -0.188227, -0.347761, -0.215215,  0.098087,  0.329007,  0.283599,
-  0.188227,  0.344612,  0.098087, -0.263118, -0.316693, -0.000000,  0.316693,  0.263118,
- -0.098087, -0.344612, -0.188227,  0.188227,  0.344612,  0.098087, -0.263118, -0.316693,
-  0.215215,  0.316693, -0.065889, -0.347761, -0.098087,  0.301511,  0.240255, -0.188227,
- -0.329007,  0.033094,  0.344612,  0.129396, -0.283599, -0.263118,  0.159534,  0.338341,
-  0.240255,  0.263118, -0.215215, -0.283599,  0.188227,  0.301511, -0.159534, -0.316693,
-  0.129396,  0.329007, -0.098087, -0.338341,  0.065889,  0.344612, -0.033094, -0.347761,
-  0.263118,  0.188227, -0.316693, -0.098087,  0.344612,  0.000000, -0.344612,  0.098087,
-  0.316693, -0.188227, -0.263118,  0.263118,  0.188227, -0.316693, -0.098087,  0.344612,
-  0.283599,  0.098087, -0.347761,  0.129396,  0.263118, -0.301511, -0.065889,  0.344612,
- -0.159534, -0.240255,  0.316693,  0.033094, -0.338341,  0.188227,  0.215215, -0.329007,
-  0.301511,  0.000000, -0.301511,  0.301511,  0.000000, -0.301511,  0.301511,  0.000000,
- -0.301511,  0.301511,  0.000000, -0.301511,  0.301511,  0.000000, -0.301511,  0.301511,
-  0.316693, -0.098087, -0.188227,  0.344612, -0.263118, -0.000000,  0.263118, -0.344612,
-  0.188227,  0.098087, -0.316693,  0.316693, -0.098087, -0.188227,  0.344612, -0.263118,
-  0.329007, -0.188227, -0.033094,  0.240255, -0.344612,  0.301511, -0.129396, -0.098087,
-  0.283599, -0.347761,  0.263118, -0.065889, -0.159534,  0.316693, -0.338341,  0.215215,
-  0.338341, -0.263118,  0.129396,  0.033094, -0.188227,  0.301511, -0.347761,  0.316693,
- -0.215215,  0.065889,  0.098087, -0.240255,  0.329007, -0.344612,  0.283599, -0.159534,
-  0.344612, -0.316693,  0.263118, -0.188227,  0.098087,  0.000000, -0.098087,  0.188227,
- -0.263118,  0.316693, -0.344612,  0.344612, -0.316693,  0.263118, -0.188227,  0.098087,
-  0.347761, -0.344612,  0.338341, -0.329007,  0.316693, -0.301511,  0.283599, -0.263118,
-  0.240255, -0.215215,  0.188227, -0.159534,  0.129396, -0.098087,  0.065889, -0.033094
-};
 
 static const int16_t idct_i16[256] = {
    4096,  5765,  5681,  5543,  5352,  5109,  4816,  4478,
@@ -279,124 +157,6 @@ static const int16_t iadst_i16[256] = {
    3936, -3526,  3084, -2614,  2120, -1607,  1080,  -542
 };
 
-void vp9_ihtllm_float_c(const int16_t *input, int16_t *output, int pitch,
-                  TX_TYPE tx_type, int tx_dim) {
-  vp9_clear_system_state();  // Make it simd safe : __asm emms;
-  {
-    int i, j, k;
-    float bufa[256], bufb[256];  // buffers are for floating-point test purpose
-                                 // the implementation could be simplified in
-                                 // conjunction with integer transform
-    const int16_t *ip = input;
-    int16_t *op = output;
-    int shortpitch = pitch >> 1;
-
-    float *pfa = &bufa[0];
-    float *pfb = &bufb[0];
-
-    // pointers to vertical and horizontal transforms
-    const float *ptv, *pth;
-
-    assert(tx_type != DCT_DCT);
-    // load and convert residual array into floating-point
-    for(j = 0; j < tx_dim; j++) {
-      for(i = 0; i < tx_dim; i++) {
-        pfa[i] = (float)ip[i];
-      }
-      pfa += tx_dim;
-      ip  += tx_dim;
-    }
-
-    // vertical transformation
-    pfa = &bufa[0];
-    pfb = &bufb[0];
-
-    switch(tx_type) {
-      case ADST_ADST :
-      case ADST_DCT  :
-        ptv = (tx_dim == 4) ? &iadst_4[0] :
-                              ((tx_dim == 8) ? &iadst_8[0] : &iadst_16[0]);
-        break;
-
-      default :
-        ptv = (tx_dim == 4) ? &idct_4[0] :
-                              ((tx_dim == 8) ? &idct_8[0] : &idct_16[0]);
-        break;
-    }
-
-    for(j = 0; j < tx_dim; j++) {
-      for(i = 0; i < tx_dim; i++) {
-        pfb[i] = 0 ;
-        for(k = 0; k < tx_dim; k++) {
-          pfb[i] += ptv[k] * pfa[(k * tx_dim)];
-        }
-        pfa += 1;
-      }
-
-      pfb += tx_dim;
-      ptv += tx_dim;
-      pfa = &bufa[0];
-    }
-
-    // horizontal transformation
-    pfa = &bufa[0];
-    pfb = &bufb[0];
-
-    switch(tx_type) {
-      case ADST_ADST :
-      case  DCT_ADST :
-        pth = (tx_dim == 4) ? &iadst_4[0] :
-                              ((tx_dim == 8) ? &iadst_8[0] : &iadst_16[0]);
-        break;
-
-      default :
-        pth = (tx_dim == 4) ? &idct_4[0] :
-                              ((tx_dim == 8) ? &idct_8[0] : &idct_16[0]);
-        break;
-    }
-
-    for(j = 0; j < tx_dim; j++) {
-      for(i = 0; i < tx_dim; i++) {
-        pfa[i] = 0;
-        for(k = 0; k < tx_dim; k++) {
-          pfa[i] += pfb[k] * pth[k];
-        }
-        pth += tx_dim;
-       }
-
-      pfa += tx_dim;
-      pfb += tx_dim;
-
-      switch(tx_type) {
-        case ADST_ADST :
-        case  DCT_ADST :
-          pth = (tx_dim == 4) ? &iadst_4[0] :
-                                ((tx_dim == 8) ? &iadst_8[0] : &iadst_16[0]);
-          break;
-
-        default :
-          pth = (tx_dim == 4) ? &idct_4[0] :
-                                ((tx_dim == 8) ? &idct_8[0] : &idct_16[0]);
-          break;
-      }
-    }
-
-    // convert to short integer format and load BLOCKD buffer
-    op  = output;
-    pfa = &bufa[0];
-
-    for(j = 0; j < tx_dim; j++) {
-      for(i = 0; i < tx_dim; i++) {
-        op[i] = (pfa[i] > 0 ) ? (int16_t)( pfa[i] / 8 + 0.49) :
-                               -(int16_t)( - pfa[i] / 8 + 0.49);
-      }
-
-      op += shortpitch;
-      pfa += tx_dim;
-    }
-  }
-  vp9_clear_system_state(); // Make it simd safe : __asm emms;
-}
 
 /* Converted the transforms to integer form. */
 #define HORIZONTAL_SHIFT 14  // 16
@@ -404,8 +164,9 @@ void vp9_ihtllm_float_c(const int16_t *input, int16_t *output, int pitch,
 #define VERTICAL_SHIFT 17  // 15
 #define VERTICAL_ROUNDING ((1 << (VERTICAL_SHIFT - 1)) - 1)
 void vp9_ihtllm_c(const int16_t *input, int16_t *output, int pitch,
-                      TX_TYPE tx_type, int tx_dim) {
+                      TX_TYPE tx_type, int tx_dim, uint16_t eobs) {
   int i, j, k;
+  int nz_dim;
   int16_t imbuf[256];
 
   const int16_t *ip = input;
@@ -444,6 +205,19 @@ void vp9_ihtllm_c(const int16_t *input, int16_t *output, int pitch,
       break;
   }
 
+  nz_dim = tx_dim;
+  if(tx_dim > 4) {
+    if(eobs < 36) {
+      vpx_memset(im, 0, 512);
+      nz_dim = 8;
+      if(eobs < 3) {
+        nz_dim = 2;
+      } else if(eobs < 10) {
+        nz_dim = 4;
+      }
+    }
+  }
+
   /* 2-D inverse transform X = M1*Z*Transposed_M2 is calculated in 2 steps
    * from right to left:
    * 1. horizontal transform: Y= Z*Transposed_M2
@@ -453,10 +227,10 @@ void vp9_ihtllm_c(const int16_t *input, int16_t *output, int pitch,
    */
   /* Horizontal transformation */
   for (j = 0; j < tx_dim; j++) {
-    for (i = 0; i < tx_dim; i++) {
+    for (i = 0; i < nz_dim; i++) {
       int temp = 0;
 
-      for (k = 0; k < tx_dim; k++) {
+      for (k = 0; k < nz_dim; k++) {
         temp += ip[k] * pth[k];
       }
 
@@ -476,7 +250,7 @@ void vp9_ihtllm_c(const int16_t *input, int16_t *output, int pitch,
     for (j = 0; j < tx_dim; j++) {
       int temp = 0;
 
-      for (k = 0; k < tx_dim; k++) {
+      for (k = 0; k < nz_dim; k++) {
         temp += ptv[k] * im[k];
       }
 
diff --git a/vp9/common/vp9_invtrans.c b/vp9/common/vp9_invtrans.c
index bb992a138..60deb5260 100644
--- a/vp9/common/vp9_invtrans.c
+++ b/vp9/common/vp9_invtrans.c
@@ -52,7 +52,7 @@ void vp9_inverse_transform_mby_4x4(MACROBLOCKD *xd) {
     TX_TYPE tx_type = get_tx_type_4x4(xd, &xd->block[i]);
     if (tx_type != DCT_DCT) {
       vp9_ihtllm(xd->block[i].dqcoeff, xd->block[i].diff, 32,
-                   tx_type, 4);
+                   tx_type, 4, xd->block[i].eob);
     } else {
       vp9_inverse_transform_b_4x4(xd, i, 32);
     }
@@ -91,7 +91,8 @@ void vp9_inverse_transform_mby_8x8(MACROBLOCKD *xd) {
   for (i = 0; i < 9; i += 8) {
     TX_TYPE tx_type = get_tx_type_8x8(xd, &xd->block[i]);
     if (tx_type != DCT_DCT) {
-      vp9_ihtllm(xd->block[i].dqcoeff, xd->block[i].diff, 32, tx_type, 8);
+      vp9_ihtllm(xd->block[i].dqcoeff, xd->block[i].diff, 32, tx_type, 8,
+                 xd->block[i].eob);
     } else {
       vp9_inverse_transform_b_8x8(&blockd[i].dqcoeff[0],
                                   &blockd[i].diff[0], 32);
@@ -100,7 +101,8 @@ void vp9_inverse_transform_mby_8x8(MACROBLOCKD *xd) {
   for (i = 2; i < 11; i += 8) {
     TX_TYPE tx_type = get_tx_type_8x8(xd, &xd->block[i]);
     if (tx_type != DCT_DCT) {
-      vp9_ihtllm(xd->block[i + 2].dqcoeff, xd->block[i].diff, 32, tx_type, 8);
+      vp9_ihtllm(xd->block[i + 2].dqcoeff, xd->block[i].diff, 32, tx_type, 8,
+                 xd->block[i + 2].eob);
     } else {
       vp9_inverse_transform_b_8x8(&blockd[i + 2].dqcoeff[0],
                                   &blockd[i].diff[0], 32);
@@ -132,7 +134,7 @@ void vp9_inverse_transform_mby_16x16(MACROBLOCKD *xd) {
   BLOCKD *bd = &xd->block[0];
   TX_TYPE tx_type = get_tx_type_16x16(xd, bd);
   if (tx_type != DCT_DCT) {
-    vp9_ihtllm(bd->dqcoeff, bd->diff, 32, tx_type, 16);
+    vp9_ihtllm(bd->dqcoeff, bd->diff, 32, tx_type, 16, bd->eob);
   } else {
     vp9_inverse_transform_b_16x16(&xd->block[0].dqcoeff[0],
                                   &xd->block[0].diff[0], 32);
diff --git a/vp9/common/vp9_invtrans.h b/vp9/common/vp9_invtrans.h
index 586a3dc4b..3cfb45fed 100644
--- a/vp9/common/vp9_invtrans.h
+++ b/vp9/common/vp9_invtrans.h
@@ -11,7 +11,7 @@
 #ifndef VP9_COMMON_VP9_INVTRANS_H_
 #define VP9_COMMON_VP9_INVTRANS_H_
 
-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_blockd.h"
 
diff --git a/vp9/common/vp9_mbpitch.c b/vp9/common/vp9_mbpitch.c
index 31162655d..e94144813 100644
--- a/vp9/common/vp9_mbpitch.c
+++ b/vp9/common/vp9_mbpitch.c
@@ -39,7 +39,7 @@ static void setup_macroblock(MACROBLOCKD *xd, BLOCKSET bs) {
   int block;
 
   uint8_t **y, **u, **v;
-  uint8_t **y2, **u2, **v2;
+  uint8_t **y2 = NULL, **u2 = NULL, **v2 = NULL;
   BLOCKD *blockd = xd->block;
   int stride;
 
diff --git a/vp9/common/vp9_postproc.c b/vp9/common/vp9_postproc.c
index 80a952d84..62c381eb9 100644
--- a/vp9/common/vp9_postproc.c
+++ b/vp9/common/vp9_postproc.c
@@ -9,7 +9,7 @@
  */
 
 
-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #include "vpx_scale/yv12config.h"
 #include "vp9/common/vp9_postproc.h"
 #include "vp9/common/vp9_textblit.h"
@@ -32,7 +32,7 @@
     (0.071*(float)(t & 0xff)) + 128)
 
 /* global constants */
-#if CONFIG_POSTPROC_VISUALIZER
+#if 0 && CONFIG_POSTPROC_VISUALIZER
 static const unsigned char MB_PREDICTION_MODE_colors[MB_MODE_COUNT][3] = {
   { RGB_TO_YUV(0x98FB98) },   /* PaleGreen */
   { RGB_TO_YUV(0x00FF00) },   /* Green */
@@ -672,7 +672,7 @@ int vp9_post_proc_frame(VP9_COMMON *oci, YV12_BUFFER_CONFIG *dest,
                         oci->post_proc_buffer.y_stride);
   }
 
-#if CONFIG_POSTPROC_VISUALIZER
+#if 0 && CONFIG_POSTPROC_VISUALIZER
   if (flags & VP9D_DEBUG_TXT_FRAME_INFO) {
     char message[512];
     sprintf(message, "F%1dG%1dQ%3dF%3dP%d_s%dx%d",
diff --git a/vp9/common/vp9_recon.c b/vp9/common/vp9_recon.c
index 7673c3727..00b537565 100644
--- a/vp9/common/vp9_recon.c
+++ b/vp9/common/vp9_recon.c
@@ -9,7 +9,7 @@
  */
 
 
-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #include "vp9_rtcd.h"
 #include "vp9/common/vp9_blockd.h"
 
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index c1d4a29c7..8a1b93df0 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -9,7 +9,7 @@
  */
 
 
-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_reconinter.h"
diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c
index 3fec98a01..0f58f9862 100644
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c
@@ -9,7 +9,7 @@
  */
 
 #include <stdio.h>
-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #include "vp9_rtcd.h"
 #include "vp9/common/vp9_reconintra.h"
 #include "vpx_mem/vpx_mem.h"
diff --git a/vp9/common/vp9_reconintra4x4.c b/vp9/common/vp9_reconintra4x4.c
index c41b55bca..d170e43ea 100644
--- a/vp9/common/vp9_reconintra4x4.c
+++ b/vp9/common/vp9_reconintra4x4.c
@@ -9,7 +9,7 @@
  */
 
 
-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vp9/common/vp9_reconintra.h"
 #include "vp9_rtcd.h"
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index f02ee0260..95253ef67 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -42,7 +42,7 @@ fi
 # Dequant
 #
 prototype void vp9_dequantize_b "struct blockd *x"
-specialize vp9_dequantize_b mmx
+specialize vp9_dequantize_b
 
 prototype void vp9_dequantize_b_2x2 "struct blockd *x"
 specialize vp9_dequantize_b_2x2
@@ -69,13 +69,13 @@ prototype void vp9_dequant_dc_idct_add "int16_t *input, const int16_t *dq, uint8
 specialize vp9_dequant_dc_idct_add
 
 prototype void vp9_dequant_dc_idct_add_y_block "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int stride, uint16_t *eobs, const int16_t *dcs"
-specialize vp9_dequant_dc_idct_add_y_block mmx
+specialize vp9_dequant_dc_idct_add_y_block
 
 prototype void vp9_dequant_idct_add_y_block "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int stride, uint16_t *eobs"
-specialize vp9_dequant_idct_add_y_block mmx
+specialize vp9_dequant_idct_add_y_block
 
 prototype void vp9_dequant_idct_add_uv_block "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dstu, uint8_t *dstv, int stride, uint16_t *eobs"
-specialize vp9_dequant_idct_add_uv_block mmx
+specialize vp9_dequant_idct_add_uv_block
 
 #
 # RECON
@@ -218,6 +218,7 @@ vp9_loop_filter_simple_bh_sse2=vp9_loop_filter_bhs_sse2
 #
 # post proc
 #
+if [ "$CONFIG_POSTPROC" = "yes" ]; then
 prototype void vp9_mbpost_proc_down "uint8_t *dst, int pitch, int rows, int cols, int flimit"
 specialize vp9_mbpost_proc_down mmx sse2
 vp9_mbpost_proc_down_sse2=vp9_mbpost_proc_down_xmm
@@ -233,6 +234,7 @@ vp9_post_proc_down_and_across_sse2=vp9_post_proc_down_and_across_xmm
 prototype void vp9_plane_add_noise "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch"
 specialize vp9_plane_add_noise mmx sse2
 vp9_plane_add_noise_sse2=vp9_plane_add_noise_wmt
+fi
 
 prototype void vp9_blend_mb_inner "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride"
 specialize vp9_blend_mb_inner
@@ -343,10 +345,10 @@ specialize vp9_bilinear_predict_avg4x4
 # dct
 #
 prototype void vp9_short_idct4x4llm_1 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct4x4llm_1 mmx
+specialize vp9_short_idct4x4llm_1
 
 prototype void vp9_short_idct4x4llm "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct4x4llm mmx
+specialize vp9_short_idct4x4llm
 
 prototype void vp9_short_idct8x8 "int16_t *input, int16_t *output, int pitch"
 specialize vp9_short_idct8x8
@@ -366,7 +368,7 @@ specialize vp9_short_idct10_16x16
 prototype void vp9_short_idct32x32 "int16_t *input, int16_t *output, int pitch"
 specialize vp9_short_idct32x32
 
-prototype void vp9_ihtllm "const int16_t *input, int16_t *output, int pitch, int tx_type, int tx_dim"
+prototype void vp9_ihtllm "const int16_t *input, int16_t *output, int pitch, int tx_type, int tx_dim, int16_t eobs"
 specialize vp9_ihtllm
 
 #
diff --git a/vp9/common/vp9_systemdependent.h b/vp9/common/vp9_systemdependent.h
index d57a42df3..1b9147ef4 100644
--- a/vp9/common/vp9_systemdependent.h
+++ b/vp9/common/vp9_systemdependent.h
@@ -15,7 +15,7 @@
 #include <math.h>
 #endif
 
-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #if ARCH_X86 || ARCH_X86_64
 void vpx_reset_mmx_state(void);
 #define vp9_clear_system_state() vpx_reset_mmx_state()
diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c
index 0d268a264..f09e2d78b 100644
--- a/vp9/common/x86/vp9_asm_stubs.c
+++ b/vp9/common/x86/vp9_asm_stubs.c
@@ -9,11 +9,11 @@
  */
 
 
-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #include "vpx_ports/mem.h"
 #include "vp9/common/vp9_subpixel.h"
 
-extern const short vp9_six_tap_mmx[16][6 * 8];
+extern const short vp9_six_tap_mmx[8][6 * 8];
 
 extern void vp9_filter_block1d_h6_mmx(unsigned char   *src_ptr,
                                       unsigned short  *output_ptr,
diff --git a/vp9/common/x86/vp9_filter_sse2.c b/vp9/common/x86/vp9_filter_sse2.c
index 09f8de384..8e02ac197 100644
--- a/vp9/common/x86/vp9_filter_sse2.c
+++ b/vp9/common/x86/vp9_filter_sse2.c
@@ -11,6 +11,7 @@
 #include <assert.h> // for alignment checks
 #include <emmintrin.h> // SSE2
 #include "vp9/common/vp9_filter.h"
+#include "vpx_ports/emmintrin_compat.h"
 #include "vpx_ports/mem.h" // for DECLARE_ALIGNED
 #include "vp9_rtcd.h"
 
diff --git a/vp9/common/x86/vp9_idctllm_sse2.asm b/vp9/common/x86/vp9_idctllm_sse2.asm
index daa572e01..8f3c6dfc3 100644
--- a/vp9/common/x86/vp9_idctllm_sse2.asm
+++ b/vp9/common/x86/vp9_idctllm_sse2.asm
@@ -21,7 +21,7 @@
 ;   int blk_stride      - 5
 ; )
 
-global sym(vp9_idct_dequant_0_2x_sse2)
+global sym(vp9_idct_dequant_0_2x_sse2) PRIVATE
 sym(vp9_idct_dequant_0_2x_sse2):
     push        rbp
     mov         rbp, rsp
@@ -97,7 +97,7 @@ sym(vp9_idct_dequant_0_2x_sse2):
     pop         rbp
     ret
 
-global sym(vp9_idct_dequant_full_2x_sse2)
+global sym(vp9_idct_dequant_full_2x_sse2) PRIVATE
 sym(vp9_idct_dequant_full_2x_sse2):
     push        rbp
     mov         rbp, rsp
@@ -362,7 +362,7 @@ sym(vp9_idct_dequant_full_2x_sse2):
 ;   int dst_stride      - 4
 ;   short *dc           - 5
 ; )
-global sym(vp9_idct_dequant_dc_0_2x_sse2)
+global sym(vp9_idct_dequant_dc_0_2x_sse2) PRIVATE
 sym(vp9_idct_dequant_dc_0_2x_sse2):
     push        rbp
     mov         rbp, rsp
@@ -438,7 +438,7 @@ sym(vp9_idct_dequant_dc_0_2x_sse2):
     pop         rbp
     ret
 
-global sym(vp9_idct_dequant_dc_full_2x_sse2)
+global sym(vp9_idct_dequant_dc_full_2x_sse2) PRIVATE
 sym(vp9_idct_dequant_dc_full_2x_sse2):
     push        rbp
     mov         rbp, rsp
diff --git a/vp9/common/x86/vp9_iwalsh_mmx.asm b/vp9/common/x86/vp9_iwalsh_mmx.asm
index 6b276b95a..1af252168 100644
--- a/vp9/common/x86/vp9_iwalsh_mmx.asm
+++ b/vp9/common/x86/vp9_iwalsh_mmx.asm
@@ -12,7 +12,7 @@
 %include "vpx_ports/x86_abi_support.asm"
 
 ;void vp9_short_inv_walsh4x4_1_mmx(short *input, short *output)
-global sym(vp9_short_inv_walsh4x4_1_mmx)
+global sym(vp9_short_inv_walsh4x4_1_mmx) PRIVATE
 sym(vp9_short_inv_walsh4x4_1_mmx):
     push        rbp
     mov         rbp, rsp
@@ -48,7 +48,7 @@ sym(vp9_short_inv_walsh4x4_1_mmx):
     ret
 
 ;void vp9_short_inv_walsh4x4_mmx(short *input, short *output)
-global sym(vp9_short_inv_walsh4x4_mmx)
+global sym(vp9_short_inv_walsh4x4_mmx) PRIVATE
 sym(vp9_short_inv_walsh4x4_mmx):
     push        rbp
     mov         rbp, rsp
diff --git a/vp9/common/x86/vp9_iwalsh_sse2.asm b/vp9/common/x86/vp9_iwalsh_sse2.asm
index 143cce87d..84fa2fe2a 100644
--- a/vp9/common/x86/vp9_iwalsh_sse2.asm
+++ b/vp9/common/x86/vp9_iwalsh_sse2.asm
@@ -12,7 +12,7 @@
 %include "vpx_ports/x86_abi_support.asm"
 
 ;void vp9_short_inv_walsh4x4_sse2(short *input, short *output)
-global sym(vp9_short_inv_walsh4x4_sse2)
+global sym(vp9_short_inv_walsh4x4_sse2) PRIVATE
 sym(vp9_short_inv_walsh4x4_sse2):
     push        rbp
     mov         rbp, rsp
diff --git a/vp9/common/x86/vp9_loopfilter_mmx.asm b/vp9/common/x86/vp9_loopfilter_mmx.asm
index ac3f74eda..ceffdf558 100644
--- a/vp9/common/x86/vp9_loopfilter_mmx.asm
+++ b/vp9/common/x86/vp9_loopfilter_mmx.asm
@@ -21,7 +21,7 @@
 ;    const char *thresh,
 ;    int  count
 ;)
-global sym(vp9_loop_filter_horizontal_edge_mmx)
+global sym(vp9_loop_filter_horizontal_edge_mmx) PRIVATE
 sym(vp9_loop_filter_horizontal_edge_mmx):
     push        rbp
     mov         rbp, rsp
@@ -233,7 +233,7 @@ sym(vp9_loop_filter_horizontal_edge_mmx):
 ;    const char *thresh,
 ;    int count
 ;)
-global sym(vp9_loop_filter_vertical_edge_mmx)
+global sym(vp9_loop_filter_vertical_edge_mmx) PRIVATE
 sym(vp9_loop_filter_vertical_edge_mmx):
     push        rbp
     mov         rbp, rsp
@@ -600,7 +600,7 @@ sym(vp9_loop_filter_vertical_edge_mmx):
 ;    int  src_pixel_step,
 ;    const char *blimit
 ;)
-global sym(vp9_loop_filter_simple_horizontal_edge_mmx)
+global sym(vp9_loop_filter_simple_horizontal_edge_mmx) PRIVATE
 sym(vp9_loop_filter_simple_horizontal_edge_mmx):
     push        rbp
     mov         rbp, rsp
@@ -716,7 +716,7 @@ sym(vp9_loop_filter_simple_horizontal_edge_mmx):
 ;    int  src_pixel_step,
 ;    const char *blimit
 ;)
-global sym(vp9_loop_filter_simple_vertical_edge_mmx)
+global sym(vp9_loop_filter_simple_vertical_edge_mmx) PRIVATE
 sym(vp9_loop_filter_simple_vertical_edge_mmx):
     push        rbp
     mov         rbp, rsp
diff --git a/vp9/common/x86/vp9_loopfilter_sse2.asm b/vp9/common/x86/vp9_loopfilter_sse2.asm
index 9c0c4b000..ae4c60f53 100644
--- a/vp9/common/x86/vp9_loopfilter_sse2.asm
+++ b/vp9/common/x86/vp9_loopfilter_sse2.asm
@@ -281,7 +281,7 @@
 ;    const char    *thresh,
 ;    int            count
 ;)
-global sym(vp9_loop_filter_horizontal_edge_sse2)
+global sym(vp9_loop_filter_horizontal_edge_sse2) PRIVATE
 sym(vp9_loop_filter_horizontal_edge_sse2):
     push        rbp
     mov         rbp, rsp
@@ -331,7 +331,7 @@ sym(vp9_loop_filter_horizontal_edge_sse2):
 ;    const char    *thresh,
 ;    int            count
 ;)
-global sym(vp9_loop_filter_horizontal_edge_uv_sse2)
+global sym(vp9_loop_filter_horizontal_edge_uv_sse2) PRIVATE
 sym(vp9_loop_filter_horizontal_edge_uv_sse2):
     push        rbp
     mov         rbp, rsp
@@ -719,7 +719,7 @@ sym(vp9_loop_filter_horizontal_edge_uv_sse2):
 ;    const char    *thresh,
 ;    int            count
 ;)
-global sym(vp9_loop_filter_vertical_edge_sse2)
+global sym(vp9_loop_filter_vertical_edge_sse2) PRIVATE
 sym(vp9_loop_filter_vertical_edge_sse2):
     push        rbp
     mov         rbp, rsp
@@ -786,7 +786,7 @@ sym(vp9_loop_filter_vertical_edge_sse2):
 ;    const char    *thresh,
 ;    unsigned char *v
 ;)
-global sym(vp9_loop_filter_vertical_edge_uv_sse2)
+global sym(vp9_loop_filter_vertical_edge_uv_sse2) PRIVATE
 sym(vp9_loop_filter_vertical_edge_uv_sse2):
     push        rbp
     mov         rbp, rsp
@@ -851,7 +851,7 @@ sym(vp9_loop_filter_vertical_edge_uv_sse2):
 ;    int  src_pixel_step,
 ;    const char *blimit,
 ;)
-global sym(vp9_loop_filter_simple_horizontal_edge_sse2)
+global sym(vp9_loop_filter_simple_horizontal_edge_sse2) PRIVATE
 sym(vp9_loop_filter_simple_horizontal_edge_sse2):
     push        rbp
     mov         rbp, rsp
@@ -960,7 +960,7 @@ sym(vp9_loop_filter_simple_horizontal_edge_sse2):
 ;    int  src_pixel_step,
 ;    const char *blimit,
 ;)
-global sym(vp9_loop_filter_simple_vertical_edge_sse2)
+global sym(vp9_loop_filter_simple_vertical_edge_sse2) PRIVATE
 sym(vp9_loop_filter_simple_vertical_edge_sse2):
     push        rbp         ; save old base pointer value.
     mov         rbp, rsp    ; set new base pointer value.
diff --git a/vp9/common/x86/vp9_loopfilter_x86.c b/vp9/common/x86/vp9_loopfilter_x86.c
index 54e6b9d74..61b1c77da 100644
--- a/vp9/common/x86/vp9_loopfilter_x86.c
+++ b/vp9/common/x86/vp9_loopfilter_x86.c
@@ -11,6 +11,7 @@
 #include <emmintrin.h>  // SSE2
 #include "vpx_config.h"
 #include "vp9/common/vp9_loopfilter.h"
+#include "vpx_ports/emmintrin_compat.h"
 
 prototype_loopfilter(vp9_loop_filter_vertical_edge_mmx);
 prototype_loopfilter(vp9_loop_filter_horizontal_edge_mmx);
diff --git a/vp9/common/x86/vp9_mask_sse3.asm b/vp9/common/x86/vp9_mask_sse3.asm
index 0d90cfa86..fe46823d0 100644
--- a/vp9/common/x86/vp9_mask_sse3.asm
+++ b/vp9/common/x86/vp9_mask_sse3.asm
@@ -25,7 +25,7 @@
 ;    int yt,
 ;    int ut,
 ;    int vt)
-global sym(vp8_makemask_sse3)
+global sym(vp8_makemask_sse3) PRIVATE
 sym(vp8_makemask_sse3):
     push        rbp
     mov         rbp, rsp
@@ -181,7 +181,7 @@ NextPairOfRows:
 ;void int vp8_growmaskmb_sse3(
 ;    unsigned char *om,
 ;    unsigned char *nm,
-global sym(vp8_growmaskmb_sse3)
+global sym(vp8_growmaskmb_sse3) PRIVATE
 sym(vp8_growmaskmb_sse3):
     push        rbp
     mov         rbp, rsp
@@ -234,7 +234,7 @@ sym(vp8_growmaskmb_sse3):
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    unsigned char *mask)
-global sym(vp8_sad16x16_masked_wmt)
+global sym(vp8_sad16x16_masked_wmt) PRIVATE
 sym(vp8_sad16x16_masked_wmt):
     push        rbp
     mov         rbp, rsp
@@ -288,7 +288,7 @@ NextSadRow:
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    unsigned char *mask)
-global sym(vp8_sad16x16_unmasked_wmt)
+global sym(vp8_sad16x16_unmasked_wmt) PRIVATE
 sym(vp8_sad16x16_unmasked_wmt):
     push        rbp
     mov         rbp, rsp
@@ -343,7 +343,7 @@ next_vp8_sad16x16_unmasked_wmt:
 ;    unsigned char *dst_ptr,
 ;    int  dst_stride,
 ;    unsigned char *mask)
-global sym(vp8_masked_predictor_wmt)
+global sym(vp8_masked_predictor_wmt) PRIVATE
 sym(vp8_masked_predictor_wmt):
     push        rbp
     mov         rbp, rsp
@@ -395,7 +395,7 @@ next_vp8_masked_predictor_wmt:
 ;    unsigned char *dst_ptr,
 ;    int  dst_stride,
 ;    unsigned char *mask)
-global sym(vp8_masked_predictor_uv_wmt)
+global sym(vp8_masked_predictor_uv_wmt) PRIVATE
 sym(vp8_masked_predictor_uv_wmt):
     push        rbp
     mov         rbp, rsp
@@ -444,7 +444,7 @@ next_vp8_masked_predictor_uv_wmt:
 ;unsigned int vp8_uv_from_y_mask(
 ;    unsigned char *ymask,
 ;    unsigned char *uvmask)
-global sym(vp8_uv_from_y_mask)
+global sym(vp8_uv_from_y_mask) PRIVATE
 sym(vp8_uv_from_y_mask):
     push        rbp
     mov         rbp, rsp
diff --git a/vp9/common/x86/vp9_postproc_mmx.asm b/vp9/common/x86/vp9_postproc_mmx.asm
index fa2152bab..5f06f0ea0 100644
--- a/vp9/common/x86/vp9_postproc_mmx.asm
+++ b/vp9/common/x86/vp9_postproc_mmx.asm
@@ -24,7 +24,7 @@
 ;    int cols,
 ;    int flimit
 ;)
-global sym(vp9_post_proc_down_and_across_mmx)
+global sym(vp9_post_proc_down_and_across_mmx) PRIVATE
 sym(vp9_post_proc_down_and_across_mmx):
     push        rbp
     mov         rbp, rsp
@@ -265,7 +265,7 @@ sym(vp9_post_proc_down_and_across_mmx):
 ;void vp9_mbpost_proc_down_mmx(unsigned char *dst,
 ;                             int pitch, int rows, int cols,int flimit)
 extern sym(vp9_rv)
-global sym(vp9_mbpost_proc_down_mmx)
+global sym(vp9_mbpost_proc_down_mmx) PRIVATE
 sym(vp9_mbpost_proc_down_mmx):
     push        rbp
     mov         rbp, rsp
@@ -465,7 +465,7 @@ sym(vp9_mbpost_proc_down_mmx):
 ;                            unsigned char bothclamp[16],
 ;                            unsigned int Width, unsigned int Height, int Pitch)
 extern sym(rand)
-global sym(vp9_plane_add_noise_mmx)
+global sym(vp9_plane_add_noise_mmx) PRIVATE
 sym(vp9_plane_add_noise_mmx):
     push        rbp
     mov         rbp, rsp
diff --git a/vp9/common/x86/vp9_postproc_sse2.asm b/vp9/common/x86/vp9_postproc_sse2.asm
index 91758e62d..8bbb3794b 100644
--- a/vp9/common/x86/vp9_postproc_sse2.asm
+++ b/vp9/common/x86/vp9_postproc_sse2.asm
@@ -21,7 +21,7 @@
 ;    int cols,
 ;    int flimit
 ;)
-global sym(vp9_post_proc_down_and_across_xmm)
+global sym(vp9_post_proc_down_and_across_xmm) PRIVATE
 sym(vp9_post_proc_down_and_across_xmm):
     push        rbp
     mov         rbp, rsp
@@ -251,7 +251,7 @@ sym(vp9_post_proc_down_and_across_xmm):
 ;void vp9_mbpost_proc_down_xmm(unsigned char *dst,
 ;                            int pitch, int rows, int cols,int flimit)
 extern sym(vp9_rv)
-global sym(vp9_mbpost_proc_down_xmm)
+global sym(vp9_mbpost_proc_down_xmm) PRIVATE
 sym(vp9_mbpost_proc_down_xmm):
     push        rbp
     mov         rbp, rsp
@@ -451,7 +451,7 @@ sym(vp9_mbpost_proc_down_xmm):
 
 ;void vp9_mbpost_proc_across_ip_xmm(unsigned char *src,
 ;                                int pitch, int rows, int cols,int flimit)
-global sym(vp9_mbpost_proc_across_ip_xmm)
+global sym(vp9_mbpost_proc_across_ip_xmm) PRIVATE
 sym(vp9_mbpost_proc_across_ip_xmm):
     push        rbp
     mov         rbp, rsp
@@ -630,7 +630,7 @@ sym(vp9_mbpost_proc_across_ip_xmm):
 ;                            unsigned char bothclamp[16],
 ;                            unsigned int Width, unsigned int Height, int Pitch)
 extern sym(rand)
-global sym(vp9_plane_add_noise_wmt)
+global sym(vp9_plane_add_noise_wmt) PRIVATE
 sym(vp9_plane_add_noise_wmt):
     push        rbp
     mov         rbp, rsp
diff --git a/vp9/common/x86/vp9_recon_mmx.asm b/vp9/common/x86/vp9_recon_mmx.asm
index 20f582dba..fc03d3f5b 100644
--- a/vp9/common/x86/vp9_recon_mmx.asm
+++ b/vp9/common/x86/vp9_recon_mmx.asm
@@ -11,7 +11,7 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 ;void vp9_recon_b_mmx(unsigned char *s, short *q, unsigned char *d, int stride)
-global sym(vp9_recon_b_mmx)
+global sym(vp9_recon_b_mmx) PRIVATE
 sym(vp9_recon_b_mmx):
     push        rbp
     mov         rbp, rsp
@@ -65,7 +65,7 @@ sym(vp9_recon_b_mmx):
 ;    unsigned char *dst,
 ;    int dst_stride
 ;    )
-global sym(vp9_copy_mem8x8_mmx)
+global sym(vp9_copy_mem8x8_mmx) PRIVATE
 sym(vp9_copy_mem8x8_mmx):
     push        rbp
     mov         rbp, rsp
@@ -128,7 +128,7 @@ sym(vp9_copy_mem8x8_mmx):
 ;    unsigned char *dst,
 ;    int dst_stride
 ;    )
-global sym(vp9_copy_mem8x4_mmx)
+global sym(vp9_copy_mem8x4_mmx) PRIVATE
 sym(vp9_copy_mem8x4_mmx):
     push        rbp
     mov         rbp, rsp
@@ -172,7 +172,7 @@ sym(vp9_copy_mem8x4_mmx):
 ;    unsigned char *dst,
 ;    int dst_stride
 ;    )
-global sym(vp9_copy_mem16x16_mmx)
+global sym(vp9_copy_mem16x16_mmx) PRIVATE
 sym(vp9_copy_mem16x16_mmx):
     push        rbp
     mov         rbp, rsp
diff --git a/vp9/common/x86/vp9_recon_sse2.asm b/vp9/common/x86/vp9_recon_sse2.asm
index c7cd23fc7..154442dc8 100644
--- a/vp9/common/x86/vp9_recon_sse2.asm
+++ b/vp9/common/x86/vp9_recon_sse2.asm
@@ -11,7 +11,7 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 ;void vp9_recon2b_sse2(unsigned char *s, short *q, unsigned char *d, int stride)
-global sym(vp9_recon2b_sse2)
+global sym(vp9_recon2b_sse2) PRIVATE
 sym(vp9_recon2b_sse2):
     push        rbp
     mov         rbp, rsp
@@ -62,7 +62,7 @@ sym(vp9_recon2b_sse2):
 
 
 ;void vp9_recon4b_sse2(unsigned char *s, short *q, unsigned char *d, int stride)
-global sym(vp9_recon4b_sse2)
+global sym(vp9_recon4b_sse2) PRIVATE
 sym(vp9_recon4b_sse2):
     push        rbp
     mov         rbp, rsp
@@ -132,7 +132,7 @@ sym(vp9_recon4b_sse2):
 ;    unsigned char *dst,
 ;    int dst_stride
 ;    )
-global sym(vp9_copy_mem16x16_sse2)
+global sym(vp9_copy_mem16x16_sse2) PRIVATE
 sym(vp9_copy_mem16x16_sse2):
     push        rbp
     mov         rbp, rsp
@@ -237,7 +237,7 @@ sym(vp9_copy_mem16x16_sse2):
 ;    unsigned char *src,
 ;    int src_stride,
 ;    )
-global sym(vp9_intra_pred_uv_dc_mmx2)
+global sym(vp9_intra_pred_uv_dc_mmx2) PRIVATE
 sym(vp9_intra_pred_uv_dc_mmx2):
     push        rbp
     mov         rbp, rsp
@@ -310,7 +310,7 @@ sym(vp9_intra_pred_uv_dc_mmx2):
 ;    unsigned char *src,
 ;    int src_stride,
 ;    )
-global sym(vp9_intra_pred_uv_dctop_mmx2)
+global sym(vp9_intra_pred_uv_dctop_mmx2) PRIVATE
 sym(vp9_intra_pred_uv_dctop_mmx2):
     push        rbp
     mov         rbp, rsp
@@ -363,7 +363,7 @@ sym(vp9_intra_pred_uv_dctop_mmx2):
 ;    unsigned char *src,
 ;    int src_stride,
 ;    )
-global sym(vp9_intra_pred_uv_dcleft_mmx2)
+global sym(vp9_intra_pred_uv_dcleft_mmx2) PRIVATE
 sym(vp9_intra_pred_uv_dcleft_mmx2):
     push        rbp
     mov         rbp, rsp
@@ -428,7 +428,7 @@ sym(vp9_intra_pred_uv_dcleft_mmx2):
 ;    unsigned char *src,
 ;    int src_stride,
 ;    )
-global sym(vp9_intra_pred_uv_dc128_mmx)
+global sym(vp9_intra_pred_uv_dc128_mmx) PRIVATE
 sym(vp9_intra_pred_uv_dc128_mmx):
     push        rbp
     mov         rbp, rsp
@@ -465,7 +465,7 @@ sym(vp9_intra_pred_uv_dc128_mmx):
 ;    int src_stride,
 ;    )
 %macro vp9_intra_pred_uv_tm 1
-global sym(vp9_intra_pred_uv_tm_%1)
+global sym(vp9_intra_pred_uv_tm_%1) PRIVATE
 sym(vp9_intra_pred_uv_tm_%1):
     push        rbp
     mov         rbp, rsp
@@ -545,7 +545,7 @@ vp9_intra_pred_uv_tm ssse3
 ;    unsigned char *src,
 ;    int src_stride,
 ;    )
-global sym(vp9_intra_pred_uv_ve_mmx)
+global sym(vp9_intra_pred_uv_ve_mmx) PRIVATE
 sym(vp9_intra_pred_uv_ve_mmx):
     push        rbp
     mov         rbp, rsp
@@ -585,7 +585,7 @@ sym(vp9_intra_pred_uv_ve_mmx):
 ;    int src_stride,
 ;    )
 %macro vp9_intra_pred_uv_ho 1
-global sym(vp9_intra_pred_uv_ho_%1)
+global sym(vp9_intra_pred_uv_ho_%1) PRIVATE
 sym(vp9_intra_pred_uv_ho_%1):
     push        rbp
     mov         rbp, rsp
diff --git a/vp9/common/x86/vp9_recon_wrapper_sse2.c b/vp9/common/x86/vp9_recon_wrapper_sse2.c
index 49b36dbd6..bb7baf8a0 100644
--- a/vp9/common/x86/vp9_recon_wrapper_sse2.c
+++ b/vp9/common/x86/vp9_recon_wrapper_sse2.c
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vp9/common/vp9_blockd.h"
 
diff --git a/vp9/common/x86/vp9_sadmxn_x86.c b/vp9/common/x86/vp9_sadmxn_x86.c
index 0b783ccea..3072d6df8 100644
--- a/vp9/common/x86/vp9_sadmxn_x86.c
+++ b/vp9/common/x86/vp9_sadmxn_x86.c
@@ -12,6 +12,7 @@
 #include "./vpx_config.h"
 #include "./vp9_rtcd.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_ports/emmintrin_compat.h"
 
 #if HAVE_SSE2
 unsigned int vp9_sad16x3_sse2(
diff --git a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
index dd89710e8..c6d65e904 100644
--- a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
+++ b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
@@ -30,7 +30,7 @@
 ;    unsigned int   output_height,
 ;    short *filter
 ;)
-global sym(vp9_filter_block1d8_v8_ssse3)
+global sym(vp9_filter_block1d8_v8_ssse3) PRIVATE
 sym(vp9_filter_block1d8_v8_ssse3):
     push        rbp
     mov         rbp, rsp
@@ -148,7 +148,7 @@ sym(vp9_filter_block1d8_v8_ssse3):
 ;    unsigned int   output_height,
 ;    short *filter
 ;)
-global sym(vp9_filter_block1d16_v8_ssse3)
+global sym(vp9_filter_block1d16_v8_ssse3) PRIVATE
 sym(vp9_filter_block1d16_v8_ssse3):
     push        rbp
     mov         rbp, rsp
@@ -298,7 +298,7 @@ sym(vp9_filter_block1d16_v8_ssse3):
 ;    unsigned int    output_height,
 ;    short *filter
 ;)
-global sym(vp9_filter_block1d8_h8_ssse3)
+global sym(vp9_filter_block1d8_h8_ssse3) PRIVATE
 sym(vp9_filter_block1d8_h8_ssse3):
     push        rbp
     mov         rbp, rsp
@@ -405,7 +405,7 @@ sym(vp9_filter_block1d8_h8_ssse3):
 ;    unsigned int    output_height,
 ;    short *filter
 ;)
-global sym(vp9_filter_block1d16_h8_ssse3)
+global sym(vp9_filter_block1d16_h8_ssse3) PRIVATE
 sym(vp9_filter_block1d16_h8_ssse3):
     push        rbp
     mov         rbp, rsp
diff --git a/vp9/common/x86/vp9_subpixel_mmx.asm b/vp9/common/x86/vp9_subpixel_mmx.asm
index 58d92bf05..dee29b8fb 100644
--- a/vp9/common/x86/vp9_subpixel_mmx.asm
+++ b/vp9/common/x86/vp9_subpixel_mmx.asm
@@ -27,7 +27,7 @@
 ;    unsigned int    output_width,
 ;    short           * vp9_filter
 ;)
-global sym(vp9_filter_block1d_h6_mmx)
+global sym(vp9_filter_block1d_h6_mmx) PRIVATE
 sym(vp9_filter_block1d_h6_mmx):
     push        rbp
     mov         rbp, rsp
@@ -124,7 +124,7 @@ sym(vp9_filter_block1d_h6_mmx):
 ;   unsigned int output_width,
 ;   short * vp9_filter
 ;)
-global sym(vp9_filter_block1dc_v6_mmx)
+global sym(vp9_filter_block1dc_v6_mmx) PRIVATE
 sym(vp9_filter_block1dc_v6_mmx):
     push        rbp
     mov         rbp, rsp
diff --git a/vp9/common/x86/vp9_subpixel_sse2.asm b/vp9/common/x86/vp9_subpixel_sse2.asm
index f62587406..b0c4f1282 100644
--- a/vp9/common/x86/vp9_subpixel_sse2.asm
+++ b/vp9/common/x86/vp9_subpixel_sse2.asm
@@ -32,7 +32,7 @@
 ;    unsigned int    output_width,
 ;    short           *vp9_filter
 ;)
-global sym(vp9_filter_block1d8_h6_sse2)
+global sym(vp9_filter_block1d8_h6_sse2) PRIVATE
 sym(vp9_filter_block1d8_h6_sse2):
     push        rbp
     mov         rbp, rsp
@@ -152,7 +152,7 @@ sym(vp9_filter_block1d8_h6_sse2):
 ; even number. This function handles 8 pixels in horizontal direction, calculating ONE
 ; rows each iteration to take advantage of the 128 bits operations.
 ;*************************************************************************************/
-global sym(vp9_filter_block1d16_h6_sse2)
+global sym(vp9_filter_block1d16_h6_sse2) PRIVATE
 sym(vp9_filter_block1d16_h6_sse2):
     push        rbp
     mov         rbp, rsp
@@ -328,7 +328,7 @@ sym(vp9_filter_block1d16_h6_sse2):
 ; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The
 ; input pixel array has output_height rows.
 ;*************************************************************************************/
-global sym(vp9_filter_block1d8_v6_sse2)
+global sym(vp9_filter_block1d8_v6_sse2) PRIVATE
 sym(vp9_filter_block1d8_v6_sse2):
     push        rbp
     mov         rbp, rsp
@@ -423,7 +423,7 @@ sym(vp9_filter_block1d8_v6_sse2):
 ; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The
 ; input pixel array has output_height rows.
 ;*************************************************************************************/
-global sym(vp9_filter_block1d16_v6_sse2)
+global sym(vp9_filter_block1d16_v6_sse2) PRIVATE
 sym(vp9_filter_block1d16_v6_sse2):
     push        rbp
     mov         rbp, rsp
@@ -533,7 +533,7 @@ sym(vp9_filter_block1d16_v6_sse2):
 ;    const short    *vp9_filter
 ;)
 ; First-pass filter only when yoffset==0
-global sym(vp9_filter_block1d8_h6_only_sse2)
+global sym(vp9_filter_block1d8_h6_only_sse2) PRIVATE
 sym(vp9_filter_block1d8_h6_only_sse2):
     push        rbp
     mov         rbp, rsp
@@ -646,7 +646,7 @@ sym(vp9_filter_block1d8_h6_only_sse2):
 ;    const short    *vp9_filter
 ;)
 ; First-pass filter only when yoffset==0
-global sym(vp9_filter_block1d16_h6_only_sse2)
+global sym(vp9_filter_block1d16_h6_only_sse2) PRIVATE
 sym(vp9_filter_block1d16_h6_only_sse2):
     push        rbp
     mov         rbp, rsp
@@ -811,7 +811,7 @@ sym(vp9_filter_block1d16_h6_only_sse2):
 ;    const short    *vp9_filter
 ;)
 ; Second-pass filter only when xoffset==0
-global sym(vp9_filter_block1d8_v6_only_sse2)
+global sym(vp9_filter_block1d8_v6_only_sse2) PRIVATE
 sym(vp9_filter_block1d8_v6_only_sse2):
     push        rbp
     mov         rbp, rsp
@@ -903,7 +903,7 @@ sym(vp9_filter_block1d8_v6_only_sse2):
 ;    unsigned int    output_height,
 ;    unsigned int    output_width
 ;)
-global sym(vp9_unpack_block1d16_h6_sse2)
+global sym(vp9_unpack_block1d16_h6_sse2) PRIVATE
 sym(vp9_unpack_block1d16_h6_sse2):
     push        rbp
     mov         rbp, rsp
@@ -962,7 +962,7 @@ sym(vp9_unpack_block1d16_h6_sse2):
 ;    int dst_pitch
 ;)
 extern sym(vp9_bilinear_filters_mmx)
-global sym(vp9_bilinear_predict16x16_sse2)
+global sym(vp9_bilinear_predict16x16_sse2) PRIVATE
 sym(vp9_bilinear_predict16x16_sse2):
     push        rbp
     mov         rbp, rsp
@@ -1231,7 +1231,7 @@ sym(vp9_bilinear_predict16x16_sse2):
 ;    int dst_pitch
 ;)
 extern sym(vp9_bilinear_filters_mmx)
-global sym(vp9_bilinear_predict8x8_sse2)
+global sym(vp9_bilinear_predict8x8_sse2) PRIVATE
 sym(vp9_bilinear_predict8x8_sse2):
     push        rbp
     mov         rbp, rsp
diff --git a/vp9/common/x86/vp9_subpixel_ssse3.asm b/vp9/common/x86/vp9_subpixel_ssse3.asm
index 4a16f1928..b260480e0 100644
--- a/vp9/common/x86/vp9_subpixel_ssse3.asm
+++ b/vp9/common/x86/vp9_subpixel_ssse3.asm
@@ -34,7 +34,7 @@
 ;    unsigned int    output_height,
 ;    unsigned int    vp9_filter_index
 ;)
-global sym(vp9_filter_block1d8_h6_ssse3)
+global sym(vp9_filter_block1d8_h6_ssse3) PRIVATE
 sym(vp9_filter_block1d8_h6_ssse3):
     push        rbp
     mov         rbp, rsp
@@ -177,7 +177,7 @@ vp9_filter_block1d8_h4_ssse3:
 ;    unsigned int    output_height,
 ;    unsigned int    vp9_filter_index
 ;)
-global sym(vp9_filter_block1d16_h6_ssse3)
+global sym(vp9_filter_block1d16_h6_ssse3) PRIVATE
 sym(vp9_filter_block1d16_h6_ssse3):
     push        rbp
     mov         rbp, rsp
@@ -284,7 +284,7 @@ sym(vp9_filter_block1d16_h6_ssse3):
 ;    unsigned int    output_height,
 ;    unsigned int    vp9_filter_index
 ;)
-global sym(vp9_filter_block1d4_h6_ssse3)
+global sym(vp9_filter_block1d4_h6_ssse3) PRIVATE
 sym(vp9_filter_block1d4_h6_ssse3):
     push        rbp
     mov         rbp, rsp
@@ -413,7 +413,7 @@ sym(vp9_filter_block1d4_h6_ssse3):
 ;    unsigned int   output_height,
 ;    unsigned int   vp9_filter_index
 ;)
-global sym(vp9_filter_block1d16_v6_ssse3)
+global sym(vp9_filter_block1d16_v6_ssse3) PRIVATE
 sym(vp9_filter_block1d16_v6_ssse3):
     push        rbp
     mov         rbp, rsp
@@ -601,7 +601,7 @@ sym(vp9_filter_block1d16_v6_ssse3):
 ;    unsigned int   output_height,
 ;    unsigned int   vp9_filter_index
 ;)
-global sym(vp9_filter_block1d8_v6_ssse3)
+global sym(vp9_filter_block1d8_v6_ssse3) PRIVATE
 sym(vp9_filter_block1d8_v6_ssse3):
     push        rbp
     mov         rbp, rsp
@@ -741,7 +741,7 @@ sym(vp9_filter_block1d8_v6_ssse3):
 ;    unsigned int   output_height,
 ;    unsigned int   vp9_filter_index
 ;)
-global sym(vp9_filter_block1d4_v6_ssse3)
+global sym(vp9_filter_block1d4_v6_ssse3) PRIVATE
 sym(vp9_filter_block1d4_v6_ssse3):
     push        rbp
     mov         rbp, rsp
@@ -880,7 +880,7 @@ sym(vp9_filter_block1d4_v6_ssse3):
 ;    unsigned char *dst_ptr,
 ;    int dst_pitch
 ;)
-global sym(vp9_bilinear_predict16x16_ssse3)
+global sym(vp9_bilinear_predict16x16_ssse3) PRIVATE
 sym(vp9_bilinear_predict16x16_ssse3):
     push        rbp
     mov         rbp, rsp
@@ -1143,7 +1143,7 @@ sym(vp9_bilinear_predict16x16_ssse3):
 ;    unsigned char *dst_ptr,
 ;    int dst_pitch
 ;)
-global sym(vp9_bilinear_predict8x8_ssse3)
+global sym(vp9_bilinear_predict8x8_ssse3) PRIVATE
 sym(vp9_bilinear_predict8x8_ssse3):
     push        rbp
     mov         rbp, rsp
diff --git a/vp9/decoder/vp9_dboolhuff.h b/vp9/decoder/vp9_dboolhuff.h
index c8c5c3b01..5afdd67c8 100644
--- a/vp9/decoder/vp9_dboolhuff.h
+++ b/vp9/decoder/vp9_dboolhuff.h
@@ -13,7 +13,7 @@
 
 #include <stddef.h>
 #include <limits.h>
-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #include "vpx_ports/mem.h"
 #include "vpx/vpx_integer.h"
 
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index 36eadc482..812bf10fc 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -264,7 +264,8 @@ static void decode_16x16(VP9D_COMP *pbi, MACROBLOCKD *xd,
   if (tx_type != DCT_DCT) {
     vp9_ht_dequant_idct_add_16x16_c(tx_type, xd->qcoeff,
                                     xd->block[0].dequant, xd->predictor,
-                                    xd->dst.y_buffer, 16, xd->dst.y_stride);
+                                    xd->dst.y_buffer, 16, xd->dst.y_stride,
+                                    xd->eobs[0]);
   } else {
     vp9_dequant_idct_add_16x16(xd->qcoeff, xd->block[0].dequant,
                                xd->predictor, xd->dst.y_buffer,
@@ -310,7 +311,8 @@ static void decode_8x8(VP9D_COMP *pbi, MACROBLOCKD *xd,
       }
       tx_type = get_tx_type_8x8(xd, &xd->block[ib]);
       if (tx_type != DCT_DCT) {
-        vp9_ht_dequant_idct_add_8x8_c(tx_type, q, dq, pre, dst, 16, stride);
+        vp9_ht_dequant_idct_add_8x8_c(tx_type, q, dq, pre, dst, 16, stride,
+                                      xd->eobs[idx]);
       } else {
         vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride,
                                    0, xd->eobs[idx]);
@@ -409,7 +411,7 @@ static void decode_4x4(VP9D_COMP *pbi, MACROBLOCKD *xd,
           vp9_ht_dequant_idct_add_c(tx_type, b->qcoeff,
                                     b->dequant, b->predictor,
                                     *(b->base_dst) + b->dst, 16,
-                                    b->dst_stride);
+                                    b->dst_stride, b->eob);
         } else {
           vp9_dequant_idct_add(b->qcoeff, b->dequant, b->predictor,
                                *(b->base_dst) + b->dst, 16, b->dst_stride);
@@ -454,7 +456,8 @@ static void decode_4x4(VP9D_COMP *pbi, MACROBLOCKD *xd,
       if (tx_type != DCT_DCT) {
         vp9_ht_dequant_idct_add_c(tx_type, b->qcoeff,
                                   b->dequant, b->predictor,
-                                  *(b->base_dst) + b->dst, 16, b->dst_stride);
+                                  *(b->base_dst) + b->dst, 16, b->dst_stride,
+                                  b->eob);
       } else {
         vp9_dequant_idct_add(b->qcoeff, b->dequant, b->predictor,
                              *(b->base_dst) + b->dst, 16, b->dst_stride);
@@ -516,7 +519,7 @@ static void decode_4x4(VP9D_COMP *pbi, MACROBLOCKD *xd,
           vp9_ht_dequant_idct_add_c(tx_type, b->qcoeff,
                                     b->dequant, b->predictor,
                                     *(b->base_dst) + b->dst, 16,
-                                    b->dst_stride);
+                                    b->dst_stride, b->eob);
         } else {
           vp9_dequant_idct_add(b->qcoeff, b->dequant, b->predictor,
                                *(b->base_dst) + b->dst, 16, b->dst_stride);
@@ -570,7 +573,7 @@ static void decode_16x16_sb(VP9D_COMP *pbi, MACROBLOCKD *xd,
         tx_type, xd->qcoeff, xd->block[0].dequant,
         xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16,
         xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16,
-        xd->dst.y_stride, xd->dst.y_stride);
+        xd->dst.y_stride, xd->dst.y_stride, xd->block[0].eob);
   } else {
     vp9_dequant_idct_add_16x16(
         xd->qcoeff, xd->block[0].dequant,
@@ -609,7 +612,7 @@ static void decode_8x8_sb(VP9D_COMP *pbi, MACROBLOCKD *xd,
             + x_idx * 16 + (i & 1) * 8,
             xd->dst.y_buffer + (y_idx * 16 + (i / 2) * 8) * xd->dst.y_stride
             + x_idx * 16 + (i & 1) * 8,
-            stride, stride);
+            stride, stride, b->eob);
       } else {
         vp9_dequant_idct_add_8x8_c(
             q, dq,
@@ -666,7 +669,7 @@ static void decode_4x4_sb(VP9D_COMP *pbi, MACROBLOCKD *xd,
             + x_idx * 16 + (i & 3) * 4,
             xd->dst.y_buffer + (y_idx * 16 + (i / 4) * 4) * xd->dst.y_stride
             + x_idx * 16 + (i & 3) * 4,
-            xd->dst.y_stride, xd->dst.y_stride);
+            xd->dst.y_stride, xd->dst.y_stride, b->eob);
       } else {
         vp9_dequant_idct_add_c(
             b->qcoeff, b->dequant,
diff --git a/vp9/decoder/vp9_dequantize.c b/vp9/decoder/vp9_dequantize.c
index 4376dc3d3..72cd2771e 100644
--- a/vp9/decoder/vp9_dequantize.c
+++ b/vp9/decoder/vp9_dequantize.c
@@ -14,7 +14,6 @@
 #include "vpx_mem/vpx_mem.h"
 #include "vp9/decoder/vp9_onyxd_int.h"
 #include "vp9/common/vp9_common.h"
-
 static void add_residual(const int16_t *diff, const uint8_t *pred, int pitch,
                          uint8_t *dest, int stride, int width, int height) {
   int r, c;
@@ -61,7 +60,7 @@ void vp9_dequantize_b_c(BLOCKD *d) {
 void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, int16_t *input,
                                const int16_t *dq,
                                uint8_t *pred, uint8_t *dest,
-                               int pitch, int stride) {
+                               int pitch, int stride, uint16_t eobs) {
   int16_t output[16];
   int16_t *diff_ptr = output;
   int i;
@@ -70,7 +69,7 @@ void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, int16_t *input,
     input[i] = dq[i] * input[i];
   }
 
-  vp9_ihtllm(input, output, 4 << 1, tx_type, 4);
+  vp9_ihtllm(input, output, 4 << 1, tx_type, 4, eobs);
 
   vpx_memset(input, 0, 32);
 
@@ -80,21 +79,25 @@ void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, int16_t *input,
 void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, int16_t *input,
                                    const int16_t *dq,
                                    uint8_t *pred, uint8_t *dest,
-                                   int pitch, int stride) {
+                                   int pitch, int stride, uint16_t eobs) {
   int16_t output[64];
   int16_t *diff_ptr = output;
   int i;
+  if (eobs == 0) {
+    /* All 0 DCT coefficient */
+    vp9_copy_mem8x8(pred, pitch, dest, stride);
+  } else if (eobs > 0) {
+    input[0] = dq[0] * input[0];
+    for (i = 1; i < 64; i++) {
+      input[i] = dq[1] * input[i];
+    }
 
-  input[0] = dq[0] * input[0];
-  for (i = 1; i < 64; i++) {
-    input[i] = dq[1] * input[i];
-  }
-
-  vp9_ihtllm(input, output, 16, tx_type, 8);
+    vp9_ihtllm(input, output, 16, tx_type, 8, eobs);
 
-  vpx_memset(input, 0, 128);
+    vpx_memset(input, 0, 128);
 
-  add_residual(diff_ptr, pred, pitch, dest, stride, 8, 8);
+    add_residual(diff_ptr, pred, pitch, dest, stride, 8, 8);
+  }
 }
 
 void vp9_dequant_idct_add_c(int16_t *input, const int16_t *dq, uint8_t *pred,
@@ -256,26 +259,31 @@ void vp9_dequant_idct_add_8x8_c(int16_t *input, const int16_t *dq,
 
 void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, int16_t *input,
                                      const int16_t *dq, uint8_t *pred,
-                                     uint8_t *dest, int pitch, int stride) {
+                                     uint8_t *dest, int pitch, int stride,
+                                     uint16_t eobs) {
   int16_t output[256];
   int16_t *diff_ptr = output;
   int i;
+  if (eobs == 0) {
+    /* All 0 DCT coefficient */
+    vp9_copy_mem16x16(pred, pitch, dest, stride);
+  } else if (eobs > 0) {
+    input[0]= input[0] * dq[0];
 
-  input[0]= input[0] * dq[0];
-
-  // recover quantizer for 4 4x4 blocks
-  for (i = 1; i < 256; i++)
-    input[i] = input[i] * dq[1];
+    // recover quantizer for 4 4x4 blocks
+    for (i = 1; i < 256; i++)
+      input[i] = input[i] * dq[1];
 
-  // inverse hybrid transform
-  vp9_ihtllm(input, output, 32, tx_type, 16);
+    // inverse hybrid transform
+    vp9_ihtllm(input, output, 32, tx_type, 16, eobs);
 
-  // the idct halves ( >> 1) the pitch
-  // vp9_short_idct16x16_c(input, output, 32);
+    // the idct halves ( >> 1) the pitch
+    // vp9_short_idct16x16_c(input, output, 32);
 
-  vpx_memset(input, 0, 512);
+    vpx_memset(input, 0, 512);
 
-  add_residual(diff_ptr, pred, pitch, dest, stride, 16, 16);
+    add_residual(diff_ptr, pred, pitch, dest, stride, 16, 16);
+  }
 }
 
 void vp9_dequant_idct_add_16x16_c(int16_t *input, const int16_t *dq,
diff --git a/vp9/decoder/vp9_dequantize.h b/vp9/decoder/vp9_dequantize.h
index c578608ba..bbbc173a2 100644
--- a/vp9/decoder/vp9_dequantize.h
+++ b/vp9/decoder/vp9_dequantize.h
@@ -11,108 +11,93 @@
 
 #ifndef VP9_DECODER_VP9_DEQUANTIZE_H_
 #define VP9_DECODER_VP9_DEQUANTIZE_H_
-
 #include "vp9/common/vp9_blockd.h"
 
 #if CONFIG_LOSSLESS
-extern void vp9_dequant_idct_add_lossless_c(int16_t *input,
-                                            const int16_t *dq,
-                                            uint8_t *pred,
-                                            uint8_t *output,
+extern void vp9_dequant_idct_add_lossless_c(int16_t *input, const int16_t *dq,
+                                            unsigned char *pred,
+                                            unsigned char *output,
                                             int pitch, int stride);
-extern void vp9_dequant_dc_idct_add_lossless_c(int16_t *input,
-                                               const int16_t *dq,
-                                               uint8_t *pred,
-                                               uint8_t *output,
+extern void vp9_dequant_dc_idct_add_lossless_c(int16_t *input, const int16_t *dq,
+                                               unsigned char *pred,
+                                               unsigned char *output,
                                                int pitch, int stride, int dc);
 extern void vp9_dequant_dc_idct_add_y_block_lossless_c(int16_t *q,
                                                        const int16_t *dq,
-                                                       uint8_t *pre,
-                                                       uint8_t *dst,
+                                                       unsigned char *pre,
+                                                       unsigned char *dst,
                                                        int stride,
                                                        uint16_t *eobs,
                                                        const int16_t *dc);
-extern void vp9_dequant_idct_add_y_block_lossless_c(int16_t *q,
-                                                    const int16_t *dq,
-                                                    uint8_t *pre,
-                                                    uint8_t *dst,
+extern void vp9_dequant_idct_add_y_block_lossless_c(int16_t *q, const int16_t *dq,
+                                                    unsigned char *pre,
+                                                    unsigned char *dst,
                                                     int stride,
                                                     uint16_t *eobs);
-extern void vp9_dequant_idct_add_uv_block_lossless_c(int16_t *q,
-                                                     const int16_t *dq,
-                                                     uint8_t *pre,
-                                                     uint8_t *dst_u,
-                                                     uint8_t *dst_v,
+extern void vp9_dequant_idct_add_uv_block_lossless_c(int16_t *q, const int16_t *dq,
+                                                     unsigned char *pre,
+                                                     unsigned char *dst_u,
+                                                     unsigned char *dst_v,
                                                      int stride,
                                                      uint16_t *eobs);
-#endif  // CONFIG_LOSSLESS
+#endif
 
 typedef void (*vp9_dequant_idct_add_fn_t)(int16_t *input, const int16_t *dq,
-                                          uint8_t *pred, uint8_t *output,
-                                          int pitch, int stride);
+    unsigned char *pred, unsigned char *output, int pitch, int stride);
 typedef void(*vp9_dequant_dc_idct_add_fn_t)(int16_t *input, const int16_t *dq,
-                                            uint8_t *pred, uint8_t *output,
-                                            int pitch, int stride, int dc);
+    unsigned char *pred, unsigned char *output, int pitch, int stride, int dc);
 
-typedef void(*vp9_dequant_dc_idct_add_y_block_fn_t)(int16_t *q,
-                                                    const int16_t *dq,
-                                                    uint8_t *pre, uint8_t *dst,
-                                                    int stride, uint16_t *eobs,
-                                                    const int16_t *dc);
+typedef void(*vp9_dequant_dc_idct_add_y_block_fn_t)(int16_t *q, const int16_t *dq,
+    unsigned char *pre, unsigned char *dst, int stride, uint16_t *eobs,
+    const int16_t *dc);
 typedef void(*vp9_dequant_idct_add_y_block_fn_t)(int16_t *q, const int16_t *dq,
-                                                 uint8_t *pre, uint8_t *dst,
-                                                 int stride, uint16_t *eobs);
+    unsigned char *pre, unsigned char *dst, int stride, uint16_t *eobs);
 typedef void(*vp9_dequant_idct_add_uv_block_fn_t)(int16_t *q, const int16_t *dq,
-                                                  uint8_t *pre, uint8_t *dst_u,
-                                                  uint8_t *dst_v, int stride,
-                                                  uint16_t *eobs);
+    unsigned char *pre, unsigned char *dst_u, unsigned char *dst_v, int stride,
+    uint16_t *eobs);
 
-void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, int16_t *input,
-                               const int16_t *dq,
-                               uint8_t *pred, uint8_t *dest,
-                               int pitch, int stride);
+void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, int16_t *input, const int16_t *dq,
+                                    unsigned char *pred, unsigned char *dest,
+                                    int pitch, int stride, uint16_t eobs);
 
 void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, int16_t *input,
-                                   const int16_t *dq, uint8_t *pred,
-                                   uint8_t *dest, int pitch, int stride);
+                                   const int16_t *dq, unsigned char *pred,
+                                   unsigned char *dest, int pitch, int stride,
+                                   uint16_t eobs);
 
 void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, int16_t *input,
-                                     const int16_t *dq, uint8_t *pred,
-                                     uint8_t *dest,
-                                     int pitch, int stride);
+                                     const int16_t *dq, unsigned char *pred,
+                                     unsigned char *dest,
+                                     int pitch, int stride, uint16_t eobs);
 
 #if CONFIG_SUPERBLOCKS
-void vp9_dequant_dc_idct_add_y_block_8x8_inplace_c(int16_t *q,
-                                                   const int16_t *dq,
-                                                   uint8_t *dst,
+void vp9_dequant_dc_idct_add_y_block_8x8_inplace_c(int16_t *q, const int16_t *dq,
+                                                   unsigned char *dst,
                                                    int stride,
                                                    uint16_t *eobs,
                                                    const int16_t *dc,
                                                    MACROBLOCKD *xd);
 
-void vp9_dequant_dc_idct_add_y_block_4x4_inplace_c(int16_t *q,
-                                                   const int16_t *dq,
-                                                   uint8_t *dst,
+void vp9_dequant_dc_idct_add_y_block_4x4_inplace_c(int16_t *q, const int16_t *dq,
+                                                   unsigned char *dst,
                                                    int stride,
                                                    uint16_t *eobs,
                                                    const int16_t *dc,
                                                    MACROBLOCKD *xd);
 
-void vp9_dequant_idct_add_uv_block_8x8_inplace_c(int16_t *q,
-                                                 const int16_t *dq,
-                                                 uint8_t *dstu,
-                                                 uint8_t *dstv,
+void vp9_dequant_idct_add_uv_block_8x8_inplace_c(int16_t *q, const int16_t *dq,
+                                                 unsigned char *dstu,
+                                                 unsigned char *dstv,
                                                  int stride,
                                                  uint16_t *eobs,
                                                  MACROBLOCKD *xd);
 
-void vp9_dequant_idct_add_uv_block_4x4_inplace_c(int16_t *q,
-                                                 const int16_t *dq,
-                                                 uint8_t *dstu,
-                                                 uint8_t *dstv,
+void vp9_dequant_idct_add_uv_block_4x4_inplace_c(int16_t *q, const int16_t *dq,
+                                                 unsigned char *dstu,
+                                                 unsigned char *dstv,
                                                  int stride,
                                                  uint16_t *eobs,
                                                  MACROBLOCKD *xd);
-#endif  // CONFIG_SUPERBLOCKS
+#endif
 
-#endif  // VP9_DECODER_VP9_DEQUANTIZE_H_
+#endif
diff --git a/vp9/decoder/vp9_onyxd_int.h b/vp9/decoder/vp9_onyxd_int.h
index 6b7184fbe..64975468d 100644
--- a/vp9/decoder/vp9_onyxd_int.h
+++ b/vp9/decoder/vp9_onyxd_int.h
@@ -10,8 +10,7 @@
 
 #ifndef VP9_DECODER_VP9_ONYXD_INT_H_
 #define VP9_DECODER_VP9_ONYXD_INT_H_
-
-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #include "vp9/decoder/vp9_onyxd.h"
 #include "vp9/decoder/vp9_treereader.h"
 #include "vp9/common/vp9_onyxc_int.h"
diff --git a/vp9/decoder/x86/vp9_idct_blk_mmx.c b/vp9/decoder/x86/vp9_idct_blk_mmx.c
index df3485233..8279eaa4a 100644
--- a/vp9/decoder/x86/vp9_idct_blk_mmx.c
+++ b/vp9/decoder/x86/vp9_idct_blk_mmx.c
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/decoder/vp9_dequantize.h"
 #include "vp9/decoder/x86/vp9_idct_mmx.h"
diff --git a/vp9/decoder/x86/vp9_idct_blk_sse2.c b/vp9/decoder/x86/vp9_idct_blk_sse2.c
index 6c1fd1439..badd97f73 100644
--- a/vp9/decoder/x86/vp9_idct_blk_sse2.c
+++ b/vp9/decoder/x86/vp9_idct_blk_sse2.c
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/decoder/vp9_dequantize.h"
 
diff --git a/vp9/decoder/x86/vp9_x86_dsystemdependent.c b/vp9/decoder/x86/vp9_x86_dsystemdependent.c
index d1cc53fce..51ee8ec31 100644
--- a/vp9/decoder/x86/vp9_x86_dsystemdependent.c
+++ b/vp9/decoder/x86/vp9_x86_dsystemdependent.c
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #include "vpx_ports/x86.h"
 #include "vp9/decoder/vp9_onyxd_int.h"
 
diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c
index 90baafe53..e14421d2d 100644
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@@ -11,7 +11,7 @@
 
 #include <assert.h>
 #include <math.h>
-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #include "vp9/common/vp9_systemdependent.h"
 
 #include "vp9/common/vp9_blockd.h"
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 702c35831..509c426d8 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -9,7 +9,7 @@
  */
 
 
-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #include "vp9/encoder/vp9_encodeframe.h"
 #include "vp9/encoder/vp9_encodemb.h"
 #include "vp9/encoder/vp9_encodemv.h"
@@ -2123,8 +2123,6 @@ static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t,
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
-  unsigned char *segment_id = &mbmi->segment_id;
-  int seg_ref_active;
   unsigned char ref_pred_flag;
 
 #if CONFIG_SUPERBLOCKS
@@ -2170,8 +2168,6 @@ static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t,
 
     vp9_update_zbin_extra(cpi, x);
 
-    seg_ref_active = vp9_segfeature_active(xd, *segment_id, SEG_LVL_REF_FRAME);
-
     // SET VARIOUS PREDICTION FLAGS
 
     // Did the chosen reference frame match its predicted value.
diff --git a/vp9/encoder/vp9_encodeintra.c b/vp9/encoder/vp9_encodeintra.c
index 4ee21bb46..9b106266e 100644
--- a/vp9/encoder/vp9_encodeintra.c
+++ b/vp9/encoder/vp9_encodeintra.c
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #include "vp9_rtcd.h"
 #include "vp9/encoder/vp9_quantize.h"
 #include "vp9/common/vp9_reconintra.h"
@@ -70,7 +70,7 @@ void vp9_encode_intra4x4block(MACROBLOCK *x, int ib) {
   if (tx_type != DCT_DCT) {
     vp9_fht(be->src_diff, 32, be->coeff, tx_type, 4);
     vp9_ht_quantize_b_4x4(be, b, tx_type);
-    vp9_ihtllm(b->dqcoeff, b->diff, 32, tx_type, 4);
+    vp9_ihtllm(b->dqcoeff, b->diff, 32, tx_type, 4, b->eob);
   } else {
     x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);
     x->quantize_b_4x4(be, b) ;
@@ -191,7 +191,7 @@ void vp9_encode_intra8x8(MACROBLOCK *x, int ib) {
                 tx_type, 8);
       x->quantize_b_8x8(x->block + idx, xd->block + idx);
       vp9_ihtllm(xd->block[idx].dqcoeff, xd->block[ib].diff, 32,
-                   tx_type, 8);
+                   tx_type, 8, xd->block[idx].eob);
     } else {
       x->vp9_short_fdct8x8(be->src_diff, (x->block + idx)->coeff, 32);
       x->quantize_b_8x8(x->block + idx, xd->block + idx);
@@ -205,7 +205,7 @@ void vp9_encode_intra8x8(MACROBLOCK *x, int ib) {
       if (tx_type != DCT_DCT) {
         vp9_fht_c(be->src_diff, 32, be->coeff, tx_type, 4);
         vp9_ht_quantize_b_4x4(be, b, tx_type);
-        vp9_ihtllm(b->dqcoeff, b->diff, 32, tx_type, 4);
+        vp9_ihtllm(b->dqcoeff, b->diff, 32, tx_type, 4, b->eob);
       } else {
         x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);
         x->quantize_b_4x4(be, b);
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 91eea4e51..2ca146c3b 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #include "vp9/encoder/vp9_encodemb.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/encoder/vp9_quantize.h"
diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h
index 37aae13eb..75c3a8a8b 100644
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h
@@ -11,7 +11,7 @@
 #ifndef VP9_ENCODER_VP9_ENCODEMB_H_
 #define VP9_ENCODER_VP9_ENCODEMB_H_
 
-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #include "vp9/encoder/vp9_block.h"
 
 typedef struct {
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 8448de7f9..a14867292 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -41,9 +41,10 @@
 #define RMAX       128.0
 #define GF_RMAX    96.0
 #define ERR_DIVISOR   150.0
+#define MIN_DECAY_FACTOR 0.1
 
-#define KF_MB_INTRA_MIN 300
-#define GF_MB_INTRA_MIN 200
+#define KF_MB_INTRA_MIN 150
+#define GF_MB_INTRA_MIN 100
 
 #define DOUBLE_DIVIDE_CHECK(X) ((X)<0?(X)-.000001:(X)+.000001)
 
@@ -800,6 +801,7 @@ static double bitcost(double prob) {
 
 static long long estimate_modemvcost(VP9_COMP *cpi,
                                      FIRSTPASS_STATS *fpstats) {
+#if 0
   int mv_cost;
   int mode_cost;
 
@@ -828,6 +830,7 @@ static long long estimate_modemvcost(VP9_COMP *cpi,
 
   // return mv_cost + mode_cost;
   // TODO PGW Fix overhead costs for extended Q range
+#endif
   return 0;
 }
 
@@ -1405,10 +1408,9 @@ static int calc_arf_boost(
     // Cumulative effect of prediction quality decay
     if (!flash_detected) {
       decay_accumulator =
-        decay_accumulator *
-        get_prediction_decay_rate(cpi, &this_frame);
-      decay_accumulator =
-        decay_accumulator < 0.1 ? 0.1 : decay_accumulator;
+        decay_accumulator * get_prediction_decay_rate(cpi, &this_frame);
+      decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
+                          ? MIN_DECAY_FACTOR : decay_accumulator;
     }
 
     boost_score += (decay_accumulator *
@@ -1443,10 +1445,9 @@ static int calc_arf_boost(
     // Cumulative effect of prediction quality decay
     if (!flash_detected) {
       decay_accumulator =
-        decay_accumulator *
-        get_prediction_decay_rate(cpi, &this_frame);
-      decay_accumulator =
-        decay_accumulator < 0.1 ? 0.1 : decay_accumulator;
+        decay_accumulator * get_prediction_decay_rate(cpi, &this_frame);
+      decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
+                          ? MIN_DECAY_FACTOR : decay_accumulator;
     }
 
     boost_score += (decay_accumulator *
@@ -1632,7 +1633,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
         ((mv_ratio_accumulator > 100.0) ||
          (abs_mv_in_out_accumulator > 3.0) ||
          (mv_in_out_accumulator < -2.0) ||
-         ((boost_score - old_boost_score) < 12.5))
+         ((boost_score - old_boost_score) < IIFACTOR))
       )) {
       boost_score = old_boost_score;
       break;
@@ -1952,12 +1953,9 @@ void vp9_second_pass(VP9_COMP *cpi) {
   FIRSTPASS_STATS this_frame;
   FIRSTPASS_STATS this_frame_copy;
 
-  double this_frame_error;
   double this_frame_intra_error;
   double this_frame_coded_error;
 
-  FIRSTPASS_STATS *start_pos;
-
   int overhead_bits;
 
   if (!cpi->twopass.stats_in) {
@@ -1971,12 +1969,9 @@ void vp9_second_pass(VP9_COMP *cpi) {
   if (EOF == input_stats(cpi, &this_frame))
     return;
 
-  this_frame_error = this_frame.ssim_weighted_pred_err;
   this_frame_intra_error = this_frame.intra_error;
   this_frame_coded_error = this_frame.coded_error;
 
-  start_pos = cpi->twopass.stats_in;
-
   // keyframe and section processing !
   if (cpi->twopass.frames_to_key == 0) {
     // Define next KF group and assign bits to it
@@ -2396,7 +2391,8 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     if (!detect_flash(cpi, 0)) {
       loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
       decay_accumulator = decay_accumulator * loop_decay_rate;
-      decay_accumulator = decay_accumulator < 0.1 ? 0.1 : decay_accumulator;
+      decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
+                            ? MIN_DECAY_FACTOR : decay_accumulator;
     }
 
     boost_score += (decay_accumulator * r);
@@ -2436,14 +2432,11 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     int allocation_chunks;
     int alt_kf_bits;
 
-    if (kf_boost < 300) {
-      kf_boost += (cpi->twopass.frames_to_key * 3);
-      if (kf_boost > 300)
-        kf_boost = 300;
-    }
+    if (kf_boost < (cpi->twopass.frames_to_key * 5))
+      kf_boost = (cpi->twopass.frames_to_key * 5);
 
-    if (kf_boost < 250)                                                      // Min KF boost
-      kf_boost = 250;
+    if (kf_boost < 300) // Min KF boost
+      kf_boost = 300;
 
     // Make a note of baseline boost and the zero motion
     // accumulator value for use elsewhere.
diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index 8511bc572..c319e07c0 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c
@@ -27,7 +27,7 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
   BLOCKD *d = &xd->block[0];
   vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];
   unsigned int best_err;
-  int step_param, further_steps;
+  int step_param;
 
   int tmp_col_min = x->mv_col_min;
   int tmp_col_max = x->mv_col_max;
@@ -38,10 +38,8 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
   // Further step/diamond searches as necessary
   if (cpi->Speed < 8) {
     step_param = cpi->sf.first_step + ((cpi->Speed > 5) ? 1 : 0);
-    further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
   } else {
     step_param = cpi->sf.first_step + 2;
-    further_steps = 0;
   }
 
   vp9_clamp_mv_min_max(x, ref_mv);
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 38a2eab62..9769d6344 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -12,7 +12,7 @@
 #include "vp9/encoder/vp9_onyx_int.h"
 #include "vp9/encoder/vp9_mcomp.h"
 #include "vpx_mem/vpx_mem.h"
-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #include <stdio.h>
 #include <limits.h>
 #include <math.h>
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 14948a0de..0e4b47ddf 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -148,7 +148,6 @@ static int calculate_minq_index(double maxq,
                                 double x3, double x2, double x, double c) {
   int i;
   double minqtarget;
-  double thisq;
 
   minqtarget = ((x3 * maxq * maxq * maxq) +
                 (x2 * maxq * maxq) +
@@ -159,7 +158,6 @@ static int calculate_minq_index(double maxq,
     minqtarget = maxq;
 
   for (i = 0; i < QINDEX_RANGE; i++) {
-    thisq = vp9_convert_qindex_to_q(i);
     if (minqtarget <= vp9_convert_qindex_to_q(i))
       return i;
   }
@@ -2925,8 +2923,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
 
   int Loop = FALSE;
   int loop_count;
-  int this_q;
-  int last_zbin_oq;
 
   int q_low;
   int q_high;
@@ -2940,8 +2936,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   int overshoot_seen = FALSE;
   int undershoot_seen = FALSE;
 
-  int loop_size_estimate = 0;
-
   SPEED_FEATURES *sf = &cpi->sf;
 #if RESET_FOREACH_FILTER
   int q_low0;
@@ -2949,6 +2943,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   int zbin_oq_high0;
   int zbin_oq_low0 = 0;
   int Q0;
+  int last_zbin_oq;
   int last_zbin_oq0;
   int active_best_quality0;
   int active_worst_quality0;
@@ -3163,7 +3158,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
     // Determine initial Q to try
     Q = vp9_regulate_q(cpi, cpi->this_frame_target);
   }
+#if RESET_FOREACH_FILTER
   last_zbin_oq = cpi->zbin_over_quant;
+#endif
 
   // Set highest allowed value for Zbin over quant
   if (cm->frame_type == KEY_FRAME)
@@ -3267,7 +3264,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
     vp9_clear_system_state();  // __asm emms;
 
     vp9_set_quantizer(cpi, Q);
-    this_q = Q;
 
     if (loop_count == 0) {
 
@@ -3503,7 +3499,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
 
       // Loop = ((Q != last_q) || (last_zbin_oq != cpi->zbin_over_quant)) ? TRUE : FALSE;
       Loop = ((Q != last_q)) ? TRUE : FALSE;
+#if RESET_FOREACH_FILTER
       last_zbin_oq = cpi->zbin_over_quant;
+#endif
     } else
       Loop = FALSE;
 
@@ -3692,9 +3690,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
    * needed in motion search besides loopfilter */
   cm->last_frame_type = cm->frame_type;
 
-  // Keep a copy of the size estimate used in the loop
-  loop_size_estimate = cpi->projected_frame_size;
-
   // Update rate control heuristics
   cpi->total_byte_count += (*size);
   cpi->projected_frame_size = (*size) << 3;
@@ -3795,7 +3790,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
               "%6d %5d %5d %5d %8d %8.2f %10d %10.3f"
               "%10.3f %8d %10d %10d %10d\n",
               cpi->common.current_video_frame, cpi->this_frame_target,
-              cpi->projected_frame_size, loop_size_estimate,
+              cpi->projected_frame_size, 0, //loop_size_estimate,
               (cpi->projected_frame_size - cpi->this_frame_target),
               (int)cpi->total_target_vs_actual,
               (cpi->oxcf.starting_buffer_level - cpi->bits_off_target),
@@ -3825,7 +3820,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
               "%8d %10d %10d %10d\n",
               cpi->common.current_video_frame,
               cpi->this_frame_target, cpi->projected_frame_size,
-              loop_size_estimate,
+              0, //loop_size_estimate,
               (cpi->projected_frame_size - cpi->this_frame_target),
               (int)cpi->total_target_vs_actual,
               (cpi->oxcf.starting_buffer_level - cpi->bits_off_target),
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 11428352f..c9ee14425 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -13,7 +13,7 @@
 #define VP9_ENCODER_VP9_ONYX_INT_H_
 
 #include <stdio.h>
-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #include "vp9/common/vp9_onyx.h"
 #include "vp9/encoder/vp9_treewriter.h"
 #include "vp9/encoder/vp9_tokenize.h"
diff --git a/vp9/encoder/vp9_picklpf.c b/vp9/encoder/vp9_picklpf.c
index 7091c4932..b443ede6f 100644
--- a/vp9/encoder/vp9_picklpf.c
+++ b/vp9/encoder/vp9_picklpf.c
@@ -24,11 +24,9 @@ void vp9_yv12_copy_partial_frame_c(YV12_BUFFER_CONFIG *src_ybc,
   uint8_t *src_y, *dst_y;
   int yheight;
   int ystride;
-  int border;
   int yoffset;
   int linestocopy;
 
-  border   = src_ybc->border;
   yheight  = src_ybc->y_height;
   ystride  = src_ybc->y_stride;
 
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 267dd0aa5..8e91d828f 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -1328,7 +1328,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *be,
 
   // inverse transform
   if (best_tx_type != DCT_DCT)
-    vp9_ihtllm(best_dqcoeff, b->diff, 32, best_tx_type, 4);
+    vp9_ihtllm(best_dqcoeff, b->diff, 32, best_tx_type, 4, b->eob);
   else
     xd->inv_xform4x4_x8(best_dqcoeff, b->diff, 32);
 
@@ -1518,7 +1518,7 @@ static int64_t rd_pick_intra16x16mby_mode(VP9_COMP *cpi,
                                           int *skippable,
                                           int64_t txfm_cache[NB_TXFM_MODES]) {
   MB_PREDICTION_MODE mode;
-  TX_SIZE UNINITIALIZED_IS_SAFE(txfm_size);
+  TX_SIZE txfm_size = 0;
   MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
 #if CONFIG_COMP_INTRA_PRED
   MB_PREDICTION_MODE mode2;
@@ -1562,7 +1562,6 @@ static int64_t rd_pick_intra16x16mby_mode(VP9_COMP *cpi,
 
       this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
 
-
       if (this_rd < best_rd) {
         mode_selected = mode;
         txfm_size = mbmi->txfm_size;
@@ -1796,6 +1795,7 @@ static int64_t rd_pick_intra8x8mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
     mic->bmi[ib].as_mode.second = best_second_mode;
 #endif
   }
+
   *Rate = cost;
   *rate_y = tot_rate_y;
   *Distortion = distortion;
@@ -3889,6 +3889,9 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   unsigned int ref_costs[MAX_REF_FRAMES];
   int_mv seg_mvs[NB_PARTITIONINGS][16 /* n_blocks */][MAX_REF_FRAMES - 1];
 
+  int intra_cost_penalty = 20 * vp9_dc_quant(cpi->common.base_qindex,
+                                             cpi->common.y1dc_delta_q);
+
   vpx_memset(mode8x8, 0, sizeof(mode8x8));
   vpx_memset(&frame_mv, 0, sizeof(frame_mv));
   vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));
@@ -4086,16 +4089,17 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     if (!mbmi->ref_frame) {
       switch (this_mode) {
         default:
-        case DC_PRED:
         case V_PRED:
         case H_PRED:
-        case TM_PRED:
         case D45_PRED:
         case D135_PRED:
         case D117_PRED:
         case D153_PRED:
         case D27_PRED:
         case D63_PRED:
+          rate2 += intra_cost_penalty;
+        case DC_PRED:
+        case TM_PRED:
           mbmi->ref_frame = INTRA_FRAME;
           // FIXME compound intra prediction
           vp9_build_intra_predictors_mby(&x->e_mbd);
@@ -4129,6 +4133,7 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 #endif
                                              cpi->update_context);
           rate2 += rate;
+          rate2 += intra_cost_penalty;
           distortion2 += distortion;
 
           if (tmp_rd < best_yrd) {
@@ -4221,6 +4226,7 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
           }
 
           rate2 += rate;
+          rate2 += intra_cost_penalty;
           distortion2 += distortion;
 
           /* TODO: uv rate maybe over-estimated here since there is UV intra
@@ -4730,7 +4736,7 @@ void vp9_rd_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x,
   int mode16x16;
   int mode8x8[2][4];
   int dist;
-  int modeuv, modeuv8x8, uv_intra_skippable, uv_intra_skippable_8x8;
+  int modeuv, uv_intra_skippable, uv_intra_skippable_8x8;
   int y_intra16x16_skippable = 0;
   int64_t txfm_cache[NB_TXFM_MODES];
   TX_SIZE txfm_size_16x16;
@@ -4743,13 +4749,11 @@ void vp9_rd_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x,
   if (cpi->common.txfm_mode != ONLY_4X4) {
     rd_pick_intra_mbuv_mode_8x8(cpi, x, &rateuv8x8, &rateuv8x8_tokenonly,
                                 &distuv8x8, &uv_intra_skippable_8x8);
-    modeuv8x8 = mbmi->uv_mode;
   } else {
     uv_intra_skippable_8x8 = uv_intra_skippable;
     rateuv8x8 = rateuv;
     distuv8x8 = distuv;
     rateuv8x8_tokenonly = rateuv_tokenonly;
-    modeuv8x8 = modeuv;
   }
 
   // current macroblock under rate-distortion optimization test loop
diff --git a/vp9/encoder/vp9_sad_c.c b/vp9/encoder/vp9_sad_c.c
index 9ce27fbed..84121f79c 100644
--- a/vp9/encoder/vp9_sad_c.c
+++ b/vp9/encoder/vp9_sad_c.c
@@ -11,7 +11,7 @@
 
 #include <stdlib.h>
 #include "vp9/common/vp9_sadmxn.h"
-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 
 unsigned int vp9_sad64x64_c(const uint8_t *src_ptr,
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index 159d6faa5..8bbe53486 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -130,7 +130,6 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
                                               int error_thresh) {
   MACROBLOCK *x = &cpi->mb;
   int step_param;
-  int further_steps;
   int sadpb = x->sadperbit16;
   int bestsme = INT_MAX;
 
@@ -164,11 +163,8 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
   if (cpi->Speed < 8) {
     step_param = cpi->sf.first_step +
                  ((cpi->Speed > 5) ? 1 : 0);
-    further_steps =
-      (cpi->sf.max_step_search_steps - 1) - step_param;
   } else {
     step_param = cpi->sf.first_step + 2;
-    further_steps = 0;
   }
 
   /*cpi->sf.search_method == HEX*/
diff --git a/vp9/encoder/x86/vp9_dct_mmx.asm b/vp9/encoder/x86/vp9_dct_mmx.asm
index 3045466f2..54766d846 100644
--- a/vp9/encoder/x86/vp9_dct_mmx.asm
+++ b/vp9/encoder/x86/vp9_dct_mmx.asm
@@ -12,7 +12,7 @@
 %include "vpx_ports/x86_abi_support.asm"
 
 ;void vp9_short_fdct4x4_mmx(short *input, short *output, int pitch)
-global sym(vp9_short_fdct4x4_mmx)
+global sym(vp9_short_fdct4x4_mmx) PRIVATE
 sym(vp9_short_fdct4x4_mmx):
     push        rbp
     mov         rbp,        rsp
diff --git a/vp9/encoder/x86/vp9_dct_sse2.asm b/vp9/encoder/x86/vp9_dct_sse2.asm
index 2821fbe35..57b81a566 100644
--- a/vp9/encoder/x86/vp9_dct_sse2.asm
+++ b/vp9/encoder/x86/vp9_dct_sse2.asm
@@ -61,7 +61,7 @@
 %endmacro
 
 ;void vp9_short_fdct4x4_sse2(short *input, short *output, int pitch)
-global sym(vp9_short_fdct4x4_sse2)
+global sym(vp9_short_fdct4x4_sse2) PRIVATE
 sym(vp9_short_fdct4x4_sse2):
 
     STACK_FRAME_CREATE
@@ -166,7 +166,7 @@ sym(vp9_short_fdct4x4_sse2):
     STACK_FRAME_DESTROY
 
 ;void vp9_short_fdct8x4_sse2(short *input, short *output, int pitch)
-global sym(vp9_short_fdct8x4_sse2)
+global sym(vp9_short_fdct8x4_sse2) PRIVATE
 sym(vp9_short_fdct8x4_sse2):
 
     STACK_FRAME_CREATE
diff --git a/vp9/encoder/x86/vp9_encodeopt.asm b/vp9/encoder/x86/vp9_encodeopt.asm
index 9e4cd1102..5d9f7769d 100644
--- a/vp9/encoder/x86/vp9_encodeopt.asm
+++ b/vp9/encoder/x86/vp9_encodeopt.asm
@@ -12,7 +12,7 @@
 %include "vpx_ports/x86_abi_support.asm"
 
 ;int vp9_block_error_xmm(short *coeff_ptr,  short *dcoef_ptr)
-global sym(vp9_block_error_xmm)
+global sym(vp9_block_error_xmm) PRIVATE
 sym(vp9_block_error_xmm):
     push        rbp
     mov         rbp, rsp
@@ -60,7 +60,7 @@ sym(vp9_block_error_xmm):
     ret
 
 ;int vp9_block_error_mmx(short *coeff_ptr,  short *dcoef_ptr)
-global sym(vp9_block_error_mmx)
+global sym(vp9_block_error_mmx) PRIVATE
 sym(vp9_block_error_mmx):
     push        rbp
     mov         rbp, rsp
@@ -126,7 +126,7 @@ sym(vp9_block_error_mmx):
 
 
 ;int vp9_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
-global sym(vp9_mbblock_error_mmx_impl)
+global sym(vp9_mbblock_error_mmx_impl) PRIVATE
 sym(vp9_mbblock_error_mmx_impl):
     push        rbp
     mov         rbp, rsp
@@ -203,7 +203,7 @@ sym(vp9_mbblock_error_mmx_impl):
 
 
 ;int vp9_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
-global sym(vp9_mbblock_error_xmm_impl)
+global sym(vp9_mbblock_error_xmm_impl) PRIVATE
 sym(vp9_mbblock_error_xmm_impl):
     push        rbp
     mov         rbp, rsp
@@ -273,7 +273,7 @@ sym(vp9_mbblock_error_xmm_impl):
 
 
 ;int vp9_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);
-global sym(vp9_mbuverror_mmx_impl)
+global sym(vp9_mbuverror_mmx_impl) PRIVATE
 sym(vp9_mbuverror_mmx_impl):
     push        rbp
     mov         rbp, rsp
@@ -330,7 +330,7 @@ sym(vp9_mbuverror_mmx_impl):
 
 
 ;int vp9_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);
-global sym(vp9_mbuverror_xmm_impl)
+global sym(vp9_mbuverror_xmm_impl) PRIVATE
 sym(vp9_mbuverror_xmm_impl):
     push        rbp
     mov         rbp, rsp
diff --git a/vp9/encoder/x86/vp9_fwalsh_sse2.asm b/vp9/encoder/x86/vp9_fwalsh_sse2.asm
index c6b18c1a1..7bee9ef63 100644
--- a/vp9/encoder/x86/vp9_fwalsh_sse2.asm
+++ b/vp9/encoder/x86/vp9_fwalsh_sse2.asm
@@ -12,7 +12,7 @@
 %include "vpx_ports/x86_abi_support.asm"
 
 ;void vp9_short_walsh4x4_sse2(short *input, short *output, int pitch)
-global sym(vp9_short_walsh4x4_sse2)
+global sym(vp9_short_walsh4x4_sse2) PRIVATE
 sym(vp9_short_walsh4x4_sse2):
     push        rbp
     mov         rbp, rsp
diff --git a/vp9/encoder/x86/vp9_quantize_mmx.asm b/vp9/encoder/x86/vp9_quantize_mmx.asm
index 050119a31..22e235610 100644
--- a/vp9/encoder/x86/vp9_quantize_mmx.asm
+++ b/vp9/encoder/x86/vp9_quantize_mmx.asm
@@ -15,7 +15,7 @@
 ;                           short *qcoeff_ptr,short *dequant_ptr,
 ;                           short *scan_mask, short *round_ptr,
 ;                           short *quant_ptr, short *dqcoeff_ptr);
-global sym(vp9_fast_quantize_b_impl_mmx)
+global sym(vp9_fast_quantize_b_impl_mmx) PRIVATE
 sym(vp9_fast_quantize_b_impl_mmx):
     push        rbp
     mov         rbp, rsp
diff --git a/vp9/encoder/x86/vp9_quantize_sse2.asm b/vp9/encoder/x86/vp9_quantize_sse2.asm
index 0b3db6caa..060acc2ac 100644
--- a/vp9/encoder/x86/vp9_quantize_sse2.asm
+++ b/vp9/encoder/x86/vp9_quantize_sse2.asm
@@ -16,7 +16,7 @@
 ;  (BLOCK  *b,                     |  0
 ;   BLOCKD *d)                     |  1
 
-global sym(vp9_regular_quantize_b_sse2)
+global sym(vp9_regular_quantize_b_sse2) PRIVATE
 sym(vp9_regular_quantize_b_sse2):
     push        rbp
     mov         rbp, rsp
@@ -237,7 +237,7 @@ ZIGZAG_LOOP 15
 ;  (BLOCK  *b,                  |  0
 ;   BLOCKD *d)                  |  1
 
-global sym(vp9_fast_quantize_b_sse2)
+global sym(vp9_fast_quantize_b_sse2) PRIVATE
 sym(vp9_fast_quantize_b_sse2):
     push        rbp
     mov         rbp, rsp
diff --git a/vp9/encoder/x86/vp9_quantize_sse4.asm b/vp9/encoder/x86/vp9_quantize_sse4.asm
index 98269f120..1d43ce958 100644
--- a/vp9/encoder/x86/vp9_quantize_sse4.asm
+++ b/vp9/encoder/x86/vp9_quantize_sse4.asm
@@ -16,7 +16,7 @@
 ;  (BLOCK  *b,                     |  0
 ;   BLOCKD *d)                     |  1
 
-global sym(vp9_regular_quantize_b_sse4)
+global sym(vp9_regular_quantize_b_sse4) PRIVATE
 sym(vp9_regular_quantize_b_sse4):
 
 %if ABI_IS_32BIT
diff --git a/vp9/encoder/x86/vp9_quantize_ssse3.asm b/vp9/encoder/x86/vp9_quantize_ssse3.asm
index 8c464287a..41edbc13e 100644
--- a/vp9/encoder/x86/vp9_quantize_ssse3.asm
+++ b/vp9/encoder/x86/vp9_quantize_ssse3.asm
@@ -17,7 +17,7 @@
 ;   BLOCKD *d)                   |  1
 ;
 
-global sym(vp9_fast_quantize_b_ssse3)
+global sym(vp9_fast_quantize_b_ssse3) PRIVATE
 sym(vp9_fast_quantize_b_ssse3):
     push        rbp
     mov         rbp, rsp
diff --git a/vp9/encoder/x86/vp9_sad_mmx.asm b/vp9/encoder/x86/vp9_sad_mmx.asm
index 827c58cbb..32fdd23d7 100644
--- a/vp9/encoder/x86/vp9_sad_mmx.asm
+++ b/vp9/encoder/x86/vp9_sad_mmx.asm
@@ -11,11 +11,11 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
-global sym(vp9_sad16x16_mmx)
-global sym(vp9_sad8x16_mmx)
-global sym(vp9_sad8x8_mmx)
-global sym(vp9_sad4x4_mmx)
-global sym(vp9_sad16x8_mmx)
+global sym(vp9_sad16x16_mmx) PRIVATE
+global sym(vp9_sad8x16_mmx) PRIVATE
+global sym(vp9_sad8x8_mmx) PRIVATE
+global sym(vp9_sad4x4_mmx) PRIVATE
+global sym(vp9_sad16x8_mmx) PRIVATE
 
 ;unsigned int vp9_sad16x16_mmx(
 ;    unsigned char *src_ptr,
diff --git a/vp9/encoder/x86/vp9_sad_sse2.asm b/vp9/encoder/x86/vp9_sad_sse2.asm
index fe9fc4d55..33271635c 100644
--- a/vp9/encoder/x86/vp9_sad_sse2.asm
+++ b/vp9/encoder/x86/vp9_sad_sse2.asm
@@ -16,7 +16,7 @@
 ;    int  src_stride,
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride)
-global sym(vp9_sad16x16_wmt)
+global sym(vp9_sad16x16_wmt) PRIVATE
 sym(vp9_sad16x16_wmt):
     push        rbp
     mov         rbp, rsp
@@ -90,7 +90,7 @@ sym(vp9_sad16x16_wmt):
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    int  max_err)
-global sym(vp9_sad8x16_wmt)
+global sym(vp9_sad8x16_wmt) PRIVATE
 sym(vp9_sad8x16_wmt):
     push        rbp
     mov         rbp, rsp
@@ -153,7 +153,7 @@ sym(vp9_sad8x16_wmt):
 ;    int  src_stride,
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride)
-global sym(vp9_sad8x8_wmt)
+global sym(vp9_sad8x8_wmt) PRIVATE
 sym(vp9_sad8x8_wmt):
     push        rbp
     mov         rbp, rsp
@@ -206,7 +206,7 @@ sym(vp9_sad8x8_wmt):
 ;    int  src_stride,
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride)
-global sym(vp9_sad4x4_wmt)
+global sym(vp9_sad4x4_wmt) PRIVATE
 sym(vp9_sad4x4_wmt):
     push        rbp
     mov         rbp, rsp
@@ -261,7 +261,7 @@ sym(vp9_sad4x4_wmt):
 ;    int  src_stride,
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride)
-global sym(vp9_sad16x8_wmt)
+global sym(vp9_sad16x8_wmt) PRIVATE
 sym(vp9_sad16x8_wmt):
     push        rbp
     mov         rbp, rsp
@@ -335,7 +335,7 @@ sym(vp9_sad16x8_wmt):
 ;    unsigned char *dst_ptr,
 ;    int  dst_stride,
 ;    int height);
-global sym(vp9_copy32xn_sse2)
+global sym(vp9_copy32xn_sse2) PRIVATE
 sym(vp9_copy32xn_sse2):
     push        rbp
     mov         rbp, rsp
diff --git a/vp9/encoder/x86/vp9_sad_sse3.asm b/vp9/encoder/x86/vp9_sad_sse3.asm
index e17485e5b..2c409cbe5 100644
--- a/vp9/encoder/x86/vp9_sad_sse3.asm
+++ b/vp9/encoder/x86/vp9_sad_sse3.asm
@@ -380,7 +380,7 @@
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    int  *results)
-global sym(vp9_sad16x16x3_sse3)
+global sym(vp9_sad16x16x3_sse3) PRIVATE
 sym(vp9_sad16x16x3_sse3):
 
     STACK_FRAME_CREATE_X3
@@ -422,7 +422,7 @@ sym(vp9_sad16x16x3_sse3):
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    int  *results)
-global sym(vp9_sad16x8x3_sse3)
+global sym(vp9_sad16x8x3_sse3) PRIVATE
 sym(vp9_sad16x8x3_sse3):
 
     STACK_FRAME_CREATE_X3
@@ -460,7 +460,7 @@ sym(vp9_sad16x8x3_sse3):
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    int  *results)
-global sym(vp9_sad8x16x3_sse3)
+global sym(vp9_sad8x16x3_sse3) PRIVATE
 sym(vp9_sad8x16x3_sse3):
 
     STACK_FRAME_CREATE_X3
@@ -489,7 +489,7 @@ sym(vp9_sad8x16x3_sse3):
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    int  *results)
-global sym(vp9_sad8x8x3_sse3)
+global sym(vp9_sad8x8x3_sse3) PRIVATE
 sym(vp9_sad8x8x3_sse3):
 
     STACK_FRAME_CREATE_X3
@@ -514,7 +514,7 @@ sym(vp9_sad8x8x3_sse3):
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    int  *results)
-global sym(vp9_sad4x4x3_sse3)
+global sym(vp9_sad4x4x3_sse3) PRIVATE
 sym(vp9_sad4x4x3_sse3):
 
     STACK_FRAME_CREATE_X3
@@ -589,7 +589,7 @@ sym(vp9_sad4x4x3_sse3):
 ;    int  ref_stride,
 ;    int  max_err)
 ;%define lddqu movdqu
-global sym(vp9_sad16x16_sse3)
+global sym(vp9_sad16x16_sse3) PRIVATE
 sym(vp9_sad16x16_sse3):
 
     STACK_FRAME_CREATE_X3
@@ -642,7 +642,7 @@ sym(vp9_sad16x16_sse3):
 ;    unsigned char *dst_ptr,
 ;    int  dst_stride,
 ;    int height);
-global sym(vp9_copy32xn_sse3)
+global sym(vp9_copy32xn_sse3) PRIVATE
 sym(vp9_copy32xn_sse3):
 
     STACK_FRAME_CREATE_X3
@@ -703,7 +703,7 @@ sym(vp9_copy32xn_sse3):
 ;    unsigned char *ref_ptr_base,
 ;    int  ref_stride,
 ;    int  *results)
-global sym(vp9_sad16x16x4d_sse3)
+global sym(vp9_sad16x16x4d_sse3) PRIVATE
 sym(vp9_sad16x16x4d_sse3):
 
     STACK_FRAME_CREATE_X4
@@ -754,7 +754,7 @@ sym(vp9_sad16x16x4d_sse3):
 ;    unsigned char *ref_ptr_base,
 ;    int  ref_stride,
 ;    int  *results)
-global sym(vp9_sad16x8x4d_sse3)
+global sym(vp9_sad16x8x4d_sse3) PRIVATE
 sym(vp9_sad16x8x4d_sse3):
 
     STACK_FRAME_CREATE_X4
@@ -801,7 +801,7 @@ sym(vp9_sad16x8x4d_sse3):
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    int  *results)
-global sym(vp9_sad8x16x4d_sse3)
+global sym(vp9_sad8x16x4d_sse3) PRIVATE
 sym(vp9_sad8x16x4d_sse3):
 
     STACK_FRAME_CREATE_X4
@@ -834,7 +834,7 @@ sym(vp9_sad8x16x4d_sse3):
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    int  *results)
-global sym(vp9_sad8x8x4d_sse3)
+global sym(vp9_sad8x8x4d_sse3) PRIVATE
 sym(vp9_sad8x8x4d_sse3):
 
     STACK_FRAME_CREATE_X4
@@ -863,7 +863,7 @@ sym(vp9_sad8x8x4d_sse3):
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    int  *results)
-global sym(vp9_sad4x4x4d_sse3)
+global sym(vp9_sad4x4x4d_sse3) PRIVATE
 sym(vp9_sad4x4x4d_sse3):
 
     STACK_FRAME_CREATE_X4
diff --git a/vp9/encoder/x86/vp9_sad_sse4.asm b/vp9/encoder/x86/vp9_sad_sse4.asm
index 25980d624..b42982a1f 100644
--- a/vp9/encoder/x86/vp9_sad_sse4.asm
+++ b/vp9/encoder/x86/vp9_sad_sse4.asm
@@ -161,7 +161,7 @@
 ;    const unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    unsigned short *sad_array);
-global sym(vp9_sad16x16x8_sse4)
+global sym(vp9_sad16x16x8_sse4) PRIVATE
 sym(vp9_sad16x16x8_sse4):
     push        rbp
     mov         rbp, rsp
@@ -203,7 +203,7 @@ sym(vp9_sad16x16x8_sse4):
 ;    int  ref_stride,
 ;    unsigned short *sad_array
 ;);
-global sym(vp9_sad16x8x8_sse4)
+global sym(vp9_sad16x8x8_sse4) PRIVATE
 sym(vp9_sad16x8x8_sse4):
     push        rbp
     mov         rbp, rsp
@@ -241,7 +241,7 @@ sym(vp9_sad16x8x8_sse4):
 ;    int  ref_stride,
 ;    unsigned short *sad_array
 ;);
-global sym(vp9_sad8x8x8_sse4)
+global sym(vp9_sad8x8x8_sse4) PRIVATE
 sym(vp9_sad8x8x8_sse4):
     push        rbp
     mov         rbp, rsp
@@ -279,7 +279,7 @@ sym(vp9_sad8x8x8_sse4):
 ;    int  ref_stride,
 ;    unsigned short *sad_array
 ;);
-global sym(vp9_sad8x16x8_sse4)
+global sym(vp9_sad8x16x8_sse4) PRIVATE
 sym(vp9_sad8x16x8_sse4):
     push        rbp
     mov         rbp, rsp
@@ -320,7 +320,7 @@ sym(vp9_sad8x16x8_sse4):
 ;    int  ref_stride,
 ;    unsigned short *sad_array
 ;);
-global sym(vp9_sad4x4x8_sse4)
+global sym(vp9_sad4x4x8_sse4) PRIVATE
 sym(vp9_sad4x4x8_sse4):
     push        rbp
     mov         rbp, rsp
diff --git a/vp9/encoder/x86/vp9_sad_ssse3.asm b/vp9/encoder/x86/vp9_sad_ssse3.asm
index 5623d8be4..0cb35424e 100644
--- a/vp9/encoder/x86/vp9_sad_ssse3.asm
+++ b/vp9/encoder/x86/vp9_sad_ssse3.asm
@@ -152,7 +152,7 @@
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    int  *results)
-global sym(vp9_sad16x16x3_ssse3)
+global sym(vp9_sad16x16x3_ssse3) PRIVATE
 sym(vp9_sad16x16x3_ssse3):
     push        rbp
     mov         rbp, rsp
@@ -265,7 +265,7 @@ sym(vp9_sad16x16x3_ssse3):
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    int  *results)
-global sym(vp9_sad16x8x3_ssse3)
+global sym(vp9_sad16x8x3_ssse3) PRIVATE
 sym(vp9_sad16x8x3_ssse3):
     push        rbp
     mov         rbp, rsp
diff --git a/vp9/encoder/x86/vp9_ssim_opt.asm b/vp9/encoder/x86/vp9_ssim_opt.asm
index 905c263a6..455d10d2c 100644
--- a/vp9/encoder/x86/vp9_ssim_opt.asm
+++ b/vp9/encoder/x86/vp9_ssim_opt.asm
@@ -61,7 +61,7 @@
 ; or pavgb At this point this is just meant to be first pass for calculating
 ; all the parms needed for 16x16 ssim so we can play with dssim as distortion
 ; in mode selection code.
-global sym(vp9_ssim_parms_16x16_sse2)
+global sym(vp9_ssim_parms_16x16_sse2) PRIVATE
 sym(vp9_ssim_parms_16x16_sse2):
     push        rbp
     mov         rbp, rsp
@@ -151,7 +151,7 @@ sym(vp9_ssim_parms_16x16_sse2):
 ; or pavgb At this point this is just meant to be first pass for calculating
 ; all the parms needed for 16x16 ssim so we can play with dssim as distortion
 ; in mode selection code.
-global sym(vp9_ssim_parms_8x8_sse2)
+global sym(vp9_ssim_parms_8x8_sse2) PRIVATE
 sym(vp9_ssim_parms_8x8_sse2):
     push        rbp
     mov         rbp, rsp
diff --git a/vp9/encoder/x86/vp9_subtract_mmx.asm b/vp9/encoder/x86/vp9_subtract_mmx.asm
index 5b0e249ca..e9eda4fed 100644
--- a/vp9/encoder/x86/vp9_subtract_mmx.asm
+++ b/vp9/encoder/x86/vp9_subtract_mmx.asm
@@ -14,7 +14,7 @@
 ;void vp9_subtract_b_mmx_impl(unsigned char *z,  int src_stride,
 ;                            short *diff, unsigned char *Predictor,
 ;                            int pitch);
-global sym(vp9_subtract_b_mmx_impl)
+global sym(vp9_subtract_b_mmx_impl) PRIVATE
 sym(vp9_subtract_b_mmx_impl):
     push        rbp
     mov         rbp, rsp
@@ -74,7 +74,7 @@ sym(vp9_subtract_b_mmx_impl):
     ret
 
 ;void vp9_subtract_mby_mmx(short *diff, unsigned char *src, unsigned char *pred, int stride)
-global sym(vp9_subtract_mby_mmx)
+global sym(vp9_subtract_mby_mmx) PRIVATE
 sym(vp9_subtract_mby_mmx):
     push        rbp
     mov         rbp, rsp
@@ -150,7 +150,7 @@ sym(vp9_subtract_mby_mmx):
 
 
 ;void vp9_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
-global sym(vp9_subtract_mbuv_mmx)
+global sym(vp9_subtract_mbuv_mmx) PRIVATE
 sym(vp9_subtract_mbuv_mmx):
     push        rbp
     mov         rbp, rsp
diff --git a/vp9/encoder/x86/vp9_subtract_sse2.asm b/vp9/encoder/x86/vp9_subtract_sse2.asm
index f84ed0697..739d9487e 100644
--- a/vp9/encoder/x86/vp9_subtract_sse2.asm
+++ b/vp9/encoder/x86/vp9_subtract_sse2.asm
@@ -14,7 +14,7 @@
 ;void vp9_subtract_b_sse2_impl(unsigned char *z,  int src_stride,
 ;                            short *diff, unsigned char *Predictor,
 ;                            int pitch);
-global sym(vp9_subtract_b_sse2_impl)
+global sym(vp9_subtract_b_sse2_impl) PRIVATE
 sym(vp9_subtract_b_sse2_impl):
     push        rbp
     mov         rbp, rsp
@@ -72,7 +72,7 @@ sym(vp9_subtract_b_sse2_impl):
 
 
 ;void vp9_subtract_mby_sse2(short *diff, unsigned char *src, unsigned char *pred, int stride)
-global sym(vp9_subtract_mby_sse2)
+global sym(vp9_subtract_mby_sse2) PRIVATE
 sym(vp9_subtract_mby_sse2):
     push        rbp
     mov         rbp, rsp
@@ -146,7 +146,7 @@ sym(vp9_subtract_mby_sse2):
 
 
 ;void vp9_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
-global sym(vp9_subtract_mbuv_sse2)
+global sym(vp9_subtract_mbuv_sse2) PRIVATE
 sym(vp9_subtract_mbuv_sse2):
     push        rbp
     mov         rbp, rsp
diff --git a/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm b/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm
index 60cc80f15..a559d5d5a 100644
--- a/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm
+++ b/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm
@@ -20,7 +20,7 @@
 ;   int             filter_weight,    |  5
 ;   unsigned int   *accumulator,      |  6
 ;   unsigned short *count)            |  7
-global sym(vp9_temporal_filter_apply_sse2)
+global sym(vp9_temporal_filter_apply_sse2) PRIVATE
 sym(vp9_temporal_filter_apply_sse2):
 
     push        rbp
diff --git a/vp9/encoder/x86/vp9_variance_impl_mmx.asm b/vp9/encoder/x86/vp9_variance_impl_mmx.asm
index 45c30b089..9f140c96b 100644
--- a/vp9/encoder/x86/vp9_variance_impl_mmx.asm
+++ b/vp9/encoder/x86/vp9_variance_impl_mmx.asm
@@ -12,7 +12,7 @@
 %include "vpx_ports/x86_abi_support.asm"
 
 ;unsigned int vp9_get_mb_ss_mmx( short *src_ptr )
-global sym(vp9_get_mb_ss_mmx)
+global sym(vp9_get_mb_ss_mmx) PRIVATE
 sym(vp9_get_mb_ss_mmx):
     push        rbp
     mov         rbp, rsp
@@ -72,7 +72,7 @@ sym(vp9_get_mb_ss_mmx):
 ;    unsigned int *SSE,
 ;    int *Sum
 ;)
-global sym(vp9_get8x8var_mmx)
+global sym(vp9_get8x8var_mmx) PRIVATE
 sym(vp9_get8x8var_mmx):
     push        rbp
     mov         rbp, rsp
@@ -320,7 +320,7 @@ sym(vp9_get8x8var_mmx):
 ;    unsigned int *SSE,
 ;    int *Sum
 ;)
-global sym(vp9_get4x4var_mmx)
+global sym(vp9_get4x4var_mmx) PRIVATE
 sym(vp9_get4x4var_mmx):
     push        rbp
     mov         rbp, rsp
@@ -433,7 +433,7 @@ sym(vp9_get4x4var_mmx):
 ;    unsigned char *ref_ptr,
 ;    int  recon_stride
 ;)
-global sym(vp9_get4x4sse_cs_mmx)
+global sym(vp9_get4x4sse_cs_mmx) PRIVATE
 sym(vp9_get4x4sse_cs_mmx):
     push        rbp
     mov         rbp, rsp
@@ -522,7 +522,7 @@ sym(vp9_get4x4sse_cs_mmx):
 ;    int *sum,
 ;    unsigned int *sumsquared
 ;)
-global sym(vp9_filter_block2d_bil4x4_var_mmx)
+global sym(vp9_filter_block2d_bil4x4_var_mmx) PRIVATE
 sym(vp9_filter_block2d_bil4x4_var_mmx):
     push        rbp
     mov         rbp, rsp
@@ -667,7 +667,7 @@ sym(vp9_filter_block2d_bil4x4_var_mmx):
 ;    int *sum,
 ;    unsigned int *sumsquared
 ;)
-global sym(vp9_filter_block2d_bil_var_mmx)
+global sym(vp9_filter_block2d_bil_var_mmx) PRIVATE
 sym(vp9_filter_block2d_bil_var_mmx):
     push        rbp
     mov         rbp, rsp
diff --git a/vp9/encoder/x86/vp9_variance_impl_sse2.asm b/vp9/encoder/x86/vp9_variance_impl_sse2.asm
index 5b20f3b32..399926900 100644
--- a/vp9/encoder/x86/vp9_variance_impl_sse2.asm
+++ b/vp9/encoder/x86/vp9_variance_impl_sse2.asm
@@ -17,7 +17,7 @@
 ;(
 ;    short *src_ptr
 ;)
-global sym(vp9_get_mb_ss_sse2)
+global sym(vp9_get_mb_ss_sse2) PRIVATE
 sym(vp9_get_mb_ss_sse2):
     push        rbp
     mov         rbp, rsp
@@ -80,7 +80,7 @@ sym(vp9_get_mb_ss_sse2):
 ;    unsigned int    *  SSE,
 ;    int             *  Sum
 ;)
-global sym(vp9_get16x16var_sse2)
+global sym(vp9_get16x16var_sse2) PRIVATE
 sym(vp9_get16x16var_sse2):
     push        rbp
     mov         rbp, rsp
@@ -224,7 +224,7 @@ sym(vp9_get16x16var_sse2):
 ;    unsigned int    *  SSE,
 ;    int             *  Sum
 ;)
-global sym(vp9_get8x8var_sse2)
+global sym(vp9_get8x8var_sse2) PRIVATE
 sym(vp9_get8x8var_sse2):
     push        rbp
     mov         rbp, rsp
@@ -413,7 +413,7 @@ sym(vp9_get8x8var_sse2):
 ;    unsigned int *sumsquared;;
 ;
 ;)
-global sym(vp9_filter_block2d_bil_var_sse2)
+global sym(vp9_filter_block2d_bil_var_sse2) PRIVATE
 sym(vp9_filter_block2d_bil_var_sse2):
     push        rbp
     mov         rbp, rsp
@@ -690,7 +690,7 @@ filter_block2d_bil_variance:
 ;    int *sum,
 ;    unsigned int *sumsquared
 ;)
-global sym(vp9_half_horiz_vert_variance8x_h_sse2)
+global sym(vp9_half_horiz_vert_variance8x_h_sse2) PRIVATE
 sym(vp9_half_horiz_vert_variance8x_h_sse2):
     push        rbp
     mov         rbp, rsp
@@ -812,7 +812,7 @@ sym(vp9_half_horiz_vert_variance8x_h_sse2):
 ;    int *sum,
 ;    unsigned int *sumsquared
 ;)
-global sym(vp9_half_horiz_vert_variance16x_h_sse2)
+global sym(vp9_half_horiz_vert_variance16x_h_sse2) PRIVATE
 sym(vp9_half_horiz_vert_variance16x_h_sse2):
     push        rbp
     mov         rbp, rsp
@@ -928,7 +928,7 @@ sym(vp9_half_horiz_vert_variance16x_h_sse2):
 ;    int *sum,
 ;    unsigned int *sumsquared
 ;)
-global sym(vp9_half_vert_variance8x_h_sse2)
+global sym(vp9_half_vert_variance8x_h_sse2) PRIVATE
 sym(vp9_half_vert_variance8x_h_sse2):
     push        rbp
     mov         rbp, rsp
@@ -1035,7 +1035,7 @@ sym(vp9_half_vert_variance8x_h_sse2):
 ;    int *sum,
 ;    unsigned int *sumsquared
 ;)
-global sym(vp9_half_vert_variance16x_h_sse2)
+global sym(vp9_half_vert_variance16x_h_sse2) PRIVATE
 sym(vp9_half_vert_variance16x_h_sse2):
     push        rbp
     mov         rbp, rsp
@@ -1143,7 +1143,7 @@ sym(vp9_half_vert_variance16x_h_sse2):
 ;    int *sum,
 ;    unsigned int *sumsquared
 ;)
-global sym(vp9_half_horiz_variance8x_h_sse2)
+global sym(vp9_half_horiz_variance8x_h_sse2) PRIVATE
 sym(vp9_half_horiz_variance8x_h_sse2):
     push        rbp
     mov         rbp, rsp
@@ -1248,7 +1248,7 @@ sym(vp9_half_horiz_variance8x_h_sse2):
 ;    int *sum,
 ;    unsigned int *sumsquared
 ;)
-global sym(vp9_half_horiz_variance16x_h_sse2)
+global sym(vp9_half_horiz_variance16x_h_sse2) PRIVATE
 sym(vp9_half_horiz_variance16x_h_sse2):
     push        rbp
     mov         rbp, rsp
diff --git a/vp9/encoder/x86/vp9_variance_impl_ssse3.asm b/vp9/encoder/x86/vp9_variance_impl_ssse3.asm
index 30c75a6ae..98a4a16f6 100644
--- a/vp9/encoder/x86/vp9_variance_impl_ssse3.asm
+++ b/vp9/encoder/x86/vp9_variance_impl_ssse3.asm
@@ -29,7 +29,7 @@
 ;)
 ;Note: The filter coefficient at offset=0 is 128. Since the second register
 ;for Pmaddubsw is signed bytes, we must calculate zero offset seperately.
-global sym(vp9_filter_block2d_bil_var_ssse3)
+global sym(vp9_filter_block2d_bil_var_ssse3) PRIVATE
 sym(vp9_filter_block2d_bil_var_ssse3):
     push        rbp
     mov         rbp, rsp
diff --git a/vp9/encoder/x86/vp9_x86_csystemdependent.c b/vp9/encoder/x86/vp9_x86_csystemdependent.c
index f52d6b52d..3beef53a2 100644
--- a/vp9/encoder/x86/vp9_x86_csystemdependent.c
+++ b/vp9/encoder/x86/vp9_x86_csystemdependent.c
@@ -9,7 +9,7 @@
  */
 
 
-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #include "vpx_ports/x86.h"
 #include "vp9/encoder/vp9_variance.h"
 #include "vp9/encoder/vp9_onyx_int.h"
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 4d17233e7..7662e404c 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -17,6 +17,7 @@ VP9_COMMON_SRCS-yes += common/vp9_asm_com_offsets.c
 VP9_COMMON_SRCS-yes += common/vp9_blockd.c
 VP9_COMMON_SRCS-yes += common/vp9_coefupdateprobs.h
 VP9_COMMON_SRCS-yes += common/vp9_debugmodes.c
+VP9_COMMON_SRCS-yes += common/vp9_default_coef_probs.h
 VP9_COMMON_SRCS-yes += common/vp9_entropy.c
 VP9_COMMON_SRCS-yes += common/vp9_entropymode.c
 VP9_COMMON_SRCS-yes += common/vp9_entropymv.c
@@ -56,6 +57,7 @@ VP9_COMMON_SRCS-yes += common/vp9_setupintrarecon.h
 VP9_COMMON_SRCS-yes += common/vp9_subpixel.h
 VP9_COMMON_SRCS-yes += common/vp9_swapyv12buffer.h
 VP9_COMMON_SRCS-yes += common/vp9_systemdependent.h
+VP9_COMMON_SRCS-yes += common/vp9_textblit.h
 VP9_COMMON_SRCS-yes += common/vp9_treecoder.h
 VP9_COMMON_SRCS-yes += common/vp9_invtrans.c
 VP9_COMMON_SRCS-yes += common/vp9_loopfilter.c
@@ -84,7 +86,6 @@ VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_x86.c
 VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.h
 VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.c
-VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_idctllm_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_iwalsh_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_recon_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_subpixel_mmx.asm
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 020a5a2d6..12d1ec4e7 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -31,6 +31,7 @@ VP9_CX_SRCS-yes += encoder/vp9_bitstream.c
 VP9_CX_SRCS-yes += encoder/vp9_boolhuff.c
 VP9_CX_SRCS-yes += encoder/vp9_dct.c
 VP9_CX_SRCS-yes += encoder/vp9_encodeframe.c
+VP9_CX_SRCS-yes += encoder/vp9_encodeframe.h
 VP9_CX_SRCS-yes += encoder/vp9_encodeintra.c
 VP9_CX_SRCS-yes += encoder/vp9_encodemb.c
 VP9_CX_SRCS-yes += encoder/vp9_encodemv.c
@@ -58,6 +59,7 @@ VP9_CX_SRCS-yes += encoder/vp9_mcomp.c
 VP9_CX_SRCS-yes += encoder/vp9_modecosts.c
 VP9_CX_SRCS-yes += encoder/vp9_onyx_if.c
 VP9_CX_SRCS-yes += encoder/vp9_picklpf.c
+VP9_CX_SRCS-yes += encoder/vp9_picklpf.h
 VP9_CX_SRCS-yes += encoder/vp9_psnr.c
 VP9_CX_SRCS-yes += encoder/vp9_quantize.c
 VP9_CX_SRCS-yes += encoder/vp9_ratectrl.c
@@ -87,6 +89,7 @@ VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_mmx.c
 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_impl_mmx.asm
 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_sad_mmx.asm
 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm
+VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.h
 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_subtract_mmx.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c
diff --git a/vp9/vp9dx.mk b/vp9/vp9dx.mk
index e03e63cd4..7622fc0b2 100644
--- a/vp9/vp9dx.mk
+++ b/vp9/vp9dx.mk
@@ -21,6 +21,7 @@ VP9_DX_SRCS-yes += decoder/vp9_asm_dec_offsets.c
 VP9_DX_SRCS-yes += decoder/vp9_dboolhuff.c
 VP9_DX_SRCS-yes += decoder/vp9_decodemv.c
 VP9_DX_SRCS-yes += decoder/vp9_decodframe.c
+VP9_DX_SRCS-yes += decoder/vp9_decodframe.h
 VP9_DX_SRCS-yes += decoder/vp9_dequantize.c
 VP9_DX_SRCS-yes += decoder/vp9_detokenize.c
 VP9_DX_SRCS-yes += decoder/vp9_dboolhuff.h
@@ -35,9 +36,6 @@ VP9_DX_SRCS-yes += decoder/vp9_idct_blk.c
 
 VP9_DX_SRCS-yes := $(filter-out $(VP9_DX_SRCS_REMOVE-yes),$(VP9_DX_SRCS-yes))
 
-VP9_DX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += decoder/x86/vp9_x86_dsystemdependent.c
-VP9_DX_SRCS-$(HAVE_MMX) += decoder/x86/vp9_dequantize_mmx.asm
-VP9_DX_SRCS-$(HAVE_MMX) += decoder/x86/vp9_idct_blk_mmx.c
 VP9_DX_SRCS-$(HAVE_SSE2) += decoder/x86/vp9_idct_blk_sse2.c
 
 $(eval $(call asm_offsets_template,\
diff --git a/vpx/vpx_codec.h b/vpx/vpx_codec.h
index 368a92262..2e6f1e757 100644
--- a/vpx/vpx_codec.h
+++ b/vpx/vpx_codec.h
@@ -49,15 +49,22 @@ extern "C" {
 #ifndef DEPRECATED
 #if defined(__GNUC__) && __GNUC__
 #define DEPRECATED          __attribute__ ((deprecated))
-#define DECLSPEC_DEPRECATED /**< \copydoc #DEPRECATED */
 #elif defined(_MSC_VER)
 #define DEPRECATED
-#define DECLSPEC_DEPRECATED __declspec(deprecated) /**< \copydoc #DEPRECATED */
 #else
 #define DEPRECATED
-#define DECLSPEC_DEPRECATED /**< \copydoc #DEPRECATED */
 #endif
+#endif  /* DEPRECATED */
+
+#ifndef DECLSPEC_DEPRECATED
+#if defined(__GNUC__) && __GNUC__
+#define DECLSPEC_DEPRECATED /**< \copydoc #DEPRECATED */
+#elif defined(_MSC_VER)
+#define DECLSPEC_DEPRECATED __declspec(deprecated) /**< \copydoc #DEPRECATED */
+#else
+#define DECLSPEC_DEPRECATED /**< \copydoc #DEPRECATED */
 #endif
+#endif  /* DECLSPEC_DEPRECATED */
 
   /*!\brief Decorator indicating a function is potentially unused */
 #ifdef UNUSED
diff --git a/vpx/vpx_codec.mk b/vpx/vpx_codec.mk
index 427fd0f52..ffa123f12 100644
--- a/vpx/vpx_codec.mk
+++ b/vpx/vpx_codec.mk
@@ -11,6 +11,21 @@
 
 API_EXPORTS += exports
 
+API_SRCS-$(CONFIG_VP8_ENCODER) += vp8.h
+API_SRCS-$(CONFIG_VP8_ENCODER) += vp8cx.h
+API_DOC_SRCS-$(CONFIG_VP8_ENCODER) += vp8.h
+API_DOC_SRCS-$(CONFIG_VP8_ENCODER) += vp8cx.h
+
+API_SRCS-$(CONFIG_VP8_DECODER) += vp8.h
+API_SRCS-$(CONFIG_VP8_DECODER) += vp8dx.h
+API_DOC_SRCS-$(CONFIG_VP8_DECODER) += vp8.h
+API_DOC_SRCS-$(CONFIG_VP8_DECODER) += vp8dx.h
+
+API_DOC_SRCS-yes += vpx_codec.h
+API_DOC_SRCS-yes += vpx_decoder.h
+API_DOC_SRCS-yes += vpx_encoder.h
+API_DOC_SRCS-yes += vpx_image.h
+
 API_SRCS-yes                += src/vpx_decoder.c
 API_SRCS-yes                += vpx_decoder.h
 API_SRCS-yes                += src/vpx_encoder.c
@@ -23,3 +38,4 @@ API_SRCS-yes                += vpx_codec.mk
 API_SRCS-yes                += vpx_codec_impl_bottom.h
 API_SRCS-yes                += vpx_codec_impl_top.h
 API_SRCS-yes                += vpx_image.h
+API_SRCS-$(BUILD_LIBVPX)    += vpx_integer.h
diff --git a/vpx_mem/include/vpx_mem_intrnl.h b/vpx_mem/include/vpx_mem_intrnl.h
index 0f58cfc8f..60b5165f3 100644
--- a/vpx_mem/include/vpx_mem_intrnl.h
+++ b/vpx_mem/include/vpx_mem_intrnl.h
@@ -11,7 +11,7 @@
 
 #ifndef __VPX_MEM_INTRNL_H__
 #define __VPX_MEM_INTRNL_H__
-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 
 #ifndef CONFIG_MEM_MANAGER
 # if defined(VXWORKS)
diff --git a/vpx_mem/vpx_mem_tracker.c b/vpx_mem/vpx_mem_tracker.c
index 5b2103b55..613e8a16b 100644
--- a/vpx_mem/vpx_mem_tracker.c
+++ b/vpx_mem/vpx_mem_tracker.c
@@ -22,7 +22,7 @@
    in the memory_tracker struct as well as calls to create/destroy/lock/unlock
    the mutex in vpx_memory_tracker_init/Destroy and memory_tracker_lock_mutex/unlock_mutex
 */
-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 
 #if defined(__uClinux__)
 # include <lddk.h>
diff --git a/vpx_ports/arm_cpudetect.c b/vpx_ports/arm_cpudetect.c
index b23344858..3c916f247 100644
--- a/vpx_ports/arm_cpudetect.c
+++ b/vpx_ports/arm_cpudetect.c
@@ -136,7 +136,6 @@ int arm_cpu_caps(void) {
 
 #elif defined(__linux__) /* end __ANDROID__ */
 
-#elif defined(__linux__) /* end __ANDROID__ */
 #include <stdio.h>
 
 int arm_cpu_caps(void) {
diff --git a/vpx_ports/emmintrin_compat.h b/vpx_ports/emmintrin_compat.h
new file mode 100644
index 000000000..782d603af
--- /dev/null
+++ b/vpx_ports/emmintrin_compat.h
@@ -0,0 +1,55 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_PORTS_EMMINTRIN_COMPAT_H
+#define VPX_PORTS_EMMINTRIN_COMPAT_H
+
+#if defined(__GNUC__) && __GNUC__ < 4
+/* From emmintrin.h (gcc 4.5.3) */
+/* Casts between various SP, DP, INT vector types.  Note that these do no
+   conversion of values, they just change the type.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castpd_ps(__m128d __A)
+{
+  return (__m128) __A;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castpd_si128(__m128d __A)
+{
+  return (__m128i) __A;
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castps_pd(__m128 __A)
+{
+  return (__m128d) __A;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castps_si128(__m128 __A)
+{
+  return (__m128i) __A;
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castsi128_ps(__m128i __A)
+{
+  return (__m128) __A;
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castsi128_pd(__m128i __A)
+{
+  return (__m128d) __A;
+}
+#endif
+
+#endif
diff --git a/vpx_ports/vpx_ports.mk b/vpx_ports/vpx_ports.mk
new file mode 100644
index 000000000..e6cb52fb4
--- /dev/null
+++ b/vpx_ports/vpx_ports.mk
@@ -0,0 +1,26 @@
+##
+##  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+
+PORTS_SRCS-yes += vpx_ports.mk
+
+PORTS_SRCS-$(BUILD_LIBVPX) += asm_offsets.h
+PORTS_SRCS-$(BUILD_LIBVPX) += mem.h
+PORTS_SRCS-$(BUILD_LIBVPX) += vpx_timer.h
+
+ifeq ($(ARCH_X86)$(ARCH_X86_64),yes)
+PORTS_SRCS-$(BUILD_LIBVPX) += emms.asm
+PORTS_SRCS-$(BUILD_LIBVPX) += x86.h
+PORTS_SRCS-$(BUILD_LIBVPX) += x86_abi_support.asm
+PORTS_SRCS-$(BUILD_LIBVPX) += x86_cpuid.c
+endif
+
+PORTS_SRCS-$(ARCH_ARM) += arm_cpudetect.c
+PORTS_SRCS-$(ARCH_ARM) += arm.h
diff --git a/vpx_scale/vpx_scale.mk b/vpx_scale/vpx_scale.mk
index 11d3fd96d..76c11e792 100644
--- a/vpx_scale/vpx_scale.mk
+++ b/vpx_scale/vpx_scale.mk
@@ -5,7 +5,9 @@ SCALE_SRCS-yes += generic/vpx_scale.c
 SCALE_SRCS-yes += generic/yv12config.c
 SCALE_SRCS-yes += generic/yv12extend.c
 SCALE_SRCS-$(CONFIG_SPATIAL_RESAMPLING) += generic/gen_scalers.c
+SCALE_SRCS-yes += vpx_scale_asm_offsets.c
 SCALE_SRCS-yes += vpx_scale_rtcd.c
+SCALE_SRCS-yes += vpx_scale_rtcd.sh
 
 #neon
 SCALE_SRCS-$(HAVE_NEON)  += arm/neon/vp8_vpxyv12_copyframe_func_neon$(ASM)
diff --git a/vpxenc.c b/vpxenc.c
index af6db9108..3fc8da1fe 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -23,7 +23,9 @@
 #include <limits.h>
 #include <assert.h>
 #include "vpx/vpx_encoder.h"
+#if CONFIG_DECODERS
 #include "vpx/vpx_decoder.h"
+#endif
 #if USE_POSIX_MMAP
 #include <sys/types.h>
 #include <sys/stat.h>
@@ -2174,6 +2176,7 @@ static void initialize_encoder(struct stream_state  *stream,
     ctx_exit_on_error(&stream->encoder, "Failed to control codec");
   }
 
+#if CONFIG_DECODERS
   if (global->test_decode) {
     int width, height;
 
@@ -2186,6 +2189,7 @@ static void initialize_encoder(struct stream_state  *stream,
     stream->ref_enc.frame_type = VP8_LAST_FRAME;
     stream->ref_dec.frame_type = VP8_LAST_FRAME;
   }
+#endif
 }
 
 
@@ -2278,16 +2282,19 @@ static void get_cx_data(struct stream_state  *stream,
         stream->nbytes += pkt->data.raw.sz;
 
         *got_data = 1;
+#if CONFIG_DECODERS
         if (global->test_decode) {
           vpx_codec_decode(&stream->decoder, pkt->data.frame.buf,
                            pkt->data.frame.sz, NULL, 0);
           ctx_exit_on_error(&stream->decoder, "Failed to decode frame");
         }
+#endif
         break;
       case VPX_CODEC_STATS_PKT:
         stream->frames_out++;
-        fprintf(stderr, " %6luS",
-                (unsigned long)pkt->data.twopass_stats.sz);
+        if (!global->quiet)
+          fprintf(stderr, " %6luS",
+                  (unsigned long)pkt->data.twopass_stats.sz);
         stats_write(&stream->stats,
                     pkt->data.twopass_stats.buf,
                     pkt->data.twopass_stats.sz);