135 files changed, 8335 insertions, 4635 deletions
diff --git a/LICENSE b/LICENSE
index 7a6f99547..1ce44343c 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,4 +1,4 @@
-Copyright (c) 2010, Google Inc. All rights reserved.
+Copyright (c) 2010, The WebM Project authors. All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
@@ -12,9 +12,10 @@ met:
     the documentation and/or other materials provided with the
     distribution.
 
-  * Neither the name of Google nor the names of its contributors may
-    be used to endorse or promote products derived from this software
-    without specific prior written permission.
+  * Neither the name of Google, nor the WebM Project, nor the names
+    of its contributors may be used to endorse or promote products
+    derived from this software without specific prior written
+    permission.
 
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
diff --git a/args.c b/args.c
index 7b2cc3a10..37ba77884 100644
--- a/args.c
+++ b/args.c
@@ -57,7 +57,7 @@ int arg_match(struct arg *arg_, const struct arg_def *def, char **argv)
     }
     else if (def->long_name)
     {
-        int name_len = strlen(def->long_name);
+        const size_t name_len = strlen(def->long_name);
 
         if (strlen(arg.argv[0]) >= name_len + 2
             && arg.argv[0][1] == '-'
diff --git a/build/make/ads2gas.pl b/build/make/ads2gas.pl
index 388133aa2..cea967f93 100755
--- a/build/make/ads2gas.pl
+++ b/build/make/ads2gas.pl
@@ -126,15 +126,14 @@ while (<STDIN>)
     # ALIGN directive
     s/ALIGN/.balign/g;
 
-    # Strip ARM
-    s/\sARM/@ ARM/g;
+    # ARM code
+    s/\sARM/.arm/g;
 
-    # Strip REQUIRE8
-    #s/\sREQUIRE8/@ REQUIRE8/g;
-    s/\sREQUIRE8/@ /g;      #EQU cause problem
+    # REQUIRE8 Stack is required to be 8-byte aligned
+    s/\sREQUIRE8/.eabi_attribute Tag_ABI_align_needed, 1/g;
 
-    # Strip PRESERVE8
-    s/\sPRESERVE8/@ PRESERVE8/g;
+    # PRESERVE8 Stack 8-byte align is preserved
+    s/\sPRESERVE8/.eabi_attribute Tag_ABI_align_preserved, 1/g;
 
     # Use PROC and ENDP to give the symbols a .size directive.
     # This makes them show up properly in debugging tools like gdb and valgrind.
diff --git a/build/make/ads2gas_apple.pl b/build/make/ads2gas_apple.pl
index 78f4a97f5..81280bf78 100755
--- a/build/make/ads2gas_apple.pl
+++ b/build/make/ads2gas_apple.pl
@@ -30,6 +30,8 @@ my @mapping_list = ("\$0", "\$1", "\$2", "\$3", "\$4", "\$5", "\$6", "\$7", "\$8
 
 my @incoming_array;
 
+my @imported_functions;
+
 # Perl trim function to remove whitespace from the start and end of the string
 sub trim($)
 {
@@ -132,7 +134,18 @@ while (<STDIN>)
     # Make function visible to linker, and make additional symbol with
     # prepended underscore
     s/EXPORT\s+\|([\$\w]*)\|/.globl _$1\n\t.globl $1/;
-    s/IMPORT\s+\|([\$\w]*)\|/.globl $1/;
+
+    # Prepend imported functions with _
+    if (s/IMPORT\s+\|([\$\w]*)\|/.globl $1/)
+    {
+        $function = trim($1);
+        push(@imported_functions, $function);
+    }
+
+    foreach $function (@imported_functions)
+    {
+        s/$function/_$function/;
+    }
 
     # No vertical bars required; make additional symbol with prepended
     # underscore
@@ -157,8 +170,8 @@ while (<STDIN>)
     s/\sPRESERVE8/@ PRESERVE8/g;
 
     # Strip PROC and ENDPROC
-    s/PROC/@/g;
-    s/ENDP/@/g;
+    s/\bPROC\b/@/g;
+    s/\bENDP\b/@/g;
 
     # EQU directive
     s/(.*)EQU(.*)/.set $1, $2/;
diff --git a/build/make/configure.sh b/build/make/configure.sh
index 1279f781a..0426f9220 100755
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -561,6 +561,10 @@ process_common_toolchain() {
                 tgt_isa=x86_64
                 tgt_os=darwin10
                 ;;
+            *darwin11*)
+                tgt_isa=x86_64
+                tgt_os=darwin11
+                ;;
             *mingw32*|*cygwin*)
                 [ -z "$tgt_isa" ] && tgt_isa=x86
                 tgt_os=win32
@@ -617,6 +621,9 @@ process_common_toolchain() {
     if [ -d "/Developer/SDKs/MacOSX10.6.sdk" ]; then
         osx_sdk_dir="/Developer/SDKs/MacOSX10.6.sdk"
     fi
+    if [ -d "/Developer/SDKs/MacOSX10.7.sdk" ]; then
+        osx_sdk_dir="/Developer/SDKs/MacOSX10.7.sdk"
+    fi
 
     case ${toolchain} in
         *-darwin8-*)
@@ -637,6 +644,12 @@ process_common_toolchain() {
             add_ldflags "-isysroot ${osx_sdk_dir}"
             add_ldflags "-mmacosx-version-min=10.6"
             ;;
+        *-darwin11-*)
+            add_cflags  "-isysroot ${osx_sdk_dir}"
+            add_cflags  "-mmacosx-version-min=10.7"
+            add_ldflags "-isysroot ${osx_sdk_dir}"
+            add_ldflags "-mmacosx-version-min=10.7"
+            ;;
     esac
 
     # Handle Solaris variants. Solaris 10 needs -lposix4
@@ -732,7 +745,7 @@ process_common_toolchain() {
             TOOLCHAIN_PATH=${SDK_PATH}/usr/bin
             CC=${TOOLCHAIN_PATH}/gcc
             AR=${TOOLCHAIN_PATH}/ar
-            LD=${TOOLCHAIN_PATH}/arm-apple-darwin10-gcc-4.2.1
+            LD=${TOOLCHAIN_PATH}/arm-apple-darwin10-llvm-gcc-4.2
             AS=${TOOLCHAIN_PATH}/as
             STRIP=${TOOLCHAIN_PATH}/strip
             NM=${TOOLCHAIN_PATH}/nm
@@ -746,13 +759,13 @@ process_common_toolchain() {
             add_cflags -arch ${tgt_isa}
             add_ldflags -arch_only ${tgt_isa}
 
-            add_cflags  "-isysroot ${SDK_PATH}/SDKs/iPhoneOS4.3.sdk"
+            add_cflags  "-isysroot ${SDK_PATH}/SDKs/iPhoneOS5.0.sdk"
 
             # This should be overridable
-            alt_libc=${SDK_PATH}/SDKs/iPhoneOS4.3.sdk
+            alt_libc=${SDK_PATH}/SDKs/iPhoneOS5.0.sdk
 
             # Add the paths for the alternate libc
-            for d in usr/include usr/include/gcc/darwin/4.2/ usr/lib/gcc/arm-apple-darwin10/4.2.1/include/; do
+            for d in usr/include; do
                 try_dir="${alt_libc}/${d}"
                 [ -d "${try_dir}" ] && add_cflags -I"${try_dir}"
             done
diff --git a/configure b/configure
index e6fcc87da..6f20c6b77 100755
--- a/configure
+++ b/configure
@@ -35,7 +35,7 @@ Advanced options:
   ${toggle_internal_stats}        output of encoder internal stats for debug, if supported (encoders)
   ${toggle_mem_tracker}           track memory usage
   ${toggle_postproc}              postprocessing
-  ${toggle_multithread}           multithreaded encoding and decoding.
+  ${toggle_multithread}           multithreaded encoding and decoding
   ${toggle_spatial_resampling}    spatial sampling (scaling) support
   ${toggle_realtime_only}         enable this option while building for real-time encoding
   ${toggle_error_concealment}     enable this option to get a decoder which is able to conceal losses
@@ -44,6 +44,7 @@ Advanced options:
   ${toggle_static}                static library support
   ${toggle_small}                 favor smaller size over speed
   ${toggle_postproc_visualizer}   macro block / block level visualizers
+  ${toggle_multi_res_encoding}    enable multiple-resolution encoding
 
 Codecs:
   Codecs can be selectively enabled or disabled individually, or by family:
@@ -118,9 +119,11 @@ all_platforms="${all_platforms} x86-win32-vs8"
 all_platforms="${all_platforms} x86-win32-vs9"
 all_platforms="${all_platforms} x86_64-darwin9-gcc"
 all_platforms="${all_platforms} x86_64-darwin10-gcc"
+all_platforms="${all_platforms} x86_64-darwin11-gcc"
 all_platforms="${all_platforms} x86_64-linux-gcc"
 all_platforms="${all_platforms} x86_64-linux-icc"
 all_platforms="${all_platforms} x86_64-solaris-gcc"
+all_platforms="${all_platforms} x86_64-win64-gcc"
 all_platforms="${all_platforms} x86_64-win64-vs8"
 all_platforms="${all_platforms} x86_64-win64-vs9"
 all_platforms="${all_platforms} universal-darwin8-gcc"
@@ -261,6 +264,7 @@ CONFIG_LIST="
     postproc_visualizer
     os_support
     unit_tests
+    multi_res_encoding
 "
 CMDLINE_SELECT="
     extra_warnings
@@ -303,6 +307,7 @@ CMDLINE_SELECT="
     small
     postproc_visualizer
     unit_tests
+    multi_res_encoding
 "
 
 process_cmdline() {
diff --git a/examples.mk b/examples.mk
index 8088d3217..f6c904588 100644
--- a/examples.mk
+++ b/examples.mk
@@ -96,6 +96,17 @@ GEN_EXAMPLES-$(CONFIG_VP8_ENCODER) += vp8cx_set_ref.c
 vp8cx_set_ref.GUID                  = C5E31F7F-96F6-48BD-BD3E-10EBF6E8057A
 vp8cx_set_ref.DESCRIPTION           = VP8 set encoder reference frame
 
+# C file is provided, not generated automatically.
+GEN_EXAMPLES-$(CONFIG_MULTI_RES_ENCODING) += vp8_multi_resolution_encoder.c
+vp8_multi_resolution_encoder.SRCS  \
+                         += third_party/libyuv/include/libyuv/basic_types.h  \
+                            third_party/libyuv/include/libyuv/cpu_id.h  \
+                            third_party/libyuv/include/libyuv/scale.h  \
+                            third_party/libyuv/source/row.h \
+                            third_party/libyuv/source/scale.c  \
+                            third_party/libyuv/source/cpu_id.c
+vp8_multi_resolution_encoder.GUID         = 04f8738e-63c8-423b-90fa-7c2703a374de
+vp8_multi_resolution_encoder.DESCRIPTION  = VP8 Multiple-resolution Encoding
 
 # Handle extra library flags depending on codec configuration
 
diff --git a/third_party/libyuv/README.webm b/third_party/libyuv/README.webm
new file mode 100644
index 000000000..d3495caa1
--- /dev/null
+++ b/third_party/libyuv/README.webm
@@ -0,0 +1,17 @@
+Name: libyuv
+URL: http://code.google.com/p/libyuv/
+Version: 102
+License: BSD
+License File: LICENSE
+
+Description:
+libyuv is an open source project that includes YUV conversion and scaling
+functionality.
+
+The optimized scaler in libyuv is used in multiple resolution encoder example,
+which down-samples the original input video (f.g. 1280x720) a number of times
+in order to encode multiple resolution bit streams.
+
+Local Modifications:
+Modified the original scaler code from C++ to C to fit in our current build
+system. This is a temporal solution, and will be improved later.
+\ No newline at end of file
diff --git a/third_party/libyuv/include/libyuv/basic_types.h b/third_party/libyuv/include/libyuv/basic_types.h
new file mode 100644
index 000000000..30504ce66
--- /dev/null
+++ b/third_party/libyuv/include/libyuv/basic_types.h
@@ -0,0 +1,73 @@
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_
+#define INCLUDE_LIBYUV_BASIC_TYPES_H_
+
+#include <stddef.h>  // for NULL, size_t
+
+#if !(defined(_MSC_VER) && (_MSC_VER < 1600))
+#include <stdint.h>  // for uintptr_t
+#endif
+
+#ifndef INT_TYPES_DEFINED
+#define INT_TYPES_DEFINED
+#ifdef COMPILER_MSVC
+typedef unsigned __int64 uint64;
+typedef __int64 int64;
+#ifndef INT64_C
+#define INT64_C(x) x ## I64
+#endif
+#ifndef UINT64_C
+#define UINT64_C(x) x ## UI64
+#endif
+#define INT64_F "I64"
+#else  // COMPILER_MSVC
+#ifdef __LP64__
+typedef unsigned long uint64;
+typedef long int64;
+#ifndef INT64_C
+#define INT64_C(x) x ## L
+#endif
+#ifndef UINT64_C
+#define UINT64_C(x) x ## UL
+#endif
+#define INT64_F "l"
+#else  // __LP64__
+typedef unsigned long long uint64;
+typedef long long int64;
+#ifndef INT64_C
+#define INT64_C(x) x ## LL
+#endif
+#ifndef UINT64_C
+#define UINT64_C(x) x ## ULL
+#endif
+#define INT64_F "ll"
+#endif  // __LP64__
+#endif  // COMPILER_MSVC
+typedef unsigned int uint32;
+typedef int int32;
+typedef unsigned short uint16;
+typedef short int16;
+typedef unsigned char uint8;
+typedef char int8;
+#endif  // INT_TYPES_DEFINED
+
+// Detect compiler is for x86 or x64.
+#if defined(__x86_64__) || defined(_M_X64) || \
+    defined(__i386__) || defined(_M_IX86)
+#define CPU_X86 1
+#endif
+
+#define ALIGNP(p, t) \
+  ((uint8*)((((uintptr_t)(p) + \
+  ((t)-1)) & ~((t)-1))))
+
+#endif // INCLUDE_LIBYUV_BASIC_TYPES_H_
diff --git a/third_party/libyuv/include/libyuv/cpu_id.h b/third_party/libyuv/include/libyuv/cpu_id.h
new file mode 100644
index 000000000..4a53b5bef
--- /dev/null
+++ b/third_party/libyuv/include/libyuv/cpu_id.h
@@ -0,0 +1,49 @@
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_CPU_ID_H_
+#define INCLUDE_LIBYUV_CPU_ID_H_
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// These flags are only valid on x86 processors
+static const int kCpuHasSSE2 = 1;
+static const int kCpuHasSSSE3 = 2;
+
+// These flags are only valid on ARM processors
+static const int kCpuHasNEON = 4;
+
+// Internal flag to indicate cpuid is initialized.
+static const int kCpuInitialized = 8;
+
+// Detect CPU has SSE2 etc.
+// test_flag parameter should be one of kCpuHas constants above
+// returns non-zero if instruction set is detected
+static __inline int TestCpuFlag(int test_flag) {
+  extern int cpu_info_;
+  extern int InitCpuFlags();
+  return (cpu_info_ ? cpu_info_ : InitCpuFlags()) & test_flag;
+}
+
+// For testing, allow CPU flags to be disabled.
+// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.
+// -1 to enable all cpu specific optimizations.
+// 0 to disable all cpu specific optimizations.
+void MaskCpuFlags(int enable_flags);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_CPU_ID_H_
diff --git a/third_party/libyuv/include/libyuv/scale.h b/third_party/libyuv/include/libyuv/scale.h
new file mode 100644
index 000000000..21fe360ce
--- /dev/null
+++ b/third_party/libyuv/include/libyuv/scale.h
@@ -0,0 +1,70 @@
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_SCALE_H_
+#define INCLUDE_LIBYUV_SCALE_H_
+
+#include "third_party/libyuv/include/libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Supported filtering
+typedef enum {
+  kFilterNone = 0,  // Point sample; Fastest
+  kFilterBilinear = 1,  // Faster than box, but lower quality scaling down.
+  kFilterBox = 2  // Highest quality
+}FilterMode;
+
+// Scales a YUV 4:2:0 image from the src width and height to the
+// dst width and height.
+// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
+// used. This produces basic (blocky) quality at the fastest speed.
+// If filtering is kFilterBilinear, interpolation is used to produce a better
+// quality image, at the expense of speed.
+// If filtering is kFilterBox, averaging is used to produce ever better
+// quality image, at further expense of speed.
+// Returns 0 if successful.
+
+int I420Scale(const uint8* src_y, int src_stride_y,
+              const uint8* src_u, int src_stride_u,
+              const uint8* src_v, int src_stride_v,
+              int src_width, int src_height,
+              uint8* dst_y, int dst_stride_y,
+              uint8* dst_u, int dst_stride_u,
+              uint8* dst_v, int dst_stride_v,
+              int dst_width, int dst_height,
+              FilterMode filtering);
+
+// Legacy API.  Deprecated
+int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
+          int src_stride_y, int src_stride_u, int src_stride_v,
+          int src_width, int src_height,
+          uint8* dst_y, uint8* dst_u, uint8* dst_v,
+          int dst_stride_y, int dst_stride_u, int dst_stride_v,
+          int dst_width, int dst_height,
+          int interpolate);
+
+// Legacy API.  Deprecated
+int ScaleOffset(const uint8* src, int src_width, int src_height,
+                uint8* dst, int dst_width, int dst_height, int dst_yoffset,
+                int interpolate);
+
+// For testing, allow disabling of optimizations.
+void SetUseReferenceImpl(int use);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_SCALE_H_
diff --git a/third_party/libyuv/source/cpu_id.c b/third_party/libyuv/source/cpu_id.c
new file mode 100644
index 000000000..fccf3dd44
--- /dev/null
+++ b/third_party/libyuv/source/cpu_id.c
@@ -0,0 +1,81 @@
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/libyuv/include/libyuv/cpu_id.h"
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+#ifdef __ANDROID__
+#include <cpu-features.h>
+#endif
+
+#include "third_party/libyuv/include/libyuv/basic_types.h"  // for CPU_X86
+
+// TODO(fbarchard): Use cpuid.h when gcc 4.4 is used on OSX and Linux.
+#if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__)
+static inline void __cpuid(int cpu_info[4], int info_type) {
+  asm volatile (
+    "mov %%ebx, %%edi                          \n"
+    "cpuid                                     \n"
+    "xchg %%edi, %%ebx                         \n"
+    : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
+    : "a"(info_type)
+  );
+}
+#elif defined(__i386__) || defined(__x86_64__)
+static inline void __cpuid(int cpu_info[4], int info_type) {
+  asm volatile (
+    "cpuid                                     \n"
+    : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
+    : "a"(info_type)
+  );
+}
+#endif
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// CPU detect function for SIMD instruction sets.
+int cpu_info_ = 0;
+
+int InitCpuFlags() {
+#ifdef CPU_X86
+  int cpu_info[4];
+  __cpuid(cpu_info, 1);
+  cpu_info_ = (cpu_info[3] & 0x04000000 ? kCpuHasSSE2 : 0) |
+              (cpu_info[2] & 0x00000200 ? kCpuHasSSSE3 : 0) |
+              kCpuInitialized;
+#elif defined(__ANDROID__) && defined(__ARM_NEON__)
+  uint64_t features = android_getCpuFeatures();
+  cpu_info_ = ((features & ANDROID_CPU_ARM_FEATURE_NEON) ? kCpuHasNEON : 0) |
+              kCpuInitialized;
+#elif defined(__ARM_NEON__)
+  // gcc -mfpu=neon defines __ARM_NEON__
+  // Enable Neon if you want support for Neon and Arm, and use MaskCpuFlags
+  // to disable Neon on devices that do not have it.
+  cpu_info_ = kCpuHasNEON | kCpuInitialized;
+#else
+  cpu_info_ = kCpuInitialized;
+#endif
+  return cpu_info_;
+}
+
+void MaskCpuFlags(int enable_flags) {
+  InitCpuFlags();
+  cpu_info_ = (cpu_info_ & enable_flags) | kCpuInitialized;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/libyuv/source/row.h b/third_party/libyuv/source/row.h
new file mode 100644
index 000000000..eabe18094
--- /dev/null
+++ b/third_party/libyuv/source/row.h
@@ -0,0 +1,264 @@
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef LIBYUV_SOURCE_ROW_H_
+#define LIBYUV_SOURCE_ROW_H_
+
+#include "third_party/libyuv/include/libyuv/basic_types.h"
+
+#define kMaxStride (2048 * 4)
+#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1)))
+
+#if defined(COVERAGE_ENABLED) || defined(TARGET_IPHONE_SIMULATOR)
+#define YUV_DISABLE_ASM
+#endif
+
+#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
+#define HAS_FASTCONVERTYUVTOARGBROW_NEON
+void FastConvertYUVToARGBRow_NEON(const uint8* y_buf,
+                                  const uint8* u_buf,
+                                  const uint8* v_buf,
+                                  uint8* rgb_buf,
+                                  int width);
+#define HAS_FASTCONVERTYUVTOBGRAROW_NEON
+void FastConvertYUVToBGRARow_NEON(const uint8* y_buf,
+                                  const uint8* u_buf,
+                                  const uint8* v_buf,
+                                  uint8* rgb_buf,
+                                  int width);
+#define HAS_FASTCONVERTYUVTOABGRROW_NEON
+void FastConvertYUVToABGRRow_NEON(const uint8* y_buf,
+                                  const uint8* u_buf,
+                                  const uint8* v_buf,
+                                  uint8* rgb_buf,
+                                  int width);
+#endif
+
+// The following are available on all x86 platforms
+#if (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
+    !defined(YUV_DISABLE_ASM)
+#define HAS_ABGRTOARGBROW_SSSE3
+#define HAS_BGRATOARGBROW_SSSE3
+#define HAS_BG24TOARGBROW_SSSE3
+#define HAS_RAWTOARGBROW_SSSE3
+#define HAS_RGB24TOYROW_SSSE3
+#define HAS_RAWTOYROW_SSSE3
+#define HAS_RGB24TOUVROW_SSSE3
+#define HAS_RAWTOUVROW_SSSE3
+#define HAS_ARGBTOYROW_SSSE3
+#define HAS_BGRATOYROW_SSSE3
+#define HAS_ABGRTOYROW_SSSE3
+#define HAS_ARGBTOUVROW_SSSE3
+#define HAS_BGRATOUVROW_SSSE3
+#define HAS_ABGRTOUVROW_SSSE3
+#define HAS_I400TOARGBROW_SSE2
+#define HAS_FASTCONVERTYTOARGBROW_SSE2
+#define HAS_FASTCONVERTYUVTOARGBROW_SSSE3
+#define HAS_FASTCONVERTYUVTOBGRAROW_SSSE3
+#define HAS_FASTCONVERTYUVTOABGRROW_SSSE3
+#define HAS_FASTCONVERTYUV444TOARGBROW_SSSE3
+#define HAS_REVERSE_ROW_SSSE3
+#endif
+
+// The following are available on Neon platforms
+#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
+#define HAS_REVERSE_ROW_NEON
+#endif
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#ifdef HAS_ARGBTOYROW_SSSE3
+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width);
+void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width);
+void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width);
+#endif
+#if defined(HAS_BG24TOARGBROW_SSSE3) && defined(HAS_ARGBTOYROW_SSSE3)
+#define HASRGB24TOYROW_SSSE3
+#endif
+#ifdef HASRGB24TOYROW_SSSE3
+void RGB24ToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void RAWToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void RGB24ToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                        uint8* dst_u, uint8* dst_v, int width);
+void RAWToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width);
+#endif
+#ifdef HAS_REVERSE_ROW_SSSE3
+void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width);
+#endif
+#ifdef HAS_REVERSE_ROW_NEON
+void ReverseRow_NEON(const uint8* src, uint8* dst, int width);
+#endif
+void ReverseRow_C(const uint8* src, uint8* dst, int width);
+
+void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
+void BGRAToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
+void ABGRToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
+void RGB24ToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
+void RAWToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb,
+                   uint8* dst_u, uint8* dst_v, int width);
+void BGRAToUVRow_C(const uint8* src_argb0, int src_stride_argb,
+                   uint8* dst_u, uint8* dst_v, int width);
+void ABGRToUVRow_C(const uint8* src_argb0, int src_stride_argb,
+                   uint8* dst_u, uint8* dst_v, int width);
+void RGB24ToUVRow_C(const uint8* src_argb0, int src_stride_argb,
+                    uint8* dst_u, uint8* dst_v, int width);
+void RAWToUVRow_C(const uint8* src_argb0, int src_stride_argb,
+                  uint8* dst_u, uint8* dst_v, int width);
+
+#ifdef HAS_BG24TOARGBROW_SSSE3
+void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix);
+void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix);
+void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix);
+void RAWToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix);
+#endif
+void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix);
+void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix);
+void BG24ToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix);
+void RAWToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix);
+
+#ifdef HAS_I400TOARGBROW_SSE2
+void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
+#endif
+void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);
+
+#if defined(_MSC_VER)
+#define SIMD_ALIGNED(var) __declspec(align(16)) var
+typedef __declspec(align(16)) signed char vec8[16];
+typedef __declspec(align(16)) unsigned char uvec8[16];
+typedef __declspec(align(16)) signed short vec16[8];
+#else // __GNUC__
+#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
+typedef signed char __attribute__((vector_size(16))) vec8;
+typedef unsigned char __attribute__((vector_size(16))) uvec8;
+typedef signed short __attribute__((vector_size(16))) vec16;
+#endif
+
+//extern "C"
+SIMD_ALIGNED(const int16 kCoefficientsRgbY[768][4]);
+//extern "C"
+SIMD_ALIGNED(const int16 kCoefficientsBgraY[768][4]);
+//extern "C"
+SIMD_ALIGNED(const int16 kCoefficientsAbgrY[768][4]);
+
+void FastConvertYUVToARGBRow_C(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* rgb_buf,
+                               int width);
+
+void FastConvertYUVToBGRARow_C(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* rgb_buf,
+                               int width);
+
+void FastConvertYUVToABGRRow_C(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* rgb_buf,
+                               int width);
+
+void FastConvertYUV444ToARGBRow_C(const uint8* y_buf,
+                                  const uint8* u_buf,
+                                  const uint8* v_buf,
+                                  uint8* rgb_buf,
+                                  int width);
+
+void FastConvertYToARGBRow_C(const uint8* y_buf,
+                             uint8* rgb_buf,
+                             int width);
+
+#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSE2
+void FastConvertYUVToARGBRow_SSE2(const uint8* y_buf,
+                                  const uint8* u_buf,
+                                  const uint8* v_buf,
+                                  uint8* rgb_buf,
+                                  int width);
+
+void FastConvertYUVToARGBRow4_SSE2(const uint8* y_buf,
+                                   const uint8* u_buf,
+                                   const uint8* v_buf,
+                                   uint8* rgb_buf,
+                                   int width);
+
+void FastConvertYUVToBGRARow_SSE2(const uint8* y_buf,
+                                  const uint8* u_buf,
+                                  const uint8* v_buf,
+                                  uint8* rgb_buf,
+                                  int width);
+
+void FastConvertYUVToABGRRow_SSE2(const uint8* y_buf,
+                                  const uint8* u_buf,
+                                  const uint8* v_buf,
+                                  uint8* rgb_buf,
+                                  int width);
+
+void FastConvertYUV444ToARGBRow_SSE2(const uint8* y_buf,
+                                     const uint8* u_buf,
+                                     const uint8* v_buf,
+                                     uint8* rgb_buf,
+                                     int width);
+
+void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
+                                uint8* rgb_buf,
+                                int width);
+#endif
+
+#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSSE3
+void FastConvertYUVToARGBRow_SSSE3(const uint8* y_buf,
+                                   const uint8* u_buf,
+                                   const uint8* v_buf,
+                                   uint8* rgb_buf,
+                                   int width);
+
+void FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf,
+                                   const uint8* u_buf,
+                                   const uint8* v_buf,
+                                   uint8* rgb_buf,
+                                   int width);
+
+void FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf,
+                                   const uint8* u_buf,
+                                   const uint8* v_buf,
+                                   uint8* rgb_buf,
+                                   int width);
+
+void FastConvertYUV444ToARGBRow_SSSE3(const uint8* y_buf,
+                                      const uint8* u_buf,
+                                      const uint8* v_buf,
+                                      uint8* rgb_buf,
+                                      int width);
+
+#endif
+
+#ifdef HAS_FASTCONVERTYTOARGBROW_SSE2
+void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
+                                uint8* rgb_buf,
+                                int width);
+
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // LIBYUV_SOURCE_ROW_H_
diff --git a/third_party/libyuv/source/scale.c b/third_party/libyuv/source/scale.c
new file mode 100644
index 000000000..930a7ae09
--- /dev/null
+++ b/third_party/libyuv/source/scale.c
@@ -0,0 +1,3884 @@
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/libyuv/include/libyuv/scale.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include "third_party/libyuv/include/libyuv/cpu_id.h"
+#include "third_party/libyuv/source/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+/*
+ * Note: Defining YUV_DISABLE_ASM allows to use c version.
+ */
+//#define YUV_DISABLE_ASM
+
+#if defined(_MSC_VER)
+#define ALIGN16(var) __declspec(align(16)) var
+#else
+#define ALIGN16(var) var __attribute__((aligned(16)))
+#endif
+
+// Note: A Neon reference manual
+// http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204j/CJAJIIGG.html
+// Note: Some SSE2 reference manuals
+// cpuvol1.pdf agner_instruction_tables.pdf 253666.pdf 253667.pdf
+
+// Set the following flag to true to revert to only
+// using the reference implementation ScalePlaneBox(), and
+// NOT the optimized versions. Useful for debugging and
+// when comparing the quality of the resulting YUV planes
+// as produced by the optimized and non-optimized versions.
+
+static int use_reference_impl_ = 0;
+
+void SetUseReferenceImpl(int use) {
+  use_reference_impl_ = use;
+}
+
+// ScaleRowDown2Int also used by planar functions
+
+/**
+ * NEON downscalers with interpolation.
+ *
+ * Provided by Fritz Koenig
+ *
+ */
+
+#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
+#define HAS_SCALEROWDOWN2_NEON
+void ScaleRowDown2_NEON(const uint8* src_ptr, int /* src_stride */,
+                        uint8* dst, int dst_width) {
+  asm volatile (
+    "1:                                        \n"
+    "vld2.u8    {q0,q1}, [%0]!                 \n"  // load even pixels into q0, odd into q1
+    "vst1.u8    {q0}, [%1]!                    \n"  // store even pixels
+    "subs       %2, %2, #16                    \n"  // 16 processed per loop
+    "bhi        1b                             \n"
+    : "+r"(src_ptr),          // %0
+      "+r"(dst),              // %1
+      "+r"(dst_width)         // %2
+    :
+    : "q0", "q1"              // Clobber List
+  );
+}
+
+void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride,
+                           uint8* dst, int dst_width) {
+  asm volatile (
+    "add        %1, %0                         \n"  // change the stride to row 2 pointer
+    "1:                                        \n"
+    "vld1.u8    {q0,q1}, [%0]!                 \n"  // load row 1 and post increment
+    "vld1.u8    {q2,q3}, [%1]!                 \n"  // load row 2 and post increment
+    "vpaddl.u8  q0, q0                         \n"  // row 1 add adjacent
+    "vpaddl.u8  q1, q1                         \n"
+    "vpadal.u8  q0, q2                         \n"  // row 2 add adjacent, add row 1 to row 2
+    "vpadal.u8  q1, q3                         \n"
+    "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack
+    "vrshrn.u16 d1, q1, #2                     \n"
+    "vst1.u8    {q0}, [%2]!                    \n"
+    "subs       %3, %3, #16                    \n"  // 16 processed per loop
+    "bhi        1b                             \n"
+    : "+r"(src_ptr),          // %0
+      "+r"(src_stride),       // %1
+      "+r"(dst),              // %2
+      "+r"(dst_width)         // %3
+    :
+    : "q0", "q1", "q2", "q3"     // Clobber List
+   );
+}
+
+#define HAS_SCALEROWDOWN4_NEON
+static void ScaleRowDown4_NEON(const uint8* src_ptr, int /* src_stride */,
+                               uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "1:                                        \n"
+    "vld2.u8    {d0, d1}, [%0]!                \n"
+    "vtrn.u8    d1, d0                         \n"
+    "vshrn.u16  d0, q0, #8                     \n"
+    "vst1.u32   {d0[1]}, [%1]!                 \n"
+
+    "subs       %2, #4                         \n"
+    "bhi        1b                             \n"
+    : "+r"(src_ptr),          // %0
+      "+r"(dst_ptr),          // %1
+      "+r"(dst_width)         // %2
+    :
+    : "q0", "q1", "memory", "cc"
+  );
+}
+
+static void ScaleRowDown4Int_NEON(const uint8* src_ptr, int src_stride,
+                                  uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "add        r4, %0, %3                     \n"
+    "add        r5, r4, %3                     \n"
+    "add        %3, r5, %3                     \n"
+    "1:                                        \n"
+    "vld1.u8    {q0}, [%0]!                    \n"   // load up 16x4 block of input data
+    "vld1.u8    {q1}, [r4]!                    \n"
+    "vld1.u8    {q2}, [r5]!                    \n"
+    "vld1.u8    {q3}, [%3]!                    \n"
+
+    "vpaddl.u8  q0, q0                         \n"
+    "vpadal.u8  q0, q1                         \n"
+    "vpadal.u8  q0, q2                         \n"
+    "vpadal.u8  q0, q3                         \n"
+
+    "vpaddl.u16 q0, q0                         \n"
+
+    "vrshrn.u32 d0, q0, #4                     \n"   // divide by 16 w/rounding
+
+    "vmovn.u16  d0, q0                         \n"
+    "vst1.u32   {d0[0]}, [%1]!                 \n"
+
+    "subs       %2, #4                         \n"
+    "bhi        1b                             \n"
+
+    : "+r"(src_ptr),          // %0
+      "+r"(dst_ptr),          // %1
+      "+r"(dst_width)         // %2
+    : "r"(src_stride)         // %3
+    : "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc"
+  );
+}
+
+#define HAS_SCALEROWDOWN34_NEON
+// Down scale from 4 to 3 pixels.  Use the neon multilane read/write
+//  to load up the every 4th pixel into a 4 different registers.
+// Point samples 32 pixels to 24 pixels.
+static void ScaleRowDown34_NEON(const uint8* src_ptr, int /* src_stride */,
+                                uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "1:                                        \n"
+    "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n" // src line 0
+    "vmov         d2, d3                       \n" // order needs to be d0, d1, d2
+    "vst3.u8      {d0, d1, d2}, [%1]!          \n"
+    "subs         %2, #24                      \n"
+    "bhi          1b                           \n"
+    : "+r"(src_ptr),          // %0
+      "+r"(dst_ptr),          // %1
+      "+r"(dst_width)         // %2
+    :
+    : "d0", "d1", "d2", "d3", "memory", "cc"
+  );
+}
+
+static void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr, int src_stride,
+                                      uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "vmov.u8      d24, #3                      \n"
+    "add          %3, %0                       \n"
+    "1:                                        \n"
+    "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n" // src line 0
+    "vld4.u8      {d4, d5, d6, d7}, [%3]!      \n" // src line 1
+
+    // filter src line 0 with src line 1
+    // expand chars to shorts to allow for room
+    // when adding lines together
+    "vmovl.u8     q8, d4                       \n"
+    "vmovl.u8     q9, d5                       \n"
+    "vmovl.u8     q10, d6                      \n"
+    "vmovl.u8     q11, d7                      \n"
+
+    // 3 * line_0 + line_1
+    "vmlal.u8     q8, d0, d24                  \n"
+    "vmlal.u8     q9, d1, d24                  \n"
+    "vmlal.u8     q10, d2, d24                 \n"
+    "vmlal.u8     q11, d3, d24                 \n"
+
+    // (3 * line_0 + line_1) >> 2
+    "vqrshrn.u16  d0, q8, #2                   \n"
+    "vqrshrn.u16  d1, q9, #2                   \n"
+    "vqrshrn.u16  d2, q10, #2                  \n"
+    "vqrshrn.u16  d3, q11, #2                  \n"
+
+    // a0 = (src[0] * 3 + s[1] * 1) >> 2
+    "vmovl.u8     q8, d1                       \n"
+    "vmlal.u8     q8, d0, d24                  \n"
+    "vqrshrn.u16  d0, q8, #2                   \n"
+
+    // a1 = (src[1] * 1 + s[2] * 1) >> 1
+    "vrhadd.u8    d1, d1, d2                   \n"
+
+    // a2 = (src[2] * 1 + s[3] * 3) >> 2
+    "vmovl.u8     q8, d2                       \n"
+    "vmlal.u8     q8, d3, d24                  \n"
+    "vqrshrn.u16  d2, q8, #2                   \n"
+
+    "vst3.u8      {d0, d1, d2}, [%1]!          \n"
+
+    "subs         %2, #24                      \n"
+    "bhi          1b                           \n"
+    : "+r"(src_ptr),          // %0
+      "+r"(dst_ptr),          // %1
+      "+r"(dst_width),        // %2
+      "+r"(src_stride)        // %3
+    :
+    : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
+  );
+}
+
+static void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr, int src_stride,
+                                      uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "vmov.u8      d24, #3                      \n"
+    "add          %3, %0                       \n"
+    "1:                                        \n"
+    "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n" // src line 0
+    "vld4.u8      {d4, d5, d6, d7}, [%3]!      \n" // src line 1
+
+    // average src line 0 with src line 1
+    "vrhadd.u8    q0, q0, q2                   \n"
+    "vrhadd.u8    q1, q1, q3                   \n"
+
+    // a0 = (src[0] * 3 + s[1] * 1) >> 2
+    "vmovl.u8     q3, d1                       \n"
+    "vmlal.u8     q3, d0, d24                  \n"
+    "vqrshrn.u16  d0, q3, #2                   \n"
+
+    // a1 = (src[1] * 1 + s[2] * 1) >> 1
+    "vrhadd.u8    d1, d1, d2                   \n"
+
+    // a2 = (src[2] * 1 + s[3] * 3) >> 2
+    "vmovl.u8     q3, d2                       \n"
+    "vmlal.u8     q3, d3, d24                  \n"
+    "vqrshrn.u16  d2, q3, #2                   \n"
+
+    "vst3.u8      {d0, d1, d2}, [%1]!          \n"
+
+    "subs         %2, #24                      \n"
+    "bhi          1b                           \n"
+    : "+r"(src_ptr),          // %0
+      "+r"(dst_ptr),          // %1
+      "+r"(dst_width),        // %2
+      "+r"(src_stride)        // %3
+    :
+    : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
+  );
+}
+
+#define HAS_SCALEROWDOWN38_NEON
+const uint8 shuf38[16] __attribute__ ((aligned(16))) =
+  { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
+const uint8 shuf38_2[16] __attribute__ ((aligned(16))) =
+  { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
+const unsigned short mult38_div6[8] __attribute__ ((aligned(16))) =
+  { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
+    65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
+const unsigned short mult38_div9[8] __attribute__ ((aligned(16))) =
+  { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
+    65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
+
+// 32 -> 12
+static void ScaleRowDown38_NEON(const uint8* src_ptr, int,
+                                uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "vld1.u8      {q3}, [%3]                   \n"
+    "1:                                        \n"
+    "vld1.u8      {d0, d1, d2, d3}, [%0]!      \n"
+    "vtbl.u8      d4, {d0, d1, d2, d3}, d6     \n"
+    "vtbl.u8      d5, {d0, d1, d2, d3}, d7     \n"
+    "vst1.u8      {d4}, [%1]!                  \n"
+    "vst1.u32     {d5[0]}, [%1]!               \n"
+    "subs         %2, #12                      \n"
+    "bhi          1b                           \n"
+    : "+r"(src_ptr),          // %0
+      "+r"(dst_ptr),          // %1
+      "+r"(dst_width)         // %2
+    : "r"(shuf38)             // %3
+    : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
+  );
+}
+
+// 32x3 -> 12x1
+static void ScaleRowDown38_3_Int_NEON(const uint8* src_ptr, int src_stride,
+                                      uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "vld1.u16     {q13}, [%4]                  \n"
+    "vld1.u8      {q14}, [%5]                  \n"
+    "vld1.u8      {q15}, [%6]                  \n"
+    "add          r4, %0, %3, lsl #1           \n"
+    "add          %3, %0                       \n"
+    "1:                                        \n"
+
+    // d0 = 00 40 01 41 02 42 03 43
+    // d1 = 10 50 11 51 12 52 13 53
+    // d2 = 20 60 21 61 22 62 23 63
+    // d3 = 30 70 31 71 32 72 33 73
+    "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n"
+    "vld4.u8      {d4, d5, d6, d7}, [%3]!      \n"
+    "vld4.u8      {d16, d17, d18, d19}, [r4]!  \n"
+
+    // Shuffle the input data around to get align the data
+    //  so adjacent data can be added.  0,1 - 2,3 - 4,5 - 6,7
+    // d0 = 00 10 01 11 02 12 03 13
+    // d1 = 40 50 41 51 42 52 43 53
+    "vtrn.u8      d0, d1                       \n"
+    "vtrn.u8      d4, d5                       \n"
+    "vtrn.u8      d16, d17                     \n"
+
+    // d2 = 20 30 21 31 22 32 23 33
+    // d3 = 60 70 61 71 62 72 63 73
+    "vtrn.u8      d2, d3                       \n"
+    "vtrn.u8      d6, d7                       \n"
+    "vtrn.u8      d18, d19                     \n"
+
+    // d0 = 00+10 01+11 02+12 03+13
+    // d2 = 40+50 41+51 42+52 43+53
+    "vpaddl.u8    q0, q0                       \n"
+    "vpaddl.u8    q2, q2                       \n"
+    "vpaddl.u8    q8, q8                       \n"
+
+    // d3 = 60+70 61+71 62+72 63+73
+    "vpaddl.u8    d3, d3                       \n"
+    "vpaddl.u8    d7, d7                       \n"
+    "vpaddl.u8    d19, d19                     \n"
+
+    // combine source lines
+    "vadd.u16     q0, q2                       \n"
+    "vadd.u16     q0, q8                       \n"
+    "vadd.u16     d4, d3, d7                   \n"
+    "vadd.u16     d4, d19                      \n"
+
+    // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
+    //             + s[6 + st * 1] + s[7 + st * 1]
+    //             + s[6 + st * 2] + s[7 + st * 2]) / 6
+    "vqrdmulh.s16 q2, q13                      \n"
+    "vmovn.u16    d4, q2                       \n"
+
+    // Shuffle 2,3 reg around so that 2 can be added to the
+    //  0,1 reg and 3 can be added to the 4,5 reg.  This
+    //  requires expanding from u8 to u16 as the 0,1 and 4,5
+    //  registers are already expanded.  Then do transposes
+    //  to get aligned.
+    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+    "vmovl.u8     q1, d2                       \n"
+    "vmovl.u8     q3, d6                       \n"
+    "vmovl.u8     q9, d18                      \n"
+
+    // combine source lines
+    "vadd.u16     q1, q3                       \n"
+    "vadd.u16     q1, q9                       \n"
+
+    // d4 = xx 20 xx 30 xx 22 xx 32
+    // d5 = xx 21 xx 31 xx 23 xx 33
+    "vtrn.u32     d2, d3                       \n"
+
+    // d4 = xx 20 xx 21 xx 22 xx 23
+    // d5 = xx 30 xx 31 xx 32 xx 33
+    "vtrn.u16     d2, d3                       \n"
+
+    // 0+1+2, 3+4+5
+    "vadd.u16     q0, q1                       \n"
+
+    // Need to divide, but can't downshift as the the value
+    //  isn't a power of 2.  So multiply by 65536 / n
+    //  and take the upper 16 bits.
+    "vqrdmulh.s16 q0, q15                      \n"
+
+    // Align for table lookup, vtbl requires registers to
+    //  be adjacent
+    "vmov.u8      d2, d4                       \n"
+
+    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
+    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
+
+    "vst1.u8      {d3}, [%1]!                  \n"
+    "vst1.u32     {d4[0]}, [%1]!               \n"
+    "subs         %2, #12                      \n"
+    "bhi          1b                           \n"
+    : "+r"(src_ptr),          // %0
+      "+r"(dst_ptr),          // %1
+      "+r"(dst_width),        // %2
+      "+r"(src_stride)        // %3
+    : "r"(mult38_div6),       // %4
+      "r"(shuf38_2),          // %5
+      "r"(mult38_div9)        // %6
+    : "r4", "q0", "q1", "q2", "q3", "q8", "q9",
+      "q13", "q14", "q15", "memory", "cc"
+  );
+}
+
+// 32x2 -> 12x1
+static void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr, int src_stride,
+                                      uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "vld1.u16     {q13}, [%4]                  \n"
+    "vld1.u8      {q14}, [%5]                  \n"
+    "add          %3, %0                       \n"
+    "1:                                        \n"
+
+    // d0 = 00 40 01 41 02 42 03 43
+    // d1 = 10 50 11 51 12 52 13 53
+    // d2 = 20 60 21 61 22 62 23 63
+    // d3 = 30 70 31 71 32 72 33 73
+    "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n"
+    "vld4.u8      {d4, d5, d6, d7}, [%3]!      \n"
+
+    // Shuffle the input data around to get align the data
+    //  so adjacent data can be added.  0,1 - 2,3 - 4,5 - 6,7
+    // d0 = 00 10 01 11 02 12 03 13
+    // d1 = 40 50 41 51 42 52 43 53
+    "vtrn.u8      d0, d1                       \n"
+    "vtrn.u8      d4, d5                       \n"
+
+    // d2 = 20 30 21 31 22 32 23 33
+    // d3 = 60 70 61 71 62 72 63 73
+    "vtrn.u8      d2, d3                       \n"
+    "vtrn.u8      d6, d7                       \n"
+
+    // d0 = 00+10 01+11 02+12 03+13
+    // d2 = 40+50 41+51 42+52 43+53
+    "vpaddl.u8    q0, q0                       \n"
+    "vpaddl.u8    q2, q2                       \n"
+
+    // d3 = 60+70 61+71 62+72 63+73
+    "vpaddl.u8    d3, d3                       \n"
+    "vpaddl.u8    d7, d7                       \n"
+
+    // combine source lines
+    "vadd.u16     q0, q2                       \n"
+    "vadd.u16     d4, d3, d7                   \n"
+
+    // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
+    "vqrshrn.u16  d4, q2, #2                   \n"
+
+    // Shuffle 2,3 reg around so that 2 can be added to the
+    //  0,1 reg and 3 can be added to the 4,5 reg.  This
+    //  requires expanding from u8 to u16 as the 0,1 and 4,5
+    //  registers are already expanded.  Then do transposes
+    //  to get aligned.
+    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+    "vmovl.u8     q1, d2                       \n"
+    "vmovl.u8     q3, d6                       \n"
+
+    // combine source lines
+    "vadd.u16     q1, q3                       \n"
+
+    // d4 = xx 20 xx 30 xx 22 xx 32
+    // d5 = xx 21 xx 31 xx 23 xx 33
+    "vtrn.u32     d2, d3                       \n"
+
+    // d4 = xx 20 xx 21 xx 22 xx 23
+    // d5 = xx 30 xx 31 xx 32 xx 33
+    "vtrn.u16     d2, d3                       \n"
+
+    // 0+1+2, 3+4+5
+    "vadd.u16     q0, q1                       \n"
+
+    // Need to divide, but can't downshift as the the value
+    //  isn't a power of 2.  So multiply by 65536 / n
+    //  and take the upper 16 bits.
+    "vqrdmulh.s16 q0, q13                      \n"
+
+    // Align for table lookup, vtbl requires registers to
+    //  be adjacent
+    "vmov.u8      d2, d4                       \n"
+
+    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
+    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
+
+    "vst1.u8      {d3}, [%1]!                  \n"
+    "vst1.u32     {d4[0]}, [%1]!               \n"
+    "subs         %2, #12                      \n"
+    "bhi          1b                           \n"
+    : "+r"(src_ptr),          // %0
+      "+r"(dst_ptr),          // %1
+      "+r"(dst_width),        // %2
+      "+r"(src_stride)        // %3
+    : "r"(mult38_div6),       // %4
+      "r"(shuf38_2)           // %5
+    : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
+  );
+}
+
+/**
+ * SSE2 downscalers with interpolation.
+ *
+ * Provided by Frank Barchard (fbarchard@google.com)
+ *
+ */
+
+// Constants for SSE2 code
+#elif (defined(_M_IX86) || defined(__i386__) || defined(__x86_64__)) && \
+    !defined(YUV_DISABLE_ASM)
+#if defined(_MSC_VER)
+#define TALIGN16(t, var) __declspec(align(16)) t _ ## var
+#elif (defined(__APPLE__) || defined(__MINGW32__) || defined(__CYGWIN__)) && defined(__i386__)
+#define TALIGN16(t, var) t var __attribute__((aligned(16)))
+#else
+#define TALIGN16(t, var) t _ ## var __attribute__((aligned(16)))
+#endif
+
+#if (defined(__APPLE__) || defined(__MINGW32__) || defined(__CYGWIN__)) && \
+    defined(__i386__)
+#define DECLARE_FUNCTION(name)                                                 \
+    ".text                                     \n"                             \
+    ".globl _" #name "                         \n"                             \
+"_" #name ":                                   \n"
+#else
+#define DECLARE_FUNCTION(name)                                                 \
+    ".text                                     \n"                             \
+    ".global " #name "                         \n"                             \
+#name ":                                       \n"
+#endif
+
+
+// Offsets for source bytes 0 to 9
+//extern "C"
+TALIGN16(const uint8, shuf0[16]) =
+  { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
+//extern "C"
+TALIGN16(const uint8, shuf1[16]) =
+  { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+//extern "C"
+TALIGN16(const uint8, shuf2[16]) =
+  { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 0 to 10
+//extern "C"
+TALIGN16(const uint8, shuf01[16]) =
+  { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
+
+// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
+//extern "C"
+TALIGN16(const uint8, shuf11[16]) =
+  { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+//extern "C"
+TALIGN16(const uint8, shuf21[16]) =
+  { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
+
+// Coefficients for source bytes 0 to 10
+//extern "C"
+TALIGN16(const uint8, madd01[16]) =
+  { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
+
+// Coefficients for source bytes 10 to 21
+//extern "C"
+TALIGN16(const uint8, madd11[16]) =
+  { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
+
+// Coefficients for source bytes 21 to 31
+//extern "C"
+TALIGN16(const uint8, madd21[16]) =
+  { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
+
+// Coefficients for source bytes 21 to 31
+//extern "C"
+TALIGN16(const int16, round34[8]) =
+  { 2, 2, 2, 2, 2, 2, 2, 2 };
+
+//extern "C"
+TALIGN16(const uint8, shuf38a[16]) =
+  { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+//extern "C"
+TALIGN16(const uint8, shuf38b[16]) =
+  { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
+
+// Arrange words 0,3,6 into 0,1,2
+//extern "C"
+TALIGN16(const uint8, shufac0[16]) =
+  { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Arrange words 0,3,6 into 3,4,5
+//extern "C"
+TALIGN16(const uint8, shufac3[16]) =
+  { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
+
+// Scaling values for boxes of 3x3 and 2x3
+//extern "C"
+TALIGN16(const uint16, scaleac3[8]) =
+  { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
+
+// Arrange first value for pixels 0,1,2,3,4,5
+//extern "C"
+TALIGN16(const uint8, shufab0[16]) =
+  { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
+
+// Arrange second value for pixels 0,1,2,3,4,5
+//extern "C"
+TALIGN16(const uint8, shufab1[16]) =
+  { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
+
+// Arrange third value for pixels 0,1,2,3,4,5
+//extern "C"
+TALIGN16(const uint8, shufab2[16]) =
+  { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
+
+// Scaling values for boxes of 3x2 and 2x2
+//extern "C"
+TALIGN16(const uint16, scaleab2[8]) =
+  { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
+#endif
+
+#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
+
+#define HAS_SCALEROWDOWN2_SSE2
+// Reads 32 pixels, throws half away and writes 16 pixels.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
+__declspec(naked)
+static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_ptr
+                                     // src_stride ignored
+    mov        edx, [esp + 12]       // dst_ptr
+    mov        ecx, [esp + 16]       // dst_width
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+
+  wloop:
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    pand       xmm0, xmm5
+    pand       xmm1, xmm5
+    packuswb   xmm0, xmm1
+    movdqa     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    ja         wloop
+
+    ret
+  }
+}
+// Blends 32x2 rectangle to 16x1.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
+__declspec(naked)
+void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]    // src_ptr
+    mov        esi, [esp + 4 + 8]    // src_stride
+    mov        edx, [esp + 4 + 12]   // dst_ptr
+    mov        ecx, [esp + 4 + 16]   // dst_width
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+
+  wloop:
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    movdqa     xmm2, [eax + esi]
+    movdqa     xmm3, [eax + esi + 16]
+    lea        eax,  [eax + 32]
+    pavgb      xmm0, xmm2            // average rows
+    pavgb      xmm1, xmm3
+
+    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
+    psrlw      xmm0, 8
+    movdqa     xmm3, xmm1
+    psrlw      xmm1, 8
+    pand       xmm2, xmm5
+    pand       xmm3, xmm5
+    pavgw      xmm0, xmm2
+    pavgw      xmm1, xmm3
+    packuswb   xmm0, xmm1
+
+    movdqa     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    ja         wloop
+
+    pop        esi
+    ret
+  }
+}
+
+#define HAS_SCALEROWDOWN4_SSE2
+// Point samples 32 pixels to 8 pixels.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
+__declspec(naked)
+static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  __asm {
+    pushad
+    mov        esi, [esp + 32 + 4]   // src_ptr
+                                     // src_stride ignored
+    mov        edi, [esp + 32 + 12]  // dst_ptr
+    mov        ecx, [esp + 32 + 16]  // dst_width
+    pcmpeqb    xmm5, xmm5            // generate mask 0x000000ff
+    psrld      xmm5, 24
+
+  wloop:
+    movdqa     xmm0, [esi]
+    movdqa     xmm1, [esi + 16]
+    lea        esi,  [esi + 32]
+    pand       xmm0, xmm5
+    pand       xmm1, xmm5
+    packuswb   xmm0, xmm1
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edi], xmm0
+    lea        edi, [edi + 8]
+    sub        ecx, 8
+    ja         wloop
+
+    popad
+    ret
+  }
+}
+
+// Blends 32x4 rectangle to 8x1.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
+__declspec(naked)
+static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
+                                  uint8* dst_ptr, int dst_width) {
+  __asm {
+    pushad
+    mov        esi, [esp + 32 + 4]   // src_ptr
+    mov        ebx, [esp + 32 + 8]   // src_stride
+    mov        edi, [esp + 32 + 12]  // dst_ptr
+    mov        ecx, [esp + 32 + 16]  // dst_width
+    pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
+    psrlw      xmm7, 8
+    lea        edx, [ebx + ebx * 2]  // src_stride * 3
+
+  wloop:
+    movdqa     xmm0, [esi]
+    movdqa     xmm1, [esi + 16]
+    movdqa     xmm2, [esi + ebx]
+    movdqa     xmm3, [esi + ebx + 16]
+    pavgb      xmm0, xmm2            // average rows
+    pavgb      xmm1, xmm3
+    movdqa     xmm2, [esi + ebx * 2]
+    movdqa     xmm3, [esi + ebx * 2 + 16]
+    movdqa     xmm4, [esi + edx]
+    movdqa     xmm5, [esi + edx + 16]
+    lea        esi, [esi + 32]
+    pavgb      xmm2, xmm4
+    pavgb      xmm3, xmm5
+    pavgb      xmm0, xmm2
+    pavgb      xmm1, xmm3
+
+    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
+    psrlw      xmm0, 8
+    movdqa     xmm3, xmm1
+    psrlw      xmm1, 8
+    pand       xmm2, xmm7
+    pand       xmm3, xmm7
+    pavgw      xmm0, xmm2
+    pavgw      xmm1, xmm3
+    packuswb   xmm0, xmm1
+
+    movdqa     xmm2, xmm0            // average columns (16 to 8 pixels)
+    psrlw      xmm0, 8
+    pand       xmm2, xmm7
+    pavgw      xmm0, xmm2
+    packuswb   xmm0, xmm0
+
+    movq       qword ptr [edi], xmm0
+    lea        edi, [edi + 8]
+    sub        ecx, 8
+    ja         wloop
+
+    popad
+    ret
+  }
+}
+
+#define HAS_SCALEROWDOWN8_SSE2
+// Point samples 32 pixels to 4 pixels.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.
+__declspec(naked)
+static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  __asm {
+    pushad
+    mov        esi, [esp + 32 + 4]   // src_ptr
+                                     // src_stride ignored
+    mov        edi, [esp + 32 + 12]  // dst_ptr
+    mov        ecx, [esp + 32 + 16]  // dst_width
+    pcmpeqb    xmm5, xmm5            // generate mask isolating 1 src 8 bytes
+    psrlq      xmm5, 56
+
+  wloop:
+    movdqa     xmm0, [esi]
+    movdqa     xmm1, [esi + 16]
+    lea        esi,  [esi + 32]
+    pand       xmm0, xmm5
+    pand       xmm1, xmm5
+    packuswb   xmm0, xmm1  // 32->16
+    packuswb   xmm0, xmm0  // 16->8
+    packuswb   xmm0, xmm0  // 8->4
+    movd       dword ptr [edi], xmm0
+    lea        edi, [edi + 4]
+    sub        ecx, 4
+    ja         wloop
+
+    popad
+    ret
+  }
+}
+
+// Blends 32x8 rectangle to 4x1.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.
+__declspec(naked)
+static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
+                                  uint8* dst_ptr, int dst_width) {
+  __asm {
+    pushad
+    mov        esi, [esp + 32 + 4]   // src_ptr
+    mov        ebx, [esp + 32 + 8]   // src_stride
+    mov        edi, [esp + 32 + 12]  // dst_ptr
+    mov        ecx, [esp + 32 + 16]  // dst_width
+    lea        edx, [ebx + ebx * 2]  // src_stride * 3
+    pxor       xmm7, xmm7
+
+  wloop:
+    movdqa     xmm0, [esi]           // average 8 rows to 1
+    movdqa     xmm1, [esi + 16]
+    movdqa     xmm2, [esi + ebx]
+    movdqa     xmm3, [esi + ebx + 16]
+    pavgb      xmm0, xmm2
+    pavgb      xmm1, xmm3
+    movdqa     xmm2, [esi + ebx * 2]
+    movdqa     xmm3, [esi + ebx * 2 + 16]
+    movdqa     xmm4, [esi + edx]
+    movdqa     xmm5, [esi + edx + 16]
+    lea        ebp, [esi + ebx * 4]
+    lea        esi, [esi + 32]
+    pavgb      xmm2, xmm4
+    pavgb      xmm3, xmm5
+    pavgb      xmm0, xmm2
+    pavgb      xmm1, xmm3
+
+    movdqa     xmm2, [ebp]
+    movdqa     xmm3, [ebp + 16]
+    movdqa     xmm4, [ebp + ebx]
+    movdqa     xmm5, [ebp + ebx + 16]
+    pavgb      xmm2, xmm4
+    pavgb      xmm3, xmm5
+    movdqa     xmm4, [ebp + ebx * 2]
+    movdqa     xmm5, [ebp + ebx * 2 + 16]
+    movdqa     xmm6, [ebp + edx]
+    pavgb      xmm4, xmm6
+    movdqa     xmm6, [ebp + edx + 16]
+    pavgb      xmm5, xmm6
+    pavgb      xmm2, xmm4
+    pavgb      xmm3, xmm5
+    pavgb      xmm0, xmm2
+    pavgb      xmm1, xmm3
+
+    psadbw     xmm0, xmm7            // average 32 pixels to 4
+    psadbw     xmm1, xmm7
+    pshufd     xmm0, xmm0, 0xd8      // x1x0 -> xx01
+    pshufd     xmm1, xmm1, 0x8d      // x3x2 -> 32xx
+    por        xmm0, xmm1            //      -> 3201
+    psrlw      xmm0, 3
+    packuswb   xmm0, xmm0
+    packuswb   xmm0, xmm0
+    movd       dword ptr [edi], xmm0
+
+    lea        edi, [edi + 4]
+    sub        ecx, 4
+    ja         wloop
+
+    popad
+    ret
+  }
+}
+
+#define HAS_SCALEROWDOWN34_SSSE3
+// Point samples 32 pixels to 24 pixels.
+// Produces three 8 byte values.  For each 8 bytes, 16 bytes are read.
+// Then shuffled to do the scaling.
+
+// Note that movdqa+palign may be better than movdqu.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
+__declspec(naked)
+static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
+                                 uint8* dst_ptr, int dst_width) {
+  __asm {
+    pushad
+    mov        esi, [esp + 32 + 4]   // src_ptr
+                                     // src_stride ignored
+    mov        edi, [esp + 32 + 12]  // dst_ptr
+    mov        ecx, [esp + 32 + 16]  // dst_width
+    movdqa     xmm3, _shuf0
+    movdqa     xmm4, _shuf1
+    movdqa     xmm5, _shuf2
+
+  wloop:
+    movdqa     xmm0, [esi]
+    movdqa     xmm1, [esi + 16]
+    lea        esi,  [esi + 32]
+    movdqa     xmm2, xmm1
+    palignr    xmm1, xmm0, 8
+    pshufb     xmm0, xmm3
+    pshufb     xmm1, xmm4
+    pshufb     xmm2, xmm5
+    movq       qword ptr [edi], xmm0
+    movq       qword ptr [edi + 8], xmm1
+    movq       qword ptr [edi + 16], xmm2
+    lea        edi, [edi + 24]
+    sub        ecx, 24
+    ja         wloop
+
+    popad
+    ret
+  }
+}
+
+// Blends 32x2 rectangle to 24x1
+// Produces three 8 byte values.  For each 8 bytes, 16 bytes are read.
+// Then shuffled to do the scaling.
+
+// Register usage:
+// xmm0 src_row 0
+// xmm1 src_row 1
+// xmm2 shuf 0
+// xmm3 shuf 1
+// xmm4 shuf 2
+// xmm5 madd 0
+// xmm6 madd 1
+// xmm7 round34
+
+// Note that movdqa+palign may be better than movdqu.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
+__declspec(naked)
+static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
+                                       uint8* dst_ptr, int dst_width) {
+  __asm {
+    pushad
+    mov        esi, [esp + 32 + 4]   // src_ptr
+    mov        ebx, [esp + 32 + 8]   // src_stride
+    mov        edi, [esp + 32 + 12]  // dst_ptr
+    mov        ecx, [esp + 32 + 16]  // dst_width
+    movdqa     xmm2, _shuf01
+    movdqa     xmm3, _shuf11
+    movdqa     xmm4, _shuf21
+    movdqa     xmm5, _madd01
+    movdqa     xmm6, _madd11
+    movdqa     xmm7, _round34
+
+  wloop:
+    movdqa     xmm0, [esi]           // pixels 0..7
+    movdqa     xmm1, [esi+ebx]
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm2
+    pmaddubsw  xmm0, xmm5
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edi], xmm0
+    movdqu     xmm0, [esi+8]         // pixels 8..15
+    movdqu     xmm1, [esi+ebx+8]
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm3
+    pmaddubsw  xmm0, xmm6
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edi+8], xmm0
+    movdqa     xmm0, [esi+16]        // pixels 16..23
+    movdqa     xmm1, [esi+ebx+16]
+    lea        esi, [esi+32]
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm4
+    movdqa     xmm1, _madd21
+    pmaddubsw  xmm0, xmm1
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edi+16], xmm0
+    lea        edi, [edi+24]
+    sub        ecx, 24
+    ja         wloop
+
+    popad
+    ret
+  }
+}
+
+// Note that movdqa+palign may be better than movdqu.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
+__declspec(naked)
+static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
+                                       uint8* dst_ptr, int dst_width) {
+  __asm {
+    pushad
+    mov        esi, [esp + 32 + 4]   // src_ptr
+    mov        ebx, [esp + 32 + 8]   // src_stride
+    mov        edi, [esp + 32 + 12]  // dst_ptr
+    mov        ecx, [esp + 32 + 16]  // dst_width
+    movdqa     xmm2, _shuf01
+    movdqa     xmm3, _shuf11
+    movdqa     xmm4, _shuf21
+    movdqa     xmm5, _madd01
+    movdqa     xmm6, _madd11
+    movdqa     xmm7, _round34
+
+  wloop:
+    movdqa     xmm0, [esi]           // pixels 0..7
+    movdqa     xmm1, [esi+ebx]
+    pavgb      xmm1, xmm0
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm2
+    pmaddubsw  xmm0, xmm5
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edi], xmm0
+    movdqu     xmm0, [esi+8]         // pixels 8..15
+    movdqu     xmm1, [esi+ebx+8]
+    pavgb      xmm1, xmm0
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm3
+    pmaddubsw  xmm0, xmm6
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edi+8], xmm0
+    movdqa     xmm0, [esi+16]        // pixels 16..23
+    movdqa     xmm1, [esi+ebx+16]
+    lea        esi, [esi+32]
+    pavgb      xmm1, xmm0
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm4
+    movdqa     xmm1, _madd21
+    pmaddubsw  xmm0, xmm1
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edi+16], xmm0
+    lea        edi, [edi+24]
+    sub        ecx, 24
+    ja         wloop
+
+    popad
+    ret
+  }
+}
+
+#define HAS_SCALEROWDOWN38_SSSE3
+// 3/8 point sampler
+
+// Scale 32 pixels to 12
+__declspec(naked)
+static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
+                                 uint8* dst_ptr, int dst_width) {
+  __asm {
+    pushad
+    mov        esi, [esp + 32 + 4]   // src_ptr
+    mov        edx, [esp + 32 + 8]   // src_stride
+    mov        edi, [esp + 32 + 12]  // dst_ptr
+    mov        ecx, [esp + 32 + 16]  // dst_width
+    movdqa     xmm4, _shuf38a
+    movdqa     xmm5, _shuf38b
+
+  xloop:
+    movdqa     xmm0, [esi]           // 16 pixels -> 0,1,2,3,4,5
+    movdqa     xmm1, [esi + 16]      // 16 pixels -> 6,7,8,9,10,11
+    lea        esi, [esi + 32]
+    pshufb     xmm0, xmm4
+    pshufb     xmm1, xmm5
+    paddusb    xmm0, xmm1
+
+    movq       qword ptr [edi], xmm0 // write 12 pixels
+    movhlps    xmm1, xmm0
+    movd       [edi + 8], xmm1
+    lea        edi, [edi + 12]
+    sub        ecx, 12
+    ja         xloop
+
+    popad
+    ret
+  }
+}
+
+// Scale 16x3 pixels to 6x1 with interpolation
+__declspec(naked)
+static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
+                                       uint8* dst_ptr, int dst_width) {
+  __asm {
+    pushad
+    mov        esi, [esp + 32 + 4]   // src_ptr
+    mov        edx, [esp + 32 + 8]   // src_stride
+    mov        edi, [esp + 32 + 12]  // dst_ptr
+    mov        ecx, [esp + 32 + 16]  // dst_width
+    movdqa     xmm4, _shufac0
+    movdqa     xmm5, _shufac3
+    movdqa     xmm6, _scaleac3
+    pxor       xmm7, xmm7
+
+  xloop:
+    movdqa     xmm0, [esi]           // sum up 3 rows into xmm0/1
+    movdqa     xmm2, [esi + edx]
+    movhlps    xmm1, xmm0
+    movhlps    xmm3, xmm2
+    punpcklbw  xmm0, xmm7
+    punpcklbw  xmm1, xmm7
+    punpcklbw  xmm2, xmm7
+    punpcklbw  xmm3, xmm7
+    paddusw    xmm0, xmm2
+    paddusw    xmm1, xmm3
+    movdqa     xmm2, [esi + edx * 2]
+    lea        esi, [esi + 16]
+    movhlps    xmm3, xmm2
+    punpcklbw  xmm2, xmm7
+    punpcklbw  xmm3, xmm7
+    paddusw    xmm0, xmm2
+    paddusw    xmm1, xmm3
+
+    movdqa     xmm2, xmm0            // 8 pixels -> 0,1,2 of xmm2
+    psrldq     xmm0, 2
+    paddusw    xmm2, xmm0
+    psrldq     xmm0, 2
+    paddusw    xmm2, xmm0
+    pshufb     xmm2, xmm4
+
+    movdqa     xmm3, xmm1            // 8 pixels -> 3,4,5 of xmm2
+    psrldq     xmm1, 2
+    paddusw    xmm3, xmm1
+    psrldq     xmm1, 2
+    paddusw    xmm3, xmm1
+    pshufb     xmm3, xmm5
+    paddusw    xmm2, xmm3
+
+    pmulhuw    xmm2, xmm6            // divide by 9,9,6, 9,9,6
+    packuswb   xmm2, xmm2
+
+    movd       [edi], xmm2           // write 6 pixels
+    pextrw     eax, xmm2, 2
+    mov        [edi + 4], ax
+    lea        edi, [edi + 6]
+    sub        ecx, 6
+    ja         xloop
+
+    popad
+    ret
+  }
+}
+
+// Scale 16x2 pixels to 6x1 with interpolation
+__declspec(naked)
+static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
+                                       uint8* dst_ptr, int dst_width) {
+  __asm {
+    pushad
+    mov        esi, [esp + 32 + 4]   // src_ptr
+    mov        edx, [esp + 32 + 8]   // src_stride
+    mov        edi, [esp + 32 + 12]  // dst_ptr
+    mov        ecx, [esp + 32 + 16]  // dst_width
+    movdqa     xmm4, _shufab0
+    movdqa     xmm5, _shufab1
+    movdqa     xmm6, _shufab2
+    movdqa     xmm7, _scaleab2
+
+  xloop:
+    movdqa     xmm2, [esi]           // average 2 rows into xmm2
+    pavgb      xmm2, [esi + edx]
+    lea        esi, [esi + 16]
+
+    movdqa     xmm0, xmm2            // 16 pixels -> 0,1,2,3,4,5 of xmm0
+    pshufb     xmm0, xmm4
+    movdqa     xmm1, xmm2
+    pshufb     xmm1, xmm5
+    paddusw    xmm0, xmm1
+    pshufb     xmm2, xmm6
+    paddusw    xmm0, xmm2
+
+    pmulhuw    xmm0, xmm7            // divide by 3,3,2, 3,3,2
+    packuswb   xmm0, xmm0
+
+    movd       [edi], xmm0           // write 6 pixels
+    pextrw     eax, xmm0, 2
+    mov        [edi + 4], ax
+    lea        edi, [edi + 6]
+    sub        ecx, 6
+    ja         xloop
+
+    popad
+    ret
+  }
+}
+
+#define HAS_SCALEADDROWS_SSE2
+
+// Reads 8xN bytes and produces 16 shorts at a time.
+__declspec(naked)
+static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
+                              uint16* dst_ptr, int src_width,
+                              int src_height) {
+  __asm {
+    pushad
+    mov        esi, [esp + 32 + 4]   // src_ptr
+    mov        edx, [esp + 32 + 8]   // src_stride
+    mov        edi, [esp + 32 + 12]  // dst_ptr
+    mov        ecx, [esp + 32 + 16]  // dst_width
+    mov        ebx, [esp + 32 + 20]  // height
+    pxor       xmm5, xmm5
+    dec        ebx
+
+  xloop:
+    // first row
+    movdqa     xmm2, [esi]
+    lea        eax, [esi + edx]
+    movhlps    xmm3, xmm2
+    mov        ebp, ebx
+    punpcklbw  xmm2, xmm5
+    punpcklbw  xmm3, xmm5
+
+    // sum remaining rows
+  yloop:
+    movdqa     xmm0, [eax]       // read 16 pixels
+    lea        eax, [eax + edx]  // advance to next row
+    movhlps    xmm1, xmm0
+    punpcklbw  xmm0, xmm5
+    punpcklbw  xmm1, xmm5
+    paddusw    xmm2, xmm0        // sum 16 words
+    paddusw    xmm3, xmm1
+    sub        ebp, 1
+    ja         yloop
+
+    movdqa     [edi], xmm2
+    movdqa     [edi + 16], xmm3
+    lea        edi, [edi + 32]
+    lea        esi, [esi + 16]
+
+    sub        ecx, 16
+    ja         xloop
+
+    popad
+    ret
+  }
+}
+
+// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version.
+#define HAS_SCALEFILTERROWS_SSE2
+__declspec(naked)
+static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+                                 int src_stride, int dst_width,
+                                 int source_y_fraction) {
+  __asm {
+    push       esi
+    push       edi
+    mov        edi, [esp + 8 + 4]   // dst_ptr
+    mov        esi, [esp + 8 + 8]   // src_ptr
+    mov        edx, [esp + 8 + 12]  // src_stride
+    mov        ecx, [esp + 8 + 16]  // dst_width
+    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
+    cmp        eax, 0
+    je         xloop1
+    cmp        eax, 128
+    je         xloop2
+
+    movd       xmm6, eax            // xmm6 = y fraction
+    punpcklwd  xmm6, xmm6
+    pshufd     xmm6, xmm6, 0
+    neg        eax                  // xmm5 = 256 - y fraction
+    add        eax, 256
+    movd       xmm5, eax
+    punpcklwd  xmm5, xmm5
+    pshufd     xmm5, xmm5, 0
+    pxor       xmm7, xmm7
+
+  xloop:
+    movdqa     xmm0, [esi]
+    movdqa     xmm2, [esi + edx]
+    lea        esi, [esi + 16]
+    movdqa     xmm1, xmm0
+    movdqa     xmm3, xmm2
+    punpcklbw  xmm0, xmm7
+    punpcklbw  xmm2, xmm7
+    punpckhbw  xmm1, xmm7
+    punpckhbw  xmm3, xmm7
+    pmullw     xmm0, xmm5           // scale row 0
+    pmullw     xmm1, xmm5
+    pmullw     xmm2, xmm6           // scale row 1
+    pmullw     xmm3, xmm6
+    paddusw    xmm0, xmm2           // sum rows
+    paddusw    xmm1, xmm3
+    psrlw      xmm0, 8
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    movdqa     [edi], xmm0
+    lea        edi, [edi + 16]
+    sub        ecx, 16
+    ja         xloop
+
+    mov        al, [edi - 1]
+    mov        [edi], al
+    pop        edi
+    pop        esi
+    ret
+
+  xloop1:
+    movdqa     xmm0, [esi]
+    lea        esi, [esi + 16]
+    movdqa     [edi], xmm0
+    lea        edi, [edi + 16]
+    sub        ecx, 16
+    ja         xloop1
+
+    mov        al, [edi - 1]
+    mov        [edi], al
+    pop        edi
+    pop        esi
+    ret
+
+  xloop2:
+    movdqa     xmm0, [esi]
+    movdqa     xmm2, [esi + edx]
+    lea        esi, [esi + 16]
+    pavgb      xmm0, xmm2
+    movdqa     [edi], xmm0
+    lea        edi, [edi + 16]
+    sub        ecx, 16
+    ja         xloop2
+
+    mov        al, [edi - 1]
+    mov        [edi], al
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version.
+#define HAS_SCALEFILTERROWS_SSSE3
+__declspec(naked)
+static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                                  int src_stride, int dst_width,
+                                  int source_y_fraction) {
+  __asm {
+    push       esi
+    push       edi
+    mov        edi, [esp + 8 + 4]   // dst_ptr
+    mov        esi, [esp + 8 + 8]   // src_ptr
+    mov        edx, [esp + 8 + 12]  // src_stride
+    mov        ecx, [esp + 8 + 16]  // dst_width
+    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
+    cmp        eax, 0
+    je         xloop1
+    cmp        eax, 128
+    je         xloop2
+
+    shr        eax, 1
+    mov        ah,al
+    neg        al
+    add        al, 128
+    movd       xmm5, eax
+    punpcklwd  xmm5, xmm5
+    pshufd     xmm5, xmm5, 0
+
+  xloop:
+    movdqa     xmm0, [esi]
+    movdqa     xmm2, [esi + edx]
+    lea        esi, [esi + 16]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm2
+    punpckhbw  xmm1, xmm2
+    pmaddubsw  xmm0, xmm5
+    pmaddubsw  xmm1, xmm5
+    psrlw      xmm0, 7
+    psrlw      xmm1, 7
+    packuswb   xmm0, xmm1
+    movdqa     [edi], xmm0
+    lea        edi, [edi + 16]
+    sub        ecx, 16
+    ja         xloop
+
+    mov        al, [edi - 1]
+    mov        [edi], al
+    pop        edi
+    pop        esi
+    ret
+
+  xloop1:
+    movdqa     xmm0, [esi]
+    lea        esi, [esi + 16]
+    movdqa     [edi], xmm0
+    lea        edi, [edi + 16]
+    sub        ecx, 16
+    ja         xloop1
+
+    mov        al, [edi - 1]
+    mov        [edi], al
+    pop        edi
+    pop        esi
+    ret
+
+  xloop2:
+    movdqa     xmm0, [esi]
+    movdqa     xmm2, [esi + edx]
+    lea        esi, [esi + 16]
+    pavgb      xmm0, xmm2
+    movdqa     [edi], xmm0
+    lea        edi, [edi + 16]
+    sub        ecx, 16
+    ja         xloop2
+
+    mov        al, [edi - 1]
+    mov        [edi], al
+    pop        edi
+    pop        esi
+    ret
+
+  }
+}
+
+// Note that movdqa+palign may be better than movdqu.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
+__declspec(naked)
+static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                                    int dst_width) {
+  __asm {
+    mov        edx, [esp + 4]    // dst_ptr
+    mov        eax, [esp + 8]    // src_ptr
+    mov        ecx, [esp + 12]   // dst_width
+    movdqa     xmm1, _round34
+    movdqa     xmm2, _shuf01
+    movdqa     xmm3, _shuf11
+    movdqa     xmm4, _shuf21
+    movdqa     xmm5, _madd01
+    movdqa     xmm6, _madd11
+    movdqa     xmm7, _madd21
+
+  wloop:
+    movdqa     xmm0, [eax]           // pixels 0..7
+    pshufb     xmm0, xmm2
+    pmaddubsw  xmm0, xmm5
+    paddsw     xmm0, xmm1
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx], xmm0
+    movdqu     xmm0, [eax+8]         // pixels 8..15
+    pshufb     xmm0, xmm3
+    pmaddubsw  xmm0, xmm6
+    paddsw     xmm0, xmm1
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx+8], xmm0
+    movdqa     xmm0, [eax+16]        // pixels 16..23
+    lea        eax, [eax+32]
+    pshufb     xmm0, xmm4
+    pmaddubsw  xmm0, xmm7
+    paddsw     xmm0, xmm1
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx+16], xmm0
+    lea        edx, [edx+24]
+    sub        ecx, 24
+    ja         wloop
+    ret
+  }
+}
+
+#elif (defined(__x86_64__) || defined(__i386__)) && !defined(YUV_DISABLE_ASM)
+
+// GCC versions of row functions are verbatim conversions from Visual C.
+// Generated using gcc disassembly on Visual C object file:
+// objdump -D yuvscaler.obj >yuvscaler.txt
+#define HAS_SCALEROWDOWN2_SSE2
+static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  asm volatile (
+  "pcmpeqb    %%xmm5,%%xmm5                    \n"
+  "psrlw      $0x8,%%xmm5                      \n"
+"1:"
+  "movdqa     (%0),%%xmm0                      \n"
+  "movdqa     0x10(%0),%%xmm1                  \n"
+  "lea        0x20(%0),%0                      \n"
+  "pand       %%xmm5,%%xmm0                    \n"
+  "pand       %%xmm5,%%xmm1                    \n"
+  "packuswb   %%xmm1,%%xmm0                    \n"
+  "movdqa     %%xmm0,(%1)                      \n"
+  "lea        0x10(%1),%1                      \n"
+  "sub        $0x10,%2                         \n"
+  "ja         1b                               \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  :
+  : "memory", "cc"
+);
+}
+
+static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
+                                  uint8* dst_ptr, int dst_width) {
+  asm volatile (
+  "pcmpeqb    %%xmm5,%%xmm5                    \n"
+  "psrlw      $0x8,%%xmm5                      \n"
+"1:"
+  "movdqa     (%0),%%xmm0                      \n"
+  "movdqa     0x10(%0),%%xmm1                  \n"
+  "movdqa     (%0,%3,1),%%xmm2                 \n"
+  "movdqa     0x10(%0,%3,1),%%xmm3             \n"
+  "lea        0x20(%0),%0                      \n"
+  "pavgb      %%xmm2,%%xmm0                    \n"
+  "pavgb      %%xmm3,%%xmm1                    \n"
+  "movdqa     %%xmm0,%%xmm2                    \n"
+  "psrlw      $0x8,%%xmm0                      \n"
+  "movdqa     %%xmm1,%%xmm3                    \n"
+  "psrlw      $0x8,%%xmm1                      \n"
+  "pand       %%xmm5,%%xmm2                    \n"
+  "pand       %%xmm5,%%xmm3                    \n"
+  "pavgw      %%xmm2,%%xmm0                    \n"
+  "pavgw      %%xmm3,%%xmm1                    \n"
+  "packuswb   %%xmm1,%%xmm0                    \n"
+  "movdqa     %%xmm0,(%1)                      \n"
+  "lea        0x10(%1),%1                      \n"
+  "sub        $0x10,%2                         \n"
+  "ja         1b                               \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  : "r"((intptr_t)(src_stride))   // %3
+  : "memory", "cc"
+);
+}
+
+#define HAS_SCALEROWDOWN4_SSE2
+static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  asm volatile (
+  "pcmpeqb    %%xmm5,%%xmm5                    \n"
+  "psrld      $0x18,%%xmm5                     \n"
+"1:"
+  "movdqa     (%0),%%xmm0                      \n"
+  "movdqa     0x10(%0),%%xmm1                  \n"
+  "lea        0x20(%0),%0                      \n"
+  "pand       %%xmm5,%%xmm0                    \n"
+  "pand       %%xmm5,%%xmm1                    \n"
+  "packuswb   %%xmm1,%%xmm0                    \n"
+  "packuswb   %%xmm0,%%xmm0                    \n"
+  "movq       %%xmm0,(%1)                      \n"
+  "lea        0x8(%1),%1                       \n"
+  "sub        $0x8,%2                          \n"
+  "ja         1b                               \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  :
+  : "memory", "cc"
+);
+}
+
+static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
+                                  uint8* dst_ptr, int dst_width) {
+  intptr_t temp = 0;
+  asm volatile (
+  "pcmpeqb    %%xmm7,%%xmm7                    \n"
+  "psrlw      $0x8,%%xmm7                      \n"
+  "lea        (%4,%4,2),%3                     \n"
+"1:"
+  "movdqa     (%0),%%xmm0                      \n"
+  "movdqa     0x10(%0),%%xmm1                  \n"
+  "movdqa     (%0,%4,1),%%xmm2                 \n"
+  "movdqa     0x10(%0,%4,1),%%xmm3             \n"
+  "pavgb      %%xmm2,%%xmm0                    \n"
+  "pavgb      %%xmm3,%%xmm1                    \n"
+  "movdqa     (%0,%4,2),%%xmm2                 \n"
+  "movdqa     0x10(%0,%4,2),%%xmm3             \n"
+  "movdqa     (%0,%3,1),%%xmm4                 \n"
+  "movdqa     0x10(%0,%3,1),%%xmm5             \n"
+  "lea        0x20(%0),%0                      \n"
+  "pavgb      %%xmm4,%%xmm2                    \n"
+  "pavgb      %%xmm2,%%xmm0                    \n"
+  "pavgb      %%xmm5,%%xmm3                    \n"
+  "pavgb      %%xmm3,%%xmm1                    \n"
+  "movdqa     %%xmm0,%%xmm2                    \n"
+  "psrlw      $0x8,%%xmm0                      \n"
+  "movdqa     %%xmm1,%%xmm3                    \n"
+  "psrlw      $0x8,%%xmm1                      \n"
+  "pand       %%xmm7,%%xmm2                    \n"
+  "pand       %%xmm7,%%xmm3                    \n"
+  "pavgw      %%xmm2,%%xmm0                    \n"
+  "pavgw      %%xmm3,%%xmm1                    \n"
+  "packuswb   %%xmm1,%%xmm0                    \n"
+  "movdqa     %%xmm0,%%xmm2                    \n"
+  "psrlw      $0x8,%%xmm0                      \n"
+  "pand       %%xmm7,%%xmm2                    \n"
+  "pavgw      %%xmm2,%%xmm0                    \n"
+  "packuswb   %%xmm0,%%xmm0                    \n"
+  "movq       %%xmm0,(%1)                      \n"
+  "lea        0x8(%1),%1                       \n"
+  "sub        $0x8,%2                          \n"
+  "ja         1b                               \n"
+  : "+r"(src_ptr),     // %0
+    "+r"(dst_ptr),     // %1
+    "+r"(dst_width),   // %2
+    "+r"(temp)         // %3
+  : "r"((intptr_t)(src_stride))    // %4
+  : "memory", "cc"
+#if defined(__x86_64__)
+    , "xmm6", "xmm7"
+#endif
+);
+}
+
+#define HAS_SCALEROWDOWN8_SSE2
+static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  asm volatile (
+  "pcmpeqb    %%xmm5,%%xmm5                    \n"
+  "psrlq      $0x38,%%xmm5                     \n"
+"1:"
+  "movdqa     (%0),%%xmm0                      \n"
+  "movdqa     0x10(%0),%%xmm1                  \n"
+  "lea        0x20(%0),%0                      \n"
+  "pand       %%xmm5,%%xmm0                    \n"
+  "pand       %%xmm5,%%xmm1                    \n"
+  "packuswb   %%xmm1,%%xmm0                    \n"
+  "packuswb   %%xmm0,%%xmm0                    \n"
+  "packuswb   %%xmm0,%%xmm0                    \n"
+  "movd       %%xmm0,(%1)                      \n"
+  "lea        0x4(%1),%1                       \n"
+  "sub        $0x4,%2                          \n"
+  "ja         1b                               \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  :
+  : "memory", "cc"
+);
+}
+
+#if defined(__i386__)
+void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
+                                      uint8* dst_ptr, int dst_width);
+  asm(
+    DECLARE_FUNCTION(ScaleRowDown8Int_SSE2)
+    "pusha                                     \n"
+    "mov    0x24(%esp),%esi                    \n"
+    "mov    0x28(%esp),%ebx                    \n"
+    "mov    0x2c(%esp),%edi                    \n"
+    "mov    0x30(%esp),%ecx                    \n"
+    "lea    (%ebx,%ebx,2),%edx                 \n"
+    "pxor   %xmm7,%xmm7                        \n"
+
+"1:"
+    "movdqa (%esi),%xmm0                       \n"
+    "movdqa 0x10(%esi),%xmm1                   \n"
+    "movdqa (%esi,%ebx,1),%xmm2                \n"
+    "movdqa 0x10(%esi,%ebx,1),%xmm3            \n"
+    "pavgb  %xmm2,%xmm0                        \n"
+    "pavgb  %xmm3,%xmm1                        \n"
+    "movdqa (%esi,%ebx,2),%xmm2                \n"
+    "movdqa 0x10(%esi,%ebx,2),%xmm3            \n"
+    "movdqa (%esi,%edx,1),%xmm4                \n"
+    "movdqa 0x10(%esi,%edx,1),%xmm5            \n"
+    "lea    (%esi,%ebx,4),%ebp                 \n"
+    "lea    0x20(%esi),%esi                    \n"
+    "pavgb  %xmm4,%xmm2                        \n"
+    "pavgb  %xmm5,%xmm3                        \n"
+    "pavgb  %xmm2,%xmm0                        \n"
+    "pavgb  %xmm3,%xmm1                        \n"
+    "movdqa 0x0(%ebp),%xmm2                    \n"
+    "movdqa 0x10(%ebp),%xmm3                   \n"
+    "movdqa 0x0(%ebp,%ebx,1),%xmm4             \n"
+    "movdqa 0x10(%ebp,%ebx,1),%xmm5            \n"
+    "pavgb  %xmm4,%xmm2                        \n"
+    "pavgb  %xmm5,%xmm3                        \n"
+    "movdqa 0x0(%ebp,%ebx,2),%xmm4             \n"
+    "movdqa 0x10(%ebp,%ebx,2),%xmm5            \n"
+    "movdqa 0x0(%ebp,%edx,1),%xmm6             \n"
+    "pavgb  %xmm6,%xmm4                        \n"
+    "movdqa 0x10(%ebp,%edx,1),%xmm6            \n"
+    "pavgb  %xmm6,%xmm5                        \n"
+    "pavgb  %xmm4,%xmm2                        \n"
+    "pavgb  %xmm5,%xmm3                        \n"
+    "pavgb  %xmm2,%xmm0                        \n"
+    "pavgb  %xmm3,%xmm1                        \n"
+    "psadbw %xmm7,%xmm0                        \n"
+    "psadbw %xmm7,%xmm1                        \n"
+    "pshufd $0xd8,%xmm0,%xmm0                  \n"
+    "pshufd $0x8d,%xmm1,%xmm1                  \n"
+    "por    %xmm1,%xmm0                        \n"
+    "psrlw  $0x3,%xmm0                         \n"
+    "packuswb %xmm0,%xmm0                      \n"
+    "packuswb %xmm0,%xmm0                      \n"
+    "movd   %xmm0,(%edi)                       \n"
+    "lea    0x4(%edi),%edi                     \n"
+    "sub    $0x4,%ecx                          \n"
+    "ja     1b                                 \n"
+    "popa                                      \n"
+    "ret                                       \n"
+);
+
+// fpic is used for magiccam plugin
+#if !defined(__PIC__)
+#define HAS_SCALEROWDOWN34_SSSE3
+void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
+                                     uint8* dst_ptr, int dst_width);
+  asm(
+    DECLARE_FUNCTION(ScaleRowDown34_SSSE3)
+    "pusha                                     \n"
+    "mov    0x24(%esp),%esi                    \n"
+    "mov    0x2c(%esp),%edi                    \n"
+    "mov    0x30(%esp),%ecx                    \n"
+    "movdqa _shuf0,%xmm3                       \n"
+    "movdqa _shuf1,%xmm4                       \n"
+    "movdqa _shuf2,%xmm5                       \n"
+
+"1:"
+    "movdqa (%esi),%xmm0                       \n"
+    "movdqa 0x10(%esi),%xmm2                   \n"
+    "lea    0x20(%esi),%esi                    \n"
+    "movdqa %xmm2,%xmm1                        \n"
+    "palignr $0x8,%xmm0,%xmm1                  \n"
+    "pshufb %xmm3,%xmm0                        \n"
+    "pshufb %xmm4,%xmm1                        \n"
+    "pshufb %xmm5,%xmm2                        \n"
+    "movq   %xmm0,(%edi)                       \n"
+    "movq   %xmm1,0x8(%edi)                    \n"
+    "movq   %xmm2,0x10(%edi)                   \n"
+    "lea    0x18(%edi),%edi                    \n"
+    "sub    $0x18,%ecx                         \n"
+    "ja     1b                                 \n"
+    "popa                                      \n"
+    "ret                                       \n"
+);
+
+void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
+                                           uint8* dst_ptr, int dst_width);
+  asm(
+    DECLARE_FUNCTION(ScaleRowDown34_1_Int_SSSE3)
+    "pusha                                     \n"
+    "mov    0x24(%esp),%esi                    \n"
+    "mov    0x28(%esp),%ebp                    \n"
+    "mov    0x2c(%esp),%edi                    \n"
+    "mov    0x30(%esp),%ecx                    \n"
+    "movdqa _shuf01,%xmm2                      \n"
+    "movdqa _shuf11,%xmm3                      \n"
+    "movdqa _shuf21,%xmm4                      \n"
+    "movdqa _madd01,%xmm5                      \n"
+    "movdqa _madd11,%xmm6                      \n"
+    "movdqa _round34,%xmm7                     \n"
+
+"1:"
+    "movdqa (%esi),%xmm0                       \n"
+    "movdqa (%esi,%ebp),%xmm1                  \n"
+    "pavgb  %xmm1,%xmm0                        \n"
+    "pshufb %xmm2,%xmm0                        \n"
+    "pmaddubsw %xmm5,%xmm0                     \n"
+    "paddsw %xmm7,%xmm0                        \n"
+    "psrlw  $0x2,%xmm0                         \n"
+    "packuswb %xmm0,%xmm0                      \n"
+    "movq   %xmm0,(%edi)                       \n"
+    "movdqu 0x8(%esi),%xmm0                    \n"
+    "movdqu 0x8(%esi,%ebp),%xmm1               \n"
+    "pavgb  %xmm1,%xmm0                        \n"
+    "pshufb %xmm3,%xmm0                        \n"
+    "pmaddubsw %xmm6,%xmm0                     \n"
+    "paddsw %xmm7,%xmm0                        \n"
+    "psrlw  $0x2,%xmm0                         \n"
+    "packuswb %xmm0,%xmm0                      \n"
+    "movq   %xmm0,0x8(%edi)                    \n"
+    "movdqa 0x10(%esi),%xmm0                   \n"
+    "movdqa 0x10(%esi,%ebp),%xmm1              \n"
+    "lea    0x20(%esi),%esi                    \n"
+    "pavgb  %xmm1,%xmm0                        \n"
+    "pshufb %xmm4,%xmm0                        \n"
+    "movdqa  _madd21,%xmm1                     \n"
+    "pmaddubsw %xmm1,%xmm0                     \n"
+    "paddsw %xmm7,%xmm0                        \n"
+    "psrlw  $0x2,%xmm0                         \n"
+    "packuswb %xmm0,%xmm0                      \n"
+    "movq   %xmm0,0x10(%edi)                   \n"
+    "lea    0x18(%edi),%edi                    \n"
+    "sub    $0x18,%ecx                         \n"
+    "ja     1b                                 \n"
+
+    "popa                                      \n"
+    "ret                                       \n"
+);
+
+void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
+                                           uint8* dst_ptr, int dst_width);
+  asm(
+    DECLARE_FUNCTION(ScaleRowDown34_0_Int_SSSE3)
+    "pusha                                     \n"
+    "mov    0x24(%esp),%esi                    \n"
+    "mov    0x28(%esp),%ebp                    \n"
+    "mov    0x2c(%esp),%edi                    \n"
+    "mov    0x30(%esp),%ecx                    \n"
+    "movdqa _shuf01,%xmm2                      \n"
+    "movdqa _shuf11,%xmm3                      \n"
+    "movdqa _shuf21,%xmm4                      \n"
+    "movdqa _madd01,%xmm5                      \n"
+    "movdqa _madd11,%xmm6                      \n"
+    "movdqa _round34,%xmm7                     \n"
+
+"1:"
+    "movdqa (%esi),%xmm0                       \n"
+    "movdqa (%esi,%ebp,1),%xmm1                \n"
+    "pavgb  %xmm0,%xmm1                        \n"
+    "pavgb  %xmm1,%xmm0                        \n"
+    "pshufb %xmm2,%xmm0                        \n"
+    "pmaddubsw %xmm5,%xmm0                     \n"
+    "paddsw %xmm7,%xmm0                        \n"
+    "psrlw  $0x2,%xmm0                         \n"
+    "packuswb %xmm0,%xmm0                      \n"
+    "movq   %xmm0,(%edi)                       \n"
+    "movdqu 0x8(%esi),%xmm0                    \n"
+    "movdqu 0x8(%esi,%ebp,1),%xmm1             \n"
+    "pavgb  %xmm0,%xmm1                        \n"
+    "pavgb  %xmm1,%xmm0                        \n"
+    "pshufb %xmm3,%xmm0                        \n"
+    "pmaddubsw %xmm6,%xmm0                     \n"
+    "paddsw %xmm7,%xmm0                        \n"
+    "psrlw  $0x2,%xmm0                         \n"
+    "packuswb %xmm0,%xmm0                      \n"
+    "movq   %xmm0,0x8(%edi)                    \n"
+    "movdqa 0x10(%esi),%xmm0                   \n"
+    "movdqa 0x10(%esi,%ebp,1),%xmm1            \n"
+    "lea    0x20(%esi),%esi                    \n"
+    "pavgb  %xmm0,%xmm1                        \n"
+    "pavgb  %xmm1,%xmm0                        \n"
+    "pshufb %xmm4,%xmm0                        \n"
+    "movdqa  _madd21,%xmm1                     \n"
+    "pmaddubsw %xmm1,%xmm0                     \n"
+    "paddsw %xmm7,%xmm0                        \n"
+    "psrlw  $0x2,%xmm0                         \n"
+    "packuswb %xmm0,%xmm0                      \n"
+    "movq   %xmm0,0x10(%edi)                   \n"
+    "lea    0x18(%edi),%edi                    \n"
+    "sub    $0x18,%ecx                         \n"
+    "ja     1b                                 \n"
+    "popa                                      \n"
+    "ret                                       \n"
+);
+
+#define HAS_SCALEROWDOWN38_SSSE3
+void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
+                                     uint8* dst_ptr, int dst_width);
+  asm(
+    DECLARE_FUNCTION(ScaleRowDown38_SSSE3)
+    "pusha                                     \n"
+    "mov    0x24(%esp),%esi                    \n"
+    "mov    0x28(%esp),%edx                    \n"
+    "mov    0x2c(%esp),%edi                    \n"
+    "mov    0x30(%esp),%ecx                    \n"
+    "movdqa _shuf38a ,%xmm4                    \n"
+    "movdqa _shuf38b ,%xmm5                    \n"
+
+"1:"
+    "movdqa (%esi),%xmm0                       \n"
+    "movdqa 0x10(%esi),%xmm1                   \n"
+    "lea    0x20(%esi),%esi                    \n"
+    "pshufb %xmm4,%xmm0                        \n"
+    "pshufb %xmm5,%xmm1                        \n"
+    "paddusb %xmm1,%xmm0                       \n"
+    "movq   %xmm0,(%edi)                       \n"
+    "movhlps %xmm0,%xmm1                       \n"
+    "movd   %xmm1,0x8(%edi)                    \n"
+    "lea    0xc(%edi),%edi                     \n"
+    "sub    $0xc,%ecx                          \n"
+    "ja     1b                                 \n"
+    "popa                                      \n"
+    "ret                                       \n"
+);
+
+void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
+                                           uint8* dst_ptr, int dst_width);
+  asm(
+    DECLARE_FUNCTION(ScaleRowDown38_3_Int_SSSE3)
+    "pusha                                     \n"
+    "mov    0x24(%esp),%esi                    \n"
+    "mov    0x28(%esp),%edx                    \n"
+    "mov    0x2c(%esp),%edi                    \n"
+    "mov    0x30(%esp),%ecx                    \n"
+    "movdqa _shufac0,%xmm4                     \n"
+    "movdqa _shufac3,%xmm5                     \n"
+    "movdqa _scaleac3,%xmm6                    \n"
+    "pxor   %xmm7,%xmm7                        \n"
+
+"1:"
+    "movdqa (%esi),%xmm0                       \n"
+    "movdqa (%esi,%edx,1),%xmm2                \n"
+    "movhlps %xmm0,%xmm1                       \n"
+    "movhlps %xmm2,%xmm3                       \n"
+    "punpcklbw %xmm7,%xmm0                     \n"
+    "punpcklbw %xmm7,%xmm1                     \n"
+    "punpcklbw %xmm7,%xmm2                     \n"
+    "punpcklbw %xmm7,%xmm3                     \n"
+    "paddusw %xmm2,%xmm0                       \n"
+    "paddusw %xmm3,%xmm1                       \n"
+    "movdqa (%esi,%edx,2),%xmm2                \n"
+    "lea    0x10(%esi),%esi                    \n"
+    "movhlps %xmm2,%xmm3                       \n"
+    "punpcklbw %xmm7,%xmm2                     \n"
+    "punpcklbw %xmm7,%xmm3                     \n"
+    "paddusw %xmm2,%xmm0                       \n"
+    "paddusw %xmm3,%xmm1                       \n"
+    "movdqa %xmm0,%xmm2                        \n"
+    "psrldq $0x2,%xmm0                         \n"
+    "paddusw %xmm0,%xmm2                       \n"
+    "psrldq $0x2,%xmm0                         \n"
+    "paddusw %xmm0,%xmm2                       \n"
+    "pshufb %xmm4,%xmm2                        \n"
+    "movdqa %xmm1,%xmm3                        \n"
+    "psrldq $0x2,%xmm1                         \n"
+    "paddusw %xmm1,%xmm3                       \n"
+    "psrldq $0x2,%xmm1                         \n"
+    "paddusw %xmm1,%xmm3                       \n"
+    "pshufb %xmm5,%xmm3                        \n"
+    "paddusw %xmm3,%xmm2                       \n"
+    "pmulhuw %xmm6,%xmm2                       \n"
+    "packuswb %xmm2,%xmm2                      \n"
+    "movd   %xmm2,(%edi)                       \n"
+    "pextrw $0x2,%xmm2,%eax                    \n"
+    "mov    %ax,0x4(%edi)                      \n"
+    "lea    0x6(%edi),%edi                     \n"
+    "sub    $0x6,%ecx                          \n"
+    "ja     1b                                 \n"
+    "popa                                      \n"
+    "ret                                       \n"
+);
+
+void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
+                                           uint8* dst_ptr, int dst_width);
+  asm(
+    DECLARE_FUNCTION(ScaleRowDown38_2_Int_SSSE3)
+    "pusha                                     \n"
+    "mov    0x24(%esp),%esi                    \n"
+    "mov    0x28(%esp),%edx                    \n"
+    "mov    0x2c(%esp),%edi                    \n"
+    "mov    0x30(%esp),%ecx                    \n"
+    "movdqa _shufab0,%xmm4                     \n"
+    "movdqa _shufab1,%xmm5                     \n"
+    "movdqa _shufab2,%xmm6                     \n"
+    "movdqa _scaleab2,%xmm7                    \n"
+
+"1:"
+    "movdqa (%esi),%xmm2                       \n"
+    "pavgb  (%esi,%edx,1),%xmm2                \n"
+    "lea    0x10(%esi),%esi                    \n"
+    "movdqa %xmm2,%xmm0                        \n"
+    "pshufb %xmm4,%xmm0                        \n"
+    "movdqa %xmm2,%xmm1                        \n"
+    "pshufb %xmm5,%xmm1                        \n"
+    "paddusw %xmm1,%xmm0                       \n"
+    "pshufb %xmm6,%xmm2                        \n"
+    "paddusw %xmm2,%xmm0                       \n"
+    "pmulhuw %xmm7,%xmm0                       \n"
+    "packuswb %xmm0,%xmm0                      \n"
+    "movd   %xmm0,(%edi)                       \n"
+    "pextrw $0x2,%xmm0,%eax                    \n"
+    "mov    %ax,0x4(%edi)                      \n"
+    "lea    0x6(%edi),%edi                     \n"
+    "sub    $0x6,%ecx                          \n"
+    "ja     1b                                 \n"
+    "popa                                      \n"
+    "ret                                       \n"
+);
+#endif // __PIC__
+
+#define HAS_SCALEADDROWS_SSE2
+void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
+                                  uint16* dst_ptr, int src_width,
+                                  int src_height);
+  asm(
+    DECLARE_FUNCTION(ScaleAddRows_SSE2)
+    "pusha                                     \n"
+    "mov    0x24(%esp),%esi                    \n"
+    "mov    0x28(%esp),%edx                    \n"
+    "mov    0x2c(%esp),%edi                    \n"
+    "mov    0x30(%esp),%ecx                    \n"
+    "mov    0x34(%esp),%ebx                    \n"
+    "pxor   %xmm5,%xmm5                        \n"
+
+"1:"
+    "movdqa (%esi),%xmm2                       \n"
+    "lea    (%esi,%edx,1),%eax                 \n"
+    "movhlps %xmm2,%xmm3                       \n"
+    "lea    -0x1(%ebx),%ebp                    \n"
+    "punpcklbw %xmm5,%xmm2                     \n"
+    "punpcklbw %xmm5,%xmm3                     \n"
+
+"2:"
+    "movdqa (%eax),%xmm0                       \n"
+    "lea    (%eax,%edx,1),%eax                 \n"
+    "movhlps %xmm0,%xmm1                       \n"
+    "punpcklbw %xmm5,%xmm0                     \n"
+    "punpcklbw %xmm5,%xmm1                     \n"
+    "paddusw %xmm0,%xmm2                       \n"
+    "paddusw %xmm1,%xmm3                       \n"
+    "sub    $0x1,%ebp                          \n"
+    "ja     2b                                 \n"
+
+    "movdqa %xmm2,(%edi)                       \n"
+    "movdqa %xmm3,0x10(%edi)                   \n"
+    "lea    0x20(%edi),%edi                    \n"
+    "lea    0x10(%esi),%esi                    \n"
+    "sub    $0x10,%ecx                         \n"
+    "ja     1b                                 \n"
+    "popa                                      \n"
+    "ret                                       \n"
+);
+
+// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version
+#define HAS_SCALEFILTERROWS_SSE2
+void ScaleFilterRows_SSE2(uint8* dst_ptr,
+                                     const uint8* src_ptr, int src_stride,
+                                     int dst_width, int source_y_fraction);
+  asm(
+    DECLARE_FUNCTION(ScaleFilterRows_SSE2)
+    "push   %esi                               \n"
+    "push   %edi                               \n"
+    "mov    0xc(%esp),%edi                     \n"
+    "mov    0x10(%esp),%esi                    \n"
+    "mov    0x14(%esp),%edx                    \n"
+    "mov    0x18(%esp),%ecx                    \n"
+    "mov    0x1c(%esp),%eax                    \n"
+    "cmp    $0x0,%eax                          \n"
+    "je     2f                                 \n"
+    "cmp    $0x80,%eax                         \n"
+    "je     3f                                 \n"
+    "movd   %eax,%xmm6                         \n"
+    "punpcklwd %xmm6,%xmm6                     \n"
+    "pshufd $0x0,%xmm6,%xmm6                   \n"
+    "neg    %eax                               \n"
+    "add    $0x100,%eax                        \n"
+    "movd   %eax,%xmm5                         \n"
+    "punpcklwd %xmm5,%xmm5                     \n"
+    "pshufd $0x0,%xmm5,%xmm5                   \n"
+    "pxor   %xmm7,%xmm7                        \n"
+
+"1:"
+    "movdqa (%esi),%xmm0                       \n"
+    "movdqa (%esi,%edx,1),%xmm2                \n"
+    "lea    0x10(%esi),%esi                    \n"
+    "movdqa %xmm0,%xmm1                        \n"
+    "movdqa %xmm2,%xmm3                        \n"
+    "punpcklbw %xmm7,%xmm0                     \n"
+    "punpcklbw %xmm7,%xmm2                     \n"
+    "punpckhbw %xmm7,%xmm1                     \n"
+    "punpckhbw %xmm7,%xmm3                     \n"
+    "pmullw %xmm5,%xmm0                        \n"
+    "pmullw %xmm5,%xmm1                        \n"
+    "pmullw %xmm6,%xmm2                        \n"
+    "pmullw %xmm6,%xmm3                        \n"
+    "paddusw %xmm2,%xmm0                       \n"
+    "paddusw %xmm3,%xmm1                       \n"
+    "psrlw  $0x8,%xmm0                         \n"
+    "psrlw  $0x8,%xmm1                         \n"
+    "packuswb %xmm1,%xmm0                      \n"
+    "movdqa %xmm0,(%edi)                       \n"
+    "lea    0x10(%edi),%edi                    \n"
+    "sub    $0x10,%ecx                         \n"
+    "ja     1b                                 \n"
+    "mov    -0x1(%edi),%al                     \n"
+    "mov    %al,(%edi)                         \n"
+    "pop    %edi                               \n"
+    "pop    %esi                               \n"
+    "ret                                       \n"
+
+"2:"
+    "movdqa (%esi),%xmm0                       \n"
+    "lea    0x10(%esi),%esi                    \n"
+    "movdqa %xmm0,(%edi)                       \n"
+    "lea    0x10(%edi),%edi                    \n"
+    "sub    $0x10,%ecx                         \n"
+    "ja     2b                                 \n"
+
+    "mov    -0x1(%edi),%al                     \n"
+    "mov    %al,(%edi)                         \n"
+    "pop    %edi                               \n"
+    "pop    %esi                               \n"
+    "ret                                       \n"
+
+"3:"
+    "movdqa (%esi),%xmm0                       \n"
+    "movdqa (%esi,%edx,1),%xmm2                \n"
+    "lea    0x10(%esi),%esi                    \n"
+    "pavgb  %xmm2,%xmm0                        \n"
+    "movdqa %xmm0,(%edi)                       \n"
+    "lea    0x10(%edi),%edi                    \n"
+    "sub    $0x10,%ecx                         \n"
+    "ja     3b                                 \n"
+
+    "mov    -0x1(%edi),%al                     \n"
+    "mov    %al,(%edi)                         \n"
+    "pop    %edi                               \n"
+    "pop    %esi                               \n"
+    "ret                                       \n"
+);
+
+// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version
+#define HAS_SCALEFILTERROWS_SSSE3
+void ScaleFilterRows_SSSE3(uint8* dst_ptr,
+                                      const uint8* src_ptr, int src_stride,
+                                      int dst_width, int source_y_fraction);
+  asm(
+    DECLARE_FUNCTION(ScaleFilterRows_SSSE3)
+    "push   %esi                               \n"
+    "push   %edi                               \n"
+    "mov    0xc(%esp),%edi                     \n"
+    "mov    0x10(%esp),%esi                    \n"
+    "mov    0x14(%esp),%edx                    \n"
+    "mov    0x18(%esp),%ecx                    \n"
+    "mov    0x1c(%esp),%eax                    \n"
+    "cmp    $0x0,%eax                          \n"
+    "je     2f                                 \n"
+    "cmp    $0x80,%eax                         \n"
+    "je     3f                                 \n"
+    "shr    %eax                               \n"
+    "mov    %al,%ah                            \n"
+    "neg    %al                                \n"
+    "add    $0x80,%al                          \n"
+    "movd   %eax,%xmm5                         \n"
+    "punpcklwd %xmm5,%xmm5                     \n"
+    "pshufd $0x0,%xmm5,%xmm5                   \n"
+
+"1:"
+    "movdqa (%esi),%xmm0                       \n"
+    "movdqa (%esi,%edx,1),%xmm2                \n"
+    "lea    0x10(%esi),%esi                    \n"
+    "movdqa %xmm0,%xmm1                        \n"
+    "punpcklbw %xmm2,%xmm0                     \n"
+    "punpckhbw %xmm2,%xmm1                     \n"
+    "pmaddubsw %xmm5,%xmm0                     \n"
+    "pmaddubsw %xmm5,%xmm1                     \n"
+    "psrlw  $0x7,%xmm0                         \n"
+    "psrlw  $0x7,%xmm1                         \n"
+    "packuswb %xmm1,%xmm0                      \n"
+    "movdqa %xmm0,(%edi)                       \n"
+    "lea    0x10(%edi),%edi                    \n"
+    "sub    $0x10,%ecx                         \n"
+    "ja     1b                                 \n"
+    "mov    -0x1(%edi),%al                     \n"
+    "mov    %al,(%edi)                         \n"
+    "pop    %edi                               \n"
+    "pop    %esi                               \n"
+    "ret                                       \n"
+
+"2:"
+    "movdqa (%esi),%xmm0                       \n"
+    "lea    0x10(%esi),%esi                    \n"
+    "movdqa %xmm0,(%edi)                       \n"
+    "lea    0x10(%edi),%edi                    \n"
+    "sub    $0x10,%ecx                         \n"
+    "ja     2b                                 \n"
+    "mov    -0x1(%edi),%al                     \n"
+    "mov    %al,(%edi)                         \n"
+    "pop    %edi                               \n"
+    "pop    %esi                               \n"
+    "ret                                       \n"
+
+"3:"
+    "movdqa (%esi),%xmm0                       \n"
+    "movdqa (%esi,%edx,1),%xmm2                \n"
+    "lea    0x10(%esi),%esi                    \n"
+    "pavgb  %xmm2,%xmm0                        \n"
+    "movdqa %xmm0,(%edi)                       \n"
+    "lea    0x10(%edi),%edi                    \n"
+    "sub    $0x10,%ecx                         \n"
+    "ja     3b                                 \n"
+    "mov    -0x1(%edi),%al                     \n"
+    "mov    %al,(%edi)                         \n"
+    "pop    %edi                               \n"
+    "pop    %esi                               \n"
+    "ret                                       \n"
+);
+
+#elif defined(__x86_64__)
+static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
+                                  uint8* dst_ptr, int dst_width) {
+  asm volatile (
+  "lea        (%3,%3,2),%%r10                  \n"
+  "pxor       %%xmm7,%%xmm7                    \n"
+"1:"
+  "movdqa     (%0),%%xmm0                      \n"
+  "movdqa     0x10(%0),%%xmm1                  \n"
+  "movdqa     (%0,%3,1),%%xmm2                 \n"
+  "movdqa     0x10(%0,%3,1),%%xmm3             \n"
+  "pavgb      %%xmm2,%%xmm0                    \n"
+  "pavgb      %%xmm3,%%xmm1                    \n"
+  "movdqa     (%0,%3,2),%%xmm2                 \n"
+  "movdqa     0x10(%0,%3,2),%%xmm3             \n"
+  "movdqa     (%0,%%r10,1),%%xmm4              \n"
+  "movdqa     0x10(%0,%%r10,1),%%xmm5          \n"
+  "lea        (%0,%3,4),%%r11                  \n"
+  "lea        0x20(%0),%0                      \n"
+  "pavgb      %%xmm4,%%xmm2                    \n"
+  "pavgb      %%xmm5,%%xmm3                    \n"
+  "pavgb      %%xmm2,%%xmm0                    \n"
+  "pavgb      %%xmm3,%%xmm1                    \n"
+  "movdqa     0x0(%%r11),%%xmm2                \n"
+  "movdqa     0x10(%%r11),%%xmm3               \n"
+  "movdqa     0x0(%%r11,%3,1),%%xmm4           \n"
+  "movdqa     0x10(%%r11,%3,1),%%xmm5          \n"
+  "pavgb      %%xmm4,%%xmm2                    \n"
+  "pavgb      %%xmm5,%%xmm3                    \n"
+  "movdqa     0x0(%%r11,%3,2),%%xmm4           \n"
+  "movdqa     0x10(%%r11,%3,2),%%xmm5          \n"
+  "movdqa     0x0(%%r11,%%r10,1),%%xmm6        \n"
+  "pavgb      %%xmm6,%%xmm4                    \n"
+  "movdqa     0x10(%%r11,%%r10,1),%%xmm6       \n"
+  "pavgb      %%xmm6,%%xmm5                    \n"
+  "pavgb      %%xmm4,%%xmm2                    \n"
+  "pavgb      %%xmm5,%%xmm3                    \n"
+  "pavgb      %%xmm2,%%xmm0                    \n"
+  "pavgb      %%xmm3,%%xmm1                    \n"
+  "psadbw     %%xmm7,%%xmm0                    \n"
+  "psadbw     %%xmm7,%%xmm1                    \n"
+  "pshufd     $0xd8,%%xmm0,%%xmm0              \n"
+  "pshufd     $0x8d,%%xmm1,%%xmm1              \n"
+  "por        %%xmm1,%%xmm0                    \n"
+  "psrlw      $0x3,%%xmm0                      \n"
+  "packuswb   %%xmm0,%%xmm0                    \n"
+  "packuswb   %%xmm0,%%xmm0                    \n"
+  "movd       %%xmm0,(%1)                      \n"
+  "lea        0x4(%1),%1                       \n"
+  "sub        $0x4,%2                          \n"
+  "ja         1b                               \n"
+  : "+r"(src_ptr),     // %0
+    "+r"(dst_ptr),     // %1
+    "+r"(dst_width)    // %2
+  : "r"((intptr_t)(src_stride))   // %3
+  : "memory", "cc", "r10", "r11", "xmm6", "xmm7"
+);
+}
+
+#define HAS_SCALEROWDOWN34_SSSE3
+static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
+                                 uint8* dst_ptr, int dst_width) {
+  asm volatile (
+  "movdqa     (%3),%%xmm3                      \n"
+  "movdqa     (%4),%%xmm4                      \n"
+  "movdqa     (%5),%%xmm5                      \n"
+"1:"
+  "movdqa     (%0),%%xmm0                      \n"
+  "movdqa     0x10(%0),%%xmm2                  \n"
+  "lea        0x20(%0),%0                      \n"
+  "movdqa     %%xmm2,%%xmm1                    \n"
+  "palignr    $0x8,%%xmm0,%%xmm1               \n"
+  "pshufb     %%xmm3,%%xmm0                    \n"
+  "pshufb     %%xmm4,%%xmm1                    \n"
+  "pshufb     %%xmm5,%%xmm2                    \n"
+  "movq       %%xmm0,(%1)                      \n"
+  "movq       %%xmm1,0x8(%1)                   \n"
+  "movq       %%xmm2,0x10(%1)                  \n"
+  "lea        0x18(%1),%1                      \n"
+  "sub        $0x18,%2                         \n"
+  "ja         1b                               \n"
+  : "+r"(src_ptr),     // %0
+    "+r"(dst_ptr),     // %1
+    "+r"(dst_width)    // %2
+  : "r"(_shuf0),   // %3
+    "r"(_shuf1),   // %4
+    "r"(_shuf2)    // %5
+  : "memory", "cc"
+);
+}
+
+static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
+                                       uint8* dst_ptr, int dst_width) {
+  asm volatile (
+  "movdqa     (%4),%%xmm2                      \n"  // _shuf01
+  "movdqa     (%5),%%xmm3                      \n"  // _shuf11
+  "movdqa     (%6),%%xmm4                      \n"  // _shuf21
+  "movdqa     (%7),%%xmm5                      \n"  // _madd01
+  "movdqa     (%8),%%xmm6                      \n"  // _madd11
+  "movdqa     (%9),%%xmm7                      \n"  // _round34
+  "movdqa     (%10),%%xmm8                     \n"  // _madd21
+"1:"
+  "movdqa     (%0),%%xmm0                      \n"
+  "movdqa     (%0,%3),%%xmm1                   \n"
+  "pavgb      %%xmm1,%%xmm0                    \n"
+  "pshufb     %%xmm2,%%xmm0                    \n"
+  "pmaddubsw  %%xmm5,%%xmm0                    \n"
+  "paddsw     %%xmm7,%%xmm0                    \n"
+  "psrlw      $0x2,%%xmm0                      \n"
+  "packuswb   %%xmm0,%%xmm0                    \n"
+  "movq       %%xmm0,(%1)                      \n"
+  "movdqu     0x8(%0),%%xmm0                   \n"
+  "movdqu     0x8(%0,%3),%%xmm1                \n"
+  "pavgb      %%xmm1,%%xmm0                    \n"
+  "pshufb     %%xmm3,%%xmm0                    \n"
+  "pmaddubsw  %%xmm6,%%xmm0                    \n"
+  "paddsw     %%xmm7,%%xmm0                    \n"
+  "psrlw      $0x2,%%xmm0                      \n"
+  "packuswb   %%xmm0,%%xmm0                    \n"
+  "movq       %%xmm0,0x8(%1)                   \n"
+  "movdqa     0x10(%0),%%xmm0                  \n"
+  "movdqa     0x10(%0,%3),%%xmm1               \n"
+  "lea        0x20(%0),%0                      \n"
+  "pavgb      %%xmm1,%%xmm0                    \n"
+  "pshufb     %%xmm4,%%xmm0                    \n"
+  "pmaddubsw  %%xmm8,%%xmm0                    \n"
+  "paddsw     %%xmm7,%%xmm0                    \n"
+  "psrlw      $0x2,%%xmm0                      \n"
+  "packuswb   %%xmm0,%%xmm0                    \n"
+  "movq       %%xmm0,0x10(%1)                  \n"
+  "lea        0x18(%1),%1                      \n"
+  "sub        $0x18,%2                         \n"
+  "ja         1b                               \n"
+  : "+r"(src_ptr),     // %0
+    "+r"(dst_ptr),     // %1
+    "+r"(dst_width)    // %2
+  : "r"((intptr_t)(src_stride)),  // %3
+    "r"(_shuf01),   // %4
+    "r"(_shuf11),   // %5
+    "r"(_shuf21),   // %6
+    "r"(_madd01),   // %7
+    "r"(_madd11),   // %8
+    "r"(_round34),  // %9
+    "r"(_madd21)    // %10
+  : "memory", "cc", "xmm6", "xmm7", "xmm8"
+);
+}
+
+static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
+                                       uint8* dst_ptr, int dst_width) {
+  asm volatile (
+  "movdqa     (%4),%%xmm2                      \n"  // _shuf01
+  "movdqa     (%5),%%xmm3                      \n"  // _shuf11
+  "movdqa     (%6),%%xmm4                      \n"  // _shuf21
+  "movdqa     (%7),%%xmm5                      \n"  // _madd01
+  "movdqa     (%8),%%xmm6                      \n"  // _madd11
+  "movdqa     (%9),%%xmm7                      \n"  // _round34
+  "movdqa     (%10),%%xmm8                     \n"  // _madd21
+"1:"
+  "movdqa     (%0),%%xmm0                      \n"
+  "movdqa     (%0,%3,1),%%xmm1                 \n"
+  "pavgb      %%xmm0,%%xmm1                    \n"
+  "pavgb      %%xmm1,%%xmm0                    \n"
+  "pshufb     %%xmm2,%%xmm0                    \n"
+  "pmaddubsw  %%xmm5,%%xmm0                    \n"
+  "paddsw     %%xmm7,%%xmm0                    \n"
+  "psrlw      $0x2,%%xmm0                      \n"
+  "packuswb   %%xmm0,%%xmm0                    \n"
+  "movq       %%xmm0,(%1)                      \n"
+  "movdqu     0x8(%0),%%xmm0                   \n"
+  "movdqu     0x8(%0,%3,1),%%xmm1              \n"
+  "pavgb      %%xmm0,%%xmm1                    \n"
+  "pavgb      %%xmm1,%%xmm0                    \n"
+  "pshufb     %%xmm3,%%xmm0                    \n"
+  "pmaddubsw  %%xmm6,%%xmm0                    \n"
+  "paddsw     %%xmm7,%%xmm0                    \n"
+  "psrlw      $0x2,%%xmm0                      \n"
+  "packuswb   %%xmm0,%%xmm0                    \n"
+  "movq       %%xmm0,0x8(%1)                   \n"
+  "movdqa     0x10(%0),%%xmm0                  \n"
+  "movdqa     0x10(%0,%3,1),%%xmm1             \n"
+  "lea        0x20(%0),%0                      \n"
+  "pavgb      %%xmm0,%%xmm1                    \n"
+  "pavgb      %%xmm1,%%xmm0                    \n"
+  "pshufb     %%xmm4,%%xmm0                    \n"
+  "pmaddubsw  %%xmm8,%%xmm0                    \n"
+  "paddsw     %%xmm7,%%xmm0                    \n"
+  "psrlw      $0x2,%%xmm0                      \n"
+  "packuswb   %%xmm0,%%xmm0                    \n"
+  "movq       %%xmm0,0x10(%1)                  \n"
+  "lea        0x18(%1),%1                      \n"
+  "sub        $0x18,%2                         \n"
+  "ja         1b                               \n"
+  : "+r"(src_ptr),     // %0
+    "+r"(dst_ptr),     // %1
+    "+r"(dst_width)    // %2
+  : "r"((intptr_t)(src_stride)),  // %3
+    "r"(_shuf01),   // %4
+    "r"(_shuf11),   // %5
+    "r"(_shuf21),   // %6
+    "r"(_madd01),   // %7
+    "r"(_madd11),   // %8
+    "r"(_round34),  // %9
+    "r"(_madd21)    // %10
+  : "memory", "cc", "xmm6", "xmm7", "xmm8"
+);
+}
+
+#define HAS_SCALEROWDOWN38_SSSE3
+static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
+                                 uint8* dst_ptr, int dst_width) {
+  asm volatile (
+  "movdqa     (%3),%%xmm4                      \n"
+  "movdqa     (%4),%%xmm5                      \n"
+"1:"
+  "movdqa     (%0),%%xmm0                      \n"
+  "movdqa     0x10(%0),%%xmm1                  \n"
+  "lea        0x20(%0),%0                      \n"
+  "pshufb     %%xmm4,%%xmm0                    \n"
+  "pshufb     %%xmm5,%%xmm1                    \n"
+  "paddusb    %%xmm1,%%xmm0                    \n"
+  "movq       %%xmm0,(%1)                      \n"
+  "movhlps    %%xmm0,%%xmm1                    \n"
+  "movd       %%xmm1,0x8(%1)                   \n"
+  "lea        0xc(%1),%1                       \n"
+  "sub        $0xc,%2                          \n"
+  "ja         1b                               \n"
+  : "+r"(src_ptr),     // %0
+    "+r"(dst_ptr),     // %1
+    "+r"(dst_width)    // %2
+  : "r"(_shuf38a),  // %3
+    "r"(_shuf38b)   // %4
+  : "memory", "cc"
+);
+}
+
+static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
+                                       uint8* dst_ptr, int dst_width) {
+  asm volatile (
+  "movdqa     (%4),%%xmm4                      \n"
+  "movdqa     (%5),%%xmm5                      \n"
+  "movdqa     (%6),%%xmm6                      \n"
+  "pxor       %%xmm7,%%xmm7                    \n"
+"1:"
+  "movdqa     (%0),%%xmm0                      \n"
+  "movdqa     (%0,%3,1),%%xmm2                 \n"
+  "movhlps    %%xmm0,%%xmm1                    \n"
+  "movhlps    %%xmm2,%%xmm3                    \n"
+  "punpcklbw  %%xmm7,%%xmm0                    \n"
+  "punpcklbw  %%xmm7,%%xmm1                    \n"
+  "punpcklbw  %%xmm7,%%xmm2                    \n"
+  "punpcklbw  %%xmm7,%%xmm3                    \n"
+  "paddusw    %%xmm2,%%xmm0                    \n"
+  "paddusw    %%xmm3,%%xmm1                    \n"
+  "movdqa     (%0,%3,2),%%xmm2                 \n"
+  "lea        0x10(%0),%0                      \n"
+  "movhlps    %%xmm2,%%xmm3                    \n"
+  "punpcklbw  %%xmm7,%%xmm2                    \n"
+  "punpcklbw  %%xmm7,%%xmm3                    \n"
+  "paddusw    %%xmm2,%%xmm0                    \n"
+  "paddusw    %%xmm3,%%xmm1                    \n"
+  "movdqa     %%xmm0,%%xmm2                    \n"
+  "psrldq     $0x2,%%xmm0                      \n"
+  "paddusw    %%xmm0,%%xmm2                    \n"
+  "psrldq     $0x2,%%xmm0                      \n"
+  "paddusw    %%xmm0,%%xmm2                    \n"
+  "pshufb     %%xmm4,%%xmm2                    \n"
+  "movdqa     %%xmm1,%%xmm3                    \n"
+  "psrldq     $0x2,%%xmm1                      \n"
+  "paddusw    %%xmm1,%%xmm3                    \n"
+  "psrldq     $0x2,%%xmm1                      \n"
+  "paddusw    %%xmm1,%%xmm3                    \n"
+  "pshufb     %%xmm5,%%xmm3                    \n"
+  "paddusw    %%xmm3,%%xmm2                    \n"
+  "pmulhuw    %%xmm6,%%xmm2                    \n"
+  "packuswb   %%xmm2,%%xmm2                    \n"
+  "movd       %%xmm2,(%1)                      \n"
+  "pextrw     $0x2,%%xmm2,%%eax                \n"
+  "mov        %%ax,0x4(%1)                     \n"
+  "lea        0x6(%1),%1                       \n"
+  "sub        $0x6,%2                          \n"
+  "ja         1b                               \n"
+  : "+r"(src_ptr),     // %0
+    "+r"(dst_ptr),     // %1
+    "+r"(dst_width)    // %2
+  : "r"((intptr_t)(src_stride)),  // %3
+    "r"(_shufac0),   // %4
+    "r"(_shufac3),   // %5
+    "r"(_scaleac3)   // %6
+  : "memory", "cc", "rax", "xmm6", "xmm7"
+);
+}
+
+static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
+                                       uint8* dst_ptr, int dst_width) {
+  asm volatile (
+  "movdqa     (%4),%%xmm4                      \n"
+  "movdqa     (%5),%%xmm5                      \n"
+  "movdqa     (%6),%%xmm6                      \n"
+  "movdqa     (%7),%%xmm7                      \n"
+"1:"
+  "movdqa     (%0),%%xmm2                      \n"
+  "pavgb      (%0,%3,1),%%xmm2                 \n"
+  "lea        0x10(%0),%0                      \n"
+  "movdqa     %%xmm2,%%xmm0                    \n"
+  "pshufb     %%xmm4,%%xmm0                    \n"
+  "movdqa     %%xmm2,%%xmm1                    \n"
+  "pshufb     %%xmm5,%%xmm1                    \n"
+  "paddusw    %%xmm1,%%xmm0                    \n"
+  "pshufb     %%xmm6,%%xmm2                    \n"
+  "paddusw    %%xmm2,%%xmm0                    \n"
+  "pmulhuw    %%xmm7,%%xmm0                    \n"
+  "packuswb   %%xmm0,%%xmm0                    \n"
+  "movd       %%xmm0,(%1)                      \n"
+  "pextrw     $0x2,%%xmm0,%%eax                \n"
+  "mov        %%ax,0x4(%1)                     \n"
+  "lea        0x6(%1),%1                       \n"
+  "sub        $0x6,%2                          \n"
+  "ja         1b                               \n"
+  : "+r"(src_ptr),     // %0
+    "+r"(dst_ptr),     // %1
+    "+r"(dst_width)    // %2
+  : "r"((intptr_t)(src_stride)),  // %3
+    "r"(_shufab0),   // %4
+    "r"(_shufab1),   // %5
+    "r"(_shufab2),   // %6
+    "r"(_scaleab2)   // %7
+  : "memory", "cc", "rax", "xmm6", "xmm7"
+);
+}
+
+#define HAS_SCALEADDROWS_SSE2
+static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
+                              uint16* dst_ptr, int src_width,
+                              int src_height) {
+  asm volatile (
+  "pxor       %%xmm5,%%xmm5                    \n"
+"1:"
+  "movdqa     (%0),%%xmm2                      \n"
+  "lea        (%0,%4,1),%%r10                  \n"
+  "movhlps    %%xmm2,%%xmm3                    \n"
+  "lea        -0x1(%3),%%r11                   \n"
+  "punpcklbw  %%xmm5,%%xmm2                    \n"
+  "punpcklbw  %%xmm5,%%xmm3                    \n"
+
+"2:"
+  "movdqa     (%%r10),%%xmm0                   \n"
+  "lea        (%%r10,%4,1),%%r10               \n"
+  "movhlps    %%xmm0,%%xmm1                    \n"
+  "punpcklbw  %%xmm5,%%xmm0                    \n"
+  "punpcklbw  %%xmm5,%%xmm1                    \n"
+  "paddusw    %%xmm0,%%xmm2                    \n"
+  "paddusw    %%xmm1,%%xmm3                    \n"
+  "sub        $0x1,%%r11                       \n"
+  "ja         2b                               \n"
+
+  "movdqa     %%xmm2,(%1)                      \n"
+  "movdqa     %%xmm3,0x10(%1)                  \n"
+  "lea        0x20(%1),%1                      \n"
+  "lea        0x10(%0),%0                      \n"
+  "sub        $0x10,%2                         \n"
+  "ja         1b                               \n"
+  : "+r"(src_ptr),     // %0
+    "+r"(dst_ptr),     // %1
+    "+r"(src_width),   // %2
+    "+r"(src_height)   // %3
+  : "r"((intptr_t)(src_stride))  // %4
+  : "memory", "cc", "r10", "r11"
+);
+}
+
+// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version
+#define HAS_SCALEFILTERROWS_SSE2
+static void ScaleFilterRows_SSE2(uint8* dst_ptr,
+                                 const uint8* src_ptr, int src_stride,
+                                 int dst_width, int source_y_fraction) {
+  if (source_y_fraction == 0) {
+    asm volatile (
+    "1:"
+      "movdqa     (%1),%%xmm0                  \n"
+      "lea        0x10(%1),%1                  \n"
+      "movdqa     %%xmm0,(%0)                  \n"
+      "lea        0x10(%0),%0                  \n"
+      "sub        $0x10,%2                     \n"
+      "ja         1b                           \n"
+      "mov        -0x1(%0),%%al                \n"
+      "mov        %%al,(%0)                    \n"
+      : "+r"(dst_ptr),     // %0
+        "+r"(src_ptr),     // %1
+        "+r"(dst_width)    // %2
+      :
+      : "memory", "cc", "rax"
+    );
+    return;
+  } else if (source_y_fraction == 128) {
+    asm volatile (
+    "1:"
+      "movdqa     (%1),%%xmm0                  \n"
+      "movdqa     (%1,%3,1),%%xmm2             \n"
+      "lea        0x10(%1),%1                  \n"
+      "pavgb      %%xmm2,%%xmm0                \n"
+      "movdqa     %%xmm0,(%0)                  \n"
+      "lea        0x10(%0),%0                  \n"
+      "sub        $0x10,%2                     \n"
+      "ja         1b                           \n"
+      "mov        -0x1(%0),%%al                \n"
+      "mov        %%al,(%0)                    \n"
+      : "+r"(dst_ptr),     // %0
+        "+r"(src_ptr),     // %1
+        "+r"(dst_width)    // %2
+      : "r"((intptr_t)(src_stride))  // %3
+      : "memory", "cc", "rax"
+    );
+    return;
+  } else {
+    asm volatile (
+      "mov        %3,%%eax                     \n"
+      "movd       %%eax,%%xmm6                 \n"
+      "punpcklwd  %%xmm6,%%xmm6                \n"
+      "pshufd     $0x0,%%xmm6,%%xmm6           \n"
+      "neg        %%eax                        \n"
+      "add        $0x100,%%eax                 \n"
+      "movd       %%eax,%%xmm5                 \n"
+      "punpcklwd  %%xmm5,%%xmm5                \n"
+      "pshufd     $0x0,%%xmm5,%%xmm5           \n"
+      "pxor       %%xmm7,%%xmm7                \n"
+    "1:"
+      "movdqa     (%1),%%xmm0                  \n"
+      "movdqa     (%1,%4,1),%%xmm2             \n"
+      "lea        0x10(%1),%1                  \n"
+      "movdqa     %%xmm0,%%xmm1                \n"
+      "movdqa     %%xmm2,%%xmm3                \n"
+      "punpcklbw  %%xmm7,%%xmm0                \n"
+      "punpcklbw  %%xmm7,%%xmm2                \n"
+      "punpckhbw  %%xmm7,%%xmm1                \n"
+      "punpckhbw  %%xmm7,%%xmm3                \n"
+      "pmullw     %%xmm5,%%xmm0                \n"
+      "pmullw     %%xmm5,%%xmm1                \n"
+      "pmullw     %%xmm6,%%xmm2                \n"
+      "pmullw     %%xmm6,%%xmm3                \n"
+      "paddusw    %%xmm2,%%xmm0                \n"
+      "paddusw    %%xmm3,%%xmm1                \n"
+      "psrlw      $0x8,%%xmm0                  \n"
+      "psrlw      $0x8,%%xmm1                  \n"
+      "packuswb   %%xmm1,%%xmm0                \n"
+      "movdqa     %%xmm0,(%0)                  \n"
+      "lea        0x10(%0),%0                  \n"
+      "sub        $0x10,%2                     \n"
+      "ja         1b                           \n"
+      "mov        -0x1(%0),%%al                \n"
+      "mov        %%al,(%0)                    \n"
+      : "+r"(dst_ptr),     // %0
+        "+r"(src_ptr),     // %1
+        "+r"(dst_width),   // %2
+        "+r"(source_y_fraction)  // %3
+      : "r"((intptr_t)(src_stride))  // %4
+      : "memory", "cc", "rax", "xmm6", "xmm7"
+    );
+  }
+  return;
+}
+
+// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version
+#define HAS_SCALEFILTERROWS_SSSE3
+static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
+                                  const uint8* src_ptr, int src_stride,
+                                  int dst_width, int source_y_fraction) {
+  if (source_y_fraction == 0) {
+    asm volatile (
+   "1:"
+      "movdqa     (%1),%%xmm0                  \n"
+      "lea        0x10(%1),%1                  \n"
+      "movdqa     %%xmm0,(%0)                  \n"
+      "lea        0x10(%0),%0                  \n"
+      "sub        $0x10,%2                     \n"
+      "ja         1b                           \n"
+      "mov        -0x1(%0),%%al                \n"
+      "mov        %%al,(%0)                    \n"
+      : "+r"(dst_ptr),     // %0
+        "+r"(src_ptr),     // %1
+        "+r"(dst_width)    // %2
+      :
+      : "memory", "cc", "rax"
+    );
+    return;
+  } else if (source_y_fraction == 128) {
+    asm volatile (
+    "1:"
+      "movdqa     (%1),%%xmm0                  \n"
+      "movdqa     (%1,%3,1),%%xmm2             \n"
+      "lea        0x10(%1),%1                  \n"
+      "pavgb      %%xmm2,%%xmm0                \n"
+      "movdqa     %%xmm0,(%0)                  \n"
+      "lea        0x10(%0),%0                  \n"
+      "sub        $0x10,%2                     \n"
+      "ja         1b                           \n"
+      "mov        -0x1(%0),%%al                \n"
+      "mov        %%al,(%0)                    \n"
+      : "+r"(dst_ptr),     // %0
+        "+r"(src_ptr),     // %1
+        "+r"(dst_width)    // %2
+      : "r"((intptr_t)(src_stride))  // %3
+     : "memory", "cc", "rax"
+    );
+    return;
+  } else {
+    asm volatile (
+      "mov        %3,%%eax                     \n"
+      "shr        %%eax                        \n"
+      "mov        %%al,%%ah                    \n"
+      "neg        %%al                         \n"
+      "add        $0x80,%%al                   \n"
+      "movd       %%eax,%%xmm5                 \n"
+      "punpcklwd  %%xmm5,%%xmm5                \n"
+      "pshufd     $0x0,%%xmm5,%%xmm5           \n"
+    "1:"
+      "movdqa     (%1),%%xmm0                  \n"
+      "movdqa     (%1,%4,1),%%xmm2             \n"
+      "lea        0x10(%1),%1                  \n"
+      "movdqa     %%xmm0,%%xmm1                \n"
+      "punpcklbw  %%xmm2,%%xmm0                \n"
+      "punpckhbw  %%xmm2,%%xmm1                \n"
+      "pmaddubsw  %%xmm5,%%xmm0                \n"
+      "pmaddubsw  %%xmm5,%%xmm1                \n"
+      "psrlw      $0x7,%%xmm0                  \n"
+      "psrlw      $0x7,%%xmm1                  \n"
+      "packuswb   %%xmm1,%%xmm0                \n"
+      "movdqa     %%xmm0,(%0)                  \n"
+      "lea        0x10(%0),%0                  \n"
+      "sub        $0x10,%2                     \n"
+      "ja         1b                           \n"
+      "mov        -0x1(%0),%%al                \n"
+      "mov        %%al,(%0)                    \n"
+      : "+r"(dst_ptr),     // %0
+        "+r"(src_ptr),     // %1
+        "+r"(dst_width),   // %2
+        "+r"(source_y_fraction)  // %3
+      : "r"((intptr_t)(src_stride))  // %4
+      : "memory", "cc", "rax"
+    );
+  }
+  return;
+}
+#endif
+#endif
+
+// CPU agnostic row functions
+static void ScaleRowDown2_C(const uint8* src_ptr, int src_stride,
+                            uint8* dst, int dst_width) {
+  int x;
+  for (x = 0; x < dst_width; ++x) {
+    *dst++ = *src_ptr;
+    src_ptr += 2;
+  }
+}
+
+static void ScaleRowDown2Int_C(const uint8* src_ptr, int src_stride,
+                               uint8* dst, int dst_width) {
+  int x;
+  for (x = 0; x < dst_width; ++x) {
+    *dst++ = (src_ptr[0] + src_ptr[1] +
+              src_ptr[src_stride] + src_ptr[src_stride + 1] + 2) >> 2;
+    src_ptr += 2;
+  }
+}
+
+static void ScaleRowDown4_C(const uint8* src_ptr, int src_stride,
+                            uint8* dst, int dst_width) {
+  int x;
+  for (x = 0; x < dst_width; ++x) {
+    *dst++ = *src_ptr;
+    src_ptr += 4;
+  }
+}
+
+static void ScaleRowDown4Int_C(const uint8* src_ptr, int src_stride,
+                               uint8* dst, int dst_width) {
+  int x;
+  for (x = 0; x < dst_width; ++x) {
+    *dst++ = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+              src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
+              src_ptr[src_stride + 2] + src_ptr[src_stride + 3] +
+              src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] +
+              src_ptr[src_stride * 2 + 2] + src_ptr[src_stride * 2 + 3] +
+              src_ptr[src_stride * 3 + 0] + src_ptr[src_stride * 3 + 1] +
+              src_ptr[src_stride * 3 + 2] + src_ptr[src_stride * 3 + 3] +
+              8) >> 4;
+    src_ptr += 4;
+  }
+}
+
+// 640 output pixels is enough to allow 5120 input pixels with 1/8 scale down.
+// Keeping the total buffer under 4096 bytes avoids a stackcheck, saving 4% cpu.
+// The following 2 lines cause error on Windows.
+//static const int kMaxOutputWidth = 640;
+//static const int kMaxRow12 = 1280;         //kMaxOutputWidth * 2;
+#define kMaxOutputWidth   640
+#define kMaxRow12         1280
+
+static void ScaleRowDown8_C(const uint8* src_ptr, int src_stride,
+                            uint8* dst, int dst_width) {
+  int x;
+  for (x = 0; x < dst_width; ++x) {
+    *dst++ = *src_ptr;
+    src_ptr += 8;
+  }
+}
+
+// Note calling code checks width is less than max and if not
+// uses ScaleRowDown8_C instead.
+static void ScaleRowDown8Int_C(const uint8* src_ptr, int src_stride,
+                               uint8* dst, int dst_width) {
+  ALIGN16(uint8 src_row[kMaxRow12 * 2]);
+  assert(dst_width <= kMaxOutputWidth);
+  ScaleRowDown4Int_C(src_ptr, src_stride, src_row, dst_width * 2);
+  ScaleRowDown4Int_C(src_ptr + src_stride * 4, src_stride,
+                     src_row + kMaxOutputWidth,
+                     dst_width * 2);
+  ScaleRowDown2Int_C(src_row, kMaxOutputWidth, dst, dst_width);
+}
+
+static void ScaleRowDown34_C(const uint8* src_ptr, int src_stride,
+                             uint8* dst, int dst_width) {
+  uint8* dend;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  dend = dst + dst_width;
+  do {
+    dst[0] = src_ptr[0];
+    dst[1] = src_ptr[1];
+    dst[2] = src_ptr[3];
+    dst += 3;
+    src_ptr += 4;
+  } while (dst < dend);
+}
+
+// Filter rows 0 and 1 together, 3 : 1
+static void ScaleRowDown34_0_Int_C(const uint8* src_ptr, int src_stride,
+                                   uint8* d, int dst_width) {
+  uint8* dend;
+  const uint8* s;
+  const uint8* t;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  dend = d + dst_width;
+  s = src_ptr;
+  t = src_ptr + src_stride;
+  do {
+    uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+    uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+    uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+    uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+    uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+    uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+    d[0] = (a0 * 3 + b0 + 2) >> 2;
+    d[1] = (a1 * 3 + b1 + 2) >> 2;
+    d[2] = (a2 * 3 + b2 + 2) >> 2;
+    d += 3;
+    s += 4;
+    t += 4;
+  } while (d < dend);
+}
+
+// Filter rows 1 and 2 together, 1 : 1
+static void ScaleRowDown34_1_Int_C(const uint8* src_ptr, int src_stride,
+                                   uint8* d, int dst_width) {
+  uint8* dend;
+  const uint8* s;
+  const uint8* t;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  dend = d + dst_width;
+  s = src_ptr;
+  t = src_ptr + src_stride;
+  do {
+    uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+    uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+    uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+    uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+    uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+    uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+    d[0] = (a0 + b0 + 1) >> 1;
+    d[1] = (a1 + b1 + 1) >> 1;
+    d[2] = (a2 + b2 + 1) >> 1;
+    d += 3;
+    s += 4;
+    t += 4;
+  } while (d < dend);
+}
+
+#if defined(HAS_SCALEFILTERROWS_SSE2)
+// Filter row to 3/4
+static void ScaleFilterCols34_C(uint8* dst_ptr, const uint8* src_ptr,
+                                int dst_width) {
+  uint8* dend;
+  const uint8* s;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  dend = dst_ptr + dst_width;
+  s = src_ptr;
+  do {
+    dst_ptr[0] = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+    dst_ptr[1] = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+    dst_ptr[2] = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+    dst_ptr += 3;
+    s += 4;
+  } while (dst_ptr < dend);
+}
+#endif
+
+static void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
+                              int dst_width, int dx) {
+  int x = 0;
+  int j;
+  for (j = 0; j < dst_width; ++j) {
+    int xi = x >> 16;
+    int xf1 = x & 0xffff;
+    int xf0 = 65536 - xf1;
+
+    *dst_ptr++ = (src_ptr[xi] * xf0 + src_ptr[xi + 1] * xf1) >> 16;
+    x += dx;
+  }
+}
+
+//Not work on Windows
+//static const int kMaxInputWidth = 2560;
+#define kMaxInputWidth    2560
+#if defined(HAS_SCALEFILTERROWS_SSE2)
+#define HAS_SCALEROWDOWN34_SSE2
+// Filter rows 0 and 1 together, 3 : 1
+static void ScaleRowDown34_0_Int_SSE2(const uint8* src_ptr, int src_stride,
+                                      uint8* dst_ptr, int dst_width) {
+  ALIGN16(uint8 row[kMaxInputWidth]);
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 4);
+  ScaleFilterCols34_C(dst_ptr, row, dst_width);
+}
+
+// Filter rows 1 and 2 together, 1 : 1
+static void ScaleRowDown34_1_Int_SSE2(const uint8* src_ptr, int src_stride,
+                                      uint8* dst_ptr, int dst_width) {
+  ALIGN16(uint8 row[kMaxInputWidth]);
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 2);
+  ScaleFilterCols34_C(dst_ptr, row, dst_width);
+}
+#endif
+
+static void ScaleRowDown38_C(const uint8* src_ptr, int src_stride,
+                             uint8* dst, int dst_width) {
+  int x;
+  assert(dst_width % 3 == 0);
+  for (x = 0; x < dst_width; x += 3) {
+    dst[0] = src_ptr[0];
+    dst[1] = src_ptr[3];
+    dst[2] = src_ptr[6];
+    dst += 3;
+    src_ptr += 8;
+  }
+}
+
+// 8x3 -> 3x1
+static void ScaleRowDown38_3_Int_C(const uint8* src_ptr, int src_stride,
+                                   uint8* dst_ptr, int dst_width) {
+  int i;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (i = 0; i < dst_width; i+=3) {
+    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
+        src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
+        src_ptr[src_stride + 2] + src_ptr[src_stride * 2 + 0] +
+        src_ptr[src_stride * 2 + 1] + src_ptr[src_stride * 2 + 2]) *
+        (65536 / 9) >> 16;
+    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
+        src_ptr[src_stride + 3] + src_ptr[src_stride + 4] +
+        src_ptr[src_stride + 5] + src_ptr[src_stride * 2 + 3] +
+        src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5]) *
+        (65536 / 9) >> 16;
+    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
+        src_ptr[src_stride + 6] + src_ptr[src_stride + 7] +
+        src_ptr[src_stride * 2 + 6] + src_ptr[src_stride * 2 + 7]) *
+        (65536 / 6) >> 16;
+    src_ptr += 8;
+    dst_ptr += 3;
+  }
+}
+
+// 8x2 -> 3x1
+static void ScaleRowDown38_2_Int_C(const uint8* src_ptr, int src_stride,
+                                   uint8* dst_ptr, int dst_width) {
+  int i;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (i = 0; i < dst_width; i+=3) {
+    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
+        src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
+        src_ptr[src_stride + 2]) * (65536 / 6) >> 16;
+    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
+        src_ptr[src_stride + 3] + src_ptr[src_stride + 4] +
+        src_ptr[src_stride + 5]) * (65536 / 6) >> 16;
+    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
+        src_ptr[src_stride + 6] + src_ptr[src_stride + 7]) *
+        (65536 / 4) >> 16;
+    src_ptr += 8;
+    dst_ptr += 3;
+  }
+}
+
+// C version 8x2 -> 8x1
+static void ScaleFilterRows_C(uint8* dst_ptr,
+                              const uint8* src_ptr, int src_stride,
+                              int dst_width, int source_y_fraction) {
+  int y1_fraction;
+  int y0_fraction;
+  const uint8* src_ptr1;
+  uint8* end;
+  assert(dst_width > 0);
+  y1_fraction = source_y_fraction;
+  y0_fraction = 256 - y1_fraction;
+  src_ptr1 = src_ptr + src_stride;
+  end = dst_ptr + dst_width;
+  do {
+    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
+    dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
+    dst_ptr[2] = (src_ptr[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8;
+    dst_ptr[3] = (src_ptr[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8;
+    dst_ptr[4] = (src_ptr[4] * y0_fraction + src_ptr1[4] * y1_fraction) >> 8;
+    dst_ptr[5] = (src_ptr[5] * y0_fraction + src_ptr1[5] * y1_fraction) >> 8;
+    dst_ptr[6] = (src_ptr[6] * y0_fraction + src_ptr1[6] * y1_fraction) >> 8;
+    dst_ptr[7] = (src_ptr[7] * y0_fraction + src_ptr1[7] * y1_fraction) >> 8;
+    src_ptr += 8;
+    src_ptr1 += 8;
+    dst_ptr += 8;
+  } while (dst_ptr < end);
+  dst_ptr[0] = dst_ptr[-1];
+}
+
+void ScaleAddRows_C(const uint8* src_ptr, int src_stride,
+                    uint16* dst_ptr, int src_width, int src_height) {
+  int x,y;
+  assert(src_width > 0);
+  assert(src_height > 0);
+  for (x = 0; x < src_width; ++x) {
+    const uint8* s = src_ptr + x;
+    int sum = 0;
+    for (y = 0; y < src_height; ++y) {
+      sum += s[0];
+      s += src_stride;
+    }
+    dst_ptr[x] = sum;
+  }
+}
+
+/**
+ * Scale plane, 1/2
+ *
+ * This is an optimized version for scaling down a plane to 1/2 of
+ * its original size.
+ *
+ */
+static void ScalePlaneDown2(int src_width, int src_height,
+                            int dst_width, int dst_height,
+                            int src_stride, int dst_stride,
+                            const uint8* src_ptr, uint8* dst_ptr,
+                            FilterMode filtering) {
+  void (*ScaleRowDown2)(const uint8* src_ptr, int src_stride,
+                        uint8* dst_ptr, int dst_width);
+  assert(IS_ALIGNED(src_width, 2));
+  assert(IS_ALIGNED(src_height, 2));
+
+#if defined(HAS_SCALEROWDOWN2_NEON)
+  if (TestCpuFlag(kCpuHasNEON) &&
+      IS_ALIGNED(dst_width, 16)) {
+    ScaleRowDown2 = filtering ? ScaleRowDown2Int_NEON : ScaleRowDown2_NEON;
+  } else
+#endif
+#if defined(HAS_SCALEROWDOWN2_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) &&
+      IS_ALIGNED(dst_width, 16) &&
+      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
+      IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
+    ScaleRowDown2 = filtering ? ScaleRowDown2Int_SSE2 : ScaleRowDown2_SSE2;
+  } else
+#endif
+  {
+    ScaleRowDown2 = filtering ? ScaleRowDown2Int_C : ScaleRowDown2_C;
+  }
+
+  {
+    int y;
+    for (y = 0; y < dst_height; ++y) {
+      ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
+      src_ptr += (src_stride << 1);
+      dst_ptr += dst_stride;
+    }
+  }
+}
+
+/**
+ * Scale plane, 1/4
+ *
+ * This is an optimized version for scaling down a plane to 1/4 of
+ * its original size.
+ */
+static void ScalePlaneDown4(int src_width, int src_height,
+                            int dst_width, int dst_height,
+                            int src_stride, int dst_stride,
+                            const uint8* src_ptr, uint8* dst_ptr,
+                            FilterMode filtering) {
+  void (*ScaleRowDown4)(const uint8* src_ptr, int src_stride,
+                        uint8* dst_ptr, int dst_width);
+  assert(IS_ALIGNED(src_width, 4));
+  assert(IS_ALIGNED(src_height, 4));
+
+#if defined(HAS_SCALEROWDOWN4_NEON)
+  if (TestCpuFlag(kCpuHasNEON) &&
+      IS_ALIGNED(dst_width, 4)) {
+    ScaleRowDown4 = filtering ? ScaleRowDown4Int_NEON : ScaleRowDown4_NEON;
+  } else
+#endif
+#if defined(HAS_SCALEROWDOWN4_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) &&
+      IS_ALIGNED(dst_width, 8) &&
+      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
+      IS_ALIGNED(dst_ptr, 8) && IS_ALIGNED(dst_stride, 8)) {
+    ScaleRowDown4 = filtering ? ScaleRowDown4Int_SSE2 : ScaleRowDown4_SSE2;
+  } else
+#endif
+  {
+    ScaleRowDown4 = filtering ? ScaleRowDown4Int_C : ScaleRowDown4_C;
+  }
+
+  {
+    int y;
+    for (y = 0; y < dst_height; ++y) {
+      ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
+      src_ptr += (src_stride << 2);
+      dst_ptr += dst_stride;
+    }
+  }
+}
+
+/**
+ * Scale plane, 1/8
+ *
+ * This is an optimized version for scaling down a plane to 1/8
+ * of its original size.
+ *
+ */
+static void ScalePlaneDown8(int src_width, int src_height,
+                            int dst_width, int dst_height,
+                            int src_stride, int dst_stride,
+                            const uint8* src_ptr, uint8* dst_ptr,
+                            FilterMode filtering) {
+  void (*ScaleRowDown8)(const uint8* src_ptr, int src_stride,
+                        uint8* dst_ptr, int dst_width);
+  assert(IS_ALIGNED(src_width, 8));
+  assert(IS_ALIGNED(src_height, 8));
+
+#if defined(HAS_SCALEROWDOWN8_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) &&
+      IS_ALIGNED(dst_width, 4) &&
+      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+    ScaleRowDown8 = filtering ? ScaleRowDown8Int_SSE2 : ScaleRowDown8_SSE2;
+  } else
+#endif
+  {
+    ScaleRowDown8 = filtering && (dst_width <= kMaxOutputWidth) ?
+        ScaleRowDown8Int_C : ScaleRowDown8_C;
+  }
+
+  {
+    int y;
+    for (y = 0; y < dst_height; ++y) {
+      ScaleRowDown8(src_ptr, src_stride, dst_ptr, dst_width);
+      src_ptr += (src_stride << 3);
+      dst_ptr += dst_stride;
+    }
+  }
+}
+
+/**
+ * Scale plane down, 3/4
+ *
+ * Provided by Frank Barchard (fbarchard@google.com)
+ *
+ */
+static void ScalePlaneDown34(int src_width, int src_height,
+                             int dst_width, int dst_height,
+                             int src_stride, int dst_stride,
+                             const uint8* src_ptr, uint8* dst_ptr,
+                             FilterMode filtering) {
+  void (*ScaleRowDown34_0)(const uint8* src_ptr, int src_stride,
+                           uint8* dst_ptr, int dst_width);
+  void (*ScaleRowDown34_1)(const uint8* src_ptr, int src_stride,
+                           uint8* dst_ptr, int dst_width);
+  assert(dst_width % 3 == 0);
+#if defined(HAS_SCALEROWDOWN34_NEON)
+  if (TestCpuFlag(kCpuHasNEON) &&
+      (dst_width % 24 == 0)) {
+    if (!filtering) {
+      ScaleRowDown34_0 = ScaleRowDown34_NEON;
+      ScaleRowDown34_1 = ScaleRowDown34_NEON;
+    } else {
+      ScaleRowDown34_0 = ScaleRowDown34_0_Int_NEON;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Int_NEON;
+    }
+  } else
+#endif
+
+#if defined(HAS_SCALEROWDOWN34_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) &&
+      (dst_width % 24 == 0) &&
+      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
+      IS_ALIGNED(dst_ptr, 8) && IS_ALIGNED(dst_stride, 8)) {
+    if (!filtering) {
+      ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
+      ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
+    } else {
+      ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSSE3;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSSE3;
+    }
+  } else
+#endif
+#if defined(HAS_SCALEROWDOWN34_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) &&
+      (dst_width % 24 == 0) && IS_ALIGNED(src_stride, 16) &&
+      IS_ALIGNED(dst_stride, 8) &&
+      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8) &&
+      filtering) {
+    ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSE2;
+    ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSE2;
+  } else
+#endif
+  {
+    if (!filtering) {
+      ScaleRowDown34_0 = ScaleRowDown34_C;
+      ScaleRowDown34_1 = ScaleRowDown34_C;
+    } else {
+      ScaleRowDown34_0 = ScaleRowDown34_0_Int_C;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Int_C;
+    }
+  }
+  {
+  int src_row = 0;
+    int y;
+    for (y = 0; y < dst_height; ++y) {
+    switch (src_row) {
+      case 0:
+        ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width);
+        break;
+
+      case 1:
+        ScaleRowDown34_1(src_ptr, src_stride, dst_ptr, dst_width);
+        break;
+
+      case 2:
+        ScaleRowDown34_0(src_ptr + src_stride, -src_stride,
+                         dst_ptr, dst_width);
+        break;
+    }
+    ++src_row;
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+    if (src_row >= 3) {
+      src_ptr += src_stride;
+      src_row = 0;
+    }
+  }
+}
+}
+
+/**
+ * Scale plane, 3/8
+ *
+ * This is an optimized version for scaling down a plane to 3/8
+ * of its original size.
+ *
+ * Reduces 16x3 to 6x1
+ */
+static void ScalePlaneDown38(int src_width, int src_height,
+                             int dst_width, int dst_height,
+                             int src_stride, int dst_stride,
+                             const uint8* src_ptr, uint8* dst_ptr,
+                             FilterMode filtering) {
+  void (*ScaleRowDown38_3)(const uint8* src_ptr, int src_stride,
+                           uint8* dst_ptr, int dst_width);
+  void (*ScaleRowDown38_2)(const uint8* src_ptr, int src_stride,
+                           uint8* dst_ptr, int dst_width);
+  assert(dst_width % 3 == 0);
+#if defined(HAS_SCALEROWDOWN38_NEON)
+  if (TestCpuFlag(kCpuHasNEON) &&
+      (dst_width % 12 == 0)) {
+    if (!filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_NEON;
+      ScaleRowDown38_2 = ScaleRowDown38_NEON;
+    } else {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Int_NEON;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Int_NEON;
+    }
+  } else
+#endif
+
+#if defined(HAS_SCALEROWDOWN38_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) &&
+      (dst_width % 24 == 0) && IS_ALIGNED(src_stride, 16) &&
+      IS_ALIGNED(dst_stride, 8) &&
+      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8)) {
+    if (!filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_SSSE3;
+      ScaleRowDown38_2 = ScaleRowDown38_SSSE3;
+    } else {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Int_SSSE3;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Int_SSSE3;
+    }
+  } else
+#endif
+  {
+    if (!filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_C;
+      ScaleRowDown38_2 = ScaleRowDown38_C;
+    } else {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Int_C;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Int_C;
+    }
+  }
+  {
+  int src_row = 0;
+    int y;
+    for (y = 0; y < dst_height; ++y) {
+    switch (src_row) {
+      case 0:
+      case 1:
+        ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width);
+        src_ptr += src_stride * 3;
+        ++src_row;
+        break;
+
+      case 2:
+        ScaleRowDown38_2(src_ptr, src_stride, dst_ptr, dst_width);
+        src_ptr += src_stride * 2;
+        src_row = 0;
+        break;
+    }
+    dst_ptr += dst_stride;
+  }
+}
+}
+
+__inline static uint32 SumBox(int iboxwidth, int iboxheight,
+                            int src_stride, const uint8* src_ptr) {
+  int x, y;
+  uint32 sum;
+  assert(iboxwidth > 0);
+  assert(iboxheight > 0);
+  sum = 0u;
+  for (y = 0; y < iboxheight; ++y) {
+    for (x = 0; x < iboxwidth; ++x) {
+      sum += src_ptr[x];
+    }
+    src_ptr += src_stride;
+  }
+  return sum;
+}
+
+static void ScalePlaneBoxRow(int dst_width, int boxheight,
+                             int dx, int src_stride,
+                             const uint8* src_ptr, uint8* dst_ptr) {
+  int x = 0;
+  int i;
+  for (i = 0; i < dst_width; ++i) {
+    int ix = x >> 16;
+    int boxwidth;
+    x += dx;
+    boxwidth = (x >> 16) - ix;
+    *dst_ptr++ = SumBox(boxwidth, boxheight, src_stride, src_ptr + ix) /
+        (boxwidth * boxheight);
+  }
+}
+
+__inline static uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {
+  uint32 sum;
+  int x;
+  assert(iboxwidth > 0);
+  sum = 0u;
+  for (x = 0; x < iboxwidth; ++x) {
+    sum += src_ptr[x];
+  }
+  return sum;
+}
+
+static void ScaleAddCols2_C(int dst_width, int boxheight, int dx,
+                            const uint16* src_ptr, uint8* dst_ptr) {
+  int scaletbl[2];
+  int minboxwidth = (dx >> 16);
+  scaletbl[0] = 65536 / (minboxwidth * boxheight);
+  scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight);
+  {
+  int *scaleptr = scaletbl - minboxwidth;
+  int x = 0;
+    int i;
+    for (i = 0; i < dst_width; ++i) {
+    int ix = x >> 16;
+      int boxwidth;
+    x += dx;
+      boxwidth = (x >> 16) - ix;
+    *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16;
+    }
+  }
+}
+
+static void ScaleAddCols1_C(int dst_width, int boxheight, int dx,
+                            const uint16* src_ptr, uint8* dst_ptr) {
+  int boxwidth = (dx >> 16);
+  int scaleval = 65536 / (boxwidth * boxheight);
+  int x = 0;
+  int i;
+  for (i = 0; i < dst_width; ++i) {
+    *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16;
+    x += boxwidth;
+  }
+}
+
+/**
+ * Scale plane down to any dimensions, with interpolation.
+ * (boxfilter).
+ *
+ * Same method as SimpleScale, which is fixed point, outputting
+ * one pixel of destination using fixed point (16.16) to step
+ * through source, sampling a box of pixel with simple
+ * averaging.
+ */
+static void ScalePlaneBox(int src_width, int src_height,
+                          int dst_width, int dst_height,
+                          int src_stride, int dst_stride,
+                          const uint8* src_ptr, uint8* dst_ptr) {
+  int dx, dy;
+  assert(dst_width > 0);
+  assert(dst_height > 0);
+  dy = (src_height << 16) / dst_height;
+  dx = (src_width << 16) / dst_width;
+  if (!IS_ALIGNED(src_width, 16) || (src_width > kMaxInputWidth) ||
+      dst_height * 2 > src_height) {
+    uint8* dst = dst_ptr;
+    int dy = (src_height << 16) / dst_height;
+    int dx = (src_width << 16) / dst_width;
+    int y = 0;
+    int j;
+    for (j = 0; j < dst_height; ++j) {
+      int iy = y >> 16;
+      const uint8* const src = src_ptr + iy * src_stride;
+      int boxheight;
+      y += dy;
+      if (y > (src_height << 16)) {
+        y = (src_height << 16);
+      }
+      boxheight = (y >> 16) - iy;
+      ScalePlaneBoxRow(dst_width, boxheight,
+                       dx, src_stride,
+                       src, dst);
+
+      dst += dst_stride;
+    }
+  } else {
+    ALIGN16(uint16 row[kMaxInputWidth]);
+    void (*ScaleAddRows)(const uint8* src_ptr, int src_stride,
+                         uint16* dst_ptr, int src_width, int src_height);
+    void (*ScaleAddCols)(int dst_width, int boxheight, int dx,
+                         const uint16* src_ptr, uint8* dst_ptr);
+#if defined(HAS_SCALEADDROWS_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) &&
+        IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16) &&
+        IS_ALIGNED(src_width, 16)) {
+      ScaleAddRows = ScaleAddRows_SSE2;
+    } else
+#endif
+    {
+      ScaleAddRows = ScaleAddRows_C;
+    }
+    if (dx & 0xffff) {
+      ScaleAddCols = ScaleAddCols2_C;
+    } else {
+      ScaleAddCols = ScaleAddCols1_C;
+    }
+
+    {
+    int y = 0;
+      int j;
+      for (j = 0; j < dst_height; ++j) {
+      int iy = y >> 16;
+      const uint8* const src = src_ptr + iy * src_stride;
+        int boxheight;
+      y += dy;
+      if (y > (src_height << 16)) {
+        y = (src_height << 16);
+      }
+        boxheight = (y >> 16) - iy;
+      ScaleAddRows(src, src_stride, row, src_width, boxheight);
+      ScaleAddCols(dst_width, boxheight, dx, row, dst_ptr);
+      dst_ptr += dst_stride;
+      }
+    }
+  }
+}
+
+/**
+ * Scale plane to/from any dimensions, with interpolation.
+ */
+static void ScalePlaneBilinearSimple(int src_width, int src_height,
+                                     int dst_width, int dst_height,
+                                     int src_stride, int dst_stride,
+                                     const uint8* src_ptr, uint8* dst_ptr) {
+  int i, j;
+  uint8* dst = dst_ptr;
+  int dx = (src_width << 16) / dst_width;
+  int dy = (src_height << 16) / dst_height;
+  int maxx = ((src_width - 1) << 16) - 1;
+  int maxy = ((src_height - 1) << 16) - 1;
+  int y = (dst_height < src_height) ? 32768 :
+      (src_height << 16) / dst_height - 32768;
+  for (i = 0; i < dst_height; ++i) {
+    int cy = (y < 0) ? 0 : y;
+    int yi = cy >> 16;
+    int yf = cy & 0xffff;
+    const uint8* const src = src_ptr + yi * src_stride;
+    int x = (dst_width < src_width) ? 32768 :
+        (src_width << 16) / dst_width - 32768;
+    for (j = 0; j < dst_width; ++j) {
+      int cx = (x < 0) ? 0 : x;
+      int xi = cx >> 16;
+      int xf = cx & 0xffff;
+      int r0 = (src[xi] * (65536 - xf) + src[xi + 1] * xf) >> 16;
+      int r1 = (src[xi + src_stride] * (65536 - xf) +
+          src[xi + src_stride + 1] * xf) >> 16;
+      *dst++ = (r0 * (65536 - yf) + r1 * yf) >> 16;
+      x += dx;
+      if (x > maxx)
+        x = maxx;
+    }
+    dst += dst_stride - dst_width;
+    y += dy;
+    if (y > maxy)
+      y = maxy;
+  }
+}
+
+/**
+ * Scale plane to/from any dimensions, with bilinear
+ * interpolation.
+ */
+static void ScalePlaneBilinear(int src_width, int src_height,
+                               int dst_width, int dst_height,
+                               int src_stride, int dst_stride,
+                               const uint8* src_ptr, uint8* dst_ptr) {
+  int dy;
+  int dx;
+  assert(dst_width > 0);
+  assert(dst_height > 0);
+  dy = (src_height << 16) / dst_height;
+  dx = (src_width << 16) / dst_width;
+  if (!IS_ALIGNED(src_width, 8) || (src_width > kMaxInputWidth)) {
+    ScalePlaneBilinearSimple(src_width, src_height, dst_width, dst_height,
+                             src_stride, dst_stride, src_ptr, dst_ptr);
+
+  } else {
+    ALIGN16(uint8 row[kMaxInputWidth + 1]);
+    void (*ScaleFilterRows)(uint8* dst_ptr, const uint8* src_ptr,
+                            int src_stride,
+                            int dst_width, int source_y_fraction);
+    void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
+                            int dst_width, int dx);
+#if defined(HAS_SCALEFILTERROWS_SSSE3)
+    if (TestCpuFlag(kCpuHasSSSE3) &&
+        IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16) &&
+        IS_ALIGNED(src_width, 16)) {
+      ScaleFilterRows = ScaleFilterRows_SSSE3;
+    } else
+#endif
+#if defined(HAS_SCALEFILTERROWS_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) &&
+        IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16) &&
+        IS_ALIGNED(src_width, 16)) {
+      ScaleFilterRows = ScaleFilterRows_SSE2;
+    } else
+#endif
+    {
+      ScaleFilterRows = ScaleFilterRows_C;
+    }
+    ScaleFilterCols = ScaleFilterCols_C;
+
+    {
+    int y = 0;
+    int maxy = ((src_height - 1) << 16) - 1; // max is filter of last 2 rows.
+      int j;
+      for (j = 0; j < dst_height; ++j) {
+      int iy = y >> 16;
+      int fy = (y >> 8) & 255;
+      const uint8* const src = src_ptr + iy * src_stride;
+      ScaleFilterRows(row, src, src_stride, src_width, fy);
+      ScaleFilterCols(dst_ptr, row, dst_width, dx);
+      dst_ptr += dst_stride;
+      y += dy;
+      if (y > maxy) {
+        y = maxy;
+      }
+    }
+  }
+}
+}
+
+/**
+ * Scale plane to/from any dimensions, without interpolation.
+ * Fixed point math is used for performance: The upper 16 bits
+ * of x and dx is the integer part of the source position and
+ * the lower 16 bits are the fixed decimal part.
+ */
+static void ScalePlaneSimple(int src_width, int src_height,
+                             int dst_width, int dst_height,
+                             int src_stride, int dst_stride,
+                             const uint8* src_ptr, uint8* dst_ptr) {
+  uint8* dst = dst_ptr;
+  int dx = (src_width << 16) / dst_width;
+  int y;
+  for (y = 0; y < dst_height; ++y) {
+    const uint8* const src = src_ptr + (y * src_height / dst_height) *
+        src_stride;
+    // TODO(fbarchard): Round X coordinate by setting x=0x8000.
+    int x = 0;
+    int i;
+    for (i = 0; i < dst_width; ++i) {
+      *dst++ = src[x >> 16];
+      x += dx;
+    }
+    dst += dst_stride - dst_width;
+  }
+}
+
+/**
+ * Scale plane to/from any dimensions.
+ */
+static void ScalePlaneAnySize(int src_width, int src_height,
+                              int dst_width, int dst_height,
+                              int src_stride, int dst_stride,
+                              const uint8* src_ptr, uint8* dst_ptr,
+                              FilterMode filtering) {
+  if (!filtering) {
+    ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
+                     src_stride, dst_stride, src_ptr, dst_ptr);
+  } else {
+    // fall back to non-optimized version
+    ScalePlaneBilinear(src_width, src_height, dst_width, dst_height,
+                       src_stride, dst_stride, src_ptr, dst_ptr);
+  }
+}
+
+/**
+ * Scale plane down, any size
+ *
+ * This is an optimized version for scaling down a plane to any size.
+ * The current implementation is ~10 times faster compared to the
+ * reference implementation for e.g. XGA->LowResPAL
+ *
+ */
+static void ScalePlaneDown(int src_width, int src_height,
+                           int dst_width, int dst_height,
+                           int src_stride, int dst_stride,
+                           const uint8* src_ptr, uint8* dst_ptr,
+                           FilterMode filtering) {
+  if (!filtering) {
+    ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
+                     src_stride, dst_stride, src_ptr, dst_ptr);
+  } else if (filtering == kFilterBilinear || src_height * 2 > dst_height) {
+    // between 1/2x and 1x use bilinear
+    ScalePlaneBilinear(src_width, src_height, dst_width, dst_height,
+                       src_stride, dst_stride, src_ptr, dst_ptr);
+  } else {
+    ScalePlaneBox(src_width, src_height, dst_width, dst_height,
+                  src_stride, dst_stride, src_ptr, dst_ptr);
+  }
+}
+
+/**
+ * Copy plane, no scaling
+ *
+ * This simply copies the given plane without scaling.
+ * The current implementation is ~115 times faster
+ * compared to the reference implementation.
+ *
+ */
+static void CopyPlane(int src_width, int src_height,
+                      int dst_width, int dst_height,
+                      int src_stride, int dst_stride,
+                      const uint8* src_ptr, uint8* dst_ptr) {
+  if (src_stride == src_width && dst_stride == dst_width) {
+    // All contiguous, so can use REALLY fast path.
+    memcpy(dst_ptr, src_ptr, src_width * src_height);
+  } else {
+    // Not all contiguous; must copy scanlines individually
+    const uint8* src = src_ptr;
+    uint8* dst = dst_ptr;
+    int i;
+    for (i = 0; i < src_height; ++i) {
+      memcpy(dst, src, src_width);
+      dst += dst_stride;
+      src += src_stride;
+    }
+  }
+}
+
+static void ScalePlane(const uint8* src, int src_stride,
+                       int src_width, int src_height,
+                       uint8* dst, int dst_stride,
+                       int dst_width, int dst_height,
+                       FilterMode filtering, int use_ref) {
+  // Use specialized scales to improve performance for common resolutions.
+  // For example, all the 1/2 scalings will use ScalePlaneDown2()
+  if (dst_width == src_width && dst_height == src_height) {
+    // Straight copy.
+    CopyPlane(src_width, src_height, dst_width, dst_height, src_stride,
+              dst_stride, src, dst);
+  } else if (dst_width <= src_width && dst_height <= src_height) {
+    // Scale down.
+    if (use_ref) {
+      // For testing, allow the optimized versions to be disabled.
+      ScalePlaneDown(src_width, src_height, dst_width, dst_height,
+                     src_stride, dst_stride, src, dst, filtering);
+    } else if (4 * dst_width == 3 * src_width &&
+               4 * dst_height == 3 * src_height) {
+      // optimized, 3/4
+      ScalePlaneDown34(src_width, src_height, dst_width, dst_height,
+                       src_stride, dst_stride, src, dst, filtering);
+    } else if (2 * dst_width == src_width && 2 * dst_height == src_height) {
+      // optimized, 1/2
+      ScalePlaneDown2(src_width, src_height, dst_width, dst_height,
+                      src_stride, dst_stride, src, dst, filtering);
+    // 3/8 rounded up for odd sized chroma height.
+    } else if (8 * dst_width == 3 * src_width &&
+               dst_height == ((src_height * 3 + 7) / 8)) {
+      // optimized, 3/8
+      ScalePlaneDown38(src_width, src_height, dst_width, dst_height,
+                       src_stride, dst_stride, src, dst, filtering);
+    } else if (4 * dst_width == src_width && 4 * dst_height == src_height) {
+      // optimized, 1/4
+      ScalePlaneDown4(src_width, src_height, dst_width, dst_height,
+                      src_stride, dst_stride, src, dst, filtering);
+    } else if (8 * dst_width == src_width && 8 * dst_height == src_height) {
+      // optimized, 1/8
+      ScalePlaneDown8(src_width, src_height, dst_width, dst_height,
+                      src_stride, dst_stride, src, dst, filtering);
+    } else {
+      // Arbitrary downsample
+      ScalePlaneDown(src_width, src_height, dst_width, dst_height,
+                     src_stride, dst_stride, src, dst, filtering);
+    }
+  } else {
+    // Arbitrary scale up and/or down.
+    ScalePlaneAnySize(src_width, src_height, dst_width, dst_height,
+                      src_stride, dst_stride, src, dst, filtering);
+  }
+}
+
+/**
+ * Scale a plane.
+ *
+ * This function in turn calls a scaling function
+ * suitable for handling the desired resolutions.
+ *
+ */
+
+int I420Scale(const uint8* src_y, int src_stride_y,
+              const uint8* src_u, int src_stride_u,
+              const uint8* src_v, int src_stride_v,
+              int src_width, int src_height,
+              uint8* dst_y, int dst_stride_y,
+              uint8* dst_u, int dst_stride_u,
+              uint8* dst_v, int dst_stride_v,
+              int dst_width, int dst_height,
+              FilterMode filtering) {
+  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
+      !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (src_height < 0) {
+    int halfheight;
+    src_height = -src_height;
+    halfheight = (src_height + 1) >> 1;
+    src_y = src_y + (src_height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+  {
+  int src_halfwidth = (src_width + 1) >> 1;
+  int src_halfheight = (src_height + 1) >> 1;
+  int dst_halfwidth = (dst_width + 1) >> 1;
+  int dst_halfheight = (dst_height + 1) >> 1;
+
+    ScalePlane(src_y, src_stride_y, src_width, src_height,
+               dst_y, dst_stride_y, dst_width, dst_height,
+               filtering, use_reference_impl_);
+  ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight,
+             dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
+             filtering, use_reference_impl_);
+  ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight,
+             dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
+             filtering, use_reference_impl_);
+  }
+  return 0;
+}
+
+// Deprecated api
+int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
+          int src_stride_y, int src_stride_u, int src_stride_v,
+          int src_width, int src_height,
+          uint8* dst_y, uint8* dst_u, uint8* dst_v,
+          int dst_stride_y, int dst_stride_u, int dst_stride_v,
+          int dst_width, int dst_height,
+          int interpolate) {
+  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
+      !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (src_height < 0) {
+    int halfheight;
+    src_height = -src_height;
+    halfheight = (src_height + 1) >> 1;
+    src_y = src_y + (src_height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+  {
+  int src_halfwidth = (src_width + 1) >> 1;
+  int src_halfheight = (src_height + 1) >> 1;
+  int dst_halfwidth = (dst_width + 1) >> 1;
+  int dst_halfheight = (dst_height + 1) >> 1;
+  FilterMode filtering = interpolate ? kFilterBox : kFilterNone;
+
+  ScalePlane(src_y, src_stride_y, src_width, src_height,
+             dst_y, dst_stride_y, dst_width, dst_height,
+             filtering, use_reference_impl_);
+  ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight,
+             dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
+             filtering, use_reference_impl_);
+  ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight,
+             dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
+             filtering, use_reference_impl_);
+  }
+  return 0;
+}
+
+// Deprecated api
+int ScaleOffset(const uint8* src, int src_width, int src_height,
+                uint8* dst, int dst_width, int dst_height, int dst_yoffset,
+          int interpolate) {
+  if (!src || src_width <= 0 || src_height <= 0 ||
+      !dst || dst_width <= 0 || dst_height <= 0 || dst_yoffset < 0 ||
+      dst_yoffset >= dst_height) {
+    return -1;
+  }
+  dst_yoffset = dst_yoffset & ~1;  // chroma requires offset to multiple of 2.
+  {
+  int src_halfwidth = (src_width + 1) >> 1;
+  int src_halfheight = (src_height + 1) >> 1;
+  int dst_halfwidth = (dst_width + 1) >> 1;
+  int dst_halfheight = (dst_height + 1) >> 1;
+  int aheight = dst_height - dst_yoffset * 2;  // actual output height
+  const uint8* const src_y = src;
+  const uint8* const src_u = src + src_width * src_height;
+  const uint8* const src_v = src + src_width * src_height +
+                             src_halfwidth * src_halfheight;
+  uint8* dst_y = dst + dst_yoffset * dst_width;
+  uint8* dst_u = dst + dst_width * dst_height +
+                 (dst_yoffset >> 1) * dst_halfwidth;
+  uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight +
+                 (dst_yoffset >> 1) * dst_halfwidth;
+  return Scale(src_y, src_u, src_v, src_width, src_halfwidth, src_halfwidth,
+               src_width, src_height, dst_y, dst_u, dst_v, dst_width,
+               dst_halfwidth, dst_halfwidth, dst_width, aheight, interpolate);
+  }
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/usage.dox b/usage.dox
index 0db080b00..9370e428f 100644
--- a/usage.dox
+++ b/usage.dox
@@ -82,6 +82,7 @@
 
     The available initialization methods are:
     \if encoder - #vpx_codec_enc_init (calls vpx_codec_enc_init_ver()) \endif
+    \if multi-encoder - #vpx_codec_enc_init_multi (calls vpx_codec_enc_init_multi_ver()) \endif
     \if decoder - #vpx_codec_dec_init (calls vpx_codec_dec_init_ver()) \endif
 
 
diff --git a/usage_cx.dox b/usage_cx.dox
index 980a03461..62f3e450b 100644
--- a/usage_cx.dox
+++ b/usage_cx.dox
@@ -1,6 +1,6 @@
 /*! \page usage_encode Encode
 
-    The vpx_codec_encode() function is at the core of the decode loop. It
+    The vpx_codec_encode() function is at the core of the encode loop. It
     processes raw images passed by the application, producing packets of
     compressed data. The <code>deadline</code> parameter controls the amount
     of time in microseconds the encoder should spend working on the frame. For
@@ -10,5 +10,4 @@
 
     \ref samples
 
-
 */
diff --git a/vp8/common/arm/arm_systemdependent.c b/vp8/common/arm/arm_systemdependent.c
index b5f194d3d..89a2be825 100644
--- a/vp8/common/arm/arm_systemdependent.c
+++ b/vp8/common/arm/arm_systemdependent.c
@@ -11,7 +11,6 @@
 
 #include "vpx_config.h"
 #include "vpx_ports/arm.h"
-#include "vp8/common/g_common.h"
 #include "vp8/common/pragmas.h"
 #include "vp8/common/subpixel.h"
 #include "vp8/common/loopfilter.h"
@@ -46,7 +45,6 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx)
         rtcd->subpix.bilinear4x4   = vp8_bilinear_predict4x4_armv6;
 
         rtcd->idct.idct16       = vp8_short_idct4x4llm_v6_dual;
-        rtcd->idct.iwalsh1      = vp8_short_inv_walsh4x4_1_v6;
         rtcd->idct.iwalsh16     = vp8_short_inv_walsh4x4_v6;
 
         rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_armv6;
@@ -64,6 +62,12 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx)
         rtcd->recon.copy8x8     = vp8_copy_mem8x8_v6;
         rtcd->recon.copy8x4     = vp8_copy_mem8x4_v6;
         rtcd->recon.intra4x4_predict = vp8_intra4x4_predict_armv6;
+
+        rtcd->dequant.block               = vp8_dequantize_b_v6;
+        rtcd->dequant.idct_add            = vp8_dequant_idct_add_v6;
+        rtcd->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_v6;
+        rtcd->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_v6;
+
     }
 #endif
 
@@ -80,7 +84,6 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx)
         rtcd->subpix.bilinear4x4   = vp8_bilinear_predict4x4_neon;
 
         rtcd->idct.idct16       = vp8_short_idct4x4llm_neon;
-        rtcd->idct.iwalsh1      = vp8_short_inv_walsh4x4_1_neon;
         rtcd->idct.iwalsh16     = vp8_short_inv_walsh4x4_neon;
 
         rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_neon;
@@ -99,6 +102,12 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx)
             vp8_build_intra_predictors_mby_neon;
         rtcd->recon.build_intra_predictors_mby_s =
             vp8_build_intra_predictors_mby_s_neon;
+
+        rtcd->dequant.block               = vp8_dequantize_b_neon;
+        rtcd->dequant.idct_add            = vp8_dequant_idct_add_neon;
+        rtcd->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_neon;
+        rtcd->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_neon;
+
     }
 #endif
 
diff --git a/vp8/decoder/arm/armv6/dequant_idct_v6.asm b/vp8/common/arm/armv6/dequant_idct_v6.asm
index 2510ad838..2510ad838 100644
--- a/vp8/decoder/arm/armv6/dequant_idct_v6.asm
+++ b/vp8/common/arm/armv6/dequant_idct_v6.asm
diff --git a/vp8/decoder/arm/armv6/dequantize_v6.asm b/vp8/common/arm/armv6/dequantize_v6.asm
index 72f7e0ee5..72f7e0ee5 100644
--- a/vp8/decoder/arm/armv6/dequantize_v6.asm
+++ b/vp8/common/arm/armv6/dequantize_v6.asm
diff --git a/vp8/decoder/arm/armv6/idct_blk_v6.c b/vp8/common/arm/armv6/idct_blk_v6.c
index 686bb737f..9108929f5 100644
--- a/vp8/decoder/arm/armv6/idct_blk_v6.c
+++ b/vp8/common/arm/armv6/idct_blk_v6.c
@@ -10,50 +10,9 @@
 
 #include "vpx_config.h"
 #include "vp8/common/idct.h"
-#include "vp8/decoder/dequantize.h"
+#include "vp8/common/dequantize.h"
 
 
-void vp8_dequant_dc_idct_add_y_block_v6(short *q, short *dq,
-                                        unsigned char *dst, int stride,
-                                        char *eobs, short *dc)
-{
-    int i;
-
-    for (i = 0; i < 4; i++)
-    {
-        if (eobs[0] > 1)
-            vp8_dequant_dc_idct_add_v6 (q, dq, dst, stride, dc[0]);
-        else if (eobs[0] == 1)
-            vp8_dc_only_idct_add_v6 (dc[0], dst, stride, dst, stride);
-
-        if (eobs[1] > 1)
-        {
-            vp8_dequant_dc_idct_add_v6 (q+16, dq, dst+4, stride, dc[1]);
-        }
-        else if (eobs[1] == 1)
-            vp8_dc_only_idct_add_v6 (dc[1], dst+4, stride, dst+4, stride);
-
-        if (eobs[2] > 1)
-        {
-            vp8_dequant_dc_idct_add_v6 (q+32, dq, dst+8, stride, dc[2]);
-        }
-        else if (eobs[2] == 1)
-            vp8_dc_only_idct_add_v6 (dc[2], dst+8, stride, dst+8, stride);
-
-        if (eobs[3] > 1)
-        {
-            vp8_dequant_dc_idct_add_v6 (q+48, dq, dst+12, stride, dc[3]);
-        }
-        else if (eobs[3] == 1)
-            vp8_dc_only_idct_add_v6 (dc[3], dst+12, stride, dst+12, stride);
-
-        q    += 64;
-        dc   += 4;
-        dst  += 4*stride;
-        eobs += 4;
-    }
-}
-
 void vp8_dequant_idct_add_y_block_v6(short *q, short *dq,
                                      unsigned char *dst,
                                      int stride, char *eobs)
diff --git a/vp8/common/arm/armv6/iwalsh_v6.asm b/vp8/common/arm/armv6/iwalsh_v6.asm
index 463bff0f5..31ef09cad 100644
--- a/vp8/common/arm/armv6/iwalsh_v6.asm
+++ b/vp8/common/arm/armv6/iwalsh_v6.asm
@@ -9,7 +9,6 @@
 ;
 
     EXPORT |vp8_short_inv_walsh4x4_v6|
-    EXPORT |vp8_short_inv_walsh4x4_1_v6|
 
     ARM
     REQUIRE8
@@ -17,19 +16,19 @@
 
     AREA    |.text|, CODE, READONLY  ; name this block of code
 
-;short vp8_short_inv_walsh4x4_v6(short *input, short *output)
+;short vp8_short_inv_walsh4x4_v6(short *input, short *mb_dqcoeff)
 |vp8_short_inv_walsh4x4_v6| PROC
 
-    stmdb       sp!, {r4 - r11, lr}
+    stmdb       sp!, {r4 - r12, lr}
 
-    ldr         r2, [r0], #4         ; [1  |  0]
-    ldr         r3, [r0], #4         ; [3  |  2]
-    ldr         r4, [r0], #4         ; [5  |  4]
-    ldr         r5, [r0], #4         ; [7  |  6]
-    ldr         r6, [r0], #4         ; [9  |  8]
-    ldr         r7, [r0], #4         ; [11 | 10]
-    ldr         r8, [r0], #4         ; [13 | 12]
-    ldr         r9, [r0]             ; [15 | 14]
+    ldr         r2, [r0, #0]         ; [1  |  0]
+    ldr         r3, [r0, #4]         ; [3  |  2]
+    ldr         r4, [r0, #8]         ; [5  |  4]
+    ldr         r5, [r0, #12]        ; [7  |  6]
+    ldr         r6, [r0, #16]        ; [9  |  8]
+    ldr         r7, [r0, #20]        ; [11 | 10]
+    ldr         r8, [r0, #24]        ; [13 | 12]
+    ldr         r9, [r0, #28]        ; [15 | 14]
 
     qadd16      r10, r2, r8          ; a1 [1+13  |  0+12]
     qadd16      r11, r4, r6          ; b1 [5+9   |  4+8]
@@ -69,24 +68,27 @@
     qadd16      r4, r4, r10          ; [b2+3|c2+3]
     qadd16      r5, r5, r10          ; [a2+3|d2+3]
 
-    asr         r12, r2, #3          ; [1  |  x]
-    pkhtb       r12, r12, r3, asr #19; [1  |  0]
-    lsl         lr, r3, #16          ; [~3 |  x]
-    lsl         r2, r2, #16          ; [~2 |  x]
-    asr         lr, lr, #3           ; [3  |  x]
-    pkhtb       lr, lr, r2, asr #19  ; [3  |  2]
-
-    asr         r2, r4, #3           ; [5  |  x]
-    pkhtb       r2, r2, r5, asr #19  ; [5  |  4]
-    lsl         r3, r5, #16          ; [~7 |  x]
-    lsl         r4, r4, #16          ; [~6 |  x]
-    asr         r3, r3, #3           ; [7  |  x]
-    pkhtb       r3, r3, r4, asr #19  ; [7  |  6]
-
-    str         r12, [r1], #4
-    str         lr, [r1], #4
-    str         r2, [r1], #4
-    str         r3, [r1], #4
+    asr         r12, r3, #19         ; [0]
+    strh        r12, [r1], #32
+    asr         lr, r2, #19          ; [1]
+    strh        lr, [r1], #32
+    sxth        r2, r2
+    sxth        r3, r3
+    asr         r2, r2, #3           ; [2]
+    strh        r2, [r1], #32
+    asr         r3, r3, #3           ; [3]
+    strh        r3, [r1], #32
+
+    asr         r12, r5, #19         ; [4]
+    strh        r12, [r1], #32
+    asr         lr, r4, #19          ; [5]
+    strh        lr, [r1], #32
+    sxth        r4, r4
+    sxth        r5, r5
+    asr         r4, r4, #3           ; [6]
+    strh        r4, [r1], #32
+    asr         r5, r5, #3           ; [7]
+    strh        r5, [r1], #32
 
     qsubaddx    r2, r6, r7           ; [c1|a1] [9-10  |  8+11]
     qaddsubx    r3, r6, r7           ; [b1|d1] [9+10  |  8-11]
@@ -103,50 +105,32 @@
     qadd16      r8, r8, r10          ; [b2+3|c2+3]
     qadd16      r9, r9, r10          ; [a2+3|d2+3]
 
-    asr         r2, r6, #3           ; [9  |  x]
-    pkhtb       r2, r2, r7, asr #19  ; [9  |  8]
-    lsl         r3, r7, #16          ; [~11|  x]
-    lsl         r4, r6, #16          ; [~10|  x]
-    asr         r3, r3, #3           ; [11 |  x]
-    pkhtb       r3, r3, r4, asr #19  ; [11 | 10]
-
-    asr         r4, r8, #3           ; [13 |  x]
-    pkhtb       r4, r4, r9, asr #19  ; [13 | 12]
-    lsl         r5, r9, #16          ; [~15|  x]
-    lsl         r6, r8, #16          ; [~14|  x]
-    asr         r5, r5, #3           ; [15 |  x]
-    pkhtb       r5, r5, r6, asr #19  ; [15 | 14]
-
-    str         r2, [r1], #4
-    str         r3, [r1], #4
-    str         r4, [r1], #4
-    str         r5, [r1]
-
-    ldmia       sp!, {r4 - r11, pc}
+    asr         r12, r7, #19         ; [8]
+    strh        r12, [r1], #32
+    asr         lr, r6, #19          ; [9]
+    strh        lr, [r1], #32
+    sxth        r6, r6
+    sxth        r7, r7
+    asr         r6, r6, #3           ; [10]
+    strh        r6, [r1], #32
+    asr         r7, r7, #3           ; [11]
+    strh        r7, [r1], #32
+
+    asr         r12, r9, #19         ; [12]
+    strh        r12, [r1], #32
+    asr         lr, r8, #19          ; [13]
+    strh        lr, [r1], #32
+    sxth        r8, r8
+    sxth        r9, r9
+    asr         r8, r8, #3           ; [14]
+    strh        r8, [r1], #32
+    asr         r9, r9, #3           ; [15]
+    strh        r9, [r1], #32
+
+    ldmia       sp!, {r4 - r12, pc}
     ENDP        ; |vp8_short_inv_walsh4x4_v6|
 
 
-;short vp8_short_inv_walsh4x4_1_v6(short *input, short *output)
-|vp8_short_inv_walsh4x4_1_v6| PROC
-
-    ldrsh       r2, [r0]             ; [0]
-    add         r2, r2, #3           ; [0] + 3
-    asr         r2, r2, #3           ; a1 ([0]+3) >> 3
-    lsl         r2, r2, #16          ; [a1 |  x]
-    orr         r2, r2, r2, lsr #16  ; [a1 | a1]
-
-    str         r2, [r1], #4
-    str         r2, [r1], #4
-    str         r2, [r1], #4
-    str         r2, [r1], #4
-    str         r2, [r1], #4
-    str         r2, [r1], #4
-    str         r2, [r1], #4
-    str         r2, [r1]
-
-    bx          lr
-    ENDP        ; |vp8_short_inv_walsh4x4_1_v6|
-
 ; Constant Pool
 c0x00030003 DCD 0x00030003
     END
diff --git a/vp8/decoder/arm/dequantize_arm.c b/vp8/common/arm/dequantize_arm.c
index 2918e0512..20a8ac4fc 100644
--- a/vp8/decoder/arm/dequantize_arm.c
+++ b/vp8/common/arm/dequantize_arm.c
@@ -10,9 +10,8 @@
 
 
 #include "vpx_config.h"
-#include "vp8/decoder/dequantize.h"
+#include "vp8/common/dequantize.h"
 #include "vp8/common/idct.h"
-#include "vpx_mem/vpx_mem.h"
 
 #if HAVE_ARMV7
 extern void vp8_dequantize_b_loop_neon(short *Q, short *DQC, short *DQ);
diff --git a/vp8/decoder/arm/dequantize_arm.h b/vp8/common/arm/dequantize_arm.h
index c020c8530..0b4d8fe89 100644
--- a/vp8/decoder/arm/dequantize_arm.h
+++ b/vp8/common/arm/dequantize_arm.h
@@ -15,8 +15,6 @@
 #if HAVE_ARMV6
 extern prototype_dequant_block(vp8_dequantize_b_v6);
 extern prototype_dequant_idct_add(vp8_dequant_idct_add_v6);
-extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_v6);
-extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_v6);
 extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_v6);
 extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_v6);
 
@@ -24,19 +22,13 @@ extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_v6);
 #undef  vp8_dequant_block
 #define vp8_dequant_block vp8_dequantize_b_v6
 
-#undef vp8_dequant_idct_add
+#undef  vp8_dequant_idct_add
 #define vp8_dequant_idct_add vp8_dequant_idct_add_v6
 
-#undef vp8_dequant_dc_idct_add
-#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_v6
-
-#undef vp8_dequant_dc_idct_add_y_block
-#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_v6
-
-#undef vp8_dequant_idct_add_y_block
+#undef  vp8_dequant_idct_add_y_block
 #define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_v6
 
-#undef vp8_dequant_idct_add_uv_block
+#undef  vp8_dequant_idct_add_uv_block
 #define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_v6
 #endif
 #endif
@@ -44,8 +36,6 @@ extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_v6);
 #if HAVE_ARMV7
 extern prototype_dequant_block(vp8_dequantize_b_neon);
 extern prototype_dequant_idct_add(vp8_dequant_idct_add_neon);
-extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_neon);
-extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_neon);
 extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_neon);
 extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_neon);
 
@@ -54,19 +44,13 @@ extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_neon);
 #undef  vp8_dequant_block
 #define vp8_dequant_block vp8_dequantize_b_neon
 
-#undef vp8_dequant_idct_add
+#undef  vp8_dequant_idct_add
 #define vp8_dequant_idct_add vp8_dequant_idct_add_neon
 
-#undef vp8_dequant_dc_idct_add
-#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_neon
-
-#undef vp8_dequant_dc_idct_add_y_block
-#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_neon
-
-#undef vp8_dequant_idct_add_y_block
+#undef  vp8_dequant_idct_add_y_block
 #define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_neon
 
-#undef vp8_dequant_idct_add_uv_block
+#undef  vp8_dequant_idct_add_uv_block
 #define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_neon
 #endif
 
diff --git a/vp8/common/arm/idct_arm.h b/vp8/common/arm/idct_arm.h
index c710c2eb0..68c0cad11 100644
--- a/vp8/common/arm/idct_arm.h
+++ b/vp8/common/arm/idct_arm.h
@@ -25,9 +25,6 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_v6);
 #undef  vp8_idct_idct1_scalar_add
 #define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_v6
 
-#undef  vp8_idct_iwalsh1
-#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_v6
-
 #undef  vp8_idct_iwalsh16
 #define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_v6
 #endif
@@ -46,9 +43,6 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_neon);
 #undef  vp8_idct_idct1_scalar_add
 #define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_neon
 
-#undef  vp8_idct_iwalsh1
-#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_neon
-
 #undef  vp8_idct_iwalsh16
 #define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_neon
 #endif
diff --git a/vp8/decoder/arm/neon/dequant_idct_neon.asm b/vp8/common/arm/neon/dequant_idct_neon.asm
index 602cce676..602cce676 100644
--- a/vp8/decoder/arm/neon/dequant_idct_neon.asm
+++ b/vp8/common/arm/neon/dequant_idct_neon.asm
diff --git a/vp8/decoder/arm/neon/dequantizeb_neon.asm b/vp8/common/arm/neon/dequantizeb_neon.asm
index c8e0c31f2..c8e0c31f2 100644
--- a/vp8/decoder/arm/neon/dequantizeb_neon.asm
+++ b/vp8/common/arm/neon/dequantizeb_neon.asm
diff --git a/vp8/decoder/arm/neon/idct_blk_neon.c b/vp8/common/arm/neon/idct_blk_neon.c
index 086293114..cc55843d5 100644
--- a/vp8/decoder/arm/neon/idct_blk_neon.c
+++ b/vp8/common/arm/neon/idct_blk_neon.c
@@ -10,51 +10,16 @@
 
 #include "vpx_config.h"
 #include "vp8/common/idct.h"
-#include "vp8/decoder/dequantize.h"
+#include "vp8/common/dequantize.h"
 
 /* place these declarations here because we don't want to maintain them
  * outside of this scope
  */
-void idct_dequant_dc_full_2x_neon(short *input, short *dq,
-                                  unsigned char *dst,
-                                  int stride, short *dc);
-void idct_dequant_dc_0_2x_neon(short *input, short *dq,
-                               unsigned char *dst,
-                               int stride, short *dc);
 void idct_dequant_full_2x_neon(short *q, short *dq,
                                unsigned char *dst, int stride);
 void idct_dequant_0_2x_neon(short *q, short dq,
                             unsigned char *dst, int stride);
 
-void vp8_dequant_dc_idct_add_y_block_neon(short *q, short *dq,
-                                          unsigned char *dst,
-                                          int stride, char *eobs, short *dc)
-{
-    int i;
-
-    for (i = 0; i < 4; i++)
-    {
-        if (((short *)(eobs))[0])
-        {
-            if (((short *)eobs)[0] & 0xfefe)
-                idct_dequant_dc_full_2x_neon (q, dq, dst, stride, dc);
-            else
-                idct_dequant_dc_0_2x_neon(q, dq, dst, stride, dc);
-        }
-
-        if (((short *)(eobs))[1])
-        {
-            if (((short *)eobs)[1] & 0xfefe)
-                idct_dequant_dc_full_2x_neon (q+32, dq, dst+8, stride, dc+2);
-            else
-                idct_dequant_dc_0_2x_neon(q+32, dq, dst+8, stride, dc+2);
-        }
-        q    += 64;
-        dc   += 4;
-        dst  += 4*stride;
-        eobs += 4;
-    }
-}
 
 void vp8_dequant_idct_add_y_block_neon(short *q, short *dq,
                                        unsigned char *dst,
diff --git a/vp8/decoder/arm/neon/idct_dequant_0_2x_neon.asm b/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm
index 6c29c5586..6c29c5586 100644
--- a/vp8/decoder/arm/neon/idct_dequant_0_2x_neon.asm
+++ b/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm
diff --git a/vp8/decoder/arm/neon/idct_dequant_full_2x_neon.asm b/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm
index d5dce63f6..d5dce63f6 100644
--- a/vp8/decoder/arm/neon/idct_dequant_full_2x_neon.asm
+++ b/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm
diff --git a/vp8/common/arm/neon/iwalsh_neon.asm b/vp8/common/arm/neon/iwalsh_neon.asm
index 01c79d937..e8ea2a619 100644
--- a/vp8/common/arm/neon/iwalsh_neon.asm
+++ b/vp8/common/arm/neon/iwalsh_neon.asm
@@ -8,7 +8,6 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
     EXPORT  |vp8_short_inv_walsh4x4_neon|
-    EXPORT  |vp8_short_inv_walsh4x4_1_neon|
 
     ARM
     REQUIRE8
@@ -16,7 +15,7 @@
 
     AREA    |.text|, CODE, READONLY  ; name this block of code
 
-;short vp8_short_inv_walsh4x4_neon(short *input, short *output)
+;short vp8_short_inv_walsh4x4_neon(short *input, short *mb_dqcoeff)
 |vp8_short_inv_walsh4x4_neon| PROC
 
     ; read in all four lines of values: d0->d3
@@ -59,22 +58,30 @@
     vshr.s16 q0, q0, #3 ;e/f >> 3
     vshr.s16 q1, q1, #3 ;g/h >> 3
 
-    vst4.i16 {d0,d1,d2,d3}, [r1@128]
+    mov      r2, #64
+    add      r3, r1, #32
 
-    bx lr
-    ENDP    ; |vp8_short_inv_walsh4x4_neon|
+    vst1.i16 d0[0], [r1],r2
+    vst1.i16 d1[0], [r3],r2
+    vst1.i16 d2[0], [r1],r2
+    vst1.i16 d3[0], [r3],r2
+
+    vst1.i16 d0[1], [r1],r2
+    vst1.i16 d1[1], [r3],r2
+    vst1.i16 d2[1], [r1],r2
+    vst1.i16 d3[1], [r3],r2
 
+    vst1.i16 d0[2], [r1],r2
+    vst1.i16 d1[2], [r3],r2
+    vst1.i16 d2[2], [r1],r2
+    vst1.i16 d3[2], [r3],r2
+
+    vst1.i16 d0[3], [r1],r2
+    vst1.i16 d1[3], [r3],r2
+    vst1.i16 d2[3], [r1]
+    vst1.i16 d3[3], [r3]
 
-;short vp8_short_inv_walsh4x4_1_neon(short *input, short *output)
-|vp8_short_inv_walsh4x4_1_neon| PROC
-    ldrsh r2, [r0]          ; load input[0]
-    add r3, r2, #3          ; add 3
-    add r2, r1, #16         ; base for last 8 output
-    asr r0, r3, #3          ; right shift 3
-    vdup.16 q0, r0          ; load and duplicate
-    vst1.16 {q0}, [r1@128]  ; write back 8
-    vst1.16 {q0}, [r2@128]  ; write back last 8
     bx lr
-    ENDP    ; |vp8_short_inv_walsh4x4_1_neon|
+    ENDP    ; |vp8_short_inv_walsh4x4_neon|
 
     END
diff --git a/vp8/common/bigend.h b/vp8/common/bigend.h
deleted file mode 100644
index 6ac3f8b5a..000000000
--- a/vp8/common/bigend.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef _bigend_h
-#define _bigend_h
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-#define invert2(x) ( (((x)>>8)&0x00ff) | (((x)<<8)&0xff00) )
-#define invert4(x) ( ((invert2(x)&0x0000ffff)<<16) | (invert2((x>>16))&0x0000ffff) )
-
-#define high_byte(x) (unsigned char)x
-#define mid2Byte(x) (unsigned char)(x >> 8)
-#define mid1Byte(x) (unsigned char)(x >> 16)
-#define low_byte(x) (unsigned char)(x >> 24)
-
-#define SWAPENDS 1
-
-#if defined(__cplusplus)
-}
-#endif
-#endif
diff --git a/vp8/common/blockd.h b/vp8/common/blockd.h
index a90c1c0b6..99b731c78 100644
--- a/vp8/common/blockd.h
+++ b/vp8/common/blockd.h
@@ -21,9 +21,6 @@ void vpx_log(const char *format, ...);
 #include "subpixel.h"
 #include "vpx_ports/mem.h"
 
-#define TRUE    1
-#define FALSE   0
-
 /*#define DCPRED 1*/
 #define DCPREDSIMTHRESH 0
 #define DCPREDCNTTHRESH 3
@@ -170,6 +167,18 @@ typedef struct
     union b_mode_info bmi[16];
 } MODE_INFO;
 
+#if CONFIG_MULTI_RES_ENCODING
+/* The information needed to be stored for higher-resolution encoder */
+typedef struct
+{
+    MB_PREDICTION_MODE mode;
+    MV_REFERENCE_FRAME ref_frame;
+    int_mv mv;
+    //union b_mode_info bmi[16];
+    int dissim;    // dissimilarity level of the macroblock
+} LOWER_RES_INFO;
+#endif
+
 typedef struct
 {
     short *qcoeff;
diff --git a/vp8/common/common.h b/vp8/common/common.h
index 9a93da991..2cc1c544c 100644
--- a/vp8/common/common.h
+++ b/vp8/common/common.h
@@ -18,8 +18,6 @@
 
 #include "vpx_mem/vpx_mem.h"
 
-#include "common_types.h"
-
 /* Only need this for fixed-size arrays, for structs just assign. */
 
 #define vp8_copy( Dest, Src) { \
diff --git a/vp8/decoder/dequantize.c b/vp8/common/dequantize.c
index 0861965eb..4a48a3192 100644
--- a/vp8/decoder/dequantize.c
+++ b/vp8/common/dequantize.c
@@ -42,22 +42,3 @@ void vp8_dequant_idct_add_c(short *input, short *dq,
     vpx_memset(input, 0, 32);
 
 }
-
-void vp8_dequant_dc_idct_add_c(short *input, short *dq,
-                               unsigned char *dest, int stride,
-                               int Dc)
-{
-    int i;
-
-    input[0] = (short)Dc;
-
-    for (i = 1; i < 16; i++)
-    {
-        input[i] = dq[i] * input[i];
-    }
-
-    vp8_short_idct4x4llm_c(input, dest, stride, dest, stride);
-
-    vpx_memset(input, 0, 32);
-
-}
diff --git a/vp8/decoder/dequantize.h b/vp8/common/dequantize.h
index 019b7f6d1..f66cf2bac 100644
--- a/vp8/decoder/dequantize.h
+++ b/vp8/common/dequantize.h
@@ -21,17 +21,6 @@
              unsigned char *output, \
              int stride)
 
-#define prototype_dequant_dc_idct_add(sym) \
-    void sym(short *input, short *dq, \
-             unsigned char *dst, \
-             int stride, \
-             int dc)
-
-#define prototype_dequant_dc_idct_add_y_block(sym) \
-    void sym(short *q, short *dq, \
-             unsigned char *dst, \
-             int stride, char *eobs, short *dc)
-
 #define prototype_dequant_idct_add_y_block(sym) \
     void sym(short *q, short *dq, \
              unsigned char *dst, \
@@ -60,16 +49,6 @@ extern prototype_dequant_block(vp8_dequant_block);
 #endif
 extern prototype_dequant_idct_add(vp8_dequant_idct_add);
 
-#ifndef vp8_dequant_dc_idct_add
-#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_c
-#endif
-extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add);
-
-#ifndef vp8_dequant_dc_idct_add_y_block
-#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_c
-#endif
-extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block);
-
 #ifndef vp8_dequant_idct_add_y_block
 #define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_c
 #endif
@@ -85,10 +64,6 @@ typedef prototype_dequant_block((*vp8_dequant_block_fn_t));
 
 typedef prototype_dequant_idct_add((*vp8_dequant_idct_add_fn_t));
 
-typedef prototype_dequant_dc_idct_add((*vp8_dequant_dc_idct_add_fn_t));
-
-typedef prototype_dequant_dc_idct_add_y_block((*vp8_dequant_dc_idct_add_y_block_fn_t));
-
 typedef prototype_dequant_idct_add_y_block((*vp8_dequant_idct_add_y_block_fn_t));
 
 typedef prototype_dequant_idct_add_uv_block((*vp8_dequant_idct_add_uv_block_fn_t));
@@ -97,8 +72,6 @@ typedef struct
 {
     vp8_dequant_block_fn_t               block;
     vp8_dequant_idct_add_fn_t            idct_add;
-    vp8_dequant_dc_idct_add_fn_t         dc_idct_add;
-    vp8_dequant_dc_idct_add_y_block_fn_t dc_idct_add_y_block;
     vp8_dequant_idct_add_y_block_fn_t    idct_add_y_block;
     vp8_dequant_idct_add_uv_block_fn_t   idct_add_uv_block;
 } vp8_dequant_rtcd_vtable_t;
diff --git a/vp8/common/dma_desc.h b/vp8/common/dma_desc.h
deleted file mode 100644
index b923da6e0..000000000
--- a/vp8/common/dma_desc.h
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef _dma_desc_h
-#define _dma_desc_h
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-
-#define NDSIZE_LG   0x00000900  // Next Descriptor Size
-#define NDSIZE_SM   0x00000800  // Next Descriptor Size
-#define NDSIZE_7    0x00000700  // Next Descriptor Size
-#define NDSIZE_6    0x00000600  // Next Descriptor Size
-#define NDSIZE_5    0x00000500  // Next Descriptor Size
-#define NDSIZE_4    0x00000400  // Next Descriptor Size
-#define NDSIZE_3    0x00000300  // Next Descriptor Size
-#define NDSIZE_2    0x00000200  // Next Descriptor Size
-#define NDSIZE_1    0x00000100  // Next Descriptor Size
-
-#define FLOW_STOP       0x0000
-#define FLOW_AUTO       0x1000
-#define FLOW_DESC_AR    0x4000
-#define FLOW_DESC_SM    0x6000
-#define FLOW_DESC_LG    0x7000
-
-    typedef struct
-    {
-        unsigned int ndp;
-        //unsigned short ndpl;
-        //unsigned short ndph;
-        unsigned int sa;
-        //unsigned short sal;
-        //unsigned short sah;
-
-        unsigned short dmacfg;
-        unsigned short xcnt;
-        unsigned short xmod;
-        unsigned short ycnt;
-        unsigned short ymod;
-
-    } LARGE_DESC;
-
-    typedef struct
-    {
-        unsigned short ndpl;
-        unsigned short sal;
-        unsigned short sah;
-        unsigned short dmacfg;
-        unsigned short xcnt;
-        unsigned short xmod;
-        unsigned short ycnt;
-        unsigned short ymod;
-    } SMALL_DESC;
-
-    typedef struct
-    {
-        unsigned short sal;
-        unsigned short sah;
-        unsigned short dmacfg;
-        unsigned short xcnt;
-        unsigned short xmod;
-        unsigned short ycnt;
-        unsigned short ymod;
-    } ARRAY_DESC_7;
-
-    typedef struct
-    {
-        unsigned short sal;
-        unsigned short sah;
-        unsigned short dmacfg;
-        unsigned short xcnt;
-        unsigned short xmod;
-        unsigned short ycnt;
-    } ARRAY_DESC_6;
-
-    typedef struct
-    {
-        unsigned short sal;
-        unsigned short sah;
-        unsigned short dmacfg;
-        unsigned short xcnt;
-        unsigned short xmod;
-    } ARRAY_DESC_5;
-
-    typedef struct
-    {
-        unsigned short sal;
-        unsigned short sah;
-        unsigned short dmacfg;
-        unsigned short xcnt;
-    } ARRAY_DESC_4;
-
-    typedef struct
-    {
-        unsigned short sal;
-        unsigned short sah;
-        unsigned short dmacfg;
-    } ARRAY_DESC_3;
-
-    typedef struct
-    {
-        unsigned short sal;
-        unsigned short sah;
-    } ARRAY_DESC_2;
-
-    typedef struct
-    {
-        unsigned short sal;
-    } ARRAY_DESC_1;
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif //_dma_desc_h
diff --git a/vp8/common/duck_io.h b/vp8/common/duck_io.h
deleted file mode 100644
index 43daa65bc..000000000
--- a/vp8/common/duck_io.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef _duck_io_h
-#define _duck_io_h
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-#if defined (_WIN32)
-    typedef __int64 int64_t;
-#elif defined(__MWERKS__)
-    typedef long long int64_t;
-#elif defined(__APPLE__) || defined(__POWERPC)
-#include <ppc/types.h>
-#else
-    typedef long long int64_t;
-#endif
-
-    typedef struct
-    {
-        int64_t  offset;     // offset to start from
-        int    blocking;    // non-zero for blocking
-    } re_open_t;
-
-
-    typedef enum
-    {
-        SAL_ERR_MAX                 = -10,
-        SAL_ERROR                   = -11, // Default error
-        SAL_ERR_WSASTARTUP          = -12,
-        SAL_ERR_SOCKET_CREATE       = -13,
-        SAL_ERR_RESOLVING_HOSTNAME  = -14,
-        SAL_ERR_SERVER_CONNECTION   = -15,
-        SAL_ERR_SENDING_DATA        = -16,
-        SAL_ERR_RECEIVING_DATA      = -17,
-        SAL_ERR_404_FILE_NOT_FOUND  = -18,
-        SAL_ERR_PARSING_HTTP_HEADER = -19,
-        SAL_ERR_PARSING_CONTENT_LEN = -20,
-        SAL_ERR_CONNECTION_TIMEOUT  = -21,
-        SAL_ERR_FILE_OPEN_FAILED    = -22,
-        SAL_ERR_MIN                 = -23
-    } SAL_ERR; /* EMH 1-15-03 */
-
-
-    typedef struct sal_err_map_temp
-    {
-        SAL_ERR code;
-        const char *decode;
-
-    } sal_err_map_t;
-
-
-    static char *sal_err_text(SAL_ERR e)
-    {
-        int t;
-        const sal_err_map_t g_sal_err_map[] =
-        {
-            {   SAL_ERR_WSASTARTUP,             "Error with WSAStartup"         },
-            {   SAL_ERR_SOCKET_CREATE,          "Error creating socket"         },
-            {   SAL_ERR_RESOLVING_HOSTNAME,     "Error resolving hostname"      },
-            {   SAL_ERR_SERVER_CONNECTION,      "Error connecting to server"    },
-            {   SAL_ERR_SENDING_DATA,           "Error sending data"            },
-            {   SAL_ERR_RECEIVING_DATA,         "Error receiving data"          },
-            {   SAL_ERR_404_FILE_NOT_FOUND,     "Error file not found "         },
-            {   SAL_ERR_PARSING_HTTP_HEADER,    "Error parsing http header"     },
-            {   SAL_ERR_PARSING_CONTENT_LEN,    "Error parsing content length"  },
-            {   SAL_ERR_CONNECTION_TIMEOUT,     "Error Connection timed out"    },
-            {   SAL_ERR_FILE_OPEN_FAILED,       "Error opening file"            }
-        };
-
-        for (t = 0; t < sizeof(g_sal_err_map) / sizeof(sal_err_map_t); t++)
-        {
-            if (e == g_sal_err_map[t].code)
-                return (char *) g_sal_err_map[t].decode;
-        }
-
-        return 0;
-    }
-
-
-
-
-
-
-
-    int duck_open(const char *fname, unsigned long user_data);
-
-    void duck_close(int ghndl);
-
-    int duck_read(int ghndl, unsigned char *buf, int nbytes);
-
-    int64_t duck_seek(int g_hndl, int64_t offs, int origin);
-
-    int duck_read_finished(int han, int flag); /* FWG 7-9-99 */
-
-    int duck_name(int handle, char name[], size_t max_len); /* EMH 9-23-03 */
-
-    int duck_read_blocking(int handle, unsigned char *buffer, int bytes); /* EMH 9-23-03 */
-
-    int64_t duck_available_data(int handle); /* EMH 10-23-03 */
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif
diff --git a/vp8/common/findnearmv.h b/vp8/common/findnearmv.h
index 01909b937..a3443d765 100644
--- a/vp8/common/findnearmv.h
+++ b/vp8/common/findnearmv.h
@@ -60,10 +60,10 @@ static unsigned int vp8_check_mv_bounds(int_mv *mv, int mb_to_left_edge,
                                 int mb_to_bottom_edge)
 {
     unsigned int need_to_clamp;
-    need_to_clamp = (mv->as_mv.col < mb_to_left_edge) ? 1 : 0;
-    need_to_clamp |= (mv->as_mv.col > mb_to_right_edge) ? 1 : 0;
-    need_to_clamp |= (mv->as_mv.row < mb_to_top_edge) ? 1 : 0;
-    need_to_clamp |= (mv->as_mv.row > mb_to_bottom_edge) ? 1 : 0;
+    need_to_clamp = (mv->as_mv.col < mb_to_left_edge);
+    need_to_clamp |= (mv->as_mv.col > mb_to_right_edge);
+    need_to_clamp |= (mv->as_mv.row < mb_to_top_edge);
+    need_to_clamp |= (mv->as_mv.row > mb_to_bottom_edge);
     return need_to_clamp;
 }
 
diff --git a/vp8/common/g_common.h b/vp8/common/g_common.h
deleted file mode 100644
index 5f523980b..000000000
--- a/vp8/common/g_common.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-extern void (*vp8_clear_system_state)(void);
-extern void (*vp8_plane_add_noise)(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int DPitch, int q);
-extern void (*de_interlace)
-(
-    unsigned char *src_ptr,
-    unsigned char *dst_ptr,
-    int Width,
-    int Height,
-    int Stride
-);
diff --git a/vp8/common/generic/systemdependent.c b/vp8/common/generic/systemdependent.c
index 9641d8c1e..01d76206d 100644
--- a/vp8/common/generic/systemdependent.c
+++ b/vp8/common/generic/systemdependent.c
@@ -10,7 +10,6 @@
 
 
 #include "vpx_config.h"
-#include "vp8/common/g_common.h"
 #include "vp8/common/subpixel.h"
 #include "vp8/common/loopfilter.h"
 #include "vp8/common/recon.h"
@@ -70,6 +69,14 @@ void vp8_machine_specific_config(VP8_COMMON *ctx)
 #if CONFIG_RUNTIME_CPU_DETECT
     VP8_COMMON_RTCD *rtcd = &ctx->rtcd;
 
+
+    rtcd->dequant.block             = vp8_dequantize_b_c;
+    rtcd->dequant.idct_add          = vp8_dequant_idct_add_c;
+    rtcd->dequant.idct_add_y_block  = vp8_dequant_idct_add_y_block_c;
+    rtcd->dequant.idct_add_uv_block =
+        vp8_dequant_idct_add_uv_block_c;
+
+
     rtcd->idct.idct16       = vp8_short_idct4x4llm_c;
     rtcd->idct.idct1_scalar_add = vp8_dc_only_idct_add_c;
     rtcd->idct.iwalsh1      = vp8_short_inv_walsh4x4_1_c;
diff --git a/vp8/common/idct.h b/vp8/common/idct.h
index 411a1b472..7371f85ff 100644
--- a/vp8/common/idct.h
+++ b/vp8/common/idct.h
@@ -37,6 +37,10 @@
 #define vp8_idct_idct16 vp8_short_idct4x4llm_c
 #endif
 extern prototype_idct(vp8_idct_idct16);
+/* add this prototype to prevent compiler warning about implicit
+ * declaration of vp8_short_idct4x4llm_c function in dequantize.c
+ * when building, for example, neon optimized version */
+extern prototype_idct(vp8_short_idct4x4llm_c);
 
 #ifndef vp8_idct_idct1_scalar_add
 #define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_c
diff --git a/vp8/decoder/idct_blk.c b/vp8/common/idct_blk.c
index 1c16b92a9..249fad4ea 100644
--- a/vp8/decoder/idct_blk.c
+++ b/vp8/common/idct_blk.c
@@ -12,39 +12,12 @@
 #include "vp8/common/idct.h"
 #include "dequantize.h"
 
-void vp8_dequant_dc_idct_add_c(short *input, short *dq,
-                               unsigned char *dest, int stride,
-                               int Dc);
 void vp8_dequant_idct_add_c(short *input, short *dq,
                             unsigned char *dest, int stride);
 void vp8_dc_only_idct_add_c(short input_dc, unsigned char * pred,
                             int pred_stride, unsigned char *dst_ptr,
                             int dst_stride);
 
-void vp8_dequant_dc_idct_add_y_block_c
-            (short *q, short *dq,
-             unsigned char *dst, int stride, char *eobs, short *dc)
-{
-    int i, j;
-
-    for (i = 0; i < 4; i++)
-    {
-        for (j = 0; j < 4; j++)
-        {
-            if (*eobs++ > 1)
-                vp8_dequant_dc_idct_add_c (q, dq, dst, stride, dc[0]);
-            else
-                vp8_dc_only_idct_add_c (dc[0], dst, stride, dst, stride);
-
-            q   += 16;
-            dst += 4;
-            dc  ++;
-        }
-
-        dst += 4*stride - 16;
-    }
-}
-
 void vp8_dequant_idct_add_y_block_c
             (short *q, short *dq,
              unsigned char *dst, int stride, char *eobs)
diff --git a/vp8/common/idctllm.c b/vp8/common/idctllm.c
index 49496abef..47af52f04 100644
--- a/vp8/common/idctllm.c
+++ b/vp8/common/idctllm.c
@@ -137,8 +137,9 @@ void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
 
 }
 
-void vp8_short_inv_walsh4x4_c(short *input, short *output)
+void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff)
 {
+    short output[16];
     int i;
     int a1, b1, c1, d1;
     int a2, b2, c2, d2;
@@ -183,22 +184,21 @@ void vp8_short_inv_walsh4x4_c(short *input, short *output)
         ip += 4;
         op += 4;
     }
+
+    for(i = 0; i < 16; i++)
+    {
+        mb_dqcoeff[i * 16] = output[i];
+    }
 }
 
-void vp8_short_inv_walsh4x4_1_c(short *input, short *output)
+void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff)
 {
     int i;
     int a1;
-    short *op = output;
 
     a1 = ((input[0] + 3) >> 3);
-
-    for (i = 0; i < 4; i++)
+    for(i = 0; i < 16; i++)
     {
-        op[0] = a1;
-        op[1] = a1;
-        op[2] = a1;
-        op[3] = a1;
-        op += 4;
+        mb_dqcoeff[i * 16] = a1;
     }
 }
diff --git a/vp8/common/invtrans.c b/vp8/common/invtrans.c
deleted file mode 100644
index 478cb329f..000000000
--- a/vp8/common/invtrans.c
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "invtrans.h"
-
-
-void vp8_inverse_transform_b(const vp8_idct_rtcd_vtable_t *rtcd, BLOCKD *b,
-                             int pitch)
-{
-    if (*b->eob > 1)
-    {
-        IDCT_INVOKE(rtcd, idct16)(b->dqcoeff, b->predictor, pitch,
-              *(b->base_dst) + b->dst, b->dst_stride);
-    }
-    else
-    {
-        IDCT_INVOKE(rtcd, idct1_scalar_add)(b->dqcoeff[0], b->predictor, pitch,
-                         *(b->base_dst) + b->dst, b->dst_stride);
-    }
-
-}
-
-static void recon_dcblock(MACROBLOCKD *x)
-{
-    BLOCKD *b = &x->block[24];
-    int i;
-
-    for (i = 0; i < 16; i++)
-    {
-        x->block[i].dqcoeff[0] = b->diff[i];
-    }
-
-}
-
-void vp8_inverse_transform_mby(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
-{
-    int i;
-
-    if(x->mode_info_context->mbmi.mode != SPLITMV)
-    {
-        /* do 2nd order transform on the dc block */
-        IDCT_INVOKE(rtcd, iwalsh16)(x->block[24].dqcoeff, x->block[24].diff);
-
-        recon_dcblock(x);
-    }
-
-    for (i = 0; i < 16; i++)
-    {
-        vp8_inverse_transform_b(rtcd, &x->block[i], 16);
-    }
-
-}
-void vp8_inverse_transform_mbuv(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
-{
-    int i;
-
-    for (i = 16; i < 24; i++)
-    {
-        vp8_inverse_transform_b(rtcd, &x->block[i], 8);
-    }
-
-}
diff --git a/vp8/common/invtrans.h b/vp8/common/invtrans.h
index d14573b91..2bcbeeccf 100644
--- a/vp8/common/invtrans.h
+++ b/vp8/common/invtrans.h
@@ -15,9 +15,66 @@
 #include "vpx_config.h"
 #include "idct.h"
 #include "blockd.h"
-extern void vp8_inverse_transform_b(const vp8_idct_rtcd_vtable_t *rtcd, BLOCKD *b, int pitch);
-extern void vp8_inverse_transform_mb(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x);
-extern void vp8_inverse_transform_mby(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x);
-extern void vp8_inverse_transform_mbuv(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x);
+#include "onyxc_int.h"
 
+#if CONFIG_MULTITHREAD
+#include "vpx_mem/vpx_mem.h"
+#endif
+
+static void eob_adjust(char *eobs, short *diff)
+{
+    /* eob adjust.... the idct can only skip if both the dc and eob are zero */
+    int js;
+    for(js = 0; js < 16; js++)
+    {
+        if((eobs[js] == 0) && (diff[0] != 0))
+            eobs[js]++;
+        diff+=16;
+    }
+}
+
+static void vp8_inverse_transform_mby(MACROBLOCKD *xd,
+                                      const VP8_COMMON_RTCD *rtcd)
+{
+    short *DQC = xd->block[0].dequant;
+    /* save the dc dequant constant in case it is overridden */
+    short dc_dequant_temp = DQC[0];
+
+#if CONFIG_MULTITHREAD
+    DECLARE_ALIGNED(16, short, local_dequant[16]);
+#endif
+
+    if (xd->mode_info_context->mbmi.mode != SPLITMV)
+    {
+        /* do 2nd order transform on the dc block */
+        if (xd->eobs[24] > 1)
+        {
+            IDCT_INVOKE(&rtcd->idct, iwalsh16)
+                (&xd->block[24].dqcoeff[0], xd->qcoeff);
+        }
+        else
+        {
+            IDCT_INVOKE(&rtcd->idct, iwalsh1)
+                (&xd->block[24].dqcoeff[0], xd->qcoeff);
+        }
+        eob_adjust(xd->eobs, xd->qcoeff);
+
+#if CONFIG_MULTITHREAD
+        DQC = local_dequant;
+
+        vpx_memcpy(DQC, xd->block[0].dequant,
+                   sizeof(local_dequant));
+#endif
+
+        /* override the dc dequant constant */
+        DQC[0] = 1;
+    }
+    DEQUANT_INVOKE (&rtcd->dequant, idct_add_y_block)
+                    (xd->qcoeff, DQC,
+                     xd->dst.y_buffer,
+                     xd->dst.y_stride, xd->eobs);
+
+    /* restore the dc dequant constant */
+    DQC[0] = dc_dequant_temp;
+}
 #endif
diff --git a/vp8/common/littlend.h b/vp8/common/littlend.h
deleted file mode 100644
index 99df1164c..000000000
--- a/vp8/common/littlend.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef _littlend_h
-#define _littlend_h
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-#define invert2(x) (x)
-#define invert4(x) (x)
-
-#define low_byte(x) (unsigned char)x
-#define mid1Byte(x) (unsigned char)(x >> 8)
-#define mid2Byte(x) (unsigned char)(x >> 16)
-#define high_byte(x) (unsigned char)(x >> 24)
-
-#define SWAPENDS 0
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif
diff --git a/vp8/common/onyx.h b/vp8/common/onyx.h
index 28cbaed98..d17a32b82 100644
--- a/vp8/common/onyx.h
+++ b/vp8/common/onyx.h
@@ -17,13 +17,14 @@ extern "C"
 {
 #endif
 
+#include "vpx_config.h"
 #include "vpx/internal/vpx_codec_internal.h"
 #include "vpx/vp8cx.h"
 #include "vpx/vpx_encoder.h"
 #include "vpx_scale/yv12config.h"
-#include "type_aliases.h"
 #include "ppflags.h"
-    typedef int *VP8_PTR;
+
+    struct VP8_COMP;
 
     /* Create/destroy static data structures. */
 
@@ -104,7 +105,7 @@ extern "C"
         int Version;            // 4 versions of bitstream defined 0 best quality/slowest decode, 3 lowest quality/fastest decode
         int Width;              // width of data passed to the compressor
         int Height;             // height of data passed to the compressor
-        double frame_rate;       // set to passed in framerate
+        struct vpx_rational  timebase;
         int target_bandwidth;    // bandwidth to be used in kilobits per second
 
         int noise_sensitivity;   // parameter used for applying pre processing blur: recommendation 0
@@ -207,32 +208,45 @@ extern "C"
         unsigned int periodicity;
         unsigned int layer_id[MAX_PERIODICITY];
 
+#if CONFIG_MULTI_RES_ENCODING
+        /* Number of total resolutions encoded */
+        unsigned int mr_total_resolutions;
+
+        /* Current encoder ID */
+        unsigned int mr_encoder_id;
+
+        /* Down-sampling factor */
+        vpx_rational_t mr_down_sampling_factor;
+
+        /* Memory location to store low-resolution encoder's mode info */
+        void* mr_low_res_mode_info;
+#endif
     } VP8_CONFIG;
 
 
     void vp8_initialize();
 
-    VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf);
-    void vp8_remove_compressor(VP8_PTR *comp);
+    struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf);
+    void vp8_remove_compressor(struct VP8_COMP* *comp);
 
-    void vp8_init_config(VP8_PTR onyx, VP8_CONFIG *oxcf);
-    void vp8_change_config(VP8_PTR onyx, VP8_CONFIG *oxcf);
+    void vp8_init_config(struct VP8_COMP* onyx, VP8_CONFIG *oxcf);
+    void vp8_change_config(struct VP8_COMP* onyx, VP8_CONFIG *oxcf);
 
 // receive a frames worth of data caller can assume that a copy of this frame is made
 // and not just a copy of the pointer..
-    int vp8_receive_raw_frame(VP8_PTR comp, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, int64_t end_time_stamp);
-    int vp8_get_compressed_data(VP8_PTR comp, unsigned int *frame_flags, unsigned long *size, unsigned char *dest, unsigned char *dest_end, int64_t *time_stamp, int64_t *time_end, int flush);
-    int vp8_get_preview_raw_frame(VP8_PTR comp, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *flags);
-
-    int vp8_use_as_reference(VP8_PTR comp, int ref_frame_flags);
-    int vp8_update_reference(VP8_PTR comp, int ref_frame_flags);
-    int vp8_get_reference(VP8_PTR comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
-    int vp8_set_reference(VP8_PTR comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
-    int vp8_update_entropy(VP8_PTR comp, int update);
-    int vp8_set_roimap(VP8_PTR comp, unsigned char *map, unsigned int rows, unsigned int cols, int delta_q[4], int delta_lf[4], unsigned int threshold[4]);
-    int vp8_set_active_map(VP8_PTR comp, unsigned char *map, unsigned int rows, unsigned int cols);
-    int vp8_set_internal_size(VP8_PTR comp, VPX_SCALING horiz_mode, VPX_SCALING vert_mode);
-    int vp8_get_quantizer(VP8_PTR c);
+    int vp8_receive_raw_frame(struct VP8_COMP* comp, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, int64_t end_time_stamp);
+    int vp8_get_compressed_data(struct VP8_COMP* comp, unsigned int *frame_flags, unsigned long *size, unsigned char *dest, unsigned char *dest_end, int64_t *time_stamp, int64_t *time_end, int flush);
+    int vp8_get_preview_raw_frame(struct VP8_COMP* comp, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *flags);
+
+    int vp8_use_as_reference(struct VP8_COMP* comp, int ref_frame_flags);
+    int vp8_update_reference(struct VP8_COMP* comp, int ref_frame_flags);
+    int vp8_get_reference(struct VP8_COMP* comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
+    int vp8_set_reference(struct VP8_COMP* comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
+    int vp8_update_entropy(struct VP8_COMP* comp, int update);
+    int vp8_set_roimap(struct VP8_COMP* comp, unsigned char *map, unsigned int rows, unsigned int cols, int delta_q[4], int delta_lf[4], unsigned int threshold[4]);
+    int vp8_set_active_map(struct VP8_COMP* comp, unsigned char *map, unsigned int rows, unsigned int cols);
+    int vp8_set_internal_size(struct VP8_COMP* comp, VPX_SCALING horiz_mode, VPX_SCALING vert_mode);
+    int vp8_get_quantizer(struct VP8_COMP* c);
 
 #ifdef __cplusplus
 }
diff --git a/vp8/common/onyxc_int.h b/vp8/common/onyxc_int.h
index 936fa9f23..f733ff774 100644
--- a/vp8/common/onyxc_int.h
+++ b/vp8/common/onyxc_int.h
@@ -22,6 +22,7 @@
 #if CONFIG_POSTPROC
 #include "postproc.h"
 #endif
+#include "dequantize.h"
 
 /*#ifdef PACKET_TESTING*/
 #include "header.h"
@@ -73,6 +74,7 @@ typedef enum
 typedef struct VP8_COMMON_RTCD
 {
 #if CONFIG_RUNTIME_CPU_DETECT
+    vp8_dequant_rtcd_vtable_t        dequant;
     vp8_idct_rtcd_vtable_t        idct;
     vp8_recon_rtcd_vtable_t       recon;
     vp8_subpix_rtcd_vtable_t      subpix;
diff --git a/vp8/common/onyxd.h b/vp8/common/onyxd.h
index 43fa00bd3..35a8b6e55 100644
--- a/vp8/common/onyxd.h
+++ b/vp8/common/onyxd.h
@@ -18,13 +18,13 @@
 extern "C"
 {
 #endif
-#include "type_aliases.h"
 #include "vpx_scale/yv12config.h"
 #include "ppflags.h"
 #include "vpx_ports/mem.h"
 #include "vpx/vpx_codec.h"
 
-    typedef void   *VP8D_PTR;
+    struct VP8D_COMP;
+
     typedef struct
     {
         int     Width;
@@ -49,19 +49,19 @@ extern "C"
 
     void vp8dx_initialize(void);
 
-    void vp8dx_set_setting(VP8D_PTR comp, VP8D_SETTING oxst, int x);
+    void vp8dx_set_setting(struct VP8D_COMP* comp, VP8D_SETTING oxst, int x);
 
-    int vp8dx_get_setting(VP8D_PTR comp, VP8D_SETTING oxst);
+    int vp8dx_get_setting(struct VP8D_COMP* comp, VP8D_SETTING oxst);
 
-    int vp8dx_receive_compressed_data(VP8D_PTR comp, unsigned long size, const unsigned char *dest, int64_t time_stamp);
-    int vp8dx_get_raw_frame(VP8D_PTR comp, YV12_BUFFER_CONFIG *sd, int64_t *time_stamp, int64_t *time_end_stamp, vp8_ppflags_t *flags);
+    int vp8dx_receive_compressed_data(struct VP8D_COMP* comp, unsigned long size, const unsigned char *dest, int64_t time_stamp);
+    int vp8dx_get_raw_frame(struct VP8D_COMP* comp, YV12_BUFFER_CONFIG *sd, int64_t *time_stamp, int64_t *time_end_stamp, vp8_ppflags_t *flags);
 
-    vpx_codec_err_t vp8dx_get_reference(VP8D_PTR comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
-    vpx_codec_err_t vp8dx_set_reference(VP8D_PTR comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
+    vpx_codec_err_t vp8dx_get_reference(struct VP8D_COMP* comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
+    vpx_codec_err_t vp8dx_set_reference(struct VP8D_COMP* comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
 
-    VP8D_PTR vp8dx_create_decompressor(VP8D_CONFIG *oxcf);
+    struct VP8D_COMP* vp8dx_create_decompressor(VP8D_CONFIG *oxcf);
 
-    void vp8dx_remove_decompressor(VP8D_PTR comp);
+    void vp8dx_remove_decompressor(struct VP8D_COMP* comp);
 
 #ifdef __cplusplus
 }
diff --git a/vp8/common/ppc/systemdependent.c b/vp8/common/ppc/systemdependent.c
index 1f5d79068..7046a63e8 100644
--- a/vp8/common/ppc/systemdependent.c
+++ b/vp8/common/ppc/systemdependent.c
@@ -9,7 +9,6 @@
  */
 
 
-#include "g_common.h"
 #include "subpixel.h"
 #include "loopfilter.h"
 #include "recon.h"
diff --git a/vp8/common/reconinter.c b/vp8/common/reconinter.c
index 24c09a353..6c7af41d4 100644
--- a/vp8/common/reconinter.c
+++ b/vp8/common/reconinter.c
@@ -334,11 +334,12 @@ void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x)
 
 
 /*encoder only*/
-void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x)
+void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x,
+                                         unsigned char *dst_y,
+                                         int dst_ystride)
 {
     unsigned char *ptr_base;
     unsigned char *ptr;
-    unsigned char *pred_ptr = x->predictor;
     int mv_row = x->mode_info_context->mbmi.mv.as_mv.row;
     int mv_col = x->mode_info_context->mbmi.mv.as_mv.col;
     int pre_stride = x->block[0].pre_stride;
@@ -348,11 +349,13 @@ void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x)
 
     if ((mv_row | mv_col) & 7)
     {
-        x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, pred_ptr, 16);
+        x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7,
+                                 dst_y, dst_ystride);
     }
     else
     {
-        RECON_INVOKE(&x->rtcd->recon, copy16x16)(ptr, pre_stride, pred_ptr, 16);
+        RECON_INVOKE(&x->rtcd->recon, copy16x16)(ptr, pre_stride, dst_y,
+            dst_ystride);
     }
 }
 
@@ -596,69 +599,3 @@ void vp8_build_inter_predictors_mb(MACROBLOCKD *xd)
         build_inter4x4_predictors_mb(xd);
     }
 }
-/* encoder only*/
-static void build_inter4x4_predictors_mb_e(MACROBLOCKD *x)
-{
-    int i;
-
-    if (x->mode_info_context->mbmi.partitioning < 3)
-    {
-        x->block[ 0].bmi = x->mode_info_context->bmi[ 0];
-        x->block[ 2].bmi = x->mode_info_context->bmi[ 2];
-        x->block[ 8].bmi = x->mode_info_context->bmi[ 8];
-        x->block[10].bmi = x->mode_info_context->bmi[10];
-
-        build_inter_predictors4b(x, &x->block[ 0], x->block[ 0].predictor, 16);
-        build_inter_predictors4b(x, &x->block[ 2], x->block[ 2].predictor, 16);
-        build_inter_predictors4b(x, &x->block[ 8], x->block[ 8].predictor, 16);
-        build_inter_predictors4b(x, &x->block[10], x->block[10].predictor, 16);
-    }
-    else
-    {
-        for (i = 0; i < 16; i += 2)
-        {
-            BLOCKD *d0 = &x->block[i];
-            BLOCKD *d1 = &x->block[i+1];
-
-            x->block[i+0].bmi = x->mode_info_context->bmi[i+0];
-            x->block[i+1].bmi = x->mode_info_context->bmi[i+1];
-
-            if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
-                build_inter_predictors2b(x, d0, d0->predictor, 16);
-            else
-            {
-                build_inter_predictors_b(d0, d0->predictor, 16, x->subpixel_predict);
-                build_inter_predictors_b(d1, d1->predictor, 16, x->subpixel_predict);
-            }
-
-        }
-
-    }
-
-    for (i = 16; i < 24; i += 2)
-    {
-        BLOCKD *d0 = &x->block[i];
-        BLOCKD *d1 = &x->block[i+1];
-
-        if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
-            build_inter_predictors2b(x, d0, d0->predictor, 8);
-        else
-        {
-            build_inter_predictors_b(d0, d0->predictor, 8, x->subpixel_predict);
-            build_inter_predictors_b(d1, d1->predictor, 8, x->subpixel_predict);
-        }
-    }
-}
-void vp8_build_inter_predictors_mb_e(MACROBLOCKD *xd)
-{
-    if (xd->mode_info_context->mbmi.mode != SPLITMV)
-    {
-        vp8_build_inter16x16_predictors_mb(xd, xd->predictor, &xd->predictor[256],
-                                           &xd->predictor[320], 16, 8);
-    }
-    else
-    {
-        build_4x4uvmvs(xd);
-        build_inter4x4_predictors_mb_e(xd);
-    }
-}
diff --git a/vp8/common/reconinter.h b/vp8/common/reconinter.h
index 86f9d5ae3..f57ff73c5 100644
--- a/vp8/common/reconinter.h
+++ b/vp8/common/reconinter.h
@@ -21,11 +21,13 @@ extern void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x,
                                                int dst_uvstride);
 
 
-extern void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x);
-extern void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, vp8_subpix_fn_t sppf);
+extern void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x,
+                                                unsigned char *dst_y,
+                                                int dst_ystride);
+extern void vp8_build_inter_predictors_b(BLOCKD *d, int pitch,
+                                         vp8_subpix_fn_t sppf);
 
 extern void vp8_build_inter16x16_predictors_mbuv(MACROBLOCKD *x);
 extern void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x);
-extern void vp8_build_inter_predictors_mb_e(MACROBLOCKD *xd);
 
 #endif
diff --git a/vp8/common/type_aliases.h b/vp8/common/type_aliases.h
deleted file mode 100644
index 22b531a76..000000000
--- a/vp8/common/type_aliases.h
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/****************************************************************************
-*
-*   Module Title :     type_aliases.h
-*
-*   Description  :     Standard type aliases
-*
-****************************************************************************/
-#ifndef __INC_TYPE_ALIASES_H
-#define __INC_TYPE_ALIASES_H
-
-/****************************************************************************
-* Macros
-****************************************************************************/
-#define EXPORT
-#define IMPORT          extern      /* Used to declare imported data & routines */
-#define PRIVATE         static      /* Used to declare & define module-local data */
-#define LOCAL           static      /* Used to define all persistent routine-local data */
-#define STD_IN_PATH     0           /* Standard input path */
-#define STD_OUT_PATH    1           /* Standard output path */
-#define STD_ERR_PATH    2           /* Standard error path */
-#define STD_IN_FILE     stdin       /* Standard input file pointer */
-#define STD_OUT_FILE    stdout      /* Standard output file pointer */
-#define STD_ERR_FILE    stderr      /* Standard error file pointer */
-#define max_int         0x7FFFFFFF
-
-#define __export
-#define _export
-
-#define CCONV
-
-#ifndef NULL
-#ifdef __cplusplus
-#define NULL    0
-#else
-#define NULL    ((void *)0)
-#endif
-#endif
-
-#ifndef FALSE
-#define FALSE   0
-#endif
-
-#ifndef TRUE
-#define TRUE    1
-#endif
-
-/****************************************************************************
-* Typedefs
-****************************************************************************/
-#ifndef TYPE_INT8
-#define TYPE_INT8
-typedef signed char     INT8;
-#endif
-
-#ifndef TYPE_INT16
-/*#define TYPE_INT16*/
-typedef signed short    INT16;
-#endif
-
-#ifndef TYPE_INT32
-/*#define TYPE_INT32*/
-typedef signed int      INT32;
-#endif
-
-#ifndef TYPE_UINT8
-/*#define TYPE_UINT8*/
-typedef unsigned char   UINT8;
-#endif
-
-#ifndef TYPE_UINT32
-/*#define TYPE_UINT32*/
-typedef unsigned int    UINT32;
-#endif
-
-#ifndef TYPE_UINT16
-/*#define TYPE_UINT16*/
-typedef unsigned short  UINT16;
-#endif
-
-#ifndef TYPE_BOOL
-/*#define TYPE_BOOL*/
-typedef int             BOOL;
-#endif
-
-typedef unsigned char   BOOLEAN;
-
-#ifdef _MSC_VER
-typedef __int64 INT64;
-#else
-
-#ifndef TYPE_INT64
-#ifdef _TMS320C6X
-/* for now we only have 40bits */
-typedef long INT64;
-#else
-typedef long long INT64;
-#endif
-#endif
-
-#endif
-
-/* Floating point */
-typedef  double         FLOAT64;
-typedef  float          FLOAT32;
-
-#endif
diff --git a/vp8/decoder/x86/dequantize_mmx.asm b/vp8/common/x86/dequantize_mmx.asm
index 648bde4c5..de9eba89f 100644
--- a/vp8/decoder/x86/dequantize_mmx.asm
+++ b/vp8/common/x86/dequantize_mmx.asm
@@ -246,207 +246,6 @@ sym(vp8_dequant_idct_add_mmx):
     pop         rbp
     ret
 
-
-;void dequant_dc_idct_add_mmx(
-;short *input,          0
-;short *dq,             1
-;unsigned char *dest,   2
-;int stride,            3
-;int Dc)                4
-global sym(vp8_dequant_dc_idct_add_mmx)
-sym(vp8_dequant_dc_idct_add_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    GET_GOT     rbx
-    ; end prolog
-
-        mov         rax,    arg(0) ;input
-        mov         rdx,    arg(1) ;dq
-
-        movq        mm0,    [rax   ]
-        pmullw      mm0,    [rdx]
-
-        movq        mm1,    [rax +8]
-        pmullw      mm1,    [rdx +8]
-
-        movq        mm2,    [rax+16]
-        pmullw      mm2,    [rdx+16]
-
-        movq        mm3,    [rax+24]
-        pmullw      mm3,    [rdx+24]
-
-        mov         rdx,    arg(2) ;pred
-        pxor        mm7,    mm7
-
-
-        movq        [rax],   mm7
-        movq        [rax+8], mm7
-
-        movq        [rax+16],mm7
-        movq        [rax+24],mm7
-
-        ; move lower word of Dc to lower word of mm0
-        psrlq       mm0,    16
-        movzx       rcx,    word ptr arg(4) ;Dc
-        psllq       mm0,    16
-        movq        mm7,    rcx
-        por         mm0,    mm7
-
-        movsxd      rax,            dword ptr arg(3) ;stride
-
-        psubw       mm0,            mm2             ; b1= 0-2
-        paddw       mm2,            mm2             ;
-
-        movq        mm5,            mm1
-        paddw       mm2,            mm0             ; a1 =0+2
-
-        pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
-        paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
-
-        movq        mm7,            mm3             ;
-        pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
-
-        paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
-        psubw       mm7,            mm5             ; c1
-
-        movq        mm5,            mm1
-        movq        mm4,            mm3
-
-        pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
-        paddw       mm5,            mm1
-
-        pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
-        paddw       mm3,            mm4
-
-        paddw       mm3,            mm5             ; d1
-        movq        mm6,            mm2             ; a1
-
-        movq        mm4,            mm0             ; b1
-        paddw       mm2,            mm3             ;0
-
-        paddw       mm4,            mm7             ;1
-        psubw       mm0,            mm7             ;2
-
-        psubw       mm6,            mm3             ;3
-
-        movq        mm1,            mm2             ; 03 02 01 00
-        movq        mm3,            mm4             ; 23 22 21 20
-
-        punpcklwd   mm1,            mm0             ; 11 01 10 00
-        punpckhwd   mm2,            mm0             ; 13 03 12 02
-
-        punpcklwd   mm3,            mm6             ; 31 21 30 20
-        punpckhwd   mm4,            mm6             ; 33 23 32 22
-
-        movq        mm0,            mm1             ; 11 01 10 00
-        movq        mm5,            mm2             ; 13 03 12 02
-
-        punpckldq   mm0,            mm3             ; 30 20 10 00
-        punpckhdq   mm1,            mm3             ; 31 21 11 01
-
-        punpckldq   mm2,            mm4             ; 32 22 12 02
-        punpckhdq   mm5,            mm4             ; 33 23 13 03
-
-        movq        mm3,            mm5             ; 33 23 13 03
-
-        psubw       mm0,            mm2             ; b1= 0-2
-        paddw       mm2,            mm2             ;
-
-        movq        mm5,            mm1
-        paddw       mm2,            mm0             ; a1 =0+2
-
-        pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
-        paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
-
-        movq        mm7,            mm3             ;
-        pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
-
-        paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
-        psubw       mm7,            mm5             ; c1
-
-        movq        mm5,            mm1
-        movq        mm4,            mm3
-
-        pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
-        paddw       mm5,            mm1
-
-        pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
-        paddw       mm3,            mm4
-
-        paddw       mm3,            mm5             ; d1
-        paddw       mm0,            [GLOBAL(fours)]
-
-        paddw       mm2,            [GLOBAL(fours)]
-        movq        mm6,            mm2             ; a1
-
-        movq        mm4,            mm0             ; b1
-        paddw       mm2,            mm3             ;0
-
-        paddw       mm4,            mm7             ;1
-        psubw       mm0,            mm7             ;2
-
-        psubw       mm6,            mm3             ;3
-        psraw       mm2,            3
-
-        psraw       mm0,            3
-        psraw       mm4,            3
-
-        psraw       mm6,            3
-
-        movq        mm1,            mm2             ; 03 02 01 00
-        movq        mm3,            mm4             ; 23 22 21 20
-
-        punpcklwd   mm1,            mm0             ; 11 01 10 00
-        punpckhwd   mm2,            mm0             ; 13 03 12 02
-
-        punpcklwd   mm3,            mm6             ; 31 21 30 20
-        punpckhwd   mm4,            mm6             ; 33 23 32 22
-
-        movq        mm0,            mm1             ; 11 01 10 00
-        movq        mm5,            mm2             ; 13 03 12 02
-
-        punpckldq   mm0,            mm3             ; 30 20 10 00
-        punpckhdq   mm1,            mm3             ; 31 21 11 01
-
-        punpckldq   mm2,            mm4             ; 32 22 12 02
-        punpckhdq   mm5,            mm4             ; 33 23 13 03
-
-        pxor        mm7,            mm7
-
-        movd        mm4,            [rdx]
-        punpcklbw   mm4,            mm7
-        paddsw      mm0,            mm4
-        packuswb    mm0,            mm7
-        movd        [rdx],          mm0
-
-        movd        mm4,            [rdx+rax]
-        punpcklbw   mm4,            mm7
-        paddsw      mm1,            mm4
-        packuswb    mm1,            mm7
-        movd        [rdx+rax],      mm1
-
-        movd        mm4,            [rdx+2*rax]
-        punpcklbw   mm4,            mm7
-        paddsw      mm2,            mm4
-        packuswb    mm2,            mm7
-        movd        [rdx+rax*2],    mm2
-
-        add         rdx,            rax
-
-        movd        mm4,            [rdx+2*rax]
-        punpcklbw   mm4,            mm7
-        paddsw      mm5,            mm4
-        packuswb    mm5,            mm7
-        movd        [rdx+rax*2],    mm5
-
-    ; begin epilog
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
 SECTION_RODATA
 align 16
 x_s1sqr2:
diff --git a/vp8/decoder/x86/dequantize_x86.h b/vp8/common/x86/dequantize_x86.h
index dc68daab3..49bcb7f19 100644
--- a/vp8/decoder/x86/dequantize_x86.h
+++ b/vp8/common/x86/dequantize_x86.h
@@ -22,8 +22,6 @@
 #if HAVE_MMX
 extern prototype_dequant_block(vp8_dequantize_b_mmx);
 extern prototype_dequant_idct_add(vp8_dequant_idct_add_mmx);
-extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_mmx);
-extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_mmx);
 extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_mmx);
 extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_mmx);
 
@@ -34,12 +32,6 @@ extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_mmx);
 #undef  vp8_dequant_idct_add
 #define vp8_dequant_idct_add vp8_dequant_idct_add_mmx
 
-#undef  vp8_dequant_dc_idct_add
-#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_mmx
-
-#undef vp8_dequant_dc_idct_add_y_block
-#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_mmx
-
 #undef vp8_dequant_idct_add_y_block
 #define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_mmx
 
@@ -50,14 +42,10 @@ extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_mmx);
 #endif
 
 #if HAVE_SSE2
-extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_sse2);
 extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_sse2);
 extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_sse2);
 
 #if !CONFIG_RUNTIME_CPU_DETECT
-#undef vp8_dequant_dc_idct_add_y_block
-#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_sse2
-
 #undef vp8_dequant_idct_add_y_block
 #define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_sse2
 
diff --git a/vp8/common/x86/filter_x86.c b/vp8/common/x86/filter_x86.c
new file mode 100644
index 000000000..ebab814f4
--- /dev/null
+++ b/vp8/common/x86/filter_x86.c
@@ -0,0 +1,35 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_ports/mem.h"
+
+DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]) =
+{
+    { 128, 128, 128, 128,   0,   0,   0,   0 },
+    { 112, 112, 112, 112,  16,  16,  16,  16 },
+    {  96,  96,  96,  96,  32,  32,  32,  32 },
+    {  80,  80,  80,  80,  48,  48,  48,  48 },
+    {  64,  64,  64,  64,  64,  64,  64,  64 },
+    {  48,  48,  48,  48,  80,  80,  80,  80 },
+    {  32,  32,  32,  32,  96,  96,  96,  96 },
+    {  16,  16,  16,  16, 112, 112, 112, 112 }
+};
+
+DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]) =
+{
+    { 128, 128, 128, 128, 128, 128, 128, 128,   0,   0,   0,   0,   0,   0,   0,   0 },
+    { 112, 112, 112, 112, 112, 112, 112, 112,  16,  16,  16,  16,  16,  16,  16,  16 },
+    {  96,  96,  96,  96,  96,  96,  96,  96,  32,  32,  32,  32,  32,  32,  32,  32 },
+    {  80,  80,  80,  80,  80,  80,  80,  80,  48,  48,  48,  48,  48,  48,  48,  48 },
+    {  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64 },
+    {  48,  48,  48,  48,  48,  48,  48,  48,  80,  80,  80,  80,  80,  80,  80,  80 },
+    {  32,  32,  32,  32,  32,  32,  32,  32,  96,  96,  96,  96,  96,  96,  96,  96 },
+    {  16,  16,  16,  16,  16,  16,  16,  16, 112, 112, 112, 112, 112, 112, 112, 112 }
+};
diff --git a/vp8/common/x86/filter_x86.h b/vp8/common/x86/filter_x86.h
new file mode 100644
index 000000000..efcc4dc2a
--- /dev/null
+++ b/vp8/common/x86/filter_x86.h
@@ -0,0 +1,19 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef FILTER_X86_H
+#define FILTER_X86_H
+
+/* x86 assembly specific copy of vp8/common/filter.c:vp8_bilinear_filters with
+ * duplicated values */
+extern const short vp8_bilinear_filters_x86_4[8][8];  /* duplicated 4x */
+extern const short vp8_bilinear_filters_x86_8[8][16]; /* duplicated 8x */
+
+#endif /* FILTER_X86_H */
diff --git a/vp8/decoder/x86/idct_blk_mmx.c b/vp8/common/x86/idct_blk_mmx.c
index 37de5b9fd..49cebd6f5 100644
--- a/vp8/decoder/x86/idct_blk_mmx.c
+++ b/vp8/common/x86/idct_blk_mmx.c
@@ -10,41 +10,16 @@
 
 #include "vpx_config.h"
 #include "vp8/common/idct.h"
-#include "vp8/decoder/dequantize.h"
+#include "vp8/common/dequantize.h"
 
-void vp8_dequant_dc_idct_add_y_block_mmx
-            (short *q, short *dq,
-             unsigned char *dst, int stride, char *eobs, short *dc)
-{
-    int i;
-
-    for (i = 0; i < 4; i++)
-    {
-        if (eobs[0] > 1)
-            vp8_dequant_dc_idct_add_mmx (q, dq, dst, stride, dc[0]);
-        else if (eobs[0] == 1)
-            vp8_dc_only_idct_add_mmx (dc[0], dst, stride, dst, stride);
-
-        if (eobs[1] > 1)
-            vp8_dequant_dc_idct_add_mmx (q+16, dq, dst+4, stride, dc[1]);
-        else if (eobs[1] == 1)
-            vp8_dc_only_idct_add_mmx (dc[1], dst+4, stride, dst+4, stride);
+extern void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q);
 
-        if (eobs[2] > 1)
-            vp8_dequant_dc_idct_add_mmx (q+32, dq, dst+8, stride, dc[2]);
-        else if (eobs[2] == 1)
-            vp8_dc_only_idct_add_mmx (dc[2], dst+8, stride, dst+8, stride);
-
-        if (eobs[3] > 1)
-            vp8_dequant_dc_idct_add_mmx (q+48, dq, dst+12, stride, dc[3]);
-        else if (eobs[3] == 1)
-            vp8_dc_only_idct_add_mmx (dc[3], dst+12, stride, dst+12, stride);
-
-        q    += 64;
-        dc   += 4;
-        dst  += 4*stride;
-        eobs += 4;
-    }
+void vp8_dequantize_b_mmx(BLOCKD *d)
+{
+    short *sq = (short *) d->qcoeff;
+    short *dq = (short *) d->dqcoeff;
+    short *q = (short *) d->dequant;
+    vp8_dequantize_b_impl_mmx(sq, dq, q);
 }
 
 void vp8_dequant_idct_add_y_block_mmx
diff --git a/vp8/decoder/x86/idct_blk_sse2.c b/vp8/common/x86/idct_blk_sse2.c
index 0495b0610..44e440c0c 100644
--- a/vp8/decoder/x86/idct_blk_sse2.c
+++ b/vp8/common/x86/idct_blk_sse2.c
@@ -10,14 +10,7 @@
 
 #include "vpx_config.h"
 #include "vp8/common/idct.h"
-#include "vp8/decoder/dequantize.h"
-
-void vp8_idct_dequant_dc_0_2x_sse2
-            (short *q, short *dq,
-             unsigned char *dst, int dst_stride, short *dc);
-void vp8_idct_dequant_dc_full_2x_sse2
-            (short *q, short *dq,
-             unsigned char *dst, int dst_stride, short *dc);
+#include "vp8/common/dequantize.h"
 
 void vp8_idct_dequant_0_2x_sse2
             (short *q, short *dq ,
@@ -26,36 +19,6 @@ void vp8_idct_dequant_full_2x_sse2
             (short *q, short *dq ,
              unsigned char *dst, int dst_stride);
 
-void vp8_dequant_dc_idct_add_y_block_sse2
-            (short *q, short *dq,
-             unsigned char *dst, int stride, char *eobs, short *dc)
-{
-    int i;
-
-    for (i = 0; i < 4; i++)
-    {
-        if (((short *)(eobs))[0])
-        {
-            if (((short *)(eobs))[0] & 0xfefe)
-                vp8_idct_dequant_dc_full_2x_sse2 (q, dq, dst, stride, dc);
-            else
-                vp8_idct_dequant_dc_0_2x_sse2 (q, dq, dst, stride, dc);
-        }
-
-        if (((short *)(eobs))[1])
-        {
-            if (((short *)(eobs))[1] & 0xfefe)
-                vp8_idct_dequant_dc_full_2x_sse2 (q+32, dq, dst+8, stride, dc+2);
-            else
-                vp8_idct_dequant_dc_0_2x_sse2 (q+32, dq, dst+8, stride, dc+2);
-        }
-        q    += 64;
-        dc   += 4;
-        dst  += stride*4;
-        eobs += 4;
-    }
-}
-
 void vp8_dequant_idct_add_y_block_sse2
             (short *q, short *dq,
              unsigned char *dst, int stride, char *eobs)
diff --git a/vp8/common/x86/idct_x86.h b/vp8/common/x86/idct_x86.h
index f9e3a794d..06e3ea4b5 100644
--- a/vp8/common/x86/idct_x86.h
+++ b/vp8/common/x86/idct_x86.h
@@ -24,7 +24,6 @@ extern prototype_idct(vp8_short_idct4x4llm_mmx);
 extern prototype_idct_scalar_add(vp8_dc_only_idct_add_mmx);
 
 extern prototype_second_order(vp8_short_inv_walsh4x4_mmx);
-extern prototype_second_order(vp8_short_inv_walsh4x4_1_mmx);
 
 #if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_idct_idct16
@@ -36,9 +35,6 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_1_mmx);
 #undef vp8_idct_iwalsh16
 #define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_mmx
 
-#undef vp8_idct_iwalsh1
-#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_mmx
-
 #endif
 #endif
 
diff --git a/vp8/common/x86/iwalsh_mmx.asm b/vp8/common/x86/iwalsh_mmx.asm
index 10b5274dc..6582687e1 100644
--- a/vp8/common/x86/iwalsh_mmx.asm
+++ b/vp8/common/x86/iwalsh_mmx.asm
@@ -11,162 +11,129 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
-;void vp8_short_inv_walsh4x4_1_mmx(short *input, short *output)
-global sym(vp8_short_inv_walsh4x4_1_mmx)
-sym(vp8_short_inv_walsh4x4_1_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 2
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov     rsi, arg(0)
-    mov     rax, 3
-
-    mov     rdi, arg(1)
-    add     rax, [rsi]          ;input[0] + 3
-
-    movd    mm0, eax
-
-    punpcklwd mm0, mm0          ;x x val val
-
-    punpckldq mm0, mm0          ;val val val val
-
-    psraw   mm0, 3            ;(input[0] + 3) >> 3
-
-    movq  [rdi + 0], mm0
-    movq  [rdi + 8], mm0
-    movq  [rdi + 16], mm0
-    movq  [rdi + 24], mm0
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
 ;void vp8_short_inv_walsh4x4_mmx(short *input, short *output)
 global sym(vp8_short_inv_walsh4x4_mmx)
 sym(vp8_short_inv_walsh4x4_mmx):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 2
-    push        rsi
-    push        rdi
     ; end prolog
 
-    mov     rax, 3
-    mov     rsi, arg(0)
-    mov     rdi, arg(1)
-    shl     rax, 16
-
-    movq    mm0, [rsi + 0]        ;ip[0]
-    movq    mm1, [rsi + 8]        ;ip[4]
-    or      rax, 3            ;00030003h
-
-    movq    mm2, [rsi + 16]       ;ip[8]
-    movq    mm3, [rsi + 24]       ;ip[12]
-
-    movq    mm7, rax
-    movq    mm4, mm0
+    mov         rdx, arg(0)
+    mov         rax, 30003h
 
-    punpcklwd mm7, mm7          ;0003000300030003h
-    movq    mm5, mm1
+    movq        mm0, [rdx + 0]    ;ip[0]
+    movq        mm1, [rdx + 8]    ;ip[4]
+    movd        mm7, rax
 
-    paddw   mm4, mm3          ;ip[0] + ip[12] aka al
-    paddw   mm5, mm2          ;ip[4] + ip[8] aka bl
+    movq        mm2, [rdx + 16]   ;ip[8]
+    movq        mm3, [rdx + 24]   ;ip[12]
+    punpcklwd   mm7, mm7          ;0003000300030003h
+    mov         rdx, arg(1)
 
-    movq    mm6, mm4          ;temp al
+    movq        mm4, mm0
+    movq        mm5, mm1
 
-    paddw   mm4, mm5          ;al + bl
-    psubw   mm6, mm5          ;al - bl
+    paddw       mm4, mm3          ;ip[0] + ip[12] aka al
+    paddw       mm5, mm2          ;ip[4] + ip[8] aka bl
 
-    psubw   mm0, mm3          ;ip[0] - ip[12] aka d1
-    psubw   mm1, mm2          ;ip[4] - ip[8] aka c1
+    movq        mm6, mm4          ;temp al
+    paddw       mm4, mm5          ;al + bl
+    psubw       mm6, mm5          ;al - bl
 
-    movq    mm5, mm0          ;temp dl
+    psubw       mm0, mm3          ;ip[0] - ip[12] aka d1
+    psubw       mm1, mm2          ;ip[4] - ip[8] aka c1
 
-    paddw   mm0, mm1          ;dl + cl
-    psubw   mm5, mm1          ;dl - cl
+    movq        mm5, mm0          ;temp dl
+    paddw       mm0, mm1          ;dl + cl
+    psubw       mm5, mm1          ;dl - cl
 
     ; 03 02 01 00
     ; 13 12 11 10
     ; 23 22 21 20
     ; 33 32 31 30
 
-    movq    mm3, mm4          ; 03 02 01 00
-    punpcklwd mm4, mm0          ; 11 01 10 00
-    punpckhwd mm3, mm0          ; 13 03 12 02
+    movq        mm3, mm4          ; 03 02 01 00
+    punpcklwd   mm4, mm0          ; 11 01 10 00
+    punpckhwd   mm3, mm0          ; 13 03 12 02
 
-    movq    mm1, mm6          ; 23 22 21 20
-    punpcklwd mm6, mm5          ; 31 21 30 20
-    punpckhwd mm1, mm5          ; 33 23 32 22
+    movq        mm1, mm6          ; 23 22 21 20
+    punpcklwd   mm6, mm5          ; 31 21 30 20
+    punpckhwd   mm1, mm5          ; 33 23 32 22
 
-    movq    mm0, mm4          ; 11 01 10 00
-    movq    mm2, mm3          ; 13 03 12 02
+    movq        mm0, mm4          ; 11 01 10 00
+    movq        mm2, mm3          ; 13 03 12 02
 
-    punpckldq mm0, mm6          ; 30 20 10 00 aka ip[0]
-    punpckhdq mm4, mm6          ; 31 21 11 01 aka ip[4]
+    punpckldq   mm0, mm6          ; 30 20 10 00 aka ip[0]
+    punpckhdq   mm4, mm6          ; 31 21 11 01 aka ip[4]
 
-    punpckldq mm2, mm1          ; 32 22 12 02 aka ip[8]
-    punpckhdq mm3, mm1          ; 33 23 13 03 aka ip[12]
+    punpckldq   mm2, mm1          ; 32 22 12 02 aka ip[8]
+    punpckhdq   mm3, mm1          ; 33 23 13 03 aka ip[12]
 ;~~~~~~~~~~~~~~~~~~~~~
-    movq    mm1, mm0
-    movq    mm5, mm4
-
-    paddw   mm1, mm3          ;ip[0] + ip[12] aka al
-    paddw   mm5, mm2          ;ip[4] + ip[8] aka bl
-
-    movq    mm6, mm1          ;temp al
-
-    paddw   mm1, mm5          ;al + bl
-    psubw   mm6, mm5          ;al - bl
-
-    psubw   mm0, mm3          ;ip[0] - ip[12] aka d1
-    psubw   mm4, mm2          ;ip[4] - ip[8] aka c1
-
-    movq    mm5, mm0          ;temp dl
-
-    paddw   mm0, mm4          ;dl + cl
-    psubw   mm5, mm4          ;dl - cl
+    movq        mm1, mm0
+    movq        mm5, mm4
+    paddw       mm1, mm3          ;ip[0] + ip[12] aka al
+    paddw       mm5, mm2          ;ip[4] + ip[8] aka bl
+
+    movq        mm6, mm1          ;temp al
+    paddw       mm1, mm5          ;al + bl
+    psubw       mm6, mm5          ;al - bl
+    paddw       mm1, mm7
+    paddw       mm6, mm7
+    psraw       mm1, 3
+    psraw       mm6, 3
+
+    psubw       mm0, mm3          ;ip[0] - ip[12] aka d1
+    psubw       mm4, mm2          ;ip[4] - ip[8] aka c1
+
+    movq        mm5, mm0          ;temp dl
+    paddw       mm0, mm4          ;dl + cl
+    psubw       mm5, mm4          ;dl - cl
+    paddw       mm0, mm7
+    paddw       mm5, mm7
+    psraw       mm0, 3
+    psraw       mm5, 3
 ;~~~~~~~~~~~~~~~~~~~~~
-    movq    mm3, mm1          ; 03 02 01 00
-    punpcklwd mm1, mm0          ; 11 01 10 00
-    punpckhwd mm3, mm0          ; 13 03 12 02
-
-    movq    mm4, mm6          ; 23 22 21 20
-    punpcklwd mm6, mm5          ; 31 21 30 20
-    punpckhwd mm4, mm5          ; 33 23 32 22
-
-    movq    mm0, mm1          ; 11 01 10 00
-    movq    mm2, mm3          ; 13 03 12 02
-
-    punpckldq mm0, mm6          ; 30 20 10 00 aka ip[0]
-    punpckhdq mm1, mm6          ; 31 21 11 01 aka ip[4]
-
-    punpckldq mm2, mm4          ; 32 22 12 02 aka ip[8]
-    punpckhdq mm3, mm4          ; 33 23 13 03 aka ip[12]
-
-    paddw   mm0, mm7
-    paddw   mm1, mm7
-    paddw   mm2, mm7
-    paddw   mm3, mm7
-
-    psraw   mm0, 3
-    psraw   mm1, 3
-    psraw   mm2, 3
-    psraw   mm3, 3
 
-    movq  [rdi + 0], mm0
-    movq  [rdi + 8], mm1
-    movq  [rdi + 16], mm2
-    movq  [rdi + 24], mm3
+    movd        eax, mm1
+    movd        ecx, mm0
+    psrlq       mm0, 32
+    psrlq       mm1, 32
+    mov         word ptr[rdx+32*0], ax
+    mov         word ptr[rdx+32*1], cx
+    shr         eax, 16
+    shr         ecx, 16
+    mov         word ptr[rdx+32*4], ax
+    mov         word ptr[rdx+32*5], cx
+    movd        eax, mm1
+    movd        ecx, mm0
+    mov         word ptr[rdx+32*8], ax
+    mov         word ptr[rdx+32*9], cx
+    shr         eax, 16
+    shr         ecx, 16
+    mov         word ptr[rdx+32*12], ax
+    mov         word ptr[rdx+32*13], cx
+
+    movd        eax, mm6
+    movd        ecx, mm5
+    psrlq       mm5, 32
+    psrlq       mm6, 32
+    mov         word ptr[rdx+32*2], ax
+    mov         word ptr[rdx+32*3], cx
+    shr         eax, 16
+    shr         ecx, 16
+    mov         word ptr[rdx+32*6], ax
+    mov         word ptr[rdx+32*7], cx
+    movd        eax, mm6
+    movd        ecx, mm5
+    mov         word ptr[rdx+32*10], ax
+    mov         word ptr[rdx+32*11], cx
+    shr         eax, 16
+    shr         ecx, 16
+    mov         word ptr[rdx+32*14], ax
+    mov         word ptr[rdx+32*15], cx
 
     ; begin epilog
-    pop rdi
-    pop rsi
     UNSHADOW_ARGS
     pop         rbp
     ret
diff --git a/vp8/common/x86/iwalsh_sse2.asm b/vp8/common/x86/iwalsh_sse2.asm
index 1da4fd8da..51cb5e21c 100644
--- a/vp8/common/x86/iwalsh_sse2.asm
+++ b/vp8/common/x86/iwalsh_sse2.asm
@@ -17,103 +17,105 @@ sym(vp8_short_inv_walsh4x4_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 2
-    SAVE_XMM 6
-    push        rsi
-    push        rdi
     ; end prolog
 
-    mov     rsi, arg(0)
-    mov     rdi, arg(1)
-    mov     rax, 3
+    mov         rcx, arg(0)
+    mov         rdx, arg(1)
+    mov         rax, 30003h
 
-    movdqa    xmm0, [rsi + 0]       ;ip[4] ip[0]
-    movdqa    xmm1, [rsi + 16]      ;ip[12] ip[8]
+    movdqa      xmm0, [rcx + 0]     ;ip[4] ip[0]
+    movdqa      xmm1, [rcx + 16]    ;ip[12] ip[8]
 
-    shl     rax, 16
-    or      rax, 3            ;00030003h
 
-    pshufd    xmm2, xmm1, 4eh       ;ip[8] ip[12]
-    movdqa    xmm3, xmm0          ;ip[4] ip[0]
+    pshufd      xmm2, xmm1, 4eh     ;ip[8] ip[12]
+    movdqa      xmm3, xmm0          ;ip[4] ip[0]
 
-    paddw   xmm0, xmm2          ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
-    psubw   xmm3, xmm2          ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
+    paddw       xmm0, xmm2          ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
+    psubw       xmm3, xmm2          ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
 
-    movdqa    xmm4, xmm0
+    movdqa      xmm4, xmm0
     punpcklqdq  xmm0, xmm3          ;d1 a1
     punpckhqdq  xmm4, xmm3          ;c1 b1
-    movd    xmm6, eax
 
-    movdqa    xmm1, xmm4          ;c1 b1
-    paddw   xmm4, xmm0          ;dl+cl a1+b1 aka op[4] op[0]
-    psubw   xmm0, xmm1          ;d1-c1 a1-b1 aka op[12] op[8]
+    movdqa      xmm1, xmm4          ;c1 b1
+    paddw       xmm4, xmm0          ;dl+cl a1+b1 aka op[4] op[0]
+    psubw       xmm0, xmm1          ;d1-c1 a1-b1 aka op[12] op[8]
 
-;;;temp output
-;;  movdqu  [rdi + 0], xmm4
-;;  movdqu  [rdi + 16], xmm3
-
-;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     ; 13 12 11 10 03 02 01 00
     ;
     ; 33 32 31 30 23 22 21 20
     ;
-    movdqa    xmm3, xmm4          ; 13 12 11 10 03 02 01 00
-    punpcklwd xmm4, xmm0          ; 23 03 22 02 21 01 20 00
-    punpckhwd xmm3, xmm0          ; 33 13 32 12 31 11 30 10
-    movdqa    xmm1, xmm4          ; 23 03 22 02 21 01 20 00
-    punpcklwd xmm4, xmm3          ; 31 21 11 01 30 20 10 00
-    punpckhwd xmm1, xmm3          ; 33 23 13 03 32 22 12 02
+    movdqa      xmm3, xmm4          ; 13 12 11 10 03 02 01 00
+    punpcklwd   xmm4, xmm0          ; 23 03 22 02 21 01 20 00
+    punpckhwd   xmm3, xmm0          ; 33 13 32 12 31 11 30 10
+    movdqa      xmm1, xmm4          ; 23 03 22 02 21 01 20 00
+    punpcklwd   xmm4, xmm3          ; 31 21 11 01 30 20 10 00
+    punpckhwd   xmm1, xmm3          ; 33 23 13 03 32 22 12 02
     ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    pshufd    xmm2, xmm1, 4eh       ;ip[8] ip[12]
-    movdqa    xmm3, xmm4          ;ip[4] ip[0]
+    movd        xmm0, eax
+    pshufd      xmm2, xmm1, 4eh     ;ip[8] ip[12]
+    movdqa      xmm3, xmm4          ;ip[4] ip[0]
 
-    pshufd    xmm6, xmm6, 0       ;03 03 03 03 03 03 03 03
+    pshufd      xmm0, xmm0, 0       ;03 03 03 03 03 03 03 03
 
-    paddw   xmm4, xmm2          ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
-    psubw   xmm3, xmm2          ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
+    paddw       xmm4, xmm2          ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
+    psubw       xmm3, xmm2          ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
 
-    movdqa    xmm5, xmm4
+    movdqa      xmm5, xmm4
     punpcklqdq  xmm4, xmm3          ;d1 a1
     punpckhqdq  xmm5, xmm3          ;c1 b1
 
-    movdqa    xmm1, xmm5          ;c1 b1
-    paddw   xmm5, xmm4          ;dl+cl a1+b1 aka op[4] op[0]
-    psubw   xmm4, xmm1          ;d1-c1 a1-b1 aka op[12] op[8]
-;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    ; 13 12 11 10 03 02 01 00
-    ;
-    ; 33 32 31 30 23 22 21 20
-    ;
-    movdqa    xmm0, xmm5          ; 13 12 11 10 03 02 01 00
-    punpcklwd xmm5, xmm4          ; 23 03 22 02 21 01 20 00
-    punpckhwd xmm0, xmm4          ; 33 13 32 12 31 11 30 10
-    movdqa    xmm1, xmm5          ; 23 03 22 02 21 01 20 00
-    punpcklwd xmm5, xmm0          ; 31 21 11 01 30 20 10 00
-    punpckhwd xmm1, xmm0          ; 33 23 13 03 32 22 12 02
-;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    paddw   xmm5, xmm6
-    paddw   xmm1, xmm6
-
-    psraw   xmm5, 3
-    psraw   xmm1, 3
-
-    movdqa  [rdi + 0], xmm5
-    movdqa  [rdi + 16], xmm1
+    movdqa      xmm1, xmm5          ;c1 b1
+    paddw       xmm5, xmm4          ;dl+cl a1+b1 aka op[4] op[0]
+    psubw       xmm4, xmm1          ;d1-c1 a1-b1 aka op[12] op[8]
+
+    paddw       xmm5, xmm0
+    paddw       xmm4, xmm0
+    psraw       xmm5, 3
+    psraw       xmm4, 3
+
+    movd        eax, xmm5
+    movd        ecx, xmm4
+    psrldq      xmm5, 4
+    psrldq      xmm4, 4
+    mov         word ptr[rdx+32*0], ax
+    mov         word ptr[rdx+32*2], cx
+    shr         eax, 16
+    shr         ecx, 16
+    mov         word ptr[rdx+32*4], ax
+    mov         word ptr[rdx+32*6], cx
+    movd        eax, xmm5
+    movd        ecx, xmm4
+    psrldq      xmm5, 4
+    psrldq      xmm4, 4
+    mov         word ptr[rdx+32*8], ax
+    mov         word ptr[rdx+32*10], cx
+    shr         eax, 16
+    shr         ecx, 16
+    mov         word ptr[rdx+32*12], ax
+    mov         word ptr[rdx+32*14], cx
+
+    movd        eax, xmm5
+    movd        ecx, xmm4
+    psrldq      xmm5, 4
+    psrldq      xmm4, 4
+    mov         word ptr[rdx+32*1], ax
+    mov         word ptr[rdx+32*3], cx
+    shr         eax, 16
+    shr         ecx, 16
+    mov         word ptr[rdx+32*5], ax
+    mov         word ptr[rdx+32*7], cx
+    movd        eax, xmm5
+    movd        ecx, xmm4
+    mov         word ptr[rdx+32*9], ax
+    mov         word ptr[rdx+32*11], cx
+    shr         eax, 16
+    shr         ecx, 16
+    mov         word ptr[rdx+32*13], ax
+    mov         word ptr[rdx+32*15], cx
 
     ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
     UNSHADOW_ARGS
     pop         rbp
     ret
-
-SECTION_RODATA
-align 16
-x_s1sqr2:
-    times 4 dw 0x8A8C
-align 16
-x_c1sqr2less1:
-    times 4 dw 0x4E7B
-align 16
-fours:
-    times 4 dw 0x0004
diff --git a/vp8/common/x86/loopfilter_sse2.asm b/vp8/common/x86/loopfilter_sse2.asm
index 86927d9f1..2ad010adb 100644
--- a/vp8/common/x86/loopfilter_sse2.asm
+++ b/vp8/common/x86/loopfilter_sse2.asm
@@ -1385,52 +1385,54 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2):
     SHADOW_ARGS_TO_STACK 3
     SAVE_XMM 7
     GET_GOT     rbx
-    push        rsi
-    push        rdi
     ; end prolog
 
-        mov         rsi, arg(0)             ;src_ptr
+        mov         rcx, arg(0)             ;src_ptr
         movsxd      rax, dword ptr arg(1)   ;src_pixel_step     ; destination pitch?
-        mov         rdx, arg(2)             ;blimit
-        movdqa      xmm3, XMMWORD PTR [rdx]
 
-        mov         rdi, rsi                ; rdi points to row +1 for indirect addressing
-        add         rdi, rax
+        lea         rdx, [rcx + rax]
         neg         rax
 
         ; calculate mask
-        movdqa      xmm1, [rsi+2*rax]       ; p1
-        movdqa      xmm0, [rdi]             ; q1
+        movdqa      xmm0, [rdx]             ; q1
+        mov         rdx, arg(2)             ;blimit
+        movdqa      xmm1, [rcx+2*rax]       ; p1
+
         movdqa      xmm2, xmm1
         movdqa      xmm7, xmm0
-        movdqa      xmm4, xmm0
+
         psubusb     xmm0, xmm1              ; q1-=p1
-        psubusb     xmm1, xmm4              ; p1-=q1
+        psubusb     xmm1, xmm7              ; p1-=q1
         por         xmm1, xmm0              ; abs(p1-q1)
         pand        xmm1, [GLOBAL(tfe)]     ; set lsb of each byte to zero
         psrlw       xmm1, 1                 ; abs(p1-q1)/2
 
-        movdqa      xmm5, [rsi+rax]         ; p0
-        movdqa      xmm4, [rsi]             ; q0
+        movdqa      xmm3, XMMWORD PTR [rdx]
+
+        movdqa      xmm5, [rcx+rax]         ; p0
+        movdqa      xmm4, [rcx]             ; q0
         movdqa      xmm0, xmm4              ; q0
         movdqa      xmm6, xmm5              ; p0
         psubusb     xmm5, xmm4              ; p0-=q0
         psubusb     xmm4, xmm6              ; q0-=p0
         por         xmm5, xmm4              ; abs(p0 - q0)
+
+        movdqa      xmm4, [GLOBAL(t80)]
+
         paddusb     xmm5, xmm5              ; abs(p0-q0)*2
         paddusb     xmm5, xmm1              ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
         psubusb     xmm5, xmm3              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
         pxor        xmm3, xmm3
         pcmpeqb     xmm5, xmm3
 
+
         ; start work on filters
-        pxor        xmm2, [GLOBAL(t80)]     ; p1 offset to convert to signed values
-        pxor        xmm7, [GLOBAL(t80)]     ; q1 offset to convert to signed values
+        pxor        xmm2, xmm4     ; p1 offset to convert to signed values
+        pxor        xmm7, xmm4     ; q1 offset to convert to signed values
         psubsb      xmm2, xmm7              ; p1 - q1
 
-        pxor        xmm6, [GLOBAL(t80)]     ; offset to convert to signed values
-        pxor        xmm0, [GLOBAL(t80)]     ; offset to convert to signed values
+        pxor        xmm6, xmm4     ; offset to convert to signed values
+        pxor        xmm0, xmm4     ; offset to convert to signed values
         movdqa      xmm3, xmm0              ; q0
         psubsb      xmm0, xmm6              ; q0 - p0
         paddsb      xmm2, xmm0              ; p1 - q1 + 1 * (q0 - p0)
@@ -1438,42 +1440,36 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2):
         paddsb      xmm2, xmm0              ; p1 - q1 + 3 * (q0 - p0)
         pand        xmm5, xmm2              ; mask filter values we don't care about
 
-        ; do + 4 side
-        paddsb      xmm5, [GLOBAL(t4)]      ; 3* (q0 - p0) + (p1 - q1) + 4
-
-        movdqa      xmm0, xmm5              ; get a copy of filters
-        psllw       xmm0, 8                 ; shift left 8
-        psraw       xmm0, 3                 ; arithmetic shift right 11
-        psrlw       xmm0, 8
-        movdqa      xmm1, xmm5              ; get a copy of filters
-        psraw       xmm1, 11                ; arithmetic shift right 11
-        psllw       xmm1, 8                 ; shift left 8 to put it back
-
-        por         xmm0, xmm1              ; put the two together to get result
+        paddsb      xmm5, [GLOBAL(t4)]      ;  3* (q0 - p0) + (p1 - q1) + 4
+        movdqa      xmm0, xmm5
+        psubsb      xmm5, [GLOBAL(t1s)]     ; +3 instead of +4
 
-        psubsb      xmm3, xmm0              ; q0-= q0 add
-        pxor        xmm3, [GLOBAL(t80)]     ; unoffset
-        movdqa      [rsi], xmm3             ; write back
+        movdqa      xmm1, [GLOBAL(te0)]
+        movdqa      xmm2, [GLOBAL(t1f)]
 
-        ; now do +3 side
-        psubsb      xmm5, [GLOBAL(t1s)]     ; +3 instead of +4
+        pxor        xmm7, xmm7
+        pcmpgtb     xmm7, xmm0              ;save sign
+        pand        xmm7, xmm1              ;preserve the upper 3 bits
+        psrlw       xmm0, 3
+        pand        xmm0, xmm2              ;clear out upper 3 bits
+        por         xmm0, xmm7              ;add sign
+        psubsb      xmm3, xmm0              ; q0-= q0sz add
 
-        movdqa      xmm0, xmm5              ; get a copy of filters
-        psllw       xmm0, 8                 ; shift left 8
-        psraw       xmm0, 3                 ; arithmetic shift right 11
-        psrlw       xmm0, 8
-        psraw       xmm5, 11                ; arithmetic shift right 11
-        psllw       xmm5, 8                 ; shift left 8 to put it back
-        por         xmm0, xmm5              ; put the two together to get result
+        pxor        xmm7, xmm7
+        pcmpgtb     xmm7, xmm5              ;save sign
+        pand        xmm7, xmm1              ;preserve the upper 3 bits
+        psrlw       xmm5, 3
+        pand        xmm5, xmm2              ;clear out upper 3 bits
+        por         xmm5, xmm7              ;add sign
+        paddsb      xmm6, xmm5              ; p0+= p0 add
 
+        pxor        xmm3, xmm4     ; unoffset
+        movdqa      [rcx], xmm3             ; write back
 
-        paddsb      xmm6, xmm0              ; p0+= p0 add
-        pxor        xmm6, [GLOBAL(t80)]     ; unoffset
-        movdqa      [rsi+rax], xmm6         ; write back
+        pxor        xmm6, xmm4     ; unoffset
+        movdqa      [rcx+rax], xmm6         ; write back
 
     ; begin epilog
-    pop rdi
-    pop rsi
     RESTORE_GOT
     RESTORE_XMM
     UNSHADOW_ARGS
@@ -1536,9 +1532,6 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
         punpckldq   xmm0,       xmm1                    ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
         punpckhdq   xmm2,       xmm1                    ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
 
-        movdqa      t0,         xmm0                    ; save to t0
-        movdqa      t1,         xmm2                    ; save to t1
-
         lea         rsi,        [rsi + rax*8]
         lea         rdi,        [rsi + rax]
         lea         rdx,        [rsi + rax*4]
@@ -1551,26 +1544,24 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
         punpckldq   xmm4,       xmm1                    ; c3 c2 c1 c0 83 82 81 80
         punpckldq   xmm6,       xmm3                    ; d3 d2 d1 d0 93 92 91 90
 
-        movd        xmm0,       [rsi + rax*2]           ; a3 a2 a1 a0
+        movd        xmm1,       [rsi + rax*2]           ; a3 a2 a1 a0
         movd        xmm5,       [rdx + rax*2]           ; e3 e2 e1 e0
-        movd        xmm2,       [rdi + rax*2]           ; b3 b2 b1 b0
+        movd        xmm3,       [rdi + rax*2]           ; b3 b2 b1 b0
         movd        xmm7,       [rcx + rax*2]           ; f3 f2 f1 f0
-        punpckldq   xmm0,       xmm5                    ; e3 e2 e1 e0 a3 a2 a1 a0
-        punpckldq   xmm2,       xmm7                    ; f3 f2 f1 f0 b3 b2 b1 b0
+        punpckldq   xmm1,       xmm5                    ; e3 e2 e1 e0 a3 a2 a1 a0
+        punpckldq   xmm3,       xmm7                    ; f3 f2 f1 f0 b3 b2 b1 b0
 
         punpcklbw   xmm4,       xmm6                    ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80
-        punpcklbw   xmm0,       xmm2                    ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0
+        punpcklbw   xmm1,       xmm3                    ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0
 
-        movdqa      xmm1,       xmm4
-        punpcklwd   xmm4,       xmm0                    ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
-        punpckhwd   xmm1,       xmm0                    ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
+        movdqa      xmm7,       xmm4
+        punpcklwd   xmm4,       xmm1                    ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
+        punpckhwd   xmm7,       xmm1                    ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
 
         movdqa      xmm6,       xmm4
-        punpckldq   xmm4,       xmm1                    ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
-        punpckhdq   xmm6,       xmm1                    ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
+        punpckldq   xmm4,       xmm7                    ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
+        punpckhdq   xmm6,       xmm7                    ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
 
-        movdqa      xmm0,       t0                      ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
-        movdqa      xmm2,       t1                      ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
         movdqa      xmm1,       xmm0
         movdqa      xmm3,       xmm2
 
@@ -1579,6 +1570,8 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
         punpcklqdq  xmm2,       xmm6                    ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
         punpckhqdq  xmm3,       xmm6                    ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
 
+        mov         rdx,        arg(2)                          ;blimit
+
         ; calculate mask
         movdqa      xmm6,       xmm0                            ; p1
         movdqa      xmm7,       xmm3                            ; q1
@@ -1588,6 +1581,8 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
         pand        xmm6,       [GLOBAL(tfe)]                   ; set lsb of each byte to zero
         psrlw       xmm6,       1                               ; abs(p1-q1)/2
 
+        movdqa      xmm7, [rdx]
+
         movdqa      xmm5,       xmm1                            ; p0
         movdqa      xmm4,       xmm2                            ; q0
         psubusb     xmm5,       xmm2                            ; p0-=q0
@@ -1596,8 +1591,7 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
         paddusb     xmm5,       xmm5                            ; abs(p0-q0)*2
         paddusb     xmm5,       xmm6                            ; abs (p0 - q0) *2 + abs(p1-q1)/2
 
-        mov         rdx,        arg(2)                          ;blimit
-        movdqa      xmm7, XMMWORD PTR [rdx]
+        movdqa      xmm4, [GLOBAL(t80)]
 
         psubusb     xmm5,        xmm7                           ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
         pxor        xmm7,        xmm7
@@ -1607,59 +1601,48 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
         movdqa        t0,        xmm0
         movdqa        t1,        xmm3
 
-        pxor        xmm0,        [GLOBAL(t80)]                  ; p1 offset to convert to signed values
-        pxor        xmm3,        [GLOBAL(t80)]                  ; q1 offset to convert to signed values
-
+        pxor        xmm0,        xmm4                  ; p1 offset to convert to signed values
+        pxor        xmm3,        xmm4                  ; q1 offset to convert to signed values
         psubsb      xmm0,        xmm3                           ; p1 - q1
-        movdqa      xmm6,        xmm1                           ; p0
-
-        movdqa      xmm7,        xmm2                           ; q0
-        pxor        xmm6,        [GLOBAL(t80)]                  ; offset to convert to signed values
 
-        pxor        xmm7,        [GLOBAL(t80)]                  ; offset to convert to signed values
-        movdqa      xmm3,        xmm7                           ; offseted ; q0
-
-        psubsb      xmm7,        xmm6                           ; q0 - p0
-        paddsb      xmm0,        xmm7                           ; p1 - q1 + 1 * (q0 - p0)
+        movdqa      xmm6,        xmm1                           ; p0
+;        movdqa      xmm7,        xmm2                           ; q0
 
-        paddsb      xmm0,        xmm7                           ; p1 - q1 + 2 * (q0 - p0)
-        paddsb      xmm0,        xmm7                           ; p1 - q1 + 3 * (q0 - p0)
+        pxor        xmm6,        xmm4                  ; offset to convert to signed values
+        pxor        xmm2,        xmm4                  ; offset to convert to signed values
 
+        movdqa      xmm3,        xmm2                           ; offseted ; q0
+        psubsb      xmm2,        xmm6                           ; q0 - p0
+        paddsb      xmm0,        xmm2                           ; p1 - q1 + 1 * (q0 - p0)
+        paddsb      xmm0,        xmm2                           ; p1 - q1 + 2 * (q0 - p0)
+        paddsb      xmm0,        xmm2                           ; p1 - q1 + 3 * (q0 - p0)
         pand        xmm5,        xmm0                           ; mask filter values we don't care about
 
-
         paddsb      xmm5,        [GLOBAL(t4)]                   ;  3* (q0 - p0) + (p1 - q1) + 4
-
-        movdqa      xmm0,        xmm5                           ; get a copy of filters
-        psllw       xmm0,        8                              ; shift left 8
-
-        psraw       xmm0,        3                              ; arithmetic shift right 11
-        psrlw       xmm0,        8
-
-        movdqa      xmm7,        xmm5                           ; get a copy of filters
-        psraw       xmm7,        11                             ; arithmetic shift right 11
-
-        psllw       xmm7,        8                              ; shift left 8 to put it back
-        por         xmm0,        xmm7                           ; put the two together to get result
-
-        psubsb      xmm3,        xmm0                           ; q0-= q0sz add
-        pxor        xmm3,        [GLOBAL(t80)]                  ; unoffset   q0
-
-        ; now do +3 side
+        movdqa      xmm0, xmm5
         psubsb      xmm5,        [GLOBAL(t1s)]                  ; +3 instead of +4
-        movdqa      xmm0,        xmm5                           ; get a copy of filters
 
-        psllw       xmm0,        8                              ; shift left 8
-        psraw       xmm0,        3                              ; arithmetic shift right 11
+        movdqa  xmm1, [GLOBAL(te0)]
+        movdqa  xmm2, [GLOBAL(t1f)]
 
-        psrlw       xmm0,        8
-        psraw       xmm5,        11                             ; arithmetic shift right 11
+        pxor        xmm7, xmm7
+        pcmpgtb     xmm7, xmm0              ;save sign
+        pand        xmm7, xmm1              ;preserve the upper 3 bits
+        psrlw       xmm0, 3
+        pand        xmm0, xmm2              ;clear out upper 3 bits
+        por         xmm0, xmm7              ;add sign
+        psubsb      xmm3, xmm0              ; q0-= q0sz add
 
-        psllw       xmm5,        8                              ; shift left 8 to put it back
-        por         xmm0,        xmm5                           ; put the two together to get result
+        pxor        xmm7, xmm7
+        pcmpgtb     xmm7, xmm5              ;save sign
+        pand        xmm7, xmm1              ;preserve the upper 3 bits
+        psrlw       xmm5, 3
+        pand        xmm5, xmm2              ;clear out upper 3 bits
+        por         xmm5, xmm7              ;add sign
+        paddsb      xmm6, xmm5              ; p0+= p0 add
 
-        paddsb      xmm6,        xmm0                           ; p0+= p0 add
-        pxor        xmm6,        [GLOBAL(t80)]                  ; unoffset   p0
+        pxor        xmm3,        xmm4                  ; unoffset   q0
+        pxor        xmm6,        xmm4                  ; unoffset   p0
 
         movdqa      xmm0,        t0                             ; p1
         movdqa      xmm4,        t1                             ; q1
@@ -1763,3 +1746,9 @@ s9:
 align 16
 s63:
     times 8 dw 0x003f
+align 16
+te0:
+    times 16 db 0xe0
+align 16
+t1f:
+    times 16 db 0x1f
diff --git a/vp8/common/x86/subpixel_mmx.asm b/vp8/common/x86/subpixel_mmx.asm
index e68d950ad..5528fd0e6 100644
--- a/vp8/common/x86/subpixel_mmx.asm
+++ b/vp8/common/x86/subpixel_mmx.asm
@@ -10,6 +10,7 @@
 
 
 %include "vpx_ports/x86_abi_support.asm"
+extern sym(vp8_bilinear_filters_x86_8)
 
 
 %define BLOCK_HEIGHT_WIDTH 4
@@ -222,14 +223,14 @@ sym(vp8_bilinear_predict8x8_mmx):
     push        rdi
     ; end prolog
 
-    ;const short *HFilter = bilinear_filters_mmx[xoffset];
-    ;const short *VFilter = bilinear_filters_mmx[yoffset];
+    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
+    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
 
         movsxd      rax,        dword ptr arg(2) ;xoffset
         mov         rdi,        arg(4) ;dst_ptr           ;
 
         shl         rax,        5 ; offset * 32
-        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_mmx))]
+        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
 
         add         rax,        rcx ; HFilter
         mov         rsi,        arg(0) ;src_ptr              ;
@@ -379,13 +380,13 @@ sym(vp8_bilinear_predict8x4_mmx):
     push        rdi
     ; end prolog
 
-    ;const short *HFilter = bilinear_filters_mmx[xoffset];
-    ;const short *VFilter = bilinear_filters_mmx[yoffset];
+    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
+    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
 
         movsxd      rax,        dword ptr arg(2) ;xoffset
         mov         rdi,        arg(4) ;dst_ptr           ;
 
-        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_mmx))]
+        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
         shl         rax,        5
 
         mov         rsi,        arg(0) ;src_ptr              ;
@@ -534,13 +535,13 @@ sym(vp8_bilinear_predict4x4_mmx):
     push        rdi
     ; end prolog
 
-    ;const short *HFilter = bilinear_filters_mmx[xoffset];
-    ;const short *VFilter = bilinear_filters_mmx[yoffset];
+    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
+    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
 
         movsxd      rax,        dword ptr arg(2) ;xoffset
         mov         rdi,        arg(4) ;dst_ptr           ;
 
-        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_mmx))]
+        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
         shl         rax,        5
 
         add         rax,        rcx ; HFilter
@@ -699,29 +700,3 @@ sym(vp8_six_tap_mmx):
     times 8 dw 0
 
 
-align 16
-global HIDDEN_DATA(sym(vp8_bilinear_filters_mmx))
-sym(vp8_bilinear_filters_mmx):
-    times 8 dw 128
-    times 8 dw 0
-
-    times 8 dw 112
-    times 8 dw 16
-
-    times 8 dw 96
-    times 8 dw 32
-
-    times 8 dw 80
-    times 8 dw 48
-
-    times 8 dw 64
-    times 8 dw 64
-
-    times 8 dw 48
-    times 8 dw 80
-
-    times 8 dw 32
-    times 8 dw 96
-
-    times 8 dw 16
-    times 8 dw 112
diff --git a/vp8/common/x86/subpixel_sse2.asm b/vp8/common/x86/subpixel_sse2.asm
index b62b5c68d..cb550af59 100644
--- a/vp8/common/x86/subpixel_sse2.asm
+++ b/vp8/common/x86/subpixel_sse2.asm
@@ -10,6 +10,7 @@
 
 
 %include "vpx_ports/x86_abi_support.asm"
+extern sym(vp8_bilinear_filters_x86_8)
 
 %define BLOCK_HEIGHT_WIDTH 4
 %define VP8_FILTER_WEIGHT 128
@@ -961,7 +962,7 @@ sym(vp8_unpack_block1d16_h6_sse2):
 ;    unsigned char *dst_ptr,
 ;    int dst_pitch
 ;)
-extern sym(vp8_bilinear_filters_mmx)
+extern sym(vp8_bilinear_filters_x86_8)
 global sym(vp8_bilinear_predict16x16_sse2)
 sym(vp8_bilinear_predict16x16_sse2):
     push        rbp
@@ -973,10 +974,10 @@ sym(vp8_bilinear_predict16x16_sse2):
     push        rdi
     ; end prolog
 
-    ;const short *HFilter = bilinear_filters_mmx[xoffset]
-    ;const short *VFilter = bilinear_filters_mmx[yoffset]
+    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]
+    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]
 
-        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_mmx))]
+        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
         movsxd      rax,        dword ptr arg(2) ;xoffset
 
         cmp         rax,        0      ;skip first_pass filter if xoffset=0
@@ -1230,7 +1231,6 @@ sym(vp8_bilinear_predict16x16_sse2):
 ;    unsigned char *dst_ptr,
 ;    int dst_pitch
 ;)
-extern sym(vp8_bilinear_filters_mmx)
 global sym(vp8_bilinear_predict8x8_sse2)
 sym(vp8_bilinear_predict8x8_sse2):
     push        rbp
@@ -1245,9 +1245,9 @@ sym(vp8_bilinear_predict8x8_sse2):
     ALIGN_STACK 16, rax
     sub         rsp, 144                         ; reserve 144 bytes
 
-    ;const short *HFilter = bilinear_filters_mmx[xoffset]
-    ;const short *VFilter = bilinear_filters_mmx[yoffset]
-        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_mmx))]
+    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]
+    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]
+        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
 
         mov         rsi,        arg(0) ;src_ptr
         movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
diff --git a/vp8/common/x86/subpixel_x86.h b/vp8/common/x86/subpixel_x86.h
index 75991cc4f..01ec9e210 100644
--- a/vp8/common/x86/subpixel_x86.h
+++ b/vp8/common/x86/subpixel_x86.h
@@ -12,6 +12,8 @@
 #ifndef SUBPIXEL_X86_H
 #define SUBPIXEL_X86_H
 
+#include "filter_x86.h"
+
 /* Note:
  *
  * This platform is commonly built for runtime CPU detection. If you modify
diff --git a/vp8/common/x86/vp8_asm_stubs.c b/vp8/common/x86/vp8_asm_stubs.c
index bce7bc38e..a623c69b4 100644
--- a/vp8/common/x86/vp8_asm_stubs.c
+++ b/vp8/common/x86/vp8_asm_stubs.c
@@ -12,9 +12,9 @@
 #include "vpx_config.h"
 #include "vpx_ports/mem.h"
 #include "vp8/common/subpixel.h"
+#include "filter_x86.h"
 
 extern const short vp8_six_tap_mmx[8][6*8];
-extern const short vp8_bilinear_filters_mmx[8][2*8];
 
 extern void vp8_filter_block1d_h6_mmx
 (
diff --git a/vp8/common/x86/x86_systemdependent.c b/vp8/common/x86/x86_systemdependent.c
index eb36d899d..e1e1b7987 100644
--- a/vp8/common/x86/x86_systemdependent.c
+++ b/vp8/common/x86/x86_systemdependent.c
@@ -11,7 +11,6 @@
 
 #include "vpx_config.h"
 #include "vpx_ports/x86.h"
-#include "vp8/common/g_common.h"
 #include "vp8/common/subpixel.h"
 #include "vp8/common/loopfilter.h"
 #include "vp8/common/recon.h"
@@ -37,12 +36,14 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx)
 
     if (flags & HAS_MMX)
     {
+        rtcd->dequant.block               = vp8_dequantize_b_mmx;
+        rtcd->dequant.idct_add            = vp8_dequant_idct_add_mmx;
+        rtcd->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_mmx;
+        rtcd->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_mmx;
+
         rtcd->idct.idct16       = vp8_short_idct4x4llm_mmx;
         rtcd->idct.idct1_scalar_add = vp8_dc_only_idct_add_mmx;
         rtcd->idct.iwalsh16     = vp8_short_inv_walsh4x4_mmx;
-        rtcd->idct.iwalsh1     = vp8_short_inv_walsh4x4_1_mmx;
-
-
 
         rtcd->recon.copy8x8     = vp8_copy_mem8x8_mmx;
         rtcd->recon.copy8x4     = vp8_copy_mem8x4_mmx;
@@ -89,6 +90,9 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx)
         rtcd->recon.build_intra_predictors_mby_s =
             vp8_build_intra_predictors_mby_s_sse2;
 
+        rtcd->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_sse2;
+        rtcd->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_sse2;
+
         rtcd->idct.iwalsh16     = vp8_short_inv_walsh4x4_sse2;
 
         rtcd->subpix.sixtap16x16   = vp8_sixtap_predict16x16_sse2;
diff --git a/vp8/decoder/arm/arm_dsystemdependent.c b/vp8/decoder/arm/arm_dsystemdependent.c
index 1b0091cdb..bf0a3481a 100644
--- a/vp8/decoder/arm/arm_dsystemdependent.c
+++ b/vp8/decoder/arm/arm_dsystemdependent.c
@@ -11,9 +11,6 @@
 
 #include "vpx_config.h"
 #include "vpx_ports/arm.h"
-#include "vp8/common/blockd.h"
-#include "vp8/common/pragmas.h"
-#include "vp8/decoder/dequantize.h"
 #include "vp8/decoder/onyxd_int.h"
 
 void vp8_arch_arm_decode_init(VP8D_COMP *pbi)
@@ -30,25 +27,12 @@ void vp8_arch_arm_decode_init(VP8D_COMP *pbi)
 #if HAVE_ARMV6
     if (flags & HAS_MEDIA)
     {
-        pbi->dequant.block               = vp8_dequantize_b_v6;
-        pbi->dequant.idct_add            = vp8_dequant_idct_add_v6;
-        pbi->dequant.dc_idct_add         = vp8_dequant_dc_idct_add_v6;
-        pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_v6;
-        pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_v6;
-        pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_v6;
     }
 #endif
 
 #if HAVE_ARMV7
     if (flags & HAS_NEON)
     {
-        pbi->dequant.block               = vp8_dequantize_b_neon;
-        pbi->dequant.idct_add            = vp8_dequant_idct_add_neon;
-        /*This is not used: NEON always dequants two blocks at once.
-        pbi->dequant.dc_idct_add         = vp8_dequant_dc_idct_add_neon;*/
-        pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_neon;
-        pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_neon;
-        pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_neon;
     }
 #endif
 #endif
diff --git a/vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm b/vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm
deleted file mode 100644
index 19f94e089..000000000
--- a/vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm
+++ /dev/null
@@ -1,213 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-    EXPORT |vp8_dequant_dc_idct_add_v6|
-
-    AREA |.text|, CODE, READONLY
-
-;void vp8_dequant_dc_idct_v6(short *input, short *dq,
-;                            unsigned char *dest, int stride, int Dc)
-; r0 = input
-; r1 = dq
-; r2 = dst
-; r3 = stride
-; sp + 36 = Dc
-
-
-|vp8_dequant_dc_idct_add_v6| PROC
-    stmdb   sp!, {r4-r11, lr}
-
-    ldr     r6, [sp, #36]
-
-    ldr     r4, [r0]                ;input
-    ldr     r5, [r1], #4            ;dq
-
-    sub     sp, sp, #4
-    str     r3, [sp]
-
-    smultt  r7, r4, r5
-
-    ldr     r4, [r0, #4]            ;input
-    ldr     r5, [r1], #4            ;dq
-
-    strh    r6, [r0], #2
-    strh    r7, [r0], #2
-
-    smulbb  r6, r4, r5
-    smultt  r7, r4, r5
-
-    ldr     r4, [r0, #4]            ;input
-    ldr     r5, [r1], #4            ;dq
-
-    strh    r6, [r0], #2
-    strh    r7, [r0], #2
-
-    mov     r12, #3
-
-vp8_dequant_dc_add_loop
-    smulbb  r6, r4, r5
-    smultt  r7, r4, r5
-
-    ldr     r4, [r0, #4]            ;input
-    ldr     r5, [r1], #4            ;dq
-
-    strh    r6, [r0], #2
-    strh    r7, [r0], #2
-
-    smulbb  r6, r4, r5
-    smultt  r7, r4, r5
-
-    subs    r12, r12, #1
-
-    ldrne   r4, [r0, #4]
-    ldrne   r5, [r1], #4
-
-    strh    r6, [r0], #2
-    strh    r7, [r0], #2
-
-    bne     vp8_dequant_dc_add_loop
-
-    sub     r0, r0, #32
-    mov     r1, r0
-
-; short_idct4x4llm_v6_dual
-    ldr     r3, cospi8sqrt2minus1
-    ldr     r4, sinpi8sqrt2
-    ldr     r6, [r0, #8]
-    mov     r5, #2
-vp8_dequant_dc_idct_loop1_v6
-    ldr     r12, [r0, #24]
-    ldr     r14, [r0, #16]
-    smulwt  r9, r3, r6
-    smulwb  r7, r3, r6
-    smulwt  r10, r4, r6
-    smulwb  r8, r4, r6
-    pkhbt   r7, r7, r9, lsl #16
-    smulwt  r11, r3, r12
-    pkhbt   r8, r8, r10, lsl #16
-    uadd16  r6, r6, r7
-    smulwt  r7, r4, r12
-    smulwb  r9, r3, r12
-    smulwb  r10, r4, r12
-    subs    r5, r5, #1
-    pkhbt   r9, r9, r11, lsl #16
-    ldr     r11, [r0], #4
-    pkhbt   r10, r10, r7, lsl #16
-    uadd16  r7, r12, r9
-    usub16  r7, r8, r7
-    uadd16  r6, r6, r10
-    uadd16  r10, r11, r14
-    usub16  r8, r11, r14
-    uadd16  r9, r10, r6
-    usub16  r10, r10, r6
-    uadd16  r6, r8, r7
-    usub16  r7, r8, r7
-    str     r6, [r1, #8]
-    ldrne   r6, [r0, #8]
-    str     r7, [r1, #16]
-    str     r10, [r1, #24]
-    str     r9, [r1], #4
-    bne     vp8_dequant_dc_idct_loop1_v6
-
-    mov     r5, #2
-    sub     r0, r1, #8
-vp8_dequant_dc_idct_loop2_v6
-    ldr     r6, [r0], #4
-    ldr     r7, [r0], #4
-    ldr     r8, [r0], #4
-    ldr     r9, [r0], #4
-    smulwt  r1, r3, r6
-    smulwt  r12, r4, r6
-    smulwt  lr, r3, r8
-    smulwt  r10, r4, r8
-    pkhbt   r11, r8, r6, lsl #16
-    pkhbt   r1, lr, r1, lsl #16
-    pkhbt   r12, r10, r12, lsl #16
-    pkhtb   r6, r6, r8, asr #16
-    uadd16  r6, r1, r6
-    pkhbt   lr, r9, r7, lsl #16
-    uadd16  r10, r11, lr
-    usub16  lr, r11, lr
-    pkhtb   r8, r7, r9, asr #16
-    subs    r5, r5, #1
-    smulwt  r1, r3, r8
-    smulwb  r7, r3, r8
-    smulwt  r11, r4, r8
-    smulwb  r9, r4, r8
-    pkhbt   r1, r7, r1, lsl #16
-    uadd16  r8, r1, r8
-    pkhbt   r11, r9, r11, lsl #16
-    usub16  r1, r12, r8
-    uadd16  r8, r11, r6
-    ldr     r9, c0x00040004
-    ldr     r12, [sp]               ; get stride from stack
-    uadd16  r6, r10, r8
-    usub16  r7, r10, r8
-    uadd16  r7, r7, r9
-    uadd16  r6, r6, r9
-    uadd16  r10, r14, r1
-    usub16  r1, r14, r1
-    uadd16  r10, r10, r9
-    uadd16  r1, r1, r9
-    ldr     r11, [r2]               ; load input from dst
-    mov     r8, r7, asr #3
-    pkhtb   r9, r8, r10, asr #19
-    mov     r8, r1, asr #3
-    pkhtb   r8, r8, r6, asr #19
-    uxtb16  lr, r11, ror #8
-    qadd16  r9, r9, lr
-    uxtb16  lr, r11
-    qadd16  r8, r8, lr
-    usat16  r9, #8, r9
-    usat16  r8, #8, r8
-    orr     r9, r8, r9, lsl #8
-    ldr     r11, [r2, r12]          ; load input from dst
-    mov     r7, r7, lsl #16
-    mov     r1, r1, lsl #16
-    mov     r10, r10, lsl #16
-    mov     r6, r6, lsl #16
-    mov     r7, r7, asr #3
-    pkhtb   r7, r7, r10, asr #19
-    mov     r1, r1, asr #3
-    pkhtb   r1, r1, r6, asr #19
-    uxtb16  r8, r11, ror #8
-    qadd16  r7, r7, r8
-    uxtb16  r8, r11
-    qadd16  r1, r1, r8
-    usat16  r7, #8, r7
-    usat16  r1, #8, r1
-    orr     r1, r1, r7, lsl #8
-    str     r9, [r2], r12           ; store output to dst
-    str     r1, [r2], r12           ; store output to dst
-    bne     vp8_dequant_dc_idct_loop2_v6
-
-; vpx_memset
-    sub     r0, r0, #32
-    add     sp, sp, #4
-
-    mov     r12, #0
-    str     r12, [r0]
-    str     r12, [r0, #4]
-    str     r12, [r0, #8]
-    str     r12, [r0, #12]
-    str     r12, [r0, #16]
-    str     r12, [r0, #20]
-    str     r12, [r0, #24]
-    str     r12, [r0, #28]
-
-    ldmia   sp!, {r4 - r11, pc}
-    ENDP    ; |vp8_dequant_dc_idct_add_v6|
-
-; Constant Pool
-cospi8sqrt2minus1 DCD 0x00004E7B
-sinpi8sqrt2       DCD 0x00008A8C
-c0x00040004       DCD 0x00040004
-
-    END
diff --git a/vp8/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm b/vp8/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm
deleted file mode 100644
index bf8d7ddcd..000000000
--- a/vp8/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm
+++ /dev/null
@@ -1,75 +0,0 @@
-;
-;  Copyright (c) 2010 The Webm project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-    EXPORT  |idct_dequant_dc_0_2x_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;void idct_dequant_dc_0_2x_neon(short *q, short *dq,
-;                               unsigned char *dst, int stride);
-; r0    *q,
-; r1    *dq,
-; r2    *dst
-; r3    stride
-; sp    *dc
-|idct_dequant_dc_0_2x_neon| PROC
-
-    ; no q- or dq-coeffs, so r0 and r1 are free to use
-    ldr             r1, [sp]                ; *dc
-    add             r12, r2, #4
-    ldr             r0, [r1]
-
-    vld1.32         {d2[0]}, [r2], r3       ; lo
-    vld1.32         {d8[0]}, [r12], r3      ; hi
-    vld1.32         {d2[1]}, [r2], r3
-    vld1.32         {d8[1]}, [r12], r3
-    vld1.32         {d4[0]}, [r2], r3
-    vld1.32         {d10[0]}, [r12], r3
-    vld1.32         {d4[1]}, [r2], r3
-    vld1.32         {d10[1]}, [r12]
-
-    sxth            r1, r0                  ; lo *dc
-    add             r1, r1, #4
-    asr             r1, r1, #3
-    vdup.16         q0, r1
-    sxth            r0, r0, ror #16         ; hi *dc
-    add             r0, r0, #4
-    asr             r0, r0, #3
-    vdup.16         q3, r0
-
-    vaddw.u8        q1, q0, d2              ; lo
-    vaddw.u8        q2, q0, d4
-    vaddw.u8        q4, q3, d8              ; hi
-    vaddw.u8        q5, q3, d10
-
-    vqmovun.s16     d2, q1                  ; lo
-    vqmovun.s16     d4, q2
-    vqmovun.s16     d8, q4                  ; hi
-    vqmovun.s16     d10, q5
-
-    sub             r2, r2, r3, lsl #2      ; dst - 4*stride
-    add             r0, r2, #4
-
-    vst1.32         {d2[0]}, [r2], r3       ; lo
-    vst1.32         {d8[0]}, [r0], r3       ; hi
-    vst1.32         {d2[1]}, [r2], r3
-    vst1.32         {d8[1]}, [r0], r3
-    vst1.32         {d4[0]}, [r2], r3
-    vst1.32         {d10[0]}, [r0], r3
-    vst1.32         {d4[1]}, [r2]
-    vst1.32         {d10[1]}, [r0]
-
-    bx             lr
-
-    ENDP           ;|idct_dequant_dc_0_2x_neon|
-    END
diff --git a/vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm b/vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm
deleted file mode 100644
index eea41f68c..000000000
--- a/vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm
+++ /dev/null
@@ -1,208 +0,0 @@
-;
-;  Copyright (c) 2010 The Webm project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |idct_dequant_dc_full_2x_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-;void idct_dequant_dc_full_2x_neon(short *q, short *dq,
-;                                  unsigned char *dst, int stride, short *dc);
-; r0    *q,
-; r1    *dq,
-; r2    *dst
-; r3    stride
-; sp    *dc
-|idct_dequant_dc_full_2x_neon| PROC
-    push            {r4}
-
-    vld1.16         {q0, q1}, [r1]          ; dq (same l/r)
-    vld1.16         {q2, q3}, [r0]          ; l q
-    add             r0, r0, #32
-    vld1.16         {q4, q5}, [r0]          ; r q
-    add             r12, r2, #4
-
-    ; interleave the predictors
-    vld1.32         {d28[0]}, [r2], r3      ; l pre
-    vld1.32         {d28[1]}, [r12], r3     ; r pre
-    vld1.32         {d29[0]}, [r2], r3
-    vld1.32         {d29[1]}, [r12], r3
-    vld1.32         {d30[0]}, [r2], r3
-    vld1.32         {d30[1]}, [r12], r3
-    vld1.32         {d31[0]}, [r2], r3
-    ldr             r1, [sp, #4]            ; *dc
-    vld1.32         {d31[1]}, [r12]
-
-    adr             r4, cospi8sqrt2minus1   ; pointer to the first constant
-
-    ldrh            r12, [r1], #2           ; lo *dc
-    ldrh            r1, [r1]                ; hi *dc
-
-    ; dequant: q[i] = q[i] * dq[i]
-    vmul.i16        q2, q2, q0
-    vmul.i16        q3, q3, q1
-    vmul.i16        q4, q4, q0
-    vmul.i16        q5, q5, q1
-
-    ; move dc up to neon and overwrite first element
-    vmov.16         d4[0], r12
-    vmov.16         d8[0], r1
-
-    vld1.16         {d0}, [r4]
-
-    ; q2: l0r0  q3: l8r8
-    ; q4: l4r4  q5: l12r12
-    vswp            d5, d8
-    vswp            d7, d10
-
-    ; _CONSTANTS_ * 4,12 >> 16
-    ; q6:  4 * sinpi : c1/temp1
-    ; q7: 12 * sinpi : d1/temp2
-    ; q8:  4 * cospi
-    ; q9: 12 * cospi
-    vqdmulh.s16     q6, q4, d0[2]           ; sinpi8sqrt2
-    vqdmulh.s16     q7, q5, d0[2]
-    vqdmulh.s16     q8, q4, d0[0]           ; cospi8sqrt2minus1
-    vqdmulh.s16     q9, q5, d0[0]
-
-    vqadd.s16       q10, q2, q3             ; a1 = 0 + 8
-    vqsub.s16       q11, q2, q3             ; b1 = 0 - 8
-
-    ; vqdmulh only accepts signed values. this was a problem because
-    ; our constant had the high bit set, and was treated as a negative value.
-    ; vqdmulh also doubles the value before it shifts by 16. we need to
-    ; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0,
-    ; so we can shift the constant without losing precision. this avoids
-    ; shift again afterward, but also avoids the sign issue. win win!
-    ; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we
-    ; pre-shift it
-    vshr.s16        q8, q8, #1
-    vshr.s16        q9, q9, #1
-
-    ; q4:  4 +  4 * cospi : d1/temp1
-    ; q5: 12 + 12 * cospi : c1/temp2
-    vqadd.s16       q4, q4, q8
-    vqadd.s16       q5, q5, q9
-
-    ; c1 = temp1 - temp2
-    ; d1 = temp1 + temp2
-    vqsub.s16       q2, q6, q5
-    vqadd.s16       q3, q4, q7
-
-    ; [0]: a1+d1
-    ; [1]: b1+c1
-    ; [2]: b1-c1
-    ; [3]: a1-d1
-    vqadd.s16       q4, q10, q3
-    vqadd.s16       q5, q11, q2
-    vqsub.s16       q6, q11, q2
-    vqsub.s16       q7, q10, q3
-
-    ; rotate
-    vtrn.32         q4, q6
-    vtrn.32         q5, q7
-    vtrn.16         q4, q5
-    vtrn.16         q6, q7
-    ; idct loop 2
-    ; q4: l 0, 4, 8,12 r 0, 4, 8,12
-    ; q5: l 1, 5, 9,13 r 1, 5, 9,13
-    ; q6: l 2, 6,10,14 r 2, 6,10,14
-    ; q7: l 3, 7,11,15 r 3, 7,11,15
-
-    ; q8:  1 * sinpi : c1/temp1
-    ; q9:  3 * sinpi : d1/temp2
-    ; q10: 1 * cospi
-    ; q11: 3 * cospi
-    vqdmulh.s16     q8, q5, d0[2]           ; sinpi8sqrt2
-    vqdmulh.s16     q9, q7, d0[2]
-    vqdmulh.s16     q10, q5, d0[0]          ; cospi8sqrt2minus1
-    vqdmulh.s16     q11, q7, d0[0]
-
-    vqadd.s16       q2, q4, q6             ; a1 = 0 + 2
-    vqsub.s16       q3, q4, q6             ; b1 = 0 - 2
-
-    ; see note on shifting above
-    vshr.s16        q10, q10, #1
-    vshr.s16        q11, q11, #1
-
-    ; q10: 1 + 1 * cospi : d1/temp1
-    ; q11: 3 + 3 * cospi : c1/temp2
-    vqadd.s16       q10, q5, q10
-    vqadd.s16       q11, q7, q11
-
-    ; q8: c1 = temp1 - temp2
-    ; q9: d1 = temp1 + temp2
-    vqsub.s16       q8, q8, q11
-    vqadd.s16       q9, q10, q9
-
-    ; a1+d1
-    ; b1+c1
-    ; b1-c1
-    ; a1-d1
-    vqadd.s16       q4, q2, q9
-    vqadd.s16       q5, q3, q8
-    vqsub.s16       q6, q3, q8
-    vqsub.s16       q7, q2, q9
-
-    ; +4 >> 3 (rounding)
-    vrshr.s16       q4, q4, #3              ; lo
-    vrshr.s16       q5, q5, #3
-    vrshr.s16       q6, q6, #3              ; hi
-    vrshr.s16       q7, q7, #3
-
-    vtrn.32         q4, q6
-    vtrn.32         q5, q7
-    vtrn.16         q4, q5
-    vtrn.16         q6, q7
-
-    ; adding pre
-    ; input is still packed. pre was read interleaved
-    vaddw.u8        q4, q4, d28
-    vaddw.u8        q5, q5, d29
-    vaddw.u8        q6, q6, d30
-    vaddw.u8        q7, q7, d31
-
-    vmov.i16        q14, #0
-    vmov            q15, q14
-    vst1.16         {q14, q15}, [r0]        ; write over high input
-    sub             r0, r0, #32
-    vst1.16         {q14, q15}, [r0]        ; write over low input
-
-    sub             r2, r2, r3, lsl #2      ; dst - 4*stride
-    add             r1, r2, #4              ; hi
-
-    ;saturate and narrow
-    vqmovun.s16     d0, q4                  ; lo
-    vqmovun.s16     d1, q5
-    vqmovun.s16     d2, q6                  ; hi
-    vqmovun.s16     d3, q7
-
-    vst1.32         {d0[0]}, [r2], r3       ; lo
-    vst1.32         {d0[1]}, [r1], r3       ; hi
-    vst1.32         {d1[0]}, [r2], r3
-    vst1.32         {d1[1]}, [r1], r3
-    vst1.32         {d2[0]}, [r2], r3
-    vst1.32         {d2[1]}, [r1], r3
-    vst1.32         {d3[0]}, [r2]
-    vst1.32         {d3[1]}, [r1]
-
-    pop             {r4}
-    bx              lr
-
-    ENDP            ; |idct_dequant_dc_full_2x_neon|
-
-; Constant Pool
-cospi8sqrt2minus1 DCD 0x4e7b
-; because the lowest bit in 0x8a8c is 0, we can pre-shift this
-sinpi8sqrt2       DCD 0x4546
-
-    END
diff --git a/vp8/decoder/decodframe.c b/vp8/decoder/decodframe.c
index e501b9ec7..11d0e38f5 100644
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -15,7 +15,7 @@
 #include "vp8/common/reconintra4x4.h"
 #include "vp8/common/recon.h"
 #include "vp8/common/reconinter.h"
-#include "dequantize.h"
+#include "vp8/common/dequantize.h"
 #include "detokenize.h"
 #include "vp8/common/invtrans.h"
 #include "vp8/common/alloccommon.h"
@@ -32,7 +32,7 @@
 #endif
 #include "vpx_mem/vpx_mem.h"
 #include "vp8/common/idct.h"
-#include "dequantize.h"
+
 #include "vp8/common/threading.h"
 #include "decoderthreading.h"
 #include "dboolhuff.h"
@@ -109,32 +109,12 @@ void mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd)
 #define RTCD_VTABLE(x) NULL
 #endif
 
-/* skip_recon_mb() is Modified: Instead of writing the result to predictor buffer and then copying it
- *  to dst buffer, we can write the result directly to dst buffer. This eliminates unnecessary copy.
- */
-static void skip_recon_mb(VP8D_COMP *pbi, MACROBLOCKD *xd)
-{
-    if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
-    {
-        RECON_INVOKE(&pbi->common.rtcd.recon, build_intra_predictors_mbuv_s)(xd);
-        RECON_INVOKE(&pbi->common.rtcd.recon,
-                     build_intra_predictors_mby_s)(xd);
-    }
-    else
-    {
-        vp8_build_inter16x16_predictors_mb(xd, xd->dst.y_buffer,
-                                           xd->dst.u_buffer, xd->dst.v_buffer,
-                                           xd->dst.y_stride, xd->dst.uv_stride);
-    }
-}
-
 static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
                               unsigned int mb_idx)
 {
-    int eobtotal = 0;
-    int throw_residual = 0;
     MB_PREDICTION_MODE mode;
     int i;
+    int corruption_detected = 0;
 
     if (xd->mode_info_context->mbmi.mb_skip_coeff)
     {
@@ -142,27 +122,51 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
     }
     else if (!vp8dx_bool_error(xd->current_bc))
     {
+        int eobtotal;
         eobtotal = vp8_decode_mb_tokens(pbi, xd);
+
+        /* Special case:  Force the loopfilter to skip when eobtotal is zero */
+        xd->mode_info_context->mbmi.mb_skip_coeff = (eobtotal==0);
     }
 
+    mode = xd->mode_info_context->mbmi.mode;
+
+    if (xd->segmentation_enabled)
+        mb_init_dequantizer(pbi, xd);
 
 
-    mode = xd->mode_info_context->mbmi.mode;
+#if CONFIG_ERROR_CONCEALMENT
 
-    if (eobtotal == 0 && mode != B_PRED && mode != SPLITMV &&
-            !vp8dx_bool_error(xd->current_bc))
+    if(pbi->ec_active)
     {
-        /* Special case:  Force the loopfilter to skip when eobtotal and
-         * mb_skip_coeff are zero.
-         * */
-        xd->mode_info_context->mbmi.mb_skip_coeff = 1;
+        int throw_residual;
+        /* When we have independent partitions we can apply residual even
+         * though other partitions within the frame are corrupt.
+         */
+        throw_residual = (!pbi->independent_partitions &&
+                          pbi->frame_corrupt_residual);
+        throw_residual = (throw_residual || vp8dx_bool_error(xd->current_bc));
 
-        skip_recon_mb(pbi, xd);
-        return;
+        if ((mb_idx >= pbi->mvs_corrupt_from_mb || throw_residual))
+        {
+            /* MB with corrupt residuals or corrupt mode/motion vectors.
+             * Better to use the predictor as reconstruction.
+             */
+            pbi->frame_corrupt_residual = 1;
+            vpx_memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
+            vp8_conceal_corrupt_mb(xd);
+
+
+            corruption_detected = 1;
+
+            /* force idct to be skipped for B_PRED and use the
+             * prediction only for reconstruction
+             * */
+            vpx_memset(xd->eobs, 0, 25);
+        }
     }
+#endif
 
-    if (xd->segmentation_enabled)
-        mb_init_dequantizer(pbi, xd);
 
     /* do prediction */
     if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
@@ -173,113 +177,117 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
         {
             RECON_INVOKE(&pbi->common.rtcd.recon,
                          build_intra_predictors_mby_s)(xd);
-        } else {
+        }
+        else
+        {
+            /* clear out residual eob info */
+            if(xd->mode_info_context->mbmi.mb_skip_coeff)
+                vpx_memset(xd->eobs, 0, 25);
+
             vp8_intra_prediction_down_copy(xd);
+
+            for (i = 0; i < 16; i++)
+            {
+                BLOCKD *b = &xd->block[i];
+                int b_mode = xd->mode_info_context->bmi[i].as_mode;
+
+                RECON_INVOKE(RTCD_VTABLE(recon), intra4x4_predict)
+                              ( *(b->base_dst) + b->dst, b->dst_stride, b_mode,
+                                *(b->base_dst) + b->dst, b->dst_stride );
+
+                if (xd->eobs[i])
+                {
+                    if (xd->eobs[i] > 1)
+                    {
+                        DEQUANT_INVOKE(&pbi->common.rtcd.dequant, idct_add)
+                            (b->qcoeff, b->dequant,
+                            *(b->base_dst) + b->dst, b->dst_stride);
+                    }
+                    else
+                    {
+                        IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)
+                            (b->qcoeff[0] * b->dequant[0],
+                            *(b->base_dst) + b->dst, b->dst_stride,
+                            *(b->base_dst) + b->dst, b->dst_stride);
+                        ((int *)b->qcoeff)[0] = 0;
+                    }
+                }
+            }
         }
     }
     else
     {
         vp8_build_inter_predictors_mb(xd);
     }
-    /* When we have independent partitions we can apply residual even
-     * though other partitions within the frame are corrupt.
-     */
-    throw_residual = (!pbi->independent_partitions &&
-                      pbi->frame_corrupt_residual);
-    throw_residual = (throw_residual || vp8dx_bool_error(xd->current_bc));
+
 
 #if CONFIG_ERROR_CONCEALMENT
-    if (pbi->ec_active &&
-        (mb_idx >= pbi->mvs_corrupt_from_mb || throw_residual))
+    if (corruption_detected)
     {
-        /* MB with corrupt residuals or corrupt mode/motion vectors.
-         * Better to use the predictor as reconstruction.
-         */
-        pbi->frame_corrupt_residual = 1;
-        vpx_memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
-        vp8_conceal_corrupt_mb(xd);
         return;
     }
 #endif
 
-    /* dequantization and idct */
-    if (mode == B_PRED)
+    if(!xd->mode_info_context->mbmi.mb_skip_coeff)
     {
-        for (i = 0; i < 16; i++)
+        /* dequantization and idct */
+        if (mode != B_PRED)
         {
-            BLOCKD *b = &xd->block[i];
-            int b_mode = xd->mode_info_context->bmi[i].as_mode;
+            short *DQC = xd->block[0].dequant;
 
-            RECON_INVOKE(RTCD_VTABLE(recon), intra4x4_predict)
-                          ( *(b->base_dst) + b->dst, b->dst_stride, b_mode,
-                            *(b->base_dst) + b->dst, b->dst_stride );
+            /* save the dc dequant constant in case it is overridden */
+            short dc_dequant_temp = DQC[0];
 
-            if (xd->eobs[i] )
+            if (mode != SPLITMV)
             {
-                if (xd->eobs[i] > 1)
+                BLOCKD *b = &xd->block[24];
+
+                /* do 2nd order transform on the dc block */
+                if (xd->eobs[24] > 1)
                 {
-                    DEQUANT_INVOKE(&pbi->dequant, idct_add)
-                        (b->qcoeff, b->dequant,
-                        *(b->base_dst) + b->dst, b->dst_stride);
+                    DEQUANT_INVOKE(&pbi->common.rtcd.dequant, block)(b);
+
+                    IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0],
+                        xd->qcoeff);
+                    ((int *)b->qcoeff)[0] = 0;
+                    ((int *)b->qcoeff)[1] = 0;
+                    ((int *)b->qcoeff)[2] = 0;
+                    ((int *)b->qcoeff)[3] = 0;
+                    ((int *)b->qcoeff)[4] = 0;
+                    ((int *)b->qcoeff)[5] = 0;
+                    ((int *)b->qcoeff)[6] = 0;
+                    ((int *)b->qcoeff)[7] = 0;
                 }
                 else
                 {
-                    IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)
-                        (b->qcoeff[0] * b->dequant[0],
-                        *(b->base_dst) + b->dst, b->dst_stride,
-                        *(b->base_dst) + b->dst, b->dst_stride);
+                    b->dqcoeff[0] = b->qcoeff[0] * b->dequant[0];
+                    IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0],
+                        xd->qcoeff);
                     ((int *)b->qcoeff)[0] = 0;
                 }
+
+                /* override the dc dequant constant in order to preserve the
+                 * dc components
+                 */
+                DQC[0] = 1;
             }
-        }
 
-    }
-    else if (mode == SPLITMV)
-    {
-        DEQUANT_INVOKE (&pbi->dequant, idct_add_y_block)
-                        (xd->qcoeff, xd->block[0].dequant,
-                         xd->dst.y_buffer,
-                         xd->dst.y_stride, xd->eobs);
-    }
-    else
-    {
-        BLOCKD *b = &xd->block[24];
+            DEQUANT_INVOKE (&pbi->common.rtcd.dequant, idct_add_y_block)
+                            (xd->qcoeff, xd->block[0].dequant,
+                             xd->dst.y_buffer,
+                             xd->dst.y_stride, xd->eobs);
 
-        /* do 2nd order transform on the dc block */
-        if (xd->eobs[24] > 1)
-        {
-            DEQUANT_INVOKE(&pbi->dequant, block)(b);
-
-            IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], b->diff);
-            ((int *)b->qcoeff)[0] = 0;
-            ((int *)b->qcoeff)[1] = 0;
-            ((int *)b->qcoeff)[2] = 0;
-            ((int *)b->qcoeff)[3] = 0;
-            ((int *)b->qcoeff)[4] = 0;
-            ((int *)b->qcoeff)[5] = 0;
-            ((int *)b->qcoeff)[6] = 0;
-            ((int *)b->qcoeff)[7] = 0;
-        }
-        else
-        {
-            b->dqcoeff[0] = b->qcoeff[0] * b->dequant[0];
-            IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], b->diff);
-            ((int *)b->qcoeff)[0] = 0;
+            /* restore the dc dequant constant */
+            DQC[0] = dc_dequant_temp;
         }
 
-        DEQUANT_INVOKE (&pbi->dequant, dc_idct_add_y_block)
-                        (xd->qcoeff, xd->block[0].dequant,
-                         xd->dst.y_buffer,
-                         xd->dst.y_stride, xd->eobs, xd->block[24].diff);
+        DEQUANT_INVOKE (&pbi->common.rtcd.dequant, idct_add_uv_block)
+                        (xd->qcoeff+16*16, xd->block[16].dequant,
+                         xd->dst.u_buffer, xd->dst.v_buffer,
+                         xd->dst.uv_stride, xd->eobs+16);
     }
-
-    DEQUANT_INVOKE (&pbi->dequant, idct_add_uv_block)
-                    (xd->qcoeff+16*16, xd->block[16].dequant,
-                     xd->dst.u_buffer, xd->dst.v_buffer,
-                     xd->dst.uv_stride, xd->eobs+16);
 }
 
-
 static int get_delta_q(vp8_reader *bc, int prev, int *q_update)
 {
     int ret_val = 0;
@@ -476,7 +484,8 @@ static void setup_token_decoder(VP8D_COMP *pbi,
                                 const unsigned char* token_part_sizes)
 {
     vp8_reader *bool_decoder = &pbi->bc2;
-    int fragment_idx, partition_idx;
+    unsigned int partition_idx;
+    int fragment_idx;
     int num_token_partitions;
     const unsigned char *first_fragment_end = pbi->fragments[0] +
                                           pbi->fragment_sizes[0];
@@ -934,16 +943,38 @@ int vp8_decode_frame(VP8D_COMP *pbi)
         if (!pc->refresh_golden_frame)
             pc->copy_buffer_to_gf = vp8_read_literal(bc, 2);
 
+#if CONFIG_ERROR_CONCEALMENT
+        /* Assume we shouldn't copy to the golden if the bit is missing */
+        xd->corrupted |= vp8dx_bool_error(bc);
+        if (pbi->ec_active && xd->corrupted)
+            pc->copy_buffer_to_gf = 0;
+#endif
+
         pc->copy_buffer_to_arf = 0;
 
         if (!pc->refresh_alt_ref_frame)
             pc->copy_buffer_to_arf = vp8_read_literal(bc, 2);
 
+#if CONFIG_ERROR_CONCEALMENT
+        /* Assume we shouldn't copy to the alt-ref if the bit is missing */
+        xd->corrupted |= vp8dx_bool_error(bc);
+        if (pbi->ec_active && xd->corrupted)
+            pc->copy_buffer_to_arf = 0;
+#endif
+
+
         pc->ref_frame_sign_bias[GOLDEN_FRAME] = vp8_read_bit(bc);
         pc->ref_frame_sign_bias[ALTREF_FRAME] = vp8_read_bit(bc);
     }
 
     pc->refresh_entropy_probs = vp8_read_bit(bc);
+#if CONFIG_ERROR_CONCEALMENT
+    /* Assume we shouldn't refresh the probabilities if the bit is
+     * missing */
+    xd->corrupted |= vp8dx_bool_error(bc);
+    if (pbi->ec_active && xd->corrupted)
+        pc->refresh_entropy_probs = 0;
+#endif
     if (pc->refresh_entropy_probs == 0)
     {
         vpx_memcpy(&pc->lfc, &pc->fc, sizeof(pc->fc));
diff --git a/vp8/decoder/detokenize.c b/vp8/decoder/detokenize.c
index 1d4568593..ba94c58bb 100644
--- a/vp8/decoder/detokenize.c
+++ b/vp8/decoder/detokenize.c
@@ -15,7 +15,7 @@
 #include "vpx_ports/mem.h"
 #include "detokenize.h"
 
-#define BOOL_DATA UINT8
+#define BOOL_DATA unsigned char
 
 #define OCB_X PREV_COEF_CONTEXTS * ENTROPY_NODES
 DECLARE_ALIGNED(16, static const unsigned char, coef_bands_x[16]) =
@@ -157,10 +157,10 @@ DECLARE_ALIGNED(16, extern const unsigned char, vp8_norm[256]);
     DECODE_AND_APPLYSIGN(val) \
     Prob = coef_probs + (ENTROPY_NODES*2); \
     if(c < 15){\
-        qcoeff_ptr [ scan[c] ] = (INT16) v; \
+        qcoeff_ptr [ scan[c] ] = (int16_t) v; \
         ++c; \
         goto DO_WHILE; }\
-    qcoeff_ptr [ 15 ] = (INT16) v; \
+    qcoeff_ptr [ 15 ] = (int16_t) v; \
     goto BLOCK_FINISHED;
 
 
@@ -172,7 +172,7 @@ DECLARE_ALIGNED(16, extern const unsigned char, vp8_norm[256]);
     {\
         range = range-split;\
         value = value-bigsplit;\
-        val += ((UINT16)1<<bits_count);\
+        val += ((uint16_t)1<<bits_count);\
     }\
     else\
     {\
@@ -340,12 +340,12 @@ ONE_CONTEXT_NODE_0_:
 
     if (c < 15)
     {
-        qcoeff_ptr [ scan[c] ] = (INT16) v;
+        qcoeff_ptr [ scan[c] ] = (int16_t) v;
         ++c;
         goto DO_WHILE;
     }
 
-    qcoeff_ptr [ 15 ] = (INT16) v;
+    qcoeff_ptr [ 15 ] = (int16_t) v;
 BLOCK_FINISHED:
     eobs[i] = c;
     eobtotal += c;
diff --git a/vp8/decoder/error_concealment.c b/vp8/decoder/error_concealment.c
index 86fa191d3..b77d743f7 100644
--- a/vp8/decoder/error_concealment.c
+++ b/vp8/decoder/error_concealment.c
@@ -491,33 +491,6 @@ static void find_neighboring_blocks(MODE_INFO *mi,
     assert(i == 20);
 }
 
-/* Calculates which reference frame type is dominating among the neighbors */
-static MV_REFERENCE_FRAME dominant_ref_frame(EC_BLOCK *neighbors)
-{
-    /* Default to referring to "skip" */
-    MV_REFERENCE_FRAME dom_ref_frame = LAST_FRAME;
-    int max_ref_frame_cnt = 0;
-    int ref_frame_cnt[MAX_REF_FRAMES] = {0};
-    int i;
-    /* Count neighboring reference frames */
-    for (i = 0; i < NUM_NEIGHBORS; ++i)
-    {
-        if (neighbors[i].ref_frame < MAX_REF_FRAMES &&
-            neighbors[i].ref_frame != INTRA_FRAME)
-            ++ref_frame_cnt[neighbors[i].ref_frame];
-    }
-    /* Find maximum */
-    for (i = 0; i < MAX_REF_FRAMES; ++i)
-    {
-        if (ref_frame_cnt[i] > max_ref_frame_cnt)
-        {
-            dom_ref_frame = i;
-            max_ref_frame_cnt = ref_frame_cnt[i];
-        }
-    }
-    return dom_ref_frame;
-}
-
 /* Interpolates all motion vectors for a macroblock from the neighboring blocks'
  * motion vectors.
  */
@@ -591,7 +564,6 @@ void vp8_interpolate_motion(MACROBLOCKD *mb,
 {
     /* Find relevant neighboring blocks */
     EC_BLOCK neighbors[NUM_NEIGHBORS];
-    MV_REFERENCE_FRAME dom_ref_frame;
     int i;
     /* Initialize the array. MAX_REF_FRAMES is interpreted as "doesn't exist" */
     for (i = 0; i < NUM_NEIGHBORS; ++i)
@@ -604,13 +576,11 @@ void vp8_interpolate_motion(MACROBLOCKD *mb,
                                 mb_row, mb_col,
                                 mb_rows, mb_cols,
                                 mb->mode_info_stride);
-    /* Determine the dominant block type */
-    dom_ref_frame = dominant_ref_frame(neighbors);
-    /* Interpolate MVs for the missing blocks
-     * from the dominating MVs */
-    interpolate_mvs(mb, neighbors, dom_ref_frame);
+    /* Interpolate MVs for the missing blocks from the surrounding
+     * blocks which refer to the last frame. */
+    interpolate_mvs(mb, neighbors, LAST_FRAME);
 
-    mb->mode_info_context->mbmi.ref_frame = dom_ref_frame;
+    mb->mode_info_context->mbmi.ref_frame = LAST_FRAME;
     mb->mode_info_context->mbmi.mode = SPLITMV;
     mb->mode_info_context->mbmi.uv_mode = DC_PRED;
     mb->mode_info_context->mbmi.partitioning = 3;
diff --git a/vp8/decoder/generic/dsystemdependent.c b/vp8/decoder/generic/dsystemdependent.c
index 9c42bc62d..8a84e566a 100644
--- a/vp8/decoder/generic/dsystemdependent.c
+++ b/vp8/decoder/generic/dsystemdependent.c
@@ -10,7 +10,7 @@
 
 
 #include "vpx_config.h"
-#include "vp8/decoder/dequantize.h"
+#include "vp8/common/dequantize.h"
 #include "vp8/decoder/onyxd_int.h"
 
 extern void vp8_arch_x86_decode_init(VP8D_COMP *pbi);
@@ -20,13 +20,7 @@ void vp8_dmachine_specific_config(VP8D_COMP *pbi)
 {
     /* Pure C: */
 #if CONFIG_RUNTIME_CPU_DETECT
-    pbi->mb.rtcd                     = &pbi->common.rtcd;
-    pbi->dequant.block               = vp8_dequantize_b_c;
-    pbi->dequant.idct_add            = vp8_dequant_idct_add_c;
-    pbi->dequant.dc_idct_add         = vp8_dequant_dc_idct_add_c;
-    pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_c;
-    pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_c;
-    pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_c;
+    pbi->mb.rtcd                               = &pbi->common.rtcd;
 #endif
 
 #if ARCH_X86 || ARCH_X86_64
diff --git a/vp8/decoder/onyxd_if.c b/vp8/decoder/onyxd_if.c
index 077954948..80648d39f 100644
--- a/vp8/decoder/onyxd_if.c
+++ b/vp8/decoder/onyxd_if.c
@@ -20,7 +20,6 @@
 #include "vpx_scale/yv12extend.h"
 #include "vp8/common/loopfilter.h"
 #include "vp8/common/swapyv12buffer.h"
-#include "vp8/common/g_common.h"
 #include "vp8/common/threading.h"
 #include "decoderthreading.h"
 #include <stdio.h>
@@ -57,7 +56,7 @@ void vp8dx_initialize()
 }
 
 
-VP8D_PTR vp8dx_create_decompressor(VP8D_CONFIG *oxcf)
+struct VP8D_COMP * vp8dx_create_decompressor(VP8D_CONFIG *oxcf)
 {
     VP8D_COMP *pbi = vpx_memalign(32, sizeof(VP8D_COMP));
 
@@ -117,14 +116,12 @@ VP8D_PTR vp8dx_create_decompressor(VP8D_CONFIG *oxcf)
      */
     pbi->independent_partitions = 0;
 
-    return (VP8D_PTR) pbi;
+    return pbi;
 }
 
 
-void vp8dx_remove_decompressor(VP8D_PTR ptr)
+void vp8dx_remove_decompressor(VP8D_COMP *pbi)
 {
-    VP8D_COMP *pbi = (VP8D_COMP *) ptr;
-
     if (!pbi)
         return;
 
@@ -142,9 +139,8 @@ void vp8dx_remove_decompressor(VP8D_PTR ptr)
 }
 
 
-vpx_codec_err_t vp8dx_get_reference(VP8D_PTR ptr, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd)
+vpx_codec_err_t vp8dx_get_reference(VP8D_COMP *pbi, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd)
 {
-    VP8D_COMP *pbi = (VP8D_COMP *) ptr;
     VP8_COMMON *cm = &pbi->common;
     int ref_fb_idx;
 
@@ -174,9 +170,8 @@ vpx_codec_err_t vp8dx_get_reference(VP8D_PTR ptr, VP8_REFFRAME ref_frame_flag, Y
 }
 
 
-vpx_codec_err_t vp8dx_set_reference(VP8D_PTR ptr, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd)
+vpx_codec_err_t vp8dx_set_reference(VP8D_COMP *pbi, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd)
 {
-    VP8D_COMP *pbi = (VP8D_COMP *) ptr;
     VP8_COMMON *cm = &pbi->common;
     int *ref_fb_ptr = NULL;
     int free_fb;
@@ -301,19 +296,18 @@ static int swap_frame_buffers (VP8_COMMON *cm)
     return err;
 }
 
-int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsigned char *source, int64_t time_stamp)
+int vp8dx_receive_compressed_data(VP8D_COMP *pbi, unsigned long size, const unsigned char *source, int64_t time_stamp)
 {
 #if HAVE_ARMV7
     int64_t dx_store_reg[8];
 #endif
-    VP8D_COMP *pbi = (VP8D_COMP *) ptr;
     VP8_COMMON *cm = &pbi->common;
     int retcode = 0;
 
     /*if(pbi->ready_for_new_data == 0)
         return -1;*/
 
-    if (ptr == 0)
+    if (pbi == 0)
     {
         return -1;
     }
@@ -359,28 +353,38 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign
         pbi->fragment_sizes[0] = 0;
     }
 
-    if (pbi->num_fragments <= 1 && pbi->fragment_sizes[0] == 0)
+    if (!pbi->ec_active &&
+        pbi->num_fragments <= 1 && pbi->fragment_sizes[0] == 0)
     {
-       /* This is used to signal that we are missing frames.
-        * We do not know if the missing frame(s) was supposed to update
-        * any of the reference buffers, but we act conservative and
-        * mark only the last buffer as corrupted.
-        */
-        cm->yv12_fb[cm->lst_fb_idx].corrupted = 1;
-
         /* If error concealment is disabled we won't signal missing frames
          * to the decoder.
          */
-        if (!pbi->ec_active)
+        if (cm->fb_idx_ref_cnt[cm->lst_fb_idx] > 1)
         {
-            /* Signal that we have no frame to show. */
-            cm->show_frame = 0;
+            /* The last reference shares buffer with another reference
+             * buffer. Move it to its own buffer before setting it as
+             * corrupt, otherwise we will make multiple buffers corrupt.
+             */
+            const int prev_idx = cm->lst_fb_idx;
+            cm->fb_idx_ref_cnt[prev_idx]--;
+            cm->lst_fb_idx = get_free_fb(cm);
+            vp8_yv12_copy_frame_ptr(&cm->yv12_fb[prev_idx],
+                                    &cm->yv12_fb[cm->lst_fb_idx]);
+        }
+        /* This is used to signal that we are missing frames.
+         * We do not know if the missing frame(s) was supposed to update
+         * any of the reference buffers, but we act conservative and
+         * mark only the last buffer as corrupted.
+         */
+        cm->yv12_fb[cm->lst_fb_idx].corrupted = 1;
 
-            pbi->num_fragments = 0;
+        /* Signal that we have no frame to show. */
+        cm->show_frame = 0;
 
-            /* Nothing more to do. */
-            return 0;
-        }
+        pbi->num_fragments = 0;
+
+        /* Nothing more to do. */
+        return 0;
     }
 
 #if HAVE_ARMV7
@@ -565,10 +569,9 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign
     pbi->common.error.setjmp = 0;
     return retcode;
 }
-int vp8dx_get_raw_frame(VP8D_PTR ptr, YV12_BUFFER_CONFIG *sd, int64_t *time_stamp, int64_t *time_end_stamp, vp8_ppflags_t *flags)
+int vp8dx_get_raw_frame(VP8D_COMP *pbi, YV12_BUFFER_CONFIG *sd, int64_t *time_stamp, int64_t *time_end_stamp, vp8_ppflags_t *flags)
 {
     int ret = -1;
-    VP8D_COMP *pbi = (VP8D_COMP *) ptr;
 
     if (pbi->ready_for_new_data == 1)
         return ret;
diff --git a/vp8/decoder/onyxd_int.h b/vp8/decoder/onyxd_int.h
index 519a7f2b9..cb2593b2c 100644
--- a/vp8/decoder/onyxd_int.h
+++ b/vp8/decoder/onyxd_int.h
@@ -16,7 +16,8 @@
 #include "treereader.h"
 #include "vp8/common/onyxc_int.h"
 #include "vp8/common/threading.h"
-#include "dequantize.h"
+
+
 #if CONFIG_ERROR_CONCEALMENT
 #include "ec_types.h"
 #endif
@@ -43,7 +44,7 @@ typedef struct
 } DATARATE;
 
 
-typedef struct VP8Decompressor
+typedef struct VP8D_COMP
 {
     DECLARE_ALIGNED(16, MACROBLOCKD, mb);
 
@@ -93,11 +94,6 @@ typedef struct VP8Decompressor
 
     DATARATE dr[16];
 
-#if CONFIG_RUNTIME_CPU_DETECT
-    vp8_dequant_rtcd_vtable_t        dequant;
-#endif
-
-
     vp8_prob prob_intra;
     vp8_prob prob_last;
     vp8_prob prob_gf;
diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c
index eba5830d5..947b3a1c6 100644
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -175,36 +175,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
 #endif
 
     /* dequantization and idct */
-    if (xd->mode_info_context->mbmi.mode != B_PRED && xd->mode_info_context->mbmi.mode != SPLITMV)
-    {
-        BLOCKD *b = &xd->block[24];
-        DEQUANT_INVOKE(&pbi->dequant, block)(b);
-
-        /* do 2nd order transform on the dc block */
-        if (xd->eobs[24] > 1)
-        {
-            IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], b->diff);
-            ((int *)b->qcoeff)[0] = 0;
-            ((int *)b->qcoeff)[1] = 0;
-            ((int *)b->qcoeff)[2] = 0;
-            ((int *)b->qcoeff)[3] = 0;
-            ((int *)b->qcoeff)[4] = 0;
-            ((int *)b->qcoeff)[5] = 0;
-            ((int *)b->qcoeff)[6] = 0;
-            ((int *)b->qcoeff)[7] = 0;
-        }
-        else
-        {
-            IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], b->diff);
-            ((int *)b->qcoeff)[0] = 0;
-        }
-
-        DEQUANT_INVOKE (&pbi->dequant, dc_idct_add_y_block)
-                        (xd->qcoeff, xd->block[0].dequant,
-                         xd->dst.y_buffer,
-                         xd->dst.y_stride, xd->eobs, xd->block[24].diff);
-    }
-    else if (xd->mode_info_context->mbmi.mode == B_PRED)
+    if (xd->mode_info_context->mbmi.mode == B_PRED)
     {
         for (i = 0; i < 16; i++)
         {
@@ -214,37 +185,81 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
             vp8mt_predict_intra4x4(pbi, xd, b_mode, *(b->base_dst) + b->dst,
                                    b->dst_stride, mb_row, mb_col, i);
 
-            if (xd->eobs[i] > 1)
+            if (xd->eobs[i] )
             {
-                DEQUANT_INVOKE(&pbi->dequant, idct_add)
-                    (b->qcoeff, b->dequant,
-                    *(b->base_dst) + b->dst, b->dst_stride);
+                if (xd->eobs[i] > 1)
+                {
+                    DEQUANT_INVOKE(&pbi->common.rtcd.dequant, idct_add)
+                        (b->qcoeff, b->dequant,
+                        *(b->base_dst) + b->dst, b->dst_stride);
+                }
+                else
+                {
+                    IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)
+                        (b->qcoeff[0] * b->dequant[0],
+                        *(b->base_dst) + b->dst, b->dst_stride,
+                        *(b->base_dst) + b->dst, b->dst_stride);
+                    ((int *)b->qcoeff)[0] = 0;
+                }
+            }
+        }
+    }
+    else
+    {
+        short *DQC = xd->block[0].dequant;
+
+        DECLARE_ALIGNED(16, short, local_dequant[16]);
+
+        if (xd->mode_info_context->mbmi.mode != SPLITMV)
+        {
+            BLOCKD *b = &xd->block[24];
+
+            /* do 2nd order transform on the dc block */
+            if (xd->eobs[24] > 1)
+            {
+                DEQUANT_INVOKE(&pbi->common.rtcd.dequant, block)(b);
+
+                IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0],
+                    xd->qcoeff);
+                ((int *)b->qcoeff)[0] = 0;
+                ((int *)b->qcoeff)[1] = 0;
+                ((int *)b->qcoeff)[2] = 0;
+                ((int *)b->qcoeff)[3] = 0;
+                ((int *)b->qcoeff)[4] = 0;
+                ((int *)b->qcoeff)[5] = 0;
+                ((int *)b->qcoeff)[6] = 0;
+                ((int *)b->qcoeff)[7] = 0;
             }
             else
             {
-                IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)
-                    (b->qcoeff[0] * b->dequant[0],
-                    *(b->base_dst) + b->dst, b->dst_stride,
-                    *(b->base_dst) + b->dst, b->dst_stride);
+                b->dqcoeff[0] = b->qcoeff[0] * b->dequant[0];
+                IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], xd->qcoeff);
                 ((int *)b->qcoeff)[0] = 0;
             }
+
+            /* make a local copy of the dequant constants */
+            vpx_memcpy(local_dequant, xd->block[0].dequant,
+                       sizeof(local_dequant));
+
+            /* override the dc dequant constant */
+            local_dequant[0] = 1;
+
+            /* use the new dequant constants */
+            DQC = local_dequant;
         }
-    }
-    else
-    {
-        DEQUANT_INVOKE (&pbi->dequant, idct_add_y_block)
-                        (xd->qcoeff, xd->block[0].dequant,
+
+        DEQUANT_INVOKE (&pbi->common.rtcd.dequant, idct_add_y_block)
+                        (xd->qcoeff, DQC,
                          xd->dst.y_buffer,
                          xd->dst.y_stride, xd->eobs);
     }
 
-    DEQUANT_INVOKE (&pbi->dequant, idct_add_uv_block)
+    DEQUANT_INVOKE (&pbi->common.rtcd.dequant, idct_add_uv_block)
                     (xd->qcoeff+16*16, xd->block[16].dequant,
                      xd->dst.u_buffer, xd->dst.v_buffer,
                      xd->dst.uv_stride, xd->eobs+16);
 }
 
-
 static THREAD_FUNCTION thread_decoding_proc(void *p_data)
 {
     int ithread = ((DECODETHREAD_DATA *)p_data)->ithread;
diff --git a/vp8/decoder/x86/x86_dsystemdependent.c b/vp8/decoder/x86/x86_dsystemdependent.c
index 443150483..27bf5ddbd 100644
--- a/vp8/decoder/x86/x86_dsystemdependent.c
+++ b/vp8/decoder/x86/x86_dsystemdependent.c
@@ -13,50 +13,7 @@
 #include "vpx_ports/x86.h"
 #include "vp8/decoder/onyxd_int.h"
 
-
-#if HAVE_MMX
-void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q);
-
-void vp8_dequantize_b_mmx(BLOCKD *d)
-{
-    short *sq = (short *) d->qcoeff;
-    short *dq = (short *) d->dqcoeff;
-    short *q = (short *) d->dequant;
-    vp8_dequantize_b_impl_mmx(sq, dq, q);
-}
-#endif
-
 void vp8_arch_x86_decode_init(VP8D_COMP *pbi)
 {
-#if CONFIG_RUNTIME_CPU_DETECT
-    int flags = x86_simd_caps();
-
-    /* Note:
-     *
-     * This platform can be built without runtime CPU detection as well. If
-     * you modify any of the function mappings present in this file, be sure
-     * to also update them in static mapings (<arch>/filename_<arch>.h)
-     */
-    /* Override default functions with fastest ones for this CPU. */
-#if HAVE_MMX
-    if (flags & HAS_MMX)
-    {
-        pbi->dequant.block               = vp8_dequantize_b_mmx;
-        pbi->dequant.idct_add            = vp8_dequant_idct_add_mmx;
-        pbi->dequant.dc_idct_add         = vp8_dequant_dc_idct_add_mmx;
-        pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_mmx;
-        pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_mmx;
-        pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_mmx;
-    }
-#endif
-#if HAVE_SSE2
-    if (flags & HAS_SSE2)
-    {
-        pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_sse2;
-        pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_sse2;
-        pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_sse2;
-    }
-#endif
 
-#endif
 }
diff --git a/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm b/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm
index 30513f912..5b7e8f66f 100644
--- a/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm
+++ b/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm
@@ -13,6 +13,7 @@
     EXPORT |vp8_encode_bool|
     EXPORT |vp8_stop_encode|
     EXPORT |vp8_encode_value|
+    IMPORT |vp8_validate_buffer_arm|
 
     INCLUDE asm_enc_offsets.asm
 
@@ -22,6 +23,20 @@
 
     AREA    |.text|, CODE, READONLY
 
+    ; macro for validating write buffer position
+    ; needs vp8_writer in r0
+    ; start shall not be in r1
+    MACRO
+    VALIDATE_POS $start, $pos
+    push {r0-r3, r12, lr}        ; rest of regs are preserved by subroutine call
+    ldr  r2, [r0, #vp8_writer_buffer_end]
+    ldr  r3, [r0, #vp8_writer_error]
+    mov  r1, $pos
+    mov  r0, $start
+    bl   vp8_validate_buffer_arm
+    pop  {r0-r3, r12, lr}
+    MEND
+
 ; r0 BOOL_CODER *br
 ; r1 unsigned char *source
 ; r2 unsigned char *source_end
@@ -43,7 +58,7 @@
 ; r1 int bit
 ; r2 int probability
 |vp8_encode_bool| PROC
-    push    {r4-r9, lr}
+    push    {r4-r10, lr}
 
     mov     r4, r2
 
@@ -106,6 +121,9 @@ token_high_bit_not_set
     bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
     str     r1, [r0, #vp8_writer_pos]
     sub     r3, r3, #8                  ; count -= 8
+
+    VALIDATE_POS r9, r1                 ; validate_buffer at pos
+
     strb    r7, [r9, r4]                ; w->buffer[w->pos++]
 
 token_count_lt_zero
@@ -114,7 +132,7 @@ token_count_lt_zero
     str     r2, [r0, #vp8_writer_lowvalue]
     str     r5, [r0, #vp8_writer_range]
     str     r3, [r0, #vp8_writer_count]
-    pop     {r4-r9, pc}
+    pop     {r4-r10, pc}
     ENDP
 
 ; r0 BOOL_CODER *br
@@ -179,6 +197,9 @@ token_high_bit_not_set_se
     bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
     str     r1, [r0, #vp8_writer_pos]
     sub     r3, r3, #8                  ; count -= 8
+
+    VALIDATE_POS r9, r1                 ; validate_buffer at pos
+
     strb    r7, [r9, r4]                ; w->buffer[w->pos++]
 
 token_count_lt_zero_se
@@ -198,7 +219,7 @@ token_count_lt_zero_se
 ; r1 int data
 ; r2 int bits
 |vp8_encode_value| PROC
-    push    {r4-r11, lr}
+    push    {r4-r12, lr}
 
     mov     r10, r2
 
@@ -270,6 +291,9 @@ token_high_bit_not_set_ev
     bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
     str     r11, [r0, #vp8_writer_pos]
     sub     r3, r3, #8                  ; count -= 8
+
+    VALIDATE_POS r9, r11                ; validate_buffer at pos
+
     strb    r7, [r9, r4]                ; w->buffer[w->pos++]
 
 token_count_lt_zero_ev
@@ -281,7 +305,7 @@ token_count_lt_zero_ev
     str     r2, [r0, #vp8_writer_lowvalue]
     str     r5, [r0, #vp8_writer_range]
     str     r3, [r0, #vp8_writer_count]
-    pop     {r4-r11, pc}
+    pop     {r4-r12, pc}
     ENDP
 
     END
diff --git a/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm b/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm
index 933717c63..a1cd46704 100644
--- a/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm
+++ b/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm
@@ -10,6 +10,7 @@
 
 
     EXPORT |vp8cx_pack_tokens_armv5|
+    IMPORT |vp8_validate_buffer_arm|
 
     INCLUDE asm_enc_offsets.asm
 
@@ -19,6 +20,22 @@
 
     AREA    |.text|, CODE, READONLY
 
+
+    ; macro for validating write buffer position
+    ; needs vp8_writer in r0
+    ; start shall not be in r1
+    MACRO
+    VALIDATE_POS $start, $pos
+    push {r0-r3, r12, lr}        ; rest of regs are preserved by subroutine call
+    ldr  r2, [r0, #vp8_writer_buffer_end]
+    ldr  r3, [r0, #vp8_writer_error]
+    mov  r1, $pos
+    mov  r0, $start
+    bl   vp8_validate_buffer_arm
+    pop  {r0-r3, r12, lr}
+    MEND
+
+
 ; r0 vp8_writer *w
 ; r1 const TOKENEXTRA *p
 ; r2 int xcount
@@ -26,11 +43,11 @@
 ; s0 vp8_extra_bits
 ; s1 vp8_coef_tree
 |vp8cx_pack_tokens_armv5| PROC
-    push    {r4-r11, lr}
+    push    {r4-r12, lr}
+    sub     sp, sp, #16
 
     ; Add size of xcount * sizeof (TOKENEXTRA) to get stop
     ;  sizeof (TOKENEXTRA) is 8
-    sub     sp, sp, #12
     add     r2, r1, r2, lsl #3          ; stop = p + xcount*sizeof(TOKENEXTRA)
     str     r2, [sp, #0]
     str     r3, [sp, #8]                ; save vp8_coef_encodings
@@ -57,7 +74,7 @@ while_p_lt_stop
     subne   r8, r8, #1                  ; --n
 
     rsb     r4, r8, #32                 ; 32-n
-    ldr     r10, [sp, #52]              ; vp8_coef_tree
+    ldr     r10, [sp, #60]              ; vp8_coef_tree
 
     ; v is kept in r12 during the token pack loop
     lsl     r12, r6, r4                ; r12 = v << 32 - n
@@ -128,12 +145,15 @@ token_high_bit_not_set
     bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
     str     r11, [r0, #vp8_writer_pos]
     sub     r3, r3, #8                  ; count -= 8
+
+    VALIDATE_POS r10, r11               ; validate_buffer at pos
+
     strb    r7, [r10, r4]               ; w->buffer[w->pos++]
 
     ; r10 is used earlier in the loop, but r10 is used as
     ; temp variable here.  So after r10 is used, reload
     ; vp8_coef_tree_dcd into r10
-    ldr     r10, [sp, #52]              ; vp8_coef_tree
+    ldr     r10, [sp, #60]              ; vp8_coef_tree
 
 token_count_lt_zero
     lsl     r2, r2, r6                  ; lowvalue <<= shift
@@ -142,7 +162,7 @@ token_count_lt_zero
     bne     token_loop
 
     ldrb    r6, [r1, #tokenextra_token] ; t
-    ldr     r7, [sp, #48]               ; vp8_extra_bits
+    ldr     r7, [sp, #56]               ; vp8_extra_bits
     ; Add t * sizeof (vp8_extra_bit_struct) to get the desired
     ;  element.  Here vp8_extra_bit_struct == 16
     add     r12, r7, r6, lsl #4         ; b = vp8_extra_bits + t
@@ -223,6 +243,9 @@ extra_high_bit_not_set
     bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
     str     r11, [r0, #vp8_writer_pos]
     sub     r3, r3, #8                  ; count -= 8
+
+    VALIDATE_POS r10, r11               ; validate_buffer at pos
+
     strb    r7, [r10, r4]               ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
     ldr     r10, [sp, #4]               ; b->tree
 extra_count_lt_zero
@@ -271,7 +294,10 @@ end_high_bit_not_set
     lsr     r6, r2, #24                 ; lowvalue >> 24
     add     r12, r4, #1                 ; w->pos++
     bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
-    str     r12, [r0, #0x10]
+    str     r12, [r0, #vp8_writer_pos]
+
+    VALIDATE_POS r7, r12               ; validate_buffer at pos
+
     strb    r6, [r7, r4]
 end_count_zero
 skip_extra_bits
@@ -284,8 +310,8 @@ check_p_lt_stop
     str     r2, [r0, #vp8_writer_lowvalue]
     str     r5, [r0, #vp8_writer_range]
     str     r3, [r0, #vp8_writer_count]
-    add     sp, sp, #12
-    pop     {r4-r11, pc}
+    add     sp, sp, #16
+    pop     {r4-r12, pc}
     ENDP
 
     END
diff --git a/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm b/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
index 82bf71f35..1fa5e6c22 100644
--- a/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
+++ b/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
@@ -10,6 +10,7 @@
 
 
     EXPORT |vp8cx_pack_mb_row_tokens_armv5|
+    IMPORT |vp8_validate_buffer_arm|
 
     INCLUDE asm_enc_offsets.asm
 
@@ -19,6 +20,21 @@
 
     AREA    |.text|, CODE, READONLY
 
+
+    ; macro for validating write buffer position
+    ; needs vp8_writer in r0
+    ; start shall not be in r1
+    MACRO
+    VALIDATE_POS $start, $pos
+    push {r0-r3, r12, lr}        ; rest of regs are preserved by subroutine call
+    ldr  r2, [r0, #vp8_writer_buffer_end]
+    ldr  r3, [r0, #vp8_writer_error]
+    mov  r1, $pos
+    mov  r0, $start
+    bl   vp8_validate_buffer_arm
+    pop  {r0-r3, r12, lr}
+    MEND
+
 ; r0 VP8_COMP *cpi
 ; r1 vp8_writer *w
 ; r2 vp8_coef_encodings
@@ -26,7 +42,7 @@
 ; s0 vp8_coef_tree
 
 |vp8cx_pack_mb_row_tokens_armv5| PROC
-    push    {r4-r11, lr}
+    push    {r4-r12, lr}
     sub     sp, sp, #24
 
     ; Compute address of cpi->common.mb_rows
@@ -79,7 +95,7 @@ while_p_lt_stop
     subne   r8, r8, #1                  ; --n
 
     rsb     r4, r8, #32                 ; 32-n
-    ldr     r10, [sp, #60]              ; vp8_coef_tree
+    ldr     r10, [sp, #64]              ; vp8_coef_tree
 
     ; v is kept in r12 during the token pack loop
     lsl     r12, r6, r4                 ; r12 = v << 32 - n
@@ -150,12 +166,15 @@ token_high_bit_not_set
     bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
     str     r11, [r0, #vp8_writer_pos]
     sub     r3, r3, #8                  ; count -= 8
+
+    VALIDATE_POS r10, r11               ; validate_buffer at pos
+
     strb    r7, [r10, r4]               ; w->buffer[w->pos++]
 
     ; r10 is used earlier in the loop, but r10 is used as
     ; temp variable here.  So after r10 is used, reload
     ; vp8_coef_tree_dcd into r10
-    ldr     r10, [sp, #60]              ; vp8_coef_tree
+    ldr     r10, [sp, #64]              ; vp8_coef_tree
 
 token_count_lt_zero
     lsl     r2, r2, r6                  ; lowvalue <<= shift
@@ -245,6 +264,9 @@ extra_high_bit_not_set
     bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
     str     r11, [r0, #vp8_writer_pos]
     sub     r3, r3, #8                  ; count -= 8
+
+    VALIDATE_POS r10, r11               ; validate_buffer at pos
+
     strb    r7, [r10, r4]               ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
     ldr     r10, [sp, #4]               ; b->tree
 extra_count_lt_zero
@@ -293,7 +315,10 @@ end_high_bit_not_set
     lsr     r6, r2, #24                 ; lowvalue >> 24
     add     r12, r4, #1                 ; w->pos++
     bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
-    str     r12, [r0, #0x10]
+    str     r12, [r0, #vp8_writer_pos]
+
+    VALIDATE_POS r7, r12               ; validate_buffer at pos
+
     strb    r6, [r7, r4]
 end_count_zero
 skip_extra_bits
@@ -314,7 +339,7 @@ check_p_lt_stop
     str     r5, [r0, #vp8_writer_range]
     str     r3, [r0, #vp8_writer_count]
     add     sp, sp, #24
-    pop     {r4-r11, pc}
+    pop     {r4-r12, pc}
     ENDP
 
 _VP8_COMP_common_
diff --git a/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm b/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
index c061b2fab..3a183aa2f 100644
--- a/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
+++ b/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
@@ -10,6 +10,7 @@
 
 
     EXPORT |vp8cx_pack_tokens_into_partitions_armv5|
+    IMPORT |vp8_validate_buffer_arm|
 
     INCLUDE asm_enc_offsets.asm
 
@@ -19,17 +20,31 @@
 
     AREA    |.text|, CODE, READONLY
 
+    ; macro for validating write buffer position
+    ; needs vp8_writer in r0
+    ; start shall not be in r1
+    MACRO
+    VALIDATE_POS $start, $pos
+    push {r0-r3, r12, lr}        ; rest of regs are preserved by subroutine call
+    ldr  r2, [r0, #vp8_writer_buffer_end]
+    ldr  r3, [r0, #vp8_writer_error]
+    mov  r1, $pos
+    mov  r0, $start
+    bl   vp8_validate_buffer_arm
+    pop  {r0-r3, r12, lr}
+    MEND
+
 ; r0 VP8_COMP *cpi
 ; r1 unsigned char *cx_data
-; r2 int num_part
-; r3 *size
+; r2 const unsigned char *cx_data_end
+; r3 int num_part
 ; s0 vp8_coef_encodings
 ; s1 vp8_extra_bits,
-; s2 const vp8_tree_index *,
+; s2 const vp8_tree_index *
 
 |vp8cx_pack_tokens_into_partitions_armv5| PROC
-    push    {r4-r11, lr}
-    sub     sp, sp, #44
+    push    {r4-r12, lr}
+    sub     sp, sp, #40
 
     ; Compute address of cpi->common.mb_rows
     ldr     r4, _VP8_COMP_common_
@@ -39,31 +54,26 @@
     ldr     r5, [r4, r6]                ; load up mb_rows
 
     str     r5, [sp, #36]               ; save mb_rows
-    str     r1, [sp, #24]               ; save cx_data
-    str     r2, [sp, #20]               ; save num_part
-    str     r3, [sp, #8]                ; save *size
-
-    ; *size = 3*(num_part -1 );
-    sub     r2, r2, #1                  ; num_part - 1
-    add     r2, r2, r2, lsl #1          ; 3*(num_part - 1)
-    str     r2, [r3]
-
-    add     r2, r2, r1                  ; cx_data + *size
-    str     r2, [sp, #40]               ; ptr
+    str     r1, [sp, #24]               ; save ptr = cx_data
+    str     r3, [sp, #20]               ; save num_part
+    str     r2, [sp, #8]                ; save cx_data_end
 
     ldr     r4, _VP8_COMP_tplist_
     add     r4, r0, r4
     ldr     r7, [r4, #0]                ; dereference cpi->tp_list
     str     r7, [sp, #32]               ; store start of cpi->tp_list
 
-    ldr     r11, _VP8_COMP_bc2_         ; load up vp8_writer out of cpi
+    ldr     r11, _VP8_COMP_bc_          ; load up vp8_writer out of cpi
     add     r0, r0, r11
 
     mov     r11, #0
     str     r11, [sp, #28]              ; i
 
 numparts_loop
-    ldr     r10, [sp, #40]              ; ptr
+    ldr     r2, _vp8_writer_sz_         ; load up sizeof(vp8_writer)
+    add     r0, r2                      ; bc[i + 1]
+
+    ldr     r10, [sp, #24]              ; ptr
     ldr     r5,  [sp, #36]              ; move mb_rows to the counting section
     subs    r5, r5, r11                 ; move start point with each partition
                                         ; mb_rows starts at i
@@ -72,6 +82,10 @@ numparts_loop
     ; Reset all of the VP8 Writer data for each partition that
     ; is processed.
     ; start_encode
+
+    ldr     r3, [sp, #8]
+    str     r3, [r0, #vp8_writer_buffer_end]
+
     mov     r2, #0                      ; vp8_writer_lowvalue
     mov     r5, #255                    ; vp8_writer_range
     mvn     r3, #23                     ; vp8_writer_count
@@ -182,6 +196,9 @@ token_high_bit_not_set
     bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
     str     r11, [r0, #vp8_writer_pos]
     sub     r3, r3, #8                  ; count -= 8
+
+    VALIDATE_POS r10, r11               ; validate_buffer at pos
+
     strb    r7, [r10, r4]               ; w->buffer[w->pos++]
 
     ; r10 is used earlier in the loop, but r10 is used as
@@ -277,6 +294,9 @@ extra_high_bit_not_set
     bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
     str     r11, [r0, #vp8_writer_pos]
     sub     r3, r3, #8                  ; count -= 8
+
+    VALIDATE_POS r10, r11               ; validate_buffer at pos
+
     strb    r7, [r10, r4]               ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
     ldr     r10, [sp, #4]               ; b->tree
 extra_count_lt_zero
@@ -320,12 +340,15 @@ end_high_bit_not_set
     bne     end_count_zero
 
     ldr     r4, [r0, #vp8_writer_pos]
-    mvn     r3, #7
+    mvn     r3, #7                      ; count = -8
     ldr     r7, [r0, #vp8_writer_buffer]
     lsr     r6, r2, #24                 ; lowvalue >> 24
     add     r12, r4, #1                 ; w->pos++
     bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
-    str     r12, [r0, #0x10]
+    str     r12, [r0, #vp8_writer_pos]
+
+    VALIDATE_POS r7, r12                ; validate_buffer at pos
+
     strb    r6, [r7, r4]
 end_count_zero
 skip_extra_bits
@@ -401,6 +424,9 @@ token_high_bit_not_set_se
     bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
     str     r11, [r0, #vp8_writer_pos]
     sub     r3, r3, #8                  ; count -= 8
+
+    VALIDATE_POS r10, r11               ; validate_buffer at pos
+
     strb    r7, [r10, r4]               ; w->buffer[w->pos++]
 
 token_count_lt_zero_se
@@ -409,33 +435,10 @@ token_count_lt_zero_se
     subs    r12, r12, #1
     bne     stop_encode_loop
 
-    ldr     r10, [sp, #8]               ; *size
-    ldr     r11, [r10]
     ldr     r4,  [r0, #vp8_writer_pos]  ; w->pos
-    add     r11, r11, r4                ; *size += w->pos
-    str     r11, [r10]
-
-    ldr     r9, [sp, #20]               ; num_parts
-    sub     r9, r9, #1
-    ldr     r10, [sp, #28]              ; i
-    cmp     r10, r9                     ; if(i<(num_part - 1))
-    bge     skip_write_partition
-
-    ldr     r12, [sp, #40]              ; ptr
+    ldr     r12, [sp, #24]              ; ptr
     add     r12, r12, r4                ; ptr += w->pos
-    str     r12, [sp, #40]
-
-    ldr     r9, [sp, #24]               ; cx_data
-    mov     r8, r4, asr #8
-    strb    r4, [r9, #0]
-    strb    r8, [r9, #1]
-    mov     r4, r4, asr #16
-    strb    r4, [r9, #2]
-
-    add     r9, r9, #3                  ; cx_data += 3
-    str     r9, [sp, #24]
-
-skip_write_partition
+    str     r12, [sp, #24]
 
     ldr     r11, [sp, #28]              ; i
     ldr     r10, [sp, #20]              ; num_parts
@@ -451,9 +454,8 @@ skip_write_partition
     cmp     r10, r11
     bgt     numparts_loop
 
-
-    add     sp, sp, #44
-    pop     {r4-r11, pc}
+    add     sp, sp, #40
+    pop     {r4-r12, pc}
     ENDP
 
 _VP8_COMP_common_
@@ -462,7 +464,9 @@ _VP8_COMMON_MBrows_
     DCD     vp8_common_mb_rows
 _VP8_COMP_tplist_
     DCD     vp8_comp_tplist
-_VP8_COMP_bc2_
-    DCD     vp8_comp_bc2
+_VP8_COMP_bc_
+    DCD     vp8_comp_bc
+_vp8_writer_sz_
+    DCD     vp8_writer_sz
 
     END
diff --git a/vp8/encoder/arm/armv6/vp8_subtract_armv6.asm b/vp8/encoder/arm/armv6/vp8_subtract_armv6.asm
index 0ca74387b..f329f8f73 100644
--- a/vp8/encoder/arm/armv6/vp8_subtract_armv6.asm
+++ b/vp8/encoder/arm/armv6/vp8_subtract_armv6.asm
@@ -72,22 +72,23 @@ loop_block
 ; r0    short *diff
 ; r1    unsigned char *usrc
 ; r2    unsigned char *vsrc
-; r3    unsigned char *pred
-; stack int stride
+; r3    int src_stride
+; sp    unsigned char *upred
+; sp    unsigned char *vpred
+; sp    int pred_stride
 |vp8_subtract_mbuv_armv6| PROC
 
-    stmfd   sp!, {r4-r12, lr}
+    stmfd   sp!, {r4-r11}
 
     add     r0, r0, #512        ; set *diff point to Cb
-    add     r3, r3, #256        ; set *pred point to Cb
-
     mov     r4, #8              ; loop count
-    ldr     r5, [sp, #40]       ; stride
+    ldr     r5, [sp, #32]       ; upred
+    ldr     r12, [sp, #40]      ; pred_stride
 
     ; Subtract U block
 loop_u
-    ldr     r6, [r1]            ; src       (A)
-    ldr     r7, [r3], #4        ; pred      (A)
+    ldr     r6, [r1]            ; usrc      (A)
+    ldr     r7, [r5]            ; upred     (A)
 
     uxtb16  r8, r6              ; [s2 | s0] (A)
     uxtb16  r9, r7              ; [p2 | p0] (A)
@@ -97,8 +98,8 @@ loop_u
     usub16  r6, r8, r9          ; [d2 | d0] (A)
     usub16  r7, r10, r11        ; [d3 | d1] (A)
 
-    ldr     r10, [r1, #4]       ; src       (B)
-    ldr     r11, [r3], #4       ; pred      (B)
+    ldr     r10, [r1, #4]       ; usrc      (B)
+    ldr     r11, [r5, #4]       ; upred     (B)
 
     pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (A)
     pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (A)
@@ -114,7 +115,8 @@ loop_u
     usub16  r6, r8, r9          ; [d2 | d0] (B)
     usub16  r7, r10, r11        ; [d3 | d1] (B)
 
-    add     r1, r1, r5          ; update usrc pointer
+    add     r1, r1, r3          ; update usrc pointer
+    add     r5, r5, r12         ; update upred pointer
 
     pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (B)
     pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (B)
@@ -125,12 +127,13 @@ loop_u
 
     bne     loop_u
 
+    ldr     r5, [sp, #36]       ; vpred
     mov     r4, #8              ; loop count
 
     ; Subtract V block
 loop_v
-    ldr     r6, [r2]            ; src       (A)
-    ldr     r7, [r3], #4        ; pred      (A)
+    ldr     r6, [r2]            ; vsrc      (A)
+    ldr     r7, [r5]            ; vpred     (A)
 
     uxtb16  r8, r6              ; [s2 | s0] (A)
     uxtb16  r9, r7              ; [p2 | p0] (A)
@@ -140,8 +143,8 @@ loop_v
     usub16  r6, r8, r9          ; [d2 | d0] (A)
     usub16  r7, r10, r11        ; [d3 | d1] (A)
 
-    ldr     r10, [r2, #4]       ; src       (B)
-    ldr     r11, [r3], #4       ; pred      (B)
+    ldr     r10, [r2, #4]       ; vsrc      (B)
+    ldr     r11, [r5, #4]       ; vpred     (B)
 
     pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (A)
     pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (A)
@@ -157,7 +160,8 @@ loop_v
     usub16  r6, r8, r9          ; [d2 | d0] (B)
     usub16  r7, r10, r11        ; [d3 | d1] (B)
 
-    add     r2, r2, r5          ; update vsrc pointer
+    add     r2, r2, r3          ; update vsrc pointer
+    add     r5, r5, r12         ; update vpred pointer
 
     pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (B)
     pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (B)
@@ -168,23 +172,25 @@ loop_v
 
     bne     loop_v
 
-    ldmfd   sp!, {r4-r12, pc}
+    ldmfd   sp!, {r4-r11}
+    bx      lr
 
     ENDP
 
 
 ; r0    short *diff
 ; r1    unsigned char *src
-; r2    unsigned char *pred
-; r3    int stride
+; r2    int src_stride
+; r3    unsigned char *pred
+; sp    int pred_stride
 |vp8_subtract_mby_armv6| PROC
 
     stmfd   sp!, {r4-r11}
-
+    ldr     r12, [sp, #32]      ; pred_stride
     mov     r4, #16
 loop
     ldr     r6, [r1]            ; src       (A)
-    ldr     r7, [r2], #4        ; pred      (A)
+    ldr     r7, [r3]            ; pred      (A)
 
     uxtb16  r8, r6              ; [s2 | s0] (A)
     uxtb16  r9, r7              ; [p2 | p0] (A)
@@ -195,7 +201,7 @@ loop
     usub16  r7, r10, r11        ; [d3 | d1] (A)
 
     ldr     r10, [r1, #4]       ; src       (B)
-    ldr     r11, [r2], #4       ; pred      (B)
+    ldr     r11, [r3, #4]       ; pred      (B)
 
     pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (A)
     pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (A)
@@ -212,7 +218,7 @@ loop
     usub16  r7, r10, r11        ; [d3 | d1] (B)
 
     ldr     r10, [r1, #8]       ; src       (C)
-    ldr     r11, [r2], #4       ; pred      (C)
+    ldr     r11, [r3, #8]       ; pred      (C)
 
     pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (B)
     pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (B)
@@ -229,10 +235,10 @@ loop
     usub16  r7, r10, r11        ; [d3 | d1] (C)
 
     ldr     r10, [r1, #12]      ; src       (D)
-    ldr     r11, [r2], #4       ; pred      (D)
+    ldr     r11, [r3, #12]      ; pred      (D)
 
-    pkhbt   r8, r6, r7, lsl #16  ; [d1 | d0] (C)
-    pkhtb   r9, r7, r6, asr #16  ; [d3 | d2] (C)
+    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (C)
+    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (C)
 
     str     r8, [r0], #4        ; diff      (C)
     uxtb16  r8, r10             ; [s2 | s0] (D)
@@ -245,7 +251,8 @@ loop
     usub16  r6, r8, r9          ; [d2 | d0] (D)
     usub16  r7, r10, r11        ; [d3 | d1] (D)
 
-    add     r1, r1, r3          ; update src pointer
+    add     r1, r1, r2          ; update src pointer
+    add     r3, r3, r12         ; update pred pointer
 
     pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (D)
     pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (D)
@@ -257,7 +264,7 @@ loop
     bne     loop
 
     ldmfd   sp!, {r4-r11}
-    mov     pc, lr
+    bx      lr
 
     ENDP
 
diff --git a/vp8/encoder/arm/boolhuff_arm.c b/vp8/encoder/arm/boolhuff_arm.c
index 9089663ca..17a941bfc 100644
--- a/vp8/encoder/arm/boolhuff_arm.c
+++ b/vp8/encoder/arm/boolhuff_arm.c
@@ -10,7 +10,7 @@
 
 
 #include "vp8/encoder/boolhuff.h"
-#include "vp8/common/blockd.h"
+#include "vpx/internal/vpx_codec_internal.h"
 
 const unsigned int vp8_prob_cost[256] =
 {
@@ -32,3 +32,10 @@ const unsigned int vp8_prob_cost[256] =
     22,   21,   19,   18,   16,   15,   13,   12,   10,    9,    7,    6,    4,    3,    1,   1
 };
 
+int vp8_validate_buffer_arm(const unsigned char *start,
+                            size_t               len,
+                            const unsigned char *end,
+                            struct vpx_internal_error_info *error)
+{
+    return validate_buffer(start, len, end, error);
+}
diff --git a/vp8/encoder/arm/neon/subtract_neon.asm b/vp8/encoder/arm/neon/subtract_neon.asm
index 68c295062..91a328c29 100644
--- a/vp8/encoder/arm/neon/subtract_neon.asm
+++ b/vp8/encoder/arm/neon/subtract_neon.asm
@@ -61,19 +61,24 @@
 
 
 ;==========================================
-;void vp8_subtract_mby_neon(short *diff, unsigned char *src, unsigned char *pred, int stride)
+;void vp8_subtract_mby_neon(short *diff, unsigned char *src, int src_stride
+;                           unsigned char *pred, int pred_stride)
 |vp8_subtract_mby_neon| PROC
+    push            {r4-r7}
     mov             r12, #4
+    ldr             r4, [sp, #16]           ; pred_stride
+    mov             r6, #32                 ; "diff" stride x2
+    add             r5, r0, #16             ; second diff pointer
 
 subtract_mby_loop
-    vld1.8          {q0}, [r1], r3          ;load src
-    vld1.8          {q1}, [r2]!             ;load pred
-    vld1.8          {q2}, [r1], r3
-    vld1.8          {q3}, [r2]!
-    vld1.8          {q4}, [r1], r3
-    vld1.8          {q5}, [r2]!
-    vld1.8          {q6}, [r1], r3
-    vld1.8          {q7}, [r2]!
+    vld1.8          {q0}, [r1], r2          ;load src
+    vld1.8          {q1}, [r3], r4          ;load pred
+    vld1.8          {q2}, [r1], r2
+    vld1.8          {q3}, [r3], r4
+    vld1.8          {q4}, [r1], r2
+    vld1.8          {q5}, [r3], r4
+    vld1.8          {q6}, [r1], r2
+    vld1.8          {q7}, [r3], r4
 
     vsubl.u8        q8, d0, d2
     vsubl.u8        q9, d1, d3
@@ -84,46 +89,53 @@ subtract_mby_loop
     vsubl.u8        q14, d12, d14
     vsubl.u8        q15, d13, d15
 
-    vst1.16         {q8}, [r0]!             ;store diff
-    vst1.16         {q9}, [r0]!
-    vst1.16         {q10}, [r0]!
-    vst1.16         {q11}, [r0]!
-    vst1.16         {q12}, [r0]!
-    vst1.16         {q13}, [r0]!
-    vst1.16         {q14}, [r0]!
-    vst1.16         {q15}, [r0]!
+    vst1.16         {q8}, [r0], r6          ;store diff
+    vst1.16         {q9}, [r5], r6
+    vst1.16         {q10}, [r0], r6
+    vst1.16         {q11}, [r5], r6
+    vst1.16         {q12}, [r0], r6
+    vst1.16         {q13}, [r5], r6
+    vst1.16         {q14}, [r0], r6
+    vst1.16         {q15}, [r5], r6
 
     subs            r12, r12, #1
     bne             subtract_mby_loop
 
+    pop             {r4-r7}
     bx              lr
     ENDP
 
 ;=================================
-;void vp8_subtract_mbuv_neon(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
+;void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc,
+;                         int src_stride, unsigned char *upred,
+;                         unsigned char *vpred, int pred_stride)
+
 |vp8_subtract_mbuv_neon| PROC
-    ldr             r12, [sp]
+    push            {r4-r7}
+    ldr             r4, [sp, #16]       ; upred
+    ldr             r5, [sp, #20]       ; vpred
+    ldr             r6, [sp, #24]       ; pred_stride
+    add             r0, r0, #512        ; short *udiff = diff + 256;
+    mov             r12, #32            ; "diff" stride x2
+    add             r7, r0, #16         ; second diff pointer
 
 ;u
-    add             r0, r0, #512        ;   short *udiff = diff + 256;
-    add             r3, r3, #256        ;   unsigned char *upred = pred + 256;
-
-    vld1.8          {d0}, [r1], r12         ;load src
-    vld1.8          {d1}, [r3]!             ;load pred
-    vld1.8          {d2}, [r1], r12
-    vld1.8          {d3}, [r3]!
-    vld1.8          {d4}, [r1], r12
-    vld1.8          {d5}, [r3]!
-    vld1.8          {d6}, [r1], r12
-    vld1.8          {d7}, [r3]!
-    vld1.8          {d8}, [r1], r12
-    vld1.8          {d9}, [r3]!
-    vld1.8          {d10}, [r1], r12
-    vld1.8          {d11}, [r3]!
-    vld1.8          {d12}, [r1], r12
-    vld1.8          {d13}, [r3]!
-    vld1.8          {d14}, [r1], r12
-    vld1.8          {d15}, [r3]!
+    vld1.8          {d0}, [r1], r3      ;load usrc
+    vld1.8          {d1}, [r4], r6      ;load upred
+    vld1.8          {d2}, [r1], r3
+    vld1.8          {d3}, [r4], r6
+    vld1.8          {d4}, [r1], r3
+    vld1.8          {d5}, [r4], r6
+    vld1.8          {d6}, [r1], r3
+    vld1.8          {d7}, [r4], r6
+    vld1.8          {d8}, [r1], r3
+    vld1.8          {d9}, [r4], r6
+    vld1.8          {d10}, [r1], r3
+    vld1.8          {d11}, [r4], r6
+    vld1.8          {d12}, [r1], r3
+    vld1.8          {d13}, [r4], r6
+    vld1.8          {d14}, [r1], r3
+    vld1.8          {d15}, [r4], r6
 
     vsubl.u8        q8, d0, d1
     vsubl.u8        q9, d2, d3
@@ -134,32 +146,32 @@ subtract_mby_loop
     vsubl.u8        q14, d12, d13
     vsubl.u8        q15, d14, d15
 
-    vst1.16         {q8}, [r0]!             ;store diff
-    vst1.16         {q9}, [r0]!
-    vst1.16         {q10}, [r0]!
-    vst1.16         {q11}, [r0]!
-    vst1.16         {q12}, [r0]!
-    vst1.16         {q13}, [r0]!
-    vst1.16         {q14}, [r0]!
-    vst1.16         {q15}, [r0]!
+    vst1.16         {q8}, [r0], r12     ;store diff
+    vst1.16         {q9}, [r7], r12
+    vst1.16         {q10}, [r0], r12
+    vst1.16         {q11}, [r7], r12
+    vst1.16         {q12}, [r0], r12
+    vst1.16         {q13}, [r7], r12
+    vst1.16         {q14}, [r0], r12
+    vst1.16         {q15}, [r7], r12
 
 ;v
-    vld1.8          {d0}, [r2], r12         ;load src
-    vld1.8          {d1}, [r3]!             ;load pred
-    vld1.8          {d2}, [r2], r12
-    vld1.8          {d3}, [r3]!
-    vld1.8          {d4}, [r2], r12
-    vld1.8          {d5}, [r3]!
-    vld1.8          {d6}, [r2], r12
-    vld1.8          {d7}, [r3]!
-    vld1.8          {d8}, [r2], r12
-    vld1.8          {d9}, [r3]!
-    vld1.8          {d10}, [r2], r12
-    vld1.8          {d11}, [r3]!
-    vld1.8          {d12}, [r2], r12
-    vld1.8          {d13}, [r3]!
-    vld1.8          {d14}, [r2], r12
-    vld1.8          {d15}, [r3]!
+    vld1.8          {d0}, [r2], r3      ;load vsrc
+    vld1.8          {d1}, [r5], r6      ;load vpred
+    vld1.8          {d2}, [r2], r3
+    vld1.8          {d3}, [r5], r6
+    vld1.8          {d4}, [r2], r3
+    vld1.8          {d5}, [r5], r6
+    vld1.8          {d6}, [r2], r3
+    vld1.8          {d7}, [r5], r6
+    vld1.8          {d8}, [r2], r3
+    vld1.8          {d9}, [r5], r6
+    vld1.8          {d10}, [r2], r3
+    vld1.8          {d11}, [r5], r6
+    vld1.8          {d12}, [r2], r3
+    vld1.8          {d13}, [r5], r6
+    vld1.8          {d14}, [r2], r3
+    vld1.8          {d15}, [r5], r6
 
     vsubl.u8        q8, d0, d1
     vsubl.u8        q9, d2, d3
@@ -170,16 +182,18 @@ subtract_mby_loop
     vsubl.u8        q14, d12, d13
     vsubl.u8        q15, d14, d15
 
-    vst1.16         {q8}, [r0]!             ;store diff
-    vst1.16         {q9}, [r0]!
-    vst1.16         {q10}, [r0]!
-    vst1.16         {q11}, [r0]!
-    vst1.16         {q12}, [r0]!
-    vst1.16         {q13}, [r0]!
-    vst1.16         {q14}, [r0]!
-    vst1.16         {q15}, [r0]!
+    vst1.16         {q8}, [r0], r12     ;store diff
+    vst1.16         {q9}, [r7], r12
+    vst1.16         {q10}, [r0], r12
+    vst1.16         {q11}, [r7], r12
+    vst1.16         {q12}, [r0], r12
+    vst1.16         {q13}, [r7], r12
+    vst1.16         {q14}, [r0], r12
+    vst1.16         {q15}, [r7], r12
 
+    pop             {r4-r7}
     bx              lr
+
     ENDP
 
     END
diff --git a/vp8/encoder/arm/variance_arm.c b/vp8/encoder/arm/variance_arm.c
index e77be9f73..7fc7473ac 100644
--- a/vp8/encoder/arm/variance_arm.c
+++ b/vp8/encoder/arm/variance_arm.c
@@ -11,9 +11,9 @@
 #include "vpx_config.h"
 #include "vp8/encoder/variance.h"
 #include "vp8/common/filter.h"
-#include "vp8/common/arm/bilinearfilter_arm.h"
 
 #if HAVE_ARMV6
+#include "vp8/common/arm/bilinearfilter_arm.h"
 
 unsigned int vp8_sub_pixel_variance8x8_armv6
 (
diff --git a/vp8/encoder/asm_enc_offsets.c b/vp8/encoder/asm_enc_offsets.c
index d05dab47c..2e9ca7232 100644
--- a/vp8/encoder/asm_enc_offsets.c
+++ b/vp8/encoder/asm_enc_offsets.c
@@ -50,6 +50,7 @@ DEFINE(vp8_writer_count,                        offsetof(vp8_writer, count));
 DEFINE(vp8_writer_pos,                          offsetof(vp8_writer, pos));
 DEFINE(vp8_writer_buffer,                       offsetof(vp8_writer, buffer));
 DEFINE(vp8_writer_buffer_end,                   offsetof(vp8_writer, buffer_end));
+DEFINE(vp8_writer_error,                        offsetof(vp8_writer, error));
 
 DEFINE(tokenextra_token,                        offsetof(TOKENEXTRA, Token));
 DEFINE(tokenextra_extra,                        offsetof(TOKENEXTRA, Extra));
@@ -69,7 +70,8 @@ DEFINE(vp8_extra_bit_struct_base_val,           offsetof(vp8_extra_bit_struct, b
 
 DEFINE(vp8_comp_tplist,                         offsetof(VP8_COMP, tplist));
 DEFINE(vp8_comp_common,                         offsetof(VP8_COMP, common));
-DEFINE(vp8_comp_bc2,                            offsetof(VP8_COMP, bc2));
+DEFINE(vp8_comp_bc ,                            offsetof(VP8_COMP, bc));
+DEFINE(vp8_writer_sz ,                          sizeof(vp8_writer));
 
 DEFINE(tokenlist_start,                         offsetof(TOKENLIST, start));
 DEFINE(tokenlist_stop,                          offsetof(TOKENLIST, stop));
diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c
index 748b60778..669bfad9a 100644
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c
@@ -109,7 +109,7 @@ static void update_mbintra_mode_probs(VP8_COMP *cpi)
 {
     VP8_COMMON *const x = & cpi->common;
 
-    vp8_writer *const w = & cpi->bc;
+    vp8_writer *const w = cpi->bc;
 
     {
         vp8_prob Pnew   [VP8_YMODES-1];
@@ -221,6 +221,11 @@ static void pack_tokens_c(vp8_writer *w, const TOKENEXTRA *p, int xcount)
                     w->buffer[x] += 1;
                 }
 
+                validate_buffer(w->buffer + w->pos,
+                                1,
+                                w->buffer_end,
+                                w->error);
+
                 w->buffer[w->pos++] = (lowvalue >> (24 - offset));
                 lowvalue <<= offset;
                 shift = count;
@@ -281,6 +286,11 @@ static void pack_tokens_c(vp8_writer *w, const TOKENEXTRA *p, int xcount)
                             w->buffer[x] += 1;
                         }
 
+                        validate_buffer(w->buffer + w->pos,
+                                        1,
+                                        w->buffer_end,
+                                        w->error);
+
                         w->buffer[w->pos++] = (lowvalue >> (24 - offset));
                         lowvalue <<= offset;
                         shift = count;
@@ -329,6 +339,12 @@ static void pack_tokens_c(vp8_writer *w, const TOKENEXTRA *p, int xcount)
                 if (!++count)
                 {
                     count = -8;
+
+                    validate_buffer(w->buffer + w->pos,
+                                    1,
+                                    w->buffer_end,
+                                    w->error);
+
                     w->buffer[w->pos++] = (lowvalue >> 24);
                     lowvalue &= 0xffffff;
                 }
@@ -358,20 +374,21 @@ static void write_partition_size(unsigned char *cx_data, int size)
 
 }
 
-static void pack_tokens_into_partitions_c(VP8_COMP *cpi, unsigned char *cx_data, unsigned char * cx_data_end, int num_part, int *size)
+static void pack_tokens_into_partitions_c(VP8_COMP *cpi, unsigned char *cx_data,
+                                          unsigned char * cx_data_end,
+                                          int num_part)
 {
 
     int i;
     unsigned char *ptr = cx_data;
     unsigned char *ptr_end = cx_data_end;
     unsigned int shift;
-    vp8_writer *w = &cpi->bc2;
-    *size = 3 * (num_part - 1);
-    cpi->partition_sz[0] += *size;
-    ptr = cx_data + (*size);
+    vp8_writer *w;
+    ptr = cx_data;
 
     for (i = 0; i < num_part; i++)
     {
+        w = cpi->bc + i + 1;
         vp8_start_encode(w, ptr, ptr_end);
         {
             unsigned int split;
@@ -581,17 +598,7 @@ static void pack_tokens_into_partitions_c(VP8_COMP *cpi, unsigned char *cx_data,
         }
 
         vp8_stop_encode(w);
-        *size +=   w->pos;
-
-        /* The first partition size is set earlier */
-        cpi->partition_sz[i + 1] = w->pos;
-
-        if (i < (num_part - 1))
-        {
-            write_partition_size(cx_data, w->pos);
-            cx_data += 3;
-            ptr += w->pos;
-        }
+        ptr += w->pos;
     }
 }
 
@@ -664,6 +671,11 @@ static void pack_mb_row_tokens_c(VP8_COMP *cpi, vp8_writer *w)
                         w->buffer[x] += 1;
                     }
 
+                    validate_buffer(w->buffer + w->pos,
+                                    1,
+                                    w->buffer_end,
+                                    w->error);
+
                     w->buffer[w->pos++] = (lowvalue >> (24 - offset));
                     lowvalue <<= offset;
                     shift = count;
@@ -724,6 +736,11 @@ static void pack_mb_row_tokens_c(VP8_COMP *cpi, vp8_writer *w)
                                 w->buffer[x] += 1;
                             }
 
+                            validate_buffer(w->buffer + w->pos,
+                                            1,
+                                            w->buffer_end,
+                                            w->error);
+
                             w->buffer[w->pos++] = (lowvalue >> (24 - offset));
                             lowvalue <<= offset;
                             shift = count;
@@ -770,6 +787,12 @@ static void pack_mb_row_tokens_c(VP8_COMP *cpi, vp8_writer *w)
                     if (!++count)
                     {
                         count = -8;
+
+                        validate_buffer(w->buffer + w->pos,
+                                        1,
+                                        w->buffer_end,
+                                        w->error);
+
                         w->buffer[w->pos++] = (lowvalue >> 24);
                         lowvalue &= 0xffffff;
                     }
@@ -855,44 +878,46 @@ static void write_mb_features(vp8_writer *w, const MB_MODE_INFO *mi, const MACRO
         }
     }
 }
+void vp8_convert_rfct_to_prob(VP8_COMP *const cpi)
+{
+    const int *const rfct = cpi->count_mb_ref_frame_usage;
+    const int rf_intra = rfct[INTRA_FRAME];
+    const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];
+
+    // Calculate the probabilities used to code the ref frame based on useage
+    if (!(cpi->prob_intra_coded = rf_intra * 255 / (rf_intra + rf_inter)))
+        cpi->prob_intra_coded = 1;
+
+    cpi->prob_last_coded = rf_inter ? (rfct[LAST_FRAME] * 255) / rf_inter : 128;
 
+    if (!cpi->prob_last_coded)
+        cpi->prob_last_coded = 1;
+
+    cpi->prob_gf_coded = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME])
+                  ? (rfct[GOLDEN_FRAME] * 255) / (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) : 128;
+
+    if (!cpi->prob_gf_coded)
+        cpi->prob_gf_coded = 1;
+
+}
 
 static void pack_inter_mode_mvs(VP8_COMP *const cpi)
 {
     VP8_COMMON *const pc = & cpi->common;
-    vp8_writer *const w = & cpi->bc;
+    vp8_writer *const w = cpi->bc;
     const MV_CONTEXT *mvc = pc->fc.mvc;
 
-    const int *const rfct = cpi->count_mb_ref_frame_usage;
-    const int rf_intra = rfct[INTRA_FRAME];
-    const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];
 
     MODE_INFO *m = pc->mi, *ms;
     const int mis = pc->mode_info_stride;
     int mb_row = -1;
 
-    int prob_last_coded;
-    int prob_gf_coded;
     int prob_skip_false = 0;
     ms = pc->mi - 1;
 
     cpi->mb.partition_info = cpi->mb.pi;
 
-    // Calculate the probabilities to be used to code the reference frame based on actual useage this frame
-    if (!(cpi->prob_intra_coded = rf_intra * 255 / (rf_intra + rf_inter)))
-        cpi->prob_intra_coded = 1;
-
-    prob_last_coded = rf_inter ? (rfct[LAST_FRAME] * 255) / rf_inter : 128;
-
-    if (!prob_last_coded)
-        prob_last_coded = 1;
-
-    prob_gf_coded = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME])
-                    ? (rfct[GOLDEN_FRAME] * 255) / (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) : 128;
-
-    if (!prob_gf_coded)
-        prob_gf_coded = 1;
-
+    vp8_convert_rfct_to_prob(cpi);
 
 #ifdef ENTROPY_STATS
     active_section = 1;
@@ -913,8 +938,8 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi)
     }
 
     vp8_write_literal(w, cpi->prob_intra_coded, 8);
-    vp8_write_literal(w, prob_last_coded, 8);
-    vp8_write_literal(w, prob_gf_coded, 8);
+    vp8_write_literal(w, cpi->prob_last_coded, 8);
+    vp8_write_literal(w, cpi->prob_gf_coded, 8);
 
     update_mbintra_mode_probs(cpi);
 
@@ -976,11 +1001,11 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi)
                 vp8_write(w, 1, cpi->prob_intra_coded);
 
                 if (rf == LAST_FRAME)
-                    vp8_write(w, 0, prob_last_coded);
+                    vp8_write(w, 0, cpi->prob_last_coded);
                 else
                 {
-                    vp8_write(w, 1, prob_last_coded);
-                    vp8_write(w, (rf == GOLDEN_FRAME) ? 0 : 1, prob_gf_coded);
+                    vp8_write(w, 1, cpi->prob_last_coded);
+                    vp8_write(w, (rf == GOLDEN_FRAME) ? 0 : 1, cpi->prob_gf_coded);
                 }
 
                 {
@@ -1075,7 +1100,7 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi)
 
 static void write_kfmodes(VP8_COMP *cpi)
 {
-    vp8_writer *const bc = & cpi->bc;
+    vp8_writer *const bc = cpi->bc;
     const VP8_COMMON *const c = & cpi->common;
     /* const */
     MODE_INFO *m = c->mi;
@@ -1181,7 +1206,7 @@ static void sum_probs_over_prev_coef_context(
     {
         for (j=0; j < PREV_COEF_CONTEXTS; ++j)
         {
-            const int tmp = out[i];
+            const unsigned int tmp = out[i];
             out[i] += probs[j][i];
             /* check for wrap */
             if (out[i] < tmp)
@@ -1332,6 +1357,24 @@ static int default_coef_context_savings(VP8_COMP *cpi)
     return savings;
 }
 
+void vp8_calc_ref_frame_costs(int *ref_frame_cost,
+                              int prob_intra,
+                              int prob_last,
+                              int prob_garf
+                             )
+{
+    ref_frame_cost[INTRA_FRAME]   = vp8_cost_zero(prob_intra);
+    ref_frame_cost[LAST_FRAME]    = vp8_cost_one(prob_intra)
+                                    + vp8_cost_zero(prob_last);
+    ref_frame_cost[GOLDEN_FRAME]  = vp8_cost_one(prob_intra)
+                                    + vp8_cost_one(prob_last)
+                                    + vp8_cost_zero(prob_garf);
+    ref_frame_cost[ALTREF_FRAME]  = vp8_cost_one(prob_intra)
+                                    + vp8_cost_one(prob_last)
+                                    + vp8_cost_one(prob_garf);
+
+}
+
 int vp8_estimate_entropy_savings(VP8_COMP *cpi)
 {
     int savings = 0;
@@ -1339,7 +1382,7 @@ int vp8_estimate_entropy_savings(VP8_COMP *cpi)
     const int *const rfct = cpi->count_mb_ref_frame_usage;
     const int rf_intra = rfct[INTRA_FRAME];
     const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];
-    int new_intra, new_last, gf_last, oldtotal, newtotal;
+    int new_intra, new_last, new_garf, oldtotal, newtotal;
     int ref_frame_cost[MAX_REF_FRAMES];
 
     vp8_clear_system_state(); //__asm emms;
@@ -1351,19 +1394,11 @@ int vp8_estimate_entropy_savings(VP8_COMP *cpi)
 
         new_last = rf_inter ? (rfct[LAST_FRAME] * 255) / rf_inter : 128;
 
-        gf_last = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME])
+        new_garf = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME])
                   ? (rfct[GOLDEN_FRAME] * 255) / (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) : 128;
 
-        // new costs
-        ref_frame_cost[INTRA_FRAME]   = vp8_cost_zero(new_intra);
-        ref_frame_cost[LAST_FRAME]    = vp8_cost_one(new_intra)
-                                        + vp8_cost_zero(new_last);
-        ref_frame_cost[GOLDEN_FRAME]  = vp8_cost_one(new_intra)
-                                        + vp8_cost_one(new_last)
-                                        + vp8_cost_zero(gf_last);
-        ref_frame_cost[ALTREF_FRAME]  = vp8_cost_one(new_intra)
-                                        + vp8_cost_one(new_last)
-                                        + vp8_cost_one(gf_last);
+
+        vp8_calc_ref_frame_costs(ref_frame_cost,new_intra,new_last,new_garf);
 
         newtotal =
             rfct[INTRA_FRAME] * ref_frame_cost[INTRA_FRAME] +
@@ -1373,15 +1408,8 @@ int vp8_estimate_entropy_savings(VP8_COMP *cpi)
 
 
         // old costs
-        ref_frame_cost[INTRA_FRAME]   = vp8_cost_zero(cpi->prob_intra_coded);
-        ref_frame_cost[LAST_FRAME]    = vp8_cost_one(cpi->prob_intra_coded)
-                                        + vp8_cost_zero(cpi->prob_last_coded);
-        ref_frame_cost[GOLDEN_FRAME]  = vp8_cost_one(cpi->prob_intra_coded)
-                                        + vp8_cost_one(cpi->prob_last_coded)
-                                        + vp8_cost_zero(cpi->prob_gf_coded);
-        ref_frame_cost[ALTREF_FRAME]  = vp8_cost_one(cpi->prob_intra_coded)
-                                        + vp8_cost_one(cpi->prob_last_coded)
-                                        + vp8_cost_one(cpi->prob_gf_coded);
+        vp8_calc_ref_frame_costs(ref_frame_cost,cpi->prob_intra_coded,
+                                 cpi->prob_last_coded,cpi->prob_gf_coded);
 
         oldtotal =
             rfct[INTRA_FRAME] * ref_frame_cost[INTRA_FRAME] +
@@ -1405,7 +1433,7 @@ int vp8_estimate_entropy_savings(VP8_COMP *cpi)
 static void update_coef_probs(VP8_COMP *cpi)
 {
     int i = 0;
-    vp8_writer *const w = & cpi->bc;
+    vp8_writer *const w = cpi->bc;
     int savings = 0;
 
     vp8_clear_system_state(); //__asm emms;
@@ -1551,7 +1579,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char * dest
     int i, j;
     VP8_HEADER oh;
     VP8_COMMON *const pc = & cpi->common;
-    vp8_writer *const bc = & cpi->bc;
+    vp8_writer *const bc = cpi->bc;
     MACROBLOCKD *const xd = & cpi->mb.e_mbd;
     int extra_bytes_packed = 0;
 
@@ -1566,6 +1594,8 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char * dest
 
     mb_feature_data_bits = vp8_mb_feature_data_bits;
 
+    bc[0].error = &pc->error;
+
     validate_buffer(cx_data, 3, cx_data_end, &cpi->common.error);
     cx_data += 3;
 
@@ -1614,20 +1644,20 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char * dest
 
 
     // Signal whether or not Segmentation is enabled
-    vp8_write_bit(bc, (xd->segmentation_enabled) ? 1 : 0);
+    vp8_write_bit(bc, xd->segmentation_enabled);
 
     // Indicate which features are enabled
     if (xd->segmentation_enabled)
     {
         // Signal whether or not the segmentation map is being updated.
-        vp8_write_bit(bc, (xd->update_mb_segmentation_map) ? 1 : 0);
-        vp8_write_bit(bc, (xd->update_mb_segmentation_data) ? 1 : 0);
+        vp8_write_bit(bc, xd->update_mb_segmentation_map);
+        vp8_write_bit(bc, xd->update_mb_segmentation_data);
 
         if (xd->update_mb_segmentation_data)
         {
             signed char Data;
 
-            vp8_write_bit(bc, (xd->mb_segement_abs_delta) ? 1 : 0);
+            vp8_write_bit(bc, xd->mb_segement_abs_delta);
 
             // For each segmentation feature (Quant and loop filter level)
             for (i = 0; i < MB_LVL_MAX; i++)
@@ -1684,7 +1714,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char * dest
     vp8_write_literal(bc, pc->sharpness_level, 3);
 
     // Write out loop filter deltas applied at the MB level based on mode or ref frame (if they are enabled).
-    vp8_write_bit(bc, (xd->mode_ref_lf_delta_enabled) ? 1 : 0);
+    vp8_write_bit(bc, xd->mode_ref_lf_delta_enabled);
 
     if (xd->mode_ref_lf_delta_enabled)
     {
@@ -1844,7 +1874,9 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char * dest
 
     vp8_stop_encode(bc);
 
-    oh.first_partition_length_in_bytes = cpi->bc.pos;
+    cx_data += bc->pos;
+
+    oh.first_partition_length_in_bytes = cpi->bc->pos;
 
     /* update frame tag */
     {
@@ -1858,34 +1890,58 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char * dest
         dest[2] = v >> 16;
     }
 
-    *size = VP8_HEADER_SIZE + extra_bytes_packed + cpi->bc.pos;
+    *size = VP8_HEADER_SIZE + extra_bytes_packed + cpi->bc->pos;
+
     cpi->partition_sz[0] = *size;
 
     if (pc->multi_token_partition != ONE_PARTITION)
     {
-        int num_part;
-        int asize;
-        num_part = 1 << pc->multi_token_partition;
+        int num_part = 1 << pc->multi_token_partition;
+
+        /* partition size table at the end of first partition */
+        cpi->partition_sz[0] += 3 * (num_part - 1);
+        *size += 3 * (num_part - 1);
+
+        validate_buffer(cx_data, 3 * (num_part - 1), cx_data_end,
+                        &pc->error);
+
+        for(i = 1; i < num_part + 1; i++)
+        {
+            cpi->bc[i].error = &pc->error;
+        }
 
-        pack_tokens_into_partitions(cpi, cx_data + bc->pos, cx_data_end, num_part, &asize);
+        pack_tokens_into_partitions(cpi, cx_data + 3 * (num_part - 1),
+                                    cx_data_end, num_part);
 
-        *size += asize;
+        for(i = 1; i < num_part; i++)
+        {
+            cpi->partition_sz[i] = cpi->bc[i].pos;
+            write_partition_size(cx_data, cpi->partition_sz[i]);
+            cx_data += 3;
+            *size += cpi->partition_sz[i]; /* add to total */
+        }
+
+        /* add last partition to total size */
+        cpi->partition_sz[i] = cpi->bc[i].pos;
+        *size += cpi->partition_sz[i];
     }
     else
     {
-        vp8_start_encode(&cpi->bc2, cx_data + bc->pos, cx_data_end);
+        bc[1].error = &pc->error;
+
+        vp8_start_encode(&cpi->bc[1], cx_data, cx_data_end);
 
 #if CONFIG_MULTITHREAD
         if (cpi->b_multi_threaded)
-            pack_mb_row_tokens(cpi, &cpi->bc2);
+            pack_mb_row_tokens(cpi, &cpi->bc[1]);
         else
 #endif
-            pack_tokens(&cpi->bc2, cpi->tok, cpi->tok_count);
+            pack_tokens(&cpi->bc[1], cpi->tok, cpi->tok_count);
 
-        vp8_stop_encode(&cpi->bc2);
+        vp8_stop_encode(&cpi->bc[1]);
 
-        *size += cpi->bc2.pos;
-        cpi->partition_sz[1] = cpi->bc2.pos;
+        *size += cpi->bc[1].pos;
+        cpi->partition_sz[1] = cpi->bc[1].pos;
     }
 }
 
diff --git a/vp8/encoder/bitstream.h b/vp8/encoder/bitstream.h
index 8a875a5bd..9007cede0 100644
--- a/vp8/encoder/bitstream.h
+++ b/vp8/encoder/bitstream.h
@@ -17,23 +17,27 @@ void vp8cx_pack_tokens_armv5(vp8_writer *w, const TOKENEXTRA *p, int xcount,
                              vp8_token *,
                              vp8_extra_bit_struct *,
                              const vp8_tree_index *);
-void vp8cx_pack_tokens_into_partitions_armv5(VP8_COMP *, unsigned char *, int , int *,
-        vp8_token *,
-        vp8_extra_bit_struct *,
-        const vp8_tree_index *);
+void vp8cx_pack_tokens_into_partitions_armv5(VP8_COMP *,
+                                             unsigned char * cx_data,
+                                             const unsigned char *cx_data_end,
+                                             int num_parts,
+                                             vp8_token *,
+                                             vp8_extra_bit_struct *,
+                                             const vp8_tree_index *);
 void vp8cx_pack_mb_row_tokens_armv5(VP8_COMP *cpi, vp8_writer *w,
                                     vp8_token *,
                                     vp8_extra_bit_struct *,
                                     const vp8_tree_index *);
 # define pack_tokens(a,b,c)                  \
     vp8cx_pack_tokens_armv5(a,b,c,vp8_coef_encodings,vp8_extra_bits,vp8_coef_tree)
-# define pack_tokens_into_partitions(a,b,unused,c,d)  \
+# define pack_tokens_into_partitions(a,b,c,d)  \
     vp8cx_pack_tokens_into_partitions_armv5(a,b,c,d,vp8_coef_encodings,vp8_extra_bits,vp8_coef_tree)
 # define pack_mb_row_tokens(a,b)               \
     vp8cx_pack_mb_row_tokens_armv5(a,b,vp8_coef_encodings,vp8_extra_bits,vp8_coef_tree)
 #else
-# define pack_tokens(a,b,c)                  pack_tokens_c(a,b,c)
-# define pack_tokens_into_partitions(a,b,c,d,e)  pack_tokens_into_partitions_c(a,b,c,d,e)
+# define pack_tokens(a,b,c)                    pack_tokens_c(a,b,c)
+# define pack_tokens_into_partitions(a,b,c,d)  pack_tokens_into_partitions_c(a,b,c,d)
 # define pack_mb_row_tokens(a,b)               pack_mb_row_tokens_c(a,b)
 #endif
+
 #endif
diff --git a/vp8/encoder/block.h b/vp8/encoder/block.h
index 5e5a60db7..0a74ca46d 100644
--- a/vp8/encoder/block.h
+++ b/vp8/encoder/block.h
@@ -45,10 +45,6 @@ typedef struct
     unsigned char **base_src;
     int src;
     int src_stride;
-
-//  MV  enc_mv;
-    int force_empty;
-
 } BLOCK;
 
 typedef struct
@@ -107,7 +103,6 @@ typedef struct
     int mv_row_min;
     int mv_row_max;
 
-    int vector_range;    // Used to monitor limiting range of recent vectors to guide search.
     int skip;
 
     int encode_breakout;
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index 8ec9e27c9..6a9ba291d 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -39,7 +39,12 @@
 #define IF_RTCD(x)  NULL
 #endif
 extern void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) ;
-
+extern void vp8_calc_ref_frame_costs(int *ref_frame_cost,
+                                     int prob_intra,
+                                     int prob_last,
+                                     int prob_garf
+                                    );
+extern void vp8_convert_rfct_to_prob(VP8_COMP *const cpi);
 extern void vp8cx_initialize_me_consts(VP8_COMP *cpi, int QIndex);
 extern void vp8_auto_select_speed(VP8_COMP *cpi);
 extern void vp8cx_init_mbrthread_data(VP8_COMP *cpi,
@@ -49,8 +54,8 @@ extern void vp8cx_init_mbrthread_data(VP8_COMP *cpi,
                                       int count);
 void vp8_build_block_offsets(MACROBLOCK *x);
 void vp8_setup_block_ptrs(MACROBLOCK *x);
-int vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t, int recon_yoffset, int recon_uvoffset);
-int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t);
+int vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t, int recon_yoffset, int recon_uvoffset, int mb_row, int mb_col);
+int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t, int mb_row, int mb_col);
 static void adjust_act_zbin( VP8_COMP *cpi, MACROBLOCK *x );
 
 #ifdef MODE_STATS
@@ -475,14 +480,14 @@ void encode_mb_row(VP8_COMP *cpi,
 
         if (cm->frame_type == KEY_FRAME)
         {
-            *totalrate += vp8cx_encode_intra_macro_block(cpi, x, tp);
+            *totalrate += vp8cx_encode_intra_macro_block(cpi, x, tp, mb_row, mb_col);
 #ifdef MODE_STATS
             y_modes[xd->mbmi.mode] ++;
 #endif
         }
         else
         {
-            *totalrate += vp8cx_encode_inter_macroblock(cpi, x, tp, recon_yoffset, recon_uvoffset);
+            *totalrate += vp8cx_encode_inter_macroblock(cpi, x, tp, recon_yoffset, recon_uvoffset, mb_row, mb_col);
 
 #ifdef MODE_STATS
             inter_y_modes[xd->mbmi.mode] ++;
@@ -590,8 +595,6 @@ void init_encode_frame_mb_context(VP8_COMP *cpi)
     // Activity map pointer
     x->mb_activity_ptr = cpi->mb_activity_map;
 
-    x->vector_range = 32;
-
     x->act_zbin_adj = 0;
 
     x->partition_info = x->pi;
@@ -636,55 +639,23 @@ void init_encode_frame_mb_context(VP8_COMP *cpi)
     vpx_memset(cm->above_context, 0,
                sizeof(ENTROPY_CONTEXT_PLANES) * cm->mb_cols);
 
-    xd->ref_frame_cost[INTRA_FRAME]   = vp8_cost_zero(cpi->prob_intra_coded);
-
     // Special case treatment when GF and ARF are not sensible options for reference
     if (cpi->ref_frame_flags == VP8_LAST_FLAG)
-    {
-        xd->ref_frame_cost[LAST_FRAME]    = vp8_cost_one(cpi->prob_intra_coded)
-                                        + vp8_cost_zero(255);
-        xd->ref_frame_cost[GOLDEN_FRAME]  = vp8_cost_one(cpi->prob_intra_coded)
-                                        + vp8_cost_one(255)
-                                        + vp8_cost_zero(128);
-        xd->ref_frame_cost[ALTREF_FRAME]  = vp8_cost_one(cpi->prob_intra_coded)
-                                        + vp8_cost_one(255)
-                                        + vp8_cost_one(128);
-    }
+        vp8_calc_ref_frame_costs(xd->ref_frame_cost,
+                                 cpi->prob_intra_coded,255,128);
     else if ((cpi->oxcf.number_of_layers > 1) &&
                (cpi->ref_frame_flags == VP8_GOLD_FLAG))
-    {
-        xd->ref_frame_cost[LAST_FRAME]    = vp8_cost_one(cpi->prob_intra_coded)
-                                        + vp8_cost_zero(1);
-        xd->ref_frame_cost[GOLDEN_FRAME]  = vp8_cost_one(cpi->prob_intra_coded)
-                                        + vp8_cost_one(1)
-                                        + vp8_cost_zero(255);
-        xd->ref_frame_cost[ALTREF_FRAME]  = vp8_cost_one(cpi->prob_intra_coded)
-                                        + vp8_cost_one(1)
-                                        + vp8_cost_one(255);
-    }
+        vp8_calc_ref_frame_costs(xd->ref_frame_cost,
+                                 cpi->prob_intra_coded,1,255);
     else if ((cpi->oxcf.number_of_layers > 1) &&
                 (cpi->ref_frame_flags == VP8_ALT_FLAG))
-    {
-        xd->ref_frame_cost[LAST_FRAME]    = vp8_cost_one(cpi->prob_intra_coded)
-                                        + vp8_cost_zero(1);
-        xd->ref_frame_cost[GOLDEN_FRAME]  = vp8_cost_one(cpi->prob_intra_coded)
-                                        + vp8_cost_one(1)
-                                        + vp8_cost_zero(1);
-        xd->ref_frame_cost[ALTREF_FRAME]  = vp8_cost_one(cpi->prob_intra_coded)
-                                        + vp8_cost_one(1)
-                                        + vp8_cost_one(1);
-    }
+        vp8_calc_ref_frame_costs(xd->ref_frame_cost,
+                                 cpi->prob_intra_coded,1,1);
     else
-    {
-        xd->ref_frame_cost[LAST_FRAME]    = vp8_cost_one(cpi->prob_intra_coded)
-                                        + vp8_cost_zero(cpi->prob_last_coded);
-        xd->ref_frame_cost[GOLDEN_FRAME]  = vp8_cost_one(cpi->prob_intra_coded)
-                                        + vp8_cost_one(cpi->prob_last_coded)
-                                        + vp8_cost_zero(cpi->prob_gf_coded);
-        xd->ref_frame_cost[ALTREF_FRAME]  = vp8_cost_one(cpi->prob_intra_coded)
-                                        + vp8_cost_one(cpi->prob_last_coded)
-                                        + vp8_cost_one(cpi->prob_gf_coded);
-    }
+        vp8_calc_ref_frame_costs(xd->ref_frame_cost,
+                                 cpi->prob_intra_coded,
+                                 cpi->prob_last_coded,
+                                 cpi->prob_gf_coded);
 
     xd->fullpixel_mask = 0xffffffff;
     if(cm->full_pixel)
@@ -966,31 +937,7 @@ void vp8_encode_frame(VP8_COMP *cpi)
     if ((cm->frame_type != KEY_FRAME) && ((cpi->oxcf.number_of_layers > 1) ||
         (!cm->refresh_alt_ref_frame && !cm->refresh_golden_frame)))
     {
-        const int *const rfct = cpi->count_mb_ref_frame_usage;
-        const int rf_intra = rfct[INTRA_FRAME];
-        const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];
-
-        if ((rf_intra + rf_inter) > 0)
-        {
-            cpi->prob_intra_coded = (rf_intra * 255) / (rf_intra + rf_inter);
-
-            if (cpi->prob_intra_coded < 1)
-                cpi->prob_intra_coded = 1;
-
-            if ((cm->frames_since_golden > 0) || cpi->source_alt_ref_active)
-            {
-                cpi->prob_last_coded = rf_inter ? (rfct[LAST_FRAME] * 255) / rf_inter : 128;
-
-                if (cpi->prob_last_coded < 1)
-                    cpi->prob_last_coded = 1;
-
-                cpi->prob_gf_coded = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME])
-                                     ? (rfct[GOLDEN_FRAME] * 255) / (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) : 128;
-
-                if (cpi->prob_gf_coded < 1)
-                    cpi->prob_gf_coded = 1;
-            }
-        }
+      vp8_convert_rfct_to_prob(cpi);
     }
 
 #if 0
@@ -1142,8 +1089,10 @@ static void adjust_act_zbin( VP8_COMP *cpi, MACROBLOCK *x )
 #endif
 }
 
-int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t)
+int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,
+                                   int mb_row, int mb_col)
 {
+    MACROBLOCKD *xd = &x->e_mbd;
     int rate;
 
     if (cpi->sf.RD && cpi->compressor_speed != 2)
@@ -1163,14 +1112,17 @@ int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t)
         vp8_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);
 
     vp8_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);
+
     sum_intra_stats(cpi, x);
     vp8_tokenize_mb(cpi, &x->e_mbd, t);
 
-    if (x->e_mbd.mode_info_context->mbmi.mode != B_PRED)
-        vp8_inverse_transform_mby(IF_RTCD(&cpi->rtcd.common->idct), &x->e_mbd);
-
-    vp8_inverse_transform_mbuv(IF_RTCD(&cpi->rtcd.common->idct), &x->e_mbd);
+    if (xd->mode_info_context->mbmi.mode != B_PRED)
+        vp8_inverse_transform_mby(xd, IF_RTCD(&cpi->common.rtcd));
 
+    DEQUANT_INVOKE (&cpi->common.rtcd.dequant, idct_add_uv_block)
+                    (xd->qcoeff+16*16, xd->block[16].dequant,
+                     xd->dst.u_buffer, xd->dst.v_buffer,
+                     xd->dst.uv_stride, xd->eobs+16);
     return rate;
 }
 #ifdef SPEEDSTATS
@@ -1182,7 +1134,8 @@ extern void vp8_fix_contexts(MACROBLOCKD *x);
 int vp8cx_encode_inter_macroblock
 (
     VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,
-    int recon_yoffset, int recon_uvoffset
+    int recon_yoffset, int recon_uvoffset,
+    int mb_row, int mb_col
 )
 {
     MACROBLOCKD *const xd = &x->e_mbd;
@@ -1230,8 +1183,10 @@ int vp8cx_encode_inter_macroblock
 
     }
     else
+    {
         vp8_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate,
-                            &distortion, &intra_error);
+                            &distortion, &intra_error, mb_row, mb_col);
+    }
 
     cpi->prediction_error += distortion;
     cpi->intra_error += intra_error;
@@ -1345,12 +1300,14 @@ int vp8cx_encode_inter_macroblock
     if (!x->skip)
     {
         vp8_tokenize_mb(cpi, xd, t);
-        if (x->e_mbd.mode_info_context->mbmi.mode != B_PRED)
-        {
-          vp8_inverse_transform_mby(IF_RTCD(&cpi->rtcd.common->idct),
-                                      &x->e_mbd);
-        }
-        vp8_inverse_transform_mbuv(IF_RTCD(&cpi->rtcd.common->idct), &x->e_mbd);
+
+        if (xd->mode_info_context->mbmi.mode != B_PRED)
+            vp8_inverse_transform_mby(xd, IF_RTCD(&cpi->common.rtcd));
+
+        DEQUANT_INVOKE (&cpi->common.rtcd.dequant, idct_add_uv_block)
+                        (xd->qcoeff+16*16, xd->block[16].dequant,
+                         xd->dst.u_buffer, xd->dst.v_buffer,
+                         xd->dst.uv_stride, xd->eobs+16);
     }
     else
     {
diff --git a/vp8/encoder/encodeintra.c b/vp8/encoder/encodeintra.c
index d89d74e5e..16393a1ff 100644
--- a/vp8/encoder/encodeintra.c
+++ b/vp8/encoder/encodeintra.c
@@ -18,7 +18,6 @@
 #include "vp8/common/invtrans.h"
 #include "vp8/common/recon.h"
 #include "dct.h"
-#include "vp8/common/g_common.h"
 #include "encodeintra.h"
 
 
@@ -45,7 +44,7 @@ int vp8_encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred)
 
         vp8_encode_intra16x16mby(rtcd, x);
 
-        vp8_inverse_transform_mby(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+        vp8_inverse_transform_mby(&x->e_mbd, IF_RTCD(&cpi->common.rtcd));
     }
     else
     {
@@ -77,8 +76,17 @@ void vp8_encode_intra4x4block(const VP8_ENCODER_RTCD *rtcd,
 
     x->quantize_b(be, b);
 
-    vp8_inverse_transform_b(IF_RTCD(&rtcd->common->idct), b, 16);
-
+    if (*b->eob > 1)
+    {
+        IDCT_INVOKE(IF_RTCD(&rtcd->common->idct), idct16)(b->dqcoeff,
+            b->predictor, 16, *(b->base_dst) + b->dst, b->dst_stride);
+    }
+    else
+    {
+        IDCT_INVOKE(IF_RTCD(&rtcd->common->idct), idct1_scalar_add)
+            (b->dqcoeff[0], b->predictor, 16, *(b->base_dst) + b->dst,
+                b->dst_stride);
+    }
 }
 
 void vp8_encode_intra4x4mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *mb)
@@ -96,11 +104,12 @@ void vp8_encode_intra4x4mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *mb)
 void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
 {
     BLOCK *b = &x->block[0];
+    MACROBLOCKD *xd = &x->e_mbd;
 
-    RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_mby)(&x->e_mbd);
+    RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_mby_s)(&x->e_mbd);
 
-    ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src),
-        x->e_mbd.predictor, b->src_stride);
+    ENCODEMB_INVOKE(&rtcd->encodemb, submby) (x->src_diff, *(b->base_src),
+        b->src_stride, xd->dst.y_buffer, xd->dst.y_stride);
 
     vp8_transform_intra_mby(x);
 
@@ -108,14 +117,17 @@ void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
 
     if (x->optimize)
         vp8_optimize_mby(x, rtcd);
-
 }
 
 void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
 {
-    RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_mbuv)(&x->e_mbd);
+    MACROBLOCKD *xd = &x->e_mbd;
 
-    ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
+    RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_mbuv_s)(&x->e_mbd);
+
+    ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer,
+        x->src.v_buffer, x->src.uv_stride, xd->dst.u_buffer,
+        xd->dst.v_buffer, xd->dst.uv_stride);
 
     vp8_transform_mbuv(x);
 
@@ -123,5 +135,4 @@ void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
 
     if (x->optimize)
         vp8_optimize_mbuv(x, rtcd);
-
 }
diff --git a/vp8/encoder/encodemb.c b/vp8/encoder/encodemb.c
index 80c32df1b..c9f755333 100644
--- a/vp8/encoder/encodemb.c
+++ b/vp8/encoder/encodemb.c
@@ -48,12 +48,12 @@ void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch)
     }
 }
 
-void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
+void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc,
+                         int src_stride, unsigned char *upred,
+                         unsigned char *vpred, int pred_stride)
 {
     short *udiff = diff + 256;
     short *vdiff = diff + 320;
-    unsigned char *upred = pred + 256;
-    unsigned char *vpred = pred + 320;
 
     int r, c;
 
@@ -65,8 +65,8 @@ void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc,
         }
 
         udiff += 8;
-        upred += 8;
-        usrc  += stride;
+        upred += pred_stride;
+        usrc  += src_stride;
     }
 
     for (r = 0; r < 8; r++)
@@ -77,12 +77,13 @@ void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc,
         }
 
         vdiff += 8;
-        vpred += 8;
-        vsrc  += stride;
+        vpred += pred_stride;
+        vsrc  += src_stride;
     }
 }
 
-void vp8_subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride)
+void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride,
+                        unsigned char *pred, int pred_stride)
 {
     int r, c;
 
@@ -94,8 +95,8 @@ void vp8_subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, in
         }
 
         diff += 16;
-        pred += 16;
-        src  += stride;
+        pred += pred_stride;
+        src  += src_stride;
     }
 }
 
@@ -103,8 +104,11 @@ static void vp8_subtract_mb(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
 {
     BLOCK *b = &x->block[0];
 
-    ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src), x->e_mbd.predictor, b->src_stride);
-    ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
+    ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src),
+        b->src_stride, x->e_mbd.dst.y_buffer, x->e_mbd.dst.y_stride);
+    ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer,
+        x->src.v_buffer, x->src.uv_stride, x->e_mbd.dst.u_buffer,
+        x->e_mbd.dst.v_buffer, x->e_mbd.dst.uv_stride);
 }
 
 static void build_dcblock(MACROBLOCK *x)
@@ -621,7 +625,7 @@ void vp8_optimize_mbuv(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)
 
 void vp8_encode_inter16x16(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
 {
-    vp8_build_inter_predictors_mb_e(&x->e_mbd);
+    vp8_build_inter_predictors_mb(&x->e_mbd);
 
     vp8_subtract_mb(rtcd, x);
 
@@ -631,7 +635,6 @@ void vp8_encode_inter16x16(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
 
     if (x->optimize)
         optimize_mb(x, rtcd);
-
 }
 
 /* this funciton is used by first pass only */
@@ -639,14 +642,15 @@ void vp8_encode_inter16x16y(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
 {
     BLOCK *b = &x->block[0];
 
-    vp8_build_inter16x16_predictors_mby(&x->e_mbd);
+    vp8_build_inter16x16_predictors_mby(&x->e_mbd, x->e_mbd.dst.y_buffer,
+                                        x->e_mbd.dst.y_stride);
 
-    ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src), x->e_mbd.predictor, b->src_stride);
+    ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src),
+        b->src_stride, x->e_mbd.dst.y_buffer, x->e_mbd.dst.y_stride);
 
     transform_mby(x);
 
     vp8_quantize_mby(x);
 
-    vp8_inverse_transform_mby(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
-
+    vp8_inverse_transform_mby(&x->e_mbd, IF_RTCD(rtcd->common));
 }
diff --git a/vp8/encoder/encodemb.h b/vp8/encoder/encodemb.h
index 8fa457aa8..0fa87cf68 100644
--- a/vp8/encoder/encodemb.h
+++ b/vp8/encoder/encodemb.h
@@ -12,6 +12,7 @@
 #ifndef __INC_ENCODEMB_H
 #define __INC_ENCODEMB_H
 
+
 #include "vpx_config.h"
 #include "block.h"
 
@@ -28,11 +29,13 @@
     void (sym)(BLOCK *be,BLOCKD *bd, int pitch)
 
 #define prototype_submby(sym) \
-    void (sym)(short *diff, unsigned char *src, unsigned char *pred, int stride)
+    void (sym)(short *diff, unsigned char *src, int src_stride, \
+        unsigned char *pred, int pred_stride)
 
 #define prototype_submbuv(sym) \
     void (sym)(short *diff, unsigned char *usrc, unsigned char *vsrc,\
-               unsigned char *pred, int stride)
+               int src_stride, unsigned char *upred, unsigned char *vpred,\
+               int pred_stride)
 
 #if ARCH_X86 || ARCH_X86_64
 #include "x86/encodemb_x86.h"
diff --git a/vp8/encoder/encodemv.c b/vp8/encoder/encodemv.c
index a4849c654..c122d038d 100644
--- a/vp8/encoder/encodemv.c
+++ b/vp8/encoder/encodemv.c
@@ -395,7 +395,7 @@ static void write_component_probs(
 
 void vp8_write_mvprobs(VP8_COMP *cpi)
 {
-    vp8_writer *const w  = & cpi->bc;
+    vp8_writer *const w  = cpi->bc;
     MV_CONTEXT *mvc = cpi->common.fc.mvc;
     int flags[2] = {0, 0};
 #ifdef ENTROPY_STATS
diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c
index 557080dba..69655989d 100644
--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -38,7 +38,7 @@ static THREAD_FUNCTION loopfilter_thread(void *p_data)
 
         if (sem_wait(&cpi->h_event_start_lpf) == 0)
         {
-            if (cpi->b_multi_threaded == FALSE) // we're shutting down
+            if (cpi->b_multi_threaded == 0) // we're shutting down
                 break;
 
             loopfilter_frame(cpi, cm);
@@ -78,7 +78,7 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
             int *segment_counts = mbri->segment_counts;
             int *totalrate = &mbri->totalrate;
 
-            if (cpi->b_multi_threaded == FALSE) // we're shutting down
+            if (cpi->b_multi_threaded == 0) // we're shutting down
                 break;
 
             for (mb_row = ithread + 1; mb_row < cm->mb_rows; mb_row += (cpi->encoding_thread_count + 1))
@@ -302,7 +302,6 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
     z->mv_col_max    = x->mv_col_max;
     z->mv_row_min    = x->mv_row_min;
     z->mv_row_max    = x->mv_row_max;
-    z->vector_range = x->vector_range ;
     */
 
     z->vp8_short_fdct4x4     = x->vp8_short_fdct4x4;
@@ -343,12 +342,13 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
         z->block[i].zbin            = x->block[i].zbin;
         z->block[i].zrun_zbin_boost   = x->block[i].zrun_zbin_boost;
         z->block[i].round           = x->block[i].round;
+        z->q_index                  = x->q_index;
+        z->act_zbin_adj             = x->act_zbin_adj;
+        z->last_act_zbin_adj        = x->last_act_zbin_adj;
         /*
         z->block[i].src             = x->block[i].src;
         */
         z->block[i].src_stride       = x->block[i].src_stride;
-        z->block[i].force_empty      = x->block[i].force_empty;
-
     }
 
     {
@@ -418,8 +418,6 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi,
 #endif
         mb->gf_active_ptr            = x->gf_active_ptr;
 
-        mb->vector_range             = 32;
-
         vpx_memset(mbr_ei[i].segment_counts, 0, sizeof(mbr_ei[i].segment_counts));
         mbr_ei[i].totalrate = 0;
 
diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index 23e30508a..346c06f32 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -267,8 +267,8 @@ static void avg_stats(FIRSTPASS_STATS *section)
 // Calculate a modified Error used in distributing bits between easier and harder frames
 static double calculate_modified_err(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
 {
-    double av_err = ( cpi->twopass.total_stats->ssim_weighted_pred_err /
-                      cpi->twopass.total_stats->count );
+    double av_err = ( cpi->twopass.total_stats.ssim_weighted_pred_err /
+                      cpi->twopass.total_stats.count );
     double this_err = this_frame->ssim_weighted_pred_err;
     double modified_err;
 
@@ -373,7 +373,7 @@ static int frame_max_bits(VP8_COMP *cpi)
     else
     {
         // For VBR base this on the bits and frames left plus the two_pass_vbrmax_section rate passed in by the user
-        max_bits = (int)(((double)cpi->twopass.bits_left / (cpi->twopass.total_stats->count - (double)cpi->common.current_video_frame)) * ((double)cpi->oxcf.two_pass_vbrmax_section / 100.0));
+        max_bits = (int)(((double)cpi->twopass.bits_left / (cpi->twopass.total_stats.count - (double)cpi->common.current_video_frame)) * ((double)cpi->oxcf.two_pass_vbrmax_section / 100.0));
     }
 
     // Trap case where we are out of bits
@@ -385,12 +385,12 @@ static int frame_max_bits(VP8_COMP *cpi)
 
 void vp8_init_first_pass(VP8_COMP *cpi)
 {
-    zero_stats(cpi->twopass.total_stats);
+    zero_stats(&cpi->twopass.total_stats);
 }
 
 void vp8_end_first_pass(VP8_COMP *cpi)
 {
-    output_stats(cpi, cpi->output_pkt_list, cpi->twopass.total_stats);
+    output_stats(cpi, cpi->output_pkt_list, &cpi->twopass.total_stats);
 }
 
 static void zz_motion_search( VP8_COMP *cpi, MACROBLOCK * x, YV12_BUFFER_CONFIG * recon_buffer, int * best_motion_err, int recon_yoffset )
@@ -804,17 +804,17 @@ void vp8_first_pass(VP8_COMP *cpi)
                        - cpi->source->ts_start;
 
         // don't want to do output stats with a stack variable!
-        memcpy(cpi->twopass.this_frame_stats,
+        memcpy(&cpi->twopass.this_frame_stats,
                &fps,
                sizeof(FIRSTPASS_STATS));
-        output_stats(cpi, cpi->output_pkt_list, cpi->twopass.this_frame_stats);
-        accumulate_stats(cpi->twopass.total_stats, &fps);
+        output_stats(cpi, cpi->output_pkt_list, &cpi->twopass.this_frame_stats);
+        accumulate_stats(&cpi->twopass.total_stats, &fps);
     }
 
     // Copy the previous Last Frame into the GF buffer if specific conditions for doing so are met
     if ((cm->current_video_frame > 0) &&
-        (cpi->twopass.this_frame_stats->pcnt_inter > 0.20) &&
-        ((cpi->twopass.this_frame_stats->intra_error / cpi->twopass.this_frame_stats->coded_error) > 2.0))
+        (cpi->twopass.this_frame_stats.pcnt_inter > 0.20) &&
+        ((cpi->twopass.this_frame_stats.intra_error / cpi->twopass.this_frame_stats.coded_error) > 2.0))
     {
         vp8_yv12_copy_frame_ptr(lst_yv12, gld_yv12);
     }
@@ -861,7 +861,7 @@ double bitcost( double prob )
 {
     return -(log( prob ) / log( 2.0 ));
 }
-static long long estimate_modemvcost(VP8_COMP *cpi,
+static int64_t estimate_modemvcost(VP8_COMP *cpi,
                                      FIRSTPASS_STATS * fpstats)
 {
     int mv_cost;
@@ -1019,7 +1019,7 @@ static int estimate_max_q(VP8_COMP *cpi,
     // averaga q observed in clip for non kf/gf.arf frames
     // Give average a chance to settle though.
     if ( (cpi->ni_frames >
-                  ((unsigned int)cpi->twopass.total_stats->count >> 8)) &&
+                  ((unsigned int)cpi->twopass.total_stats.count >> 8)) &&
          (cpi->ni_frames > 150) )
     {
         cpi->twopass.maxq_max_limit = ((cpi->ni_av_qi + 32) < cpi->worst_quality)
@@ -1075,8 +1075,8 @@ static int estimate_cq( VP8_COMP *cpi,
     }
 
     // II ratio correction factor for clip as a whole
-    clip_iiratio = cpi->twopass.total_stats->intra_error /
-                   DOUBLE_DIVIDE_CHECK(cpi->twopass.total_stats->coded_error);
+    clip_iiratio = cpi->twopass.total_stats.intra_error /
+                   DOUBLE_DIVIDE_CHECK(cpi->twopass.total_stats.coded_error);
     clip_iifactor = 1.0 - ((clip_iiratio - 10.0) * 0.025);
     if (clip_iifactor < 0.80)
         clip_iifactor = 0.80;
@@ -1260,25 +1260,25 @@ void vp8_init_second_pass(VP8_COMP *cpi)
 
     double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);
 
-    zero_stats(cpi->twopass.total_stats);
-    zero_stats(cpi->twopass.total_left_stats);
+    zero_stats(&cpi->twopass.total_stats);
+    zero_stats(&cpi->twopass.total_left_stats);
 
     if (!cpi->twopass.stats_in_end)
         return;
 
-    *cpi->twopass.total_stats = *cpi->twopass.stats_in_end;
-    *cpi->twopass.total_left_stats = *cpi->twopass.total_stats;
+    cpi->twopass.total_stats = *cpi->twopass.stats_in_end;
+    cpi->twopass.total_left_stats = cpi->twopass.total_stats;
 
     // each frame can have a different duration, as the frame rate in the source
     // isn't guaranteed to be constant.   The frame rate prior to the first frame
     // encoded in the second pass is a guess.  However the sum duration is not.
     // Its calculated based on the actual durations of all frames from the first
     // pass.
-    vp8_new_frame_rate(cpi, 10000000.0 * cpi->twopass.total_stats->count / cpi->twopass.total_stats->duration);
+    vp8_new_frame_rate(cpi, 10000000.0 * cpi->twopass.total_stats.count / cpi->twopass.total_stats.duration);
 
-    cpi->output_frame_rate = cpi->oxcf.frame_rate;
-    cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats->duration * cpi->oxcf.target_bandwidth / 10000000.0) ;
-    cpi->twopass.bits_left -= (int64_t)(cpi->twopass.total_stats->duration * two_pass_min_rate / 10000000.0);
+    cpi->output_frame_rate = cpi->frame_rate;
+    cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats.duration * cpi->oxcf.target_bandwidth / 10000000.0) ;
+    cpi->twopass.bits_left -= (int64_t)(cpi->twopass.total_stats.duration * two_pass_min_rate / 10000000.0);
 
     // Calculate a minimum intra value to be used in determining the IIratio
     // scores used in the second pass. We have this minimum to make sure
@@ -1301,7 +1301,7 @@ void vp8_init_second_pass(VP8_COMP *cpi)
             sum_iiratio += IIRatio;
         }
 
-        cpi->twopass.avg_iiratio = sum_iiratio / DOUBLE_DIVIDE_CHECK((double)cpi->twopass.total_stats->count);
+        cpi->twopass.avg_iiratio = sum_iiratio / DOUBLE_DIVIDE_CHECK((double)cpi->twopass.total_stats.count);
 
         // Reset file position
         reset_fpf_position(cpi, start_pos);
@@ -1376,7 +1376,7 @@ static int detect_transition_to_still(
     double loop_decay_rate,
     double decay_accumulator )
 {
-    BOOL trans_to_still = FALSE;
+    int trans_to_still = 0;
 
     // Break clause to detect very still sections after motion
     // For example a static image after a fade or other transition
@@ -1406,7 +1406,7 @@ static int detect_transition_to_still(
 
         // Only if it does do we signal a transition to still
         if ( j == still_interval )
-            trans_to_still = TRUE;
+            trans_to_still = 1;
     }
 
     return trans_to_still;
@@ -1415,14 +1415,14 @@ static int detect_transition_to_still(
 // This function detects a flash through the high relative pcnt_second_ref
 // score in the frame following a flash frame. The offset passed in should
 // reflect this
-static BOOL detect_flash( VP8_COMP *cpi, int offset )
+static int detect_flash( VP8_COMP *cpi, int offset )
 {
     FIRSTPASS_STATS next_frame;
 
-    BOOL flash_detected = FALSE;
+    int flash_detected = 0;
 
     // Read the frame data.
-    // The return is FALSE (no flash detected) if not a valid frame
+    // The return is 0 (no flash detected) if not a valid frame
     if ( read_frame_stats(cpi, &next_frame, offset) != EOF )
     {
         // What we are looking for here is a situation where there is a
@@ -1433,7 +1433,7 @@ static BOOL detect_flash( VP8_COMP *cpi, int offset )
         if ( (next_frame.pcnt_second_ref > next_frame.pcnt_inter) &&
              (next_frame.pcnt_second_ref >= 0.5 ) )
         {
-            flash_detected = TRUE;
+            flash_detected = 1;
 
             /*if (1)
             {
@@ -1548,7 +1548,7 @@ static int calc_arf_boost(
     double mv_in_out_accumulator = 0.0;
     double abs_mv_in_out_accumulator = 0.0;
     double r;
-    BOOL flash_detected = FALSE;
+    int flash_detected = 0;
 
     // Search forward from the proposed arf/next gf position
     for ( i = 0; i < f_frames; i++ )
@@ -1677,7 +1677,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
     int alt_boost = 0;
     int f_boost = 0;
     int b_boost = 0;
-    BOOL flash_detected;
+    int flash_detected;
 
     cpi->twopass.gf_group_bits = 0;
     cpi->twopass.gf_decay_rate = 0;
@@ -1751,7 +1751,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
                                          loop_decay_rate,
                                          decay_accumulator ) )
         {
-            allow_alt_ref = FALSE;
+            allow_alt_ref = 0;
             boost_score = old_boost_score;
             break;
         }
@@ -1923,7 +1923,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
             int frames_bwd = cpi->oxcf.arnr_max_frames - 1;
             int frames_fwd = cpi->oxcf.arnr_max_frames - 1;
 
-            cpi->source_alt_ref_pending = TRUE;
+            cpi->source_alt_ref_pending = 1;
 
             // For alt ref frames the error score for the end frame of the
             // group (the alt ref frame) should not contribute to the group
@@ -1949,7 +1949,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
             // Note: this_frame->frame has been updated in the loop
             // so it now points at the ARF frame.
             half_gf_int = cpi->baseline_gf_interval >> 1;
-            frames_after_arf = cpi->twopass.total_stats->count -
+            frames_after_arf = cpi->twopass.total_stats.count -
                                this_frame->frame - 1;
 
             switch (cpi->oxcf.arnr_type)
@@ -1989,13 +1989,13 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
         }
         else
         {
-            cpi->source_alt_ref_pending = FALSE;
+            cpi->source_alt_ref_pending = 0;
             cpi->baseline_gf_interval = i;
         }
     }
     else
     {
-        cpi->source_alt_ref_pending = FALSE;
+        cpi->source_alt_ref_pending = 0;
         cpi->baseline_gf_interval = i;
     }
 
@@ -2005,7 +2005,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
     // where cpi->twopass.kf_group_bits is tied to cpi->twopass.bits_left.
     // This is also important for short clips where there may only be one
     // key frame.
-    if (cpi->twopass.frames_to_key >= (int)(cpi->twopass.total_stats->count -
+    if (cpi->twopass.frames_to_key >= (int)(cpi->twopass.total_stats.count -
                                             cpi->common.current_video_frame))
     {
         cpi->twopass.kf_group_bits =
@@ -2296,7 +2296,7 @@ static void assign_std_frame_bits(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
 void vp8_second_pass(VP8_COMP *cpi)
 {
     int tmp_q;
-    int frames_left = (int)(cpi->twopass.total_stats->count - cpi->common.current_video_frame);
+    int frames_left = (int)(cpi->twopass.total_stats.count - cpi->common.current_video_frame);
 
     FIRSTPASS_STATS this_frame = {0};
     FIRSTPASS_STATS this_frame_copy;
@@ -2341,7 +2341,7 @@ void vp8_second_pass(VP8_COMP *cpi)
             cpi->twopass.gf_group_error_left = cpi->twopass.kf_group_error_left;
             cpi->baseline_gf_interval = cpi->twopass.frames_to_key;
             cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
-            cpi->source_alt_ref_pending = FALSE;
+            cpi->source_alt_ref_pending = 0;
         }
 
     }
@@ -2411,7 +2411,7 @@ void vp8_second_pass(VP8_COMP *cpi)
 
     // Account for mv, mode and other overheads.
     overhead_bits = estimate_modemvcost(
-                        cpi, cpi->twopass.total_left_stats );
+                        cpi, &cpi->twopass.total_left_stats );
 
     // Special case code for first frame.
     if (cpi->common.current_video_frame == 0)
@@ -2425,7 +2425,7 @@ void vp8_second_pass(VP8_COMP *cpi)
 
             est_cq =
                 estimate_cq( cpi,
-                             cpi->twopass.total_left_stats,
+                             &cpi->twopass.total_left_stats,
                              (int)(cpi->twopass.bits_left / frames_left),
                              overhead_bits );
 
@@ -2440,7 +2440,7 @@ void vp8_second_pass(VP8_COMP *cpi)
 
         tmp_q = estimate_max_q(
                     cpi,
-                    cpi->twopass.total_left_stats,
+                    &cpi->twopass.total_left_stats,
                     (int)(cpi->twopass.bits_left / frames_left),
                     overhead_bits );
 
@@ -2463,16 +2463,16 @@ void vp8_second_pass(VP8_COMP *cpi)
     // radical adjustments to the allowed quantizer range just to use up a
     // few surplus bits or get beneath the target rate.
     else if ( (cpi->common.current_video_frame <
-                 (((unsigned int)cpi->twopass.total_stats->count * 255)>>8)) &&
+                 (((unsigned int)cpi->twopass.total_stats.count * 255)>>8)) &&
               ((cpi->common.current_video_frame + cpi->baseline_gf_interval) <
-                 (unsigned int)cpi->twopass.total_stats->count) )
+                 (unsigned int)cpi->twopass.total_stats.count) )
     {
         if (frames_left < 1)
             frames_left = 1;
 
         tmp_q = estimate_max_q(
                     cpi,
-                    cpi->twopass.total_left_stats,
+                    &cpi->twopass.total_left_stats,
                     (int)(cpi->twopass.bits_left / frames_left),
                     overhead_bits );
 
@@ -2489,13 +2489,13 @@ void vp8_second_pass(VP8_COMP *cpi)
     cpi->twopass.frames_to_key --;
 
     // Update the total stats remaining sturcture
-    subtract_stats(cpi->twopass.total_left_stats, &this_frame );
+    subtract_stats(&cpi->twopass.total_left_stats, &this_frame );
 }
 
 
-static BOOL test_candidate_kf(VP8_COMP *cpi,  FIRSTPASS_STATS *last_frame, FIRSTPASS_STATS *this_frame, FIRSTPASS_STATS *next_frame)
+static int test_candidate_kf(VP8_COMP *cpi,  FIRSTPASS_STATS *last_frame, FIRSTPASS_STATS *this_frame, FIRSTPASS_STATS *next_frame)
 {
-    BOOL is_viable_kf = FALSE;
+    int is_viable_kf = 0;
 
     // Does the frame satisfy the primary criteria of a key frame
     //      If so, then examine how well it predicts subsequent frames
@@ -2569,13 +2569,13 @@ static BOOL test_candidate_kf(VP8_COMP *cpi,  FIRSTPASS_STATS *last_frame, FIRST
 
         // If there is tolerable prediction for at least the next 3 frames then break out else discard this pottential key frame and move on
         if (boost_score > 5.0 && (i > 3))
-            is_viable_kf = TRUE;
+            is_viable_kf = 1;
         else
         {
             // Reset the file position
             reset_fpf_position(cpi, start_pos);
 
-            is_viable_kf = FALSE;
+            is_viable_kf = 0;
         }
     }
 
@@ -2611,7 +2611,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
     cpi->this_key_frame_forced = cpi->next_key_frame_forced;
 
     // Clear the alt ref active flag as this can never be active on a key frame
-    cpi->source_alt_ref_active = FALSE;
+    cpi->source_alt_ref_active = 0;
 
     // Kf is always a gf so clear frames till next gf counter
     cpi->frames_till_gf_update_due = 0;
@@ -2727,10 +2727,10 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
         // Reset to the start of the group
         reset_fpf_position(cpi, current_pos);
 
-        cpi->next_key_frame_forced = TRUE;
+        cpi->next_key_frame_forced = 1;
     }
     else
-        cpi->next_key_frame_forced = FALSE;
+        cpi->next_key_frame_forced = 0;
 
     // Special case for the last frame of the file
     if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end)
@@ -3034,8 +3034,8 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
 
     if (cpi->oxcf.allow_spatial_resampling)
     {
-        int resample_trigger = FALSE;
-        int last_kf_resampled = FALSE;
+        int resample_trigger = 0;
+        int last_kf_resampled = 0;
         int kf_q;
         int scale_val = 0;
         int hr, hs, vr, vs;
@@ -3053,15 +3053,15 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
         double effective_size_ratio;
 
         if ((cpi->common.Width != cpi->oxcf.Width) || (cpi->common.Height != cpi->oxcf.Height))
-            last_kf_resampled = TRUE;
+            last_kf_resampled = 1;
 
         // Set back to unscaled by defaults
         cpi->common.horiz_scale = NORMAL;
         cpi->common.vert_scale = NORMAL;
 
         // Calculate Average bits per frame.
-        //av_bits_per_frame = cpi->twopass.bits_left/(double)(cpi->twopass.total_stats->count - cpi->common.current_video_frame);
-        av_bits_per_frame = cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->oxcf.frame_rate);
+        //av_bits_per_frame = cpi->twopass.bits_left/(double)(cpi->twopass.total_stats.count - cpi->common.current_video_frame);
+        av_bits_per_frame = cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->frame_rate);
         //if ( av_bits_per_frame < 0.0 )
         //  av_bits_per_frame = 0.0
 
@@ -3117,21 +3117,21 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
                 (last_kf_resampled && (projected_buffer_level < (cpi->oxcf.resample_up_water_mark * cpi->oxcf.optimal_buffer_level / 100))))
                 //( ((cpi->buffer_level < (cpi->oxcf.resample_down_water_mark * cpi->oxcf.optimal_buffer_level / 100))) &&
                 //  ((projected_buffer_level < (cpi->oxcf.resample_up_water_mark * cpi->oxcf.optimal_buffer_level / 100))) ))
-                resample_trigger = TRUE;
+                resample_trigger = 1;
             else
-                resample_trigger = FALSE;
+                resample_trigger = 0;
         }
         else
         {
-            int64_t clip_bits = (int64_t)(cpi->twopass.total_stats->count * cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->oxcf.frame_rate));
+            int64_t clip_bits = (int64_t)(cpi->twopass.total_stats.count * cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->frame_rate));
             int64_t over_spend = cpi->oxcf.starting_buffer_level - cpi->buffer_level;
 
             if ((last_kf_resampled && (kf_q > cpi->worst_quality)) ||                                               // If triggered last time the threshold for triggering again is reduced
                 ((kf_q > cpi->worst_quality) &&                                                                  // Projected Q higher than allowed and ...
                  (over_spend > clip_bits / 20)))                                                               // ... Overspend > 5% of total bits
-                resample_trigger = TRUE;
+                resample_trigger = 1;
             else
-                resample_trigger = FALSE;
+                resample_trigger = 0;
 
         }
 
diff --git a/vp8/encoder/lookahead.c b/vp8/encoder/lookahead.c
index b92e82bdf..3e582e369 100644
--- a/vp8/encoder/lookahead.c
+++ b/vp8/encoder/lookahead.c
@@ -48,7 +48,7 @@ vp8_lookahead_destroy(struct lookahead_ctx *ctx)
     {
         if(ctx->buf)
         {
-            int i;
+            unsigned int i;
 
             for(i = 0; i < ctx->max_sz; i++)
                 vp8_yv12_de_alloc_frame_buffer(&ctx->buf[i].img);
@@ -65,7 +65,7 @@ vp8_lookahead_init(unsigned int width,
                    unsigned int depth)
 {
     struct lookahead_ctx *ctx = NULL;
-    int i;
+    unsigned int i;
 
     /* Clamp the lookahead queue depth */
     if(depth < 1)
@@ -188,7 +188,7 @@ vp8_lookahead_pop(struct lookahead_ctx *ctx,
 
 struct lookahead_entry*
 vp8_lookahead_peek(struct lookahead_ctx *ctx,
-                   int                   index)
+                   unsigned int          index)
 {
     struct lookahead_entry* buf = NULL;
 
diff --git a/vp8/encoder/lookahead.h b/vp8/encoder/lookahead.h
index afb3fd4a9..32bafcd63 100644
--- a/vp8/encoder/lookahead.h
+++ b/vp8/encoder/lookahead.h
@@ -92,7 +92,7 @@ vp8_lookahead_pop(struct lookahead_ctx *ctx,
  */
 struct lookahead_entry*
 vp8_lookahead_peek(struct lookahead_ctx *ctx,
-                   int                   index);
+                   unsigned int          index);
 
 
 /**\brief Get the number of frames currently in the lookahead queue
diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c
index c1a0ea7bf..735af95ca 100644
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@@ -9,6 +9,7 @@
  */
 
 
+#include "onyx_int.h"
 #include "mcomp.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_config.h"
@@ -182,8 +183,6 @@ void vp8_init3smotion_compensation(MACROBLOCK *x, int stride)
 #define IFMVCV(r,c,s,e) if ( c >= minc && c <= maxc && r >= minr && r <= maxr) s else e;
 #define ERR(r,c) (MVC(r,c)+DIST(r,c)) // returns distortion + motion vector cost
 #define CHECK_BETTER(v,r,c) IFMVCV(r,c,{thismse = DIST(r,c); if((v = (MVC(r,c)+thismse)) < besterr) { besterr = v; br=r; bc=c; *distortion = thismse; *sse1 = sse; }}, v=INT_MAX;)// checks if (r,c) has better score than previous best
-#define MIN(x,y) (((x)<(y))?(x):(y))
-#define MAX(x,y) (((x)>(y))?(x):(y))
 
 int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
                                              int_mv *bestmv, int_mv *ref_mv,
@@ -331,8 +330,7 @@ int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
 #undef IFMVCV
 #undef ERR
 #undef CHECK_BETTER
-#undef MIN
-#undef MAX
+
 int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
                                  int_mv *bestmv, int_mv *ref_mv,
                                  int error_per_bit,
@@ -854,6 +852,8 @@ int vp8_hex_search
     int k = -1;
     int all_in;
     int best_site = -1;
+    int hex_range = 127;
+    int dia_range = 8;
 
     int_mv fcenter_mv;
     fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
@@ -873,6 +873,18 @@ int vp8_hex_search
                         in_what_stride, 0x7fffffff)
             + mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit);
 
+#if CONFIG_MULTI_RES_ENCODING
+    /* Lower search range based on prediction info */
+    if (search_param >= 6) goto cal_neighbors;
+    else if (search_param >= 5) hex_range = 4;
+    else if (search_param >= 4) hex_range = 6;
+    else if (search_param >= 3) hex_range = 15;
+    else if (search_param >= 2) hex_range = 31;
+    else if (search_param >= 1) hex_range = 63;
+
+    dia_range = 8;
+#endif
+
     // hex search
     //j=0
     CHECK_BOUNDS(2)
@@ -909,7 +921,7 @@ int vp8_hex_search
         k = best_site;
     }
 
-    for (j = 1; j < 127; j++)
+    for (j = 1; j < hex_range; j++)
     {
         best_site = -1;
         CHECK_BOUNDS(2)
@@ -951,7 +963,7 @@ int vp8_hex_search
 
     // check 4 1-away neighbors
 cal_neighbors:
-    for (j = 0; j < 32; j++)
+    for (j = 0; j < dia_range; j++)
     {
         best_site = -1;
         CHECK_BOUNDS(1)
@@ -1144,7 +1156,7 @@ int vp8_diamond_search_sadx4
     int tot_steps;
     int_mv this_mv;
 
-    int bestsad = INT_MAX;
+    unsigned int bestsad = UINT_MAX;
     int best_site = 0;
     int last_site = 0;
 
@@ -1385,7 +1397,7 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
     unsigned char *bestaddress;
     int_mv *best_mv = &d->bmi.mv;
     int_mv this_mv;
-    int bestsad = INT_MAX;
+    unsigned int bestsad = UINT_MAX;
     int r, c;
 
     unsigned char *check_here;
@@ -1515,7 +1527,7 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
     unsigned char *bestaddress;
     int_mv *best_mv = &d->bmi.mv;
     int_mv this_mv;
-    int bestsad = INT_MAX;
+    unsigned int bestsad = UINT_MAX;
     int r, c;
 
     unsigned char *check_here;
diff --git a/vp8/encoder/mr_dissim.c b/vp8/encoder/mr_dissim.c
new file mode 100644
index 000000000..7a62a06ec
--- /dev/null
+++ b/vp8/encoder/mr_dissim.c
@@ -0,0 +1,201 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <limits.h>
+#include "vpx_config.h"
+#include "onyx_int.h"
+#include "mr_dissim.h"
+#include "vpx_mem/vpx_mem.h"
+#include "rdopt.h"
+
+void vp8_cal_low_res_mb_cols(VP8_COMP *cpi)
+{
+    int low_res_w;
+
+    /* Support arbitrary down-sampling factor */
+    unsigned int iw = cpi->oxcf.Width*cpi->oxcf.mr_down_sampling_factor.den
+                      + cpi->oxcf.mr_down_sampling_factor.num - 1;
+
+    low_res_w = iw/cpi->oxcf.mr_down_sampling_factor.num;
+    cpi->mr_low_res_mb_cols = ((low_res_w + 15) >> 4);
+}
+
+#define GET_MV(x)    \
+if(x->mbmi.ref_frame !=INTRA_FRAME)   \
+{   \
+    mvx[cnt] = x->mbmi.mv.as_mv.row;  \
+    mvy[cnt] = x->mbmi.mv.as_mv.col;  \
+    cnt++;    \
+}
+
+#define GET_MV_SIGN(x)    \
+if(x->mbmi.ref_frame !=INTRA_FRAME)   \
+{   \
+    mvx[cnt] = x->mbmi.mv.as_mv.row;  \
+    mvy[cnt] = x->mbmi.mv.as_mv.col;  \
+    if (cm->ref_frame_sign_bias[x->mbmi.ref_frame]  \
+        != cm->ref_frame_sign_bias[tmp->mbmi.ref_frame])  \
+    {  \
+        mvx[cnt] *= -1;   \
+        mvy[cnt] *= -1;   \
+    }  \
+    cnt++;  \
+}
+
+void vp8_cal_dissimilarity(VP8_COMP *cpi)
+{
+    VP8_COMMON *cm = &cpi->common;
+
+    /* Note: The first row & first column in mip are outside the frame, which
+     * were initialized to all 0.(ref_frame, mode, mv...)
+     * Their ref_frame = 0 means they won't be counted in the following
+     * calculation.
+     */
+    if (cpi->oxcf.mr_total_resolutions >1
+        && cpi->oxcf.mr_encoder_id < (cpi->oxcf.mr_total_resolutions - 1))
+    {
+        /* Store info for show/no-show frames for supporting alt_ref.
+         * If parent frame is alt_ref, child has one too.
+         */
+        if(cm->frame_type != KEY_FRAME)
+        {
+            int mb_row;
+            int mb_col;
+            /* Point to beginning of allocated MODE_INFO arrays. */
+            MODE_INFO *tmp = cm->mip + cm->mode_info_stride;
+            LOWER_RES_INFO* store_mode_info
+                            = (LOWER_RES_INFO*)cpi->oxcf.mr_low_res_mode_info;
+
+            for (mb_row = 0; mb_row < cm->mb_rows; mb_row ++)
+            {
+                tmp++;
+                for (mb_col = 0; mb_col < cm->mb_cols; mb_col ++)
+                {
+                    int dissim = INT_MAX;
+
+                    if(tmp->mbmi.ref_frame !=INTRA_FRAME)
+                    {
+                        int              mvx[8];
+                        int              mvy[8];
+                        int              mmvx;
+                        int              mmvy;
+                        int              cnt=0;
+                        const MODE_INFO *here = tmp;
+                        const MODE_INFO *above = here - cm->mode_info_stride;
+                        const MODE_INFO *left = here - 1;
+                        const MODE_INFO *aboveleft = above - 1;
+                        const MODE_INFO *aboveright = NULL;
+                        const MODE_INFO *right = NULL;
+                        const MODE_INFO *belowleft = NULL;
+                        const MODE_INFO *below = NULL;
+                        const MODE_INFO *belowright = NULL;
+
+                        /* If alternate reference frame is used, we have to
+                         * check sign of MV. */
+                        if(cpi->oxcf.play_alternate)
+                        {
+                            /* Gather mv of neighboring MBs */
+                            GET_MV_SIGN(above)
+                            GET_MV_SIGN(left)
+                            GET_MV_SIGN(aboveleft)
+
+                            if(mb_col < (cm->mb_cols-1))
+                            {
+                                right = here + 1;
+                                aboveright = above + 1;
+                                GET_MV_SIGN(right)
+                                GET_MV_SIGN(aboveright)
+                            }
+
+                            if(mb_row < (cm->mb_rows-1))
+                            {
+                                below = here + cm->mode_info_stride;
+                                belowleft = below - 1;
+                                GET_MV_SIGN(below)
+                                GET_MV_SIGN(belowleft)
+                            }
+
+                            if(mb_col < (cm->mb_cols-1)
+                                && mb_row < (cm->mb_rows-1))
+                            {
+                                belowright = below + 1;
+                                GET_MV_SIGN(belowright)
+                            }
+                        }else
+                        {
+                            /* No alt_ref and gather mv of neighboring MBs */
+                            GET_MV(above)
+                            GET_MV(left)
+                            GET_MV(aboveleft)
+
+                            if(mb_col < (cm->mb_cols-1))
+                            {
+                                right = here + 1;
+                                aboveright = above + 1;
+                                GET_MV(right)
+                                GET_MV(aboveright)
+                            }
+
+                            if(mb_row < (cm->mb_rows-1))
+                            {
+                                below = here + cm->mode_info_stride;
+                                belowleft = below - 1;
+                                GET_MV(below)
+                                GET_MV(belowleft)
+                            }
+
+                            if(mb_col < (cm->mb_cols-1)
+                                && mb_row < (cm->mb_rows-1))
+                            {
+                                belowright = below + 1;
+                                GET_MV(belowright)
+                            }
+                        }
+
+                        if (cnt > 0)
+                        {
+                            int max_mvx = mvx[0];
+                            int min_mvx = mvx[0];
+                            int max_mvy = mvy[0];
+                            int min_mvy = mvy[0];
+                            int i;
+
+                            if (cnt > 1)
+                            {
+                                for (i=1; i< cnt; i++)
+                                {
+                                    if (mvx[i] > max_mvx) max_mvx = mvx[i];
+                                    else if (mvx[i] < min_mvx) min_mvx = mvx[i];
+                                    if (mvy[i] > max_mvy) max_mvy = mvy[i];
+                                    else if (mvy[i] < min_mvy) min_mvy = mvy[i];
+                                }
+                            }
+
+                            mmvx = MAX(abs(min_mvx - here->mbmi.mv.as_mv.row),
+                                       abs(max_mvx - here->mbmi.mv.as_mv.row));
+                            mmvy = MAX(abs(min_mvy - here->mbmi.mv.as_mv.col),
+                                       abs(max_mvy - here->mbmi.mv.as_mv.col));
+                            dissim = MAX(mmvx, mmvy);
+                        }
+                    }
+
+                    /* Store mode info for next resolution encoding */
+                    store_mode_info->mode = tmp->mbmi.mode;
+                    store_mode_info->ref_frame = tmp->mbmi.ref_frame;
+                    store_mode_info->mv.as_int = tmp->mbmi.mv.as_int;
+                    store_mode_info->dissim = dissim;
+                    tmp++;
+                    store_mode_info++;
+                }
+            }
+        }
+    }
+}
diff --git a/vp8/common/common_types.h b/vp8/encoder/mr_dissim.h
index 4e6248697..3d2c2035f 100644
--- a/vp8/common/common_types.h
+++ b/vp8/encoder/mr_dissim.h
@@ -9,10 +9,11 @@
  */
 
 
-#ifndef __INC_COMMON_TYPES
-#define __INC_COMMON_TYPES
+#ifndef __INC_MR_DISSIM_H
+#define __INC_MR_DISSIM_H
+#include "vpx_config.h"
 
-#define TRUE    1
-#define FALSE   0
+extern void vp8_cal_low_res_mb_cols(VP8_COMP *cpi);
+extern void vp8_cal_dissimilarity(VP8_COMP *cpi);
 
 #endif
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 1d00e6777..e3f951925 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -23,7 +23,6 @@
 #include "ratectrl.h"
 #include "vp8/common/quant_common.h"
 #include "segmentation.h"
-#include "vp8/common/g_common.h"
 #include "vpx_scale/yv12extend.h"
 #if CONFIG_POSTPROC
 #include "vp8/common/postproc.h"
@@ -36,6 +35,9 @@
 #if ARCH_ARM
 #include "vpx_ports/arm.h"
 #endif
+#if CONFIG_MULTI_RES_ENCODING
+#include "mr_dissim.h"
+#endif
 
 #include <math.h>
 #include <stdio.h>
@@ -67,6 +69,7 @@ extern void vp8_yv12_copy_src_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_
 #endif
 
 int vp8_estimate_entropy_savings(VP8_COMP *cpi);
+
 int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, const vp8_variance_rtcd_vtable_t *rtcd);
 
 extern void vp8_temporal_filter_prepare_c(VP8_COMP *cpi, int distance);
@@ -355,7 +358,7 @@ static void dealloc_compressor_data(VP8_COMP *cpi)
 
     vp8_de_alloc_frame_buffers(&cpi->common);
 
-    vp8_yv12_de_alloc_frame_buffer(&cpi->last_frame_uf);
+    vp8_yv12_de_alloc_frame_buffer(&cpi->pick_lf_lvl_frame);
     vp8_yv12_de_alloc_frame_buffer(&cpi->scaled_source);
 #if VP8_TEMPORAL_ALT_REF
     vp8_yv12_de_alloc_frame_buffer(&cpi->alt_ref_buffer);
@@ -377,42 +380,25 @@ static void dealloc_compressor_data(VP8_COMP *cpi)
 
     vpx_free(cpi->mb.pip);
     cpi->mb.pip = 0;
-
-#if !(CONFIG_REALTIME_ONLY)
-    vpx_free(cpi->twopass.total_stats);
-    cpi->twopass.total_stats = 0;
-
-    vpx_free(cpi->twopass.total_left_stats);
-    cpi->twopass.total_left_stats = 0;
-
-    vpx_free(cpi->twopass.this_frame_stats);
-    cpi->twopass.this_frame_stats = 0;
-#endif
 }
 
-static void enable_segmentation(VP8_PTR ptr)
+static void enable_segmentation(VP8_COMP *cpi)
 {
-    VP8_COMP *cpi = (VP8_COMP *)(ptr);
-
     // Set the appropriate feature bit
     cpi->mb.e_mbd.segmentation_enabled = 1;
     cpi->mb.e_mbd.update_mb_segmentation_map = 1;
     cpi->mb.e_mbd.update_mb_segmentation_data = 1;
 }
-static void disable_segmentation(VP8_PTR ptr)
+static void disable_segmentation(VP8_COMP *cpi)
 {
-    VP8_COMP *cpi = (VP8_COMP *)(ptr);
-
     // Clear the appropriate feature bit
     cpi->mb.e_mbd.segmentation_enabled = 0;
 }
 
 // Valid values for a segment are 0 to 3
 // Segmentation map is arrange as [Rows][Columns]
-static void set_segmentation_map(VP8_PTR ptr, unsigned char *segmentation_map)
+static void set_segmentation_map(VP8_COMP *cpi, unsigned char *segmentation_map)
 {
-    VP8_COMP *cpi = (VP8_COMP *)(ptr);
-
     // Copy in the new segmentation map
     vpx_memcpy(cpi->segmentation_map, segmentation_map, (cpi->common.mb_rows * cpi->common.mb_cols));
 
@@ -429,19 +415,15 @@ static void set_segmentation_map(VP8_PTR ptr, unsigned char *segmentation_map)
 // abs_delta = SEGMENT_DELTADATA (deltas) abs_delta = SEGMENT_ABSDATA (use the absolute values given).
 //
 //
-static void set_segment_data(VP8_PTR ptr, signed char *feature_data, unsigned char abs_delta)
+static void set_segment_data(VP8_COMP *cpi, signed char *feature_data, unsigned char abs_delta)
 {
-    VP8_COMP *cpi = (VP8_COMP *)(ptr);
-
     cpi->mb.e_mbd.mb_segement_abs_delta = abs_delta;
     vpx_memcpy(cpi->segment_feature_data, feature_data, sizeof(cpi->segment_feature_data));
 }
 
 
-static void segmentation_test_function(VP8_PTR ptr)
+static void segmentation_test_function(VP8_COMP *cpi)
 {
-    VP8_COMP *cpi = (VP8_COMP *)(ptr);
-
     unsigned char *seg_map;
     signed char feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS];
 
@@ -469,10 +451,10 @@ static void segmentation_test_function(VP8_PTR ptr)
     }*/
 
     // Set the segmentation Map
-    set_segmentation_map(ptr, seg_map);
+    set_segmentation_map(cpi, seg_map);
 
     // Activate segmentation.
-    enable_segmentation(ptr);
+    enable_segmentation(cpi);
 
     // Set up the quant segment data
     feature_data[MB_LVL_ALT_Q][0] = 0;
@@ -487,7 +469,7 @@ static void segmentation_test_function(VP8_PTR ptr)
 
     // Initialise the feature data structure
     // SEGMENT_DELTADATA    0, SEGMENT_ABSDATA      1
-    set_segment_data(ptr, &feature_data[0][0], SEGMENT_DELTADATA);
+    set_segment_data(cpi, &feature_data[0][0], SEGMENT_DELTADATA);
 
     // Delete sementation map
         vpx_free(seg_map);
@@ -561,10 +543,10 @@ static void cyclic_background_refresh(VP8_COMP *cpi, int Q, int lf_adjustment)
     }
 
     // Set the segmentation Map
-    set_segmentation_map((VP8_PTR)cpi, seg_map);
+    set_segmentation_map(cpi, seg_map);
 
     // Activate segmentation.
-    enable_segmentation((VP8_PTR)cpi);
+    enable_segmentation(cpi);
 
     // Set up the quant segment data
     feature_data[MB_LVL_ALT_Q][0] = 0;
@@ -580,7 +562,7 @@ static void cyclic_background_refresh(VP8_COMP *cpi, int Q, int lf_adjustment)
 
     // Initialise the feature data structure
     // SEGMENT_DELTADATA    0, SEGMENT_ABSDATA      1
-    set_segment_data((VP8_PTR)cpi, &feature_data[0][0], SEGMENT_DELTADATA);
+    set_segment_data(cpi, &feature_data[0][0], SEGMENT_DELTADATA);
 
     // Delete sementation map
     vpx_free(seg_map);
@@ -609,6 +591,93 @@ static void set_default_lf_deltas(VP8_COMP *cpi)
     cpi->mb.e_mbd.mode_lf_deltas[3] = 4;               // Split mv
 }
 
+/* Convenience macros for mapping speed and mode into a continuous
+ * range
+ */
+#define GOOD(x) (x+1)
+#define RT(x) (x+7)
+
+static int speed_map(int speed, int *map)
+{
+    int res;
+
+    do
+    {
+        res = *map++;
+    } while(speed >= *map++);
+    return res;
+}
+
+static int thresh_mult_map_znn[] = {
+    /* map common to zero, nearest, and near */
+    0, GOOD(2), 1500, GOOD(3), 2000, RT(0), 1000, RT(2), 2000, INT_MAX
+};
+
+static int thresh_mult_map_vhpred[] = {
+    1000, GOOD(2), 1500, GOOD(3), 2000, RT(0), 1000, RT(1), 2000,
+    RT(7), INT_MAX, INT_MAX
+};
+
+static int thresh_mult_map_bpred[] = {
+    2000, GOOD(0), 2500, GOOD(2), 5000, GOOD(3), 7500, RT(0), 2500, RT(1), 5000,
+    RT(6), INT_MAX, INT_MAX
+};
+
+static int thresh_mult_map_tm[] = {
+    1000, GOOD(2), 1500, GOOD(3), 2000, RT(0), 0, RT(1), 1000, RT(2), 2000,
+    RT(7), INT_MAX, INT_MAX
+};
+
+static int thresh_mult_map_new1[] = {
+    1000, GOOD(2), 2000, RT(0), 2000, INT_MAX
+};
+
+static int thresh_mult_map_new2[] = {
+    1000, GOOD(2), 2000, GOOD(3), 2500, GOOD(5), 4000, RT(0), 2000, RT(2), 2500,
+    RT(5), 4000, INT_MAX
+};
+
+static int thresh_mult_map_split1[] = {
+    2500, GOOD(0), 1700, GOOD(2), 10000, GOOD(3), 25000, GOOD(4), INT_MAX,
+    RT(0), 5000, RT(1), 10000, RT(2), 25000, RT(3), INT_MAX, INT_MAX
+};
+
+static int thresh_mult_map_split2[] = {
+    5000, GOOD(0), 4500, GOOD(2), 20000, GOOD(3), 50000, GOOD(4), INT_MAX,
+    RT(0), 10000, RT(1), 20000, RT(2), 50000, RT(3), INT_MAX, INT_MAX
+};
+
+static int mode_check_freq_map_zn2[] = {
+    /* {zero,nearest}{2,3} */
+    0, RT(10), 1<<1, RT(11), 1<<2, RT(12), 1<<3, INT_MAX
+};
+
+static int mode_check_freq_map_vhbpred[] = {
+    0, GOOD(5), 2, RT(0), 0, RT(3), 2, RT(5), 4, INT_MAX
+};
+
+static int mode_check_freq_map_near2[] = {
+    0, GOOD(5), 2, RT(0), 0, RT(3), 2, RT(10), 1<<2, RT(11), 1<<3, RT(12), 1<<4,
+    INT_MAX
+};
+
+static int mode_check_freq_map_new1[] = {
+    0, RT(10), 1<<1, RT(11), 1<<2, RT(12), 1<<3, INT_MAX
+};
+
+static int mode_check_freq_map_new2[] = {
+    0, GOOD(5), 4, RT(0), 0, RT(3), 4, RT(10), 1<<3, RT(11), 1<<4, RT(12), 1<<5,
+    INT_MAX
+};
+
+static int mode_check_freq_map_split1[] = {
+    0, GOOD(2), 2, GOOD(3), 7, RT(1), 2, RT(2), 7, INT_MAX
+};
+
+static int mode_check_freq_map_split2[] = {
+    0, GOOD(1), 2, GOOD(2), 4, GOOD(3), 15, RT(1), 4, RT(2), 15, INT_MAX
+};
+
 void vp8_set_speed_features(VP8_COMP *cpi)
 {
     SPEED_FEATURES *sf = &cpi->sf;
@@ -617,6 +686,7 @@ void vp8_set_speed_features(VP8_COMP *cpi)
     int i;
     VP8_COMMON *cm = &cpi->common;
     int last_improved_quant = sf->improved_quant;
+    int ref_frames;
 
     // Initialise default mode frequency sampling variables
     for (i = 0; i < MAX_MODES; i ++)
@@ -650,93 +720,90 @@ void vp8_set_speed_features(VP8_COMP *cpi)
     for (i = 0; i < MAX_MODES; i++)
         sf->thresh_mult[i] = 0;
 
+    /* Count enabled references */
+    ref_frames = 1;
+    if (cpi->ref_frame_flags & VP8_LAST_FLAG)
+        ref_frames++;
+    if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
+        ref_frames++;
+    if (cpi->ref_frame_flags & VP8_ALT_FLAG)
+        ref_frames++;
+
+    /* Convert speed to continuous range, with clamping */
+    if (Mode == 0)
+        Speed = 0;
+    else if (Mode == 2)
+        Speed = RT(Speed);
+    else
+    {
+        if (Speed > 5)
+            Speed = 5;
+        Speed = GOOD(Speed);
+    }
+
+    sf->thresh_mult[THR_ZERO1] =
+    sf->thresh_mult[THR_NEAREST1] =
+    sf->thresh_mult[THR_NEAR1] =
+    sf->thresh_mult[THR_DC] = 0; /* always */
+
+    sf->thresh_mult[THR_ZERO2] =
+    sf->thresh_mult[THR_ZERO3] =
+    sf->thresh_mult[THR_NEAREST2] =
+    sf->thresh_mult[THR_NEAREST3] =
+    sf->thresh_mult[THR_NEAR2]  =
+    sf->thresh_mult[THR_NEAR3]  = speed_map(Speed, thresh_mult_map_znn);
+
+    sf->thresh_mult[THR_V_PRED] =
+    sf->thresh_mult[THR_H_PRED] = speed_map(Speed, thresh_mult_map_vhpred);
+    sf->thresh_mult[THR_B_PRED] = speed_map(Speed, thresh_mult_map_bpred);
+    sf->thresh_mult[THR_TM]     = speed_map(Speed, thresh_mult_map_tm);
+    sf->thresh_mult[THR_NEW1]   = speed_map(Speed, thresh_mult_map_new1);
+    sf->thresh_mult[THR_NEW2]   =
+    sf->thresh_mult[THR_NEW3]   = speed_map(Speed, thresh_mult_map_new2);
+    sf->thresh_mult[THR_SPLIT1] = speed_map(Speed, thresh_mult_map_split1);
+    sf->thresh_mult[THR_SPLIT2] =
+    sf->thresh_mult[THR_SPLIT3] = speed_map(Speed, thresh_mult_map_split2);
+
+    cpi->mode_check_freq[THR_ZERO1] =
+    cpi->mode_check_freq[THR_NEAREST1] =
+    cpi->mode_check_freq[THR_NEAR1] =
+    cpi->mode_check_freq[THR_TM]     =
+    cpi->mode_check_freq[THR_DC] = 0; /* always */
+
+    cpi->mode_check_freq[THR_ZERO2] =
+    cpi->mode_check_freq[THR_ZERO3] =
+    cpi->mode_check_freq[THR_NEAREST2] =
+    cpi->mode_check_freq[THR_NEAREST3] = speed_map(Speed,
+                                                   mode_check_freq_map_zn2);
+
+    cpi->mode_check_freq[THR_NEAR2]  =
+    cpi->mode_check_freq[THR_NEAR3]  = speed_map(Speed,
+                                                 mode_check_freq_map_near2);
+
+    cpi->mode_check_freq[THR_V_PRED] =
+    cpi->mode_check_freq[THR_H_PRED] =
+    cpi->mode_check_freq[THR_B_PRED] = speed_map(Speed,
+                                                 mode_check_freq_map_vhbpred);
+    cpi->mode_check_freq[THR_NEW1]   = speed_map(Speed,
+                                                 mode_check_freq_map_new1);
+    cpi->mode_check_freq[THR_NEW2]   =
+    cpi->mode_check_freq[THR_NEW3]   = speed_map(Speed,
+                                                 mode_check_freq_map_new2);
+    cpi->mode_check_freq[THR_SPLIT1] = speed_map(Speed,
+                                                 mode_check_freq_map_split1);
+    cpi->mode_check_freq[THR_SPLIT2] =
+    cpi->mode_check_freq[THR_SPLIT3] = speed_map(Speed,
+                                                 mode_check_freq_map_split2);
+    Speed = cpi->Speed;
     switch (Mode)
     {
 #if !(CONFIG_REALTIME_ONLY)
     case 0: // best quality mode
-        sf->thresh_mult[THR_ZEROMV   ] = 0;
-        sf->thresh_mult[THR_ZEROG    ] = 0;
-        sf->thresh_mult[THR_ZEROA    ] = 0;
-        sf->thresh_mult[THR_NEARESTMV] = 0;
-        sf->thresh_mult[THR_NEARESTG ] = 0;
-        sf->thresh_mult[THR_NEARESTA ] = 0;
-        sf->thresh_mult[THR_NEARMV   ] = 0;
-        sf->thresh_mult[THR_NEARG    ] = 0;
-        sf->thresh_mult[THR_NEARA    ] = 0;
-
-        sf->thresh_mult[THR_DC       ] = 0;
-
-        sf->thresh_mult[THR_V_PRED   ] = 1000;
-        sf->thresh_mult[THR_H_PRED   ] = 1000;
-        sf->thresh_mult[THR_B_PRED   ] = 2000;
-        sf->thresh_mult[THR_TM       ] = 1000;
-
-        sf->thresh_mult[THR_NEWMV    ] = 1000;
-        sf->thresh_mult[THR_NEWG     ] = 1000;
-        sf->thresh_mult[THR_NEWA     ] = 1000;
-
-        sf->thresh_mult[THR_SPLITMV  ] = 2500;
-        sf->thresh_mult[THR_SPLITG   ] = 5000;
-        sf->thresh_mult[THR_SPLITA   ] = 5000;
-
-
         sf->first_step = 0;
         sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
         break;
     case 1:
     case 3:
-        sf->thresh_mult[THR_NEARESTMV] = 0;
-        sf->thresh_mult[THR_ZEROMV   ] = 0;
-        sf->thresh_mult[THR_DC       ] = 0;
-        sf->thresh_mult[THR_NEARMV   ] = 0;
-        sf->thresh_mult[THR_V_PRED   ] = 1000;
-        sf->thresh_mult[THR_H_PRED   ] = 1000;
-        sf->thresh_mult[THR_B_PRED   ] = 2500;
-        sf->thresh_mult[THR_TM       ] = 1000;
-
-        sf->thresh_mult[THR_NEARESTG ] = 1000;
-        sf->thresh_mult[THR_NEARESTA ] = 1000;
-
-        sf->thresh_mult[THR_ZEROG    ] = 1000;
-        sf->thresh_mult[THR_ZEROA    ] = 1000;
-        sf->thresh_mult[THR_NEARG    ] = 1000;
-        sf->thresh_mult[THR_NEARA    ] = 1000;
-
-#if 1
-        sf->thresh_mult[THR_ZEROMV   ] = 0;
-        sf->thresh_mult[THR_ZEROG    ] = 0;
-        sf->thresh_mult[THR_ZEROA    ] = 0;
-        sf->thresh_mult[THR_NEARESTMV] = 0;
-        sf->thresh_mult[THR_NEARESTG ] = 0;
-        sf->thresh_mult[THR_NEARESTA ] = 0;
-        sf->thresh_mult[THR_NEARMV   ] = 0;
-        sf->thresh_mult[THR_NEARG    ] = 0;
-        sf->thresh_mult[THR_NEARA    ] = 0;
-
-//        sf->thresh_mult[THR_DC       ] = 0;
-
-//        sf->thresh_mult[THR_V_PRED   ] = 1000;
-//        sf->thresh_mult[THR_H_PRED   ] = 1000;
-//        sf->thresh_mult[THR_B_PRED   ] = 2000;
-//        sf->thresh_mult[THR_TM       ] = 1000;
-
-        sf->thresh_mult[THR_NEWMV    ] = 1000;
-        sf->thresh_mult[THR_NEWG     ] = 1000;
-        sf->thresh_mult[THR_NEWA     ] = 1000;
-
-        sf->thresh_mult[THR_SPLITMV  ] = 1700;
-        sf->thresh_mult[THR_SPLITG   ] = 4500;
-        sf->thresh_mult[THR_SPLITA   ] = 4500;
-#else
-        sf->thresh_mult[THR_NEWMV    ] = 1500;
-        sf->thresh_mult[THR_NEWG     ] = 1500;
-        sf->thresh_mult[THR_NEWA     ] = 1500;
-
-        sf->thresh_mult[THR_SPLITMV  ] = 5000;
-        sf->thresh_mult[THR_SPLITG   ] = 10000;
-        sf->thresh_mult[THR_SPLITA   ] = 10000;
-#endif
-
         if (Speed > 0)
         {
             /* Disable coefficient optimization above speed 0 */
@@ -745,83 +812,10 @@ void vp8_set_speed_features(VP8_COMP *cpi)
             sf->no_skip_block4x4_search = 0;
 
             sf->first_step = 1;
-
-            cpi->mode_check_freq[THR_SPLITG] = 2;
-            cpi->mode_check_freq[THR_SPLITA] = 2;
-            cpi->mode_check_freq[THR_SPLITMV] = 0;
-        }
-
-        if (Speed > 1)
-        {
-            cpi->mode_check_freq[THR_SPLITG] = 4;
-            cpi->mode_check_freq[THR_SPLITA] = 4;
-            cpi->mode_check_freq[THR_SPLITMV] = 2;
-
-            sf->thresh_mult[THR_TM       ] = 1500;
-            sf->thresh_mult[THR_V_PRED   ] = 1500;
-            sf->thresh_mult[THR_H_PRED   ] = 1500;
-            sf->thresh_mult[THR_B_PRED   ] = 5000;
-
-            if (cpi->ref_frame_flags & VP8_LAST_FLAG)
-            {
-                sf->thresh_mult[THR_NEWMV    ] = 2000;
-                sf->thresh_mult[THR_SPLITMV  ] = 10000;
-            }
-
-            if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
-            {
-                sf->thresh_mult[THR_NEARESTG ] = 1500;
-                sf->thresh_mult[THR_ZEROG    ] = 1500;
-                sf->thresh_mult[THR_NEARG    ] = 1500;
-                sf->thresh_mult[THR_NEWG     ] = 2000;
-                sf->thresh_mult[THR_SPLITG   ] = 20000;
-            }
-
-            if (cpi->ref_frame_flags & VP8_ALT_FLAG)
-            {
-                sf->thresh_mult[THR_NEARESTA ] = 1500;
-                sf->thresh_mult[THR_ZEROA    ] = 1500;
-                sf->thresh_mult[THR_NEARA    ] = 1500;
-                sf->thresh_mult[THR_NEWA     ] = 2000;
-                sf->thresh_mult[THR_SPLITA   ] = 20000;
-            }
         }
 
         if (Speed > 2)
         {
-            cpi->mode_check_freq[THR_SPLITG] = 15;
-            cpi->mode_check_freq[THR_SPLITA] = 15;
-            cpi->mode_check_freq[THR_SPLITMV] = 7;
-
-            sf->thresh_mult[THR_TM       ] = 2000;
-            sf->thresh_mult[THR_V_PRED   ] = 2000;
-            sf->thresh_mult[THR_H_PRED   ] = 2000;
-            sf->thresh_mult[THR_B_PRED   ] = 7500;
-
-            if (cpi->ref_frame_flags & VP8_LAST_FLAG)
-            {
-                sf->thresh_mult[THR_NEWMV    ] = 2000;
-                sf->thresh_mult[THR_SPLITMV  ] = 25000;
-            }
-
-            if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
-            {
-                sf->thresh_mult[THR_NEARESTG ] = 2000;
-                sf->thresh_mult[THR_ZEROG    ] = 2000;
-                sf->thresh_mult[THR_NEARG    ] = 2000;
-                sf->thresh_mult[THR_NEWG     ] = 2500;
-                sf->thresh_mult[THR_SPLITG   ] = 50000;
-            }
-
-            if (cpi->ref_frame_flags & VP8_ALT_FLAG)
-            {
-                sf->thresh_mult[THR_NEARESTA ] = 2000;
-                sf->thresh_mult[THR_ZEROA    ] = 2000;
-                sf->thresh_mult[THR_NEARA    ] = 2000;
-                sf->thresh_mult[THR_NEWA     ] = 2500;
-                sf->thresh_mult[THR_SPLITA   ] = 50000;
-            }
-
             sf->improved_quant = 0;
             sf->improved_dct = 0;
 
@@ -833,18 +827,6 @@ void vp8_set_speed_features(VP8_COMP *cpi)
 
         if (Speed > 3)
         {
-            sf->thresh_mult[THR_SPLITA  ] = INT_MAX;
-            sf->thresh_mult[THR_SPLITG  ] = INT_MAX;
-            sf->thresh_mult[THR_SPLITMV  ] = INT_MAX;
-
-            cpi->mode_check_freq[THR_V_PRED] = 0;
-            cpi->mode_check_freq[THR_H_PRED] = 0;
-            cpi->mode_check_freq[THR_B_PRED] = 0;
-            cpi->mode_check_freq[THR_NEARG] = 0;
-            cpi->mode_check_freq[THR_NEWG] = 0;
-            cpi->mode_check_freq[THR_NEARA] = 0;
-            cpi->mode_check_freq[THR_NEWA] = 0;
-
             sf->auto_filter = 1;
             sf->recode_loop = 0; // recode loop off
             sf->RD = 0;         // Turn rd off
@@ -854,38 +836,6 @@ void vp8_set_speed_features(VP8_COMP *cpi)
         if (Speed > 4)
         {
             sf->auto_filter = 0;                     // Faster selection of loop filter
-
-            cpi->mode_check_freq[THR_V_PRED] = 2;
-            cpi->mode_check_freq[THR_H_PRED] = 2;
-            cpi->mode_check_freq[THR_B_PRED] = 2;
-
-            if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
-            {
-                cpi->mode_check_freq[THR_NEARG] = 2;
-                cpi->mode_check_freq[THR_NEWG] = 4;
-            }
-
-            if (cpi->ref_frame_flags & VP8_ALT_FLAG)
-            {
-                cpi->mode_check_freq[THR_NEARA] = 2;
-                cpi->mode_check_freq[THR_NEWA] = 4;
-            }
-
-            if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
-            {
-                sf->thresh_mult[THR_NEARESTG ] = 2000;
-                sf->thresh_mult[THR_ZEROG    ] = 2000;
-                sf->thresh_mult[THR_NEARG    ] = 2000;
-                sf->thresh_mult[THR_NEWG     ] = 4000;
-            }
-
-            if (cpi->ref_frame_flags & VP8_ALT_FLAG)
-            {
-                sf->thresh_mult[THR_NEARESTA ] = 2000;
-                sf->thresh_mult[THR_ZEROA    ] = 2000;
-                sf->thresh_mult[THR_NEARA    ] = 2000;
-                sf->thresh_mult[THR_NEWA     ] = 4000;
-            }
         }
 
         break;
@@ -895,67 +845,10 @@ void vp8_set_speed_features(VP8_COMP *cpi)
         sf->recode_loop = 0;
         sf->auto_filter = 1;
         sf->iterative_sub_pixel = 1;
-        sf->thresh_mult[THR_NEARESTMV] = 0;
-        sf->thresh_mult[THR_ZEROMV   ] = 0;
-        sf->thresh_mult[THR_DC       ] = 0;
-        sf->thresh_mult[THR_TM       ] = 0;
-        sf->thresh_mult[THR_NEARMV   ] = 0;
-        sf->thresh_mult[THR_V_PRED   ] = 1000;
-        sf->thresh_mult[THR_H_PRED   ] = 1000;
-        sf->thresh_mult[THR_B_PRED   ] = 2500;
-        sf->thresh_mult[THR_NEARESTG ] = 1000;
-        sf->thresh_mult[THR_ZEROG    ] = 1000;
-        sf->thresh_mult[THR_NEARG    ] = 1000;
-        sf->thresh_mult[THR_NEARESTA ] = 1000;
-        sf->thresh_mult[THR_ZEROA    ] = 1000;
-        sf->thresh_mult[THR_NEARA    ] = 1000;
-        sf->thresh_mult[THR_NEWMV    ] = 2000;
-        sf->thresh_mult[THR_NEWG     ] = 2000;
-        sf->thresh_mult[THR_NEWA     ] = 2000;
-        sf->thresh_mult[THR_SPLITMV  ] = 5000;
-        sf->thresh_mult[THR_SPLITG   ] = 10000;
-        sf->thresh_mult[THR_SPLITA   ] = 10000;
         sf->search_method = NSTEP;
 
         if (Speed > 0)
         {
-            cpi->mode_check_freq[THR_SPLITG] = 4;
-            cpi->mode_check_freq[THR_SPLITA] = 4;
-            cpi->mode_check_freq[THR_SPLITMV] = 2;
-
-            sf->thresh_mult[THR_DC       ] = 0;
-            sf->thresh_mult[THR_TM       ] = 1000;
-            sf->thresh_mult[THR_V_PRED   ] = 2000;
-            sf->thresh_mult[THR_H_PRED   ] = 2000;
-            sf->thresh_mult[THR_B_PRED   ] = 5000;
-
-            if (cpi->ref_frame_flags & VP8_LAST_FLAG)
-            {
-                sf->thresh_mult[THR_NEARESTMV] = 0;
-                sf->thresh_mult[THR_ZEROMV   ] = 0;
-                sf->thresh_mult[THR_NEARMV   ] = 0;
-                sf->thresh_mult[THR_NEWMV    ] = 2000;
-                sf->thresh_mult[THR_SPLITMV  ] = 10000;
-            }
-
-            if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
-            {
-                sf->thresh_mult[THR_NEARESTG ] = 1000;
-                sf->thresh_mult[THR_ZEROG    ] = 1000;
-                sf->thresh_mult[THR_NEARG    ] = 1000;
-                sf->thresh_mult[THR_NEWG     ] = 2000;
-                sf->thresh_mult[THR_SPLITG   ] = 20000;
-            }
-
-            if (cpi->ref_frame_flags & VP8_ALT_FLAG)
-            {
-                sf->thresh_mult[THR_NEARESTA ] = 1000;
-                sf->thresh_mult[THR_ZEROA    ] = 1000;
-                sf->thresh_mult[THR_NEARA    ] = 1000;
-                sf->thresh_mult[THR_NEWA     ] = 2000;
-                sf->thresh_mult[THR_SPLITA   ] = 20000;
-            }
-
             sf->improved_quant = 0;
             sf->improved_dct = 0;
 
@@ -964,133 +857,28 @@ void vp8_set_speed_features(VP8_COMP *cpi)
             sf->first_step = 1;
         }
 
-        if (Speed > 1)
-        {
-            cpi->mode_check_freq[THR_SPLITMV] = 7;
-            cpi->mode_check_freq[THR_SPLITG] = 15;
-            cpi->mode_check_freq[THR_SPLITA] = 15;
-
-            sf->thresh_mult[THR_TM       ] = 2000;
-            sf->thresh_mult[THR_V_PRED   ] = 2000;
-            sf->thresh_mult[THR_H_PRED   ] = 2000;
-            sf->thresh_mult[THR_B_PRED   ] = 5000;
-
-            if (cpi->ref_frame_flags & VP8_LAST_FLAG)
-            {
-                sf->thresh_mult[THR_NEWMV    ] = 2000;
-                sf->thresh_mult[THR_SPLITMV  ] = 25000;
-            }
-
-            if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
-            {
-                sf->thresh_mult[THR_NEARESTG ] = 2000;
-                sf->thresh_mult[THR_ZEROG    ] = 2000;
-                sf->thresh_mult[THR_NEARG    ] = 2000;
-                sf->thresh_mult[THR_NEWG     ] = 2500;
-                sf->thresh_mult[THR_SPLITG   ] = 50000;
-            }
-
-            if (cpi->ref_frame_flags & VP8_ALT_FLAG)
-            {
-                sf->thresh_mult[THR_NEARESTA ] = 2000;
-                sf->thresh_mult[THR_ZEROA    ] = 2000;
-                sf->thresh_mult[THR_NEARA    ] = 2000;
-                sf->thresh_mult[THR_NEWA     ] = 2500;
-                sf->thresh_mult[THR_SPLITA   ] = 50000;
-            }
-
-        }
-
         if (Speed > 2)
-        {
             sf->auto_filter = 0;                     // Faster selection of loop filter
 
-            cpi->mode_check_freq[THR_V_PRED] = 2;
-            cpi->mode_check_freq[THR_H_PRED] = 2;
-            cpi->mode_check_freq[THR_B_PRED] = 2;
-
-            if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
-            {
-                cpi->mode_check_freq[THR_NEARG] = 2;
-                cpi->mode_check_freq[THR_NEWG] = 4;
-            }
-
-            if (cpi->ref_frame_flags & VP8_ALT_FLAG)
-            {
-                cpi->mode_check_freq[THR_NEARA] = 2;
-                cpi->mode_check_freq[THR_NEWA] = 4;
-            }
-
-            sf->thresh_mult[THR_SPLITMV  ] = INT_MAX;
-            sf->thresh_mult[THR_SPLITG  ] = INT_MAX;
-            sf->thresh_mult[THR_SPLITA  ] = INT_MAX;
-
-        }
-
         if (Speed > 3)
         {
             sf->RD = 0;
-
             sf->auto_filter = 1;
         }
 
         if (Speed > 4)
         {
             sf->auto_filter = 0;                     // Faster selection of loop filter
-
             sf->search_method = HEX;
-            //sf->search_method = DIAMOND;
-
             sf->iterative_sub_pixel = 0;
-
-            cpi->mode_check_freq[THR_V_PRED] = 4;
-            cpi->mode_check_freq[THR_H_PRED] = 4;
-            cpi->mode_check_freq[THR_B_PRED] = 4;
-
-            if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
-            {
-                cpi->mode_check_freq[THR_NEARG] = 2;
-                cpi->mode_check_freq[THR_NEWG] = 4;
-            }
-
-            if (cpi->ref_frame_flags & VP8_ALT_FLAG)
-            {
-                cpi->mode_check_freq[THR_NEARA] = 2;
-                cpi->mode_check_freq[THR_NEWA] = 4;
-            }
-
-            sf->thresh_mult[THR_TM       ] = 2000;
-            sf->thresh_mult[THR_B_PRED   ] = 5000;
-
-            if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
-            {
-                sf->thresh_mult[THR_NEARESTG ] = 2000;
-                sf->thresh_mult[THR_ZEROG    ] = 2000;
-                sf->thresh_mult[THR_NEARG    ] = 2000;
-                sf->thresh_mult[THR_NEWG     ] = 4000;
-            }
-
-            if (cpi->ref_frame_flags & VP8_ALT_FLAG)
-            {
-                sf->thresh_mult[THR_NEARESTA ] = 2000;
-                sf->thresh_mult[THR_ZEROA    ] = 2000;
-                sf->thresh_mult[THR_NEARA    ] = 2000;
-                sf->thresh_mult[THR_NEWA     ] = 4000;
-            }
-        }
-
-        if (Speed > 5)
-        {
-            // Disable split MB intra prediction mode
-            sf->thresh_mult[THR_B_PRED] = INT_MAX;
         }
 
         if (Speed > 6)
         {
-            unsigned int i, sum = 0;
+            unsigned int sum = 0;
             unsigned int total_mbs = cm->MBs;
-            int thresh;
-            int total_skip;
+            int i, thresh;
+            unsigned int total_skip;
 
             int min = 2000;
 
@@ -1122,109 +910,53 @@ void vp8_set_speed_features(VP8_COMP *cpi)
             if (thresh < 2000)
                 thresh = 2000;
 
-            if (cpi->ref_frame_flags & VP8_LAST_FLAG)
+            if (ref_frames > 1)
             {
-                sf->thresh_mult[THR_NEWMV] = thresh;
-                sf->thresh_mult[THR_NEARESTMV ] = thresh >> 1;
-                sf->thresh_mult[THR_NEARMV    ] = thresh >> 1;
+                sf->thresh_mult[THR_NEW1 ] = thresh;
+                sf->thresh_mult[THR_NEAREST1  ] = thresh >> 1;
+                sf->thresh_mult[THR_NEAR1     ] = thresh >> 1;
             }
 
-            if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
+            if (ref_frames > 2)
             {
-                sf->thresh_mult[THR_NEWG] = thresh << 1;
-                sf->thresh_mult[THR_NEARESTG ] = thresh;
-                sf->thresh_mult[THR_NEARG    ] = thresh;
+                sf->thresh_mult[THR_NEW2] = thresh << 1;
+                sf->thresh_mult[THR_NEAREST2 ] = thresh;
+                sf->thresh_mult[THR_NEAR2    ] = thresh;
             }
 
-            if (cpi->ref_frame_flags & VP8_ALT_FLAG)
+            if (ref_frames > 3)
             {
-                sf->thresh_mult[THR_NEWA] = thresh << 1;
-                sf->thresh_mult[THR_NEARESTA ] = thresh;
-                sf->thresh_mult[THR_NEARA    ] = thresh;
+                sf->thresh_mult[THR_NEW3] = thresh << 1;
+                sf->thresh_mult[THR_NEAREST3 ] = thresh;
+                sf->thresh_mult[THR_NEAR3    ] = thresh;
             }
 
-            // Disable other intra prediction modes
-            sf->thresh_mult[THR_TM] = INT_MAX;
-            sf->thresh_mult[THR_V_PRED] = INT_MAX;
-            sf->thresh_mult[THR_H_PRED] = INT_MAX;
-
             sf->improved_mv_pred = 0;
         }
 
         if (Speed > 8)
-        {
             sf->quarter_pixel_search = 0;
-        }
 
-        if (Speed > 9)
+        if(cm->version == 0)
         {
-            int Tmp = cpi->Speed - 8;
-
-            if (Tmp > 4)
-                Tmp = 4;
+            cm->filter_type = NORMAL_LOOPFILTER;
 
-            if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
-            {
-                cpi->mode_check_freq[THR_ZEROG] = 1 << (Tmp - 1);
-                cpi->mode_check_freq[THR_NEARESTG] = 1 << (Tmp - 1);
-                cpi->mode_check_freq[THR_NEARG] = 1 << Tmp;
-                cpi->mode_check_freq[THR_NEWG] = 1 << (Tmp + 1);
-            }
-
-            if (cpi->ref_frame_flags & VP8_ALT_FLAG)
-            {
-                cpi->mode_check_freq[THR_ZEROA] = 1 << (Tmp - 1);
-                cpi->mode_check_freq[THR_NEARESTA] = 1 << (Tmp - 1);
-                cpi->mode_check_freq[THR_NEARA] = 1 << Tmp;
-                cpi->mode_check_freq[THR_NEWA] = 1 << (Tmp + 1);
-            }
-
-            cpi->mode_check_freq[THR_NEWMV] = 1 << (Tmp - 1);
+            if (Speed >= 14)
+                cm->filter_type = SIMPLE_LOOPFILTER;
         }
-
-        cm->filter_type = NORMAL_LOOPFILTER;
-
-        if (Speed >= 14)
+        else
+        {
             cm->filter_type = SIMPLE_LOOPFILTER;
+        }
 
+        // This has a big hit on quality. Last resort
         if (Speed >= 15)
-        {
-            sf->half_pixel_search = 0;        // This has a big hit on quality. Last resort
-        }
+            sf->half_pixel_search = 0;
 
         vpx_memset(cpi->error_bins, 0, sizeof(cpi->error_bins));
 
     }; /* switch */
 
-    /* disable frame modes if flags not set */
-    if (!(cpi->ref_frame_flags & VP8_LAST_FLAG))
-    {
-        sf->thresh_mult[THR_NEWMV    ] = INT_MAX;
-        sf->thresh_mult[THR_NEARESTMV] = INT_MAX;
-        sf->thresh_mult[THR_ZEROMV   ] = INT_MAX;
-        sf->thresh_mult[THR_NEARMV   ] = INT_MAX;
-        sf->thresh_mult[THR_SPLITMV  ] = INT_MAX;
-    }
-
-    if (!(cpi->ref_frame_flags & VP8_GOLD_FLAG))
-    {
-        sf->thresh_mult[THR_NEARESTG ] = INT_MAX;
-        sf->thresh_mult[THR_ZEROG    ] = INT_MAX;
-        sf->thresh_mult[THR_NEARG    ] = INT_MAX;
-        sf->thresh_mult[THR_NEWG     ] = INT_MAX;
-        sf->thresh_mult[THR_SPLITG   ] = INT_MAX;
-    }
-
-    if (!(cpi->ref_frame_flags & VP8_ALT_FLAG))
-    {
-        sf->thresh_mult[THR_NEARESTA ] = INT_MAX;
-        sf->thresh_mult[THR_ZEROA    ] = INT_MAX;
-        sf->thresh_mult[THR_NEARA    ] = INT_MAX;
-        sf->thresh_mult[THR_NEWA     ] = INT_MAX;
-        sf->thresh_mult[THR_SPLITA   ] = INT_MAX;
-    }
-
-
     // Slow quant, dct and trellis not worthwhile for first pass
     // so make sure they are always turned off.
     if ( cpi->pass == 1 )
@@ -1306,6 +1038,9 @@ void vp8_set_speed_features(VP8_COMP *cpi)
     frames_at_speed[cpi->Speed]++;
 #endif
 }
+#undef GOOD
+#undef RT
+
 static void alloc_raw_frame_buffers(VP8_COMP *cpi)
 {
     int width = (cpi->oxcf.Width + 15) & ~15;
@@ -1365,7 +1100,7 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi)
         height += 16 - (height & 0xf);
 
 
-    if (vp8_yv12_alloc_frame_buffer(&cpi->last_frame_uf,
+    if (vp8_yv12_alloc_frame_buffer(&cpi->pick_lf_lvl_frame,
                                     width, height, VP8BORDERINPIXELS))
         vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
                            "Failed to allocate last frame buffer");
@@ -1406,25 +1141,6 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi)
                     vpx_calloc(sizeof(unsigned int),
                     cm->mb_rows * cm->mb_cols));
 
-#if !(CONFIG_REALTIME_ONLY)
-        vpx_free(cpi->twopass.total_stats);
-
-    cpi->twopass.total_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS));
-
-    vpx_free(cpi->twopass.total_left_stats);
-    cpi->twopass.total_left_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS));
-
-        vpx_free(cpi->twopass.this_frame_stats);
-
-    cpi->twopass.this_frame_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS));
-
-    if( !cpi->twopass.total_stats ||
-        !cpi->twopass.total_left_stats ||
-        !cpi->twopass.this_frame_stats)
-        vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
-                           "Failed to allocate firstpass stats");
-#endif
-
 #if CONFIG_MULTITHREAD
     if (width < 640)
         cpi->mt_sync_range = 1;
@@ -1436,7 +1152,7 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi)
         cpi->mt_sync_range = 16;
 #endif
 
-        vpx_free(cpi->tplist);
+    vpx_free(cpi->tplist);
 
     CHECK_MEM_ERROR(cpi->tplist, vpx_malloc(sizeof(TOKENLIST) * cpi->common.mb_rows));
 }
@@ -1470,8 +1186,8 @@ void vp8_new_frame_rate(VP8_COMP *cpi, double framerate)
     if(framerate < .1)
         framerate = 30;
 
-    cpi->oxcf.frame_rate        = framerate;
-    cpi->output_frame_rate      = cpi->oxcf.frame_rate;
+    cpi->frame_rate             = framerate;
+    cpi->output_frame_rate      = framerate;
     cpi->per_frame_bandwidth    = (int)(cpi->oxcf.target_bandwidth /
                                   cpi->output_frame_rate);
     cpi->av_per_frame_bandwidth = cpi->per_frame_bandwidth;
@@ -1513,9 +1229,8 @@ rescale(int val, int num, int denom)
 }
 
 
-static void init_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
+static void init_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
 {
-    VP8_COMP *cpi = (VP8_COMP *)(ptr);
     VP8_COMMON *cm = &cpi->common;
 
     cpi->oxcf = *oxcf;
@@ -1527,8 +1242,20 @@ static void init_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
     cm->version = oxcf->Version;
     vp8_setup_version(cm);
 
+    /* frame rate is not available on the first frame, as it's derived from
+     * the observed timestamps. The actual value used here doesn't matter
+     * too much, as it will adapt quickly. If the reciprocal of the timebase
+     * seems like a reasonable framerate, then use that as a guess, otherwise
+     * use 30.
+     */
+    cpi->frame_rate = (double)(oxcf->timebase.den) /
+                      (double)(oxcf->timebase.num);
+
+    if (cpi->frame_rate > 180)
+        cpi->frame_rate = 30;
+
     // change includes all joint functionality
-    vp8_change_config(ptr, oxcf);
+    vp8_change_config(cpi, oxcf);
 
     // Initialize active best and worst q and average q values.
     cpi->active_worst_quality         = cpi->oxcf.worst_allowed_q;
@@ -1550,8 +1277,8 @@ static void init_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
     // Temporal scalabilty
     if (cpi->oxcf.number_of_layers > 1)
     {
-        int i;
-        int prev_layer_frame_rate=0;
+        unsigned int i;
+        double prev_layer_frame_rate=0;
 
         for (i=0; i<cpi->oxcf.number_of_layers; i++)
         {
@@ -1619,9 +1346,8 @@ static void init_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
 }
 
 
-void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
+void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
 {
-    VP8_COMP *cpi = (VP8_COMP *)(ptr);
     VP8_COMMON *cm = &cpi->common;
 
     if (!cpi)
@@ -1787,7 +1513,7 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
                     cpi->oxcf.target_bandwidth, 1000);
 
     // Set up frame rate and related parameters rate control values.
-    vp8_new_frame_rate(cpi, cpi->oxcf.frame_rate);
+    vp8_new_frame_rate(cpi, cpi->frame_rate);
 
     // Set absolute upper and lower quality limits
     cpi->worst_quality               = cpi->oxcf.worst_allowed_q;
@@ -1813,7 +1539,7 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
       cpi->active_best_quality = cpi->oxcf.worst_allowed_q;
     }
 
-    cpi->buffered_mode = (cpi->oxcf.optimal_buffer_level > 0) ? TRUE : FALSE;
+    cpi->buffered_mode = cpi->oxcf.optimal_buffer_level > 0;
 
     cpi->cq_target_quality = cpi->oxcf.cq_level;
 
@@ -1912,19 +1638,14 @@ static void cal_mvsadcosts(int *mvsadcost[2])
     while (++i <= mvfp_max);
 }
 
-VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
+struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
 {
     int i;
-    volatile union
-    {
-        VP8_COMP *cpi;
-        VP8_PTR   ptr;
-    } ctx;
 
     VP8_COMP *cpi;
     VP8_COMMON *cm;
 
-    cpi = ctx.cpi = vpx_memalign(32, sizeof(VP8_COMP));
+    cpi = vpx_memalign(32, sizeof(VP8_COMP));
     // Check that the CPI instance is valid
     if (!cpi)
         return 0;
@@ -1935,10 +1656,8 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
 
     if (setjmp(cm->error.jmp))
     {
-        VP8_PTR ptr = ctx.ptr;
-
-        ctx.cpi->common.error.setjmp = 0;
-        vp8_remove_compressor(&ptr);
+        cpi->common.error.setjmp = 0;
+        vp8_remove_compressor(&cpi);
         return 0;
     }
 
@@ -1949,7 +1668,7 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
     vp8_create_common(&cpi->common);
     vp8_cmachine_specific_config(cpi);
 
-    init_config((VP8_PTR)cpi, oxcf);
+    init_config(cpi, oxcf);
 
     memcpy(cpi->base_skip_false_prob, vp8cx_base_skip_false_prob, sizeof(vp8cx_base_skip_false_prob));
     cpi->common.current_video_frame   = 0;
@@ -2028,7 +1747,7 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
         cpi->cyclic_refresh_map = (signed char *) NULL;
 
     // Test function for segmentation
-    //segmentation_test_function((VP8_PTR) cpi);
+    //segmentation_test_function( cpi);
 
 #ifdef ENTROPY_STATS
     init_context_counters();
@@ -2039,11 +1758,11 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
 
     cpi->frames_since_key = 8;        // Give a sensible default for the first frame.
     cpi->key_frame_frequency = cpi->oxcf.key_freq;
-    cpi->this_key_frame_forced = FALSE;
-    cpi->next_key_frame_forced = FALSE;
+    cpi->this_key_frame_forced = 0;
+    cpi->next_key_frame_forced = 0;
 
-    cpi->source_alt_ref_pending = FALSE;
-    cpi->source_alt_ref_active = FALSE;
+    cpi->source_alt_ref_pending = 0;
+    cpi->source_alt_ref_active = 0;
     cpi->common.refresh_alt_ref_frame = 0;
 
     cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
@@ -2241,14 +1960,21 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
     vp8_loop_filter_init(cm);
 
     cpi->common.error.setjmp = 0;
-    return (VP8_PTR) cpi;
+
+#if CONFIG_MULTI_RES_ENCODING
+    /* Calculate # of MBs in a row in lower-resolution level image. */
+    if (cpi->oxcf.mr_encoder_id > 0)
+        vp8_cal_low_res_mb_cols(cpi);
+#endif
+
+    return  cpi;
 
 }
 
 
-void vp8_remove_compressor(VP8_PTR *ptr)
+void vp8_remove_compressor(VP8_COMP **ptr)
 {
-    VP8_COMP *cpi = (VP8_COMP *)(*ptr);
+    VP8_COMP *cpi = *ptr;
 
     if (!cpi)
         return;
@@ -2408,7 +2134,7 @@ void vp8_remove_compressor(VP8_PTR *ptr)
         {
             extern int count_mb_seg[4];
             FILE *f = fopen("modes.stt", "a");
-            double dr = (double)cpi->oxcf.frame_rate * (double)bytes * (double)8 / (double)count / (double)1000 ;
+            double dr = (double)cpi->frame_rate * (double)bytes * (double)8 / (double)count / (double)1000 ;
             fprintf(f, "intra_mode in Intra Frames:\n");
             fprintf(f, "Y: %8d, %8d, %8d, %8d, %8d\n", y_modes[0], y_modes[1], y_modes[2], y_modes[3], y_modes[4]);
             fprintf(f, "UV:%8d, %8d, %8d, %8d\n", uv_modes[0], uv_modes[1], uv_modes[2], uv_modes[3]);
@@ -2659,20 +2385,16 @@ static void generate_psnr_packet(VP8_COMP *cpi)
 }
 
 
-int vp8_use_as_reference(VP8_PTR ptr, int ref_frame_flags)
+int vp8_use_as_reference(VP8_COMP *cpi, int ref_frame_flags)
 {
-    VP8_COMP *cpi = (VP8_COMP *)(ptr);
-
     if (ref_frame_flags > 7)
         return -1 ;
 
     cpi->ref_frame_flags = ref_frame_flags;
     return 0;
 }
-int vp8_update_reference(VP8_PTR ptr, int ref_frame_flags)
+int vp8_update_reference(VP8_COMP *cpi, int ref_frame_flags)
 {
-    VP8_COMP *cpi = (VP8_COMP *)(ptr);
-
     if (ref_frame_flags > 7)
         return -1 ;
 
@@ -2692,9 +2414,8 @@ int vp8_update_reference(VP8_PTR ptr, int ref_frame_flags)
     return 0;
 }
 
-int vp8_get_reference(VP8_PTR ptr, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd)
+int vp8_get_reference(VP8_COMP *cpi, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd)
 {
-    VP8_COMP *cpi = (VP8_COMP *)(ptr);
     VP8_COMMON *cm = &cpi->common;
     int ref_fb_idx;
 
@@ -2711,9 +2432,8 @@ int vp8_get_reference(VP8_PTR ptr, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONF
 
     return 0;
 }
-int vp8_set_reference(VP8_PTR ptr, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd)
+int vp8_set_reference(VP8_COMP *cpi, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd)
 {
-    VP8_COMP *cpi = (VP8_COMP *)(ptr);
     VP8_COMMON *cm = &cpi->common;
 
     int ref_fb_idx;
@@ -2731,9 +2451,8 @@ int vp8_set_reference(VP8_PTR ptr, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONF
 
     return 0;
 }
-int vp8_update_entropy(VP8_PTR comp, int update)
+int vp8_update_entropy(VP8_COMP *cpi, int update)
 {
-    VP8_COMP *cpi = (VP8_COMP *) comp;
     VP8_COMMON *cm = &cpi->common;
     cm->refresh_entropy_probs = update;
 
@@ -2889,10 +2608,10 @@ static void update_alt_ref_frame_stats(VP8_COMP *cpi)
     cpi->common.frames_since_golden = 0;
 
     // Clear the alternate reference update pending flag.
-    cpi->source_alt_ref_pending = FALSE;
+    cpi->source_alt_ref_pending = 0;
 
     // Set the alternate refernce frame active flag
-    cpi->source_alt_ref_active = TRUE;
+    cpi->source_alt_ref_active = 1;
 
 
 }
@@ -2955,12 +2674,12 @@ static void update_golden_frame_stats(VP8_COMP *cpi)
         if (cpi->oxcf.fixed_q >= 0 &&
             cpi->oxcf.play_alternate && !cpi->common.refresh_alt_ref_frame)
         {
-            cpi->source_alt_ref_pending = TRUE;
+            cpi->source_alt_ref_pending = 1;
             cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
         }
 
         if (!cpi->source_alt_ref_pending)
-            cpi->source_alt_ref_active = FALSE;
+            cpi->source_alt_ref_active = 0;
 
         // Decrement count down till next gf
         if (cpi->frames_till_gf_update_due > 0)
@@ -2994,8 +2713,7 @@ static void update_rd_ref_frame_probs(VP8_COMP *cpi)
 {
     VP8_COMMON *cm = &cpi->common;
 
-#if 0
-    const int *const rfct = cpi->recent_ref_frame_usage;
+    const int *const rfct = cpi->count_mb_ref_frame_usage;
     const int rf_intra = rfct[INTRA_FRAME];
     const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];
 
@@ -3007,100 +2725,10 @@ static void update_rd_ref_frame_probs(VP8_COMP *cpi)
     }
     else if (!(rf_intra + rf_inter))
     {
-        // This is a trap in case this function is called with cpi->recent_ref_frame_usage[] blank.
         cpi->prob_intra_coded = 63;
         cpi->prob_last_coded  = 128;
         cpi->prob_gf_coded    = 128;
     }
-    else
-    {
-        cpi->prob_intra_coded = (rf_intra * 255) / (rf_intra + rf_inter);
-
-        if (cpi->prob_intra_coded < 1)
-            cpi->prob_intra_coded = 1;
-
-        if ((cm->frames_since_golden > 0) || cpi->source_alt_ref_active)
-        {
-            cpi->prob_last_coded = rf_inter ? (rfct[LAST_FRAME] * 255) / rf_inter : 128;
-
-            if (cpi->prob_last_coded < 1)
-                cpi->prob_last_coded = 1;
-
-            cpi->prob_gf_coded = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME])
-                                 ? (rfct[GOLDEN_FRAME] * 255) / (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) : 128;
-
-            if (cpi->prob_gf_coded < 1)
-                cpi->prob_gf_coded = 1;
-        }
-    }
-
-#else
-    const int *const rfct = cpi->count_mb_ref_frame_usage;
-    const int rf_intra = rfct[INTRA_FRAME];
-    const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];
-
-    if (cm->frame_type == KEY_FRAME)
-    {
-        cpi->prob_intra_coded = 255;
-        cpi->prob_last_coded  = 128;
-        cpi->prob_gf_coded  = 128;
-    }
-    else if (!(rf_intra + rf_inter))
-    {
-        if (cpi->oxcf.number_of_layers > 1)
-        {
-            if (cpi->ref_frame_flags == VP8_LAST_FLAG)
-            {
-                cpi->prob_intra_coded = 63;
-                cpi->prob_last_coded  = 255;
-                cpi->prob_gf_coded    = 128;
-            }
-            else if (cpi->ref_frame_flags == VP8_GOLD_FLAG)
-            {
-                cpi->prob_intra_coded = 63;
-                cpi->prob_last_coded  = 1;
-                cpi->prob_gf_coded    = 255;
-            }
-            else if (cpi->ref_frame_flags == VP8_ALT_FLAG)
-            {
-                cpi->prob_intra_coded = 63;
-                cpi->prob_last_coded  = 1;
-                cpi->prob_gf_coded    = 1;
-            }
-            else
-            {
-                cpi->prob_intra_coded = 63;
-                cpi->prob_last_coded  = 128;
-                cpi->prob_gf_coded    = 128;
-            }
-        }
-        else
-        {
-            // This is a trap in case this function is called with
-            // cpi->recent_ref_frame_usage[] blank.
-            cpi->prob_intra_coded = 63;
-            cpi->prob_last_coded  = 128;
-            cpi->prob_gf_coded    = 128;
-        }
-    }
-    else
-    {
-        cpi->prob_intra_coded = (rf_intra * 255) / (rf_intra + rf_inter);
-
-        if (cpi->prob_intra_coded < 1)
-            cpi->prob_intra_coded = 1;
-
-        cpi->prob_last_coded = rf_inter ? (rfct[LAST_FRAME] * 255) / rf_inter : 128;
-
-        if (cpi->prob_last_coded < 1)
-            cpi->prob_last_coded = 1;
-
-        cpi->prob_gf_coded = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME])
-                             ? (rfct[GOLDEN_FRAME] * 255) / (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) : 128;
-
-        if (cpi->prob_gf_coded < 1)
-            cpi->prob_gf_coded = 1;
-    }
 
     // update reference frame costs since we can do better than what we got last frame.
     if (cpi->oxcf.number_of_layers == 1)
@@ -3114,7 +2742,6 @@ static void update_rd_ref_frame_probs(VP8_COMP *cpi)
         else if (cpi->common.frames_since_golden == 0)
         {
             cpi->prob_last_coded = 214;
-            cpi->prob_gf_coded = 1;
         }
         else if (cpi->common.frames_since_golden == 1)
         {
@@ -3123,14 +2750,14 @@ static void update_rd_ref_frame_probs(VP8_COMP *cpi)
         }
         else if (cpi->source_alt_ref_active)
         {
-            //int dist = cpi->common.frames_till_alt_ref_frame + cpi->common.frames_since_golden;
             cpi->prob_gf_coded -= 20;
 
             if (cpi->prob_gf_coded < 10)
                 cpi->prob_gf_coded = 10;
         }
+        if (!cpi->source_alt_ref_active)
+            cpi->prob_gf_coded = 255;
     }
-#endif
 }
 
 
@@ -3139,12 +2766,12 @@ static int decide_key_frame(VP8_COMP *cpi)
 {
     VP8_COMMON *cm = &cpi->common;
 
-    int code_key_frame = FALSE;
+    int code_key_frame = 0;
 
     cpi->kf_boost = 0;
 
     if (cpi->Speed > 11)
-        return FALSE;
+        return 0;
 
     // Clear down mmx registers
     vp8_clear_system_state();  //__asm emms;
@@ -3186,10 +2813,10 @@ static int decide_key_frame(VP8_COMP *cpi)
             && (change > .25 || change2 > .25))
         {
             /*(change > 1.4 || change < .75)&& cpi->this_frame_percent_intra > cpi->last_frame_percent_intra + 3*/
-            return TRUE;
+            return 1;
         }
 
-        return FALSE;
+        return 0;
 
     }
 
@@ -3199,7 +2826,7 @@ static int decide_key_frame(VP8_COMP *cpi)
         ((cpi->this_frame_percent_intra > 95) &&
          (cpi->this_frame_percent_intra >= (cpi->last_frame_percent_intra + 5))))
     {
-        code_key_frame = TRUE;
+        code_key_frame = 1;
     }
     // in addition if the following are true and this is not a golden frame then code a key frame
     // Note that on golden frames there often seems to be a pop in intra useage anyway hence this
@@ -3212,7 +2839,7 @@ static int decide_key_frame(VP8_COMP *cpi)
               (cpi->this_frame_percent_intra > (cpi->last_frame_percent_intra + 10))))
     {
         if (!cm->refresh_golden_frame)
-            code_key_frame = TRUE;
+            code_key_frame = 1;
     }
 
     return code_key_frame;
@@ -3268,11 +2895,11 @@ void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame)
 
 // Function to test for conditions that indeicate we should loop
 // back and recode a frame.
-static BOOL recode_loop_test( VP8_COMP *cpi,
+static int recode_loop_test( VP8_COMP *cpi,
                               int high_limit, int low_limit,
                               int q, int maxq, int minq )
 {
-    BOOL    force_recode = FALSE;
+    int force_recode = 0;
     VP8_COMMON *cm = &cpi->common;
 
     // Is frame recode allowed at all
@@ -3288,7 +2915,7 @@ static BOOL recode_loop_test( VP8_COMP *cpi,
         if ( ((cpi->projected_frame_size > high_limit) && (q < maxq)) ||
              ((cpi->projected_frame_size < low_limit) && (q > minq)) )
         {
-            force_recode = TRUE;
+            force_recode = 1;
         }
         // Special Constrained quality tests
         else if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY)
@@ -3298,14 +2925,14 @@ static BOOL recode_loop_test( VP8_COMP *cpi,
                  (cpi->projected_frame_size <
                      ((cpi->this_frame_target * 7) >> 3)))
             {
-                force_recode = TRUE;
+                force_recode = 1;
             }
             // Severe undershoot and between auto and user cq level
             else if ( (q > cpi->oxcf.cq_level) &&
                       (cpi->projected_frame_size < cpi->min_frame_bandwidth) &&
                       (cpi->active_best_quality > cpi->oxcf.cq_level))
             {
-                force_recode = TRUE;
+                force_recode = 1;
                 cpi->active_best_quality = cpi->oxcf.cq_level;
             }
         }
@@ -3456,7 +3083,7 @@ static void encode_frame_to_data_rate
     int frame_over_shoot_limit;
     int frame_under_shoot_limit;
 
-    int Loop = FALSE;
+    int Loop = 0;
     int loop_count;
     int this_q;
     int last_zbin_oq;
@@ -3468,10 +3095,10 @@ static void encode_frame_to_data_rate
     int top_index;
     int bottom_index;
     VP8_COMMON *cm = &cpi->common;
-    int active_worst_qchanged = FALSE;
+    int active_worst_qchanged = 0;
 
-    int overshoot_seen = FALSE;
-    int undershoot_seen = FALSE;
+    int overshoot_seen = 0;
+    int undershoot_seen = 0;
     int drop_mark = cpi->oxcf.drop_frames_water_mark * cpi->oxcf.optimal_buffer_level / 100;
     int drop_mark75 = drop_mark * 2 / 3;
     int drop_mark50 = drop_mark / 4;
@@ -3482,7 +3109,7 @@ static void encode_frame_to_data_rate
     vp8_clear_system_state();
 
     // Test code for segmentation of gf/arf (0,0)
-    //segmentation_test_function((VP8_PTR) cpi);
+    //segmentation_test_function( cpi);
 
     if (cpi->compressor_speed == 2)
     {
@@ -3522,12 +3149,12 @@ static void encode_frame_to_data_rate
     // Enable or disable mode based tweaking of the zbin
     // For 2 Pass Only used where GF/ARF prediction quality
     // is above a threshold
-    cpi->zbin_mode_boost_enabled = TRUE;
+    cpi->zbin_mode_boost_enabled = 1;
     if (cpi->pass == 2)
     {
         if ( cpi->gfu_boost <= 400 )
         {
-            cpi->zbin_mode_boost_enabled = FALSE;
+            cpi->zbin_mode_boost_enabled = 0;
         }
     }
 
@@ -3568,7 +3195,7 @@ static void encode_frame_to_data_rate
         }
 
         // The alternate reference frame cannot be active for a key frame
-        cpi->source_alt_ref_active = FALSE;
+        cpi->source_alt_ref_active = 0;
 
         // Reset the RD threshold multipliers to default of * 1 (128)
         for (i = 0; i < MAX_MODES; i++)
@@ -3580,9 +3207,9 @@ static void encode_frame_to_data_rate
     // Test code for segmentation
     //if ( (cm->frame_type == KEY_FRAME) || ((cm->current_video_frame % 2) == 0))
     //if ( (cm->current_video_frame % 2) == 0 )
-    //  enable_segmentation((VP8_PTR)cpi);
+    //  enable_segmentation(cpi);
     //else
-    //  disable_segmentation((VP8_PTR)cpi);
+    //  disable_segmentation(cpi);
 
 #if 0
     // Experimental code for lagged compress and one pass
@@ -3676,7 +3303,7 @@ static void encode_frame_to_data_rate
 
             if (cpi->oxcf.number_of_layers > 1)
             {
-                int i;
+                unsigned int i;
 
                 // Propagate bits saved by dropping the frame to higher layers
                 for (i=cpi->current_layer+1; i<cpi->oxcf.number_of_layers; i++)
@@ -4080,7 +3707,7 @@ static void encode_frame_to_data_rate
                 vp8_pick_frame_size(cpi);
 
                 // Clear the Alt reference frame active flag when we have a key frame
-                cpi->source_alt_ref_active = FALSE;
+                cpi->source_alt_ref_active = 0;
 
                 // Reset the loop filter deltas and segmentation map
                 setup_features(cpi);
@@ -4105,7 +3732,7 @@ static void encode_frame_to_data_rate
                 q_high = cpi->active_worst_quality;
 
                 loop_count++;
-                Loop = TRUE;
+                Loop = 1;
 
                 continue;
             }
@@ -4133,10 +3760,10 @@ static void encode_frame_to_data_rate
             }
 
             // If we have updated the active max Q do not call vp8_update_rate_correction_factors() this loop.
-            active_worst_qchanged = TRUE;
+            active_worst_qchanged = 1;
         }
         else
-            active_worst_qchanged = FALSE;
+            active_worst_qchanged = 0;
 
 #if !(CONFIG_REALTIME_ONLY)
         // Special case handling for forced key frames
@@ -4172,7 +3799,7 @@ static void encode_frame_to_data_rate
             else if (Q < q_low)
                 Q = q_low;
 
-            Loop = ((Q != last_q)) ? TRUE : FALSE;
+            Loop = Q != last_q;
         }
 
         // Is the projected frame size out of range and are we allowed to attempt to recode.
@@ -4229,7 +3856,7 @@ static void encode_frame_to_data_rate
                     }
                 }
 
-                overshoot_seen = TRUE;
+                overshoot_seen = 1;
             }
             // Frame is too small
             else
@@ -4279,7 +3906,7 @@ static void encode_frame_to_data_rate
                     }
                 }
 
-                undershoot_seen = TRUE;
+                undershoot_seen = 1;
             }
 
             // Clamp Q to upper and lower limits:
@@ -4291,18 +3918,18 @@ static void encode_frame_to_data_rate
             // Clamp cpi->zbin_over_quant
             cpi->zbin_over_quant = (cpi->zbin_over_quant < zbin_oq_low) ? zbin_oq_low : (cpi->zbin_over_quant > zbin_oq_high) ? zbin_oq_high : cpi->zbin_over_quant;
 
-            //Loop = ((Q != last_q) || (last_zbin_oq != cpi->zbin_over_quant)) ? TRUE : FALSE;
-            Loop = ((Q != last_q)) ? TRUE : FALSE;
+            //Loop = (Q != last_q) || (last_zbin_oq != cpi->zbin_over_quant);
+            Loop = Q != last_q;
             last_zbin_oq = cpi->zbin_over_quant;
         }
         else
 #endif
-            Loop = FALSE;
+            Loop = 0;
 
         if (cpi->is_src_frame_alt_ref)
-            Loop = FALSE;
+            Loop = 0;
 
-        if (Loop == TRUE)
+        if (Loop == 1)
         {
             vp8_restore_coding_context(cpi);
             loop_count++;
@@ -4311,7 +3938,7 @@ static void encode_frame_to_data_rate
 #endif
         }
     }
-    while (Loop == TRUE);
+    while (Loop == 1);
 
 #if 0
     // Experimental code for lagged and one pass
@@ -4345,13 +3972,20 @@ static void encode_frame_to_data_rate
                                            IF_RTCD(&cpi->rtcd.variance));
     }
 
-    // This frame's MVs are saved and will be used in next frame's MV prediction.
-    // Last frame has one more line(add to bottom) and one more column(add to right) than cm->mip. The edge elements are initialized to 0.
-    if(cm->show_frame)   //do not save for altref frame
+    /* This frame's MVs are saved and will be used in next frame's MV predictor.
+     * Last frame has one more line(add to bottom) and one more column(add to
+     * right) than cm->mip. The edge elements are initialized to 0.
+     */
+#if CONFIG_MULTI_RES_ENCODING
+    if(!cpi->oxcf.mr_encoder_id && cm->show_frame)
+#else
+    if(cm->show_frame)   /* do not save for altref frame */
+#endif
     {
         int mb_row;
         int mb_col;
-        MODE_INFO *tmp = cm->mip; //point to beginning of allocated MODE_INFO arrays.
+        /* Point to beginning of allocated MODE_INFO arrays. */
+        MODE_INFO *tmp = cm->mip;
 
         if(cm->frame_type != KEY_FRAME)
         {
@@ -4370,6 +4004,10 @@ static void encode_frame_to_data_rate
         }
     }
 
+#if CONFIG_MULTI_RES_ENCODING
+    vp8_cal_dissimilarity(cpi);
+#endif
+
     // Update the GF useage maps.
     // This is done after completing the compression of a frame when all
     // modes etc. are finalized but before loop filter
@@ -4442,7 +4080,7 @@ static void encode_frame_to_data_rate
 
     if (cpi->oxcf.number_of_layers > 1)
     {
-        int i;
+        unsigned int i;
         for (i=cpi->current_layer+1; i<cpi->oxcf.number_of_layers; i++)
           cpi->layer_context[i].total_byte_count += (*size);
     }
@@ -4509,7 +4147,7 @@ static void encode_frame_to_data_rate
         (cpi->buffer_level < cpi->oxcf.drop_frames_water_mark * cpi->oxcf.optimal_buffer_level / 100) &&
         (cpi->projected_frame_size > (4 * cpi->this_frame_target)))
     {
-        cpi->drop_frame = TRUE;
+        cpi->drop_frame = 1;
     }
 
 #endif
@@ -4553,7 +4191,7 @@ static void encode_frame_to_data_rate
     // Propagate values to higher temporal layers
     if (cpi->oxcf.number_of_layers > 1)
     {
-        int i;
+        unsigned int i;
 
         for (i=cpi->current_layer+1; i<cpi->oxcf.number_of_layers; i++)
         {
@@ -4856,7 +4494,7 @@ static void Pass2Encode(VP8_COMP *cpi, unsigned long *size, unsigned char *dest,
     {
         double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth
             *cpi->oxcf.two_pass_vbrmin_section / 100);
-        cpi->twopass.bits_left += (int64_t)(two_pass_min_rate / cpi->oxcf.frame_rate);
+        cpi->twopass.bits_left += (int64_t)(two_pass_min_rate / cpi->frame_rate);
     }
 }
 #endif
@@ -4868,12 +4506,11 @@ extern void vp8_pop_neon(int64_t *store);
 #endif
 
 
-int vp8_receive_raw_frame(VP8_PTR ptr, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, int64_t end_time)
+int vp8_receive_raw_frame(VP8_COMP *cpi, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, int64_t end_time)
 {
 #if HAVE_ARMV7
     int64_t store_reg[8];
 #endif
-    VP8_COMP              *cpi = (VP8_COMP *) ptr;
     VP8_COMMON            *cm = &cpi->common;
     struct vpx_usec_timer  timer;
     int                    res = 0;
@@ -4922,13 +4559,12 @@ static int frame_is_reference(const VP8_COMP *cpi)
 }
 
 
-int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned long *size, unsigned char *dest, unsigned char *dest_end, int64_t *time_stamp, int64_t *time_end, int flush)
+int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned long *size, unsigned char *dest, unsigned char *dest_end, int64_t *time_stamp, int64_t *time_end, int flush)
 {
 #if HAVE_ARMV7
     int64_t store_reg[8];
 #endif
-    VP8_COMP *cpi = (VP8_COMP *) ptr;
-    VP8_COMMON *cm = &cpi->common;
+    VP8_COMMON *cm;
     struct vpx_usec_timer  tsctimer;
     struct vpx_usec_timer  ticktimer;
     struct vpx_usec_timer  cmptimer;
@@ -4937,12 +4573,14 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
     if (!cpi)
         return -1;
 
-    if (setjmp(cpi->common.error.jmp)){
+    cm = &cpi->common;
+
+    if (setjmp(cpi->common.error.jmp))
+    {
         cpi->common.error.setjmp = 0;
         return VPX_CODEC_CORRUPT_FRAME;
     }
 
-    cpi->bc.error = &cpi->common.error;
     cpi->common.error.setjmp = 1;
 
 #if HAVE_ARMV7
@@ -4979,7 +4617,7 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
             cm->refresh_golden_frame = 0;
             cm->refresh_last_frame = 0;
             cm->show_frame = 0;
-            cpi->source_alt_ref_pending = FALSE;  // Clear Pending alt Ref flag.
+            cpi->source_alt_ref_pending = 0;  // Clear Pending alt Ref flag.
             cpi->is_src_frame_alt_ref = 0;
         }
     }
@@ -5092,7 +4730,7 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
                 if(interval > 10000000.0)
                     interval = 10000000;
 
-                avg_duration = 10000000.0 / cpi->oxcf.frame_rate;
+                avg_duration = 10000000.0 / cpi->frame_rate;
                 avg_duration *= (interval - avg_duration + this_duration);
                 avg_duration /= interval;
 
@@ -5200,6 +4838,17 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
         vpx_memcpy(&cm->fc, &cm->lfc, sizeof(cm->fc));
     }
 
+    // Save the contexts separately for alt ref, gold and last.
+    // (TODO jbb -> Optimize this with pointers to avoid extra copies. )
+    if(cm->refresh_alt_ref_frame)
+        vpx_memcpy(&cpi->lfc_a, &cm->fc, sizeof(cm->fc));
+
+    if(cm->refresh_golden_frame)
+        vpx_memcpy(&cpi->lfc_g, &cm->fc, sizeof(cm->fc));
+
+    if(cm->refresh_last_frame)
+        vpx_memcpy(&cpi->lfc_n, &cm->fc, sizeof(cm->fc));
+
     // if its a dropped frame honor the requests on subsequent frames
     if (*size > 0)
     {
@@ -5400,10 +5049,8 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
     return 0;
 }
 
-int vp8_get_preview_raw_frame(VP8_PTR comp, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *flags)
+int vp8_get_preview_raw_frame(VP8_COMP *cpi, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *flags)
 {
-    VP8_COMP *cpi = (VP8_COMP *) comp;
-
     if (cpi->common.refresh_alt_ref_frame)
         return -1;
     else
@@ -5432,9 +5079,8 @@ int vp8_get_preview_raw_frame(VP8_PTR comp, YV12_BUFFER_CONFIG *dest, vp8_ppflag
     }
 }
 
-int vp8_set_roimap(VP8_PTR comp, unsigned char *map, unsigned int rows, unsigned int cols, int delta_q[4], int delta_lf[4], unsigned int threshold[4])
+int vp8_set_roimap(VP8_COMP *cpi, unsigned char *map, unsigned int rows, unsigned int cols, int delta_q[4], int delta_lf[4], unsigned int threshold[4])
 {
-    VP8_COMP *cpi = (VP8_COMP *) comp;
     signed char feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS];
 
     if (cpi->common.mb_rows != rows || cpi->common.mb_cols != cols)
@@ -5442,15 +5088,15 @@ int vp8_set_roimap(VP8_PTR comp, unsigned char *map, unsigned int rows, unsigned
 
     if (!map)
     {
-        disable_segmentation((VP8_PTR)cpi);
+        disable_segmentation(cpi);
         return 0;
     }
 
     // Set the segmentation Map
-    set_segmentation_map((VP8_PTR)cpi, map);
+    set_segmentation_map(cpi, map);
 
     // Activate segmentation.
-    enable_segmentation((VP8_PTR)cpi);
+    enable_segmentation(cpi);
 
     // Set up the quant segment data
     feature_data[MB_LVL_ALT_Q][0] = delta_q[0];
@@ -5471,15 +5117,13 @@ int vp8_set_roimap(VP8_PTR comp, unsigned char *map, unsigned int rows, unsigned
 
     // Initialise the feature data structure
     // SEGMENT_DELTADATA    0, SEGMENT_ABSDATA      1
-    set_segment_data((VP8_PTR)cpi, &feature_data[0][0], SEGMENT_DELTADATA);
+    set_segment_data(cpi, &feature_data[0][0], SEGMENT_DELTADATA);
 
     return 0;
 }
 
-int vp8_set_active_map(VP8_PTR comp, unsigned char *map, unsigned int rows, unsigned int cols)
+int vp8_set_active_map(VP8_COMP *cpi, unsigned char *map, unsigned int rows, unsigned int cols)
 {
-    VP8_COMP *cpi = (VP8_COMP *) comp;
-
     if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols)
     {
         if (map)
@@ -5499,10 +5143,8 @@ int vp8_set_active_map(VP8_PTR comp, unsigned char *map, unsigned int rows, unsi
     }
 }
 
-int vp8_set_internal_size(VP8_PTR comp, VPX_SCALING horiz_mode, VPX_SCALING vert_mode)
+int vp8_set_internal_size(VP8_COMP *cpi, VPX_SCALING horiz_mode, VPX_SCALING vert_mode)
 {
-    VP8_COMP *cpi = (VP8_COMP *) comp;
-
     if (horiz_mode <= ONETWO)
         cpi->common.horiz_scale = horiz_mode;
     else
@@ -5544,8 +5186,7 @@ int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, const
 }
 
 
-int vp8_get_quantizer(VP8_PTR c)
+int vp8_get_quantizer(VP8_COMP *cpi)
 {
-    VP8_COMP   *cpi = (VP8_COMP *) c;
     return cpi->common.base_qindex;
 }
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index a0828a479..46951e3b9 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -58,6 +58,9 @@
 
 #define MAX_PERIODICITY 16
 
+#define MAX(x,y) (((x)>(y))?(x):(y))
+#define MIN(x,y) (((x)<(y))?(x):(y))
+
 typedef struct
 {
     int kf_indicated;
@@ -133,32 +136,32 @@ typedef struct
 
 typedef enum
 {
-    THR_ZEROMV         = 0,
+    THR_ZERO1          = 0,
     THR_DC             = 1,
 
-    THR_NEARESTMV      = 2,
-    THR_NEARMV         = 3,
+    THR_NEAREST1       = 2,
+    THR_NEAR1          = 3,
 
-    THR_ZEROG          = 4,
-    THR_NEARESTG       = 5,
+    THR_ZERO2          = 4,
+    THR_NEAREST2       = 5,
 
-    THR_ZEROA          = 6,
-    THR_NEARESTA       = 7,
+    THR_ZERO3          = 6,
+    THR_NEAREST3       = 7,
 
-    THR_NEARG          = 8,
-    THR_NEARA          = 9,
+    THR_NEAR2          = 8,
+    THR_NEAR3          = 9,
 
     THR_V_PRED         = 10,
     THR_H_PRED         = 11,
     THR_TM             = 12,
 
-    THR_NEWMV          = 13,
-    THR_NEWG           = 14,
-    THR_NEWA           = 15,
+    THR_NEW1           = 13,
+    THR_NEW2           = 14,
+    THR_NEW3           = 15,
 
-    THR_SPLITMV        = 16,
-    THR_SPLITG         = 17,
-    THR_SPLITA         = 18,
+    THR_SPLIT1         = 16,
+    THR_SPLIT2         = 17,
+    THR_SPLIT3         = 18,
 
     THR_B_PRED         = 19,
 }
@@ -256,7 +259,7 @@ typedef struct
     int buffer_level;
     int bits_off_target;
 
-    long long total_actual_bits;
+    int64_t total_actual_bits;
     int total_target_vs_actual;
 
     int worst_quality;
@@ -276,7 +279,7 @@ typedef struct
     int zbin_over_quant;
 
     int inter_frame_target;
-    INT64 total_byte_count;
+    int64_t total_byte_count;
 
     int filter_level;
 
@@ -314,8 +317,7 @@ typedef struct VP8_COMP
 
     MACROBLOCK mb;
     VP8_COMMON common;
-    vp8_writer bc, bc2;
-    // bool_writer *bc2;
+    vp8_writer bc[9]; // one boolcoder for each partition
 
     VP8_CONFIG oxcf;
 
@@ -337,7 +339,7 @@ typedef struct VP8_COMP
     int gold_is_alt;  // don't do both alt and gold search ( just do gold).
 
     //int refresh_alt_ref_frame;
-    YV12_BUFFER_CONFIG last_frame_uf;
+    YV12_BUFFER_CONFIG pick_lf_lvl_frame;
 
     TOKENEXTRA *tok;
     unsigned int tok_count;
@@ -418,6 +420,7 @@ typedef struct VP8_COMP
 
     int buffered_mode;
 
+    double frame_rate;
     int64_t buffer_level;
     int bits_off_target;
 
@@ -565,16 +568,21 @@ typedef struct VP8_COMP
 
     int base_skip_false_prob[128];
 
+    FRAME_CONTEXT lfc_n; /* last frame entropy */
+    FRAME_CONTEXT lfc_a; /* last alt ref entropy */
+    FRAME_CONTEXT lfc_g; /* last gold ref entropy */
+
+
     struct twopass_rc
     {
         unsigned int section_intra_rating;
         double section_max_qfactor;
         unsigned int next_iiratio;
         unsigned int this_iiratio;
-        FIRSTPASS_STATS *total_stats;
-        FIRSTPASS_STATS *this_frame_stats;
+        FIRSTPASS_STATS total_stats;
+        FIRSTPASS_STATS this_frame_stats;
         FIRSTPASS_STATS *stats_in, *stats_in_end, *stats_in_start;
-        FIRSTPASS_STATS *total_left_stats;
+        FIRSTPASS_STATS total_left_stats;
         int first_pass_done;
         int64_t bits_left;
         int64_t clip_bits_total;
@@ -665,8 +673,8 @@ typedef struct VP8_COMP
     unsigned int current_layer;
     LAYER_CONTEXT layer_context[MAX_LAYERS];
 
-    long long frames_in_layer[MAX_LAYERS];
-    long long bytes_in_layer[MAX_LAYERS];
+    int64_t frames_in_layer[MAX_LAYERS];
+    int64_t bytes_in_layer[MAX_LAYERS];
     double sum_psnr[MAX_LAYERS];
     double sum_psnr_p[MAX_LAYERS];
     double total_error2[MAX_LAYERS];
@@ -679,6 +687,11 @@ typedef struct VP8_COMP
     double total_ssimg_v_in_layer[MAX_LAYERS];
     double total_ssimg_all_in_layer[MAX_LAYERS];
 
+#if CONFIG_MULTI_RES_ENCODING
+    /* Number of MBs per row at lower-resolution level */
+    int    mr_low_res_mb_cols;
+#endif
+
 } VP8_COMP;
 
 void control_data_rate(VP8_COMP *cpi);
diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
index 4d8734137..46f53a18d 100644
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -21,7 +21,6 @@
 #include "vp8/common/reconinter.h"
 #include "vp8/common/reconintra.h"
 #include "vp8/common/reconintra4x4.h"
-#include "vp8/common/g_common.h"
 #include "variance.h"
 #include "mcomp.h"
 #include "rdopt.h"
@@ -39,7 +38,7 @@ extern int VP8_UVSSE(MACROBLOCK *x, const vp8_variance_rtcd_vtable_t *rtcd);
 extern unsigned int cnt_pm;
 #endif
 
-extern const MV_REFERENCE_FRAME vp8_ref_frame_order[MAX_MODES];
+extern const int vp8_ref_frame_order[MAX_MODES];
 extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES];
 
 extern unsigned int (*vp8_get4x4sse_cs)(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride);
@@ -402,9 +401,68 @@ static void update_mvcount(VP8_COMP *cpi, MACROBLOCKD *xd, int_mv *best_ref_mv)
     }
 }
 
+
+#if CONFIG_MULTI_RES_ENCODING
+static
+void get_lower_res_motion_info(VP8_COMP *cpi, MACROBLOCKD *xd, int *dissim,
+                               int *parent_ref_frame,
+                               MB_PREDICTION_MODE *parent_mode,
+                               int_mv *parent_ref_mv, int mb_row, int mb_col)
+{
+    LOWER_RES_INFO* store_mode_info
+                          = (LOWER_RES_INFO*)cpi->oxcf.mr_low_res_mode_info;
+    unsigned int parent_mb_index;
+    //unsigned int parent_mb_index = map_640x480_to_320x240[mb_row][mb_col];
+
+    /* Consider different down_sampling_factor.  */
+    {
+        /* TODO: Removed the loop that supports special down_sampling_factor
+         * such as 2, 4, 8. Will revisit it if needed.
+         * Should also try using a look-up table to see if it helps
+         * performance. */
+        int round = cpi->oxcf.mr_down_sampling_factor.num/2;
+        int parent_mb_row, parent_mb_col;
+
+        parent_mb_row = (mb_row*cpi->oxcf.mr_down_sampling_factor.den+round)
+                    /cpi->oxcf.mr_down_sampling_factor.num;
+        parent_mb_col = (mb_col*cpi->oxcf.mr_down_sampling_factor.den+round)
+                    /cpi->oxcf.mr_down_sampling_factor.num;
+        parent_mb_index = parent_mb_row*cpi->mr_low_res_mb_cols + parent_mb_col;
+    }
+
+    /* Read lower-resolution mode & motion result from memory.*/
+    *parent_ref_frame = store_mode_info[parent_mb_index].ref_frame;
+    *parent_mode =  store_mode_info[parent_mb_index].mode;
+    *dissim = store_mode_info[parent_mb_index].dissim;
+
+    /* For highest-resolution encoder, adjust dissim value. Lower its quality
+     * for good performance. */
+    if (cpi->oxcf.mr_encoder_id == (cpi->oxcf.mr_total_resolutions - 1))
+        *dissim>>=1;
+
+    if(*parent_ref_frame != INTRA_FRAME)
+    {
+        /* Consider different down_sampling_factor.
+         * The result can be rounded to be more precise, but it takes more time.
+         */
+        //int round = cpi->oxcf.mr_down_sampling_factor.den/2;
+        (*parent_ref_mv).as_mv.row = store_mode_info[parent_mb_index].mv.as_mv.row
+                                  *cpi->oxcf.mr_down_sampling_factor.num
+                                  /cpi->oxcf.mr_down_sampling_factor.den;
+        (*parent_ref_mv).as_mv.col = store_mode_info[parent_mb_index].mv.as_mv.col
+                                  *cpi->oxcf.mr_down_sampling_factor.num
+                                  /cpi->oxcf.mr_down_sampling_factor.den;
+
+        vp8_clamp_mv2(parent_ref_mv, xd);
+    }
+}
+#endif
+
+
 void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
                          int recon_uvoffset, int *returnrate,
-                         int *returndistortion, int *returnintra)
+                         int *returndistortion, int *returnintra, int mb_row,
+                         int mb_col)
 {
     BLOCK *b = &x->block[0];
     BLOCKD *d = &x->e_mbd.block[0];
@@ -422,34 +480,67 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
     int rate;
     int rate2;
     int distortion2;
-    int bestsme;
-    //int all_rds[MAX_MODES];         // Experimental debug code.
+    int bestsme = INT_MAX;
     int best_mode_index = 0;
     unsigned int sse = INT_MAX, best_sse = INT_MAX;
 
     int_mv mvp;
+
     int near_sadidx[8] = {0, 1, 2, 3, 4, 5, 6, 7};
     int saddone=0;
     int sr=0;    //search range got from mv_pred(). It uses step_param levels. (0-7)
 
-    int_mv nearest_mv[4];
-    int_mv near_mv[4];
-    int_mv frame_best_ref_mv[4];
-    int MDCounts[4][4];
     unsigned char *y_buffer[4];
     unsigned char *u_buffer[4];
     unsigned char *v_buffer[4];
+    int i;
+    int ref_frame_map[4];
+    int sign_bias = 0;
 
-    int skip_mode[4] = {0, 0, 0, 0};
-    int found_near_mvs[4] = {0, 0, 0, 0};
+    int have_subp_search = cpi->sf.half_pixel_search;  /* In real-time mode,
+                                       when Speed >= 15, no sub-pixel search. */
 
-    int have_subp_search = cpi->sf.half_pixel_search;  /* In real-time mode, when Speed >= 15, no sub-pixel search. */
+#if CONFIG_MULTI_RES_ENCODING
+    int dissim = INT_MAX;
+    int parent_ref_frame = 0;
+    int_mv parent_ref_mv;
+    MB_PREDICTION_MODE parent_mode = 0;
+
+    if (cpi->oxcf.mr_encoder_id)
+        get_lower_res_motion_info(cpi, xd, &dissim, &parent_ref_frame,
+                                  &parent_mode, &parent_ref_mv, mb_row, mb_col);
+#endif
 
     vpx_memset(mode_mv, 0, sizeof(mode_mv));
-    vpx_memset(nearest_mv, 0, sizeof(nearest_mv));
-    vpx_memset(near_mv, 0, sizeof(near_mv));
     vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));
 
+    /* Setup search priorities */
+    i=0;
+    ref_frame_map[i++] = INTRA_FRAME;
+    if (cpi->ref_frame_flags & VP8_LAST_FLAG)
+        ref_frame_map[i++] = LAST_FRAME;
+    if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
+        ref_frame_map[i++] = GOLDEN_FRAME;
+    if (cpi->ref_frame_flags & VP8_ALT_FLAG) // &&(cpi->source_alt_ref_active || cpi->oxcf.number_of_layers > 1)
+        ref_frame_map[i++] = ALTREF_FRAME;
+    for(; i<4; i++)
+        ref_frame_map[i] = -1;
+
+    /* Check to see if there is at least 1 valid reference frame that we need
+     * to calculate near_mvs.
+     */
+    if (ref_frame_map[1] > 0)
+    {
+        vp8_find_near_mvs(&x->e_mbd,
+                          x->e_mbd.mode_info_context,
+                          &mode_mv[NEARESTMV], &mode_mv[NEARMV],
+                          &best_ref_mv,
+                          mdcounts,
+                          ref_frame_map[1],
+                          cpi->common.ref_frame_sign_bias);
+
+        sign_bias = cpi->common.ref_frame_sign_bias[ref_frame_map[1]];
+    }
 
     // set up all the refframe dependent pointers.
     if (cpi->ref_frame_flags & VP8_LAST_FLAG)
@@ -459,8 +550,6 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
         u_buffer[LAST_FRAME] = lst_yv12->u_buffer + recon_uvoffset;
         v_buffer[LAST_FRAME] = lst_yv12->v_buffer + recon_uvoffset;
     }
-    else
-        skip_mode[LAST_FRAME] = 1;
 
     if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
     {
@@ -469,21 +558,16 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
         u_buffer[GOLDEN_FRAME] = gld_yv12->u_buffer + recon_uvoffset;
         v_buffer[GOLDEN_FRAME] = gld_yv12->v_buffer + recon_uvoffset;
     }
-    else
-        skip_mode[GOLDEN_FRAME] = 1;
 
-    if ((cpi->ref_frame_flags & VP8_ALT_FLAG) &&
-        (cpi->source_alt_ref_active || cpi->oxcf.number_of_layers > 1))
+    if (cpi->ref_frame_flags & VP8_ALT_FLAG)
     {
         YV12_BUFFER_CONFIG *alt_yv12 = &cpi->common.yv12_fb[cpi->common.alt_fb_idx];
         y_buffer[ALTREF_FRAME] = alt_yv12->y_buffer + recon_yoffset;
         u_buffer[ALTREF_FRAME] = alt_yv12->u_buffer + recon_uvoffset;
         v_buffer[ALTREF_FRAME] = alt_yv12->v_buffer + recon_uvoffset;
     }
-    else
-        skip_mode[ALTREF_FRAME] = 1;
 
-    cpi->mbs_tested_so_far++;          // Count of the number of MBs tested so far this frame
+    cpi->mbs_tested_so_far++; // Count of the number of MBs tested so far this frame
 
     *returnintra = INT_MAX;
     x->skip = 0;
@@ -496,50 +580,98 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
     {
         int frame_cost;
         int this_rd = INT_MAX;
+        int this_ref_frame = ref_frame_map[vp8_ref_frame_order[mode_index]];
 
         if (best_rd <= cpi->rd_threshes[mode_index])
             continue;
 
-        x->e_mbd.mode_info_context->mbmi.ref_frame = vp8_ref_frame_order[mode_index];
-
-        if (skip_mode[x->e_mbd.mode_info_context->mbmi.ref_frame])
+        if (this_ref_frame < 0)
             continue;
 
-        // Check to see if the testing frequency for this mode is at its max
-        // If so then prevent it from being tested and increase the threshold for its testing
-        if (cpi->mode_test_hit_counts[mode_index] && (cpi->mode_check_freq[mode_index] > 1))
+        x->e_mbd.mode_info_context->mbmi.ref_frame = this_ref_frame;
+
+#if CONFIG_MULTI_RES_ENCODING
+        if (cpi->oxcf.mr_encoder_id)
+        {
+            /* If parent MB is intra, child MB is intra. */
+            if (!parent_ref_frame && this_ref_frame)
+                continue;
+
+            /* If parent MB is inter, and it is unlikely there are multiple
+             * objects in parent MB, we use parent ref frame as child MB's
+             * ref frame. */
+            if (parent_ref_frame && dissim < 8
+                && parent_ref_frame != this_ref_frame)
+                continue;
+        }
+#endif
+
+        // everything but intra
+        if (x->e_mbd.mode_info_context->mbmi.ref_frame)
+        {
+            x->e_mbd.pre.y_buffer = y_buffer[x->e_mbd.mode_info_context->mbmi.ref_frame];
+            x->e_mbd.pre.u_buffer = u_buffer[x->e_mbd.mode_info_context->mbmi.ref_frame];
+            x->e_mbd.pre.v_buffer = v_buffer[x->e_mbd.mode_info_context->mbmi.ref_frame];
+
+            if (sign_bias !=
+                cpi->common.ref_frame_sign_bias[x->e_mbd.mode_info_context->mbmi.ref_frame])
+            {
+                mode_mv[NEARESTMV].as_mv.row *= -1;
+                mode_mv[NEARESTMV].as_mv.col *= -1;
+                mode_mv[NEARMV].as_mv.row *= -1;
+                mode_mv[NEARMV].as_mv.col *= -1;
+                best_ref_mv.as_mv.row *= -1;
+                best_ref_mv.as_mv.col *= -1;
+                sign_bias
+                = cpi->common.ref_frame_sign_bias[x->e_mbd.mode_info_context->mbmi.ref_frame];
+            }
+
+#if CONFIG_MULTI_RES_ENCODING
+            if (cpi->oxcf.mr_encoder_id)
+            {
+                if (vp8_mode_order[mode_index] == NEARESTMV &&
+                    mode_mv[NEARESTMV].as_int ==0)
+                    continue;
+                if (vp8_mode_order[mode_index] == NEARMV &&
+                    mode_mv[NEARMV].as_int ==0)
+                    continue;
+
+                if (vp8_mode_order[mode_index] == NEWMV && parent_mode == ZEROMV
+                    && best_ref_mv.as_int==0) //&& dissim==0
+                    continue;
+                else if(vp8_mode_order[mode_index] == NEWMV && dissim==0
+                    && best_ref_mv.as_int==parent_ref_mv.as_int)
+                    continue;
+            }
+#endif
+        }
+
+        /* Check to see if the testing frequency for this mode is at its max
+         * If so then prevent it from being tested and increase the threshold
+         * for its testing */
+        if (cpi->mode_test_hit_counts[mode_index] &&
+                                         (cpi->mode_check_freq[mode_index] > 1))
         {
-            //if ( (cpi->mbs_tested_so_far / cpi->mode_test_hit_counts[mode_index]) <= cpi->mode_check_freq[mode_index] )
-            if (cpi->mbs_tested_so_far <= (cpi->mode_check_freq[mode_index] * cpi->mode_test_hit_counts[mode_index]))
+            if (cpi->mbs_tested_so_far <= (cpi->mode_check_freq[mode_index] *
+                                         cpi->mode_test_hit_counts[mode_index]))
             {
-                // Increase the threshold for coding this mode to make it less likely to be chosen
+                /* Increase the threshold for coding this mode to make it less
+                 * likely to be chosen */
                 cpi->rd_thresh_mult[mode_index] += 4;
 
                 if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
                     cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
 
-                cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
-
+                cpi->rd_threshes[mode_index] =
+                                 (cpi->rd_baseline_thresh[mode_index] >> 7) *
+                                 cpi->rd_thresh_mult[mode_index];
                 continue;
             }
         }
 
-        // If nearby MVs haven't been found for this reference frame then do it now.
-        if (x->e_mbd.mode_info_context->mbmi.ref_frame != INTRA_FRAME &&
-            !found_near_mvs[x->e_mbd.mode_info_context->mbmi.ref_frame])
-        {
-            int ref_frame = x->e_mbd.mode_info_context->mbmi.ref_frame;
-            vp8_find_near_mvs(&x->e_mbd,
-                              x->e_mbd.mode_info_context,
-                              &nearest_mv[ref_frame], &near_mv[ref_frame],
-                              &frame_best_ref_mv[ref_frame],
-                              MDCounts[ref_frame],
-                              ref_frame,
-                              cpi->common.ref_frame_sign_bias);
-            found_near_mvs[ref_frame] = 1;
-        }
-
-        // We have now reached the point where we are going to test the current mode so increment the counter for the number of times it has been tested
+        /* We have now reached the point where we are going to test the current
+         * mode so increment the counter for the number of times it has been
+         * tested */
         cpi->mode_test_hit_counts[mode_index] ++;
 
         rate2 = 0;
@@ -547,42 +679,28 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
 
         this_mode = vp8_mode_order[mode_index];
 
-        // Experimental debug code.
-        //all_rds[mode_index] = -1;
-
         x->e_mbd.mode_info_context->mbmi.mode = this_mode;
         x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;
 
-        // Work out the cost assosciated with selecting the reference frame
+        /* Work out the cost assosciated with selecting the reference frame */
         frame_cost =
             x->e_mbd.ref_frame_cost[x->e_mbd.mode_info_context->mbmi.ref_frame];
         rate2 += frame_cost;
 
-        // everything but intra
-        if (x->e_mbd.mode_info_context->mbmi.ref_frame)
-        {
-            x->e_mbd.pre.y_buffer = y_buffer[x->e_mbd.mode_info_context->mbmi.ref_frame];
-            x->e_mbd.pre.u_buffer = u_buffer[x->e_mbd.mode_info_context->mbmi.ref_frame];
-            x->e_mbd.pre.v_buffer = v_buffer[x->e_mbd.mode_info_context->mbmi.ref_frame];
-            mode_mv[NEARESTMV] = nearest_mv[x->e_mbd.mode_info_context->mbmi.ref_frame];
-            mode_mv[NEARMV] = near_mv[x->e_mbd.mode_info_context->mbmi.ref_frame];
-            best_ref_mv = frame_best_ref_mv[x->e_mbd.mode_info_context->mbmi.ref_frame];
-            memcpy(mdcounts, MDCounts[x->e_mbd.mode_info_context->mbmi.ref_frame], sizeof(mdcounts));
-        }
-
-        // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
-        // unless ARNR filtering is enabled in which case we want
-        // an unfiltered alternative
+        /* Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
+         * unless ARNR filtering is enabled in which case we want
+         * an unfiltered alternative */
         if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0))
         {
-            if (this_mode != ZEROMV || x->e_mbd.mode_info_context->mbmi.ref_frame != ALTREF_FRAME)
+            if (this_mode != ZEROMV ||
+                x->e_mbd.mode_info_context->mbmi.ref_frame != ALTREF_FRAME)
                 continue;
         }
 
         switch (this_mode)
         {
         case B_PRED:
-            // Pass best so far to pick_intra4x4mby_modes to use as breakout
+            /* Pass best so far to pick_intra4x4mby_modes to use as breakout */
             distortion2 = best_sse;
             pick_intra4x4mby_modes(IF_RTCD(&cpi->rtcd), x, &rate, &distortion2);
 
@@ -641,10 +759,12 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
             int sadpb = x->sadperbit16;
             int_mv mvp_full;
 
-            int col_min = (best_ref_mv.as_mv.col>>3) - MAX_FULL_PEL_VAL + ((best_ref_mv.as_mv.col & 7)?1:0);
-            int row_min = (best_ref_mv.as_mv.row>>3) - MAX_FULL_PEL_VAL + ((best_ref_mv.as_mv.row & 7)?1:0);
-            int col_max = (best_ref_mv.as_mv.col>>3) + MAX_FULL_PEL_VAL;
-            int row_max = (best_ref_mv.as_mv.row>>3) + MAX_FULL_PEL_VAL;
+            int col_min = ((best_ref_mv.as_mv.col+7)>>3) - MAX_FULL_PEL_VAL;
+            int row_min = ((best_ref_mv.as_mv.row+7)>>3) - MAX_FULL_PEL_VAL;
+            int col_max = (best_ref_mv.as_mv.col>>3)
+                         + MAX_FULL_PEL_VAL;
+            int row_max = (best_ref_mv.as_mv.row>>3)
+                         + MAX_FULL_PEL_VAL;
 
             int tmp_col_min = x->mv_col_min;
             int tmp_col_max = x->mv_col_max;
@@ -656,110 +776,156 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
             // Further step/diamond searches as necessary
             step_param = cpi->sf.first_step + speed_adjust;
 
-            if(cpi->sf.improved_mv_pred)
+#if CONFIG_MULTI_RES_ENCODING
+            if (cpi->oxcf.mr_encoder_id)
             {
-                if(!saddone)
-                {
-                    vp8_cal_sad(cpi,xd,x, recon_yoffset ,&near_sadidx[0] );
-                    saddone = 1;
-                }
-
-                vp8_mv_pred(cpi, &x->e_mbd, x->e_mbd.mode_info_context, &mvp,
-                            x->e_mbd.mode_info_context->mbmi.ref_frame, cpi->common.ref_frame_sign_bias, &sr, &near_sadidx[0]);
-
-                sr += speed_adjust;
-                //adjust search range according to sr from mv prediction
-                if(sr > step_param)
-                    step_param = sr;
-
-                mvp_full.as_mv.col = mvp.as_mv.col>>3;
-                mvp_full.as_mv.row = mvp.as_mv.row>>3;
-
+                // Use parent MV as predictor. Adjust search range accordingly.
+                mvp.as_int = parent_ref_mv.as_int;
+                mvp_full.as_mv.col = parent_ref_mv.as_mv.col>>3;
+                mvp_full.as_mv.row = parent_ref_mv.as_mv.row>>3;
+
+                if(dissim <=32) step_param += 3;
+                else if(dissim <=128) step_param += 2;
+                else step_param += 1;
             }else
+#endif
             {
-                mvp.as_int = best_ref_mv.as_int;
-                mvp_full.as_mv.col = best_ref_mv.as_mv.col>>3;
-                mvp_full.as_mv.row = best_ref_mv.as_mv.row>>3;
-            }
+                if(cpi->sf.improved_mv_pred)
+                {
+                    if(!saddone)
+                    {
+                        vp8_cal_sad(cpi,xd,x, recon_yoffset ,&near_sadidx[0] );
+                        saddone = 1;
+                    }
 
-            // Get intersection of UMV window and valid MV window to reduce # of checks in diamond search.
-            if (x->mv_col_min < col_min )
-                x->mv_col_min = col_min;
-            if (x->mv_col_max > col_max )
-                x->mv_col_max = col_max;
-            if (x->mv_row_min < row_min )
-                x->mv_row_min = row_min;
-            if (x->mv_row_max > row_max )
-                x->mv_row_max = row_max;
+                    vp8_mv_pred(cpi, &x->e_mbd, x->e_mbd.mode_info_context,
+                                &mvp,x->e_mbd.mode_info_context->mbmi.ref_frame,
+                                cpi->common.ref_frame_sign_bias, &sr,
+                                &near_sadidx[0]);
 
-            further_steps = (cpi->Speed >= 8)? 0: (cpi->sf.max_step_search_steps - 1 - step_param);
+                    sr += speed_adjust;
+                    //adjust search range according to sr from mv prediction
+                    if(sr > step_param)
+                        step_param = sr;
 
-            if (cpi->sf.search_method == HEX)
-            {
-                bestsme = vp8_hex_search(x, b, d, &mvp_full, &d->bmi.mv, step_param,
-                                      sadpb, &cpi->fn_ptr[BLOCK_16X16],
-                                      x->mvsadcost, x->mvcost, &best_ref_mv);
-                mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
+                    mvp_full.as_mv.col = mvp.as_mv.col>>3;
+                    mvp_full.as_mv.row = mvp.as_mv.row>>3;
+                }else
+                {
+                    mvp.as_int = best_ref_mv.as_int;
+                    mvp_full.as_mv.col = best_ref_mv.as_mv.col>>3;
+                    mvp_full.as_mv.row = best_ref_mv.as_mv.row>>3;
+                }
             }
-            else
+
+#if CONFIG_MULTI_RES_ENCODING
+            if (cpi->oxcf.mr_encoder_id && dissim <= 2 &&
+                MAX(abs(best_ref_mv.as_mv.row - parent_ref_mv.as_mv.row),
+                    abs(best_ref_mv.as_mv.col - parent_ref_mv.as_mv.col)) <= 4)
             {
-                bestsme = cpi->diamond_search_sad(x, b, d, &mvp_full, &d->bmi.mv,
-                                      step_param, sadpb, &num00,
-                                      &cpi->fn_ptr[BLOCK_16X16],
-                                      x->mvcost, &best_ref_mv);
-                mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
+                d->bmi.mv.as_int = mvp_full.as_int;
+                mode_mv[NEWMV].as_int = mvp_full.as_int;
 
-                // Further step/diamond searches as necessary
-                n = 0;
-                //further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
+                cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv, &best_ref_mv,
+                                             x->errorperbit,
+                                             &cpi->fn_ptr[BLOCK_16X16],
+                                             cpi->mb.mvcost,
+                                             &distortion2,&sse);
+            }else
+#endif
+            {
+                /* Get intersection of UMV window and valid MV window to
+                 * reduce # of checks in diamond search. */
+                if (x->mv_col_min < col_min )
+                    x->mv_col_min = col_min;
+                if (x->mv_col_max > col_max )
+                    x->mv_col_max = col_max;
+                if (x->mv_row_min < row_min )
+                    x->mv_row_min = row_min;
+                if (x->mv_row_max > row_max )
+                    x->mv_row_max = row_max;
+
+                further_steps = (cpi->Speed >= 8)?
+                           0: (cpi->sf.max_step_search_steps - 1 - step_param);
+
+                if (cpi->sf.search_method == HEX)
+                {
+#if CONFIG_MULTI_RES_ENCODING
+                /* TODO: In higher-res pick_inter_mode, step_param is used to
+                 * modify hex search range. Here, set step_param to 0 not to
+                 * change the behavior in lowest-resolution encoder.
+                 * Will improve it later.
+                 */
+                if (!cpi->oxcf.mr_encoder_id)
+                    step_param = 0;
+#endif
+                    bestsme = vp8_hex_search(x, b, d, &mvp_full, &d->bmi.mv,
+                                          step_param, sadpb,
+                                          &cpi->fn_ptr[BLOCK_16X16],
+                                          x->mvsadcost, x->mvcost, &best_ref_mv);
+                    mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
+                }
+                else
+                {
+                    bestsme = cpi->diamond_search_sad(x, b, d, &mvp_full,
+                                          &d->bmi.mv, step_param, sadpb, &num00,
+                                          &cpi->fn_ptr[BLOCK_16X16],
+                                          x->mvcost, &best_ref_mv);
+                    mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
 
-                n = num00;
-                num00 = 0;
+                    // Further step/diamond searches as necessary
+                    n = 0;
+                    //further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
 
-                while (n < further_steps)
-                {
-                    n++;
+                    n = num00;
+                    num00 = 0;
 
-                    if (num00)
-                        num00--;
-                    else
+                    while (n < further_steps)
                     {
-                        thissme =
-                        cpi->diamond_search_sad(x, b, d, &mvp_full,
-                                                &d->bmi.mv,
-                                                step_param + n,
-                                                sadpb, &num00,
-                                                &cpi->fn_ptr[BLOCK_16X16],
-                                                x->mvcost, &best_ref_mv);
-                        if (thissme < bestsme)
-                        {
-                            bestsme = thissme;
-                            mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
-                        }
+                        n++;
+
+                        if (num00)
+                            num00--;
                         else
                         {
-                            d->bmi.mv.as_int = mode_mv[NEWMV].as_int;
+                            thissme =
+                            cpi->diamond_search_sad(x, b, d, &mvp_full,
+                                                    &d->bmi.mv,
+                                                    step_param + n,
+                                                    sadpb, &num00,
+                                                    &cpi->fn_ptr[BLOCK_16X16],
+                                                    x->mvcost, &best_ref_mv);
+                            if (thissme < bestsme)
+                            {
+                                bestsme = thissme;
+                                mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
+                            }
+                            else
+                            {
+                                d->bmi.mv.as_int = mode_mv[NEWMV].as_int;
+                            }
                         }
                     }
                 }
-            }
 
-            x->mv_col_min = tmp_col_min;
-            x->mv_col_max = tmp_col_max;
-            x->mv_row_min = tmp_row_min;
-            x->mv_row_max = tmp_row_max;
+                x->mv_col_min = tmp_col_min;
+                x->mv_col_max = tmp_col_max;
+                x->mv_row_min = tmp_row_min;
+                x->mv_row_max = tmp_row_max;
 
-            if (bestsme < INT_MAX)
-                cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv, &best_ref_mv,
-                                             x->errorperbit,
+                if (bestsme < INT_MAX)
+                    cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv,
+                                             &best_ref_mv, x->errorperbit,
                                              &cpi->fn_ptr[BLOCK_16X16],
                                              cpi->mb.mvcost,
                                              &distortion2,&sse);
+            }
 
             mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
 
             // mv cost;
-            rate2 += vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv, cpi->mb.mvcost, 128);
+            rate2 += vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv,
+                                     cpi->mb.mvcost, 128);
         }
 
         case NEARESTMV:
@@ -770,18 +936,23 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
 
         case ZEROMV:
 
-            // Trap vectors that reach beyond the UMV borders
-            // Note that ALL New MV, Nearest MV Near MV and Zero MV code drops through to this point
-            // because of the lack of break statements in the previous two cases.
-            if (((mode_mv[this_mode].as_mv.row >> 3) < x->mv_row_min) || ((mode_mv[this_mode].as_mv.row >> 3) > x->mv_row_max) ||
-                ((mode_mv[this_mode].as_mv.col >> 3) < x->mv_col_min) || ((mode_mv[this_mode].as_mv.col >> 3) > x->mv_col_max))
+            /* Trap vectors that reach beyond the UMV borders
+             * Note that ALL New MV, Nearest MV Near MV and Zero MV code drops
+             * through to this point because of the lack of break statements
+             * in the previous two cases.
+             */
+            if (((mode_mv[this_mode].as_mv.row >> 3) < x->mv_row_min) ||
+                ((mode_mv[this_mode].as_mv.row >> 3) > x->mv_row_max) ||
+                ((mode_mv[this_mode].as_mv.col >> 3) < x->mv_col_min) ||
+                ((mode_mv[this_mode].as_mv.col >> 3) > x->mv_col_max))
                 continue;
 
             rate2 += vp8_cost_mv_ref(this_mode, mdcounts);
             x->e_mbd.mode_info_context->mbmi.mv.as_int =
                                                     mode_mv[this_mode].as_int;
 
-            /* Exit early and don't compute the distortion if this macroblock is marked inactive. */
+            /* Exit early and don't compute the distortion if this macroblock
+             * is marked inactive. */
             if (cpi->active_map_enabled && x->active_ptr[0] == 0)
             {
                 sse = 0;
@@ -816,9 +987,6 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
             break;
         }
 
-        // Experimental debug code.
-        //all_rds[mode_index] = this_rd;
-
         if (this_rd < best_rd || x->skip)
         {
             // Note index of best mode
@@ -828,14 +996,23 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
             *returndistortion = distortion2;
             best_sse = sse;
             best_rd = this_rd;
-            vpx_memcpy(&best_mbmode, &x->e_mbd.mode_info_context->mbmi, sizeof(MB_MODE_INFO));
-
-            // Testing this mode gave rise to an improvement in best error score. Lower threshold a bit for next time
-            cpi->rd_thresh_mult[mode_index] = (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ? cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;
-            cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
+            vpx_memcpy(&best_mbmode, &x->e_mbd.mode_info_context->mbmi,
+                       sizeof(MB_MODE_INFO));
+
+            /* Testing this mode gave rise to an improvement in best error
+             * score. Lower threshold a bit for next time
+             */
+            cpi->rd_thresh_mult[mode_index] =
+                     (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ?
+                     cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;
+            cpi->rd_threshes[mode_index] =
+                                   (cpi->rd_baseline_thresh[mode_index] >> 7) *
+                                   cpi->rd_thresh_mult[mode_index];
         }
 
-        // If the mode did not help improve the best error case then raise the threshold for testing that mode next time around.
+        /* If the mode did not help improve the best error case then raise the
+         * threshold for testing that mode next time around.
+         */
         else
         {
             cpi->rd_thresh_mult[mode_index] += 4;
@@ -843,7 +1020,9 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
             if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
                 cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
 
-            cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
+            cpi->rd_threshes[mode_index] =
+                         (cpi->rd_baseline_thresh[mode_index] >> 7) *
+                         cpi->rd_thresh_mult[mode_index];
         }
 
         if (x->skip)
@@ -855,8 +1034,14 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
     {
         int best_adjustment = (cpi->rd_thresh_mult[best_mode_index] >> 3);
 
-        cpi->rd_thresh_mult[best_mode_index] = (cpi->rd_thresh_mult[best_mode_index] >= (MIN_THRESHMULT + best_adjustment)) ? cpi->rd_thresh_mult[best_mode_index] - best_adjustment : MIN_THRESHMULT;
-        cpi->rd_threshes[best_mode_index] = (cpi->rd_baseline_thresh[best_mode_index] >> 7) * cpi->rd_thresh_mult[best_mode_index];
+        cpi->rd_thresh_mult[best_mode_index] =
+                        (cpi->rd_thresh_mult[best_mode_index]
+                        >= (MIN_THRESHMULT + best_adjustment)) ?
+                        cpi->rd_thresh_mult[best_mode_index] - best_adjustment :
+                        MIN_THRESHMULT;
+        cpi->rd_threshes[best_mode_index] =
+                        (cpi->rd_baseline_thresh[best_mode_index] >> 7) *
+                        cpi->rd_thresh_mult[best_mode_index];
     }
 
 
@@ -879,15 +1064,17 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
         x->e_mbd.mode_info_context->mbmi.mv.as_int = 0;
         x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;
         x->e_mbd.mode_info_context->mbmi.mb_skip_coeff =
-                                        (cpi->common.mb_no_coeff_skip) ? 1 : 0;
+                                        (cpi->common.mb_no_coeff_skip);
         x->e_mbd.mode_info_context->mbmi.partitioning = 0;
 
         return;
     }
 
-    /* set to the best mb mode, this copy can be skip if x->skip since it already has the right content */
+    /* set to the best mb mode, this copy can be skip if x->skip since it
+     * already has the right content */
     if (!x->skip)
-        vpx_memcpy(&x->e_mbd.mode_info_context->mbmi, &best_mbmode, sizeof(MB_MODE_INFO));
+        vpx_memcpy(&x->e_mbd.mode_info_context->mbmi, &best_mbmode,
+                   sizeof(MB_MODE_INFO));
 
     if (best_mbmode.mode <= B_PRED)
     {
@@ -895,7 +1082,14 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
         pick_intra_mbuv_mode(x);
     }
 
-    update_mvcount(cpi, &x->e_mbd, &frame_best_ref_mv[xd->mode_info_context->mbmi.ref_frame]);
+    if (sign_bias
+      != cpi->common.ref_frame_sign_bias[xd->mode_info_context->mbmi.ref_frame])
+    {
+        best_ref_mv.as_mv.row *= -1;
+        best_ref_mv.as_mv.col *= -1;
+    }
+
+    update_mvcount(cpi, &x->e_mbd, &best_ref_mv);
 }
 
 
diff --git a/vp8/encoder/pickinter.h b/vp8/encoder/pickinter.h
index 1c5d6a6e6..3d83782b5 100644
--- a/vp8/encoder/pickinter.h
+++ b/vp8/encoder/pickinter.h
@@ -14,6 +14,10 @@
 #include "vpx_config.h"
 #include "vp8/common/onyxc_int.h"
 
-extern void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra);
+extern void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
+                                int recon_uvoffset, int *returnrate,
+                                int *returndistortion, int *returnintra,
+                                int mb_row, int mb_col);
 extern void vp8_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate);
+
 #endif
diff --git a/vp8/encoder/picklpf.c b/vp8/encoder/picklpf.c
index c1e5f7797..2449ae540 100644
--- a/vp8/encoder/picklpf.c
+++ b/vp8/encoder/picklpf.c
@@ -152,9 +152,10 @@ void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
     int max_filter_level = get_max_filter_level(cpi, cm->base_qindex);
     int filt_val;
     int best_filt_val = cm->filter_level;
+    YV12_BUFFER_CONFIG * saved_frame = cm->frame_to_show;
 
-    //  Make a copy of the unfiltered / processed recon buffer
-    vp8_yv12_copy_partial_frame_ptr(cm->frame_to_show, &cpi->last_frame_uf);
+    /* Replace unfiltered frame buffer with a new one */
+    cm->frame_to_show = &cpi->pick_lf_lvl_frame;
 
     if (cm->frame_type == KEY_FRAME)
         cm->sharpness_level = 0;
@@ -177,26 +178,26 @@ void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
     best_filt_val = filt_val;
 
     // Get the err using the previous frame's filter value.
-    vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
 
-    best_err = calc_partial_ssl_err(sd, cm->frame_to_show, IF_RTCD(&cpi->rtcd.variance));
+    /* Copy the unfiltered / processed recon buffer to the new buffer */
+    vp8_yv12_copy_partial_frame_ptr(saved_frame, cm->frame_to_show);
+    vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
 
-    //  Re-instate the unfiltered frame
-    vp8_yv12_copy_partial_frame_ptr(&cpi->last_frame_uf, cm->frame_to_show);
+    best_err = calc_partial_ssl_err(sd, cm->frame_to_show,
+                                    IF_RTCD(&cpi->rtcd.variance));
 
-    filt_val -= (1 + ((filt_val > 10) ? 1 : 0));
+    filt_val -= 1 + (filt_val > 10);
 
     // Search lower filter levels
     while (filt_val >= min_filter_level)
     {
         // Apply the loop filter
+        vp8_yv12_copy_partial_frame_ptr(saved_frame, cm->frame_to_show);
         vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
 
         // Get the err for filtered frame
-        filt_err = calc_partial_ssl_err(sd, cm->frame_to_show, IF_RTCD(&cpi->rtcd.variance));
-
-        //  Re-instate the unfiltered frame
-        vp8_yv12_copy_partial_frame_ptr(&cpi->last_frame_uf, cm->frame_to_show);
+        filt_err = calc_partial_ssl_err(sd, cm->frame_to_show,
+                                        IF_RTCD(&cpi->rtcd.variance));
 
         // Update the best case record or exit loop.
         if (filt_err < best_err)
@@ -208,11 +209,11 @@ void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
             break;
 
         // Adjust filter level
-        filt_val -= (1 + ((filt_val > 10) ? 1 : 0));
+        filt_val -= 1 + (filt_val > 10);
     }
 
     // Search up (note that we have already done filt_val = cm->filter_level)
-    filt_val = cm->filter_level + (1 + ((filt_val > 10) ? 1 : 0));
+    filt_val = cm->filter_level + 1 + (filt_val > 10);
 
     if (best_filt_val == cm->filter_level)
     {
@@ -222,13 +223,13 @@ void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
         while (filt_val < max_filter_level)
         {
             // Apply the loop filter
+            vp8_yv12_copy_partial_frame_ptr(saved_frame, cm->frame_to_show);
+
             vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
 
             // Get the err for filtered frame
-            filt_err = calc_partial_ssl_err(sd, cm->frame_to_show, IF_RTCD(&cpi->rtcd.variance));
-
-            //  Re-instate the unfiltered frame
-            vp8_yv12_copy_partial_frame_ptr(&cpi->last_frame_uf, cm->frame_to_show);
+            filt_err = calc_partial_ssl_err(sd, cm->frame_to_show,
+                                            IF_RTCD(&cpi->rtcd.variance));
 
             // Update the best case record or exit loop.
             if (filt_err < best_err)
@@ -242,7 +243,7 @@ void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
                 break;
 
             // Adjust filter level
-            filt_val += (1 + ((filt_val > 10) ? 1 : 0));
+            filt_val += 1 + (filt_val > 10);
         }
     }
 
@@ -253,6 +254,9 @@ void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
 
     if (cm->filter_level > max_filter_level)
         cm->filter_level = max_filter_level;
+
+    /* restore unfiltered frame pointer */
+    cm->frame_to_show = saved_frame;
 }
 
 // Stub function for now Alt LF not used
@@ -283,10 +287,16 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
     int filt_best;
     int filt_direction = 0;
 
-    int Bias = 0;                       // Bias against raising loop filter and in favour of lowering it
+    int Bias = 0;                       // Bias against raising loop filter and in favor of lowering it
 
-    //  Make a copy of the unfiltered / processed recon buffer
-    vp8_yv12_copy_y_ptr(cm->frame_to_show, &cpi->last_frame_uf);
+    int ss_err[MAX_LOOP_FILTER + 1];
+
+    YV12_BUFFER_CONFIG * saved_frame = cm->frame_to_show;
+
+    vpx_memset(ss_err, 0, sizeof(ss_err));
+
+    /* Replace unfiltered frame buffer with a new one */
+    cm->frame_to_show = &cpi->pick_lf_lvl_frame;
 
     if (cm->frame_type == KEY_FRAME)
         cm->sharpness_level = 0;
@@ -305,14 +315,19 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
     filter_step = (filt_mid < 16) ? 4 : filt_mid / 4;
 
     // Get baseline error score
+
+    /* Copy the unfiltered / processed recon buffer to the new buffer */
+    vp8_yv12_copy_y_ptr(saved_frame, cm->frame_to_show);
+
     vp8cx_set_alt_lf_level(cpi, filt_mid);
     vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_mid);
 
-    best_err = vp8_calc_ss_err(sd, cm->frame_to_show, IF_RTCD(&cpi->rtcd.variance));
-    filt_best = filt_mid;
+    best_err = vp8_calc_ss_err(sd, cm->frame_to_show,
+                               IF_RTCD(&cpi->rtcd.variance));
 
-    //  Re-instate the unfiltered frame
-    vp8_yv12_copy_y_ptr(&cpi->last_frame_uf, cm->frame_to_show);
+    ss_err[filt_mid] = best_err;
+
+    filt_best = filt_mid;
 
     while (filter_step > 0)
     {
@@ -327,14 +342,19 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
 
         if ((filt_direction <= 0) && (filt_low != filt_mid))
         {
-            // Get Low filter error score
-            vp8cx_set_alt_lf_level(cpi, filt_low);
-            vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_low);
-
-            filt_err = vp8_calc_ss_err(sd, cm->frame_to_show, IF_RTCD(&cpi->rtcd.variance));
-
-            //  Re-instate the unfiltered frame
-            vp8_yv12_copy_y_ptr(&cpi->last_frame_uf, cm->frame_to_show);
+            if(ss_err[filt_low] == 0)
+            {
+                // Get Low filter error score
+                vp8_yv12_copy_y_ptr(saved_frame, cm->frame_to_show);
+                vp8cx_set_alt_lf_level(cpi, filt_low);
+                vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_low);
+
+                filt_err = vp8_calc_ss_err(sd, cm->frame_to_show,
+                                           IF_RTCD(&cpi->rtcd.variance));
+                ss_err[filt_low] = filt_err;
+            }
+            else
+                filt_err = ss_err[filt_low];
 
             // If value is close to the best so far then bias towards a lower loop filter value.
             if ((filt_err - Bias) < best_err)
@@ -350,13 +370,18 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
         // Now look at filt_high
         if ((filt_direction >= 0) && (filt_high != filt_mid))
         {
-            vp8cx_set_alt_lf_level(cpi, filt_high);
-            vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_high);
-
-            filt_err = vp8_calc_ss_err(sd, cm->frame_to_show, IF_RTCD(&cpi->rtcd.variance));
+            if(ss_err[filt_high] == 0)
+            {
+                vp8_yv12_copy_y_ptr(saved_frame, cm->frame_to_show);
+                vp8cx_set_alt_lf_level(cpi, filt_high);
+                vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_high);
 
-            //  Re-instate the unfiltered frame
-            vp8_yv12_copy_y_ptr(&cpi->last_frame_uf, cm->frame_to_show);
+                filt_err = vp8_calc_ss_err(sd, cm->frame_to_show,
+                                           IF_RTCD(&cpi->rtcd.variance));
+                ss_err[filt_high] = filt_err;
+            }
+            else
+                filt_err = ss_err[filt_high];
 
             // Was it better than the previous best?
             if (filt_err < (best_err - Bias))
@@ -380,4 +405,7 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
     }
 
     cm->filter_level = filt_best;
+
+    /* restore unfiltered frame pointer */
+    cm->frame_to_show = saved_frame;
 }
diff --git a/vp8/encoder/quantize.c b/vp8/encoder/quantize.c
index e57a26430..ce04212e6 100644
--- a/vp8/encoder/quantize.c
+++ b/vp8/encoder/quantize.c
@@ -436,7 +436,8 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)
     int quant_val;
     int Q;
 
-    int zbin_boost[16] = {0, 0, 8, 10, 12, 14, 16, 20, 24, 28, 32, 36, 40, 44, 44, 44};
+    int zbin_boost[16] = {0, 0, 8, 10, 12, 14, 16, 20, 24, 28, 32, 36, 40, 44,
+                          44, 44};
 
     for (Q = 0; Q < QINDEX_RANGE; Q++)
     {
@@ -469,36 +470,61 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)
         cpi->zrun_zbin_boost_uv[Q][0] = (quant_val * zbin_boost[0]) >> 7;
 
         // all the ac values = ;
-        for (i = 1; i < 16; i++)
+        quant_val = vp8_ac_yquant(Q);
+        cpi->Y1quant_fast[Q][1] = (1 << 16) / quant_val;
+        invert_quant(cpi->sf.improved_quant, cpi->Y1quant[Q] + 1,
+                     cpi->Y1quant_shift[Q] + 1, quant_val);
+        cpi->Y1zbin[Q][1] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
+        cpi->Y1round[Q][1] = (qrounding_factors[Q] * quant_val) >> 7;
+        cpi->common.Y1dequant[Q][1] = quant_val;
+        cpi->zrun_zbin_boost_y1[Q][1] = (quant_val * zbin_boost[1]) >> 7;
+
+        quant_val = vp8_ac2quant(Q, cpi->common.y2ac_delta_q);
+        cpi->Y2quant_fast[Q][1] = (1 << 16) / quant_val;
+        invert_quant(cpi->sf.improved_quant, cpi->Y2quant[Q] + 1,
+                     cpi->Y2quant_shift[Q] + 1, quant_val);
+        cpi->Y2zbin[Q][1] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7;
+        cpi->Y2round[Q][1] = (qrounding_factors_y2[Q] * quant_val) >> 7;
+        cpi->common.Y2dequant[Q][1] = quant_val;
+        cpi->zrun_zbin_boost_y2[Q][1] = (quant_val * zbin_boost[1]) >> 7;
+
+        quant_val = vp8_ac_uv_quant(Q, cpi->common.uvac_delta_q);
+        cpi->UVquant_fast[Q][1] = (1 << 16) / quant_val;
+        invert_quant(cpi->sf.improved_quant, cpi->UVquant[Q] + 1,
+                     cpi->UVquant_shift[Q] + 1, quant_val);
+        cpi->UVzbin[Q][1] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
+        cpi->UVround[Q][1] = (qrounding_factors[Q] * quant_val) >> 7;
+        cpi->common.UVdequant[Q][1] = quant_val;
+        cpi->zrun_zbin_boost_uv[Q][1] = (quant_val * zbin_boost[1]) >> 7;
+
+        for (i = 2; i < 16; i++)
         {
-            int rc = vp8_default_zig_zag1d[i];
-
-            quant_val = vp8_ac_yquant(Q);
-            cpi->Y1quant_fast[Q][rc] = (1 << 16) / quant_val;
-            invert_quant(cpi->sf.improved_quant, cpi->Y1quant[Q] + rc,
-                         cpi->Y1quant_shift[Q] + rc, quant_val);
-            cpi->Y1zbin[Q][rc] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
-            cpi->Y1round[Q][rc] = (qrounding_factors[Q] * quant_val) >> 7;
-            cpi->common.Y1dequant[Q][rc] = quant_val;
-            cpi->zrun_zbin_boost_y1[Q][i] = (quant_val * zbin_boost[i]) >> 7;
-
-            quant_val = vp8_ac2quant(Q, cpi->common.y2ac_delta_q);
-            cpi->Y2quant_fast[Q][rc] = (1 << 16) / quant_val;
-            invert_quant(cpi->sf.improved_quant, cpi->Y2quant[Q] + rc,
-                         cpi->Y2quant_shift[Q] + rc, quant_val);
-            cpi->Y2zbin[Q][rc] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7;
-            cpi->Y2round[Q][rc] = (qrounding_factors_y2[Q] * quant_val) >> 7;
-            cpi->common.Y2dequant[Q][rc] = quant_val;
-            cpi->zrun_zbin_boost_y2[Q][i] = (quant_val * zbin_boost[i]) >> 7;
-
-            quant_val = vp8_ac_uv_quant(Q, cpi->common.uvac_delta_q);
-            cpi->UVquant_fast[Q][rc] = (1 << 16) / quant_val;
-            invert_quant(cpi->sf.improved_quant, cpi->UVquant[Q] + rc,
-                         cpi->UVquant_shift[Q] + rc, quant_val);
-            cpi->UVzbin[Q][rc] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
-            cpi->UVround[Q][rc] = (qrounding_factors[Q] * quant_val) >> 7;
-            cpi->common.UVdequant[Q][rc] = quant_val;
-            cpi->zrun_zbin_boost_uv[Q][i] = (quant_val * zbin_boost[i]) >> 7;
+            cpi->Y1quant_fast[Q][i] = cpi->Y1quant_fast[Q][1];
+            cpi->Y1quant[Q][i] = cpi->Y1quant[Q][1];
+            cpi->Y1quant_shift[Q][i] = cpi->Y1quant_shift[Q][1];
+            cpi->Y1zbin[Q][i] = cpi->Y1zbin[Q][1];
+            cpi->Y1round[Q][i] = cpi->Y1round[Q][1];
+            cpi->common.Y1dequant[Q][i] = cpi->common.Y1dequant[Q][1];
+            cpi->zrun_zbin_boost_y1[Q][i] = (cpi->common.Y1dequant[Q][1] *
+                                             zbin_boost[i]) >> 7;
+
+            cpi->Y2quant_fast[Q][i] = cpi->Y2quant_fast[Q][1];
+            cpi->Y2quant[Q][i] = cpi->Y2quant[Q][1];
+            cpi->Y2quant_shift[Q][i] = cpi->Y2quant_shift[Q][1];
+            cpi->Y2zbin[Q][i] = cpi->Y2zbin[Q][1];
+            cpi->Y2round[Q][i] = cpi->Y2round[Q][1];
+            cpi->common.Y2dequant[Q][i] = cpi->common.Y2dequant[Q][1];
+            cpi->zrun_zbin_boost_y2[Q][i] = (cpi->common.Y2dequant[Q][1] *
+                                             zbin_boost[i]) >> 7;
+
+            cpi->UVquant_fast[Q][i] = cpi->UVquant_fast[Q][1];
+            cpi->UVquant[Q][i] = cpi->UVquant[Q][1];
+            cpi->UVquant_shift[Q][i] = cpi->UVquant_shift[Q][1];
+            cpi->UVzbin[Q][i] = cpi->UVzbin[Q][1];
+            cpi->UVround[Q][i] = cpi->UVround[Q][1];
+            cpi->common.UVdequant[Q][i] = cpi->common.UVdequant[Q][1];
+            cpi->zrun_zbin_boost_uv[Q][i] = (cpi->common.UVdequant[Q][1] *
+                                             zbin_boost[i]) >> 7;
         }
     }
 }
@@ -609,6 +635,9 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x, int ok_to_skip)
 
     /* This initialization should be called at least once. Use ok_to_skip to
      * decide if it is ok to skip.
+     * Before encoding a frame, this function is always called with ok_to_skip
+     * =0, which means no skiping of calculations. The "last" values are
+     * initialized at that time.
      */
     if (!ok_to_skip || QIndex != x->q_index)
     {
diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c
index dc6feb980..1c43c1171 100644
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -28,7 +28,6 @@
 #define MAX_BPB_FACTOR          50
 
 extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES];
-extern const MV_REFERENCE_FRAME vp8_ref_frame_order[MAX_MODES];
 
 
 
@@ -305,6 +304,8 @@ void vp8_setup_key_frame(VP8_COMP *cpi)
     // Setup for Key frame:
 
     vp8_default_coef_probs(& cpi->common);
+
+
     vp8_kf_default_bmode_probs(cpi->common.kf_bmode_prob);
 
     vpx_memcpy(cpi->common.fc.mvc, vp8_default_mv_context, sizeof(vp8_default_mv_context));
@@ -315,6 +316,12 @@ void vp8_setup_key_frame(VP8_COMP *cpi)
 
     vpx_memset(cpi->common.fc.pre_mvc, 0, sizeof(cpi->common.fc.pre_mvc));  //initialize pre_mvc to all zero.
 
+    // Make sure we initialize separate contexts for altref,gold, and normal.
+    // TODO shouldn't need 3 different copies of structure to do this!
+    vpx_memcpy(&cpi->lfc_a, &cpi->common.fc, sizeof(cpi->common.fc));
+    vpx_memcpy(&cpi->lfc_g, &cpi->common.fc, sizeof(cpi->common.fc));
+    vpx_memcpy(&cpi->lfc_n, &cpi->common.fc, sizeof(cpi->common.fc));
+
     //cpi->common.filter_level = 0;      // Reset every key frame.
     cpi->common.filter_level = cpi->common.base_qindex * 3 / 8 ;
 
@@ -325,8 +332,8 @@ void vp8_setup_key_frame(VP8_COMP *cpi)
     else
         cpi->frames_till_gf_update_due = cpi->goldfreq;
 
-    cpi->common.refresh_golden_frame = TRUE;
-    cpi->common.refresh_alt_ref_frame = TRUE;
+    cpi->common.refresh_golden_frame = 1;
+    cpi->common.refresh_alt_ref_frame = 1;
 }
 
 
@@ -464,7 +471,7 @@ static void calc_gf_params(VP8_COMP *cpi)
     if (cpi->pass != 2)
     {
         // Single Pass lagged mode: TBD
-        if (FALSE)
+        if (0)
         {
         }
 
@@ -591,14 +598,14 @@ static void calc_gf_params(VP8_COMP *cpi)
     if (cpi->pass != 2)
     {
         // For now Alt ref is not allowed except in 2 pass modes.
-        cpi->source_alt_ref_pending = FALSE;
+        cpi->source_alt_ref_pending = 0;
 
         /*if ( cpi->oxcf.fixed_q == -1)
         {
             if ( cpi->oxcf.play_alternate && (cpi->last_boost > (100 + (AF_THRESH*cpi->frames_till_gf_update_due)) ) )
-                cpi->source_alt_ref_pending = TRUE;
+                cpi->source_alt_ref_pending = 1;
             else
-                cpi->source_alt_ref_pending = FALSE;
+                cpi->source_alt_ref_pending = 0;
         }*/
     }
 }
@@ -933,6 +940,8 @@ static void calc_pframe_target_size(VP8_COMP *cpi)
             if (cpi->active_worst_quality <= cpi->active_best_quality)
                 cpi->active_worst_quality = cpi->active_best_quality + 1;
 
+            if(cpi->active_worst_quality > 127)
+                cpi->active_worst_quality = 127;
         }
         // Unbuffered mode (eg. video conferencing)
         else
@@ -973,7 +982,7 @@ static void calc_pframe_target_size(VP8_COMP *cpi)
 #endif
             //vpx_log("Decoder: Drop frame due to bandwidth: %d \n",cpi->buffer_level, cpi->av_per_frame_bandwidth);
 
-            cpi->drop_frame = TRUE;
+            cpi->drop_frame = 1;
         }
 
 #if 0
@@ -981,7 +990,7 @@ static void calc_pframe_target_size(VP8_COMP *cpi)
         else if ((cpi->buffer_level < cpi->oxcf.drop_frames_water_mark * cpi->oxcf.optimal_buffer_level / 100) &&
                  (cpi->drop_count < cpi->max_drop_count) && (cpi->pass == 0))
         {
-            cpi->drop_frame = TRUE;
+            cpi->drop_frame = 1;
         }
 
 #endif
@@ -1027,11 +1036,11 @@ static void calc_pframe_target_size(VP8_COMP *cpi)
         {
             // For one pass throw a GF if recent frame intra useage is low or the GF useage is high
             if ((cpi->pass == 0) && (cpi->this_frame_percent_intra < 15 || gf_frame_useage >= 5))
-                cpi->common.refresh_golden_frame = TRUE;
+                cpi->common.refresh_golden_frame = 1;
 
             // Two pass GF descision
             else if (cpi->pass == 2)
-                cpi->common.refresh_golden_frame = TRUE;
+                cpi->common.refresh_golden_frame = 1;
         }
 
 #if 0
@@ -1049,7 +1058,7 @@ static void calc_pframe_target_size(VP8_COMP *cpi)
 
 #endif
 
-        if (cpi->common.refresh_golden_frame == TRUE)
+        if (cpi->common.refresh_golden_frame == 1)
         {
 #if 0
 
@@ -1534,7 +1543,7 @@ int vp8_pick_frame_size(VP8_COMP *cpi)
         // Check if we're dropping the frame:
         if (cpi->drop_frame)
         {
-            cpi->drop_frame = FALSE;
+            cpi->drop_frame = 0;
             cpi->drop_count++;
             return 0;
         }
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index e8abf848c..ce979619a 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -28,7 +28,6 @@
 #include "encodemb.h"
 #include "quantize.h"
 #include "vp8/common/idct.h"
-#include "vp8/common/g_common.h"
 #include "variance.h"
 #include "mcomp.h"
 #include "rdopt.h"
@@ -100,36 +99,39 @@ const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES] =
     B_PRED,
 };
 
-const MV_REFERENCE_FRAME vp8_ref_frame_order[MAX_MODES] =
+/* This table determines the search order in reference frame priority order,
+ * which may not necessarily match INTRA,LAST,GOLDEN,ARF
+ */
+const int vp8_ref_frame_order[MAX_MODES] =
 {
-    LAST_FRAME,
-    INTRA_FRAME,
+    1,
+    0,
 
-    LAST_FRAME,
-    LAST_FRAME,
+    1,
+    1,
 
-    GOLDEN_FRAME,
-    GOLDEN_FRAME,
+    2,
+    2,
 
-    ALTREF_FRAME,
-    ALTREF_FRAME,
+    3,
+    3,
 
-    GOLDEN_FRAME,
-    ALTREF_FRAME,
+    2,
+    3,
 
-    INTRA_FRAME,
-    INTRA_FRAME,
-    INTRA_FRAME,
+    0,
+    0,
+    0,
 
-    LAST_FRAME,
-    GOLDEN_FRAME,
-    ALTREF_FRAME,
+    1,
+    2,
+    3,
 
-    LAST_FRAME,
-    GOLDEN_FRAME,
-    ALTREF_FRAME,
+    1,
+    2,
+    3,
 
-    INTRA_FRAME,
+    0,
 };
 
 static void fill_token_costs(
@@ -285,18 +287,39 @@ void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue)
         }
     }
 
-    fill_token_costs(
-        cpi->mb.token_costs,
-        (const vp8_prob( *)[8][3][11]) cpi->common.fc.coef_probs
-    );
+    {
+      // build token cost array for the type of frame we have now
+      FRAME_CONTEXT *l = &cpi->lfc_n;
+
+      if(cpi->common.refresh_alt_ref_frame)
+          l = &cpi->lfc_a;
+      else if(cpi->common.refresh_golden_frame)
+          l = &cpi->lfc_g;
+
+      fill_token_costs(
+          cpi->mb.token_costs,
+          (const vp8_prob( *)[8][3][11]) l->coef_probs
+      );
+      /*
+      fill_token_costs(
+          cpi->mb.token_costs,
+          (const vp8_prob( *)[8][3][11]) cpi->common.fc.coef_probs);
+      */
+
 
-    vp8_init_mode_costs(cpi);
+      // TODO make these mode costs depend on last,alt or gold too.  (jbb)
+      vp8_init_mode_costs(cpi);
+
+      // TODO figure onnnnuut why making mv cost frame type dependent didn't help (jbb)
+      //vp8_build_component_cost_table(cpi->mb.mvcost, (const MV_CONTEXT *) l->mvc, flags);
+
+    }
 
 }
 
 void vp8_auto_select_speed(VP8_COMP *cpi)
 {
-    int milliseconds_for_compress = (int)(1000000 / cpi->oxcf.frame_rate);
+    int milliseconds_for_compress = (int)(1000000 / cpi->frame_rate);
 
     milliseconds_for_compress = milliseconds_for_compress * (16 - cpi->oxcf.cpu_used) / 16;
 
@@ -552,7 +575,7 @@ static void macro_block_yrd( MACROBLOCK *mb,
     int d;
 
     ENCODEMB_INVOKE(rtcd, submby)( mb->src_diff, *(mb->block[0].base_src),
-                                   mb->e_mbd.predictor, mb->block[0].src_stride );
+        mb->block[0].src_stride,  mb->e_mbd.predictor, 16);
 
     // Fdct and building the 2nd order block
     for (beptr = mb->block; beptr < mb->block + 16; beptr += 2)
@@ -800,7 +823,8 @@ static int rd_inter16x16_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
 {
     vp8_build_inter16x16_predictors_mbuv(&x->e_mbd);
     ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), submbuv)(x->src_diff,
-        x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
+        x->src.u_buffer, x->src.v_buffer, x->src.uv_stride,
+        &x->e_mbd.predictor[256], &x->e_mbd.predictor[320], 8);
 
     vp8_transform_mbuv(x);
     vp8_quantize_mbuv(x);
@@ -816,7 +840,8 @@ static int rd_inter4x4_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
 {
     vp8_build_inter4x4_predictors_mbuv(&x->e_mbd);
     ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), submbuv)(x->src_diff,
-        x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
+        x->src.u_buffer, x->src.v_buffer, x->src.uv_stride,
+        &x->e_mbd.predictor[256], &x->e_mbd.predictor[320], 8);
 
     vp8_transform_mbuv(x);
     vp8_quantize_mbuv(x);
@@ -845,8 +870,8 @@ static void rd_pick_intra_mbuv_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate, int
         RECON_INVOKE(&cpi->rtcd.common->recon, build_intra_predictors_mbuv)
                      (&x->e_mbd);
         ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), submbuv)(x->src_diff,
-                      x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor,
-                      x->src.uv_stride);
+                      x->src.u_buffer, x->src.v_buffer, x->src.uv_stride,
+                      &x->e_mbd.predictor[256], &x->e_mbd.predictor[320], 8);
         vp8_transform_mbuv(x);
         vp8_quantize_mbuv(x);
 
@@ -1359,8 +1384,8 @@ static int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x,
 
         if (bsi.segment_rd < best_rd)
         {
-            int col_min = (best_ref_mv->as_mv.col>>3) - MAX_FULL_PEL_VAL + ((best_ref_mv->as_mv.col & 7)?1:0);
-            int row_min = (best_ref_mv->as_mv.row>>3) - MAX_FULL_PEL_VAL + ((best_ref_mv->as_mv.row & 7)?1:0);
+            int col_min = ((best_ref_mv->as_mv.col+7)>>3) - MAX_FULL_PEL_VAL;
+            int row_min = ((best_ref_mv->as_mv.row+7)>>3) - MAX_FULL_PEL_VAL;
             int col_max = (best_ref_mv->as_mv.col>>3) + MAX_FULL_PEL_VAL;
             int row_max = (best_ref_mv->as_mv.row>>3) + MAX_FULL_PEL_VAL;
 
@@ -1458,57 +1483,6 @@ static int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x,
     return bsi.segment_rd;
 }
 
-static void insertsortmv(int arr[], int len)
-{
-    int i, j, k;
-
-    for ( i = 1 ; i <= len-1 ; i++ )
-    {
-        for ( j = 0 ; j < i ; j++ )
-        {
-            if ( arr[j] > arr[i] )
-            {
-                int temp;
-
-                temp = arr[i];
-
-                for ( k = i; k >j; k--)
-                    arr[k] = arr[k - 1] ;
-
-                arr[j] = temp ;
-            }
-        }
-    }
-}
-
-static void insertsortsad(int arr[],int idx[], int len)
-{
-    int i, j, k;
-
-    for ( i = 1 ; i <= len-1 ; i++ )
-    {
-        for ( j = 0 ; j < i ; j++ )
-        {
-            if ( arr[j] > arr[i] )
-            {
-                int temp, tempi;
-
-                temp = arr[i];
-                tempi = idx[i];
-
-                for ( k = i; k >j; k--)
-                {
-                    arr[k] = arr[k - 1] ;
-                    idx[k] = idx[k - 1];
-                }
-
-                arr[j] = temp ;
-                idx[j] = tempi;
-            }
-        }
-    }
-}
-
 //The improved MV prediction
 void vp8_mv_pred
 (
@@ -1741,7 +1715,9 @@ static void rd_update_mvcount(VP8_COMP *cpi, MACROBLOCK *x, int_mv *best_ref_mv)
     }
 }
 
-void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra)
+void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
+                            int recon_uvoffset, int *returnrate,
+                            int *returndistortion, int *returnintra)
 {
     BLOCK *b = &x->block[0];
     BLOCKD *d = &x->e_mbd.block[0];
@@ -1768,40 +1744,58 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
     int distortion_uv;
     int best_yrd = INT_MAX;
 
-    //int all_rds[MAX_MODES];        // Experimental debug code.
-    //int all_rates[MAX_MODES];
-    //int all_dist[MAX_MODES];
-    //int intermodecost[MAX_MODES];
-
     MB_PREDICTION_MODE uv_intra_mode;
     int_mv mvp;
     int near_sadidx[8] = {0, 1, 2, 3, 4, 5, 6, 7};
     int saddone=0;
     int sr=0;    //search range got from mv_pred(). It uses step_param levels. (0-7)
 
-    int_mv frame_nearest_mv[4];
-    int_mv frame_near_mv[4];
-    int_mv frame_best_ref_mv[4];
-    int frame_mdcounts[4][4];
     int frame_lf_or_gf[4];
     unsigned char *y_buffer[4];
     unsigned char *u_buffer[4];
     unsigned char *v_buffer[4];
+    int ref_frame_map[4];
+    int sign_bias = 0;
 
+    vpx_memset(mode_mv, 0, sizeof(mode_mv));
     vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));
     vpx_memset(&best_bmodes, 0, sizeof(best_bmodes));
 
+    /* Setup search priorities */
+    i=0;
+    ref_frame_map[i++] = INTRA_FRAME;
     if (cpi->ref_frame_flags & VP8_LAST_FLAG)
+        ref_frame_map[i++] = LAST_FRAME;
+    if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
+        ref_frame_map[i++] = GOLDEN_FRAME;
+    if (cpi->ref_frame_flags & VP8_ALT_FLAG)
+        ref_frame_map[i++] = ALTREF_FRAME;
+    for(; i<4; i++)
+        ref_frame_map[i] = -1;
+
+    /* Check to see if there is at least 1 valid reference frame that we need
+     * to calculate near_mvs.
+     */
+    if (ref_frame_map[1] > 0)
     {
-        YV12_BUFFER_CONFIG *lst_yv12 = &cpi->common.yv12_fb[cpi->common.lst_fb_idx];
+        vp8_find_near_mvs(&x->e_mbd,
+                          x->e_mbd.mode_info_context,
+                          &mode_mv[NEARESTMV], &mode_mv[NEARMV],
+                          &best_ref_mv,
+                          mdcounts,
+                          ref_frame_map[1],
+                          cpi->common.ref_frame_sign_bias);
+
+        sign_bias = cpi->common.ref_frame_sign_bias[ref_frame_map[1]];
+    }
 
-        vp8_find_near_mvs(&x->e_mbd, x->e_mbd.mode_info_context, &frame_nearest_mv[LAST_FRAME], &frame_near_mv[LAST_FRAME],
-                          &frame_best_ref_mv[LAST_FRAME], frame_mdcounts[LAST_FRAME], LAST_FRAME, cpi->common.ref_frame_sign_bias);
+    if (cpi->ref_frame_flags & VP8_LAST_FLAG)
+    {
+        YV12_BUFFER_CONFIG *lst_yv12 = &cpi->common.yv12_fb[cpi->common.lst_fb_idx];
 
         y_buffer[LAST_FRAME] = lst_yv12->y_buffer + recon_yoffset;
         u_buffer[LAST_FRAME] = lst_yv12->u_buffer + recon_uvoffset;
         v_buffer[LAST_FRAME] = lst_yv12->v_buffer + recon_uvoffset;
-
         frame_lf_or_gf[LAST_FRAME] = 0;
     }
 
@@ -1809,13 +1803,9 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
     {
         YV12_BUFFER_CONFIG *gld_yv12 = &cpi->common.yv12_fb[cpi->common.gld_fb_idx];
 
-        vp8_find_near_mvs(&x->e_mbd, x->e_mbd.mode_info_context, &frame_nearest_mv[GOLDEN_FRAME], &frame_near_mv[GOLDEN_FRAME],
-                          &frame_best_ref_mv[GOLDEN_FRAME], frame_mdcounts[GOLDEN_FRAME], GOLDEN_FRAME, cpi->common.ref_frame_sign_bias);
-
         y_buffer[GOLDEN_FRAME] = gld_yv12->y_buffer + recon_yoffset;
         u_buffer[GOLDEN_FRAME] = gld_yv12->u_buffer + recon_uvoffset;
         v_buffer[GOLDEN_FRAME] = gld_yv12->v_buffer + recon_uvoffset;
-
         frame_lf_or_gf[GOLDEN_FRAME] = 1;
     }
 
@@ -1823,13 +1813,9 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
     {
         YV12_BUFFER_CONFIG *alt_yv12 = &cpi->common.yv12_fb[cpi->common.alt_fb_idx];
 
-        vp8_find_near_mvs(&x->e_mbd, x->e_mbd.mode_info_context, &frame_nearest_mv[ALTREF_FRAME], &frame_near_mv[ALTREF_FRAME],
-                          &frame_best_ref_mv[ALTREF_FRAME], frame_mdcounts[ALTREF_FRAME], ALTREF_FRAME, cpi->common.ref_frame_sign_bias);
-
         y_buffer[ALTREF_FRAME] = alt_yv12->y_buffer + recon_yoffset;
         u_buffer[ALTREF_FRAME] = alt_yv12->u_buffer + recon_uvoffset;
         v_buffer[ALTREF_FRAME] = alt_yv12->v_buffer + recon_uvoffset;
-
         frame_lf_or_gf[ALTREF_FRAME] = 1;
     }
 
@@ -1838,8 +1824,6 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
 
     x->skip = 0;
 
-    vpx_memset(mode_mv, 0, sizeof(mode_mv));
-
     x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;
     rd_pick_intra_mbuv_mode(cpi, x, &uv_intra_rate, &uv_intra_rate_tokenonly, &uv_intra_distortion);
     uv_intra_mode = x->e_mbd.mode_info_context->mbmi.uv_mode;
@@ -1850,18 +1834,15 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
         int lf_or_gf = 0;           // Lat Frame (01) or gf/arf (1)
         int disable_skip = 0;
         int other_cost = 0;
-
-        // Experimental debug code.
-        // Record of rd values recorded for this MB. -1 indicates not measured
-        //all_rds[mode_index] = -1;
-        //all_rates[mode_index] = -1;
-        //all_dist[mode_index] = -1;
-        //intermodecost[mode_index] = -1;
+        int this_ref_frame = ref_frame_map[vp8_ref_frame_order[mode_index]];
 
         // Test best rd so far against threshold for trying this mode.
         if (best_rd <= cpi->rd_threshes[mode_index])
             continue;
 
+        if (this_ref_frame < 0)
+            continue;
+
         // These variables hold are rolling total cost and distortion for this mode
         rate2 = 0;
         distortion2 = 0;
@@ -1870,7 +1851,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
 
         x->e_mbd.mode_info_context->mbmi.mode = this_mode;
         x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;
-        x->e_mbd.mode_info_context->mbmi.ref_frame = vp8_ref_frame_order[mode_index];
+        x->e_mbd.mode_info_context->mbmi.ref_frame = this_ref_frame;
 
         // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
         // unless ARNR filtering is enabled in which case we want
@@ -1887,10 +1868,20 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
             x->e_mbd.pre.y_buffer = y_buffer[x->e_mbd.mode_info_context->mbmi.ref_frame];
             x->e_mbd.pre.u_buffer = u_buffer[x->e_mbd.mode_info_context->mbmi.ref_frame];
             x->e_mbd.pre.v_buffer = v_buffer[x->e_mbd.mode_info_context->mbmi.ref_frame];
-            mode_mv[NEARESTMV] = frame_nearest_mv[x->e_mbd.mode_info_context->mbmi.ref_frame];
-            mode_mv[NEARMV] = frame_near_mv[x->e_mbd.mode_info_context->mbmi.ref_frame];
-            best_ref_mv = frame_best_ref_mv[x->e_mbd.mode_info_context->mbmi.ref_frame];
-            vpx_memcpy(mdcounts, frame_mdcounts[x->e_mbd.mode_info_context->mbmi.ref_frame], sizeof(mdcounts));
+
+            if (sign_bias !=
+                cpi->common.ref_frame_sign_bias[x->e_mbd.mode_info_context->mbmi.ref_frame])
+            {
+                mode_mv[NEARESTMV].as_mv.row *= -1;
+                mode_mv[NEARESTMV].as_mv.col *= -1;
+                mode_mv[NEARMV].as_mv.row *= -1;
+                mode_mv[NEARMV].as_mv.col *= -1;
+                best_ref_mv.as_mv.row *= -1;
+                best_ref_mv.as_mv.col *= -1;
+                sign_bias
+                = cpi->common.ref_frame_sign_bias[x->e_mbd.mode_info_context->mbmi.ref_frame];
+            }
+
             lf_or_gf = frame_lf_or_gf[x->e_mbd.mode_info_context->mbmi.ref_frame];
         }
 
@@ -1918,13 +1909,13 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
         // Experimental code. Special case for gf and arf zeromv modes. Increase zbin size to supress noise
         if (cpi->zbin_mode_boost_enabled)
         {
-            if ( vp8_ref_frame_order[mode_index] == INTRA_FRAME )
+            if ( this_ref_frame == INTRA_FRAME )
                 cpi->zbin_mode_boost = 0;
             else
             {
                 if (vp8_mode_order[mode_index] == ZEROMV)
                 {
-                    if (vp8_ref_frame_order[mode_index] != LAST_FRAME)
+                    if (this_ref_frame != LAST_FRAME)
                         cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
                     else
                         cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
@@ -1969,8 +1960,8 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
             int tmp_rd;
             int this_rd_thresh;
 
-            this_rd_thresh = (x->e_mbd.mode_info_context->mbmi.ref_frame == LAST_FRAME) ? cpi->rd_threshes[THR_NEWMV] : cpi->rd_threshes[THR_NEWA];
-            this_rd_thresh = (x->e_mbd.mode_info_context->mbmi.ref_frame == GOLDEN_FRAME) ? cpi->rd_threshes[THR_NEWG]: this_rd_thresh;
+            this_rd_thresh = (vp8_ref_frame_order[mode_index] == 1) ? cpi->rd_threshes[THR_NEW1] : cpi->rd_threshes[THR_NEW3];
+            this_rd_thresh = (vp8_ref_frame_order[mode_index] == 2) ? cpi->rd_threshes[THR_NEW2] : this_rd_thresh;
 
             tmp_rd = vp8_rd_pick_best_mbsegmentation(cpi, x, &best_ref_mv,
                                                      best_yrd, mdcounts,
@@ -2024,8 +2015,8 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
             int sadpb = x->sadperbit16;
             int_mv mvp_full;
 
-            int col_min = (best_ref_mv.as_mv.col>>3) - MAX_FULL_PEL_VAL + ((best_ref_mv.as_mv.col & 7)?1:0);
-            int row_min = (best_ref_mv.as_mv.row>>3) - MAX_FULL_PEL_VAL + ((best_ref_mv.as_mv.row & 7)?1:0);
+            int col_min = ((best_ref_mv.as_mv.col+7)>>3) - MAX_FULL_PEL_VAL;
+            int row_min = ((best_ref_mv.as_mv.row+7)>>3) - MAX_FULL_PEL_VAL;
             int col_max = (best_ref_mv.as_mv.col>>3) + MAX_FULL_PEL_VAL;
             int row_max = (best_ref_mv.as_mv.row>>3) + MAX_FULL_PEL_VAL;
 
@@ -2174,7 +2165,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
                 continue;
 
             vp8_set_mbmode_and_mvs(x, this_mode, &mode_mv[this_mode]);
-            vp8_build_inter16x16_predictors_mby(&x->e_mbd);
+            vp8_build_inter16x16_predictors_mby(&x->e_mbd, x->e_mbd.predictor, 16);
 
             if (cpi->active_map_enabled && x->active_ptr[0] == 0) {
                 x->skip = 1;
@@ -2294,11 +2285,6 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
             this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
         }
 
-        // Experimental debug code.
-        //all_rds[mode_index] = this_rd;
-        //all_rates[mode_index] = rate2;
-        //all_dist[mode_index] = distortion2;
-
         // Keep record of best intra distortion
         if ((x->e_mbd.mode_info_context->mbmi.ref_frame == INTRA_FRAME) &&
             (this_rd < best_intra_rd) )
@@ -2399,7 +2385,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
         x->e_mbd.mode_info_context->mbmi.mv.as_int = 0;
         x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;
         x->e_mbd.mode_info_context->mbmi.mb_skip_coeff =
-                                        (cpi->common.mb_no_coeff_skip) ? 1 : 0;
+                                        (cpi->common.mb_no_coeff_skip);
         x->e_mbd.mode_info_context->mbmi.partitioning = 0;
 
         return;
@@ -2426,10 +2412,14 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
                                       x->partition_info->bmi[15].mv.as_int;
     }
 
-    rd_update_mvcount(cpi, x, &frame_best_ref_mv[xd->mode_info_context->mbmi.ref_frame]);
-
-
+    if (sign_bias
+        != cpi->common.ref_frame_sign_bias[xd->mode_info_context->mbmi.ref_frame])
+    {
+        best_ref_mv.as_mv.row *= -1;
+        best_ref_mv.as_mv.col *= -1;
+    }
 
+    rd_update_mvcount(cpi, x, &best_ref_mv);
 }
 
 void vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate_)
diff --git a/vp8/encoder/rdopt.h b/vp8/encoder/rdopt.h
index 95134cb81..5ee869903 100644
--- a/vp8/encoder/rdopt.h
+++ b/vp8/encoder/rdopt.h
@@ -14,6 +14,57 @@
 
 #define RDCOST(RM,DM,R,D) ( ((128+(R)*(RM)) >> 8) + (DM)*(D) )
 
+static void insertsortmv(int arr[], int len)
+{
+    int i, j, k;
+
+    for ( i = 1 ; i <= len-1 ; i++ )
+    {
+        for ( j = 0 ; j < i ; j++ )
+        {
+            if ( arr[j] > arr[i] )
+            {
+                int temp;
+
+                temp = arr[i];
+
+                for ( k = i; k >j; k--)
+                    arr[k] = arr[k - 1] ;
+
+                arr[j] = temp ;
+            }
+        }
+    }
+}
+
+static void insertsortsad(int arr[],int idx[], int len)
+{
+    int i, j, k;
+
+    for ( i = 1 ; i <= len-1 ; i++ )
+    {
+        for ( j = 0 ; j < i ; j++ )
+        {
+            if ( arr[j] > arr[i] )
+            {
+                int temp, tempi;
+
+                temp = arr[i];
+                tempi = idx[i];
+
+                for ( k = i; k >j; k--)
+                {
+                    arr[k] = arr[k - 1] ;
+                    idx[k] = idx[k - 1];
+                }
+
+                arr[j] = temp ;
+                idx[j] = tempi;
+            }
+        }
+    }
+}
+
 extern void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue);
 extern void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra);
 extern void vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate);
diff --git a/vp8/encoder/temporal_filter.c b/vp8/encoder/temporal_filter.c
index b9ade1c6c..545e4f205 100644
--- a/vp8/encoder/temporal_filter.c
+++ b/vp8/encoder/temporal_filter.c
@@ -22,7 +22,6 @@
 #include "ratectrl.h"
 #include "vp8/common/quant_common.h"
 #include "segmentation.h"
-#include "vp8/common/g_common.h"
 #include "vpx_scale/yv12extend.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vp8/common/swapyv12buffer.h"
@@ -98,7 +97,7 @@ void vp8_temporal_filter_apply_c
     unsigned short *count
 )
 {
-    int i, j, k;
+    unsigned int i, j, k;
     int modifier;
     int byte = 0;
 
@@ -186,7 +185,7 @@ static int vp8_temporal_filter_find_matching_mb_c
     if (cpi->Speed < 8)
     {
         step_param = cpi->sf.first_step +
-                    ((cpi->Speed > 5) ? 1 : 0);
+                    (cpi->Speed > 5);
         further_steps =
             (cpi->sf.max_step_search_steps - 1)-step_param;
     }
diff --git a/vp8/encoder/tokenize.c b/vp8/encoder/tokenize.c
index e81948567..8bfc47f8f 100644
--- a/vp8/encoder/tokenize.c
+++ b/vp8/encoder/tokenize.c
@@ -514,17 +514,19 @@ static __inline void stuff1st_order_b
     TOKENEXTRA **tp,
     ENTROPY_CONTEXT *a,
     ENTROPY_CONTEXT *l,
+    int type,
     VP8_COMP *cpi
 )
 {
     int pt; /* near block/prev token context index */
+    int band;
     TOKENEXTRA *t = *tp;        /* store tokens starting here */
     VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
-
+    band = type ? 0 : 1;
     t->Token = DCT_EOB_TOKEN;
-    t->context_tree = cpi->common.fc.coef_probs [0] [1] [pt];
+    t->context_tree = cpi->common.fc.coef_probs [type] [band] [pt];
     t->skip_eob_node = 0;
-    ++cpi->coef_counts       [0] [1] [pt] [DCT_EOB_TOKEN];
+    ++cpi->coef_counts       [type] [band] [pt] [DCT_EOB_TOKEN];
     ++t;
     *tp = t;
     pt = 0; /* 0 <-> all coeff data is zero */
@@ -561,15 +563,19 @@ void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t)
     ENTROPY_CONTEXT * L = (ENTROPY_CONTEXT *)x->left_context;
     int plane_type;
     int b;
-
-    stuff2nd_order_b(t,
+    plane_type = 3;
+    if((x->mode_info_context->mbmi.mode != B_PRED
+                        && x->mode_info_context->mbmi.mode != SPLITMV))
+    {
+        stuff2nd_order_b(t,
                      A + vp8_block2above[24], L + vp8_block2left[24], cpi);
-    plane_type = 0;
+        plane_type = 0;
+    }
 
     for (b = 0; b < 16; b++)
         stuff1st_order_b(t,
                          A + vp8_block2above[b],
-                         L + vp8_block2left[b], cpi);
+                         L + vp8_block2left[b], plane_type, cpi);
 
     for (b = 16; b < 24; b++)
         stuff1st_order_buv(t,
diff --git a/vp8/encoder/x86/subtract_mmx.asm b/vp8/encoder/x86/subtract_mmx.asm
index 4ce16ce90..75e8aa3c2 100644
--- a/vp8/encoder/x86/subtract_mmx.asm
+++ b/vp8/encoder/x86/subtract_mmx.asm
@@ -73,74 +73,71 @@ sym(vp8_subtract_b_mmx_impl):
     pop         rbp
     ret
 
-;void vp8_subtract_mby_mmx(short *diff, unsigned char *src, unsigned char *pred, int stride)
+;void vp8_subtract_mby_mmx(short *diff, unsigned char *src, int src_stride,
+;unsigned char *pred, int pred_stride)
 global sym(vp8_subtract_mby_mmx)
 sym(vp8_subtract_mby_mmx):
     push        rbp
     mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
+    SHADOW_ARGS_TO_STACK 5
     push rsi
     push rdi
     ; end prolog
 
+    mov         rdi,        arg(0)          ;diff
+    mov         rsi,        arg(1)          ;src
+    movsxd      rdx,        dword ptr arg(2);src_stride
+    mov         rax,        arg(3)          ;pred
+    push        rbx
+    movsxd      rbx,        dword ptr arg(4);pred_stride
 
-            mov         rsi,            arg(1) ;src
-            mov         rdi,            arg(0) ;diff
-
-            mov         rax,            arg(2) ;pred
-            movsxd      rdx,            dword ptr arg(3) ;stride
+    pxor        mm0,        mm0
+    mov         rcx,        16
 
-            mov         rcx,            16
-            pxor        mm0,            mm0
 
 .submby_loop:
+    movq        mm1,        [rsi]
+    movq        mm3,        [rax]
 
-            movq        mm1,            [rsi]
-            movq        mm3,            [rax]
-
-            movq        mm2,            mm1
-            movq        mm4,            mm3
-
-            punpcklbw   mm1,            mm0
-            punpcklbw   mm3,            mm0
-
-            punpckhbw   mm2,            mm0
-            punpckhbw   mm4,            mm0
-
-            psubw       mm1,            mm3
-            psubw       mm2,            mm4
+    movq        mm2,        mm1
+    movq        mm4,        mm3
 
-            movq        [rdi],          mm1
-            movq        [rdi+8],        mm2
+    punpcklbw   mm1,        mm0
+    punpcklbw   mm3,        mm0
 
+    punpckhbw   mm2,        mm0
+    punpckhbw   mm4,        mm0
 
-            movq        mm1,            [rsi+8]
-            movq        mm3,            [rax+8]
+    psubw       mm1,        mm3
+    psubw       mm2,        mm4
 
-            movq        mm2,            mm1
-            movq        mm4,            mm3
+    movq        [rdi],      mm1
+    movq        [rdi+8],    mm2
 
-            punpcklbw   mm1,            mm0
-            punpcklbw   mm3,            mm0
+    movq        mm1,        [rsi+8]
+    movq        mm3,        [rax+8]
 
-            punpckhbw   mm2,            mm0
-            punpckhbw   mm4,            mm0
+    movq        mm2,        mm1
+    movq        mm4,        mm3
 
-            psubw       mm1,            mm3
-            psubw       mm2,            mm4
+    punpcklbw   mm1,        mm0
+    punpcklbw   mm3,        mm0
 
-            movq        [rdi+16],       mm1
-            movq        [rdi+24],       mm2
+    punpckhbw   mm2,        mm0
+    punpckhbw   mm4,        mm0
 
+    psubw       mm1,        mm3
+    psubw       mm2,        mm4
 
-            add         rdi,            32
-            add         rax,            16
-
-            lea         rsi,            [rsi+rdx]
-
-            sub         rcx,            1
-            jnz         .submby_loop
+    movq        [rdi+16],   mm1
+    movq        [rdi+24],   mm2
+    add         rdi,        32
+    lea         rax,        [rax+rbx]
+    lea         rsi,        [rsi+rdx]
+    dec         rcx
+    jnz         .submby_loop
 
+    pop rbx
     pop rdi
     pop rsi
     ; begin epilog
@@ -149,281 +146,75 @@ sym(vp8_subtract_mby_mmx):
     ret
 
 
-;void vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
+;vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc,
+;                         int src_stride, unsigned char *upred,
+;                         unsigned char *vpred, int pred_stride)
+
 global sym(vp8_subtract_mbuv_mmx)
 sym(vp8_subtract_mbuv_mmx):
     push        rbp
     mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
+    SHADOW_ARGS_TO_STACK 7
     push rsi
     push rdi
     ; end prolog
 
-    ;short *udiff = diff + 256;
-    ;short *vdiff = diff + 320;
-    ;unsigned char *upred = pred + 256;
-    ;unsigned char *vpred = pred + 320;
-
-        ;unsigned char  *z    = usrc;
-        ;unsigned short *diff = udiff;
-        ;unsigned char  *Predictor= upred;
-
-            mov     rdi,        arg(0) ;diff
-            mov     rax,        arg(3) ;pred
-            mov     rsi,        arg(1) ;z = usrc
-            add     rdi,        256*2  ;diff = diff + 256 (shorts)
-            add     rax,        256    ;Predictor = pred + 256
-            movsxd  rdx,        dword ptr arg(4) ;stride;
-            pxor    mm7,        mm7
-
-            movq    mm0,        [rsi]
-            movq    mm1,        [rax]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi],      mm0
-            movq    [rdi+8],    mm3
-
-
-            movq    mm0,        [rsi+rdx]
-            movq    mm1,        [rax+8]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi+16],   mm0
-            movq    [rdi+24],   mm3
-
-            movq    mm0,        [rsi+rdx*2]
-            movq    mm1,        [rax+16]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi+32],   mm0
-            movq    [rdi+40],   mm3
-            lea     rsi,        [rsi+rdx*2]
-
-
-            movq    mm0,        [rsi+rdx]
-            movq    mm1,        [rax+24]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-
-            movq    [rdi+48],   mm0
-            movq    [rdi+56],   mm3
-
-
-            add     rdi,        64
-            add     rax,        32
-            lea     rsi,        [rsi+rdx*2]
-
-
-            movq    mm0,        [rsi]
-            movq    mm1,        [rax]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi],      mm0
-            movq    [rdi+8],    mm3
-
-
-            movq    mm0,        [rsi+rdx]
-            movq    mm1,        [rax+8]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi+16],   mm0
-            movq    [rdi+24],   mm3
-
-            movq    mm0,        [rsi+rdx*2]
-            movq    mm1,        [rax+16]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi+32],   mm0
-            movq    [rdi+40],   mm3
-            lea     rsi,        [rsi+rdx*2]
-
-
-            movq    mm0,        [rsi+rdx]
-            movq    mm1,        [rax+24]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-
-            movq    [rdi+48],   mm0
-            movq    [rdi+56],   mm3
-
-        ;unsigned char  *z    = vsrc;
-        ;unsigned short *diff = vdiff;
-        ;unsigned char  *Predictor= vpred;
-
-            mov     rdi,        arg(0) ;diff
-            mov     rax,        arg(3) ;pred
-            mov     rsi,        arg(2) ;z = usrc
-            add     rdi,        320*2  ;diff = diff + 320 (shorts)
-            add     rax,        320    ;Predictor = pred + 320
-            movsxd  rdx,        dword ptr arg(4) ;stride;
-            pxor    mm7,        mm7
-
-            movq    mm0,        [rsi]
-            movq    mm1,        [rax]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi],      mm0
-            movq    [rdi+8],    mm3
-
-
-            movq    mm0,        [rsi+rdx]
-            movq    mm1,        [rax+8]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi+16],   mm0
-            movq    [rdi+24],   mm3
-
-            movq    mm0,        [rsi+rdx*2]
-            movq    mm1,        [rax+16]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi+32],   mm0
-            movq    [rdi+40],   mm3
-            lea     rsi,        [rsi+rdx*2]
-
-
-            movq    mm0,        [rsi+rdx]
-            movq    mm1,        [rax+24]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-
-            movq    [rdi+48],   mm0
-            movq    [rdi+56],   mm3
-
-
-            add     rdi,        64
-            add     rax,        32
-            lea     rsi,        [rsi+rdx*2]
-
-
-            movq    mm0,        [rsi]
-            movq    mm1,        [rax]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi],      mm0
-            movq    [rdi+8],    mm3
-
-
-            movq    mm0,        [rsi+rdx]
-            movq    mm1,        [rax+8]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi+16],   mm0
-            movq    [rdi+24],   mm3
-
-            movq    mm0,        [rsi+rdx*2]
-            movq    mm1,        [rax+16]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi+32],   mm0
-            movq    [rdi+40],   mm3
-            lea     rsi,        [rsi+rdx*2]
-
-
-            movq    mm0,        [rsi+rdx]
-            movq    mm1,        [rax+24]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-
-            movq    [rdi+48],   mm0
-            movq    [rdi+56],   mm3
-
+    mov         rdi,        arg(0)          ;diff
+    mov         rsi,        arg(1)          ;usrc
+    movsxd      rdx,        dword ptr arg(3);src_stride;
+    mov         rax,        arg(4)          ;upred
+    add         rdi,        256*2           ;diff = diff + 256 (shorts)
+    mov         rcx,        8
+    push        rbx
+    movsxd      rbx,        dword ptr arg(6);pred_stride
+
+    pxor        mm7,        mm7
+
+.submbu_loop:
+    movq        mm0,        [rsi]
+    movq        mm1,        [rax]
+    movq        mm3,        mm0
+    movq        mm4,        mm1
+    punpcklbw   mm0,        mm7
+    punpcklbw   mm1,        mm7
+    punpckhbw   mm3,        mm7
+    punpckhbw   mm4,        mm7
+    psubw       mm0,        mm1
+    psubw       mm3,        mm4
+    movq        [rdi],      mm0
+    movq        [rdi+8],    mm3
+    add         rdi, 16
+    add         rsi, rdx
+    add         rax, rbx
+
+    dec         rcx
+    jnz         .submbu_loop
+
+    mov         rsi,        arg(2)          ;vsrc
+    mov         rax,        arg(5)          ;vpred
+    mov         rcx,        8
+
+.submbv_loop:
+    movq        mm0,        [rsi]
+    movq        mm1,        [rax]
+    movq        mm3,        mm0
+    movq        mm4,        mm1
+    punpcklbw   mm0,        mm7
+    punpcklbw   mm1,        mm7
+    punpckhbw   mm3,        mm7
+    punpckhbw   mm4,        mm7
+    psubw       mm0,        mm1
+    psubw       mm3,        mm4
+    movq        [rdi],      mm0
+    movq        [rdi+8],    mm3
+    add         rdi, 16
+    add         rsi, rdx
+    add         rax, rbx
+
+    dec         rcx
+    jnz         .submbv_loop
+
+    pop         rbx
     ; begin epilog
     pop rdi
     pop rsi
diff --git a/vp8/encoder/x86/subtract_sse2.asm b/vp8/encoder/x86/subtract_sse2.asm
index 3bd1ff678..008e9c7d1 100644
--- a/vp8/encoder/x86/subtract_sse2.asm
+++ b/vp8/encoder/x86/subtract_sse2.asm
@@ -71,277 +71,166 @@ sym(vp8_subtract_b_sse2_impl):
     ret
 
 
-;void vp8_subtract_mby_sse2(short *diff, unsigned char *src, unsigned char *pred, int stride)
+;void vp8_subtract_mby_sse2(short *diff, unsigned char *src, int src_stride,
+;unsigned char *pred, int pred_stride)
 global sym(vp8_subtract_mby_sse2)
 sym(vp8_subtract_mby_sse2):
     push        rbp
     mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    SAVE_XMM 7
+    SHADOW_ARGS_TO_STACK 5
     GET_GOT     rbx
     push rsi
     push rdi
     ; end prolog
 
-            mov         rsi,            arg(1) ;src
-            mov         rdi,            arg(0) ;diff
-
-            mov         rax,            arg(2) ;pred
-            movsxd      rdx,            dword ptr arg(3) ;stride
-
-            mov         rcx,            8      ; do two lines at one time
+    mov         rdi,        arg(0)          ;diff
+    mov         rsi,        arg(1)          ;src
+    movsxd      rdx,        dword ptr arg(2);src_stride
+    mov         rax,        arg(3)          ;pred
+    movdqa      xmm4,       [GLOBAL(t80)]
+    push        rbx
+    mov         rcx,        8               ; do two lines at one time
+    movsxd      rbx,        dword ptr arg(4);pred_stride
 
 .submby_loop:
-            movdqa      xmm0,           XMMWORD PTR [rsi]   ; src
-            movdqa      xmm1,           XMMWORD PTR [rax]   ; pred
+    movdqa      xmm0,       [rsi]           ; src
+    movdqa      xmm1,       [rax]           ; pred
 
-            movdqa      xmm2,           xmm0
-            psubb       xmm0,           xmm1
+    movdqa      xmm2,       xmm0
+    psubb       xmm0,       xmm1
 
-            pxor        xmm1,           [GLOBAL(t80)]   ;convert to signed values
-            pxor        xmm2,           [GLOBAL(t80)]
-            pcmpgtb     xmm1,           xmm2            ; obtain sign information
+    pxor        xmm1,       xmm4            ;convert to signed values
+    pxor        xmm2,       xmm4
+    pcmpgtb     xmm1,       xmm2            ; obtain sign information
 
-            movdqa      xmm2,    xmm0
-            movdqa      xmm3,    xmm1
-            punpcklbw   xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw   xmm2,    xmm3            ; put sign back to subtraction
+    movdqa      xmm2,       xmm0
+    punpcklbw   xmm0,       xmm1            ; put sign back to subtraction
+    punpckhbw   xmm2,       xmm1            ; put sign back to subtraction
 
-            movdqa      XMMWORD PTR [rdi],   xmm0
-            movdqa      XMMWORD PTR [rdi +16], xmm2
+    movdqa      xmm3,       [rsi + rdx]
+    movdqa      xmm5,       [rax + rbx]
 
-            movdqa      xmm4,           XMMWORD PTR [rsi + rdx]
-            movdqa      xmm5,           XMMWORD PTR [rax + 16]
+    lea         rsi,        [rsi+rdx*2]
+    lea         rax,        [rax+rbx*2]
 
-            movdqa      xmm6,           xmm4
-            psubb       xmm4,           xmm5
+    movdqa      [rdi],      xmm0
+    movdqa      [rdi +16],  xmm2
 
-            pxor        xmm5,           [GLOBAL(t80)]   ;convert to signed values
-            pxor        xmm6,           [GLOBAL(t80)]
-            pcmpgtb     xmm5,           xmm6            ; obtain sign information
+    movdqa      xmm1,       xmm3
+    psubb       xmm3,       xmm5
 
-            movdqa      xmm6,    xmm4
-            movdqa      xmm7,    xmm5
-            punpcklbw   xmm4,    xmm5            ; put sign back to subtraction
-            punpckhbw   xmm6,    xmm7            ; put sign back to subtraction
+    pxor        xmm5,       xmm4            ;convert to signed values
+    pxor        xmm1,       xmm4
+    pcmpgtb     xmm5,       xmm1            ; obtain sign information
 
-            movdqa      XMMWORD PTR [rdi +32], xmm4
-            movdqa      XMMWORD PTR [rdi +48], xmm6
+    movdqa      xmm1,       xmm3
+    punpcklbw   xmm3,       xmm5            ; put sign back to subtraction
+    punpckhbw   xmm1,       xmm5            ; put sign back to subtraction
 
-            add         rdi,            64
-            add         rax,            32
-            lea         rsi,            [rsi+rdx*2]
+    movdqa      [rdi +32],  xmm3
+    movdqa      [rdi +48],  xmm1
 
-            sub         rcx,            1
-            jnz         .submby_loop
+    add         rdi,        64
+    dec         rcx
+    jnz         .submby_loop
 
+    pop rbx
     pop rdi
     pop rsi
     ; begin epilog
     RESTORE_GOT
-    RESTORE_XMM
     UNSHADOW_ARGS
     pop         rbp
     ret
 
-
-;void vp8_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
+;vp8_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc,
+;                         int src_stride, unsigned char *upred,
+;                         unsigned char *vpred, int pred_stride)
 global sym(vp8_subtract_mbuv_sse2)
 sym(vp8_subtract_mbuv_sse2):
     push        rbp
     mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
+    SHADOW_ARGS_TO_STACK 7
     GET_GOT     rbx
     push rsi
     push rdi
     ; end prolog
 
-            mov     rdi,        arg(0) ;diff
-            mov     rax,        arg(3) ;pred
-            mov     rsi,        arg(1) ;z = usrc
-            add     rdi,        256*2  ;diff = diff + 256 (shorts)
-            add     rax,        256    ;Predictor = pred + 256
-            movsxd  rdx,        dword ptr arg(4) ;stride;
-            lea     rcx,        [rdx + rdx*2]
-
-            ;u
-            ;line 0 1
-            movq       xmm0,    MMWORD PTR [rsi]  ; src
-            movq       xmm2,    MMWORD PTR [rsi+rdx]
-            movdqa     xmm1,    XMMWORD PTR [rax]  ; pred
-            punpcklqdq xmm0,    xmm2
-
-            movdqa     xmm2,    xmm0
-            psubb      xmm0,    xmm1            ; subtraction with sign missed
-
-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
-            pxor       xmm2,    [GLOBAL(t80)]
-            pcmpgtb    xmm1,    xmm2            ; obtain sign information
-
-            movdqa     xmm2,    xmm0
-            movdqa     xmm3,    xmm1
-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
-
-            movdqa     XMMWORD PTR [rdi],   xmm0
-            movdqa     XMMWORD PTR [rdi +16],   xmm2
-
-            ;line 2 3
-            movq       xmm0,    MMWORD PTR [rsi+rdx*2]  ; src
-            movq       xmm2,    MMWORD PTR [rsi+rcx]
-            movdqa     xmm1,    XMMWORD PTR [rax+16]  ; pred
-            punpcklqdq xmm0,    xmm2
-
-            movdqa     xmm2,    xmm0
-            psubb      xmm0,    xmm1            ; subtraction with sign missed
-
-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
-            pxor       xmm2,    [GLOBAL(t80)]
-            pcmpgtb    xmm1,    xmm2            ; obtain sign information
-
-            movdqa     xmm2,    xmm0
-            movdqa     xmm3,    xmm1
-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
-
-            movdqa     XMMWORD PTR [rdi + 32],   xmm0
-            movdqa     XMMWORD PTR [rdi + 48],   xmm2
-
-            ;line 4 5
-            lea        rsi,     [rsi + rdx*4]
-
-            movq       xmm0,    MMWORD PTR [rsi]  ; src
-            movq       xmm2,    MMWORD PTR [rsi+rdx]
-            movdqa     xmm1,    XMMWORD PTR [rax + 32]  ; pred
-            punpcklqdq xmm0,    xmm2
-
-            movdqa     xmm2,    xmm0
-            psubb      xmm0,    xmm1            ; subtraction with sign missed
-
-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
-            pxor       xmm2,    [GLOBAL(t80)]
-            pcmpgtb    xmm1,    xmm2            ; obtain sign information
-
-            movdqa     xmm2,    xmm0
-            movdqa     xmm3,    xmm1
-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
-
-            movdqa     XMMWORD PTR [rdi + 64],   xmm0
-            movdqa     XMMWORD PTR [rdi + 80],   xmm2
-
-            ;line 6 7
-            movq       xmm0,    MMWORD PTR [rsi+rdx*2]  ; src
-            movq       xmm2,    MMWORD PTR [rsi+rcx]
-            movdqa     xmm1,    XMMWORD PTR [rax+ 48]  ; pred
-            punpcklqdq xmm0,    xmm2
-
-            movdqa     xmm2,    xmm0
-            psubb      xmm0,    xmm1            ; subtraction with sign missed
-
-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
-            pxor       xmm2,    [GLOBAL(t80)]
-            pcmpgtb    xmm1,    xmm2            ; obtain sign information
-
-            movdqa     xmm2,    xmm0
-            movdqa     xmm3,    xmm1
-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
-
-            movdqa     XMMWORD PTR [rdi + 96],   xmm0
-            movdqa     XMMWORD PTR [rdi + 112],  xmm2
-
-            ;v
-            mov     rsi,        arg(2) ;z = vsrc
-            add     rdi,        64*2  ;diff = diff + 320 (shorts)
-            add     rax,        64    ;Predictor = pred + 320
-
-            ;line 0 1
-            movq       xmm0,    MMWORD PTR [rsi]  ; src
-            movq       xmm2,    MMWORD PTR [rsi+rdx]
-            movdqa     xmm1,    XMMWORD PTR [rax]  ; pred
-            punpcklqdq xmm0,    xmm2
-
-            movdqa     xmm2,    xmm0
-            psubb      xmm0,    xmm1            ; subtraction with sign missed
-
-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
-            pxor       xmm2,    [GLOBAL(t80)]
-            pcmpgtb    xmm1,    xmm2            ; obtain sign information
-
-            movdqa     xmm2,    xmm0
-            movdqa     xmm3,    xmm1
-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
-
-            movdqa     XMMWORD PTR [rdi],   xmm0
-            movdqa     XMMWORD PTR [rdi +16],   xmm2
-
-            ;line 2 3
-            movq       xmm0,    MMWORD PTR [rsi+rdx*2]  ; src
-            movq       xmm2,    MMWORD PTR [rsi+rcx]
-            movdqa     xmm1,    XMMWORD PTR [rax+16]  ; pred
-            punpcklqdq xmm0,    xmm2
-
-            movdqa     xmm2,    xmm0
-            psubb      xmm0,    xmm1            ; subtraction with sign missed
-
-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
-            pxor       xmm2,    [GLOBAL(t80)]
-            pcmpgtb    xmm1,    xmm2            ; obtain sign information
-
-            movdqa     xmm2,    xmm0
-            movdqa     xmm3,    xmm1
-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
-
-            movdqa     XMMWORD PTR [rdi + 32],   xmm0
-            movdqa     XMMWORD PTR [rdi + 48],   xmm2
-
-            ;line 4 5
-            lea        rsi,     [rsi + rdx*4]
-
-            movq       xmm0,    MMWORD PTR [rsi]  ; src
-            movq       xmm2,    MMWORD PTR [rsi+rdx]
-            movdqa     xmm1,    XMMWORD PTR [rax + 32]  ; pred
-            punpcklqdq xmm0,    xmm2
-
-            movdqa     xmm2,    xmm0
-            psubb      xmm0,    xmm1            ; subtraction with sign missed
-
-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
-            pxor       xmm2,    [GLOBAL(t80)]
-            pcmpgtb    xmm1,    xmm2            ; obtain sign information
-
-            movdqa     xmm2,    xmm0
-            movdqa     xmm3,    xmm1
-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
-
-            movdqa     XMMWORD PTR [rdi + 64],   xmm0
-            movdqa     XMMWORD PTR [rdi + 80],   xmm2
-
-            ;line 6 7
-            movq       xmm0,    MMWORD PTR [rsi+rdx*2]  ; src
-            movq       xmm2,    MMWORD PTR [rsi+rcx]
-            movdqa     xmm1,    XMMWORD PTR [rax+ 48]  ; pred
-            punpcklqdq xmm0,    xmm2
-
-            movdqa     xmm2,    xmm0
-            psubb      xmm0,    xmm1            ; subtraction with sign missed
-
-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
-            pxor       xmm2,    [GLOBAL(t80)]
-            pcmpgtb    xmm1,    xmm2            ; obtain sign information
-
-            movdqa     xmm2,    xmm0
-            movdqa     xmm3,    xmm1
-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
-
-            movdqa     XMMWORD PTR [rdi + 96],   xmm0
-            movdqa     XMMWORD PTR [rdi + 112],  xmm2
-
+    movdqa      xmm4,       [GLOBAL(t80)]
+    mov         rdi,        arg(0)          ;diff
+    mov         rsi,        arg(1)          ;usrc
+    movsxd      rdx,        dword ptr arg(3);src_stride;
+    mov         rax,        arg(4)          ;upred
+    add         rdi,        256*2           ;diff = diff + 256 (shorts)
+    mov         rcx,        4
+    push        rbx
+    movsxd      rbx,        dword ptr arg(6);pred_stride
+
+    ;u
+.submbu_loop:
+    movq        xmm0,       [rsi]           ; src
+    movq        xmm2,       [rsi+rdx]       ; src -- next line
+    movq        xmm1,       [rax]           ; pred
+    movq        xmm3,       [rax+rbx]       ; pred -- next line
+    lea         rsi,        [rsi + rdx*2]
+    lea         rax,        [rax + rbx*2]
+
+    punpcklqdq  xmm0,       xmm2
+    punpcklqdq  xmm1,       xmm3
+
+    movdqa      xmm2,       xmm0
+    psubb       xmm0,       xmm1            ; subtraction with sign missed
+
+    pxor        xmm1,       xmm4            ;convert to signed values
+    pxor        xmm2,       xmm4
+    pcmpgtb     xmm1,       xmm2            ; obtain sign information
+
+    movdqa      xmm2,       xmm0
+    movdqa      xmm3,       xmm1
+    punpcklbw   xmm0,       xmm1            ; put sign back to subtraction
+    punpckhbw   xmm2,       xmm3            ; put sign back to subtraction
+
+    movdqa      [rdi],      xmm0            ; store difference
+    movdqa      [rdi +16],  xmm2            ; store difference
+    add         rdi,        32
+    sub         rcx, 1
+    jnz         .submbu_loop
+
+    mov         rsi,        arg(2)          ;vsrc
+    mov         rax,        arg(5)          ;vpred
+    mov         rcx,        4
+
+    ;v
+.submbv_loop:
+    movq        xmm0,       [rsi]           ; src
+    movq        xmm2,       [rsi+rdx]       ; src -- next line
+    movq        xmm1,       [rax]           ; pred
+    movq        xmm3,       [rax+rbx]       ; pred -- next line
+    lea         rsi,        [rsi + rdx*2]
+    lea         rax,        [rax + rbx*2]
+
+    punpcklqdq  xmm0,       xmm2
+    punpcklqdq  xmm1,       xmm3
+
+    movdqa      xmm2,       xmm0
+    psubb       xmm0,       xmm1            ; subtraction with sign missed
+
+    pxor        xmm1,       xmm4            ;convert to signed values
+    pxor        xmm2,       xmm4
+    pcmpgtb     xmm1,       xmm2            ; obtain sign information
+
+    movdqa      xmm2,       xmm0
+    movdqa      xmm3,       xmm1
+    punpcklbw   xmm0,       xmm1            ; put sign back to subtraction
+    punpckhbw   xmm2,       xmm3            ; put sign back to subtraction
+
+    movdqa      [rdi],      xmm0            ; store difference
+    movdqa      [rdi +16],  xmm2            ; store difference
+    add         rdi,        32
+    sub         rcx, 1
+    jnz         .submbv_loop
+
+    pop         rbx
     ; begin epilog
     pop rdi
     pop rsi
diff --git a/vp8/encoder/x86/variance_mmx.c b/vp8/encoder/x86/variance_mmx.c
index 92b695f17..e2524b46a 100644
--- a/vp8/encoder/x86/variance_mmx.c
+++ b/vp8/encoder/x86/variance_mmx.c
@@ -12,6 +12,7 @@
 #include "vp8/encoder/variance.h"
 #include "vp8/common/pragmas.h"
 #include "vpx_ports/mem.h"
+#include "vp8/common/x86/filter_x86.h"
 
 extern void filter_block1d_h6_mmx
 (
@@ -21,7 +22,7 @@ extern void filter_block1d_h6_mmx
     unsigned int pixel_step,
     unsigned int output_height,
     unsigned int output_width,
-    short *vp7_filter
+    short *filter
 );
 extern void filter_block1d_v6_mmx
 (
@@ -31,7 +32,7 @@ extern void filter_block1d_v6_mmx
     unsigned int pixel_step,
     unsigned int output_height,
     unsigned int output_width,
-    short *vp7_filter
+    short *filter
 );
 
 extern unsigned int vp8_get_mb_ss_mmx(const short *src_ptr);
@@ -198,24 +199,6 @@ unsigned int vp8_variance8x16_mmx(
 }
 
 
-
-
-///////////////////////////////////////////////////////////////////////////
-// the mmx function that does the bilinear filtering and var calculation //
-// int one pass                                                          //
-///////////////////////////////////////////////////////////////////////////
-DECLARE_ALIGNED(16, const short, vp8_vp7_bilinear_filters_mmx[8][8]) =
-{
-    { 128, 128, 128, 128,  0,  0,  0,  0 },
-    { 112, 112, 112, 112, 16, 16, 16, 16 },
-    {  96, 96, 96, 96, 32, 32, 32, 32 },
-    {  80, 80, 80, 80, 48, 48, 48, 48 },
-    {  64, 64, 64, 64, 64, 64, 64, 64 },
-    {  48, 48, 48, 48, 80, 80, 80, 80 },
-    {  32, 32, 32, 32, 96, 96, 96, 96 },
-    {  16, 16, 16, 16, 112, 112, 112, 112 }
-};
-
 unsigned int vp8_sub_pixel_variance4x4_mmx
 (
     const unsigned char  *src_ptr,
@@ -232,7 +215,7 @@ unsigned int vp8_sub_pixel_variance4x4_mmx
     vp8_filter_block2d_bil4x4_var_mmx(
         src_ptr, src_pixels_per_line,
         dst_ptr, dst_pixels_per_line,
-        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
         &xsum, &xxsum
     );
     *sse = xxsum;
@@ -257,7 +240,7 @@ unsigned int vp8_sub_pixel_variance8x8_mmx
     vp8_filter_block2d_bil_var_mmx(
         src_ptr, src_pixels_per_line,
         dst_ptr, dst_pixels_per_line, 8,
-        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
         &xsum, &xxsum
     );
     *sse = xxsum;
@@ -283,7 +266,7 @@ unsigned int vp8_sub_pixel_variance16x16_mmx
     vp8_filter_block2d_bil_var_mmx(
         src_ptr, src_pixels_per_line,
         dst_ptr, dst_pixels_per_line, 16,
-        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
         &xsum0, &xxsum0
     );
 
@@ -291,7 +274,7 @@ unsigned int vp8_sub_pixel_variance16x16_mmx
     vp8_filter_block2d_bil_var_mmx(
         src_ptr + 8, src_pixels_per_line,
         dst_ptr + 8, dst_pixels_per_line, 16,
-        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
         &xsum1, &xxsum1
     );
 
@@ -336,7 +319,7 @@ unsigned int vp8_sub_pixel_variance16x8_mmx
     vp8_filter_block2d_bil_var_mmx(
         src_ptr, src_pixels_per_line,
         dst_ptr, dst_pixels_per_line, 8,
-        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
         &xsum0, &xxsum0
     );
 
@@ -344,7 +327,7 @@ unsigned int vp8_sub_pixel_variance16x8_mmx
     vp8_filter_block2d_bil_var_mmx(
         src_ptr + 8, src_pixels_per_line,
         dst_ptr + 8, dst_pixels_per_line, 8,
-        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
         &xsum1, &xxsum1
     );
 
@@ -371,7 +354,7 @@ unsigned int vp8_sub_pixel_variance8x16_mmx
     vp8_filter_block2d_bil_var_mmx(
         src_ptr, src_pixels_per_line,
         dst_ptr, dst_pixels_per_line, 16,
-        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
         &xsum, &xxsum
     );
     *sse = xxsum;
diff --git a/vp8/encoder/x86/variance_sse2.c b/vp8/encoder/x86/variance_sse2.c
index 24062eb9b..39213b03d 100644
--- a/vp8/encoder/x86/variance_sse2.c
+++ b/vp8/encoder/x86/variance_sse2.c
@@ -12,11 +12,12 @@
 #include "vp8/encoder/variance.h"
 #include "vp8/common/pragmas.h"
 #include "vpx_ports/mem.h"
+#include "vp8/common/x86/filter_x86.h"
 
-extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
-extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
-extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
-extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
+extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
+extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
+extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
+extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
 
 extern void vp8_filter_block2d_bil4x4_var_mmx
 (
@@ -135,8 +136,6 @@ void vp8_half_vert_variance16x_h_sse2
     unsigned int *sumsquared
 );
 
-DECLARE_ALIGNED(16, extern short, vp8_vp7_bilinear_filters_mmx[8][8]);
-
 unsigned int vp8_variance4x4_wmt(
     const unsigned char *src_ptr,
     int  source_stride,
@@ -262,7 +261,7 @@ unsigned int vp8_sub_pixel_variance4x4_wmt
     vp8_filter_block2d_bil4x4_var_mmx(
         src_ptr, src_pixels_per_line,
         dst_ptr, dst_pixels_per_line,
-        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
         &xsum, &xxsum
     );
     *sse = xxsum;
diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index 5c15a3e4f..0e564320f 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -9,7 +9,6 @@
 ##
 
 VP8_COMMON_SRCS-yes += vp8_common.mk
-VP8_COMMON_SRCS-yes += common/type_aliases.h
 VP8_COMMON_SRCS-yes += common/pragmas.h
 VP8_COMMON_SRCS-yes += common/ppflags.h
 VP8_COMMON_SRCS-yes += common/onyx.h
@@ -20,6 +19,8 @@ VP8_COMMON_SRCS-yes += common/blockd.c
 VP8_COMMON_SRCS-yes += common/coefupdateprobs.h
 VP8_COMMON_SRCS-yes += common/debugmodes.c
 VP8_COMMON_SRCS-yes += common/default_coef_probs.h
+VP8_COMMON_SRCS-yes += common/dequantize.c
+VP8_COMMON_SRCS-yes += common/dequantize.h
 VP8_COMMON_SRCS-yes += common/entropy.c
 VP8_COMMON_SRCS-yes += common/entropymode.c
 VP8_COMMON_SRCS-yes += common/entropymv.c
@@ -28,17 +29,16 @@ VP8_COMMON_SRCS-yes += common/filter.c
 VP8_COMMON_SRCS-yes += common/filter.h
 VP8_COMMON_SRCS-yes += common/findnearmv.c
 VP8_COMMON_SRCS-yes += common/generic/systemdependent.c
+VP8_COMMON_SRCS-yes += common/idct_blk.c
 VP8_COMMON_SRCS-yes += common/idctllm.c
 VP8_COMMON_SRCS-yes += common/alloccommon.h
 VP8_COMMON_SRCS-yes += common/blockd.h
 VP8_COMMON_SRCS-yes += common/common.h
-VP8_COMMON_SRCS-yes += common/common_types.h
 VP8_COMMON_SRCS-yes += common/entropy.h
 VP8_COMMON_SRCS-yes += common/entropymode.h
 VP8_COMMON_SRCS-yes += common/entropymv.h
 VP8_COMMON_SRCS-yes += common/extend.h
 VP8_COMMON_SRCS-yes += common/findnearmv.h
-VP8_COMMON_SRCS-yes += common/g_common.h
 VP8_COMMON_SRCS-yes += common/header.h
 VP8_COMMON_SRCS-yes += common/idct.h
 VP8_COMMON_SRCS-yes += common/invtrans.h
@@ -57,7 +57,6 @@ VP8_COMMON_SRCS-yes += common/swapyv12buffer.h
 VP8_COMMON_SRCS-yes += common/systemdependent.h
 VP8_COMMON_SRCS-yes += common/threading.h
 VP8_COMMON_SRCS-yes += common/treecoder.h
-VP8_COMMON_SRCS-yes += common/invtrans.c
 VP8_COMMON_SRCS-yes += common/loopfilter.c
 VP8_COMMON_SRCS-yes += common/loopfilter_filters.c
 VP8_COMMON_SRCS-yes += common/mbpitch.c
@@ -69,9 +68,15 @@ VP8_COMMON_SRCS-yes += common/reconintra.c
 VP8_COMMON_SRCS-yes += common/reconintra4x4.c
 VP8_COMMON_SRCS-yes += common/setupintrarecon.c
 VP8_COMMON_SRCS-yes += common/swapyv12buffer.c
+
+
+
 VP8_COMMON_SRCS-$(CONFIG_POSTPROC_VISUALIZER) += common/textblit.c
 VP8_COMMON_SRCS-yes += common/treecoder.c
 
+VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/dequantize_x86.h
+VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/filter_x86.c
+VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/filter_x86.h
 VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/idct_x86.h
 VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/subpixel_x86.h
 VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/recon_x86.h
@@ -82,11 +87,14 @@ VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp8_asm_stubs.c
 VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/loopfilter_x86.c
 VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/postproc.h
 VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/postproc.c
+VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/dequantize_mmx.asm
+VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idct_blk_mmx.c
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idctllm_mmx.asm
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/iwalsh_mmx.asm
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/recon_mmx.asm
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/subpixel_mmx.asm
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/loopfilter_mmx.asm
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idct_blk_sse2.c
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idctllm_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_wrapper_sse2.c
@@ -104,8 +112,6 @@ endif
 
 # common (c)
 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/arm_systemdependent.c
-VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/bilinearfilter_arm.c
-VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/bilinearfilter_arm.h
 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/filter_arm.c
 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/idct_arm.h
 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/loopfilter_arm.c
@@ -113,8 +119,12 @@ VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/loopfilter_arm.h
 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/recon_arm.h
 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/reconintra_arm.c
 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/subpixel_arm.h
+VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/dequantize_arm.c
+VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/dequantize_arm.h
 
 # common (armv6)
+VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/bilinearfilter_arm.c
+VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/bilinearfilter_arm.h
 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/bilinearfilter_v6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/copymem8x4_v6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/copymem8x8_v6$(ASM)
@@ -127,6 +137,9 @@ VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/loopfilter_v6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/simpleloopfilter_v6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/sixtappredict8x4_v6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/intra4x4_predict_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/dequant_idct_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/dequantize_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/idct_blk_v6.c
 
 # common (neon)
 VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/bilinearpredict4x4_neon$(ASM)
@@ -149,3 +162,8 @@ VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/sixtappredict8x8_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/sixtappredict16x16_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/buildintrapredictorsmby_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/save_neon_reg$(ASM)
+VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/dequant_idct_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/idct_dequant_full_2x_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/idct_dequant_0_2x_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/dequantizeb_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/idct_blk_neon.c
diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index 7260e942b..6181ee8ee 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -83,7 +83,7 @@ struct vpx_codec_alg_priv
     vpx_codec_enc_cfg_t     cfg;
     struct vp8_extracfg     vp8_cfg;
     VP8_CONFIG              oxcf;
-    VP8_PTR             cpi;
+    struct VP8_COMP        *cpi;
     unsigned char          *cx_data;
     unsigned int            cx_data_sz;
     vpx_image_t             preview_img;
@@ -137,7 +137,8 @@ update_error_state(vpx_codec_alg_priv_t                 *ctx,
 
 static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
                                        const vpx_codec_enc_cfg_t *cfg,
-                                       const struct vp8_extracfg *vp8_cfg)
+                                       const struct vp8_extracfg *vp8_cfg,
+                                       int                        finalize)
 {
     RANGE_CHECK(cfg, g_w,                   1, 16383); /* 14 bits available */
     RANGE_CHECK(cfg, g_h,                   1, 16383); /* 14 bits available */
@@ -193,6 +194,9 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
     RANGE_CHECK_HI(vp8_cfg, arnr_strength,   6);
     RANGE_CHECK(vp8_cfg, arnr_type,       1, 3);
     RANGE_CHECK(vp8_cfg, cq_level, 0, 63);
+    if(finalize && cfg->rc_end_usage == VPX_CQ)
+        RANGE_CHECK(vp8_cfg, cq_level,
+                    cfg->rc_min_quantizer, cfg->rc_max_quantizer);
 
 #if !(CONFIG_REALTIME_ONLY)
     if (cfg->g_pass == VPX_RC_LAST_PASS)
@@ -264,21 +268,15 @@ static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx,
 
 static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf,
                                        vpx_codec_enc_cfg_t cfg,
-                                       struct vp8_extracfg vp8_cfg)
+                                       struct vp8_extracfg vp8_cfg,
+                                       vpx_codec_priv_enc_mr_cfg_t *mr_cfg)
 {
     oxcf->multi_threaded         = cfg.g_threads;
     oxcf->Version               = cfg.g_profile;
 
     oxcf->Width                 = cfg.g_w;
     oxcf->Height                = cfg.g_h;
-    /* guess a frame rate if out of whack, use 30 */
-    oxcf->frame_rate = (double)(cfg.g_timebase.den) /
-                       (double)(cfg.g_timebase.num);
-
-    if (oxcf->frame_rate > 180)
-    {
-        oxcf->frame_rate = 30;
-    }
+    oxcf->timebase              = cfg.g_timebase;
 
     oxcf->error_resilient_mode = cfg.g_error_resilient;
 
@@ -362,6 +360,21 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf,
         memcpy (oxcf->layer_id, cfg.ts_layer_id, sizeof(cfg.ts_layer_id));
     }
 
+#if CONFIG_MULTI_RES_ENCODING
+    /* When mr_cfg is NULL, oxcf->mr_total_resolutions and oxcf->mr_encoder_id
+     * are both memset to 0, which ensures the correct logic under this
+     * situation.
+     */
+    if(mr_cfg)
+    {
+        oxcf->mr_total_resolutions        = mr_cfg->mr_total_resolutions;
+        oxcf->mr_encoder_id               = mr_cfg->mr_encoder_id;
+        oxcf->mr_down_sampling_factor.num = mr_cfg->mr_down_sampling_factor.num;
+        oxcf->mr_down_sampling_factor.den = mr_cfg->mr_down_sampling_factor.den;
+        oxcf->mr_low_res_mode_info        = mr_cfg->mr_low_res_mode_info;
+    }
+#endif
+
     //oxcf->delete_first_pass_file = cfg.g_delete_firstpassfile;
     //strcpy(oxcf->first_pass_file, cfg.g_firstpass_file);
 
@@ -434,12 +447,12 @@ static vpx_codec_err_t vp8e_set_config(vpx_codec_alg_priv_t       *ctx,
     if ((cfg->g_lag_in_frames > ctx->cfg.g_lag_in_frames))
         ERROR("Cannot increase lag_in_frames");
 
-    res = validate_config(ctx, cfg, &ctx->vp8_cfg);
+    res = validate_config(ctx, cfg, &ctx->vp8_cfg, 0);
 
     if (!res)
     {
         ctx->cfg = *cfg;
-        set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg);
+        set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg, NULL);
         vp8_change_config(ctx->cpi, &ctx->oxcf);
     }
 
@@ -500,26 +513,50 @@ static vpx_codec_err_t set_param(vpx_codec_alg_priv_t *ctx,
 
     }
 
-    res = validate_config(ctx, &ctx->cfg, &xcfg);
+    res = validate_config(ctx, &ctx->cfg, &xcfg, 0);
 
     if (!res)
     {
         ctx->vp8_cfg = xcfg;
-        set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg);
+        set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg, NULL);
         vp8_change_config(ctx->cpi, &ctx->oxcf);
     }
 
     return res;
 #undef MAP
 }
-static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx)
+
+static vpx_codec_err_t vp8e_mr_alloc_mem(const vpx_codec_enc_cfg_t *cfg,
+                                        void **mem_loc)
+{
+    vpx_codec_err_t res = 0;
+
+#if CONFIG_MULTI_RES_ENCODING
+    int mb_rows = ((cfg->g_w + 15) >>4);
+    int mb_cols = ((cfg->g_h + 15) >>4);
+
+    *mem_loc = calloc(mb_rows*mb_cols, sizeof(LOWER_RES_INFO));
+    if(!(*mem_loc))
+    {
+        free(*mem_loc);
+        res = VPX_CODEC_MEM_ERROR;
+    }
+    else
+        res = VPX_CODEC_OK;
+#endif
+
+    return res;
+}
+
+static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx,
+                                 vpx_codec_priv_enc_mr_cfg_t *mr_cfg)
 {
     vpx_codec_err_t        res = VPX_DEC_OK;
     struct vpx_codec_alg_priv *priv;
     vpx_codec_enc_cfg_t       *cfg;
     unsigned int               i;
 
-    VP8_PTR optr;
+    struct VP8_COMP *optr;
 
     if (!ctx->priv)
     {
@@ -573,13 +610,20 @@ static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx)
 
         vp8_initialize();
 
-        res = validate_config(priv, &priv->cfg, &priv->vp8_cfg);
+        res = validate_config(priv, &priv->cfg, &priv->vp8_cfg, 0);
 
         if (!res)
         {
+            if(mr_cfg)
+                ctx->priv->enc.total_encoders   = mr_cfg->mr_total_resolutions;
+            else
+                ctx->priv->enc.total_encoders   = 1;
+
             set_vp8e_config(&ctx->priv->alg_priv->oxcf,
                              ctx->priv->alg_priv->cfg,
-                             ctx->priv->alg_priv->vp8_cfg);
+                             ctx->priv->alg_priv->vp8_cfg,
+                             mr_cfg);
+
             optr = vp8_create_compressor(&ctx->priv->alg_priv->oxcf);
 
             if (!optr)
@@ -594,6 +638,11 @@ static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx)
 
 static vpx_codec_err_t vp8e_destroy(vpx_codec_alg_priv_t *ctx)
 {
+#if CONFIG_MULTI_RES_ENCODING
+    /* Free multi-encoder shared memory */
+    if (ctx->oxcf.mr_total_resolutions > 0 && (ctx->oxcf.mr_encoder_id == ctx->oxcf.mr_total_resolutions-1))
+        free(ctx->oxcf.mr_low_res_mode_info);
+#endif
 
     free(ctx->cx_data);
     vp8_remove_compressor(&ctx->cpi);
@@ -691,6 +740,9 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t  *ctx,
     if (img)
         res = validate_img(ctx, img);
 
+    if (!res)
+        res = validate_config(ctx, &ctx->cfg, &ctx->vp8_cfg, 1);
+
     pick_quickcompress_mode(ctx, duration, deadline);
     vpx_codec_pkt_list_init(&ctx->pkt_list);
 
@@ -1230,6 +1282,7 @@ CODEC_INTERFACE(vpx_codec_vp8_cx) =
         vp8e_set_config,
         NOT_IMPLEMENTED,
         vp8e_get_preview,
+        vp8e_mr_alloc_mem,
     } /* encoder functions */
 };
 
@@ -1314,5 +1367,6 @@ vpx_codec_iface_t vpx_enc_vp8_algo =
         vp8e_set_config,
         NOT_IMPLEMENTED,
         vp8e_get_preview,
+        vp8e_mr_alloc_mem,
     } /* encoder functions */
 };
diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c
index cdfcd2142..fbe58171c 100644
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -57,7 +57,7 @@ struct vpx_codec_alg_priv
     vp8_stream_info_t       si;
     int                     defer_alloc;
     int                     decoder_init;
-    VP8D_PTR                pbi;
+    struct VP8D_COMP       *pbi;
     int                     postproc_cfg_set;
     vp8_postproc_cfg_t      postproc_cfg;
 #if CONFIG_POSTPROC_VISUALIZER
@@ -181,9 +181,11 @@ static void vp8_finalize_mmaps(vpx_codec_alg_priv_t *ctx)
     /* nothing to clean up */
 }
 
-static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx)
+static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx,
+                                vpx_codec_priv_enc_mr_cfg_t *data)
 {
     vpx_codec_err_t        res = VPX_CODEC_OK;
+    (void) data;
 
     /* This function only allocates space for the vpx_codec_alg_priv_t
      * structure. More memory may be required at the time the stream
@@ -387,7 +389,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,
         if (!res)
         {
             VP8D_CONFIG oxcf;
-            VP8D_PTR optr;
+            struct VP8D_COMP* optr;
 
             vp8dx_initialize();
 
@@ -564,7 +566,7 @@ static vpx_codec_err_t vp8_xma_set_mmap(vpx_codec_ctx_t         *ctx,
     if (done && !res)
     {
         vp8_finalize_mmaps(ctx->priv->alg_priv);
-        res = ctx->iface->init(ctx);
+        res = ctx->iface->init(ctx, NULL);
     }
 
     return res;
diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk
index b71a54aea..2d99981f5 100644
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -86,6 +86,8 @@ VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.h
 VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.c
 VP8_CX_SRCS-yes += encoder/temporal_filter.c
 VP8_CX_SRCS-yes += encoder/temporal_filter.h
+VP8_CX_SRCS-$(CONFIG_MULTI_RES_ENCODING) += encoder/mr_dissim.c
+VP8_CX_SRCS-$(CONFIG_MULTI_RES_ENCODING) += encoder/mr_dissim.h
 
 ifeq ($(CONFIG_REALTIME_ONLY),yes)
 VP8_CX_SRCS_REMOVE-yes += encoder/firstpass.c
diff --git a/vp8/vp8dx.mk b/vp8/vp8dx.mk
index d88b595fb..d6dc15305 100644
--- a/vp8/vp8dx.mk
+++ b/vp8/vp8dx.mk
@@ -52,7 +52,6 @@ VP8_DX_SRCS-yes += decoder/asm_dec_offsets.c
 VP8_DX_SRCS-yes += decoder/dboolhuff.c
 VP8_DX_SRCS-yes += decoder/decodemv.c
 VP8_DX_SRCS-yes += decoder/decodframe.c
-VP8_DX_SRCS-yes += decoder/dequantize.c
 VP8_DX_SRCS-yes += decoder/detokenize.c
 VP8_DX_SRCS-$(CONFIG_ERROR_CONCEALMENT) += decoder/ec_types.h
 VP8_DX_SRCS-$(CONFIG_ERROR_CONCEALMENT) += decoder/error_concealment.h
@@ -61,20 +60,14 @@ VP8_DX_SRCS-yes += decoder/generic/dsystemdependent.c
 VP8_DX_SRCS-yes += decoder/dboolhuff.h
 VP8_DX_SRCS-yes += decoder/decodemv.h
 VP8_DX_SRCS-yes += decoder/decoderthreading.h
-VP8_DX_SRCS-yes += decoder/dequantize.h
 VP8_DX_SRCS-yes += decoder/detokenize.h
 VP8_DX_SRCS-yes += decoder/onyxd_int.h
 VP8_DX_SRCS-yes += decoder/treereader.h
 VP8_DX_SRCS-yes += decoder/onyxd_if.c
 VP8_DX_SRCS-$(CONFIG_MULTITHREAD) += decoder/threading.c
-VP8_DX_SRCS-yes += decoder/idct_blk.c
 VP8_DX_SRCS-$(CONFIG_MULTITHREAD) += decoder/reconintra_mt.h
 VP8_DX_SRCS-$(CONFIG_MULTITHREAD) += decoder/reconintra_mt.c
 
 VP8_DX_SRCS-yes := $(filter-out $(VP8_DX_SRCS_REMOVE-yes),$(VP8_DX_SRCS-yes))
 
-VP8_DX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += decoder/x86/dequantize_x86.h
 VP8_DX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += decoder/x86/x86_dsystemdependent.c
-VP8_DX_SRCS-$(HAVE_MMX) += decoder/x86/dequantize_mmx.asm
-VP8_DX_SRCS-$(HAVE_MMX) += decoder/x86/idct_blk_mmx.c
-VP8_DX_SRCS-$(HAVE_SSE2) += decoder/x86/idct_blk_sse2.c
diff --git a/vp8/vp8dx_arm.mk b/vp8/vp8dx_arm.mk
index 6bde42f4c..fa1aaea0b 100644
--- a/vp8/vp8dx_arm.mk
+++ b/vp8/vp8dx_arm.mk
@@ -12,20 +12,3 @@
 #VP8_DX_SRCS list is modified according to different platforms.
 
 VP8_DX_SRCS-$(ARCH_ARM)  += decoder/arm/arm_dsystemdependent.c
-VP8_DX_SRCS-$(ARCH_ARM)  += decoder/arm/dequantize_arm.c
-VP8_DX_SRCS-$(ARCH_ARM)  += decoder/arm/dequantize_arm.h
-
-#File list for armv6
-VP8_DX_SRCS-$(HAVE_ARMV6)  += decoder/arm/armv6/dequant_dc_idct_v6$(ASM)
-VP8_DX_SRCS-$(HAVE_ARMV6)  += decoder/arm/armv6/dequant_idct_v6$(ASM)
-VP8_DX_SRCS-$(HAVE_ARMV6)  += decoder/arm/armv6/dequantize_v6$(ASM)
-VP8_DX_SRCS-$(HAVE_ARMV6)  += decoder/arm/armv6/idct_blk_v6.c
-
-#File list for neon
-VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_dequant_dc_full_2x_neon$(ASM)
-VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_dequant_dc_0_2x_neon$(ASM)
-VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/dequant_idct_neon$(ASM)
-VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_dequant_full_2x_neon$(ASM)
-VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_dequant_0_2x_neon$(ASM)
-VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/dequantizeb_neon$(ASM)
-VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_blk_neon.c
diff --git a/vp8_multi_resolution_encoder.c b/vp8_multi_resolution_encoder.c
new file mode 100644
index 000000000..11c33d618
--- /dev/null
+++ b/vp8_multi_resolution_encoder.c
@@ -0,0 +1,463 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*
+ * This is an example demonstrating multi-resolution encoding in VP8.
+ * High-resolution input video is down-sampled to lower-resolutions. The
+ * encoder then encodes the video and outputs multiple bitstreams with
+ * different resolutions.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include "math.h"
+#define VPX_CODEC_DISABLE_COMPAT 1
+#include "vpx/vpx_encoder.h"
+#include "vpx/vp8cx.h"
+#include "vpx_ports/mem_ops.h"
+#define interface (vpx_codec_vp8_cx())
+#define fourcc    0x30385056
+
+#define IVF_FILE_HDR_SZ  (32)
+#define IVF_FRAME_HDR_SZ (12)
+
+/*
+ * The input video frame is downsampled several times to generate a multi-level
+ * hierarchical structure. NUM_ENCODERS is defined as the number of encoding
+ * levels required. For example, if the size of input video is 1280x720,
+ * NUM_ENCODERS is 3, and down-sampling factor is 2, the encoder outputs 3
+ * bitstreams with resolution of 1280x720(level 0), 640x360(level 1), and
+ * 320x180(level 2) respectively.
+ */
+#define NUM_ENCODERS 3
+
+/* This example uses the scaler function in libyuv. */
+#include "third_party/libyuv/include/libyuv/basic_types.h"
+#include "third_party/libyuv/include/libyuv/scale.h"
+#include "third_party/libyuv/include/libyuv/cpu_id.h"
+
+static double vp8_mse2psnr(double Samples, double Peak, double Mse)
+{
+    double psnr;
+
+    if ((double)Mse > 0.0)
+        psnr = 10.0 * log10(Peak * Peak * Samples / Mse);
+    else
+        psnr = 60;      // Limit to prevent / 0
+
+    if (psnr > 60)
+        psnr = 60;
+
+    return psnr;
+}
+
+static void die(const char *fmt, ...) {
+    va_list ap;
+
+    va_start(ap, fmt);
+    vprintf(fmt, ap);
+    if(fmt[strlen(fmt)-1] != '\n')
+        printf("\n");
+    exit(EXIT_FAILURE);
+}
+
+static void die_codec(vpx_codec_ctx_t *ctx, const char *s) {
+    const char *detail = vpx_codec_error_detail(ctx);
+
+    printf("%s: %s\n", s, vpx_codec_error(ctx));
+    if(detail)
+        printf("    %s\n",detail);
+    exit(EXIT_FAILURE);
+}
+
+int (*read_frame_p)(FILE *f, vpx_image_t *img);
+
+static int read_frame(FILE *f, vpx_image_t *img) {
+    size_t nbytes, to_read;
+    int    res = 1;
+
+    to_read = img->w*img->h*3/2;
+    nbytes = fread(img->planes[0], 1, to_read, f);
+    if(nbytes != to_read) {
+        res = 0;
+        if(nbytes > 0)
+            printf("Warning: Read partial frame. Check your width & height!\n");
+    }
+    return res;
+}
+
+static int read_frame_by_row(FILE *f, vpx_image_t *img) {
+    size_t nbytes, to_read;
+    int    res = 1;
+    int plane;
+
+    for (plane = 0; plane < 3; plane++)
+    {
+        unsigned char *ptr;
+        int w = (plane ? (1 + img->d_w) / 2 : img->d_w);
+        int h = (plane ? (1 + img->d_h) / 2 : img->d_h);
+        int r;
+
+        /* Determine the correct plane based on the image format. The for-loop
+         * always counts in Y,U,V order, but this may not match the order of
+         * the data on disk.
+         */
+        switch (plane)
+        {
+        case 1:
+            ptr = img->planes[img->fmt==VPX_IMG_FMT_YV12? VPX_PLANE_V : VPX_PLANE_U];
+            break;
+        case 2:
+            ptr = img->planes[img->fmt==VPX_IMG_FMT_YV12?VPX_PLANE_U : VPX_PLANE_V];
+            break;
+        default:
+            ptr = img->planes[plane];
+        }
+
+        for (r = 0; r < h; r++)
+        {
+            to_read = w;
+
+            nbytes = fread(ptr, 1, to_read, f);
+            if(nbytes != to_read) {
+                res = 0;
+                if(nbytes > 0)
+                    printf("Warning: Read partial frame. Check your width & height!\n");
+                break;
+            }
+
+            ptr += img->stride[plane];
+        }
+        if (!res)
+            break;
+    }
+
+    return res;
+}
+
+static void write_ivf_file_header(FILE *outfile,
+                                  const vpx_codec_enc_cfg_t *cfg,
+                                  int frame_cnt) {
+    char header[32];
+
+    if(cfg->g_pass != VPX_RC_ONE_PASS && cfg->g_pass != VPX_RC_LAST_PASS)
+        return;
+    header[0] = 'D';
+    header[1] = 'K';
+    header[2] = 'I';
+    header[3] = 'F';
+    mem_put_le16(header+4,  0);                   /* version */
+    mem_put_le16(header+6,  32);                  /* headersize */
+    mem_put_le32(header+8,  fourcc);              /* headersize */
+    mem_put_le16(header+12, cfg->g_w);            /* width */
+    mem_put_le16(header+14, cfg->g_h);            /* height */
+    mem_put_le32(header+16, cfg->g_timebase.den); /* rate */
+    mem_put_le32(header+20, cfg->g_timebase.num); /* scale */
+    mem_put_le32(header+24, frame_cnt);           /* length */
+    mem_put_le32(header+28, 0);                   /* unused */
+
+    if(fwrite(header, 1, 32, outfile));
+}
+
+static void write_ivf_frame_header(FILE *outfile,
+                                   const vpx_codec_cx_pkt_t *pkt)
+{
+    char             header[12];
+    vpx_codec_pts_t  pts;
+
+    if(pkt->kind != VPX_CODEC_CX_FRAME_PKT)
+        return;
+
+    pts = pkt->data.frame.pts;
+    mem_put_le32(header, pkt->data.frame.sz);
+    mem_put_le32(header+4, pts&0xFFFFFFFF);
+    mem_put_le32(header+8, pts >> 32);
+
+    if(fwrite(header, 1, 12, outfile));
+}
+
+int main(int argc, char **argv)
+{
+    FILE                *infile, *outfile[NUM_ENCODERS];
+    vpx_codec_ctx_t      codec[NUM_ENCODERS];
+    vpx_codec_enc_cfg_t  cfg[NUM_ENCODERS];
+    vpx_codec_pts_t      frame_cnt = 0;
+    vpx_image_t          raw[NUM_ENCODERS];
+    vpx_codec_err_t      res[NUM_ENCODERS];
+
+    int                  i;
+    long                 width;
+    long                 height;
+    int                  frame_avail;
+    int                  got_data;
+    int                  flags = 0;
+
+    /*Currently, only realtime mode is supported in multi-resolution encoding.*/
+    int                  arg_deadline = VPX_DL_REALTIME;
+
+    /* Set show_psnr to 1/0 to show/not show PSNR. Choose show_psnr=0 if you
+       don't need to know PSNR, which will skip PSNR calculation and save
+       encoding time. */
+    int                  show_psnr = 0;
+    uint64_t             psnr_sse_total[NUM_ENCODERS] = {0};
+    uint64_t             psnr_samples_total[NUM_ENCODERS] = {0};
+    double               psnr_totals[NUM_ENCODERS][4] = {{0,0}};
+    int                  psnr_count[NUM_ENCODERS] = {0};
+
+    /* Set the required target bitrates for each resolution level. */
+    unsigned int         target_bitrate[NUM_ENCODERS]={1400, 500, 100};
+    /* Enter the frame rate of the input video */
+    int                  framerate = 30;
+    /* Set down-sampling factor for each resolution level.
+       dsf[0] controls down sampling from level 0 to level 1;
+       dsf[1] controls down sampling from level 1 to level 2;
+       dsf[2] is not used. */
+    vpx_rational_t dsf[NUM_ENCODERS] = {{2, 1}, {2, 1}, {1, 1}};
+
+    if(argc!= (5+NUM_ENCODERS))
+        die("Usage: %s <width> <height> <infile> <outfile(s)> <output psnr?>\n",
+            argv[0]);
+
+    printf("Using %s\n",vpx_codec_iface_name(interface));
+
+    width = strtol(argv[1], NULL, 0);
+    height = strtol(argv[2], NULL, 0);
+
+    if(width < 16 || width%2 || height <16 || height%2)
+        die("Invalid resolution: %ldx%ld", width, height);
+
+    /* Open input video file for encoding */
+    if(!(infile = fopen(argv[3], "rb")))
+        die("Failed to open %s for reading", argv[3]);
+
+    /* Open output file for each encoder to output bitstreams */
+    for (i=0; i< NUM_ENCODERS; i++)
+    {
+        if(!(outfile[i] = fopen(argv[i+4], "wb")))
+            die("Failed to open %s for writing", argv[i+4]);
+    }
+
+    show_psnr = strtol(argv[NUM_ENCODERS + 4], NULL, 0);
+
+    /* Populate default encoder configuration */
+    for (i=0; i< NUM_ENCODERS; i++)
+    {
+        res[i] = vpx_codec_enc_config_default(interface, &cfg[i], 0);
+        if(res[i]) {
+            printf("Failed to get config: %s\n", vpx_codec_err_to_string(res[i]));
+            return EXIT_FAILURE;
+        }
+    }
+
+    /*
+     * Update the default configuration according to needs of the application.
+     */
+    /* Highest-resolution encoder settings */
+    cfg[0].g_w = width;
+    cfg[0].g_h = height;
+    cfg[0].g_threads = 1;                           /* number of threads used */
+    cfg[0].rc_dropframe_thresh = 0;
+    cfg[0].rc_end_usage = VPX_CBR;
+    cfg[0].rc_resize_allowed = 0;
+    cfg[0].rc_min_quantizer = 4;
+    cfg[0].rc_max_quantizer = 56;
+    cfg[0].rc_undershoot_pct = 98;
+    cfg[0].rc_overshoot_pct = 100;
+    cfg[0].rc_buf_initial_sz = 500;
+    cfg[0].rc_buf_optimal_sz = 600;
+    cfg[0].rc_buf_sz = 1000;
+    //cfg[0].rc_dropframe_thresh = 10;
+    cfg[0].g_error_resilient = 1;              /* Enable error resilient mode */
+    cfg[0].g_lag_in_frames   = 0;
+
+    /* Disable automatic keyframe placement */
+    //cfg[0].kf_mode           = VPX_KF_DISABLED;
+    cfg[0].kf_min_dist = cfg[0].kf_max_dist = 1000;
+
+    cfg[0].rc_target_bitrate = target_bitrate[0];       /* Set target bitrate */
+    cfg[0].g_timebase.num = 1;                          /* Set fps */
+    cfg[0].g_timebase.den = framerate;
+
+    /* Other-resolution encoder settings */
+    for (i=1; i< NUM_ENCODERS; i++)
+    {
+        memcpy(&cfg[i], &cfg[0], sizeof(vpx_codec_enc_cfg_t));
+
+        cfg[i].g_threads = 1;                       /* number of threads used */
+        cfg[i].rc_target_bitrate = target_bitrate[i];
+
+        /* Note: Width & height of other-resolution encoders are calculated
+         * from the highest-resolution encoder's size and the corresponding
+         * down_sampling_factor.
+         */
+        {
+            unsigned int iw = cfg[i-1].g_w*dsf[i-1].den + dsf[i-1].num - 1;
+            unsigned int ih = cfg[i-1].g_h*dsf[i-1].den + dsf[i-1].num - 1;
+            cfg[i].g_w = iw/dsf[i-1].num;
+            cfg[i].g_h = ih/dsf[i-1].num;
+        }
+
+        /* Make width & height to be multiplier of 2. */
+        // Should support odd size ???
+        if((cfg[i].g_w)%2)cfg[i].g_w++;
+        if((cfg[i].g_h)%2)cfg[i].g_h++;
+    }
+
+    /* Allocate image for each encoder */
+    for (i=0; i< NUM_ENCODERS; i++)
+        if(!vpx_img_alloc(&raw[i], VPX_IMG_FMT_I420, cfg[i].g_w, cfg[i].g_h, 32))
+            die("Failed to allocate image", cfg[i].g_w, cfg[i].g_h);
+
+    if (raw[0].stride[VPX_PLANE_Y] == raw[0].d_w)
+        read_frame_p = read_frame;
+    else
+        read_frame_p = read_frame_by_row;
+
+    for (i=0; i< NUM_ENCODERS; i++)
+        write_ivf_file_header(outfile[i], &cfg[i], 0);
+
+    /* Initialize multi-encoder */
+    if(vpx_codec_enc_init_multi(&codec[0], interface, &cfg[0], NUM_ENCODERS,
+                                (show_psnr ? VPX_CODEC_USE_PSNR : 0), &dsf[0]))
+        die_codec(&codec[0], "Failed to initialize encoder");
+
+    /* The extra encoding configuration parameters can be set as follows. */
+    /* Set encoding speed */
+    for ( i=0; i<NUM_ENCODERS; i++)
+    {
+        int speed = -6;
+        if(vpx_codec_control(&codec[i], VP8E_SET_CPUUSED, speed))
+            die_codec(&codec[i], "Failed to set cpu_used");
+    }
+    /* Set static thresh for highest-resolution encoder. Set it to 1000 for
+     * better performance. */
+    {
+        unsigned int static_thresh = 1000;
+        if(vpx_codec_control(&codec[0], VP8E_SET_STATIC_THRESHOLD, static_thresh))
+            die_codec(&codec[0], "Failed to set static threshold");
+    }
+    /* Set static thresh = 0 for other encoders for better quality */
+    for ( i=1; i<NUM_ENCODERS; i++)
+    {
+        unsigned int static_thresh = 0;
+        if(vpx_codec_control(&codec[i], VP8E_SET_STATIC_THRESHOLD, static_thresh))
+            die_codec(&codec[i], "Failed to set static threshold");
+    }
+
+    frame_avail = 1;
+    got_data = 0;
+
+    while(frame_avail || got_data)
+    {
+        vpx_codec_iter_t iter[NUM_ENCODERS]={NULL};
+        const vpx_codec_cx_pkt_t *pkt[NUM_ENCODERS];
+
+        flags = 0;
+        frame_avail = read_frame_p(infile, &raw[0]);
+
+        if(frame_avail)
+        {
+            for ( i=1; i<NUM_ENCODERS; i++)
+            {
+                /*Scale the image down a number of times by downsampling factor*/
+                /* FilterMode 1 or 2 give better psnr than FilterMode 0. */
+                I420Scale(raw[i-1].planes[VPX_PLANE_Y], raw[i-1].stride[VPX_PLANE_Y],
+                          raw[i-1].planes[VPX_PLANE_U], raw[i-1].stride[VPX_PLANE_U],
+                          raw[i-1].planes[VPX_PLANE_V], raw[i-1].stride[VPX_PLANE_V],
+                          raw[i-1].d_w, raw[i-1].d_h,
+                          raw[i].planes[VPX_PLANE_Y], raw[i].stride[VPX_PLANE_Y],
+                          raw[i].planes[VPX_PLANE_U], raw[i].stride[VPX_PLANE_U],
+                          raw[i].planes[VPX_PLANE_V], raw[i].stride[VPX_PLANE_V],
+                          raw[i].d_w, raw[i].d_h, 1);
+            }
+        }
+
+        /* Encode each frame at multi-levels */
+        if(vpx_codec_encode(&codec[0], frame_avail? &raw[0] : NULL,
+            frame_cnt, 1, flags, arg_deadline))
+            die_codec(&codec[0], "Failed to encode frame");
+
+        for (i=NUM_ENCODERS-1; i>=0 ; i--)
+        {
+            got_data = 0;
+
+            while( (pkt[i] = vpx_codec_get_cx_data(&codec[i], &iter[i])) )
+            {
+                got_data = 1;
+                switch(pkt[i]->kind) {
+                    case VPX_CODEC_CX_FRAME_PKT:
+                        write_ivf_frame_header(outfile[i], pkt[i]);
+                        if(fwrite(pkt[i]->data.frame.buf, 1, pkt[i]->data.frame.sz,
+                                  outfile[i]));
+                    break;
+                    case VPX_CODEC_PSNR_PKT:
+                        if (show_psnr)
+                        {
+                            int j;
+
+                            psnr_sse_total[i] += pkt[i]->data.psnr.sse[0];
+                            psnr_samples_total[i] += pkt[i]->data.psnr.samples[0];
+                            for (j = 0; j < 4; j++)
+                            {
+                                //fprintf(stderr, "%.3lf ", pkt[i]->data.psnr.psnr[j]);
+                                psnr_totals[i][j] += pkt[i]->data.psnr.psnr[j];
+                            }
+                            psnr_count[i]++;
+                        }
+
+                        break;
+                    default:
+                        break;
+                }
+                printf(pkt[i]->kind == VPX_CODEC_CX_FRAME_PKT
+                       && (pkt[i]->data.frame.flags & VPX_FRAME_IS_KEY)? "K":".");
+                fflush(stdout);
+            }
+        }
+        frame_cnt++;
+    }
+    printf("\n");
+
+    fclose(infile);
+
+    for (i=0; i< NUM_ENCODERS; i++)
+    {
+        printf("Processed %ld frames.\n",(long int)frame_cnt-1);
+
+        /* Calculate PSNR and print it out */
+        if ( (show_psnr) && (psnr_count[i]>0) )
+        {
+            int j;
+            double ovpsnr = vp8_mse2psnr(psnr_samples_total[i], 255.0,
+                                         psnr_sse_total[i]);
+
+            fprintf(stderr, "\n ENC%d PSNR (Overall/Avg/Y/U/V)", i);
+
+            fprintf(stderr, " %.3lf", ovpsnr);
+            for (j = 0; j < 4; j++)
+            {
+                fprintf(stderr, " %.3lf", psnr_totals[i][j]/psnr_count[i]);
+            }
+        }
+
+        if(vpx_codec_destroy(&codec[i]))
+            die_codec(&codec[i], "Failed to destroy codec");
+
+        /* Try to rewrite the file header with the actual frame count */
+        if(!fseek(outfile[i], 0, SEEK_SET))
+            write_ivf_file_header(outfile[i], &cfg[i], frame_cnt-1);
+        fclose(outfile[i]);
+
+        vpx_img_free(&raw[i]);
+    }
+
+    return EXIT_SUCCESS;
+}
diff --git a/vpx/internal/vpx_codec_internal.h b/vpx/internal/vpx_codec_internal.h
index a1ff1921e..0703d6a4f 100644
--- a/vpx/internal/vpx_codec_internal.h
+++ b/vpx/internal/vpx_codec_internal.h
@@ -56,9 +56,10 @@
  * types, removing or reassigning enums, adding/removing/rearranging
  * fields to structures
  */
-#define VPX_CODEC_INTERNAL_ABI_VERSION (3) /**<\hideinitializer*/
+#define VPX_CODEC_INTERNAL_ABI_VERSION (4) /**<\hideinitializer*/
 
 typedef struct vpx_codec_alg_priv  vpx_codec_alg_priv_t;
+typedef struct vpx_codec_priv_enc_mr_cfg vpx_codec_priv_enc_mr_cfg_t;
 
 /*!\brief init function pointer prototype
  *
@@ -73,7 +74,8 @@ typedef struct vpx_codec_alg_priv  vpx_codec_alg_priv_t;
  * \retval #VPX_CODEC_MEM_ERROR
  *     Memory operation failed.
  */
-typedef vpx_codec_err_t (*vpx_codec_init_fn_t)(vpx_codec_ctx_t *ctx);
+typedef vpx_codec_err_t (*vpx_codec_init_fn_t)(vpx_codec_ctx_t *ctx,
+                                             vpx_codec_priv_enc_mr_cfg_t *data);
 
 /*!\brief destroy function pointer prototype
  *
@@ -264,6 +266,10 @@ typedef vpx_fixed_buf_t *
 typedef vpx_image_t *
 (*vpx_codec_get_preview_frame_fn_t)(vpx_codec_alg_priv_t   *ctx);
 
+typedef vpx_codec_err_t
+(*vpx_codec_enc_mr_get_mem_loc_fn_t)(const vpx_codec_enc_cfg_t     *cfg,
+                                   void **mem_loc);
+
 /*!\brief usage configuration mapping
  *
  * This structure stores the mapping between usage identifiers and
@@ -309,8 +315,9 @@ struct vpx_codec_iface
         vpx_codec_encode_fn_t              encode;        /**< \copydoc ::vpx_codec_encode_fn_t */
         vpx_codec_get_cx_data_fn_t         get_cx_data;   /**< \copydoc ::vpx_codec_get_cx_data_fn_t */
         vpx_codec_enc_config_set_fn_t      cfg_set;       /**< \copydoc ::vpx_codec_enc_config_set_fn_t */
-        vpx_codec_get_global_headers_fn_t  get_glob_hdrs; /**< \copydoc ::vpx_codec_enc_config_set_fn_t */
+        vpx_codec_get_global_headers_fn_t  get_glob_hdrs; /**< \copydoc ::vpx_codec_get_global_headers_fn_t */
         vpx_codec_get_preview_frame_fn_t   get_preview;   /**< \copydoc ::vpx_codec_get_preview_frame_fn_t */
+        vpx_codec_enc_mr_get_mem_loc_fn_t  mr_get_mem_loc;   /**< \copydoc ::vpx_codec_enc_mr_get_mem_loc_fn_t */
     } enc;
 };
 
@@ -353,9 +360,21 @@ struct vpx_codec_priv
         unsigned int                cx_data_pad_before;
         unsigned int                cx_data_pad_after;
         vpx_codec_cx_pkt_t          cx_data_pkt;
+        unsigned int                total_encoders;
     } enc;
 };
 
+/*
+ * Multi-resolution encoding internal configuration
+ */
+struct vpx_codec_priv_enc_mr_cfg
+{
+    unsigned int           mr_total_resolutions;
+    unsigned int           mr_encoder_id;
+    struct vpx_rational    mr_down_sampling_factor;
+    void*                  mr_low_res_mode_info;
+};
+
 #undef VPX_CTRL_USE_TYPE
 #define VPX_CTRL_USE_TYPE(id, typ) \
     static typ id##__value(va_list args) {return va_arg(args, typ);} \
diff --git a/vpx/src/vpx_decoder.c b/vpx/src/vpx_decoder.c
index 5d31c2c49..59a783dd9 100644
--- a/vpx/src/vpx_decoder.c
+++ b/vpx/src/vpx_decoder.c
@@ -56,7 +56,7 @@ vpx_codec_err_t vpx_codec_dec_init_ver(vpx_codec_ctx_t      *ctx,
 
         if (!(flags & VPX_CODEC_USE_XMA))
         {
-            res = ctx->iface->init(ctx);
+            res = ctx->iface->init(ctx, NULL);
 
             if (res)
             {
diff --git a/vpx/src/vpx_encoder.c b/vpx/src/vpx_encoder.c
index 5e86835ea..03ddc62b2 100644
--- a/vpx/src/vpx_encoder.c
+++ b/vpx/src/vpx_encoder.c
@@ -51,7 +51,7 @@ vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t      *ctx,
         ctx->priv = NULL;
         ctx->init_flags = flags;
         ctx->config.enc = cfg;
-        res = ctx->iface->init(ctx);
+        res = ctx->iface->init(ctx, NULL);
 
         if (res)
         {
@@ -66,6 +66,85 @@ vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t      *ctx,
     return SAVE_STATUS(ctx, res);
 }
 
+vpx_codec_err_t vpx_codec_enc_init_multi_ver(vpx_codec_ctx_t      *ctx,
+                                             vpx_codec_iface_t    *iface,
+                                             vpx_codec_enc_cfg_t  *cfg,
+                                             int                   num_enc,
+                                             vpx_codec_flags_t     flags,
+                                             vpx_rational_t       *dsf,
+                                             int                   ver)
+{
+    vpx_codec_err_t res = 0;
+
+    if (ver != VPX_ENCODER_ABI_VERSION)
+        res = VPX_CODEC_ABI_MISMATCH;
+    else if (!ctx || !iface || !cfg || (num_enc > 16 || num_enc < 1))
+        res = VPX_CODEC_INVALID_PARAM;
+    else if (iface->abi_version != VPX_CODEC_INTERNAL_ABI_VERSION)
+        res = VPX_CODEC_ABI_MISMATCH;
+    else if (!(iface->caps & VPX_CODEC_CAP_ENCODER))
+        res = VPX_CODEC_INCAPABLE;
+    else if ((flags & VPX_CODEC_USE_XMA) && !(iface->caps & VPX_CODEC_CAP_XMA))
+        res = VPX_CODEC_INCAPABLE;
+    else if ((flags & VPX_CODEC_USE_PSNR)
+             && !(iface->caps & VPX_CODEC_CAP_PSNR))
+        res = VPX_CODEC_INCAPABLE;
+    else if ((flags & VPX_CODEC_USE_OUTPUT_PARTITION)
+             && !(iface->caps & VPX_CODEC_CAP_OUTPUT_PARTITION))
+        res = VPX_CODEC_INCAPABLE;
+    else
+    {
+        int i;
+        void *mem_loc = NULL;
+
+        if(!(res = iface->enc.mr_get_mem_loc(cfg, &mem_loc)))
+        {
+            for (i = 0; i < num_enc; i++)
+            {
+                vpx_codec_priv_enc_mr_cfg_t mr_cfg;
+
+                /* Validate down-sampling factor. */
+                if(dsf->num < 1 || dsf->num > 4096 || dsf->den < 1 ||
+                   dsf->den > dsf->num)
+                {
+                    res = VPX_CODEC_INVALID_PARAM;
+                    break;
+                }
+
+                mr_cfg.mr_low_res_mode_info = mem_loc;
+                mr_cfg.mr_total_resolutions = num_enc;
+                mr_cfg.mr_encoder_id = num_enc-1-i;
+                mr_cfg.mr_down_sampling_factor.num = dsf->num;
+                mr_cfg.mr_down_sampling_factor.den = dsf->den;
+
+                ctx->iface = iface;
+                ctx->name = iface->name;
+                ctx->priv = NULL;
+                ctx->init_flags = flags;
+                ctx->config.enc = cfg;
+                res = ctx->iface->init(ctx, &mr_cfg);
+
+                if (res)
+                {
+                    ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL;
+                    vpx_codec_destroy(ctx);
+                }
+
+                if (ctx->priv)
+                    ctx->priv->iface = ctx->iface;
+
+                if (res)
+                    break;
+
+                ctx++;
+                cfg++;
+                dsf++;
+            }
+        }
+    }
+
+    return SAVE_STATUS(ctx, res);
+}
 
 
 vpx_codec_err_t  vpx_codec_enc_config_default(vpx_codec_iface_t    *iface,
@@ -123,7 +202,7 @@ vpx_codec_err_t  vpx_codec_encode(vpx_codec_ctx_t            *ctx,
                                   vpx_enc_frame_flags_t       flags,
                                   unsigned long               deadline)
 {
-    vpx_codec_err_t res;
+    vpx_codec_err_t res = 0;
 
     if (!ctx || (img && !duration))
         res = VPX_CODEC_INVALID_PARAM;
@@ -136,9 +215,37 @@ vpx_codec_err_t  vpx_codec_encode(vpx_codec_ctx_t            *ctx,
         /* Execute in a normalized floating point environment, if the platform
          * requires it.
          */
+        unsigned int num_enc =ctx->priv->enc.total_encoders;
+
         FLOATING_POINT_INIT();
-        res = ctx->iface->enc.encode(ctx->priv->alg_priv, img, pts,
-                                     duration, flags, deadline);
+
+        if (num_enc == 1)
+            res = ctx->iface->enc.encode(ctx->priv->alg_priv, img, pts,
+                                         duration, flags, deadline);
+        else
+        {
+            /* Multi-resolution encoding:
+             * Encode multi-levels in reverse order. For example,
+             * if mr_total_resolutions = 3, first encode level 2,
+             * then encode level 1, and finally encode level 0.
+             */
+            int i;
+
+            ctx += num_enc - 1;
+            if (img) img += num_enc - 1;
+
+            for (i = num_enc-1; i >= 0; i--)
+            {
+                if ((res = ctx->iface->enc.encode(ctx->priv->alg_priv, img, pts,
+                                                  duration, flags, deadline)))
+                    break;
+
+                ctx--;
+                if (img) img--;
+            }
+            ctx++;
+        }
+
         FLOATING_POINT_RESTORE();
     }
 
diff --git a/vpx/src/vpx_image.c b/vpx/src/vpx_image.c
index 7a4e27062..336b6e29d 100644
--- a/vpx/src/vpx_image.c
+++ b/vpx/src/vpx_image.c
@@ -13,10 +13,42 @@
 #include <string.h>
 #include "vpx/vpx_image.h"
 
+#define ADDRESS_STORAGE_SIZE      sizeof(size_t)
+/*returns an addr aligned to the byte boundary specified by align*/
+#define align_addr(addr,align) (void*)(((size_t)(addr) + ((align) - 1)) & (size_t)-(align))
+
+/* Memalign code is copied from vpx_mem.c */
+static void *img_buf_memalign(size_t align, size_t size)
+{
+    void *addr,
+         * x = NULL;
+
+    addr = malloc(size + align - 1 + ADDRESS_STORAGE_SIZE);
+
+    if (addr)
+    {
+        x = align_addr((unsigned char *)addr + ADDRESS_STORAGE_SIZE, (int)align);
+        /* save the actual malloc address */
+        ((size_t *)x)[-1] = (size_t)addr;
+    }
+
+    return x;
+}
+
+static void img_buf_free(void *memblk)
+{
+    if (memblk)
+    {
+        void *addr = (void *)(((size_t *)memblk)[-1]);
+        free(addr);
+    }
+}
+
 static vpx_image_t *img_alloc_helper(vpx_image_t  *img,
                                      vpx_img_fmt_t fmt,
                                      unsigned int  d_w,
                                      unsigned int  d_h,
+                                     unsigned int  buf_align,
                                      unsigned int  stride_align,
                                      unsigned char      *img_data)
 {
@@ -25,6 +57,14 @@ static vpx_image_t *img_alloc_helper(vpx_image_t  *img,
     int           align;
 
     /* Treat align==0 like align==1 */
+    if (!buf_align)
+        buf_align = 1;
+
+    /* Validate alignment (must be power of 2) */
+    if (buf_align & (buf_align - 1))
+        goto fail;
+
+    /* Treat align==0 like align==1 */
     if (!stride_align)
         stride_align = 1;
 
@@ -119,7 +159,8 @@ static vpx_image_t *img_alloc_helper(vpx_image_t  *img,
 
     if (!img_data)
     {
-        img->img_data = malloc((fmt & VPX_IMG_FMT_PLANAR) ? h * w * bps / 8 : h * s);
+        img->img_data = img_buf_memalign(buf_align, ((fmt & VPX_IMG_FMT_PLANAR)?
+                                         h * s * bps / 8 : h * s));
         img->img_data_owner = 1;
     }
 
@@ -150,9 +191,9 @@ vpx_image_t *vpx_img_alloc(vpx_image_t  *img,
                            vpx_img_fmt_t fmt,
                            unsigned int  d_w,
                            unsigned int  d_h,
-                           unsigned int  stride_align)
+                           unsigned int  align)
 {
-    return img_alloc_helper(img, fmt, d_w, d_h, stride_align, NULL);
+    return img_alloc_helper(img, fmt, d_w, d_h, align, align, NULL);
 }
 
 vpx_image_t *vpx_img_wrap(vpx_image_t  *img,
@@ -162,7 +203,9 @@ vpx_image_t *vpx_img_wrap(vpx_image_t  *img,
                           unsigned int  stride_align,
                           unsigned char       *img_data)
 {
-    return img_alloc_helper(img, fmt, d_w, d_h, stride_align, img_data);
+    /* By setting buf_align = 1, we don't change buffer alignment in this
+     * function. */
+    return img_alloc_helper(img, fmt, d_w, d_h, 1, stride_align, img_data);
 }
 
 int vpx_img_set_rect(vpx_image_t  *img,
@@ -254,7 +297,7 @@ void vpx_img_free(vpx_image_t *img)
     if (img)
     {
         if (img->img_data && img->img_data_owner)
-            free(img->img_data);
+            img_buf_free(img->img_data);
 
         if (img->self_allocd)
             free(img);
diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h
index 87ab20c75..885ca229f 100644
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h
@@ -634,7 +634,6 @@ extern "C" {
          * then ts_layer_id = (0,1,0,1,0,1,0,1).
          */
         unsigned int           ts_layer_id[MAX_PERIODICITY];
-
     } vpx_codec_enc_cfg_t; /**< alias for struct vpx_codec_enc_cfg */
 
 
@@ -675,6 +674,48 @@ extern "C" {
     vpx_codec_enc_init_ver(ctx, iface, cfg, flags, VPX_ENCODER_ABI_VERSION)
 
 
+    /*!\brief Initialize multi-encoder instance
+     *
+     * Initializes multi-encoder context using the given interface.
+     * Applications should call the vpx_codec_enc_init_multi convenience macro
+     * instead of this function directly, to ensure that the ABI version number
+     * parameter is properly initialized.
+     *
+     * In XMA mode (activated by setting VPX_CODEC_USE_XMA in the flags
+     * parameter), the storage pointed to by the cfg parameter must be
+     * kept readable and stable until all memory maps have been set.
+     *
+     * \param[in]    ctx     Pointer to this instance's context.
+     * \param[in]    iface   Pointer to the algorithm interface to use.
+     * \param[in]    cfg     Configuration to use, if known. May be NULL.
+     * \param[in]    num_enc Total number of encoders.
+     * \param[in]    flags   Bitfield of VPX_CODEC_USE_* flags
+     * \param[in]    dsf     Pointer to down-sampling factors.
+     * \param[in]    ver     ABI version number. Must be set to
+     *                       VPX_ENCODER_ABI_VERSION
+     * \retval #VPX_CODEC_OK
+     *     The decoder algorithm initialized.
+     * \retval #VPX_CODEC_MEM_ERROR
+     *     Memory allocation failed.
+     */
+    vpx_codec_err_t vpx_codec_enc_init_multi_ver(vpx_codec_ctx_t      *ctx,
+                                                 vpx_codec_iface_t    *iface,
+                                                 vpx_codec_enc_cfg_t  *cfg,
+                                                 int                   num_enc,
+                                                 vpx_codec_flags_t     flags,
+                                                 vpx_rational_t       *dsf,
+                                                 int                   ver);
+
+
+    /*!\brief Convenience macro for vpx_codec_enc_init_multi_ver()
+     *
+     * Ensures the ABI version parameter is properly set.
+     */
+#define vpx_codec_enc_init_multi(ctx, iface, cfg, num_enc, flags, dsf) \
+    vpx_codec_enc_init_multi_ver(ctx, iface, cfg, num_enc, flags, dsf, \
+                                 VPX_ENCODER_ABI_VERSION)
+
+
     /*!\brief Get a default configuration
      *
      * Initializes a encoder configuration structure with default values. Supports
@@ -780,7 +821,6 @@ extern "C" {
                                       vpx_enc_frame_flags_t       flags,
                                       unsigned long               deadline);
 
-
     /*!\brief Set compressed data output buffer
      *
      * Sets the buffer that the codec should output the compressed data
diff --git a/vpx/vpx_image.h b/vpx/vpx_image.h
index 8e08b3642..3e424470f 100644
--- a/vpx/vpx_image.h
+++ b/vpx/vpx_image.h
@@ -160,7 +160,8 @@ extern "C" {
      * \param[in]    fmt       Format for the image
      * \param[in]    d_w       Width of the image
      * \param[in]    d_h       Height of the image
-     * \param[in]    align     Alignment, in bytes, of each row in the image.
+     * \param[in]    align     Alignment, in bytes, of the image buffer and
+     *                         each row in the image(stride).
      *
      * \return Returns a pointer to the initialized image descriptor. If the img
      *         parameter is non-null, the value of the img parameter will be
diff --git a/vpx/vpx_integer.h b/vpx/vpx_integer.h
index 608760f8b..218bca773 100644
--- a/vpx/vpx_integer.h
+++ b/vpx/vpx_integer.h
@@ -29,16 +29,8 @@ typedef signed __int64   int64_t;
 typedef unsigned __int64 uint64_t;
 #endif
 
-#ifdef HAVE_ARMV6
-typedef unsigned int int_fast16_t;
-#else
-typedef signed short int_fast16_t;
-#endif
-typedef signed char int_fast8_t;
-typedef unsigned char uint_fast8_t;
-
 #ifndef _UINTPTR_T_DEFINED
-typedef unsigned int   uintptr_t;
+typedef size_t uintptr_t;
 #endif
 
 #else
diff --git a/vpxdec.c b/vpxdec.c
index 6a1a0f523..7401101f8 100644
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -32,7 +32,7 @@
 #include "nestegg/include/nestegg/nestegg.h"
 
 #if CONFIG_OS_SUPPORT
-#if defined(_WIN32)
+#if defined(_MSC_VER)
 #include <io.h>
 #define snprintf _snprintf
 #define isatty   _isatty
diff --git a/vpxenc.c b/vpxenc.c
index abbf093a9..3637acba0 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -47,9 +47,11 @@ typedef __int64 off_t;
 #define fseeko _fseeki64
 #define ftello _ftelli64
 #elif defined(_WIN32)
-/* MinGW defines off_t, and uses f{seek,tell}o64 */
+/* MinGW defines off_t as long
+   and uses f{seek,tell}o64/off64_t for large files */
 #define fseeko fseeko64
 #define ftello ftello64
+#define off_t off64_t
 #endif
 
 #if defined(_MSC_VER)
@@ -805,7 +807,7 @@ write_webm_file_footer(EbmlGlobal *glob, long hash)
 
     {
         EbmlLoc start;
-        int i;
+        unsigned int i;
 
         glob->cue_pos = ftello(glob->stream);
         Ebml_StartSubElement(glob, &start, Cues);
@@ -1440,7 +1442,8 @@ static void show_rate_histogram(struct rate_hist          *hist,
     show_histogram(hist->bucket, buckets, hist->total, scale);
 }
 
-#define ARG_CTRL_CNT_MAX 10
+#define NELEMENTS(x) (sizeof(x)/sizeof(x[0]))
+#define ARG_CTRL_CNT_MAX NELEMENTS(vp8_arg_ctrl_map)
 
 int main(int argc, const char **argv_)
 {
@@ -1719,14 +1722,26 @@ int main(int argc, const char **argv_)
         {
             if (arg_match(&arg, ctrl_args[i], argi))
             {
+                int j;
                 match = 1;
 
-                if (arg_ctrl_cnt < ARG_CTRL_CNT_MAX)
+                /* Point either to the next free element or the first
+                * instance of this control.
+                */
+                for(j=0; j<arg_ctrl_cnt; j++)
+                    if(arg_ctrls[j][0] == ctrl_args_map[i])
+                        break;
+
+                /* Update/insert */
+                assert(j < ARG_CTRL_CNT_MAX);
+                if (j < ARG_CTRL_CNT_MAX)
                 {
-                    arg_ctrls[arg_ctrl_cnt][0] = ctrl_args_map[i];
-                    arg_ctrls[arg_ctrl_cnt][1] = arg_parse_enum_or_int(&arg);
-                    arg_ctrl_cnt++;
+                    arg_ctrls[j][0] = ctrl_args_map[i];
+                    arg_ctrls[j][1] = arg_parse_enum_or_int(&arg);
+                    if(j == arg_ctrl_cnt)
+                        arg_ctrl_cnt++;
                 }
+
             }
         }