31 files changed, 2621 insertions, 1572 deletions
diff --git a/CHANGELOG b/CHANGELOG
index b8da8f8e3..e8760d19f 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,80 @@
+2011-03-07 v0.9.6 "Bali"
+  Our second named release, focused on a faster, higher quality, encoder.
+
+  - Upgrading:
+    This release is backwards compatible with Aylesbury (v0.9.5). Users
+    of older releases should refer to the Upgrading notes in this
+    document for that release.
+
+  - Enhancements:
+      vpxenc --psnr shows a summary when encode completes
+      --tune=ssim option to enable activity masking
+      improved postproc visualizations for development
+      updated support for Apple iOS to SDK 4.2
+      query decoder to determine which reference frames were updated
+      implemented error tracking in the decoder
+      fix pipe support on windows
+
+  - Speed:
+      Primary focus was on good quality mode, speed 0. Average improvement
+      on x86 about 40%, up to 100% on user-generated content at that speed.
+      Best quality mode speed improved 35%, and realtime speed 10-20%. This
+      release also saw significant improvement in realtime encoding speed
+      on ARM platforms.
+
+        Improved encoder threading
+        Dont pick encoder filter level when loopfilter is disabled.
+        Avoid double copying of key frames into alt and golden buffer
+        FDCT optimizations.
+        x86 sse2 temporal filter
+        SSSE3 version of fast quantizer
+        vp8_rd_pick_best_mbsegmentation code restructure
+        Adjusted breakout RD for SPLITMV
+        Changed segmentation check order
+        Improved rd_pick_intra4x4block
+        Adds armv6 optimized variance calculation
+        ARMv6 optimized sad16x16
+        ARMv6 optimized half pixel variance calculations
+        Full search SAD function optimization in SSE4.1
+        Improve MV prediction accuracy to achieve performance gain
+        Improve MV prediction in vp8_pick_inter_mode() for speed>3
+
+  - Quality:
+      Best quality mode improved PSNR 6.3%, and SSIM 6.1%. This release
+      also includes support for "activity masking," which greatly improves
+      SSIM at the expense of PSNR. For now, this feature is available with
+      the --tune=ssim option. Further experimentation in this area
+      is ongoing. This release also introduces a new rate control mode
+      called "CQ," which changes the allocation of bits within a clip to
+      the sections where they will have the most visual impact.
+
+        Tuning for the more exact quantizer.
+        Relax rate control for last few frames
+        CQ Mode
+        Limit key frame quantizer for forced key frames.
+        KF/GF Pulsing
+        Add simple version of activity masking.
+        make rdmult adaptive for intra in quantizer RDO
+        cap the best quantizer for 2nd order DC
+        change the threshold of DC check for encode breakout
+
+  - Bug Fixes:
+      Fix crash on Sparc Solaris.
+      Fix counter of fixed keyframe distance
+      ARNR filter pointer update bug fix
+      Fixed use of motion percentage in KF/GF group calc
+      Changed condition for using RD in Intra Mode
+      Fix encoder real-time only configuration.
+      Fix ARM encoder crash with multiple token partitions
+      Fixed bug first cluster timecode of webm file is wrong.
+      Fixed various encoder bugs with odd-sized images
+      vp8e_get_preview fixed when spatial resampling enabled
+      quantizer: fix assertion in fast quantizer path
+      Allocate source buffers to be multiples of 16
+      Fix for manual Golden frame frequency
+      Fix drastic undershoot in long form content
+
+
 2010-10-28 v0.9.5 "Aylesbury"
   Our first named release, focused on a faster decoder, and a better encoder.
 
diff --git a/build/make/Makefile b/build/make/Makefile
index 40fa6d50c..62d139ea4 100755
--- a/build/make/Makefile
+++ b/build/make/Makefile
@@ -153,7 +153,7 @@ endif
 #
 obj_int_extract: build/make/obj_int_extract.c
 	$(if $(quiet),echo "    [HOSTCC] $@")
-	$(qexec)$(HOSTCC) -I. -o $@ $<
+	$(qexec)$(HOSTCC) -I. -I$(SRC_PATH_BARE) -o $@ $<
 CLEAN-OBJS += obj_int_extract
 
 #
diff --git a/build/make/gen_msvs_proj.sh b/build/make/gen_msvs_proj.sh
index becd95827..c2ef44a9b 100755
--- a/build/make/gen_msvs_proj.sh
+++ b/build/make/gen_msvs_proj.sh
@@ -33,6 +33,7 @@ Options:
     --proj-guid=GUID            GUID to use for the project
     --module-def=filename       File containing export definitions (for DLLs)
     --ver=version               Version (7,8,9) of visual studio to generate for
+    --src-path-bare=dir         Path to root of source tree
     -Ipath/to/include           Additional include directories
     -DFLAG[=value]              Preprocessor macros to define
     -Lpath/to/lib               Additional library search paths
@@ -191,6 +192,8 @@ for opt in "$@"; do
         ;;
         --lib) proj_kind="lib"
         ;;
+        --src-path-bare=*) src_path_bare="$optval"
+        ;;
         --static-crt) use_static_runtime=true
         ;;
         --ver=*)
@@ -335,6 +338,35 @@ generate_vcproj() {
         case "$target" in
             x86*)
                 case "$name" in
+                    obj_int_extract)
+                        tag Tool \
+                            Name="VCCLCompilerTool" \
+                            Optimization="0" \
+                            AdditionalIncludeDirectories="$incs" \
+                            PreprocessorDefinitions="WIN32;DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE" \
+                            RuntimeLibrary="$debug_runtime" \
+                            WarningLevel="3" \
+                            Detect64BitPortabilityProblems="true" \
+                            DebugInformationFormat="1" \
+                    ;;
+                    vpx)
+                        tag Tool \
+                            Name="VCPreBuildEventTool" \
+                            CommandLine="call obj_int_extract.bat $src_path_bare" \
+
+                        tag Tool \
+                            Name="VCCLCompilerTool" \
+                            Optimization="0" \
+                            AdditionalIncludeDirectories="$incs" \
+                            PreprocessorDefinitions="WIN32;_DEBUG;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE;$defines" \
+                            RuntimeLibrary="$debug_runtime" \
+                            UsePrecompiledHeader="0" \
+                            WarningLevel="3" \
+                            DebugInformationFormat="1" \
+                            Detect64BitPortabilityProblems="true" \
+
+                        $uses_asm && tag Tool Name="YASM"  IncludePaths="$incs" Debug="1"
+                    ;;
                     *)
                         tag Tool \
                             Name="VCCLCompilerTool" \
@@ -358,6 +390,12 @@ generate_vcproj() {
                 case "$target" in
                     x86*)
                         case "$name" in
+                            obj_int_extract)
+                                tag Tool \
+                                    Name="VCLinkerTool" \
+                                    OutputFile="${name}.exe" \
+                                    GenerateDebugInformation="true" \
+                            ;;
                             *)
                                 tag Tool \
                                     Name="VCLinkerTool" \
@@ -406,6 +444,34 @@ generate_vcproj() {
         case "$target" in
             x86*)
                 case "$name" in
+                    obj_int_extract)
+                        tag Tool \
+                            Name="VCCLCompilerTool" \
+                            AdditionalIncludeDirectories="$incs" \
+                            PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE" \
+                            RuntimeLibrary="$release_runtime" \
+                            UsePrecompiledHeader="0" \
+                            WarningLevel="3" \
+                            Detect64BitPortabilityProblems="true" \
+                            DebugInformationFormat="0" \
+                    ;;
+                    vpx)
+                        tag Tool \
+                            Name="VCPreBuildEventTool" \
+                            CommandLine="call obj_int_extract.bat $src_path_bare" \
+
+                        tag Tool \
+                            Name="VCCLCompilerTool" \
+                            AdditionalIncludeDirectories="$incs" \
+                            PreprocessorDefinitions="WIN32;NDEBUG;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE;$defines" \
+                            RuntimeLibrary="$release_runtime" \
+                            UsePrecompiledHeader="0" \
+                            WarningLevel="3" \
+                            DebugInformationFormat="0" \
+                            Detect64BitPortabilityProblems="true" \
+
+                        $uses_asm && tag Tool Name="YASM"  IncludePaths="$incs"
+                    ;;
                     *)
                         tag Tool \
                             Name="VCCLCompilerTool" \
@@ -428,6 +494,12 @@ generate_vcproj() {
                 case "$target" in
                     x86*)
                         case "$name" in
+                            obj_int_extract)
+                                tag Tool \
+                                    Name="VCLinkerTool" \
+                                    OutputFile="${name}.exe" \
+                                    GenerateDebugInformation="true" \
+                            ;;
                             *)
                                 tag Tool \
                                     Name="VCLinkerTool" \
diff --git a/build/make/obj_int_extract.c b/build/make/obj_int_extract.c
index 3c54b248f..01b3129d7 100644
--- a/build/make/obj_int_extract.c
+++ b/build/make/obj_int_extract.c
@@ -14,7 +14,7 @@
 
 #include "vpx_config.h"
 
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) || defined(__MINGW32__)
 #include <io.h>
 #include <share.h>
 #include "vpx/vpx_integer.h"
@@ -59,20 +59,47 @@ int parse_macho(uint8_t *base_buf, size_t sz)
     struct mach_header header;
     uint8_t *buf = base_buf;
     int base_data_section = 0;
-
+    int bits = 0;
+
+    /* We can read in mach_header for 32 and 64 bit architectures
+     * because it's identical to mach_header_64 except for the last
+     * element (uint32_t reserved), which we don't use. Then, when
+     * we know which architecture we're looking at, increment buf
+     * appropriately.
+     */
     memcpy(&header, buf, sizeof(struct mach_header));
-    buf += sizeof(struct mach_header);
 
-    if (header.magic != MH_MAGIC)
+    if (header.magic == MH_MAGIC)
     {
-        log_msg("Bad magic number for object file. 0x%x expected, 0x%x found.\n",
-                header.magic, MH_MAGIC);
-        goto bail;
+        if (header.cputype == CPU_TYPE_ARM
+            || header.cputype == CPU_TYPE_X86)
+        {
+            bits = 32;
+            buf += sizeof(struct mach_header);
+        }
+        else
+        {
+            log_msg("Bad cputype for object file. Currently only tested for CPU_TYPE_[ARM|X86].\n");
+            goto bail;
+        }
     }
-
-    if (header.cputype != CPU_TYPE_ARM)
+    else if (header.magic == MH_MAGIC_64)
+    {
+        if (header.cputype == CPU_TYPE_X86_64)
+        {
+            bits = 64;
+            buf += sizeof(struct mach_header_64);
+        }
+        else
+        {
+            log_msg("Bad cputype for object file. Currently only tested for CPU_TYPE_X86_64.\n");
+            goto bail;
+        }
+    }
+    else
     {
-        log_msg("Bad cputype for object file. Currently only tested for CPU_TYPE_ARM.\n");
+        log_msg("Bad magic number for object file. 0x%x or 0x%x expected, 0x%x found.\n",
+                MH_MAGIC, MH_MAGIC_64, header.magic);
         goto bail;
     }
 
@@ -85,8 +112,6 @@ int parse_macho(uint8_t *base_buf, size_t sz)
     for (i = 0; i < header.ncmds; i++)
     {
         struct load_command lc;
-        struct symtab_command sc;
-        struct segment_command seg_c;
 
         memcpy(&lc, buf, sizeof(struct load_command));
 
@@ -94,50 +119,99 @@ int parse_macho(uint8_t *base_buf, size_t sz)
         {
             uint8_t *seg_buf = buf;
             struct section s;
+            struct segment_command seg_c;
 
-            memcpy(&seg_c, buf, sizeof(struct segment_command));
-
+            memcpy(&seg_c, seg_buf, sizeof(struct segment_command));
             seg_buf += sizeof(struct segment_command);
 
-            for (j = 0; j < seg_c.nsects; j++)
+            /* Although each section is given it's own offset, nlist.n_value
+             * references the offset of the first section. This isn't
+             * apparent without debug information because the offset of the
+             * data section is the same as the first section. However, with
+             * debug sections mixed in, the offset of the debug section
+             * increases but n_value still references the first section.
+             */
+            if (seg_c.nsects < 1)
             {
-                memcpy(&s, seg_buf + (j * sizeof(struct section)), sizeof(struct section));
+                log_msg("Not enough sections\n");
+                goto bail;
+            }
 
-                // Need to get this offset which is the start of the symbol table
-                // before matching the strings up with symbols.
-                base_data_section = s.offset;
+            memcpy(&s, seg_buf, sizeof(struct section));
+            base_data_section = s.offset;
+        }
+        else if (lc.cmd == LC_SEGMENT_64)
+        {
+            uint8_t *seg_buf = buf;
+            struct section_64 s;
+            struct segment_command_64 seg_c;
+
+            memcpy(&seg_c, seg_buf, sizeof(struct segment_command_64));
+            seg_buf += sizeof(struct segment_command_64);
+
+            /* Explanation in LG_SEGMENT */
+            if (seg_c.nsects < 1)
+            {
+                log_msg("Not enough sections\n");
+                goto bail;
             }
+
+            memcpy(&s, seg_buf, sizeof(struct section_64));
+            base_data_section = s.offset;
         }
         else if (lc.cmd == LC_SYMTAB)
         {
-            uint8_t *sym_buf = base_buf;
-            uint8_t *str_buf = base_buf;
-
             if (base_data_section != 0)
             {
+                struct symtab_command sc;
+                uint8_t *sym_buf = base_buf;
+                uint8_t *str_buf = base_buf;
+
                 memcpy(&sc, buf, sizeof(struct symtab_command));
 
                 if (sc.cmdsize != sizeof(struct symtab_command))
+                {
                     log_msg("Can't find symbol table!\n");
+                    goto bail;
+                }
 
                 sym_buf += sc.symoff;
                 str_buf += sc.stroff;
 
                 for (j = 0; j < sc.nsyms; j++)
                 {
-                    struct nlist nl;
-                    int val;
+                    /* Location of string is cacluated each time from the
+                     * start of the string buffer.  On darwin the symbols
+                     * are prefixed by "_", so we bump the pointer by 1.
+                     * The target value is defined as an int in asm_*_offsets.c,
+                     * which is 4 bytes on all targets we currently use.
+                     */
+                    if (bits == 32)
+                    {
+                        struct nlist nl;
+                        int val;
 
-                    memcpy(&nl, sym_buf + (j * sizeof(struct nlist)), sizeof(struct nlist));
+                        memcpy(&nl, sym_buf, sizeof(struct nlist));
+                        sym_buf += sizeof(struct nlist);
 
-                    val = *((int *)(base_buf + base_data_section + nl.n_value));
+                        memcpy(&val, base_buf + base_data_section + nl.n_value,
+                               sizeof(val));
+                        printf("%-40s EQU %5d\n",
+                               str_buf + nl.n_un.n_strx + 1, val);
+                    }
+                    else /* if (bits == 64) */
+                    {
+                        struct nlist_64 nl;
+                        int val;
+
+                        memcpy(&nl, sym_buf, sizeof(struct nlist_64));
+                        sym_buf += sizeof(struct nlist_64);
 
-                    // Location of string is cacluated each time from the
-                    // start of the string buffer.  On darwin the symbols
-                    // are prefixed by "_".  On other platforms it is not
-                    // so it needs to be removed.  That is the reason for
-                    // the +1.
-                    printf("%-40s EQU %5d\n", str_buf + nl.n_un.n_strx + 1, val);
+                        memcpy(&val, base_buf + base_data_section + nl.n_value,
+                               sizeof(val));
+                        printf("%-40s EQU %5d\n",
+                               str_buf + nl.n_un.n_strx + 1, val);
+                    }
                 }
             }
         }
@@ -218,7 +292,7 @@ bail:
     return EXIT_FAILURE;
 }
 
-#else
+#elif defined(__ELF__)
 #include "elf.h"
 
 #define COPY_STRUCT(dst, buf, ofst, sz) do {\
@@ -237,212 +311,420 @@ bail:
 
 typedef struct
 {
-    uint8_t     *buf; /* Buffer containing ELF data */
-    size_t       sz;  /* Buffer size */
-    int          le_data;   /* Data is little-endian */
-    Elf32_Ehdr   hdr;
+    uint8_t      *buf; /* Buffer containing ELF data */
+    size_t        sz;  /* Buffer size */
+    int           le_data; /* Data is little-endian */
+    unsigned char e_ident[EI_NIDENT]; /* Magic number and other info */
+    int           bits; /* 32 or 64 */
+    Elf32_Ehdr    hdr32;
+    Elf64_Ehdr    hdr64;
 } elf_obj_t;
 
-int parse_elf32_header(elf_obj_t *elf)
+int parse_elf_header(elf_obj_t *elf)
 {
     int res;
-    /* Verify ELF32 header */
-    COPY_STRUCT(&elf->hdr, elf->buf, 0, elf->sz);
-    res = elf->hdr.e_ident[EI_MAG0] == ELFMAG0;
-    res &= elf->hdr.e_ident[EI_MAG1] == ELFMAG1;
-    res &= elf->hdr.e_ident[EI_MAG2] == ELFMAG2;
-    res &= elf->hdr.e_ident[EI_MAG3] == ELFMAG3;
-    res &= elf->hdr.e_ident[EI_CLASS] == ELFCLASS32;
-    res &= elf->hdr.e_ident[EI_DATA] == ELFDATA2LSB
-           || elf->hdr.e_ident[EI_DATA] == ELFDATA2MSB;
+    /* Verify ELF Magic numbers */
+    COPY_STRUCT(&elf->e_ident, elf->buf, 0, elf->sz);
+    res = elf->e_ident[EI_MAG0] == ELFMAG0;
+    res &= elf->e_ident[EI_MAG1] == ELFMAG1;
+    res &= elf->e_ident[EI_MAG2] == ELFMAG2;
+    res &= elf->e_ident[EI_MAG3] == ELFMAG3;
+    res &= elf->e_ident[EI_CLASS] == ELFCLASS32
+        || elf->e_ident[EI_CLASS] == ELFCLASS64;
+    res &= elf->e_ident[EI_DATA] == ELFDATA2LSB;
 
     if (!res) goto bail;
 
-    elf->le_data = elf->hdr.e_ident[EI_DATA] == ELFDATA2LSB;
-
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_type);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_machine);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_version);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_entry);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_phoff);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_shoff);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_flags);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_ehsize);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_phentsize);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_phnum);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_shentsize);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_shnum);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_shstrndx);
+    elf->le_data = elf->e_ident[EI_DATA] == ELFDATA2LSB;
+
+    /* Read in relevant values */
+    if (elf->e_ident[EI_CLASS] == ELFCLASS32)
+    {
+        elf->bits = 32;
+        COPY_STRUCT(&elf->hdr32, elf->buf, 0, elf->sz);
+
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_type);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_machine);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_version);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_entry);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_phoff);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_shoff);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_flags);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_ehsize);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_phentsize);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_phnum);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_shentsize);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_shnum);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_shstrndx);
+    }
+    else /* if (elf->e_ident[EI_CLASS] == ELFCLASS64) */
+    {
+        elf->bits = 64;
+        COPY_STRUCT(&elf->hdr64, elf->buf, 0, elf->sz);
+
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_type);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_machine);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_version);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_entry);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_phoff);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_shoff);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_flags);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_ehsize);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_phentsize);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_phnum);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_shentsize);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_shnum);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_shstrndx);
+    }
+
     return 0;
 bail:
+    log_msg("Failed to parse ELF file header");
     return 1;
 }
 
-int parse_elf32_section(elf_obj_t *elf, int idx, Elf32_Shdr *hdr)
+int parse_elf_section(elf_obj_t *elf, int idx, Elf32_Shdr *hdr32, Elf64_Shdr *hdr64)
 {
-    if (idx >= elf->hdr.e_shnum)
-        goto bail;
+    if (hdr32)
+    {
+        if (idx >= elf->hdr32.e_shnum)
+            goto bail;
+
+        COPY_STRUCT(hdr32, elf->buf, elf->hdr32.e_shoff + idx * elf->hdr32.e_shentsize,
+                    elf->sz);
+        ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_name);
+        ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_type);
+        ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_flags);
+        ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_addr);
+        ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_offset);
+        ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_size);
+        ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_link);
+        ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_info);
+        ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_addralign);
+        ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_entsize);
+    }
+    else /* if (hdr64) */
+    {
+        if (idx >= elf->hdr64.e_shnum)
+            goto bail;
+
+        COPY_STRUCT(hdr64, elf->buf, elf->hdr64.e_shoff + idx * elf->hdr64.e_shentsize,
+                    elf->sz);
+        ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_name);
+        ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_type);
+        ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_flags);
+        ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_addr);
+        ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_offset);
+        ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_size);
+        ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_link);
+        ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_info);
+        ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_addralign);
+        ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_entsize);
+    }
 
-    COPY_STRUCT(hdr, elf->buf, elf->hdr.e_shoff + idx * elf->hdr.e_shentsize,
-                elf->sz);
-    ENDIAN_ASSIGN_IN_PLACE(hdr->sh_name);
-    ENDIAN_ASSIGN_IN_PLACE(hdr->sh_type);
-    ENDIAN_ASSIGN_IN_PLACE(hdr->sh_flags);
-    ENDIAN_ASSIGN_IN_PLACE(hdr->sh_addr);
-    ENDIAN_ASSIGN_IN_PLACE(hdr->sh_offset);
-    ENDIAN_ASSIGN_IN_PLACE(hdr->sh_size);
-    ENDIAN_ASSIGN_IN_PLACE(hdr->sh_link);
-    ENDIAN_ASSIGN_IN_PLACE(hdr->sh_info);
-    ENDIAN_ASSIGN_IN_PLACE(hdr->sh_addralign);
-    ENDIAN_ASSIGN_IN_PLACE(hdr->sh_entsize);
     return 0;
 bail:
     return 1;
 }
 
-char *parse_elf32_string_table(elf_obj_t *elf, int s_idx, int idx)
+char *parse_elf_string_table(elf_obj_t *elf, int s_idx, int idx)
 {
-    Elf32_Shdr shdr;
-
-    if (parse_elf32_section(elf, s_idx, &shdr))
+    if (elf->bits == 32)
     {
-        log_msg("Failed to parse ELF string table: section %d, index %d\n",
-                s_idx, idx);
-        return "";
+        Elf32_Shdr shdr;
+
+        if (parse_elf_section(elf, s_idx, &shdr, NULL))
+        {
+            log_msg("Failed to parse ELF string table: section %d, index %d\n",
+                    s_idx, idx);
+            return "";
+        }
+
+        return (char *)(elf->buf + shdr.sh_offset + idx);
     }
+    else /* if (elf->bits == 64) */
+    {
+        Elf64_Shdr shdr;
 
-    return (char *)(elf->buf + shdr.sh_offset + idx);
+        if (parse_elf_section(elf, s_idx, NULL, &shdr))
+        {
+            log_msg("Failed to parse ELF string table: section %d, index %d\n",
+                    s_idx, idx);
+            return "";
+        }
+
+        return (char *)(elf->buf + shdr.sh_offset + idx);
+    }
 }
 
-int parse_elf32_symbol(elf_obj_t *elf, unsigned int ofst, Elf32_Sym *sym)
+int parse_elf_symbol(elf_obj_t *elf, unsigned int ofst, Elf32_Sym *sym32, Elf64_Sym *sym64)
 {
-    COPY_STRUCT(sym, elf->buf, ofst, elf->sz);
-    ENDIAN_ASSIGN_IN_PLACE(sym->st_name);
-    ENDIAN_ASSIGN_IN_PLACE(sym->st_value);
-    ENDIAN_ASSIGN_IN_PLACE(sym->st_size);
-    ENDIAN_ASSIGN_IN_PLACE(sym->st_info);
-    ENDIAN_ASSIGN_IN_PLACE(sym->st_other);
-    ENDIAN_ASSIGN_IN_PLACE(sym->st_shndx);
+    if (sym32)
+    {
+        COPY_STRUCT(sym32, elf->buf, ofst, elf->sz);
+        ENDIAN_ASSIGN_IN_PLACE(sym32->st_name);
+        ENDIAN_ASSIGN_IN_PLACE(sym32->st_value);
+        ENDIAN_ASSIGN_IN_PLACE(sym32->st_size);
+        ENDIAN_ASSIGN_IN_PLACE(sym32->st_info);
+        ENDIAN_ASSIGN_IN_PLACE(sym32->st_other);
+        ENDIAN_ASSIGN_IN_PLACE(sym32->st_shndx);
+    }
+    else /* if (sym64) */
+    {
+        COPY_STRUCT(sym64, elf->buf, ofst, elf->sz);
+        ENDIAN_ASSIGN_IN_PLACE(sym64->st_name);
+        ENDIAN_ASSIGN_IN_PLACE(sym64->st_value);
+        ENDIAN_ASSIGN_IN_PLACE(sym64->st_size);
+        ENDIAN_ASSIGN_IN_PLACE(sym64->st_info);
+        ENDIAN_ASSIGN_IN_PLACE(sym64->st_other);
+        ENDIAN_ASSIGN_IN_PLACE(sym64->st_shndx);
+    }
     return 0;
 bail:
     return 1;
 }
 
-int parse_elf32(uint8_t *buf, size_t sz, output_fmt_t mode)
+int parse_elf(uint8_t *buf, size_t sz, output_fmt_t mode)
 {
-    elf_obj_t  elf;
-    Elf32_Shdr shdr;
+    elf_obj_t    elf;
     unsigned int ofst;
-    int         i;
-    Elf32_Off strtab_off;   /* save String Table offset for later use */
+    int          i;
+    Elf32_Off    strtab_off32;
+    Elf64_Off    strtab_off64; /* save String Table offset for later use */
 
     memset(&elf, 0, sizeof(elf));
     elf.buf = buf;
     elf.sz = sz;
 
     /* Parse Header */
-    if (parse_elf32_header(&elf))
-    {
-        log_msg("Parse error: File does not appear to be valid ELF32\n");
-        return 1;
-    }
+    if (parse_elf_header(&elf))
+      goto bail;
 
-    for (i = 0; i < elf.hdr.e_shnum; i++)
+    if (elf.bits == 32)
     {
-        parse_elf32_section(&elf, i, &shdr);
-
-        if (shdr.sh_type == SHT_STRTAB)
+        Elf32_Shdr shdr;
+        for (i = 0; i < elf.hdr32.e_shnum; i++)
         {
-            char strtsb_name[128];
-
-            strcpy(strtsb_name, (char *)(elf.buf + shdr.sh_offset + shdr.sh_name));
+            parse_elf_section(&elf, i, &shdr, NULL);
 
-            if (!(strcmp(strtsb_name, ".shstrtab")))
+            if (shdr.sh_type == SHT_STRTAB)
             {
-                log_msg("found section: %s\n", strtsb_name);
-                strtab_off = shdr.sh_offset;
-                break;
+                char strtsb_name[128];
+
+                strcpy(strtsb_name, (char *)(elf.buf + shdr.sh_offset + shdr.sh_name));
+
+                if (!(strcmp(strtsb_name, ".shstrtab")))
+                {
+                    /* log_msg("found section: %s\n", strtsb_name); */
+                    strtab_off32 = shdr.sh_offset;
+                    break;
+                }
             }
         }
     }
-
-    /* Parse all Symbol Tables */
-    for (i = 0; i < elf.hdr.e_shnum; i++)
+    else /* if (elf.bits == 64) */
     {
-
-        parse_elf32_section(&elf, i, &shdr);
-
-        if (shdr.sh_type == SHT_SYMTAB)
+        Elf64_Shdr shdr;
+        for (i = 0; i < elf.hdr64.e_shnum; i++)
         {
-            for (ofst = shdr.sh_offset;
-                 ofst < shdr.sh_offset + shdr.sh_size;
-                 ofst += shdr.sh_entsize)
-            {
-                Elf32_Sym sym;
+            parse_elf_section(&elf, i, NULL, &shdr);
 
-                parse_elf32_symbol(&elf, ofst, &sym);
+            if (shdr.sh_type == SHT_STRTAB)
+            {
+                char strtsb_name[128];
 
-                /* For all OBJECTS (data objects), extract the value from the
-                 * proper data segment.
-                 */
-                if (ELF32_ST_TYPE(sym.st_info) == STT_OBJECT && sym.st_name)
-                    log_msg("found data object %s\n",
-                            parse_elf32_string_table(&elf,
-                                                     shdr.sh_link,
-                                                     sym.st_name));
+                strcpy(strtsb_name, (char *)(elf.buf + shdr.sh_offset + shdr.sh_name));
 
-                if (ELF32_ST_TYPE(sym.st_info) == STT_OBJECT
-                    && sym.st_size == 4)
+                if (!(strcmp(strtsb_name, ".shstrtab")))
                 {
-                    Elf32_Shdr dhdr;
-                    int32_t      val;
-                    char section_name[128];
-
-                    parse_elf32_section(&elf, sym.st_shndx, &dhdr);
+                    /* log_msg("found section: %s\n", strtsb_name); */
+                    strtab_off64 = shdr.sh_offset;
+                    break;
+                }
+            }
+        }
+    }
 
-                    /* For explanition - refer to _MSC_VER version of code */
-                    strcpy(section_name, (char *)(elf.buf + strtab_off + dhdr.sh_name));
-                    log_msg("Section_name: %s, Section_type: %d\n", section_name, dhdr.sh_type);
+    /* Parse all Symbol Tables */
+    if (elf.bits == 32)
+    {
+        Elf32_Shdr shdr;
+        for (i = 0; i < elf.hdr32.e_shnum; i++)
+        {
+            parse_elf_section(&elf, i, &shdr, NULL);
 
-                    if (!(strcmp(section_name, ".bss")))
-                    {
-                        val = 0;
-                    }
-                    else
+            if (shdr.sh_type == SHT_SYMTAB)
+            {
+                for (ofst = shdr.sh_offset;
+                     ofst < shdr.sh_offset + shdr.sh_size;
+                     ofst += shdr.sh_entsize)
+                {
+                    Elf32_Sym sym;
+
+                    parse_elf_symbol(&elf, ofst, &sym, NULL);
+
+                    /* For all OBJECTS (data objects), extract the value from the
+                     * proper data segment.
+                     */
+                    /* if (ELF32_ST_TYPE(sym.st_info) == STT_OBJECT && sym.st_name)
+                        log_msg("found data object %s\n",
+                                parse_elf_string_table(&elf,
+                                                       shdr.sh_link,
+                                                       sym.st_name));
+                     */
+
+                    if (ELF32_ST_TYPE(sym.st_info) == STT_OBJECT
+                        && sym.st_size == 4)
                     {
-                        memcpy(&val,
-                               elf.buf + dhdr.sh_offset + sym.st_value,
-                               sizeof(val));
+                        Elf32_Shdr dhdr;
+                        int val = 0;
+                        char section_name[128];
+
+                        parse_elf_section(&elf, sym.st_shndx, &dhdr, NULL);
+
+                        /* For explanition - refer to _MSC_VER version of code */
+                        strcpy(section_name, (char *)(elf.buf + strtab_off32 + dhdr.sh_name));
+                        /* log_msg("Section_name: %s, Section_type: %d\n", section_name, dhdr.sh_type); */
+
+                        if (strcmp(section_name, ".bss"))
+                        {
+                            if (sizeof(val) != sym.st_size)
+                            {
+                                /* The target value is declared as an int in
+                                 * asm_*_offsets.c, which is 4 bytes on all
+                                 * targets we currently use. Complain loudly if
+                                 * this is not true.
+                                 */
+                                log_msg("Symbol size is wrong\n");
+                                goto bail;
+                            }
+
+                            memcpy(&val,
+                                   elf.buf + dhdr.sh_offset + sym.st_value,
+                                   sym.st_size);
+                        }
+
+                        if (!elf.le_data)
+                        {
+                            log_msg("Big Endian data not supported yet!\n");
+                            goto bail;
+                        }
+
+                        switch (mode)
+                        {
+                            case OUTPUT_FMT_RVDS:
+                                printf("%-40s EQU %5d\n",
+                                       parse_elf_string_table(&elf,
+                                                              shdr.sh_link,
+                                                              sym.st_name),
+                                       val);
+                                break;
+                            case OUTPUT_FMT_GAS:
+                                printf(".equ %-40s, %5d\n",
+                                       parse_elf_string_table(&elf,
+                                                              shdr.sh_link,
+                                                              sym.st_name),
+                                       val);
+                                break;
+                            default:
+                                printf("%s = %d\n",
+                                       parse_elf_string_table(&elf,
+                                                              shdr.sh_link,
+                                                              sym.st_name),
+                                       val);
+                        }
                     }
+                }
+            }
+        }
+    }
+    else /* if (elf.bits == 64) */
+    {
+        Elf64_Shdr shdr;
+        for (i = 0; i < elf.hdr64.e_shnum; i++)
+        {
+            parse_elf_section(&elf, i, NULL, &shdr);
 
-                    if (!elf.le_data)
-                    {
-                        log_msg("Big Endian data not supported yet!\n");
-                        goto bail;
-                    }\
-
-                    switch (mode)
+            if (shdr.sh_type == SHT_SYMTAB)
+            {
+                for (ofst = shdr.sh_offset;
+                     ofst < shdr.sh_offset + shdr.sh_size;
+                     ofst += shdr.sh_entsize)
+                {
+                    Elf64_Sym sym;
+
+                    parse_elf_symbol(&elf, ofst, NULL, &sym);
+
+                    /* For all OBJECTS (data objects), extract the value from the
+                     * proper data segment.
+                     */
+                    /* if (ELF64_ST_TYPE(sym.st_info) == STT_OBJECT && sym.st_name)
+                        log_msg("found data object %s\n",
+                                parse_elf_string_table(&elf,
+                                                       shdr.sh_link,
+                                                       sym.st_name));
+                     */
+
+                    if (ELF64_ST_TYPE(sym.st_info) == STT_OBJECT
+                        && sym.st_size == 4)
                     {
-                    case OUTPUT_FMT_RVDS:
-                        printf("%-40s EQU %5d\n",
-                               parse_elf32_string_table(&elf,
-                                                        shdr.sh_link,
-                                                        sym.st_name),
-                               val);
-                        break;
-                    case OUTPUT_FMT_GAS:
-                        printf(".equ %-40s, %5d\n",
-                               parse_elf32_string_table(&elf,
-                                                        shdr.sh_link,
-                                                        sym.st_name),
-                               val);
-                        break;
-                    default:
-                        printf("%s = %d\n",
-                               parse_elf32_string_table(&elf,
-                                                        shdr.sh_link,
-                                                        sym.st_name),
-                               val);
+                        Elf64_Shdr dhdr;
+                        int val = 0;
+                        char section_name[128];
+
+                        parse_elf_section(&elf, sym.st_shndx, NULL, &dhdr);
+
+                        /* For explanition - refer to _MSC_VER version of code */
+                        strcpy(section_name, (char *)(elf.buf + strtab_off64 + dhdr.sh_name));
+                        /* log_msg("Section_name: %s, Section_type: %d\n", section_name, dhdr.sh_type); */
+
+                        if ((strcmp(section_name, ".bss")))
+                        {
+                            if (sizeof(val) != sym.st_size)
+                            {
+                                /* The target value is declared as an int in
+                                 * asm_*_offsets.c, which is 4 bytes on all
+                                 * targets we currently use. Complain loudly if
+                                 * this is not true.
+                                 */
+                                log_msg("Symbol size is wrong\n");
+                                goto bail;
+                            }
+
+                            memcpy(&val,
+                                   elf.buf + dhdr.sh_offset + sym.st_value,
+                                   sym.st_size);
+                        }
+
+                        if (!elf.le_data)
+                        {
+                            log_msg("Big Endian data not supported yet!\n");
+                            goto bail;
+                        }
+
+                        switch (mode)
+                        {
+                            case OUTPUT_FMT_RVDS:
+                                printf("%-40s EQU %5d\n",
+                                       parse_elf_string_table(&elf,
+                                                              shdr.sh_link,
+                                                              sym.st_name),
+                                       val);
+                                break;
+                            case OUTPUT_FMT_GAS:
+                                printf(".equ %-40s, %5d\n",
+                                       parse_elf_string_table(&elf,
+                                                              shdr.sh_link,
+                                                              sym.st_name),
+                                       val);
+                                break;
+                            default:
+                                printf("%s = %d\n",
+                                       parse_elf_string_table(&elf,
+                                                              shdr.sh_link,
+                                                              sym.st_name),
+                                       val);
+                        }
                     }
                 }
             }
@@ -454,7 +736,7 @@ int parse_elf32(uint8_t *buf, size_t sz, output_fmt_t mode)
 
     return 0;
 bail:
-    log_msg("Parse error: File does not appear to be valid ELF32\n");
+    log_msg("Parse error: File does not appear to be valid ELF32 or ELF64\n");
     return 1;
 }
 
@@ -521,8 +803,7 @@ int main(int argc, char **argv)
         goto bail;
     }
 
-    res = parse_elf32(file_buf, stat_buf.st_size, mode);
-    //res = parse_coff(file_buf, stat_buf.st_size);
+    res = parse_elf(file_buf, stat_buf.st_size, mode);
     free(file_buf);
 
     if (!res)
@@ -535,7 +816,7 @@ bail:
 #endif
 
 
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) || defined(__MINGW32__)
 /*  See "Microsoft Portable Executable and Common Object File Format Specification"
     for reference.
 */
@@ -549,7 +830,6 @@ int parse_coff(unsigned __int8 *buf, size_t sz)
     unsigned int i;
     unsigned __int8 *ptr;
     unsigned __int32 symoffset;
-    FILE *fp;
 
     char **sectionlist;  //this array holds all section names in their correct order.
     //it is used to check if the symbol is in .bss or .data section.
@@ -560,9 +840,18 @@ int parse_coff(unsigned __int8 *buf, size_t sz)
     strtab_ptr = symtab_ptr + symtab_sz * 18;
 
     if (nsections > 96)
-        goto bail;
+    {
+        log_msg("Too many sections\n");
+        return 1;
+    }
+
+    sectionlist = malloc(nsections * sizeof(sectionlist));
 
-    sectionlist = malloc(nsections * sizeof * sectionlist);
+    if (sectionlist == NULL)
+    {
+        log_msg("Allocating first level of section list failed\n");
+        return 1;
+    }
 
     //log_msg("COFF: Found %u symbols in %u sections.\n", symtab_sz, nsections);
 
@@ -580,6 +869,12 @@ int parse_coff(unsigned __int8 *buf, size_t sz)
         //log_msg("COFF: Parsing section %s\n",sectionname);
 
         sectionlist[i] = malloc(strlen(sectionname) + 1);
+
+        if (sectionlist[i] == NULL)
+        {
+            log_msg("Allocating storage for %s failed\n", sectionname);
+            goto bail;
+        }
         strcpy(sectionlist[i], sectionname);
 
         if (!strcmp(sectionname, ".data")) sectionrawdata_ptr = get_le32(ptr + 20);
@@ -590,14 +885,6 @@ int parse_coff(unsigned __int8 *buf, size_t sz)
     //log_msg("COFF: Symbol table at offset %u\n", symtab_ptr);
     //log_msg("COFF: raw data pointer ofset for section .data is %u\n", sectionrawdata_ptr);
 
-    fp = fopen("assembly_offsets.asm", "w");
-
-    if (fp == NULL)
-    {
-        perror("open file");
-        goto bail;
-    }
-
     /*  The compiler puts the data with non-zero offset in .data section, but puts the data with
         zero offset in .bss section. So, if the data in in .bss section, set offset=0.
         Note from Wiki: In an object module compiled from C, the bss section contains
@@ -631,13 +918,15 @@ int parse_coff(unsigned __int8 *buf, size_t sz)
                 char name[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
                 strncpy(name, ptr, 8);
                 //log_msg("COFF: Parsing symbol %s\n",name);
-                fprintf(fp, "%-40s EQU ", name);
+                /* +1 to avoid printing leading underscore */
+                printf("%-40s EQU ", name + 1);
             }
             else
             {
                 //log_msg("COFF: Parsing symbol %s\n",
                 //        buf + strtab_ptr + get_le32(ptr+4));
-                fprintf(fp, "%-40s EQU ", buf + strtab_ptr + get_le32(ptr + 4));
+                /* +1 to avoid printing leading underscore */
+                printf("%-40s EQU ", buf + strtab_ptr + get_le32(ptr + 4) + 1);
             }
 
             if (!(strcmp(sectionlist[section-1], ".bss")))
@@ -654,14 +943,13 @@ int parse_coff(unsigned __int8 *buf, size_t sz)
             //log_msg("      Address: %u\n",get_le32(ptr+8));
             //log_msg("      Offset: %u\n", symoffset);
 
-            fprintf(fp, "%5d\n", symoffset);
+            printf("%5d\n", symoffset);
         }
 
         ptr += 18;
     }
 
-    fprintf(fp, "    END\n");
-    fclose(fp);
+    printf("    END\n");
 
     for (i = 0; i < nsections; i++)
     {
@@ -711,11 +999,7 @@ int main(int argc, char **argv)
     else
         f = argv[1];
 
-    if (_sopen_s(&fd, f, _O_BINARY, _SH_DENYNO, _S_IREAD | _S_IWRITE))
-    {
-        perror("Unable to open file");
-        goto bail;
-    }
+    fd = _sopen(f, _O_BINARY, _SH_DENYNO, _S_IREAD | _S_IWRITE);
 
     if (_fstat(fd, &stat_buf))
     {
diff --git a/build/x86-msvs/obj_int_extract.bat b/build/x86-msvs/obj_int_extract.bat
new file mode 100644
index 000000000..1bb865331
--- /dev/null
+++ b/build/x86-msvs/obj_int_extract.bat
@@ -0,0 +1,15 @@
+REM   Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+REM
+REM   Use of this source code is governed by a BSD-style license
+REM   that can be found in the LICENSE file in the root of the source
+REM   tree. An additional intellectual property rights grant can be found
+REM   in the file PATENTS.  All contributing project authors may
+REM   be found in the AUTHORS file in the root of the source tree.
+echo on
+
+cl /I "./" /I "%1" /nologo /c "%1/vp8/common/asm_com_offsets.c"
+cl /I "./" /I "%1" /nologo /c "%1/vp8/decoder/asm_dec_offsets.c"
+cl /I "./" /I "%1" /nologo /c "%1/vp8/encoder/asm_enc_offsets.c"
+obj_int_extract.exe rvds "asm_com_offsets.obj" > "asm_com_offsets.asm"
+obj_int_extract.exe rvds "asm_dec_offsets.obj" > "asm_dec_offsets.asm"
+obj_int_extract.exe rvds "asm_enc_offsets.obj" > "asm_enc_offsets.asm"
diff --git a/libs.mk b/libs.mk
index 37ce0b13b..350b31077 100644
--- a/libs.mk
+++ b/libs.mk
@@ -9,7 +9,13 @@
 ##
 
 
-ASM:=$(if $(filter yes,$(CONFIG_GCC)),.asm.s,.asm)
+# ARM assembly files are written in RVCT-style. We use some make magic to
+# filter those files to allow GCC compilation
+ifeq ($(ARCH_ARM),yes)
+  ASM:=$(if $(filter yes,$(CONFIG_GCC)),.asm.s,.asm)
+else
+  ASM:=.asm
+endif
 
 CODEC_SRCS-yes += libs.mk
 
@@ -126,6 +132,23 @@ INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(call enabled,CODEC_EXPORTS)
 ifeq ($(CONFIG_EXTERNAL_BUILD),yes)
 ifeq ($(CONFIG_MSVS),yes)
 
+obj_int_extract.vcproj: $(SRC_PATH_BARE)/build/make/obj_int_extract.c
+	@cp $(SRC_PATH_BARE)/build/x86-msvs/obj_int_extract.bat .
+	@echo "    [CREATE] $@"
+	$(SRC_PATH_BARE)/build/make/gen_msvs_proj.sh \
+    --exe \
+    --target=$(TOOLCHAIN) \
+    --name=obj_int_extract \
+    --ver=$(CONFIG_VS_VERSION) \
+    --proj-guid=E1360C65-D375-4335-8057-7ED99CC3F9B2 \
+    $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
+    --out=$@ $^ \
+    -I. \
+    -I"$(SRC_PATH_BARE)" \
+
+PROJECTS-$(BUILD_LIBVPX) += obj_int_extract.vcproj
+PROJECTS-$(BUILD_LIBVPX) += obj_int_extract.bat
+
 vpx.def: $(call enabled,CODEC_EXPORTS)
 	@echo "    [CREATE] $@"
 	$(SRC_PATH_BARE)/build/make/gen_msvs_def.sh\
@@ -135,15 +158,16 @@ CLEAN-OBJS += vpx.def
 
 vpx.vcproj: $(CODEC_SRCS) vpx.def
 	@echo "    [CREATE] $@"
-	$(SRC_PATH_BARE)/build/make/gen_msvs_proj.sh\
-			--lib\
-			--target=$(TOOLCHAIN)\
+	$(SRC_PATH_BARE)/build/make/gen_msvs_proj.sh \
+			--lib \
+			--target=$(TOOLCHAIN) \
             $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
-            --name=vpx\
-            --proj-guid=DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74\
-            --module-def=vpx.def\
-            --ver=$(CONFIG_VS_VERSION)\
-            --out=$@ $(CFLAGS) $^\
+            --name=vpx \
+            --proj-guid=DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74 \
+            --module-def=vpx.def \
+            --ver=$(CONFIG_VS_VERSION) \
+            --out=$@ $(CFLAGS) $^ \
+            --src-path-bare="$(SRC_PATH_BARE)" \
 
 PROJECTS-$(BUILD_LIBVPX) += vpx.vcproj
 
@@ -207,36 +231,38 @@ endif
 #
 # Add assembler dependencies for configuration and offsets
 #
-$(filter %.s.o,$(OBJS-yes)):   $(BUILD_PFX)vpx_config.asm
-$(filter %.asm.o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm
+$(filter %.s.o,$(OBJS-yes)):     $(BUILD_PFX)vpx_config.asm
+$(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm
 
 #
 # Calculate platform- and compiler-specific offsets for hand coded assembly
 #
-ifeq ($(ARCH_ARM), yes)
-  asm_com_offsets.asm: obj_int_extract
-  asm_com_offsets.asm: $(VP8_PREFIX)common/asm_com_offsets.c.o
+ifeq ($(CONFIG_EXTERNAL_BUILD),) # Visual Studio uses obj_int_extract.bat
+  ifeq ($(ARCH_ARM), yes)
+    asm_com_offsets.asm: obj_int_extract
+    asm_com_offsets.asm: $(VP8_PREFIX)common/asm_com_offsets.c.o
 	./obj_int_extract rvds $< $(ADS2GAS) > $@
-  OBJS-yes += $(VP8_PREFIX)common/asm_com_offsets.c.o
-  CLEAN-OBJS += asm_com_offsets.asm
-  $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_com_offsets.asm
+    OBJS-yes += $(VP8_PREFIX)common/asm_com_offsets.c.o
+    CLEAN-OBJS += asm_com_offsets.asm
+    $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_com_offsets.asm
 
-  ifeq ($(CONFIG_VP8_ENCODER), yes)
-    asm_enc_offsets.asm: obj_int_extract
-    asm_enc_offsets.asm: $(VP8_PREFIX)encoder/asm_enc_offsets.c.o
+    ifeq ($(CONFIG_VP8_ENCODER), yes)
+      asm_enc_offsets.asm: obj_int_extract
+      asm_enc_offsets.asm: $(VP8_PREFIX)encoder/asm_enc_offsets.c.o
 	./obj_int_extract rvds $< $(ADS2GAS) > $@
-    OBJS-yes += $(VP8_PREFIX)encoder/asm_enc_offsets.c.o
-    CLEAN-OBJS += asm_enc_offsets.asm
-    $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_enc_offsets.asm
-  endif
-
-  ifeq ($(CONFIG_VP8_DECODER), yes)
-    asm_dec_offsets.asm: obj_int_extract
-    asm_dec_offsets.asm: $(VP8_PREFIX)decoder/asm_dec_offsets.c.o
+      OBJS-yes += $(VP8_PREFIX)encoder/asm_enc_offsets.c.o
+      CLEAN-OBJS += asm_enc_offsets.asm
+      $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_enc_offsets.asm
+    endif
+
+    ifeq ($(CONFIG_VP8_DECODER), yes)
+      asm_dec_offsets.asm: obj_int_extract
+      asm_dec_offsets.asm: $(VP8_PREFIX)decoder/asm_dec_offsets.c.o
 	./obj_int_extract rvds $< $(ADS2GAS) > $@
-    OBJS-yes += $(VP8_PREFIX)decoder/asm_dec_offsets.c.o
-    CLEAN-OBJS += asm_dec_offsets.asm
-    $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_dec_offsets.asm
+      OBJS-yes += $(VP8_PREFIX)decoder/asm_dec_offsets.c.o
+      CLEAN-OBJS += asm_dec_offsets.asm
+      $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_dec_offsets.asm
+    endif
   endif
 endif
 
diff --git a/solution.mk b/solution.mk
index bef00883f..782150fd9 100644
--- a/solution.mk
+++ b/solution.mk
@@ -13,8 +13,9 @@ vpx.sln: $(wildcard *.vcproj)
 	@echo "    [CREATE] $@"
 	$(SRC_PATH_BARE)/build/make/gen_msvs_sln.sh \
             $(if $(filter %vpx.vcproj,$^),\
-                $(foreach vcp,$(filter-out %vpx.vcproj,$^),\
+                $(foreach vcp,$(filter-out %vpx.vcproj %obj_int_extract.vcproj,$^),\
                   --dep=$(vcp:.vcproj=):vpx)) \
+            --dep=vpx:obj_int_extract \
             --ver=$(CONFIG_VS_VERSION)\
             --out=$@ $^
 vpx.sln.mk: vpx.sln
diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c
index 12d5f66d3..5c607a0cb 100644
--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -24,6 +24,35 @@ extern void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x);
 extern void vp8_build_block_offsets(MACROBLOCK *x);
 extern void vp8_setup_block_ptrs(MACROBLOCK *x);
 
+#if CONFIG_MULTITHREAD
+
+extern void loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm);
+
+static THREAD_FUNCTION loopfilter_thread(void *p_data)
+{
+    VP8_COMP *cpi = (VP8_COMP *)(((LPFTHREAD_DATA *)p_data)->ptr1);
+    VP8_COMMON *cm = &cpi->common;
+
+    while (1)
+    {
+        if (cpi->b_multi_threaded == 0)
+            break;
+
+        if (sem_wait(&cpi->h_event_start_lpf) == 0)
+        {
+            if (cpi->b_multi_threaded == FALSE) // we're shutting down
+                break;
+
+            loopfilter_frame(cpi, cm);
+
+            sem_post(&cpi->h_event_end_lpf);
+        }
+    }
+
+    return 0;
+}
+#endif
+
 static
 THREAD_FUNCTION thread_encoding_proc(void *p_data)
 {
@@ -479,6 +508,15 @@ void vp8cx_create_encoder_threads(VP8_COMP *cpi)
             pthread_create(&cpi->h_encoding_thread[ithread], 0, thread_encoding_proc, ethd);
         }
 
+        {
+            LPFTHREAD_DATA * lpfthd = &cpi->lpf_thread_data;
+
+            sem_init(&cpi->h_event_start_lpf, 0, 0);
+            sem_init(&cpi->h_event_end_lpf, 0, 0);
+
+            lpfthd->ptr1 = (void *)cpi;
+            pthread_create(&cpi->h_filter_thread, 0, loopfilter_thread, lpfthd);
+        }
     }
 
 }
@@ -500,9 +538,14 @@ void vp8cx_remove_encoder_threads(VP8_COMP *cpi)
 
                 sem_destroy(&cpi->h_event_start_encoding[i]);
             }
+
+            sem_post(&cpi->h_event_start_lpf);
+            pthread_join(cpi->h_filter_thread, 0);
         }
 
         sem_destroy(&cpi->h_event_end_encoding);
+        sem_destroy(&cpi->h_event_end_lpf);
+        sem_destroy(&cpi->h_event_start_lpf);
 
         //free thread related resources
         vpx_free(cpi->h_event_start_encoding);
diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index e0b2a8a45..774d9b6b5 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -281,21 +281,6 @@ int frame_max_bits(VP8_COMP *cpi)
 }
 
 
-extern size_t vp8_firstpass_stats_sz(unsigned int mb_count)
-{
-    /* Calculate the size of a stats packet, which is dependent on the frame
-     * resolution. The FIRSTPASS_STATS struct has a single element array,
-     * motion_map, which is virtually expanded to have one element per
-     * macroblock.
-     */
-    size_t stats_sz;
-
-    stats_sz = sizeof(FIRSTPASS_STATS) + mb_count;
-    stats_sz = (stats_sz + 7) & ~7;
-    return stats_sz;
-}
-
-
 void vp8_output_stats(const VP8_COMP            *cpi,
                       struct vpx_codec_pkt_list *pktlist,
                       FIRSTPASS_STATS            *stats)
@@ -303,16 +288,19 @@ void vp8_output_stats(const VP8_COMP            *cpi,
     struct vpx_codec_cx_pkt pkt;
     pkt.kind = VPX_CODEC_STATS_PKT;
     pkt.data.twopass_stats.buf = stats;
-    pkt.data.twopass_stats.sz = vp8_firstpass_stats_sz(cpi->common.MBs);
+    pkt.data.twopass_stats.sz = sizeof(FIRSTPASS_STATS);
     vpx_codec_pkt_list_add(pktlist, &pkt);
 
 // TEMP debug code
 #if OUTPUT_FPF
+
     {
         FILE *fpfile;
         fpfile = fopen("firstpass.stt", "a");
 
-        fprintf(fpfile, "%12.0f %12.0f %12.0f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.0f\n",
+        fprintf(fpfile, "%12.0f %12.0f %12.0f %12.4f %12.4f %12.4f %12.4f"
+                " %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f"
+                " %12.0f %12.4f\n",
                 stats->frame,
                 stats->intra_error,
                 stats->coded_error,
@@ -320,6 +308,7 @@ void vp8_output_stats(const VP8_COMP            *cpi,
                 stats->pcnt_inter,
                 stats->pcnt_motion,
                 stats->pcnt_second_ref,
+                stats->pcnt_neutral,
                 stats->MVr,
                 stats->mvr_abs,
                 stats->MVc,
@@ -327,12 +316,8 @@ void vp8_output_stats(const VP8_COMP            *cpi,
                 stats->MVrv,
                 stats->MVcv,
                 stats->mv_in_out_count,
-                stats->count);
-        fclose(fpfile);
-
-
-        fpfile = fopen("fpmotionmap.stt", "a");
-        if(fwrite(cpi->fp_motion_map, 1, cpi->common.MBs, fpfile));
+                stats->count,
+                stats->duration);
         fclose(fpfile);
     }
 #endif
@@ -340,13 +325,11 @@ void vp8_output_stats(const VP8_COMP            *cpi,
 
 int vp8_input_stats(VP8_COMP *cpi, FIRSTPASS_STATS *fps)
 {
-    size_t stats_sz = vp8_firstpass_stats_sz(cpi->common.MBs);
-
     if (cpi->stats_in >= cpi->stats_in_end)
         return EOF;
 
     *fps = *cpi->stats_in;
-    cpi->stats_in = (void*)((char *)cpi->stats_in + stats_sz);
+    cpi->stats_in = (void*)((char *)cpi->stats_in + sizeof(FIRSTPASS_STATS));
     return 1;
 }
 
@@ -359,6 +342,7 @@ void vp8_zero_stats(FIRSTPASS_STATS *section)
     section->pcnt_inter  = 0.0;
     section->pcnt_motion  = 0.0;
     section->pcnt_second_ref = 0.0;
+    section->pcnt_neutral = 0.0;
     section->MVr        = 0.0;
     section->mvr_abs     = 0.0;
     section->MVc        = 0.0;
@@ -378,6 +362,7 @@ void vp8_accumulate_stats(FIRSTPASS_STATS *section, FIRSTPASS_STATS *frame)
     section->pcnt_inter  += frame->pcnt_inter;
     section->pcnt_motion += frame->pcnt_motion;
     section->pcnt_second_ref += frame->pcnt_second_ref;
+    section->pcnt_neutral += frame->pcnt_neutral;
     section->MVr        += frame->MVr;
     section->mvr_abs     += frame->mvr_abs;
     section->MVc        += frame->MVc;
@@ -398,6 +383,7 @@ void vp8_avg_stats(FIRSTPASS_STATS *section)
     section->ssim_weighted_pred_err /= section->count;
     section->pcnt_inter  /= section->count;
     section->pcnt_second_ref /= section->count;
+    section->pcnt_neutral /= section->count;
     section->pcnt_motion /= section->count;
     section->MVr        /= section->count;
     section->mvr_abs     /= section->count;
@@ -409,57 +395,9 @@ void vp8_avg_stats(FIRSTPASS_STATS *section)
     section->duration   /= section->count;
 }
 
-unsigned char *vp8_fpmm_get_pos(VP8_COMP *cpi)
-{
-    return cpi->fp_motion_map_stats;
-}
-void vp8_fpmm_reset_pos(VP8_COMP *cpi, unsigned char *target_pos)
-{
-    cpi->fp_motion_map_stats = target_pos;
-}
-
-void vp8_advance_fpmm(VP8_COMP *cpi, int count)
-{
-    cpi->fp_motion_map_stats = (void*)((char*)cpi->fp_motion_map_stats +
-        count * vp8_firstpass_stats_sz(cpi->common.MBs));
-}
-
-void vp8_input_fpmm(VP8_COMP *cpi)
-{
-    unsigned char *fpmm = cpi->fp_motion_map;
-    int MBs = cpi->common.MBs;
-    int max_frames = cpi->active_arnr_frames;
-    int i;
-
-    for (i=0; i<max_frames; i++)
-    {
-        char *motion_map = (char*)cpi->fp_motion_map_stats
-                           + sizeof(FIRSTPASS_STATS);
-
-        memcpy(fpmm, motion_map, MBs);
-        fpmm += MBs;
-        vp8_advance_fpmm(cpi, 1);
-    }
-
-    // Flag the use of weights in the temporal filter
-    cpi->use_weighted_temporal_filter = 1;
-}
-
 void vp8_init_first_pass(VP8_COMP *cpi)
 {
     vp8_zero_stats(cpi->total_stats);
-
-// TEMP debug code
-#ifdef OUTPUT_FPF
-    {
-        FILE *fpfile;
-        fpfile = fopen("firstpass.stt", "w");
-        fclose(fpfile);
-        fpfile = fopen("fpmotionmap.stt", "wb");
-        fclose(fpfile);
-    }
-#endif
-
 }
 
 void vp8_end_first_pass(VP8_COMP *cpi)
@@ -570,13 +508,12 @@ void vp8_first_pass(VP8_COMP *cpi)
     int intercount = 0;
     int second_ref_count = 0;
     int intrapenalty = 256;
+    int neutral_count = 0;
 
     int sum_in_vectors = 0;
 
     MV zero_ref_mv = {0, 0};
 
-    unsigned char *fp_motion_map_ptr = cpi->fp_motion_map;
-
     vp8_clear_system_state();  //__asm emms;
 
     x->src = * cpi->Source;
@@ -628,7 +565,6 @@ void vp8_first_pass(VP8_COMP *cpi)
         for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
         {
             int this_error;
-            int zero_error;
             int zz_to_best_ratio;
             int gf_motion_error = INT_MAX;
             int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
@@ -650,9 +586,6 @@ void vp8_first_pass(VP8_COMP *cpi)
             // Cumulative intra error total
             intra_error += (long long)this_error;
 
-            // Indicate default assumption of intra in the motion map
-            *fp_motion_map_ptr = 0;
-
             // Set up limit values for motion vectors to prevent them extending outside the UMV borders
             x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16));
             x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + (VP8BORDERINPIXELS - 16);
@@ -671,9 +604,6 @@ void vp8_first_pass(VP8_COMP *cpi)
                 d->bmi.mv.as_mv.row = 0;
                 d->bmi.mv.as_mv.col = 0;
 
-                // Save (0,0) error for later use
-                zero_error = motion_error;
-
                 // Test last reference frame using the previous best mv as the
                 // starting point (best reference) for the search
                 vp8_first_pass_motion_search(cpi, x, &best_ref_mv.as_mv,
@@ -726,6 +656,17 @@ void vp8_first_pass(VP8_COMP *cpi)
 
                 if (motion_error <= this_error)
                 {
+                    // Keep a count of cases where the inter and intra were
+                    // very close and very low. This helps with scene cut
+                    // detection for example in cropped clips with black bars
+                    // at the sides or top and bottom.
+                    if( (((this_error-intrapenalty) * 9) <=
+                         (motion_error*10)) &&
+                        (this_error < (2*intrapenalty)) )
+                    {
+                        neutral_count++;
+                    }
+
                     d->bmi.mv.as_mv.row <<= 3;
                     d->bmi.mv.as_mv.col <<= 3;
                     this_error = motion_error;
@@ -777,25 +718,6 @@ void vp8_first_pass(VP8_COMP *cpi)
                             else if (d->bmi.mv.as_mv.col < 0)
                                 sum_in_vectors--;
                         }
-
-                        // Compute how close (0,0) predictor is to best
-                        // predictor in terms of their prediction error
-                        zz_to_best_ratio = (10*zero_error + this_error/2)
-                                            / (this_error+!this_error);
-
-                        if ((zero_error < 50000) &&
-                            (zz_to_best_ratio <= 11) )
-                            *fp_motion_map_ptr = 1;
-                        else
-                            *fp_motion_map_ptr = 0;
-                    }
-                    else
-                    {
-                        // 0,0 mv was best
-                        if( zero_error<50000 )
-                            *fp_motion_map_ptr = 2;
-                        else
-                            *fp_motion_map_ptr = 1;
                     }
                 }
             }
@@ -809,9 +731,6 @@ void vp8_first_pass(VP8_COMP *cpi)
 
             recon_yoffset += 16;
             recon_uvoffset += 8;
-
-            // Update the motion map
-            fp_motion_map_ptr++;
         }
 
         // adjust to the next row of mbs
@@ -854,6 +773,7 @@ void vp8_first_pass(VP8_COMP *cpi)
 
         fps.pcnt_inter   = 1.0 * (double)intercount / cm->MBs;
         fps.pcnt_second_ref = 1.0 * (double)second_ref_count / cm->MBs;
+        fps.pcnt_neutral = 1.0 * (double)neutral_count / cm->MBs;
 
         if (mvcount > 0)
         {
@@ -872,13 +792,10 @@ void vp8_first_pass(VP8_COMP *cpi)
         // than the full time between subsequent cpi->source_time_stamp s  .
         fps.duration = cpi->source_end_time_stamp - cpi->source_time_stamp;
 
-        // don't want to do outputstats with a stack variable!
+        // don't want to do output stats with a stack variable!
         memcpy(cpi->this_frame_stats,
                &fps,
                sizeof(FIRSTPASS_STATS));
-        memcpy((char*)cpi->this_frame_stats + sizeof(FIRSTPASS_STATS),
-               cpi->fp_motion_map,
-               sizeof(cpi->fp_motion_map[0]) * cpi->common.MBs);
         vp8_output_stats(cpi, cpi->output_pkt_list, cpi->this_frame_stats);
         vp8_accumulate_stats(cpi->total_stats, &fps);
     }
@@ -924,10 +841,10 @@ void vp8_first_pass(VP8_COMP *cpi)
 extern const int vp8_bits_per_mb[2][QINDEX_RANGE];
 
 #define BASE_ERRPERMB   150
-static int estimate_max_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh, int Height, int Width)
+static int estimate_max_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh)
 {
     int Q;
-    int num_mbs = ((Height * Width) / (16 * 16));
+    int num_mbs = cpi->common.MBs;
     int target_norm_bits_per_mb;
 
     double err_per_mb = section_err / num_mbs;
@@ -1024,10 +941,10 @@ static int estimate_max_q(VP8_COMP *cpi, double section_err, int section_target_
 
     return Q;
 }
-static int estimate_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh, int Height, int Width)
+static int estimate_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh)
 {
     int Q;
-    int num_mbs = ((Height * Width) / (16 * 16));
+    int num_mbs = cpi->common.MBs;
     int target_norm_bits_per_mb;
 
     double err_per_mb = section_err / num_mbs;
@@ -1075,10 +992,10 @@ static int estimate_q(VP8_COMP *cpi, double section_err, int section_target_band
 }
 
 // Estimate a worst case Q for a KF group
-static int estimate_kf_group_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh, int Height, int Width, double group_iiratio)
+static int estimate_kf_group_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh, double group_iiratio)
 {
     int Q;
-    int num_mbs = ((Height * Width) / (16 * 16));
+    int num_mbs = cpi->common.MBs;
     int target_norm_bits_per_mb = (512 * section_target_bandwitdh) / num_mbs;
     int bits_per_mb_at_this_q;
 
@@ -1173,11 +1090,10 @@ static int estimate_kf_group_q(VP8_COMP *cpi, double section_err, int section_ta
 
 // For cq mode estimate a cq level that matches the observed
 // complexity and data rate.
-static int estimate_cq(VP8_COMP *cpi, double section_err,
-                       int section_target_bandwitdh, int Height, int Width)
+static int estimate_cq(VP8_COMP *cpi, double section_err, int section_target_bandwitdh)
 {
     int Q;
-    int num_mbs = ((Height * Width) / (16 * 16));
+    int num_mbs = cpi->common.MBs;
     int target_norm_bits_per_mb;
 
     double err_per_mb = section_err / num_mbs;
@@ -1331,8 +1247,6 @@ void vp8_init_second_pass(VP8_COMP *cpi)
     cpi->clip_bpe =  cpi->bits_left /
                      DOUBLE_DIVIDE_CHECK(cpi->modified_error_total);
     cpi->observed_bpe = cpi->clip_bpe;
-
-    cpi->fp_motion_map_stats = (unsigned char *)cpi->stats_in;
 }
 
 void vp8_end_second_pass(VP8_COMP *cpi)
@@ -1340,8 +1254,8 @@ void vp8_end_second_pass(VP8_COMP *cpi)
 }
 
 // This function gives and estimate of how badly we believe
-// the predicition quality is decaying from frame to frame.
-double gf_prediction_decay_rate(VP8_COMP *cpi, FIRSTPASS_STATS *next_frame)
+// the prediction quality is decaying from frame to frame.
+double get_prediction_decay_rate(VP8_COMP *cpi, FIRSTPASS_STATS *next_frame)
 {
     double prediction_decay_rate;
     double motion_decay;
@@ -1376,6 +1290,52 @@ double gf_prediction_decay_rate(VP8_COMP *cpi, FIRSTPASS_STATS *next_frame)
     return prediction_decay_rate;
 }
 
+// Function to test for a condition where a complex transition is followed
+// by a static section. For example in slide shows where there is a fade
+// between slides. This is to help with more optimal kf and gf positioning.
+BOOL detect_transition_to_still(
+    VP8_COMP *cpi,
+    int frame_interval,
+    int still_interval,
+    double loop_decay_rate,
+    double decay_accumulator )
+{
+    BOOL trans_to_still = FALSE;
+
+    // Break clause to detect very still sections after motion
+    // For example a static image after a fade or other transition
+    // instead of a clean scene cut.
+    if ( (frame_interval > MIN_GF_INTERVAL) &&
+         (loop_decay_rate >= 0.999) &&
+         (decay_accumulator < 0.9) )
+    {
+        int j;
+        FIRSTPASS_STATS * position = cpi->stats_in;
+        FIRSTPASS_STATS tmp_next_frame;
+        double decay_rate;
+
+        // Look ahead a few frames to see if static condition
+        // persists...
+        for ( j = 0; j < still_interval; j++ )
+        {
+            if (EOF == vp8_input_stats(cpi, &tmp_next_frame))
+                break;
+
+            decay_rate = get_prediction_decay_rate(cpi, &tmp_next_frame);
+            if ( decay_rate < 0.999 )
+                break;
+        }
+        // Reset file position
+        reset_fpf_position(cpi, position);
+
+        // Only if it does do we signal a transition to still
+        if ( j == still_interval )
+            trans_to_still = TRUE;
+    }
+
+    return trans_to_still;
+}
+
 // Analyse and define a gf/arf group .
 static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
 {
@@ -1406,8 +1366,6 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
 
     int max_bits = frame_max_bits(cpi);     // Max for a single frame
 
-    unsigned char *fpmm_pos;
-
     unsigned int allow_alt_ref =
                     cpi->oxcf.play_alternate && cpi->oxcf.lag_in_frames;
 
@@ -1416,8 +1374,6 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
 
     vp8_clear_system_state();  //__asm emms;
 
-    fpmm_pos = vp8_fpmm_get_pos(cpi);
-
     start_pos = cpi->stats_in;
 
     vpx_memset(&next_frame, 0, sizeof(next_frame)); // assure clean
@@ -1528,7 +1484,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
         if (r > GF_RMAX)
             r = GF_RMAX;
 
-        loop_decay_rate = gf_prediction_decay_rate(cpi, &next_frame);
+        loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
 
         // Cumulative effect of decay
         decay_accumulator = decay_accumulator * loop_decay_rate;
@@ -1537,48 +1493,13 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
         boost_score += (decay_accumulator * r);
 
         // Break clause to detect very still sections after motion
-        // For example a staic image after a fade or other transition
-        // instead of a clean key frame.
-        if ( (i > MIN_GF_INTERVAL) &&
-             (loop_decay_rate >= 0.999) &&
-             (decay_accumulator < 0.9) )
+        // For example a staic image after a fade or other transition.
+        if ( detect_transition_to_still( cpi, i, 5,
+                                         loop_decay_rate, decay_accumulator ) )
         {
-            int j;
-            FIRSTPASS_STATS * position = cpi->stats_in;
-            FIRSTPASS_STATS tmp_next_frame;
-            double decay_rate;
-
-            // Look ahead a few frames to see if static condition
-            // persists...
-            for ( j = 0; j < 4; j++ )
-            {
-                if (EOF == vp8_input_stats(cpi, &tmp_next_frame))
-                    break;
-
-                decay_rate = gf_prediction_decay_rate(cpi, &tmp_next_frame);
-                if ( decay_rate < 0.999 )
-                    break;
-            }
-            reset_fpf_position(cpi, position);            // Reset file position
-
-            // Force GF not alt ref
-            if ( j == 4 )
-            {
-                if (0)
-                {
-                    FILE *f = fopen("fadegf.stt", "a");
-                    fprintf(f, " %8d %8d %10.4f %10.4f %10.4f\n",
-                         cpi->common.current_video_frame+i, i,
-                         loop_decay_rate, decay_accumulator,
-                         boost_score );
-                    fclose(f);
-                }
-
-                allow_alt_ref = FALSE;
-
-                boost_score = old_boost_score;
-                break;
-            }
+            allow_alt_ref = FALSE;
+            boost_score = old_boost_score;
+            break;
         }
 
         // Break out conditions.
@@ -1686,7 +1607,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
         arf_frame_bits = (int)((double)Boost * (group_bits / (double)allocation_chunks));
 
         // Estimate if there are enough bits available to make worthwhile use of an arf.
-        tmp_q = estimate_q(cpi, mod_frame_err, (int)arf_frame_bits, cpi->common.Height, cpi->common.Width);
+        tmp_q = estimate_q(cpi, mod_frame_err, (int)arf_frame_bits);
 
         // Only use an arf if it is likely we will be able to code it at a lower Q than the surrounding frames.
         if (tmp_q < cpi->worst_quality)
@@ -1749,20 +1670,6 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
             }
 
             cpi->active_arnr_frames = frames_bwd + 1 + frames_fwd;
-
-            {
-                // Advance to & read in the motion map for those frames
-                // to be considered for filtering based on the position
-                // of the ARF
-                vp8_fpmm_reset_pos(cpi, cpi->fp_motion_map_stats_save);
-
-                // Position at the 'earliest' frame to be filtered
-                vp8_advance_fpmm(cpi,
-                    cpi->baseline_gf_interval - frames_bwd);
-
-                // Read / create a motion map for the region of interest
-                vp8_input_fpmm(cpi);
-            }
         }
         else
         {
@@ -1992,9 +1899,6 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
 
         reset_fpf_position(cpi, start_pos);
     }
-
-    // Reset the First pass motion map file position
-    vp8_fpmm_reset_pos(cpi, fpmm_pos);
 }
 
 // Allocate bits to a normal frame that is neither a gf an arf or a key frame.
@@ -2076,13 +1980,6 @@ void vp8_second_pass(VP8_COMP *cpi)
     if (EOF == vp8_input_stats(cpi, &this_frame))
         return;
 
-    vpx_memset(cpi->fp_motion_map, 0,
-                cpi->oxcf.arnr_max_frames*cpi->common.MBs);
-    cpi->fp_motion_map_stats_save = vp8_fpmm_get_pos(cpi);
-
-    // Step over this frame's first pass motion map
-    vp8_advance_fpmm(cpi, 1);
-
     this_frame_error = this_frame.ssim_weighted_pred_err;
     this_frame_intra_error = this_frame.intra_error;
     this_frame_coded_error = this_frame.coded_error;
@@ -2214,8 +2111,7 @@ void vp8_second_pass(VP8_COMP *cpi)
             est_cq =
                 estimate_cq( cpi,
                              (cpi->total_coded_error_left / frames_left),
-                             (int)(cpi->bits_left / frames_left),
-                             cpi->common.Height, cpi->common.Width);
+                             (int)(cpi->bits_left / frames_left));
 
             cpi->cq_target_quality = cpi->oxcf.cq_level;
             if ( est_cq > cpi->cq_target_quality )
@@ -2227,9 +2123,7 @@ void vp8_second_pass(VP8_COMP *cpi)
         cpi->maxq_min_limit = cpi->best_quality;
         tmp_q = estimate_max_q( cpi,
                                 (cpi->total_coded_error_left / frames_left),
-                                (int)(cpi->bits_left / frames_left),
-                                cpi->common.Height,
-                                cpi->common.Width);
+                                (int)(cpi->bits_left / frames_left));
 
         // Limit the maxq value returned subsequently.
         // This increases the risk of overspend or underspend if the initial
@@ -2257,7 +2151,7 @@ void vp8_second_pass(VP8_COMP *cpi)
         if (frames_left < 1)
             frames_left = 1;
 
-        tmp_q = estimate_max_q(cpi, (cpi->total_coded_error_left / frames_left), (int)(cpi->bits_left / frames_left), cpi->common.Height, cpi->common.Width);
+        tmp_q = estimate_max_q(cpi, (cpi->total_coded_error_left / frames_left), (int)(cpi->bits_left / frames_left));
 
         // Move active_worst_quality but in a damped way
         if (tmp_q > cpi->active_worst_quality)
@@ -2285,7 +2179,7 @@ static BOOL test_candidate_kf(VP8_COMP *cpi,  FIRSTPASS_STATS *last_frame, FIRST
         (next_frame->pcnt_second_ref < 0.10) &&
         ((this_frame->pcnt_inter < 0.05) ||
          (
-             (this_frame->pcnt_inter < .25) &&
+             ((this_frame->pcnt_inter - this_frame->pcnt_neutral) < .25) &&
              ((this_frame->intra_error / DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) < 2.5) &&
              ((fabs(last_frame->coded_error - this_frame->coded_error) / DOUBLE_DIVIDE_CHECK(this_frame->coded_error) > .40) ||
               (fabs(last_frame->intra_error - this_frame->intra_error) / DOUBLE_DIVIDE_CHECK(this_frame->intra_error) > .40) ||
@@ -2332,7 +2226,9 @@ static BOOL test_candidate_kf(VP8_COMP *cpi,  FIRSTPASS_STATS *last_frame, FIRST
             // Test various breakout clauses
             if ((local_next_frame.pcnt_inter < 0.05) ||
                 (next_iiratio < 1.5) ||
-                ((local_next_frame.pcnt_inter < 0.20) && (next_iiratio < 3.0)) ||
+                (((local_next_frame.pcnt_inter -
+                   local_next_frame.pcnt_neutral) < 0.20) &&
+                 (next_iiratio < 3.0)) ||
                 ((boost_score - old_boost_score) < 0.5) ||
                 (local_next_frame.intra_error < 200)
                )
@@ -2363,13 +2259,13 @@ static BOOL test_candidate_kf(VP8_COMP *cpi,  FIRSTPASS_STATS *last_frame, FIRST
 }
 void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
 {
-    int i;
+    int i,j;
     FIRSTPASS_STATS last_frame;
     FIRSTPASS_STATS first_frame;
     FIRSTPASS_STATS next_frame;
     FIRSTPASS_STATS *start_position;
 
-    double decay_accumulator = 0;
+    double decay_accumulator = 1.0;
     double boost_score = 0;
     double old_boost_score = 0.0;
     double loop_decay_rate;
@@ -2379,6 +2275,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
     double kf_group_intra_err = 0.0;
     double kf_group_coded_err = 0.0;
     double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);
+    double recent_loop_decay[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
 
     vpx_memset(&next_frame, 0, sizeof(next_frame)); // assure clean
 
@@ -2407,6 +2304,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
     kf_mod_err = calculate_modified_err(cpi, this_frame);
 
     // find the next keyframe
+    i = 0;
     while (cpi->stats_in < cpi->stats_in_end)
     {
         // Accumulate kf group error
@@ -2425,9 +2323,34 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
         if (cpi->oxcf.auto_key
             && lookup_next_frame_stats(cpi, &next_frame) != EOF)
         {
+            // Normal scene cut check
             if (test_candidate_kf(cpi, &last_frame, this_frame, &next_frame))
                 break;
 
+            // How fast is prediction quality decaying
+            loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
+
+            // We want to know something about the recent past... rather than
+            // as used elsewhere where we are concened with decay in prediction
+            // quality since the last GF or KF.
+            recent_loop_decay[i%8] = loop_decay_rate;
+            decay_accumulator = 1.0;
+            for (j = 0; j < 8; j++)
+            {
+                decay_accumulator = decay_accumulator * recent_loop_decay[j];
+            }
+
+            // Special check for transition or high motion followed by a
+            // to a static scene.
+            if ( detect_transition_to_still( cpi, i,
+                                             (cpi->key_frame_frequency-i),
+                                             loop_decay_rate,
+                                             decay_accumulator ) )
+            {
+                break;
+            }
+
+
             // Step on to the next frame
             cpi->frames_to_key ++;
 
@@ -2437,6 +2360,8 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
                 break;
         } else
             cpi->frames_to_key ++;
+
+        i++;
     }
 
     // If there is a max kf interval set by the user we must obey it.
@@ -2588,32 +2513,8 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
         if (r > RMAX)
             r = RMAX;
 
-        // Adjust loop decay rate
-        //if ( next_frame.pcnt_inter < loop_decay_rate )
-        loop_decay_rate = next_frame.pcnt_inter;
-
-        // High % motion -> somewhat higher decay rate
-        motion_pct = next_frame.pcnt_motion;
-        motion_decay = (1.0 - (motion_pct / 20.0));
-        if (motion_decay < loop_decay_rate)
-            loop_decay_rate = motion_decay;
-
-        // Adjustment to decay rate based on speed of motion
-        {
-            double this_mv_rabs;
-            double this_mv_cabs;
-            double distance_factor;
-
-            this_mv_rabs = fabs(next_frame.mvr_abs * motion_pct);
-            this_mv_cabs = fabs(next_frame.mvc_abs * motion_pct);
-
-            distance_factor = sqrt((this_mv_rabs * this_mv_rabs) +
-                                   (this_mv_cabs * this_mv_cabs)) / 250.0;
-            distance_factor = ((distance_factor > 1.0)
-                                    ? 0.0 : (1.0 - distance_factor));
-            if (distance_factor < loop_decay_rate)
-                loop_decay_rate = distance_factor;
-        }
+        // How fast is prediction quality decaying
+        loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
 
         decay_accumulator = decay_accumulator * loop_decay_rate;
         decay_accumulator = decay_accumulator < 0.1 ? 0.1 : decay_accumulator;
@@ -2859,7 +2760,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
             bits_per_frame = (cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);
 
         // Work out if spatial resampling is necessary
-        kf_q = estimate_kf_group_q(cpi, err_per_frame, bits_per_frame, new_height, new_width, group_iiratio);
+        kf_q = estimate_kf_group_q(cpi, err_per_frame, bits_per_frame, group_iiratio);
 
         // If we project a required Q higher than the maximum allowed Q then make a guess at the actual size of frames in this section
         projected_bits_perframe = bits_per_frame;
@@ -2930,7 +2831,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
                 effective_size_ratio = (1.0 + (3.0 * effective_size_ratio)) / 4.0;
 
                 // Now try again and see what Q we get with the smaller image size
-                kf_q = estimate_kf_group_q(cpi, err_per_frame * effective_size_ratio, bits_per_frame, new_height, new_width, group_iiratio);
+                kf_q = estimate_kf_group_q(cpi, err_per_frame * effective_size_ratio, bits_per_frame, group_iiratio);
 
                 if (0)
                 {
diff --git a/vp8/encoder/generic/csystemdependent.c b/vp8/encoder/generic/csystemdependent.c
index fc0580d55..81108fe96 100644
--- a/vp8/encoder/generic/csystemdependent.c
+++ b/vp8/encoder/generic/csystemdependent.c
@@ -103,6 +103,10 @@ void vp8_cmachine_specific_config(VP8_COMP *cpi)
     // Pure C:
     vp8_yv12_copy_partial_frame_ptr = vp8_yv12_copy_partial_frame;
 
+#if CONFIG_PSNR
+    cpi->rtcd.variance.ssimpf_8x8            = ssim_parms_8x8_c;
+    cpi->rtcd.variance.ssimpf                = ssim_parms_c;
+#endif
 
 #if ARCH_X86 || ARCH_X86_64
     vp8_arch_x86_encoder_init(cpi);
diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c
index 33aaa2ca9..c210c1de2 100644
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@@ -1415,7 +1415,7 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
     int col_min = ref_col - distance;
     int col_max = ref_col + distance;
 
-    unsigned short sad_array8[8];
+    DECLARE_ALIGNED_ARRAY(16, unsigned short, sad_array8, 8);
     unsigned int sad_array[3];
 
     // Work out the mid point for the search
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 39610a73f..8965634fe 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -86,9 +86,11 @@ extern double vp8_calc_ssim
     YV12_BUFFER_CONFIG *source,
     YV12_BUFFER_CONFIG *dest,
     int lumamask,
-    double *weight
+    double *weight,
+    const vp8_variance_rtcd_vtable_t *rtcd
 );
 
+
 extern double vp8_calc_ssimg
 (
     YV12_BUFFER_CONFIG *source,
@@ -281,12 +283,6 @@ void vp8_dealloc_compressor_data(VP8_COMP *cpi)
     vpx_free(cpi->active_map);
     cpi->active_map = 0;
 
-#if !(CONFIG_REALTIME_ONLY)
-    // Delete first pass motion map
-    vpx_free(cpi->fp_motion_map);
-    cpi->fp_motion_map = 0;
-#endif
-
     vp8_de_alloc_frame_buffers(&cpi->common);
 
     vp8_yv12_de_alloc_frame_buffer(&cpi->last_frame_uf);
@@ -1360,11 +1356,11 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi)
 #if !(CONFIG_REALTIME_ONLY)
         vpx_free(cpi->total_stats);
 
-    cpi->total_stats = vpx_calloc(1, vp8_firstpass_stats_sz(cpi->common.MBs));
+    cpi->total_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS));
 
         vpx_free(cpi->this_frame_stats);
 
-    cpi->this_frame_stats = vpx_calloc(1, vp8_firstpass_stats_sz(cpi->common.MBs));
+    cpi->this_frame_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS));
 
     if(!cpi->total_stats || !cpi->this_frame_stats)
         vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
@@ -1462,8 +1458,7 @@ void vp8_init_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
     VP8_COMP *cpi = (VP8_COMP *)(ptr);
     VP8_COMMON *cm = &cpi->common;
 
-    if (!cpi)
-        return;
+    cpi->oxcf = *oxcf;
 
     cpi->auto_gold = 1;
     cpi->auto_adjust_gold_quantizer = 1;
@@ -1475,299 +1470,31 @@ void vp8_init_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
     cm->version = oxcf->Version;
     vp8_setup_version(cm);
 
-    if (oxcf == 0)
-    {
-        cpi->pass                     = 0;
-
-        cpi->auto_worst_q              = 0;
-        cpi->oxcf.best_allowed_q            = MINQ;
-        cpi->oxcf.worst_allowed_q           = MAXQ;
-        cpi->oxcf.cq_level = MINQ;
-
-        cpi->oxcf.end_usage                = USAGE_STREAM_FROM_SERVER;
-        cpi->oxcf.starting_buffer_level     =   4000;
-        cpi->oxcf.optimal_buffer_level      =   5000;
-        cpi->oxcf.maximum_buffer_size       =   6000;
-        cpi->oxcf.under_shoot_pct           =  90;
-        cpi->oxcf.allow_df                 =   0;
-        cpi->oxcf.drop_frames_water_mark     =  20;
-
-        cpi->oxcf.allow_spatial_resampling  = 0;
-        cpi->oxcf.resample_down_water_mark   = 40;
-        cpi->oxcf.resample_up_water_mark     = 60;
-
-        cpi->oxcf.fixed_q = cpi->interquantizer;
-
-        cpi->filter_type = NORMAL_LOOPFILTER;
-
-        if (cm->simpler_lpf)
-            cpi->filter_type = SIMPLE_LOOPFILTER;
-
-        cpi->compressor_speed = 1;
-        cpi->horiz_scale = 0;
-        cpi->vert_scale = 0;
-        cpi->oxcf.two_pass_vbrbias = 50;
-        cpi->oxcf.two_pass_vbrmax_section = 400;
-        cpi->oxcf.two_pass_vbrmin_section = 0;
-
-        cpi->oxcf.Sharpness = 0;
-        cpi->oxcf.noise_sensitivity = 0;
-    }
-    else
-        cpi->oxcf = *oxcf;
-
-
-    switch (cpi->oxcf.Mode)
-    {
-
-    case MODE_REALTIME:
-        cpi->pass = 0;
-        cpi->compressor_speed = 2;
-
-        if (cpi->oxcf.cpu_used < -16)
-        {
-            cpi->oxcf.cpu_used = -16;
-        }
-
-        if (cpi->oxcf.cpu_used > 16)
-            cpi->oxcf.cpu_used = 16;
-
-        break;
-
-#if !(CONFIG_REALTIME_ONLY)
-    case MODE_GOODQUALITY:
-        cpi->pass = 0;
-        cpi->compressor_speed = 1;
-
-        if (cpi->oxcf.cpu_used < -5)
-        {
-            cpi->oxcf.cpu_used = -5;
-        }
-
-        if (cpi->oxcf.cpu_used > 5)
-            cpi->oxcf.cpu_used = 5;
-
-        break;
-
-    case MODE_BESTQUALITY:
-        cpi->pass = 0;
-        cpi->compressor_speed = 0;
-        break;
-
-    case MODE_FIRSTPASS:
-        cpi->pass = 1;
-        cpi->compressor_speed = 1;
-        break;
-    case MODE_SECONDPASS:
-        cpi->pass = 2;
-        cpi->compressor_speed = 1;
-
-        if (cpi->oxcf.cpu_used < -5)
-        {
-            cpi->oxcf.cpu_used = -5;
-        }
-
-        if (cpi->oxcf.cpu_used > 5)
-            cpi->oxcf.cpu_used = 5;
-
-        break;
-    case MODE_SECONDPASS_BEST:
-        cpi->pass = 2;
-        cpi->compressor_speed = 0;
-        break;
-#endif
-    }
-
-    if (cpi->pass == 0)
-        cpi->auto_worst_q = 1;
-
-    cpi->oxcf.worst_allowed_q = q_trans[oxcf->worst_allowed_q];
-    cpi->oxcf.best_allowed_q  = q_trans[oxcf->best_allowed_q];
-    cpi->oxcf.cq_level = q_trans[cpi->oxcf.cq_level];
-
-    if (oxcf->fixed_q >= 0)
-    {
-        if (oxcf->worst_allowed_q < 0)
-            cpi->oxcf.fixed_q = q_trans[0];
-        else
-            cpi->oxcf.fixed_q = q_trans[oxcf->worst_allowed_q];
-
-        if (oxcf->alt_q < 0)
-            cpi->oxcf.alt_q = q_trans[0];
-        else
-            cpi->oxcf.alt_q = q_trans[oxcf->alt_q];
-
-        if (oxcf->key_q < 0)
-            cpi->oxcf.key_q = q_trans[0];
-        else
-            cpi->oxcf.key_q = q_trans[oxcf->key_q];
-
-        if (oxcf->gold_q < 0)
-            cpi->oxcf.gold_q = q_trans[0];
-        else
-            cpi->oxcf.gold_q = q_trans[oxcf->gold_q];
-
-    }
-
-    cpi->baseline_gf_interval = cpi->oxcf.alt_freq ? cpi->oxcf.alt_freq : DEFAULT_GF_INTERVAL;
-    cpi->ref_frame_flags = VP8_ALT_FLAG | VP8_GOLD_FLAG | VP8_LAST_FLAG;
-
-    //cpi->use_golden_frame_only = 0;
-    //cpi->use_last_frame_only = 0;
-    cm->refresh_golden_frame = 0;
-    cm->refresh_last_frame = 1;
-    cm->refresh_entropy_probs = 1;
-
-    if (cpi->oxcf.token_partitions >= 0 && cpi->oxcf.token_partitions <= 3)
-        cm->multi_token_partition = (TOKEN_PARTITION) cpi->oxcf.token_partitions;
-
-    setup_features(cpi);
-
-    {
-        int i;
-
-        for (i = 0; i < MAX_MB_SEGMENTS; i++)
-            cpi->segment_encode_breakout[i] = cpi->oxcf.encode_breakout;
-    }
-
-    // At the moment the first order values may not be > MAXQ
-    if (cpi->oxcf.fixed_q > MAXQ)
-        cpi->oxcf.fixed_q = MAXQ;
-
-    // local file playback mode == really big buffer
-    if (cpi->oxcf.end_usage == USAGE_LOCAL_FILE_PLAYBACK)
-    {
-        cpi->oxcf.starting_buffer_level   = 60000;
-        cpi->oxcf.optimal_buffer_level    = 60000;
-        cpi->oxcf.maximum_buffer_size     = 240000;
-
-    }
+    // change includes all joint functionality
+    vp8_change_config(ptr, oxcf);
 
+    // Initialize active best and worst q and average q values.
+    cpi->active_worst_quality         = cpi->oxcf.worst_allowed_q;
+    cpi->active_best_quality          = cpi->oxcf.best_allowed_q;
+    cpi->avg_frame_qindex             = cpi->oxcf.worst_allowed_q;
 
-    // Convert target bandwidth from Kbit/s to Bit/s
-    cpi->oxcf.target_bandwidth       *= 1000;
+    // Initialise the starting buffer levels
     cpi->oxcf.starting_buffer_level =
         rescale(cpi->oxcf.starting_buffer_level,
                 cpi->oxcf.target_bandwidth, 1000);
 
-    if (cpi->oxcf.optimal_buffer_level == 0)
-        cpi->oxcf.optimal_buffer_level = cpi->oxcf.target_bandwidth / 8;
-    else
-        cpi->oxcf.optimal_buffer_level =
-            rescale(cpi->oxcf.optimal_buffer_level,
-                    cpi->oxcf.target_bandwidth, 1000);
-
-    if (cpi->oxcf.maximum_buffer_size == 0)
-        cpi->oxcf.maximum_buffer_size = cpi->oxcf.target_bandwidth / 8;
-    else
-        cpi->oxcf.maximum_buffer_size =
-            rescale(cpi->oxcf.maximum_buffer_size,
-                    cpi->oxcf.target_bandwidth, 1000);
-
-    cpi->buffer_level                = cpi->oxcf.starting_buffer_level;
+    cpi->buffer_level                 = cpi->oxcf.starting_buffer_level;
     cpi->bits_off_target              = cpi->oxcf.starting_buffer_level;
 
-    vp8_new_frame_rate(cpi, cpi->oxcf.frame_rate);
-    cpi->worst_quality               = cpi->oxcf.worst_allowed_q;
-    cpi->active_worst_quality         = cpi->oxcf.worst_allowed_q;
-    cpi->avg_frame_qindex             = cpi->oxcf.worst_allowed_q;
-    cpi->best_quality                = cpi->oxcf.best_allowed_q;
-    cpi->active_best_quality          = cpi->oxcf.best_allowed_q;
-    cpi->cq_target_quality = cpi->oxcf.cq_level;
-
-    cpi->buffered_mode = (cpi->oxcf.optimal_buffer_level > 0) ? TRUE : FALSE;
-
     cpi->rolling_target_bits          = cpi->av_per_frame_bandwidth;
     cpi->rolling_actual_bits          = cpi->av_per_frame_bandwidth;
-    cpi->long_rolling_target_bits      = cpi->av_per_frame_bandwidth;
-    cpi->long_rolling_actual_bits      = cpi->av_per_frame_bandwidth;
+    cpi->long_rolling_target_bits     = cpi->av_per_frame_bandwidth;
+    cpi->long_rolling_actual_bits     = cpi->av_per_frame_bandwidth;
 
     cpi->total_actual_bits            = 0;
-    cpi->total_target_vs_actual        = 0;
-
-    // Only allow dropped frames in buffered mode
-    cpi->drop_frames_allowed          = cpi->oxcf.allow_df && cpi->buffered_mode;
-
-    cm->filter_type      = (LOOPFILTERTYPE) cpi->filter_type;
-
-    if (!cm->use_bilinear_mc_filter)
-        cm->mcomp_filter_type = SIXTAP;
-    else
-        cm->mcomp_filter_type = BILINEAR;
-
-    cpi->target_bandwidth = cpi->oxcf.target_bandwidth;
-
-    cm->Width       = cpi->oxcf.Width     ;
-    cm->Height      = cpi->oxcf.Height    ;
-
-    cpi->intra_frame_target = (4 * (cm->Width + cm->Height) / 15) * 1000; // As per VP8
-
-    cm->horiz_scale  = cpi->horiz_scale;
-    cm->vert_scale   = cpi->vert_scale ;
-
-    // VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs)
-    if (cpi->oxcf.Sharpness > 7)
-        cpi->oxcf.Sharpness = 7;
-
-    cm->sharpness_level = cpi->oxcf.Sharpness;
-
-    if (cm->horiz_scale != NORMAL || cm->vert_scale != NORMAL)
-    {
-        int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs);
-        int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs);
-
-        Scale2Ratio(cm->horiz_scale, &hr, &hs);
-        Scale2Ratio(cm->vert_scale, &vr, &vs);
-
-        // always go to the next whole number
-        cm->Width = (hs - 1 + cpi->oxcf.Width * hr) / hs;
-        cm->Height = (vs - 1 + cpi->oxcf.Height * vr) / vs;
-    }
-
-    if (((cm->Width + 15) & 0xfffffff0) != cm->yv12_fb[cm->lst_fb_idx].y_width ||
-        ((cm->Height + 15) & 0xfffffff0) != cm->yv12_fb[cm->lst_fb_idx].y_height ||
-        cm->yv12_fb[cm->lst_fb_idx].y_width == 0)
-    {
-        alloc_raw_frame_buffers(cpi);
-        vp8_alloc_compressor_data(cpi);
-    }
-
-    // Clamp KF frame size to quarter of data rate
-    if (cpi->intra_frame_target > cpi->target_bandwidth >> 2)
-        cpi->intra_frame_target = cpi->target_bandwidth >> 2;
-
-    if (cpi->oxcf.fixed_q >= 0)
-    {
-        cpi->last_q[0] = cpi->oxcf.fixed_q;
-        cpi->last_q[1] = cpi->oxcf.fixed_q;
-    }
-
-    cpi->Speed = cpi->oxcf.cpu_used;
-
-    // force to allowlag to 0 if lag_in_frames is 0;
-    if (cpi->oxcf.lag_in_frames == 0)
-    {
-        cpi->oxcf.allow_lag = 0;
-    }
-    // Limit on lag buffers as these are not currently dynamically allocated
-    else if (cpi->oxcf.lag_in_frames > MAX_LAG_BUFFERS)
-        cpi->oxcf.lag_in_frames = MAX_LAG_BUFFERS;
-
-    // YX Temp
-    cpi->last_alt_ref_sei    = -1;
-    cpi->is_src_frame_alt_ref = 0;
-    cpi->is_next_src_alt_ref = 0;
-
-#if 0
-    // Experimental RD Code
-    cpi->frame_distortion = 0;
-    cpi->last_frame_distortion = 0;
-#endif
+    cpi->total_target_vs_actual       = 0;
 
 #if VP8_TEMPORAL_ALT_REF
-
-    cpi->use_weighted_temporal_filter = 0;
-
     {
         int i;
 
@@ -1779,12 +1506,6 @@ void vp8_init_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
 #endif
 }
 
-/*
- * This function needs more clean up, i.e. be more tuned torwards
- * change_config rather than init_config  !!!!!!!!!!!!!!!!
- * YX - 5/28/2009
- *
- */
 
 void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
 {
@@ -1897,7 +1618,8 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
 
     }
 
-    cpi->baseline_gf_interval = cpi->oxcf.alt_freq ? cpi->oxcf.alt_freq : DEFAULT_GF_INTERVAL;
+    cpi->baseline_gf_interval =
+        cpi->oxcf.alt_freq ? cpi->oxcf.alt_freq : DEFAULT_GF_INTERVAL;
 
     cpi->ref_frame_flags = VP8_ALT_FLAG | VP8_GOLD_FLAG | VP8_LAST_FLAG;
 
@@ -1908,7 +1630,8 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
     cm->refresh_entropy_probs = 1;
 
     if (cpi->oxcf.token_partitions >= 0 && cpi->oxcf.token_partitions <= 3)
-        cm->multi_token_partition = (TOKEN_PARTITION) cpi->oxcf.token_partitions;
+        cm->multi_token_partition =
+            (TOKEN_PARTITION) cpi->oxcf.token_partitions;
 
     setup_features(cpi);
 
@@ -1929,16 +1652,12 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
         cpi->oxcf.starting_buffer_level   = 60000;
         cpi->oxcf.optimal_buffer_level    = 60000;
         cpi->oxcf.maximum_buffer_size     = 240000;
-
     }
 
     // Convert target bandwidth from Kbit/s to Bit/s
     cpi->oxcf.target_bandwidth       *= 1000;
 
-    cpi->oxcf.starting_buffer_level =
-        rescale(cpi->oxcf.starting_buffer_level,
-                cpi->oxcf.target_bandwidth, 1000);
-
+    // Set or reset optimal and maximum buffer levels.
     if (cpi->oxcf.optimal_buffer_level == 0)
         cpi->oxcf.optimal_buffer_level = cpi->oxcf.target_bandwidth / 8;
     else
@@ -1953,31 +1672,41 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
             rescale(cpi->oxcf.maximum_buffer_size,
                     cpi->oxcf.target_bandwidth, 1000);
 
-    cpi->buffer_level                = cpi->oxcf.starting_buffer_level;
-    cpi->bits_off_target              = cpi->oxcf.starting_buffer_level;
-
+    // Set up frame rate and related parameters rate control values.
     vp8_new_frame_rate(cpi, cpi->oxcf.frame_rate);
+
+    // Set absolute upper and lower quality limits
     cpi->worst_quality               = cpi->oxcf.worst_allowed_q;
-    cpi->active_worst_quality         = cpi->oxcf.worst_allowed_q;
-    cpi->avg_frame_qindex             = cpi->oxcf.worst_allowed_q;
     cpi->best_quality                = cpi->oxcf.best_allowed_q;
-    cpi->active_best_quality          = cpi->oxcf.best_allowed_q;
-    cpi->buffered_mode = (cpi->oxcf.optimal_buffer_level > 0) ? TRUE : FALSE;
 
-    cpi->cq_target_quality = cpi->oxcf.cq_level;
+    // active values should only be modified if out of new range
+    if (cpi->active_worst_quality > cpi->oxcf.worst_allowed_q)
+    {
+      cpi->active_worst_quality = cpi->oxcf.worst_allowed_q;
+    }
+    // less likely
+    else if (cpi->active_worst_quality < cpi->oxcf.best_allowed_q)
+    {
+      cpi->active_worst_quality = cpi->oxcf.best_allowed_q;
+    }
+    if (cpi->active_best_quality < cpi->oxcf.best_allowed_q)
+    {
+      cpi->active_best_quality = cpi->oxcf.best_allowed_q;
+    }
+    // less likely
+    else if (cpi->active_best_quality > cpi->oxcf.worst_allowed_q)
+    {
+      cpi->active_best_quality = cpi->oxcf.worst_allowed_q;
+    }
 
-    cpi->rolling_target_bits          = cpi->av_per_frame_bandwidth;
-    cpi->rolling_actual_bits          = cpi->av_per_frame_bandwidth;
-    cpi->long_rolling_target_bits      = cpi->av_per_frame_bandwidth;
-    cpi->long_rolling_actual_bits      = cpi->av_per_frame_bandwidth;
+    cpi->buffered_mode = (cpi->oxcf.optimal_buffer_level > 0) ? TRUE : FALSE;
 
-    cpi->total_actual_bits            = 0;
-    cpi->total_target_vs_actual        = 0;
+    cpi->cq_target_quality = cpi->oxcf.cq_level;
 
     // Only allow dropped frames in buffered mode
-    cpi->drop_frames_allowed          = cpi->oxcf.allow_df && cpi->buffered_mode;
+    cpi->drop_frames_allowed = cpi->oxcf.allow_df && cpi->buffered_mode;
 
-    cm->filter_type                  = (LOOPFILTERTYPE) cpi->filter_type;
+    cm->filter_type          = (LOOPFILTERTYPE) cpi->filter_type;
 
     if (!cm->use_bilinear_mc_filter)
         cm->mcomp_filter_type = SIXTAP;
@@ -1992,7 +1721,8 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
     cm->horiz_scale  = cpi->horiz_scale;
     cm->vert_scale   = cpi->vert_scale ;
 
-    cpi->intra_frame_target           = (4 * (cm->Width + cm->Height) / 15) * 1000; // As per VP8
+    // As per VP8
+    cpi->intra_frame_target = (4 * (cm->Width + cm->Height) / 15) * 1000;
 
     // VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs)
     if (cpi->oxcf.Sharpness > 7)
@@ -2013,8 +1743,10 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
         cm->Height = (vs - 1 + cpi->oxcf.Height * vr) / vs;
     }
 
-    if (((cm->Width + 15) & 0xfffffff0) != cm->yv12_fb[cm->lst_fb_idx].y_width ||
-        ((cm->Height + 15) & 0xfffffff0) != cm->yv12_fb[cm->lst_fb_idx].y_height ||
+    if (((cm->Width + 15) & 0xfffffff0) !=
+          cm->yv12_fb[cm->lst_fb_idx].y_width ||
+        ((cm->Height + 15) & 0xfffffff0) !=
+          cm->yv12_fb[cm->lst_fb_idx].y_height ||
         cm->yv12_fb[cm->lst_fb_idx].y_width == 0)
     {
         alloc_raw_frame_buffers(cpi);
@@ -2153,12 +1885,6 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
     vpx_memset(cpi->active_map , 1, (cpi->common.mb_rows * cpi->common.mb_cols));
     cpi->active_map_enabled = 0;
 
-#if !(CONFIG_REALTIME_ONLY)
-    // Create the first pass motion map structure and set to 0
-    // Allocate space for maximum of 15 buffers
-    CHECK_MEM_ERROR(cpi->fp_motion_map, vpx_calloc(15*cpi->common.MBs, 1));
-#endif
-
 #if 0
     // Experimental code for lagged and one pass
     // Initialise one_pass GF frames stats
@@ -2308,7 +2034,7 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
     }
     else if (cpi->pass == 2)
     {
-        size_t packet_sz = vp8_firstpass_stats_sz(cpi->common.MBs);
+        size_t packet_sz = sizeof(FIRSTPASS_STATS);
         int packets = oxcf->two_pass_stats_in.sz / packet_sz;
 
         cpi->stats_in = oxcf->two_pass_stats_in.buf;
@@ -3509,6 +3235,89 @@ static BOOL recode_loop_test( VP8_COMP *cpi,
     return force_recode;
 }
 
+void loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm)
+{
+    if (cm->no_lpf)
+    {
+        cm->filter_level = 0;
+    }
+    else
+    {
+        struct vpx_usec_timer timer;
+
+        vp8_clear_system_state();
+
+        vpx_usec_timer_start(&timer);
+        if (cpi->sf.auto_filter == 0)
+            vp8cx_pick_filter_level_fast(cpi->Source, cpi);
+
+        else
+            vp8cx_pick_filter_level(cpi->Source, cpi);
+
+        vpx_usec_timer_mark(&timer);
+        cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer);
+    }
+
+#if CONFIG_MULTITHREAD
+    sem_post(&cpi->h_event_end_lpf); /* signal that we have set filter_level */
+#endif
+
+    if (cm->filter_level > 0)
+    {
+        vp8cx_set_alt_lf_level(cpi, cm->filter_level);
+        vp8_loop_filter_frame(cm, &cpi->mb.e_mbd, cm->filter_level);
+        cm->last_filter_type = cm->filter_type;
+        cm->last_sharpness_level = cm->sharpness_level;
+    }
+
+    vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show);
+
+    {
+        YV12_BUFFER_CONFIG *lst_yv12 = &cm->yv12_fb[cm->lst_fb_idx];
+        YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx];
+        YV12_BUFFER_CONFIG *gld_yv12 = &cm->yv12_fb[cm->gld_fb_idx];
+        YV12_BUFFER_CONFIG *alt_yv12 = &cm->yv12_fb[cm->alt_fb_idx];
+        // At this point the new frame has been encoded.
+        // If any buffer copy / swapping is signaled it should be done here.
+        if (cm->frame_type == KEY_FRAME)
+        {
+            vp8_yv12_copy_frame_ptr(cm->frame_to_show, gld_yv12);
+            vp8_yv12_copy_frame_ptr(cm->frame_to_show, alt_yv12);
+        }
+        else    // For non key frames
+        {
+            // Code to copy between reference buffers
+            if (cm->copy_buffer_to_arf)
+            {
+                if (cm->copy_buffer_to_arf == 1)
+                {
+                    if (cm->refresh_last_frame)
+                        // We copy new_frame here because last and new buffers will already have been swapped if cm->refresh_last_frame is set.
+                        vp8_yv12_copy_frame_ptr(new_yv12, alt_yv12);
+                    else
+                        vp8_yv12_copy_frame_ptr(lst_yv12, alt_yv12);
+                }
+                else if (cm->copy_buffer_to_arf == 2)
+                    vp8_yv12_copy_frame_ptr(gld_yv12, alt_yv12);
+            }
+
+            if (cm->copy_buffer_to_gf)
+            {
+                if (cm->copy_buffer_to_gf == 1)
+                {
+                    if (cm->refresh_last_frame)
+                        // We copy new_frame here because last and new buffers will already have been swapped if cm->refresh_last_frame is set.
+                        vp8_yv12_copy_frame_ptr(new_yv12, gld_yv12);
+                    else
+                        vp8_yv12_copy_frame_ptr(lst_yv12, gld_yv12);
+                }
+                else if (cm->copy_buffer_to_gf == 2)
+                    vp8_yv12_copy_frame_ptr(alt_yv12, gld_yv12);
+            }
+        }
+    }
+}
+
 static void encode_frame_to_data_rate
 (
     VP8_COMP *cpi,
@@ -3542,6 +3351,7 @@ static void encode_frame_to_data_rate
     int drop_mark50 = drop_mark / 4;
     int drop_mark25 = drop_mark / 8;
 
+
     // Clear down mmx registers to allow floating point in what follows
     vp8_clear_system_state();
 
@@ -3862,11 +3672,12 @@ static void encode_frame_to_data_rate
             }
         }
 
-        // If CBR and the buffer is as full then it is reasonable to allow higher quality on the frames
-        // to prevent bits just going to waste.
+        // If CBR and the buffer is as full then it is reasonable to allow
+        // higher quality on the frames to prevent bits just going to waste.
         if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
         {
-            // Note that the use of >= here elliminates the risk of a devide by 0 error in the else if clause
+            // Note that the use of >= here elliminates the risk of a devide
+            // by 0 error in the else if clause
             if (cpi->buffer_level >= cpi->oxcf.maximum_buffer_size)
                 cpi->active_best_quality = cpi->best_quality;
 
@@ -3879,6 +3690,20 @@ static void encode_frame_to_data_rate
             }
         }
     }
+    // Make sure constrained quality mode limits are adhered to for the first
+    // few frames of one pass encodes
+    else if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY)
+    {
+        if ( (cm->frame_type == KEY_FRAME) ||
+             cm->refresh_golden_frame || cpi->common.refresh_alt_ref_frame )
+        {
+             cpi->active_best_quality = cpi->best_quality;
+        }
+        else if (cpi->active_best_quality < cpi->cq_target_quality)
+        {
+            cpi->active_best_quality = cpi->cq_target_quality;
+        }
+    }
 
     // Clip the active best and worst quality values to limits
     if (cpi->active_worst_quality > cpi->worst_quality)
@@ -4058,8 +3883,8 @@ static void encode_frame_to_data_rate
             vp8_setup_key_frame(cpi);
 
         // transform / motion compensation build reconstruction frame
-
         vp8_encode_frame(cpi);
+
         cpi->projected_frame_size -= vp8_estimate_entropy_savings(cpi);
         cpi->projected_frame_size = (cpi->projected_frame_size > 0) ? cpi->projected_frame_size : 0;
 
@@ -4408,92 +4233,43 @@ static void encode_frame_to_data_rate
     else
         cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx];
 
-    if (cm->no_lpf)
+
+#if CONFIG_MULTITHREAD
+    if (cpi->b_multi_threaded)
     {
-        cm->filter_level = 0;
+        sem_post(&cpi->h_event_start_lpf); /* start loopfilter in separate thread */
     }
     else
+#endif
     {
-        struct vpx_usec_timer timer;
-
-        vpx_usec_timer_start(&timer);
-
-        if (cpi->sf.auto_filter == 0)
-            vp8cx_pick_filter_level_fast(cpi->Source, cpi);
-        else
-            vp8cx_pick_filter_level(cpi->Source, cpi);
-
-        vpx_usec_timer_mark(&timer);
-
-        cpi->time_pick_lpf +=  vpx_usec_timer_elapsed(&timer);
-    }
-
-    if (cm->filter_level > 0)
-    {
-        vp8cx_set_alt_lf_level(cpi, cm->filter_level);
-        vp8_loop_filter_frame(cm, &cpi->mb.e_mbd, cm->filter_level);
-        cm->last_filter_type = cm->filter_type;
-        cm->last_sharpness_level = cm->sharpness_level;
+        loopfilter_frame(cpi, cm);
     }
 
-    /* Move storing frame_type out of the above loop since it is also
-     * needed in motion search besides loopfilter */
-    cm->last_frame_type = cm->frame_type;
-
-    vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show);
-
     if (cpi->oxcf.error_resilient_mode == 1)
     {
         cm->refresh_entropy_probs = 0;
     }
 
+#if CONFIG_MULTITHREAD
+    /* wait that filter_level is picked so that we can continue with stream packing */
+    if (cpi->b_multi_threaded)
+        sem_wait(&cpi->h_event_end_lpf);
+#endif
+
     // build the bitstream
     vp8_pack_bitstream(cpi, dest, size);
 
+#if CONFIG_MULTITHREAD
+    /* wait for loopfilter thread done */
+    if (cpi->b_multi_threaded)
     {
-        YV12_BUFFER_CONFIG *lst_yv12 = &cm->yv12_fb[cm->lst_fb_idx];
-        YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx];
-        YV12_BUFFER_CONFIG *gld_yv12 = &cm->yv12_fb[cm->gld_fb_idx];
-        YV12_BUFFER_CONFIG *alt_yv12 = &cm->yv12_fb[cm->alt_fb_idx];
-        // At this point the new frame has been encoded coded.
-        // If any buffer copy / swaping is signalled it should be done here.
-        if (cm->frame_type == KEY_FRAME)
-        {
-            vp8_yv12_copy_frame_ptr(cm->frame_to_show, gld_yv12);
-            vp8_yv12_copy_frame_ptr(cm->frame_to_show, alt_yv12);
-        }
-        else    // For non key frames
-        {
-            // Code to copy between reference buffers
-            if (cm->copy_buffer_to_arf)
-            {
-                if (cm->copy_buffer_to_arf == 1)
-                {
-                    if (cm->refresh_last_frame)
-                        // We copy new_frame here because last and new buffers will already have been swapped if cm->refresh_last_frame is set.
-                        vp8_yv12_copy_frame_ptr(new_yv12, alt_yv12);
-                    else
-                        vp8_yv12_copy_frame_ptr(lst_yv12, alt_yv12);
-                }
-                else if (cm->copy_buffer_to_arf == 2)
-                    vp8_yv12_copy_frame_ptr(gld_yv12, alt_yv12);
-            }
-
-            if (cm->copy_buffer_to_gf)
-            {
-                if (cm->copy_buffer_to_gf == 1)
-                {
-                    if (cm->refresh_last_frame)
-                        // We copy new_frame here because last and new buffers will already have been swapped if cm->refresh_last_frame is set.
-                        vp8_yv12_copy_frame_ptr(new_yv12, gld_yv12);
-                    else
-                        vp8_yv12_copy_frame_ptr(lst_yv12, gld_yv12);
-                }
-                else if (cm->copy_buffer_to_gf == 2)
-                    vp8_yv12_copy_frame_ptr(alt_yv12, gld_yv12);
-            }
-        }
+        sem_wait(&cpi->h_event_end_lpf);
     }
+#endif
+
+    /* Move storing frame_type out of the above loop since it is also
+     * needed in motion search besides loopfilter */
+      cm->last_frame_type = cm->frame_type;
 
     // Update rate control heuristics
     cpi->total_byte_count += (*size);
@@ -5328,7 +5104,9 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
     cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer);
 
     if (cpi->b_calculate_psnr && cpi->pass != 1 && cm->show_frame)
+    {
         generate_psnr_packet(cpi);
+    }
 
 #if CONFIG_PSNR
 
@@ -5344,12 +5122,35 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
             if (cpi->b_calculate_psnr)
             {
                 double y, u, v;
-                double sq_error;
-                double frame_psnr = vp8_calc_psnr(cpi->Source, cm->frame_to_show, &y, &u, &v, &sq_error);
-
-                cpi->total_y += y;
-                cpi->total_u += u;
-                cpi->total_v += v;
+                double ye,ue,ve;
+                double frame_psnr;
+                YV12_BUFFER_CONFIG      *orig = cpi->Source;
+                YV12_BUFFER_CONFIG      *recon = cpi->common.frame_to_show;
+                YV12_BUFFER_CONFIG      *pp = &cm->post_proc_buffer;
+                int y_samples = orig->y_height * orig->y_width ;
+                int uv_samples = orig->uv_height * orig->uv_width ;
+                int t_samples = y_samples + 2 * uv_samples;
+                long long sq_error;
+
+                ye = calc_plane_error(orig->y_buffer, orig->y_stride,
+                  recon->y_buffer, recon->y_stride, orig->y_width, orig->y_height,
+                  IF_RTCD(&cpi->rtcd.variance));
+
+                ue = calc_plane_error(orig->u_buffer, orig->uv_stride,
+                  recon->u_buffer, recon->uv_stride, orig->uv_width, orig->uv_height,
+                  IF_RTCD(&cpi->rtcd.variance));
+
+                ve = calc_plane_error(orig->v_buffer, orig->uv_stride,
+                  recon->v_buffer, recon->uv_stride, orig->uv_width, orig->uv_height,
+                  IF_RTCD(&cpi->rtcd.variance));
+
+                sq_error = ye + ue + ve;
+
+                frame_psnr = vp8_mse2psnr(t_samples, 255.0, sq_error);
+
+                cpi->total_y += vp8_mse2psnr(y_samples, 255.0, ye);
+                cpi->total_u += vp8_mse2psnr(uv_samples, 255.0, ue);
+                cpi->total_v += vp8_mse2psnr(uv_samples, 255.0, ve);
                 cpi->total_sq_error += sq_error;
                 cpi->total  += frame_psnr;
                 {
@@ -5358,17 +5159,35 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
 
                     vp8_deblock(cm->frame_to_show, &cm->post_proc_buffer, cm->filter_level * 10 / 6, 1, 0, IF_RTCD(&cm->rtcd.postproc));
                     vp8_clear_system_state();
-                    frame_psnr2 = vp8_calc_psnr(cpi->Source, &cm->post_proc_buffer, &y2, &u2, &v2, &sq_error);
-                    frame_ssim2 = vp8_calc_ssim(cpi->Source, &cm->post_proc_buffer, 1, &weight);
 
-                    cpi->summed_quality += frame_ssim2 * weight;
-                    cpi->summed_weights += weight;
+                    ye = calc_plane_error(orig->y_buffer, orig->y_stride,
+                      pp->y_buffer, pp->y_stride, orig->y_width, orig->y_height,
+                      IF_RTCD(&cpi->rtcd.variance));
 
-                    cpi->totalp_y += y2;
-                    cpi->totalp_u += u2;
-                    cpi->totalp_v += v2;
-                    cpi->totalp  += frame_psnr2;
+                    ue = calc_plane_error(orig->u_buffer, orig->uv_stride,
+                      pp->u_buffer, pp->uv_stride, orig->uv_width, orig->uv_height,
+                      IF_RTCD(&cpi->rtcd.variance));
+
+                    ve = calc_plane_error(orig->v_buffer, orig->uv_stride,
+                      pp->v_buffer, pp->uv_stride, orig->uv_width, orig->uv_height,
+                      IF_RTCD(&cpi->rtcd.variance));
+
+                    sq_error = ye + ue + ve;
+
+                    frame_psnr2 = vp8_mse2psnr(t_samples, 255.0, sq_error);
+
+                    cpi->totalp_y += vp8_mse2psnr(y_samples, 255.0, ye);
+                    cpi->totalp_u += vp8_mse2psnr(uv_samples, 255.0, ue);
+                    cpi->totalp_v += vp8_mse2psnr(uv_samples, 255.0, ve);
                     cpi->total_sq_error2 += sq_error;
+                    cpi->totalp  += frame_psnr2;
+
+                    frame_ssim2 = vp8_calc_ssim(cpi->Source,
+                      &cm->post_proc_buffer, 1, &weight,
+                      IF_RTCD(&cpi->rtcd.variance));
+
+                    cpi->summed_quality += frame_ssim2 * weight;
+                    cpi->summed_weights += weight;
 
                 }
             }
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index b66131d15..0e53f6803 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -99,6 +99,7 @@ typedef struct
     double pcnt_inter;
     double pcnt_motion;
     double pcnt_second_ref;
+    double pcnt_neutral;
     double MVr;
     double mvr_abs;
     double MVc;
@@ -495,11 +496,6 @@ typedef struct
     struct vpx_codec_pkt_list  *output_pkt_list;
     int                          first_pass_done;
 
-#if !(CONFIG_REALTIME_ONLY)
-    unsigned char *fp_motion_map;
-    unsigned char *fp_motion_map_stats, *fp_motion_map_stats_save;
-#endif
-
 #if 0
     // Experimental code for lagged and one pass
     ONEPASS_FRAMESTATS one_pass_frame_stats[MAX_LAG_BUFFERS];
@@ -603,12 +599,17 @@ typedef struct
     int encoding_thread_count;
 
     pthread_t *h_encoding_thread;
+    pthread_t h_filter_thread;
+
     MB_ROW_COMP *mb_row_ei;
     ENCODETHREAD_DATA *en_thread_data;
+    LPFTHREAD_DATA lpf_thread_data;
 
     //events
     sem_t *h_event_start_encoding;
     sem_t h_event_end_encoding;
+    sem_t h_event_start_lpf;
+    sem_t h_event_end_lpf;
 #endif
 
     TOKENLIST *tplist;
@@ -641,8 +642,6 @@ typedef struct
     YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS];
     int fixed_divide[512];
 #endif
-    // Flag to indicate temporal filter method
-    int use_weighted_temporal_filter;
 
 #if CONFIG_PSNR
     int    count;
diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c
index bfffe43d9..9797f5f25 100644
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -842,7 +842,8 @@ void vp8_calc_pframe_target_size(VP8_COMP *cpi)
         {
             int one_percent_bits = 1 + cpi->oxcf.optimal_buffer_level / 100;
 
-            if ((cpi->buffer_level < cpi->oxcf.optimal_buffer_level) || (cpi->bits_off_target < cpi->oxcf.optimal_buffer_level))
+            if ((cpi->buffer_level < cpi->oxcf.optimal_buffer_level) ||
+                (cpi->bits_off_target < cpi->oxcf.optimal_buffer_level))
             {
                 int percent_low = 0;
 
@@ -851,9 +852,12 @@ void vp8_calc_pframe_target_size(VP8_COMP *cpi)
                 // If we are are below the optimal buffer fullness level and adherence
                 // to buffering contraints is important to the end useage then adjust
                 // the per frame target.
-                if ((cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) && (cpi->buffer_level < cpi->oxcf.optimal_buffer_level))
+                if ((cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) &&
+                    (cpi->buffer_level < cpi->oxcf.optimal_buffer_level))
                 {
-                    percent_low = (cpi->oxcf.optimal_buffer_level - cpi->buffer_level) / one_percent_bits;
+                    percent_low =
+                        (cpi->oxcf.optimal_buffer_level - cpi->buffer_level) /
+                        one_percent_bits;
 
                     if (percent_low > 100)
                         percent_low = 100;
@@ -864,7 +868,8 @@ void vp8_calc_pframe_target_size(VP8_COMP *cpi)
                 else if (cpi->bits_off_target < 0)
                 {
                     // Adjust per frame data target downwards to compensate.
-                    percent_low = (int)(100 * -cpi->bits_off_target / (cpi->total_byte_count * 8));
+                    percent_low = (int)(100 * -cpi->bits_off_target /
+                                       (cpi->total_byte_count * 8));
 
                     if (percent_low > 100)
                         percent_low = 100;
@@ -873,39 +878,60 @@ void vp8_calc_pframe_target_size(VP8_COMP *cpi)
                 }
 
                 // lower the target bandwidth for this frame.
-                cpi->this_frame_target = (cpi->this_frame_target * (100 - (percent_low / 2))) / 100;
+                cpi->this_frame_target =
+                    (cpi->this_frame_target * (100 - (percent_low / 2))) / 100;
 
-                // Are we using allowing control of active_worst_allowed_q according to buffer level.
+                // Are we using allowing control of active_worst_allowed_q
+                // according to buffer level.
                 if (cpi->auto_worst_q)
                 {
                     int critical_buffer_level;
 
-                    // For streaming applications the most important factor is cpi->buffer_level as this takes
-                    // into account the specified short term buffering constraints. However, hitting the long
-                    // term clip data rate target is also important.
+                    // For streaming applications the most important factor is
+                    // cpi->buffer_level as this takes into account the
+                    // specified short term buffering constraints. However,
+                    // hitting the long term clip data rate target is also
+                    // important.
                     if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
                     {
-                        // Take the smaller of cpi->buffer_level and cpi->bits_off_target
-                        critical_buffer_level = (cpi->buffer_level < cpi->bits_off_target) ? cpi->buffer_level : cpi->bits_off_target;
+                        // Take the smaller of cpi->buffer_level and
+                        // cpi->bits_off_target
+                        critical_buffer_level =
+                            (cpi->buffer_level < cpi->bits_off_target)
+                            ? cpi->buffer_level : cpi->bits_off_target;
                     }
-                    // For local file playback short term buffering contraints are less of an issue
+                    // For local file playback short term buffering contraints
+                    // are less of an issue
                     else
                     {
-                        // Consider only how we are doing for the clip as a whole
+                        // Consider only how we are doing for the clip as a
+                        // whole
                         critical_buffer_level = cpi->bits_off_target;
                     }
 
-                    // Set the active worst quality based upon the selected buffer fullness number.
+                    // Set the active worst quality based upon the selected
+                    // buffer fullness number.
                     if (critical_buffer_level < cpi->oxcf.optimal_buffer_level)
                     {
-                        if (critical_buffer_level > (cpi->oxcf.optimal_buffer_level / 4))
+                        if ( critical_buffer_level >
+                             (cpi->oxcf.optimal_buffer_level >> 2) )
                         {
-                            int qadjustment_range = cpi->worst_quality - cpi->ni_av_qi;
-                            int above_base = (critical_buffer_level - (cpi->oxcf.optimal_buffer_level / 4));
-
-                            // Step active worst quality down from cpi->ni_av_qi when (critical_buffer_level == cpi->optimal_buffer_level)
-                            // to cpi->oxcf.worst_allowed_q when (critical_buffer_level == cpi->optimal_buffer_level/4)
-                            cpi->active_worst_quality = cpi->worst_quality - ((qadjustment_range * above_base) / (cpi->oxcf.optimal_buffer_level * 3 / 4));
+                            INT64 qadjustment_range =
+                                      cpi->worst_quality - cpi->ni_av_qi;
+                            INT64 above_base =
+                                      (critical_buffer_level -
+                                       (cpi->oxcf.optimal_buffer_level >> 2));
+
+                            // Step active worst quality down from
+                            // cpi->ni_av_qi when (critical_buffer_level ==
+                            // cpi->optimal_buffer_level) to
+                            // cpi->worst_quality when
+                            // (critical_buffer_level ==
+                            //     cpi->optimal_buffer_level >> 2)
+                            cpi->active_worst_quality =
+                                cpi->worst_quality -
+                                ((qadjustment_range * above_base) /
+                                 (cpi->oxcf.optimal_buffer_level*3>>2));
                         }
                         else
                         {
@@ -965,6 +991,15 @@ void vp8_calc_pframe_target_size(VP8_COMP *cpi)
             // Set the active worst quality
             cpi->active_worst_quality = cpi->worst_quality;
         }
+
+        // Special trap for constrained quality mode
+        // "active_worst_quality" may never drop below cq level
+        // for any frame type.
+        if ( cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY &&
+             cpi->active_worst_quality < cpi->cq_target_quality)
+        {
+            cpi->active_worst_quality = cpi->cq_target_quality;
+        }
     }
 
     // Test to see if we have to drop a frame
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index 8aaca0917..b0dcfe0a4 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -1968,7 +1968,7 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
                     else
                         cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
                 }
-                else if (vp8_ref_frame_order[mode_index] == SPLITMV)
+                else if (vp8_mode_order[mode_index] == SPLITMV)
                     cpi->zbin_mode_boost = 0;
                 else
                     cpi->zbin_mode_boost = MV_ZBIN_BOOST;
diff --git a/vp8/encoder/ssim.c b/vp8/encoder/ssim.c
index 4ebcba1a1..64d67c6dd 100644
--- a/vp8/encoder/ssim.c
+++ b/vp8/encoder/ssim.c
@@ -11,298 +11,13 @@
 
 #include "vpx_scale/yv12config.h"
 #include "math.h"
+#include "onyx_int.h"
 
-#define C1 (float)(64 * 64 * 0.01*255*0.01*255)
-#define C2 (float)(64 * 64 * 0.03*255*0.03*255)
-
-static int width_y;
-static int height_y;
-static int height_uv;
-static int width_uv;
-static int stride_uv;
-static int stride;
-static int lumimask;
-static int luminance;
-static double plane_summed_weights = 0;
-
-static short img12_sum_block[8*4096*4096*2] ;
-
-static short img1_sum[8*4096*2];
-static short img2_sum[8*4096*2];
-static int   img1_sq_sum[8*4096*2];
-static int   img2_sq_sum[8*4096*2];
-static int   img12_mul_sum[8*4096*2];
-
-
-double vp8_similarity
-(
-    int mu_x,
-    int mu_y,
-    int pre_mu_x2,
-    int pre_mu_y2,
-    int pre_mu_xy2
-)
-{
-    int mu_x2, mu_y2, mu_xy, theta_x2, theta_y2, theta_xy;
-
-    mu_x2 = mu_x * mu_x;
-    mu_y2 = mu_y * mu_y;
-    mu_xy = mu_x * mu_y;
-
-    theta_x2 = 64 * pre_mu_x2 - mu_x2;
-    theta_y2 = 64 * pre_mu_y2 - mu_y2;
-    theta_xy = 64 * pre_mu_xy2 - mu_xy;
-
-    return (2 * mu_xy + C1) * (2 * theta_xy + C2) / ((mu_x2 + mu_y2 + C1) * (theta_x2 + theta_y2 + C2));
-}
-
-double vp8_ssim
-(
-    const unsigned char *img1,
-    const unsigned char *img2,
-    int stride_img1,
-    int stride_img2,
-    int width,
-    int height
-)
-{
-    int x, y, x2, y2, img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block, temp;
-
-    double plane_quality, weight, mean;
-
-    short *img1_sum_ptr1, *img1_sum_ptr2;
-    short *img2_sum_ptr1, *img2_sum_ptr2;
-    int *img1_sq_sum_ptr1, *img1_sq_sum_ptr2;
-    int *img2_sq_sum_ptr1, *img2_sq_sum_ptr2;
-    int *img12_mul_sum_ptr1, *img12_mul_sum_ptr2;
-
-    plane_quality = 0;
-
-    if (lumimask)
-        plane_summed_weights = 0.0f;
-    else
-        plane_summed_weights = (height - 7) * (width - 7);
-
-    //some prologue for the main loop
-    temp = 8 * width;
-
-    img1_sum_ptr1      = img1_sum + temp;
-    img2_sum_ptr1      = img2_sum + temp;
-    img1_sq_sum_ptr1   = img1_sq_sum + temp;
-    img2_sq_sum_ptr1   = img2_sq_sum + temp;
-    img12_mul_sum_ptr1 = img12_mul_sum + temp;
-
-    for (x = 0; x < width; x++)
-    {
-        img1_sum[x]      = img1[x];
-        img2_sum[x]      = img2[x];
-        img1_sq_sum[x]   = img1[x] * img1[x];
-        img2_sq_sum[x]   = img2[x] * img2[x];
-        img12_mul_sum[x] = img1[x] * img2[x];
-
-        img1_sum_ptr1[x]      = 0;
-        img2_sum_ptr1[x]      = 0;
-        img1_sq_sum_ptr1[x]   = 0;
-        img2_sq_sum_ptr1[x]   = 0;
-        img12_mul_sum_ptr1[x] = 0;
-    }
-
-    //the main loop
-    for (y = 1; y < height; y++)
-    {
-        img1 += stride_img1;
-        img2 += stride_img2;
-
-        temp = (y - 1) % 9 * width;
-
-        img1_sum_ptr1      = img1_sum + temp;
-        img2_sum_ptr1      = img2_sum + temp;
-        img1_sq_sum_ptr1   = img1_sq_sum + temp;
-        img2_sq_sum_ptr1   = img2_sq_sum + temp;
-        img12_mul_sum_ptr1 = img12_mul_sum + temp;
-
-        temp = y % 9 * width;
-
-        img1_sum_ptr2      = img1_sum + temp;
-        img2_sum_ptr2      = img2_sum + temp;
-        img1_sq_sum_ptr2   = img1_sq_sum + temp;
-        img2_sq_sum_ptr2   = img2_sq_sum + temp;
-        img12_mul_sum_ptr2 = img12_mul_sum + temp;
-
-        for (x = 0; x < width; x++)
-        {
-            img1_sum_ptr2[x]      = img1_sum_ptr1[x] + img1[x];
-            img2_sum_ptr2[x]      = img2_sum_ptr1[x] + img2[x];
-            img1_sq_sum_ptr2[x]   = img1_sq_sum_ptr1[x] + img1[x] * img1[x];
-            img2_sq_sum_ptr2[x]   = img2_sq_sum_ptr1[x] + img2[x] * img2[x];
-            img12_mul_sum_ptr2[x] = img12_mul_sum_ptr1[x] + img1[x] * img2[x];
-        }
-
-        if (y > 6)
-        {
-            //calculate the sum of the last 8 lines by subtracting the total sum of 8 lines back from the present sum
-            temp = (y + 1) % 9 * width;
-
-            img1_sum_ptr1      = img1_sum + temp;
-            img2_sum_ptr1      = img2_sum + temp;
-            img1_sq_sum_ptr1   = img1_sq_sum + temp;
-            img2_sq_sum_ptr1   = img2_sq_sum + temp;
-            img12_mul_sum_ptr1 = img12_mul_sum + temp;
-
-            for (x = 0; x < width; x++)
-            {
-                img1_sum_ptr1[x]      = img1_sum_ptr2[x] - img1_sum_ptr1[x];
-                img2_sum_ptr1[x]      = img2_sum_ptr2[x] - img2_sum_ptr1[x];
-                img1_sq_sum_ptr1[x]   = img1_sq_sum_ptr2[x] - img1_sq_sum_ptr1[x];
-                img2_sq_sum_ptr1[x]   = img2_sq_sum_ptr2[x] - img2_sq_sum_ptr1[x];
-                img12_mul_sum_ptr1[x] = img12_mul_sum_ptr2[x] - img12_mul_sum_ptr1[x];
-            }
-
-            //here we calculate the sum over the 8x8 block of pixels
-            //this is done by sliding a window across the column sums for the last 8 lines
-            //each time adding the new column sum, and subtracting the one which fell out of the window
-            img1_block      = 0;
-            img2_block      = 0;
-            img1_sq_block   = 0;
-            img2_sq_block   = 0;
-            img12_mul_block = 0;
-
-            //prologue, and calculation of simularity measure from the first 8 column sums
-            for (x = 0; x < 8; x++)
-            {
-                img1_block      += img1_sum_ptr1[x];
-                img2_block      += img2_sum_ptr1[x];
-                img1_sq_block   += img1_sq_sum_ptr1[x];
-                img2_sq_block   += img2_sq_sum_ptr1[x];
-                img12_mul_block += img12_mul_sum_ptr1[x];
-            }
-
-            if (lumimask)
-            {
-                y2 = y - 7;
-                x2 = 0;
-
-                if (luminance)
-                {
-                    mean = (img2_block + img1_block) / 128.0f;
-
-                    if (!(y2 % 2 || x2 % 2))
-                        *(img12_sum_block + y2 / 2 * width_uv + x2 / 2) = img2_block + img1_block;
-                }
-                else
-                {
-                    mean = *(img12_sum_block + y2 * width_uv + x2);
-                    mean += *(img12_sum_block + y2 * width_uv + x2 + 4);
-                    mean += *(img12_sum_block + (y2 + 4) * width_uv + x2);
-                    mean += *(img12_sum_block + (y2 + 4) * width_uv + x2 + 4);
-
-                    mean /= 512.0f;
-                }
-
-                weight = mean < 40 ? 0.0f :
-                         (mean < 50 ? (mean - 40.0f) / 10.0f : 1.0f);
-                plane_summed_weights += weight;
-
-                plane_quality += weight * vp8_similarity(img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block);
-            }
-            else
-                plane_quality += vp8_similarity(img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block);
-
-            //and for the rest
-            for (x = 8; x < width; x++)
-            {
-                img1_block      = img1_block + img1_sum_ptr1[x] - img1_sum_ptr1[x - 8];
-                img2_block      = img2_block + img2_sum_ptr1[x] - img2_sum_ptr1[x - 8];
-                img1_sq_block   = img1_sq_block + img1_sq_sum_ptr1[x] - img1_sq_sum_ptr1[x - 8];
-                img2_sq_block   = img2_sq_block + img2_sq_sum_ptr1[x] - img2_sq_sum_ptr1[x - 8];
-                img12_mul_block = img12_mul_block + img12_mul_sum_ptr1[x] - img12_mul_sum_ptr1[x - 8];
-
-                if (lumimask)
-                {
-                    y2 = y - 7;
-                    x2 = x - 7;
-
-                    if (luminance)
-                    {
-                        mean = (img2_block + img1_block) / 128.0f;
-
-                        if (!(y2 % 2 || x2 % 2))
-                            *(img12_sum_block + y2 / 2 * width_uv + x2 / 2) = img2_block + img1_block;
-                    }
-                    else
-                    {
-                        mean = *(img12_sum_block + y2 * width_uv + x2);
-                        mean += *(img12_sum_block + y2 * width_uv + x2 + 4);
-                        mean += *(img12_sum_block + (y2 + 4) * width_uv + x2);
-                        mean += *(img12_sum_block + (y2 + 4) * width_uv + x2 + 4);
-
-                        mean /= 512.0f;
-                    }
-
-                    weight = mean < 40 ? 0.0f :
-                             (mean < 50 ? (mean - 40.0f) / 10.0f : 1.0f);
-                    plane_summed_weights += weight;
-
-                    plane_quality += weight * vp8_similarity(img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block);
-                }
-                else
-                    plane_quality += vp8_similarity(img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block);
-            }
-        }
-    }
-
-    if (plane_summed_weights == 0)
-        return 1.0f;
-    else
-        return plane_quality / plane_summed_weights;
-}
-
-double vp8_calc_ssim
-(
-    YV12_BUFFER_CONFIG *source,
-    YV12_BUFFER_CONFIG *dest,
-    int lumamask,
-    double *weight
-)
-{
-    double a, b, c;
-    double frame_weight;
-    double ssimv;
-
-    width_y = source->y_width;
-    height_y = source->y_height;
-    height_uv = source->uv_height;
-    width_uv = source->uv_width;
-    stride_uv = dest->uv_stride;
-    stride = dest->y_stride;
-
-    lumimask = lumamask;
-
-    luminance = 1;
-    a = vp8_ssim(source->y_buffer, dest->y_buffer,
-                 source->y_stride, dest->y_stride, source->y_width, source->y_height);
-    luminance = 0;
-
-    frame_weight = plane_summed_weights / ((width_y - 7) * (height_y - 7));
-
-    if (frame_weight == 0)
-        a = b = c = 1.0f;
-    else
-    {
-        b = vp8_ssim(source->u_buffer, dest->u_buffer,
-                     source->uv_stride, dest->uv_stride, source->uv_width, source->uv_height);
-
-        c = vp8_ssim(source->v_buffer, dest->v_buffer,
-                     source->uv_stride, dest->uv_stride, source->uv_width, source->uv_height);
-    }
-
-    ssimv = a * .8 + .1 * (b + c);
-
-    *weight = frame_weight;
-
-    return ssimv;
-}
-
+#if CONFIG_RUNTIME_CPU_DETECT
+#define IF_RTCD(x)  (x)
+#else
+#define IF_RTCD(x)  NULL
+#endif
 // Google version of SSIM
 // SSIM
 #define KERNEL 3
@@ -520,3 +235,174 @@ double vp8_calc_ssimg
     *ssim_v /= uvsize;
     return ssim_all;
 }
+
+
+void ssim_parms_c
+(
+    unsigned char *s,
+    int sp,
+    unsigned char *r,
+    int rp,
+    unsigned long *sum_s,
+    unsigned long *sum_r,
+    unsigned long *sum_sq_s,
+    unsigned long *sum_sq_r,
+    unsigned long *sum_sxr
+)
+{
+    int i,j;
+    for(i=0;i<16;i++,s+=sp,r+=rp)
+     {
+         for(j=0;j<16;j++)
+         {
+             *sum_s += s[j];
+             *sum_r += r[j];
+             *sum_sq_s += s[j] * s[j];
+             *sum_sq_r += r[j] * r[j];
+             *sum_sxr += s[j] * r[j];
+         }
+     }
+}
+void ssim_parms_8x8_c
+(
+    unsigned char *s,
+    int sp,
+    unsigned char *r,
+    int rp,
+    unsigned long *sum_s,
+    unsigned long *sum_r,
+    unsigned long *sum_sq_s,
+    unsigned long *sum_sq_r,
+    unsigned long *sum_sxr
+)
+{
+    int i,j;
+    for(i=0;i<8;i++,s+=sp,r+=rp)
+     {
+         for(j=0;j<8;j++)
+         {
+             *sum_s += s[j];
+             *sum_r += r[j];
+             *sum_sq_s += s[j] * s[j];
+             *sum_sq_r += r[j] * r[j];
+             *sum_sxr += s[j] * r[j];
+         }
+     }
+}
+
+const static long long c1 =  426148; // (256^2*(.01*255)^2
+const static long long c2 = 3835331; //(256^2*(.03*255)^2
+
+static double similarity
+(
+    unsigned long sum_s,
+    unsigned long sum_r,
+    unsigned long sum_sq_s,
+    unsigned long sum_sq_r,
+    unsigned long sum_sxr,
+    int count
+)
+{
+    long long ssim_n = (2*sum_s*sum_r+ c1)*(2*count*sum_sxr-2*sum_s*sum_r+c2);
+
+    long long ssim_d = (sum_s*sum_s +sum_r*sum_r+c1)*
+            (count*sum_sq_s-sum_s*sum_s + count*sum_sq_r-sum_r*sum_r +c2) ;
+
+    return ssim_n * 1.0 / ssim_d;
+}
+
+static double ssim_16x16(unsigned char *s,int sp, unsigned char *r,int rp,
+            const vp8_variance_rtcd_vtable_t *rtcd)
+{
+    unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0;
+    rtcd->ssimpf(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
+    return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 256);
+}
+static double ssim_8x8(unsigned char *s,int sp, unsigned char *r,int rp,
+                const vp8_variance_rtcd_vtable_t *rtcd)
+{
+    unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0;
+    rtcd->ssimpf_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
+    return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64);
+}
+
+// TODO: (jbb) tried to scale this function such that we may be able to use it
+// for distortion metric in mode selection code ( provided we do a reconstruction)
+long dssim(unsigned char *s,int sp, unsigned char *r,int rp,
+           const vp8_variance_rtcd_vtable_t *rtcd)
+{
+    unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0;
+    double ssim3;
+    long long ssim_n;
+    long long ssim_d;
+
+    rtcd->ssimpf(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
+    ssim_n = (2*sum_s*sum_r+ c1)*(2*256*sum_sxr-2*sum_s*sum_r+c2);
+
+    ssim_d = (sum_s*sum_s +sum_r*sum_r+c1)*
+            (256*sum_sq_s-sum_s*sum_s + 256*sum_sq_r-sum_r*sum_r +c2) ;
+
+    ssim3 = 256 * (ssim_d-ssim_n) / ssim_d;
+    return (long)( 256*ssim3 * ssim3 );
+}
+// TODO: (jbb) this 8x8 window might be too big + we may want to pick pixels
+// such that the window regions overlap block boundaries to penalize blocking
+// artifacts.
+
+double vp8_ssim2
+(
+    unsigned char *img1,
+    unsigned char *img2,
+    int stride_img1,
+    int stride_img2,
+    int width,
+    int height,
+    const vp8_variance_rtcd_vtable_t *rtcd
+)
+{
+    int i,j;
+
+    double ssim_total=0;
+
+    // we can sample points as frequently as we like start with 1 per 8x8
+    for(i=0; i < height; i+=8, img1 += stride_img1*8, img2 += stride_img2*8)
+    {
+        for(j=0; j < width; j+=8 )
+        {
+            ssim_total += ssim_8x8(img1, stride_img1, img2, stride_img2, rtcd);
+        }
+    }
+    ssim_total /= (width/8 * height /8);
+    return ssim_total;
+
+}
+double vp8_calc_ssim
+(
+    YV12_BUFFER_CONFIG *source,
+    YV12_BUFFER_CONFIG *dest,
+    int lumamask,
+    double *weight,
+    const vp8_variance_rtcd_vtable_t *rtcd
+)
+{
+    double a, b, c;
+    double ssimv;
+
+    a = vp8_ssim2(source->y_buffer, dest->y_buffer,
+                 source->y_stride, dest->y_stride, source->y_width,
+                 source->y_height, rtcd);
+
+    b = vp8_ssim2(source->u_buffer, dest->u_buffer,
+                 source->uv_stride, dest->uv_stride, source->uv_width,
+                 source->uv_height, rtcd);
+
+    c = vp8_ssim2(source->v_buffer, dest->v_buffer,
+                 source->uv_stride, dest->uv_stride, source->uv_width,
+                 source->uv_height, rtcd);
+
+    ssimv = a * .8 + .1 * (b + c);
+
+    *weight = 1;
+
+    return ssimv;
+}
diff --git a/vp8/encoder/temporal_filter.c b/vp8/encoder/temporal_filter.c
index 0f8e654a0..fd36b22eb 100644
--- a/vp8/encoder/temporal_filter.c
+++ b/vp8/encoder/temporal_filter.c
@@ -287,8 +287,7 @@ static void vp8_temporal_filter_iterate_c
     int byte;
     int frame;
     int mb_col, mb_row;
-    unsigned int filter_weight[MAX_LAG_BUFFERS];
-    unsigned char *mm_ptr = cpi->fp_motion_map;
+    unsigned int filter_weight;
     int mb_cols = cpi->common.mb_cols;
     int mb_rows = cpi->common.mb_rows;
     int MBs  = cpi->common.MBs;
@@ -306,13 +305,6 @@ static void vp8_temporal_filter_iterate_c
     unsigned char *u_buffer = mbd->pre.u_buffer;
     unsigned char *v_buffer = mbd->pre.v_buffer;
 
-    if (!cpi->use_weighted_temporal_filter)
-    {
-        // Temporal filtering is unweighted
-        for (frame = 0; frame < frame_count; frame++)
-            filter_weight[frame] = 1;
-    }
-
     for (mb_row = 0; mb_row < mb_rows; mb_row++)
     {
 #if ALT_REF_MC_ENABLED
@@ -338,34 +330,9 @@ static void vp8_temporal_filter_iterate_c
                                     + (VP8BORDERINPIXELS - 19);
 #endif
 
-            // Read & process macroblock weights from motion map
-            if (cpi->use_weighted_temporal_filter)
-            {
-                weight_cap = 2;
-
-                for (frame = alt_ref_index-1; frame >= 0; frame--)
-                {
-                    w = *(mm_ptr + (frame+1)*MBs);
-                    filter_weight[frame] = w < weight_cap ? w : weight_cap;
-                    weight_cap = w;
-                }
-
-                filter_weight[alt_ref_index] = 2;
-
-                weight_cap = 2;
-
-                for (frame = alt_ref_index+1; frame < frame_count; frame++)
-                {
-                    w = *(mm_ptr + frame*MBs);
-                    filter_weight[frame] = w < weight_cap ? w : weight_cap;
-                    weight_cap = w;
-                }
-
-            }
-
             for (frame = 0; frame < frame_count; frame++)
             {
-                int err;
+                int err = 0;
 
                 if (cpi->frames[frame] == NULL)
                     continue;
@@ -374,28 +341,25 @@ static void vp8_temporal_filter_iterate_c
                 mbd->block[0].bmi.mv.as_mv.col = 0;
 
 #if ALT_REF_MC_ENABLED
-                //if (filter_weight[frame] == 0)
-                {
 #define THRESH_LOW   10000
 #define THRESH_HIGH  20000
 
-                    // Correlation has been lost try MC
-                    err = vp8_temporal_filter_find_matching_mb_c
-                        (cpi,
-                         cpi->frames[alt_ref_index],
-                         cpi->frames[frame],
-                         mb_y_offset,
-                         THRESH_LOW);
-
-                    if (filter_weight[frame] < 2)
-                    {
-                        // Set weight depending on error
-                        filter_weight[frame] = err<THRESH_LOW
-                                                ? 2 : err<THRESH_HIGH ? 1 : 0;
-                    }
-                }
+                // Find best match in this frame by MC
+                err = vp8_temporal_filter_find_matching_mb_c
+                      (cpi,
+                       cpi->frames[alt_ref_index],
+                       cpi->frames[frame],
+                       mb_y_offset,
+                       THRESH_LOW);
+
 #endif
-                if (filter_weight[frame] != 0)
+                // Assign higher weight to matching MB if it's error
+                // score is lower. If not applying MC default behavior
+                // is to weight all MBs equal.
+                filter_weight = err<THRESH_LOW
+                                  ? 2 : err<THRESH_HIGH ? 1 : 0;
+
+                if (filter_weight != 0)
                 {
                     // Construct the predictors
                     vp8_temporal_filter_predictors_mb_c
@@ -415,7 +379,7 @@ static void vp8_temporal_filter_iterate_c
                          predictor,
                          16,
                          strength,
-                         filter_weight[frame],
+                         filter_weight,
                          accumulator,
                          count);
 
@@ -425,7 +389,7 @@ static void vp8_temporal_filter_iterate_c
                          predictor + 256,
                          8,
                          strength,
-                         filter_weight[frame],
+                         filter_weight,
                          accumulator + 256,
                          count + 256);
 
@@ -435,7 +399,7 @@ static void vp8_temporal_filter_iterate_c
                          predictor + 320,
                          8,
                          strength,
-                         filter_weight[frame],
+                         filter_weight,
                          accumulator + 320,
                          count + 320);
                 }
@@ -491,7 +455,6 @@ static void vp8_temporal_filter_iterate_c
                 byte += stride - 8;
             }
 
-            mm_ptr++;
             mb_y_offset += 16;
             mb_uv_offset += 8;
         }
diff --git a/vp8/encoder/variance.h b/vp8/encoder/variance.h
index 5befd3b86..bf17ea8b6 100644
--- a/vp8/encoder/variance.h
+++ b/vp8/encoder/variance.h
@@ -85,6 +85,19 @@
       unsigned int *sse \
     );
 
+#define prototype_ssimpf(sym) \
+    void (sym) \
+      ( \
+        unsigned char *s, \
+        int sp, \
+        unsigned char *r, \
+        int rp, \
+        unsigned long *sum_s, \
+        unsigned long *sum_r, \
+        unsigned long *sum_sq_s, \
+        unsigned long *sum_sq_r, \
+        unsigned long *sum_sxr \
+      );
 
 #define prototype_getmbss(sym) unsigned int (sym)(const short *)
 
@@ -306,6 +319,15 @@ extern prototype_variance2(vp8_variance_get16x16var);
 #endif
 extern prototype_sad(vp8_variance_get4x4sse_cs);
 
+#ifndef vp8_ssimpf
+#define vp8_ssimpf ssim_parms_c
+#endif
+extern prototype_ssimpf(vp8_ssimpf)
+
+#ifndef vp8_ssimpf_8x8
+#define vp8_ssimpf_8x8 ssim_parms_8x8_c
+#endif
+extern prototype_ssimpf(vp8_ssimpf_8x8)
 
 typedef prototype_sad(*vp8_sad_fn_t);
 typedef prototype_sad_multi_same_address(*vp8_sad_multi_fn_t);
@@ -315,6 +337,10 @@ typedef prototype_variance(*vp8_variance_fn_t);
 typedef prototype_variance2(*vp8_variance2_fn_t);
 typedef prototype_subpixvariance(*vp8_subpixvariance_fn_t);
 typedef prototype_getmbss(*vp8_getmbss_fn_t);
+
+typedef prototype_ssimpf(*vp8_ssimpf_fn_t)
+
+
 typedef struct
 {
     vp8_sad_fn_t             sad4x4;
@@ -365,6 +391,11 @@ typedef struct
     vp8_sad_multi_d_fn_t     sad8x8x4d;
     vp8_sad_multi_d_fn_t     sad4x4x4d;
 
+#if CONFIG_PSNR
+    vp8_ssimpf_fn_t          ssimpf_8x8;
+    vp8_ssimpf_fn_t          ssimpf;
+#endif
+
 } vp8_variance_rtcd_vtable_t;
 
 typedef struct
@@ -378,6 +409,7 @@ typedef struct
     vp8_sad_multi_fn_t      sdx3f;
     vp8_sad_multi1_fn_t     sdx8f;
     vp8_sad_multi_d_fn_t    sdx4df;
+
 } vp8_variance_fn_ptr_t;
 
 #if CONFIG_RUNTIME_CPU_DETECT
diff --git a/vp8/encoder/x86/sad_sse4.asm b/vp8/encoder/x86/sad_sse4.asm
index 21e2e5007..03ecec4b3 100644
--- a/vp8/encoder/x86/sad_sse4.asm
+++ b/vp8/encoder/x86/sad_sse4.asm
@@ -186,7 +186,7 @@ sym(vp8_sad16x16x8_sse4):
         PROCESS_16X2X8 0
 
         mov             rdi,        arg(4)           ;Results
-        movdqu          XMMWORD PTR [rdi],    xmm1
+        movdqa          XMMWORD PTR [rdi],    xmm1
 
     ; begin epilog
     pop         rdi
@@ -224,7 +224,7 @@ sym(vp8_sad16x8x8_sse4):
         PROCESS_16X2X8 0
 
         mov             rdi,        arg(4)           ;Results
-        movdqu          XMMWORD PTR [rdi],    xmm1
+        movdqa          XMMWORD PTR [rdi],    xmm1
 
     ; begin epilog
     pop         rdi
@@ -262,7 +262,7 @@ sym(vp8_sad8x8x8_sse4):
         PROCESS_8X2X8 0
 
         mov             rdi,        arg(4)           ;Results
-        movdqu          XMMWORD PTR [rdi],    xmm1
+        movdqa          XMMWORD PTR [rdi],    xmm1
 
     ; begin epilog
     pop         rdi
@@ -303,7 +303,7 @@ sym(vp8_sad8x16x8_sse4):
         PROCESS_8X2X8 0
         PROCESS_8X2X8 0
         mov             rdi,        arg(4)           ;Results
-        movdqu          XMMWORD PTR [rdi],    xmm1
+        movdqa          XMMWORD PTR [rdi],    xmm1
 
     ; begin epilog
     pop         rdi
@@ -339,7 +339,7 @@ sym(vp8_sad4x4x8_sse4):
         PROCESS_4X2X8 0
 
         mov             rdi,        arg(4)           ;Results
-        movdqu          XMMWORD PTR [rdi],    xmm1
+        movdqa          XMMWORD PTR [rdi],    xmm1
 
     ; begin epilog
     pop         rdi
diff --git a/vp8/encoder/x86/ssim_opt.asm b/vp8/encoder/x86/ssim_opt.asm
new file mode 100644
index 000000000..c267cdb54
--- /dev/null
+++ b/vp8/encoder/x86/ssim_opt.asm
@@ -0,0 +1,215 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "vpx_ports/x86_abi_support.asm"
+
+; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
+%macro TABULATE_SSIM 0
+        paddusw         xmm15, xmm3  ; sum_s
+        paddusw         xmm14, xmm4  ; sum_r
+        movdqa          xmm1, xmm3
+        pmaddwd         xmm1, xmm1
+        paddq           xmm13, xmm1 ; sum_sq_s
+        movdqa          xmm2, xmm4
+        pmaddwd         xmm2, xmm2
+        paddq           xmm12, xmm2 ; sum_sq_r
+        pmaddwd         xmm3, xmm4
+        paddq           xmm11, xmm3  ; sum_sxr
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_Q 1
+        movdqa          xmm2,%1
+        punpckldq       %1,xmm0
+        punpckhdq       xmm2,xmm0
+        paddq           %1,xmm2
+        movdqa          xmm2,%1
+        punpcklqdq      %1,xmm0
+        punpckhqdq      xmm2,xmm0
+        paddq           %1,xmm2
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_W 1
+        movdqa          xmm1, %1
+        punpcklwd       %1,xmm0
+        punpckhwd       xmm1,xmm0
+        paddd           %1, xmm1
+        SUM_ACROSS_Q    %1
+%endmacro
+;void ssim_parms_sse3(
+;    unsigned char *s,
+;    int sp,
+;    unsigned char *r,
+;    int rp
+;    unsigned long *sum_s,
+;    unsigned long *sum_r,
+;    unsigned long *sum_sq_s,
+;    unsigned long *sum_sq_r,
+;    unsigned long *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+global sym(vp8_ssim_parms_16x16_sse3)
+sym(vp8_ssim_parms_16x16_sse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov             rsi,        arg(0) ;s
+    mov             rcx,        arg(1) ;sp
+    mov             rdi,        arg(2) ;r
+    mov             rax,        arg(3) ;rp
+
+    pxor            xmm0, xmm0
+    pxor            xmm15,xmm15  ;sum_s
+    pxor            xmm14,xmm14  ;sum_r
+    pxor            xmm13,xmm13  ;sum_sq_s
+    pxor            xmm12,xmm12  ;sum_sq_r
+    pxor            xmm11,xmm11  ;sum_sxr
+
+    mov             rdx, 16      ;row counter
+NextRow:
+
+    ;grab source and reference pixels
+    movdqu          xmm5, [rsi]
+    movdqu          xmm6, [rdi]
+    movdqa          xmm3, xmm5
+    movdqa          xmm4, xmm6
+    punpckhbw       xmm3, xmm0 ; high_s
+    punpckhbw       xmm4, xmm0 ; high_r
+
+    TABULATE_SSIM
+
+    movdqa          xmm3, xmm5
+    movdqa          xmm4, xmm6
+    punpcklbw       xmm3, xmm0 ; low_s
+    punpcklbw       xmm4, xmm0 ; low_r
+
+    TABULATE_SSIM
+
+    add             rsi, rcx   ; next s row
+    add             rdi, rax   ; next r row
+
+    dec             rdx        ; counter
+    jnz NextRow
+
+    SUM_ACROSS_W    xmm15
+    SUM_ACROSS_W    xmm14
+    SUM_ACROSS_Q    xmm13
+    SUM_ACROSS_Q    xmm12
+    SUM_ACROSS_Q    xmm11
+
+    mov             rdi,arg(4)
+    movq            [rdi], xmm15;
+    mov             rdi,arg(5)
+    movq            [rdi], xmm14;
+    mov             rdi,arg(6)
+    movq            [rdi], xmm13;
+    mov             rdi,arg(7)
+    movq            [rdi], xmm12;
+    mov             rdi,arg(8)
+    movq            [rdi], xmm11;
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void ssim_parms_sse3(
+;    unsigned char *s,
+;    int sp,
+;    unsigned char *r,
+;    int rp
+;    unsigned long *sum_s,
+;    unsigned long *sum_r,
+;    unsigned long *sum_sq_s,
+;    unsigned long *sum_sq_r,
+;    unsigned long *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+global sym(vp8_ssim_parms_8x8_sse3)
+sym(vp8_ssim_parms_8x8_sse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov             rsi,        arg(0) ;s
+    mov             rcx,        arg(1) ;sp
+    mov             rdi,        arg(2) ;r
+    mov             rax,        arg(3) ;rp
+
+    pxor            xmm0, xmm0
+    pxor            xmm15,xmm15  ;sum_s
+    pxor            xmm14,xmm14  ;sum_r
+    pxor            xmm13,xmm13  ;sum_sq_s
+    pxor            xmm12,xmm12  ;sum_sq_r
+    pxor            xmm11,xmm11  ;sum_sxr
+
+    mov             rdx, 8      ;row counter
+NextRow2:
+
+    ;grab source and reference pixels
+    movq            xmm5, [rsi]
+    movq            xmm6, [rdi]
+
+    movdqa          xmm3, xmm5
+    movdqa          xmm4, xmm6
+    punpcklbw       xmm3, xmm0 ; low_s
+    punpcklbw       xmm4, xmm0 ; low_r
+
+    TABULATE_SSIM
+
+    add             rsi, rcx   ; next s row
+    add             rdi, rax   ; next r row
+
+    dec             rdx        ; counter
+    jnz NextRow2
+
+    SUM_ACROSS_W    xmm15
+    SUM_ACROSS_W    xmm14
+    SUM_ACROSS_Q    xmm13
+    SUM_ACROSS_Q    xmm12
+    SUM_ACROSS_Q    xmm11
+
+    mov             rdi,arg(4)
+    movq            [rdi], xmm15;
+    mov             rdi,arg(5)
+    movq            [rdi], xmm14;
+    mov             rdi,arg(6)
+    movq            [rdi], xmm13;
+    mov             rdi,arg(7)
+    movq            [rdi], xmm12;
+    mov             rdi,arg(8)
+    movq            [rdi], xmm11;
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/vp8/encoder/x86/variance_impl_sse2.asm b/vp8/encoder/x86/variance_impl_sse2.asm
index 6cdc47bc9..c2c30deb2 100644
--- a/vp8/encoder/x86/variance_impl_sse2.asm
+++ b/vp8/encoder/x86/variance_impl_sse2.asm
@@ -627,6 +627,10 @@ filter_block2d_bil_var_sse2_loop:
 
 filter_block2d_bil_var_sse2_sp_only:
         movsxd          rdx,            dword ptr arg(6)     ; yoffset
+
+        cmp             rdx,            0                    ; skip all if both xoffset=0 and yoffset=0
+        je              filter_block2d_bil_var_sse2_full_pixel
+
         shl             rdx,            5
         lea             rdx,            [rdx + rcx]          ; VFilter
 
@@ -671,6 +675,35 @@ filter_block2d_bil_sp_only_loop:
 
         jmp             filter_block2d_bil_variance
 
+filter_block2d_bil_var_sse2_full_pixel:
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height
+        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
+        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
+        pxor            xmm0,           xmm0                 ;
+
+filter_block2d_bil_full_pixel_loop:
+        movq            xmm1,           QWORD PTR [rsi]               ;
+        punpcklbw       xmm1,           xmm0                 ;
+
+        movq            xmm2,           QWORD PTR [rdi]               ;
+        punpcklbw       xmm2,           xmm0                 ;
+
+        psubw           xmm1,           xmm2                 ;
+        paddw           xmm6,           xmm1                 ;
+
+        pmaddwd         xmm1,           xmm1                 ;
+        paddd           xmm7,           xmm1                 ;
+
+        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
+        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
+
+        sub             rcx,            1                   ;
+        jnz             filter_block2d_bil_full_pixel_loop       ;
+
+        jmp             filter_block2d_bil_variance
+
 filter_block2d_bil_var_sse2_fp_only:
         mov             rsi,            arg(0)               ;ref_ptr
         mov             rdi,            arg(2)               ;src_ptr
@@ -757,7 +790,7 @@ filter_block2d_bil_variance:
     ret
 
 
-;void vp8_half_horiz_vert_variance16x_h_sse2
+;void vp8_half_horiz_vert_variance8x_h_sse2
 ;(
 ;    unsigned char *ref_ptr,
 ;    int ref_pixels_per_line,
@@ -767,8 +800,8 @@ filter_block2d_bil_variance:
 ;    int *sum,
 ;    unsigned int *sumsquared
 ;)
-global sym(vp8_half_horiz_vert_variance16x_h_sse2)
-sym(vp8_half_horiz_vert_variance16x_h_sse2):
+global sym(vp8_half_horiz_vert_variance8x_h_sse2)
+sym(vp8_half_horiz_vert_variance8x_h_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -802,7 +835,7 @@ sym(vp8_half_horiz_vert_variance16x_h_sse2):
         add             rsi, r8
 %endif
 
-vp8_half_horiz_vert_variance16x_h_1:
+vp8_half_horiz_vert_variance8x_h_1:
 
         movq            xmm1,           QWORD PTR [rsi]     ;
         movq            xmm2,           QWORD PTR [rsi+1]   ;
@@ -830,7 +863,7 @@ vp8_half_horiz_vert_variance16x_h_1:
 %endif
 
         sub             rcx,            1                   ;
-        jnz             vp8_half_horiz_vert_variance16x_h_1     ;
+        jnz             vp8_half_horiz_vert_variance8x_h_1     ;
 
         movdq2q         mm6,            xmm6                ;
         movdq2q         mm7,            xmm7                ;
@@ -877,8 +910,123 @@ vp8_half_horiz_vert_variance16x_h_1:
     pop         rbp
     ret
 
+;void vp8_half_horiz_vert_variance16x_h_sse2
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    int *sum,
+;    unsigned int *sumsquared
+;)
+global sym(vp8_half_horiz_vert_variance16x_h_sse2)
+sym(vp8_half_horiz_vert_variance16x_h_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
 
-;void vp8_half_vert_variance16x_h_sse2
+        pxor            xmm6,           xmm6                ;  error accumulator
+        pxor            xmm7,           xmm7                ;  sse eaccumulator
+        mov             rsi,            arg(0) ;ref_ptr              ;
+
+        mov             rdi,            arg(2) ;src_ptr              ;
+        movsxd          rcx,            dword ptr arg(4) ;Height              ;
+        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
+        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
+
+        pxor            xmm0,           xmm0                ;
+
+        movdqu          xmm5,           XMMWORD PTR [rsi]
+        movdqu          xmm3,           XMMWORD PTR [rsi+1]
+        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1
+
+        lea             rsi,            [rsi + rax]
+
+vp8_half_horiz_vert_variance16x_h_1:
+        movdqu          xmm1,           XMMWORD PTR [rsi]     ;
+        movdqu          xmm2,           XMMWORD PTR [rsi+1]   ;
+        pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1
+
+        pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above
+
+        movdqa          xmm4,           xmm5
+        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
+        punpckhbw       xmm4,           xmm0
+
+        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
+        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
+        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
+
+        movq            xmm3,           QWORD PTR [rdi+8]
+        punpcklbw       xmm3,           xmm0
+        psubw           xmm4,           xmm3
+
+        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
+        paddw           xmm6,           xmm4
+        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
+        pmaddwd         xmm4,           xmm4
+        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
+        paddd           xmm7,           xmm4
+
+        movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row
+
+        lea             rsi,            [rsi + rax]
+        lea             rdi,            [rdi + rdx]
+
+        sub             rcx,            1                   ;
+        jnz             vp8_half_horiz_vert_variance16x_h_1     ;
+
+        pxor        xmm1,           xmm1
+        pxor        xmm5,           xmm5
+
+        punpcklwd   xmm0,           xmm6
+        punpckhwd   xmm1,           xmm6
+        psrad       xmm0,           16
+        psrad       xmm1,           16
+        paddd       xmm0,           xmm1
+        movdqa      xmm1,           xmm0
+
+        movdqa      xmm6,           xmm7
+        punpckldq   xmm6,           xmm5
+        punpckhdq   xmm7,           xmm5
+        paddd       xmm6,           xmm7
+
+        punpckldq   xmm0,           xmm5
+        punpckhdq   xmm1,           xmm5
+        paddd       xmm0,           xmm1
+
+        movdqa      xmm7,           xmm6
+        movdqa      xmm1,           xmm0
+
+        psrldq      xmm7,           8
+        psrldq      xmm1,           8
+
+        paddd       xmm6,           xmm7
+        paddd       xmm0,           xmm1
+
+        mov         rsi,            arg(5) ;[Sum]
+        mov         rdi,            arg(6) ;[SSE]
+
+        movd        [rsi],       xmm0
+        movd        [rdi],       xmm6
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_half_vert_variance8x_h_sse2
 ;(
 ;    unsigned char *ref_ptr,
 ;    int ref_pixels_per_line,
@@ -888,8 +1036,8 @@ vp8_half_horiz_vert_variance16x_h_1:
 ;    int *sum,
 ;    unsigned int *sumsquared
 ;)
-global sym(vp8_half_vert_variance16x_h_sse2)
-sym(vp8_half_vert_variance16x_h_sse2):
+global sym(vp8_half_vert_variance8x_h_sse2)
+sym(vp8_half_vert_variance8x_h_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -912,7 +1060,7 @@ sym(vp8_half_vert_variance16x_h_sse2):
         movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
 
         pxor            xmm0,           xmm0                ;
-vp8_half_vert_variance16x_h_1:
+vp8_half_vert_variance8x_h_1:
         movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
         movq            xmm3,           QWORD PTR [rsi+rax] ;  xmm3 = s1,s2,s3..s9
 
@@ -936,7 +1084,7 @@ vp8_half_vert_variance16x_h_1:
 %endif
 
         sub             rcx,            1                   ;
-        jnz             vp8_half_vert_variance16x_h_1          ;
+        jnz             vp8_half_vert_variance8x_h_1          ;
 
         movdq2q         mm6,            xmm6                ;
         movdq2q         mm7,            xmm7                ;
@@ -983,8 +1131,115 @@ vp8_half_vert_variance16x_h_1:
     pop         rbp
     ret
 
+;void vp8_half_vert_variance16x_h_sse2
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    int *sum,
+;    unsigned int *sumsquared
+;)
+global sym(vp8_half_vert_variance16x_h_sse2)
+sym(vp8_half_vert_variance16x_h_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        pxor            xmm6,           xmm6                ;  error accumulator
+        pxor            xmm7,           xmm7                ;  sse eaccumulator
+        mov             rsi,            arg(0)              ;ref_ptr
+
+        mov             rdi,            arg(2)              ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)    ;Height
+        movsxd          rax,            dword ptr arg(1)    ;ref_pixels_per_line
+        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
+
+        movdqu          xmm5,           XMMWORD PTR [rsi]
+        lea             rsi,            [rsi + rax          ]
+        pxor            xmm0,           xmm0
+
+vp8_half_vert_variance16x_h_1:
+        movdqu          xmm3,           XMMWORD PTR [rsi]
+
+        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
+        movdqa          xmm4,           xmm5
+        punpcklbw       xmm5,           xmm0
+        punpckhbw       xmm4,           xmm0
+
+        movq            xmm2,           QWORD PTR [rdi]
+        punpcklbw       xmm2,           xmm0
+        psubw           xmm5,           xmm2
+        movq            xmm2,           QWORD PTR [rdi+8]
+        punpcklbw       xmm2,           xmm0
+        psubw           xmm4,           xmm2
+
+        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
+        paddw           xmm6,           xmm4
+        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
+        pmaddwd         xmm4,           xmm4
+        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
+        paddd           xmm7,           xmm4
+
+        movdqa          xmm5,           xmm3
+
+        lea             rsi,            [rsi + rax]
+        lea             rdi,            [rdi + rdx]
+
+        sub             rcx,            1
+        jnz             vp8_half_vert_variance16x_h_1
+
+        pxor        xmm1,           xmm1
+        pxor        xmm5,           xmm5
+
+        punpcklwd   xmm0,           xmm6
+        punpckhwd   xmm1,           xmm6
+        psrad       xmm0,           16
+        psrad       xmm1,           16
+        paddd       xmm0,           xmm1
+        movdqa      xmm1,           xmm0
+
+        movdqa      xmm6,           xmm7
+        punpckldq   xmm6,           xmm5
+        punpckhdq   xmm7,           xmm5
+        paddd       xmm6,           xmm7
+
+        punpckldq   xmm0,           xmm5
+        punpckhdq   xmm1,           xmm5
+        paddd       xmm0,           xmm1
+
+        movdqa      xmm7,           xmm6
+        movdqa      xmm1,           xmm0
+
+        psrldq      xmm7,           8
+        psrldq      xmm1,           8
+
+        paddd       xmm6,           xmm7
+        paddd       xmm0,           xmm1
+
+        mov         rsi,            arg(5) ;[Sum]
+        mov         rdi,            arg(6) ;[SSE]
+
+        movd        [rsi],       xmm0
+        movd        [rdi],       xmm6
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
 
-;void vp8_half_horiz_variance16x_h_sse2
+;void vp8_half_horiz_variance8x_h_sse2
 ;(
 ;    unsigned char *ref_ptr,
 ;    int ref_pixels_per_line,
@@ -994,8 +1249,8 @@ vp8_half_vert_variance16x_h_1:
 ;    int *sum,
 ;    unsigned int *sumsquared
 ;)
-global sym(vp8_half_horiz_variance16x_h_sse2)
-sym(vp8_half_horiz_variance16x_h_sse2):
+global sym(vp8_half_horiz_variance8x_h_sse2)
+sym(vp8_half_horiz_variance8x_h_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -1017,7 +1272,7 @@ sym(vp8_half_horiz_variance16x_h_sse2):
         movsxd          rcx,            dword ptr arg(4) ;Height              ;
 
         pxor            xmm0,           xmm0                ;
-vp8_half_horiz_variance16x16_1:
+vp8_half_horiz_variance8x_h_1:
         movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
         movq            xmm3,           QWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s9
 
@@ -1040,7 +1295,7 @@ vp8_half_horiz_variance16x16_1:
         add             rdi, r9
 %endif
         sub             rcx,            1                   ;
-        jnz             vp8_half_horiz_variance16x16_1        ;
+        jnz             vp8_half_horiz_variance8x_h_1        ;
 
         movdq2q         mm6,            xmm6                ;
         movdq2q         mm7,            xmm7                ;
@@ -1087,6 +1342,109 @@ vp8_half_horiz_variance16x16_1:
     pop         rbp
     ret
 
+;void vp8_half_horiz_variance16x_h_sse2
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    int *sum,
+;    unsigned int *sumsquared
+;)
+global sym(vp8_half_horiz_variance16x_h_sse2)
+sym(vp8_half_horiz_variance16x_h_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        pxor            xmm6,           xmm6                ;  error accumulator
+        pxor            xmm7,           xmm7                ;  sse eaccumulator
+        mov             rsi,            arg(0) ;ref_ptr              ;
+
+        mov             rdi,            arg(2) ;src_ptr              ;
+        movsxd          rcx,            dword ptr arg(4) ;Height              ;
+        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
+        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
+
+        pxor            xmm0,           xmm0                ;
+
+vp8_half_horiz_variance16x_h_1:
+        movdqu          xmm5,           XMMWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s15
+        movdqu          xmm3,           XMMWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s16
+
+        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
+        movdqa          xmm1,           xmm5
+        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
+        punpckhbw       xmm1,           xmm0
+
+        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
+        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
+        movq            xmm2,           QWORD PTR [rdi+8]
+        punpcklbw       xmm2,           xmm0
+
+        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
+        psubw           xmm1,           xmm2
+        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
+        paddw           xmm6,           xmm1
+        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
+        pmaddwd         xmm1,           xmm1
+        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
+        paddd           xmm7,           xmm1
+
+        lea             rsi,            [rsi + rax]
+        lea             rdi,            [rdi + rdx]
+
+        sub             rcx,            1                   ;
+        jnz             vp8_half_horiz_variance16x_h_1        ;
+
+        pxor        xmm1,           xmm1
+        pxor        xmm5,           xmm5
+
+        punpcklwd   xmm0,           xmm6
+        punpckhwd   xmm1,           xmm6
+        psrad       xmm0,           16
+        psrad       xmm1,           16
+        paddd       xmm0,           xmm1
+        movdqa      xmm1,           xmm0
+
+        movdqa      xmm6,           xmm7
+        punpckldq   xmm6,           xmm5
+        punpckhdq   xmm7,           xmm5
+        paddd       xmm6,           xmm7
+
+        punpckldq   xmm0,           xmm5
+        punpckhdq   xmm1,           xmm5
+        paddd       xmm0,           xmm1
+
+        movdqa      xmm7,           xmm6
+        movdqa      xmm1,           xmm0
+
+        psrldq      xmm7,           8
+        psrldq      xmm1,           8
+
+        paddd       xmm6,           xmm7
+        paddd       xmm0,           xmm1
+
+        mov         rsi,            arg(5) ;[Sum]
+        mov         rdi,            arg(6) ;[SSE]
+
+        movd        [rsi],       xmm0
+        movd        [rdi],       xmm6
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
 
 SECTION_RODATA
 ;    short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
diff --git a/vp8/encoder/x86/variance_impl_ssse3.asm b/vp8/encoder/x86/variance_impl_ssse3.asm
new file mode 100644
index 000000000..b1976328d
--- /dev/null
+++ b/vp8/encoder/x86/variance_impl_ssse3.asm
@@ -0,0 +1,348 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%define xmm_filter_shift            7
+
+
+;void vp8_filter_block2d_bil_var_ssse3
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    int  xoffset,
+;    int  yoffset,
+;    int *sum,
+;    unsigned int *sumsquared;;
+;
+;)
+;Note: The filter coefficient at offset=0 is 128. Since the second register
+;for Pmaddubsw is signed bytes, we must calculate zero offset seperately.
+global sym(vp8_filter_block2d_bil_var_ssse3)
+sym(vp8_filter_block2d_bil_var_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    SAVE_XMM
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    push rbx
+    ; end prolog
+
+        pxor            xmm6,           xmm6
+        pxor            xmm7,           xmm7
+
+        lea             rcx,            [GLOBAL(vp8_bilinear_filters_ssse3)]
+        movsxd          rax,            dword ptr arg(5)     ; xoffset
+
+        cmp             rax,            0                    ; skip first_pass filter if xoffset=0
+        je              filter_block2d_bil_var_ssse3_sp_only
+
+        shl             rax,            4                    ; point to filter coeff with xoffset
+        lea             rax,            [rax + rcx]          ; HFilter
+
+        movsxd          rdx,            dword ptr arg(6)     ; yoffset
+
+        cmp             rdx,            0                    ; skip second_pass filter if yoffset=0
+        je              filter_block2d_bil_var_ssse3_fp_only
+
+        shl             rdx,            4
+        lea             rdx,            [rdx + rcx]          ; VFilter
+
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height
+
+        movdqu          xmm0,           XMMWORD PTR [rsi]
+        movdqu          xmm1,           XMMWORD PTR [rsi+1]
+        movdqa          xmm2,           xmm0
+
+        punpcklbw       xmm0,           xmm1
+        punpckhbw       xmm2,           xmm1
+        pmaddubsw       xmm0,           [rax]
+        pmaddubsw       xmm2,           [rax]
+
+        paddw           xmm0,           [GLOBAL(xmm_bi_rd)]
+        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
+        psraw           xmm0,           xmm_filter_shift
+        psraw           xmm2,           xmm_filter_shift
+
+        packuswb        xmm0,           xmm2
+
+        movsxd          rbx,            dword ptr arg(1) ;ref_pixels_per_line
+        lea             rsi,            [rsi + rbx]
+%if ABI_IS_32BIT=0
+        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+filter_block2d_bil_var_ssse3_loop:
+        movdqu          xmm1,           XMMWORD PTR [rsi]
+        movdqu          xmm2,           XMMWORD PTR [rsi+1]
+        movdqa          xmm3,           xmm1
+
+        punpcklbw       xmm1,           xmm2
+        punpckhbw       xmm3,           xmm2
+        pmaddubsw       xmm1,           [rax]
+        pmaddubsw       xmm3,           [rax]
+
+        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
+        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
+        psraw           xmm1,           xmm_filter_shift
+        psraw           xmm3,           xmm_filter_shift
+        packuswb        xmm1,           xmm3
+
+        movdqa          xmm2,           xmm0
+        movdqa          xmm0,           xmm1
+        movdqa          xmm3,           xmm2
+
+        punpcklbw       xmm2,           xmm1
+        punpckhbw       xmm3,           xmm1
+        pmaddubsw       xmm2,           [rdx]
+        pmaddubsw       xmm3,           [rdx]
+
+        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
+        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
+        psraw           xmm2,           xmm_filter_shift
+        psraw           xmm3,           xmm_filter_shift
+
+        movq            xmm1,           QWORD PTR [rdi]
+        pxor            xmm4,           xmm4
+        punpcklbw       xmm1,           xmm4
+        movq            xmm5,           QWORD PTR [rdi+8]
+        punpcklbw       xmm5,           xmm4
+
+        psubw           xmm2,           xmm1
+        psubw           xmm3,           xmm5
+        paddw           xmm6,           xmm2
+        paddw           xmm6,           xmm3
+        pmaddwd         xmm2,           xmm2
+        pmaddwd         xmm3,           xmm3
+        paddd           xmm7,           xmm2
+        paddd           xmm7,           xmm3
+
+        lea             rsi,            [rsi + rbx]          ;ref_pixels_per_line
+%if ABI_IS_32BIT
+        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
+%else
+        lea             rdi,            [rdi + r9]
+%endif
+
+        sub             rcx,            1
+        jnz             filter_block2d_bil_var_ssse3_loop
+
+        jmp             filter_block2d_bil_variance
+
+filter_block2d_bil_var_ssse3_sp_only:
+        movsxd          rdx,            dword ptr arg(6)     ; yoffset
+
+        cmp             rdx,            0                    ; Both xoffset =0 and yoffset=0
+        je              filter_block2d_bil_var_ssse3_full_pixel
+
+        shl             rdx,            4
+        lea             rdx,            [rdx + rcx]          ; VFilter
+
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height
+        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
+
+        movdqu          xmm1,           XMMWORD PTR [rsi]
+        movdqa          xmm0,           xmm1
+
+        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
+        lea             rsi,            [rsi + rax]
+
+filter_block2d_bil_sp_only_loop:
+        movdqu          xmm3,           XMMWORD PTR [rsi]
+        movdqa          xmm2,           xmm1
+        movdqa          xmm0,           xmm3
+
+        punpcklbw       xmm1,           xmm3
+        punpckhbw       xmm2,           xmm3
+        pmaddubsw       xmm1,           [rdx]
+        pmaddubsw       xmm2,           [rdx]
+
+        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
+        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
+        psraw           xmm1,           xmm_filter_shift
+        psraw           xmm2,           xmm_filter_shift
+
+        movq            xmm3,           QWORD PTR [rdi]
+        pxor            xmm4,           xmm4
+        punpcklbw       xmm3,           xmm4
+        movq            xmm5,           QWORD PTR [rdi+8]
+        punpcklbw       xmm5,           xmm4
+
+        psubw           xmm1,           xmm3
+        psubw           xmm2,           xmm5
+        paddw           xmm6,           xmm1
+        paddw           xmm6,           xmm2
+        pmaddwd         xmm1,           xmm1
+        pmaddwd         xmm2,           xmm2
+        paddd           xmm7,           xmm1
+        paddd           xmm7,           xmm2
+
+        movdqa          xmm1,           xmm0
+        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
+        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
+
+        sub             rcx,            1
+        jnz             filter_block2d_bil_sp_only_loop
+
+        jmp             filter_block2d_bil_variance
+
+filter_block2d_bil_var_ssse3_full_pixel:
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height
+        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
+        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
+        pxor            xmm0,           xmm0
+
+filter_block2d_bil_full_pixel_loop:
+        movq            xmm1,           QWORD PTR [rsi]
+        punpcklbw       xmm1,           xmm0
+        movq            xmm2,           QWORD PTR [rsi+8]
+        punpcklbw       xmm2,           xmm0
+
+        movq            xmm3,           QWORD PTR [rdi]
+        punpcklbw       xmm3,           xmm0
+        movq            xmm4,           QWORD PTR [rdi+8]
+        punpcklbw       xmm4,           xmm0
+
+        psubw           xmm1,           xmm3
+        psubw           xmm2,           xmm4
+        paddw           xmm6,           xmm1
+        paddw           xmm6,           xmm2
+        pmaddwd         xmm1,           xmm1
+        pmaddwd         xmm2,           xmm2
+        paddd           xmm7,           xmm1
+        paddd           xmm7,           xmm2
+
+        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
+        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
+        sub             rcx,            1
+        jnz             filter_block2d_bil_full_pixel_loop
+
+        jmp             filter_block2d_bil_variance
+
+filter_block2d_bil_var_ssse3_fp_only:
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height
+        movsxd          rdx,            dword ptr arg(1)     ;ref_pixels_per_line
+
+        pxor            xmm0,           xmm0
+        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
+
+filter_block2d_bil_fp_only_loop:
+        movdqu          xmm1,           XMMWORD PTR [rsi]
+        movdqu          xmm2,           XMMWORD PTR [rsi+1]
+        movdqa          xmm3,           xmm1
+
+        punpcklbw       xmm1,           xmm2
+        punpckhbw       xmm3,           xmm2
+        pmaddubsw       xmm1,           [rax]
+        pmaddubsw       xmm3,           [rax]
+
+        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
+        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
+        psraw           xmm1,           xmm_filter_shift
+        psraw           xmm3,           xmm_filter_shift
+
+        movq            xmm2,           XMMWORD PTR [rdi]
+        pxor            xmm4,           xmm4
+        punpcklbw       xmm2,           xmm4
+        movq            xmm5,           QWORD PTR [rdi+8]
+        punpcklbw       xmm5,           xmm4
+
+        psubw           xmm1,           xmm2
+        psubw           xmm3,           xmm5
+        paddw           xmm6,           xmm1
+        paddw           xmm6,           xmm3
+        pmaddwd         xmm1,           xmm1
+        pmaddwd         xmm3,           xmm3
+        paddd           xmm7,           xmm1
+        paddd           xmm7,           xmm3
+
+        lea             rsi,            [rsi + rdx]
+        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
+
+        sub             rcx,            1
+        jnz             filter_block2d_bil_fp_only_loop
+
+        jmp             filter_block2d_bil_variance
+
+filter_block2d_bil_variance:
+        pxor        xmm0,           xmm0
+        pxor        xmm1,           xmm1
+        pxor        xmm5,           xmm5
+
+        punpcklwd   xmm0,           xmm6
+        punpckhwd   xmm1,           xmm6
+        psrad       xmm0,           16
+        psrad       xmm1,           16
+        paddd       xmm0,           xmm1
+        movdqa      xmm1,           xmm0
+
+        movdqa      xmm6,           xmm7
+        punpckldq   xmm6,           xmm5
+        punpckhdq   xmm7,           xmm5
+        paddd       xmm6,           xmm7
+
+        punpckldq   xmm0,           xmm5
+        punpckhdq   xmm1,           xmm5
+        paddd       xmm0,           xmm1
+
+        movdqa      xmm7,           xmm6
+        movdqa      xmm1,           xmm0
+
+        psrldq      xmm7,           8
+        psrldq      xmm1,           8
+
+        paddd       xmm6,           xmm7
+        paddd       xmm0,           xmm1
+
+        mov         rsi,            arg(7) ;[Sum]
+        mov         rdi,            arg(8) ;[SSE]
+
+        movd        [rsi],       xmm0
+        movd        [rdi],       xmm6
+
+    ; begin epilog
+    pop rbx
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+SECTION_RODATA
+align 16
+xmm_bi_rd:
+    times 8 dw 64
+align 16
+vp8_bilinear_filters_ssse3:
+    times 8 db 128, 0
+    times 8 db 112, 16
+    times 8 db 96,  32
+    times 8 db 80,  48
+    times 8 db 64,  64
+    times 8 db 48,  80
+    times 8 db 32,  96
+    times 8 db 16,  112
diff --git a/vp8/encoder/x86/variance_mmx.c b/vp8/encoder/x86/variance_mmx.c
index 6eed98e07..07358c0c7 100644
--- a/vp8/encoder/x86/variance_mmx.c
+++ b/vp8/encoder/x86/variance_mmx.c
@@ -456,146 +456,6 @@ unsigned int vp8_sub_pixel_variance8x16_mmx
     return (xxsum - ((xsum * xsum) >> 7));
 }
 
-unsigned int vp8_i_variance16x16_mmx(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int sse0, sse1, sse2, sse3, var;
-    int sum0, sum1, sum2, sum3, avg;
-
-
-    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
-    vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
-    vp8_get8x8var_mmx(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse2, &sum2) ;
-    vp8_get8x8var_mmx(src_ptr + (source_stride >> 1) + 8, source_stride, ref_ptr + (recon_stride >> 1) + 8, recon_stride, &sse3, &sum3);
-
-    var = sse0 + sse1 + sse2 + sse3;
-    avg = sum0 + sum1 + sum2 + sum3;
-    *sse = var;
-    return (var - ((avg * avg) >> 8));
-
-}
-
-unsigned int vp8_i_variance8x16_mmx(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int sse0, sse1, var;
-    int sum0, sum1, avg;
-    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
-    vp8_get8x8var_mmx(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse1, &sum1) ;
-
-    var = sse0 + sse1;
-    avg = sum0 + sum1;
-
-    *sse = var;
-    return (var - ((avg * avg) >> 7));
-
-}
-
-unsigned int vp8_i_sub_pixel_variance16x16_mmx
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    int xsum0, xsum1;
-    unsigned int xxsum0, xxsum1;
-    int f2soffset = (src_pixels_per_line >> 1);
-    int f2doffset = (dst_pixels_per_line >> 1);
-
-
-    vp8_filter_block2d_bil_var_mmx(
-        src_ptr, src_pixels_per_line,
-        dst_ptr, dst_pixels_per_line, 8,
-        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
-        &xsum0, &xxsum0
-    );
-
-
-    vp8_filter_block2d_bil_var_mmx(
-        src_ptr + 8, src_pixels_per_line,
-        dst_ptr + 8, dst_pixels_per_line, 8,
-        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
-        &xsum1, &xxsum1
-    );
-
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
-
-    vp8_filter_block2d_bil_var_mmx(
-        src_ptr + f2soffset, src_pixels_per_line,
-        dst_ptr + f2doffset, dst_pixels_per_line, 8,
-        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
-        &xsum1, &xxsum1
-    );
-
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
-
-    vp8_filter_block2d_bil_var_mmx(
-        src_ptr + f2soffset + 8, src_pixels_per_line,
-        dst_ptr + f2doffset + 8, dst_pixels_per_line, 8,
-        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
-        &xsum1, &xxsum1
-    );
-
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
-    *sse = xxsum0;
-    return (xxsum0 - ((xsum0 * xsum0) >> 8));
-}
-
-
-unsigned int vp8_i_sub_pixel_variance8x16_mmx
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    int xsum0, xsum1;
-    unsigned int xxsum0, xxsum1;
-    int f2soffset = (src_pixels_per_line >> 1);
-    int f2doffset = (dst_pixels_per_line >> 1);
-
-
-    vp8_filter_block2d_bil_var_mmx(
-        src_ptr, src_pixels_per_line,
-        dst_ptr, dst_pixels_per_line, 8,
-        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
-        &xsum0, &xxsum0
-    );
-
-
-    vp8_filter_block2d_bil_var_mmx(
-        src_ptr + f2soffset, src_pixels_per_line,
-        dst_ptr + f2doffset, dst_pixels_per_line, 8,
-        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
-        &xsum1, &xxsum1
-    );
-
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
-    *sse = xxsum0;
-    return (xxsum0 - ((xsum0 * xsum0) >> 7));
-}
-
 
 unsigned int vp8_variance_halfpixvar16x16_h_mmx(
     const unsigned char *src_ptr,
diff --git a/vp8/encoder/x86/variance_sse2.c b/vp8/encoder/x86/variance_sse2.c
index 7cf6a6308..0edda3062 100644
--- a/vp8/encoder/x86/variance_sse2.c
+++ b/vp8/encoder/x86/variance_sse2.c
@@ -81,6 +81,16 @@ void vp8_filter_block2d_bil_var_sse2
     int *sum,
     unsigned int *sumsquared
 );
+void vp8_half_horiz_vert_variance8x_h_sse2
+(
+    const unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    const unsigned char *src_ptr,
+    int src_pixels_per_line,
+    unsigned int Height,
+    int *sum,
+    unsigned int *sumsquared
+);
 void vp8_half_horiz_vert_variance16x_h_sse2
 (
     const unsigned char *ref_ptr,
@@ -91,6 +101,16 @@ void vp8_half_horiz_vert_variance16x_h_sse2
     int *sum,
     unsigned int *sumsquared
 );
+void vp8_half_horiz_variance8x_h_sse2
+(
+    const unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    const unsigned char *src_ptr,
+    int src_pixels_per_line,
+    unsigned int Height,
+    int *sum,
+    unsigned int *sumsquared
+);
 void vp8_half_horiz_variance16x_h_sse2
 (
     const unsigned char *ref_ptr,
@@ -101,6 +121,16 @@ void vp8_half_horiz_variance16x_h_sse2
     int *sum,
     unsigned int *sumsquared
 );
+void vp8_half_vert_variance8x_h_sse2
+(
+    const unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    const unsigned char *src_ptr,
+    int src_pixels_per_line,
+    unsigned int Height,
+    int *sum,
+    unsigned int *sumsquared
+);
 void vp8_half_vert_variance16x_h_sse2
 (
     const unsigned char *ref_ptr,
@@ -262,21 +292,21 @@ unsigned int vp8_sub_pixel_variance8x8_wmt
 
     if (xoffset == 4 && yoffset == 0)
     {
-        vp8_half_horiz_variance16x_h_sse2(
+        vp8_half_horiz_variance8x_h_sse2(
             src_ptr, src_pixels_per_line,
             dst_ptr, dst_pixels_per_line, 8,
             &xsum, &xxsum);
     }
     else if (xoffset == 0 && yoffset == 4)
     {
-        vp8_half_vert_variance16x_h_sse2(
+        vp8_half_vert_variance8x_h_sse2(
             src_ptr, src_pixels_per_line,
             dst_ptr, dst_pixels_per_line, 8,
             &xsum, &xxsum);
     }
     else if (xoffset == 4 && yoffset == 4)
     {
-        vp8_half_horiz_vert_variance16x_h_sse2(
+        vp8_half_horiz_vert_variance8x_h_sse2(
             src_ptr, src_pixels_per_line,
             dst_ptr, dst_pixels_per_line, 8,
             &xsum, &xxsum);
@@ -317,11 +347,6 @@ unsigned int vp8_sub_pixel_variance16x16_wmt
             src_ptr, src_pixels_per_line,
             dst_ptr, dst_pixels_per_line, 16,
             &xsum0, &xxsum0);
-
-        vp8_half_horiz_variance16x_h_sse2(
-            src_ptr + 8, src_pixels_per_line,
-            dst_ptr + 8, dst_pixels_per_line, 16,
-            &xsum1, &xxsum1);
     }
     else if (xoffset == 0 && yoffset == 4)
     {
@@ -329,11 +354,6 @@ unsigned int vp8_sub_pixel_variance16x16_wmt
             src_ptr, src_pixels_per_line,
             dst_ptr, dst_pixels_per_line, 16,
             &xsum0, &xxsum0);
-
-        vp8_half_vert_variance16x_h_sse2(
-            src_ptr + 8, src_pixels_per_line,
-            dst_ptr + 8, dst_pixels_per_line, 16,
-            &xsum1, &xxsum1);
     }
     else if (xoffset == 4 && yoffset == 4)
     {
@@ -341,11 +361,6 @@ unsigned int vp8_sub_pixel_variance16x16_wmt
             src_ptr, src_pixels_per_line,
             dst_ptr, dst_pixels_per_line, 16,
             &xsum0, &xxsum0);
-
-        vp8_half_horiz_vert_variance16x_h_sse2(
-            src_ptr + 8, src_pixels_per_line,
-            dst_ptr + 8, dst_pixels_per_line, 16,
-            &xsum1, &xxsum1);
     }
     else
     {
@@ -356,17 +371,16 @@ unsigned int vp8_sub_pixel_variance16x16_wmt
             &xsum0, &xxsum0
         );
 
-
         vp8_filter_block2d_bil_var_sse2(
             src_ptr + 8, src_pixels_per_line,
             dst_ptr + 8, dst_pixels_per_line, 16,
             xoffset, yoffset,
             &xsum1, &xxsum1
         );
+        xsum0 += xsum1;
+        xxsum0 += xxsum1;
     }
 
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
     *sse = xxsum0;
     return (xxsum0 - ((xsum0 * xsum0) >> 8));
 }
@@ -406,11 +420,6 @@ unsigned int vp8_sub_pixel_variance16x8_wmt
             src_ptr, src_pixels_per_line,
             dst_ptr, dst_pixels_per_line, 8,
             &xsum0, &xxsum0);
-
-        vp8_half_horiz_variance16x_h_sse2(
-            src_ptr + 8, src_pixels_per_line,
-            dst_ptr + 8, dst_pixels_per_line, 8,
-            &xsum1, &xxsum1);
     }
     else if (xoffset == 0 && yoffset == 4)
     {
@@ -418,11 +427,6 @@ unsigned int vp8_sub_pixel_variance16x8_wmt
             src_ptr, src_pixels_per_line,
             dst_ptr, dst_pixels_per_line, 8,
             &xsum0, &xxsum0);
-
-        vp8_half_vert_variance16x_h_sse2(
-            src_ptr + 8, src_pixels_per_line,
-            dst_ptr + 8, dst_pixels_per_line, 8,
-            &xsum1, &xxsum1);
     }
     else if (xoffset == 4 && yoffset == 4)
     {
@@ -430,11 +434,6 @@ unsigned int vp8_sub_pixel_variance16x8_wmt
             src_ptr, src_pixels_per_line,
             dst_ptr, dst_pixels_per_line, 8,
             &xsum0, &xxsum0);
-
-        vp8_half_horiz_vert_variance16x_h_sse2(
-            src_ptr + 8, src_pixels_per_line,
-            dst_ptr + 8, dst_pixels_per_line, 8,
-            &xsum1, &xxsum1);
     }
     else
     {
@@ -449,11 +448,10 @@ unsigned int vp8_sub_pixel_variance16x8_wmt
             dst_ptr + 8, dst_pixels_per_line, 8,
             xoffset, yoffset,
             &xsum1, &xxsum1);
+        xsum0 += xsum1;
+        xxsum0 += xxsum1;
     }
 
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
-
     *sse = xxsum0;
     return (xxsum0 - ((xsum0 * xsum0) >> 7));
 }
@@ -474,21 +472,21 @@ unsigned int vp8_sub_pixel_variance8x16_wmt
 
     if (xoffset == 4 && yoffset == 0)
     {
-        vp8_half_horiz_variance16x_h_sse2(
+        vp8_half_horiz_variance8x_h_sse2(
             src_ptr, src_pixels_per_line,
             dst_ptr, dst_pixels_per_line, 16,
             &xsum, &xxsum);
     }
     else if (xoffset == 0 && yoffset == 4)
     {
-        vp8_half_vert_variance16x_h_sse2(
+        vp8_half_vert_variance8x_h_sse2(
             src_ptr, src_pixels_per_line,
             dst_ptr, dst_pixels_per_line, 16,
             &xsum, &xxsum);
     }
     else if (xoffset == 4 && yoffset == 4)
     {
-        vp8_half_horiz_vert_variance16x_h_sse2(
+        vp8_half_horiz_vert_variance8x_h_sse2(
             src_ptr, src_pixels_per_line,
             dst_ptr, dst_pixels_per_line, 16,
             &xsum, &xxsum);
@@ -506,81 +504,6 @@ unsigned int vp8_sub_pixel_variance8x16_wmt
     return (xxsum - ((xsum * xsum) >> 7));
 }
 
-unsigned int vp8_i_variance16x16_wmt(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int sse0, sse1, sse2, sse3, var;
-    int sum0, sum1, sum2, sum3, avg;
-
-
-    vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
-    vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
-    vp8_get8x8var_sse2(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse2, &sum2) ;
-    vp8_get8x8var_sse2(src_ptr + (source_stride >> 1) + 8, source_stride, ref_ptr + (recon_stride >> 1) + 8, recon_stride, &sse3, &sum3);
-
-    var = sse0 + sse1 + sse2 + sse3;
-    avg = sum0 + sum1 + sum2 + sum3;
-
-    *sse = var;
-    return (var - ((avg * avg) >> 8));
-
-}
-
-unsigned int vp8_i_variance8x16_wmt(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int sse0, sse1, var;
-    int sum0, sum1, avg;
-    vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
-    vp8_get8x8var_sse2(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse1, &sum1) ;
-
-    var = sse0 + sse1;
-    avg = sum0 + sum1;
-
-    *sse = var;
-    return (var - ((avg * avg) >> 7));
-
-}
-
-
-unsigned int vp8_i_sub_pixel_variance16x16_wmt
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    return vp8_sub_pixel_variance16x16_wmt(src_ptr, (src_pixels_per_line >> 1), xoffset, yoffset, dst_ptr, (dst_pixels_per_line >> 1), sse);
-}
-
-
-unsigned int vp8_i_sub_pixel_variance8x16_wmt
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-
-    return vp8_sub_pixel_variance8x16_wmt(src_ptr, (src_pixels_per_line >> 1), xoffset, yoffset, dst_ptr, (dst_pixels_per_line >> 1), sse);
-}
-
 
 unsigned int vp8_variance_halfpixvar16x16_h_wmt(
     const unsigned char *src_ptr,
@@ -589,21 +512,14 @@ unsigned int vp8_variance_halfpixvar16x16_h_wmt(
     int  dst_pixels_per_line,
     unsigned int *sse)
 {
-    int xsum0, xsum1;
-    unsigned int xxsum0, xxsum1;
+    int xsum0;
+    unsigned int xxsum0;
 
     vp8_half_horiz_variance16x_h_sse2(
         src_ptr, src_pixels_per_line,
         dst_ptr, dst_pixels_per_line, 16,
         &xsum0, &xxsum0);
 
-    vp8_half_horiz_variance16x_h_sse2(
-        src_ptr + 8, src_pixels_per_line,
-        dst_ptr + 8, dst_pixels_per_line, 16,
-        &xsum1, &xxsum1);
-
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
     *sse = xxsum0;
     return (xxsum0 - ((xsum0 * xsum0) >> 8));
 }
@@ -616,21 +532,13 @@ unsigned int vp8_variance_halfpixvar16x16_v_wmt(
     int  dst_pixels_per_line,
     unsigned int *sse)
 {
-    int xsum0, xsum1;
-    unsigned int xxsum0, xxsum1;
-
+    int xsum0;
+    unsigned int xxsum0;
     vp8_half_vert_variance16x_h_sse2(
         src_ptr, src_pixels_per_line,
         dst_ptr, dst_pixels_per_line, 16,
         &xsum0, &xxsum0);
 
-    vp8_half_vert_variance16x_h_sse2(
-        src_ptr + 8, src_pixels_per_line,
-        dst_ptr + 8, dst_pixels_per_line, 16,
-        &xsum1, &xxsum1);
-
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
     *sse = xxsum0;
     return (xxsum0 - ((xsum0 * xsum0) >> 8));
 }
@@ -643,21 +551,14 @@ unsigned int vp8_variance_halfpixvar16x16_hv_wmt(
     int  dst_pixels_per_line,
     unsigned int *sse)
 {
-    int xsum0, xsum1;
-    unsigned int xxsum0, xxsum1;
+    int xsum0;
+    unsigned int xxsum0;
 
     vp8_half_horiz_vert_variance16x_h_sse2(
         src_ptr, src_pixels_per_line,
         dst_ptr, dst_pixels_per_line, 16,
         &xsum0, &xxsum0);
 
-    vp8_half_horiz_vert_variance16x_h_sse2(
-        src_ptr + 8, src_pixels_per_line,
-        dst_ptr + 8, dst_pixels_per_line, 16,
-        &xsum1, &xxsum1);
-
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
     *sse = xxsum0;
     return (xxsum0 - ((xsum0 * xsum0) >> 8));
 }
diff --git a/vp8/encoder/x86/variance_ssse3.c b/vp8/encoder/x86/variance_ssse3.c
new file mode 100644
index 000000000..eb5d486bf
--- /dev/null
+++ b/vp8/encoder/x86/variance_ssse3.c
@@ -0,0 +1,165 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp8/encoder/variance.h"
+#include "vp8/common/pragmas.h"
+#include "vpx_ports/mem.h"
+
+extern unsigned int vp8_get16x16var_sse2
+(
+    const unsigned char *src_ptr,
+    int source_stride,
+    const unsigned char *ref_ptr,
+    int recon_stride,
+    unsigned int *SSE,
+    int *Sum
+);
+extern void vp8_half_horiz_vert_variance16x_h_sse2
+(
+    const unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    const unsigned char *src_ptr,
+    int src_pixels_per_line,
+    unsigned int Height,
+    int *sum,
+    unsigned int *sumsquared
+);
+extern void vp8_half_horiz_variance16x_h_sse2
+(
+    const unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    const unsigned char *src_ptr,
+    int src_pixels_per_line,
+    unsigned int Height,
+    int *sum,
+    unsigned int *sumsquared
+);
+extern void vp8_half_vert_variance16x_h_sse2
+(
+    const unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    const unsigned char *src_ptr,
+    int src_pixels_per_line,
+    unsigned int Height,
+    int *sum,
+    unsigned int *sumsquared
+);
+extern void vp8_filter_block2d_bil_var_ssse3
+(
+    const unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    const unsigned char *src_ptr,
+    int src_pixels_per_line,
+    unsigned int Height,
+    int  xoffset,
+    int  yoffset,
+    int *sum,
+    unsigned int *sumsquared
+);
+
+unsigned int vp8_sub_pixel_variance16x16_ssse3
+(
+    const unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    const unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    int xsum0;
+    unsigned int xxsum0;
+
+    // note we could avoid these if statements if the calling function
+    // just called the appropriate functions inside.
+    if (xoffset == 4 && yoffset == 0)
+    {
+        vp8_half_horiz_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 16,
+            &xsum0, &xxsum0);
+    }
+    else if (xoffset == 0 && yoffset == 4)
+    {
+        vp8_half_vert_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 16,
+            &xsum0, &xxsum0);
+    }
+    else if (xoffset == 4 && yoffset == 4)
+    {
+        vp8_half_horiz_vert_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 16,
+            &xsum0, &xxsum0);
+    }
+    else
+    {
+        vp8_filter_block2d_bil_var_ssse3(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 16,
+            xoffset, yoffset,
+            &xsum0, &xxsum0);
+    }
+
+    *sse = xxsum0;
+    return (xxsum0 - ((xsum0 * xsum0) >> 8));
+}
+
+unsigned int vp8_sub_pixel_variance16x8_ssse3
+(
+    const unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    const unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+
+)
+{
+    int xsum0;
+    unsigned int xxsum0;
+
+    if (xoffset == 4 && yoffset == 0)
+    {
+        vp8_half_horiz_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            &xsum0, &xxsum0);
+    }
+    else if (xoffset == 0 && yoffset == 4)
+    {
+        vp8_half_vert_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            &xsum0, &xxsum0);
+    }
+    else if (xoffset == 4 && yoffset == 4)
+    {
+        vp8_half_horiz_vert_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            &xsum0, &xxsum0);
+    }
+    else
+    {
+        vp8_filter_block2d_bil_var_ssse3(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            xoffset, yoffset,
+            &xsum0, &xxsum0);
+    }
+
+    *sse = xxsum0;
+    return (xxsum0 - ((xsum0 * xsum0) >> 7));
+}
diff --git a/vp8/encoder/x86/variance_x86.h b/vp8/encoder/x86/variance_x86.h
index 6bea15ebc..3560f7413 100644
--- a/vp8/encoder/x86/variance_x86.h
+++ b/vp8/encoder/x86/variance_x86.h
@@ -286,6 +286,8 @@ extern prototype_sad_multi_dif_address(vp8_sad4x4x4d_sse3);
 #if HAVE_SSSE3
 extern prototype_sad_multi_same_address(vp8_sad16x16x3_ssse3);
 extern prototype_sad_multi_same_address(vp8_sad16x8x3_ssse3);
+extern prototype_subpixvariance(vp8_sub_pixel_variance16x8_ssse3);
+extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_ssse3);
 
 #if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_variance_sad16x16x3
@@ -294,6 +296,12 @@ extern prototype_sad_multi_same_address(vp8_sad16x8x3_ssse3);
 #undef  vp8_variance_sad16x8x3
 #define vp8_variance_sad16x8x3 vp8_sad16x8x3_ssse3
 
+#undef  vp8_variance_subpixvar16x8
+#define vp8_variance_subpixvar16x8 vp8_sub_pixel_variance16x8_ssse3
+
+#undef  vp8_variance_subpixvar16x16
+#define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_ssse3
+
 #endif
 #endif
 
diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c
index 61c603229..5ab364147 100644
--- a/vp8/encoder/x86/x86_csystemdependent.c
+++ b/vp8/encoder/x86/x86_csystemdependent.c
@@ -176,6 +176,25 @@ void vp8_fast_quantize_b_ssse3(BLOCK *b, BLOCKD *d)
                     d->dqcoeff
                );
 }
+#if CONFIG_PSNR
+#if ARCH_X86_64
+typedef void ssimpf
+(
+    unsigned char *s,
+    int sp,
+    unsigned char *r,
+    int rp,
+    unsigned long *sum_s,
+    unsigned long *sum_r,
+    unsigned long *sum_sq_s,
+    unsigned long *sum_sq_r,
+    unsigned long *sum_sxr
+);
+
+extern ssimpf vp8_ssim_parms_16x16_sse3;
+extern ssimpf vp8_ssim_parms_8x8_sse3;
+#endif
+#endif
 #endif
 
 
@@ -280,6 +299,8 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
         cpi->rtcd.variance.get16x16prederror     = vp8_get16x16pred_error_sse2;
         cpi->rtcd.variance.get8x8var             = vp8_get8x8var_sse2;
         cpi->rtcd.variance.get16x16var           = vp8_get16x16var_sse2;
+
+
         /* cpi->rtcd.variance.get4x4sse_cs  not implemented for wmt */;
 
         cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_sse2;
@@ -334,11 +355,23 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
         cpi->rtcd.variance.sad16x16x3            = vp8_sad16x16x3_ssse3;
         cpi->rtcd.variance.sad16x8x3             = vp8_sad16x8x3_ssse3;
 
+        cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_ssse3;
+        cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_ssse3;
+
         cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_ssse3;
 
+#if CONFIG_PSNR
+#if ARCH_X86_64
+        cpi->rtcd.variance.ssimpf_8x8            = vp8_ssim_parms_8x8_sse3;
+        cpi->rtcd.variance.ssimpf                = vp8_ssim_parms_16x16_sse3;
+#endif
+#endif
+
     }
 #endif
 
+
+
 #if HAVE_SSE4_1
     if (SSE4_1Enabled)
     {
diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index 4daadee32..ba9caa7ce 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -24,6 +24,7 @@ VP8_COMMON_SRCS-yes += common/entropymode.c
 VP8_COMMON_SRCS-yes += common/entropymv.c
 VP8_COMMON_SRCS-yes += common/extend.c
 VP8_COMMON_SRCS-yes += common/filter.c
+VP8_COMMON_SRCS-yes += common/filter.h
 VP8_COMMON_SRCS-yes += common/findnearmv.c
 VP8_COMMON_SRCS-yes += common/generic/systemdependent.c
 VP8_COMMON_SRCS-yes += common/idctllm.c
diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index a45a37912..2622738ec 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -199,7 +199,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
     {
         int              mb_r = (cfg->g_h + 15) / 16;
         int              mb_c = (cfg->g_w + 15) / 16;
-        size_t           packet_sz = vp8_firstpass_stats_sz(mb_r * mb_c);
+        size_t           packet_sz = sizeof(FIRSTPASS_STATS);
         int              n_packets = cfg->rc_twopass_stats_in.sz / packet_sz;
         FIRSTPASS_STATS *stats;
 
diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk
index ed2feddae..8f0681fb9 100644
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -110,10 +110,13 @@ VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE3) += encoder/x86/sad_sse3.asm
 VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/sad_ssse3.asm
+VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/variance_ssse3.c
+VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/variance_impl_ssse3.asm
 VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.asm
 VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/sad_sse4.asm
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm
+VP8_CX_SRCS-$(ARCH_X86_64) += encoder/x86/ssim_opt.asm
 
 ifeq ($(CONFIG_REALTIME_ONLY),yes)
 VP8_CX_SRCS_REMOVE-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
diff --git a/vpxenc.c b/vpxenc.c
index d0f4c2348..7b8de1171 100755
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -163,8 +163,8 @@ int stats_open_file(stats_io_t *stats, const char *fpf, int pass)
 
         if (!stats->buf.buf)
         {
-            fprintf(stderr, "Failed to allocate first-pass stats buffer (%d bytes)\n",
-                    stats->buf_alloc_sz);
+            fprintf(stderr, "Failed to allocate first-pass stats buffer (%lu bytes)\n",
+                    (unsigned long)stats->buf_alloc_sz);
             exit(EXIT_FAILURE);
         }