55 files changed, 1560 insertions, 1607 deletions
diff --git a/README b/README
index 29072b919..460ad7390 100644
--- a/README
+++ b/README
@@ -79,9 +79,6 @@ COMPILING THE APPLICATIONS/LIBRARIES:
     x86-os2-gcc
     x86-solaris-gcc
     x86-win32-gcc
-    x86-win32-vs7
-    x86-win32-vs8
-    x86-win32-vs9
     x86-win32-vs10
     x86-win32-vs11
     x86-win32-vs12
@@ -98,8 +95,6 @@ COMPILING THE APPLICATIONS/LIBRARIES:
     x86_64-linux-icc
     x86_64-solaris-gcc
     x86_64-win64-gcc
-    x86_64-win64-vs8
-    x86_64-win64-vs9
     x86_64-win64-vs10
     x86_64-win64-vs11
     x86_64-win64-vs12
diff --git a/build/make/Makefile b/build/make/Makefile
index 3e8c02490..dfb7e4b7a 100644
--- a/build/make/Makefile
+++ b/build/make/Makefile
@@ -418,7 +418,6 @@ ifneq ($(call enabled,DIST-SRCS),)
     DIST-SRCS-yes            += build/make/gen_asm_deps.sh
     DIST-SRCS-yes            += build/make/Makefile
     DIST-SRCS-$(CONFIG_MSVS)  += build/make/gen_msvs_def.sh
-    DIST-SRCS-$(CONFIG_MSVS)  += build/make/gen_msvs_proj.sh
     DIST-SRCS-$(CONFIG_MSVS)  += build/make/gen_msvs_sln.sh
     DIST-SRCS-$(CONFIG_MSVS)  += build/make/gen_msvs_vcxproj.sh
     DIST-SRCS-$(CONFIG_MSVS)  += build/make/msvs_common.sh
diff --git a/build/make/configure.sh b/build/make/configure.sh
index f2b8b3765..2e1597779 100644
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -1396,10 +1396,6 @@ EOF
     fi
   fi
 
-  if [ "${tgt_isa}" = "x86_64" ] || [ "${tgt_isa}" = "x86" ]; then
-    soft_enable use_x86inc
-  fi
-
   # Position Independent Code (PIC) support, for building relocatable
   # shared objects
   enabled gcc && enabled pic && check_add_cflags -fPIC
diff --git a/build/make/gen_msvs_proj.sh b/build/make/gen_msvs_proj.sh
deleted file mode 100755
index 2b91fbfbc..000000000
--- a/build/make/gen_msvs_proj.sh
+++ /dev/null
@@ -1,490 +0,0 @@
-#!/bin/bash
-##
-##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-##
-##  Use of this source code is governed by a BSD-style license
-##  that can be found in the LICENSE file in the root of the source
-##  tree. An additional intellectual property rights grant can be found
-##  in the file PATENTS.  All contributing project authors may
-##  be found in the AUTHORS file in the root of the source tree.
-##
-
-self=$0
-self_basename=${self##*/}
-self_dirname=$(dirname "$0")
-
-. "$self_dirname/msvs_common.sh"|| exit 127
-
-show_help() {
-    cat <<EOF
-Usage: ${self_basename} --name=projname [options] file1 [file2 ...]
-
-This script generates a Visual Studio project file from a list of source
-code files.
-
-Options:
-    --help                      Print this message
-    --exe                       Generate a project for building an Application
-    --lib                       Generate a project for creating a static library
-    --dll                       Generate a project for creating a dll
-    --static-crt                Use the static C runtime (/MT)
-    --target=isa-os-cc          Target specifier (required)
-    --out=filename              Write output to a file [stdout]
-    --name=project_name         Name of the project (required)
-    --proj-guid=GUID            GUID to use for the project
-    --module-def=filename       File containing export definitions (for DLLs)
-    --ver=version               Version (7,8,9) of visual studio to generate for
-    --src-path-bare=dir         Path to root of source tree
-    -Ipath/to/include           Additional include directories
-    -DFLAG[=value]              Preprocessor macros to define
-    -Lpath/to/lib               Additional library search paths
-    -llibname                   Library to link against
-EOF
-    exit 1
-}
-
-generate_filter() {
-    local var=$1
-    local name=$2
-    local pats=$3
-    local file_list_sz
-    local i
-    local f
-    local saveIFS="$IFS"
-    local pack
-    echo "generating filter '$name' from ${#file_list[@]} files" >&2
-    IFS=*
-
-    open_tag Filter \
-        Name=$name \
-        Filter=$pats \
-        UniqueIdentifier=`generate_uuid` \
-
-    file_list_sz=${#file_list[@]}
-    for i in ${!file_list[@]}; do
-        f=${file_list[i]}
-        for pat in ${pats//;/$IFS}; do
-            if [ "${f##*.}" == "$pat" ]; then
-                unset file_list[i]
-
-                objf=$(echo ${f%.*}.obj \
-                       | sed -e "s,$src_path_bare,," \
-                             -e 's/^[\./]\+//g' -e 's,[:/ ],_,g')
-                open_tag File RelativePath="$f"
-
-                if [ "$pat" == "asm" ] && $asm_use_custom_step; then
-                    # Avoid object file name collisions, i.e. vpx_config.c and
-                    # vpx_config.asm produce the same object file without
-                    # this additional suffix.
-                    objf=${objf%.obj}_asm.obj
-                    for plat in "${platforms[@]}"; do
-                        for cfg in Debug Release; do
-                            open_tag FileConfiguration \
-                                Name="${cfg}|${plat}" \
-
-                            tag Tool \
-                                Name="VCCustomBuildTool" \
-                                Description="Assembling \$(InputFileName)" \
-                                CommandLine="$(eval echo \$asm_${cfg}_cmdline) -o \$(IntDir)\\$objf" \
-                                Outputs="\$(IntDir)\\$objf" \
-
-                            close_tag FileConfiguration
-                        done
-                    done
-                fi
-                if [ "$pat" == "c" ] || \
-                   [ "$pat" == "cc" ] || [ "$pat" == "cpp" ]; then
-                    for plat in "${platforms[@]}"; do
-                        for cfg in Debug Release; do
-                            open_tag FileConfiguration \
-                                Name="${cfg}|${plat}" \
-
-                            tag Tool \
-                                Name="VCCLCompilerTool" \
-                                ObjectFile="\$(IntDir)\\$objf" \
-
-                            close_tag FileConfiguration
-                        done
-                    done
-                fi
-                close_tag File
-
-                break
-            fi
-        done
-    done
-
-    close_tag Filter
-    IFS="$saveIFS"
-}
-
-# Process command line
-unset target
-for opt in "$@"; do
-    optval="${opt#*=}"
-    case "$opt" in
-        --help|-h) show_help
-        ;;
-        --target=*) target="${optval}"
-        ;;
-        --out=*) outfile="$optval"
-        ;;
-        --name=*) name="${optval}"
-        ;;
-        --proj-guid=*) guid="${optval}"
-        ;;
-        --module-def=*) link_opts="${link_opts} ModuleDefinitionFile=${optval}"
-        ;;
-        --exe) proj_kind="exe"
-        ;;
-        --dll) proj_kind="dll"
-        ;;
-        --lib) proj_kind="lib"
-        ;;
-        --src-path-bare=*)
-            src_path_bare=$(fix_path "$optval")
-            src_path_bare=${src_path_bare%/}
-        ;;
-        --static-crt) use_static_runtime=true
-        ;;
-        --ver=*)
-            vs_ver="$optval"
-            case "$optval" in
-                [789])
-                ;;
-                *) die Unrecognized Visual Studio Version in $opt
-                ;;
-            esac
-        ;;
-        -I*)
-            opt=${opt##-I}
-            opt=$(fix_path "$opt")
-            opt="${opt%/}"
-            incs="${incs}${incs:+;}&quot;${opt}&quot;"
-            yasmincs="${yasmincs} -I&quot;${opt}&quot;"
-        ;;
-        -D*) defines="${defines}${defines:+;}${opt##-D}"
-        ;;
-        -L*) # fudge . to $(OutDir)
-            if [ "${opt##-L}" == "." ]; then
-                libdirs="${libdirs}${libdirs:+;}&quot;\$(OutDir)&quot;"
-            else
-                 # Also try directories for this platform/configuration
-                 opt=${opt##-L}
-                 opt=$(fix_path "$opt")
-                 libdirs="${libdirs}${libdirs:+;}&quot;${opt}&quot;"
-                 libdirs="${libdirs}${libdirs:+;}&quot;${opt}/\$(PlatformName)/\$(ConfigurationName)&quot;"
-                 libdirs="${libdirs}${libdirs:+;}&quot;${opt}/\$(PlatformName)&quot;"
-            fi
-        ;;
-        -l*) libs="${libs}${libs:+ }${opt##-l}.lib"
-        ;;
-        -*) die_unknown $opt
-        ;;
-        *)
-            # The paths in file_list are fixed outside of the loop.
-            file_list[${#file_list[@]}]="$opt"
-            case "$opt" in
-                 *.asm) uses_asm=true
-                 ;;
-            esac
-        ;;
-    esac
-done
-
-# Make one call to fix_path for file_list to improve performance.
-fix_file_list file_list
-
-outfile=${outfile:-/dev/stdout}
-guid=${guid:-`generate_uuid`}
-asm_use_custom_step=false
-uses_asm=${uses_asm:-false}
-case "${vs_ver:-8}" in
-    7) vs_ver_id="7.10"
-       asm_use_custom_step=$uses_asm
-       warn_64bit='Detect64BitPortabilityProblems=true'
-    ;;
-    8) vs_ver_id="8.00"
-       asm_use_custom_step=$uses_asm
-       warn_64bit='Detect64BitPortabilityProblems=true'
-    ;;
-    9) vs_ver_id="9.00"
-       asm_use_custom_step=$uses_asm
-       warn_64bit='Detect64BitPortabilityProblems=false'
-    ;;
-esac
-
-[ -n "$name" ] || die "Project name (--name) must be specified!"
-[ -n "$target" ] || die "Target (--target) must be specified!"
-
-if ${use_static_runtime:-false}; then
-    release_runtime=0
-    debug_runtime=1
-    lib_sfx=mt
-else
-    release_runtime=2
-    debug_runtime=3
-    lib_sfx=md
-fi
-
-# Calculate debug lib names: If a lib ends in ${lib_sfx}.lib, then rename
-# it to ${lib_sfx}d.lib. This precludes linking to release libs from a
-# debug exe, so this may need to be refactored later.
-for lib in ${libs}; do
-    if [ "$lib" != "${lib%${lib_sfx}.lib}" ]; then
-        lib=${lib%.lib}d.lib
-    fi
-    debug_libs="${debug_libs}${debug_libs:+ }${lib}"
-done
-
-
-# List Keyword for this target
-case "$target" in
-    x86*) keyword="ManagedCProj"
-    ;;
-    *) die "Unsupported target $target!"
-esac
-
-# List of all platforms supported for this target
-case "$target" in
-    x86_64*)
-        platforms[0]="x64"
-        asm_Debug_cmdline="yasm -Xvc -g cv8 -f win64 ${yasmincs} &quot;\$(InputPath)&quot;"
-        asm_Release_cmdline="yasm -Xvc -f win64 ${yasmincs} &quot;\$(InputPath)&quot;"
-    ;;
-    x86*)
-        platforms[0]="Win32"
-        asm_Debug_cmdline="yasm -Xvc -g cv8 -f win32 ${yasmincs} &quot;\$(InputPath)&quot;"
-        asm_Release_cmdline="yasm -Xvc -f win32 ${yasmincs} &quot;\$(InputPath)&quot;"
-    ;;
-    *) die "Unsupported target $target!"
-    ;;
-esac
-
-generate_vcproj() {
-    case "$proj_kind" in
-        exe) vs_ConfigurationType=1
-        ;;
-        dll) vs_ConfigurationType=2
-        ;;
-        *)   vs_ConfigurationType=4
-        ;;
-    esac
-
-    echo "<?xml version=\"1.0\" encoding=\"Windows-1252\"?>"
-    open_tag VisualStudioProject \
-        ProjectType="Visual C++" \
-        Version="${vs_ver_id}" \
-        Name="${name}" \
-        ProjectGUID="{${guid}}" \
-        RootNamespace="${name}" \
-        Keyword="${keyword}" \
-
-    open_tag Platforms
-    for plat in "${platforms[@]}"; do
-        tag Platform Name="$plat"
-    done
-    close_tag Platforms
-
-    open_tag Configurations
-    for plat in "${platforms[@]}"; do
-        plat_no_ws=`echo $plat | sed 's/[^A-Za-z0-9_]/_/g'`
-        open_tag Configuration \
-            Name="Debug|$plat" \
-            OutputDirectory="\$(SolutionDir)$plat_no_ws/\$(ConfigurationName)" \
-            IntermediateDirectory="$plat_no_ws/\$(ConfigurationName)/${name}" \
-            ConfigurationType="$vs_ConfigurationType" \
-            CharacterSet="1" \
-
-        case "$target" in
-            x86*)
-                case "$name" in
-                    vpx)
-                        tag Tool \
-                            Name="VCCLCompilerTool" \
-                            Optimization="0" \
-                            AdditionalIncludeDirectories="$incs" \
-                            PreprocessorDefinitions="WIN32;_DEBUG;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE;$defines" \
-                            RuntimeLibrary="$debug_runtime" \
-                            UsePrecompiledHeader="0" \
-                            WarningLevel="3" \
-                            DebugInformationFormat="2" \
-                            $warn_64bit \
-
-                        $uses_asm && tag Tool Name="YASM"  IncludePaths="$incs" Debug="true"
-                    ;;
-                    *)
-                        tag Tool \
-                            Name="VCCLCompilerTool" \
-                            Optimization="0" \
-                            AdditionalIncludeDirectories="$incs" \
-                            PreprocessorDefinitions="WIN32;_DEBUG;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE;$defines" \
-                            RuntimeLibrary="$debug_runtime" \
-                            UsePrecompiledHeader="0" \
-                            WarningLevel="3" \
-                            DebugInformationFormat="2" \
-                            $warn_64bit \
-
-                        $uses_asm && tag Tool Name="YASM"  IncludePaths="$incs" Debug="true"
-                    ;;
-                esac
-            ;;
-        esac
-
-        case "$proj_kind" in
-            exe)
-                case "$target" in
-                    x86*)
-                        case "$name" in
-                            *)
-                                tag Tool \
-                                    Name="VCLinkerTool" \
-                                    AdditionalDependencies="$debug_libs \$(NoInherit)" \
-                                    AdditionalLibraryDirectories="$libdirs" \
-                                    GenerateDebugInformation="true" \
-                                    ProgramDatabaseFile="\$(OutDir)/${name}.pdb" \
-                            ;;
-                        esac
-                    ;;
-                 esac
-            ;;
-            lib)
-                case "$target" in
-                    x86*)
-                        tag Tool \
-                            Name="VCLibrarianTool" \
-                            OutputFile="\$(OutDir)/${name}${lib_sfx}d.lib" \
-
-                    ;;
-                esac
-            ;;
-            dll)
-                tag Tool \
-                    Name="VCLinkerTool" \
-                    AdditionalDependencies="\$(NoInherit)" \
-                    LinkIncremental="2" \
-                    GenerateDebugInformation="true" \
-                    AssemblyDebug="1" \
-                    TargetMachine="1" \
-                    $link_opts \
-
-            ;;
-        esac
-
-        close_tag Configuration
-
-        open_tag Configuration \
-            Name="Release|$plat" \
-            OutputDirectory="\$(SolutionDir)$plat_no_ws/\$(ConfigurationName)" \
-            IntermediateDirectory="$plat_no_ws/\$(ConfigurationName)/${name}" \
-            ConfigurationType="$vs_ConfigurationType" \
-            CharacterSet="1" \
-            WholeProgramOptimization="0" \
-
-        case "$target" in
-            x86*)
-                case "$name" in
-                    vpx)
-                        tag Tool \
-                            Name="VCCLCompilerTool" \
-                            Optimization="2" \
-                            FavorSizeorSpeed="1" \
-                            AdditionalIncludeDirectories="$incs" \
-                            PreprocessorDefinitions="WIN32;NDEBUG;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE;$defines" \
-                            RuntimeLibrary="$release_runtime" \
-                            UsePrecompiledHeader="0" \
-                            WarningLevel="3" \
-                            DebugInformationFormat="0" \
-                            $warn_64bit \
-
-                        $uses_asm && tag Tool Name="YASM"  IncludePaths="$incs"
-                    ;;
-                    *)
-                        tag Tool \
-                            Name="VCCLCompilerTool" \
-                            AdditionalIncludeDirectories="$incs" \
-                            Optimization="2" \
-                            FavorSizeorSpeed="1" \
-                            PreprocessorDefinitions="WIN32;NDEBUG;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE;$defines" \
-                            RuntimeLibrary="$release_runtime" \
-                            UsePrecompiledHeader="0" \
-                            WarningLevel="3" \
-                            DebugInformationFormat="0" \
-                            $warn_64bit \
-
-                        $uses_asm && tag Tool Name="YASM"  IncludePaths="$incs"
-                    ;;
-                esac
-            ;;
-        esac
-
-        case "$proj_kind" in
-            exe)
-                case "$target" in
-                    x86*)
-                        case "$name" in
-                            *)
-                                tag Tool \
-                                    Name="VCLinkerTool" \
-                                    AdditionalDependencies="$libs \$(NoInherit)" \
-                                    AdditionalLibraryDirectories="$libdirs" \
-
-                            ;;
-                        esac
-                    ;;
-                 esac
-            ;;
-            lib)
-                case "$target" in
-                    x86*)
-                        tag Tool \
-                            Name="VCLibrarianTool" \
-                            OutputFile="\$(OutDir)/${name}${lib_sfx}.lib" \
-
-                    ;;
-                esac
-            ;;
-            dll) # note differences to debug version: LinkIncremental, AssemblyDebug
-                tag Tool \
-                    Name="VCLinkerTool" \
-                    AdditionalDependencies="\$(NoInherit)" \
-                    LinkIncremental="1" \
-                    GenerateDebugInformation="true" \
-                    TargetMachine="1" \
-                    $link_opts \
-
-            ;;
-        esac
-
-        close_tag Configuration
-    done
-    close_tag Configurations
-
-    open_tag Files
-    generate_filter srcs   "Source Files"   "c;cc;cpp;def;odl;idl;hpj;bat;asm;asmx"
-    generate_filter hdrs   "Header Files"   "h;hm;inl;inc;xsd"
-    generate_filter resrcs "Resource Files" "rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav"
-    generate_filter resrcs "Build Files"    "mk"
-    close_tag Files
-
-    tag       Globals
-    close_tag VisualStudioProject
-
-    # This must be done from within the {} subshell
-    echo "Ignored files list (${#file_list[@]} items) is:" >&2
-    for f in "${file_list[@]}"; do
-        echo "    $f" >&2
-    done
-}
-
-generate_vcproj |
-    sed  -e '/"/s;\([^ "]\)/;\1\\;g' > ${outfile}
-
-exit
-<!--
-TODO: Add any files not captured by filters.
-                <File
-                        RelativePath=".\ReadMe.txt"
-                        >
-                </File>
--->
diff --git a/build/make/gen_msvs_sln.sh b/build/make/gen_msvs_sln.sh
index 664b404c9..7d5f46810 100755
--- a/build/make/gen_msvs_sln.sh
+++ b/build/make/gen_msvs_sln.sh
@@ -55,16 +55,11 @@ indent_pop() {
 
 parse_project() {
     local file=$1
-    if [ "$sfx" = "vcproj" ]; then
-        local name=`grep Name "$file" | awk 'BEGIN {FS="\""}{if (NR==1) print $2}'`
-        local guid=`grep ProjectGUID "$file" | awk 'BEGIN {FS="\""}{if (NR==1) print $2}'`
-    else
-        local name=`grep RootNamespace "$file" | sed 's,.*<.*>\(.*\)</.*>.*,\1,'`
-        local guid=`grep ProjectGuid "$file" | sed 's,.*<.*>\(.*\)</.*>.*,\1,'`
-    fi
+    local name=`grep RootNamespace "$file" | sed 's,.*<.*>\(.*\)</.*>.*,\1,'`
+    local guid=`grep ProjectGuid "$file" | sed 's,.*<.*>\(.*\)</.*>.*,\1,'`
 
     # save the project GUID to a varaible, normalizing to the basename of the
-    # vcproj file without the extension
+    # vcxproj file without the extension
     local var
     var=${file##*/}
     var=${var%%.${sfx}}
@@ -72,13 +67,8 @@ parse_project() {
     eval "${var}_name=$name"
     eval "${var}_guid=$guid"
 
-    if [ "$sfx" = "vcproj" ]; then
-        cur_config_list=`grep -A1 '<Configuration' $file |
-            grep Name | cut -d\" -f2`
-    else
-        cur_config_list=`grep -B1 'Label="Configuration"' $file |
-            grep Condition | cut -d\' -f4`
-    fi
+    cur_config_list=`grep -B1 'Label="Configuration"' $file |
+        grep Condition | cut -d\' -f4`
     new_config_list=$(for i in $config_list $cur_config_list; do
         echo $i
     done | sort | uniq)
@@ -103,25 +93,6 @@ process_project() {
     eval "${var}_guid=$guid"
 
     echo "Project(\"{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}\") = \"$name\", \"$file\", \"$guid\""
-    indent_push
-
-    eval "local deps=\"\${${var}_deps}\""
-    if [ -n "$deps" ] && [ "$sfx" = "vcproj" ]; then
-        echo "${indent}ProjectSection(ProjectDependencies) = postProject"
-        indent_push
-
-        for dep in $deps; do
-            eval "local dep_guid=\${${dep}_guid}"
-            [ -z "${dep_guid}" ] && die "Unknown GUID for $dep (dependency of $var)"
-            echo "${indent}$dep_guid = $dep_guid"
-        done
-
-        indent_pop
-        echo "${indent}EndProjectSection"
-
-    fi
-
-    indent_pop
     echo "EndProject"
 }
 
@@ -191,11 +162,7 @@ process_makefile() {
     IFS=$'\r'$'\n'
     local TAB=$'\t'
     cat <<EOF
-ifeq (\$(CONFIG_VS_VERSION),7)
-MSBUILD_TOOL := devenv.com
-else
 MSBUILD_TOOL := msbuild.exe
-endif
 found_devenv := \$(shell which \$(MSBUILD_TOOL) >/dev/null 2>&1 && echo yes)
 .nodevenv.once:
 ${TAB}@echo "  * \$(MSBUILD_TOOL) not found in path."
@@ -204,7 +171,7 @@ ${TAB}@echo "  * You will have to build all configurations manually using the"
 ${TAB}@echo "  * Visual Studio IDE. To allow make to build them automatically,"
 ${TAB}@echo "  * add the Common7/IDE directory of your Visual Studio"
 ${TAB}@echo "  * installation to your path, eg:"
-${TAB}@echo "  *   C:\Program Files\Microsoft Visual Studio 8\Common7\IDE"
+${TAB}@echo "  *   C:\Program Files\Microsoft Visual Studio 10.0\Common7\IDE"
 ${TAB}@echo "  * "
 ${TAB}@touch \$@
 CLEAN-OBJS += \$(if \$(found_devenv),,.nodevenv.once)
@@ -221,16 +188,9 @@ clean::
 ${TAB}rm -rf "$platform"/"$config"
 .PHONY: $nows_sln_config
 ifneq (\$(found_devenv),)
-  ifeq (\$(CONFIG_VS_VERSION),7)
-$nows_sln_config: $outfile
-${TAB}\$(MSBUILD_TOOL) $outfile -build "$config"
-
-  else
 $nows_sln_config: $outfile
 ${TAB}\$(MSBUILD_TOOL) $outfile -m -t:Build \\
 ${TAB}${TAB}-p:Configuration="$config" -p:Platform="$platform"
-
-  endif
 else
 $nows_sln_config: $outfile .nodevenv.once
 ${TAB}@echo "  * Skipping build of $sln_config (\$(MSBUILD_TOOL) not in path)."
@@ -255,23 +215,12 @@ for opt in "$@"; do
     ;;
     --ver=*) vs_ver="$optval"
              case $optval in
-             [789]|10|11|12|14)
+             10|11|12|14)
              ;;
              *) die Unrecognized Visual Studio Version in $opt
              ;;
              esac
     ;;
-    --ver=*) vs_ver="$optval"
-             case $optval in
-             7) sln_vers="8.00"
-                sln_vers_str="Visual Studio .NET 2003"
-             ;;
-             [89])
-             ;;
-             *) die "Unrecognized Visual Studio Version '$optval' in $opt"
-             ;;
-             esac
-    ;;
     --target=*) target="${optval}"
     ;;
     -*) die_unknown $opt
@@ -281,16 +230,7 @@ for opt in "$@"; do
 done
 outfile=${outfile:-/dev/stdout}
 mkoutfile=${mkoutfile:-/dev/stdout}
-case "${vs_ver:-8}" in
-    7) sln_vers="8.00"
-       sln_vers_str="Visual Studio .NET 2003"
-    ;;
-    8) sln_vers="9.00"
-       sln_vers_str="Visual Studio 2005"
-    ;;
-    9) sln_vers="10.00"
-       sln_vers_str="Visual Studio 2008"
-    ;;
+case "${vs_ver:-10}" in
     10) sln_vers="11.00"
        sln_vers_str="Visual Studio 2010"
     ;;
@@ -304,14 +244,7 @@ case "${vs_ver:-8}" in
        sln_vers_str="Visual Studio 2015"
     ;;
 esac
-case "${vs_ver:-8}" in
-    [789])
-    sfx=vcproj
-    ;;
-    10|11|12|14)
-    sfx=vcxproj
-    ;;
-esac
+sfx=vcxproj
 
 for f in "${file_list[@]}"; do
     parse_project $f
diff --git a/configure b/configure
index f82ee046b..c1ad24bfb 100755
--- a/configure
+++ b/configure
@@ -132,9 +132,6 @@ all_platforms="${all_platforms} x86-linux-icc"
 all_platforms="${all_platforms} x86-os2-gcc"
 all_platforms="${all_platforms} x86-solaris-gcc"
 all_platforms="${all_platforms} x86-win32-gcc"
-all_platforms="${all_platforms} x86-win32-vs7"
-all_platforms="${all_platforms} x86-win32-vs8"
-all_platforms="${all_platforms} x86-win32-vs9"
 all_platforms="${all_platforms} x86-win32-vs10"
 all_platforms="${all_platforms} x86-win32-vs11"
 all_platforms="${all_platforms} x86-win32-vs12"
@@ -152,8 +149,6 @@ all_platforms="${all_platforms} x86_64-linux-gcc"
 all_platforms="${all_platforms} x86_64-linux-icc"
 all_platforms="${all_platforms} x86_64-solaris-gcc"
 all_platforms="${all_platforms} x86_64-win64-gcc"
-all_platforms="${all_platforms} x86_64-win64-vs8"
-all_platforms="${all_platforms} x86_64-win64-vs9"
 all_platforms="${all_platforms} x86_64-win64-vs10"
 all_platforms="${all_platforms} x86_64-win64-vs11"
 all_platforms="${all_platforms} x86_64-win64-vs12"
@@ -272,7 +267,6 @@ CONFIG_LIST="
     install_bins
     install_libs
     install_srcs
-    use_x86inc
     debug
     gprof
     gcov
@@ -334,7 +328,6 @@ CMDLINE_SELECT="
     gprof
     gcov
     pic
-    use_x86inc
     optimizations
     ccache
     runtime_cpu_detect
@@ -646,17 +639,9 @@ process_toolchain() {
         vs*) enable_feature msvs
              enable_feature solution
              vs_version=${tgt_cc##vs}
-             case $vs_version in
-             [789])
-                 VCPROJ_SFX=vcproj
-                 gen_vcproj_cmd=${source_path}/build/make/gen_msvs_proj.sh
-                 ;;
-             10|11|12|14)
-                 VCPROJ_SFX=vcxproj
-                 gen_vcproj_cmd=${source_path}/build/make/gen_msvs_vcxproj.sh
-                 enabled werror && gen_vcproj_cmd="${gen_vcproj_cmd} --enable-werror"
-                 ;;
-             esac
+             VCPROJ_SFX=vcxproj
+             gen_vcproj_cmd=${source_path}/build/make/gen_msvs_vcxproj.sh
+             enabled werror && gen_vcproj_cmd="${gen_vcproj_cmd} --enable-werror"
              all_targets="${all_targets} solution"
              INLINE="__forceinline"
         ;;
diff --git a/examples.mk b/examples.mk
index c891a5496..cc7fb1ddc 100644
--- a/examples.mk
+++ b/examples.mk
@@ -215,6 +215,17 @@ vp8cx_set_ref.SRCS                 += vpx_ports/msvc.h
 vp8cx_set_ref.GUID                  = C5E31F7F-96F6-48BD-BD3E-10EBF6E8057A
 vp8cx_set_ref.DESCRIPTION           = VP8 set encoder reference frame
 
+ifeq ($(CONFIG_VP9_ENCODER),yes)
+ifeq ($(CONFIG_DECODERS),yes)
+EXAMPLES-yes                       += vp9cx_set_ref.c
+vp9cx_set_ref.SRCS                 += ivfenc.h ivfenc.c
+vp9cx_set_ref.SRCS                 += tools_common.h tools_common.c
+vp9cx_set_ref.SRCS                 += video_common.h
+vp9cx_set_ref.SRCS                 += video_writer.h video_writer.c
+vp9cx_set_ref.GUID                  = 65D7F14A-2EE6-4293-B958-AB5107A03B55
+vp9cx_set_ref.DESCRIPTION           = VP9 set encoder reference frame
+endif
+endif
 
 ifeq ($(CONFIG_MULTI_RES_ENCODING),yes)
 ifeq ($(CONFIG_LIBYUV),yes)
diff --git a/examples/vp9cx_set_ref.c b/examples/vp9cx_set_ref.c
new file mode 100644
index 000000000..acf79dbca
--- /dev/null
+++ b/examples/vp9cx_set_ref.c
@@ -0,0 +1,453 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+// VP9 Set Reference Frame
+// ============================
+//
+// This is an example demonstrating how to overwrite the VP9 encoder's
+// internal reference frame. In the sample we set the last frame to the
+// current frame. This technique could be used to bounce between two cameras.
+//
+// The decoder would also have to set the reference frame to the same value
+// on the same frame, or the video will become corrupt. The 'test_decode'
+// variable is set to 1 in this example that tests if the encoder and decoder
+// results are matching.
+//
+// Usage
+// -----
+// This example encodes a raw video. And the last argument passed in specifies
+// the frame number to update the reference frame on. For example, run
+// examples/vp9cx_set_ref 352 288 in.yuv out.ivf 4 30
+// The parameter is parsed as follows:
+//
+//
+// Extra Variables
+// ---------------
+// This example maintains the frame number passed on the command line
+// in the `update_frame_num` variable.
+//
+//
+// Configuration
+// -------------
+//
+// The reference frame is updated on the frame specified on the command
+// line.
+//
+// Observing The Effects
+// ---------------------
+// The encoder and decoder results should be matching when the same reference
+// frame setting operation is done in both encoder and decoder. Otherwise,
+// the encoder/decoder mismatch would be seen.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "vpx/vp8cx.h"
+#include "vpx/vpx_decoder.h"
+#include "vpx/vpx_encoder.h"
+
+#include "./tools_common.h"
+#include "./video_writer.h"
+
+static const char *exec_name;
+
+void usage_exit() {
+  fprintf(stderr, "Usage: %s <width> <height> <infile> <outfile> "
+          "<frame> <limit(optional)>\n",
+          exec_name);
+  exit(EXIT_FAILURE);
+}
+
+static int compare_img(const vpx_image_t *const img1,
+                       const vpx_image_t *const img2) {
+  uint32_t l_w = img1->d_w;
+  uint32_t c_w =
+      (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift;
+  const uint32_t c_h =
+      (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift;
+  uint32_t i;
+  int match = 1;
+
+  match &= (img1->fmt == img2->fmt);
+  match &= (img1->d_w == img2->d_w);
+  match &= (img1->d_h == img2->d_h);
+
+  for (i = 0; i < img1->d_h; ++i)
+    match &= (memcmp(img1->planes[VPX_PLANE_Y] + i * img1->stride[VPX_PLANE_Y],
+                     img2->planes[VPX_PLANE_Y] + i * img2->stride[VPX_PLANE_Y],
+                     l_w) == 0);
+
+  for (i = 0; i < c_h; ++i)
+    match &= (memcmp(img1->planes[VPX_PLANE_U] + i * img1->stride[VPX_PLANE_U],
+                     img2->planes[VPX_PLANE_U] + i * img2->stride[VPX_PLANE_U],
+                     c_w) == 0);
+
+  for (i = 0; i < c_h; ++i)
+    match &= (memcmp(img1->planes[VPX_PLANE_V] + i * img1->stride[VPX_PLANE_V],
+                     img2->planes[VPX_PLANE_V] + i * img2->stride[VPX_PLANE_V],
+                     c_w) == 0);
+
+  return match;
+}
+
+#define mmin(a, b)  ((a) < (b) ? (a) : (b))
+static void find_mismatch(const vpx_image_t *const img1,
+                          const vpx_image_t *const img2,
+                          int yloc[4], int uloc[4], int vloc[4]) {
+  const uint32_t bsize = 64;
+  const uint32_t bsizey = bsize >> img1->y_chroma_shift;
+  const uint32_t bsizex = bsize >> img1->x_chroma_shift;
+  const uint32_t c_w =
+      (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift;
+  const uint32_t c_h =
+      (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift;
+  int match = 1;
+  uint32_t i, j;
+  yloc[0] = yloc[1] = yloc[2] = yloc[3] = -1;
+  for (i = 0, match = 1; match && i < img1->d_h; i += bsize) {
+    for (j = 0; match && j < img1->d_w; j += bsize) {
+      int k, l;
+      const int si = mmin(i + bsize, img1->d_h) - i;
+      const int sj = mmin(j + bsize, img1->d_w) - j;
+      for (k = 0; match && k < si; ++k) {
+        for (l = 0; match && l < sj; ++l) {
+          if (*(img1->planes[VPX_PLANE_Y] +
+                (i + k) * img1->stride[VPX_PLANE_Y] + j + l) !=
+              *(img2->planes[VPX_PLANE_Y] +
+                (i + k) * img2->stride[VPX_PLANE_Y] + j + l)) {
+            yloc[0] = i + k;
+            yloc[1] = j + l;
+            yloc[2] = *(img1->planes[VPX_PLANE_Y] +
+                        (i + k) * img1->stride[VPX_PLANE_Y] + j + l);
+            yloc[3] = *(img2->planes[VPX_PLANE_Y] +
+                        (i + k) * img2->stride[VPX_PLANE_Y] + j + l);
+            match = 0;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  uloc[0] = uloc[1] = uloc[2] = uloc[3] = -1;
+  for (i = 0, match = 1; match && i < c_h; i += bsizey) {
+    for (j = 0; match && j < c_w; j += bsizex) {
+      int k, l;
+      const int si = mmin(i + bsizey, c_h - i);
+      const int sj = mmin(j + bsizex, c_w - j);
+      for (k = 0; match && k < si; ++k) {
+        for (l = 0; match && l < sj; ++l) {
+          if (*(img1->planes[VPX_PLANE_U] +
+                (i + k) * img1->stride[VPX_PLANE_U] + j + l) !=
+              *(img2->planes[VPX_PLANE_U] +
+                (i + k) * img2->stride[VPX_PLANE_U] + j + l)) {
+            uloc[0] = i + k;
+            uloc[1] = j + l;
+            uloc[2] = *(img1->planes[VPX_PLANE_U] +
+                        (i + k) * img1->stride[VPX_PLANE_U] + j + l);
+            uloc[3] = *(img2->planes[VPX_PLANE_U] +
+                        (i + k) * img2->stride[VPX_PLANE_U] + j + l);
+            match = 0;
+            break;
+          }
+        }
+      }
+    }
+  }
+  vloc[0] = vloc[1] = vloc[2] = vloc[3] = -1;
+  for (i = 0, match = 1; match && i < c_h; i += bsizey) {
+    for (j = 0; match && j < c_w; j += bsizex) {
+      int k, l;
+      const int si = mmin(i + bsizey, c_h - i);
+      const int sj = mmin(j + bsizex, c_w - j);
+      for (k = 0; match && k < si; ++k) {
+        for (l = 0; match && l < sj; ++l) {
+          if (*(img1->planes[VPX_PLANE_V] +
+                (i + k) * img1->stride[VPX_PLANE_V] + j + l) !=
+              *(img2->planes[VPX_PLANE_V] +
+                (i + k) * img2->stride[VPX_PLANE_V] + j + l)) {
+            vloc[0] = i + k;
+            vloc[1] = j + l;
+            vloc[2] = *(img1->planes[VPX_PLANE_V] +
+                        (i + k) * img1->stride[VPX_PLANE_V] + j + l);
+            vloc[3] = *(img2->planes[VPX_PLANE_V] +
+                        (i + k) * img2->stride[VPX_PLANE_V] + j + l);
+            match = 0;
+            break;
+          }
+        }
+      }
+    }
+  }
+}
+
+static void testing_decode(vpx_codec_ctx_t *encoder,
+                           vpx_codec_ctx_t *decoder,
+                           vpx_codec_enc_cfg_t *cfg,
+                           unsigned int frame_out,
+                           int *mismatch_seen) {
+  vpx_image_t enc_img, dec_img;
+  struct vp9_ref_frame ref_enc, ref_dec;
+
+  if (*mismatch_seen)
+    return;
+
+  ref_enc.idx = 0;
+  ref_dec.idx = 0;
+  if (vpx_codec_control(encoder, VP9_GET_REFERENCE, &ref_enc))
+    die_codec(encoder,  "Failed to get encoder reference frame");
+  enc_img = ref_enc.img;
+  if (vpx_codec_control(decoder, VP9_GET_REFERENCE, &ref_dec))
+    die_codec(decoder, "Failed to get decoder reference frame");
+  dec_img = ref_dec.img;
+
+  if (!compare_img(&enc_img, &dec_img)) {
+    int y[4], u[4], v[4];
+
+    *mismatch_seen = 1;
+
+    find_mismatch(&enc_img, &dec_img, y, u, v);
+    printf("Encode/decode mismatch on frame %d at"
+           " Y[%d, %d] {%d/%d},"
+           " U[%d, %d] {%d/%d},"
+           " V[%d, %d] {%d/%d}",
+           frame_out,
+           y[0], y[1], y[2], y[3],
+           u[0], u[1], u[2], u[3],
+           v[0], v[1], v[2], v[3]);
+  }
+
+  vpx_img_free(&enc_img);
+  vpx_img_free(&dec_img);
+}
+
+static int encode_frame(vpx_codec_ctx_t *ecodec,
+                        vpx_codec_enc_cfg_t *cfg,
+                        vpx_image_t *img,
+                        unsigned int frame_in,
+                        VpxVideoWriter *writer,
+                        int test_decode,
+                        vpx_codec_ctx_t *dcodec,
+                        unsigned int *frame_out,
+                        int *mismatch_seen) {
+  int got_pkts = 0;
+  vpx_codec_iter_t iter = NULL;
+  const vpx_codec_cx_pkt_t *pkt = NULL;
+  int got_data;
+  const vpx_codec_err_t res = vpx_codec_encode(ecodec, img, frame_in, 1,
+                                               0, VPX_DL_GOOD_QUALITY);
+  if (res != VPX_CODEC_OK)
+    die_codec(ecodec, "Failed to encode frame");
+
+  got_data = 0;
+
+  while ((pkt = vpx_codec_get_cx_data(ecodec, &iter)) != NULL) {
+    got_pkts = 1;
+
+    if (pkt->kind == VPX_CODEC_CX_FRAME_PKT) {
+      const int keyframe = (pkt->data.frame.flags & VPX_FRAME_IS_KEY) != 0;
+
+      if (!(pkt->data.frame.flags & VPX_FRAME_IS_FRAGMENT)) {
+                *frame_out += 1;
+        }
+
+      if (!vpx_video_writer_write_frame(writer,
+                                        pkt->data.frame.buf,
+                                        pkt->data.frame.sz,
+                                        pkt->data.frame.pts)) {
+        die_codec(ecodec, "Failed to write compressed frame");
+      }
+      printf(keyframe ? "K" : ".");
+      fflush(stdout);
+      got_data = 1;
+
+      // Decode 1 frame.
+      if (test_decode) {
+        if (vpx_codec_decode(dcodec, pkt->data.frame.buf,
+                             (unsigned int)pkt->data.frame.sz, NULL, 0))
+          die_codec(dcodec, "Failed to decode frame.");
+      }
+    }
+  }
+
+  // Mismatch checking
+  if (got_data && test_decode) {
+    testing_decode(ecodec, dcodec, cfg, *frame_out, mismatch_seen);
+  }
+
+  return got_pkts;
+}
+
+int main(int argc, char **argv) {
+  FILE *infile = NULL;
+  // Encoder
+  vpx_codec_ctx_t ecodec = {0};
+  vpx_codec_enc_cfg_t cfg = {0};
+  unsigned int frame_in = 0;
+  vpx_image_t raw;
+  vpx_codec_err_t res;
+  VpxVideoInfo info = {0};
+  VpxVideoWriter *writer = NULL;
+  const VpxInterface *encoder = NULL;
+
+  // Test encoder/decoder mismatch.
+  int test_decode = 1;
+  // Decoder
+  vpx_codec_ctx_t dcodec;
+  unsigned int frame_out = 0;
+
+  // The frame number to set reference frame on
+  int update_frame_num = 0;
+  int mismatch_seen = 0;
+
+  const int fps = 30;
+  const int bitrate = 500;
+
+  const char *width_arg = NULL;
+  const char *height_arg = NULL;
+  const char *infile_arg = NULL;
+  const char *outfile_arg = NULL;
+  unsigned int limit = 0;
+  exec_name = argv[0];
+
+  if (argc < 6)
+    die("Invalid number of arguments");
+
+  width_arg = argv[1];
+  height_arg = argv[2];
+  infile_arg = argv[3];
+  outfile_arg = argv[4];
+
+  encoder = get_vpx_encoder_by_name("vp9");
+  if (!encoder)
+    die("Unsupported codec.");
+
+  update_frame_num = atoi(argv[5]);
+  // In VP9, the reference buffers (cm->buffer_pool->frame_bufs[i].buf) are
+  // allocated while calling vpx_codec_encode(), thus, setting reference for
+  // 1st frame isn't supported.
+  if (update_frame_num <= 1)
+    die("Couldn't parse frame number '%s'\n", argv[5]);
+
+  if (argc > 6) {
+    limit = atoi(argv[6]);
+    if (update_frame_num > limit)
+      die("Update frame number couldn't larger than limit\n");
+  }
+
+  info.codec_fourcc = encoder->fourcc;
+  info.frame_width = strtol(width_arg, NULL, 0);
+  info.frame_height = strtol(height_arg, NULL, 0);
+  info.time_base.numerator = 1;
+  info.time_base.denominator = fps;
+
+  if (info.frame_width <= 0 ||
+      info.frame_height <= 0 ||
+      (info.frame_width % 2) != 0 ||
+      (info.frame_height % 2) != 0) {
+    die("Invalid frame size: %dx%d", info.frame_width, info.frame_height);
+  }
+
+  if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, info.frame_width,
+                                             info.frame_height, 1)) {
+    die("Failed to allocate image.");
+  }
+
+  printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface()));
+
+  res = vpx_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
+  if (res)
+    die_codec(&ecodec, "Failed to get default codec config.");
+
+  cfg.g_w = info.frame_width;
+  cfg.g_h = info.frame_height;
+  cfg.g_timebase.num = info.time_base.numerator;
+  cfg.g_timebase.den = info.time_base.denominator;
+  cfg.rc_target_bitrate = bitrate;
+  cfg.g_lag_in_frames = 3;
+
+  writer = vpx_video_writer_open(outfile_arg, kContainerIVF, &info);
+  if (!writer)
+    die("Failed to open %s for writing.", outfile_arg);
+
+  if (!(infile = fopen(infile_arg, "rb")))
+    die("Failed to open %s for reading.", infile_arg);
+
+  if (vpx_codec_enc_init(&ecodec, encoder->codec_interface(), &cfg, 0))
+    die_codec(&ecodec, "Failed to initialize encoder");
+
+  // Disable alt_ref.
+  if (vpx_codec_control(&ecodec, VP8E_SET_ENABLEAUTOALTREF, 0))
+    die_codec(&ecodec, "Failed to set enable auto alt ref");
+
+  if (test_decode) {
+      const VpxInterface *decoder = get_vpx_decoder_by_name("vp9");
+      if (vpx_codec_dec_init(&dcodec, decoder->codec_interface(), NULL, 0))
+        die_codec(&dcodec, "Failed to initialize decoder.");
+  }
+
+  // Encode frames.
+  while (vpx_img_read(&raw, infile)) {
+    if (limit && frame_in >= limit)
+      break;
+    if (update_frame_num > 1 && frame_out + 1 == update_frame_num) {
+      vpx_ref_frame_t ref;
+      ref.frame_type = VP8_LAST_FRAME;
+      ref.img = raw;
+      // Set reference frame in encoder.
+      if (vpx_codec_control(&ecodec, VP8_SET_REFERENCE, &ref))
+        die_codec(&ecodec, "Failed to set reference frame");
+      printf(" <SET_REF>");
+
+      // If set_reference in decoder is commented out, the enc/dec mismatch
+      // would be seen.
+      if (test_decode) {
+        if (vpx_codec_control(&dcodec, VP8_SET_REFERENCE, &ref))
+          die_codec(&dcodec, "Failed to set reference frame");
+      }
+    }
+
+    encode_frame(&ecodec, &cfg, &raw, frame_in, writer, test_decode,
+                 &dcodec, &frame_out, &mismatch_seen);
+    frame_in++;
+    if (mismatch_seen)
+      break;
+  }
+
+  // Flush encoder.
+  if (!mismatch_seen)
+    while (encode_frame(&ecodec, &cfg, NULL, frame_in, writer, test_decode,
+                        &dcodec, &frame_out, &mismatch_seen)) {}
+
+  printf("\n");
+  fclose(infile);
+  printf("Processed %d frames.\n", frame_out);
+
+  if (test_decode) {
+    if (!mismatch_seen)
+      printf("Encoder/decoder results are matching.\n");
+    else
+      printf("Encoder/decoder results are NOT matching.\n");
+  }
+
+  if (test_decode)
+    if (vpx_codec_destroy(&dcodec))
+      die_codec(&dcodec, "Failed to destroy decoder");
+
+  vpx_img_free(&raw);
+  if (vpx_codec_destroy(&ecodec))
+    die_codec(&ecodec, "Failed to destroy encoder.");
+
+  vpx_video_writer_close(writer);
+
+  return EXIT_SUCCESS;
+}
diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index 73b0edb99..5933a62b0 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -431,7 +431,8 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
   void CopyOutputToRef() {
     memcpy(output_ref_, output_, kOutputBufferSize);
 #if CONFIG_VP9_HIGHBITDEPTH
-    memcpy(output16_ref_, output16_, kOutputBufferSize);
+    memcpy(output16_ref_, output16_,
+           kOutputBufferSize * sizeof(output16_ref_[0]));
 #endif
   }
 
@@ -443,41 +444,41 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
   }
 
   uint8_t *input() const {
+    const int offset = BorderTop() * kOuterBlockSize + BorderLeft();
 #if CONFIG_VP9_HIGHBITDEPTH
     if (UUT_->use_highbd_ == 0) {
-      return input_ + BorderTop() * kOuterBlockSize + BorderLeft();
+      return input_ + offset;
     } else {
-      return CONVERT_TO_BYTEPTR(input16_ + BorderTop() * kOuterBlockSize +
-                                BorderLeft());
+      return CONVERT_TO_BYTEPTR(input16_) + offset;
     }
 #else
-    return input_ + BorderTop() * kOuterBlockSize + BorderLeft();
+    return input_ + offset;
 #endif
   }
 
   uint8_t *output() const {
+    const int offset = BorderTop() * kOuterBlockSize + BorderLeft();
 #if CONFIG_VP9_HIGHBITDEPTH
     if (UUT_->use_highbd_ == 0) {
-      return output_ + BorderTop() * kOuterBlockSize + BorderLeft();
+      return output_ + offset;
     } else {
-      return CONVERT_TO_BYTEPTR(output16_ + BorderTop() * kOuterBlockSize +
-                                BorderLeft());
+      return CONVERT_TO_BYTEPTR(output16_) + offset;
     }
 #else
-    return output_ + BorderTop() * kOuterBlockSize + BorderLeft();
+    return output_ + offset;
 #endif
   }
 
   uint8_t *output_ref() const {
+    const int offset = BorderTop() * kOuterBlockSize + BorderLeft();
 #if CONFIG_VP9_HIGHBITDEPTH
     if (UUT_->use_highbd_ == 0) {
-      return output_ref_ + BorderTop() * kOuterBlockSize + BorderLeft();
+      return output_ref_ + offset;
     } else {
-      return CONVERT_TO_BYTEPTR(output16_ref_ + BorderTop() * kOuterBlockSize +
-                                BorderLeft());
+      return CONVERT_TO_BYTEPTR(output16_ref_) + offset;
     }
 #else
-    return output_ref_ + BorderTop() * kOuterBlockSize + BorderLeft();
+    return output_ref_ + offset;
 #endif
   }
 
@@ -989,14 +990,12 @@ void wrap_ ## func ## _ ## bd(const uint8_t *src, ptrdiff_t src_stride, \
                       w, h, bd); \
 }
 #if HAVE_SSE2 && ARCH_X86_64
-#if CONFIG_USE_X86INC
 WRAP(convolve_copy_sse2, 8)
 WRAP(convolve_avg_sse2, 8)
 WRAP(convolve_copy_sse2, 10)
 WRAP(convolve_avg_sse2, 10)
 WRAP(convolve_copy_sse2, 12)
 WRAP(convolve_avg_sse2, 12)
-#endif  // CONFIG_USE_X86INC
 WRAP(convolve8_horiz_sse2, 8)
 WRAP(convolve8_avg_horiz_sse2, 8)
 WRAP(convolve8_vert_sse2, 8)
@@ -1090,11 +1089,7 @@ INSTANTIATE_TEST_CASE_P(C, ConvolveTest,
 #if HAVE_SSE2 && ARCH_X86_64
 #if CONFIG_VP9_HIGHBITDEPTH
 const ConvolveFunctions convolve8_sse2(
-#if CONFIG_USE_X86INC
     wrap_convolve_copy_sse2_8, wrap_convolve_avg_sse2_8,
-#else
-    wrap_convolve_copy_c_8, wrap_convolve_avg_c_8,
-#endif  // CONFIG_USE_X86INC
     wrap_convolve8_horiz_sse2_8, wrap_convolve8_avg_horiz_sse2_8,
     wrap_convolve8_vert_sse2_8, wrap_convolve8_avg_vert_sse2_8,
     wrap_convolve8_sse2_8, wrap_convolve8_avg_sse2_8,
@@ -1102,11 +1097,7 @@ const ConvolveFunctions convolve8_sse2(
     wrap_convolve8_vert_sse2_8, wrap_convolve8_avg_vert_sse2_8,
     wrap_convolve8_sse2_8, wrap_convolve8_avg_sse2_8, 8);
 const ConvolveFunctions convolve10_sse2(
-#if CONFIG_USE_X86INC
     wrap_convolve_copy_sse2_10, wrap_convolve_avg_sse2_10,
-#else
-    wrap_convolve_copy_c_10, wrap_convolve_avg_c_10,
-#endif  // CONFIG_USE_X86INC
     wrap_convolve8_horiz_sse2_10, wrap_convolve8_avg_horiz_sse2_10,
     wrap_convolve8_vert_sse2_10, wrap_convolve8_avg_vert_sse2_10,
     wrap_convolve8_sse2_10, wrap_convolve8_avg_sse2_10,
@@ -1114,11 +1105,7 @@ const ConvolveFunctions convolve10_sse2(
     wrap_convolve8_vert_sse2_10, wrap_convolve8_avg_vert_sse2_10,
     wrap_convolve8_sse2_10, wrap_convolve8_avg_sse2_10, 10);
 const ConvolveFunctions convolve12_sse2(
-#if CONFIG_USE_X86INC
     wrap_convolve_copy_sse2_12, wrap_convolve_avg_sse2_12,
-#else
-    wrap_convolve_copy_c_12, wrap_convolve_avg_c_12,
-#endif  // CONFIG_USE_X86INC
     wrap_convolve8_horiz_sse2_12, wrap_convolve8_avg_horiz_sse2_12,
     wrap_convolve8_vert_sse2_12, wrap_convolve8_avg_vert_sse2_12,
     wrap_convolve8_sse2_12, wrap_convolve8_avg_sse2_12,
@@ -1132,11 +1119,7 @@ const ConvolveParam kArrayConvolve_sse2[] = {
 };
 #else
 const ConvolveFunctions convolve8_sse2(
-#if CONFIG_USE_X86INC
     vpx_convolve_copy_sse2, vpx_convolve_avg_sse2,
-#else
-    vpx_convolve_copy_c, vpx_convolve_avg_c,
-#endif  // CONFIG_USE_X86INC
     vpx_convolve8_horiz_sse2, vpx_convolve8_avg_horiz_sse2,
     vpx_convolve8_vert_sse2, vpx_convolve8_avg_vert_sse2,
     vpx_convolve8_sse2, vpx_convolve8_avg_sse2,
diff --git a/test/vp8cx_set_ref.sh b/test/cx_set_ref.sh
index 5d760bcde..0a58dc187 100755
--- a/test/vp8cx_set_ref.sh
+++ b/test/cx_set_ref.sh
@@ -1,6 +1,6 @@
 #!/bin/sh
 ##
-##  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+##  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
 ##
 ##  Use of this source code is governed by a BSD-style license
 ##  that can be found in the LICENSE file in the root of the source
@@ -8,30 +8,27 @@
 ##  in the file PATENTS.  All contributing project authors may
 ##  be found in the AUTHORS file in the root of the source tree.
 ##
-##  This file tests the libvpx vp8cx_set_ref example. To add new tests to this
+##  This file tests the libvpx cx_set_ref example. To add new tests to this
 ##  file, do the following:
 ##    1. Write a shell function (this is your test).
-##    2. Add the function to vp8cx_set_ref_tests (on a new line).
+##    2. Add the function to cx_set_ref_tests (on a new line).
 ##
 . $(dirname $0)/tools_common.sh
 
 # Environment check: $YUV_RAW_INPUT is required.
-vp8cx_set_ref_verify_environment() {
+cx_set_ref_verify_environment() {
   if [ ! -e "${YUV_RAW_INPUT}" ]; then
     echo "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
     return 1
   fi
 }
 
-# Runs vp8cx_set_ref and updates the reference frame before encoding frame 90.
-# $1 is the codec name, which vp8cx_set_ref does not support at present: It's
-# currently used only to name the output file.
-# TODO(tomfinegan): Pass the codec param once the example is updated to support
-# VP9.
+# Runs cx_set_ref and updates the reference frame before encoding frame 90.
+# $1 is the codec name.
 vpx_set_ref() {
-  local encoder="${LIBVPX_BIN_PATH}/vp8cx_set_ref${VPX_TEST_EXE_SUFFIX}"
   local codec="$1"
-  local output_file="${VPX_TEST_OUTPUT_DIR}/vp8cx_set_ref_${codec}.ivf"
+  local encoder="${LIBVPX_BIN_PATH}/${codec}cx_set_ref${VPX_TEST_EXE_SUFFIX}"
+  local output_file="${VPX_TEST_OUTPUT_DIR}/${codec}cx_set_ref_${codec}.ivf"
   local ref_frame_num=90
 
   if [ ! -x "${encoder}" ]; then
@@ -46,12 +43,18 @@ vpx_set_ref() {
   [ -e "${output_file}" ] || return 1
 }
 
-vp8cx_set_ref_vp8() {
+cx_set_ref_vp8() {
   if [ "$(vp8_encode_available)" = "yes" ]; then
     vpx_set_ref vp8 || return 1
   fi
 }
 
-vp8cx_set_ref_tests="vp8cx_set_ref_vp8"
+cx_set_ref_vp9() {
+  if [ "$(vp9_encode_available)" = "yes" ]; then
+    vpx_set_ref vp9 || return 1
+  fi
+}
+
+cx_set_ref_tests="cx_set_ref_vp8 cx_set_ref_vp9"
 
-run_tests vp8cx_set_ref_verify_environment "${vp8cx_set_ref_tests}"
+run_tests cx_set_ref_verify_environment "${cx_set_ref_tests}"
diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index ddaf9395b..e6224b21a 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -25,20 +25,12 @@
 #include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
+#include "vpx_ports/msvc.h"  // for round()
 
 using libvpx_test::ACMRandom;
 
 namespace {
 
-#ifdef _MSC_VER
-static int round(double x) {
-  if (x < 0)
-    return static_cast<int>(ceil(x - 0.5));
-  else
-    return static_cast<int>(floor(x + 0.5));
-}
-#endif
-
 const int kNumCoeffs = 256;
 const double C1 = 0.995184726672197;
 const double C2 = 0.98078528040323;
diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc
index 16d88255e..278d72dfa 100644
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@@ -25,18 +25,11 @@
 #include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
+#include "vpx_ports/msvc.h"  // for round()
 
 using libvpx_test::ACMRandom;
 
 namespace {
-#ifdef _MSC_VER
-static int round(double x) {
-  if (x < 0)
-    return static_cast<int>(ceil(x - 0.5));
-  else
-    return static_cast<int>(floor(x + 0.5));
-}
-#endif
 
 const int kNumCoeffs = 1024;
 const double kPi = 3.141592653589793238462643383279502884;
diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
index 735cccf8d..a24085606 100644
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -487,7 +487,7 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 3, VPX_BITS_8)));
 #endif  // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
-#if CONFIG_USE_X86INC && HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE
+#if HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans4x4WHT,
     ::testing::Values(
diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index 29f215817..083ee6628 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -766,7 +766,7 @@ INSTANTIATE_TEST_CASE_P(
                    &idct8x8_64_add_12_sse2, 6225, VPX_BITS_12)));
 #endif  // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
-#if HAVE_SSSE3 && CONFIG_USE_X86INC && ARCH_X86_64 && \
+#if HAVE_SSSE3 && ARCH_X86_64 && \
     !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     SSSE3, FwdTrans8x8DCT,
diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc
index 7a5bd5b4c..b8eec523f 100644
--- a/test/hadamard_test.cc
+++ b/test/hadamard_test.cc
@@ -152,10 +152,10 @@ INSTANTIATE_TEST_CASE_P(SSE2, Hadamard8x8Test,
                         ::testing::Values(&vpx_hadamard_8x8_sse2));
 #endif  // HAVE_SSE2
 
-#if HAVE_SSSE3 && CONFIG_USE_X86INC && ARCH_X86_64
+#if HAVE_SSSE3 && ARCH_X86_64
 INSTANTIATE_TEST_CASE_P(SSSE3, Hadamard8x8Test,
                         ::testing::Values(&vpx_hadamard_8x8_ssse3));
-#endif  // HAVE_SSSE3 && CONFIG_USE_X86INC && ARCH_X86_64
+#endif  // HAVE_SSSE3 && ARCH_X86_64
 
 #if HAVE_NEON
 INSTANTIATE_TEST_CASE_P(NEON, Hadamard8x8Test,
diff --git a/test/idct8x8_test.cc b/test/idct8x8_test.cc
index 7f9d751d6..04487c452 100644
--- a/test/idct8x8_test.cc
+++ b/test/idct8x8_test.cc
@@ -17,20 +17,12 @@
 #include "./vpx_dsp_rtcd.h"
 #include "test/acm_random.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_ports/msvc.h"  // for round()
 
 using libvpx_test::ACMRandom;
 
 namespace {
 
-#ifdef _MSC_VER
-static int round(double x) {
-  if (x < 0)
-    return static_cast<int>(ceil(x - 0.5));
-  else
-    return static_cast<int>(floor(x + 0.5));
-}
-#endif
-
 void reference_dct_1d(double input[8], double output[8]) {
   const double kPi = 3.141592653589793238462643383279502884;
   const double kInvSqrt2 = 0.707106781186547524400844362104;
@@ -86,7 +78,7 @@ TEST(VP9Idct8x8Test, AccuracyCheck) {
 
     reference_dct_2d(input, output_r);
     for (int j = 0; j < 64; ++j)
-      coeff[j] = round(output_r[j]);
+      coeff[j] = static_cast<tran_low_t>(round(output_r[j]));
     vpx_idct8x8_64_add_c(coeff, dst, 8);
     for (int j = 0; j < 64; ++j) {
       const int diff = dst[j] - src[j];
diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc
index 6c824128b..1efb1a4eb 100644
--- a/test/partial_idct_test.cc
+++ b/test/partial_idct_test.cc
@@ -295,7 +295,7 @@ INSTANTIATE_TEST_CASE_P(
                    TX_4X4, 1)));
 #endif
 
-#if HAVE_SSSE3 && CONFIG_USE_X86INC && ARCH_X86_64 && \
+#if HAVE_SSSE3 && ARCH_X86_64 && \
     !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     SSSE3_64, PartialIDctTest,
diff --git a/test/sad_test.cc b/test/sad_test.cc
index e6bd0d793..fa7b6c840 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -690,7 +690,6 @@ INSTANTIATE_TEST_CASE_P(NEON, SADx4Test, ::testing::ValuesIn(x4d_neon_tests));
 //------------------------------------------------------------------------------
 // x86 functions
 #if HAVE_SSE2
-#if CONFIG_USE_X86INC
 const SadMxNParam sse2_tests[] = {
   make_tuple(64, 64, &vpx_sad64x64_sse2, -1),
   make_tuple(64, 32, &vpx_sad64x32_sse2, -1),
@@ -852,7 +851,6 @@ const SadMxNx4Param x4d_sse2_tests[] = {
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 };
 INSTANTIATE_TEST_CASE_P(SSE2, SADx4Test, ::testing::ValuesIn(x4d_sse2_tests));
-#endif  // CONFIG_USE_X86INC
 #endif  // HAVE_SSE2
 
 #if HAVE_SSE3
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index 2acf744d5..8928bf87c 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -187,21 +187,21 @@ INTRA_PRED_TEST(C, TestIntraPred4, vpx_dc_predictor_4x4_c,
                 vpx_d153_predictor_4x4_c, vpx_d207_predictor_4x4_c,
                 vpx_d63_predictor_4x4_c, vpx_tm_predictor_4x4_c)
 
-#if HAVE_SSE2 && CONFIG_USE_X86INC
+#if HAVE_SSE2
 INTRA_PRED_TEST(SSE2, TestIntraPred4, vpx_dc_predictor_4x4_sse2,
                 vpx_dc_left_predictor_4x4_sse2, vpx_dc_top_predictor_4x4_sse2,
                 vpx_dc_128_predictor_4x4_sse2, vpx_v_predictor_4x4_sse2,
                 vpx_h_predictor_4x4_sse2, vpx_d45_predictor_4x4_sse2, NULL,
                 NULL, NULL, vpx_d207_predictor_4x4_sse2, NULL,
                 vpx_tm_predictor_4x4_sse2)
-#endif  // HAVE_SSE2 && CONFIG_USE_X86INC
+#endif  // HAVE_SSE2
 
-#if HAVE_SSSE3 && CONFIG_USE_X86INC
+#if HAVE_SSSE3
 INTRA_PRED_TEST(SSSE3, TestIntraPred4, NULL, NULL, NULL, NULL, NULL,
                 NULL, NULL, NULL, NULL,
                 vpx_d153_predictor_4x4_ssse3, NULL,
                 vpx_d63_predictor_4x4_ssse3, NULL)
-#endif  // HAVE_SSSE3 && CONFIG_USE_X86INC
+#endif  // HAVE_SSSE3
 
 #if HAVE_DSPR2
 INTRA_PRED_TEST(DSPR2, TestIntraPred4, vpx_dc_predictor_4x4_dspr2, NULL, NULL,
@@ -237,20 +237,20 @@ INTRA_PRED_TEST(C, TestIntraPred8, vpx_dc_predictor_8x8_c,
                 vpx_d153_predictor_8x8_c, vpx_d207_predictor_8x8_c,
                 vpx_d63_predictor_8x8_c, vpx_tm_predictor_8x8_c)
 
-#if HAVE_SSE2 && CONFIG_USE_X86INC
+#if HAVE_SSE2
 INTRA_PRED_TEST(SSE2, TestIntraPred8, vpx_dc_predictor_8x8_sse2,
                 vpx_dc_left_predictor_8x8_sse2, vpx_dc_top_predictor_8x8_sse2,
                 vpx_dc_128_predictor_8x8_sse2, vpx_v_predictor_8x8_sse2,
                 vpx_h_predictor_8x8_sse2, vpx_d45_predictor_8x8_sse2, NULL,
                 NULL, NULL, NULL, NULL, vpx_tm_predictor_8x8_sse2)
-#endif  // HAVE_SSE2 && CONFIG_USE_X86INC
+#endif  // HAVE_SSE2
 
-#if HAVE_SSSE3 && CONFIG_USE_X86INC
+#if HAVE_SSSE3
 INTRA_PRED_TEST(SSSE3, TestIntraPred8, NULL, NULL, NULL, NULL, NULL,
                 NULL, NULL, NULL, NULL,
                 vpx_d153_predictor_8x8_ssse3, vpx_d207_predictor_8x8_ssse3,
                 vpx_d63_predictor_8x8_ssse3, NULL)
-#endif  // HAVE_SSSE3 && CONFIG_USE_X86INC
+#endif  // HAVE_SSSE3
 
 #if HAVE_DSPR2
 INTRA_PRED_TEST(DSPR2, TestIntraPred8, vpx_dc_predictor_8x8_dspr2, NULL, NULL,
@@ -286,22 +286,22 @@ INTRA_PRED_TEST(C, TestIntraPred16, vpx_dc_predictor_16x16_c,
                 vpx_d153_predictor_16x16_c, vpx_d207_predictor_16x16_c,
                 vpx_d63_predictor_16x16_c, vpx_tm_predictor_16x16_c)
 
-#if HAVE_SSE2 && CONFIG_USE_X86INC
+#if HAVE_SSE2
 INTRA_PRED_TEST(SSE2, TestIntraPred16, vpx_dc_predictor_16x16_sse2,
                 vpx_dc_left_predictor_16x16_sse2,
                 vpx_dc_top_predictor_16x16_sse2,
                 vpx_dc_128_predictor_16x16_sse2, vpx_v_predictor_16x16_sse2,
                 vpx_h_predictor_16x16_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
                 vpx_tm_predictor_16x16_sse2)
-#endif  // HAVE_SSE2 && CONFIG_USE_X86INC
+#endif  // HAVE_SSE2
 
-#if HAVE_SSSE3 && CONFIG_USE_X86INC
+#if HAVE_SSSE3
 INTRA_PRED_TEST(SSSE3, TestIntraPred16, NULL, NULL, NULL, NULL, NULL,
                 NULL, vpx_d45_predictor_16x16_ssse3,
                 NULL, NULL, vpx_d153_predictor_16x16_ssse3,
                 vpx_d207_predictor_16x16_ssse3, vpx_d63_predictor_16x16_ssse3,
                 NULL)
-#endif  // HAVE_SSSE3 && CONFIG_USE_X86INC
+#endif  // HAVE_SSSE3
 
 #if HAVE_DSPR2
 INTRA_PRED_TEST(DSPR2, TestIntraPred16, vpx_dc_predictor_16x16_dspr2, NULL,
@@ -337,21 +337,21 @@ INTRA_PRED_TEST(C, TestIntraPred32, vpx_dc_predictor_32x32_c,
                 vpx_d153_predictor_32x32_c, vpx_d207_predictor_32x32_c,
                 vpx_d63_predictor_32x32_c, vpx_tm_predictor_32x32_c)
 
-#if HAVE_SSE2 && CONFIG_USE_X86INC
+#if HAVE_SSE2
 INTRA_PRED_TEST(SSE2, TestIntraPred32, vpx_dc_predictor_32x32_sse2,
                 vpx_dc_left_predictor_32x32_sse2,
                 vpx_dc_top_predictor_32x32_sse2,
                 vpx_dc_128_predictor_32x32_sse2, vpx_v_predictor_32x32_sse2,
                 vpx_h_predictor_32x32_sse2, NULL, NULL, NULL, NULL, NULL,
                 NULL, vpx_tm_predictor_32x32_sse2)
-#endif  // HAVE_SSE2 && CONFIG_USE_X86INC
+#endif  // HAVE_SSE2
 
-#if HAVE_SSSE3 && CONFIG_USE_X86INC
+#if HAVE_SSSE3
 INTRA_PRED_TEST(SSSE3, TestIntraPred32, NULL, NULL, NULL, NULL, NULL,
                 NULL, vpx_d45_predictor_32x32_ssse3, NULL, NULL,
                 vpx_d153_predictor_32x32_ssse3, vpx_d207_predictor_32x32_ssse3,
                 vpx_d63_predictor_32x32_ssse3, NULL)
-#endif  // HAVE_SSSE3 && CONFIG_USE_X86INC
+#endif  // HAVE_SSSE3
 
 #if HAVE_NEON
 INTRA_PRED_TEST(NEON, TestIntraPred32, vpx_dc_predictor_32x32_neon,
diff --git a/test/variance_test.cc b/test/variance_test.cc
index cb6339041..08c84a613 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -1002,7 +1002,6 @@ INSTANTIATE_TEST_CASE_P(
                       make_tuple(2, 3, &vpx_variance4x8_sse2, 0),
                       make_tuple(2, 2, &vpx_variance4x4_sse2, 0)));
 
-#if CONFIG_USE_X86INC
 INSTANTIATE_TEST_CASE_P(
     SSE2, VpxSubpelVarianceTest,
     ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_sse2, 0),
@@ -1035,7 +1034,6 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_sse2, 0),
         make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_sse2, 0),
         make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_sse2, 0)));
-#endif  // CONFIG_USE_X86INC
 
 #if CONFIG_VP9_HIGHBITDEPTH
 /* TODO(debargha): This test does not support the highbd version
@@ -1088,7 +1086,6 @@ INSTANTIATE_TEST_CASE_P(
                       make_tuple(3, 4, &vpx_highbd_8_variance8x16_sse2, 8),
                       make_tuple(3, 3, &vpx_highbd_8_variance8x8_sse2, 8)));
 
-#if CONFIG_USE_X86INC
 INSTANTIATE_TEST_CASE_P(
     SSE2, VpxHBDSubpelVarianceTest,
     ::testing::Values(
@@ -1162,12 +1159,10 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(3, 4, &vpx_highbd_8_sub_pixel_avg_variance8x16_sse2, 8),
         make_tuple(3, 3, &vpx_highbd_8_sub_pixel_avg_variance8x8_sse2, 8),
         make_tuple(3, 2, &vpx_highbd_8_sub_pixel_avg_variance8x4_sse2, 8)));
-#endif  // CONFIG_USE_X86INC
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_SSE2
 
 #if HAVE_SSSE3
-#if CONFIG_USE_X86INC
 INSTANTIATE_TEST_CASE_P(
     SSSE3, VpxSubpelVarianceTest,
     ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_ssse3, 0),
@@ -1200,7 +1195,6 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_ssse3, 0),
         make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_ssse3, 0),
         make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_ssse3, 0)));
-#endif  // CONFIG_USE_X86INC
 #endif  // HAVE_SSSE3
 
 #if HAVE_AVX2
diff --git a/test/vp9_error_block_test.cc b/test/vp9_error_block_test.cc
index 23a249e2b..341cc19cb 100644
--- a/test/vp9_error_block_test.cc
+++ b/test/vp9_error_block_test.cc
@@ -157,9 +157,9 @@ TEST_P(ErrorBlockTest, ExtremeValues) {
       << "First failed at test case " << first_failure;
 }
 
+#if HAVE_SSE2 || HAVE_AVX
 using std::tr1::make_tuple;
 
-#if CONFIG_USE_X86INC
 int64_t wrap_vp9_highbd_block_error_8bit_c(const tran_low_t *coeff,
                                            const tran_low_t *dqcoeff,
                                            intptr_t block_size,
@@ -167,6 +167,7 @@ int64_t wrap_vp9_highbd_block_error_8bit_c(const tran_low_t *coeff,
   EXPECT_EQ(8, bps);
   return vp9_highbd_block_error_8bit_c(coeff, dqcoeff, block_size, ssz);
 }
+#endif  // HAVE_SSE2 || HAVE_AVX
 
 #if HAVE_SSE2
 int64_t wrap_vp9_highbd_block_error_8bit_sse2(const tran_low_t *coeff,
@@ -206,6 +207,5 @@ INSTANTIATE_TEST_CASE_P(
                    &wrap_vp9_highbd_block_error_8bit_c, VPX_BITS_8)));
 #endif  // HAVE_AVX
 
-#endif  // CONFIG_USE_X86INC
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 }  // namespace
diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc
index 416f3c322..ea8abfb77 100644
--- a/test/vp9_intrapred_test.cc
+++ b/test/vp9_intrapred_test.cc
@@ -131,7 +131,6 @@ using std::tr1::make_tuple;
 
 #if HAVE_SSE2
 #if CONFIG_VP9_HIGHBITDEPTH
-#if CONFIG_USE_X86INC
 INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, VP9IntraPredTest,
                         ::testing::Values(
                             make_tuple(&vpx_highbd_dc_predictor_32x32_sse2,
@@ -225,7 +224,6 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, VP9IntraPredTest,
                             make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
                                        &vpx_highbd_tm_predictor_8x8_c, 8, 12)));
 
-#endif  // CONFIG_USE_X86INC
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_SSE2
 }  // namespace
diff --git a/test/vp9_subtract_test.cc b/test/vp9_subtract_test.cc
index 3cad4d7e6..4793a9716 100644
--- a/test/vp9_subtract_test.cc
+++ b/test/vp9_subtract_test.cc
@@ -93,7 +93,7 @@ TEST_P(VP9SubtractBlockTest, SimpleSubtract) {
 INSTANTIATE_TEST_CASE_P(C, VP9SubtractBlockTest,
                         ::testing::Values(vpx_subtract_block_c));
 
-#if HAVE_SSE2 && CONFIG_USE_X86INC
+#if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(SSE2, VP9SubtractBlockTest,
                         ::testing::Values(vpx_subtract_block_sse2));
 #endif
diff --git a/vp8/common/reconintra4x4.c b/vp8/common/reconintra4x4.c
index 35ad891ef..cd7c24821 100644
--- a/vp8/common/reconintra4x4.c
+++ b/vp8/common/reconintra4x4.c
@@ -14,6 +14,7 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vp8_rtcd.h"
 #include "blockd.h"
+#include "reconintra4x4.h"
 
 typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left);
diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c
index 7da3d71ad..96d00cbf7 100644
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -1593,7 +1593,7 @@ int vp8_drop_encodedframe_overshoot(VP8_COMP *cpi, int Q) {
     if (Q < thresh_qp &&
         cpi->projected_frame_size > thresh_rate &&
         pred_err_mb > thresh_pred_err_mb) {
-      double new_correction_factor = cpi->rate_correction_factor;
+      double new_correction_factor;
       const int target_size = cpi->av_per_frame_bandwidth;
       int target_bits_per_mb;
       // Drop this frame: advance frame counters, and set force_maxqp flag.
diff --git a/vp9/common/vp9_blockd.c b/vp9/common/vp9_blockd.c
index 7bab27d4f..88320584c 100644
--- a/vp9/common/vp9_blockd.c
+++ b/vp9/common/vp9_blockd.c
@@ -66,7 +66,7 @@ void vp9_foreach_transformed_block_in_plane(
   for (r = 0; r < max_blocks_high; r += (1 << tx_size)) {
     // Skip visiting the sub blocks that are wholly within the UMV.
     for (c = 0; c < max_blocks_wide; c += (1 << tx_size)) {
-      visit(plane, i, plane_bsize, tx_size, arg);
+      visit(plane, i, r, c, plane_bsize, tx_size, arg);
       i += step;
     }
     i += extra_step;
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 3d26fb2b5..85b99c4bc 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -270,6 +270,7 @@ static INLINE const vpx_prob *get_y_mode_probs(const MODE_INFO *mi,
 }
 
 typedef void (*foreach_transformed_block_visitor)(int plane, int block,
+                                                  int row, int col,
                                                   BLOCK_SIZE plane_bsize,
                                                   TX_SIZE tx_size,
                                                   void *arg);
@@ -283,17 +284,6 @@ void vp9_foreach_transformed_block(
     const MACROBLOCKD* const xd, BLOCK_SIZE bsize,
     foreach_transformed_block_visitor visit, void *arg);
 
-static INLINE void txfrm_block_to_raster_xy(BLOCK_SIZE plane_bsize,
-                                            TX_SIZE tx_size, int block,
-                                            int *x, int *y) {
-  const int bwl = b_width_log2_lookup[plane_bsize];
-  const int tx_cols_log2 = bwl - tx_size;
-  const int tx_cols = 1 << tx_cols_log2;
-  const int raster_mb = block >> (tx_size << 1);
-  *x = (raster_mb & (tx_cols - 1)) << tx_size;
-  *y = (raster_mb >> tx_cols_log2) << tx_size;
-}
-
 void vp9_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
                       BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int has_eob,
                       int aoff, int loff);
diff --git a/vp9/common/vp9_postproc.c b/vp9/common/vp9_postproc.c
index c04cc8f05..5dad81d64 100644
--- a/vp9/common/vp9_postproc.c
+++ b/vp9/common/vp9_postproc.c
@@ -618,15 +618,16 @@ int vp9_post_proc_frame(struct VP9Common *cm,
 
   // Alloc memory for prev_mip in the first frame.
   if (cm->current_video_frame == 1) {
-    cm->postproc_state.last_base_qindex = cm->base_qindex;
-    cm->postproc_state.last_frame_valid = 1;
+    ppstate->last_base_qindex = cm->base_qindex;
+    ppstate->last_frame_valid = 1;
+  }
+
+  if ((flags & VP9D_MFQE) && ppstate->prev_mip == NULL) {
     ppstate->prev_mip = vpx_calloc(cm->mi_alloc_size, sizeof(*cm->mip));
     if (!ppstate->prev_mip) {
       return 1;
     }
     ppstate->prev_mi = ppstate->prev_mip + cm->mi_stride + 1;
-    memset(ppstate->prev_mip, 0,
-           cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mip));
   }
 
   // Allocate post_proc_buffer_int if needed.
@@ -664,9 +665,9 @@ int vp9_post_proc_frame(struct VP9Common *cm,
                        "Failed to allocate post-processing buffer");
 
   if ((flags & VP9D_MFQE) && cm->current_video_frame >= 2 &&
-      cm->postproc_state.last_frame_valid && cm->bit_depth == 8 &&
-      cm->postproc_state.last_base_qindex <= last_q_thresh &&
-      cm->base_qindex - cm->postproc_state.last_base_qindex >= q_diff_thresh) {
+      ppstate->last_frame_valid && cm->bit_depth == 8 &&
+      ppstate->last_base_qindex <= last_q_thresh &&
+      cm->base_qindex - ppstate->last_base_qindex >= q_diff_thresh) {
     vp9_mfqe(cm);
     // TODO(jackychen): Consider whether enable deblocking by default
     // if mfqe is enabled. Need to take both the quality and the speed
@@ -692,8 +693,8 @@ int vp9_post_proc_frame(struct VP9Common *cm,
     vp8_yv12_copy_frame(cm->frame_to_show, ppbuf);
   }
 
-  cm->postproc_state.last_base_qindex = cm->base_qindex;
-  cm->postproc_state.last_frame_valid = 1;
+  ppstate->last_base_qindex = cm->base_qindex;
+  ppstate->last_frame_valid = 1;
 
   if (flags & VP9D_ADDNOISE) {
     const int noise_level = ppflags->noise_level;
@@ -714,7 +715,8 @@ int vp9_post_proc_frame(struct VP9Common *cm,
   dest->uv_width = dest->y_width >> cm->subsampling_x;
   dest->uv_height = dest->y_height >> cm->subsampling_y;
 
-  swap_mi_and_prev_mi(cm);
+  if (flags & VP9D_MFQE)
+    swap_mi_and_prev_mi(cm);
   return 0;
 }
 #endif  // CONFIG_VP9_POSTPROC
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index 84718e970..67fe18c7e 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -178,7 +178,7 @@ static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
       // Co-ordinate of containing block to pixel precision.
       const int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x));
       const int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y));
-#if CONFIG_BETTER_HW_COMPATIBILITY
+#if 0  // CONFIG_BETTER_HW_COMPATIBILITY
       assert(xd->mi[0]->sb_type != BLOCK_4X8 &&
              xd->mi[0]->sb_type != BLOCK_8X4);
       assert(mv_q4.row == mv.row * (1 << (1 - pd->subsampling_y)) &&
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 846133674..276f14554 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -21,29 +21,6 @@ EOF
 }
 forward_decls qw/vp9_common_forward_decls/;
 
-# x86inc.asm had specific constraints. break it out so it's easy to disable.
-# zero all the variables to avoid tricky else conditions.
-$mmx_x86inc = $sse_x86inc = $sse2_x86inc = $ssse3_x86inc = $avx_x86inc =
-  $avx2_x86inc = '';
-$mmx_x86_64_x86inc = $sse_x86_64_x86inc = $sse2_x86_64_x86inc =
-  $ssse3_x86_64_x86inc = $avx_x86_64_x86inc = $avx2_x86_64_x86inc = '';
-if (vpx_config("CONFIG_USE_X86INC") eq "yes") {
-  $mmx_x86inc = 'mmx';
-  $sse_x86inc = 'sse';
-  $sse2_x86inc = 'sse2';
-  $ssse3_x86inc = 'ssse3';
-  $avx_x86inc = 'avx';
-  $avx2_x86inc = 'avx2';
-  if ($opts{arch} eq "x86_64") {
-    $mmx_x86_64_x86inc = 'mmx';
-    $sse_x86_64_x86inc = 'sse';
-    $sse2_x86_64_x86inc = 'sse2';
-    $ssse3_x86_64_x86inc = 'ssse3';
-    $avx_x86_64_x86inc = 'avx';
-    $avx2_x86_64_x86inc = 'avx2';
-  }
-}
-
 # functions that are 64 bit only.
 $mmx_x86_64 = $sse2_x86_64 = $ssse3_x86_64 = $avx_x86_64 = $avx2_x86_64 = '';
 if ($opts{arch} eq "x86_64") {
@@ -202,10 +179,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vp9_block_error/;
 
   add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
-  specialize qw/vp9_highbd_block_error/, "$sse2_x86inc";
+  specialize qw/vp9_highbd_block_error sse2/;
 
   add_proto qw/int64_t vp9_highbd_block_error_8bit/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
-  specialize qw/vp9_highbd_block_error_8bit/, "$sse2_x86inc", "$avx_x86inc";
+  specialize qw/vp9_highbd_block_error_8bit sse2 avx/;
 
   add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_quantize_fp/;
@@ -217,16 +194,16 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vp9_fdct8x8_quant/;
 } else {
   add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
-  specialize qw/vp9_block_error avx2 msa/, "$sse2_x86inc";
+  specialize qw/vp9_block_error avx2 msa sse2/;
 
   add_proto qw/int64_t vp9_block_error_fp/, "const int16_t *coeff, const int16_t *dqcoeff, int block_size";
-  specialize qw/vp9_block_error_fp neon/, "$sse2_x86inc";
+  specialize qw/vp9_block_error_fp neon sse2/;
 
   add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64_x86inc";
+  specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64";
 
   add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64_x86inc";
+  specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64";
 
   add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_fdct8x8_quant sse2 ssse3 neon/;
@@ -245,7 +222,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vp9_fht16x16 sse2/;
 
   add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fwht4x4/, "$sse2_x86inc";
+  specialize qw/vp9_fwht4x4 sse2/;
 } else {
   add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
   specialize qw/vp9_fht4x4 sse2 msa/;
@@ -257,7 +234,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vp9_fht16x16 sse2 msa/;
 
   add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fwht4x4 msa/, "$sse2_x86inc";
+  specialize qw/vp9_fwht4x4 msa sse2/;
 }
 
 #
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index d63912932..3199b98aa 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -589,7 +589,7 @@ static void dec_build_inter_predictors(VPxWorker *const worker, MACROBLOCKD *xd,
     // Co-ordinate of containing block to pixel precision.
     int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x));
     int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y));
-#if CONFIG_BETTER_HW_COMPATIBILITY
+#if 0  // CONFIG_BETTER_HW_COMPATIBILITY
     assert(xd->mi[0]->sb_type != BLOCK_4X8 &&
            xd->mi[0]->sb_type != BLOCK_8X4);
     assert(mv_q4.row == mv->row * (1 << (1 - pd->subsampling_y)) &&
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index ffc6839ad..6869036bc 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -902,10 +902,10 @@ void vp9_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd,
       frame_mvs += cm->mi_cols;
     }
   }
-#if CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH
-    if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) &&
-        (xd->above_mi == NULL || xd->left_mi == NULL) &&
-        !is_inter_block(mi) && need_top_left[mi->uv_mode])
-      assert(0);
+#if 0  // CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH
+  if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) &&
+      (xd->above_mi == NULL || xd->left_mi == NULL) &&
+      !is_inter_block(mi) && need_top_left[mi->uv_mode])
+    assert(0);
 #endif  // CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH
 }
diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c
index 935c04f3a..9ed980081 100644
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@@ -187,47 +187,45 @@ vpx_codec_err_t vp9_copy_reference_dec(VP9Decoder *pbi,
 vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm,
                                       VP9_REFFRAME ref_frame_flag,
                                       YV12_BUFFER_CONFIG *sd) {
-  RefBuffer *ref_buf = NULL;
-  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+  int idx;
+  YV12_BUFFER_CONFIG *ref_buf = NULL;
 
   // TODO(jkoleszar): The decoder doesn't have any real knowledge of what the
   // encoder is using the frame buffers for. This is just a stub to keep the
   // vpxenc --test-decode functionality working, and will be replaced in a
   // later commit that adds VP9-specific controls for this functionality.
+  // (Yunqing) The set_reference control depends on the following setting in
+  // encoder.
+  // cpi->lst_fb_idx = 0;
+  // cpi->gld_fb_idx = 1;
+  // cpi->alt_fb_idx = 2;
   if (ref_frame_flag == VP9_LAST_FLAG) {
-    ref_buf = &cm->frame_refs[0];
+    idx = cm->ref_frame_map[0];
   } else if (ref_frame_flag == VP9_GOLD_FLAG) {
-    ref_buf = &cm->frame_refs[1];
+    idx = cm->ref_frame_map[1];
   } else if (ref_frame_flag == VP9_ALT_FLAG) {
-    ref_buf = &cm->frame_refs[2];
+    idx = cm->ref_frame_map[2];
   } else {
     vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
                        "Invalid reference frame");
     return cm->error.error_code;
   }
 
-  if (!equal_dimensions(ref_buf->buf, sd)) {
+  if (idx < 0 || idx >= FRAME_BUFFERS) {
     vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
-                       "Incorrect buffer dimensions");
-  } else {
-    int *ref_fb_ptr = &ref_buf->idx;
-
-    // Find an empty frame buffer.
-    const int free_fb = get_free_fb(cm);
-    if (cm->new_fb_idx == INVALID_IDX) {
-      vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
-                         "Unable to find free frame buffer");
-      return cm->error.error_code;
-    }
+                       "Invalid reference frame map");
+    return cm->error.error_code;
+  }
 
-    // Decrease ref_count since it will be increased again in
-    // ref_cnt_fb() below.
-    --frame_bufs[free_fb].ref_count;
+  // Get the destination reference buffer.
+  ref_buf = &cm->buffer_pool->frame_bufs[idx].buf;
 
-    // Manage the reference counters and copy image.
-    ref_cnt_fb(frame_bufs, ref_fb_ptr, free_fb);
-    ref_buf->buf = &frame_bufs[*ref_fb_ptr].buf;
-    vp8_yv12_copy_frame(sd, ref_buf->buf);
+  if (!equal_dimensions(ref_buf, sd)) {
+    vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                       "Incorrect buffer dimensions");
+  } else {
+    // Overwrite the reference frame buffer.
+    vp8_yv12_copy_frame(sd, ref_buf);
   }
 
   return cm->error.error_code;
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 169943c10..05fbc4194 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -31,6 +31,9 @@ struct optimize_ctx {
   ENTROPY_CONTEXT tl[MAX_MB_PLANE][16];
 };
 
+#define HETEROMULT 12
+#define HETEROCOEF 4
+
 void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
   struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane];
@@ -67,6 +70,48 @@ static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] ={ {10, 6}, {8, 7}, };
   rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);\
 }
 
+// This function eliminates isolated small nonzero coefficients.
+static void eliminate_small_coeff(const tran_low_t *const coeff_ptr,
+                                  const TX_SIZE tx_size,
+                                  const int16_t *const zbin_ptr,
+                                  tran_low_t *const qcoeff_ptr,
+                                  tran_low_t *const dqcoeff_ptr,
+                                  uint16_t *const eob_ptr,
+                                  const int16_t *const scan) {
+  const int zbins[2] =
+      {tx_size == TX_32X32 ? ROUND_POWER_OF_TWO(zbin_ptr[0], 1) : zbin_ptr[0],
+       tx_size == TX_32X32 ? ROUND_POWER_OF_TWO(zbin_ptr[1], 1) : zbin_ptr[1]};
+  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
+  const int hetero_zbins[2] = {(HETEROCOEF + 1) * zbins[0] / HETEROCOEF,
+                               (HETEROCOEF + 1) * zbins[1] / HETEROCOEF};
+  const int hetero_nzbins[2] = {hetero_zbins[0] * -1, hetero_zbins[1] * -1};
+  int eob = *eob_ptr, i = eob - 1, rc, tail_count = 0;
+
+  assert(i >= 0);
+  rc = scan[i];
+  while (i > 0 && coeff_ptr[rc] <= hetero_zbins[rc != 0] &&
+      coeff_ptr[rc] >= hetero_nzbins[rc != 0]) {
+    if (coeff_ptr[rc] > zbins[rc != 0] || coeff_ptr[rc] < nzbins[rc != 0])
+      ++tail_count;
+    if ((eob - i) * HETEROMULT >= tail_count * zbins[1]) {
+      eob = i;
+      tail_count = 0;
+    }
+    --i;
+    rc = scan[i];
+  }
+
+  for (i = eob; i < (*eob_ptr); ++i) {
+    rc = scan[i];
+    qcoeff_ptr[rc] = 0;
+    dqcoeff_ptr[rc] = 0;
+  }
+
+  while (eob > 0 && qcoeff_ptr[scan[eob - 1]] == 0) --eob;
+
+  *eob_ptr = eob;
+}
+
 // This function is a place holder for now but may ultimately need
 // to scan previous tokens to work out the correct context.
 static int trellis_get_coeff_context(const int16_t *scan,
@@ -335,7 +380,7 @@ static INLINE void highbd_fdct32x32(int rd_transform, const int16_t *src,
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
+void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col,
                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
@@ -346,10 +391,8 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   uint16_t *const eob = &p->eobs[block];
   const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
-  int i, j;
   const int16_t *src_diff;
-  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
-  src_diff = &p->src_diff[4 * (j * diff_stride + i)];
+  src_diff = &p->src_diff[4 * (row * diff_stride + col)];
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -425,7 +468,7 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
   }
 }
 
-void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block,
+void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, int row, int col,
                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
@@ -435,12 +478,8 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block,
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   uint16_t *const eob = &p->eobs[block];
   const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
-  int i, j;
   const int16_t *src_diff;
-
-  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
-  src_diff = &p->src_diff[4 * (j * diff_stride + i)];
-
+  src_diff = &p->src_diff[4 * (row * diff_stride + col)];
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     switch (tx_size) {
@@ -506,7 +545,7 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block,
   }
 }
 
-void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
+void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
                      BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
@@ -517,10 +556,8 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   uint16_t *const eob = &p->eobs[block];
   const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
-  int i, j;
   const int16_t *src_diff;
-  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
-  src_diff = &p->src_diff[4 * (j * diff_stride + i)];
+  src_diff = &p->src_diff[4 * (row * diff_stride + col)];
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -593,9 +630,14 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
       assert(0);
       break;
   }
+  if (!x->skip_block && *eob > 0) {
+    eliminate_small_coeff(coeff, tx_size, p->zbin, qcoeff, dqcoeff, eob,
+                          scan_order->scan);
+  }
 }
 
-static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
+static void encode_block(int plane, int block, int row, int col,
+                         BLOCK_SIZE plane_bsize,
                          TX_SIZE tx_size, void *arg) {
   struct encode_b_args *const args = arg;
   MACROBLOCK *const x = args->x;
@@ -604,13 +646,11 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  int i, j;
   uint8_t *dst;
   ENTROPY_CONTEXT *a, *l;
-  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
-  dst = &pd->dst.buf[4 * j * pd->dst.stride + 4 * i];
-  a = &ctx->ta[plane][i];
-  l = &ctx->tl[plane][j];
+  dst = &pd->dst.buf[4 * row * pd->dst.stride + 4 * col];
+  a = &ctx->ta[plane][col];
+  l = &ctx->tl[plane][row];
 
   // TODO(jingning): per transformed block zero forcing only enabled for
   // luma component. will integrate chroma components as well.
@@ -629,17 +669,17 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
         *a = *l = 0;
         return;
       } else {
-        vp9_xform_quant_fp(x, plane, block, plane_bsize, tx_size);
+        vp9_xform_quant_fp(x, plane, block, row, col, plane_bsize, tx_size);
       }
     } else {
       if (max_txsize_lookup[plane_bsize] == tx_size) {
         int txfm_blk_index = (plane << 2) + (block >> (tx_size << 1));
         if (x->skip_txfm[txfm_blk_index] == SKIP_TXFM_NONE) {
           // full forward transform and quantization
-          vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
+          vp9_xform_quant(x, plane, block, row, col, plane_bsize, tx_size);
         } else if (x->skip_txfm[txfm_blk_index] == SKIP_TXFM_AC_ONLY) {
           // fast path forward transform and quantization
-          vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size);
+          vp9_xform_quant_dc(x, plane, block, row, col, plane_bsize, tx_size);
         } else {
           // skip forward transform
           p->eobs[block] = 0;
@@ -647,7 +687,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
           return;
         }
       } else {
-        vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
+        vp9_xform_quant(x, plane, block, row, col, plane_bsize, tx_size);
       }
     }
   }
@@ -715,19 +755,18 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
   }
 }
 
-static void encode_block_pass1(int plane, int block, BLOCK_SIZE plane_bsize,
+static void encode_block_pass1(int plane, int block, int row, int col,
+                               BLOCK_SIZE plane_bsize,
                                TX_SIZE tx_size, void *arg) {
   MACROBLOCK *const x = (MACROBLOCK *)arg;
   MACROBLOCKD *const xd = &x->e_mbd;
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  int i, j;
   uint8_t *dst;
-  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
-  dst = &pd->dst.buf[4 * j * pd->dst.stride + 4 * i];
+  dst = &pd->dst.buf[4 * row * pd->dst.stride + 4 * col];
 
-  vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
+  vp9_xform_quant(x, plane, block, row, col, plane_bsize, tx_size);
 
   if (p->eobs[block] > 0) {
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -774,7 +813,8 @@ void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
   }
 }
 
-void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
+void vp9_encode_block_intra(int plane, int block, int row, int col,
+                            BLOCK_SIZE plane_bsize,
                             TX_SIZE tx_size, void *arg) {
   struct encode_b_args* const args = arg;
   MACROBLOCK *const x = args->x;
@@ -795,18 +835,16 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
   uint16_t *eob = &p->eobs[block];
   const int src_stride = p->src.stride;
   const int dst_stride = pd->dst.stride;
-  int i, j;
   struct optimize_ctx *const ctx = args->ctx;
   ENTROPY_CONTEXT *a = NULL;
   ENTROPY_CONTEXT *l = NULL;
   int entropy_ctx = 0;
-  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
-  dst = &pd->dst.buf[4 * (j * dst_stride + i)];
-  src = &p->src.buf[4 * (j * src_stride + i)];
-  src_diff = &p->src_diff[4 * (j * diff_stride + i)];
+  dst = &pd->dst.buf[4 * (row * dst_stride + col)];
+  src = &p->src.buf[4 * (row * src_stride + col)];
+  src_diff = &p->src_diff[4 * (row * diff_stride + col)];
   if (args->ctx != NULL) {
-    a = &ctx->ta[plane][i];
-    l = &ctx->tl[plane][j];
+    a = &ctx->ta[plane][col];
+    l = &ctx->tl[plane][row];
     entropy_ctx = combine_entropy_contexts(*a, *l);
   }
 
@@ -826,7 +864,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
 
   vp9_predict_intra_block(xd, bwl, tx_size, mode, x->skip_encode ? src : dst,
                           x->skip_encode ? src_stride : dst_stride,
-                          dst, dst_stride, i, j, plane);
+                          dst, dst_stride, col, row, plane);
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -926,6 +964,10 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
                              p->quant, p->quant_shift, qcoeff, dqcoeff,
                              pd->dequant, eob, scan_order->scan,
                              scan_order->iscan);
+        if (!x->skip_block && *eob > 0) {
+          eliminate_small_coeff(coeff, tx_size, p->zbin, qcoeff, dqcoeff,
+                                eob, scan_order->scan);
+        }
       }
       if (args->ctx != NULL && !x->skip_recode) {
        *a = *l = optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
@@ -942,6 +984,10 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
                        p->quant, p->quant_shift, qcoeff, dqcoeff,
                        pd->dequant, eob, scan_order->scan,
                        scan_order->iscan);
+        if (!x->skip_block && *eob > 0) {
+          eliminate_small_coeff(coeff, tx_size, p->zbin, qcoeff, dqcoeff,
+                                eob, scan_order->scan);
+        }
       }
       if (args->ctx != NULL && !x->skip_recode) {
         *a = *l = optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
@@ -958,6 +1004,10 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
                        p->quant_shift, qcoeff, dqcoeff,
                        pd->dequant, eob, scan_order->scan,
                        scan_order->iscan);
+        if (!x->skip_block && *eob > 0) {
+          eliminate_small_coeff(coeff, tx_size, p->zbin, qcoeff, dqcoeff,
+                                eob, scan_order->scan);
+        }
       }
       if (args->ctx != NULL && !x->skip_recode) {
         *a = *l = optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
@@ -977,6 +1027,10 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
                        p->quant_shift, qcoeff, dqcoeff,
                        pd->dequant, eob, scan_order->scan,
                        scan_order->iscan);
+        if (!x->skip_block && *eob > 0) {
+          eliminate_small_coeff(coeff, tx_size, p->zbin, qcoeff, dqcoeff,
+                                eob, scan_order->scan);
+        }
       }
       if (args->ctx != NULL && !x->skip_recode) {
         *a = *l = optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h
index 25b0b23e0..62d75a129 100644
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h
@@ -25,16 +25,17 @@ struct encode_b_args {
 };
 void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize);
 void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize);
-void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
+void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col,
                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
-void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block,
+void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, int row, int col,
                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
-void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
+void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
                      BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
 
 void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
 
-void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
+void vp9_encode_block_intra(int plane, int block, int row, int col,
+                            BLOCK_SIZE plane_bsize,
                             TX_SIZE tx_size, void *arg);
 
 void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane,
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 53a3ec7de..66ccc92c4 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -506,10 +506,10 @@ static int get_ul_intra_threshold(VP9_COMMON *cm) {
         ret_val = UL_INTRA_THRESH;
         break;
       case VPX_BITS_10:
-        ret_val = UL_INTRA_THRESH >> 2;
+        ret_val = UL_INTRA_THRESH << 2;
         break;
       case VPX_BITS_12:
-        ret_val = UL_INTRA_THRESH >> 4;
+        ret_val = UL_INTRA_THRESH << 4;
         break;
       default:
         assert(0 && "cm->bit_depth should be VPX_BITS_8, "
@@ -532,10 +532,10 @@ static int get_smooth_intra_threshold(VP9_COMMON *cm) {
         ret_val = SMOOTH_INTRA_THRESH;
         break;
       case VPX_BITS_10:
-        ret_val = SMOOTH_INTRA_THRESH >> 2;
+        ret_val = SMOOTH_INTRA_THRESH << 4;
         break;
       case VPX_BITS_12:
-        ret_val = SMOOTH_INTRA_THRESH >> 4;
+        ret_val = SMOOTH_INTRA_THRESH << 8;
         break;
       default:
         assert(0 && "cm->bit_depth should be VPX_BITS_8, "
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 7db6ef2b0..5ccadd005 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -932,7 +932,8 @@ struct estimate_block_intra_args {
   RD_COST *rdc;
 };
 
-static void estimate_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
+static void estimate_block_intra(int plane, int block, int row, int col,
+                                 BLOCK_SIZE plane_bsize,
                                  TX_SIZE tx_size, void *arg) {
   struct estimate_block_intra_args* const args = arg;
   VP9_COMP *const cpi = args->cpi;
@@ -945,20 +946,19 @@ static void estimate_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
   uint8_t *const dst_buf_base = pd->dst.buf;
   const int src_stride = p->src.stride;
   const int dst_stride = pd->dst.stride;
-  int i, j;
   RD_COST this_rdc;
 
-  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
+  (void)block;
 
-  p->src.buf = &src_buf_base[4 * (j * src_stride + i)];
-  pd->dst.buf = &dst_buf_base[4 * (j * dst_stride + i)];
+  p->src.buf = &src_buf_base[4 * (row * src_stride + col)];
+  pd->dst.buf = &dst_buf_base[4 * (row * dst_stride + col)];
   // Use source buffer as an approximation for the fully reconstructed buffer.
   vp9_predict_intra_block(xd, b_width_log2_lookup[plane_bsize],
                           tx_size, args->mode,
                           x->skip_encode ? p->src.buf : pd->dst.buf,
                           x->skip_encode ? src_stride : dst_stride,
                           pd->dst.buf, dst_stride,
-                          i, j, plane);
+                          col, row, plane);
 
   if (plane == 0) {
     int64_t this_sse = INT64_MAX;
@@ -1744,7 +1744,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 #if CONFIG_VP9_HIGHBITDEPTH
       const int large_block = bsize > BLOCK_32X32;
 #else
-      const int large_block = bsize >= BLOCK_32X32;
+      const int large_block =
+          x->sb_is_skin ? bsize > BLOCK_32X32 : bsize >= BLOCK_32X32;
 #endif
       mi->interp_filter = (filter_ref == SWITCHABLE) ? EIGHTTAP : filter_ref;
       vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 6e4ffee92..28530386c 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -498,18 +498,16 @@ static void dist_block(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size,
   }
 }
 
-static int rate_block(int plane, int block, BLOCK_SIZE plane_bsize,
+static int rate_block(int plane, int block, int row, int col,
                       TX_SIZE tx_size, struct rdcost_block_args* args) {
-  int x_idx, y_idx;
-  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x_idx, &y_idx);
-
-  return cost_coeffs(args->x, plane, block, args->t_above + x_idx,
-                     args->t_left + y_idx, tx_size,
+  return cost_coeffs(args->x, plane, block, args->t_above + col,
+                     args->t_left + row, tx_size,
                      args->so->scan, args->so->neighbors,
                      args->use_fast_coef_costing);
 }
 
-static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
+static void block_rd_txfm(int plane, int block, int row, int col,
+                          BLOCK_SIZE plane_bsize,
                           TX_SIZE tx_size, void *arg) {
   struct rdcost_block_args *args = arg;
   MACROBLOCK *const x = args->x;
@@ -525,20 +523,20 @@ static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
 
   if (!is_inter_block(mi)) {
     struct encode_b_args arg = {x, NULL, &mi->skip};
-    vp9_encode_block_intra(plane, block, plane_bsize, tx_size, &arg);
+    vp9_encode_block_intra(plane, block, row, col, plane_bsize, tx_size, &arg);
     dist_block(x, plane, block, tx_size, &dist, &sse);
   } else if (max_txsize_lookup[plane_bsize] == tx_size) {
     if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] ==
         SKIP_TXFM_NONE) {
       // full forward transform and quantization
-      vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
+      vp9_xform_quant(x, plane, block, row, col, plane_bsize, tx_size);
       dist_block(x, plane, block, tx_size, &dist, &sse);
     } else if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] ==
                SKIP_TXFM_AC_ONLY) {
       // compute DC coefficient
       tran_low_t *const coeff   = BLOCK_OFFSET(x->plane[plane].coeff, block);
       tran_low_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block);
-      vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size);
+      vp9_xform_quant_dc(x, plane, block, row, col, plane_bsize, tx_size);
       sse  = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4;
       dist = sse;
       if (x->plane[plane].eobs[block]) {
@@ -562,7 +560,7 @@ static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
     }
   } else {
     // full forward transform and quantization
-    vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
+    vp9_xform_quant(x, plane, block, row, col, plane_bsize, tx_size);
     dist_block(x, plane, block, tx_size, &dist, &sse);
   }
 
@@ -572,7 +570,7 @@ static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
     return;
   }
 
-  rate = rate_block(plane, block, plane_bsize, tx_size, args);
+  rate = rate_block(plane, block, row, col, tx_size, args);
   rd1 = RDCOST(x->rdmult, x->rddiv, rate, dist);
   rd2 = RDCOST(x->rdmult, x->rddiv, 0, sse);
 
@@ -2465,9 +2463,9 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 
   if (pred_filter_search) {
     INTERP_FILTER af = SWITCHABLE, lf = SWITCHABLE;
-    if (xd->above_mi)
+    if (xd->above_mi && is_inter_block(xd->above_mi))
       af = xd->above_mi->interp_filter;
-    if (xd->left_mi)
+    if (xd->left_mi && is_inter_block(xd->left_mi))
       lf = xd->left_mi->interp_filter;
 
     if ((this_mode != NEWMV) || (af == lf))
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index edec755dd..4400da42d 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -319,7 +319,8 @@ struct tokenize_b_args {
   TOKENEXTRA **tp;
 };
 
-static void set_entropy_context_b(int plane, int block, BLOCK_SIZE plane_bsize,
+static void set_entropy_context_b(int plane, int block, int row, int col,
+                                  BLOCK_SIZE plane_bsize,
                                   TX_SIZE tx_size, void *arg) {
   struct tokenize_b_args* const args = arg;
   ThreadData *const td = args->td;
@@ -327,10 +328,8 @@ static void set_entropy_context_b(int plane, int block, BLOCK_SIZE plane_bsize,
   MACROBLOCKD *const xd = &x->e_mbd;
   struct macroblock_plane *p = &x->plane[plane];
   struct macroblockd_plane *pd = &xd->plane[plane];
-  int aoff, loff;
-  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff);
   vp9_set_contexts(xd, pd, plane_bsize, tx_size, p->eobs[block] > 0,
-                   aoff, loff);
+                   col, row);
 }
 
 static INLINE void add_token(TOKENEXTRA **t, const vpx_prob *context_tree,
@@ -353,7 +352,8 @@ static INLINE void add_token_no_extra(TOKENEXTRA **t,
   ++counts[token];
 }
 
-static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
+static void tokenize_b(int plane, int block, int row, int col,
+                       BLOCK_SIZE plane_bsize,
                        TX_SIZE tx_size, void *arg) {
   struct tokenize_b_args* const args = arg;
   VP9_COMP *cpi = args->cpi;
@@ -384,11 +384,8 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
   const int tx_eob = 16 << (tx_size << 1);
   int16_t token;
   EXTRABIT extra;
-  int aoff, loff;
-  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff);
-
-  pt = get_entropy_context(tx_size, pd->above_context + aoff,
-                           pd->left_context + loff);
+  pt = get_entropy_context(tx_size, pd->above_context + col,
+                           pd->left_context + row);
   so = get_scan(xd, tx_size, type, block);
   scan = so->scan;
   nb = so->neighbors;
@@ -426,20 +423,23 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
 
   *tp = t;
 
-  vp9_set_contexts(xd, pd, plane_bsize, tx_size, c > 0, aoff, loff);
+  vp9_set_contexts(xd, pd, plane_bsize, tx_size, c > 0, col, row);
 }
 
 struct is_skippable_args {
   uint16_t *eobs;
   int *skippable;
 };
-static void is_skippable(int plane, int block,
+
+static void is_skippable(int plane, int block, int row, int col,
                          BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
                          void *argv) {
   struct is_skippable_args *args = argv;
   (void)plane;
   (void)plane_bsize;
   (void)tx_size;
+  (void)row;
+  (void)col;
   args->skippable[0] &= (!args->eobs[block]);
 }
 
@@ -453,14 +453,15 @@ int vp9_is_skippable_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
   return result;
 }
 
-static void has_high_freq_coeff(int plane, int block,
+static void has_high_freq_coeff(int plane, int block, int row, int col,
                                 BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
                                 void *argv) {
   struct is_skippable_args *args = argv;
   int eobs = (tx_size == TX_4X4) ? 3 : 10;
   (void) plane;
   (void) plane_bsize;
-
+  (void) row;
+  (void) col;
   *(args->skippable) |= (args->eobs[block] > eobs);
 }
 
diff --git a/vp9/encoder/x86/vp9_dct_ssse3.c b/vp9/encoder/x86/vp9_dct_ssse3.c
index b09eac0d1..1a1d4eabc 100644
--- a/vp9/encoder/x86/vp9_dct_ssse3.c
+++ b/vp9/encoder/x86/vp9_dct_ssse3.c
@@ -9,11 +9,6 @@
  */
 
 #include <assert.h>
-#if defined(_MSC_VER) && _MSC_VER <= 1500
-// Need to include math.h before calling tmmintrin.h/intrin.h
-// in certain versions of MSVS.
-#include <math.h>
-#endif
 #include <tmmintrin.h>  // SSSE3
 
 #include "./vp9_rtcd.h"
diff --git a/vp9/encoder/x86/vp9_frame_scale_ssse3.c b/vp9/encoder/x86/vp9_frame_scale_ssse3.c
index 38af3b13a..23325d63b 100644
--- a/vp9/encoder/x86/vp9_frame_scale_ssse3.c
+++ b/vp9/encoder/x86/vp9_frame_scale_ssse3.c
@@ -8,11 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#if defined(_MSC_VER) && _MSC_VER <= 1500
-// Need to include math.h before calling tmmintrin.h/intrin.h
-// in certain versions of MSVS.
-#include <math.h>
-#endif
 #include <tmmintrin.h>  // SSSE3
 
 #include "./vp9_rtcd.h"
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 9ad86cbf8..51c6fbb02 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -105,19 +105,6 @@ struct vpx_codec_alg_priv {
   BufferPool              *buffer_pool;
 };
 
-static VP9_REFFRAME ref_frame_to_vp9_reframe(vpx_ref_frame_type_t frame) {
-  switch (frame) {
-    case VP8_LAST_FRAME:
-      return VP9_LAST_FLAG;
-    case VP8_GOLD_FRAME:
-      return VP9_GOLD_FLAG;
-    case VP8_ALTR_FRAME:
-      return VP9_ALT_FLAG;
-  }
-  assert(0 && "Invalid Reference Frame");
-  return VP9_LAST_FLAG;
-}
-
 static vpx_codec_err_t update_error_state(vpx_codec_alg_priv_t *ctx,
     const struct vpx_internal_error_info *error) {
   const vpx_codec_err_t res = error->error_code;
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index 6531e2c61..08adea026 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -785,7 +785,8 @@ static vpx_codec_err_t ctrl_set_reference(vpx_codec_alg_priv_t *ctx,
     FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
     image2yuvconfig(&frame->img, &sd);
     return vp9_set_reference_dec(&frame_worker_data->pbi->common,
-                                 (VP9_REFFRAME)frame->frame_type, &sd);
+                                 ref_frame_to_vp9_reframe(frame->frame_type),
+                                 &sd);
   } else {
     return VPX_CODEC_INVALID_PARAM;
   }
diff --git a/vp9/vp9_iface_common.h b/vp9/vp9_iface_common.h
index 938d4224b..44a5e8157 100644
--- a/vp9/vp9_iface_common.h
+++ b/vp9/vp9_iface_common.h
@@ -133,4 +133,16 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img,
   return VPX_CODEC_OK;
 }
 
+static VP9_REFFRAME ref_frame_to_vp9_reframe(vpx_ref_frame_type_t frame) {
+  switch (frame) {
+    case VP8_LAST_FRAME:
+      return VP9_LAST_FLAG;
+    case VP8_GOLD_FRAME:
+      return VP9_GOLD_FLAG;
+    case VP8_ALTR_FRAME:
+      return VP9_ALT_FLAG;
+  }
+  assert(0 && "Invalid Reference Frame");
+  return VP9_LAST_FLAG;
+}
 #endif  // VP9_VP9_IFACE_COMMON_H_
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 5f3de8f8a..b8342b9e1 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -101,7 +101,6 @@ ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c
 endif
 
-ifeq ($(CONFIG_USE_X86INC),yes)
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.asm
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_error_sse2.asm
@@ -109,13 +108,10 @@ VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_highbd_error_avx.asm
 else
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm
 endif
-endif
 
 ifeq ($(ARCH_X86_64),yes)
-ifeq ($(CONFIG_USE_X86INC),yes)
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3_x86_64.asm
 endif
-endif
 
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_intrin_sse2.c
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_dct_ssse3.c
diff --git a/vpx/vpx_integer.h b/vpx/vpx_integer.h
index 829c9d132..2945c87ca 100644
--- a/vpx/vpx_integer.h
+++ b/vpx/vpx_integer.h
@@ -24,7 +24,7 @@
 #define VPX_INLINE inline
 #endif
 
-#if (defined(_MSC_VER) && (_MSC_VER < 1600)) || defined(VPX_EMULATE_INTTYPES)
+#if defined(VPX_EMULATE_INTTYPES)
 typedef signed char  int8_t;
 typedef signed short int16_t;
 typedef signed int   int32_t;
@@ -33,16 +33,6 @@ typedef unsigned char  uint8_t;
 typedef unsigned short uint16_t;
 typedef unsigned int   uint32_t;
 
-#if (defined(_MSC_VER) && (_MSC_VER < 1600))
-typedef signed __int64   int64_t;
-typedef unsigned __int64 uint64_t;
-#define INT64_MAX _I64_MAX
-#define INT32_MAX _I32_MAX
-#define INT32_MIN _I32_MIN
-#define INT16_MAX _I16_MAX
-#define INT16_MIN _I16_MIN
-#endif
-
 #ifndef _UINTPTR_T_DEFINED
 typedef size_t uintptr_t;
 #endif
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 018126d4b..93855c5ad 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -40,18 +40,14 @@ endif
 # intra predictions
 DSP_SRCS-yes += intrapred.c
 
-ifeq ($(CONFIG_USE_X86INC),yes)
 DSP_SRCS-$(HAVE_SSE) += x86/intrapred_sse2.asm
 DSP_SRCS-$(HAVE_SSE2) += x86/intrapred_sse2.asm
 DSP_SRCS-$(HAVE_SSSE3) += x86/intrapred_ssse3.asm
 DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_ssse3.asm
-endif  # CONFIG_USE_X86INC
 
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
-ifeq ($(CONFIG_USE_X86INC),yes)
 DSP_SRCS-$(HAVE_SSE)  += x86/highbd_intrapred_sse2.asm
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.asm
-endif  # CONFIG_USE_X86INC
 endif  # CONFIG_VP9_HIGHBITDEPTH
 
 ifneq ($(filter yes,$(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)),)
@@ -87,9 +83,8 @@ ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_high_subpixel_8t_sse2.asm
 DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_high_subpixel_bilinear_sse2.asm
 endif
-ifeq ($(CONFIG_USE_X86INC),yes)
+
 DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_convolve_copy_sse2.asm
-endif
 
 ifeq ($(HAVE_NEON_ASM),yes)
 DSP_SRCS-yes += arm/vpx_convolve_copy_neon_asm$(ASM)
@@ -179,10 +174,8 @@ DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_txfm_sse2.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_txfm_impl_sse2.h
 DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_dct32x32_impl_sse2.h
 ifeq ($(ARCH_X86_64),yes)
-ifeq ($(CONFIG_USE_X86INC),yes)
 DSP_SRCS-$(HAVE_SSSE3)  += x86/fwd_txfm_ssse3_x86_64.asm
 endif
-endif
 DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_txfm_avx2.c
 DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_dct32x32_impl_avx2.h
 DSP_SRCS-$(HAVE_NEON)   += arm/fwd_txfm_neon.c
@@ -197,12 +190,10 @@ DSP_SRCS-yes            += inv_txfm.h
 DSP_SRCS-yes            += inv_txfm.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/inv_txfm_sse2.h
 DSP_SRCS-$(HAVE_SSE2)   += x86/inv_txfm_sse2.c
-ifeq ($(CONFIG_USE_X86INC),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/inv_wht_sse2.asm
 ifeq ($(ARCH_X86_64),yes)
 DSP_SRCS-$(HAVE_SSSE3)  += x86/inv_txfm_ssse3_x86_64.asm
 endif  # ARCH_X86_64
-endif  # CONFIG_USE_X86INC
 
 ifeq ($(HAVE_NEON_ASM),yes)
 DSP_SRCS-yes  += arm/save_reg_neon$(ASM)
@@ -254,11 +245,9 @@ ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_quantize_intrin_sse2.c
 endif
 ifeq ($(ARCH_X86_64),yes)
-ifeq ($(CONFIG_USE_X86INC),yes)
 DSP_SRCS-$(HAVE_SSSE3)  += x86/quantize_ssse3_x86_64.asm
 DSP_SRCS-$(HAVE_AVX)    += x86/quantize_avx_x86_64.asm
 endif
-endif
 
 # avg
 DSP_SRCS-yes           += avg.c
@@ -267,10 +256,8 @@ DSP_SRCS-$(HAVE_NEON)  += arm/avg_neon.c
 DSP_SRCS-$(HAVE_MSA)   += mips/avg_msa.c
 DSP_SRCS-$(HAVE_NEON)  += arm/hadamard_neon.c
 ifeq ($(ARCH_X86_64),yes)
-ifeq ($(CONFIG_USE_X86INC),yes)
 DSP_SRCS-$(HAVE_SSSE3) += x86/avg_ssse3_x86_64.asm
 endif
-endif
 
 endif  # CONFIG_VP9_ENCODER
 
@@ -292,7 +279,6 @@ DSP_SRCS-$(HAVE_SSE4_1) += x86/sad_sse4.asm
 DSP_SRCS-$(HAVE_AVX2)   += x86/sad4d_avx2.c
 DSP_SRCS-$(HAVE_AVX2)   += x86/sad_avx2.c
 
-ifeq ($(CONFIG_USE_X86INC),yes)
 DSP_SRCS-$(HAVE_SSE)    += x86/sad4d_sse2.asm
 DSP_SRCS-$(HAVE_SSE)    += x86/sad_sse2.asm
 DSP_SRCS-$(HAVE_SSE2)   += x86/sad4d_sse2.asm
@@ -303,7 +289,6 @@ ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm
 endif  # CONFIG_VP9_HIGHBITDEPTH
-endif  # CONFIG_USE_X86INC
 
 endif  # CONFIG_ENCODERS
 
@@ -334,17 +319,13 @@ ifeq ($(ARCH_X86_64),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/ssim_opt_x86_64.asm
 endif  # ARCH_X86_64
 
-ifeq ($(CONFIG_USE_X86INC),yes)
 DSP_SRCS-$(HAVE_SSE)    += x86/subpel_variance_sse2.asm
 DSP_SRCS-$(HAVE_SSE2)   += x86/subpel_variance_sse2.asm  # Contains SSE2 and SSSE3
-endif  # CONFIG_USE_X86INC
 
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_sse2.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_impl_sse2.asm
-ifeq ($(CONFIG_USE_X86INC),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_subpel_variance_impl_sse2.asm
-endif  # CONFIG_USE_X86INC
 endif  # CONFIG_VP9_HIGHBITDEPTH
 endif  # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
 
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 37239a195..7b61415b6 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -11,29 +11,6 @@ EOF
 }
 forward_decls qw/vpx_dsp_forward_decls/;
 
-# x86inc.asm had specific constraints. break it out so it's easy to disable.
-# zero all the variables to avoid tricky else conditions.
-$mmx_x86inc = $sse_x86inc = $sse2_x86inc = $ssse3_x86inc = $avx_x86inc =
-  $avx2_x86inc = '';
-$mmx_x86_64_x86inc = $sse_x86_64_x86inc = $sse2_x86_64_x86inc =
-  $ssse3_x86_64_x86inc = $avx_x86_64_x86inc = $avx2_x86_64_x86inc = '';
-if (vpx_config("CONFIG_USE_X86INC") eq "yes") {
-  $mmx_x86inc = 'mmx';
-  $sse_x86inc = 'sse';
-  $sse2_x86inc = 'sse2';
-  $ssse3_x86inc = 'ssse3';
-  $avx_x86inc = 'avx';
-  $avx2_x86inc = 'avx2';
-  if ($opts{arch} eq "x86_64") {
-    $mmx_x86_64_x86inc = 'mmx';
-    $sse_x86_64_x86inc = 'sse';
-    $sse2_x86_64_x86inc = 'sse2';
-    $ssse3_x86_64_x86inc = 'ssse3';
-    $avx_x86_64_x86inc = 'avx';
-    $avx2_x86_64_x86inc = 'avx2';
-  }
-}
-
 # optimizations which depend on multiple features
 $avx2_ssse3 = '';
 if ((vpx_config("HAVE_AVX2") eq "yes") && (vpx_config("HAVE_SSSE3") eq "yes")) {
@@ -55,19 +32,19 @@ if ($opts{arch} eq "x86_64") {
 #
 
 add_proto qw/void vpx_d207_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d207_predictor_4x4/, "$sse2_x86inc";
+specialize qw/vpx_d207_predictor_4x4 sse2/;
 
 add_proto qw/void vpx_d207e_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d207e_predictor_4x4/;
 
 add_proto qw/void vpx_d45_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d45_predictor_4x4 neon/, "$sse2_x86inc";
+specialize qw/vpx_d45_predictor_4x4 neon sse2/;
 
 add_proto qw/void vpx_d45e_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d45e_predictor_4x4/;
 
 add_proto qw/void vpx_d63_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d63_predictor_4x4/, "$ssse3_x86inc";
+specialize qw/vpx_d63_predictor_4x4 ssse3/;
 
 add_proto qw/void vpx_d63e_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d63e_predictor_4x4/;
@@ -76,7 +53,7 @@ add_proto qw/void vpx_d63f_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, co
 specialize qw/vpx_d63f_predictor_4x4/;
 
 add_proto qw/void vpx_h_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_h_predictor_4x4 neon dspr2 msa/, "$sse2_x86inc";
+specialize qw/vpx_h_predictor_4x4 neon dspr2 msa sse2/;
 
 add_proto qw/void vpx_he_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_he_predictor_4x4/;
@@ -88,49 +65,49 @@ add_proto qw/void vpx_d135_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, co
 specialize qw/vpx_d135_predictor_4x4 neon/;
 
 add_proto qw/void vpx_d153_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d153_predictor_4x4/, "$ssse3_x86inc";
+specialize qw/vpx_d153_predictor_4x4 ssse3/;
 
 add_proto qw/void vpx_v_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_v_predictor_4x4 neon msa/, "$sse2_x86inc";
+specialize qw/vpx_v_predictor_4x4 neon msa sse2/;
 
 add_proto qw/void vpx_ve_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_ve_predictor_4x4/;
 
 add_proto qw/void vpx_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_tm_predictor_4x4 neon dspr2 msa/, "$sse2_x86inc";
+specialize qw/vpx_tm_predictor_4x4 neon dspr2 msa sse2/;
 
 add_proto qw/void vpx_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_predictor_4x4 dspr2 msa neon/, "$sse2_x86inc";
+specialize qw/vpx_dc_predictor_4x4 dspr2 msa neon sse2/;
 
 add_proto qw/void vpx_dc_top_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_top_predictor_4x4 msa neon/, "$sse2_x86inc";
+specialize qw/vpx_dc_top_predictor_4x4 msa neon sse2/;
 
 add_proto qw/void vpx_dc_left_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_left_predictor_4x4 msa neon/, "$sse2_x86inc";
+specialize qw/vpx_dc_left_predictor_4x4 msa neon sse2/;
 
 add_proto qw/void vpx_dc_128_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_128_predictor_4x4 msa neon/, "$sse2_x86inc";
+specialize qw/vpx_dc_128_predictor_4x4 msa neon sse2/;
 
 add_proto qw/void vpx_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d207_predictor_8x8/, "$ssse3_x86inc";
+specialize qw/vpx_d207_predictor_8x8 ssse3/;
 
 add_proto qw/void vpx_d207e_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d207e_predictor_8x8/;
 
 add_proto qw/void vpx_d45_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d45_predictor_8x8 neon/, "$sse2_x86inc";
+specialize qw/vpx_d45_predictor_8x8 neon sse2/;
 
 add_proto qw/void vpx_d45e_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d45e_predictor_8x8/;
 
 add_proto qw/void vpx_d63_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d63_predictor_8x8/, "$ssse3_x86inc";
+specialize qw/vpx_d63_predictor_8x8 ssse3/;
 
 add_proto qw/void vpx_d63e_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d63e_predictor_8x8/;
 
 add_proto qw/void vpx_h_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_h_predictor_8x8 neon dspr2 msa/, "$sse2_x86inc";
+specialize qw/vpx_h_predictor_8x8 neon dspr2 msa sse2/;
 
 add_proto qw/void vpx_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d117_predictor_8x8/;
@@ -139,46 +116,46 @@ add_proto qw/void vpx_d135_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, co
 specialize qw/vpx_d135_predictor_8x8/;
 
 add_proto qw/void vpx_d153_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d153_predictor_8x8/, "$ssse3_x86inc";
+specialize qw/vpx_d153_predictor_8x8 ssse3/;
 
 add_proto qw/void vpx_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_v_predictor_8x8 neon msa/, "$sse2_x86inc";
+specialize qw/vpx_v_predictor_8x8 neon msa sse2/;
 
 add_proto qw/void vpx_tm_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_tm_predictor_8x8 neon dspr2 msa/, "$sse2_x86inc";
+specialize qw/vpx_tm_predictor_8x8 neon dspr2 msa sse2/;
 
 add_proto qw/void vpx_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_predictor_8x8 dspr2 neon msa/, "$sse2_x86inc";
+specialize qw/vpx_dc_predictor_8x8 dspr2 neon msa sse2/;
 
 add_proto qw/void vpx_dc_top_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_top_predictor_8x8 neon msa/, "$sse2_x86inc";
+specialize qw/vpx_dc_top_predictor_8x8 neon msa sse2/;
 
 add_proto qw/void vpx_dc_left_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_left_predictor_8x8 neon msa/, "$sse2_x86inc";
+specialize qw/vpx_dc_left_predictor_8x8 neon msa sse2/;
 
 add_proto qw/void vpx_dc_128_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_128_predictor_8x8 neon msa/, "$sse2_x86inc";
+specialize qw/vpx_dc_128_predictor_8x8 neon msa sse2/;
 
 add_proto qw/void vpx_d207_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d207_predictor_16x16/, "$ssse3_x86inc";
+specialize qw/vpx_d207_predictor_16x16 ssse3/;
 
 add_proto qw/void vpx_d207e_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d207e_predictor_16x16/;
 
 add_proto qw/void vpx_d45_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d45_predictor_16x16 neon/, "$ssse3_x86inc";
+specialize qw/vpx_d45_predictor_16x16 neon ssse3/;
 
 add_proto qw/void vpx_d45e_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d45e_predictor_16x16/;
 
 add_proto qw/void vpx_d63_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d63_predictor_16x16/, "$ssse3_x86inc";
+specialize qw/vpx_d63_predictor_16x16 ssse3/;
 
 add_proto qw/void vpx_d63e_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d63e_predictor_16x16/;
 
 add_proto qw/void vpx_h_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_h_predictor_16x16 neon dspr2 msa/, "$sse2_x86inc";
+specialize qw/vpx_h_predictor_16x16 neon dspr2 msa sse2/;
 
 add_proto qw/void vpx_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d117_predictor_16x16/;
@@ -187,46 +164,46 @@ add_proto qw/void vpx_d135_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride,
 specialize qw/vpx_d135_predictor_16x16/;
 
 add_proto qw/void vpx_d153_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d153_predictor_16x16/, "$ssse3_x86inc";
+specialize qw/vpx_d153_predictor_16x16 ssse3/;
 
 add_proto qw/void vpx_v_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_v_predictor_16x16 neon msa/, "$sse2_x86inc";
+specialize qw/vpx_v_predictor_16x16 neon msa sse2/;
 
 add_proto qw/void vpx_tm_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_tm_predictor_16x16 neon msa/, "$sse2_x86inc";
+specialize qw/vpx_tm_predictor_16x16 neon msa sse2/;
 
 add_proto qw/void vpx_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_predictor_16x16 dspr2 neon msa/, "$sse2_x86inc";
+specialize qw/vpx_dc_predictor_16x16 dspr2 neon msa sse2/;
 
 add_proto qw/void vpx_dc_top_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_top_predictor_16x16 neon msa/, "$sse2_x86inc";
+specialize qw/vpx_dc_top_predictor_16x16 neon msa sse2/;
 
 add_proto qw/void vpx_dc_left_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_left_predictor_16x16 neon msa/, "$sse2_x86inc";
+specialize qw/vpx_dc_left_predictor_16x16 neon msa sse2/;
 
 add_proto qw/void vpx_dc_128_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_128_predictor_16x16 neon msa/, "$sse2_x86inc";
+specialize qw/vpx_dc_128_predictor_16x16 neon msa sse2/;
 
 add_proto qw/void vpx_d207_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d207_predictor_32x32/, "$ssse3_x86inc";
+specialize qw/vpx_d207_predictor_32x32 ssse3/;
 
 add_proto qw/void vpx_d207e_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d207e_predictor_32x32/;
 
 add_proto qw/void vpx_d45_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d45_predictor_32x32/, "$ssse3_x86inc";
+specialize qw/vpx_d45_predictor_32x32 ssse3/;
 
 add_proto qw/void vpx_d45e_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d45e_predictor_32x32/;
 
 add_proto qw/void vpx_d63_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d63_predictor_32x32/, "$ssse3_x86inc";
+specialize qw/vpx_d63_predictor_32x32 ssse3/;
 
 add_proto qw/void vpx_d63e_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d63e_predictor_32x32/;
 
 add_proto qw/void vpx_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_h_predictor_32x32 neon msa/, "$sse2_x86inc";
+specialize qw/vpx_h_predictor_32x32 neon msa sse2/;
 
 add_proto qw/void vpx_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d117_predictor_32x32/;
@@ -235,25 +212,25 @@ add_proto qw/void vpx_d135_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride,
 specialize qw/vpx_d135_predictor_32x32/;
 
 add_proto qw/void vpx_d153_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d153_predictor_32x32/, "$ssse3_x86inc";
+specialize qw/vpx_d153_predictor_32x32 ssse3/;
 
 add_proto qw/void vpx_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_v_predictor_32x32 neon msa/, "$sse2_x86inc";
+specialize qw/vpx_v_predictor_32x32 neon msa sse2/;
 
 add_proto qw/void vpx_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_tm_predictor_32x32 neon msa/, "$sse2_x86inc";
+specialize qw/vpx_tm_predictor_32x32 neon msa sse2/;
 
 add_proto qw/void vpx_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_predictor_32x32 msa neon/, "$sse2_x86inc";
+specialize qw/vpx_dc_predictor_32x32 msa neon sse2/;
 
 add_proto qw/void vpx_dc_top_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_top_predictor_32x32 msa neon/, "$sse2_x86inc";
+specialize qw/vpx_dc_top_predictor_32x32 msa neon sse2/;
 
 add_proto qw/void vpx_dc_left_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_left_predictor_32x32 msa neon/, "$sse2_x86inc";
+specialize qw/vpx_dc_left_predictor_32x32 msa neon sse2/;
 
 add_proto qw/void vpx_dc_128_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_128_predictor_32x32 msa neon/, "$sse2_x86inc";
+specialize qw/vpx_dc_128_predictor_32x32 msa neon sse2/;
 
 # High bitdepth functions
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
@@ -288,13 +265,13 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vpx_highbd_d153_predictor_4x4/;
 
   add_proto qw/void vpx_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_v_predictor_4x4/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_v_predictor_4x4 sse2/;
 
   add_proto qw/void vpx_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_tm_predictor_4x4/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_tm_predictor_4x4 sse2/;
 
   add_proto qw/void vpx_highbd_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_predictor_4x4/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_dc_predictor_4x4 sse2/;
 
   add_proto qw/void vpx_highbd_dc_top_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_top_predictor_4x4/;
@@ -336,13 +313,13 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vpx_highbd_d153_predictor_8x8/;
 
   add_proto qw/void vpx_highbd_v_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_v_predictor_8x8/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_v_predictor_8x8 sse2/;
 
   add_proto qw/void vpx_highbd_tm_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_tm_predictor_8x8/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_tm_predictor_8x8 sse2/;
 
   add_proto qw/void vpx_highbd_dc_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_predictor_8x8/, "$sse2_x86inc";;
+  specialize qw/vpx_highbd_dc_predictor_8x8 sse2/;;
 
   add_proto qw/void vpx_highbd_dc_top_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_top_predictor_8x8/;
@@ -384,13 +361,13 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vpx_highbd_d153_predictor_16x16/;
 
   add_proto qw/void vpx_highbd_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_v_predictor_16x16/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_v_predictor_16x16 sse2/;
 
   add_proto qw/void vpx_highbd_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_tm_predictor_16x16/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_tm_predictor_16x16 sse2/;
 
   add_proto qw/void vpx_highbd_dc_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_predictor_16x16/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_dc_predictor_16x16 sse2/;
 
   add_proto qw/void vpx_highbd_dc_top_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_top_predictor_16x16/;
@@ -432,13 +409,13 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vpx_highbd_d153_predictor_32x32/;
 
   add_proto qw/void vpx_highbd_v_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_v_predictor_32x32/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_v_predictor_32x32 sse2/;
 
   add_proto qw/void vpx_highbd_tm_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_tm_predictor_32x32/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_tm_predictor_32x32 sse2/;
 
   add_proto qw/void vpx_highbd_dc_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_predictor_32x32/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_dc_predictor_32x32 sse2/;
 
   add_proto qw/void vpx_highbd_dc_top_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_top_predictor_32x32/;
@@ -454,10 +431,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 # Sub Pixel Filters
 #
 add_proto qw/void vpx_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve_copy neon dspr2 msa/, "$sse2_x86inc";
+specialize qw/vpx_convolve_copy neon dspr2 msa sse2/;
 
 add_proto qw/void vpx_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve_avg neon dspr2 msa/, "$sse2_x86inc";
+specialize qw/vpx_convolve_avg neon dspr2 msa sse2/;
 
 add_proto qw/void vpx_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
 specialize qw/vpx_convolve8 sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
@@ -500,10 +477,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   # Sub Pixel Filters
   #
   add_proto qw/void vpx_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/vpx_highbd_convolve_copy/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_convolve_copy sse2/;
 
   add_proto qw/void vpx_highbd_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/vpx_highbd_convolve_avg/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_convolve_avg sse2/;
 
   add_proto qw/void vpx_highbd_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
   specialize qw/vpx_highbd_convolve8/, "$sse2_x86_64";
@@ -674,7 +651,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vpx_fdct4x4_1 sse2/;
 
   add_proto qw/void vpx_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct8x8 sse2 neon msa/, "$ssse3_x86_64_x86inc";
+  specialize qw/vpx_fdct8x8 sse2 neon msa/, "$ssse3_x86_64";
 
   add_proto qw/void vpx_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vpx_fdct8x8_1 sse2 neon msa/;
@@ -706,7 +683,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vpx_iwht4x4_1_add/;
 
   add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/vpx_iwht4x4_16_add/, "$sse2_x86inc";
+  specialize qw/vpx_iwht4x4_16_add sse2/;
 
   add_proto qw/void vpx_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
   specialize qw/vpx_highbd_idct4x4_1_add/;
@@ -792,10 +769,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     specialize qw/vpx_idct4x4_1_add sse2/;
 
     add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct8x8_64_add sse2/, "$ssse3_x86_64_x86inc";
+    specialize qw/vpx_idct8x8_64_add sse2/, "$ssse3_x86_64";
 
     add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct8x8_12_add sse2/, "$ssse3_x86_64_x86inc";
+    specialize qw/vpx_idct8x8_12_add sse2/, "$ssse3_x86_64";
 
     add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
     specialize qw/vpx_idct8x8_1_add sse2/;
@@ -810,15 +787,15 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     specialize qw/vpx_idct16x16_1_add sse2/;
 
     add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct32x32_1024_add sse2/, "$ssse3_x86_64_x86inc";
+    specialize qw/vpx_idct32x32_1024_add sse2/, "$ssse3_x86_64";
 
     add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct32x32_135_add sse2/, "$ssse3_x86_64_x86inc";
+    specialize qw/vpx_idct32x32_135_add sse2/, "$ssse3_x86_64";
     # Need to add 135 eob idct32x32 implementations.
     $vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2;
 
     add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct32x32_34_add sse2/, "$ssse3_x86_64_x86inc";
+    specialize qw/vpx_idct32x32_34_add sse2/, "$ssse3_x86_64";
 
     add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
     specialize qw/vpx_idct32x32_1_add sse2/;
@@ -893,10 +870,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     specialize qw/vpx_idct8x8_1_add sse2 neon dspr2 msa/;
 
     add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct8x8_64_add sse2 neon dspr2 msa/, "$ssse3_x86_64_x86inc";
+    specialize qw/vpx_idct8x8_64_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
 
     add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct8x8_12_add sse2 neon dspr2 msa/, "$ssse3_x86_64_x86inc";
+    specialize qw/vpx_idct8x8_12_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
 
     add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
     specialize qw/vpx_idct16x16_1_add sse2 neon dspr2 msa/;
@@ -908,10 +885,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     specialize qw/vpx_idct16x16_10_add sse2 neon dspr2 msa/;
 
     add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct32x32_1024_add sse2 neon dspr2 msa/, "$ssse3_x86_64_x86inc";
+    specialize qw/vpx_idct32x32_1024_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
 
     add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct32x32_135_add sse2 neon dspr2 msa/, "$ssse3_x86_64_x86inc";
+    specialize qw/vpx_idct32x32_135_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
     # Need to add 135 eob idct32x32 implementations.
     $vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2;
     $vpx_idct32x32_135_add_neon=vpx_idct32x32_1024_add_neon;
@@ -919,7 +896,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     $vpx_idct32x32_135_add_msa=vpx_idct32x32_1024_add_msa;
 
     add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct32x32_34_add sse2 neon dspr2 msa/, "$ssse3_x86_64_x86inc";
+    specialize qw/vpx_idct32x32_34_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
     # Need to add 34 eob idct32x32 neon implementation.
     $vpx_idct32x32_34_add_neon=vpx_idct32x32_1024_add_neon;
 
@@ -930,7 +907,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     specialize qw/vpx_iwht4x4_1_add msa/;
 
     add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_iwht4x4_16_add msa/, "$sse2_x86inc";
+    specialize qw/vpx_iwht4x4_16_add msa sse2/;
   }  # CONFIG_EMULATE_HARDWARE
 }  # CONFIG_VP9_HIGHBITDEPTH
 }  # CONFIG_VP9
@@ -940,10 +917,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 #
 if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
   add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vpx_quantize_b sse2/, "$ssse3_x86_64_x86inc", "$avx_x86_64_x86inc";
+  specialize qw/vpx_quantize_b sse2/, "$ssse3_x86_64", "$avx_x86_64";
 
   add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vpx_quantize_b_32x32/, "$ssse3_x86_64_x86inc", "$avx_x86_64_x86inc";
+  specialize qw/vpx_quantize_b_32x32/, "$ssse3_x86_64", "$avx_x86_64";
 
   if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
@@ -959,49 +936,49 @@ if (vpx_config("CONFIG_ENCODERS") eq "yes") {
 # Block subtraction
 #
 add_proto qw/void vpx_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
-specialize qw/vpx_subtract_block neon msa/, "$sse2_x86inc";
+specialize qw/vpx_subtract_block neon msa sse2/;
 
 #
 # Single block SAD
 #
 add_proto qw/unsigned int vpx_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad64x64 avx2 neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad64x64 avx2 neon msa sse2/;
 
 add_proto qw/unsigned int vpx_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad64x32 avx2 msa/, "$sse2_x86inc";
+specialize qw/vpx_sad64x32 avx2 msa sse2/;
 
 add_proto qw/unsigned int vpx_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad32x64 avx2 msa/, "$sse2_x86inc";
+specialize qw/vpx_sad32x64 avx2 msa sse2/;
 
 add_proto qw/unsigned int vpx_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad32x32 avx2 neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad32x32 avx2 neon msa sse2/;
 
 add_proto qw/unsigned int vpx_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad32x16 avx2 msa/, "$sse2_x86inc";
+specialize qw/vpx_sad32x16 avx2 msa sse2/;
 
 add_proto qw/unsigned int vpx_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad16x32 msa/, "$sse2_x86inc";
+specialize qw/vpx_sad16x32 msa sse2/;
 
 add_proto qw/unsigned int vpx_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad16x16 media neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad16x16 media neon msa sse2/;
 
 add_proto qw/unsigned int vpx_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad16x8 neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad16x8 neon msa sse2/;
 
 add_proto qw/unsigned int vpx_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad8x16 neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad8x16 neon msa sse2/;
 
 add_proto qw/unsigned int vpx_sad8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad8x8 neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad8x8 neon msa sse2/;
 
 add_proto qw/unsigned int vpx_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad8x4 msa/, "$sse2_x86inc";
+specialize qw/vpx_sad8x4 msa sse2/;
 
 add_proto qw/unsigned int vpx_sad4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad4x8 msa/, "$sse2_x86inc";
+specialize qw/vpx_sad4x8 msa sse2/;
 
 add_proto qw/unsigned int vpx_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad4x4 neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad4x4 neon msa sse2/;
 
 #
 # Avg
@@ -1017,7 +994,7 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
   specialize qw/vpx_minmax_8x8 sse2 neon/;
 
   add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, int src_stride, int16_t *coeff";
-  specialize qw/vpx_hadamard_8x8 sse2 neon/, "$ssse3_x86_64_x86inc";
+  specialize qw/vpx_hadamard_8x8 sse2 neon/, "$ssse3_x86_64";
 
   add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, int src_stride, int16_t *coeff";
   specialize qw/vpx_hadamard_16x16 sse2 neon/;
@@ -1036,43 +1013,43 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
 }  # CONFIG_VP9_ENCODER
 
 add_proto qw/unsigned int vpx_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad64x64_avg avx2 msa/, "$sse2_x86inc";
+specialize qw/vpx_sad64x64_avg avx2 msa sse2/;
 
 add_proto qw/unsigned int vpx_sad64x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad64x32_avg avx2 msa/, "$sse2_x86inc";
+specialize qw/vpx_sad64x32_avg avx2 msa sse2/;
 
 add_proto qw/unsigned int vpx_sad32x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad32x64_avg avx2 msa/, "$sse2_x86inc";
+specialize qw/vpx_sad32x64_avg avx2 msa sse2/;
 
 add_proto qw/unsigned int vpx_sad32x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad32x32_avg avx2 msa/, "$sse2_x86inc";
+specialize qw/vpx_sad32x32_avg avx2 msa sse2/;
 
 add_proto qw/unsigned int vpx_sad32x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad32x16_avg avx2 msa/, "$sse2_x86inc";
+specialize qw/vpx_sad32x16_avg avx2 msa sse2/;
 
 add_proto qw/unsigned int vpx_sad16x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad16x32_avg msa/, "$sse2_x86inc";
+specialize qw/vpx_sad16x32_avg msa sse2/;
 
 add_proto qw/unsigned int vpx_sad16x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad16x16_avg msa/, "$sse2_x86inc";
+specialize qw/vpx_sad16x16_avg msa sse2/;
 
 add_proto qw/unsigned int vpx_sad16x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad16x8_avg msa/, "$sse2_x86inc";
+specialize qw/vpx_sad16x8_avg msa sse2/;
 
 add_proto qw/unsigned int vpx_sad8x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad8x16_avg msa/, "$sse2_x86inc";
+specialize qw/vpx_sad8x16_avg msa sse2/;
 
 add_proto qw/unsigned int vpx_sad8x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad8x8_avg msa/, "$sse2_x86inc";
+specialize qw/vpx_sad8x8_avg msa sse2/;
 
 add_proto qw/unsigned int vpx_sad8x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad8x4_avg msa/, "$sse2_x86inc";
+specialize qw/vpx_sad8x4_avg msa sse2/;
 
 add_proto qw/unsigned int vpx_sad4x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad4x8_avg msa/, "$sse2_x86inc";
+specialize qw/vpx_sad4x8_avg msa sse2/;
 
 add_proto qw/unsigned int vpx_sad4x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad4x4_avg msa/, "$sse2_x86inc";
+specialize qw/vpx_sad4x4_avg msa sse2/;
 
 #
 # Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
@@ -1131,43 +1108,43 @@ specialize qw/vpx_sad4x4x8 sse4_1 msa/;
 # Multi-block SAD, comparing a reference to N independent blocks
 #
 add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad64x64x4d avx2 neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad64x64x4d avx2 neon msa sse2/;
 
 add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad64x32x4d msa/, "$sse2_x86inc";
+specialize qw/vpx_sad64x32x4d msa sse2/;
 
 add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad32x64x4d msa/, "$sse2_x86inc";
+specialize qw/vpx_sad32x64x4d msa sse2/;
 
 add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad32x32x4d avx2 neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad32x32x4d avx2 neon msa sse2/;
 
 add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad32x16x4d msa/, "$sse2_x86inc";
+specialize qw/vpx_sad32x16x4d msa sse2/;
 
 add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x32x4d msa/, "$sse2_x86inc";
+specialize qw/vpx_sad16x32x4d msa sse2/;
 
 add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x16x4d neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad16x16x4d neon msa sse2/;
 
 add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x8x4d msa/, "$sse2_x86inc";
+specialize qw/vpx_sad16x8x4d msa sse2/;
 
 add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x16x4d msa/, "$sse2_x86inc";
+specialize qw/vpx_sad8x16x4d msa sse2/;
 
 add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x8x4d msa/, "$sse2_x86inc";
+specialize qw/vpx_sad8x8x4d msa sse2/;
 
 add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x4x4d msa/, "$sse2_x86inc";
+specialize qw/vpx_sad8x4x4d msa sse2/;
 
 add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad4x8x4d msa/, "$sse2_x86inc";
+specialize qw/vpx_sad4x8x4d msa sse2/;
 
 add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad4x4x4d msa/, "$sse2_x86inc";
+specialize qw/vpx_sad4x4x4d msa sse2/;
 
 #
 # Structured Similarity (SSIM)
@@ -1191,37 +1168,37 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   # Single block SAD
   #
   add_proto qw/unsigned int vpx_highbd_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad64x64/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_sad64x64 sse2/;
 
   add_proto qw/unsigned int vpx_highbd_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad64x32/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_sad64x32 sse2/;
 
   add_proto qw/unsigned int vpx_highbd_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad32x64/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_sad32x64 sse2/;
 
   add_proto qw/unsigned int vpx_highbd_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad32x32/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_sad32x32 sse2/;
 
   add_proto qw/unsigned int vpx_highbd_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad32x16/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_sad32x16 sse2/;
 
   add_proto qw/unsigned int vpx_highbd_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad16x32/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_sad16x32 sse2/;
 
   add_proto qw/unsigned int vpx_highbd_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad16x16/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_sad16x16 sse2/;
 
   add_proto qw/unsigned int vpx_highbd_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad16x8/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_sad16x8 sse2/;
 
   add_proto qw/unsigned int vpx_highbd_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad8x16/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_sad8x16 sse2/;
 
   add_proto qw/unsigned int vpx_highbd_sad8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad8x8/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_sad8x8 sse2/;
 
   add_proto qw/unsigned int vpx_highbd_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad8x4/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_sad8x4 sse2/;
 
   add_proto qw/unsigned int vpx_highbd_sad4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
   specialize qw/vpx_highbd_sad4x8/;
@@ -1240,37 +1217,37 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vpx_highbd_minmax_8x8/;
 
   add_proto qw/unsigned int vpx_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad64x64_avg/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_sad64x64_avg sse2/;
 
   add_proto qw/unsigned int vpx_highbd_sad64x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad64x32_avg/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_sad64x32_avg sse2/;
 
   add_proto qw/unsigned int vpx_highbd_sad32x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad32x64_avg/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_sad32x64_avg sse2/;
 
   add_proto qw/unsigned int vpx_highbd_sad32x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad32x32_avg/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_sad32x32_avg sse2/;
 
   add_proto qw/unsigned int vpx_highbd_sad32x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad32x16_avg/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_sad32x16_avg sse2/;
 
   add_proto qw/unsigned int vpx_highbd_sad16x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad16x32_avg/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_sad16x32_avg sse2/;
 
   add_proto qw/unsigned int vpx_highbd_sad16x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad16x16_avg/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_sad16x16_avg sse2/;
 
   add_proto qw/unsigned int vpx_highbd_sad16x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad16x8_avg/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_sad16x8_avg sse2/;
 
   add_proto qw/unsigned int vpx_highbd_sad8x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad8x16_avg/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_sad8x16_avg sse2/;
 
   add_proto qw/unsigned int vpx_highbd_sad8x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad8x8_avg/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_sad8x8_avg sse2/;
 
   add_proto qw/unsigned int vpx_highbd_sad8x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad8x4_avg/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_sad8x4_avg sse2/;
 
   add_proto qw/unsigned int vpx_highbd_sad4x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
   specialize qw/vpx_highbd_sad4x8_avg/;
@@ -1335,43 +1312,43 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   # Multi-block SAD, comparing a reference to N independent blocks
   #
   add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad64x64x4d/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_sad64x64x4d sse2/;
 
   add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad64x32x4d/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_sad64x32x4d sse2/;
 
   add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad32x64x4d/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_sad32x64x4d sse2/;
 
   add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad32x32x4d/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_sad32x32x4d sse2/;
 
   add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad32x16x4d/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_sad32x16x4d sse2/;
 
   add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad16x32x4d/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_sad16x32x4d sse2/;
 
   add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad16x16x4d/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_sad16x16x4d sse2/;
 
   add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad16x8x4d/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_sad16x8x4d sse2/;
 
   add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad8x16x4d/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_sad8x16x4d sse2/;
 
   add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad8x8x4d/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_sad8x8x4d sse2/;
 
   add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad8x4x4d/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_sad8x4x4d sse2/;
 
   add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad4x8x4d/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_sad4x8x4d sse2/;
 
   add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad4x4x4d/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_sad4x4x4d sse2/;
 
   #
   # Structured Similarity (SSIM)
@@ -1460,82 +1437,82 @@ add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred,
 # Subpixel Variance
 #
 add_proto qw/uint32_t vpx_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance64x64 avx2 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_variance64x64 avx2 neon msa sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance64x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_variance64x32 msa sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance32x64 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_variance32x64 msa sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance32x32 avx2 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_variance32x32 avx2 neon msa sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance32x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_variance32x16 msa sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance16x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_variance16x32 msa sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance16x16 media neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_variance16x16 media neon msa sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance16x8 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_variance16x8 msa sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance8x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_variance8x16 msa sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance8x8 media neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_variance8x8 media neon msa sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_variance8x4 msa sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance4x8 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_variance4x8 msa sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance4x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_variance4x4 msa sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance64x64 avx2 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_avg_variance64x64 avx2 msa sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance64x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_avg_variance64x32 msa sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance32x64 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_avg_variance32x64 msa sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance32x32 avx2 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_avg_variance32x32 avx2 msa sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance32x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_avg_variance32x16 msa sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance16x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_avg_variance16x32 msa sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance16x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_avg_variance16x16 msa sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance16x8 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_avg_variance16x8 msa sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance8x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_avg_variance8x16 msa sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance8x8 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_avg_variance8x8 msa sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_avg_variance8x4 msa sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance4x8 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_avg_variance4x8 msa sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance4x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_avg_variance4x4 msa sse2 ssse3/;
 
 #
 # Specialty Subpixel
@@ -1691,217 +1668,217 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   # Subpixel Variance
   #
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance64x64/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_12_sub_pixel_variance64x64 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance64x32/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_12_sub_pixel_variance64x32 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance32x64/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_12_sub_pixel_variance32x64 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance32x32/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_12_sub_pixel_variance32x32 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance32x16/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_12_sub_pixel_variance32x16 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance16x32/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_12_sub_pixel_variance16x32 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance16x16/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_12_sub_pixel_variance16x16 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance16x8/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_12_sub_pixel_variance16x8 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance8x16/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_12_sub_pixel_variance8x16 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance8x8/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_12_sub_pixel_variance8x8 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance8x4/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_12_sub_pixel_variance8x4 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance64x64/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_10_sub_pixel_variance64x64 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance64x32/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_10_sub_pixel_variance64x32 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance32x64/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_10_sub_pixel_variance32x64 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance32x32/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_10_sub_pixel_variance32x32 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance32x16/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_10_sub_pixel_variance32x16 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance16x32/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_10_sub_pixel_variance16x32 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance16x16/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_10_sub_pixel_variance16x16 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance16x8/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_10_sub_pixel_variance16x8 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance8x16/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_10_sub_pixel_variance8x16 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance8x8/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_10_sub_pixel_variance8x8 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance8x4/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_10_sub_pixel_variance8x4 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance64x64/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_8_sub_pixel_variance64x64 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance64x32/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_8_sub_pixel_variance64x32 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance32x64/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_8_sub_pixel_variance32x64 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance32x32/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_8_sub_pixel_variance32x32 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance32x16/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_8_sub_pixel_variance32x16 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance16x32/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_8_sub_pixel_variance16x32 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance16x16/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_8_sub_pixel_variance16x16 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance16x8/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_8_sub_pixel_variance16x8 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance8x16/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_8_sub_pixel_variance8x16 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance8x8/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_8_sub_pixel_variance8x8 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance8x4/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_8_sub_pixel_variance8x4 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x64 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x32 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x64 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x32 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x16 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x32 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x16 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x8 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x16 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x8 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x4 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x64 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x32 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x64 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x32 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x16 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x32 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x16 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x8 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x16 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x8 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x4 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x64 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x32 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x64 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x32 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x16 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x32 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x16 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x8 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x16 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x8 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x4 sse2/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
diff --git a/vpx_dsp/x86/highbd_variance_sse2.c b/vpx_dsp/x86/highbd_variance_sse2.c
index 14d029c9a..0d62adf8b 100644
--- a/vpx_dsp/x86/highbd_variance_sse2.c
+++ b/vpx_dsp/x86/highbd_variance_sse2.c
@@ -143,24 +143,28 @@ uint32_t vpx_highbd_10_variance##w##x##h##_sse2( \
     const uint8_t *src8, int src_stride, \
     const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
   int sum; \
+  int64_t var; \
   uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
   highbd_10_variance_sse2( \
       src, src_stride, ref, ref_stride, w, h, sse, &sum, \
       vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
-  return *sse - (((int64_t)sum * sum) >> shift); \
+  var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \
+  return (var >= 0) ? (uint32_t)var : 0; \
 } \
 \
 uint32_t vpx_highbd_12_variance##w##x##h##_sse2( \
     const uint8_t *src8, int src_stride, \
     const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
   int sum; \
+  int64_t var; \
   uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
   highbd_12_variance_sse2( \
       src, src_stride, ref, ref_stride, w, h, sse, &sum, \
       vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
-  return *sse - (((int64_t)sum * sum) >> shift); \
+  var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \
+  return (var >= 0) ? (uint32_t)var : 0; \
 }
 
 VAR_FN(64, 64, 16, 12);
@@ -242,7 +246,6 @@ unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
   return *sse;
 }
 
-#if CONFIG_USE_X86INC
 // The 2 unused parameters are place holders for PIC enabled build.
 // These definitions are for functions defined in
 // highbd_subpel_variance_impl_sse2.asm
@@ -589,4 +592,3 @@ FNS(sse2);
 
 #undef FNS
 #undef FN
-#endif  // CONFIG_USE_X86INC
diff --git a/vpx_dsp/x86/variance_sse2.c b/vpx_dsp/x86/variance_sse2.c
index 6987c2e24..92dc752c0 100644
--- a/vpx_dsp/x86/variance_sse2.c
+++ b/vpx_dsp/x86/variance_sse2.c
@@ -308,7 +308,6 @@ unsigned int vpx_mse16x16_sse2(const uint8_t *src, int src_stride,
   return *sse;
 }
 
-#if CONFIG_USE_X86INC
 // The 2 unused parameters are place holders for PIC enabled build.
 // These definitions are for functions defined in subpel_variance.asm
 #define DECL(w, opt) \
@@ -474,4 +473,3 @@ FNS(ssse3, ssse3);
 
 #undef FNS
 #undef FN
-#endif  // CONFIG_USE_X86INC
diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
index b71867853..cbd22dcd0 100644
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
@@ -8,10 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-// Due to a header conflict between math.h and intrinsics includes with ceil()
-// in certain configurations under vs9 this include needs to precede
-// immintrin.h.
-
 #include <immintrin.h>
 
 #include "./vpx_dsp_rtcd.h"
diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
index 6fd52087c..2bb0f4a24 100644
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
@@ -8,10 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-// Due to a header conflict between math.h and intrinsics includes with ceil()
-// in certain configurations under vs9 this include needs to precede
-// tmmintrin.h.
-
 #include <tmmintrin.h>
 
 #include "./vpx_dsp_rtcd.h"
diff --git a/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm b/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
index 9cc4f9d7d..c1a6f23ab 100644
--- a/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
+++ b/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
@@ -23,11 +23,7 @@ pw_64:    times 8 dw 64
 ; z = signed SAT(x + y)
 
 SECTION .text
-%if ARCH_X86_64
-  %define LOCAL_VARS_SIZE 16*4
-%else
-  %define LOCAL_VARS_SIZE 16*6
-%endif
+%define LOCAL_VARS_SIZE 16*6
 
 %macro SETUP_LOCAL_VARS 0
     ; TODO(slavarnway): using xmm registers for these on ARCH_X86_64 +
@@ -54,11 +50,11 @@ SECTION .text
     mova       k6k7, m3
 %if ARCH_X86_64
     %define     krd  m12
-    %define     tmp  m13
+    %define    tmp0  [rsp + 16*4]
+    %define    tmp1  [rsp + 16*5]
     mova        krd, [GLOBAL(pw_64)]
 %else
-    %define     tmp  [rsp + 16*4]
-    %define     krd  [rsp + 16*5]
+    %define     krd  [rsp + 16*4]
 %if CONFIG_PIC=0
     mova         m6, [GLOBAL(pw_64)]
 %else
@@ -71,50 +67,31 @@ SECTION .text
 %endif
 %endm
 
-%macro HORIZx4_ROW 2
-    mova      %2, %1
-    punpcklbw %1, %1
-    punpckhbw %2, %2
-
-    mova      m3, %2
-    palignr   %2, %1, 1
-    palignr   m3, %1, 5
-
-    pmaddubsw %2, k0k1k4k5
-    pmaddubsw m3, k2k3k6k7
-    mova      m4, %2        ;k0k1
-    mova      m5, m3        ;k2k3
-    psrldq    %2, 8         ;k4k5
-    psrldq    m3, 8         ;k6k7
-    paddsw    %2, m4
-    paddsw    m5, m3
-    paddsw    %2, m5
-    paddsw    %2, krd
-    psraw     %2, 7
-    packuswb  %2, %2
-%endm
-
 ;-------------------------------------------------------------------------------
+%if ARCH_X86_64
+  %define LOCAL_VARS_SIZE_H4 0
+%else
+  %define LOCAL_VARS_SIZE_H4 16*4
+%endif
+
 %macro SUBPIX_HFILTER4 1
-cglobal filter_block1d4_%1, 6, 6+(ARCH_X86_64*2), 11, LOCAL_VARS_SIZE, \
+cglobal filter_block1d4_%1, 6, 6, 11, LOCAL_VARS_SIZE_H4, \
                             src, sstride, dst, dstride, height, filter
     mova                m4, [filterq]
     packsswb            m4, m4
 %if ARCH_X86_64
-    %define       k0k1k4k5 m8
-    %define       k2k3k6k7 m9
-    %define            krd m10
-    %define    orig_height r7d
+    %define       k0k1k4k5  m8
+    %define       k2k3k6k7  m9
+    %define            krd  m10
     mova               krd, [GLOBAL(pw_64)]
     pshuflw       k0k1k4k5, m4, 0b              ;k0_k1
     pshufhw       k0k1k4k5, k0k1k4k5, 10101010b ;k0_k1_k4_k5
     pshuflw       k2k3k6k7, m4, 01010101b       ;k2_k3
     pshufhw       k2k3k6k7, k2k3k6k7, 11111111b ;k2_k3_k6_k7
 %else
-    %define       k0k1k4k5 [rsp + 16*0]
-    %define       k2k3k6k7 [rsp + 16*1]
-    %define            krd [rsp + 16*2]
-    %define    orig_height [rsp + 16*3]
+    %define       k0k1k4k5  [rsp + 16*0]
+    %define       k2k3k6k7  [rsp + 16*1]
+    %define            krd  [rsp + 16*2]
     pshuflw             m6, m4, 0b              ;k0_k1
     pshufhw             m6, m6, 10101010b       ;k0_k1_k4_k5
     pshuflw             m7, m4, 01010101b       ;k2_k3
@@ -131,61 +108,46 @@ cglobal filter_block1d4_%1, 6, 6+(ARCH_X86_64*2), 11, LOCAL_VARS_SIZE, \
     mova          k2k3k6k7, m7
     mova               krd, m1
 %endif
-    mov        orig_height, heightd
-    shr            heightd, 1
+    dec            heightd
+
 .loop:
     ;Do two rows at once
-    movh                m0, [srcq - 3]
-    movh                m1, [srcq + 5]
-    punpcklqdq          m0, m1
-    mova                m1, m0
-    movh                m2, [srcq + sstrideq - 3]
-    movh                m3, [srcq + sstrideq + 5]
-    punpcklqdq          m2, m3
-    mova                m3, m2
-    punpcklbw           m0, m0
-    punpckhbw           m1, m1
-    punpcklbw           m2, m2
-    punpckhbw           m3, m3
-    mova                m4, m1
-    palignr             m4, m0,  1
-    pmaddubsw           m4, k0k1k4k5
-    palignr             m1, m0,  5
+    movu                m4, [srcq - 3]
+    movu                m5, [srcq + sstrideq - 3]
+    punpckhbw           m1, m4, m4
+    punpcklbw           m4, m4
+    punpckhbw           m3, m5, m5
+    punpcklbw           m5, m5
+    palignr             m0, m1, m4, 1
+    pmaddubsw           m0, k0k1k4k5
+    palignr             m1, m4, 5
     pmaddubsw           m1, k2k3k6k7
-    mova                m7, m3
-    palignr             m7, m2,  1
-    pmaddubsw           m7, k0k1k4k5
-    palignr             m3, m2,  5
+    palignr             m2, m3, m5, 1
+    pmaddubsw           m2, k0k1k4k5
+    palignr             m3, m5, 5
     pmaddubsw           m3, k2k3k6k7
-    mova                m0, m4                  ;k0k1
-    mova                m5, m1                  ;k2k3
-    mova                m2, m7                  ;k0k1 upper
-    psrldq              m4, 8                   ;k4k5
-    psrldq              m1, 8                   ;k6k7
-    paddsw              m4, m0
-    paddsw              m5, m1
-    mova                m1, m3                  ;k2k3 upper
-    psrldq              m7, 8                   ;k4k5 upper
-    psrldq              m3, 8                   ;k6k7 upper
-    paddsw              m7, m2
-    paddsw              m4, m5
-    paddsw              m1, m3
-    paddsw              m7, m1
-    paddsw              m4, krd
-    psraw               m4, 7
-    packuswb            m4, m4
-    paddsw              m7, krd
-    psraw               m7, 7
-    packuswb            m7, m7
+    punpckhqdq          m4, m0, m2
+    punpcklqdq          m0, m2
+    punpckhqdq          m5, m1, m3
+    punpcklqdq          m1, m3
+    paddsw              m0, m4
+    paddsw              m1, m5
+%ifidn %1, h8_avg
+    movd                m4, [dstq]
+    movd                m5, [dstq + dstrideq]
+%endif
+    paddsw              m0, m1
+    paddsw              m0, krd
+    psraw               m0, 7
+    packuswb            m0, m0
+    psrldq              m1, m0, 4
 
 %ifidn %1, h8_avg
-    movd                m0, [dstq]
-    pavgb               m4, m0
-    movd                m2, [dstq + dstrideq]
-    pavgb               m7, m2
+    pavgb               m0, m4
+    pavgb               m1, m5
 %endif
-    movd            [dstq], m4
-    movd [dstq + dstrideq], m7
+    movd            [dstq], m0
+    movd [dstq + dstrideq], m1
 
     lea               srcq, [srcq + sstrideq        ]
     prefetcht0              [srcq + 4 * sstrideq - 3]
@@ -193,205 +155,156 @@ cglobal filter_block1d4_%1, 6, 6+(ARCH_X86_64*2), 11, LOCAL_VARS_SIZE, \
     lea               dstq, [dstq + 2 * dstrideq    ]
     prefetcht0              [srcq + 2 * sstrideq - 3]
 
-    dec            heightd
-    jnz              .loop
+    sub            heightd, 2
+    jg               .loop
 
     ; Do last row if output_height is odd
-    mov            heightd, orig_height
-    and            heightd, 1
-    je               .done
-
-    movh                m0, [srcq - 3]    ; load src
-    movh                m1, [srcq + 5]
-    punpcklqdq          m0, m1
-
-    HORIZx4_ROW         m0, m1
+    jne              .done
+
+    movu                m4, [srcq - 3]
+    punpckhbw           m1, m4, m4
+    punpcklbw           m4, m4
+    palignr             m0, m1, m4, 1
+    palignr             m1, m4, 5
+    pmaddubsw           m0, k0k1k4k5
+    pmaddubsw           m1, k2k3k6k7
+    psrldq              m2, m0, 8
+    psrldq              m3, m1, 8
+    paddsw              m0, m2
+    paddsw              m1, m3
+    paddsw              m0, m1
+    paddsw              m0, krd
+    psraw               m0, 7
+    packuswb            m0, m0
 %ifidn %1, h8_avg
-    movd                m0, [dstq]
-    pavgb               m1, m0
+    movd                m4, [dstq]
+    pavgb               m0, m4
 %endif
-    movd            [dstq], m1
+    movd            [dstq], m0
 .done:
-    RET
-%endm
-
-%macro HORIZx8_ROW 5
-    mova        %2, %1
-    punpcklbw   %1, %1
-    punpckhbw   %2, %2
-
-    mova        %3, %2
-    mova        %4, %2
-    mova        %5, %2
-
-    palignr     %2, %1, 1
-    palignr     %3, %1, 5
-    palignr     %4, %1, 9
-    palignr     %5, %1, 13
-
-    pmaddubsw   %2, k0k1
-    pmaddubsw   %3, k2k3
-    pmaddubsw   %4, k4k5
-    pmaddubsw   %5, k6k7
-    paddsw      %2, %4
-    paddsw      %5, %3
-    paddsw      %2, %5
-    paddsw      %2, krd
-    psraw       %2, 7
-    packuswb    %2, %2
-    SWAP        %1, %2
+    REP_RET
 %endm
 
 ;-------------------------------------------------------------------------------
 %macro SUBPIX_HFILTER8 1
-cglobal filter_block1d8_%1, 6, 6+(ARCH_X86_64*1), 14, LOCAL_VARS_SIZE, \
+cglobal filter_block1d8_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
                             src, sstride, dst, dstride, height, filter
     mova                 m4, [filterq]
     SETUP_LOCAL_VARS
-%if ARCH_X86_64
-    %define     orig_height r7d
-%else
-    %define     orig_height heightmp
-%endif
-    mov         orig_height, heightd
-    shr             heightd, 1
+    dec             heightd
 
 .loop:
-    movh                 m0, [srcq - 3]
-    movh                 m3, [srcq + 5]
-    movh                 m4, [srcq + sstrideq - 3]
-    movh                 m7, [srcq + sstrideq + 5]
-    punpcklqdq           m0, m3
-    mova                 m1, m0
+    ;Do two rows at once
+    movu                 m0, [srcq - 3]
+    movu                 m4, [srcq + sstrideq - 3]
+    punpckhbw            m1, m0, m0
     punpcklbw            m0, m0
-    punpckhbw            m1, m1
-    mova                 m5, m1
-    palignr              m5, m0, 13
+    palignr              m5, m1, m0, 13
     pmaddubsw            m5, k6k7
-    mova                 m2, m1
-    mova                 m3, m1
+    palignr              m2, m1, m0, 5
+    palignr              m3, m1, m0, 9
     palignr              m1, m0, 1
     pmaddubsw            m1, k0k1
-    punpcklqdq           m4, m7
-    mova                 m6, m4
+    punpckhbw            m6, m4, m4
     punpcklbw            m4, m4
-    palignr              m2, m0, 5
-    punpckhbw            m6, m6
-    palignr              m3, m0, 9
-    mova                 m7, m6
     pmaddubsw            m2, k2k3
     pmaddubsw            m3, k4k5
 
-    palignr              m7, m4, 13
-    mova                 m0, m6
-    palignr              m0, m4, 5
+    palignr              m7, m6, m4, 13
+    palignr              m0, m6, m4, 5
     pmaddubsw            m7, k6k7
     paddsw               m1, m3
     paddsw               m2, m5
     paddsw               m1, m2
-    mova                 m5, m6
+%ifidn %1, h8_avg
+    movh                 m2, [dstq]
+    movhps               m2, [dstq + dstrideq]
+%endif
+    palignr              m5, m6, m4, 9
     palignr              m6, m4, 1
     pmaddubsw            m0, k2k3
     pmaddubsw            m6, k0k1
-    palignr              m5, m4, 9
     paddsw               m1, krd
     pmaddubsw            m5, k4k5
     psraw                m1, 7
     paddsw               m0, m7
-%ifidn %1, h8_avg
-    movh                 m7, [dstq]
-    movh                 m2, [dstq + dstrideq]
-%endif
-    packuswb             m1, m1
     paddsw               m6, m5
     paddsw               m6, m0
     paddsw               m6, krd
     psraw                m6, 7
-    packuswb             m6, m6
+    packuswb             m1, m6
 %ifidn %1, h8_avg
-    pavgb                m1, m7
-    pavgb                m6, m2
+    pavgb                m1, m2
 %endif
-    movh             [dstq], m1
-    movh  [dstq + dstrideq], m6
+    movh              [dstq], m1
+    movhps [dstq + dstrideq], m1
 
     lea                srcq, [srcq + sstrideq        ]
     prefetcht0               [srcq + 4 * sstrideq - 3]
     lea                srcq, [srcq + sstrideq        ]
     lea                dstq, [dstq + 2 * dstrideq    ]
     prefetcht0               [srcq + 2 * sstrideq - 3]
-    dec             heightd
-    jnz             .loop
-
-    ;Do last row if output_height is odd
-    mov             heightd, orig_height
-    and             heightd, 1
-    je                .done
+    sub             heightd, 2
+    jg                .loop
 
-    movh                 m0, [srcq - 3]
-    movh                 m3, [srcq + 5]
-    punpcklqdq           m0, m3
-
-    HORIZx8_ROW          m0, m1, m2, m3, m4
+    ; Do last row if output_height is odd
+    jne               .done
 
+    movu                 m0, [srcq - 3]
+    punpckhbw            m3, m0, m0
+    punpcklbw            m0, m0
+    palignr              m1, m3, m0, 1
+    palignr              m2, m3, m0, 5
+    palignr              m4, m3, m0, 13
+    palignr              m3, m0, 9
+    pmaddubsw            m1, k0k1
+    pmaddubsw            m2, k2k3
+    pmaddubsw            m3, k4k5
+    pmaddubsw            m4, k6k7
+    paddsw               m1, m3
+    paddsw               m4, m2
+    paddsw               m1, m4
+    paddsw               m1, krd
+    psraw                m1, 7
+    packuswb             m1, m1
 %ifidn %1, h8_avg
-    movh                 m1, [dstq]
-    pavgb                m0, m1
+    movh                 m0, [dstq]
+    pavgb                m1, m0
 %endif
-    movh             [dstq], m0
+    movh             [dstq], m1
 .done:
-    RET
+    REP_RET
 %endm
 
 ;-------------------------------------------------------------------------------
 %macro SUBPIX_HFILTER16 1
-cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*0), 14, LOCAL_VARS_SIZE, \
+cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
                              src, sstride, dst, dstride, height, filter
     mova          m4, [filterq]
     SETUP_LOCAL_VARS
+
 .loop:
     prefetcht0        [srcq + 2 * sstrideq -3]
 
-    movh          m0, [srcq -  3]
-    movh          m4, [srcq +  5]
-    movh          m6, [srcq + 13]
-    punpcklqdq    m0, m4
-    mova          m7, m0
-    punpckhbw     m0, m0
-    mova          m1, m0
-    punpcklqdq    m4, m6
-    mova          m3, m0
-    punpcklbw     m7, m7
-
-    palignr       m3, m7, 13
-    mova          m2, m0
-    pmaddubsw     m3, k6k7
-    palignr       m0, m7, 1
+    movu          m0, [srcq - 3]
+    movu          m4, [srcq - 2]
     pmaddubsw     m0, k0k1
-    palignr       m1, m7, 5
-    pmaddubsw     m1, k2k3
-    palignr       m2, m7, 9
-    pmaddubsw     m2, k4k5
-    paddsw        m1, m3
-    mova          m3, m4
-    punpckhbw     m4, m4
-    mova          m5, m4
-    punpcklbw     m3, m3
-    mova          m7, m4
-    palignr       m5, m3, 5
-    mova          m6, m4
-    palignr       m4, m3, 1
     pmaddubsw     m4, k0k1
+    movu          m1, [srcq - 1]
+    movu          m5, [srcq + 0]
+    pmaddubsw     m1, k2k3
     pmaddubsw     m5, k2k3
-    palignr       m6, m3, 9
+    movu          m2, [srcq + 1]
+    movu          m6, [srcq + 2]
+    pmaddubsw     m2, k4k5
     pmaddubsw     m6, k4k5
-    palignr       m7, m3, 13
+    movu          m3, [srcq + 3]
+    movu          m7, [srcq + 4]
+    pmaddubsw     m3, k6k7
     pmaddubsw     m7, k6k7
     paddsw        m0, m2
+    paddsw        m1, m3
     paddsw        m0, m1
-%ifidn %1, h8_avg
-    mova          m1, [dstq]
-%endif
     paddsw        m4, m6
     paddsw        m5, m7
     paddsw        m4, m5
@@ -399,16 +312,18 @@ cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*0), 14, LOCAL_VARS_SIZE, \
     paddsw        m4, krd
     psraw         m0, 7
     psraw         m4, 7
-    packuswb      m0, m4
+    packuswb      m0, m0
+    packuswb      m4, m4
+    punpcklbw     m0, m4
 %ifidn %1, h8_avg
-    pavgb         m0, m1
+    pavgb         m0, [dstq]
 %endif
     lea         srcq, [srcq + sstrideq]
     mova      [dstq], m0
     lea         dstq, [dstq + dstrideq]
     dec      heightd
     jnz        .loop
-    RET
+    REP_RET
 %endm
 
 INIT_XMM ssse3
@@ -420,204 +335,463 @@ SUBPIX_HFILTER4  h8
 SUBPIX_HFILTER4  h8_avg
 
 ;-------------------------------------------------------------------------------
+
+; TODO(Linfeng): Detect cpu type and choose the code with better performance.
+%define X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON 1
+
+%if ARCH_X86_64 && X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
+    %define NUM_GENERAL_REG_USED 9
+%else
+    %define NUM_GENERAL_REG_USED 6
+%endif
+
 %macro SUBPIX_VFILTER 2
-cglobal filter_block1d%2_%1, 6, 6+(ARCH_X86_64*3), 14, LOCAL_VARS_SIZE, \
+cglobal filter_block1d%2_%1, 6, NUM_GENERAL_REG_USED, 15, LOCAL_VARS_SIZE, \
                              src, sstride, dst, dstride, height, filter
     mova          m4, [filterq]
     SETUP_LOCAL_VARS
-%if ARCH_X86_64
-    %define      src1q r7
-    %define  sstride6q r8
-    %define dst_stride dstrideq
+
+%ifidn %2, 8
+    %define                movx  movh
 %else
-    %define      src1q filterq
-    %define  sstride6q dstrideq
-    %define dst_stride dstridemp
+    %define                movx  movd
 %endif
-    mov       src1q, srcq
-    add       src1q, sstrideq
-    lea   sstride6q, [sstrideq + sstrideq * 4]
-    add   sstride6q, sstrideq                   ;pitch * 6
 
-%ifidn %2, 8
-    %define movx movh
+    dec                 heightd
+
+%if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
+
+%if ARCH_X86_64
+    %define               src1q  r7
+    %define           sstride6q  r8
+    %define          dst_stride  dstrideq
 %else
-    %define movx movd
+    %define               src1q  filterq
+    %define           sstride6q  dstrideq
+    %define          dst_stride  dstridemp
 %endif
+    mov                   src1q, srcq
+    add                   src1q, sstrideq
+    lea               sstride6q, [sstrideq + sstrideq * 4]
+    add               sstride6q, sstrideq                   ;pitch * 6
+
 .loop:
-    movx         m0, [srcq                ]     ;A
-    movx         m1, [srcq + sstrideq     ]     ;B
-    punpcklbw    m0, m1                         ;A B
-    movx         m2, [srcq + sstrideq * 2 ]     ;C
-    pmaddubsw    m0, k0k1
-    mova         m6, m2
-    movx         m3, [src1q + sstrideq * 2]     ;D
-    punpcklbw    m2, m3                         ;C D
-    pmaddubsw    m2, k2k3
-    movx         m4, [srcq + sstrideq * 4 ]     ;E
-    mova         m7, m4
-    movx         m5, [src1q + sstrideq * 4]     ;F
-    punpcklbw    m4, m5                         ;E F
-    pmaddubsw    m4, k4k5
-    punpcklbw    m1, m6                         ;A B next iter
-    movx         m6, [srcq + sstride6q    ]     ;G
-    punpcklbw    m5, m6                         ;E F next iter
-    punpcklbw    m3, m7                         ;C D next iter
-    pmaddubsw    m5, k4k5
-    movx         m7, [src1q + sstride6q   ]     ;H
-    punpcklbw    m6, m7                         ;G H
-    pmaddubsw    m6, k6k7
-    pmaddubsw    m3, k2k3
-    pmaddubsw    m1, k0k1
-    paddsw       m0, m4
-    paddsw       m2, m6
-    movx         m6, [srcq + sstrideq * 8 ]     ;H next iter
-    punpcklbw    m7, m6
-    pmaddubsw    m7, k6k7
-    paddsw       m0, m2
-    paddsw       m0, krd
-    psraw        m0, 7
-    paddsw       m1, m5
-    packuswb     m0, m0
-
-    paddsw       m3, m7
-    paddsw       m1, m3
-    paddsw       m1, krd
-    psraw        m1, 7
-    lea        srcq, [srcq + sstrideq * 2 ]
-    lea       src1q, [src1q + sstrideq * 2]
-    packuswb     m1, m1
+    ;Do two rows at once
+    movx                     m0, [srcq                ]     ;A
+    movx                     m1, [src1q               ]     ;B
+    punpcklbw                m0, m1                         ;A B
+    movx                     m2, [srcq + sstrideq * 2 ]     ;C
+    pmaddubsw                m0, k0k1
+    mova                     m6, m2
+    movx                     m3, [src1q + sstrideq * 2]     ;D
+    punpcklbw                m2, m3                         ;C D
+    pmaddubsw                m2, k2k3
+    movx                     m4, [srcq + sstrideq * 4 ]     ;E
+    mova                     m7, m4
+    movx                     m5, [src1q + sstrideq * 4]     ;F
+    punpcklbw                m4, m5                         ;E F
+    pmaddubsw                m4, k4k5
+    punpcklbw                m1, m6                         ;A B next iter
+    movx                     m6, [srcq + sstride6q    ]     ;G
+    punpcklbw                m5, m6                         ;E F next iter
+    punpcklbw                m3, m7                         ;C D next iter
+    pmaddubsw                m5, k4k5
+    movx                     m7, [src1q + sstride6q   ]     ;H
+    punpcklbw                m6, m7                         ;G H
+    pmaddubsw                m6, k6k7
+    pmaddubsw                m3, k2k3
+    pmaddubsw                m1, k0k1
+    paddsw                   m0, m4
+    paddsw                   m2, m6
+    movx                     m6, [srcq + sstrideq * 8 ]     ;H next iter
+    punpcklbw                m7, m6
+    pmaddubsw                m7, k6k7
+    paddsw                   m0, m2
+    paddsw                   m0, krd
+    psraw                    m0, 7
+    paddsw                   m1, m5
+    packuswb                 m0, m0
+
+    paddsw                   m3, m7
+    paddsw                   m1, m3
+    paddsw                   m1, krd
+    psraw                    m1, 7
+    lea                    srcq, [srcq + sstrideq * 2 ]
+    lea                   src1q, [src1q + sstrideq * 2]
+    packuswb                 m1, m1
+
+%ifidn %1, v8_avg
+    movx                     m2, [dstq]
+    pavgb                    m0, m2
+%endif
+    movx                 [dstq], m0
+    add                    dstq, dst_stride
+%ifidn %1, v8_avg
+    movx                     m3, [dstq]
+    pavgb                    m1, m3
+%endif
+    movx                 [dstq], m1
+    add                    dstq, dst_stride
+    sub                 heightd, 2
+    jg                    .loop
 
+    ; Do last row if output_height is odd
+    jne                   .done
+
+    movx                     m0, [srcq                ]     ;A
+    movx                     m1, [srcq + sstrideq     ]     ;B
+    movx                     m6, [srcq + sstride6q    ]     ;G
+    punpcklbw                m0, m1                         ;A B
+    movx                     m7, [src1q + sstride6q   ]     ;H
+    pmaddubsw                m0, k0k1
+    movx                     m2, [srcq + sstrideq * 2 ]     ;C
+    punpcklbw                m6, m7                         ;G H
+    movx                     m3, [src1q + sstrideq * 2]     ;D
+    pmaddubsw                m6, k6k7
+    movx                     m4, [srcq + sstrideq * 4 ]     ;E
+    punpcklbw                m2, m3                         ;C D
+    movx                     m5, [src1q + sstrideq * 4]     ;F
+    punpcklbw                m4, m5                         ;E F
+    pmaddubsw                m2, k2k3
+    pmaddubsw                m4, k4k5
+    paddsw                   m2, m6
+    paddsw                   m0, m4
+    paddsw                   m0, m2
+    paddsw                   m0, krd
+    psraw                    m0, 7
+    packuswb                 m0, m0
 %ifidn %1, v8_avg
-    movx         m2, [dstq]
-    pavgb        m0, m2
+    movx                     m1, [dstq]
+    pavgb                    m0, m1
 %endif
-    movx     [dstq], m0
-    add        dstq, dst_stride
+    movx                 [dstq], m0
+
+%else
+    ; ARCH_X86_64
+
+    movx                     m0, [srcq                ]     ;A
+    movx                     m1, [srcq + sstrideq     ]     ;B
+    lea                    srcq, [srcq + sstrideq * 2 ]
+    movx                     m2, [srcq]                     ;C
+    movx                     m3, [srcq + sstrideq]          ;D
+    lea                    srcq, [srcq + sstrideq * 2 ]
+    movx                     m4, [srcq]                     ;E
+    movx                     m5, [srcq + sstrideq]          ;F
+    lea                    srcq, [srcq + sstrideq * 2 ]
+    movx                     m6, [srcq]                     ;G
+    punpcklbw                m0, m1                         ;A B
+    punpcklbw                m1, m2                         ;A B next iter
+    punpcklbw                m2, m3                         ;C D
+    punpcklbw                m3, m4                         ;C D next iter
+    punpcklbw                m4, m5                         ;E F
+    punpcklbw                m5, m6                         ;E F next iter
+
+.loop:
+    ;Do two rows at once
+    movx                     m7, [srcq + sstrideq]          ;H
+    lea                    srcq, [srcq + sstrideq * 2 ]
+    movx                    m14, [srcq]                     ;H next iter
+    punpcklbw                m6, m7                         ;G H
+    punpcklbw                m7, m14                        ;G H next iter
+    pmaddubsw                m8, m0, k0k1
+    pmaddubsw                m9, m1, k0k1
+    mova                     m0, m2
+    mova                     m1, m3
+    pmaddubsw               m10, m2, k2k3
+    pmaddubsw               m11, m3, k2k3
+    mova                     m2, m4
+    mova                     m3, m5
+    pmaddubsw                m4, k4k5
+    pmaddubsw                m5, k4k5
+    paddsw                   m8, m4
+    paddsw                   m9, m5
+    mova                     m4, m6
+    mova                     m5, m7
+    pmaddubsw                m6, k6k7
+    pmaddubsw                m7, k6k7
+    paddsw                  m10, m6
+    paddsw                  m11, m7
+    paddsw                   m8, m10
+    paddsw                   m9, m11
+    mova                     m6, m14
+    paddsw                   m8, krd
+    paddsw                   m9, krd
+    psraw                    m8, 7
+    psraw                    m9, 7
+%ifidn %2, 4
+    packuswb                 m8, m8
+    packuswb                 m9, m9
+%else
+    packuswb                 m8, m9
+%endif
+
 %ifidn %1, v8_avg
-    movx         m3, [dstq]
-    pavgb        m1, m3
+    movx                     m7, [dstq]
+%ifidn %2, 4
+    movx                    m10, [dstq + dstrideq]
+    pavgb                    m9, m10
+%else
+    movhpd                   m7, [dstq + dstrideq]
+%endif
+    pavgb                    m8, m7
 %endif
-    movx     [dstq], m1
-    add        dstq, dst_stride
-    sub     heightd, 2
-    cmp     heightd, 1
-    jg        .loop
-
-    cmp     heightd, 0
-    je        .done
-
-    movx         m0, [srcq                ]     ;A
-    movx         m1, [srcq + sstrideq     ]     ;B
-    movx         m6, [srcq + sstride6q    ]     ;G
-    punpcklbw    m0, m1                         ;A B
-    movx         m7, [src1q + sstride6q   ]     ;H
-    pmaddubsw    m0, k0k1
-    movx         m2, [srcq + sstrideq * 2 ]     ;C
-    punpcklbw    m6, m7                         ;G H
-    movx         m3, [src1q + sstrideq * 2]     ;D
-    pmaddubsw    m6, k6k7
-    movx         m4, [srcq + sstrideq * 4 ]     ;E
-    punpcklbw    m2, m3                         ;C D
-    movx         m5, [src1q + sstrideq * 4]     ;F
-    punpcklbw    m4, m5                         ;E F
-    pmaddubsw    m2, k2k3
-    pmaddubsw    m4, k4k5
-    paddsw       m2, m6
-    paddsw       m0, m4
-    paddsw       m0, m2
-    paddsw       m0, krd
-    psraw        m0, 7
-    packuswb     m0, m0
+    movx                 [dstq], m8
+%ifidn %2, 4
+    movx      [dstq + dstrideq], m9
+%else
+    movhpd    [dstq + dstrideq], m8
+%endif
+
+    lea                    dstq, [dstq + dstrideq * 2 ]
+    sub                 heightd, 2
+    jg                    .loop
+
+    ; Do last row if output_height is odd
+    jne                   .done
+
+    movx                     m7, [srcq + sstrideq]          ;H
+    punpcklbw                m6, m7                         ;G H
+    pmaddubsw                m0, k0k1
+    pmaddubsw                m2, k2k3
+    pmaddubsw                m4, k4k5
+    pmaddubsw                m6, k6k7
+    paddsw                   m0, m4
+    paddsw                   m2, m6
+    paddsw                   m0, m2
+    paddsw                   m0, krd
+    psraw                    m0, 7
+    packuswb                 m0, m0
 %ifidn %1, v8_avg
-    movx         m1, [dstq]
-    pavgb        m0, m1
+    movx                     m1, [dstq]
+    pavgb                    m0, m1
 %endif
-    movx     [dstq], m0
+    movx                 [dstq], m0
+
+%endif ; ARCH_X86_64
+
 .done:
-    RET
+    REP_RET
+
 %endm
 
 ;-------------------------------------------------------------------------------
 %macro SUBPIX_VFILTER16 1
-cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*3), 14, LOCAL_VARS_SIZE, \
+cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \
                              src, sstride, dst, dstride, height, filter
-    mova          m4, [filterq]
+    mova                     m4, [filterq]
     SETUP_LOCAL_VARS
+
+%if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
+
 %if ARCH_X86_64
-    %define      src1q r7
-    %define  sstride6q r8
-    %define dst_stride dstrideq
+    %define               src1q  r7
+    %define           sstride6q  r8
+    %define          dst_stride  dstrideq
 %else
-    %define      src1q filterq
-    %define  sstride6q dstrideq
-    %define dst_stride dstridemp
+    %define               src1q  filterq
+    %define           sstride6q  dstrideq
+    %define          dst_stride  dstridemp
 %endif
-    mov        src1q, srcq
-    add        src1q, sstrideq
-    lea    sstride6q, [sstrideq + sstrideq * 4]
-    add    sstride6q, sstrideq                   ;pitch * 6
+    lea                   src1q, [srcq + sstrideq]
+    lea               sstride6q, [sstrideq + sstrideq * 4]
+    add               sstride6q, sstrideq                   ;pitch * 6
 
 .loop:
-    movh          m0, [srcq                ]     ;A
-    movh          m1, [srcq + sstrideq     ]     ;B
-    movh          m2, [srcq + sstrideq * 2 ]     ;C
-    movh          m3, [src1q + sstrideq * 2]     ;D
-    movh          m4, [srcq + sstrideq * 4 ]     ;E
-    movh          m5, [src1q + sstrideq * 4]     ;F
-
-    punpcklbw     m0, m1                         ;A B
-    movh          m6, [srcq + sstride6q]         ;G
-    punpcklbw     m2, m3                         ;C D
-    movh          m7, [src1q + sstride6q]        ;H
-    punpcklbw     m4, m5                         ;E F
-    pmaddubsw     m0, k0k1
-    movh          m3, [srcq + 8]                 ;A
-    pmaddubsw     m2, k2k3
-    punpcklbw     m6, m7                         ;G H
-    movh          m5, [srcq + sstrideq + 8]      ;B
-    pmaddubsw     m4, k4k5
-    punpcklbw     m3, m5                         ;A B
-    movh          m7, [srcq + sstrideq * 2 + 8]  ;C
-    pmaddubsw     m6, k6k7
-    movh          m5, [src1q + sstrideq * 2 + 8] ;D
-    punpcklbw     m7, m5                         ;C D
-    paddsw        m2, m6
-    pmaddubsw     m3, k0k1
-    movh          m1, [srcq + sstrideq * 4 + 8]  ;E
-    paddsw        m0, m4
-    pmaddubsw     m7, k2k3
-    movh          m6, [src1q + sstrideq * 4 + 8] ;F
-    punpcklbw     m1, m6                         ;E F
-    paddsw        m0, m2
-    paddsw        m0, krd
-    movh          m2, [srcq + sstride6q + 8]     ;G
-    pmaddubsw     m1, k4k5
-    movh          m5, [src1q + sstride6q + 8]    ;H
-    psraw         m0, 7
-    punpcklbw     m2, m5                         ;G H
-    pmaddubsw     m2, k6k7
+    movh                     m0, [srcq                ]     ;A
+    movh                     m1, [src1q               ]     ;B
+    movh                     m2, [srcq + sstrideq * 2 ]     ;C
+    movh                     m3, [src1q + sstrideq * 2]     ;D
+    movh                     m4, [srcq + sstrideq * 4 ]     ;E
+    movh                     m5, [src1q + sstrideq * 4]     ;F
+
+    punpcklbw                m0, m1                         ;A B
+    movh                     m6, [srcq + sstride6q]         ;G
+    punpcklbw                m2, m3                         ;C D
+    movh                     m7, [src1q + sstride6q]        ;H
+    punpcklbw                m4, m5                         ;E F
+    pmaddubsw                m0, k0k1
+    movh                     m3, [srcq + 8]                 ;A
+    pmaddubsw                m2, k2k3
+    punpcklbw                m6, m7                         ;G H
+    movh                     m5, [srcq + sstrideq + 8]      ;B
+    pmaddubsw                m4, k4k5
+    punpcklbw                m3, m5                         ;A B
+    movh                     m7, [srcq + sstrideq * 2 + 8]  ;C
+    pmaddubsw                m6, k6k7
+    movh                     m5, [src1q + sstrideq * 2 + 8] ;D
+    punpcklbw                m7, m5                         ;C D
+    paddsw                   m2, m6
+    pmaddubsw                m3, k0k1
+    movh                     m1, [srcq + sstrideq * 4 + 8]  ;E
+    paddsw                   m0, m4
+    pmaddubsw                m7, k2k3
+    movh                     m6, [src1q + sstrideq * 4 + 8] ;F
+    punpcklbw                m1, m6                         ;E F
+    paddsw                   m0, m2
+    paddsw                   m0, krd
+    movh                     m2, [srcq + sstride6q + 8]     ;G
+    pmaddubsw                m1, k4k5
+    movh                     m5, [src1q + sstride6q + 8]    ;H
+    psraw                    m0, 7
+    punpcklbw                m2, m5                         ;G H
+    pmaddubsw                m2, k6k7
+    paddsw                   m7, m2
+    paddsw                   m3, m1
+    paddsw                   m3, m7
+    paddsw                   m3, krd
+    psraw                    m3, 7
+    packuswb                 m0, m3
+
+    add                    srcq, sstrideq
+    add                   src1q, sstrideq
+%ifidn %1, v8_avg
+    pavgb                    m0, [dstq]
+%endif
+    mova                 [dstq], m0
+    add                    dstq, dst_stride
+    dec                 heightd
+    jnz                   .loop
+    REP_RET
+
+%else
+    ; ARCH_X86_64
+    dec                 heightd
+
+    movu                     m1, [srcq                ]     ;A
+    movu                     m3, [srcq + sstrideq     ]     ;B
+    lea                    srcq, [srcq + sstrideq * 2]
+    punpcklbw                m0, m1, m3                     ;A B
+    punpckhbw                m1, m3                         ;A B
+    movu                     m5, [srcq]                     ;C
+    punpcklbw                m2, m3, m5                     ;A B next iter
+    punpckhbw                m3, m5                         ;A B next iter
+    mova                   tmp0, m2                         ;store to stack
+    mova                   tmp1, m3                         ;store to stack
+    movu                     m7, [srcq + sstrideq]          ;D
+    lea                    srcq, [srcq + sstrideq * 2]
+    punpcklbw                m4, m5, m7                     ;C D
+    punpckhbw                m5, m7                         ;C D
+    movu                     m9, [srcq]                     ;E
+    punpcklbw                m6, m7, m9                     ;C D next iter
+    punpckhbw                m7, m9                         ;C D next iter
+    movu                    m11, [srcq + sstrideq]          ;F
+    lea                    srcq, [srcq + sstrideq * 2]
+    punpcklbw                m8, m9, m11                    ;E F
+    punpckhbw                m9, m11                        ;E F
+    movu                     m2, [srcq]                     ;G
+    punpcklbw               m10, m11, m2                    ;E F next iter
+    punpckhbw               m11, m2                         ;E F next iter
+
+.loop:
+    ;Do two rows at once
+    pmaddubsw               m13, m0, k0k1
+    mova                     m0, m4
+    pmaddubsw               m14, m8, k4k5
+    pmaddubsw               m15, m4, k2k3
+    mova                     m4, m8
+    paddsw                  m13, m14
+    movu                     m3, [srcq + sstrideq]          ;H
+    lea                    srcq, [srcq + sstrideq * 2]
+    punpcklbw               m14, m2, m3                     ;G H
+    mova                     m8, m14
+    pmaddubsw               m14, k6k7
+    paddsw                  m15, m14
+    paddsw                  m13, m15
+    paddsw                  m13, krd
+    psraw                   m13, 7
+
+    pmaddubsw               m14, m1, k0k1
+    pmaddubsw                m1, m9, k4k5
+    pmaddubsw               m15, m5, k2k3
+    paddsw                  m14, m1
+    mova                     m1, m5
+    mova                     m5, m9
+    punpckhbw                m2, m3                         ;G H
+    mova                     m9, m2
+    pmaddubsw                m2, k6k7
+    paddsw                  m15, m2
+    paddsw                  m14, m15
+    paddsw                  m14, krd
+    psraw                   m14, 7
+    packuswb                m13, m14
 %ifidn %1, v8_avg
-    mova          m4, [dstq]
+    pavgb                   m13, [dstq]
 %endif
-    movh      [dstq], m0
-    paddsw        m7, m2
-    paddsw        m3, m1
-    paddsw        m3, m7
-    paddsw        m3, krd
-    psraw         m3, 7
-    packuswb      m0, m3
-
-    add         srcq, sstrideq
-    add        src1q, sstrideq
+    mova                 [dstq], m13
+
+    ; next iter
+    pmaddubsw               m15, tmp0, k0k1
+    pmaddubsw               m14, m10, k4k5
+    pmaddubsw               m13, m6, k2k3
+    paddsw                  m15, m14
+    mova                   tmp0, m6
+    mova                     m6, m10
+    movu                     m2, [srcq]                     ;G next iter
+    punpcklbw               m14, m3, m2                     ;G H next iter
+    mova                    m10, m14
+    pmaddubsw               m14, k6k7
+    paddsw                  m13, m14
+    paddsw                  m15, m13
+    paddsw                  m15, krd
+    psraw                   m15, 7
+
+    pmaddubsw               m14, tmp1, k0k1
+    mova                   tmp1, m7
+    pmaddubsw               m13, m7, k2k3
+    mova                     m7, m11
+    pmaddubsw               m11, k4k5
+    paddsw                  m14, m11
+    punpckhbw                m3, m2                         ;G H next iter
+    mova                    m11, m3
+    pmaddubsw                m3, k6k7
+    paddsw                  m13, m3
+    paddsw                  m14, m13
+    paddsw                  m14, krd
+    psraw                   m14, 7
+    packuswb                m15, m14
 %ifidn %1, v8_avg
-    pavgb         m0, m4
+    pavgb                   m15, [dstq + dstrideq]
 %endif
-    mova      [dstq], m0
-    add         dstq, dst_stride
-    dec      heightd
-    jnz        .loop
-    RET
+    mova      [dstq + dstrideq], m15
+    lea                    dstq, [dstq + dstrideq * 2]
+    sub                 heightd, 2
+    jg                    .loop
+
+    ; Do last row if output_height is odd
+    jne                   .done
+
+    movu                     m3, [srcq + sstrideq]          ;H
+    punpcklbw                m6, m2, m3                     ;G H
+    punpckhbw                m2, m3                         ;G H
+    pmaddubsw                m0, k0k1
+    pmaddubsw                m1, k0k1
+    pmaddubsw                m4, k2k3
+    pmaddubsw                m5, k2k3
+    pmaddubsw                m8, k4k5
+    pmaddubsw                m9, k4k5
+    pmaddubsw                m6, k6k7
+    pmaddubsw                m2, k6k7
+    paddsw                   m0, m8
+    paddsw                   m1, m9
+    paddsw                   m4, m6
+    paddsw                   m5, m2
+    paddsw                   m0, m4
+    paddsw                   m1, m5
+    paddsw                   m0, krd
+    paddsw                   m1, krd
+    psraw                    m0, 7
+    psraw                    m1, 7
+    packuswb                 m0, m1
+%ifidn %1, v8_avg
+    pavgb                    m0, [dstq]
+%endif
+    mova                 [dstq], m0
+
+.done:
+    REP_RET
+
+%endif ; ARCH_X86_64
+
 %endm
 
 INIT_XMM ssse3
diff --git a/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm b/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm
index 3c8cfd225..538b2129d 100644
--- a/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm
+++ b/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm
@@ -14,14 +14,14 @@
     mov         rdx, arg(5)                 ;filter ptr
     mov         rsi, arg(0)                 ;src_ptr
     mov         rdi, arg(2)                 ;output_ptr
-    mov         rcx, 0x0400040
+    mov         ecx, 0x01000100
 
     movdqa      xmm3, [rdx]                 ;load filters
     psrldq      xmm3, 6
     packsswb    xmm3, xmm3
     pshuflw     xmm3, xmm3, 0b              ;k3_k4
 
-    movq        xmm2, rcx                   ;rounding
+    movd        xmm2, ecx                   ;rounding_shift
     pshufd      xmm2, xmm2, 0
 
     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
@@ -33,8 +33,7 @@
     punpcklbw   xmm0, xmm1
     pmaddubsw   xmm0, xmm3
 
-    paddsw      xmm0, xmm2                  ;rounding
-    psraw       xmm0, 7                     ;shift
+    pmulhrsw    xmm0, xmm2                  ;rounding(+64)+shift(>>7)
     packuswb    xmm0, xmm0                  ;pack to byte
 
 %if %1
@@ -51,7 +50,7 @@
     mov         rdx, arg(5)                 ;filter ptr
     mov         rsi, arg(0)                 ;src_ptr
     mov         rdi, arg(2)                 ;output_ptr
-    mov         rcx, 0x0400040
+    mov         ecx, 0x01000100
 
     movdqa      xmm7, [rdx]                 ;load filters
     psrldq      xmm7, 6
@@ -59,7 +58,7 @@
     pshuflw     xmm7, xmm7, 0b              ;k3_k4
     punpcklwd   xmm7, xmm7
 
-    movq        xmm6, rcx                   ;rounding
+    movd        xmm6, ecx                   ;rounding_shift
     pshufd      xmm6, xmm6, 0
 
     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
@@ -71,8 +70,7 @@
     punpcklbw   xmm0, xmm1
     pmaddubsw   xmm0, xmm7
 
-    paddsw      xmm0, xmm6                  ;rounding
-    psraw       xmm0, 7                     ;shift
+    pmulhrsw    xmm0, xmm6                  ;rounding(+64)+shift(>>7)
     packuswb    xmm0, xmm0                  ;pack back to byte
 
 %if %1
@@ -92,10 +90,8 @@
     pmaddubsw   xmm0, xmm7
     pmaddubsw   xmm2, xmm7
 
-    paddsw      xmm0, xmm6                  ;rounding
-    paddsw      xmm2, xmm6
-    psraw       xmm0, 7                     ;shift
-    psraw       xmm2, 7
+    pmulhrsw    xmm0, xmm6                  ;rounding(+64)+shift(>>7)
+    pmulhrsw    xmm2, xmm6
     packuswb    xmm0, xmm2                  ;pack back to byte
 
 %if %1
diff --git a/vpx_ports/bitops.h b/vpx_ports/bitops.h
index 84ff3659f..19426faf0 100644
--- a/vpx_ports/bitops.h
+++ b/vpx_ports/bitops.h
@@ -16,8 +16,7 @@
 #include "vpx_ports/msvc.h"
 
 #ifdef _MSC_VER
-# include <math.h>  // the ceil() definition must precede intrin.h
-# if _MSC_VER > 1310 && (defined(_M_X64) || defined(_M_IX86))
+# if defined(_M_X64) || defined(_M_IX86)
 #  include <intrin.h>
 #  define USE_MSC_INTRINSICS
 # endif