10 files changed, 1107 insertions, 258 deletions
diff --git a/build/make/obj_int_extract.c b/build/make/obj_int_extract.c
index 3c54b248f..26cf45782 100644
--- a/build/make/obj_int_extract.c
+++ b/build/make/obj_int_extract.c
@@ -59,20 +59,47 @@ int parse_macho(uint8_t *base_buf, size_t sz)
     struct mach_header header;
     uint8_t *buf = base_buf;
     int base_data_section = 0;
-
+    int bits = 0;
+
+    /* We can read in mach_header for 32 and 64 bit architectures
+     * because it's identical to mach_header_64 except for the last
+     * element (uint32_t reserved), which we don't use. Then, when
+     * we know which architecture we're looking at, increment buf
+     * appropriately.
+     */
     memcpy(&header, buf, sizeof(struct mach_header));
-    buf += sizeof(struct mach_header);
 
-    if (header.magic != MH_MAGIC)
+    if (header.magic == MH_MAGIC)
     {
-        log_msg("Bad magic number for object file. 0x%x expected, 0x%x found.\n",
-                header.magic, MH_MAGIC);
-        goto bail;
+        if (header.cputype == CPU_TYPE_ARM
+            || header.cputype == CPU_TYPE_X86)
+        {
+            bits = 32;
+            buf += sizeof(struct mach_header);
+        }
+        else
+        {
+            log_msg("Bad cputype for object file. Currently only tested for CPU_TYPE_[ARM|X86].\n");
+            goto bail;
+        }
     }
-
-    if (header.cputype != CPU_TYPE_ARM)
+    else if (header.magic == MH_MAGIC_64)
     {
-        log_msg("Bad cputype for object file. Currently only tested for CPU_TYPE_ARM.\n");
+        if (header.cputype == CPU_TYPE_X86_64)
+        {
+            bits = 64;
+            buf += sizeof(struct mach_header_64);
+        }
+        else
+        {
+            log_msg("Bad cputype for object file. Currently only tested for CPU_TYPE_X86_64.\n");
+            goto bail;
+        }
+    }
+    else
+    {
+        log_msg("Bad magic number for object file. 0x%x or 0x%x expected, 0x%x found.\n",
+                MH_MAGIC, MH_MAGIC_64, header.magic);
         goto bail;
     }
 
@@ -85,8 +112,6 @@ int parse_macho(uint8_t *base_buf, size_t sz)
     for (i = 0; i < header.ncmds; i++)
     {
         struct load_command lc;
-        struct symtab_command sc;
-        struct segment_command seg_c;
 
         memcpy(&lc, buf, sizeof(struct load_command));
 
@@ -94,50 +119,99 @@ int parse_macho(uint8_t *base_buf, size_t sz)
         {
             uint8_t *seg_buf = buf;
             struct section s;
+            struct segment_command seg_c;
 
-            memcpy(&seg_c, buf, sizeof(struct segment_command));
-
+            memcpy(&seg_c, seg_buf, sizeof(struct segment_command));
             seg_buf += sizeof(struct segment_command);
 
-            for (j = 0; j < seg_c.nsects; j++)
+            /* Although each section is given it's own offset, nlist.n_value
+             * references the offset of the first section. This isn't
+             * apparent without debug information because the offset of the
+             * data section is the same as the first section. However, with
+             * debug sections mixed in, the offset of the debug section
+             * increases but n_value still references the first section.
+             */
+            if (seg_c.nsects < 1)
             {
-                memcpy(&s, seg_buf + (j * sizeof(struct section)), sizeof(struct section));
+                log_msg("Not enough sections\n");
+                goto bail;
+            }
+
+            memcpy(&s, seg_buf, sizeof(struct section));
+            base_data_section = s.offset;
+        }
+        else if (lc.cmd == LC_SEGMENT_64)
+        {
+            uint8_t *seg_buf = buf;
+            struct section_64 s;
+            struct segment_command_64 seg_c;
+
+            memcpy(&seg_c, seg_buf, sizeof(struct segment_command_64));
+            seg_buf += sizeof(struct segment_command_64);
 
-                // Need to get this offset which is the start of the symbol table
-                // before matching the strings up with symbols.
-                base_data_section = s.offset;
+            /* Explanation in LG_SEGMENT */
+            if (seg_c.nsects < 1)
+            {
+                log_msg("Not enough sections\n");
+                goto bail;
             }
+
+            memcpy(&s, seg_buf, sizeof(struct section_64));
+            base_data_section = s.offset;
         }
         else if (lc.cmd == LC_SYMTAB)
         {
-            uint8_t *sym_buf = base_buf;
-            uint8_t *str_buf = base_buf;
-
             if (base_data_section != 0)
             {
+                struct symtab_command sc;
+                uint8_t *sym_buf = base_buf;
+                uint8_t *str_buf = base_buf;
+
                 memcpy(&sc, buf, sizeof(struct symtab_command));
 
                 if (sc.cmdsize != sizeof(struct symtab_command))
+                {
                     log_msg("Can't find symbol table!\n");
+                    goto bail;
+                }
 
                 sym_buf += sc.symoff;
                 str_buf += sc.stroff;
 
                 for (j = 0; j < sc.nsyms; j++)
                 {
-                    struct nlist nl;
-                    int val;
+                    /* Location of string is cacluated each time from the
+                     * start of the string buffer.  On darwin the symbols
+                     * are prefixed by "_", so we bump the pointer by 1.
+                     * The target value is defined as an int in asm_*_offsets.c,
+                     * which is 4 bytes on all targets we currently use.
+                     */
+                    if (bits == 32)
+                    {
+                        struct nlist nl;
+                        int val;
 
-                    memcpy(&nl, sym_buf + (j * sizeof(struct nlist)), sizeof(struct nlist));
+                        memcpy(&nl, sym_buf, sizeof(struct nlist));
+                        sym_buf += sizeof(struct nlist);
 
-                    val = *((int *)(base_buf + base_data_section + nl.n_value));
+                        memcpy(&val, base_buf + base_data_section + nl.n_value,
+                               sizeof(val));
+                        printf("%-40s EQU %5d\n",
+                               str_buf + nl.n_un.n_strx + 1, val);
+                    }
+                    else /* if (bits == 64) */
+                    {
+                        struct nlist_64 nl;
+                        int val;
 
-                    // Location of string is cacluated each time from the
-                    // start of the string buffer.  On darwin the symbols
-                    // are prefixed by "_".  On other platforms it is not
-                    // so it needs to be removed.  That is the reason for
-                    // the +1.
-                    printf("%-40s EQU %5d\n", str_buf + nl.n_un.n_strx + 1, val);
+                        memcpy(&nl, sym_buf, sizeof(struct nlist_64));
+                        sym_buf += sizeof(struct nlist_64);
+
+                        memcpy(&val, base_buf + base_data_section + nl.n_value,
+                               sizeof(val));
+                        printf("%-40s EQU %5d\n",
+                               str_buf + nl.n_un.n_strx + 1, val);
+                    }
                 }
             }
         }
@@ -218,7 +292,7 @@ bail:
     return EXIT_FAILURE;
 }
 
-#else
+#elif defined(__ELF__)
 #include "elf.h"
 
 #define COPY_STRUCT(dst, buf, ofst, sz) do {\
@@ -237,212 +311,420 @@ bail:
 
 typedef struct
 {
-    uint8_t     *buf; /* Buffer containing ELF data */
-    size_t       sz;  /* Buffer size */
-    int          le_data;   /* Data is little-endian */
-    Elf32_Ehdr   hdr;
+    uint8_t      *buf; /* Buffer containing ELF data */
+    size_t        sz;  /* Buffer size */
+    int           le_data; /* Data is little-endian */
+    unsigned char e_ident[EI_NIDENT]; /* Magic number and other info */
+    int           bits; /* 32 or 64 */
+    Elf32_Ehdr    hdr32;
+    Elf64_Ehdr    hdr64;
 } elf_obj_t;
 
-int parse_elf32_header(elf_obj_t *elf)
+int parse_elf_header(elf_obj_t *elf)
 {
     int res;
-    /* Verify ELF32 header */
-    COPY_STRUCT(&elf->hdr, elf->buf, 0, elf->sz);
-    res = elf->hdr.e_ident[EI_MAG0] == ELFMAG0;
-    res &= elf->hdr.e_ident[EI_MAG1] == ELFMAG1;
-    res &= elf->hdr.e_ident[EI_MAG2] == ELFMAG2;
-    res &= elf->hdr.e_ident[EI_MAG3] == ELFMAG3;
-    res &= elf->hdr.e_ident[EI_CLASS] == ELFCLASS32;
-    res &= elf->hdr.e_ident[EI_DATA] == ELFDATA2LSB
-           || elf->hdr.e_ident[EI_DATA] == ELFDATA2MSB;
+    /* Verify ELF Magic numbers */
+    COPY_STRUCT(&elf->e_ident, elf->buf, 0, elf->sz);
+    res = elf->e_ident[EI_MAG0] == ELFMAG0;
+    res &= elf->e_ident[EI_MAG1] == ELFMAG1;
+    res &= elf->e_ident[EI_MAG2] == ELFMAG2;
+    res &= elf->e_ident[EI_MAG3] == ELFMAG3;
+    res &= elf->e_ident[EI_CLASS] == ELFCLASS32
+        || elf->e_ident[EI_CLASS] == ELFCLASS64;
+    res &= elf->e_ident[EI_DATA] == ELFDATA2LSB;
 
     if (!res) goto bail;
 
-    elf->le_data = elf->hdr.e_ident[EI_DATA] == ELFDATA2LSB;
-
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_type);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_machine);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_version);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_entry);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_phoff);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_shoff);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_flags);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_ehsize);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_phentsize);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_phnum);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_shentsize);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_shnum);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_shstrndx);
+    elf->le_data = elf->e_ident[EI_DATA] == ELFDATA2LSB;
+
+    /* Read in relevant values */
+    if (elf->e_ident[EI_CLASS] == ELFCLASS32)
+    {
+        elf->bits = 32;
+        COPY_STRUCT(&elf->hdr32, elf->buf, 0, elf->sz);
+
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_type);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_machine);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_version);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_entry);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_phoff);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_shoff);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_flags);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_ehsize);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_phentsize);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_phnum);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_shentsize);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_shnum);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_shstrndx);
+    }
+    else /* if (elf->e_ident[EI_CLASS] == ELFCLASS64) */
+    {
+        elf->bits = 64;
+        COPY_STRUCT(&elf->hdr64, elf->buf, 0, elf->sz);
+
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_type);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_machine);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_version);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_entry);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_phoff);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_shoff);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_flags);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_ehsize);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_phentsize);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_phnum);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_shentsize);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_shnum);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_shstrndx);
+    }
+
     return 0;
 bail:
+    log_msg("Failed to parse ELF file header");
     return 1;
 }
 
-int parse_elf32_section(elf_obj_t *elf, int idx, Elf32_Shdr *hdr)
+int parse_elf_section(elf_obj_t *elf, int idx, Elf32_Shdr *hdr32, Elf64_Shdr *hdr64)
 {
-    if (idx >= elf->hdr.e_shnum)
-        goto bail;
+    if (hdr32)
+    {
+        if (idx >= elf->hdr32.e_shnum)
+            goto bail;
+
+        COPY_STRUCT(hdr32, elf->buf, elf->hdr32.e_shoff + idx * elf->hdr32.e_shentsize,
+                    elf->sz);
+        ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_name);
+        ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_type);
+        ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_flags);
+        ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_addr);
+        ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_offset);
+        ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_size);
+        ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_link);
+        ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_info);
+        ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_addralign);
+        ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_entsize);
+    }
+    else /* if (hdr64) */
+    {
+        if (idx >= elf->hdr64.e_shnum)
+            goto bail;
+
+        COPY_STRUCT(hdr64, elf->buf, elf->hdr64.e_shoff + idx * elf->hdr64.e_shentsize,
+                    elf->sz);
+        ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_name);
+        ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_type);
+        ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_flags);
+        ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_addr);
+        ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_offset);
+        ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_size);
+        ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_link);
+        ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_info);
+        ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_addralign);
+        ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_entsize);
+    }
 
-    COPY_STRUCT(hdr, elf->buf, elf->hdr.e_shoff + idx * elf->hdr.e_shentsize,
-                elf->sz);
-    ENDIAN_ASSIGN_IN_PLACE(hdr->sh_name);
-    ENDIAN_ASSIGN_IN_PLACE(hdr->sh_type);
-    ENDIAN_ASSIGN_IN_PLACE(hdr->sh_flags);
-    ENDIAN_ASSIGN_IN_PLACE(hdr->sh_addr);
-    ENDIAN_ASSIGN_IN_PLACE(hdr->sh_offset);
-    ENDIAN_ASSIGN_IN_PLACE(hdr->sh_size);
-    ENDIAN_ASSIGN_IN_PLACE(hdr->sh_link);
-    ENDIAN_ASSIGN_IN_PLACE(hdr->sh_info);
-    ENDIAN_ASSIGN_IN_PLACE(hdr->sh_addralign);
-    ENDIAN_ASSIGN_IN_PLACE(hdr->sh_entsize);
     return 0;
 bail:
     return 1;
 }
 
-char *parse_elf32_string_table(elf_obj_t *elf, int s_idx, int idx)
+char *parse_elf_string_table(elf_obj_t *elf, int s_idx, int idx)
 {
-    Elf32_Shdr shdr;
-
-    if (parse_elf32_section(elf, s_idx, &shdr))
+    if (elf->bits == 32)
     {
-        log_msg("Failed to parse ELF string table: section %d, index %d\n",
-                s_idx, idx);
-        return "";
+        Elf32_Shdr shdr;
+
+        if (parse_elf_section(elf, s_idx, &shdr, NULL))
+        {
+            log_msg("Failed to parse ELF string table: section %d, index %d\n",
+                    s_idx, idx);
+            return "";
+        }
+
+        return (char *)(elf->buf + shdr.sh_offset + idx);
     }
+    else /* if (elf->bits == 64) */
+    {
+        Elf64_Shdr shdr;
+
+        if (parse_elf_section(elf, s_idx, NULL, &shdr))
+        {
+            log_msg("Failed to parse ELF string table: section %d, index %d\n",
+                    s_idx, idx);
+            return "";
+        }
 
-    return (char *)(elf->buf + shdr.sh_offset + idx);
+        return (char *)(elf->buf + shdr.sh_offset + idx);
+    }
 }
 
-int parse_elf32_symbol(elf_obj_t *elf, unsigned int ofst, Elf32_Sym *sym)
+int parse_elf_symbol(elf_obj_t *elf, unsigned int ofst, Elf32_Sym *sym32, Elf64_Sym *sym64)
 {
-    COPY_STRUCT(sym, elf->buf, ofst, elf->sz);
-    ENDIAN_ASSIGN_IN_PLACE(sym->st_name);
-    ENDIAN_ASSIGN_IN_PLACE(sym->st_value);
-    ENDIAN_ASSIGN_IN_PLACE(sym->st_size);
-    ENDIAN_ASSIGN_IN_PLACE(sym->st_info);
-    ENDIAN_ASSIGN_IN_PLACE(sym->st_other);
-    ENDIAN_ASSIGN_IN_PLACE(sym->st_shndx);
+    if (sym32)
+    {
+        COPY_STRUCT(sym32, elf->buf, ofst, elf->sz);
+        ENDIAN_ASSIGN_IN_PLACE(sym32->st_name);
+        ENDIAN_ASSIGN_IN_PLACE(sym32->st_value);
+        ENDIAN_ASSIGN_IN_PLACE(sym32->st_size);
+        ENDIAN_ASSIGN_IN_PLACE(sym32->st_info);
+        ENDIAN_ASSIGN_IN_PLACE(sym32->st_other);
+        ENDIAN_ASSIGN_IN_PLACE(sym32->st_shndx);
+    }
+    else /* if (sym64) */
+    {
+        COPY_STRUCT(sym64, elf->buf, ofst, elf->sz);
+        ENDIAN_ASSIGN_IN_PLACE(sym64->st_name);
+        ENDIAN_ASSIGN_IN_PLACE(sym64->st_value);
+        ENDIAN_ASSIGN_IN_PLACE(sym64->st_size);
+        ENDIAN_ASSIGN_IN_PLACE(sym64->st_info);
+        ENDIAN_ASSIGN_IN_PLACE(sym64->st_other);
+        ENDIAN_ASSIGN_IN_PLACE(sym64->st_shndx);
+    }
     return 0;
 bail:
     return 1;
 }
 
-int parse_elf32(uint8_t *buf, size_t sz, output_fmt_t mode)
+int parse_elf(uint8_t *buf, size_t sz, output_fmt_t mode)
 {
-    elf_obj_t  elf;
-    Elf32_Shdr shdr;
+    elf_obj_t    elf;
     unsigned int ofst;
-    int         i;
-    Elf32_Off strtab_off;   /* save String Table offset for later use */
+    int          i;
+    Elf32_Off    strtab_off32;
+    Elf64_Off    strtab_off64; /* save String Table offset for later use */
 
     memset(&elf, 0, sizeof(elf));
     elf.buf = buf;
     elf.sz = sz;
 
     /* Parse Header */
-    if (parse_elf32_header(&elf))
-    {
-        log_msg("Parse error: File does not appear to be valid ELF32\n");
-        return 1;
-    }
+    if (parse_elf_header(&elf))
+      goto bail;
 
-    for (i = 0; i < elf.hdr.e_shnum; i++)
+    if (elf.bits == 32)
     {
-        parse_elf32_section(&elf, i, &shdr);
-
-        if (shdr.sh_type == SHT_STRTAB)
+        Elf32_Shdr shdr;
+        for (i = 0; i < elf.hdr32.e_shnum; i++)
         {
-            char strtsb_name[128];
-
-            strcpy(strtsb_name, (char *)(elf.buf + shdr.sh_offset + shdr.sh_name));
+            parse_elf_section(&elf, i, &shdr, NULL);
 
-            if (!(strcmp(strtsb_name, ".shstrtab")))
+            if (shdr.sh_type == SHT_STRTAB)
             {
-                log_msg("found section: %s\n", strtsb_name);
-                strtab_off = shdr.sh_offset;
-                break;
+                char strtsb_name[128];
+
+                strcpy(strtsb_name, (char *)(elf.buf + shdr.sh_offset + shdr.sh_name));
+
+                if (!(strcmp(strtsb_name, ".shstrtab")))
+                {
+                    /* log_msg("found section: %s\n", strtsb_name); */
+                    strtab_off32 = shdr.sh_offset;
+                    break;
+                }
             }
         }
     }
-
-    /* Parse all Symbol Tables */
-    for (i = 0; i < elf.hdr.e_shnum; i++)
+    else /* if (elf.bits == 64) */
     {
-
-        parse_elf32_section(&elf, i, &shdr);
-
-        if (shdr.sh_type == SHT_SYMTAB)
+        Elf64_Shdr shdr;
+        for (i = 0; i < elf.hdr64.e_shnum; i++)
         {
-            for (ofst = shdr.sh_offset;
-                 ofst < shdr.sh_offset + shdr.sh_size;
-                 ofst += shdr.sh_entsize)
-            {
-                Elf32_Sym sym;
+            parse_elf_section(&elf, i, NULL, &shdr);
 
-                parse_elf32_symbol(&elf, ofst, &sym);
+            if (shdr.sh_type == SHT_STRTAB)
+            {
+                char strtsb_name[128];
 
-                /* For all OBJECTS (data objects), extract the value from the
-                 * proper data segment.
-                 */
-                if (ELF32_ST_TYPE(sym.st_info) == STT_OBJECT && sym.st_name)
-                    log_msg("found data object %s\n",
-                            parse_elf32_string_table(&elf,
-                                                     shdr.sh_link,
-                                                     sym.st_name));
+                strcpy(strtsb_name, (char *)(elf.buf + shdr.sh_offset + shdr.sh_name));
 
-                if (ELF32_ST_TYPE(sym.st_info) == STT_OBJECT
-                    && sym.st_size == 4)
+                if (!(strcmp(strtsb_name, ".shstrtab")))
                 {
-                    Elf32_Shdr dhdr;
-                    int32_t      val;
-                    char section_name[128];
-
-                    parse_elf32_section(&elf, sym.st_shndx, &dhdr);
+                    /* log_msg("found section: %s\n", strtsb_name); */
+                    strtab_off64 = shdr.sh_offset;
+                    break;
+                }
+            }
+        }
+    }
 
-                    /* For explanition - refer to _MSC_VER version of code */
-                    strcpy(section_name, (char *)(elf.buf + strtab_off + dhdr.sh_name));
-                    log_msg("Section_name: %s, Section_type: %d\n", section_name, dhdr.sh_type);
+    /* Parse all Symbol Tables */
+    if (elf.bits == 32)
+    {
+        Elf32_Shdr shdr;
+        for (i = 0; i < elf.hdr32.e_shnum; i++)
+        {
+            parse_elf_section(&elf, i, &shdr, NULL);
 
-                    if (!(strcmp(section_name, ".bss")))
+            if (shdr.sh_type == SHT_SYMTAB)
+            {
+                for (ofst = shdr.sh_offset;
+                     ofst < shdr.sh_offset + shdr.sh_size;
+                     ofst += shdr.sh_entsize)
+                {
+                    Elf32_Sym sym;
+
+                    parse_elf_symbol(&elf, ofst, &sym, NULL);
+
+                    /* For all OBJECTS (data objects), extract the value from the
+                     * proper data segment.
+                     */
+                    /* if (ELF32_ST_TYPE(sym.st_info) == STT_OBJECT && sym.st_name)
+                        log_msg("found data object %s\n",
+                                parse_elf_string_table(&elf,
+                                                       shdr.sh_link,
+                                                       sym.st_name));
+                     */
+
+                    if (ELF32_ST_TYPE(sym.st_info) == STT_OBJECT
+                        && sym.st_size == 4)
                     {
-                        val = 0;
+                        Elf32_Shdr dhdr;
+                        int val = 0;
+                        char section_name[128];
+
+                        parse_elf_section(&elf, sym.st_shndx, &dhdr, NULL);
+
+                        /* For explanition - refer to _MSC_VER version of code */
+                        strcpy(section_name, (char *)(elf.buf + strtab_off32 + dhdr.sh_name));
+                        /* log_msg("Section_name: %s, Section_type: %d\n", section_name, dhdr.sh_type); */
+
+                        if (strcmp(section_name, ".bss"))
+                        {
+                            if (sizeof(val) != sym.st_size)
+                            {
+                                /* The target value is declared as an int in
+                                 * asm_*_offsets.c, which is 4 bytes on all
+                                 * targets we currently use. Complain loudly if
+                                 * this is not true.
+                                 */
+                                log_msg("Symbol size is wrong\n");
+                                goto bail;
+                            }
+
+                            memcpy(&val,
+                                   elf.buf + dhdr.sh_offset + sym.st_value,
+                                   sym.st_size);
+                        }
+
+                        if (!elf.le_data)
+                        {
+                            log_msg("Big Endian data not supported yet!\n");
+                            goto bail;
+                        }
+
+                        switch (mode)
+                        {
+                            case OUTPUT_FMT_RVDS:
+                                printf("%-40s EQU %5d\n",
+                                       parse_elf_string_table(&elf,
+                                                              shdr.sh_link,
+                                                              sym.st_name),
+                                       val);
+                                break;
+                            case OUTPUT_FMT_GAS:
+                                printf(".equ %-40s, %5d\n",
+                                       parse_elf_string_table(&elf,
+                                                              shdr.sh_link,
+                                                              sym.st_name),
+                                       val);
+                                break;
+                            default:
+                                printf("%s = %d\n",
+                                       parse_elf_string_table(&elf,
+                                                              shdr.sh_link,
+                                                              sym.st_name),
+                                       val);
+                        }
                     }
-                    else
-                    {
-                        memcpy(&val,
-                               elf.buf + dhdr.sh_offset + sym.st_value,
-                               sizeof(val));
-                    }
-
-                    if (!elf.le_data)
-                    {
-                        log_msg("Big Endian data not supported yet!\n");
-                        goto bail;
-                    }\
+                }
+            }
+        }
+    }
+    else /* if (elf.bits == 64) */
+    {
+        Elf64_Shdr shdr;
+        for (i = 0; i < elf.hdr64.e_shnum; i++)
+        {
+            parse_elf_section(&elf, i, NULL, &shdr);
 
-                    switch (mode)
+            if (shdr.sh_type == SHT_SYMTAB)
+            {
+                for (ofst = shdr.sh_offset;
+                     ofst < shdr.sh_offset + shdr.sh_size;
+                     ofst += shdr.sh_entsize)
+                {
+                    Elf64_Sym sym;
+
+                    parse_elf_symbol(&elf, ofst, NULL, &sym);
+
+                    /* For all OBJECTS (data objects), extract the value from the
+                     * proper data segment.
+                     */
+                    /* if (ELF64_ST_TYPE(sym.st_info) == STT_OBJECT && sym.st_name)
+                        log_msg("found data object %s\n",
+                                parse_elf_string_table(&elf,
+                                                       shdr.sh_link,
+                                                       sym.st_name));
+                     */
+
+                    if (ELF64_ST_TYPE(sym.st_info) == STT_OBJECT
+                        && sym.st_size == 4)
                     {
-                    case OUTPUT_FMT_RVDS:
-                        printf("%-40s EQU %5d\n",
-                               parse_elf32_string_table(&elf,
-                                                        shdr.sh_link,
-                                                        sym.st_name),
-                               val);
-                        break;
-                    case OUTPUT_FMT_GAS:
-                        printf(".equ %-40s, %5d\n",
-                               parse_elf32_string_table(&elf,
-                                                        shdr.sh_link,
-                                                        sym.st_name),
-                               val);
-                        break;
-                    default:
-                        printf("%s = %d\n",
-                               parse_elf32_string_table(&elf,
-                                                        shdr.sh_link,
-                                                        sym.st_name),
-                               val);
+                        Elf64_Shdr dhdr;
+                        int val = 0;
+                        char section_name[128];
+
+                        parse_elf_section(&elf, sym.st_shndx, NULL, &dhdr);
+
+                        /* For explanition - refer to _MSC_VER version of code */
+                        strcpy(section_name, (char *)(elf.buf + strtab_off64 + dhdr.sh_name));
+                        /* log_msg("Section_name: %s, Section_type: %d\n", section_name, dhdr.sh_type); */
+
+                        if ((strcmp(section_name, ".bss")))
+                        {
+                            if (sizeof(val) != sym.st_size)
+                            {
+                                /* The target value is declared as an int in
+                                 * asm_*_offsets.c, which is 4 bytes on all
+                                 * targets we currently use. Complain loudly if
+                                 * this is not true.
+                                 */
+                                log_msg("Symbol size is wrong\n");
+                                goto bail;
+                            }
+
+                            memcpy(&val,
+                                   elf.buf + dhdr.sh_offset + sym.st_value,
+                                   sym.st_size);
+                        }
+
+                        if (!elf.le_data)
+                        {
+                            log_msg("Big Endian data not supported yet!\n");
+                            goto bail;
+                        }
+
+                        switch (mode)
+                        {
+                            case OUTPUT_FMT_RVDS:
+                                printf("%-40s EQU %5d\n",
+                                       parse_elf_string_table(&elf,
+                                                              shdr.sh_link,
+                                                              sym.st_name),
+                                       val);
+                                break;
+                            case OUTPUT_FMT_GAS:
+                                printf(".equ %-40s, %5d\n",
+                                       parse_elf_string_table(&elf,
+                                                              shdr.sh_link,
+                                                              sym.st_name),
+                                       val);
+                                break;
+                            default:
+                                printf("%s = %d\n",
+                                       parse_elf_string_table(&elf,
+                                                              shdr.sh_link,
+                                                              sym.st_name),
+                                       val);
+                        }
                     }
                 }
             }
@@ -454,7 +736,7 @@ int parse_elf32(uint8_t *buf, size_t sz, output_fmt_t mode)
 
     return 0;
 bail:
-    log_msg("Parse error: File does not appear to be valid ELF32\n");
+    log_msg("Parse error: File does not appear to be valid ELF32 or ELF64\n");
     return 1;
 }
 
@@ -521,8 +803,7 @@ int main(int argc, char **argv)
         goto bail;
     }
 
-    res = parse_elf32(file_buf, stat_buf.st_size, mode);
-    //res = parse_coff(file_buf, stat_buf.st_size);
+    res = parse_elf(file_buf, stat_buf.st_size, mode);
     free(file_buf);
 
     if (!res)
diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index d489413f6..804b80bd5 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -312,7 +312,9 @@ void vp8_output_stats(const VP8_COMP            *cpi,
         FILE *fpfile;
         fpfile = fopen("firstpass.stt", "a");
 
-        fprintf(fpfile, "%12.0f %12.0f %12.0f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.0f\n",
+        fprintf(fpfile, "%12.0f %12.0f %12.0f %12.4f %12.4f %12.4f %12.4f"
+                " %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.0f"
+                " %12.4f\n",
                 stats->frame,
                 stats->intra_error,
                 stats->coded_error,
@@ -320,6 +322,7 @@ void vp8_output_stats(const VP8_COMP            *cpi,
                 stats->pcnt_inter,
                 stats->pcnt_motion,
                 stats->pcnt_second_ref,
+                stats->pcnt_neutral,
                 stats->MVr,
                 stats->mvr_abs,
                 stats->MVc,
@@ -327,7 +330,8 @@ void vp8_output_stats(const VP8_COMP            *cpi,
                 stats->MVrv,
                 stats->MVcv,
                 stats->mv_in_out_count,
-                stats->count);
+                stats->count,
+                stats->duration);
         fclose(fpfile);
 
 
@@ -359,6 +363,7 @@ void vp8_zero_stats(FIRSTPASS_STATS *section)
     section->pcnt_inter  = 0.0;
     section->pcnt_motion  = 0.0;
     section->pcnt_second_ref = 0.0;
+    section->pcnt_neutral = 0.0;
     section->MVr        = 0.0;
     section->mvr_abs     = 0.0;
     section->MVc        = 0.0;
@@ -378,6 +383,7 @@ void vp8_accumulate_stats(FIRSTPASS_STATS *section, FIRSTPASS_STATS *frame)
     section->pcnt_inter  += frame->pcnt_inter;
     section->pcnt_motion += frame->pcnt_motion;
     section->pcnt_second_ref += frame->pcnt_second_ref;
+    section->pcnt_neutral += frame->pcnt_neutral;
     section->MVr        += frame->MVr;
     section->mvr_abs     += frame->mvr_abs;
     section->MVc        += frame->MVc;
@@ -398,6 +404,7 @@ void vp8_avg_stats(FIRSTPASS_STATS *section)
     section->ssim_weighted_pred_err /= section->count;
     section->pcnt_inter  /= section->count;
     section->pcnt_second_ref /= section->count;
+    section->pcnt_neutral /= section->count;
     section->pcnt_motion /= section->count;
     section->MVr        /= section->count;
     section->mvr_abs     /= section->count;
@@ -570,6 +577,7 @@ void vp8_first_pass(VP8_COMP *cpi)
     int intercount = 0;
     int second_ref_count = 0;
     int intrapenalty = 256;
+    int neutral_count = 0;
 
     int sum_in_vectors = 0;
 
@@ -726,6 +734,17 @@ void vp8_first_pass(VP8_COMP *cpi)
 
                 if (motion_error <= this_error)
                 {
+                    // Keep a count of cases where the inter and intra were
+                    // very close and very low. This helps with scene cut
+                    // detection for example in cropped clips with black bars
+                    // at the sides or top and bottom.
+                    if( (((this_error-intrapenalty) * 9) <=
+                         (motion_error*10)) &&
+                        (this_error < (2*intrapenalty)) )
+                    {
+                        neutral_count++;
+                    }
+
                     d->bmi.mv.as_mv.row <<= 3;
                     d->bmi.mv.as_mv.col <<= 3;
                     this_error = motion_error;
@@ -854,6 +873,7 @@ void vp8_first_pass(VP8_COMP *cpi)
 
         fps.pcnt_inter   = 1.0 * (double)intercount / cm->MBs;
         fps.pcnt_second_ref = 1.0 * (double)second_ref_count / cm->MBs;
+        fps.pcnt_neutral = 1.0 * (double)neutral_count / cm->MBs;
 
         if (mvcount > 0)
         {
@@ -1341,7 +1361,7 @@ void vp8_end_second_pass(VP8_COMP *cpi)
 
 // This function gives and estimate of how badly we believe
 // the predicition quality is decaying from frame to frame.
-double gf_prediction_decay_rate(VP8_COMP *cpi, FIRSTPASS_STATS *next_frame)
+double get_prediction_decay_rate(VP8_COMP *cpi, FIRSTPASS_STATS *next_frame)
 {
     double prediction_decay_rate;
     double motion_decay;
@@ -1376,6 +1396,52 @@ double gf_prediction_decay_rate(VP8_COMP *cpi, FIRSTPASS_STATS *next_frame)
     return prediction_decay_rate;
 }
 
+// Function to test for a condition where a complex transition is followed
+// by a static section. For example in slide shows where there is a fade
+// between slides. This is to help with more optimal kf and gf positioning.
+BOOL detect_transition_to_still(
+    VP8_COMP *cpi,
+    int frame_interval,
+    int still_interval,
+    double loop_decay_rate,
+    double decay_accumulator )
+{
+    BOOL trans_to_still = FALSE;
+
+    // Break clause to detect very still sections after motion
+    // For example a static image after a fade or other transition
+    // instead of a clean scene cut.
+    if ( (frame_interval > MIN_GF_INTERVAL) &&
+         (loop_decay_rate >= 0.999) &&
+         (decay_accumulator < 0.9) )
+    {
+        int j;
+        FIRSTPASS_STATS * position = cpi->stats_in;
+        FIRSTPASS_STATS tmp_next_frame;
+        double decay_rate;
+
+        // Look ahead a few frames to see if static condition
+        // persists...
+        for ( j = 0; j < still_interval; j++ )
+        {
+            if (EOF == vp8_input_stats(cpi, &tmp_next_frame))
+                break;
+
+            decay_rate = get_prediction_decay_rate(cpi, &tmp_next_frame);
+            if ( decay_rate < 0.999 )
+                break;
+        }
+        // Reset file position
+        reset_fpf_position(cpi, position);
+
+        // Only if it does do we signal a transition to still
+        if ( j == still_interval )
+            trans_to_still = TRUE;
+    }
+
+    return trans_to_still;
+}
+
 // Analyse and define a gf/arf group .
 static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
 {
@@ -1528,7 +1594,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
         if (r > GF_RMAX)
             r = GF_RMAX;
 
-        loop_decay_rate = gf_prediction_decay_rate(cpi, &next_frame);
+        loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
 
         // Cumulative effect of decay
         decay_accumulator = decay_accumulator * loop_decay_rate;
@@ -1537,48 +1603,13 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
         boost_score += (decay_accumulator * r);
 
         // Break clause to detect very still sections after motion
-        // For example a staic image after a fade or other transition
-        // instead of a clean key frame.
-        if ( (i > MIN_GF_INTERVAL) &&
-             (loop_decay_rate >= 0.999) &&
-             (decay_accumulator < 0.9) )
+        // For example a staic image after a fade or other transition.
+        if ( detect_transition_to_still( cpi, i, 5,
+                                         loop_decay_rate, decay_accumulator ) )
         {
-            int j;
-            FIRSTPASS_STATS * position = cpi->stats_in;
-            FIRSTPASS_STATS tmp_next_frame;
-            double decay_rate;
-
-            // Look ahead a few frames to see if static condition
-            // persists...
-            for ( j = 0; j < 4; j++ )
-            {
-                if (EOF == vp8_input_stats(cpi, &tmp_next_frame))
-                    break;
-
-                decay_rate = gf_prediction_decay_rate(cpi, &tmp_next_frame);
-                if ( decay_rate < 0.999 )
-                    break;
-            }
-            reset_fpf_position(cpi, position);            // Reset file position
-
-            // Force GF not alt ref
-            if ( j == 4 )
-            {
-                if (0)
-                {
-                    FILE *f = fopen("fadegf.stt", "a");
-                    fprintf(f, " %8d %8d %10.4f %10.4f %10.4f\n",
-                         cpi->common.current_video_frame+i, i,
-                         loop_decay_rate, decay_accumulator,
-                         boost_score );
-                    fclose(f);
-                }
-
-                allow_alt_ref = FALSE;
-
-                boost_score = old_boost_score;
-                break;
-            }
+            allow_alt_ref = FALSE;
+            boost_score = old_boost_score;
+            break;
         }
 
         // Break out conditions.
@@ -2285,7 +2316,7 @@ static BOOL test_candidate_kf(VP8_COMP *cpi,  FIRSTPASS_STATS *last_frame, FIRST
         (next_frame->pcnt_second_ref < 0.10) &&
         ((this_frame->pcnt_inter < 0.05) ||
          (
-             (this_frame->pcnt_inter < .25) &&
+             ((this_frame->pcnt_inter - this_frame->pcnt_neutral) < .25) &&
              ((this_frame->intra_error / DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) < 2.5) &&
              ((fabs(last_frame->coded_error - this_frame->coded_error) / DOUBLE_DIVIDE_CHECK(this_frame->coded_error) > .40) ||
               (fabs(last_frame->intra_error - this_frame->intra_error) / DOUBLE_DIVIDE_CHECK(this_frame->intra_error) > .40) ||
@@ -2332,7 +2363,9 @@ static BOOL test_candidate_kf(VP8_COMP *cpi,  FIRSTPASS_STATS *last_frame, FIRST
             // Test various breakout clauses
             if ((local_next_frame.pcnt_inter < 0.05) ||
                 (next_iiratio < 1.5) ||
-                ((local_next_frame.pcnt_inter < 0.20) && (next_iiratio < 3.0)) ||
+                (((local_next_frame.pcnt_inter -
+                   local_next_frame.pcnt_neutral) < 0.20) &&
+                 (next_iiratio < 3.0)) ||
                 ((boost_score - old_boost_score) < 0.5) ||
                 (local_next_frame.intra_error < 200)
                )
@@ -2363,13 +2396,13 @@ static BOOL test_candidate_kf(VP8_COMP *cpi,  FIRSTPASS_STATS *last_frame, FIRST
 }
 void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
 {
-    int i;
+    int i,j;
     FIRSTPASS_STATS last_frame;
     FIRSTPASS_STATS first_frame;
     FIRSTPASS_STATS next_frame;
     FIRSTPASS_STATS *start_position;
 
-    double decay_accumulator = 0;
+    double decay_accumulator = 1.0;
     double boost_score = 0;
     double old_boost_score = 0.0;
     double loop_decay_rate;
@@ -2379,6 +2412,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
     double kf_group_intra_err = 0.0;
     double kf_group_coded_err = 0.0;
     double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);
+    double recent_loop_decay[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
 
     vpx_memset(&next_frame, 0, sizeof(next_frame)); // assure clean
 
@@ -2407,6 +2441,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
     kf_mod_err = calculate_modified_err(cpi, this_frame);
 
     // find the next keyframe
+    i = 0;
     while (cpi->stats_in < cpi->stats_in_end)
     {
         // Accumulate kf group error
@@ -2425,9 +2460,34 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
         if (cpi->oxcf.auto_key
             && lookup_next_frame_stats(cpi, &next_frame) != EOF)
         {
+            // Normal scene cut check
             if (test_candidate_kf(cpi, &last_frame, this_frame, &next_frame))
                 break;
 
+            // How fast is prediction quality decaying
+            loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
+
+            // We want to know something about the recent past... rather than
+            // as used elsewhere where we are concened with decay in prediction
+            // quality since the last GF or KF.
+            recent_loop_decay[i%8] = loop_decay_rate;
+            decay_accumulator = 1.0;
+            for (j = 0; j < 8; j++)
+            {
+                decay_accumulator = decay_accumulator * recent_loop_decay[j];
+            }
+
+            // Special check for transition or high motion followed by a
+            // to a static scene.
+            if ( detect_transition_to_still( cpi, i,
+                                             (cpi->key_frame_frequency-i),
+                                             loop_decay_rate,
+                                             decay_accumulator ) )
+            {
+                break;
+            }
+
+
             // Step on to the next frame
             cpi->frames_to_key ++;
 
@@ -2437,6 +2497,8 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
                 break;
         } else
             cpi->frames_to_key ++;
+
+        i++;
     }
 
     // If there is a max kf interval set by the user we must obey it.
@@ -2588,32 +2650,8 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
         if (r > RMAX)
             r = RMAX;
 
-        // Adjust loop decay rate
-        //if ( next_frame.pcnt_inter < loop_decay_rate )
-        loop_decay_rate = next_frame.pcnt_inter;
-
-        // High % motion -> somewhat higher decay rate
-        motion_pct = next_frame.pcnt_motion;
-        motion_decay = (1.0 - (motion_pct / 20.0));
-        if (motion_decay < loop_decay_rate)
-            loop_decay_rate = motion_decay;
-
-        // Adjustment to decay rate based on speed of motion
-        {
-            double this_mv_rabs;
-            double this_mv_cabs;
-            double distance_factor;
-
-            this_mv_rabs = fabs(next_frame.mvr_abs * motion_pct);
-            this_mv_cabs = fabs(next_frame.mvc_abs * motion_pct);
-
-            distance_factor = sqrt((this_mv_rabs * this_mv_rabs) +
-                                   (this_mv_cabs * this_mv_cabs)) / 250.0;
-            distance_factor = ((distance_factor > 1.0)
-                                    ? 0.0 : (1.0 - distance_factor));
-            if (distance_factor < loop_decay_rate)
-                loop_decay_rate = distance_factor;
-        }
+        // How fast is prediction quality decaying
+        loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
 
         decay_accumulator = decay_accumulator * loop_decay_rate;
         decay_accumulator = decay_accumulator < 0.1 ? 0.1 : decay_accumulator;
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index 5a4b3c185..0d353c31f 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -99,6 +99,7 @@ typedef struct
     double pcnt_inter;
     double pcnt_motion;
     double pcnt_second_ref;
+    double pcnt_neutral;
     double MVr;
     double mvr_abs;
     double MVc;
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index 3db05155c..867ff6a9c 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -2033,7 +2033,7 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
                     else
                         cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
                 }
-                else if (vp8_ref_frame_order[mode_index] == SPLITMV)
+                else if (vp8_mode_order[mode_index] == SPLITMV)
                     cpi->zbin_mode_boost = 0;
                 else
                     cpi->zbin_mode_boost = MV_ZBIN_BOOST;
diff --git a/vp8/encoder/x86/variance_impl_sse2.asm b/vp8/encoder/x86/variance_impl_sse2.asm
index 6cdc47bc9..5d1a17d44 100644
--- a/vp8/encoder/x86/variance_impl_sse2.asm
+++ b/vp8/encoder/x86/variance_impl_sse2.asm
@@ -627,6 +627,10 @@ filter_block2d_bil_var_sse2_loop:
 
 filter_block2d_bil_var_sse2_sp_only:
         movsxd          rdx,            dword ptr arg(6)     ; yoffset
+
+        cmp             rdx,            0                    ; skip all if both xoffset=0 and yoffset=0
+        je              filter_block2d_bil_var_sse2_full_pixel
+
         shl             rdx,            5
         lea             rdx,            [rdx + rcx]          ; VFilter
 
@@ -671,6 +675,35 @@ filter_block2d_bil_sp_only_loop:
 
         jmp             filter_block2d_bil_variance
 
+filter_block2d_bil_var_sse2_full_pixel:
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height
+        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
+        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
+        pxor            xmm0,           xmm0                 ;
+
+filter_block2d_bil_full_pixel_loop:
+        movq            xmm1,           QWORD PTR [rsi]               ;
+        punpcklbw       xmm1,           xmm0                 ;
+
+        movq            xmm2,           QWORD PTR [rdi]               ;
+        punpcklbw       xmm2,           xmm0                 ;
+
+        psubw           xmm1,           xmm2                 ;
+        paddw           xmm6,           xmm1                 ;
+
+        pmaddwd         xmm1,           xmm1                 ;
+        paddd           xmm7,           xmm1                 ;
+
+        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
+        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
+
+        sub             rcx,            1                   ;
+        jnz             filter_block2d_bil_full_pixel_loop       ;
+
+        jmp             filter_block2d_bil_variance
+
 filter_block2d_bil_var_sse2_fp_only:
         mov             rsi,            arg(0)               ;ref_ptr
         mov             rdi,            arg(2)               ;src_ptr
diff --git a/vp8/encoder/x86/variance_impl_ssse3.asm b/vp8/encoder/x86/variance_impl_ssse3.asm
new file mode 100644
index 000000000..b1976328d
--- /dev/null
+++ b/vp8/encoder/x86/variance_impl_ssse3.asm
@@ -0,0 +1,348 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%define xmm_filter_shift            7
+
+
+;void vp8_filter_block2d_bil_var_ssse3
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    int  xoffset,
+;    int  yoffset,
+;    int *sum,
+;    unsigned int *sumsquared;;
+;
+;)
+;Note: The filter coefficient at offset=0 is 128. Since the second register
+;for Pmaddubsw is signed bytes, we must calculate zero offset seperately.
+global sym(vp8_filter_block2d_bil_var_ssse3)
+sym(vp8_filter_block2d_bil_var_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    SAVE_XMM
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    push rbx
+    ; end prolog
+
+        pxor            xmm6,           xmm6
+        pxor            xmm7,           xmm7
+
+        lea             rcx,            [GLOBAL(vp8_bilinear_filters_ssse3)]
+        movsxd          rax,            dword ptr arg(5)     ; xoffset
+
+        cmp             rax,            0                    ; skip first_pass filter if xoffset=0
+        je              filter_block2d_bil_var_ssse3_sp_only
+
+        shl             rax,            4                    ; point to filter coeff with xoffset
+        lea             rax,            [rax + rcx]          ; HFilter
+
+        movsxd          rdx,            dword ptr arg(6)     ; yoffset
+
+        cmp             rdx,            0                    ; skip second_pass filter if yoffset=0
+        je              filter_block2d_bil_var_ssse3_fp_only
+
+        shl             rdx,            4
+        lea             rdx,            [rdx + rcx]          ; VFilter
+
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height
+
+        movdqu          xmm0,           XMMWORD PTR [rsi]
+        movdqu          xmm1,           XMMWORD PTR [rsi+1]
+        movdqa          xmm2,           xmm0
+
+        punpcklbw       xmm0,           xmm1
+        punpckhbw       xmm2,           xmm1
+        pmaddubsw       xmm0,           [rax]
+        pmaddubsw       xmm2,           [rax]
+
+        paddw           xmm0,           [GLOBAL(xmm_bi_rd)]
+        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
+        psraw           xmm0,           xmm_filter_shift
+        psraw           xmm2,           xmm_filter_shift
+
+        packuswb        xmm0,           xmm2
+
+        movsxd          rbx,            dword ptr arg(1) ;ref_pixels_per_line
+        lea             rsi,            [rsi + rbx]
+%if ABI_IS_32BIT=0
+        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+filter_block2d_bil_var_ssse3_loop:
+        movdqu          xmm1,           XMMWORD PTR [rsi]
+        movdqu          xmm2,           XMMWORD PTR [rsi+1]
+        movdqa          xmm3,           xmm1
+
+        punpcklbw       xmm1,           xmm2
+        punpckhbw       xmm3,           xmm2
+        pmaddubsw       xmm1,           [rax]
+        pmaddubsw       xmm3,           [rax]
+
+        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
+        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
+        psraw           xmm1,           xmm_filter_shift
+        psraw           xmm3,           xmm_filter_shift
+        packuswb        xmm1,           xmm3
+
+        movdqa          xmm2,           xmm0
+        movdqa          xmm0,           xmm1
+        movdqa          xmm3,           xmm2
+
+        punpcklbw       xmm2,           xmm1
+        punpckhbw       xmm3,           xmm1
+        pmaddubsw       xmm2,           [rdx]
+        pmaddubsw       xmm3,           [rdx]
+
+        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
+        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
+        psraw           xmm2,           xmm_filter_shift
+        psraw           xmm3,           xmm_filter_shift
+
+        movq            xmm1,           QWORD PTR [rdi]
+        pxor            xmm4,           xmm4
+        punpcklbw       xmm1,           xmm4
+        movq            xmm5,           QWORD PTR [rdi+8]
+        punpcklbw       xmm5,           xmm4
+
+        psubw           xmm2,           xmm1
+        psubw           xmm3,           xmm5
+        paddw           xmm6,           xmm2
+        paddw           xmm6,           xmm3
+        pmaddwd         xmm2,           xmm2
+        pmaddwd         xmm3,           xmm3
+        paddd           xmm7,           xmm2
+        paddd           xmm7,           xmm3
+
+        lea             rsi,            [rsi + rbx]          ;ref_pixels_per_line
+%if ABI_IS_32BIT
+        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
+%else
+        lea             rdi,            [rdi + r9]
+%endif
+
+        sub             rcx,            1
+        jnz             filter_block2d_bil_var_ssse3_loop
+
+        jmp             filter_block2d_bil_variance
+
+filter_block2d_bil_var_ssse3_sp_only:
+        movsxd          rdx,            dword ptr arg(6)     ; yoffset
+
+        cmp             rdx,            0                    ; Both xoffset =0 and yoffset=0
+        je              filter_block2d_bil_var_ssse3_full_pixel
+
+        shl             rdx,            4
+        lea             rdx,            [rdx + rcx]          ; VFilter
+
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height
+        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
+
+        movdqu          xmm1,           XMMWORD PTR [rsi]
+        movdqa          xmm0,           xmm1
+
+        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
+        lea             rsi,            [rsi + rax]
+
+filter_block2d_bil_sp_only_loop:
+        movdqu          xmm3,           XMMWORD PTR [rsi]
+        movdqa          xmm2,           xmm1
+        movdqa          xmm0,           xmm3
+
+        punpcklbw       xmm1,           xmm3
+        punpckhbw       xmm2,           xmm3
+        pmaddubsw       xmm1,           [rdx]
+        pmaddubsw       xmm2,           [rdx]
+
+        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
+        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
+        psraw           xmm1,           xmm_filter_shift
+        psraw           xmm2,           xmm_filter_shift
+
+        movq            xmm3,           QWORD PTR [rdi]
+        pxor            xmm4,           xmm4
+        punpcklbw       xmm3,           xmm4
+        movq            xmm5,           QWORD PTR [rdi+8]
+        punpcklbw       xmm5,           xmm4
+
+        psubw           xmm1,           xmm3
+        psubw           xmm2,           xmm5
+        paddw           xmm6,           xmm1
+        paddw           xmm6,           xmm2
+        pmaddwd         xmm1,           xmm1
+        pmaddwd         xmm2,           xmm2
+        paddd           xmm7,           xmm1
+        paddd           xmm7,           xmm2
+
+        movdqa          xmm1,           xmm0
+        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
+        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
+
+        sub             rcx,            1
+        jnz             filter_block2d_bil_sp_only_loop
+
+        jmp             filter_block2d_bil_variance
+
+filter_block2d_bil_var_ssse3_full_pixel:
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height
+        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
+        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
+        pxor            xmm0,           xmm0
+
+filter_block2d_bil_full_pixel_loop:
+        movq            xmm1,           QWORD PTR [rsi]
+        punpcklbw       xmm1,           xmm0
+        movq            xmm2,           QWORD PTR [rsi+8]
+        punpcklbw       xmm2,           xmm0
+
+        movq            xmm3,           QWORD PTR [rdi]
+        punpcklbw       xmm3,           xmm0
+        movq            xmm4,           QWORD PTR [rdi+8]
+        punpcklbw       xmm4,           xmm0
+
+        psubw           xmm1,           xmm3
+        psubw           xmm2,           xmm4
+        paddw           xmm6,           xmm1
+        paddw           xmm6,           xmm2
+        pmaddwd         xmm1,           xmm1
+        pmaddwd         xmm2,           xmm2
+        paddd           xmm7,           xmm1
+        paddd           xmm7,           xmm2
+
+        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
+        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
+        sub             rcx,            1
+        jnz             filter_block2d_bil_full_pixel_loop
+
+        jmp             filter_block2d_bil_variance
+
+filter_block2d_bil_var_ssse3_fp_only:
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height
+        movsxd          rdx,            dword ptr arg(1)     ;ref_pixels_per_line
+
+        pxor            xmm0,           xmm0
+        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
+
+filter_block2d_bil_fp_only_loop:
+        movdqu          xmm1,           XMMWORD PTR [rsi]
+        movdqu          xmm2,           XMMWORD PTR [rsi+1]
+        movdqa          xmm3,           xmm1
+
+        punpcklbw       xmm1,           xmm2
+        punpckhbw       xmm3,           xmm2
+        pmaddubsw       xmm1,           [rax]
+        pmaddubsw       xmm3,           [rax]
+
+        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
+        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
+        psraw           xmm1,           xmm_filter_shift
+        psraw           xmm3,           xmm_filter_shift
+
+        movq            xmm2,           XMMWORD PTR [rdi]
+        pxor            xmm4,           xmm4
+        punpcklbw       xmm2,           xmm4
+        movq            xmm5,           QWORD PTR [rdi+8]
+        punpcklbw       xmm5,           xmm4
+
+        psubw           xmm1,           xmm2
+        psubw           xmm3,           xmm5
+        paddw           xmm6,           xmm1
+        paddw           xmm6,           xmm3
+        pmaddwd         xmm1,           xmm1
+        pmaddwd         xmm3,           xmm3
+        paddd           xmm7,           xmm1
+        paddd           xmm7,           xmm3
+
+        lea             rsi,            [rsi + rdx]
+        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
+
+        sub             rcx,            1
+        jnz             filter_block2d_bil_fp_only_loop
+
+        jmp             filter_block2d_bil_variance
+
+filter_block2d_bil_variance:
+        pxor        xmm0,           xmm0
+        pxor        xmm1,           xmm1
+        pxor        xmm5,           xmm5
+
+        punpcklwd   xmm0,           xmm6
+        punpckhwd   xmm1,           xmm6
+        psrad       xmm0,           16
+        psrad       xmm1,           16
+        paddd       xmm0,           xmm1
+        movdqa      xmm1,           xmm0
+
+        movdqa      xmm6,           xmm7
+        punpckldq   xmm6,           xmm5
+        punpckhdq   xmm7,           xmm5
+        paddd       xmm6,           xmm7
+
+        punpckldq   xmm0,           xmm5
+        punpckhdq   xmm1,           xmm5
+        paddd       xmm0,           xmm1
+
+        movdqa      xmm7,           xmm6
+        movdqa      xmm1,           xmm0
+
+        psrldq      xmm7,           8
+        psrldq      xmm1,           8
+
+        paddd       xmm6,           xmm7
+        paddd       xmm0,           xmm1
+
+        mov         rsi,            arg(7) ;[Sum]
+        mov         rdi,            arg(8) ;[SSE]
+
+        movd        [rsi],       xmm0
+        movd        [rdi],       xmm6
+
+    ; begin epilog
+    pop rbx
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+SECTION_RODATA
+align 16
+xmm_bi_rd:
+    times 8 dw 64
+align 16
+vp8_bilinear_filters_ssse3:
+    times 8 db 128, 0
+    times 8 db 112, 16
+    times 8 db 96,  32
+    times 8 db 80,  48
+    times 8 db 64,  64
+    times 8 db 48,  80
+    times 8 db 32,  96
+    times 8 db 16,  112
diff --git a/vp8/encoder/x86/variance_ssse3.c b/vp8/encoder/x86/variance_ssse3.c
new file mode 100644
index 000000000..750ae8b86
--- /dev/null
+++ b/vp8/encoder/x86/variance_ssse3.c
@@ -0,0 +1,140 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp8/encoder/variance.h"
+#include "vp8/common/pragmas.h"
+#include "vpx_ports/mem.h"
+
+extern unsigned int vp8_get16x16var_sse2
+(
+    const unsigned char *src_ptr,
+    int source_stride,
+    const unsigned char *ref_ptr,
+    int recon_stride,
+    unsigned int *SSE,
+    int *Sum
+);
+extern void vp8_half_horiz_vert_variance16x_h_sse2
+(
+    const unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    const unsigned char *src_ptr,
+    int src_pixels_per_line,
+    unsigned int Height,
+    int *sum,
+    unsigned int *sumsquared
+);
+extern void vp8_half_horiz_variance16x_h_sse2
+(
+    const unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    const unsigned char *src_ptr,
+    int src_pixels_per_line,
+    unsigned int Height,
+    int *sum,
+    unsigned int *sumsquared
+);
+extern void vp8_half_vert_variance16x_h_sse2
+(
+    const unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    const unsigned char *src_ptr,
+    int src_pixels_per_line,
+    unsigned int Height,
+    int *sum,
+    unsigned int *sumsquared
+);
+extern void vp8_filter_block2d_bil_var_ssse3
+(
+    const unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    const unsigned char *src_ptr,
+    int src_pixels_per_line,
+    unsigned int Height,
+    int  xoffset,
+    int  yoffset,
+    int *sum,
+    unsigned int *sumsquared
+);
+
+unsigned int vp8_sub_pixel_variance16x16_ssse3
+(
+    const unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    const unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    int xsum0, xsum1;
+    unsigned int xxsum0, xxsum1;
+
+    // note we could avoid these if statements if the calling function
+    // just called the appropriate functions inside.
+    if (xoffset == 4 && yoffset == 0)
+    {
+        vp8_half_horiz_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 16,
+            &xsum0, &xxsum0);
+
+        vp8_half_horiz_variance16x_h_sse2(
+            src_ptr + 8, src_pixels_per_line,
+            dst_ptr + 8, dst_pixels_per_line, 16,
+            &xsum1, &xxsum1);
+
+        xsum0 += xsum1;
+        xxsum0 += xxsum1;
+    }
+    else if (xoffset == 0 && yoffset == 4)
+    {
+        vp8_half_vert_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 16,
+            &xsum0, &xxsum0);
+
+        vp8_half_vert_variance16x_h_sse2(
+            src_ptr + 8, src_pixels_per_line,
+            dst_ptr + 8, dst_pixels_per_line, 16,
+            &xsum1, &xxsum1);
+
+        xsum0 += xsum1;
+        xxsum0 += xxsum1;
+    }
+    else if (xoffset == 4 && yoffset == 4)
+    {
+        vp8_half_horiz_vert_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 16,
+            &xsum0, &xxsum0);
+
+        vp8_half_horiz_vert_variance16x_h_sse2(
+            src_ptr + 8, src_pixels_per_line,
+            dst_ptr + 8, dst_pixels_per_line, 16,
+            &xsum1, &xxsum1);
+
+        xsum0 += xsum1;
+        xxsum0 += xxsum1;
+    }
+    else
+    {
+      vp8_filter_block2d_bil_var_ssse3(
+          src_ptr, src_pixels_per_line,
+          dst_ptr, dst_pixels_per_line, 16,
+          xoffset, yoffset,
+          &xsum0, &xxsum0);
+    }
+
+    *sse = xxsum0;
+    return (xxsum0 - ((xsum0 * xsum0) >> 8));
+}
diff --git a/vp8/encoder/x86/variance_x86.h b/vp8/encoder/x86/variance_x86.h
index 6bea15ebc..1e2fb3490 100644
--- a/vp8/encoder/x86/variance_x86.h
+++ b/vp8/encoder/x86/variance_x86.h
@@ -286,6 +286,7 @@ extern prototype_sad_multi_dif_address(vp8_sad4x4x4d_sse3);
 #if HAVE_SSSE3
 extern prototype_sad_multi_same_address(vp8_sad16x16x3_ssse3);
 extern prototype_sad_multi_same_address(vp8_sad16x8x3_ssse3);
+extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_ssse3);
 
 #if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_variance_sad16x16x3
@@ -294,6 +295,9 @@ extern prototype_sad_multi_same_address(vp8_sad16x8x3_ssse3);
 #undef  vp8_variance_sad16x8x3
 #define vp8_variance_sad16x8x3 vp8_sad16x8x3_ssse3
 
+#undef  vp8_variance_subpixvar16x16
+#define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_ssse3
+
 #endif
 #endif
 
diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c
index 61c603229..c7639a7e4 100644
--- a/vp8/encoder/x86/x86_csystemdependent.c
+++ b/vp8/encoder/x86/x86_csystemdependent.c
@@ -334,6 +334,8 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
         cpi->rtcd.variance.sad16x16x3            = vp8_sad16x16x3_ssse3;
         cpi->rtcd.variance.sad16x8x3             = vp8_sad16x8x3_ssse3;
 
+        cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_ssse3;
+
         cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_ssse3;
 
     }
diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk
index ed2feddae..c0ae250f5 100644
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -110,6 +110,8 @@ VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE3) += encoder/x86/sad_sse3.asm
 VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/sad_ssse3.asm
+VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/variance_ssse3.c
+VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/variance_impl_ssse3.asm
 VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.asm
 VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/sad_sse4.asm
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm