summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--build/make/obj_int_extract.c645
-rw-r--r--vp8/encoder/firstpass.c188
-rw-r--r--vp8/encoder/onyx_int.h1
-rw-r--r--vp8/encoder/rdopt.c2
-rw-r--r--vp8/encoder/x86/variance_impl_sse2.asm33
-rw-r--r--vp8/encoder/x86/variance_impl_ssse3.asm348
-rw-r--r--vp8/encoder/x86/variance_ssse3.c140
-rw-r--r--vp8/encoder/x86/variance_x86.h4
-rw-r--r--vp8/encoder/x86/x86_csystemdependent.c2
-rw-r--r--vp8/vp8cx.mk2
10 files changed, 1107 insertions, 258 deletions
diff --git a/build/make/obj_int_extract.c b/build/make/obj_int_extract.c
index 3c54b248f..26cf45782 100644
--- a/build/make/obj_int_extract.c
+++ b/build/make/obj_int_extract.c
@@ -59,20 +59,47 @@ int parse_macho(uint8_t *base_buf, size_t sz)
struct mach_header header;
uint8_t *buf = base_buf;
int base_data_section = 0;
-
+ int bits = 0;
+
+ /* We can read in mach_header for 32 and 64 bit architectures
+ * because it's identical to mach_header_64 except for the last
+ * element (uint32_t reserved), which we don't use. Then, when
+ * we know which architecture we're looking at, increment buf
+ * appropriately.
+ */
memcpy(&header, buf, sizeof(struct mach_header));
- buf += sizeof(struct mach_header);
- if (header.magic != MH_MAGIC)
+ if (header.magic == MH_MAGIC)
{
- log_msg("Bad magic number for object file. 0x%x expected, 0x%x found.\n",
- header.magic, MH_MAGIC);
- goto bail;
+ if (header.cputype == CPU_TYPE_ARM
+ || header.cputype == CPU_TYPE_X86)
+ {
+ bits = 32;
+ buf += sizeof(struct mach_header);
+ }
+ else
+ {
+ log_msg("Bad cputype for object file. Currently only tested for CPU_TYPE_[ARM|X86].\n");
+ goto bail;
+ }
}
-
- if (header.cputype != CPU_TYPE_ARM)
+ else if (header.magic == MH_MAGIC_64)
{
- log_msg("Bad cputype for object file. Currently only tested for CPU_TYPE_ARM.\n");
+ if (header.cputype == CPU_TYPE_X86_64)
+ {
+ bits = 64;
+ buf += sizeof(struct mach_header_64);
+ }
+ else
+ {
+ log_msg("Bad cputype for object file. Currently only tested for CPU_TYPE_X86_64.\n");
+ goto bail;
+ }
+ }
+ else
+ {
+ log_msg("Bad magic number for object file. 0x%x or 0x%x expected, 0x%x found.\n",
+ MH_MAGIC, MH_MAGIC_64, header.magic);
goto bail;
}
@@ -85,8 +112,6 @@ int parse_macho(uint8_t *base_buf, size_t sz)
for (i = 0; i < header.ncmds; i++)
{
struct load_command lc;
- struct symtab_command sc;
- struct segment_command seg_c;
memcpy(&lc, buf, sizeof(struct load_command));
@@ -94,50 +119,99 @@ int parse_macho(uint8_t *base_buf, size_t sz)
{
uint8_t *seg_buf = buf;
struct section s;
+ struct segment_command seg_c;
- memcpy(&seg_c, buf, sizeof(struct segment_command));
-
+ memcpy(&seg_c, seg_buf, sizeof(struct segment_command));
seg_buf += sizeof(struct segment_command);
- for (j = 0; j < seg_c.nsects; j++)
+ /* Although each section is given it's own offset, nlist.n_value
+ * references the offset of the first section. This isn't
+ * apparent without debug information because the offset of the
+ * data section is the same as the first section. However, with
+ * debug sections mixed in, the offset of the debug section
+ * increases but n_value still references the first section.
+ */
+ if (seg_c.nsects < 1)
{
- memcpy(&s, seg_buf + (j * sizeof(struct section)), sizeof(struct section));
+ log_msg("Not enough sections\n");
+ goto bail;
+ }
+
+ memcpy(&s, seg_buf, sizeof(struct section));
+ base_data_section = s.offset;
+ }
+ else if (lc.cmd == LC_SEGMENT_64)
+ {
+ uint8_t *seg_buf = buf;
+ struct section_64 s;
+ struct segment_command_64 seg_c;
+
+ memcpy(&seg_c, seg_buf, sizeof(struct segment_command_64));
+ seg_buf += sizeof(struct segment_command_64);
- // Need to get this offset which is the start of the symbol table
- // before matching the strings up with symbols.
- base_data_section = s.offset;
+ /* Explanation in LG_SEGMENT */
+ if (seg_c.nsects < 1)
+ {
+ log_msg("Not enough sections\n");
+ goto bail;
}
+
+ memcpy(&s, seg_buf, sizeof(struct section_64));
+ base_data_section = s.offset;
}
else if (lc.cmd == LC_SYMTAB)
{
- uint8_t *sym_buf = base_buf;
- uint8_t *str_buf = base_buf;
-
if (base_data_section != 0)
{
+ struct symtab_command sc;
+ uint8_t *sym_buf = base_buf;
+ uint8_t *str_buf = base_buf;
+
memcpy(&sc, buf, sizeof(struct symtab_command));
if (sc.cmdsize != sizeof(struct symtab_command))
+ {
log_msg("Can't find symbol table!\n");
+ goto bail;
+ }
sym_buf += sc.symoff;
str_buf += sc.stroff;
for (j = 0; j < sc.nsyms; j++)
{
- struct nlist nl;
- int val;
+ /* Location of string is cacluated each time from the
+ * start of the string buffer. On darwin the symbols
+ * are prefixed by "_", so we bump the pointer by 1.
+ * The target value is defined as an int in asm_*_offsets.c,
+ * which is 4 bytes on all targets we currently use.
+ */
+ if (bits == 32)
+ {
+ struct nlist nl;
+ int val;
- memcpy(&nl, sym_buf + (j * sizeof(struct nlist)), sizeof(struct nlist));
+ memcpy(&nl, sym_buf, sizeof(struct nlist));
+ sym_buf += sizeof(struct nlist);
- val = *((int *)(base_buf + base_data_section + nl.n_value));
+ memcpy(&val, base_buf + base_data_section + nl.n_value,
+ sizeof(val));
+ printf("%-40s EQU %5d\n",
+ str_buf + nl.n_un.n_strx + 1, val);
+ }
+ else /* if (bits == 64) */
+ {
+ struct nlist_64 nl;
+ int val;
- // Location of string is cacluated each time from the
- // start of the string buffer. On darwin the symbols
- // are prefixed by "_". On other platforms it is not
- // so it needs to be removed. That is the reason for
- // the +1.
- printf("%-40s EQU %5d\n", str_buf + nl.n_un.n_strx + 1, val);
+ memcpy(&nl, sym_buf, sizeof(struct nlist_64));
+ sym_buf += sizeof(struct nlist_64);
+
+ memcpy(&val, base_buf + base_data_section + nl.n_value,
+ sizeof(val));
+ printf("%-40s EQU %5d\n",
+ str_buf + nl.n_un.n_strx + 1, val);
+ }
}
}
}
@@ -218,7 +292,7 @@ bail:
return EXIT_FAILURE;
}
-#else
+#elif defined(__ELF__)
#include "elf.h"
#define COPY_STRUCT(dst, buf, ofst, sz) do {\
@@ -237,212 +311,420 @@ bail:
typedef struct
{
- uint8_t *buf; /* Buffer containing ELF data */
- size_t sz; /* Buffer size */
- int le_data; /* Data is little-endian */
- Elf32_Ehdr hdr;
+ uint8_t *buf; /* Buffer containing ELF data */
+ size_t sz; /* Buffer size */
+ int le_data; /* Data is little-endian */
+ unsigned char e_ident[EI_NIDENT]; /* Magic number and other info */
+ int bits; /* 32 or 64 */
+ Elf32_Ehdr hdr32;
+ Elf64_Ehdr hdr64;
} elf_obj_t;
-int parse_elf32_header(elf_obj_t *elf)
+int parse_elf_header(elf_obj_t *elf)
{
int res;
- /* Verify ELF32 header */
- COPY_STRUCT(&elf->hdr, elf->buf, 0, elf->sz);
- res = elf->hdr.e_ident[EI_MAG0] == ELFMAG0;
- res &= elf->hdr.e_ident[EI_MAG1] == ELFMAG1;
- res &= elf->hdr.e_ident[EI_MAG2] == ELFMAG2;
- res &= elf->hdr.e_ident[EI_MAG3] == ELFMAG3;
- res &= elf->hdr.e_ident[EI_CLASS] == ELFCLASS32;
- res &= elf->hdr.e_ident[EI_DATA] == ELFDATA2LSB
- || elf->hdr.e_ident[EI_DATA] == ELFDATA2MSB;
+ /* Verify ELF Magic numbers */
+ COPY_STRUCT(&elf->e_ident, elf->buf, 0, elf->sz);
+ res = elf->e_ident[EI_MAG0] == ELFMAG0;
+ res &= elf->e_ident[EI_MAG1] == ELFMAG1;
+ res &= elf->e_ident[EI_MAG2] == ELFMAG2;
+ res &= elf->e_ident[EI_MAG3] == ELFMAG3;
+ res &= elf->e_ident[EI_CLASS] == ELFCLASS32
+ || elf->e_ident[EI_CLASS] == ELFCLASS64;
+ res &= elf->e_ident[EI_DATA] == ELFDATA2LSB;
if (!res) goto bail;
- elf->le_data = elf->hdr.e_ident[EI_DATA] == ELFDATA2LSB;
-
- ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_type);
- ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_machine);
- ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_version);
- ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_entry);
- ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_phoff);
- ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_shoff);
- ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_flags);
- ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_ehsize);
- ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_phentsize);
- ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_phnum);
- ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_shentsize);
- ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_shnum);
- ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_shstrndx);
+ elf->le_data = elf->e_ident[EI_DATA] == ELFDATA2LSB;
+
+ /* Read in relevant values */
+ if (elf->e_ident[EI_CLASS] == ELFCLASS32)
+ {
+ elf->bits = 32;
+ COPY_STRUCT(&elf->hdr32, elf->buf, 0, elf->sz);
+
+ ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_type);
+ ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_machine);
+ ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_version);
+ ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_entry);
+ ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_phoff);
+ ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_shoff);
+ ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_flags);
+ ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_ehsize);
+ ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_phentsize);
+ ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_phnum);
+ ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_shentsize);
+ ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_shnum);
+ ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_shstrndx);
+ }
+ else /* if (elf->e_ident[EI_CLASS] == ELFCLASS64) */
+ {
+ elf->bits = 64;
+ COPY_STRUCT(&elf->hdr64, elf->buf, 0, elf->sz);
+
+ ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_type);
+ ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_machine);
+ ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_version);
+ ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_entry);
+ ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_phoff);
+ ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_shoff);
+ ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_flags);
+ ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_ehsize);
+ ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_phentsize);
+ ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_phnum);
+ ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_shentsize);
+ ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_shnum);
+ ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_shstrndx);
+ }
+
return 0;
bail:
+ log_msg("Failed to parse ELF file header");
return 1;
}
-int parse_elf32_section(elf_obj_t *elf, int idx, Elf32_Shdr *hdr)
+int parse_elf_section(elf_obj_t *elf, int idx, Elf32_Shdr *hdr32, Elf64_Shdr *hdr64)
{
- if (idx >= elf->hdr.e_shnum)
- goto bail;
+ if (hdr32)
+ {
+ if (idx >= elf->hdr32.e_shnum)
+ goto bail;
+
+ COPY_STRUCT(hdr32, elf->buf, elf->hdr32.e_shoff + idx * elf->hdr32.e_shentsize,
+ elf->sz);
+ ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_name);
+ ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_type);
+ ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_flags);
+ ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_addr);
+ ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_offset);
+ ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_size);
+ ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_link);
+ ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_info);
+ ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_addralign);
+ ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_entsize);
+ }
+ else /* if (hdr64) */
+ {
+ if (idx >= elf->hdr64.e_shnum)
+ goto bail;
+
+ COPY_STRUCT(hdr64, elf->buf, elf->hdr64.e_shoff + idx * elf->hdr64.e_shentsize,
+ elf->sz);
+ ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_name);
+ ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_type);
+ ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_flags);
+ ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_addr);
+ ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_offset);
+ ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_size);
+ ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_link);
+ ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_info);
+ ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_addralign);
+ ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_entsize);
+ }
- COPY_STRUCT(hdr, elf->buf, elf->hdr.e_shoff + idx * elf->hdr.e_shentsize,
- elf->sz);
- ENDIAN_ASSIGN_IN_PLACE(hdr->sh_name);
- ENDIAN_ASSIGN_IN_PLACE(hdr->sh_type);
- ENDIAN_ASSIGN_IN_PLACE(hdr->sh_flags);
- ENDIAN_ASSIGN_IN_PLACE(hdr->sh_addr);
- ENDIAN_ASSIGN_IN_PLACE(hdr->sh_offset);
- ENDIAN_ASSIGN_IN_PLACE(hdr->sh_size);
- ENDIAN_ASSIGN_IN_PLACE(hdr->sh_link);
- ENDIAN_ASSIGN_IN_PLACE(hdr->sh_info);
- ENDIAN_ASSIGN_IN_PLACE(hdr->sh_addralign);
- ENDIAN_ASSIGN_IN_PLACE(hdr->sh_entsize);
return 0;
bail:
return 1;
}
-char *parse_elf32_string_table(elf_obj_t *elf, int s_idx, int idx)
+char *parse_elf_string_table(elf_obj_t *elf, int s_idx, int idx)
{
- Elf32_Shdr shdr;
-
- if (parse_elf32_section(elf, s_idx, &shdr))
+ if (elf->bits == 32)
{
- log_msg("Failed to parse ELF string table: section %d, index %d\n",
- s_idx, idx);
- return "";
+ Elf32_Shdr shdr;
+
+ if (parse_elf_section(elf, s_idx, &shdr, NULL))
+ {
+ log_msg("Failed to parse ELF string table: section %d, index %d\n",
+ s_idx, idx);
+ return "";
+ }
+
+ return (char *)(elf->buf + shdr.sh_offset + idx);
}
+ else /* if (elf->bits == 64) */
+ {
+ Elf64_Shdr shdr;
+
+ if (parse_elf_section(elf, s_idx, NULL, &shdr))
+ {
+ log_msg("Failed to parse ELF string table: section %d, index %d\n",
+ s_idx, idx);
+ return "";
+ }
- return (char *)(elf->buf + shdr.sh_offset + idx);
+ return (char *)(elf->buf + shdr.sh_offset + idx);
+ }
}
-int parse_elf32_symbol(elf_obj_t *elf, unsigned int ofst, Elf32_Sym *sym)
+int parse_elf_symbol(elf_obj_t *elf, unsigned int ofst, Elf32_Sym *sym32, Elf64_Sym *sym64)
{
- COPY_STRUCT(sym, elf->buf, ofst, elf->sz);
- ENDIAN_ASSIGN_IN_PLACE(sym->st_name);
- ENDIAN_ASSIGN_IN_PLACE(sym->st_value);
- ENDIAN_ASSIGN_IN_PLACE(sym->st_size);
- ENDIAN_ASSIGN_IN_PLACE(sym->st_info);
- ENDIAN_ASSIGN_IN_PLACE(sym->st_other);
- ENDIAN_ASSIGN_IN_PLACE(sym->st_shndx);
+ if (sym32)
+ {
+ COPY_STRUCT(sym32, elf->buf, ofst, elf->sz);
+ ENDIAN_ASSIGN_IN_PLACE(sym32->st_name);
+ ENDIAN_ASSIGN_IN_PLACE(sym32->st_value);
+ ENDIAN_ASSIGN_IN_PLACE(sym32->st_size);
+ ENDIAN_ASSIGN_IN_PLACE(sym32->st_info);
+ ENDIAN_ASSIGN_IN_PLACE(sym32->st_other);
+ ENDIAN_ASSIGN_IN_PLACE(sym32->st_shndx);
+ }
+ else /* if (sym64) */
+ {
+ COPY_STRUCT(sym64, elf->buf, ofst, elf->sz);
+ ENDIAN_ASSIGN_IN_PLACE(sym64->st_name);
+ ENDIAN_ASSIGN_IN_PLACE(sym64->st_value);
+ ENDIAN_ASSIGN_IN_PLACE(sym64->st_size);
+ ENDIAN_ASSIGN_IN_PLACE(sym64->st_info);
+ ENDIAN_ASSIGN_IN_PLACE(sym64->st_other);
+ ENDIAN_ASSIGN_IN_PLACE(sym64->st_shndx);
+ }
return 0;
bail:
return 1;
}
-int parse_elf32(uint8_t *buf, size_t sz, output_fmt_t mode)
+int parse_elf(uint8_t *buf, size_t sz, output_fmt_t mode)
{
- elf_obj_t elf;
- Elf32_Shdr shdr;
+ elf_obj_t elf;
unsigned int ofst;
- int i;
- Elf32_Off strtab_off; /* save String Table offset for later use */
+ int i;
+ Elf32_Off strtab_off32;
+ Elf64_Off strtab_off64; /* save String Table offset for later use */
memset(&elf, 0, sizeof(elf));
elf.buf = buf;
elf.sz = sz;
/* Parse Header */
- if (parse_elf32_header(&elf))
- {
- log_msg("Parse error: File does not appear to be valid ELF32\n");
- return 1;
- }
+ if (parse_elf_header(&elf))
+ goto bail;
- for (i = 0; i < elf.hdr.e_shnum; i++)
+ if (elf.bits == 32)
{
- parse_elf32_section(&elf, i, &shdr);
-
- if (shdr.sh_type == SHT_STRTAB)
+ Elf32_Shdr shdr;
+ for (i = 0; i < elf.hdr32.e_shnum; i++)
{
- char strtsb_name[128];
-
- strcpy(strtsb_name, (char *)(elf.buf + shdr.sh_offset + shdr.sh_name));
+ parse_elf_section(&elf, i, &shdr, NULL);
- if (!(strcmp(strtsb_name, ".shstrtab")))
+ if (shdr.sh_type == SHT_STRTAB)
{
- log_msg("found section: %s\n", strtsb_name);
- strtab_off = shdr.sh_offset;
- break;
+ char strtsb_name[128];
+
+ strcpy(strtsb_name, (char *)(elf.buf + shdr.sh_offset + shdr.sh_name));
+
+ if (!(strcmp(strtsb_name, ".shstrtab")))
+ {
+ /* log_msg("found section: %s\n", strtsb_name); */
+ strtab_off32 = shdr.sh_offset;
+ break;
+ }
}
}
}
-
- /* Parse all Symbol Tables */
- for (i = 0; i < elf.hdr.e_shnum; i++)
+ else /* if (elf.bits == 64) */
{
-
- parse_elf32_section(&elf, i, &shdr);
-
- if (shdr.sh_type == SHT_SYMTAB)
+ Elf64_Shdr shdr;
+ for (i = 0; i < elf.hdr64.e_shnum; i++)
{
- for (ofst = shdr.sh_offset;
- ofst < shdr.sh_offset + shdr.sh_size;
- ofst += shdr.sh_entsize)
- {
- Elf32_Sym sym;
+ parse_elf_section(&elf, i, NULL, &shdr);
- parse_elf32_symbol(&elf, ofst, &sym);
+ if (shdr.sh_type == SHT_STRTAB)
+ {
+ char strtsb_name[128];
- /* For all OBJECTS (data objects), extract the value from the
- * proper data segment.
- */
- if (ELF32_ST_TYPE(sym.st_info) == STT_OBJECT && sym.st_name)
- log_msg("found data object %s\n",
- parse_elf32_string_table(&elf,
- shdr.sh_link,
- sym.st_name));
+ strcpy(strtsb_name, (char *)(elf.buf + shdr.sh_offset + shdr.sh_name));
- if (ELF32_ST_TYPE(sym.st_info) == STT_OBJECT
- && sym.st_size == 4)
+ if (!(strcmp(strtsb_name, ".shstrtab")))
{
- Elf32_Shdr dhdr;
- int32_t val;
- char section_name[128];
-
- parse_elf32_section(&elf, sym.st_shndx, &dhdr);
+ /* log_msg("found section: %s\n", strtsb_name); */
+ strtab_off64 = shdr.sh_offset;
+ break;
+ }
+ }
+ }
+ }
- /* For explanition - refer to _MSC_VER version of code */
- strcpy(section_name, (char *)(elf.buf + strtab_off + dhdr.sh_name));
- log_msg("Section_name: %s, Section_type: %d\n", section_name, dhdr.sh_type);
+ /* Parse all Symbol Tables */
+ if (elf.bits == 32)
+ {
+ Elf32_Shdr shdr;
+ for (i = 0; i < elf.hdr32.e_shnum; i++)
+ {
+ parse_elf_section(&elf, i, &shdr, NULL);
- if (!(strcmp(section_name, ".bss")))
+ if (shdr.sh_type == SHT_SYMTAB)
+ {
+ for (ofst = shdr.sh_offset;
+ ofst < shdr.sh_offset + shdr.sh_size;
+ ofst += shdr.sh_entsize)
+ {
+ Elf32_Sym sym;
+
+ parse_elf_symbol(&elf, ofst, &sym, NULL);
+
+ /* For all OBJECTS (data objects), extract the value from the
+ * proper data segment.
+ */
+ /* if (ELF32_ST_TYPE(sym.st_info) == STT_OBJECT && sym.st_name)
+ log_msg("found data object %s\n",
+ parse_elf_string_table(&elf,
+ shdr.sh_link,
+ sym.st_name));
+ */
+
+ if (ELF32_ST_TYPE(sym.st_info) == STT_OBJECT
+ && sym.st_size == 4)
{
- val = 0;
+ Elf32_Shdr dhdr;
+ int val = 0;
+ char section_name[128];
+
+ parse_elf_section(&elf, sym.st_shndx, &dhdr, NULL);
+
+ /* For explanition - refer to _MSC_VER version of code */
+ strcpy(section_name, (char *)(elf.buf + strtab_off32 + dhdr.sh_name));
+ /* log_msg("Section_name: %s, Section_type: %d\n", section_name, dhdr.sh_type); */
+
+ if (strcmp(section_name, ".bss"))
+ {
+ if (sizeof(val) != sym.st_size)
+ {
+ /* The target value is declared as an int in
+ * asm_*_offsets.c, which is 4 bytes on all
+ * targets we currently use. Complain loudly if
+ * this is not true.
+ */
+ log_msg("Symbol size is wrong\n");
+ goto bail;
+ }
+
+ memcpy(&val,
+ elf.buf + dhdr.sh_offset + sym.st_value,
+ sym.st_size);
+ }
+
+ if (!elf.le_data)
+ {
+ log_msg("Big Endian data not supported yet!\n");
+ goto bail;
+ }
+
+ switch (mode)
+ {
+ case OUTPUT_FMT_RVDS:
+ printf("%-40s EQU %5d\n",
+ parse_elf_string_table(&elf,
+ shdr.sh_link,
+ sym.st_name),
+ val);
+ break;
+ case OUTPUT_FMT_GAS:
+ printf(".equ %-40s, %5d\n",
+ parse_elf_string_table(&elf,
+ shdr.sh_link,
+ sym.st_name),
+ val);
+ break;
+ default:
+ printf("%s = %d\n",
+ parse_elf_string_table(&elf,
+ shdr.sh_link,
+ sym.st_name),
+ val);
+ }
}
- else
- {
- memcpy(&val,
- elf.buf + dhdr.sh_offset + sym.st_value,
- sizeof(val));
- }
-
- if (!elf.le_data)
- {
- log_msg("Big Endian data not supported yet!\n");
- goto bail;
- }\
+ }
+ }
+ }
+ }
+ else /* if (elf.bits == 64) */
+ {
+ Elf64_Shdr shdr;
+ for (i = 0; i < elf.hdr64.e_shnum; i++)
+ {
+ parse_elf_section(&elf, i, NULL, &shdr);
- switch (mode)
+ if (shdr.sh_type == SHT_SYMTAB)
+ {
+ for (ofst = shdr.sh_offset;
+ ofst < shdr.sh_offset + shdr.sh_size;
+ ofst += shdr.sh_entsize)
+ {
+ Elf64_Sym sym;
+
+ parse_elf_symbol(&elf, ofst, NULL, &sym);
+
+ /* For all OBJECTS (data objects), extract the value from the
+ * proper data segment.
+ */
+ /* if (ELF64_ST_TYPE(sym.st_info) == STT_OBJECT && sym.st_name)
+ log_msg("found data object %s\n",
+ parse_elf_string_table(&elf,
+ shdr.sh_link,
+ sym.st_name));
+ */
+
+ if (ELF64_ST_TYPE(sym.st_info) == STT_OBJECT
+ && sym.st_size == 4)
{
- case OUTPUT_FMT_RVDS:
- printf("%-40s EQU %5d\n",
- parse_elf32_string_table(&elf,
- shdr.sh_link,
- sym.st_name),
- val);
- break;
- case OUTPUT_FMT_GAS:
- printf(".equ %-40s, %5d\n",
- parse_elf32_string_table(&elf,
- shdr.sh_link,
- sym.st_name),
- val);
- break;
- default:
- printf("%s = %d\n",
- parse_elf32_string_table(&elf,
- shdr.sh_link,
- sym.st_name),
- val);
+ Elf64_Shdr dhdr;
+ int val = 0;
+ char section_name[128];
+
+ parse_elf_section(&elf, sym.st_shndx, NULL, &dhdr);
+
+ /* For explanition - refer to _MSC_VER version of code */
+ strcpy(section_name, (char *)(elf.buf + strtab_off64 + dhdr.sh_name));
+ /* log_msg("Section_name: %s, Section_type: %d\n", section_name, dhdr.sh_type); */
+
+ if ((strcmp(section_name, ".bss")))
+ {
+ if (sizeof(val) != sym.st_size)
+ {
+ /* The target value is declared as an int in
+ * asm_*_offsets.c, which is 4 bytes on all
+ * targets we currently use. Complain loudly if
+ * this is not true.
+ */
+ log_msg("Symbol size is wrong\n");
+ goto bail;
+ }
+
+ memcpy(&val,
+ elf.buf + dhdr.sh_offset + sym.st_value,
+ sym.st_size);
+ }
+
+ if (!elf.le_data)
+ {
+ log_msg("Big Endian data not supported yet!\n");
+ goto bail;
+ }
+
+ switch (mode)
+ {
+ case OUTPUT_FMT_RVDS:
+ printf("%-40s EQU %5d\n",
+ parse_elf_string_table(&elf,
+ shdr.sh_link,
+ sym.st_name),
+ val);
+ break;
+ case OUTPUT_FMT_GAS:
+ printf(".equ %-40s, %5d\n",
+ parse_elf_string_table(&elf,
+ shdr.sh_link,
+ sym.st_name),
+ val);
+ break;
+ default:
+ printf("%s = %d\n",
+ parse_elf_string_table(&elf,
+ shdr.sh_link,
+ sym.st_name),
+ val);
+ }
}
}
}
@@ -454,7 +736,7 @@ int parse_elf32(uint8_t *buf, size_t sz, output_fmt_t mode)
return 0;
bail:
- log_msg("Parse error: File does not appear to be valid ELF32\n");
+ log_msg("Parse error: File does not appear to be valid ELF32 or ELF64\n");
return 1;
}
@@ -521,8 +803,7 @@ int main(int argc, char **argv)
goto bail;
}
- res = parse_elf32(file_buf, stat_buf.st_size, mode);
- //res = parse_coff(file_buf, stat_buf.st_size);
+ res = parse_elf(file_buf, stat_buf.st_size, mode);
free(file_buf);
if (!res)
diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index d489413f6..804b80bd5 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -312,7 +312,9 @@ void vp8_output_stats(const VP8_COMP *cpi,
FILE *fpfile;
fpfile = fopen("firstpass.stt", "a");
- fprintf(fpfile, "%12.0f %12.0f %12.0f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.0f\n",
+ fprintf(fpfile, "%12.0f %12.0f %12.0f %12.4f %12.4f %12.4f %12.4f"
+ " %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.0f"
+ " %12.4f\n",
stats->frame,
stats->intra_error,
stats->coded_error,
@@ -320,6 +322,7 @@ void vp8_output_stats(const VP8_COMP *cpi,
stats->pcnt_inter,
stats->pcnt_motion,
stats->pcnt_second_ref,
+ stats->pcnt_neutral,
stats->MVr,
stats->mvr_abs,
stats->MVc,
@@ -327,7 +330,8 @@ void vp8_output_stats(const VP8_COMP *cpi,
stats->MVrv,
stats->MVcv,
stats->mv_in_out_count,
- stats->count);
+ stats->count,
+ stats->duration);
fclose(fpfile);
@@ -359,6 +363,7 @@ void vp8_zero_stats(FIRSTPASS_STATS *section)
section->pcnt_inter = 0.0;
section->pcnt_motion = 0.0;
section->pcnt_second_ref = 0.0;
+ section->pcnt_neutral = 0.0;
section->MVr = 0.0;
section->mvr_abs = 0.0;
section->MVc = 0.0;
@@ -378,6 +383,7 @@ void vp8_accumulate_stats(FIRSTPASS_STATS *section, FIRSTPASS_STATS *frame)
section->pcnt_inter += frame->pcnt_inter;
section->pcnt_motion += frame->pcnt_motion;
section->pcnt_second_ref += frame->pcnt_second_ref;
+ section->pcnt_neutral += frame->pcnt_neutral;
section->MVr += frame->MVr;
section->mvr_abs += frame->mvr_abs;
section->MVc += frame->MVc;
@@ -398,6 +404,7 @@ void vp8_avg_stats(FIRSTPASS_STATS *section)
section->ssim_weighted_pred_err /= section->count;
section->pcnt_inter /= section->count;
section->pcnt_second_ref /= section->count;
+ section->pcnt_neutral /= section->count;
section->pcnt_motion /= section->count;
section->MVr /= section->count;
section->mvr_abs /= section->count;
@@ -570,6 +577,7 @@ void vp8_first_pass(VP8_COMP *cpi)
int intercount = 0;
int second_ref_count = 0;
int intrapenalty = 256;
+ int neutral_count = 0;
int sum_in_vectors = 0;
@@ -726,6 +734,17 @@ void vp8_first_pass(VP8_COMP *cpi)
if (motion_error <= this_error)
{
+ // Keep a count of cases where the inter and intra were
+ // very close and very low. This helps with scene cut
+ // detection for example in cropped clips with black bars
+ // at the sides or top and bottom.
+ if( (((this_error-intrapenalty) * 9) <=
+ (motion_error*10)) &&
+ (this_error < (2*intrapenalty)) )
+ {
+ neutral_count++;
+ }
+
d->bmi.mv.as_mv.row <<= 3;
d->bmi.mv.as_mv.col <<= 3;
this_error = motion_error;
@@ -854,6 +873,7 @@ void vp8_first_pass(VP8_COMP *cpi)
fps.pcnt_inter = 1.0 * (double)intercount / cm->MBs;
fps.pcnt_second_ref = 1.0 * (double)second_ref_count / cm->MBs;
+ fps.pcnt_neutral = 1.0 * (double)neutral_count / cm->MBs;
if (mvcount > 0)
{
@@ -1341,7 +1361,7 @@ void vp8_end_second_pass(VP8_COMP *cpi)
// This function gives and estimate of how badly we believe
// the predicition quality is decaying from frame to frame.
-double gf_prediction_decay_rate(VP8_COMP *cpi, FIRSTPASS_STATS *next_frame)
+double get_prediction_decay_rate(VP8_COMP *cpi, FIRSTPASS_STATS *next_frame)
{
double prediction_decay_rate;
double motion_decay;
@@ -1376,6 +1396,52 @@ double gf_prediction_decay_rate(VP8_COMP *cpi, FIRSTPASS_STATS *next_frame)
return prediction_decay_rate;
}
+// Function to test for a condition where a complex transition is followed
+// by a static section. For example in slide shows where there is a fade
+// between slides. This is to help with more optimal kf and gf positioning.
+BOOL detect_transition_to_still(
+ VP8_COMP *cpi,
+ int frame_interval,
+ int still_interval,
+ double loop_decay_rate,
+ double decay_accumulator )
+{
+ BOOL trans_to_still = FALSE;
+
+ // Break clause to detect very still sections after motion
+ // For example a static image after a fade or other transition
+ // instead of a clean scene cut.
+ if ( (frame_interval > MIN_GF_INTERVAL) &&
+ (loop_decay_rate >= 0.999) &&
+ (decay_accumulator < 0.9) )
+ {
+ int j;
+ FIRSTPASS_STATS * position = cpi->stats_in;
+ FIRSTPASS_STATS tmp_next_frame;
+ double decay_rate;
+
+ // Look ahead a few frames to see if static condition
+ // persists...
+ for ( j = 0; j < still_interval; j++ )
+ {
+ if (EOF == vp8_input_stats(cpi, &tmp_next_frame))
+ break;
+
+ decay_rate = get_prediction_decay_rate(cpi, &tmp_next_frame);
+ if ( decay_rate < 0.999 )
+ break;
+ }
+ // Reset file position
+ reset_fpf_position(cpi, position);
+
+ // Only if it does do we signal a transition to still
+ if ( j == still_interval )
+ trans_to_still = TRUE;
+ }
+
+ return trans_to_still;
+}
+
// Analyse and define a gf/arf group .
static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
{
@@ -1528,7 +1594,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
if (r > GF_RMAX)
r = GF_RMAX;
- loop_decay_rate = gf_prediction_decay_rate(cpi, &next_frame);
+ loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
// Cumulative effect of decay
decay_accumulator = decay_accumulator * loop_decay_rate;
@@ -1537,48 +1603,13 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
boost_score += (decay_accumulator * r);
// Break clause to detect very still sections after motion
- // For example a staic image after a fade or other transition
- // instead of a clean key frame.
- if ( (i > MIN_GF_INTERVAL) &&
- (loop_decay_rate >= 0.999) &&
- (decay_accumulator < 0.9) )
+ // For example a staic image after a fade or other transition.
+ if ( detect_transition_to_still( cpi, i, 5,
+ loop_decay_rate, decay_accumulator ) )
{
- int j;
- FIRSTPASS_STATS * position = cpi->stats_in;
- FIRSTPASS_STATS tmp_next_frame;
- double decay_rate;
-
- // Look ahead a few frames to see if static condition
- // persists...
- for ( j = 0; j < 4; j++ )
- {
- if (EOF == vp8_input_stats(cpi, &tmp_next_frame))
- break;
-
- decay_rate = gf_prediction_decay_rate(cpi, &tmp_next_frame);
- if ( decay_rate < 0.999 )
- break;
- }
- reset_fpf_position(cpi, position); // Reset file position
-
- // Force GF not alt ref
- if ( j == 4 )
- {
- if (0)
- {
- FILE *f = fopen("fadegf.stt", "a");
- fprintf(f, " %8d %8d %10.4f %10.4f %10.4f\n",
- cpi->common.current_video_frame+i, i,
- loop_decay_rate, decay_accumulator,
- boost_score );
- fclose(f);
- }
-
- allow_alt_ref = FALSE;
-
- boost_score = old_boost_score;
- break;
- }
+ allow_alt_ref = FALSE;
+ boost_score = old_boost_score;
+ break;
}
// Break out conditions.
@@ -2285,7 +2316,7 @@ static BOOL test_candidate_kf(VP8_COMP *cpi, FIRSTPASS_STATS *last_frame, FIRST
(next_frame->pcnt_second_ref < 0.10) &&
((this_frame->pcnt_inter < 0.05) ||
(
- (this_frame->pcnt_inter < .25) &&
+ ((this_frame->pcnt_inter - this_frame->pcnt_neutral) < .25) &&
((this_frame->intra_error / DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) < 2.5) &&
((fabs(last_frame->coded_error - this_frame->coded_error) / DOUBLE_DIVIDE_CHECK(this_frame->coded_error) > .40) ||
(fabs(last_frame->intra_error - this_frame->intra_error) / DOUBLE_DIVIDE_CHECK(this_frame->intra_error) > .40) ||
@@ -2332,7 +2363,9 @@ static BOOL test_candidate_kf(VP8_COMP *cpi, FIRSTPASS_STATS *last_frame, FIRST
// Test various breakout clauses
if ((local_next_frame.pcnt_inter < 0.05) ||
(next_iiratio < 1.5) ||
- ((local_next_frame.pcnt_inter < 0.20) && (next_iiratio < 3.0)) ||
+ (((local_next_frame.pcnt_inter -
+ local_next_frame.pcnt_neutral) < 0.20) &&
+ (next_iiratio < 3.0)) ||
((boost_score - old_boost_score) < 0.5) ||
(local_next_frame.intra_error < 200)
)
@@ -2363,13 +2396,13 @@ static BOOL test_candidate_kf(VP8_COMP *cpi, FIRSTPASS_STATS *last_frame, FIRST
}
void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
{
- int i;
+ int i,j;
FIRSTPASS_STATS last_frame;
FIRSTPASS_STATS first_frame;
FIRSTPASS_STATS next_frame;
FIRSTPASS_STATS *start_position;
- double decay_accumulator = 0;
+ double decay_accumulator = 1.0;
double boost_score = 0;
double old_boost_score = 0.0;
double loop_decay_rate;
@@ -2379,6 +2412,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
double kf_group_intra_err = 0.0;
double kf_group_coded_err = 0.0;
double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);
+ double recent_loop_decay[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
vpx_memset(&next_frame, 0, sizeof(next_frame)); // assure clean
@@ -2407,6 +2441,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
kf_mod_err = calculate_modified_err(cpi, this_frame);
// find the next keyframe
+ i = 0;
while (cpi->stats_in < cpi->stats_in_end)
{
// Accumulate kf group error
@@ -2425,9 +2460,34 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
if (cpi->oxcf.auto_key
&& lookup_next_frame_stats(cpi, &next_frame) != EOF)
{
+ // Normal scene cut check
if (test_candidate_kf(cpi, &last_frame, this_frame, &next_frame))
break;
+ // How fast is prediction quality decaying
+ loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
+
+ // We want to know something about the recent past... rather than
+ // as used elsewhere where we are concened with decay in prediction
+ // quality since the last GF or KF.
+ recent_loop_decay[i%8] = loop_decay_rate;
+ decay_accumulator = 1.0;
+ for (j = 0; j < 8; j++)
+ {
+ decay_accumulator = decay_accumulator * recent_loop_decay[j];
+ }
+
+ // Special check for transition or high motion followed by a
+ // to a static scene.
+ if ( detect_transition_to_still( cpi, i,
+ (cpi->key_frame_frequency-i),
+ loop_decay_rate,
+ decay_accumulator ) )
+ {
+ break;
+ }
+
+
// Step on to the next frame
cpi->frames_to_key ++;
@@ -2437,6 +2497,8 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
break;
} else
cpi->frames_to_key ++;
+
+ i++;
}
// If there is a max kf interval set by the user we must obey it.
@@ -2588,32 +2650,8 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
if (r > RMAX)
r = RMAX;
- // Adjust loop decay rate
- //if ( next_frame.pcnt_inter < loop_decay_rate )
- loop_decay_rate = next_frame.pcnt_inter;
-
- // High % motion -> somewhat higher decay rate
- motion_pct = next_frame.pcnt_motion;
- motion_decay = (1.0 - (motion_pct / 20.0));
- if (motion_decay < loop_decay_rate)
- loop_decay_rate = motion_decay;
-
- // Adjustment to decay rate based on speed of motion
- {
- double this_mv_rabs;
- double this_mv_cabs;
- double distance_factor;
-
- this_mv_rabs = fabs(next_frame.mvr_abs * motion_pct);
- this_mv_cabs = fabs(next_frame.mvc_abs * motion_pct);
-
- distance_factor = sqrt((this_mv_rabs * this_mv_rabs) +
- (this_mv_cabs * this_mv_cabs)) / 250.0;
- distance_factor = ((distance_factor > 1.0)
- ? 0.0 : (1.0 - distance_factor));
- if (distance_factor < loop_decay_rate)
- loop_decay_rate = distance_factor;
- }
+ // How fast is prediction quality decaying
+ loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
decay_accumulator = decay_accumulator * loop_decay_rate;
decay_accumulator = decay_accumulator < 0.1 ? 0.1 : decay_accumulator;
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index 5a4b3c185..0d353c31f 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -99,6 +99,7 @@ typedef struct
double pcnt_inter;
double pcnt_motion;
double pcnt_second_ref;
+ double pcnt_neutral;
double MVr;
double mvr_abs;
double MVc;
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index 3db05155c..867ff6a9c 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -2033,7 +2033,7 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
else
cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
}
- else if (vp8_ref_frame_order[mode_index] == SPLITMV)
+ else if (vp8_mode_order[mode_index] == SPLITMV)
cpi->zbin_mode_boost = 0;
else
cpi->zbin_mode_boost = MV_ZBIN_BOOST;
diff --git a/vp8/encoder/x86/variance_impl_sse2.asm b/vp8/encoder/x86/variance_impl_sse2.asm
index 6cdc47bc9..5d1a17d44 100644
--- a/vp8/encoder/x86/variance_impl_sse2.asm
+++ b/vp8/encoder/x86/variance_impl_sse2.asm
@@ -627,6 +627,10 @@ filter_block2d_bil_var_sse2_loop:
filter_block2d_bil_var_sse2_sp_only:
movsxd rdx, dword ptr arg(6) ; yoffset
+
+ cmp rdx, 0 ; skip all if both xoffset=0 and yoffset=0
+ je filter_block2d_bil_var_sse2_full_pixel
+
shl rdx, 5
lea rdx, [rdx + rcx] ; VFilter
@@ -671,6 +675,35 @@ filter_block2d_bil_sp_only_loop:
jmp filter_block2d_bil_variance
+filter_block2d_bil_var_sse2_full_pixel:
+ mov rsi, arg(0) ;ref_ptr
+ mov rdi, arg(2) ;src_ptr
+ movsxd rcx, dword ptr arg(4) ;Height
+ movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
+ movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
+ pxor xmm0, xmm0 ;
+
+filter_block2d_bil_full_pixel_loop:
+ movq xmm1, QWORD PTR [rsi] ;
+ punpcklbw xmm1, xmm0 ;
+
+ movq xmm2, QWORD PTR [rdi] ;
+ punpcklbw xmm2, xmm0 ;
+
+ psubw xmm1, xmm2 ;
+ paddw xmm6, xmm1 ;
+
+ pmaddwd xmm1, xmm1 ;
+ paddd xmm7, xmm1 ;
+
+ lea rsi, [rsi + rax] ;ref_pixels_per_line
+ lea rdi, [rdi + rbx] ;src_pixels_per_line
+
+ sub rcx, 1 ;
+ jnz filter_block2d_bil_full_pixel_loop ;
+
+ jmp filter_block2d_bil_variance
+
filter_block2d_bil_var_sse2_fp_only:
mov rsi, arg(0) ;ref_ptr
mov rdi, arg(2) ;src_ptr
diff --git a/vp8/encoder/x86/variance_impl_ssse3.asm b/vp8/encoder/x86/variance_impl_ssse3.asm
new file mode 100644
index 000000000..b1976328d
--- /dev/null
+++ b/vp8/encoder/x86/variance_impl_ssse3.asm
@@ -0,0 +1,348 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%define xmm_filter_shift 7
+
+
+;void vp8_filter_block2d_bil_var_ssse3
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned int Height,
+; int xoffset,
+; int yoffset,
+; int *sum,
+; unsigned int *sumsquared;;
+;
+;)
+;Note: The filter coefficient at offset=0 is 128. Since the second register
+;for Pmaddubsw is signed bytes, we must calculate zero offset seperately.
+global sym(vp8_filter_block2d_bil_var_ssse3)
+sym(vp8_filter_block2d_bil_var_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 9
+ SAVE_XMM
+ GET_GOT rbx
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ pxor xmm6, xmm6
+ pxor xmm7, xmm7
+
+ lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)]
+ movsxd rax, dword ptr arg(5) ; xoffset
+
+ cmp rax, 0 ; skip first_pass filter if xoffset=0
+ je filter_block2d_bil_var_ssse3_sp_only
+
+ shl rax, 4 ; point to filter coeff with xoffset
+ lea rax, [rax + rcx] ; HFilter
+
+ movsxd rdx, dword ptr arg(6) ; yoffset
+
+ cmp rdx, 0 ; skip second_pass filter if yoffset=0
+ je filter_block2d_bil_var_ssse3_fp_only
+
+ shl rdx, 4
+ lea rdx, [rdx + rcx] ; VFilter
+
+ mov rsi, arg(0) ;ref_ptr
+ mov rdi, arg(2) ;src_ptr
+ movsxd rcx, dword ptr arg(4) ;Height
+
+ movdqu xmm0, XMMWORD PTR [rsi]
+ movdqu xmm1, XMMWORD PTR [rsi+1]
+ movdqa xmm2, xmm0
+
+ punpcklbw xmm0, xmm1
+ punpckhbw xmm2, xmm1
+ pmaddubsw xmm0, [rax]
+ pmaddubsw xmm2, [rax]
+
+ paddw xmm0, [GLOBAL(xmm_bi_rd)]
+ paddw xmm2, [GLOBAL(xmm_bi_rd)]
+ psraw xmm0, xmm_filter_shift
+ psraw xmm2, xmm_filter_shift
+
+ packuswb xmm0, xmm2
+
+ movsxd rbx, dword ptr arg(1) ;ref_pixels_per_line
+ lea rsi, [rsi + rbx]
+%if ABI_IS_32BIT=0
+ movsxd r9, dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+filter_block2d_bil_var_ssse3_loop:
+ movdqu xmm1, XMMWORD PTR [rsi]
+ movdqu xmm2, XMMWORD PTR [rsi+1]
+ movdqa xmm3, xmm1
+
+ punpcklbw xmm1, xmm2
+ punpckhbw xmm3, xmm2
+ pmaddubsw xmm1, [rax]
+ pmaddubsw xmm3, [rax]
+
+ paddw xmm1, [GLOBAL(xmm_bi_rd)]
+ paddw xmm3, [GLOBAL(xmm_bi_rd)]
+ psraw xmm1, xmm_filter_shift
+ psraw xmm3, xmm_filter_shift
+ packuswb xmm1, xmm3
+
+ movdqa xmm2, xmm0
+ movdqa xmm0, xmm1
+ movdqa xmm3, xmm2
+
+ punpcklbw xmm2, xmm1
+ punpckhbw xmm3, xmm1
+ pmaddubsw xmm2, [rdx]
+ pmaddubsw xmm3, [rdx]
+
+ paddw xmm2, [GLOBAL(xmm_bi_rd)]
+ paddw xmm3, [GLOBAL(xmm_bi_rd)]
+ psraw xmm2, xmm_filter_shift
+ psraw xmm3, xmm_filter_shift
+
+ movq xmm1, QWORD PTR [rdi]
+ pxor xmm4, xmm4
+ punpcklbw xmm1, xmm4
+ movq xmm5, QWORD PTR [rdi+8]
+ punpcklbw xmm5, xmm4
+
+ psubw xmm2, xmm1
+ psubw xmm3, xmm5
+ paddw xmm6, xmm2
+ paddw xmm6, xmm3
+ pmaddwd xmm2, xmm2
+ pmaddwd xmm3, xmm3
+ paddd xmm7, xmm2
+ paddd xmm7, xmm3
+
+ lea rsi, [rsi + rbx] ;ref_pixels_per_line
+%if ABI_IS_32BIT
+ add rdi, dword ptr arg(3) ;src_pixels_per_line
+%else
+ lea rdi, [rdi + r9]
+%endif
+
+ sub rcx, 1
+ jnz filter_block2d_bil_var_ssse3_loop
+
+ jmp filter_block2d_bil_variance
+
+filter_block2d_bil_var_ssse3_sp_only:
+ movsxd rdx, dword ptr arg(6) ; yoffset
+
+ cmp rdx, 0 ; Both xoffset =0 and yoffset=0
+ je filter_block2d_bil_var_ssse3_full_pixel
+
+ shl rdx, 4
+ lea rdx, [rdx + rcx] ; VFilter
+
+ mov rsi, arg(0) ;ref_ptr
+ mov rdi, arg(2) ;src_ptr
+ movsxd rcx, dword ptr arg(4) ;Height
+ movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
+
+ movdqu xmm1, XMMWORD PTR [rsi]
+ movdqa xmm0, xmm1
+
+ movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
+ lea rsi, [rsi + rax]
+
+filter_block2d_bil_sp_only_loop:
+ movdqu xmm3, XMMWORD PTR [rsi]
+ movdqa xmm2, xmm1
+ movdqa xmm0, xmm3
+
+ punpcklbw xmm1, xmm3
+ punpckhbw xmm2, xmm3
+ pmaddubsw xmm1, [rdx]
+ pmaddubsw xmm2, [rdx]
+
+ paddw xmm1, [GLOBAL(xmm_bi_rd)]
+ paddw xmm2, [GLOBAL(xmm_bi_rd)]
+ psraw xmm1, xmm_filter_shift
+ psraw xmm2, xmm_filter_shift
+
+ movq xmm3, QWORD PTR [rdi]
+ pxor xmm4, xmm4
+ punpcklbw xmm3, xmm4
+ movq xmm5, QWORD PTR [rdi+8]
+ punpcklbw xmm5, xmm4
+
+ psubw xmm1, xmm3
+ psubw xmm2, xmm5
+ paddw xmm6, xmm1
+ paddw xmm6, xmm2
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm2, xmm2
+ paddd xmm7, xmm1
+ paddd xmm7, xmm2
+
+ movdqa xmm1, xmm0
+ lea rsi, [rsi + rax] ;ref_pixels_per_line
+ lea rdi, [rdi + rbx] ;src_pixels_per_line
+
+ sub rcx, 1
+ jnz filter_block2d_bil_sp_only_loop
+
+ jmp filter_block2d_bil_variance
+
+filter_block2d_bil_var_ssse3_full_pixel:
+ mov rsi, arg(0) ;ref_ptr
+ mov rdi, arg(2) ;src_ptr
+ movsxd rcx, dword ptr arg(4) ;Height
+ movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
+ movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
+ pxor xmm0, xmm0
+
+filter_block2d_bil_full_pixel_loop:
+ movq xmm1, QWORD PTR [rsi]
+ punpcklbw xmm1, xmm0
+ movq xmm2, QWORD PTR [rsi+8]
+ punpcklbw xmm2, xmm0
+
+ movq xmm3, QWORD PTR [rdi]
+ punpcklbw xmm3, xmm0
+ movq xmm4, QWORD PTR [rdi+8]
+ punpcklbw xmm4, xmm0
+
+ psubw xmm1, xmm3
+ psubw xmm2, xmm4
+ paddw xmm6, xmm1
+ paddw xmm6, xmm2
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm2, xmm2
+ paddd xmm7, xmm1
+ paddd xmm7, xmm2
+
+ lea rsi, [rsi + rax] ;ref_pixels_per_line
+ lea rdi, [rdi + rbx] ;src_pixels_per_line
+ sub rcx, 1
+ jnz filter_block2d_bil_full_pixel_loop
+
+ jmp filter_block2d_bil_variance
+
+filter_block2d_bil_var_ssse3_fp_only:
+ mov rsi, arg(0) ;ref_ptr
+ mov rdi, arg(2) ;src_ptr
+ movsxd rcx, dword ptr arg(4) ;Height
+ movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line
+
+ pxor xmm0, xmm0
+ movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
+
+filter_block2d_bil_fp_only_loop:
+ movdqu xmm1, XMMWORD PTR [rsi]
+ movdqu xmm2, XMMWORD PTR [rsi+1]
+ movdqa xmm3, xmm1
+
+ punpcklbw xmm1, xmm2
+ punpckhbw xmm3, xmm2
+ pmaddubsw xmm1, [rax]
+ pmaddubsw xmm3, [rax]
+
+ paddw xmm1, [GLOBAL(xmm_bi_rd)]
+ paddw xmm3, [GLOBAL(xmm_bi_rd)]
+ psraw xmm1, xmm_filter_shift
+ psraw xmm3, xmm_filter_shift
+
+ movq xmm2, XMMWORD PTR [rdi]
+ pxor xmm4, xmm4
+ punpcklbw xmm2, xmm4
+ movq xmm5, QWORD PTR [rdi+8]
+ punpcklbw xmm5, xmm4
+
+ psubw xmm1, xmm2
+ psubw xmm3, xmm5
+ paddw xmm6, xmm1
+ paddw xmm6, xmm3
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm3, xmm3
+ paddd xmm7, xmm1
+ paddd xmm7, xmm3
+
+ lea rsi, [rsi + rdx]
+ lea rdi, [rdi + rbx] ;src_pixels_per_line
+
+ sub rcx, 1
+ jnz filter_block2d_bil_fp_only_loop
+
+ jmp filter_block2d_bil_variance
+
+filter_block2d_bil_variance:
+ pxor xmm0, xmm0
+ pxor xmm1, xmm1
+ pxor xmm5, xmm5
+
+ punpcklwd xmm0, xmm6
+ punpckhwd xmm1, xmm6
+ psrad xmm0, 16
+ psrad xmm1, 16
+ paddd xmm0, xmm1
+ movdqa xmm1, xmm0
+
+ movdqa xmm6, xmm7
+ punpckldq xmm6, xmm5
+ punpckhdq xmm7, xmm5
+ paddd xmm6, xmm7
+
+ punpckldq xmm0, xmm5
+ punpckhdq xmm1, xmm5
+ paddd xmm0, xmm1
+
+ movdqa xmm7, xmm6
+ movdqa xmm1, xmm0
+
+ psrldq xmm7, 8
+ psrldq xmm1, 8
+
+ paddd xmm6, xmm7
+ paddd xmm0, xmm1
+
+ mov rsi, arg(7) ;[Sum]
+ mov rdi, arg(8) ;[SSE]
+
+ movd [rsi], xmm0
+ movd [rdi], xmm6
+
+ ; begin epilog
+ pop rbx
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+SECTION_RODATA
+align 16
+xmm_bi_rd:
+ times 8 dw 64
+align 16
+vp8_bilinear_filters_ssse3:
+ times 8 db 128, 0
+ times 8 db 112, 16
+ times 8 db 96, 32
+ times 8 db 80, 48
+ times 8 db 64, 64
+ times 8 db 48, 80
+ times 8 db 32, 96
+ times 8 db 16, 112
diff --git a/vp8/encoder/x86/variance_ssse3.c b/vp8/encoder/x86/variance_ssse3.c
new file mode 100644
index 000000000..750ae8b86
--- /dev/null
+++ b/vp8/encoder/x86/variance_ssse3.c
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp8/encoder/variance.h"
+#include "vp8/common/pragmas.h"
+#include "vpx_ports/mem.h"
+
+extern unsigned int vp8_get16x16var_sse2
+(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *SSE,
+ int *Sum
+);
+extern void vp8_half_horiz_vert_variance16x_h_sse2
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int *sum,
+ unsigned int *sumsquared
+);
+extern void vp8_half_horiz_variance16x_h_sse2
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int *sum,
+ unsigned int *sumsquared
+);
+extern void vp8_half_vert_variance16x_h_sse2
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int *sum,
+ unsigned int *sumsquared
+);
+extern void vp8_filter_block2d_bil_var_ssse3
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int xoffset,
+ int yoffset,
+ int *sum,
+ unsigned int *sumsquared
+);
+
+unsigned int vp8_sub_pixel_variance16x16_ssse3
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ int xsum0, xsum1;
+ unsigned int xxsum0, xxsum1;
+
+ // note we could avoid these if statements if the calling function
+ // just called the appropriate functions inside.
+ if (xoffset == 4 && yoffset == 0)
+ {
+ vp8_half_horiz_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum0, &xxsum0);
+
+ vp8_half_horiz_variance16x_h_sse2(
+ src_ptr + 8, src_pixels_per_line,
+ dst_ptr + 8, dst_pixels_per_line, 16,
+ &xsum1, &xxsum1);
+
+ xsum0 += xsum1;
+ xxsum0 += xxsum1;
+ }
+ else if (xoffset == 0 && yoffset == 4)
+ {
+ vp8_half_vert_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum0, &xxsum0);
+
+ vp8_half_vert_variance16x_h_sse2(
+ src_ptr + 8, src_pixels_per_line,
+ dst_ptr + 8, dst_pixels_per_line, 16,
+ &xsum1, &xxsum1);
+
+ xsum0 += xsum1;
+ xxsum0 += xxsum1;
+ }
+ else if (xoffset == 4 && yoffset == 4)
+ {
+ vp8_half_horiz_vert_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum0, &xxsum0);
+
+ vp8_half_horiz_vert_variance16x_h_sse2(
+ src_ptr + 8, src_pixels_per_line,
+ dst_ptr + 8, dst_pixels_per_line, 16,
+ &xsum1, &xxsum1);
+
+ xsum0 += xsum1;
+ xxsum0 += xxsum1;
+ }
+ else
+ {
+ vp8_filter_block2d_bil_var_ssse3(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ xoffset, yoffset,
+ &xsum0, &xxsum0);
+ }
+
+ *sse = xxsum0;
+ return (xxsum0 - ((xsum0 * xsum0) >> 8));
+}
diff --git a/vp8/encoder/x86/variance_x86.h b/vp8/encoder/x86/variance_x86.h
index 6bea15ebc..1e2fb3490 100644
--- a/vp8/encoder/x86/variance_x86.h
+++ b/vp8/encoder/x86/variance_x86.h
@@ -286,6 +286,7 @@ extern prototype_sad_multi_dif_address(vp8_sad4x4x4d_sse3);
#if HAVE_SSSE3
extern prototype_sad_multi_same_address(vp8_sad16x16x3_ssse3);
extern prototype_sad_multi_same_address(vp8_sad16x8x3_ssse3);
+extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_ssse3);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_variance_sad16x16x3
@@ -294,6 +295,9 @@ extern prototype_sad_multi_same_address(vp8_sad16x8x3_ssse3);
#undef vp8_variance_sad16x8x3
#define vp8_variance_sad16x8x3 vp8_sad16x8x3_ssse3
+#undef vp8_variance_subpixvar16x16
+#define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_ssse3
+
#endif
#endif
diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c
index 61c603229..c7639a7e4 100644
--- a/vp8/encoder/x86/x86_csystemdependent.c
+++ b/vp8/encoder/x86/x86_csystemdependent.c
@@ -334,6 +334,8 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
cpi->rtcd.variance.sad16x16x3 = vp8_sad16x16x3_ssse3;
cpi->rtcd.variance.sad16x8x3 = vp8_sad16x8x3_ssse3;
+ cpi->rtcd.variance.subpixvar16x16 = vp8_sub_pixel_variance16x16_ssse3;
+
cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_ssse3;
}
diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk
index ed2feddae..c0ae250f5 100644
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -110,6 +110,8 @@ VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
VP8_CX_SRCS-$(HAVE_SSE3) += encoder/x86/sad_sse3.asm
VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/sad_ssse3.asm
+VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/variance_ssse3.c
+VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/variance_impl_ssse3.asm
VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.asm
VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/sad_sse4.asm
VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm