diff options
author | John Koleszar <jkoleszar@google.com> | 2011-03-09 00:05:06 -0500 |
---|---|---|
committer | John Koleszar <jkoleszar@google.com> | 2011-03-09 00:05:06 -0500 |
commit | 96208f2e45f0fb0e9e2a0b47c9b45bd4b0247482 (patch) | |
tree | be2c9e1d03eb309d6d7f8c3b000c8b15f1aec5bf | |
parent | b7ec812e2db3bca83d15faadd14eba444fb55f4d (diff) | |
parent | 95adf3df77439740526b73bca7242f7096687d82 (diff) | |
download | libvpx-96208f2e45f0fb0e9e2a0b47c9b45bd4b0247482.tar libvpx-96208f2e45f0fb0e9e2a0b47c9b45bd4b0247482.tar.gz libvpx-96208f2e45f0fb0e9e2a0b47c9b45bd4b0247482.tar.bz2 libvpx-96208f2e45f0fb0e9e2a0b47c9b45bd4b0247482.zip |
Merge remote branch 'internal/upstream' into HEAD
-rw-r--r-- | build/make/obj_int_extract.c | 645 | ||||
-rw-r--r-- | vp8/encoder/firstpass.c | 188 | ||||
-rw-r--r-- | vp8/encoder/onyx_int.h | 1 | ||||
-rw-r--r-- | vp8/encoder/rdopt.c | 2 | ||||
-rw-r--r-- | vp8/encoder/x86/variance_impl_sse2.asm | 33 | ||||
-rw-r--r-- | vp8/encoder/x86/variance_impl_ssse3.asm | 348 | ||||
-rw-r--r-- | vp8/encoder/x86/variance_ssse3.c | 140 | ||||
-rw-r--r-- | vp8/encoder/x86/variance_x86.h | 4 | ||||
-rw-r--r-- | vp8/encoder/x86/x86_csystemdependent.c | 2 | ||||
-rw-r--r-- | vp8/vp8cx.mk | 2 |
10 files changed, 1107 insertions, 258 deletions
diff --git a/build/make/obj_int_extract.c b/build/make/obj_int_extract.c index 3c54b248f..26cf45782 100644 --- a/build/make/obj_int_extract.c +++ b/build/make/obj_int_extract.c @@ -59,20 +59,47 @@ int parse_macho(uint8_t *base_buf, size_t sz) struct mach_header header; uint8_t *buf = base_buf; int base_data_section = 0; - + int bits = 0; + + /* We can read in mach_header for 32 and 64 bit architectures + * because it's identical to mach_header_64 except for the last + * element (uint32_t reserved), which we don't use. Then, when + * we know which architecture we're looking at, increment buf + * appropriately. + */ memcpy(&header, buf, sizeof(struct mach_header)); - buf += sizeof(struct mach_header); - if (header.magic != MH_MAGIC) + if (header.magic == MH_MAGIC) { - log_msg("Bad magic number for object file. 0x%x expected, 0x%x found.\n", - header.magic, MH_MAGIC); - goto bail; + if (header.cputype == CPU_TYPE_ARM + || header.cputype == CPU_TYPE_X86) + { + bits = 32; + buf += sizeof(struct mach_header); + } + else + { + log_msg("Bad cputype for object file. Currently only tested for CPU_TYPE_[ARM|X86].\n"); + goto bail; + } } - - if (header.cputype != CPU_TYPE_ARM) + else if (header.magic == MH_MAGIC_64) { - log_msg("Bad cputype for object file. Currently only tested for CPU_TYPE_ARM.\n"); + if (header.cputype == CPU_TYPE_X86_64) + { + bits = 64; + buf += sizeof(struct mach_header_64); + } + else + { + log_msg("Bad cputype for object file. Currently only tested for CPU_TYPE_X86_64.\n"); + goto bail; + } + } + else + { + log_msg("Bad magic number for object file. 0x%x or 0x%x expected, 0x%x found.\n", + MH_MAGIC, MH_MAGIC_64, header.magic); goto bail; } @@ -85,8 +112,6 @@ int parse_macho(uint8_t *base_buf, size_t sz) for (i = 0; i < header.ncmds; i++) { struct load_command lc; - struct symtab_command sc; - struct segment_command seg_c; memcpy(&lc, buf, sizeof(struct load_command)); @@ -94,50 +119,99 @@ int parse_macho(uint8_t *base_buf, size_t sz) { uint8_t *seg_buf = buf; struct section s; + struct segment_command seg_c; - memcpy(&seg_c, buf, sizeof(struct segment_command)); - + memcpy(&seg_c, seg_buf, sizeof(struct segment_command)); seg_buf += sizeof(struct segment_command); - for (j = 0; j < seg_c.nsects; j++) + /* Although each section is given it's own offset, nlist.n_value + * references the offset of the first section. This isn't + * apparent without debug information because the offset of the + * data section is the same as the first section. However, with + * debug sections mixed in, the offset of the debug section + * increases but n_value still references the first section. + */ + if (seg_c.nsects < 1) { - memcpy(&s, seg_buf + (j * sizeof(struct section)), sizeof(struct section)); + log_msg("Not enough sections\n"); + goto bail; + } + + memcpy(&s, seg_buf, sizeof(struct section)); + base_data_section = s.offset; + } + else if (lc.cmd == LC_SEGMENT_64) + { + uint8_t *seg_buf = buf; + struct section_64 s; + struct segment_command_64 seg_c; + + memcpy(&seg_c, seg_buf, sizeof(struct segment_command_64)); + seg_buf += sizeof(struct segment_command_64); - // Need to get this offset which is the start of the symbol table - // before matching the strings up with symbols. - base_data_section = s.offset; + /* Explanation in LG_SEGMENT */ + if (seg_c.nsects < 1) + { + log_msg("Not enough sections\n"); + goto bail; } + + memcpy(&s, seg_buf, sizeof(struct section_64)); + base_data_section = s.offset; } else if (lc.cmd == LC_SYMTAB) { - uint8_t *sym_buf = base_buf; - uint8_t *str_buf = base_buf; - if (base_data_section != 0) { + struct symtab_command sc; + uint8_t *sym_buf = base_buf; + uint8_t *str_buf = base_buf; + memcpy(&sc, buf, sizeof(struct symtab_command)); if (sc.cmdsize != sizeof(struct symtab_command)) + { log_msg("Can't find symbol table!\n"); + goto bail; + } sym_buf += sc.symoff; str_buf += sc.stroff; for (j = 0; j < sc.nsyms; j++) { - struct nlist nl; - int val; + /* Location of string is cacluated each time from the + * start of the string buffer. On darwin the symbols + * are prefixed by "_", so we bump the pointer by 1. + * The target value is defined as an int in asm_*_offsets.c, + * which is 4 bytes on all targets we currently use. + */ + if (bits == 32) + { + struct nlist nl; + int val; - memcpy(&nl, sym_buf + (j * sizeof(struct nlist)), sizeof(struct nlist)); + memcpy(&nl, sym_buf, sizeof(struct nlist)); + sym_buf += sizeof(struct nlist); - val = *((int *)(base_buf + base_data_section + nl.n_value)); + memcpy(&val, base_buf + base_data_section + nl.n_value, + sizeof(val)); + printf("%-40s EQU %5d\n", + str_buf + nl.n_un.n_strx + 1, val); + } + else /* if (bits == 64) */ + { + struct nlist_64 nl; + int val; - // Location of string is cacluated each time from the - // start of the string buffer. On darwin the symbols - // are prefixed by "_". On other platforms it is not - // so it needs to be removed. That is the reason for - // the +1. - printf("%-40s EQU %5d\n", str_buf + nl.n_un.n_strx + 1, val); + memcpy(&nl, sym_buf, sizeof(struct nlist_64)); + sym_buf += sizeof(struct nlist_64); + + memcpy(&val, base_buf + base_data_section + nl.n_value, + sizeof(val)); + printf("%-40s EQU %5d\n", + str_buf + nl.n_un.n_strx + 1, val); + } } } } @@ -218,7 +292,7 @@ bail: return EXIT_FAILURE; } -#else +#elif defined(__ELF__) #include "elf.h" #define COPY_STRUCT(dst, buf, ofst, sz) do {\ @@ -237,212 +311,420 @@ bail: typedef struct { - uint8_t *buf; /* Buffer containing ELF data */ - size_t sz; /* Buffer size */ - int le_data; /* Data is little-endian */ - Elf32_Ehdr hdr; + uint8_t *buf; /* Buffer containing ELF data */ + size_t sz; /* Buffer size */ + int le_data; /* Data is little-endian */ + unsigned char e_ident[EI_NIDENT]; /* Magic number and other info */ + int bits; /* 32 or 64 */ + Elf32_Ehdr hdr32; + Elf64_Ehdr hdr64; } elf_obj_t; -int parse_elf32_header(elf_obj_t *elf) +int parse_elf_header(elf_obj_t *elf) { int res; - /* Verify ELF32 header */ - COPY_STRUCT(&elf->hdr, elf->buf, 0, elf->sz); - res = elf->hdr.e_ident[EI_MAG0] == ELFMAG0; - res &= elf->hdr.e_ident[EI_MAG1] == ELFMAG1; - res &= elf->hdr.e_ident[EI_MAG2] == ELFMAG2; - res &= elf->hdr.e_ident[EI_MAG3] == ELFMAG3; - res &= elf->hdr.e_ident[EI_CLASS] == ELFCLASS32; - res &= elf->hdr.e_ident[EI_DATA] == ELFDATA2LSB - || elf->hdr.e_ident[EI_DATA] == ELFDATA2MSB; + /* Verify ELF Magic numbers */ + COPY_STRUCT(&elf->e_ident, elf->buf, 0, elf->sz); + res = elf->e_ident[EI_MAG0] == ELFMAG0; + res &= elf->e_ident[EI_MAG1] == ELFMAG1; + res &= elf->e_ident[EI_MAG2] == ELFMAG2; + res &= elf->e_ident[EI_MAG3] == ELFMAG3; + res &= elf->e_ident[EI_CLASS] == ELFCLASS32 + || elf->e_ident[EI_CLASS] == ELFCLASS64; + res &= elf->e_ident[EI_DATA] == ELFDATA2LSB; if (!res) goto bail; - elf->le_data = elf->hdr.e_ident[EI_DATA] == ELFDATA2LSB; - - ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_type); - ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_machine); - ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_version); - ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_entry); - ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_phoff); - ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_shoff); - ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_flags); - ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_ehsize); - ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_phentsize); - ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_phnum); - ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_shentsize); - ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_shnum); - ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_shstrndx); + elf->le_data = elf->e_ident[EI_DATA] == ELFDATA2LSB; + + /* Read in relevant values */ + if (elf->e_ident[EI_CLASS] == ELFCLASS32) + { + elf->bits = 32; + COPY_STRUCT(&elf->hdr32, elf->buf, 0, elf->sz); + + ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_type); + ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_machine); + ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_version); + ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_entry); + ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_phoff); + ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_shoff); + ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_flags); + ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_ehsize); + ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_phentsize); + ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_phnum); + ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_shentsize); + ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_shnum); + ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_shstrndx); + } + else /* if (elf->e_ident[EI_CLASS] == ELFCLASS64) */ + { + elf->bits = 64; + COPY_STRUCT(&elf->hdr64, elf->buf, 0, elf->sz); + + ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_type); + ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_machine); + ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_version); + ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_entry); + ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_phoff); + ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_shoff); + ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_flags); + ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_ehsize); + ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_phentsize); + ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_phnum); + ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_shentsize); + ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_shnum); + ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_shstrndx); + } + return 0; bail: + log_msg("Failed to parse ELF file header"); return 1; } -int parse_elf32_section(elf_obj_t *elf, int idx, Elf32_Shdr *hdr) +int parse_elf_section(elf_obj_t *elf, int idx, Elf32_Shdr *hdr32, Elf64_Shdr *hdr64) { - if (idx >= elf->hdr.e_shnum) - goto bail; + if (hdr32) + { + if (idx >= elf->hdr32.e_shnum) + goto bail; + + COPY_STRUCT(hdr32, elf->buf, elf->hdr32.e_shoff + idx * elf->hdr32.e_shentsize, + elf->sz); + ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_name); + ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_type); + ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_flags); + ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_addr); + ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_offset); + ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_size); + ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_link); + ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_info); + ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_addralign); + ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_entsize); + } + else /* if (hdr64) */ + { + if (idx >= elf->hdr64.e_shnum) + goto bail; + + COPY_STRUCT(hdr64, elf->buf, elf->hdr64.e_shoff + idx * elf->hdr64.e_shentsize, + elf->sz); + ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_name); + ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_type); + ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_flags); + ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_addr); + ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_offset); + ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_size); + ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_link); + ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_info); + ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_addralign); + ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_entsize); + } - COPY_STRUCT(hdr, elf->buf, elf->hdr.e_shoff + idx * elf->hdr.e_shentsize, - elf->sz); - ENDIAN_ASSIGN_IN_PLACE(hdr->sh_name); - ENDIAN_ASSIGN_IN_PLACE(hdr->sh_type); - ENDIAN_ASSIGN_IN_PLACE(hdr->sh_flags); - ENDIAN_ASSIGN_IN_PLACE(hdr->sh_addr); - ENDIAN_ASSIGN_IN_PLACE(hdr->sh_offset); - ENDIAN_ASSIGN_IN_PLACE(hdr->sh_size); - ENDIAN_ASSIGN_IN_PLACE(hdr->sh_link); - ENDIAN_ASSIGN_IN_PLACE(hdr->sh_info); - ENDIAN_ASSIGN_IN_PLACE(hdr->sh_addralign); - ENDIAN_ASSIGN_IN_PLACE(hdr->sh_entsize); return 0; bail: return 1; } -char *parse_elf32_string_table(elf_obj_t *elf, int s_idx, int idx) +char *parse_elf_string_table(elf_obj_t *elf, int s_idx, int idx) { - Elf32_Shdr shdr; - - if (parse_elf32_section(elf, s_idx, &shdr)) + if (elf->bits == 32) { - log_msg("Failed to parse ELF string table: section %d, index %d\n", - s_idx, idx); - return ""; + Elf32_Shdr shdr; + + if (parse_elf_section(elf, s_idx, &shdr, NULL)) + { + log_msg("Failed to parse ELF string table: section %d, index %d\n", + s_idx, idx); + return ""; + } + + return (char *)(elf->buf + shdr.sh_offset + idx); } + else /* if (elf->bits == 64) */ + { + Elf64_Shdr shdr; + + if (parse_elf_section(elf, s_idx, NULL, &shdr)) + { + log_msg("Failed to parse ELF string table: section %d, index %d\n", + s_idx, idx); + return ""; + } - return (char *)(elf->buf + shdr.sh_offset + idx); + return (char *)(elf->buf + shdr.sh_offset + idx); + } } -int parse_elf32_symbol(elf_obj_t *elf, unsigned int ofst, Elf32_Sym *sym) +int parse_elf_symbol(elf_obj_t *elf, unsigned int ofst, Elf32_Sym *sym32, Elf64_Sym *sym64) { - COPY_STRUCT(sym, elf->buf, ofst, elf->sz); - ENDIAN_ASSIGN_IN_PLACE(sym->st_name); - ENDIAN_ASSIGN_IN_PLACE(sym->st_value); - ENDIAN_ASSIGN_IN_PLACE(sym->st_size); - ENDIAN_ASSIGN_IN_PLACE(sym->st_info); - ENDIAN_ASSIGN_IN_PLACE(sym->st_other); - ENDIAN_ASSIGN_IN_PLACE(sym->st_shndx); + if (sym32) + { + COPY_STRUCT(sym32, elf->buf, ofst, elf->sz); + ENDIAN_ASSIGN_IN_PLACE(sym32->st_name); + ENDIAN_ASSIGN_IN_PLACE(sym32->st_value); + ENDIAN_ASSIGN_IN_PLACE(sym32->st_size); + ENDIAN_ASSIGN_IN_PLACE(sym32->st_info); + ENDIAN_ASSIGN_IN_PLACE(sym32->st_other); + ENDIAN_ASSIGN_IN_PLACE(sym32->st_shndx); + } + else /* if (sym64) */ + { + COPY_STRUCT(sym64, elf->buf, ofst, elf->sz); + ENDIAN_ASSIGN_IN_PLACE(sym64->st_name); + ENDIAN_ASSIGN_IN_PLACE(sym64->st_value); + ENDIAN_ASSIGN_IN_PLACE(sym64->st_size); + ENDIAN_ASSIGN_IN_PLACE(sym64->st_info); + ENDIAN_ASSIGN_IN_PLACE(sym64->st_other); + ENDIAN_ASSIGN_IN_PLACE(sym64->st_shndx); + } return 0; bail: return 1; } -int parse_elf32(uint8_t *buf, size_t sz, output_fmt_t mode) +int parse_elf(uint8_t *buf, size_t sz, output_fmt_t mode) { - elf_obj_t elf; - Elf32_Shdr shdr; + elf_obj_t elf; unsigned int ofst; - int i; - Elf32_Off strtab_off; /* save String Table offset for later use */ + int i; + Elf32_Off strtab_off32; + Elf64_Off strtab_off64; /* save String Table offset for later use */ memset(&elf, 0, sizeof(elf)); elf.buf = buf; elf.sz = sz; /* Parse Header */ - if (parse_elf32_header(&elf)) - { - log_msg("Parse error: File does not appear to be valid ELF32\n"); - return 1; - } + if (parse_elf_header(&elf)) + goto bail; - for (i = 0; i < elf.hdr.e_shnum; i++) + if (elf.bits == 32) { - parse_elf32_section(&elf, i, &shdr); - - if (shdr.sh_type == SHT_STRTAB) + Elf32_Shdr shdr; + for (i = 0; i < elf.hdr32.e_shnum; i++) { - char strtsb_name[128]; - - strcpy(strtsb_name, (char *)(elf.buf + shdr.sh_offset + shdr.sh_name)); + parse_elf_section(&elf, i, &shdr, NULL); - if (!(strcmp(strtsb_name, ".shstrtab"))) + if (shdr.sh_type == SHT_STRTAB) { - log_msg("found section: %s\n", strtsb_name); - strtab_off = shdr.sh_offset; - break; + char strtsb_name[128]; + + strcpy(strtsb_name, (char *)(elf.buf + shdr.sh_offset + shdr.sh_name)); + + if (!(strcmp(strtsb_name, ".shstrtab"))) + { + /* log_msg("found section: %s\n", strtsb_name); */ + strtab_off32 = shdr.sh_offset; + break; + } } } } - - /* Parse all Symbol Tables */ - for (i = 0; i < elf.hdr.e_shnum; i++) + else /* if (elf.bits == 64) */ { - - parse_elf32_section(&elf, i, &shdr); - - if (shdr.sh_type == SHT_SYMTAB) + Elf64_Shdr shdr; + for (i = 0; i < elf.hdr64.e_shnum; i++) { - for (ofst = shdr.sh_offset; - ofst < shdr.sh_offset + shdr.sh_size; - ofst += shdr.sh_entsize) - { - Elf32_Sym sym; + parse_elf_section(&elf, i, NULL, &shdr); - parse_elf32_symbol(&elf, ofst, &sym); + if (shdr.sh_type == SHT_STRTAB) + { + char strtsb_name[128]; - /* For all OBJECTS (data objects), extract the value from the - * proper data segment. - */ - if (ELF32_ST_TYPE(sym.st_info) == STT_OBJECT && sym.st_name) - log_msg("found data object %s\n", - parse_elf32_string_table(&elf, - shdr.sh_link, - sym.st_name)); + strcpy(strtsb_name, (char *)(elf.buf + shdr.sh_offset + shdr.sh_name)); - if (ELF32_ST_TYPE(sym.st_info) == STT_OBJECT - && sym.st_size == 4) + if (!(strcmp(strtsb_name, ".shstrtab"))) { - Elf32_Shdr dhdr; - int32_t val; - char section_name[128]; - - parse_elf32_section(&elf, sym.st_shndx, &dhdr); + /* log_msg("found section: %s\n", strtsb_name); */ + strtab_off64 = shdr.sh_offset; + break; + } + } + } + } - /* For explanition - refer to _MSC_VER version of code */ - strcpy(section_name, (char *)(elf.buf + strtab_off + dhdr.sh_name)); - log_msg("Section_name: %s, Section_type: %d\n", section_name, dhdr.sh_type); + /* Parse all Symbol Tables */ + if (elf.bits == 32) + { + Elf32_Shdr shdr; + for (i = 0; i < elf.hdr32.e_shnum; i++) + { + parse_elf_section(&elf, i, &shdr, NULL); - if (!(strcmp(section_name, ".bss"))) + if (shdr.sh_type == SHT_SYMTAB) + { + for (ofst = shdr.sh_offset; + ofst < shdr.sh_offset + shdr.sh_size; + ofst += shdr.sh_entsize) + { + Elf32_Sym sym; + + parse_elf_symbol(&elf, ofst, &sym, NULL); + + /* For all OBJECTS (data objects), extract the value from the + * proper data segment. + */ + /* if (ELF32_ST_TYPE(sym.st_info) == STT_OBJECT && sym.st_name) + log_msg("found data object %s\n", + parse_elf_string_table(&elf, + shdr.sh_link, + sym.st_name)); + */ + + if (ELF32_ST_TYPE(sym.st_info) == STT_OBJECT + && sym.st_size == 4) { - val = 0; + Elf32_Shdr dhdr; + int val = 0; + char section_name[128]; + + parse_elf_section(&elf, sym.st_shndx, &dhdr, NULL); + + /* For explanition - refer to _MSC_VER version of code */ + strcpy(section_name, (char *)(elf.buf + strtab_off32 + dhdr.sh_name)); + /* log_msg("Section_name: %s, Section_type: %d\n", section_name, dhdr.sh_type); */ + + if (strcmp(section_name, ".bss")) + { + if (sizeof(val) != sym.st_size) + { + /* The target value is declared as an int in + * asm_*_offsets.c, which is 4 bytes on all + * targets we currently use. Complain loudly if + * this is not true. + */ + log_msg("Symbol size is wrong\n"); + goto bail; + } + + memcpy(&val, + elf.buf + dhdr.sh_offset + sym.st_value, + sym.st_size); + } + + if (!elf.le_data) + { + log_msg("Big Endian data not supported yet!\n"); + goto bail; + } + + switch (mode) + { + case OUTPUT_FMT_RVDS: + printf("%-40s EQU %5d\n", + parse_elf_string_table(&elf, + shdr.sh_link, + sym.st_name), + val); + break; + case OUTPUT_FMT_GAS: + printf(".equ %-40s, %5d\n", + parse_elf_string_table(&elf, + shdr.sh_link, + sym.st_name), + val); + break; + default: + printf("%s = %d\n", + parse_elf_string_table(&elf, + shdr.sh_link, + sym.st_name), + val); + } } - else - { - memcpy(&val, - elf.buf + dhdr.sh_offset + sym.st_value, - sizeof(val)); - } - - if (!elf.le_data) - { - log_msg("Big Endian data not supported yet!\n"); - goto bail; - }\ + } + } + } + } + else /* if (elf.bits == 64) */ + { + Elf64_Shdr shdr; + for (i = 0; i < elf.hdr64.e_shnum; i++) + { + parse_elf_section(&elf, i, NULL, &shdr); - switch (mode) + if (shdr.sh_type == SHT_SYMTAB) + { + for (ofst = shdr.sh_offset; + ofst < shdr.sh_offset + shdr.sh_size; + ofst += shdr.sh_entsize) + { + Elf64_Sym sym; + + parse_elf_symbol(&elf, ofst, NULL, &sym); + + /* For all OBJECTS (data objects), extract the value from the + * proper data segment. + */ + /* if (ELF64_ST_TYPE(sym.st_info) == STT_OBJECT && sym.st_name) + log_msg("found data object %s\n", + parse_elf_string_table(&elf, + shdr.sh_link, + sym.st_name)); + */ + + if (ELF64_ST_TYPE(sym.st_info) == STT_OBJECT + && sym.st_size == 4) { - case OUTPUT_FMT_RVDS: - printf("%-40s EQU %5d\n", - parse_elf32_string_table(&elf, - shdr.sh_link, - sym.st_name), - val); - break; - case OUTPUT_FMT_GAS: - printf(".equ %-40s, %5d\n", - parse_elf32_string_table(&elf, - shdr.sh_link, - sym.st_name), - val); - break; - default: - printf("%s = %d\n", - parse_elf32_string_table(&elf, - shdr.sh_link, - sym.st_name), - val); + Elf64_Shdr dhdr; + int val = 0; + char section_name[128]; + + parse_elf_section(&elf, sym.st_shndx, NULL, &dhdr); + + /* For explanition - refer to _MSC_VER version of code */ + strcpy(section_name, (char *)(elf.buf + strtab_off64 + dhdr.sh_name)); + /* log_msg("Section_name: %s, Section_type: %d\n", section_name, dhdr.sh_type); */ + + if ((strcmp(section_name, ".bss"))) + { + if (sizeof(val) != sym.st_size) + { + /* The target value is declared as an int in + * asm_*_offsets.c, which is 4 bytes on all + * targets we currently use. Complain loudly if + * this is not true. + */ + log_msg("Symbol size is wrong\n"); + goto bail; + } + + memcpy(&val, + elf.buf + dhdr.sh_offset + sym.st_value, + sym.st_size); + } + + if (!elf.le_data) + { + log_msg("Big Endian data not supported yet!\n"); + goto bail; + } + + switch (mode) + { + case OUTPUT_FMT_RVDS: + printf("%-40s EQU %5d\n", + parse_elf_string_table(&elf, + shdr.sh_link, + sym.st_name), + val); + break; + case OUTPUT_FMT_GAS: + printf(".equ %-40s, %5d\n", + parse_elf_string_table(&elf, + shdr.sh_link, + sym.st_name), + val); + break; + default: + printf("%s = %d\n", + parse_elf_string_table(&elf, + shdr.sh_link, + sym.st_name), + val); + } } } } @@ -454,7 +736,7 @@ int parse_elf32(uint8_t *buf, size_t sz, output_fmt_t mode) return 0; bail: - log_msg("Parse error: File does not appear to be valid ELF32\n"); + log_msg("Parse error: File does not appear to be valid ELF32 or ELF64\n"); return 1; } @@ -521,8 +803,7 @@ int main(int argc, char **argv) goto bail; } - res = parse_elf32(file_buf, stat_buf.st_size, mode); - //res = parse_coff(file_buf, stat_buf.st_size); + res = parse_elf(file_buf, stat_buf.st_size, mode); free(file_buf); if (!res) diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c index d489413f6..804b80bd5 100644 --- a/vp8/encoder/firstpass.c +++ b/vp8/encoder/firstpass.c @@ -312,7 +312,9 @@ void vp8_output_stats(const VP8_COMP *cpi, FILE *fpfile; fpfile = fopen("firstpass.stt", "a"); - fprintf(fpfile, "%12.0f %12.0f %12.0f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.0f\n", + fprintf(fpfile, "%12.0f %12.0f %12.0f %12.4f %12.4f %12.4f %12.4f" + " %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.0f" + " %12.4f\n", stats->frame, stats->intra_error, stats->coded_error, @@ -320,6 +322,7 @@ void vp8_output_stats(const VP8_COMP *cpi, stats->pcnt_inter, stats->pcnt_motion, stats->pcnt_second_ref, + stats->pcnt_neutral, stats->MVr, stats->mvr_abs, stats->MVc, @@ -327,7 +330,8 @@ void vp8_output_stats(const VP8_COMP *cpi, stats->MVrv, stats->MVcv, stats->mv_in_out_count, - stats->count); + stats->count, + stats->duration); fclose(fpfile); @@ -359,6 +363,7 @@ void vp8_zero_stats(FIRSTPASS_STATS *section) section->pcnt_inter = 0.0; section->pcnt_motion = 0.0; section->pcnt_second_ref = 0.0; + section->pcnt_neutral = 0.0; section->MVr = 0.0; section->mvr_abs = 0.0; section->MVc = 0.0; @@ -378,6 +383,7 @@ void vp8_accumulate_stats(FIRSTPASS_STATS *section, FIRSTPASS_STATS *frame) section->pcnt_inter += frame->pcnt_inter; section->pcnt_motion += frame->pcnt_motion; section->pcnt_second_ref += frame->pcnt_second_ref; + section->pcnt_neutral += frame->pcnt_neutral; section->MVr += frame->MVr; section->mvr_abs += frame->mvr_abs; section->MVc += frame->MVc; @@ -398,6 +404,7 @@ void vp8_avg_stats(FIRSTPASS_STATS *section) section->ssim_weighted_pred_err /= section->count; section->pcnt_inter /= section->count; section->pcnt_second_ref /= section->count; + section->pcnt_neutral /= section->count; section->pcnt_motion /= section->count; section->MVr /= section->count; section->mvr_abs /= section->count; @@ -570,6 +577,7 @@ void vp8_first_pass(VP8_COMP *cpi) int intercount = 0; int second_ref_count = 0; int intrapenalty = 256; + int neutral_count = 0; int sum_in_vectors = 0; @@ -726,6 +734,17 @@ void vp8_first_pass(VP8_COMP *cpi) if (motion_error <= this_error) { + // Keep a count of cases where the inter and intra were + // very close and very low. This helps with scene cut + // detection for example in cropped clips with black bars + // at the sides or top and bottom. + if( (((this_error-intrapenalty) * 9) <= + (motion_error*10)) && + (this_error < (2*intrapenalty)) ) + { + neutral_count++; + } + d->bmi.mv.as_mv.row <<= 3; d->bmi.mv.as_mv.col <<= 3; this_error = motion_error; @@ -854,6 +873,7 @@ void vp8_first_pass(VP8_COMP *cpi) fps.pcnt_inter = 1.0 * (double)intercount / cm->MBs; fps.pcnt_second_ref = 1.0 * (double)second_ref_count / cm->MBs; + fps.pcnt_neutral = 1.0 * (double)neutral_count / cm->MBs; if (mvcount > 0) { @@ -1341,7 +1361,7 @@ void vp8_end_second_pass(VP8_COMP *cpi) // This function gives and estimate of how badly we believe // the predicition quality is decaying from frame to frame. -double gf_prediction_decay_rate(VP8_COMP *cpi, FIRSTPASS_STATS *next_frame) +double get_prediction_decay_rate(VP8_COMP *cpi, FIRSTPASS_STATS *next_frame) { double prediction_decay_rate; double motion_decay; @@ -1376,6 +1396,52 @@ double gf_prediction_decay_rate(VP8_COMP *cpi, FIRSTPASS_STATS *next_frame) return prediction_decay_rate; } +// Function to test for a condition where a complex transition is followed +// by a static section. For example in slide shows where there is a fade +// between slides. This is to help with more optimal kf and gf positioning. +BOOL detect_transition_to_still( + VP8_COMP *cpi, + int frame_interval, + int still_interval, + double loop_decay_rate, + double decay_accumulator ) +{ + BOOL trans_to_still = FALSE; + + // Break clause to detect very still sections after motion + // For example a static image after a fade or other transition + // instead of a clean scene cut. + if ( (frame_interval > MIN_GF_INTERVAL) && + (loop_decay_rate >= 0.999) && + (decay_accumulator < 0.9) ) + { + int j; + FIRSTPASS_STATS * position = cpi->stats_in; + FIRSTPASS_STATS tmp_next_frame; + double decay_rate; + + // Look ahead a few frames to see if static condition + // persists... + for ( j = 0; j < still_interval; j++ ) + { + if (EOF == vp8_input_stats(cpi, &tmp_next_frame)) + break; + + decay_rate = get_prediction_decay_rate(cpi, &tmp_next_frame); + if ( decay_rate < 0.999 ) + break; + } + // Reset file position + reset_fpf_position(cpi, position); + + // Only if it does do we signal a transition to still + if ( j == still_interval ) + trans_to_still = TRUE; + } + + return trans_to_still; +} + // Analyse and define a gf/arf group . static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { @@ -1528,7 +1594,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) if (r > GF_RMAX) r = GF_RMAX; - loop_decay_rate = gf_prediction_decay_rate(cpi, &next_frame); + loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame); // Cumulative effect of decay decay_accumulator = decay_accumulator * loop_decay_rate; @@ -1537,48 +1603,13 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) boost_score += (decay_accumulator * r); // Break clause to detect very still sections after motion - // For example a staic image after a fade or other transition - // instead of a clean key frame. - if ( (i > MIN_GF_INTERVAL) && - (loop_decay_rate >= 0.999) && - (decay_accumulator < 0.9) ) + // For example a staic image after a fade or other transition. + if ( detect_transition_to_still( cpi, i, 5, + loop_decay_rate, decay_accumulator ) ) { - int j; - FIRSTPASS_STATS * position = cpi->stats_in; - FIRSTPASS_STATS tmp_next_frame; - double decay_rate; - - // Look ahead a few frames to see if static condition - // persists... - for ( j = 0; j < 4; j++ ) - { - if (EOF == vp8_input_stats(cpi, &tmp_next_frame)) - break; - - decay_rate = gf_prediction_decay_rate(cpi, &tmp_next_frame); - if ( decay_rate < 0.999 ) - break; - } - reset_fpf_position(cpi, position); // Reset file position - - // Force GF not alt ref - if ( j == 4 ) - { - if (0) - { - FILE *f = fopen("fadegf.stt", "a"); - fprintf(f, " %8d %8d %10.4f %10.4f %10.4f\n", - cpi->common.current_video_frame+i, i, - loop_decay_rate, decay_accumulator, - boost_score ); - fclose(f); - } - - allow_alt_ref = FALSE; - - boost_score = old_boost_score; - break; - } + allow_alt_ref = FALSE; + boost_score = old_boost_score; + break; } // Break out conditions. @@ -2285,7 +2316,7 @@ static BOOL test_candidate_kf(VP8_COMP *cpi, FIRSTPASS_STATS *last_frame, FIRST (next_frame->pcnt_second_ref < 0.10) && ((this_frame->pcnt_inter < 0.05) || ( - (this_frame->pcnt_inter < .25) && + ((this_frame->pcnt_inter - this_frame->pcnt_neutral) < .25) && ((this_frame->intra_error / DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) < 2.5) && ((fabs(last_frame->coded_error - this_frame->coded_error) / DOUBLE_DIVIDE_CHECK(this_frame->coded_error) > .40) || (fabs(last_frame->intra_error - this_frame->intra_error) / DOUBLE_DIVIDE_CHECK(this_frame->intra_error) > .40) || @@ -2332,7 +2363,9 @@ static BOOL test_candidate_kf(VP8_COMP *cpi, FIRSTPASS_STATS *last_frame, FIRST // Test various breakout clauses if ((local_next_frame.pcnt_inter < 0.05) || (next_iiratio < 1.5) || - ((local_next_frame.pcnt_inter < 0.20) && (next_iiratio < 3.0)) || + (((local_next_frame.pcnt_inter - + local_next_frame.pcnt_neutral) < 0.20) && + (next_iiratio < 3.0)) || ((boost_score - old_boost_score) < 0.5) || (local_next_frame.intra_error < 200) ) @@ -2363,13 +2396,13 @@ static BOOL test_candidate_kf(VP8_COMP *cpi, FIRSTPASS_STATS *last_frame, FIRST } void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { - int i; + int i,j; FIRSTPASS_STATS last_frame; FIRSTPASS_STATS first_frame; FIRSTPASS_STATS next_frame; FIRSTPASS_STATS *start_position; - double decay_accumulator = 0; + double decay_accumulator = 1.0; double boost_score = 0; double old_boost_score = 0.0; double loop_decay_rate; @@ -2379,6 +2412,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) double kf_group_intra_err = 0.0; double kf_group_coded_err = 0.0; double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100); + double recent_loop_decay[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}; vpx_memset(&next_frame, 0, sizeof(next_frame)); // assure clean @@ -2407,6 +2441,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) kf_mod_err = calculate_modified_err(cpi, this_frame); // find the next keyframe + i = 0; while (cpi->stats_in < cpi->stats_in_end) { // Accumulate kf group error @@ -2425,9 +2460,34 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) if (cpi->oxcf.auto_key && lookup_next_frame_stats(cpi, &next_frame) != EOF) { + // Normal scene cut check if (test_candidate_kf(cpi, &last_frame, this_frame, &next_frame)) break; + // How fast is prediction quality decaying + loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame); + + // We want to know something about the recent past... rather than + // as used elsewhere where we are concened with decay in prediction + // quality since the last GF or KF. + recent_loop_decay[i%8] = loop_decay_rate; + decay_accumulator = 1.0; + for (j = 0; j < 8; j++) + { + decay_accumulator = decay_accumulator * recent_loop_decay[j]; + } + + // Special check for transition or high motion followed by a + // to a static scene. + if ( detect_transition_to_still( cpi, i, + (cpi->key_frame_frequency-i), + loop_decay_rate, + decay_accumulator ) ) + { + break; + } + + // Step on to the next frame cpi->frames_to_key ++; @@ -2437,6 +2497,8 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) break; } else cpi->frames_to_key ++; + + i++; } // If there is a max kf interval set by the user we must obey it. @@ -2588,32 +2650,8 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) if (r > RMAX) r = RMAX; - // Adjust loop decay rate - //if ( next_frame.pcnt_inter < loop_decay_rate ) - loop_decay_rate = next_frame.pcnt_inter; - - // High % motion -> somewhat higher decay rate - motion_pct = next_frame.pcnt_motion; - motion_decay = (1.0 - (motion_pct / 20.0)); - if (motion_decay < loop_decay_rate) - loop_decay_rate = motion_decay; - - // Adjustment to decay rate based on speed of motion - { - double this_mv_rabs; - double this_mv_cabs; - double distance_factor; - - this_mv_rabs = fabs(next_frame.mvr_abs * motion_pct); - this_mv_cabs = fabs(next_frame.mvc_abs * motion_pct); - - distance_factor = sqrt((this_mv_rabs * this_mv_rabs) + - (this_mv_cabs * this_mv_cabs)) / 250.0; - distance_factor = ((distance_factor > 1.0) - ? 0.0 : (1.0 - distance_factor)); - if (distance_factor < loop_decay_rate) - loop_decay_rate = distance_factor; - } + // How fast is prediction quality decaying + loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame); decay_accumulator = decay_accumulator * loop_decay_rate; decay_accumulator = decay_accumulator < 0.1 ? 0.1 : decay_accumulator; diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h index 5a4b3c185..0d353c31f 100644 --- a/vp8/encoder/onyx_int.h +++ b/vp8/encoder/onyx_int.h @@ -99,6 +99,7 @@ typedef struct double pcnt_inter; double pcnt_motion; double pcnt_second_ref; + double pcnt_neutral; double MVr; double mvr_abs; double MVc; diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c index 3db05155c..867ff6a9c 100644 --- a/vp8/encoder/rdopt.c +++ b/vp8/encoder/rdopt.c @@ -2033,7 +2033,7 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int else cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST; } - else if (vp8_ref_frame_order[mode_index] == SPLITMV) + else if (vp8_mode_order[mode_index] == SPLITMV) cpi->zbin_mode_boost = 0; else cpi->zbin_mode_boost = MV_ZBIN_BOOST; diff --git a/vp8/encoder/x86/variance_impl_sse2.asm b/vp8/encoder/x86/variance_impl_sse2.asm index 6cdc47bc9..5d1a17d44 100644 --- a/vp8/encoder/x86/variance_impl_sse2.asm +++ b/vp8/encoder/x86/variance_impl_sse2.asm @@ -627,6 +627,10 @@ filter_block2d_bil_var_sse2_loop: filter_block2d_bil_var_sse2_sp_only: movsxd rdx, dword ptr arg(6) ; yoffset + + cmp rdx, 0 ; skip all if both xoffset=0 and yoffset=0 + je filter_block2d_bil_var_sse2_full_pixel + shl rdx, 5 lea rdx, [rdx + rcx] ; VFilter @@ -671,6 +675,35 @@ filter_block2d_bil_sp_only_loop: jmp filter_block2d_bil_variance +filter_block2d_bil_var_sse2_full_pixel: + mov rsi, arg(0) ;ref_ptr + mov rdi, arg(2) ;src_ptr + movsxd rcx, dword ptr arg(4) ;Height + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line + movsxd rbx, dword ptr arg(3) ;src_pixels_per_line + pxor xmm0, xmm0 ; + +filter_block2d_bil_full_pixel_loop: + movq xmm1, QWORD PTR [rsi] ; + punpcklbw xmm1, xmm0 ; + + movq xmm2, QWORD PTR [rdi] ; + punpcklbw xmm2, xmm0 ; + + psubw xmm1, xmm2 ; + paddw xmm6, xmm1 ; + + pmaddwd xmm1, xmm1 ; + paddd xmm7, xmm1 ; + + lea rsi, [rsi + rax] ;ref_pixels_per_line + lea rdi, [rdi + rbx] ;src_pixels_per_line + + sub rcx, 1 ; + jnz filter_block2d_bil_full_pixel_loop ; + + jmp filter_block2d_bil_variance + filter_block2d_bil_var_sse2_fp_only: mov rsi, arg(0) ;ref_ptr mov rdi, arg(2) ;src_ptr diff --git a/vp8/encoder/x86/variance_impl_ssse3.asm b/vp8/encoder/x86/variance_impl_ssse3.asm new file mode 100644 index 000000000..b1976328d --- /dev/null +++ b/vp8/encoder/x86/variance_impl_ssse3.asm @@ -0,0 +1,348 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +%define xmm_filter_shift 7 + + +;void vp8_filter_block2d_bil_var_ssse3 +;( +; unsigned char *ref_ptr, +; int ref_pixels_per_line, +; unsigned char *src_ptr, +; int src_pixels_per_line, +; unsigned int Height, +; int xoffset, +; int yoffset, +; int *sum, +; unsigned int *sumsquared;; +; +;) +;Note: The filter coefficient at offset=0 is 128. Since the second register +;for Pmaddubsw is signed bytes, we must calculate zero offset seperately. +global sym(vp8_filter_block2d_bil_var_ssse3) +sym(vp8_filter_block2d_bil_var_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + SAVE_XMM + GET_GOT rbx + push rsi + push rdi + push rbx + ; end prolog + + pxor xmm6, xmm6 + pxor xmm7, xmm7 + + lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)] + movsxd rax, dword ptr arg(5) ; xoffset + + cmp rax, 0 ; skip first_pass filter if xoffset=0 + je filter_block2d_bil_var_ssse3_sp_only + + shl rax, 4 ; point to filter coeff with xoffset + lea rax, [rax + rcx] ; HFilter + + movsxd rdx, dword ptr arg(6) ; yoffset + + cmp rdx, 0 ; skip second_pass filter if yoffset=0 + je filter_block2d_bil_var_ssse3_fp_only + + shl rdx, 4 + lea rdx, [rdx + rcx] ; VFilter + + mov rsi, arg(0) ;ref_ptr + mov rdi, arg(2) ;src_ptr + movsxd rcx, dword ptr arg(4) ;Height + + movdqu xmm0, XMMWORD PTR [rsi] + movdqu xmm1, XMMWORD PTR [rsi+1] + movdqa xmm2, xmm0 + + punpcklbw xmm0, xmm1 + punpckhbw xmm2, xmm1 + pmaddubsw xmm0, [rax] + pmaddubsw xmm2, [rax] + + paddw xmm0, [GLOBAL(xmm_bi_rd)] + paddw xmm2, [GLOBAL(xmm_bi_rd)] + psraw xmm0, xmm_filter_shift + psraw xmm2, xmm_filter_shift + + packuswb xmm0, xmm2 + + movsxd rbx, dword ptr arg(1) ;ref_pixels_per_line + lea rsi, [rsi + rbx] +%if ABI_IS_32BIT=0 + movsxd r9, dword ptr arg(3) ;src_pixels_per_line +%endif + +filter_block2d_bil_var_ssse3_loop: + movdqu xmm1, XMMWORD PTR [rsi] + movdqu xmm2, XMMWORD PTR [rsi+1] + movdqa xmm3, xmm1 + + punpcklbw xmm1, xmm2 + punpckhbw xmm3, xmm2 + pmaddubsw xmm1, [rax] + pmaddubsw xmm3, [rax] + + paddw xmm1, [GLOBAL(xmm_bi_rd)] + paddw xmm3, [GLOBAL(xmm_bi_rd)] + psraw xmm1, xmm_filter_shift + psraw xmm3, xmm_filter_shift + packuswb xmm1, xmm3 + + movdqa xmm2, xmm0 + movdqa xmm0, xmm1 + movdqa xmm3, xmm2 + + punpcklbw xmm2, xmm1 + punpckhbw xmm3, xmm1 + pmaddubsw xmm2, [rdx] + pmaddubsw xmm3, [rdx] + + paddw xmm2, [GLOBAL(xmm_bi_rd)] + paddw xmm3, [GLOBAL(xmm_bi_rd)] + psraw xmm2, xmm_filter_shift + psraw xmm3, xmm_filter_shift + + movq xmm1, QWORD PTR [rdi] + pxor xmm4, xmm4 + punpcklbw xmm1, xmm4 + movq xmm5, QWORD PTR [rdi+8] + punpcklbw xmm5, xmm4 + + psubw xmm2, xmm1 + psubw xmm3, xmm5 + paddw xmm6, xmm2 + paddw xmm6, xmm3 + pmaddwd xmm2, xmm2 + pmaddwd xmm3, xmm3 + paddd xmm7, xmm2 + paddd xmm7, xmm3 + + lea rsi, [rsi + rbx] ;ref_pixels_per_line +%if ABI_IS_32BIT + add rdi, dword ptr arg(3) ;src_pixels_per_line +%else + lea rdi, [rdi + r9] +%endif + + sub rcx, 1 + jnz filter_block2d_bil_var_ssse3_loop + + jmp filter_block2d_bil_variance + +filter_block2d_bil_var_ssse3_sp_only: + movsxd rdx, dword ptr arg(6) ; yoffset + + cmp rdx, 0 ; Both xoffset =0 and yoffset=0 + je filter_block2d_bil_var_ssse3_full_pixel + + shl rdx, 4 + lea rdx, [rdx + rcx] ; VFilter + + mov rsi, arg(0) ;ref_ptr + mov rdi, arg(2) ;src_ptr + movsxd rcx, dword ptr arg(4) ;Height + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line + + movdqu xmm1, XMMWORD PTR [rsi] + movdqa xmm0, xmm1 + + movsxd rbx, dword ptr arg(3) ;src_pixels_per_line + lea rsi, [rsi + rax] + +filter_block2d_bil_sp_only_loop: + movdqu xmm3, XMMWORD PTR [rsi] + movdqa xmm2, xmm1 + movdqa xmm0, xmm3 + + punpcklbw xmm1, xmm3 + punpckhbw xmm2, xmm3 + pmaddubsw xmm1, [rdx] + pmaddubsw xmm2, [rdx] + + paddw xmm1, [GLOBAL(xmm_bi_rd)] + paddw xmm2, [GLOBAL(xmm_bi_rd)] + psraw xmm1, xmm_filter_shift + psraw xmm2, xmm_filter_shift + + movq xmm3, QWORD PTR [rdi] + pxor xmm4, xmm4 + punpcklbw xmm3, xmm4 + movq xmm5, QWORD PTR [rdi+8] + punpcklbw xmm5, xmm4 + + psubw xmm1, xmm3 + psubw xmm2, xmm5 + paddw xmm6, xmm1 + paddw xmm6, xmm2 + pmaddwd xmm1, xmm1 + pmaddwd xmm2, xmm2 + paddd xmm7, xmm1 + paddd xmm7, xmm2 + + movdqa xmm1, xmm0 + lea rsi, [rsi + rax] ;ref_pixels_per_line + lea rdi, [rdi + rbx] ;src_pixels_per_line + + sub rcx, 1 + jnz filter_block2d_bil_sp_only_loop + + jmp filter_block2d_bil_variance + +filter_block2d_bil_var_ssse3_full_pixel: + mov rsi, arg(0) ;ref_ptr + mov rdi, arg(2) ;src_ptr + movsxd rcx, dword ptr arg(4) ;Height + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line + movsxd rbx, dword ptr arg(3) ;src_pixels_per_line + pxor xmm0, xmm0 + +filter_block2d_bil_full_pixel_loop: + movq xmm1, QWORD PTR [rsi] + punpcklbw xmm1, xmm0 + movq xmm2, QWORD PTR [rsi+8] + punpcklbw xmm2, xmm0 + + movq xmm3, QWORD PTR [rdi] + punpcklbw xmm3, xmm0 + movq xmm4, QWORD PTR [rdi+8] + punpcklbw xmm4, xmm0 + + psubw xmm1, xmm3 + psubw xmm2, xmm4 + paddw xmm6, xmm1 + paddw xmm6, xmm2 + pmaddwd xmm1, xmm1 + pmaddwd xmm2, xmm2 + paddd xmm7, xmm1 + paddd xmm7, xmm2 + + lea rsi, [rsi + rax] ;ref_pixels_per_line + lea rdi, [rdi + rbx] ;src_pixels_per_line + sub rcx, 1 + jnz filter_block2d_bil_full_pixel_loop + + jmp filter_block2d_bil_variance + +filter_block2d_bil_var_ssse3_fp_only: + mov rsi, arg(0) ;ref_ptr + mov rdi, arg(2) ;src_ptr + movsxd rcx, dword ptr arg(4) ;Height + movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line + + pxor xmm0, xmm0 + movsxd rbx, dword ptr arg(3) ;src_pixels_per_line + +filter_block2d_bil_fp_only_loop: + movdqu xmm1, XMMWORD PTR [rsi] + movdqu xmm2, XMMWORD PTR [rsi+1] + movdqa xmm3, xmm1 + + punpcklbw xmm1, xmm2 + punpckhbw xmm3, xmm2 + pmaddubsw xmm1, [rax] + pmaddubsw xmm3, [rax] + + paddw xmm1, [GLOBAL(xmm_bi_rd)] + paddw xmm3, [GLOBAL(xmm_bi_rd)] + psraw xmm1, xmm_filter_shift + psraw xmm3, xmm_filter_shift + + movq xmm2, XMMWORD PTR [rdi] + pxor xmm4, xmm4 + punpcklbw xmm2, xmm4 + movq xmm5, QWORD PTR [rdi+8] + punpcklbw xmm5, xmm4 + + psubw xmm1, xmm2 + psubw xmm3, xmm5 + paddw xmm6, xmm1 + paddw xmm6, xmm3 + pmaddwd xmm1, xmm1 + pmaddwd xmm3, xmm3 + paddd xmm7, xmm1 + paddd xmm7, xmm3 + + lea rsi, [rsi + rdx] + lea rdi, [rdi + rbx] ;src_pixels_per_line + + sub rcx, 1 + jnz filter_block2d_bil_fp_only_loop + + jmp filter_block2d_bil_variance + +filter_block2d_bil_variance: + pxor xmm0, xmm0 + pxor xmm1, xmm1 + pxor xmm5, xmm5 + + punpcklwd xmm0, xmm6 + punpckhwd xmm1, xmm6 + psrad xmm0, 16 + psrad xmm1, 16 + paddd xmm0, xmm1 + movdqa xmm1, xmm0 + + movdqa xmm6, xmm7 + punpckldq xmm6, xmm5 + punpckhdq xmm7, xmm5 + paddd xmm6, xmm7 + + punpckldq xmm0, xmm5 + punpckhdq xmm1, xmm5 + paddd xmm0, xmm1 + + movdqa xmm7, xmm6 + movdqa xmm1, xmm0 + + psrldq xmm7, 8 + psrldq xmm1, 8 + + paddd xmm6, xmm7 + paddd xmm0, xmm1 + + mov rsi, arg(7) ;[Sum] + mov rdi, arg(8) ;[SSE] + + movd [rsi], xmm0 + movd [rdi], xmm6 + + ; begin epilog + pop rbx + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +SECTION_RODATA +align 16 +xmm_bi_rd: + times 8 dw 64 +align 16 +vp8_bilinear_filters_ssse3: + times 8 db 128, 0 + times 8 db 112, 16 + times 8 db 96, 32 + times 8 db 80, 48 + times 8 db 64, 64 + times 8 db 48, 80 + times 8 db 32, 96 + times 8 db 16, 112 diff --git a/vp8/encoder/x86/variance_ssse3.c b/vp8/encoder/x86/variance_ssse3.c new file mode 100644 index 000000000..750ae8b86 --- /dev/null +++ b/vp8/encoder/x86/variance_ssse3.c @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vp8/encoder/variance.h" +#include "vp8/common/pragmas.h" +#include "vpx_ports/mem.h" + +extern unsigned int vp8_get16x16var_sse2 +( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *SSE, + int *Sum +); +extern void vp8_half_horiz_vert_variance16x_h_sse2 +( + const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + int *sum, + unsigned int *sumsquared +); +extern void vp8_half_horiz_variance16x_h_sse2 +( + const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + int *sum, + unsigned int *sumsquared +); +extern void vp8_half_vert_variance16x_h_sse2 +( + const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + int *sum, + unsigned int *sumsquared +); +extern void vp8_filter_block2d_bil_var_ssse3 +( + const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + int xoffset, + int yoffset, + int *sum, + unsigned int *sumsquared +); + +unsigned int vp8_sub_pixel_variance16x16_ssse3 +( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + int xsum0, xsum1; + unsigned int xxsum0, xxsum1; + + // note we could avoid these if statements if the calling function + // just called the appropriate functions inside. + if (xoffset == 4 && yoffset == 0) + { + vp8_half_horiz_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + &xsum0, &xxsum0); + + vp8_half_horiz_variance16x_h_sse2( + src_ptr + 8, src_pixels_per_line, + dst_ptr + 8, dst_pixels_per_line, 16, + &xsum1, &xxsum1); + + xsum0 += xsum1; + xxsum0 += xxsum1; + } + else if (xoffset == 0 && yoffset == 4) + { + vp8_half_vert_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + &xsum0, &xxsum0); + + vp8_half_vert_variance16x_h_sse2( + src_ptr + 8, src_pixels_per_line, + dst_ptr + 8, dst_pixels_per_line, 16, + &xsum1, &xxsum1); + + xsum0 += xsum1; + xxsum0 += xxsum1; + } + else if (xoffset == 4 && yoffset == 4) + { + vp8_half_horiz_vert_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + &xsum0, &xxsum0); + + vp8_half_horiz_vert_variance16x_h_sse2( + src_ptr + 8, src_pixels_per_line, + dst_ptr + 8, dst_pixels_per_line, 16, + &xsum1, &xxsum1); + + xsum0 += xsum1; + xxsum0 += xxsum1; + } + else + { + vp8_filter_block2d_bil_var_ssse3( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + xoffset, yoffset, + &xsum0, &xxsum0); + } + + *sse = xxsum0; + return (xxsum0 - ((xsum0 * xsum0) >> 8)); +} diff --git a/vp8/encoder/x86/variance_x86.h b/vp8/encoder/x86/variance_x86.h index 6bea15ebc..1e2fb3490 100644 --- a/vp8/encoder/x86/variance_x86.h +++ b/vp8/encoder/x86/variance_x86.h @@ -286,6 +286,7 @@ extern prototype_sad_multi_dif_address(vp8_sad4x4x4d_sse3); #if HAVE_SSSE3 extern prototype_sad_multi_same_address(vp8_sad16x16x3_ssse3); extern prototype_sad_multi_same_address(vp8_sad16x8x3_ssse3); +extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_ssse3); #if !CONFIG_RUNTIME_CPU_DETECT #undef vp8_variance_sad16x16x3 @@ -294,6 +295,9 @@ extern prototype_sad_multi_same_address(vp8_sad16x8x3_ssse3); #undef vp8_variance_sad16x8x3 #define vp8_variance_sad16x8x3 vp8_sad16x8x3_ssse3 +#undef vp8_variance_subpixvar16x16 +#define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_ssse3 + #endif #endif diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c index 61c603229..c7639a7e4 100644 --- a/vp8/encoder/x86/x86_csystemdependent.c +++ b/vp8/encoder/x86/x86_csystemdependent.c @@ -334,6 +334,8 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) cpi->rtcd.variance.sad16x16x3 = vp8_sad16x16x3_ssse3; cpi->rtcd.variance.sad16x8x3 = vp8_sad16x8x3_ssse3; + cpi->rtcd.variance.subpixvar16x16 = vp8_sub_pixel_variance16x16_ssse3; + cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_ssse3; } diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk index ed2feddae..c0ae250f5 100644 --- a/vp8/vp8cx.mk +++ b/vp8/vp8cx.mk @@ -110,6 +110,8 @@ VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm VP8_CX_SRCS-$(HAVE_SSE3) += encoder/x86/sad_sse3.asm VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/sad_ssse3.asm +VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/variance_ssse3.c +VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/variance_impl_ssse3.asm VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.asm VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/sad_sse4.asm VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm |