diff options
31 files changed, 2621 insertions, 1572 deletions
@@ -1,3 +1,80 @@ +2011-03-07 v0.9.6 "Bali" + Our second named release, focused on a faster, higher quality, encoder. + + - Upgrading: + This release is backwards compatible with Aylesbury (v0.9.5). Users + of older releases should refer to the Upgrading notes in this + document for that release. + + - Enhancements: + vpxenc --psnr shows a summary when encode completes + --tune=ssim option to enable activity masking + improved postproc visualizations for development + updated support for Apple iOS to SDK 4.2 + query decoder to determine which reference frames were updated + implemented error tracking in the decoder + fix pipe support on windows + + - Speed: + Primary focus was on good quality mode, speed 0. Average improvement + on x86 about 40%, up to 100% on user-generated content at that speed. + Best quality mode speed improved 35%, and realtime speed 10-20%. This + release also saw significant improvement in realtime encoding speed + on ARM platforms. + + Improved encoder threading + Dont pick encoder filter level when loopfilter is disabled. + Avoid double copying of key frames into alt and golden buffer + FDCT optimizations. + x86 sse2 temporal filter + SSSE3 version of fast quantizer + vp8_rd_pick_best_mbsegmentation code restructure + Adjusted breakout RD for SPLITMV + Changed segmentation check order + Improved rd_pick_intra4x4block + Adds armv6 optimized variance calculation + ARMv6 optimized sad16x16 + ARMv6 optimized half pixel variance calculations + Full search SAD function optimization in SSE4.1 + Improve MV prediction accuracy to achieve performance gain + Improve MV prediction in vp8_pick_inter_mode() for speed>3 + + - Quality: + Best quality mode improved PSNR 6.3%, and SSIM 6.1%. This release + also includes support for "activity masking," which greatly improves + SSIM at the expense of PSNR. For now, this feature is available with + the --tune=ssim option. Further experimentation in this area + is ongoing. This release also introduces a new rate control mode + called "CQ," which changes the allocation of bits within a clip to + the sections where they will have the most visual impact. + + Tuning for the more exact quantizer. + Relax rate control for last few frames + CQ Mode + Limit key frame quantizer for forced key frames. + KF/GF Pulsing + Add simple version of activity masking. + make rdmult adaptive for intra in quantizer RDO + cap the best quantizer for 2nd order DC + change the threshold of DC check for encode breakout + + - Bug Fixes: + Fix crash on Sparc Solaris. + Fix counter of fixed keyframe distance + ARNR filter pointer update bug fix + Fixed use of motion percentage in KF/GF group calc + Changed condition for using RD in Intra Mode + Fix encoder real-time only configuration. + Fix ARM encoder crash with multiple token partitions + Fixed bug first cluster timecode of webm file is wrong. + Fixed various encoder bugs with odd-sized images + vp8e_get_preview fixed when spatial resampling enabled + quantizer: fix assertion in fast quantizer path + Allocate source buffers to be multiples of 16 + Fix for manual Golden frame frequency + Fix drastic undershoot in long form content + + 2010-10-28 v0.9.5 "Aylesbury" Our first named release, focused on a faster decoder, and a better encoder. diff --git a/build/make/Makefile b/build/make/Makefile index 40fa6d50c..62d139ea4 100755 --- a/build/make/Makefile +++ b/build/make/Makefile @@ -153,7 +153,7 @@ endif # obj_int_extract: build/make/obj_int_extract.c $(if $(quiet),echo " [HOSTCC] $@") - $(qexec)$(HOSTCC) -I. -o $@ $< + $(qexec)$(HOSTCC) -I. -I$(SRC_PATH_BARE) -o $@ $< CLEAN-OBJS += obj_int_extract # diff --git a/build/make/gen_msvs_proj.sh b/build/make/gen_msvs_proj.sh index becd95827..c2ef44a9b 100755 --- a/build/make/gen_msvs_proj.sh +++ b/build/make/gen_msvs_proj.sh @@ -33,6 +33,7 @@ Options: --proj-guid=GUID GUID to use for the project --module-def=filename File containing export definitions (for DLLs) --ver=version Version (7,8,9) of visual studio to generate for + --src-path-bare=dir Path to root of source tree -Ipath/to/include Additional include directories -DFLAG[=value] Preprocessor macros to define -Lpath/to/lib Additional library search paths @@ -191,6 +192,8 @@ for opt in "$@"; do ;; --lib) proj_kind="lib" ;; + --src-path-bare=*) src_path_bare="$optval" + ;; --static-crt) use_static_runtime=true ;; --ver=*) @@ -335,6 +338,35 @@ generate_vcproj() { case "$target" in x86*) case "$name" in + obj_int_extract) + tag Tool \ + Name="VCCLCompilerTool" \ + Optimization="0" \ + AdditionalIncludeDirectories="$incs" \ + PreprocessorDefinitions="WIN32;DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE" \ + RuntimeLibrary="$debug_runtime" \ + WarningLevel="3" \ + Detect64BitPortabilityProblems="true" \ + DebugInformationFormat="1" \ + ;; + vpx) + tag Tool \ + Name="VCPreBuildEventTool" \ + CommandLine="call obj_int_extract.bat $src_path_bare" \ + + tag Tool \ + Name="VCCLCompilerTool" \ + Optimization="0" \ + AdditionalIncludeDirectories="$incs" \ + PreprocessorDefinitions="WIN32;_DEBUG;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE;$defines" \ + RuntimeLibrary="$debug_runtime" \ + UsePrecompiledHeader="0" \ + WarningLevel="3" \ + DebugInformationFormat="1" \ + Detect64BitPortabilityProblems="true" \ + + $uses_asm && tag Tool Name="YASM" IncludePaths="$incs" Debug="1" + ;; *) tag Tool \ Name="VCCLCompilerTool" \ @@ -358,6 +390,12 @@ generate_vcproj() { case "$target" in x86*) case "$name" in + obj_int_extract) + tag Tool \ + Name="VCLinkerTool" \ + OutputFile="${name}.exe" \ + GenerateDebugInformation="true" \ + ;; *) tag Tool \ Name="VCLinkerTool" \ @@ -406,6 +444,34 @@ generate_vcproj() { case "$target" in x86*) case "$name" in + obj_int_extract) + tag Tool \ + Name="VCCLCompilerTool" \ + AdditionalIncludeDirectories="$incs" \ + PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE" \ + RuntimeLibrary="$release_runtime" \ + UsePrecompiledHeader="0" \ + WarningLevel="3" \ + Detect64BitPortabilityProblems="true" \ + DebugInformationFormat="0" \ + ;; + vpx) + tag Tool \ + Name="VCPreBuildEventTool" \ + CommandLine="call obj_int_extract.bat $src_path_bare" \ + + tag Tool \ + Name="VCCLCompilerTool" \ + AdditionalIncludeDirectories="$incs" \ + PreprocessorDefinitions="WIN32;NDEBUG;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE;$defines" \ + RuntimeLibrary="$release_runtime" \ + UsePrecompiledHeader="0" \ + WarningLevel="3" \ + DebugInformationFormat="0" \ + Detect64BitPortabilityProblems="true" \ + + $uses_asm && tag Tool Name="YASM" IncludePaths="$incs" + ;; *) tag Tool \ Name="VCCLCompilerTool" \ @@ -428,6 +494,12 @@ generate_vcproj() { case "$target" in x86*) case "$name" in + obj_int_extract) + tag Tool \ + Name="VCLinkerTool" \ + OutputFile="${name}.exe" \ + GenerateDebugInformation="true" \ + ;; *) tag Tool \ Name="VCLinkerTool" \ diff --git a/build/make/obj_int_extract.c b/build/make/obj_int_extract.c index 3c54b248f..01b3129d7 100644 --- a/build/make/obj_int_extract.c +++ b/build/make/obj_int_extract.c @@ -14,7 +14,7 @@ #include "vpx_config.h" -#if defined(_MSC_VER) +#if defined(_MSC_VER) || defined(__MINGW32__) #include <io.h> #include <share.h> #include "vpx/vpx_integer.h" @@ -59,20 +59,47 @@ int parse_macho(uint8_t *base_buf, size_t sz) struct mach_header header; uint8_t *buf = base_buf; int base_data_section = 0; - + int bits = 0; + + /* We can read in mach_header for 32 and 64 bit architectures + * because it's identical to mach_header_64 except for the last + * element (uint32_t reserved), which we don't use. Then, when + * we know which architecture we're looking at, increment buf + * appropriately. + */ memcpy(&header, buf, sizeof(struct mach_header)); - buf += sizeof(struct mach_header); - if (header.magic != MH_MAGIC) + if (header.magic == MH_MAGIC) { - log_msg("Bad magic number for object file. 0x%x expected, 0x%x found.\n", - header.magic, MH_MAGIC); - goto bail; + if (header.cputype == CPU_TYPE_ARM + || header.cputype == CPU_TYPE_X86) + { + bits = 32; + buf += sizeof(struct mach_header); + } + else + { + log_msg("Bad cputype for object file. Currently only tested for CPU_TYPE_[ARM|X86].\n"); + goto bail; + } } - - if (header.cputype != CPU_TYPE_ARM) + else if (header.magic == MH_MAGIC_64) + { + if (header.cputype == CPU_TYPE_X86_64) + { + bits = 64; + buf += sizeof(struct mach_header_64); + } + else + { + log_msg("Bad cputype for object file. Currently only tested for CPU_TYPE_X86_64.\n"); + goto bail; + } + } + else { - log_msg("Bad cputype for object file. Currently only tested for CPU_TYPE_ARM.\n"); + log_msg("Bad magic number for object file. 0x%x or 0x%x expected, 0x%x found.\n", + MH_MAGIC, MH_MAGIC_64, header.magic); goto bail; } @@ -85,8 +112,6 @@ int parse_macho(uint8_t *base_buf, size_t sz) for (i = 0; i < header.ncmds; i++) { struct load_command lc; - struct symtab_command sc; - struct segment_command seg_c; memcpy(&lc, buf, sizeof(struct load_command)); @@ -94,50 +119,99 @@ int parse_macho(uint8_t *base_buf, size_t sz) { uint8_t *seg_buf = buf; struct section s; + struct segment_command seg_c; - memcpy(&seg_c, buf, sizeof(struct segment_command)); - + memcpy(&seg_c, seg_buf, sizeof(struct segment_command)); seg_buf += sizeof(struct segment_command); - for (j = 0; j < seg_c.nsects; j++) + /* Although each section is given it's own offset, nlist.n_value + * references the offset of the first section. This isn't + * apparent without debug information because the offset of the + * data section is the same as the first section. However, with + * debug sections mixed in, the offset of the debug section + * increases but n_value still references the first section. + */ + if (seg_c.nsects < 1) { - memcpy(&s, seg_buf + (j * sizeof(struct section)), sizeof(struct section)); + log_msg("Not enough sections\n"); + goto bail; + } - // Need to get this offset which is the start of the symbol table - // before matching the strings up with symbols. - base_data_section = s.offset; + memcpy(&s, seg_buf, sizeof(struct section)); + base_data_section = s.offset; + } + else if (lc.cmd == LC_SEGMENT_64) + { + uint8_t *seg_buf = buf; + struct section_64 s; + struct segment_command_64 seg_c; + + memcpy(&seg_c, seg_buf, sizeof(struct segment_command_64)); + seg_buf += sizeof(struct segment_command_64); + + /* Explanation in LG_SEGMENT */ + if (seg_c.nsects < 1) + { + log_msg("Not enough sections\n"); + goto bail; } + + memcpy(&s, seg_buf, sizeof(struct section_64)); + base_data_section = s.offset; } else if (lc.cmd == LC_SYMTAB) { - uint8_t *sym_buf = base_buf; - uint8_t *str_buf = base_buf; - if (base_data_section != 0) { + struct symtab_command sc; + uint8_t *sym_buf = base_buf; + uint8_t *str_buf = base_buf; + memcpy(&sc, buf, sizeof(struct symtab_command)); if (sc.cmdsize != sizeof(struct symtab_command)) + { log_msg("Can't find symbol table!\n"); + goto bail; + } sym_buf += sc.symoff; str_buf += sc.stroff; for (j = 0; j < sc.nsyms; j++) { - struct nlist nl; - int val; + /* Location of string is cacluated each time from the + * start of the string buffer. On darwin the symbols + * are prefixed by "_", so we bump the pointer by 1. + * The target value is defined as an int in asm_*_offsets.c, + * which is 4 bytes on all targets we currently use. + */ + if (bits == 32) + { + struct nlist nl; + int val; - memcpy(&nl, sym_buf + (j * sizeof(struct nlist)), sizeof(struct nlist)); + memcpy(&nl, sym_buf, sizeof(struct nlist)); + sym_buf += sizeof(struct nlist); - val = *((int *)(base_buf + base_data_section + nl.n_value)); + memcpy(&val, base_buf + base_data_section + nl.n_value, + sizeof(val)); + printf("%-40s EQU %5d\n", + str_buf + nl.n_un.n_strx + 1, val); + } + else /* if (bits == 64) */ + { + struct nlist_64 nl; + int val; + + memcpy(&nl, sym_buf, sizeof(struct nlist_64)); + sym_buf += sizeof(struct nlist_64); - // Location of string is cacluated each time from the - // start of the string buffer. On darwin the symbols - // are prefixed by "_". On other platforms it is not - // so it needs to be removed. That is the reason for - // the +1. - printf("%-40s EQU %5d\n", str_buf + nl.n_un.n_strx + 1, val); + memcpy(&val, base_buf + base_data_section + nl.n_value, + sizeof(val)); + printf("%-40s EQU %5d\n", + str_buf + nl.n_un.n_strx + 1, val); + } } } } @@ -218,7 +292,7 @@ bail: return EXIT_FAILURE; } -#else +#elif defined(__ELF__) #include "elf.h" #define COPY_STRUCT(dst, buf, ofst, sz) do {\ @@ -237,212 +311,420 @@ bail: typedef struct { - uint8_t *buf; /* Buffer containing ELF data */ - size_t sz; /* Buffer size */ - int le_data; /* Data is little-endian */ - Elf32_Ehdr hdr; + uint8_t *buf; /* Buffer containing ELF data */ + size_t sz; /* Buffer size */ + int le_data; /* Data is little-endian */ + unsigned char e_ident[EI_NIDENT]; /* Magic number and other info */ + int bits; /* 32 or 64 */ + Elf32_Ehdr hdr32; + Elf64_Ehdr hdr64; } elf_obj_t; -int parse_elf32_header(elf_obj_t *elf) +int parse_elf_header(elf_obj_t *elf) { int res; - /* Verify ELF32 header */ - COPY_STRUCT(&elf->hdr, elf->buf, 0, elf->sz); - res = elf->hdr.e_ident[EI_MAG0] == ELFMAG0; - res &= elf->hdr.e_ident[EI_MAG1] == ELFMAG1; - res &= elf->hdr.e_ident[EI_MAG2] == ELFMAG2; - res &= elf->hdr.e_ident[EI_MAG3] == ELFMAG3; - res &= elf->hdr.e_ident[EI_CLASS] == ELFCLASS32; - res &= elf->hdr.e_ident[EI_DATA] == ELFDATA2LSB - || elf->hdr.e_ident[EI_DATA] == ELFDATA2MSB; + /* Verify ELF Magic numbers */ + COPY_STRUCT(&elf->e_ident, elf->buf, 0, elf->sz); + res = elf->e_ident[EI_MAG0] == ELFMAG0; + res &= elf->e_ident[EI_MAG1] == ELFMAG1; + res &= elf->e_ident[EI_MAG2] == ELFMAG2; + res &= elf->e_ident[EI_MAG3] == ELFMAG3; + res &= elf->e_ident[EI_CLASS] == ELFCLASS32 + || elf->e_ident[EI_CLASS] == ELFCLASS64; + res &= elf->e_ident[EI_DATA] == ELFDATA2LSB; if (!res) goto bail; - elf->le_data = elf->hdr.e_ident[EI_DATA] == ELFDATA2LSB; - - ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_type); - ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_machine); - ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_version); - ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_entry); - ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_phoff); - ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_shoff); - ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_flags); - ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_ehsize); - ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_phentsize); - ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_phnum); - ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_shentsize); - ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_shnum); - ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_shstrndx); + elf->le_data = elf->e_ident[EI_DATA] == ELFDATA2LSB; + + /* Read in relevant values */ + if (elf->e_ident[EI_CLASS] == ELFCLASS32) + { + elf->bits = 32; + COPY_STRUCT(&elf->hdr32, elf->buf, 0, elf->sz); + + ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_type); + ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_machine); + ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_version); + ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_entry); + ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_phoff); + ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_shoff); + ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_flags); + ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_ehsize); + ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_phentsize); + ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_phnum); + ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_shentsize); + ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_shnum); + ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_shstrndx); + } + else /* if (elf->e_ident[EI_CLASS] == ELFCLASS64) */ + { + elf->bits = 64; + COPY_STRUCT(&elf->hdr64, elf->buf, 0, elf->sz); + + ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_type); + ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_machine); + ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_version); + ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_entry); + ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_phoff); + ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_shoff); + ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_flags); + ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_ehsize); + ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_phentsize); + ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_phnum); + ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_shentsize); + ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_shnum); + ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_shstrndx); + } + return 0; bail: + log_msg("Failed to parse ELF file header"); return 1; } -int parse_elf32_section(elf_obj_t *elf, int idx, Elf32_Shdr *hdr) +int parse_elf_section(elf_obj_t *elf, int idx, Elf32_Shdr *hdr32, Elf64_Shdr *hdr64) { - if (idx >= elf->hdr.e_shnum) - goto bail; + if (hdr32) + { + if (idx >= elf->hdr32.e_shnum) + goto bail; + + COPY_STRUCT(hdr32, elf->buf, elf->hdr32.e_shoff + idx * elf->hdr32.e_shentsize, + elf->sz); + ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_name); + ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_type); + ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_flags); + ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_addr); + ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_offset); + ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_size); + ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_link); + ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_info); + ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_addralign); + ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_entsize); + } + else /* if (hdr64) */ + { + if (idx >= elf->hdr64.e_shnum) + goto bail; + + COPY_STRUCT(hdr64, elf->buf, elf->hdr64.e_shoff + idx * elf->hdr64.e_shentsize, + elf->sz); + ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_name); + ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_type); + ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_flags); + ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_addr); + ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_offset); + ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_size); + ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_link); + ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_info); + ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_addralign); + ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_entsize); + } - COPY_STRUCT(hdr, elf->buf, elf->hdr.e_shoff + idx * elf->hdr.e_shentsize, - elf->sz); - ENDIAN_ASSIGN_IN_PLACE(hdr->sh_name); - ENDIAN_ASSIGN_IN_PLACE(hdr->sh_type); - ENDIAN_ASSIGN_IN_PLACE(hdr->sh_flags); - ENDIAN_ASSIGN_IN_PLACE(hdr->sh_addr); - ENDIAN_ASSIGN_IN_PLACE(hdr->sh_offset); - ENDIAN_ASSIGN_IN_PLACE(hdr->sh_size); - ENDIAN_ASSIGN_IN_PLACE(hdr->sh_link); - ENDIAN_ASSIGN_IN_PLACE(hdr->sh_info); - ENDIAN_ASSIGN_IN_PLACE(hdr->sh_addralign); - ENDIAN_ASSIGN_IN_PLACE(hdr->sh_entsize); return 0; bail: return 1; } -char *parse_elf32_string_table(elf_obj_t *elf, int s_idx, int idx) +char *parse_elf_string_table(elf_obj_t *elf, int s_idx, int idx) { - Elf32_Shdr shdr; - - if (parse_elf32_section(elf, s_idx, &shdr)) + if (elf->bits == 32) { - log_msg("Failed to parse ELF string table: section %d, index %d\n", - s_idx, idx); - return ""; + Elf32_Shdr shdr; + + if (parse_elf_section(elf, s_idx, &shdr, NULL)) + { + log_msg("Failed to parse ELF string table: section %d, index %d\n", + s_idx, idx); + return ""; + } + + return (char *)(elf->buf + shdr.sh_offset + idx); } + else /* if (elf->bits == 64) */ + { + Elf64_Shdr shdr; - return (char *)(elf->buf + shdr.sh_offset + idx); + if (parse_elf_section(elf, s_idx, NULL, &shdr)) + { + log_msg("Failed to parse ELF string table: section %d, index %d\n", + s_idx, idx); + return ""; + } + + return (char *)(elf->buf + shdr.sh_offset + idx); + } } -int parse_elf32_symbol(elf_obj_t *elf, unsigned int ofst, Elf32_Sym *sym) +int parse_elf_symbol(elf_obj_t *elf, unsigned int ofst, Elf32_Sym *sym32, Elf64_Sym *sym64) { - COPY_STRUCT(sym, elf->buf, ofst, elf->sz); - ENDIAN_ASSIGN_IN_PLACE(sym->st_name); - ENDIAN_ASSIGN_IN_PLACE(sym->st_value); - ENDIAN_ASSIGN_IN_PLACE(sym->st_size); - ENDIAN_ASSIGN_IN_PLACE(sym->st_info); - ENDIAN_ASSIGN_IN_PLACE(sym->st_other); - ENDIAN_ASSIGN_IN_PLACE(sym->st_shndx); + if (sym32) + { + COPY_STRUCT(sym32, elf->buf, ofst, elf->sz); + ENDIAN_ASSIGN_IN_PLACE(sym32->st_name); + ENDIAN_ASSIGN_IN_PLACE(sym32->st_value); + ENDIAN_ASSIGN_IN_PLACE(sym32->st_size); + ENDIAN_ASSIGN_IN_PLACE(sym32->st_info); + ENDIAN_ASSIGN_IN_PLACE(sym32->st_other); + ENDIAN_ASSIGN_IN_PLACE(sym32->st_shndx); + } + else /* if (sym64) */ + { + COPY_STRUCT(sym64, elf->buf, ofst, elf->sz); + ENDIAN_ASSIGN_IN_PLACE(sym64->st_name); + ENDIAN_ASSIGN_IN_PLACE(sym64->st_value); + ENDIAN_ASSIGN_IN_PLACE(sym64->st_size); + ENDIAN_ASSIGN_IN_PLACE(sym64->st_info); + ENDIAN_ASSIGN_IN_PLACE(sym64->st_other); + ENDIAN_ASSIGN_IN_PLACE(sym64->st_shndx); + } return 0; bail: return 1; } -int parse_elf32(uint8_t *buf, size_t sz, output_fmt_t mode) +int parse_elf(uint8_t *buf, size_t sz, output_fmt_t mode) { - elf_obj_t elf; - Elf32_Shdr shdr; + elf_obj_t elf; unsigned int ofst; - int i; - Elf32_Off strtab_off; /* save String Table offset for later use */ + int i; + Elf32_Off strtab_off32; + Elf64_Off strtab_off64; /* save String Table offset for later use */ memset(&elf, 0, sizeof(elf)); elf.buf = buf; elf.sz = sz; /* Parse Header */ - if (parse_elf32_header(&elf)) - { - log_msg("Parse error: File does not appear to be valid ELF32\n"); - return 1; - } + if (parse_elf_header(&elf)) + goto bail; - for (i = 0; i < elf.hdr.e_shnum; i++) + if (elf.bits == 32) { - parse_elf32_section(&elf, i, &shdr); - - if (shdr.sh_type == SHT_STRTAB) + Elf32_Shdr shdr; + for (i = 0; i < elf.hdr32.e_shnum; i++) { - char strtsb_name[128]; - - strcpy(strtsb_name, (char *)(elf.buf + shdr.sh_offset + shdr.sh_name)); + parse_elf_section(&elf, i, &shdr, NULL); - if (!(strcmp(strtsb_name, ".shstrtab"))) + if (shdr.sh_type == SHT_STRTAB) { - log_msg("found section: %s\n", strtsb_name); - strtab_off = shdr.sh_offset; - break; + char strtsb_name[128]; + + strcpy(strtsb_name, (char *)(elf.buf + shdr.sh_offset + shdr.sh_name)); + + if (!(strcmp(strtsb_name, ".shstrtab"))) + { + /* log_msg("found section: %s\n", strtsb_name); */ + strtab_off32 = shdr.sh_offset; + break; + } } } } - - /* Parse all Symbol Tables */ - for (i = 0; i < elf.hdr.e_shnum; i++) + else /* if (elf.bits == 64) */ { - - parse_elf32_section(&elf, i, &shdr); - - if (shdr.sh_type == SHT_SYMTAB) + Elf64_Shdr shdr; + for (i = 0; i < elf.hdr64.e_shnum; i++) { - for (ofst = shdr.sh_offset; - ofst < shdr.sh_offset + shdr.sh_size; - ofst += shdr.sh_entsize) - { - Elf32_Sym sym; + parse_elf_section(&elf, i, NULL, &shdr); - parse_elf32_symbol(&elf, ofst, &sym); + if (shdr.sh_type == SHT_STRTAB) + { + char strtsb_name[128]; - /* For all OBJECTS (data objects), extract the value from the - * proper data segment. - */ - if (ELF32_ST_TYPE(sym.st_info) == STT_OBJECT && sym.st_name) - log_msg("found data object %s\n", - parse_elf32_string_table(&elf, - shdr.sh_link, - sym.st_name)); + strcpy(strtsb_name, (char *)(elf.buf + shdr.sh_offset + shdr.sh_name)); - if (ELF32_ST_TYPE(sym.st_info) == STT_OBJECT - && sym.st_size == 4) + if (!(strcmp(strtsb_name, ".shstrtab"))) { - Elf32_Shdr dhdr; - int32_t val; - char section_name[128]; - - parse_elf32_section(&elf, sym.st_shndx, &dhdr); + /* log_msg("found section: %s\n", strtsb_name); */ + strtab_off64 = shdr.sh_offset; + break; + } + } + } + } - /* For explanition - refer to _MSC_VER version of code */ - strcpy(section_name, (char *)(elf.buf + strtab_off + dhdr.sh_name)); - log_msg("Section_name: %s, Section_type: %d\n", section_name, dhdr.sh_type); + /* Parse all Symbol Tables */ + if (elf.bits == 32) + { + Elf32_Shdr shdr; + for (i = 0; i < elf.hdr32.e_shnum; i++) + { + parse_elf_section(&elf, i, &shdr, NULL); - if (!(strcmp(section_name, ".bss"))) - { - val = 0; - } - else + if (shdr.sh_type == SHT_SYMTAB) + { + for (ofst = shdr.sh_offset; + ofst < shdr.sh_offset + shdr.sh_size; + ofst += shdr.sh_entsize) + { + Elf32_Sym sym; + + parse_elf_symbol(&elf, ofst, &sym, NULL); + + /* For all OBJECTS (data objects), extract the value from the + * proper data segment. + */ + /* if (ELF32_ST_TYPE(sym.st_info) == STT_OBJECT && sym.st_name) + log_msg("found data object %s\n", + parse_elf_string_table(&elf, + shdr.sh_link, + sym.st_name)); + */ + + if (ELF32_ST_TYPE(sym.st_info) == STT_OBJECT + && sym.st_size == 4) { - memcpy(&val, - elf.buf + dhdr.sh_offset + sym.st_value, - sizeof(val)); + Elf32_Shdr dhdr; + int val = 0; + char section_name[128]; + + parse_elf_section(&elf, sym.st_shndx, &dhdr, NULL); + + /* For explanition - refer to _MSC_VER version of code */ + strcpy(section_name, (char *)(elf.buf + strtab_off32 + dhdr.sh_name)); + /* log_msg("Section_name: %s, Section_type: %d\n", section_name, dhdr.sh_type); */ + + if (strcmp(section_name, ".bss")) + { + if (sizeof(val) != sym.st_size) + { + /* The target value is declared as an int in + * asm_*_offsets.c, which is 4 bytes on all + * targets we currently use. Complain loudly if + * this is not true. + */ + log_msg("Symbol size is wrong\n"); + goto bail; + } + + memcpy(&val, + elf.buf + dhdr.sh_offset + sym.st_value, + sym.st_size); + } + + if (!elf.le_data) + { + log_msg("Big Endian data not supported yet!\n"); + goto bail; + } + + switch (mode) + { + case OUTPUT_FMT_RVDS: + printf("%-40s EQU %5d\n", + parse_elf_string_table(&elf, + shdr.sh_link, + sym.st_name), + val); + break; + case OUTPUT_FMT_GAS: + printf(".equ %-40s, %5d\n", + parse_elf_string_table(&elf, + shdr.sh_link, + sym.st_name), + val); + break; + default: + printf("%s = %d\n", + parse_elf_string_table(&elf, + shdr.sh_link, + sym.st_name), + val); + } } + } + } + } + } + else /* if (elf.bits == 64) */ + { + Elf64_Shdr shdr; + for (i = 0; i < elf.hdr64.e_shnum; i++) + { + parse_elf_section(&elf, i, NULL, &shdr); - if (!elf.le_data) - { - log_msg("Big Endian data not supported yet!\n"); - goto bail; - }\ - - switch (mode) + if (shdr.sh_type == SHT_SYMTAB) + { + for (ofst = shdr.sh_offset; + ofst < shdr.sh_offset + shdr.sh_size; + ofst += shdr.sh_entsize) + { + Elf64_Sym sym; + + parse_elf_symbol(&elf, ofst, NULL, &sym); + + /* For all OBJECTS (data objects), extract the value from the + * proper data segment. + */ + /* if (ELF64_ST_TYPE(sym.st_info) == STT_OBJECT && sym.st_name) + log_msg("found data object %s\n", + parse_elf_string_table(&elf, + shdr.sh_link, + sym.st_name)); + */ + + if (ELF64_ST_TYPE(sym.st_info) == STT_OBJECT + && sym.st_size == 4) { - case OUTPUT_FMT_RVDS: - printf("%-40s EQU %5d\n", - parse_elf32_string_table(&elf, - shdr.sh_link, - sym.st_name), - val); - break; - case OUTPUT_FMT_GAS: - printf(".equ %-40s, %5d\n", - parse_elf32_string_table(&elf, - shdr.sh_link, - sym.st_name), - val); - break; - default: - printf("%s = %d\n", - parse_elf32_string_table(&elf, - shdr.sh_link, - sym.st_name), - val); + Elf64_Shdr dhdr; + int val = 0; + char section_name[128]; + + parse_elf_section(&elf, sym.st_shndx, NULL, &dhdr); + + /* For explanition - refer to _MSC_VER version of code */ + strcpy(section_name, (char *)(elf.buf + strtab_off64 + dhdr.sh_name)); + /* log_msg("Section_name: %s, Section_type: %d\n", section_name, dhdr.sh_type); */ + + if ((strcmp(section_name, ".bss"))) + { + if (sizeof(val) != sym.st_size) + { + /* The target value is declared as an int in + * asm_*_offsets.c, which is 4 bytes on all + * targets we currently use. Complain loudly if + * this is not true. + */ + log_msg("Symbol size is wrong\n"); + goto bail; + } + + memcpy(&val, + elf.buf + dhdr.sh_offset + sym.st_value, + sym.st_size); + } + + if (!elf.le_data) + { + log_msg("Big Endian data not supported yet!\n"); + goto bail; + } + + switch (mode) + { + case OUTPUT_FMT_RVDS: + printf("%-40s EQU %5d\n", + parse_elf_string_table(&elf, + shdr.sh_link, + sym.st_name), + val); + break; + case OUTPUT_FMT_GAS: + printf(".equ %-40s, %5d\n", + parse_elf_string_table(&elf, + shdr.sh_link, + sym.st_name), + val); + break; + default: + printf("%s = %d\n", + parse_elf_string_table(&elf, + shdr.sh_link, + sym.st_name), + val); + } } } } @@ -454,7 +736,7 @@ int parse_elf32(uint8_t *buf, size_t sz, output_fmt_t mode) return 0; bail: - log_msg("Parse error: File does not appear to be valid ELF32\n"); + log_msg("Parse error: File does not appear to be valid ELF32 or ELF64\n"); return 1; } @@ -521,8 +803,7 @@ int main(int argc, char **argv) goto bail; } - res = parse_elf32(file_buf, stat_buf.st_size, mode); - //res = parse_coff(file_buf, stat_buf.st_size); + res = parse_elf(file_buf, stat_buf.st_size, mode); free(file_buf); if (!res) @@ -535,7 +816,7 @@ bail: #endif -#if defined(_MSC_VER) +#if defined(_MSC_VER) || defined(__MINGW32__) /* See "Microsoft Portable Executable and Common Object File Format Specification" for reference. */ @@ -549,7 +830,6 @@ int parse_coff(unsigned __int8 *buf, size_t sz) unsigned int i; unsigned __int8 *ptr; unsigned __int32 symoffset; - FILE *fp; char **sectionlist; //this array holds all section names in their correct order. //it is used to check if the symbol is in .bss or .data section. @@ -560,9 +840,18 @@ int parse_coff(unsigned __int8 *buf, size_t sz) strtab_ptr = symtab_ptr + symtab_sz * 18; if (nsections > 96) - goto bail; + { + log_msg("Too many sections\n"); + return 1; + } + + sectionlist = malloc(nsections * sizeof(sectionlist)); - sectionlist = malloc(nsections * sizeof * sectionlist); + if (sectionlist == NULL) + { + log_msg("Allocating first level of section list failed\n"); + return 1; + } //log_msg("COFF: Found %u symbols in %u sections.\n", symtab_sz, nsections); @@ -580,6 +869,12 @@ int parse_coff(unsigned __int8 *buf, size_t sz) //log_msg("COFF: Parsing section %s\n",sectionname); sectionlist[i] = malloc(strlen(sectionname) + 1); + + if (sectionlist[i] == NULL) + { + log_msg("Allocating storage for %s failed\n", sectionname); + goto bail; + } strcpy(sectionlist[i], sectionname); if (!strcmp(sectionname, ".data")) sectionrawdata_ptr = get_le32(ptr + 20); @@ -590,14 +885,6 @@ int parse_coff(unsigned __int8 *buf, size_t sz) //log_msg("COFF: Symbol table at offset %u\n", symtab_ptr); //log_msg("COFF: raw data pointer ofset for section .data is %u\n", sectionrawdata_ptr); - fp = fopen("assembly_offsets.asm", "w"); - - if (fp == NULL) - { - perror("open file"); - goto bail; - } - /* The compiler puts the data with non-zero offset in .data section, but puts the data with zero offset in .bss section. So, if the data in in .bss section, set offset=0. Note from Wiki: In an object module compiled from C, the bss section contains @@ -631,13 +918,15 @@ int parse_coff(unsigned __int8 *buf, size_t sz) char name[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; strncpy(name, ptr, 8); //log_msg("COFF: Parsing symbol %s\n",name); - fprintf(fp, "%-40s EQU ", name); + /* +1 to avoid printing leading underscore */ + printf("%-40s EQU ", name + 1); } else { //log_msg("COFF: Parsing symbol %s\n", // buf + strtab_ptr + get_le32(ptr+4)); - fprintf(fp, "%-40s EQU ", buf + strtab_ptr + get_le32(ptr + 4)); + /* +1 to avoid printing leading underscore */ + printf("%-40s EQU ", buf + strtab_ptr + get_le32(ptr + 4) + 1); } if (!(strcmp(sectionlist[section-1], ".bss"))) @@ -654,14 +943,13 @@ int parse_coff(unsigned __int8 *buf, size_t sz) //log_msg(" Address: %u\n",get_le32(ptr+8)); //log_msg(" Offset: %u\n", symoffset); - fprintf(fp, "%5d\n", symoffset); + printf("%5d\n", symoffset); } ptr += 18; } - fprintf(fp, " END\n"); - fclose(fp); + printf(" END\n"); for (i = 0; i < nsections; i++) { @@ -711,11 +999,7 @@ int main(int argc, char **argv) else f = argv[1]; - if (_sopen_s(&fd, f, _O_BINARY, _SH_DENYNO, _S_IREAD | _S_IWRITE)) - { - perror("Unable to open file"); - goto bail; - } + fd = _sopen(f, _O_BINARY, _SH_DENYNO, _S_IREAD | _S_IWRITE); if (_fstat(fd, &stat_buf)) { diff --git a/build/x86-msvs/obj_int_extract.bat b/build/x86-msvs/obj_int_extract.bat new file mode 100644 index 000000000..1bb865331 --- /dev/null +++ b/build/x86-msvs/obj_int_extract.bat @@ -0,0 +1,15 @@ +REM Copyright (c) 2011 The WebM project authors. All Rights Reserved. +REM +REM Use of this source code is governed by a BSD-style license +REM that can be found in the LICENSE file in the root of the source +REM tree. An additional intellectual property rights grant can be found +REM in the file PATENTS. All contributing project authors may +REM be found in the AUTHORS file in the root of the source tree. +echo on + +cl /I "./" /I "%1" /nologo /c "%1/vp8/common/asm_com_offsets.c" +cl /I "./" /I "%1" /nologo /c "%1/vp8/decoder/asm_dec_offsets.c" +cl /I "./" /I "%1" /nologo /c "%1/vp8/encoder/asm_enc_offsets.c" +obj_int_extract.exe rvds "asm_com_offsets.obj" > "asm_com_offsets.asm" +obj_int_extract.exe rvds "asm_dec_offsets.obj" > "asm_dec_offsets.asm" +obj_int_extract.exe rvds "asm_enc_offsets.obj" > "asm_enc_offsets.asm" @@ -9,7 +9,13 @@ ## -ASM:=$(if $(filter yes,$(CONFIG_GCC)),.asm.s,.asm) +# ARM assembly files are written in RVCT-style. We use some make magic to +# filter those files to allow GCC compilation +ifeq ($(ARCH_ARM),yes) + ASM:=$(if $(filter yes,$(CONFIG_GCC)),.asm.s,.asm) +else + ASM:=.asm +endif CODEC_SRCS-yes += libs.mk @@ -126,6 +132,23 @@ INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(call enabled,CODEC_EXPORTS) ifeq ($(CONFIG_EXTERNAL_BUILD),yes) ifeq ($(CONFIG_MSVS),yes) +obj_int_extract.vcproj: $(SRC_PATH_BARE)/build/make/obj_int_extract.c + @cp $(SRC_PATH_BARE)/build/x86-msvs/obj_int_extract.bat . + @echo " [CREATE] $@" + $(SRC_PATH_BARE)/build/make/gen_msvs_proj.sh \ + --exe \ + --target=$(TOOLCHAIN) \ + --name=obj_int_extract \ + --ver=$(CONFIG_VS_VERSION) \ + --proj-guid=E1360C65-D375-4335-8057-7ED99CC3F9B2 \ + $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \ + --out=$@ $^ \ + -I. \ + -I"$(SRC_PATH_BARE)" \ + +PROJECTS-$(BUILD_LIBVPX) += obj_int_extract.vcproj +PROJECTS-$(BUILD_LIBVPX) += obj_int_extract.bat + vpx.def: $(call enabled,CODEC_EXPORTS) @echo " [CREATE] $@" $(SRC_PATH_BARE)/build/make/gen_msvs_def.sh\ @@ -135,15 +158,16 @@ CLEAN-OBJS += vpx.def vpx.vcproj: $(CODEC_SRCS) vpx.def @echo " [CREATE] $@" - $(SRC_PATH_BARE)/build/make/gen_msvs_proj.sh\ - --lib\ - --target=$(TOOLCHAIN)\ + $(SRC_PATH_BARE)/build/make/gen_msvs_proj.sh \ + --lib \ + --target=$(TOOLCHAIN) \ $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \ - --name=vpx\ - --proj-guid=DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74\ - --module-def=vpx.def\ - --ver=$(CONFIG_VS_VERSION)\ - --out=$@ $(CFLAGS) $^\ + --name=vpx \ + --proj-guid=DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74 \ + --module-def=vpx.def \ + --ver=$(CONFIG_VS_VERSION) \ + --out=$@ $(CFLAGS) $^ \ + --src-path-bare="$(SRC_PATH_BARE)" \ PROJECTS-$(BUILD_LIBVPX) += vpx.vcproj @@ -207,36 +231,38 @@ endif # # Add assembler dependencies for configuration and offsets # -$(filter %.s.o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm -$(filter %.asm.o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm +$(filter %.s.o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm +$(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm # # Calculate platform- and compiler-specific offsets for hand coded assembly # -ifeq ($(ARCH_ARM), yes) - asm_com_offsets.asm: obj_int_extract - asm_com_offsets.asm: $(VP8_PREFIX)common/asm_com_offsets.c.o +ifeq ($(CONFIG_EXTERNAL_BUILD),) # Visual Studio uses obj_int_extract.bat + ifeq ($(ARCH_ARM), yes) + asm_com_offsets.asm: obj_int_extract + asm_com_offsets.asm: $(VP8_PREFIX)common/asm_com_offsets.c.o ./obj_int_extract rvds $< $(ADS2GAS) > $@ - OBJS-yes += $(VP8_PREFIX)common/asm_com_offsets.c.o - CLEAN-OBJS += asm_com_offsets.asm - $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_com_offsets.asm + OBJS-yes += $(VP8_PREFIX)common/asm_com_offsets.c.o + CLEAN-OBJS += asm_com_offsets.asm + $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_com_offsets.asm - ifeq ($(CONFIG_VP8_ENCODER), yes) - asm_enc_offsets.asm: obj_int_extract - asm_enc_offsets.asm: $(VP8_PREFIX)encoder/asm_enc_offsets.c.o + ifeq ($(CONFIG_VP8_ENCODER), yes) + asm_enc_offsets.asm: obj_int_extract + asm_enc_offsets.asm: $(VP8_PREFIX)encoder/asm_enc_offsets.c.o ./obj_int_extract rvds $< $(ADS2GAS) > $@ - OBJS-yes += $(VP8_PREFIX)encoder/asm_enc_offsets.c.o - CLEAN-OBJS += asm_enc_offsets.asm - $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_enc_offsets.asm - endif - - ifeq ($(CONFIG_VP8_DECODER), yes) - asm_dec_offsets.asm: obj_int_extract - asm_dec_offsets.asm: $(VP8_PREFIX)decoder/asm_dec_offsets.c.o + OBJS-yes += $(VP8_PREFIX)encoder/asm_enc_offsets.c.o + CLEAN-OBJS += asm_enc_offsets.asm + $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_enc_offsets.asm + endif + + ifeq ($(CONFIG_VP8_DECODER), yes) + asm_dec_offsets.asm: obj_int_extract + asm_dec_offsets.asm: $(VP8_PREFIX)decoder/asm_dec_offsets.c.o ./obj_int_extract rvds $< $(ADS2GAS) > $@ - OBJS-yes += $(VP8_PREFIX)decoder/asm_dec_offsets.c.o - CLEAN-OBJS += asm_dec_offsets.asm - $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_dec_offsets.asm + OBJS-yes += $(VP8_PREFIX)decoder/asm_dec_offsets.c.o + CLEAN-OBJS += asm_dec_offsets.asm + $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_dec_offsets.asm + endif endif endif diff --git a/solution.mk b/solution.mk index bef00883f..782150fd9 100644 --- a/solution.mk +++ b/solution.mk @@ -13,8 +13,9 @@ vpx.sln: $(wildcard *.vcproj) @echo " [CREATE] $@" $(SRC_PATH_BARE)/build/make/gen_msvs_sln.sh \ $(if $(filter %vpx.vcproj,$^),\ - $(foreach vcp,$(filter-out %vpx.vcproj,$^),\ + $(foreach vcp,$(filter-out %vpx.vcproj %obj_int_extract.vcproj,$^),\ --dep=$(vcp:.vcproj=):vpx)) \ + --dep=vpx:obj_int_extract \ --ver=$(CONFIG_VS_VERSION)\ --out=$@ $^ vpx.sln.mk: vpx.sln diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c index 12d5f66d3..5c607a0cb 100644 --- a/vp8/encoder/ethreading.c +++ b/vp8/encoder/ethreading.c @@ -24,6 +24,35 @@ extern void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x); extern void vp8_build_block_offsets(MACROBLOCK *x); extern void vp8_setup_block_ptrs(MACROBLOCK *x); +#if CONFIG_MULTITHREAD + +extern void loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm); + +static THREAD_FUNCTION loopfilter_thread(void *p_data) +{ + VP8_COMP *cpi = (VP8_COMP *)(((LPFTHREAD_DATA *)p_data)->ptr1); + VP8_COMMON *cm = &cpi->common; + + while (1) + { + if (cpi->b_multi_threaded == 0) + break; + + if (sem_wait(&cpi->h_event_start_lpf) == 0) + { + if (cpi->b_multi_threaded == FALSE) // we're shutting down + break; + + loopfilter_frame(cpi, cm); + + sem_post(&cpi->h_event_end_lpf); + } + } + + return 0; +} +#endif + static THREAD_FUNCTION thread_encoding_proc(void *p_data) { @@ -479,6 +508,15 @@ void vp8cx_create_encoder_threads(VP8_COMP *cpi) pthread_create(&cpi->h_encoding_thread[ithread], 0, thread_encoding_proc, ethd); } + { + LPFTHREAD_DATA * lpfthd = &cpi->lpf_thread_data; + + sem_init(&cpi->h_event_start_lpf, 0, 0); + sem_init(&cpi->h_event_end_lpf, 0, 0); + + lpfthd->ptr1 = (void *)cpi; + pthread_create(&cpi->h_filter_thread, 0, loopfilter_thread, lpfthd); + } } } @@ -500,9 +538,14 @@ void vp8cx_remove_encoder_threads(VP8_COMP *cpi) sem_destroy(&cpi->h_event_start_encoding[i]); } + + sem_post(&cpi->h_event_start_lpf); + pthread_join(cpi->h_filter_thread, 0); } sem_destroy(&cpi->h_event_end_encoding); + sem_destroy(&cpi->h_event_end_lpf); + sem_destroy(&cpi->h_event_start_lpf); //free thread related resources vpx_free(cpi->h_event_start_encoding); diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c index e0b2a8a45..774d9b6b5 100644 --- a/vp8/encoder/firstpass.c +++ b/vp8/encoder/firstpass.c @@ -281,21 +281,6 @@ int frame_max_bits(VP8_COMP *cpi) } -extern size_t vp8_firstpass_stats_sz(unsigned int mb_count) -{ - /* Calculate the size of a stats packet, which is dependent on the frame - * resolution. The FIRSTPASS_STATS struct has a single element array, - * motion_map, which is virtually expanded to have one element per - * macroblock. - */ - size_t stats_sz; - - stats_sz = sizeof(FIRSTPASS_STATS) + mb_count; - stats_sz = (stats_sz + 7) & ~7; - return stats_sz; -} - - void vp8_output_stats(const VP8_COMP *cpi, struct vpx_codec_pkt_list *pktlist, FIRSTPASS_STATS *stats) @@ -303,16 +288,19 @@ void vp8_output_stats(const VP8_COMP *cpi, struct vpx_codec_cx_pkt pkt; pkt.kind = VPX_CODEC_STATS_PKT; pkt.data.twopass_stats.buf = stats; - pkt.data.twopass_stats.sz = vp8_firstpass_stats_sz(cpi->common.MBs); + pkt.data.twopass_stats.sz = sizeof(FIRSTPASS_STATS); vpx_codec_pkt_list_add(pktlist, &pkt); // TEMP debug code #if OUTPUT_FPF + { FILE *fpfile; fpfile = fopen("firstpass.stt", "a"); - fprintf(fpfile, "%12.0f %12.0f %12.0f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.0f\n", + fprintf(fpfile, "%12.0f %12.0f %12.0f %12.4f %12.4f %12.4f %12.4f" + " %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f" + " %12.0f %12.4f\n", stats->frame, stats->intra_error, stats->coded_error, @@ -320,6 +308,7 @@ void vp8_output_stats(const VP8_COMP *cpi, stats->pcnt_inter, stats->pcnt_motion, stats->pcnt_second_ref, + stats->pcnt_neutral, stats->MVr, stats->mvr_abs, stats->MVc, @@ -327,12 +316,8 @@ void vp8_output_stats(const VP8_COMP *cpi, stats->MVrv, stats->MVcv, stats->mv_in_out_count, - stats->count); - fclose(fpfile); - - - fpfile = fopen("fpmotionmap.stt", "a"); - if(fwrite(cpi->fp_motion_map, 1, cpi->common.MBs, fpfile)); + stats->count, + stats->duration); fclose(fpfile); } #endif @@ -340,13 +325,11 @@ void vp8_output_stats(const VP8_COMP *cpi, int vp8_input_stats(VP8_COMP *cpi, FIRSTPASS_STATS *fps) { - size_t stats_sz = vp8_firstpass_stats_sz(cpi->common.MBs); - if (cpi->stats_in >= cpi->stats_in_end) return EOF; *fps = *cpi->stats_in; - cpi->stats_in = (void*)((char *)cpi->stats_in + stats_sz); + cpi->stats_in = (void*)((char *)cpi->stats_in + sizeof(FIRSTPASS_STATS)); return 1; } @@ -359,6 +342,7 @@ void vp8_zero_stats(FIRSTPASS_STATS *section) section->pcnt_inter = 0.0; section->pcnt_motion = 0.0; section->pcnt_second_ref = 0.0; + section->pcnt_neutral = 0.0; section->MVr = 0.0; section->mvr_abs = 0.0; section->MVc = 0.0; @@ -378,6 +362,7 @@ void vp8_accumulate_stats(FIRSTPASS_STATS *section, FIRSTPASS_STATS *frame) section->pcnt_inter += frame->pcnt_inter; section->pcnt_motion += frame->pcnt_motion; section->pcnt_second_ref += frame->pcnt_second_ref; + section->pcnt_neutral += frame->pcnt_neutral; section->MVr += frame->MVr; section->mvr_abs += frame->mvr_abs; section->MVc += frame->MVc; @@ -398,6 +383,7 @@ void vp8_avg_stats(FIRSTPASS_STATS *section) section->ssim_weighted_pred_err /= section->count; section->pcnt_inter /= section->count; section->pcnt_second_ref /= section->count; + section->pcnt_neutral /= section->count; section->pcnt_motion /= section->count; section->MVr /= section->count; section->mvr_abs /= section->count; @@ -409,57 +395,9 @@ void vp8_avg_stats(FIRSTPASS_STATS *section) section->duration /= section->count; } -unsigned char *vp8_fpmm_get_pos(VP8_COMP *cpi) -{ - return cpi->fp_motion_map_stats; -} -void vp8_fpmm_reset_pos(VP8_COMP *cpi, unsigned char *target_pos) -{ - cpi->fp_motion_map_stats = target_pos; -} - -void vp8_advance_fpmm(VP8_COMP *cpi, int count) -{ - cpi->fp_motion_map_stats = (void*)((char*)cpi->fp_motion_map_stats + - count * vp8_firstpass_stats_sz(cpi->common.MBs)); -} - -void vp8_input_fpmm(VP8_COMP *cpi) -{ - unsigned char *fpmm = cpi->fp_motion_map; - int MBs = cpi->common.MBs; - int max_frames = cpi->active_arnr_frames; - int i; - - for (i=0; i<max_frames; i++) - { - char *motion_map = (char*)cpi->fp_motion_map_stats - + sizeof(FIRSTPASS_STATS); - - memcpy(fpmm, motion_map, MBs); - fpmm += MBs; - vp8_advance_fpmm(cpi, 1); - } - - // Flag the use of weights in the temporal filter - cpi->use_weighted_temporal_filter = 1; -} - void vp8_init_first_pass(VP8_COMP *cpi) { vp8_zero_stats(cpi->total_stats); - -// TEMP debug code -#ifdef OUTPUT_FPF - { - FILE *fpfile; - fpfile = fopen("firstpass.stt", "w"); - fclose(fpfile); - fpfile = fopen("fpmotionmap.stt", "wb"); - fclose(fpfile); - } -#endif - } void vp8_end_first_pass(VP8_COMP *cpi) @@ -570,13 +508,12 @@ void vp8_first_pass(VP8_COMP *cpi) int intercount = 0; int second_ref_count = 0; int intrapenalty = 256; + int neutral_count = 0; int sum_in_vectors = 0; MV zero_ref_mv = {0, 0}; - unsigned char *fp_motion_map_ptr = cpi->fp_motion_map; - vp8_clear_system_state(); //__asm emms; x->src = * cpi->Source; @@ -628,7 +565,6 @@ void vp8_first_pass(VP8_COMP *cpi) for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) { int this_error; - int zero_error; int zz_to_best_ratio; int gf_motion_error = INT_MAX; int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row); @@ -650,9 +586,6 @@ void vp8_first_pass(VP8_COMP *cpi) // Cumulative intra error total intra_error += (long long)this_error; - // Indicate default assumption of intra in the motion map - *fp_motion_map_ptr = 0; - // Set up limit values for motion vectors to prevent them extending outside the UMV borders x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16)); x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + (VP8BORDERINPIXELS - 16); @@ -671,9 +604,6 @@ void vp8_first_pass(VP8_COMP *cpi) d->bmi.mv.as_mv.row = 0; d->bmi.mv.as_mv.col = 0; - // Save (0,0) error for later use - zero_error = motion_error; - // Test last reference frame using the previous best mv as the // starting point (best reference) for the search vp8_first_pass_motion_search(cpi, x, &best_ref_mv.as_mv, @@ -726,6 +656,17 @@ void vp8_first_pass(VP8_COMP *cpi) if (motion_error <= this_error) { + // Keep a count of cases where the inter and intra were + // very close and very low. This helps with scene cut + // detection for example in cropped clips with black bars + // at the sides or top and bottom. + if( (((this_error-intrapenalty) * 9) <= + (motion_error*10)) && + (this_error < (2*intrapenalty)) ) + { + neutral_count++; + } + d->bmi.mv.as_mv.row <<= 3; d->bmi.mv.as_mv.col <<= 3; this_error = motion_error; @@ -777,25 +718,6 @@ void vp8_first_pass(VP8_COMP *cpi) else if (d->bmi.mv.as_mv.col < 0) sum_in_vectors--; } - - // Compute how close (0,0) predictor is to best - // predictor in terms of their prediction error - zz_to_best_ratio = (10*zero_error + this_error/2) - / (this_error+!this_error); - - if ((zero_error < 50000) && - (zz_to_best_ratio <= 11) ) - *fp_motion_map_ptr = 1; - else - *fp_motion_map_ptr = 0; - } - else - { - // 0,0 mv was best - if( zero_error<50000 ) - *fp_motion_map_ptr = 2; - else - *fp_motion_map_ptr = 1; } } } @@ -809,9 +731,6 @@ void vp8_first_pass(VP8_COMP *cpi) recon_yoffset += 16; recon_uvoffset += 8; - - // Update the motion map - fp_motion_map_ptr++; } // adjust to the next row of mbs @@ -854,6 +773,7 @@ void vp8_first_pass(VP8_COMP *cpi) fps.pcnt_inter = 1.0 * (double)intercount / cm->MBs; fps.pcnt_second_ref = 1.0 * (double)second_ref_count / cm->MBs; + fps.pcnt_neutral = 1.0 * (double)neutral_count / cm->MBs; if (mvcount > 0) { @@ -872,13 +792,10 @@ void vp8_first_pass(VP8_COMP *cpi) // than the full time between subsequent cpi->source_time_stamp s . fps.duration = cpi->source_end_time_stamp - cpi->source_time_stamp; - // don't want to do outputstats with a stack variable! + // don't want to do output stats with a stack variable! memcpy(cpi->this_frame_stats, &fps, sizeof(FIRSTPASS_STATS)); - memcpy((char*)cpi->this_frame_stats + sizeof(FIRSTPASS_STATS), - cpi->fp_motion_map, - sizeof(cpi->fp_motion_map[0]) * cpi->common.MBs); vp8_output_stats(cpi, cpi->output_pkt_list, cpi->this_frame_stats); vp8_accumulate_stats(cpi->total_stats, &fps); } @@ -924,10 +841,10 @@ void vp8_first_pass(VP8_COMP *cpi) extern const int vp8_bits_per_mb[2][QINDEX_RANGE]; #define BASE_ERRPERMB 150 -static int estimate_max_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh, int Height, int Width) +static int estimate_max_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh) { int Q; - int num_mbs = ((Height * Width) / (16 * 16)); + int num_mbs = cpi->common.MBs; int target_norm_bits_per_mb; double err_per_mb = section_err / num_mbs; @@ -1024,10 +941,10 @@ static int estimate_max_q(VP8_COMP *cpi, double section_err, int section_target_ return Q; } -static int estimate_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh, int Height, int Width) +static int estimate_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh) { int Q; - int num_mbs = ((Height * Width) / (16 * 16)); + int num_mbs = cpi->common.MBs; int target_norm_bits_per_mb; double err_per_mb = section_err / num_mbs; @@ -1075,10 +992,10 @@ static int estimate_q(VP8_COMP *cpi, double section_err, int section_target_band } // Estimate a worst case Q for a KF group -static int estimate_kf_group_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh, int Height, int Width, double group_iiratio) +static int estimate_kf_group_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh, double group_iiratio) { int Q; - int num_mbs = ((Height * Width) / (16 * 16)); + int num_mbs = cpi->common.MBs; int target_norm_bits_per_mb = (512 * section_target_bandwitdh) / num_mbs; int bits_per_mb_at_this_q; @@ -1173,11 +1090,10 @@ static int estimate_kf_group_q(VP8_COMP *cpi, double section_err, int section_ta // For cq mode estimate a cq level that matches the observed // complexity and data rate. -static int estimate_cq(VP8_COMP *cpi, double section_err, - int section_target_bandwitdh, int Height, int Width) +static int estimate_cq(VP8_COMP *cpi, double section_err, int section_target_bandwitdh) { int Q; - int num_mbs = ((Height * Width) / (16 * 16)); + int num_mbs = cpi->common.MBs; int target_norm_bits_per_mb; double err_per_mb = section_err / num_mbs; @@ -1331,8 +1247,6 @@ void vp8_init_second_pass(VP8_COMP *cpi) cpi->clip_bpe = cpi->bits_left / DOUBLE_DIVIDE_CHECK(cpi->modified_error_total); cpi->observed_bpe = cpi->clip_bpe; - - cpi->fp_motion_map_stats = (unsigned char *)cpi->stats_in; } void vp8_end_second_pass(VP8_COMP *cpi) @@ -1340,8 +1254,8 @@ void vp8_end_second_pass(VP8_COMP *cpi) } // This function gives and estimate of how badly we believe -// the predicition quality is decaying from frame to frame. -double gf_prediction_decay_rate(VP8_COMP *cpi, FIRSTPASS_STATS *next_frame) +// the prediction quality is decaying from frame to frame. +double get_prediction_decay_rate(VP8_COMP *cpi, FIRSTPASS_STATS *next_frame) { double prediction_decay_rate; double motion_decay; @@ -1376,6 +1290,52 @@ double gf_prediction_decay_rate(VP8_COMP *cpi, FIRSTPASS_STATS *next_frame) return prediction_decay_rate; } +// Function to test for a condition where a complex transition is followed +// by a static section. For example in slide shows where there is a fade +// between slides. This is to help with more optimal kf and gf positioning. +BOOL detect_transition_to_still( + VP8_COMP *cpi, + int frame_interval, + int still_interval, + double loop_decay_rate, + double decay_accumulator ) +{ + BOOL trans_to_still = FALSE; + + // Break clause to detect very still sections after motion + // For example a static image after a fade or other transition + // instead of a clean scene cut. + if ( (frame_interval > MIN_GF_INTERVAL) && + (loop_decay_rate >= 0.999) && + (decay_accumulator < 0.9) ) + { + int j; + FIRSTPASS_STATS * position = cpi->stats_in; + FIRSTPASS_STATS tmp_next_frame; + double decay_rate; + + // Look ahead a few frames to see if static condition + // persists... + for ( j = 0; j < still_interval; j++ ) + { + if (EOF == vp8_input_stats(cpi, &tmp_next_frame)) + break; + + decay_rate = get_prediction_decay_rate(cpi, &tmp_next_frame); + if ( decay_rate < 0.999 ) + break; + } + // Reset file position + reset_fpf_position(cpi, position); + + // Only if it does do we signal a transition to still + if ( j == still_interval ) + trans_to_still = TRUE; + } + + return trans_to_still; +} + // Analyse and define a gf/arf group . static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { @@ -1406,8 +1366,6 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) int max_bits = frame_max_bits(cpi); // Max for a single frame - unsigned char *fpmm_pos; - unsigned int allow_alt_ref = cpi->oxcf.play_alternate && cpi->oxcf.lag_in_frames; @@ -1416,8 +1374,6 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) vp8_clear_system_state(); //__asm emms; - fpmm_pos = vp8_fpmm_get_pos(cpi); - start_pos = cpi->stats_in; vpx_memset(&next_frame, 0, sizeof(next_frame)); // assure clean @@ -1528,7 +1484,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) if (r > GF_RMAX) r = GF_RMAX; - loop_decay_rate = gf_prediction_decay_rate(cpi, &next_frame); + loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame); // Cumulative effect of decay decay_accumulator = decay_accumulator * loop_decay_rate; @@ -1537,48 +1493,13 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) boost_score += (decay_accumulator * r); // Break clause to detect very still sections after motion - // For example a staic image after a fade or other transition - // instead of a clean key frame. - if ( (i > MIN_GF_INTERVAL) && - (loop_decay_rate >= 0.999) && - (decay_accumulator < 0.9) ) + // For example a staic image after a fade or other transition. + if ( detect_transition_to_still( cpi, i, 5, + loop_decay_rate, decay_accumulator ) ) { - int j; - FIRSTPASS_STATS * position = cpi->stats_in; - FIRSTPASS_STATS tmp_next_frame; - double decay_rate; - - // Look ahead a few frames to see if static condition - // persists... - for ( j = 0; j < 4; j++ ) - { - if (EOF == vp8_input_stats(cpi, &tmp_next_frame)) - break; - - decay_rate = gf_prediction_decay_rate(cpi, &tmp_next_frame); - if ( decay_rate < 0.999 ) - break; - } - reset_fpf_position(cpi, position); // Reset file position - - // Force GF not alt ref - if ( j == 4 ) - { - if (0) - { - FILE *f = fopen("fadegf.stt", "a"); - fprintf(f, " %8d %8d %10.4f %10.4f %10.4f\n", - cpi->common.current_video_frame+i, i, - loop_decay_rate, decay_accumulator, - boost_score ); - fclose(f); - } - - allow_alt_ref = FALSE; - - boost_score = old_boost_score; - break; - } + allow_alt_ref = FALSE; + boost_score = old_boost_score; + break; } // Break out conditions. @@ -1686,7 +1607,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) arf_frame_bits = (int)((double)Boost * (group_bits / (double)allocation_chunks)); // Estimate if there are enough bits available to make worthwhile use of an arf. - tmp_q = estimate_q(cpi, mod_frame_err, (int)arf_frame_bits, cpi->common.Height, cpi->common.Width); + tmp_q = estimate_q(cpi, mod_frame_err, (int)arf_frame_bits); // Only use an arf if it is likely we will be able to code it at a lower Q than the surrounding frames. if (tmp_q < cpi->worst_quality) @@ -1749,20 +1670,6 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) } cpi->active_arnr_frames = frames_bwd + 1 + frames_fwd; - - { - // Advance to & read in the motion map for those frames - // to be considered for filtering based on the position - // of the ARF - vp8_fpmm_reset_pos(cpi, cpi->fp_motion_map_stats_save); - - // Position at the 'earliest' frame to be filtered - vp8_advance_fpmm(cpi, - cpi->baseline_gf_interval - frames_bwd); - - // Read / create a motion map for the region of interest - vp8_input_fpmm(cpi); - } } else { @@ -1992,9 +1899,6 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) reset_fpf_position(cpi, start_pos); } - - // Reset the First pass motion map file position - vp8_fpmm_reset_pos(cpi, fpmm_pos); } // Allocate bits to a normal frame that is neither a gf an arf or a key frame. @@ -2076,13 +1980,6 @@ void vp8_second_pass(VP8_COMP *cpi) if (EOF == vp8_input_stats(cpi, &this_frame)) return; - vpx_memset(cpi->fp_motion_map, 0, - cpi->oxcf.arnr_max_frames*cpi->common.MBs); - cpi->fp_motion_map_stats_save = vp8_fpmm_get_pos(cpi); - - // Step over this frame's first pass motion map - vp8_advance_fpmm(cpi, 1); - this_frame_error = this_frame.ssim_weighted_pred_err; this_frame_intra_error = this_frame.intra_error; this_frame_coded_error = this_frame.coded_error; @@ -2214,8 +2111,7 @@ void vp8_second_pass(VP8_COMP *cpi) est_cq = estimate_cq( cpi, (cpi->total_coded_error_left / frames_left), - (int)(cpi->bits_left / frames_left), - cpi->common.Height, cpi->common.Width); + (int)(cpi->bits_left / frames_left)); cpi->cq_target_quality = cpi->oxcf.cq_level; if ( est_cq > cpi->cq_target_quality ) @@ -2227,9 +2123,7 @@ void vp8_second_pass(VP8_COMP *cpi) cpi->maxq_min_limit = cpi->best_quality; tmp_q = estimate_max_q( cpi, (cpi->total_coded_error_left / frames_left), - (int)(cpi->bits_left / frames_left), - cpi->common.Height, - cpi->common.Width); + (int)(cpi->bits_left / frames_left)); // Limit the maxq value returned subsequently. // This increases the risk of overspend or underspend if the initial @@ -2257,7 +2151,7 @@ void vp8_second_pass(VP8_COMP *cpi) if (frames_left < 1) frames_left = 1; - tmp_q = estimate_max_q(cpi, (cpi->total_coded_error_left / frames_left), (int)(cpi->bits_left / frames_left), cpi->common.Height, cpi->common.Width); + tmp_q = estimate_max_q(cpi, (cpi->total_coded_error_left / frames_left), (int)(cpi->bits_left / frames_left)); // Move active_worst_quality but in a damped way if (tmp_q > cpi->active_worst_quality) @@ -2285,7 +2179,7 @@ static BOOL test_candidate_kf(VP8_COMP *cpi, FIRSTPASS_STATS *last_frame, FIRST (next_frame->pcnt_second_ref < 0.10) && ((this_frame->pcnt_inter < 0.05) || ( - (this_frame->pcnt_inter < .25) && + ((this_frame->pcnt_inter - this_frame->pcnt_neutral) < .25) && ((this_frame->intra_error / DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) < 2.5) && ((fabs(last_frame->coded_error - this_frame->coded_error) / DOUBLE_DIVIDE_CHECK(this_frame->coded_error) > .40) || (fabs(last_frame->intra_error - this_frame->intra_error) / DOUBLE_DIVIDE_CHECK(this_frame->intra_error) > .40) || @@ -2332,7 +2226,9 @@ static BOOL test_candidate_kf(VP8_COMP *cpi, FIRSTPASS_STATS *last_frame, FIRST // Test various breakout clauses if ((local_next_frame.pcnt_inter < 0.05) || (next_iiratio < 1.5) || - ((local_next_frame.pcnt_inter < 0.20) && (next_iiratio < 3.0)) || + (((local_next_frame.pcnt_inter - + local_next_frame.pcnt_neutral) < 0.20) && + (next_iiratio < 3.0)) || ((boost_score - old_boost_score) < 0.5) || (local_next_frame.intra_error < 200) ) @@ -2363,13 +2259,13 @@ static BOOL test_candidate_kf(VP8_COMP *cpi, FIRSTPASS_STATS *last_frame, FIRST } void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { - int i; + int i,j; FIRSTPASS_STATS last_frame; FIRSTPASS_STATS first_frame; FIRSTPASS_STATS next_frame; FIRSTPASS_STATS *start_position; - double decay_accumulator = 0; + double decay_accumulator = 1.0; double boost_score = 0; double old_boost_score = 0.0; double loop_decay_rate; @@ -2379,6 +2275,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) double kf_group_intra_err = 0.0; double kf_group_coded_err = 0.0; double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100); + double recent_loop_decay[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}; vpx_memset(&next_frame, 0, sizeof(next_frame)); // assure clean @@ -2407,6 +2304,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) kf_mod_err = calculate_modified_err(cpi, this_frame); // find the next keyframe + i = 0; while (cpi->stats_in < cpi->stats_in_end) { // Accumulate kf group error @@ -2425,9 +2323,34 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) if (cpi->oxcf.auto_key && lookup_next_frame_stats(cpi, &next_frame) != EOF) { + // Normal scene cut check if (test_candidate_kf(cpi, &last_frame, this_frame, &next_frame)) break; + // How fast is prediction quality decaying + loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame); + + // We want to know something about the recent past... rather than + // as used elsewhere where we are concened with decay in prediction + // quality since the last GF or KF. + recent_loop_decay[i%8] = loop_decay_rate; + decay_accumulator = 1.0; + for (j = 0; j < 8; j++) + { + decay_accumulator = decay_accumulator * recent_loop_decay[j]; + } + + // Special check for transition or high motion followed by a + // to a static scene. + if ( detect_transition_to_still( cpi, i, + (cpi->key_frame_frequency-i), + loop_decay_rate, + decay_accumulator ) ) + { + break; + } + + // Step on to the next frame cpi->frames_to_key ++; @@ -2437,6 +2360,8 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) break; } else cpi->frames_to_key ++; + + i++; } // If there is a max kf interval set by the user we must obey it. @@ -2588,32 +2513,8 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) if (r > RMAX) r = RMAX; - // Adjust loop decay rate - //if ( next_frame.pcnt_inter < loop_decay_rate ) - loop_decay_rate = next_frame.pcnt_inter; - - // High % motion -> somewhat higher decay rate - motion_pct = next_frame.pcnt_motion; - motion_decay = (1.0 - (motion_pct / 20.0)); - if (motion_decay < loop_decay_rate) - loop_decay_rate = motion_decay; - - // Adjustment to decay rate based on speed of motion - { - double this_mv_rabs; - double this_mv_cabs; - double distance_factor; - - this_mv_rabs = fabs(next_frame.mvr_abs * motion_pct); - this_mv_cabs = fabs(next_frame.mvc_abs * motion_pct); - - distance_factor = sqrt((this_mv_rabs * this_mv_rabs) + - (this_mv_cabs * this_mv_cabs)) / 250.0; - distance_factor = ((distance_factor > 1.0) - ? 0.0 : (1.0 - distance_factor)); - if (distance_factor < loop_decay_rate) - loop_decay_rate = distance_factor; - } + // How fast is prediction quality decaying + loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame); decay_accumulator = decay_accumulator * loop_decay_rate; decay_accumulator = decay_accumulator < 0.1 ? 0.1 : decay_accumulator; @@ -2859,7 +2760,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) bits_per_frame = (cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100); // Work out if spatial resampling is necessary - kf_q = estimate_kf_group_q(cpi, err_per_frame, bits_per_frame, new_height, new_width, group_iiratio); + kf_q = estimate_kf_group_q(cpi, err_per_frame, bits_per_frame, group_iiratio); // If we project a required Q higher than the maximum allowed Q then make a guess at the actual size of frames in this section projected_bits_perframe = bits_per_frame; @@ -2930,7 +2831,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) effective_size_ratio = (1.0 + (3.0 * effective_size_ratio)) / 4.0; // Now try again and see what Q we get with the smaller image size - kf_q = estimate_kf_group_q(cpi, err_per_frame * effective_size_ratio, bits_per_frame, new_height, new_width, group_iiratio); + kf_q = estimate_kf_group_q(cpi, err_per_frame * effective_size_ratio, bits_per_frame, group_iiratio); if (0) { diff --git a/vp8/encoder/generic/csystemdependent.c b/vp8/encoder/generic/csystemdependent.c index fc0580d55..81108fe96 100644 --- a/vp8/encoder/generic/csystemdependent.c +++ b/vp8/encoder/generic/csystemdependent.c @@ -103,6 +103,10 @@ void vp8_cmachine_specific_config(VP8_COMP *cpi) // Pure C: vp8_yv12_copy_partial_frame_ptr = vp8_yv12_copy_partial_frame; +#if CONFIG_PSNR + cpi->rtcd.variance.ssimpf_8x8 = ssim_parms_8x8_c; + cpi->rtcd.variance.ssimpf = ssim_parms_c; +#endif #if ARCH_X86 || ARCH_X86_64 vp8_arch_x86_encoder_init(cpi); diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c index 33aaa2ca9..c210c1de2 100644 --- a/vp8/encoder/mcomp.c +++ b/vp8/encoder/mcomp.c @@ -1415,7 +1415,7 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er int col_min = ref_col - distance; int col_max = ref_col + distance; - unsigned short sad_array8[8]; + DECLARE_ALIGNED_ARRAY(16, unsigned short, sad_array8, 8); unsigned int sad_array[3]; // Work out the mid point for the search diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index 39610a73f..8965634fe 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -86,9 +86,11 @@ extern double vp8_calc_ssim YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, int lumamask, - double *weight + double *weight, + const vp8_variance_rtcd_vtable_t *rtcd ); + extern double vp8_calc_ssimg ( YV12_BUFFER_CONFIG *source, @@ -281,12 +283,6 @@ void vp8_dealloc_compressor_data(VP8_COMP *cpi) vpx_free(cpi->active_map); cpi->active_map = 0; -#if !(CONFIG_REALTIME_ONLY) - // Delete first pass motion map - vpx_free(cpi->fp_motion_map); - cpi->fp_motion_map = 0; -#endif - vp8_de_alloc_frame_buffers(&cpi->common); vp8_yv12_de_alloc_frame_buffer(&cpi->last_frame_uf); @@ -1360,11 +1356,11 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) #if !(CONFIG_REALTIME_ONLY) vpx_free(cpi->total_stats); - cpi->total_stats = vpx_calloc(1, vp8_firstpass_stats_sz(cpi->common.MBs)); + cpi->total_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS)); vpx_free(cpi->this_frame_stats); - cpi->this_frame_stats = vpx_calloc(1, vp8_firstpass_stats_sz(cpi->common.MBs)); + cpi->this_frame_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS)); if(!cpi->total_stats || !cpi->this_frame_stats) vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, @@ -1462,8 +1458,7 @@ void vp8_init_config(VP8_PTR ptr, VP8_CONFIG *oxcf) VP8_COMP *cpi = (VP8_COMP *)(ptr); VP8_COMMON *cm = &cpi->common; - if (!cpi) - return; + cpi->oxcf = *oxcf; cpi->auto_gold = 1; cpi->auto_adjust_gold_quantizer = 1; @@ -1475,299 +1470,31 @@ void vp8_init_config(VP8_PTR ptr, VP8_CONFIG *oxcf) cm->version = oxcf->Version; vp8_setup_version(cm); - if (oxcf == 0) - { - cpi->pass = 0; - - cpi->auto_worst_q = 0; - cpi->oxcf.best_allowed_q = MINQ; - cpi->oxcf.worst_allowed_q = MAXQ; - cpi->oxcf.cq_level = MINQ; - - cpi->oxcf.end_usage = USAGE_STREAM_FROM_SERVER; - cpi->oxcf.starting_buffer_level = 4000; - cpi->oxcf.optimal_buffer_level = 5000; - cpi->oxcf.maximum_buffer_size = 6000; - cpi->oxcf.under_shoot_pct = 90; - cpi->oxcf.allow_df = 0; - cpi->oxcf.drop_frames_water_mark = 20; - - cpi->oxcf.allow_spatial_resampling = 0; - cpi->oxcf.resample_down_water_mark = 40; - cpi->oxcf.resample_up_water_mark = 60; - - cpi->oxcf.fixed_q = cpi->interquantizer; - - cpi->filter_type = NORMAL_LOOPFILTER; - - if (cm->simpler_lpf) - cpi->filter_type = SIMPLE_LOOPFILTER; - - cpi->compressor_speed = 1; - cpi->horiz_scale = 0; - cpi->vert_scale = 0; - cpi->oxcf.two_pass_vbrbias = 50; - cpi->oxcf.two_pass_vbrmax_section = 400; - cpi->oxcf.two_pass_vbrmin_section = 0; - - cpi->oxcf.Sharpness = 0; - cpi->oxcf.noise_sensitivity = 0; - } - else - cpi->oxcf = *oxcf; - - - switch (cpi->oxcf.Mode) - { - - case MODE_REALTIME: - cpi->pass = 0; - cpi->compressor_speed = 2; - - if (cpi->oxcf.cpu_used < -16) - { - cpi->oxcf.cpu_used = -16; - } - - if (cpi->oxcf.cpu_used > 16) - cpi->oxcf.cpu_used = 16; - - break; - -#if !(CONFIG_REALTIME_ONLY) - case MODE_GOODQUALITY: - cpi->pass = 0; - cpi->compressor_speed = 1; - - if (cpi->oxcf.cpu_used < -5) - { - cpi->oxcf.cpu_used = -5; - } - - if (cpi->oxcf.cpu_used > 5) - cpi->oxcf.cpu_used = 5; - - break; - - case MODE_BESTQUALITY: - cpi->pass = 0; - cpi->compressor_speed = 0; - break; - - case MODE_FIRSTPASS: - cpi->pass = 1; - cpi->compressor_speed = 1; - break; - case MODE_SECONDPASS: - cpi->pass = 2; - cpi->compressor_speed = 1; - - if (cpi->oxcf.cpu_used < -5) - { - cpi->oxcf.cpu_used = -5; - } - - if (cpi->oxcf.cpu_used > 5) - cpi->oxcf.cpu_used = 5; - - break; - case MODE_SECONDPASS_BEST: - cpi->pass = 2; - cpi->compressor_speed = 0; - break; -#endif - } - - if (cpi->pass == 0) - cpi->auto_worst_q = 1; - - cpi->oxcf.worst_allowed_q = q_trans[oxcf->worst_allowed_q]; - cpi->oxcf.best_allowed_q = q_trans[oxcf->best_allowed_q]; - cpi->oxcf.cq_level = q_trans[cpi->oxcf.cq_level]; - - if (oxcf->fixed_q >= 0) - { - if (oxcf->worst_allowed_q < 0) - cpi->oxcf.fixed_q = q_trans[0]; - else - cpi->oxcf.fixed_q = q_trans[oxcf->worst_allowed_q]; - - if (oxcf->alt_q < 0) - cpi->oxcf.alt_q = q_trans[0]; - else - cpi->oxcf.alt_q = q_trans[oxcf->alt_q]; - - if (oxcf->key_q < 0) - cpi->oxcf.key_q = q_trans[0]; - else - cpi->oxcf.key_q = q_trans[oxcf->key_q]; - - if (oxcf->gold_q < 0) - cpi->oxcf.gold_q = q_trans[0]; - else - cpi->oxcf.gold_q = q_trans[oxcf->gold_q]; - - } - - cpi->baseline_gf_interval = cpi->oxcf.alt_freq ? cpi->oxcf.alt_freq : DEFAULT_GF_INTERVAL; - cpi->ref_frame_flags = VP8_ALT_FLAG | VP8_GOLD_FLAG | VP8_LAST_FLAG; - - //cpi->use_golden_frame_only = 0; - //cpi->use_last_frame_only = 0; - cm->refresh_golden_frame = 0; - cm->refresh_last_frame = 1; - cm->refresh_entropy_probs = 1; - - if (cpi->oxcf.token_partitions >= 0 && cpi->oxcf.token_partitions <= 3) - cm->multi_token_partition = (TOKEN_PARTITION) cpi->oxcf.token_partitions; - - setup_features(cpi); - - { - int i; - - for (i = 0; i < MAX_MB_SEGMENTS; i++) - cpi->segment_encode_breakout[i] = cpi->oxcf.encode_breakout; - } - - // At the moment the first order values may not be > MAXQ - if (cpi->oxcf.fixed_q > MAXQ) - cpi->oxcf.fixed_q = MAXQ; - - // local file playback mode == really big buffer - if (cpi->oxcf.end_usage == USAGE_LOCAL_FILE_PLAYBACK) - { - cpi->oxcf.starting_buffer_level = 60000; - cpi->oxcf.optimal_buffer_level = 60000; - cpi->oxcf.maximum_buffer_size = 240000; - - } + // change includes all joint functionality + vp8_change_config(ptr, oxcf); + // Initialize active best and worst q and average q values. + cpi->active_worst_quality = cpi->oxcf.worst_allowed_q; + cpi->active_best_quality = cpi->oxcf.best_allowed_q; + cpi->avg_frame_qindex = cpi->oxcf.worst_allowed_q; - // Convert target bandwidth from Kbit/s to Bit/s - cpi->oxcf.target_bandwidth *= 1000; + // Initialise the starting buffer levels cpi->oxcf.starting_buffer_level = rescale(cpi->oxcf.starting_buffer_level, cpi->oxcf.target_bandwidth, 1000); - if (cpi->oxcf.optimal_buffer_level == 0) - cpi->oxcf.optimal_buffer_level = cpi->oxcf.target_bandwidth / 8; - else - cpi->oxcf.optimal_buffer_level = - rescale(cpi->oxcf.optimal_buffer_level, - cpi->oxcf.target_bandwidth, 1000); - - if (cpi->oxcf.maximum_buffer_size == 0) - cpi->oxcf.maximum_buffer_size = cpi->oxcf.target_bandwidth / 8; - else - cpi->oxcf.maximum_buffer_size = - rescale(cpi->oxcf.maximum_buffer_size, - cpi->oxcf.target_bandwidth, 1000); - - cpi->buffer_level = cpi->oxcf.starting_buffer_level; + cpi->buffer_level = cpi->oxcf.starting_buffer_level; cpi->bits_off_target = cpi->oxcf.starting_buffer_level; - vp8_new_frame_rate(cpi, cpi->oxcf.frame_rate); - cpi->worst_quality = cpi->oxcf.worst_allowed_q; - cpi->active_worst_quality = cpi->oxcf.worst_allowed_q; - cpi->avg_frame_qindex = cpi->oxcf.worst_allowed_q; - cpi->best_quality = cpi->oxcf.best_allowed_q; - cpi->active_best_quality = cpi->oxcf.best_allowed_q; - cpi->cq_target_quality = cpi->oxcf.cq_level; - - cpi->buffered_mode = (cpi->oxcf.optimal_buffer_level > 0) ? TRUE : FALSE; - cpi->rolling_target_bits = cpi->av_per_frame_bandwidth; cpi->rolling_actual_bits = cpi->av_per_frame_bandwidth; - cpi->long_rolling_target_bits = cpi->av_per_frame_bandwidth; - cpi->long_rolling_actual_bits = cpi->av_per_frame_bandwidth; + cpi->long_rolling_target_bits = cpi->av_per_frame_bandwidth; + cpi->long_rolling_actual_bits = cpi->av_per_frame_bandwidth; cpi->total_actual_bits = 0; - cpi->total_target_vs_actual = 0; - - // Only allow dropped frames in buffered mode - cpi->drop_frames_allowed = cpi->oxcf.allow_df && cpi->buffered_mode; - - cm->filter_type = (LOOPFILTERTYPE) cpi->filter_type; - - if (!cm->use_bilinear_mc_filter) - cm->mcomp_filter_type = SIXTAP; - else - cm->mcomp_filter_type = BILINEAR; - - cpi->target_bandwidth = cpi->oxcf.target_bandwidth; - - cm->Width = cpi->oxcf.Width ; - cm->Height = cpi->oxcf.Height ; - - cpi->intra_frame_target = (4 * (cm->Width + cm->Height) / 15) * 1000; // As per VP8 - - cm->horiz_scale = cpi->horiz_scale; - cm->vert_scale = cpi->vert_scale ; - - // VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs) - if (cpi->oxcf.Sharpness > 7) - cpi->oxcf.Sharpness = 7; - - cm->sharpness_level = cpi->oxcf.Sharpness; - - if (cm->horiz_scale != NORMAL || cm->vert_scale != NORMAL) - { - int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs); - int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs); - - Scale2Ratio(cm->horiz_scale, &hr, &hs); - Scale2Ratio(cm->vert_scale, &vr, &vs); - - // always go to the next whole number - cm->Width = (hs - 1 + cpi->oxcf.Width * hr) / hs; - cm->Height = (vs - 1 + cpi->oxcf.Height * vr) / vs; - } - - if (((cm->Width + 15) & 0xfffffff0) != cm->yv12_fb[cm->lst_fb_idx].y_width || - ((cm->Height + 15) & 0xfffffff0) != cm->yv12_fb[cm->lst_fb_idx].y_height || - cm->yv12_fb[cm->lst_fb_idx].y_width == 0) - { - alloc_raw_frame_buffers(cpi); - vp8_alloc_compressor_data(cpi); - } - - // Clamp KF frame size to quarter of data rate - if (cpi->intra_frame_target > cpi->target_bandwidth >> 2) - cpi->intra_frame_target = cpi->target_bandwidth >> 2; - - if (cpi->oxcf.fixed_q >= 0) - { - cpi->last_q[0] = cpi->oxcf.fixed_q; - cpi->last_q[1] = cpi->oxcf.fixed_q; - } - - cpi->Speed = cpi->oxcf.cpu_used; - - // force to allowlag to 0 if lag_in_frames is 0; - if (cpi->oxcf.lag_in_frames == 0) - { - cpi->oxcf.allow_lag = 0; - } - // Limit on lag buffers as these are not currently dynamically allocated - else if (cpi->oxcf.lag_in_frames > MAX_LAG_BUFFERS) - cpi->oxcf.lag_in_frames = MAX_LAG_BUFFERS; - - // YX Temp - cpi->last_alt_ref_sei = -1; - cpi->is_src_frame_alt_ref = 0; - cpi->is_next_src_alt_ref = 0; - -#if 0 - // Experimental RD Code - cpi->frame_distortion = 0; - cpi->last_frame_distortion = 0; -#endif + cpi->total_target_vs_actual = 0; #if VP8_TEMPORAL_ALT_REF - - cpi->use_weighted_temporal_filter = 0; - { int i; @@ -1779,12 +1506,6 @@ void vp8_init_config(VP8_PTR ptr, VP8_CONFIG *oxcf) #endif } -/* - * This function needs more clean up, i.e. be more tuned torwards - * change_config rather than init_config !!!!!!!!!!!!!!!! - * YX - 5/28/2009 - * - */ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf) { @@ -1897,7 +1618,8 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf) } - cpi->baseline_gf_interval = cpi->oxcf.alt_freq ? cpi->oxcf.alt_freq : DEFAULT_GF_INTERVAL; + cpi->baseline_gf_interval = + cpi->oxcf.alt_freq ? cpi->oxcf.alt_freq : DEFAULT_GF_INTERVAL; cpi->ref_frame_flags = VP8_ALT_FLAG | VP8_GOLD_FLAG | VP8_LAST_FLAG; @@ -1908,7 +1630,8 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf) cm->refresh_entropy_probs = 1; if (cpi->oxcf.token_partitions >= 0 && cpi->oxcf.token_partitions <= 3) - cm->multi_token_partition = (TOKEN_PARTITION) cpi->oxcf.token_partitions; + cm->multi_token_partition = + (TOKEN_PARTITION) cpi->oxcf.token_partitions; setup_features(cpi); @@ -1929,16 +1652,12 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf) cpi->oxcf.starting_buffer_level = 60000; cpi->oxcf.optimal_buffer_level = 60000; cpi->oxcf.maximum_buffer_size = 240000; - } // Convert target bandwidth from Kbit/s to Bit/s cpi->oxcf.target_bandwidth *= 1000; - cpi->oxcf.starting_buffer_level = - rescale(cpi->oxcf.starting_buffer_level, - cpi->oxcf.target_bandwidth, 1000); - + // Set or reset optimal and maximum buffer levels. if (cpi->oxcf.optimal_buffer_level == 0) cpi->oxcf.optimal_buffer_level = cpi->oxcf.target_bandwidth / 8; else @@ -1953,31 +1672,41 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf) rescale(cpi->oxcf.maximum_buffer_size, cpi->oxcf.target_bandwidth, 1000); - cpi->buffer_level = cpi->oxcf.starting_buffer_level; - cpi->bits_off_target = cpi->oxcf.starting_buffer_level; - + // Set up frame rate and related parameters rate control values. vp8_new_frame_rate(cpi, cpi->oxcf.frame_rate); + + // Set absolute upper and lower quality limits cpi->worst_quality = cpi->oxcf.worst_allowed_q; - cpi->active_worst_quality = cpi->oxcf.worst_allowed_q; - cpi->avg_frame_qindex = cpi->oxcf.worst_allowed_q; cpi->best_quality = cpi->oxcf.best_allowed_q; - cpi->active_best_quality = cpi->oxcf.best_allowed_q; - cpi->buffered_mode = (cpi->oxcf.optimal_buffer_level > 0) ? TRUE : FALSE; - cpi->cq_target_quality = cpi->oxcf.cq_level; + // active values should only be modified if out of new range + if (cpi->active_worst_quality > cpi->oxcf.worst_allowed_q) + { + cpi->active_worst_quality = cpi->oxcf.worst_allowed_q; + } + // less likely + else if (cpi->active_worst_quality < cpi->oxcf.best_allowed_q) + { + cpi->active_worst_quality = cpi->oxcf.best_allowed_q; + } + if (cpi->active_best_quality < cpi->oxcf.best_allowed_q) + { + cpi->active_best_quality = cpi->oxcf.best_allowed_q; + } + // less likely + else if (cpi->active_best_quality > cpi->oxcf.worst_allowed_q) + { + cpi->active_best_quality = cpi->oxcf.worst_allowed_q; + } - cpi->rolling_target_bits = cpi->av_per_frame_bandwidth; - cpi->rolling_actual_bits = cpi->av_per_frame_bandwidth; - cpi->long_rolling_target_bits = cpi->av_per_frame_bandwidth; - cpi->long_rolling_actual_bits = cpi->av_per_frame_bandwidth; + cpi->buffered_mode = (cpi->oxcf.optimal_buffer_level > 0) ? TRUE : FALSE; - cpi->total_actual_bits = 0; - cpi->total_target_vs_actual = 0; + cpi->cq_target_quality = cpi->oxcf.cq_level; // Only allow dropped frames in buffered mode - cpi->drop_frames_allowed = cpi->oxcf.allow_df && cpi->buffered_mode; + cpi->drop_frames_allowed = cpi->oxcf.allow_df && cpi->buffered_mode; - cm->filter_type = (LOOPFILTERTYPE) cpi->filter_type; + cm->filter_type = (LOOPFILTERTYPE) cpi->filter_type; if (!cm->use_bilinear_mc_filter) cm->mcomp_filter_type = SIXTAP; @@ -1992,7 +1721,8 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf) cm->horiz_scale = cpi->horiz_scale; cm->vert_scale = cpi->vert_scale ; - cpi->intra_frame_target = (4 * (cm->Width + cm->Height) / 15) * 1000; // As per VP8 + // As per VP8 + cpi->intra_frame_target = (4 * (cm->Width + cm->Height) / 15) * 1000; // VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs) if (cpi->oxcf.Sharpness > 7) @@ -2013,8 +1743,10 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf) cm->Height = (vs - 1 + cpi->oxcf.Height * vr) / vs; } - if (((cm->Width + 15) & 0xfffffff0) != cm->yv12_fb[cm->lst_fb_idx].y_width || - ((cm->Height + 15) & 0xfffffff0) != cm->yv12_fb[cm->lst_fb_idx].y_height || + if (((cm->Width + 15) & 0xfffffff0) != + cm->yv12_fb[cm->lst_fb_idx].y_width || + ((cm->Height + 15) & 0xfffffff0) != + cm->yv12_fb[cm->lst_fb_idx].y_height || cm->yv12_fb[cm->lst_fb_idx].y_width == 0) { alloc_raw_frame_buffers(cpi); @@ -2153,12 +1885,6 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf) vpx_memset(cpi->active_map , 1, (cpi->common.mb_rows * cpi->common.mb_cols)); cpi->active_map_enabled = 0; -#if !(CONFIG_REALTIME_ONLY) - // Create the first pass motion map structure and set to 0 - // Allocate space for maximum of 15 buffers - CHECK_MEM_ERROR(cpi->fp_motion_map, vpx_calloc(15*cpi->common.MBs, 1)); -#endif - #if 0 // Experimental code for lagged and one pass // Initialise one_pass GF frames stats @@ -2308,7 +2034,7 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf) } else if (cpi->pass == 2) { - size_t packet_sz = vp8_firstpass_stats_sz(cpi->common.MBs); + size_t packet_sz = sizeof(FIRSTPASS_STATS); int packets = oxcf->two_pass_stats_in.sz / packet_sz; cpi->stats_in = oxcf->two_pass_stats_in.buf; @@ -3509,6 +3235,89 @@ static BOOL recode_loop_test( VP8_COMP *cpi, return force_recode; } +void loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm) +{ + if (cm->no_lpf) + { + cm->filter_level = 0; + } + else + { + struct vpx_usec_timer timer; + + vp8_clear_system_state(); + + vpx_usec_timer_start(&timer); + if (cpi->sf.auto_filter == 0) + vp8cx_pick_filter_level_fast(cpi->Source, cpi); + + else + vp8cx_pick_filter_level(cpi->Source, cpi); + + vpx_usec_timer_mark(&timer); + cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer); + } + +#if CONFIG_MULTITHREAD + sem_post(&cpi->h_event_end_lpf); /* signal that we have set filter_level */ +#endif + + if (cm->filter_level > 0) + { + vp8cx_set_alt_lf_level(cpi, cm->filter_level); + vp8_loop_filter_frame(cm, &cpi->mb.e_mbd, cm->filter_level); + cm->last_filter_type = cm->filter_type; + cm->last_sharpness_level = cm->sharpness_level; + } + + vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show); + + { + YV12_BUFFER_CONFIG *lst_yv12 = &cm->yv12_fb[cm->lst_fb_idx]; + YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx]; + YV12_BUFFER_CONFIG *gld_yv12 = &cm->yv12_fb[cm->gld_fb_idx]; + YV12_BUFFER_CONFIG *alt_yv12 = &cm->yv12_fb[cm->alt_fb_idx]; + // At this point the new frame has been encoded. + // If any buffer copy / swapping is signaled it should be done here. + if (cm->frame_type == KEY_FRAME) + { + vp8_yv12_copy_frame_ptr(cm->frame_to_show, gld_yv12); + vp8_yv12_copy_frame_ptr(cm->frame_to_show, alt_yv12); + } + else // For non key frames + { + // Code to copy between reference buffers + if (cm->copy_buffer_to_arf) + { + if (cm->copy_buffer_to_arf == 1) + { + if (cm->refresh_last_frame) + // We copy new_frame here because last and new buffers will already have been swapped if cm->refresh_last_frame is set. + vp8_yv12_copy_frame_ptr(new_yv12, alt_yv12); + else + vp8_yv12_copy_frame_ptr(lst_yv12, alt_yv12); + } + else if (cm->copy_buffer_to_arf == 2) + vp8_yv12_copy_frame_ptr(gld_yv12, alt_yv12); + } + + if (cm->copy_buffer_to_gf) + { + if (cm->copy_buffer_to_gf == 1) + { + if (cm->refresh_last_frame) + // We copy new_frame here because last and new buffers will already have been swapped if cm->refresh_last_frame is set. + vp8_yv12_copy_frame_ptr(new_yv12, gld_yv12); + else + vp8_yv12_copy_frame_ptr(lst_yv12, gld_yv12); + } + else if (cm->copy_buffer_to_gf == 2) + vp8_yv12_copy_frame_ptr(alt_yv12, gld_yv12); + } + } + } +} + static void encode_frame_to_data_rate ( VP8_COMP *cpi, @@ -3542,6 +3351,7 @@ static void encode_frame_to_data_rate int drop_mark50 = drop_mark / 4; int drop_mark25 = drop_mark / 8; + // Clear down mmx registers to allow floating point in what follows vp8_clear_system_state(); @@ -3862,11 +3672,12 @@ static void encode_frame_to_data_rate } } - // If CBR and the buffer is as full then it is reasonable to allow higher quality on the frames - // to prevent bits just going to waste. + // If CBR and the buffer is as full then it is reasonable to allow + // higher quality on the frames to prevent bits just going to waste. if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) { - // Note that the use of >= here elliminates the risk of a devide by 0 error in the else if clause + // Note that the use of >= here elliminates the risk of a devide + // by 0 error in the else if clause if (cpi->buffer_level >= cpi->oxcf.maximum_buffer_size) cpi->active_best_quality = cpi->best_quality; @@ -3879,6 +3690,20 @@ static void encode_frame_to_data_rate } } } + // Make sure constrained quality mode limits are adhered to for the first + // few frames of one pass encodes + else if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) + { + if ( (cm->frame_type == KEY_FRAME) || + cm->refresh_golden_frame || cpi->common.refresh_alt_ref_frame ) + { + cpi->active_best_quality = cpi->best_quality; + } + else if (cpi->active_best_quality < cpi->cq_target_quality) + { + cpi->active_best_quality = cpi->cq_target_quality; + } + } // Clip the active best and worst quality values to limits if (cpi->active_worst_quality > cpi->worst_quality) @@ -4058,8 +3883,8 @@ static void encode_frame_to_data_rate vp8_setup_key_frame(cpi); // transform / motion compensation build reconstruction frame - vp8_encode_frame(cpi); + cpi->projected_frame_size -= vp8_estimate_entropy_savings(cpi); cpi->projected_frame_size = (cpi->projected_frame_size > 0) ? cpi->projected_frame_size : 0; @@ -4408,92 +4233,43 @@ static void encode_frame_to_data_rate else cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx]; - if (cm->no_lpf) + +#if CONFIG_MULTITHREAD + if (cpi->b_multi_threaded) { - cm->filter_level = 0; + sem_post(&cpi->h_event_start_lpf); /* start loopfilter in separate thread */ } else +#endif { - struct vpx_usec_timer timer; - - vpx_usec_timer_start(&timer); - - if (cpi->sf.auto_filter == 0) - vp8cx_pick_filter_level_fast(cpi->Source, cpi); - else - vp8cx_pick_filter_level(cpi->Source, cpi); - - vpx_usec_timer_mark(&timer); - - cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer); - } - - if (cm->filter_level > 0) - { - vp8cx_set_alt_lf_level(cpi, cm->filter_level); - vp8_loop_filter_frame(cm, &cpi->mb.e_mbd, cm->filter_level); - cm->last_filter_type = cm->filter_type; - cm->last_sharpness_level = cm->sharpness_level; + loopfilter_frame(cpi, cm); } - /* Move storing frame_type out of the above loop since it is also - * needed in motion search besides loopfilter */ - cm->last_frame_type = cm->frame_type; - - vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show); - if (cpi->oxcf.error_resilient_mode == 1) { cm->refresh_entropy_probs = 0; } +#if CONFIG_MULTITHREAD + /* wait that filter_level is picked so that we can continue with stream packing */ + if (cpi->b_multi_threaded) + sem_wait(&cpi->h_event_end_lpf); +#endif + // build the bitstream vp8_pack_bitstream(cpi, dest, size); +#if CONFIG_MULTITHREAD + /* wait for loopfilter thread done */ + if (cpi->b_multi_threaded) { - YV12_BUFFER_CONFIG *lst_yv12 = &cm->yv12_fb[cm->lst_fb_idx]; - YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx]; - YV12_BUFFER_CONFIG *gld_yv12 = &cm->yv12_fb[cm->gld_fb_idx]; - YV12_BUFFER_CONFIG *alt_yv12 = &cm->yv12_fb[cm->alt_fb_idx]; - // At this point the new frame has been encoded coded. - // If any buffer copy / swaping is signalled it should be done here. - if (cm->frame_type == KEY_FRAME) - { - vp8_yv12_copy_frame_ptr(cm->frame_to_show, gld_yv12); - vp8_yv12_copy_frame_ptr(cm->frame_to_show, alt_yv12); - } - else // For non key frames - { - // Code to copy between reference buffers - if (cm->copy_buffer_to_arf) - { - if (cm->copy_buffer_to_arf == 1) - { - if (cm->refresh_last_frame) - // We copy new_frame here because last and new buffers will already have been swapped if cm->refresh_last_frame is set. - vp8_yv12_copy_frame_ptr(new_yv12, alt_yv12); - else - vp8_yv12_copy_frame_ptr(lst_yv12, alt_yv12); - } - else if (cm->copy_buffer_to_arf == 2) - vp8_yv12_copy_frame_ptr(gld_yv12, alt_yv12); - } - - if (cm->copy_buffer_to_gf) - { - if (cm->copy_buffer_to_gf == 1) - { - if (cm->refresh_last_frame) - // We copy new_frame here because last and new buffers will already have been swapped if cm->refresh_last_frame is set. - vp8_yv12_copy_frame_ptr(new_yv12, gld_yv12); - else - vp8_yv12_copy_frame_ptr(lst_yv12, gld_yv12); - } - else if (cm->copy_buffer_to_gf == 2) - vp8_yv12_copy_frame_ptr(alt_yv12, gld_yv12); - } - } + sem_wait(&cpi->h_event_end_lpf); } +#endif + + /* Move storing frame_type out of the above loop since it is also + * needed in motion search besides loopfilter */ + cm->last_frame_type = cm->frame_type; // Update rate control heuristics cpi->total_byte_count += (*size); @@ -5328,7 +5104,9 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer); if (cpi->b_calculate_psnr && cpi->pass != 1 && cm->show_frame) + { generate_psnr_packet(cpi); + } #if CONFIG_PSNR @@ -5344,12 +5122,35 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon if (cpi->b_calculate_psnr) { double y, u, v; - double sq_error; - double frame_psnr = vp8_calc_psnr(cpi->Source, cm->frame_to_show, &y, &u, &v, &sq_error); - - cpi->total_y += y; - cpi->total_u += u; - cpi->total_v += v; + double ye,ue,ve; + double frame_psnr; + YV12_BUFFER_CONFIG *orig = cpi->Source; + YV12_BUFFER_CONFIG *recon = cpi->common.frame_to_show; + YV12_BUFFER_CONFIG *pp = &cm->post_proc_buffer; + int y_samples = orig->y_height * orig->y_width ; + int uv_samples = orig->uv_height * orig->uv_width ; + int t_samples = y_samples + 2 * uv_samples; + long long sq_error; + + ye = calc_plane_error(orig->y_buffer, orig->y_stride, + recon->y_buffer, recon->y_stride, orig->y_width, orig->y_height, + IF_RTCD(&cpi->rtcd.variance)); + + ue = calc_plane_error(orig->u_buffer, orig->uv_stride, + recon->u_buffer, recon->uv_stride, orig->uv_width, orig->uv_height, + IF_RTCD(&cpi->rtcd.variance)); + + ve = calc_plane_error(orig->v_buffer, orig->uv_stride, + recon->v_buffer, recon->uv_stride, orig->uv_width, orig->uv_height, + IF_RTCD(&cpi->rtcd.variance)); + + sq_error = ye + ue + ve; + + frame_psnr = vp8_mse2psnr(t_samples, 255.0, sq_error); + + cpi->total_y += vp8_mse2psnr(y_samples, 255.0, ye); + cpi->total_u += vp8_mse2psnr(uv_samples, 255.0, ue); + cpi->total_v += vp8_mse2psnr(uv_samples, 255.0, ve); cpi->total_sq_error += sq_error; cpi->total += frame_psnr; { @@ -5358,17 +5159,35 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon vp8_deblock(cm->frame_to_show, &cm->post_proc_buffer, cm->filter_level * 10 / 6, 1, 0, IF_RTCD(&cm->rtcd.postproc)); vp8_clear_system_state(); - frame_psnr2 = vp8_calc_psnr(cpi->Source, &cm->post_proc_buffer, &y2, &u2, &v2, &sq_error); - frame_ssim2 = vp8_calc_ssim(cpi->Source, &cm->post_proc_buffer, 1, &weight); - cpi->summed_quality += frame_ssim2 * weight; - cpi->summed_weights += weight; + ye = calc_plane_error(orig->y_buffer, orig->y_stride, + pp->y_buffer, pp->y_stride, orig->y_width, orig->y_height, + IF_RTCD(&cpi->rtcd.variance)); - cpi->totalp_y += y2; - cpi->totalp_u += u2; - cpi->totalp_v += v2; - cpi->totalp += frame_psnr2; + ue = calc_plane_error(orig->u_buffer, orig->uv_stride, + pp->u_buffer, pp->uv_stride, orig->uv_width, orig->uv_height, + IF_RTCD(&cpi->rtcd.variance)); + + ve = calc_plane_error(orig->v_buffer, orig->uv_stride, + pp->v_buffer, pp->uv_stride, orig->uv_width, orig->uv_height, + IF_RTCD(&cpi->rtcd.variance)); + + sq_error = ye + ue + ve; + + frame_psnr2 = vp8_mse2psnr(t_samples, 255.0, sq_error); + + cpi->totalp_y += vp8_mse2psnr(y_samples, 255.0, ye); + cpi->totalp_u += vp8_mse2psnr(uv_samples, 255.0, ue); + cpi->totalp_v += vp8_mse2psnr(uv_samples, 255.0, ve); cpi->total_sq_error2 += sq_error; + cpi->totalp += frame_psnr2; + + frame_ssim2 = vp8_calc_ssim(cpi->Source, + &cm->post_proc_buffer, 1, &weight, + IF_RTCD(&cpi->rtcd.variance)); + + cpi->summed_quality += frame_ssim2 * weight; + cpi->summed_weights += weight; } } diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h index b66131d15..0e53f6803 100644 --- a/vp8/encoder/onyx_int.h +++ b/vp8/encoder/onyx_int.h @@ -99,6 +99,7 @@ typedef struct double pcnt_inter; double pcnt_motion; double pcnt_second_ref; + double pcnt_neutral; double MVr; double mvr_abs; double MVc; @@ -495,11 +496,6 @@ typedef struct struct vpx_codec_pkt_list *output_pkt_list; int first_pass_done; -#if !(CONFIG_REALTIME_ONLY) - unsigned char *fp_motion_map; - unsigned char *fp_motion_map_stats, *fp_motion_map_stats_save; -#endif - #if 0 // Experimental code for lagged and one pass ONEPASS_FRAMESTATS one_pass_frame_stats[MAX_LAG_BUFFERS]; @@ -603,12 +599,17 @@ typedef struct int encoding_thread_count; pthread_t *h_encoding_thread; + pthread_t h_filter_thread; + MB_ROW_COMP *mb_row_ei; ENCODETHREAD_DATA *en_thread_data; + LPFTHREAD_DATA lpf_thread_data; //events sem_t *h_event_start_encoding; sem_t h_event_end_encoding; + sem_t h_event_start_lpf; + sem_t h_event_end_lpf; #endif TOKENLIST *tplist; @@ -641,8 +642,6 @@ typedef struct YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS]; int fixed_divide[512]; #endif - // Flag to indicate temporal filter method - int use_weighted_temporal_filter; #if CONFIG_PSNR int count; diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c index bfffe43d9..9797f5f25 100644 --- a/vp8/encoder/ratectrl.c +++ b/vp8/encoder/ratectrl.c @@ -842,7 +842,8 @@ void vp8_calc_pframe_target_size(VP8_COMP *cpi) { int one_percent_bits = 1 + cpi->oxcf.optimal_buffer_level / 100; - if ((cpi->buffer_level < cpi->oxcf.optimal_buffer_level) || (cpi->bits_off_target < cpi->oxcf.optimal_buffer_level)) + if ((cpi->buffer_level < cpi->oxcf.optimal_buffer_level) || + (cpi->bits_off_target < cpi->oxcf.optimal_buffer_level)) { int percent_low = 0; @@ -851,9 +852,12 @@ void vp8_calc_pframe_target_size(VP8_COMP *cpi) // If we are are below the optimal buffer fullness level and adherence // to buffering contraints is important to the end useage then adjust // the per frame target. - if ((cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) && (cpi->buffer_level < cpi->oxcf.optimal_buffer_level)) + if ((cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) && + (cpi->buffer_level < cpi->oxcf.optimal_buffer_level)) { - percent_low = (cpi->oxcf.optimal_buffer_level - cpi->buffer_level) / one_percent_bits; + percent_low = + (cpi->oxcf.optimal_buffer_level - cpi->buffer_level) / + one_percent_bits; if (percent_low > 100) percent_low = 100; @@ -864,7 +868,8 @@ void vp8_calc_pframe_target_size(VP8_COMP *cpi) else if (cpi->bits_off_target < 0) { // Adjust per frame data target downwards to compensate. - percent_low = (int)(100 * -cpi->bits_off_target / (cpi->total_byte_count * 8)); + percent_low = (int)(100 * -cpi->bits_off_target / + (cpi->total_byte_count * 8)); if (percent_low > 100) percent_low = 100; @@ -873,39 +878,60 @@ void vp8_calc_pframe_target_size(VP8_COMP *cpi) } // lower the target bandwidth for this frame. - cpi->this_frame_target = (cpi->this_frame_target * (100 - (percent_low / 2))) / 100; + cpi->this_frame_target = + (cpi->this_frame_target * (100 - (percent_low / 2))) / 100; - // Are we using allowing control of active_worst_allowed_q according to buffer level. + // Are we using allowing control of active_worst_allowed_q + // according to buffer level. if (cpi->auto_worst_q) { int critical_buffer_level; - // For streaming applications the most important factor is cpi->buffer_level as this takes - // into account the specified short term buffering constraints. However, hitting the long - // term clip data rate target is also important. + // For streaming applications the most important factor is + // cpi->buffer_level as this takes into account the + // specified short term buffering constraints. However, + // hitting the long term clip data rate target is also + // important. if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) { - // Take the smaller of cpi->buffer_level and cpi->bits_off_target - critical_buffer_level = (cpi->buffer_level < cpi->bits_off_target) ? cpi->buffer_level : cpi->bits_off_target; + // Take the smaller of cpi->buffer_level and + // cpi->bits_off_target + critical_buffer_level = + (cpi->buffer_level < cpi->bits_off_target) + ? cpi->buffer_level : cpi->bits_off_target; } - // For local file playback short term buffering contraints are less of an issue + // For local file playback short term buffering contraints + // are less of an issue else { - // Consider only how we are doing for the clip as a whole + // Consider only how we are doing for the clip as a + // whole critical_buffer_level = cpi->bits_off_target; } - // Set the active worst quality based upon the selected buffer fullness number. + // Set the active worst quality based upon the selected + // buffer fullness number. if (critical_buffer_level < cpi->oxcf.optimal_buffer_level) { - if (critical_buffer_level > (cpi->oxcf.optimal_buffer_level / 4)) + if ( critical_buffer_level > + (cpi->oxcf.optimal_buffer_level >> 2) ) { - int qadjustment_range = cpi->worst_quality - cpi->ni_av_qi; - int above_base = (critical_buffer_level - (cpi->oxcf.optimal_buffer_level / 4)); - - // Step active worst quality down from cpi->ni_av_qi when (critical_buffer_level == cpi->optimal_buffer_level) - // to cpi->oxcf.worst_allowed_q when (critical_buffer_level == cpi->optimal_buffer_level/4) - cpi->active_worst_quality = cpi->worst_quality - ((qadjustment_range * above_base) / (cpi->oxcf.optimal_buffer_level * 3 / 4)); + INT64 qadjustment_range = + cpi->worst_quality - cpi->ni_av_qi; + INT64 above_base = + (critical_buffer_level - + (cpi->oxcf.optimal_buffer_level >> 2)); + + // Step active worst quality down from + // cpi->ni_av_qi when (critical_buffer_level == + // cpi->optimal_buffer_level) to + // cpi->worst_quality when + // (critical_buffer_level == + // cpi->optimal_buffer_level >> 2) + cpi->active_worst_quality = + cpi->worst_quality - + ((qadjustment_range * above_base) / + (cpi->oxcf.optimal_buffer_level*3>>2)); } else { @@ -965,6 +991,15 @@ void vp8_calc_pframe_target_size(VP8_COMP *cpi) // Set the active worst quality cpi->active_worst_quality = cpi->worst_quality; } + + // Special trap for constrained quality mode + // "active_worst_quality" may never drop below cq level + // for any frame type. + if ( cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY && + cpi->active_worst_quality < cpi->cq_target_quality) + { + cpi->active_worst_quality = cpi->cq_target_quality; + } } // Test to see if we have to drop a frame diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c index 8aaca0917..b0dcfe0a4 100644 --- a/vp8/encoder/rdopt.c +++ b/vp8/encoder/rdopt.c @@ -1968,7 +1968,7 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int else cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST; } - else if (vp8_ref_frame_order[mode_index] == SPLITMV) + else if (vp8_mode_order[mode_index] == SPLITMV) cpi->zbin_mode_boost = 0; else cpi->zbin_mode_boost = MV_ZBIN_BOOST; diff --git a/vp8/encoder/ssim.c b/vp8/encoder/ssim.c index 4ebcba1a1..64d67c6dd 100644 --- a/vp8/encoder/ssim.c +++ b/vp8/encoder/ssim.c @@ -11,298 +11,13 @@ #include "vpx_scale/yv12config.h" #include "math.h" +#include "onyx_int.h" -#define C1 (float)(64 * 64 * 0.01*255*0.01*255) -#define C2 (float)(64 * 64 * 0.03*255*0.03*255) - -static int width_y; -static int height_y; -static int height_uv; -static int width_uv; -static int stride_uv; -static int stride; -static int lumimask; -static int luminance; -static double plane_summed_weights = 0; - -static short img12_sum_block[8*4096*4096*2] ; - -static short img1_sum[8*4096*2]; -static short img2_sum[8*4096*2]; -static int img1_sq_sum[8*4096*2]; -static int img2_sq_sum[8*4096*2]; -static int img12_mul_sum[8*4096*2]; - - -double vp8_similarity -( - int mu_x, - int mu_y, - int pre_mu_x2, - int pre_mu_y2, - int pre_mu_xy2 -) -{ - int mu_x2, mu_y2, mu_xy, theta_x2, theta_y2, theta_xy; - - mu_x2 = mu_x * mu_x; - mu_y2 = mu_y * mu_y; - mu_xy = mu_x * mu_y; - - theta_x2 = 64 * pre_mu_x2 - mu_x2; - theta_y2 = 64 * pre_mu_y2 - mu_y2; - theta_xy = 64 * pre_mu_xy2 - mu_xy; - - return (2 * mu_xy + C1) * (2 * theta_xy + C2) / ((mu_x2 + mu_y2 + C1) * (theta_x2 + theta_y2 + C2)); -} - -double vp8_ssim -( - const unsigned char *img1, - const unsigned char *img2, - int stride_img1, - int stride_img2, - int width, - int height -) -{ - int x, y, x2, y2, img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block, temp; - - double plane_quality, weight, mean; - - short *img1_sum_ptr1, *img1_sum_ptr2; - short *img2_sum_ptr1, *img2_sum_ptr2; - int *img1_sq_sum_ptr1, *img1_sq_sum_ptr2; - int *img2_sq_sum_ptr1, *img2_sq_sum_ptr2; - int *img12_mul_sum_ptr1, *img12_mul_sum_ptr2; - - plane_quality = 0; - - if (lumimask) - plane_summed_weights = 0.0f; - else - plane_summed_weights = (height - 7) * (width - 7); - - //some prologue for the main loop - temp = 8 * width; - - img1_sum_ptr1 = img1_sum + temp; - img2_sum_ptr1 = img2_sum + temp; - img1_sq_sum_ptr1 = img1_sq_sum + temp; - img2_sq_sum_ptr1 = img2_sq_sum + temp; - img12_mul_sum_ptr1 = img12_mul_sum + temp; - - for (x = 0; x < width; x++) - { - img1_sum[x] = img1[x]; - img2_sum[x] = img2[x]; - img1_sq_sum[x] = img1[x] * img1[x]; - img2_sq_sum[x] = img2[x] * img2[x]; - img12_mul_sum[x] = img1[x] * img2[x]; - - img1_sum_ptr1[x] = 0; - img2_sum_ptr1[x] = 0; - img1_sq_sum_ptr1[x] = 0; - img2_sq_sum_ptr1[x] = 0; - img12_mul_sum_ptr1[x] = 0; - } - - //the main loop - for (y = 1; y < height; y++) - { - img1 += stride_img1; - img2 += stride_img2; - - temp = (y - 1) % 9 * width; - - img1_sum_ptr1 = img1_sum + temp; - img2_sum_ptr1 = img2_sum + temp; - img1_sq_sum_ptr1 = img1_sq_sum + temp; - img2_sq_sum_ptr1 = img2_sq_sum + temp; - img12_mul_sum_ptr1 = img12_mul_sum + temp; - - temp = y % 9 * width; - - img1_sum_ptr2 = img1_sum + temp; - img2_sum_ptr2 = img2_sum + temp; - img1_sq_sum_ptr2 = img1_sq_sum + temp; - img2_sq_sum_ptr2 = img2_sq_sum + temp; - img12_mul_sum_ptr2 = img12_mul_sum + temp; - - for (x = 0; x < width; x++) - { - img1_sum_ptr2[x] = img1_sum_ptr1[x] + img1[x]; - img2_sum_ptr2[x] = img2_sum_ptr1[x] + img2[x]; - img1_sq_sum_ptr2[x] = img1_sq_sum_ptr1[x] + img1[x] * img1[x]; - img2_sq_sum_ptr2[x] = img2_sq_sum_ptr1[x] + img2[x] * img2[x]; - img12_mul_sum_ptr2[x] = img12_mul_sum_ptr1[x] + img1[x] * img2[x]; - } - - if (y > 6) - { - //calculate the sum of the last 8 lines by subtracting the total sum of 8 lines back from the present sum - temp = (y + 1) % 9 * width; - - img1_sum_ptr1 = img1_sum + temp; - img2_sum_ptr1 = img2_sum + temp; - img1_sq_sum_ptr1 = img1_sq_sum + temp; - img2_sq_sum_ptr1 = img2_sq_sum + temp; - img12_mul_sum_ptr1 = img12_mul_sum + temp; - - for (x = 0; x < width; x++) - { - img1_sum_ptr1[x] = img1_sum_ptr2[x] - img1_sum_ptr1[x]; - img2_sum_ptr1[x] = img2_sum_ptr2[x] - img2_sum_ptr1[x]; - img1_sq_sum_ptr1[x] = img1_sq_sum_ptr2[x] - img1_sq_sum_ptr1[x]; - img2_sq_sum_ptr1[x] = img2_sq_sum_ptr2[x] - img2_sq_sum_ptr1[x]; - img12_mul_sum_ptr1[x] = img12_mul_sum_ptr2[x] - img12_mul_sum_ptr1[x]; - } - - //here we calculate the sum over the 8x8 block of pixels - //this is done by sliding a window across the column sums for the last 8 lines - //each time adding the new column sum, and subtracting the one which fell out of the window - img1_block = 0; - img2_block = 0; - img1_sq_block = 0; - img2_sq_block = 0; - img12_mul_block = 0; - - //prologue, and calculation of simularity measure from the first 8 column sums - for (x = 0; x < 8; x++) - { - img1_block += img1_sum_ptr1[x]; - img2_block += img2_sum_ptr1[x]; - img1_sq_block += img1_sq_sum_ptr1[x]; - img2_sq_block += img2_sq_sum_ptr1[x]; - img12_mul_block += img12_mul_sum_ptr1[x]; - } - - if (lumimask) - { - y2 = y - 7; - x2 = 0; - - if (luminance) - { - mean = (img2_block + img1_block) / 128.0f; - - if (!(y2 % 2 || x2 % 2)) - *(img12_sum_block + y2 / 2 * width_uv + x2 / 2) = img2_block + img1_block; - } - else - { - mean = *(img12_sum_block + y2 * width_uv + x2); - mean += *(img12_sum_block + y2 * width_uv + x2 + 4); - mean += *(img12_sum_block + (y2 + 4) * width_uv + x2); - mean += *(img12_sum_block + (y2 + 4) * width_uv + x2 + 4); - - mean /= 512.0f; - } - - weight = mean < 40 ? 0.0f : - (mean < 50 ? (mean - 40.0f) / 10.0f : 1.0f); - plane_summed_weights += weight; - - plane_quality += weight * vp8_similarity(img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block); - } - else - plane_quality += vp8_similarity(img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block); - - //and for the rest - for (x = 8; x < width; x++) - { - img1_block = img1_block + img1_sum_ptr1[x] - img1_sum_ptr1[x - 8]; - img2_block = img2_block + img2_sum_ptr1[x] - img2_sum_ptr1[x - 8]; - img1_sq_block = img1_sq_block + img1_sq_sum_ptr1[x] - img1_sq_sum_ptr1[x - 8]; - img2_sq_block = img2_sq_block + img2_sq_sum_ptr1[x] - img2_sq_sum_ptr1[x - 8]; - img12_mul_block = img12_mul_block + img12_mul_sum_ptr1[x] - img12_mul_sum_ptr1[x - 8]; - - if (lumimask) - { - y2 = y - 7; - x2 = x - 7; - - if (luminance) - { - mean = (img2_block + img1_block) / 128.0f; - - if (!(y2 % 2 || x2 % 2)) - *(img12_sum_block + y2 / 2 * width_uv + x2 / 2) = img2_block + img1_block; - } - else - { - mean = *(img12_sum_block + y2 * width_uv + x2); - mean += *(img12_sum_block + y2 * width_uv + x2 + 4); - mean += *(img12_sum_block + (y2 + 4) * width_uv + x2); - mean += *(img12_sum_block + (y2 + 4) * width_uv + x2 + 4); - - mean /= 512.0f; - } - - weight = mean < 40 ? 0.0f : - (mean < 50 ? (mean - 40.0f) / 10.0f : 1.0f); - plane_summed_weights += weight; - - plane_quality += weight * vp8_similarity(img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block); - } - else - plane_quality += vp8_similarity(img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block); - } - } - } - - if (plane_summed_weights == 0) - return 1.0f; - else - return plane_quality / plane_summed_weights; -} - -double vp8_calc_ssim -( - YV12_BUFFER_CONFIG *source, - YV12_BUFFER_CONFIG *dest, - int lumamask, - double *weight -) -{ - double a, b, c; - double frame_weight; - double ssimv; - - width_y = source->y_width; - height_y = source->y_height; - height_uv = source->uv_height; - width_uv = source->uv_width; - stride_uv = dest->uv_stride; - stride = dest->y_stride; - - lumimask = lumamask; - - luminance = 1; - a = vp8_ssim(source->y_buffer, dest->y_buffer, - source->y_stride, dest->y_stride, source->y_width, source->y_height); - luminance = 0; - - frame_weight = plane_summed_weights / ((width_y - 7) * (height_y - 7)); - - if (frame_weight == 0) - a = b = c = 1.0f; - else - { - b = vp8_ssim(source->u_buffer, dest->u_buffer, - source->uv_stride, dest->uv_stride, source->uv_width, source->uv_height); - - c = vp8_ssim(source->v_buffer, dest->v_buffer, - source->uv_stride, dest->uv_stride, source->uv_width, source->uv_height); - } - - ssimv = a * .8 + .1 * (b + c); - - *weight = frame_weight; - - return ssimv; -} - +#if CONFIG_RUNTIME_CPU_DETECT +#define IF_RTCD(x) (x) +#else +#define IF_RTCD(x) NULL +#endif // Google version of SSIM // SSIM #define KERNEL 3 @@ -520,3 +235,174 @@ double vp8_calc_ssimg *ssim_v /= uvsize; return ssim_all; } + + +void ssim_parms_c +( + unsigned char *s, + int sp, + unsigned char *r, + int rp, + unsigned long *sum_s, + unsigned long *sum_r, + unsigned long *sum_sq_s, + unsigned long *sum_sq_r, + unsigned long *sum_sxr +) +{ + int i,j; + for(i=0;i<16;i++,s+=sp,r+=rp) + { + for(j=0;j<16;j++) + { + *sum_s += s[j]; + *sum_r += r[j]; + *sum_sq_s += s[j] * s[j]; + *sum_sq_r += r[j] * r[j]; + *sum_sxr += s[j] * r[j]; + } + } +} +void ssim_parms_8x8_c +( + unsigned char *s, + int sp, + unsigned char *r, + int rp, + unsigned long *sum_s, + unsigned long *sum_r, + unsigned long *sum_sq_s, + unsigned long *sum_sq_r, + unsigned long *sum_sxr +) +{ + int i,j; + for(i=0;i<8;i++,s+=sp,r+=rp) + { + for(j=0;j<8;j++) + { + *sum_s += s[j]; + *sum_r += r[j]; + *sum_sq_s += s[j] * s[j]; + *sum_sq_r += r[j] * r[j]; + *sum_sxr += s[j] * r[j]; + } + } +} + +const static long long c1 = 426148; // (256^2*(.01*255)^2 +const static long long c2 = 3835331; //(256^2*(.03*255)^2 + +static double similarity +( + unsigned long sum_s, + unsigned long sum_r, + unsigned long sum_sq_s, + unsigned long sum_sq_r, + unsigned long sum_sxr, + int count +) +{ + long long ssim_n = (2*sum_s*sum_r+ c1)*(2*count*sum_sxr-2*sum_s*sum_r+c2); + + long long ssim_d = (sum_s*sum_s +sum_r*sum_r+c1)* + (count*sum_sq_s-sum_s*sum_s + count*sum_sq_r-sum_r*sum_r +c2) ; + + return ssim_n * 1.0 / ssim_d; +} + +static double ssim_16x16(unsigned char *s,int sp, unsigned char *r,int rp, + const vp8_variance_rtcd_vtable_t *rtcd) +{ + unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0; + rtcd->ssimpf(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr); + return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 256); +} +static double ssim_8x8(unsigned char *s,int sp, unsigned char *r,int rp, + const vp8_variance_rtcd_vtable_t *rtcd) +{ + unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0; + rtcd->ssimpf_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr); + return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64); +} + +// TODO: (jbb) tried to scale this function such that we may be able to use it +// for distortion metric in mode selection code ( provided we do a reconstruction) +long dssim(unsigned char *s,int sp, unsigned char *r,int rp, + const vp8_variance_rtcd_vtable_t *rtcd) +{ + unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0; + double ssim3; + long long ssim_n; + long long ssim_d; + + rtcd->ssimpf(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr); + ssim_n = (2*sum_s*sum_r+ c1)*(2*256*sum_sxr-2*sum_s*sum_r+c2); + + ssim_d = (sum_s*sum_s +sum_r*sum_r+c1)* + (256*sum_sq_s-sum_s*sum_s + 256*sum_sq_r-sum_r*sum_r +c2) ; + + ssim3 = 256 * (ssim_d-ssim_n) / ssim_d; + return (long)( 256*ssim3 * ssim3 ); +} +// TODO: (jbb) this 8x8 window might be too big + we may want to pick pixels +// such that the window regions overlap block boundaries to penalize blocking +// artifacts. + +double vp8_ssim2 +( + unsigned char *img1, + unsigned char *img2, + int stride_img1, + int stride_img2, + int width, + int height, + const vp8_variance_rtcd_vtable_t *rtcd +) +{ + int i,j; + + double ssim_total=0; + + // we can sample points as frequently as we like start with 1 per 8x8 + for(i=0; i < height; i+=8, img1 += stride_img1*8, img2 += stride_img2*8) + { + for(j=0; j < width; j+=8 ) + { + ssim_total += ssim_8x8(img1, stride_img1, img2, stride_img2, rtcd); + } + } + ssim_total /= (width/8 * height /8); + return ssim_total; + +} +double vp8_calc_ssim +( + YV12_BUFFER_CONFIG *source, + YV12_BUFFER_CONFIG *dest, + int lumamask, + double *weight, + const vp8_variance_rtcd_vtable_t *rtcd +) +{ + double a, b, c; + double ssimv; + + a = vp8_ssim2(source->y_buffer, dest->y_buffer, + source->y_stride, dest->y_stride, source->y_width, + source->y_height, rtcd); + + b = vp8_ssim2(source->u_buffer, dest->u_buffer, + source->uv_stride, dest->uv_stride, source->uv_width, + source->uv_height, rtcd); + + c = vp8_ssim2(source->v_buffer, dest->v_buffer, + source->uv_stride, dest->uv_stride, source->uv_width, + source->uv_height, rtcd); + + ssimv = a * .8 + .1 * (b + c); + + *weight = 1; + + return ssimv; +} diff --git a/vp8/encoder/temporal_filter.c b/vp8/encoder/temporal_filter.c index 0f8e654a0..fd36b22eb 100644 --- a/vp8/encoder/temporal_filter.c +++ b/vp8/encoder/temporal_filter.c @@ -287,8 +287,7 @@ static void vp8_temporal_filter_iterate_c int byte; int frame; int mb_col, mb_row; - unsigned int filter_weight[MAX_LAG_BUFFERS]; - unsigned char *mm_ptr = cpi->fp_motion_map; + unsigned int filter_weight; int mb_cols = cpi->common.mb_cols; int mb_rows = cpi->common.mb_rows; int MBs = cpi->common.MBs; @@ -306,13 +305,6 @@ static void vp8_temporal_filter_iterate_c unsigned char *u_buffer = mbd->pre.u_buffer; unsigned char *v_buffer = mbd->pre.v_buffer; - if (!cpi->use_weighted_temporal_filter) - { - // Temporal filtering is unweighted - for (frame = 0; frame < frame_count; frame++) - filter_weight[frame] = 1; - } - for (mb_row = 0; mb_row < mb_rows; mb_row++) { #if ALT_REF_MC_ENABLED @@ -338,34 +330,9 @@ static void vp8_temporal_filter_iterate_c + (VP8BORDERINPIXELS - 19); #endif - // Read & process macroblock weights from motion map - if (cpi->use_weighted_temporal_filter) - { - weight_cap = 2; - - for (frame = alt_ref_index-1; frame >= 0; frame--) - { - w = *(mm_ptr + (frame+1)*MBs); - filter_weight[frame] = w < weight_cap ? w : weight_cap; - weight_cap = w; - } - - filter_weight[alt_ref_index] = 2; - - weight_cap = 2; - - for (frame = alt_ref_index+1; frame < frame_count; frame++) - { - w = *(mm_ptr + frame*MBs); - filter_weight[frame] = w < weight_cap ? w : weight_cap; - weight_cap = w; - } - - } - for (frame = 0; frame < frame_count; frame++) { - int err; + int err = 0; if (cpi->frames[frame] == NULL) continue; @@ -374,28 +341,25 @@ static void vp8_temporal_filter_iterate_c mbd->block[0].bmi.mv.as_mv.col = 0; #if ALT_REF_MC_ENABLED - //if (filter_weight[frame] == 0) - { #define THRESH_LOW 10000 #define THRESH_HIGH 20000 - // Correlation has been lost try MC - err = vp8_temporal_filter_find_matching_mb_c - (cpi, - cpi->frames[alt_ref_index], - cpi->frames[frame], - mb_y_offset, - THRESH_LOW); - - if (filter_weight[frame] < 2) - { - // Set weight depending on error - filter_weight[frame] = err<THRESH_LOW - ? 2 : err<THRESH_HIGH ? 1 : 0; - } - } + // Find best match in this frame by MC + err = vp8_temporal_filter_find_matching_mb_c + (cpi, + cpi->frames[alt_ref_index], + cpi->frames[frame], + mb_y_offset, + THRESH_LOW); + #endif - if (filter_weight[frame] != 0) + // Assign higher weight to matching MB if it's error + // score is lower. If not applying MC default behavior + // is to weight all MBs equal. + filter_weight = err<THRESH_LOW + ? 2 : err<THRESH_HIGH ? 1 : 0; + + if (filter_weight != 0) { // Construct the predictors vp8_temporal_filter_predictors_mb_c @@ -415,7 +379,7 @@ static void vp8_temporal_filter_iterate_c predictor, 16, strength, - filter_weight[frame], + filter_weight, accumulator, count); @@ -425,7 +389,7 @@ static void vp8_temporal_filter_iterate_c predictor + 256, 8, strength, - filter_weight[frame], + filter_weight, accumulator + 256, count + 256); @@ -435,7 +399,7 @@ static void vp8_temporal_filter_iterate_c predictor + 320, 8, strength, - filter_weight[frame], + filter_weight, accumulator + 320, count + 320); } @@ -491,7 +455,6 @@ static void vp8_temporal_filter_iterate_c byte += stride - 8; } - mm_ptr++; mb_y_offset += 16; mb_uv_offset += 8; } diff --git a/vp8/encoder/variance.h b/vp8/encoder/variance.h index 5befd3b86..bf17ea8b6 100644 --- a/vp8/encoder/variance.h +++ b/vp8/encoder/variance.h @@ -85,6 +85,19 @@ unsigned int *sse \ ); +#define prototype_ssimpf(sym) \ + void (sym) \ + ( \ + unsigned char *s, \ + int sp, \ + unsigned char *r, \ + int rp, \ + unsigned long *sum_s, \ + unsigned long *sum_r, \ + unsigned long *sum_sq_s, \ + unsigned long *sum_sq_r, \ + unsigned long *sum_sxr \ + ); #define prototype_getmbss(sym) unsigned int (sym)(const short *) @@ -306,6 +319,15 @@ extern prototype_variance2(vp8_variance_get16x16var); #endif extern prototype_sad(vp8_variance_get4x4sse_cs); +#ifndef vp8_ssimpf +#define vp8_ssimpf ssim_parms_c +#endif +extern prototype_ssimpf(vp8_ssimpf) + +#ifndef vp8_ssimpf_8x8 +#define vp8_ssimpf_8x8 ssim_parms_8x8_c +#endif +extern prototype_ssimpf(vp8_ssimpf_8x8) typedef prototype_sad(*vp8_sad_fn_t); typedef prototype_sad_multi_same_address(*vp8_sad_multi_fn_t); @@ -315,6 +337,10 @@ typedef prototype_variance(*vp8_variance_fn_t); typedef prototype_variance2(*vp8_variance2_fn_t); typedef prototype_subpixvariance(*vp8_subpixvariance_fn_t); typedef prototype_getmbss(*vp8_getmbss_fn_t); + +typedef prototype_ssimpf(*vp8_ssimpf_fn_t) + + typedef struct { vp8_sad_fn_t sad4x4; @@ -365,6 +391,11 @@ typedef struct vp8_sad_multi_d_fn_t sad8x8x4d; vp8_sad_multi_d_fn_t sad4x4x4d; +#if CONFIG_PSNR + vp8_ssimpf_fn_t ssimpf_8x8; + vp8_ssimpf_fn_t ssimpf; +#endif + } vp8_variance_rtcd_vtable_t; typedef struct @@ -378,6 +409,7 @@ typedef struct vp8_sad_multi_fn_t sdx3f; vp8_sad_multi1_fn_t sdx8f; vp8_sad_multi_d_fn_t sdx4df; + } vp8_variance_fn_ptr_t; #if CONFIG_RUNTIME_CPU_DETECT diff --git a/vp8/encoder/x86/sad_sse4.asm b/vp8/encoder/x86/sad_sse4.asm index 21e2e5007..03ecec4b3 100644 --- a/vp8/encoder/x86/sad_sse4.asm +++ b/vp8/encoder/x86/sad_sse4.asm @@ -186,7 +186,7 @@ sym(vp8_sad16x16x8_sse4): PROCESS_16X2X8 0 mov rdi, arg(4) ;Results - movdqu XMMWORD PTR [rdi], xmm1 + movdqa XMMWORD PTR [rdi], xmm1 ; begin epilog pop rdi @@ -224,7 +224,7 @@ sym(vp8_sad16x8x8_sse4): PROCESS_16X2X8 0 mov rdi, arg(4) ;Results - movdqu XMMWORD PTR [rdi], xmm1 + movdqa XMMWORD PTR [rdi], xmm1 ; begin epilog pop rdi @@ -262,7 +262,7 @@ sym(vp8_sad8x8x8_sse4): PROCESS_8X2X8 0 mov rdi, arg(4) ;Results - movdqu XMMWORD PTR [rdi], xmm1 + movdqa XMMWORD PTR [rdi], xmm1 ; begin epilog pop rdi @@ -303,7 +303,7 @@ sym(vp8_sad8x16x8_sse4): PROCESS_8X2X8 0 PROCESS_8X2X8 0 mov rdi, arg(4) ;Results - movdqu XMMWORD PTR [rdi], xmm1 + movdqa XMMWORD PTR [rdi], xmm1 ; begin epilog pop rdi @@ -339,7 +339,7 @@ sym(vp8_sad4x4x8_sse4): PROCESS_4X2X8 0 mov rdi, arg(4) ;Results - movdqu XMMWORD PTR [rdi], xmm1 + movdqa XMMWORD PTR [rdi], xmm1 ; begin epilog pop rdi diff --git a/vp8/encoder/x86/ssim_opt.asm b/vp8/encoder/x86/ssim_opt.asm new file mode 100644 index 000000000..c267cdb54 --- /dev/null +++ b/vp8/encoder/x86/ssim_opt.asm @@ -0,0 +1,215 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "vpx_ports/x86_abi_support.asm" + +; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr +%macro TABULATE_SSIM 0 + paddusw xmm15, xmm3 ; sum_s + paddusw xmm14, xmm4 ; sum_r + movdqa xmm1, xmm3 + pmaddwd xmm1, xmm1 + paddq xmm13, xmm1 ; sum_sq_s + movdqa xmm2, xmm4 + pmaddwd xmm2, xmm2 + paddq xmm12, xmm2 ; sum_sq_r + pmaddwd xmm3, xmm4 + paddq xmm11, xmm3 ; sum_sxr +%endmacro + +; Sum across the register %1 starting with q words +%macro SUM_ACROSS_Q 1 + movdqa xmm2,%1 + punpckldq %1,xmm0 + punpckhdq xmm2,xmm0 + paddq %1,xmm2 + movdqa xmm2,%1 + punpcklqdq %1,xmm0 + punpckhqdq xmm2,xmm0 + paddq %1,xmm2 +%endmacro + +; Sum across the register %1 starting with q words +%macro SUM_ACROSS_W 1 + movdqa xmm1, %1 + punpcklwd %1,xmm0 + punpckhwd xmm1,xmm0 + paddd %1, xmm1 + SUM_ACROSS_Q %1 +%endmacro +;void ssim_parms_sse3( +; unsigned char *s, +; int sp, +; unsigned char *r, +; int rp +; unsigned long *sum_s, +; unsigned long *sum_r, +; unsigned long *sum_sq_s, +; unsigned long *sum_sq_r, +; unsigned long *sum_sxr); +; +; TODO: Use parm passing through structure, probably don't need the pxors +; ( calling app will initialize to 0 ) could easily fit everything in sse2 +; without too much hastle, and can probably do better estimates with psadw +; or pavgb At this point this is just meant to be first pass for calculating +; all the parms needed for 16x16 ssim so we can play with dssim as distortion +; in mode selection code. +global sym(vp8_ssim_parms_16x16_sse3) +sym(vp8_ssim_parms_16x16_sse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;s + mov rcx, arg(1) ;sp + mov rdi, arg(2) ;r + mov rax, arg(3) ;rp + + pxor xmm0, xmm0 + pxor xmm15,xmm15 ;sum_s + pxor xmm14,xmm14 ;sum_r + pxor xmm13,xmm13 ;sum_sq_s + pxor xmm12,xmm12 ;sum_sq_r + pxor xmm11,xmm11 ;sum_sxr + + mov rdx, 16 ;row counter +NextRow: + + ;grab source and reference pixels + movdqu xmm5, [rsi] + movdqu xmm6, [rdi] + movdqa xmm3, xmm5 + movdqa xmm4, xmm6 + punpckhbw xmm3, xmm0 ; high_s + punpckhbw xmm4, xmm0 ; high_r + + TABULATE_SSIM + + movdqa xmm3, xmm5 + movdqa xmm4, xmm6 + punpcklbw xmm3, xmm0 ; low_s + punpcklbw xmm4, xmm0 ; low_r + + TABULATE_SSIM + + add rsi, rcx ; next s row + add rdi, rax ; next r row + + dec rdx ; counter + jnz NextRow + + SUM_ACROSS_W xmm15 + SUM_ACROSS_W xmm14 + SUM_ACROSS_Q xmm13 + SUM_ACROSS_Q xmm12 + SUM_ACROSS_Q xmm11 + + mov rdi,arg(4) + movq [rdi], xmm15; + mov rdi,arg(5) + movq [rdi], xmm14; + mov rdi,arg(6) + movq [rdi], xmm13; + mov rdi,arg(7) + movq [rdi], xmm12; + mov rdi,arg(8) + movq [rdi], xmm11; + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;void ssim_parms_sse3( +; unsigned char *s, +; int sp, +; unsigned char *r, +; int rp +; unsigned long *sum_s, +; unsigned long *sum_r, +; unsigned long *sum_sq_s, +; unsigned long *sum_sq_r, +; unsigned long *sum_sxr); +; +; TODO: Use parm passing through structure, probably don't need the pxors +; ( calling app will initialize to 0 ) could easily fit everything in sse2 +; without too much hastle, and can probably do better estimates with psadw +; or pavgb At this point this is just meant to be first pass for calculating +; all the parms needed for 16x16 ssim so we can play with dssim as distortion +; in mode selection code. +global sym(vp8_ssim_parms_8x8_sse3) +sym(vp8_ssim_parms_8x8_sse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;s + mov rcx, arg(1) ;sp + mov rdi, arg(2) ;r + mov rax, arg(3) ;rp + + pxor xmm0, xmm0 + pxor xmm15,xmm15 ;sum_s + pxor xmm14,xmm14 ;sum_r + pxor xmm13,xmm13 ;sum_sq_s + pxor xmm12,xmm12 ;sum_sq_r + pxor xmm11,xmm11 ;sum_sxr + + mov rdx, 8 ;row counter +NextRow2: + + ;grab source and reference pixels + movq xmm5, [rsi] + movq xmm6, [rdi] + + movdqa xmm3, xmm5 + movdqa xmm4, xmm6 + punpcklbw xmm3, xmm0 ; low_s + punpcklbw xmm4, xmm0 ; low_r + + TABULATE_SSIM + + add rsi, rcx ; next s row + add rdi, rax ; next r row + + dec rdx ; counter + jnz NextRow2 + + SUM_ACROSS_W xmm15 + SUM_ACROSS_W xmm14 + SUM_ACROSS_Q xmm13 + SUM_ACROSS_Q xmm12 + SUM_ACROSS_Q xmm11 + + mov rdi,arg(4) + movq [rdi], xmm15; + mov rdi,arg(5) + movq [rdi], xmm14; + mov rdi,arg(6) + movq [rdi], xmm13; + mov rdi,arg(7) + movq [rdi], xmm12; + mov rdi,arg(8) + movq [rdi], xmm11; + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret diff --git a/vp8/encoder/x86/variance_impl_sse2.asm b/vp8/encoder/x86/variance_impl_sse2.asm index 6cdc47bc9..c2c30deb2 100644 --- a/vp8/encoder/x86/variance_impl_sse2.asm +++ b/vp8/encoder/x86/variance_impl_sse2.asm @@ -627,6 +627,10 @@ filter_block2d_bil_var_sse2_loop: filter_block2d_bil_var_sse2_sp_only: movsxd rdx, dword ptr arg(6) ; yoffset + + cmp rdx, 0 ; skip all if both xoffset=0 and yoffset=0 + je filter_block2d_bil_var_sse2_full_pixel + shl rdx, 5 lea rdx, [rdx + rcx] ; VFilter @@ -671,6 +675,35 @@ filter_block2d_bil_sp_only_loop: jmp filter_block2d_bil_variance +filter_block2d_bil_var_sse2_full_pixel: + mov rsi, arg(0) ;ref_ptr + mov rdi, arg(2) ;src_ptr + movsxd rcx, dword ptr arg(4) ;Height + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line + movsxd rbx, dword ptr arg(3) ;src_pixels_per_line + pxor xmm0, xmm0 ; + +filter_block2d_bil_full_pixel_loop: + movq xmm1, QWORD PTR [rsi] ; + punpcklbw xmm1, xmm0 ; + + movq xmm2, QWORD PTR [rdi] ; + punpcklbw xmm2, xmm0 ; + + psubw xmm1, xmm2 ; + paddw xmm6, xmm1 ; + + pmaddwd xmm1, xmm1 ; + paddd xmm7, xmm1 ; + + lea rsi, [rsi + rax] ;ref_pixels_per_line + lea rdi, [rdi + rbx] ;src_pixels_per_line + + sub rcx, 1 ; + jnz filter_block2d_bil_full_pixel_loop ; + + jmp filter_block2d_bil_variance + filter_block2d_bil_var_sse2_fp_only: mov rsi, arg(0) ;ref_ptr mov rdi, arg(2) ;src_ptr @@ -757,7 +790,7 @@ filter_block2d_bil_variance: ret -;void vp8_half_horiz_vert_variance16x_h_sse2 +;void vp8_half_horiz_vert_variance8x_h_sse2 ;( ; unsigned char *ref_ptr, ; int ref_pixels_per_line, @@ -767,8 +800,8 @@ filter_block2d_bil_variance: ; int *sum, ; unsigned int *sumsquared ;) -global sym(vp8_half_horiz_vert_variance16x_h_sse2) -sym(vp8_half_horiz_vert_variance16x_h_sse2): +global sym(vp8_half_horiz_vert_variance8x_h_sse2) +sym(vp8_half_horiz_vert_variance8x_h_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 @@ -802,7 +835,7 @@ sym(vp8_half_horiz_vert_variance16x_h_sse2): add rsi, r8 %endif -vp8_half_horiz_vert_variance16x_h_1: +vp8_half_horiz_vert_variance8x_h_1: movq xmm1, QWORD PTR [rsi] ; movq xmm2, QWORD PTR [rsi+1] ; @@ -830,7 +863,7 @@ vp8_half_horiz_vert_variance16x_h_1: %endif sub rcx, 1 ; - jnz vp8_half_horiz_vert_variance16x_h_1 ; + jnz vp8_half_horiz_vert_variance8x_h_1 ; movdq2q mm6, xmm6 ; movdq2q mm7, xmm7 ; @@ -877,8 +910,123 @@ vp8_half_horiz_vert_variance16x_h_1: pop rbp ret +;void vp8_half_horiz_vert_variance16x_h_sse2 +;( +; unsigned char *ref_ptr, +; int ref_pixels_per_line, +; unsigned char *src_ptr, +; int src_pixels_per_line, +; unsigned int Height, +; int *sum, +; unsigned int *sumsquared +;) +global sym(vp8_half_horiz_vert_variance16x_h_sse2) +sym(vp8_half_horiz_vert_variance16x_h_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM + GET_GOT rbx + push rsi + push rdi + ; end prolog -;void vp8_half_vert_variance16x_h_sse2 + pxor xmm6, xmm6 ; error accumulator + pxor xmm7, xmm7 ; sse eaccumulator + mov rsi, arg(0) ;ref_ptr ; + + mov rdi, arg(2) ;src_ptr ; + movsxd rcx, dword ptr arg(4) ;Height ; + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line + movsxd rdx, dword ptr arg(3) ;src_pixels_per_line + + pxor xmm0, xmm0 ; + + movdqu xmm5, XMMWORD PTR [rsi] + movdqu xmm3, XMMWORD PTR [rsi+1] + pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1 + + lea rsi, [rsi + rax] + +vp8_half_horiz_vert_variance16x_h_1: + movdqu xmm1, XMMWORD PTR [rsi] ; + movdqu xmm2, XMMWORD PTR [rsi+1] ; + pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1 + + pavgb xmm5, xmm1 ; xmm = vertical average of the above + + movdqa xmm4, xmm5 + punpcklbw xmm5, xmm0 ; xmm5 = words of above + punpckhbw xmm4, xmm0 + + movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7 + punpcklbw xmm3, xmm0 ; xmm3 = words of above + psubw xmm5, xmm3 ; xmm5 -= xmm3 + + movq xmm3, QWORD PTR [rdi+8] + punpcklbw xmm3, xmm0 + psubw xmm4, xmm3 + + paddw xmm6, xmm5 ; xmm6 += accumulated column differences + paddw xmm6, xmm4 + pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 + pmaddwd xmm4, xmm4 + paddd xmm7, xmm5 ; xmm7 += accumulated square column differences + paddd xmm7, xmm4 + + movdqa xmm5, xmm1 ; save xmm1 for use on the next row + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + + sub rcx, 1 ; + jnz vp8_half_horiz_vert_variance16x_h_1 ; + + pxor xmm1, xmm1 + pxor xmm5, xmm5 + + punpcklwd xmm0, xmm6 + punpckhwd xmm1, xmm6 + psrad xmm0, 16 + psrad xmm1, 16 + paddd xmm0, xmm1 + movdqa xmm1, xmm0 + + movdqa xmm6, xmm7 + punpckldq xmm6, xmm5 + punpckhdq xmm7, xmm5 + paddd xmm6, xmm7 + + punpckldq xmm0, xmm5 + punpckhdq xmm1, xmm5 + paddd xmm0, xmm1 + + movdqa xmm7, xmm6 + movdqa xmm1, xmm0 + + psrldq xmm7, 8 + psrldq xmm1, 8 + + paddd xmm6, xmm7 + paddd xmm0, xmm1 + + mov rsi, arg(5) ;[Sum] + mov rdi, arg(6) ;[SSE] + + movd [rsi], xmm0 + movd [rdi], xmm6 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_half_vert_variance8x_h_sse2 ;( ; unsigned char *ref_ptr, ; int ref_pixels_per_line, @@ -888,8 +1036,8 @@ vp8_half_horiz_vert_variance16x_h_1: ; int *sum, ; unsigned int *sumsquared ;) -global sym(vp8_half_vert_variance16x_h_sse2) -sym(vp8_half_vert_variance16x_h_sse2): +global sym(vp8_half_vert_variance8x_h_sse2) +sym(vp8_half_vert_variance8x_h_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 @@ -912,7 +1060,7 @@ sym(vp8_half_vert_variance16x_h_sse2): movsxd rax, dword ptr arg(1) ;ref_pixels_per_line pxor xmm0, xmm0 ; -vp8_half_vert_variance16x_h_1: +vp8_half_vert_variance8x_h_1: movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 movq xmm3, QWORD PTR [rsi+rax] ; xmm3 = s1,s2,s3..s9 @@ -936,7 +1084,7 @@ vp8_half_vert_variance16x_h_1: %endif sub rcx, 1 ; - jnz vp8_half_vert_variance16x_h_1 ; + jnz vp8_half_vert_variance8x_h_1 ; movdq2q mm6, xmm6 ; movdq2q mm7, xmm7 ; @@ -983,8 +1131,115 @@ vp8_half_vert_variance16x_h_1: pop rbp ret +;void vp8_half_vert_variance16x_h_sse2 +;( +; unsigned char *ref_ptr, +; int ref_pixels_per_line, +; unsigned char *src_ptr, +; int src_pixels_per_line, +; unsigned int Height, +; int *sum, +; unsigned int *sumsquared +;) +global sym(vp8_half_vert_variance16x_h_sse2) +sym(vp8_half_vert_variance16x_h_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM + GET_GOT rbx + push rsi + push rdi + ; end prolog + + pxor xmm6, xmm6 ; error accumulator + pxor xmm7, xmm7 ; sse eaccumulator + mov rsi, arg(0) ;ref_ptr + + mov rdi, arg(2) ;src_ptr + movsxd rcx, dword ptr arg(4) ;Height + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line + movsxd rdx, dword ptr arg(3) ;src_pixels_per_line + + movdqu xmm5, XMMWORD PTR [rsi] + lea rsi, [rsi + rax ] + pxor xmm0, xmm0 + +vp8_half_vert_variance16x_h_1: + movdqu xmm3, XMMWORD PTR [rsi] + + pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) + movdqa xmm4, xmm5 + punpcklbw xmm5, xmm0 + punpckhbw xmm4, xmm0 + + movq xmm2, QWORD PTR [rdi] + punpcklbw xmm2, xmm0 + psubw xmm5, xmm2 + movq xmm2, QWORD PTR [rdi+8] + punpcklbw xmm2, xmm0 + psubw xmm4, xmm2 + + paddw xmm6, xmm5 ; xmm6 += accumulated column differences + paddw xmm6, xmm4 + pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 + pmaddwd xmm4, xmm4 + paddd xmm7, xmm5 ; xmm7 += accumulated square column differences + paddd xmm7, xmm4 + + movdqa xmm5, xmm3 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + + sub rcx, 1 + jnz vp8_half_vert_variance16x_h_1 + + pxor xmm1, xmm1 + pxor xmm5, xmm5 + + punpcklwd xmm0, xmm6 + punpckhwd xmm1, xmm6 + psrad xmm0, 16 + psrad xmm1, 16 + paddd xmm0, xmm1 + movdqa xmm1, xmm0 + + movdqa xmm6, xmm7 + punpckldq xmm6, xmm5 + punpckhdq xmm7, xmm5 + paddd xmm6, xmm7 + + punpckldq xmm0, xmm5 + punpckhdq xmm1, xmm5 + paddd xmm0, xmm1 + + movdqa xmm7, xmm6 + movdqa xmm1, xmm0 + + psrldq xmm7, 8 + psrldq xmm1, 8 + + paddd xmm6, xmm7 + paddd xmm0, xmm1 + + mov rsi, arg(5) ;[Sum] + mov rdi, arg(6) ;[SSE] + + movd [rsi], xmm0 + movd [rdi], xmm6 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + -;void vp8_half_horiz_variance16x_h_sse2 +;void vp8_half_horiz_variance8x_h_sse2 ;( ; unsigned char *ref_ptr, ; int ref_pixels_per_line, @@ -994,8 +1249,8 @@ vp8_half_vert_variance16x_h_1: ; int *sum, ; unsigned int *sumsquared ;) -global sym(vp8_half_horiz_variance16x_h_sse2) -sym(vp8_half_horiz_variance16x_h_sse2): +global sym(vp8_half_horiz_variance8x_h_sse2) +sym(vp8_half_horiz_variance8x_h_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 @@ -1017,7 +1272,7 @@ sym(vp8_half_horiz_variance16x_h_sse2): movsxd rcx, dword ptr arg(4) ;Height ; pxor xmm0, xmm0 ; -vp8_half_horiz_variance16x16_1: +vp8_half_horiz_variance8x_h_1: movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9 @@ -1040,7 +1295,7 @@ vp8_half_horiz_variance16x16_1: add rdi, r9 %endif sub rcx, 1 ; - jnz vp8_half_horiz_variance16x16_1 ; + jnz vp8_half_horiz_variance8x_h_1 ; movdq2q mm6, xmm6 ; movdq2q mm7, xmm7 ; @@ -1087,6 +1342,109 @@ vp8_half_horiz_variance16x16_1: pop rbp ret +;void vp8_half_horiz_variance16x_h_sse2 +;( +; unsigned char *ref_ptr, +; int ref_pixels_per_line, +; unsigned char *src_ptr, +; int src_pixels_per_line, +; unsigned int Height, +; int *sum, +; unsigned int *sumsquared +;) +global sym(vp8_half_horiz_variance16x_h_sse2) +sym(vp8_half_horiz_variance16x_h_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM + GET_GOT rbx + push rsi + push rdi + ; end prolog + + pxor xmm6, xmm6 ; error accumulator + pxor xmm7, xmm7 ; sse eaccumulator + mov rsi, arg(0) ;ref_ptr ; + + mov rdi, arg(2) ;src_ptr ; + movsxd rcx, dword ptr arg(4) ;Height ; + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line + movsxd rdx, dword ptr arg(3) ;src_pixels_per_line + + pxor xmm0, xmm0 ; + +vp8_half_horiz_variance16x_h_1: + movdqu xmm5, XMMWORD PTR [rsi] ; xmm5 = s0,s1,s2..s15 + movdqu xmm3, XMMWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s16 + + pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) + movdqa xmm1, xmm5 + punpcklbw xmm5, xmm0 ; xmm5 = words of above + punpckhbw xmm1, xmm0 + + movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7 + punpcklbw xmm3, xmm0 ; xmm3 = words of above + movq xmm2, QWORD PTR [rdi+8] + punpcklbw xmm2, xmm0 + + psubw xmm5, xmm3 ; xmm5 -= xmm3 + psubw xmm1, xmm2 + paddw xmm6, xmm5 ; xmm6 += accumulated column differences + paddw xmm6, xmm1 + pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 + pmaddwd xmm1, xmm1 + paddd xmm7, xmm5 ; xmm7 += accumulated square column differences + paddd xmm7, xmm1 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + + sub rcx, 1 ; + jnz vp8_half_horiz_variance16x_h_1 ; + + pxor xmm1, xmm1 + pxor xmm5, xmm5 + + punpcklwd xmm0, xmm6 + punpckhwd xmm1, xmm6 + psrad xmm0, 16 + psrad xmm1, 16 + paddd xmm0, xmm1 + movdqa xmm1, xmm0 + + movdqa xmm6, xmm7 + punpckldq xmm6, xmm5 + punpckhdq xmm7, xmm5 + paddd xmm6, xmm7 + + punpckldq xmm0, xmm5 + punpckhdq xmm1, xmm5 + paddd xmm0, xmm1 + + movdqa xmm7, xmm6 + movdqa xmm1, xmm0 + + psrldq xmm7, 8 + psrldq xmm1, 8 + + paddd xmm6, xmm7 + paddd xmm0, xmm1 + + mov rsi, arg(5) ;[Sum] + mov rdi, arg(6) ;[SSE] + + movd [rsi], xmm0 + movd [rdi], xmm6 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret SECTION_RODATA ; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64}; diff --git a/vp8/encoder/x86/variance_impl_ssse3.asm b/vp8/encoder/x86/variance_impl_ssse3.asm new file mode 100644 index 000000000..b1976328d --- /dev/null +++ b/vp8/encoder/x86/variance_impl_ssse3.asm @@ -0,0 +1,348 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +%define xmm_filter_shift 7 + + +;void vp8_filter_block2d_bil_var_ssse3 +;( +; unsigned char *ref_ptr, +; int ref_pixels_per_line, +; unsigned char *src_ptr, +; int src_pixels_per_line, +; unsigned int Height, +; int xoffset, +; int yoffset, +; int *sum, +; unsigned int *sumsquared;; +; +;) +;Note: The filter coefficient at offset=0 is 128. Since the second register +;for Pmaddubsw is signed bytes, we must calculate zero offset seperately. +global sym(vp8_filter_block2d_bil_var_ssse3) +sym(vp8_filter_block2d_bil_var_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + SAVE_XMM + GET_GOT rbx + push rsi + push rdi + push rbx + ; end prolog + + pxor xmm6, xmm6 + pxor xmm7, xmm7 + + lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)] + movsxd rax, dword ptr arg(5) ; xoffset + + cmp rax, 0 ; skip first_pass filter if xoffset=0 + je filter_block2d_bil_var_ssse3_sp_only + + shl rax, 4 ; point to filter coeff with xoffset + lea rax, [rax + rcx] ; HFilter + + movsxd rdx, dword ptr arg(6) ; yoffset + + cmp rdx, 0 ; skip second_pass filter if yoffset=0 + je filter_block2d_bil_var_ssse3_fp_only + + shl rdx, 4 + lea rdx, [rdx + rcx] ; VFilter + + mov rsi, arg(0) ;ref_ptr + mov rdi, arg(2) ;src_ptr + movsxd rcx, dword ptr arg(4) ;Height + + movdqu xmm0, XMMWORD PTR [rsi] + movdqu xmm1, XMMWORD PTR [rsi+1] + movdqa xmm2, xmm0 + + punpcklbw xmm0, xmm1 + punpckhbw xmm2, xmm1 + pmaddubsw xmm0, [rax] + pmaddubsw xmm2, [rax] + + paddw xmm0, [GLOBAL(xmm_bi_rd)] + paddw xmm2, [GLOBAL(xmm_bi_rd)] + psraw xmm0, xmm_filter_shift + psraw xmm2, xmm_filter_shift + + packuswb xmm0, xmm2 + + movsxd rbx, dword ptr arg(1) ;ref_pixels_per_line + lea rsi, [rsi + rbx] +%if ABI_IS_32BIT=0 + movsxd r9, dword ptr arg(3) ;src_pixels_per_line +%endif + +filter_block2d_bil_var_ssse3_loop: + movdqu xmm1, XMMWORD PTR [rsi] + movdqu xmm2, XMMWORD PTR [rsi+1] + movdqa xmm3, xmm1 + + punpcklbw xmm1, xmm2 + punpckhbw xmm3, xmm2 + pmaddubsw xmm1, [rax] + pmaddubsw xmm3, [rax] + + paddw xmm1, [GLOBAL(xmm_bi_rd)] + paddw xmm3, [GLOBAL(xmm_bi_rd)] + psraw xmm1, xmm_filter_shift + psraw xmm3, xmm_filter_shift + packuswb xmm1, xmm3 + + movdqa xmm2, xmm0 + movdqa xmm0, xmm1 + movdqa xmm3, xmm2 + + punpcklbw xmm2, xmm1 + punpckhbw xmm3, xmm1 + pmaddubsw xmm2, [rdx] + pmaddubsw xmm3, [rdx] + + paddw xmm2, [GLOBAL(xmm_bi_rd)] + paddw xmm3, [GLOBAL(xmm_bi_rd)] + psraw xmm2, xmm_filter_shift + psraw xmm3, xmm_filter_shift + + movq xmm1, QWORD PTR [rdi] + pxor xmm4, xmm4 + punpcklbw xmm1, xmm4 + movq xmm5, QWORD PTR [rdi+8] + punpcklbw xmm5, xmm4 + + psubw xmm2, xmm1 + psubw xmm3, xmm5 + paddw xmm6, xmm2 + paddw xmm6, xmm3 + pmaddwd xmm2, xmm2 + pmaddwd xmm3, xmm3 + paddd xmm7, xmm2 + paddd xmm7, xmm3 + + lea rsi, [rsi + rbx] ;ref_pixels_per_line +%if ABI_IS_32BIT + add rdi, dword ptr arg(3) ;src_pixels_per_line +%else + lea rdi, [rdi + r9] +%endif + + sub rcx, 1 + jnz filter_block2d_bil_var_ssse3_loop + + jmp filter_block2d_bil_variance + +filter_block2d_bil_var_ssse3_sp_only: + movsxd rdx, dword ptr arg(6) ; yoffset + + cmp rdx, 0 ; Both xoffset =0 and yoffset=0 + je filter_block2d_bil_var_ssse3_full_pixel + + shl rdx, 4 + lea rdx, [rdx + rcx] ; VFilter + + mov rsi, arg(0) ;ref_ptr + mov rdi, arg(2) ;src_ptr + movsxd rcx, dword ptr arg(4) ;Height + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line + + movdqu xmm1, XMMWORD PTR [rsi] + movdqa xmm0, xmm1 + + movsxd rbx, dword ptr arg(3) ;src_pixels_per_line + lea rsi, [rsi + rax] + +filter_block2d_bil_sp_only_loop: + movdqu xmm3, XMMWORD PTR [rsi] + movdqa xmm2, xmm1 + movdqa xmm0, xmm3 + + punpcklbw xmm1, xmm3 + punpckhbw xmm2, xmm3 + pmaddubsw xmm1, [rdx] + pmaddubsw xmm2, [rdx] + + paddw xmm1, [GLOBAL(xmm_bi_rd)] + paddw xmm2, [GLOBAL(xmm_bi_rd)] + psraw xmm1, xmm_filter_shift + psraw xmm2, xmm_filter_shift + + movq xmm3, QWORD PTR [rdi] + pxor xmm4, xmm4 + punpcklbw xmm3, xmm4 + movq xmm5, QWORD PTR [rdi+8] + punpcklbw xmm5, xmm4 + + psubw xmm1, xmm3 + psubw xmm2, xmm5 + paddw xmm6, xmm1 + paddw xmm6, xmm2 + pmaddwd xmm1, xmm1 + pmaddwd xmm2, xmm2 + paddd xmm7, xmm1 + paddd xmm7, xmm2 + + movdqa xmm1, xmm0 + lea rsi, [rsi + rax] ;ref_pixels_per_line + lea rdi, [rdi + rbx] ;src_pixels_per_line + + sub rcx, 1 + jnz filter_block2d_bil_sp_only_loop + + jmp filter_block2d_bil_variance + +filter_block2d_bil_var_ssse3_full_pixel: + mov rsi, arg(0) ;ref_ptr + mov rdi, arg(2) ;src_ptr + movsxd rcx, dword ptr arg(4) ;Height + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line + movsxd rbx, dword ptr arg(3) ;src_pixels_per_line + pxor xmm0, xmm0 + +filter_block2d_bil_full_pixel_loop: + movq xmm1, QWORD PTR [rsi] + punpcklbw xmm1, xmm0 + movq xmm2, QWORD PTR [rsi+8] + punpcklbw xmm2, xmm0 + + movq xmm3, QWORD PTR [rdi] + punpcklbw xmm3, xmm0 + movq xmm4, QWORD PTR [rdi+8] + punpcklbw xmm4, xmm0 + + psubw xmm1, xmm3 + psubw xmm2, xmm4 + paddw xmm6, xmm1 + paddw xmm6, xmm2 + pmaddwd xmm1, xmm1 + pmaddwd xmm2, xmm2 + paddd xmm7, xmm1 + paddd xmm7, xmm2 + + lea rsi, [rsi + rax] ;ref_pixels_per_line + lea rdi, [rdi + rbx] ;src_pixels_per_line + sub rcx, 1 + jnz filter_block2d_bil_full_pixel_loop + + jmp filter_block2d_bil_variance + +filter_block2d_bil_var_ssse3_fp_only: + mov rsi, arg(0) ;ref_ptr + mov rdi, arg(2) ;src_ptr + movsxd rcx, dword ptr arg(4) ;Height + movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line + + pxor xmm0, xmm0 + movsxd rbx, dword ptr arg(3) ;src_pixels_per_line + +filter_block2d_bil_fp_only_loop: + movdqu xmm1, XMMWORD PTR [rsi] + movdqu xmm2, XMMWORD PTR [rsi+1] + movdqa xmm3, xmm1 + + punpcklbw xmm1, xmm2 + punpckhbw xmm3, xmm2 + pmaddubsw xmm1, [rax] + pmaddubsw xmm3, [rax] + + paddw xmm1, [GLOBAL(xmm_bi_rd)] + paddw xmm3, [GLOBAL(xmm_bi_rd)] + psraw xmm1, xmm_filter_shift + psraw xmm3, xmm_filter_shift + + movq xmm2, XMMWORD PTR [rdi] + pxor xmm4, xmm4 + punpcklbw xmm2, xmm4 + movq xmm5, QWORD PTR [rdi+8] + punpcklbw xmm5, xmm4 + + psubw xmm1, xmm2 + psubw xmm3, xmm5 + paddw xmm6, xmm1 + paddw xmm6, xmm3 + pmaddwd xmm1, xmm1 + pmaddwd xmm3, xmm3 + paddd xmm7, xmm1 + paddd xmm7, xmm3 + + lea rsi, [rsi + rdx] + lea rdi, [rdi + rbx] ;src_pixels_per_line + + sub rcx, 1 + jnz filter_block2d_bil_fp_only_loop + + jmp filter_block2d_bil_variance + +filter_block2d_bil_variance: + pxor xmm0, xmm0 + pxor xmm1, xmm1 + pxor xmm5, xmm5 + + punpcklwd xmm0, xmm6 + punpckhwd xmm1, xmm6 + psrad xmm0, 16 + psrad xmm1, 16 + paddd xmm0, xmm1 + movdqa xmm1, xmm0 + + movdqa xmm6, xmm7 + punpckldq xmm6, xmm5 + punpckhdq xmm7, xmm5 + paddd xmm6, xmm7 + + punpckldq xmm0, xmm5 + punpckhdq xmm1, xmm5 + paddd xmm0, xmm1 + + movdqa xmm7, xmm6 + movdqa xmm1, xmm0 + + psrldq xmm7, 8 + psrldq xmm1, 8 + + paddd xmm6, xmm7 + paddd xmm0, xmm1 + + mov rsi, arg(7) ;[Sum] + mov rdi, arg(8) ;[SSE] + + movd [rsi], xmm0 + movd [rdi], xmm6 + + ; begin epilog + pop rbx + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +SECTION_RODATA +align 16 +xmm_bi_rd: + times 8 dw 64 +align 16 +vp8_bilinear_filters_ssse3: + times 8 db 128, 0 + times 8 db 112, 16 + times 8 db 96, 32 + times 8 db 80, 48 + times 8 db 64, 64 + times 8 db 48, 80 + times 8 db 32, 96 + times 8 db 16, 112 diff --git a/vp8/encoder/x86/variance_mmx.c b/vp8/encoder/x86/variance_mmx.c index 6eed98e07..07358c0c7 100644 --- a/vp8/encoder/x86/variance_mmx.c +++ b/vp8/encoder/x86/variance_mmx.c @@ -456,146 +456,6 @@ unsigned int vp8_sub_pixel_variance8x16_mmx return (xxsum - ((xsum * xsum) >> 7)); } -unsigned int vp8_i_variance16x16_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int sse0, sse1, sse2, sse3, var; - int sum0, sum1, sum2, sum3, avg; - - - vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; - vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); - vp8_get8x8var_mmx(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse2, &sum2) ; - vp8_get8x8var_mmx(src_ptr + (source_stride >> 1) + 8, source_stride, ref_ptr + (recon_stride >> 1) + 8, recon_stride, &sse3, &sum3); - - var = sse0 + sse1 + sse2 + sse3; - avg = sum0 + sum1 + sum2 + sum3; - *sse = var; - return (var - ((avg * avg) >> 8)); - -} - -unsigned int vp8_i_variance8x16_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int sse0, sse1, var; - int sum0, sum1, avg; - vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; - vp8_get8x8var_mmx(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse1, &sum1) ; - - var = sse0 + sse1; - avg = sum0 + sum1; - - *sse = var; - return (var - ((avg * avg) >> 7)); - -} - -unsigned int vp8_i_sub_pixel_variance16x16_mmx -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) -{ - int xsum0, xsum1; - unsigned int xxsum0, xxsum1; - int f2soffset = (src_pixels_per_line >> 1); - int f2doffset = (dst_pixels_per_line >> 1); - - - vp8_filter_block2d_bil_var_mmx( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], - &xsum0, &xxsum0 - ); - - - vp8_filter_block2d_bil_var_mmx( - src_ptr + 8, src_pixels_per_line, - dst_ptr + 8, dst_pixels_per_line, 8, - vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], - &xsum1, &xxsum1 - ); - - xsum0 += xsum1; - xxsum0 += xxsum1; - - vp8_filter_block2d_bil_var_mmx( - src_ptr + f2soffset, src_pixels_per_line, - dst_ptr + f2doffset, dst_pixels_per_line, 8, - vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], - &xsum1, &xxsum1 - ); - - xsum0 += xsum1; - xxsum0 += xxsum1; - - vp8_filter_block2d_bil_var_mmx( - src_ptr + f2soffset + 8, src_pixels_per_line, - dst_ptr + f2doffset + 8, dst_pixels_per_line, 8, - vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], - &xsum1, &xxsum1 - ); - - xsum0 += xsum1; - xxsum0 += xxsum1; - *sse = xxsum0; - return (xxsum0 - ((xsum0 * xsum0) >> 8)); -} - - -unsigned int vp8_i_sub_pixel_variance8x16_mmx -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) -{ - int xsum0, xsum1; - unsigned int xxsum0, xxsum1; - int f2soffset = (src_pixels_per_line >> 1); - int f2doffset = (dst_pixels_per_line >> 1); - - - vp8_filter_block2d_bil_var_mmx( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], - &xsum0, &xxsum0 - ); - - - vp8_filter_block2d_bil_var_mmx( - src_ptr + f2soffset, src_pixels_per_line, - dst_ptr + f2doffset, dst_pixels_per_line, 8, - vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], - &xsum1, &xxsum1 - ); - - xsum0 += xsum1; - xxsum0 += xxsum1; - *sse = xxsum0; - return (xxsum0 - ((xsum0 * xsum0) >> 7)); -} - unsigned int vp8_variance_halfpixvar16x16_h_mmx( const unsigned char *src_ptr, diff --git a/vp8/encoder/x86/variance_sse2.c b/vp8/encoder/x86/variance_sse2.c index 7cf6a6308..0edda3062 100644 --- a/vp8/encoder/x86/variance_sse2.c +++ b/vp8/encoder/x86/variance_sse2.c @@ -81,6 +81,16 @@ void vp8_filter_block2d_bil_var_sse2 int *sum, unsigned int *sumsquared ); +void vp8_half_horiz_vert_variance8x_h_sse2 +( + const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + int *sum, + unsigned int *sumsquared +); void vp8_half_horiz_vert_variance16x_h_sse2 ( const unsigned char *ref_ptr, @@ -91,6 +101,16 @@ void vp8_half_horiz_vert_variance16x_h_sse2 int *sum, unsigned int *sumsquared ); +void vp8_half_horiz_variance8x_h_sse2 +( + const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + int *sum, + unsigned int *sumsquared +); void vp8_half_horiz_variance16x_h_sse2 ( const unsigned char *ref_ptr, @@ -101,6 +121,16 @@ void vp8_half_horiz_variance16x_h_sse2 int *sum, unsigned int *sumsquared ); +void vp8_half_vert_variance8x_h_sse2 +( + const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + int *sum, + unsigned int *sumsquared +); void vp8_half_vert_variance16x_h_sse2 ( const unsigned char *ref_ptr, @@ -262,21 +292,21 @@ unsigned int vp8_sub_pixel_variance8x8_wmt if (xoffset == 4 && yoffset == 0) { - vp8_half_horiz_variance16x_h_sse2( + vp8_half_horiz_variance8x_h_sse2( src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, 8, &xsum, &xxsum); } else if (xoffset == 0 && yoffset == 4) { - vp8_half_vert_variance16x_h_sse2( + vp8_half_vert_variance8x_h_sse2( src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, 8, &xsum, &xxsum); } else if (xoffset == 4 && yoffset == 4) { - vp8_half_horiz_vert_variance16x_h_sse2( + vp8_half_horiz_vert_variance8x_h_sse2( src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, 8, &xsum, &xxsum); @@ -317,11 +347,6 @@ unsigned int vp8_sub_pixel_variance16x16_wmt src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, 16, &xsum0, &xxsum0); - - vp8_half_horiz_variance16x_h_sse2( - src_ptr + 8, src_pixels_per_line, - dst_ptr + 8, dst_pixels_per_line, 16, - &xsum1, &xxsum1); } else if (xoffset == 0 && yoffset == 4) { @@ -329,11 +354,6 @@ unsigned int vp8_sub_pixel_variance16x16_wmt src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, 16, &xsum0, &xxsum0); - - vp8_half_vert_variance16x_h_sse2( - src_ptr + 8, src_pixels_per_line, - dst_ptr + 8, dst_pixels_per_line, 16, - &xsum1, &xxsum1); } else if (xoffset == 4 && yoffset == 4) { @@ -341,11 +361,6 @@ unsigned int vp8_sub_pixel_variance16x16_wmt src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, 16, &xsum0, &xxsum0); - - vp8_half_horiz_vert_variance16x_h_sse2( - src_ptr + 8, src_pixels_per_line, - dst_ptr + 8, dst_pixels_per_line, 16, - &xsum1, &xxsum1); } else { @@ -356,17 +371,16 @@ unsigned int vp8_sub_pixel_variance16x16_wmt &xsum0, &xxsum0 ); - vp8_filter_block2d_bil_var_sse2( src_ptr + 8, src_pixels_per_line, dst_ptr + 8, dst_pixels_per_line, 16, xoffset, yoffset, &xsum1, &xxsum1 ); + xsum0 += xsum1; + xxsum0 += xxsum1; } - xsum0 += xsum1; - xxsum0 += xxsum1; *sse = xxsum0; return (xxsum0 - ((xsum0 * xsum0) >> 8)); } @@ -406,11 +420,6 @@ unsigned int vp8_sub_pixel_variance16x8_wmt src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, 8, &xsum0, &xxsum0); - - vp8_half_horiz_variance16x_h_sse2( - src_ptr + 8, src_pixels_per_line, - dst_ptr + 8, dst_pixels_per_line, 8, - &xsum1, &xxsum1); } else if (xoffset == 0 && yoffset == 4) { @@ -418,11 +427,6 @@ unsigned int vp8_sub_pixel_variance16x8_wmt src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, 8, &xsum0, &xxsum0); - - vp8_half_vert_variance16x_h_sse2( - src_ptr + 8, src_pixels_per_line, - dst_ptr + 8, dst_pixels_per_line, 8, - &xsum1, &xxsum1); } else if (xoffset == 4 && yoffset == 4) { @@ -430,11 +434,6 @@ unsigned int vp8_sub_pixel_variance16x8_wmt src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, 8, &xsum0, &xxsum0); - - vp8_half_horiz_vert_variance16x_h_sse2( - src_ptr + 8, src_pixels_per_line, - dst_ptr + 8, dst_pixels_per_line, 8, - &xsum1, &xxsum1); } else { @@ -449,11 +448,10 @@ unsigned int vp8_sub_pixel_variance16x8_wmt dst_ptr + 8, dst_pixels_per_line, 8, xoffset, yoffset, &xsum1, &xxsum1); + xsum0 += xsum1; + xxsum0 += xxsum1; } - xsum0 += xsum1; - xxsum0 += xxsum1; - *sse = xxsum0; return (xxsum0 - ((xsum0 * xsum0) >> 7)); } @@ -474,21 +472,21 @@ unsigned int vp8_sub_pixel_variance8x16_wmt if (xoffset == 4 && yoffset == 0) { - vp8_half_horiz_variance16x_h_sse2( + vp8_half_horiz_variance8x_h_sse2( src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, 16, &xsum, &xxsum); } else if (xoffset == 0 && yoffset == 4) { - vp8_half_vert_variance16x_h_sse2( + vp8_half_vert_variance8x_h_sse2( src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, 16, &xsum, &xxsum); } else if (xoffset == 4 && yoffset == 4) { - vp8_half_horiz_vert_variance16x_h_sse2( + vp8_half_horiz_vert_variance8x_h_sse2( src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, 16, &xsum, &xxsum); @@ -506,81 +504,6 @@ unsigned int vp8_sub_pixel_variance8x16_wmt return (xxsum - ((xsum * xsum) >> 7)); } -unsigned int vp8_i_variance16x16_wmt( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int sse0, sse1, sse2, sse3, var; - int sum0, sum1, sum2, sum3, avg; - - - vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; - vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); - vp8_get8x8var_sse2(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse2, &sum2) ; - vp8_get8x8var_sse2(src_ptr + (source_stride >> 1) + 8, source_stride, ref_ptr + (recon_stride >> 1) + 8, recon_stride, &sse3, &sum3); - - var = sse0 + sse1 + sse2 + sse3; - avg = sum0 + sum1 + sum2 + sum3; - - *sse = var; - return (var - ((avg * avg) >> 8)); - -} - -unsigned int vp8_i_variance8x16_wmt( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int sse0, sse1, var; - int sum0, sum1, avg; - vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; - vp8_get8x8var_sse2(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse1, &sum1) ; - - var = sse0 + sse1; - avg = sum0 + sum1; - - *sse = var; - return (var - ((avg * avg) >> 7)); - -} - - -unsigned int vp8_i_sub_pixel_variance16x16_wmt -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) -{ - return vp8_sub_pixel_variance16x16_wmt(src_ptr, (src_pixels_per_line >> 1), xoffset, yoffset, dst_ptr, (dst_pixels_per_line >> 1), sse); -} - - -unsigned int vp8_i_sub_pixel_variance8x16_wmt -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) -{ - - return vp8_sub_pixel_variance8x16_wmt(src_ptr, (src_pixels_per_line >> 1), xoffset, yoffset, dst_ptr, (dst_pixels_per_line >> 1), sse); -} - unsigned int vp8_variance_halfpixvar16x16_h_wmt( const unsigned char *src_ptr, @@ -589,21 +512,14 @@ unsigned int vp8_variance_halfpixvar16x16_h_wmt( int dst_pixels_per_line, unsigned int *sse) { - int xsum0, xsum1; - unsigned int xxsum0, xxsum1; + int xsum0; + unsigned int xxsum0; vp8_half_horiz_variance16x_h_sse2( src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, 16, &xsum0, &xxsum0); - vp8_half_horiz_variance16x_h_sse2( - src_ptr + 8, src_pixels_per_line, - dst_ptr + 8, dst_pixels_per_line, 16, - &xsum1, &xxsum1); - - xsum0 += xsum1; - xxsum0 += xxsum1; *sse = xxsum0; return (xxsum0 - ((xsum0 * xsum0) >> 8)); } @@ -616,21 +532,13 @@ unsigned int vp8_variance_halfpixvar16x16_v_wmt( int dst_pixels_per_line, unsigned int *sse) { - int xsum0, xsum1; - unsigned int xxsum0, xxsum1; - + int xsum0; + unsigned int xxsum0; vp8_half_vert_variance16x_h_sse2( src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, 16, &xsum0, &xxsum0); - vp8_half_vert_variance16x_h_sse2( - src_ptr + 8, src_pixels_per_line, - dst_ptr + 8, dst_pixels_per_line, 16, - &xsum1, &xxsum1); - - xsum0 += xsum1; - xxsum0 += xxsum1; *sse = xxsum0; return (xxsum0 - ((xsum0 * xsum0) >> 8)); } @@ -643,21 +551,14 @@ unsigned int vp8_variance_halfpixvar16x16_hv_wmt( int dst_pixels_per_line, unsigned int *sse) { - int xsum0, xsum1; - unsigned int xxsum0, xxsum1; + int xsum0; + unsigned int xxsum0; vp8_half_horiz_vert_variance16x_h_sse2( src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, 16, &xsum0, &xxsum0); - vp8_half_horiz_vert_variance16x_h_sse2( - src_ptr + 8, src_pixels_per_line, - dst_ptr + 8, dst_pixels_per_line, 16, - &xsum1, &xxsum1); - - xsum0 += xsum1; - xxsum0 += xxsum1; *sse = xxsum0; return (xxsum0 - ((xsum0 * xsum0) >> 8)); } diff --git a/vp8/encoder/x86/variance_ssse3.c b/vp8/encoder/x86/variance_ssse3.c new file mode 100644 index 000000000..eb5d486bf --- /dev/null +++ b/vp8/encoder/x86/variance_ssse3.c @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vp8/encoder/variance.h" +#include "vp8/common/pragmas.h" +#include "vpx_ports/mem.h" + +extern unsigned int vp8_get16x16var_sse2 +( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *SSE, + int *Sum +); +extern void vp8_half_horiz_vert_variance16x_h_sse2 +( + const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + int *sum, + unsigned int *sumsquared +); +extern void vp8_half_horiz_variance16x_h_sse2 +( + const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + int *sum, + unsigned int *sumsquared +); +extern void vp8_half_vert_variance16x_h_sse2 +( + const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + int *sum, + unsigned int *sumsquared +); +extern void vp8_filter_block2d_bil_var_ssse3 +( + const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + int xoffset, + int yoffset, + int *sum, + unsigned int *sumsquared +); + +unsigned int vp8_sub_pixel_variance16x16_ssse3 +( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + int xsum0; + unsigned int xxsum0; + + // note we could avoid these if statements if the calling function + // just called the appropriate functions inside. + if (xoffset == 4 && yoffset == 0) + { + vp8_half_horiz_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + &xsum0, &xxsum0); + } + else if (xoffset == 0 && yoffset == 4) + { + vp8_half_vert_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + &xsum0, &xxsum0); + } + else if (xoffset == 4 && yoffset == 4) + { + vp8_half_horiz_vert_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + &xsum0, &xxsum0); + } + else + { + vp8_filter_block2d_bil_var_ssse3( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + xoffset, yoffset, + &xsum0, &xxsum0); + } + + *sse = xxsum0; + return (xxsum0 - ((xsum0 * xsum0) >> 8)); +} + +unsigned int vp8_sub_pixel_variance16x8_ssse3 +( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse + +) +{ + int xsum0; + unsigned int xxsum0; + + if (xoffset == 4 && yoffset == 0) + { + vp8_half_horiz_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + &xsum0, &xxsum0); + } + else if (xoffset == 0 && yoffset == 4) + { + vp8_half_vert_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + &xsum0, &xxsum0); + } + else if (xoffset == 4 && yoffset == 4) + { + vp8_half_horiz_vert_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + &xsum0, &xxsum0); + } + else + { + vp8_filter_block2d_bil_var_ssse3( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + xoffset, yoffset, + &xsum0, &xxsum0); + } + + *sse = xxsum0; + return (xxsum0 - ((xsum0 * xsum0) >> 7)); +} diff --git a/vp8/encoder/x86/variance_x86.h b/vp8/encoder/x86/variance_x86.h index 6bea15ebc..3560f7413 100644 --- a/vp8/encoder/x86/variance_x86.h +++ b/vp8/encoder/x86/variance_x86.h @@ -286,6 +286,8 @@ extern prototype_sad_multi_dif_address(vp8_sad4x4x4d_sse3); #if HAVE_SSSE3 extern prototype_sad_multi_same_address(vp8_sad16x16x3_ssse3); extern prototype_sad_multi_same_address(vp8_sad16x8x3_ssse3); +extern prototype_subpixvariance(vp8_sub_pixel_variance16x8_ssse3); +extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_ssse3); #if !CONFIG_RUNTIME_CPU_DETECT #undef vp8_variance_sad16x16x3 @@ -294,6 +296,12 @@ extern prototype_sad_multi_same_address(vp8_sad16x8x3_ssse3); #undef vp8_variance_sad16x8x3 #define vp8_variance_sad16x8x3 vp8_sad16x8x3_ssse3 +#undef vp8_variance_subpixvar16x8 +#define vp8_variance_subpixvar16x8 vp8_sub_pixel_variance16x8_ssse3 + +#undef vp8_variance_subpixvar16x16 +#define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_ssse3 + #endif #endif diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c index 61c603229..5ab364147 100644 --- a/vp8/encoder/x86/x86_csystemdependent.c +++ b/vp8/encoder/x86/x86_csystemdependent.c @@ -176,6 +176,25 @@ void vp8_fast_quantize_b_ssse3(BLOCK *b, BLOCKD *d) d->dqcoeff ); } +#if CONFIG_PSNR +#if ARCH_X86_64 +typedef void ssimpf +( + unsigned char *s, + int sp, + unsigned char *r, + int rp, + unsigned long *sum_s, + unsigned long *sum_r, + unsigned long *sum_sq_s, + unsigned long *sum_sq_r, + unsigned long *sum_sxr +); + +extern ssimpf vp8_ssim_parms_16x16_sse3; +extern ssimpf vp8_ssim_parms_8x8_sse3; +#endif +#endif #endif @@ -280,6 +299,8 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) cpi->rtcd.variance.get16x16prederror = vp8_get16x16pred_error_sse2; cpi->rtcd.variance.get8x8var = vp8_get8x8var_sse2; cpi->rtcd.variance.get16x16var = vp8_get16x16var_sse2; + + /* cpi->rtcd.variance.get4x4sse_cs not implemented for wmt */; cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_sse2; @@ -334,11 +355,23 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) cpi->rtcd.variance.sad16x16x3 = vp8_sad16x16x3_ssse3; cpi->rtcd.variance.sad16x8x3 = vp8_sad16x8x3_ssse3; + cpi->rtcd.variance.subpixvar16x8 = vp8_sub_pixel_variance16x8_ssse3; + cpi->rtcd.variance.subpixvar16x16 = vp8_sub_pixel_variance16x16_ssse3; + cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_ssse3; +#if CONFIG_PSNR +#if ARCH_X86_64 + cpi->rtcd.variance.ssimpf_8x8 = vp8_ssim_parms_8x8_sse3; + cpi->rtcd.variance.ssimpf = vp8_ssim_parms_16x16_sse3; +#endif +#endif + } #endif + + #if HAVE_SSE4_1 if (SSE4_1Enabled) { diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk index 4daadee32..ba9caa7ce 100644 --- a/vp8/vp8_common.mk +++ b/vp8/vp8_common.mk @@ -24,6 +24,7 @@ VP8_COMMON_SRCS-yes += common/entropymode.c VP8_COMMON_SRCS-yes += common/entropymv.c VP8_COMMON_SRCS-yes += common/extend.c VP8_COMMON_SRCS-yes += common/filter.c +VP8_COMMON_SRCS-yes += common/filter.h VP8_COMMON_SRCS-yes += common/findnearmv.c VP8_COMMON_SRCS-yes += common/generic/systemdependent.c VP8_COMMON_SRCS-yes += common/idctllm.c diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c index a45a37912..2622738ec 100644 --- a/vp8/vp8_cx_iface.c +++ b/vp8/vp8_cx_iface.c @@ -199,7 +199,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, { int mb_r = (cfg->g_h + 15) / 16; int mb_c = (cfg->g_w + 15) / 16; - size_t packet_sz = vp8_firstpass_stats_sz(mb_r * mb_c); + size_t packet_sz = sizeof(FIRSTPASS_STATS); int n_packets = cfg->rc_twopass_stats_in.sz / packet_sz; FIRSTPASS_STATS *stats; diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk index ed2feddae..8f0681fb9 100644 --- a/vp8/vp8cx.mk +++ b/vp8/vp8cx.mk @@ -110,10 +110,13 @@ VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm VP8_CX_SRCS-$(HAVE_SSE3) += encoder/x86/sad_sse3.asm VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/sad_ssse3.asm +VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/variance_ssse3.c +VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/variance_impl_ssse3.asm VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.asm VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/sad_sse4.asm VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm +VP8_CX_SRCS-$(ARCH_X86_64) += encoder/x86/ssim_opt.asm ifeq ($(CONFIG_REALTIME_ONLY),yes) VP8_CX_SRCS_REMOVE-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm @@ -163,8 +163,8 @@ int stats_open_file(stats_io_t *stats, const char *fpf, int pass) if (!stats->buf.buf) { - fprintf(stderr, "Failed to allocate first-pass stats buffer (%d bytes)\n", - stats->buf_alloc_sz); + fprintf(stderr, "Failed to allocate first-pass stats buffer (%lu bytes)\n", + (unsigned long)stats->buf_alloc_sz); exit(EXIT_FAILURE); } |