diff options
36 files changed, 1286 insertions, 806 deletions
diff --git a/build/make/gen_msvs_proj.sh b/build/make/gen_msvs_proj.sh index becd95827..c2ef44a9b 100755 --- a/build/make/gen_msvs_proj.sh +++ b/build/make/gen_msvs_proj.sh @@ -33,6 +33,7 @@ Options: --proj-guid=GUID GUID to use for the project --module-def=filename File containing export definitions (for DLLs) --ver=version Version (7,8,9) of visual studio to generate for + --src-path-bare=dir Path to root of source tree -Ipath/to/include Additional include directories -DFLAG[=value] Preprocessor macros to define -Lpath/to/lib Additional library search paths @@ -191,6 +192,8 @@ for opt in "$@"; do ;; --lib) proj_kind="lib" ;; + --src-path-bare=*) src_path_bare="$optval" + ;; --static-crt) use_static_runtime=true ;; --ver=*) @@ -335,6 +338,35 @@ generate_vcproj() { case "$target" in x86*) case "$name" in + obj_int_extract) + tag Tool \ + Name="VCCLCompilerTool" \ + Optimization="0" \ + AdditionalIncludeDirectories="$incs" \ + PreprocessorDefinitions="WIN32;DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE" \ + RuntimeLibrary="$debug_runtime" \ + WarningLevel="3" \ + Detect64BitPortabilityProblems="true" \ + DebugInformationFormat="1" \ + ;; + vpx) + tag Tool \ + Name="VCPreBuildEventTool" \ + CommandLine="call obj_int_extract.bat $src_path_bare" \ + + tag Tool \ + Name="VCCLCompilerTool" \ + Optimization="0" \ + AdditionalIncludeDirectories="$incs" \ + PreprocessorDefinitions="WIN32;_DEBUG;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE;$defines" \ + RuntimeLibrary="$debug_runtime" \ + UsePrecompiledHeader="0" \ + WarningLevel="3" \ + DebugInformationFormat="1" \ + Detect64BitPortabilityProblems="true" \ + + $uses_asm && tag Tool Name="YASM" IncludePaths="$incs" Debug="1" + ;; *) tag Tool \ Name="VCCLCompilerTool" \ @@ -358,6 +390,12 @@ generate_vcproj() { case "$target" in x86*) case "$name" in + obj_int_extract) + tag Tool \ + Name="VCLinkerTool" \ + OutputFile="${name}.exe" \ + GenerateDebugInformation="true" \ + ;; *) tag Tool \ Name="VCLinkerTool" \ @@ -406,6 +444,34 @@ generate_vcproj() { case "$target" in x86*) case "$name" in + obj_int_extract) + tag Tool \ + Name="VCCLCompilerTool" \ + AdditionalIncludeDirectories="$incs" \ + PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE" \ + RuntimeLibrary="$release_runtime" \ + UsePrecompiledHeader="0" \ + WarningLevel="3" \ + Detect64BitPortabilityProblems="true" \ + DebugInformationFormat="0" \ + ;; + vpx) + tag Tool \ + Name="VCPreBuildEventTool" \ + CommandLine="call obj_int_extract.bat $src_path_bare" \ + + tag Tool \ + Name="VCCLCompilerTool" \ + AdditionalIncludeDirectories="$incs" \ + PreprocessorDefinitions="WIN32;NDEBUG;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE;$defines" \ + RuntimeLibrary="$release_runtime" \ + UsePrecompiledHeader="0" \ + WarningLevel="3" \ + DebugInformationFormat="0" \ + Detect64BitPortabilityProblems="true" \ + + $uses_asm && tag Tool Name="YASM" IncludePaths="$incs" + ;; *) tag Tool \ Name="VCCLCompilerTool" \ @@ -428,6 +494,12 @@ generate_vcproj() { case "$target" in x86*) case "$name" in + obj_int_extract) + tag Tool \ + Name="VCLinkerTool" \ + OutputFile="${name}.exe" \ + GenerateDebugInformation="true" \ + ;; *) tag Tool \ Name="VCLinkerTool" \ diff --git a/build/make/obj_int_extract.c b/build/make/obj_int_extract.c index 22c5cf2ab..01b3129d7 100644 --- a/build/make/obj_int_extract.c +++ b/build/make/obj_int_extract.c @@ -840,9 +840,18 @@ int parse_coff(unsigned __int8 *buf, size_t sz) strtab_ptr = symtab_ptr + symtab_sz * 18; if (nsections > 96) - goto bail; + { + log_msg("Too many sections\n"); + return 1; + } - sectionlist = malloc(nsections * sizeof * sectionlist); + sectionlist = malloc(nsections * sizeof(sectionlist)); + + if (sectionlist == NULL) + { + log_msg("Allocating first level of section list failed\n"); + return 1; + } //log_msg("COFF: Found %u symbols in %u sections.\n", symtab_sz, nsections); @@ -860,6 +869,12 @@ int parse_coff(unsigned __int8 *buf, size_t sz) //log_msg("COFF: Parsing section %s\n",sectionname); sectionlist[i] = malloc(strlen(sectionname) + 1); + + if (sectionlist[i] == NULL) + { + log_msg("Allocating storage for %s failed\n", sectionname); + goto bail; + } strcpy(sectionlist[i], sectionname); if (!strcmp(sectionname, ".data")) sectionrawdata_ptr = get_le32(ptr + 20); @@ -903,12 +918,14 @@ int parse_coff(unsigned __int8 *buf, size_t sz) char name[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; strncpy(name, ptr, 8); //log_msg("COFF: Parsing symbol %s\n",name); + /* +1 to avoid printing leading underscore */ printf("%-40s EQU ", name + 1); } else { //log_msg("COFF: Parsing symbol %s\n", // buf + strtab_ptr + get_le32(ptr+4)); + /* +1 to avoid printing leading underscore */ printf("%-40s EQU ", buf + strtab_ptr + get_le32(ptr + 4) + 1); } diff --git a/build/x86-msvs/obj_int_extract.bat b/build/x86-msvs/obj_int_extract.bat new file mode 100644 index 000000000..1bb865331 --- /dev/null +++ b/build/x86-msvs/obj_int_extract.bat @@ -0,0 +1,15 @@ +REM Copyright (c) 2011 The WebM project authors. All Rights Reserved. +REM +REM Use of this source code is governed by a BSD-style license +REM that can be found in the LICENSE file in the root of the source +REM tree. An additional intellectual property rights grant can be found +REM in the file PATENTS. All contributing project authors may +REM be found in the AUTHORS file in the root of the source tree. +echo on + +cl /I "./" /I "%1" /nologo /c "%1/vp8/common/asm_com_offsets.c" +cl /I "./" /I "%1" /nologo /c "%1/vp8/decoder/asm_dec_offsets.c" +cl /I "./" /I "%1" /nologo /c "%1/vp8/encoder/asm_enc_offsets.c" +obj_int_extract.exe rvds "asm_com_offsets.obj" > "asm_com_offsets.asm" +obj_int_extract.exe rvds "asm_dec_offsets.obj" > "asm_dec_offsets.asm" +obj_int_extract.exe rvds "asm_enc_offsets.obj" > "asm_enc_offsets.asm" @@ -9,7 +9,13 @@ ## -ASM:=$(if $(filter yes,$(CONFIG_GCC)),.asm.s,.asm) +# ARM assembly files are written in RVCT-style. We use some make magic to +# filter those files to allow GCC compilation +ifeq ($(ARCH_ARM),yes) + ASM:=$(if $(filter yes,$(CONFIG_GCC)),.asm.s,.asm) +else + ASM:=.asm +endif CODEC_SRCS-yes += libs.mk @@ -126,6 +132,23 @@ INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(call enabled,CODEC_EXPORTS) ifeq ($(CONFIG_EXTERNAL_BUILD),yes) ifeq ($(CONFIG_MSVS),yes) +obj_int_extract.vcproj: $(SRC_PATH_BARE)/build/make/obj_int_extract.c + @cp $(SRC_PATH_BARE)/build/x86-msvs/obj_int_extract.bat . + @echo " [CREATE] $@" + $(SRC_PATH_BARE)/build/make/gen_msvs_proj.sh \ + --exe \ + --target=$(TOOLCHAIN) \ + --name=obj_int_extract \ + --ver=$(CONFIG_VS_VERSION) \ + --proj-guid=E1360C65-D375-4335-8057-7ED99CC3F9B2 \ + $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \ + --out=$@ $^ \ + -I. \ + -I"$(SRC_PATH_BARE)" \ + +PROJECTS-$(BUILD_LIBVPX) += obj_int_extract.vcproj +PROJECTS-$(BUILD_LIBVPX) += obj_int_extract.bat + vpx.def: $(call enabled,CODEC_EXPORTS) @echo " [CREATE] $@" $(SRC_PATH_BARE)/build/make/gen_msvs_def.sh\ @@ -135,15 +158,16 @@ CLEAN-OBJS += vpx.def vpx.vcproj: $(CODEC_SRCS) vpx.def @echo " [CREATE] $@" - $(SRC_PATH_BARE)/build/make/gen_msvs_proj.sh\ - --lib\ - --target=$(TOOLCHAIN)\ + $(SRC_PATH_BARE)/build/make/gen_msvs_proj.sh \ + --lib \ + --target=$(TOOLCHAIN) \ $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \ - --name=vpx\ - --proj-guid=DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74\ - --module-def=vpx.def\ - --ver=$(CONFIG_VS_VERSION)\ - --out=$@ $(CFLAGS) $^\ + --name=vpx \ + --proj-guid=DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74 \ + --module-def=vpx.def \ + --ver=$(CONFIG_VS_VERSION) \ + --out=$@ $(CFLAGS) $^ \ + --src-path-bare="$(SRC_PATH_BARE)" \ PROJECTS-$(BUILD_LIBVPX) += vpx.vcproj @@ -207,36 +231,38 @@ endif # # Add assembler dependencies for configuration and offsets # -$(filter %.s.o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm -$(filter %.asm.o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm +$(filter %.s.o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm +$(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm # # Calculate platform- and compiler-specific offsets for hand coded assembly # -ifeq ($(ARCH_ARM), yes) - asm_com_offsets.asm: obj_int_extract - asm_com_offsets.asm: $(VP8_PREFIX)common/asm_com_offsets.c.o +ifeq ($(CONFIG_EXTERNAL_BUILD),) # Visual Studio uses obj_int_extract.bat + ifeq ($(ARCH_ARM), yes) + asm_com_offsets.asm: obj_int_extract + asm_com_offsets.asm: $(VP8_PREFIX)common/asm_com_offsets.c.o ./obj_int_extract rvds $< $(ADS2GAS) > $@ - OBJS-yes += $(VP8_PREFIX)common/asm_com_offsets.c.o - CLEAN-OBJS += asm_com_offsets.asm - $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_com_offsets.asm + OBJS-yes += $(VP8_PREFIX)common/asm_com_offsets.c.o + CLEAN-OBJS += asm_com_offsets.asm + $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_com_offsets.asm - ifeq ($(CONFIG_VP8_ENCODER), yes) - asm_enc_offsets.asm: obj_int_extract - asm_enc_offsets.asm: $(VP8_PREFIX)encoder/asm_enc_offsets.c.o + ifeq ($(CONFIG_VP8_ENCODER), yes) + asm_enc_offsets.asm: obj_int_extract + asm_enc_offsets.asm: $(VP8_PREFIX)encoder/asm_enc_offsets.c.o ./obj_int_extract rvds $< $(ADS2GAS) > $@ - OBJS-yes += $(VP8_PREFIX)encoder/asm_enc_offsets.c.o - CLEAN-OBJS += asm_enc_offsets.asm - $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_enc_offsets.asm - endif - - ifeq ($(CONFIG_VP8_DECODER), yes) - asm_dec_offsets.asm: obj_int_extract - asm_dec_offsets.asm: $(VP8_PREFIX)decoder/asm_dec_offsets.c.o + OBJS-yes += $(VP8_PREFIX)encoder/asm_enc_offsets.c.o + CLEAN-OBJS += asm_enc_offsets.asm + $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_enc_offsets.asm + endif + + ifeq ($(CONFIG_VP8_DECODER), yes) + asm_dec_offsets.asm: obj_int_extract + asm_dec_offsets.asm: $(VP8_PREFIX)decoder/asm_dec_offsets.c.o ./obj_int_extract rvds $< $(ADS2GAS) > $@ - OBJS-yes += $(VP8_PREFIX)decoder/asm_dec_offsets.c.o - CLEAN-OBJS += asm_dec_offsets.asm - $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_dec_offsets.asm + OBJS-yes += $(VP8_PREFIX)decoder/asm_dec_offsets.c.o + CLEAN-OBJS += asm_dec_offsets.asm + $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_dec_offsets.asm + endif endif endif diff --git a/solution.mk b/solution.mk index bef00883f..782150fd9 100644 --- a/solution.mk +++ b/solution.mk @@ -13,8 +13,9 @@ vpx.sln: $(wildcard *.vcproj) @echo " [CREATE] $@" $(SRC_PATH_BARE)/build/make/gen_msvs_sln.sh \ $(if $(filter %vpx.vcproj,$^),\ - $(foreach vcp,$(filter-out %vpx.vcproj,$^),\ + $(foreach vcp,$(filter-out %vpx.vcproj %obj_int_extract.vcproj,$^),\ --dep=$(vcp:.vcproj=):vpx)) \ + --dep=vpx:obj_int_extract \ --ver=$(CONFIG_VS_VERSION)\ --out=$@ $^ vpx.sln.mk: vpx.sln diff --git a/vp8/common/arm/arm_systemdependent.c b/vp8/common/arm/arm_systemdependent.c index 69e1bdff4..bd5c0759d 100644 --- a/vp8/common/arm/arm_systemdependent.c +++ b/vp8/common/arm/arm_systemdependent.c @@ -19,14 +19,6 @@ #include "vp8/common/idct.h" #include "vp8/common/onyxc_int.h" -extern void (*vp8_build_intra_predictors_mby_ptr)(MACROBLOCKD *x); -extern void vp8_build_intra_predictors_mby(MACROBLOCKD *x); -extern void vp8_build_intra_predictors_mby_neon(MACROBLOCKD *x); - -extern void (*vp8_build_intra_predictors_mby_s_ptr)(MACROBLOCKD *x); -extern void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x); -extern void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x); - void vp8_arch_arm_common_init(VP8_COMMON *ctx) { #if CONFIG_RUNTIME_CPU_DETECT @@ -106,31 +98,12 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx) rtcd->recon.recon2 = vp8_recon2b_neon; rtcd->recon.recon4 = vp8_recon4b_neon; rtcd->recon.recon_mb = vp8_recon_mb_neon; - + rtcd->recon.build_intra_predictors_mby = + vp8_build_intra_predictors_mby_neon; + rtcd->recon.build_intra_predictors_mby_s = + vp8_build_intra_predictors_mby_s_neon; } #endif #endif - -#if HAVE_ARMV6 -#if CONFIG_RUNTIME_CPU_DETECT - if (has_media) -#endif - { - vp8_build_intra_predictors_mby_ptr = vp8_build_intra_predictors_mby; - vp8_build_intra_predictors_mby_s_ptr = vp8_build_intra_predictors_mby_s; - } -#endif - -#if HAVE_ARMV7 -#if CONFIG_RUNTIME_CPU_DETECT - if (has_neon) -#endif - { - vp8_build_intra_predictors_mby_ptr = - vp8_build_intra_predictors_mby_neon; - vp8_build_intra_predictors_mby_s_ptr = - vp8_build_intra_predictors_mby_s_neon; - } -#endif } diff --git a/vp8/common/arm/recon_arm.h b/vp8/common/arm/recon_arm.h index b46b7fc7d..377cb2a07 100644 --- a/vp8/common/arm/recon_arm.h +++ b/vp8/common/arm/recon_arm.h @@ -53,6 +53,9 @@ extern prototype_copy_block(vp8_copy_mem16x16_neon); extern prototype_recon_macroblock(vp8_recon_mb_neon); +extern prototype_build_intra_predictors(vp8_build_intra_predictors_mby_neon); +extern prototype_build_intra_predictors(vp8_build_intra_predictors_mby_s_neon); + #if !CONFIG_RUNTIME_CPU_DETECT #undef vp8_recon_recon #define vp8_recon_recon vp8_recon_b_neon @@ -74,6 +77,13 @@ extern prototype_recon_macroblock(vp8_recon_mb_neon); #undef vp8_recon_recon_mb #define vp8_recon_recon_mb vp8_recon_mb_neon + +#undef vp8_recon_build_intra_predictors_mby +#define vp8_recon_build_intra_predictors_mby vp8_build_intra_predictors_mby_neon + +#undef vp8_recon_build_intra_predictors_mby_s +#define vp8_recon_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_neon + #endif #endif diff --git a/vp8/common/generic/systemdependent.c b/vp8/common/generic/systemdependent.c index c843d86fe..5c6464772 100644 --- a/vp8/common/generic/systemdependent.c +++ b/vp8/common/generic/systemdependent.c @@ -20,12 +20,6 @@ extern void vp8_arch_x86_common_init(VP8_COMMON *ctx); extern void vp8_arch_arm_common_init(VP8_COMMON *ctx); -void (*vp8_build_intra_predictors_mby_ptr)(MACROBLOCKD *x); -extern void vp8_build_intra_predictors_mby(MACROBLOCKD *x); - -void (*vp8_build_intra_predictors_mby_s_ptr)(MACROBLOCKD *x); -extern void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x); - void vp8_machine_specific_config(VP8_COMMON *ctx) { #if CONFIG_RUNTIME_CPU_DETECT @@ -45,6 +39,10 @@ void vp8_machine_specific_config(VP8_COMMON *ctx) rtcd->recon.recon4 = vp8_recon4b_c; rtcd->recon.recon_mb = vp8_recon_mb_c; rtcd->recon.recon_mby = vp8_recon_mby_c; + rtcd->recon.build_intra_predictors_mby = + vp8_build_intra_predictors_mby; + rtcd->recon.build_intra_predictors_mby_s = + vp8_build_intra_predictors_mby_s; rtcd->subpix.sixtap16x16 = vp8_sixtap_predict16x16_c; rtcd->subpix.sixtap8x8 = vp8_sixtap_predict8x8_c; @@ -75,9 +73,6 @@ void vp8_machine_specific_config(VP8_COMMON *ctx) #endif #endif - /* Pure C: */ - vp8_build_intra_predictors_mby_ptr = vp8_build_intra_predictors_mby; - vp8_build_intra_predictors_mby_s_ptr = vp8_build_intra_predictors_mby_s; #if ARCH_X86 || ARCH_X86_64 vp8_arch_x86_common_init(ctx); diff --git a/vp8/common/recon.h b/vp8/common/recon.h index e7df90a71..e608f218c 100644 --- a/vp8/common/recon.h +++ b/vp8/common/recon.h @@ -23,6 +23,9 @@ #define prototype_recon_macroblock(sym) \ void sym(const struct vp8_recon_rtcd_vtable *rtcd, MACROBLOCKD *x) +#define prototype_build_intra_predictors(sym) \ + void sym(MACROBLOCKD *x) + struct vp8_recon_rtcd_vtable; #if ARCH_X86 || ARCH_X86_64 @@ -73,9 +76,23 @@ extern prototype_recon_macroblock(vp8_recon_recon_mb); #endif extern prototype_recon_macroblock(vp8_recon_recon_mby); +#ifndef vp8_recon_build_intra_predictors_mby +#define vp8_recon_build_intra_predictors_mby vp8_build_intra_predictors_mby +#endif +extern prototype_build_intra_predictors\ + (vp8_recon_build_intra_predictors_mby); + +#ifndef vp8_recon_build_intra_predictors_mby_s +#define vp8_recon_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s +#endif +extern prototype_build_intra_predictors\ + (vp8_recon_build_intra_predictors_mby_s); + + typedef prototype_copy_block((*vp8_copy_block_fn_t)); typedef prototype_recon_block((*vp8_recon_fn_t)); typedef prototype_recon_macroblock((*vp8_recon_mb_fn_t)); +typedef prototype_build_intra_predictors((*vp8_build_intra_pred_fn_t)); typedef struct vp8_recon_rtcd_vtable { vp8_copy_block_fn_t copy16x16; @@ -86,6 +103,8 @@ typedef struct vp8_recon_rtcd_vtable vp8_recon_fn_t recon4; vp8_recon_mb_fn_t recon_mb; vp8_recon_mb_fn_t recon_mby; + vp8_build_intra_pred_fn_t build_intra_predictors_mby_s; + vp8_build_intra_pred_fn_t build_intra_predictors_mby; } vp8_recon_rtcd_vtable_t; #if CONFIG_RUNTIME_CPU_DETECT diff --git a/vp8/common/reconintra.h b/vp8/common/reconintra.h index 988b43a77..4025a5307 100644 --- a/vp8/common/reconintra.h +++ b/vp8/common/reconintra.h @@ -14,13 +14,6 @@ extern void init_intra_left_above_pixels(MACROBLOCKD *x); -extern void (*vp8_build_intra_predictors_mby_ptr)(MACROBLOCKD *x); -extern void vp8_build_intra_predictors_mby(MACROBLOCKD *x); -extern void vp8_build_intra_predictors_mby_neon(MACROBLOCKD *x); -extern void (*vp8_build_intra_predictors_mby_s_ptr)(MACROBLOCKD *x); -extern void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x); -extern void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x); - extern void vp8_build_intra_predictors_mbuv(MACROBLOCKD *x); extern void vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *x); diff --git a/vp8/decoder/decodframe.c b/vp8/decoder/decodframe.c index c454bbc70..3d4d9b961 100644 --- a/vp8/decoder/decodframe.c +++ b/vp8/decoder/decodframe.c @@ -115,8 +115,8 @@ static void skip_recon_mb(VP8D_COMP *pbi, MACROBLOCKD *xd) { vp8_build_intra_predictors_mbuv_s(xd); - vp8_build_intra_predictors_mby_s_ptr(xd); - + RECON_INVOKE(&pbi->common.rtcd.recon, + build_intra_predictors_mby_s)(xd); } else { @@ -214,7 +214,8 @@ void vp8_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd) if (xd->mode_info_context->mbmi.mode != B_PRED) { - vp8_build_intra_predictors_mby_ptr(xd); + RECON_INVOKE(&pbi->common.rtcd.recon, + build_intra_predictors_mby)(xd); } else { vp8_intra_prediction_down_copy(xd); } diff --git a/vp8/encoder/arm/arm_csystemdependent.c b/vp8/encoder/arm/arm_csystemdependent.c index afd43042d..5ba14f375 100644 --- a/vp8/encoder/arm/arm_csystemdependent.c +++ b/vp8/encoder/arm/arm_csystemdependent.c @@ -71,8 +71,8 @@ void vp8_arch_arm_encoder_init(VP8_COMP *cpi) cpi->rtcd.encodemb.submby = vp8_subtract_mby_c; cpi->rtcd.encodemb.submbuv = vp8_subtract_mbuv_c;*/ - /*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b; - cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_c;*/ + /*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b;*/ + cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_armv6; } #endif diff --git a/vp8/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm b/vp8/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm new file mode 100644 index 000000000..ae2f6030d --- /dev/null +++ b/vp8/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm @@ -0,0 +1,224 @@ +; +; Copyright (c) 2011 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_fast_quantize_b_armv6| + + INCLUDE asm_enc_offsets.asm + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 BLOCK *b +; r1 BLOCKD *d +|vp8_fast_quantize_b_armv6| PROC + stmfd sp!, {r1, r4-r11, lr} + + ldr r3, [r0, #vp8_block_coeff] ; coeff + ldr r4, [r0, #vp8_block_quant_fast] ; quant_fast + ldr r5, [r0, #vp8_block_round] ; round + ldr r6, [r1, #vp8_blockd_qcoeff] ; qcoeff + ldr r7, [r1, #vp8_blockd_dqcoeff] ; dqcoeff + ldr r8, [r1, #vp8_blockd_dequant] ; dequant + + ldr r2, loop_count ; loop_count=0x1000000. 'lsls' instruction + ; is used to update the counter so that + ; it can be used to mark nonzero + ; quantized coefficient pairs. + + mov r1, #0 ; flags for quantized coeffs + + ; PART 1: quantization and dequantization loop +loop + ldr r9, [r3], #4 ; [z1 | z0] + ldr r10, [r5], #4 ; [r1 | r0] + ldr r11, [r4], #4 ; [q1 | q0] + + ssat16 lr, #1, r9 ; [sz1 | sz0] + eor r9, r9, lr ; [z1 ^ sz1 | z0 ^ sz0] + ssub16 r9, r9, lr ; x = (z ^ sz) - sz + sadd16 r9, r9, r10 ; [x1+r1 | x0+r0] + + ldr r12, [r3], #4 ; [z3 | z2] + + smulbb r0, r9, r11 ; [(x0+r0)*q0] + smultt r9, r9, r11 ; [(x1+r1)*q1] + + ldr r10, [r5], #4 ; [r3 | r2] + + ssat16 r11, #1, r12 ; [sz3 | sz2] + eor r12, r12, r11 ; [z3 ^ sz3 | z2 ^ sz2] + pkhtb r0, r9, r0, asr #16 ; [y1 | y0] + ldr r9, [r4], #4 ; [q3 | q2] + ssub16 r12, r12, r11 ; x = (z ^ sz) - sz + + sadd16 r12, r12, r10 ; [x3+r3 | x2+r2] + + eor r0, r0, lr ; [(y1 ^ sz1) | (y0 ^ sz0)] + + smulbb r10, r12, r9 ; [(x2+r2)*q2] + smultt r12, r12, r9 ; [(x3+r3)*q3] + + ssub16 r0, r0, lr ; x = (y ^ sz) - sz + + cmp r0, #0 ; check if zero + orrne r1, r1, r2, lsr #24 ; add flag for nonzero coeffs + + str r0, [r6], #4 ; *qcoeff++ = x + ldr r9, [r8], #4 ; [dq1 | dq0] + + pkhtb r10, r12, r10, asr #16 ; [y3 | y2] + eor r10, r10, r11 ; [(y3 ^ sz3) | (y2 ^ sz2)] + ssub16 r10, r10, r11 ; x = (y ^ sz) - sz + + cmp r10, #0 ; check if zero + orrne r1, r1, r2, lsr #23 ; add flag for nonzero coeffs + + str r10, [r6], #4 ; *qcoeff++ = x + ldr r11, [r8], #4 ; [dq3 | dq2] + + smulbb r12, r0, r9 ; [x0*dq0] + smultt r0, r0, r9 ; [x1*dq1] + + smulbb r9, r10, r11 ; [x2*dq2] + smultt r10, r10, r11 ; [x3*dq3] + + lsls r2, r2, #2 ; update loop counter + strh r12, [r7, #0] ; dqcoeff[0] = [x0*dq0] + strh r0, [r7, #2] ; dqcoeff[1] = [x1*dq1] + strh r9, [r7, #4] ; dqcoeff[2] = [x2*dq2] + strh r10, [r7, #6] ; dqcoeff[3] = [x3*dq3] + add r7, r7, #8 ; dqcoeff += 8 + bne loop + + ; PART 2: check position for eob... + mov lr, #0 ; init eob + cmp r1, #0 ; coeffs after quantization? + ldr r11, [sp, #0] ; restore BLOCKD pointer + beq end ; skip eob calculations if all zero + + ldr r0, [r11, #vp8_blockd_qcoeff] + + ; check shortcut for nonzero qcoeffs + tst r1, #0x80 + bne quant_coeff_15_14 + tst r1, #0x20 + bne quant_coeff_13_11 + tst r1, #0x8 + bne quant_coeff_12_7 + tst r1, #0x40 + bne quant_coeff_10_9 + tst r1, #0x10 + bne quant_coeff_8_3 + tst r1, #0x2 + bne quant_coeff_6_5 + tst r1, #0x4 + bne quant_coeff_4_2 + b quant_coeff_1_0 + +quant_coeff_15_14 + ldrh r2, [r0, #30] ; rc=15, i=15 + mov lr, #16 + cmp r2, #0 + bne end + + ldrh r3, [r0, #28] ; rc=14, i=14 + mov lr, #15 + cmp r3, #0 + bne end + +quant_coeff_13_11 + ldrh r2, [r0, #22] ; rc=11, i=13 + mov lr, #14 + cmp r2, #0 + bne end + +quant_coeff_12_7 + ldrh r3, [r0, #14] ; rc=7, i=12 + mov lr, #13 + cmp r3, #0 + bne end + + ldrh r2, [r0, #20] ; rc=10, i=11 + mov lr, #12 + cmp r2, #0 + bne end + +quant_coeff_10_9 + ldrh r3, [r0, #26] ; rc=13, i=10 + mov lr, #11 + cmp r3, #0 + bne end + + ldrh r2, [r0, #24] ; rc=12, i=9 + mov lr, #10 + cmp r2, #0 + bne end + +quant_coeff_8_3 + ldrh r3, [r0, #18] ; rc=9, i=8 + mov lr, #9 + cmp r3, #0 + bne end + + ldrh r2, [r0, #12] ; rc=6, i=7 + mov lr, #8 + cmp r2, #0 + bne end + +quant_coeff_6_5 + ldrh r3, [r0, #6] ; rc=3, i=6 + mov lr, #7 + cmp r3, #0 + bne end + + ldrh r2, [r0, #4] ; rc=2, i=5 + mov lr, #6 + cmp r2, #0 + bne end + +quant_coeff_4_2 + ldrh r3, [r0, #10] ; rc=5, i=4 + mov lr, #5 + cmp r3, #0 + bne end + + ldrh r2, [r0, #16] ; rc=8, i=3 + mov lr, #4 + cmp r2, #0 + bne end + + ldrh r3, [r0, #8] ; rc=4, i=2 + mov lr, #3 + cmp r3, #0 + bne end + +quant_coeff_1_0 + ldrh r2, [r0, #2] ; rc=1, i=1 + mov lr, #2 + cmp r2, #0 + bne end + + mov lr, #1 ; rc=0, i=0 + +end + str lr, [r11, #vp8_blockd_eob] + ldmfd sp!, {r1, r4-r11, pc} + + ENDP + +loop_count + DCD 0x1000000 + + END + diff --git a/vp8/encoder/arm/quantize_arm.h b/vp8/encoder/arm/quantize_arm.h index 5f9155eb1..0c6adf4c2 100644 --- a/vp8/encoder/arm/quantize_arm.h +++ b/vp8/encoder/arm/quantize_arm.h @@ -12,6 +12,16 @@ #ifndef QUANTIZE_ARM_H #define QUANTIZE_ARM_H +#if HAVE_ARMV6 + +extern prototype_quantize_block(vp8_fast_quantize_b_armv6); + +#undef vp8_quantize_fastquantb +#define vp8_quantize_fastquantb vp8_fast_quantize_b_armv6 + +#endif /* HAVE_ARMV6 */ + + #if HAVE_ARMV7 extern prototype_quantize_block(vp8_fast_quantize_b_neon); diff --git a/vp8/encoder/asm_enc_offsets.c b/vp8/encoder/asm_enc_offsets.c index cd4953227..fcf77756a 100644 --- a/vp8/encoder/asm_enc_offsets.c +++ b/vp8/encoder/asm_enc_offsets.c @@ -65,6 +65,17 @@ DEFINE(TOKENLIST_SZ, sizeof(TOKENLIST)); DEFINE(vp8_common_mb_rows, offsetof(VP8_COMMON, mb_rows)); +// offsets from BLOCK structure +DEFINE(vp8_block_coeff, offsetof(BLOCK, coeff)); +DEFINE(vp8_block_quant_fast, offsetof(BLOCK, quant_fast)); +DEFINE(vp8_block_round, offsetof(BLOCK, round)); + +// offsets from BLOCKD structure +DEFINE(vp8_blockd_qcoeff, offsetof(BLOCKD, qcoeff)); +DEFINE(vp8_blockd_dqcoeff, offsetof(BLOCKD, dqcoeff)); +DEFINE(vp8_blockd_dequant, offsetof(BLOCKD, dequant)); +DEFINE(vp8_blockd_eob, offsetof(BLOCKD, eob)); + // These two sizes are used in vp8cx_pack_tokens. They are hard coded // so if the size changes this will have to be adjusted. #if HAVE_ARMV5TE diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c index 0ced6e7b0..0613b9070 100644 --- a/vp8/encoder/encodeframe.c +++ b/vp8/encoder/encodeframe.c @@ -1184,7 +1184,8 @@ int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t) int distortion2; x->e_mbd.mode_info_context->mbmi.mode = mode; - vp8_build_intra_predictors_mby_ptr(&x->e_mbd); + RECON_INVOKE(&cpi->common.rtcd.recon, build_intra_predictors_mby) + (&x->e_mbd); distortion2 = VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16prederror)(x->src.y_buffer, x->src.y_stride, x->e_mbd.predictor, 16, 0x7fffffff); rate2 = x->mbmode_cost[x->e_mbd.frame_type][mode]; this_rd = RD_ESTIMATE(x->rdmult, x->rddiv, rate2, distortion2); diff --git a/vp8/encoder/encodeintra.c b/vp8/encoder/encodeintra.c index cd66016cc..7b81c8d95 100644 --- a/vp8/encoder/encodeintra.c +++ b/vp8/encoder/encodeintra.c @@ -80,7 +80,7 @@ void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) { int b; - vp8_build_intra_predictors_mby_ptr(&x->e_mbd); + RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_mby)(&x->e_mbd); ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, x->src.y_buffer, x->e_mbd.predictor, x->src.y_stride); diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c index 12d5f66d3..5c607a0cb 100644 --- a/vp8/encoder/ethreading.c +++ b/vp8/encoder/ethreading.c @@ -24,6 +24,35 @@ extern void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x); extern void vp8_build_block_offsets(MACROBLOCK *x); extern void vp8_setup_block_ptrs(MACROBLOCK *x); +#if CONFIG_MULTITHREAD + +extern void loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm); + +static THREAD_FUNCTION loopfilter_thread(void *p_data) +{ + VP8_COMP *cpi = (VP8_COMP *)(((LPFTHREAD_DATA *)p_data)->ptr1); + VP8_COMMON *cm = &cpi->common; + + while (1) + { + if (cpi->b_multi_threaded == 0) + break; + + if (sem_wait(&cpi->h_event_start_lpf) == 0) + { + if (cpi->b_multi_threaded == FALSE) // we're shutting down + break; + + loopfilter_frame(cpi, cm); + + sem_post(&cpi->h_event_end_lpf); + } + } + + return 0; +} +#endif + static THREAD_FUNCTION thread_encoding_proc(void *p_data) { @@ -479,6 +508,15 @@ void vp8cx_create_encoder_threads(VP8_COMP *cpi) pthread_create(&cpi->h_encoding_thread[ithread], 0, thread_encoding_proc, ethd); } + { + LPFTHREAD_DATA * lpfthd = &cpi->lpf_thread_data; + + sem_init(&cpi->h_event_start_lpf, 0, 0); + sem_init(&cpi->h_event_end_lpf, 0, 0); + + lpfthd->ptr1 = (void *)cpi; + pthread_create(&cpi->h_filter_thread, 0, loopfilter_thread, lpfthd); + } } } @@ -500,9 +538,14 @@ void vp8cx_remove_encoder_threads(VP8_COMP *cpi) sem_destroy(&cpi->h_event_start_encoding[i]); } + + sem_post(&cpi->h_event_start_lpf); + pthread_join(cpi->h_filter_thread, 0); } sem_destroy(&cpi->h_event_end_encoding); + sem_destroy(&cpi->h_event_end_lpf); + sem_destroy(&cpi->h_event_start_lpf); //free thread related resources vpx_free(cpi->h_event_start_encoding); diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c index d2cc8482e..774d9b6b5 100644 --- a/vp8/encoder/firstpass.c +++ b/vp8/encoder/firstpass.c @@ -281,21 +281,6 @@ int frame_max_bits(VP8_COMP *cpi) } -extern size_t vp8_firstpass_stats_sz(unsigned int mb_count) -{ - /* Calculate the size of a stats packet, which is dependent on the frame - * resolution. The FIRSTPASS_STATS struct has a single element array, - * motion_map, which is virtually expanded to have one element per - * macroblock. - */ - size_t stats_sz; - - stats_sz = sizeof(FIRSTPASS_STATS) + mb_count; - stats_sz = (stats_sz + 7) & ~7; - return stats_sz; -} - - void vp8_output_stats(const VP8_COMP *cpi, struct vpx_codec_pkt_list *pktlist, FIRSTPASS_STATS *stats) @@ -303,18 +288,19 @@ void vp8_output_stats(const VP8_COMP *cpi, struct vpx_codec_cx_pkt pkt; pkt.kind = VPX_CODEC_STATS_PKT; pkt.data.twopass_stats.buf = stats; - pkt.data.twopass_stats.sz = vp8_firstpass_stats_sz(cpi->common.MBs); + pkt.data.twopass_stats.sz = sizeof(FIRSTPASS_STATS); vpx_codec_pkt_list_add(pktlist, &pkt); // TEMP debug code #if OUTPUT_FPF + { FILE *fpfile; fpfile = fopen("firstpass.stt", "a"); fprintf(fpfile, "%12.0f %12.0f %12.0f %12.4f %12.4f %12.4f %12.4f" - " %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.0f" - " %12.4f\n", + " %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f" + " %12.0f %12.4f\n", stats->frame, stats->intra_error, stats->coded_error, @@ -333,24 +319,17 @@ void vp8_output_stats(const VP8_COMP *cpi, stats->count, stats->duration); fclose(fpfile); - - - fpfile = fopen("fpmotionmap.stt", "a"); - if(fwrite(cpi->fp_motion_map, 1, cpi->common.MBs, fpfile)); - fclose(fpfile); } #endif } int vp8_input_stats(VP8_COMP *cpi, FIRSTPASS_STATS *fps) { - size_t stats_sz = vp8_firstpass_stats_sz(cpi->common.MBs); - if (cpi->stats_in >= cpi->stats_in_end) return EOF; *fps = *cpi->stats_in; - cpi->stats_in = (void*)((char *)cpi->stats_in + stats_sz); + cpi->stats_in = (void*)((char *)cpi->stats_in + sizeof(FIRSTPASS_STATS)); return 1; } @@ -416,57 +395,9 @@ void vp8_avg_stats(FIRSTPASS_STATS *section) section->duration /= section->count; } -unsigned char *vp8_fpmm_get_pos(VP8_COMP *cpi) -{ - return cpi->fp_motion_map_stats; -} -void vp8_fpmm_reset_pos(VP8_COMP *cpi, unsigned char *target_pos) -{ - cpi->fp_motion_map_stats = target_pos; -} - -void vp8_advance_fpmm(VP8_COMP *cpi, int count) -{ - cpi->fp_motion_map_stats = (void*)((char*)cpi->fp_motion_map_stats + - count * vp8_firstpass_stats_sz(cpi->common.MBs)); -} - -void vp8_input_fpmm(VP8_COMP *cpi) -{ - unsigned char *fpmm = cpi->fp_motion_map; - int MBs = cpi->common.MBs; - int max_frames = cpi->active_arnr_frames; - int i; - - for (i=0; i<max_frames; i++) - { - char *motion_map = (char*)cpi->fp_motion_map_stats - + sizeof(FIRSTPASS_STATS); - - memcpy(fpmm, motion_map, MBs); - fpmm += MBs; - vp8_advance_fpmm(cpi, 1); - } - - // Flag the use of weights in the temporal filter - cpi->use_weighted_temporal_filter = 1; -} - void vp8_init_first_pass(VP8_COMP *cpi) { vp8_zero_stats(cpi->total_stats); - -// TEMP debug code -#ifdef OUTPUT_FPF - { - FILE *fpfile; - fpfile = fopen("firstpass.stt", "w"); - fclose(fpfile); - fpfile = fopen("fpmotionmap.stt", "wb"); - fclose(fpfile); - } -#endif - } void vp8_end_first_pass(VP8_COMP *cpi) @@ -583,8 +514,6 @@ void vp8_first_pass(VP8_COMP *cpi) MV zero_ref_mv = {0, 0}; - unsigned char *fp_motion_map_ptr = cpi->fp_motion_map; - vp8_clear_system_state(); //__asm emms; x->src = * cpi->Source; @@ -636,7 +565,6 @@ void vp8_first_pass(VP8_COMP *cpi) for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) { int this_error; - int zero_error; int zz_to_best_ratio; int gf_motion_error = INT_MAX; int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row); @@ -658,9 +586,6 @@ void vp8_first_pass(VP8_COMP *cpi) // Cumulative intra error total intra_error += (long long)this_error; - // Indicate default assumption of intra in the motion map - *fp_motion_map_ptr = 0; - // Set up limit values for motion vectors to prevent them extending outside the UMV borders x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16)); x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + (VP8BORDERINPIXELS - 16); @@ -679,9 +604,6 @@ void vp8_first_pass(VP8_COMP *cpi) d->bmi.mv.as_mv.row = 0; d->bmi.mv.as_mv.col = 0; - // Save (0,0) error for later use - zero_error = motion_error; - // Test last reference frame using the previous best mv as the // starting point (best reference) for the search vp8_first_pass_motion_search(cpi, x, &best_ref_mv.as_mv, @@ -796,25 +718,6 @@ void vp8_first_pass(VP8_COMP *cpi) else if (d->bmi.mv.as_mv.col < 0) sum_in_vectors--; } - - // Compute how close (0,0) predictor is to best - // predictor in terms of their prediction error - zz_to_best_ratio = (10*zero_error + this_error/2) - / (this_error+!this_error); - - if ((zero_error < 50000) && - (zz_to_best_ratio <= 11) ) - *fp_motion_map_ptr = 1; - else - *fp_motion_map_ptr = 0; - } - else - { - // 0,0 mv was best - if( zero_error<50000 ) - *fp_motion_map_ptr = 2; - else - *fp_motion_map_ptr = 1; } } } @@ -828,9 +731,6 @@ void vp8_first_pass(VP8_COMP *cpi) recon_yoffset += 16; recon_uvoffset += 8; - - // Update the motion map - fp_motion_map_ptr++; } // adjust to the next row of mbs @@ -892,13 +792,10 @@ void vp8_first_pass(VP8_COMP *cpi) // than the full time between subsequent cpi->source_time_stamp s . fps.duration = cpi->source_end_time_stamp - cpi->source_time_stamp; - // don't want to do outputstats with a stack variable! + // don't want to do output stats with a stack variable! memcpy(cpi->this_frame_stats, &fps, sizeof(FIRSTPASS_STATS)); - memcpy((char*)cpi->this_frame_stats + sizeof(FIRSTPASS_STATS), - cpi->fp_motion_map, - sizeof(cpi->fp_motion_map[0]) * cpi->common.MBs); vp8_output_stats(cpi, cpi->output_pkt_list, cpi->this_frame_stats); vp8_accumulate_stats(cpi->total_stats, &fps); } @@ -944,10 +841,10 @@ void vp8_first_pass(VP8_COMP *cpi) extern const int vp8_bits_per_mb[2][QINDEX_RANGE]; #define BASE_ERRPERMB 150 -static int estimate_max_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh, int Height, int Width) +static int estimate_max_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh) { int Q; - int num_mbs = ((Height * Width) / (16 * 16)); + int num_mbs = cpi->common.MBs; int target_norm_bits_per_mb; double err_per_mb = section_err / num_mbs; @@ -1044,10 +941,10 @@ static int estimate_max_q(VP8_COMP *cpi, double section_err, int section_target_ return Q; } -static int estimate_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh, int Height, int Width) +static int estimate_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh) { int Q; - int num_mbs = ((Height * Width) / (16 * 16)); + int num_mbs = cpi->common.MBs; int target_norm_bits_per_mb; double err_per_mb = section_err / num_mbs; @@ -1095,10 +992,10 @@ static int estimate_q(VP8_COMP *cpi, double section_err, int section_target_band } // Estimate a worst case Q for a KF group -static int estimate_kf_group_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh, int Height, int Width, double group_iiratio) +static int estimate_kf_group_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh, double group_iiratio) { int Q; - int num_mbs = ((Height * Width) / (16 * 16)); + int num_mbs = cpi->common.MBs; int target_norm_bits_per_mb = (512 * section_target_bandwitdh) / num_mbs; int bits_per_mb_at_this_q; @@ -1193,11 +1090,10 @@ static int estimate_kf_group_q(VP8_COMP *cpi, double section_err, int section_ta // For cq mode estimate a cq level that matches the observed // complexity and data rate. -static int estimate_cq(VP8_COMP *cpi, double section_err, - int section_target_bandwitdh, int Height, int Width) +static int estimate_cq(VP8_COMP *cpi, double section_err, int section_target_bandwitdh) { int Q; - int num_mbs = ((Height * Width) / (16 * 16)); + int num_mbs = cpi->common.MBs; int target_norm_bits_per_mb; double err_per_mb = section_err / num_mbs; @@ -1351,8 +1247,6 @@ void vp8_init_second_pass(VP8_COMP *cpi) cpi->clip_bpe = cpi->bits_left / DOUBLE_DIVIDE_CHECK(cpi->modified_error_total); cpi->observed_bpe = cpi->clip_bpe; - - cpi->fp_motion_map_stats = (unsigned char *)cpi->stats_in; } void vp8_end_second_pass(VP8_COMP *cpi) @@ -1360,7 +1254,7 @@ void vp8_end_second_pass(VP8_COMP *cpi) } // This function gives and estimate of how badly we believe -// the predicition quality is decaying from frame to frame. +// the prediction quality is decaying from frame to frame. double get_prediction_decay_rate(VP8_COMP *cpi, FIRSTPASS_STATS *next_frame) { double prediction_decay_rate; @@ -1472,8 +1366,6 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) int max_bits = frame_max_bits(cpi); // Max for a single frame - unsigned char *fpmm_pos; - unsigned int allow_alt_ref = cpi->oxcf.play_alternate && cpi->oxcf.lag_in_frames; @@ -1482,8 +1374,6 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) vp8_clear_system_state(); //__asm emms; - fpmm_pos = vp8_fpmm_get_pos(cpi); - start_pos = cpi->stats_in; vpx_memset(&next_frame, 0, sizeof(next_frame)); // assure clean @@ -1717,7 +1607,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) arf_frame_bits = (int)((double)Boost * (group_bits / (double)allocation_chunks)); // Estimate if there are enough bits available to make worthwhile use of an arf. - tmp_q = estimate_q(cpi, mod_frame_err, (int)arf_frame_bits, cpi->common.Height, cpi->common.Width); + tmp_q = estimate_q(cpi, mod_frame_err, (int)arf_frame_bits); // Only use an arf if it is likely we will be able to code it at a lower Q than the surrounding frames. if (tmp_q < cpi->worst_quality) @@ -1780,20 +1670,6 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) } cpi->active_arnr_frames = frames_bwd + 1 + frames_fwd; - - { - // Advance to & read in the motion map for those frames - // to be considered for filtering based on the position - // of the ARF - vp8_fpmm_reset_pos(cpi, cpi->fp_motion_map_stats_save); - - // Position at the 'earliest' frame to be filtered - vp8_advance_fpmm(cpi, - cpi->baseline_gf_interval - frames_bwd); - - // Read / create a motion map for the region of interest - vp8_input_fpmm(cpi); - } } else { @@ -2023,9 +1899,6 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) reset_fpf_position(cpi, start_pos); } - - // Reset the First pass motion map file position - vp8_fpmm_reset_pos(cpi, fpmm_pos); } // Allocate bits to a normal frame that is neither a gf an arf or a key frame. @@ -2107,13 +1980,6 @@ void vp8_second_pass(VP8_COMP *cpi) if (EOF == vp8_input_stats(cpi, &this_frame)) return; - vpx_memset(cpi->fp_motion_map, 0, - cpi->oxcf.arnr_max_frames*cpi->common.MBs); - cpi->fp_motion_map_stats_save = vp8_fpmm_get_pos(cpi); - - // Step over this frame's first pass motion map - vp8_advance_fpmm(cpi, 1); - this_frame_error = this_frame.ssim_weighted_pred_err; this_frame_intra_error = this_frame.intra_error; this_frame_coded_error = this_frame.coded_error; @@ -2245,8 +2111,7 @@ void vp8_second_pass(VP8_COMP *cpi) est_cq = estimate_cq( cpi, (cpi->total_coded_error_left / frames_left), - (int)(cpi->bits_left / frames_left), - cpi->common.Height, cpi->common.Width); + (int)(cpi->bits_left / frames_left)); cpi->cq_target_quality = cpi->oxcf.cq_level; if ( est_cq > cpi->cq_target_quality ) @@ -2258,9 +2123,7 @@ void vp8_second_pass(VP8_COMP *cpi) cpi->maxq_min_limit = cpi->best_quality; tmp_q = estimate_max_q( cpi, (cpi->total_coded_error_left / frames_left), - (int)(cpi->bits_left / frames_left), - cpi->common.Height, - cpi->common.Width); + (int)(cpi->bits_left / frames_left)); // Limit the maxq value returned subsequently. // This increases the risk of overspend or underspend if the initial @@ -2288,7 +2151,7 @@ void vp8_second_pass(VP8_COMP *cpi) if (frames_left < 1) frames_left = 1; - tmp_q = estimate_max_q(cpi, (cpi->total_coded_error_left / frames_left), (int)(cpi->bits_left / frames_left), cpi->common.Height, cpi->common.Width); + tmp_q = estimate_max_q(cpi, (cpi->total_coded_error_left / frames_left), (int)(cpi->bits_left / frames_left)); // Move active_worst_quality but in a damped way if (tmp_q > cpi->active_worst_quality) @@ -2897,7 +2760,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) bits_per_frame = (cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100); // Work out if spatial resampling is necessary - kf_q = estimate_kf_group_q(cpi, err_per_frame, bits_per_frame, new_height, new_width, group_iiratio); + kf_q = estimate_kf_group_q(cpi, err_per_frame, bits_per_frame, group_iiratio); // If we project a required Q higher than the maximum allowed Q then make a guess at the actual size of frames in this section projected_bits_perframe = bits_per_frame; @@ -2968,7 +2831,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) effective_size_ratio = (1.0 + (3.0 * effective_size_ratio)) / 4.0; // Now try again and see what Q we get with the smaller image size - kf_q = estimate_kf_group_q(cpi, err_per_frame * effective_size_ratio, bits_per_frame, new_height, new_width, group_iiratio); + kf_q = estimate_kf_group_q(cpi, err_per_frame * effective_size_ratio, bits_per_frame, group_iiratio); if (0) { diff --git a/vp8/encoder/generic/csystemdependent.c b/vp8/encoder/generic/csystemdependent.c index fc0580d55..81108fe96 100644 --- a/vp8/encoder/generic/csystemdependent.c +++ b/vp8/encoder/generic/csystemdependent.c @@ -103,6 +103,10 @@ void vp8_cmachine_specific_config(VP8_COMP *cpi) // Pure C: vp8_yv12_copy_partial_frame_ptr = vp8_yv12_copy_partial_frame; +#if CONFIG_PSNR + cpi->rtcd.variance.ssimpf_8x8 = ssim_parms_8x8_c; + cpi->rtcd.variance.ssimpf = ssim_parms_c; +#endif #if ARCH_X86 || ARCH_X86_64 vp8_arch_x86_encoder_init(cpi); diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c index 33aaa2ca9..c210c1de2 100644 --- a/vp8/encoder/mcomp.c +++ b/vp8/encoder/mcomp.c @@ -1415,7 +1415,7 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er int col_min = ref_col - distance; int col_max = ref_col + distance; - unsigned short sad_array8[8]; + DECLARE_ALIGNED_ARRAY(16, unsigned short, sad_array8, 8); unsigned int sad_array[3]; // Work out the mid point for the search diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index 797e18b30..8965634fe 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -86,9 +86,11 @@ extern double vp8_calc_ssim YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, int lumamask, - double *weight + double *weight, + const vp8_variance_rtcd_vtable_t *rtcd ); + extern double vp8_calc_ssimg ( YV12_BUFFER_CONFIG *source, @@ -281,12 +283,6 @@ void vp8_dealloc_compressor_data(VP8_COMP *cpi) vpx_free(cpi->active_map); cpi->active_map = 0; -#if !(CONFIG_REALTIME_ONLY) - // Delete first pass motion map - vpx_free(cpi->fp_motion_map); - cpi->fp_motion_map = 0; -#endif - vp8_de_alloc_frame_buffers(&cpi->common); vp8_yv12_de_alloc_frame_buffer(&cpi->last_frame_uf); @@ -1360,11 +1356,11 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) #if !(CONFIG_REALTIME_ONLY) vpx_free(cpi->total_stats); - cpi->total_stats = vpx_calloc(1, vp8_firstpass_stats_sz(cpi->common.MBs)); + cpi->total_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS)); vpx_free(cpi->this_frame_stats); - cpi->this_frame_stats = vpx_calloc(1, vp8_firstpass_stats_sz(cpi->common.MBs)); + cpi->this_frame_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS)); if(!cpi->total_stats || !cpi->this_frame_stats) vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, @@ -1462,8 +1458,7 @@ void vp8_init_config(VP8_PTR ptr, VP8_CONFIG *oxcf) VP8_COMP *cpi = (VP8_COMP *)(ptr); VP8_COMMON *cm = &cpi->common; - if (!cpi) - return; + cpi->oxcf = *oxcf; cpi->auto_gold = 1; cpi->auto_adjust_gold_quantizer = 1; @@ -1475,50 +1470,15 @@ void vp8_init_config(VP8_PTR ptr, VP8_CONFIG *oxcf) cm->version = oxcf->Version; vp8_setup_version(cm); - if (oxcf == 0) - { - cpi->pass = 0; - - cpi->auto_worst_q = 0; - cpi->oxcf.best_allowed_q = MINQ; - cpi->oxcf.worst_allowed_q = MAXQ; - cpi->oxcf.cq_level = MINQ; - - cpi->oxcf.end_usage = USAGE_STREAM_FROM_SERVER; - cpi->oxcf.starting_buffer_level = 4000; - cpi->oxcf.optimal_buffer_level = 5000; - cpi->oxcf.maximum_buffer_size = 6000; - cpi->oxcf.under_shoot_pct = 90; - cpi->oxcf.allow_df = 0; - cpi->oxcf.drop_frames_water_mark = 20; - - cpi->oxcf.allow_spatial_resampling = 0; - cpi->oxcf.resample_down_water_mark = 40; - cpi->oxcf.resample_up_water_mark = 60; - - cpi->oxcf.fixed_q = cpi->interquantizer; - - cpi->filter_type = NORMAL_LOOPFILTER; - - if (cm->simpler_lpf) - cpi->filter_type = SIMPLE_LOOPFILTER; - - cpi->compressor_speed = 1; - cpi->horiz_scale = 0; - cpi->vert_scale = 0; - cpi->oxcf.two_pass_vbrbias = 50; - cpi->oxcf.two_pass_vbrmax_section = 400; - cpi->oxcf.two_pass_vbrmin_section = 0; - - cpi->oxcf.Sharpness = 0; - cpi->oxcf.noise_sensitivity = 0; - } - else - cpi->oxcf = *oxcf; + // change includes all joint functionality + vp8_change_config(ptr, oxcf); + // Initialize active best and worst q and average q values. + cpi->active_worst_quality = cpi->oxcf.worst_allowed_q; + cpi->active_best_quality = cpi->oxcf.best_allowed_q; + cpi->avg_frame_qindex = cpi->oxcf.worst_allowed_q; - // Convert target bandwidth from Kbit/s to Bit/s - cpi->oxcf.target_bandwidth *= 1000; + // Initialise the starting buffer levels cpi->oxcf.starting_buffer_level = rescale(cpi->oxcf.starting_buffer_level, cpi->oxcf.target_bandwidth, 1000); @@ -1526,10 +1486,6 @@ void vp8_init_config(VP8_PTR ptr, VP8_CONFIG *oxcf) cpi->buffer_level = cpi->oxcf.starting_buffer_level; cpi->bits_off_target = cpi->oxcf.starting_buffer_level; - cpi->active_worst_quality = cpi->oxcf.worst_allowed_q; - cpi->active_best_quality = cpi->oxcf.best_allowed_q; - cpi->avg_frame_qindex = cpi->oxcf.worst_allowed_q; - cpi->rolling_target_bits = cpi->av_per_frame_bandwidth; cpi->rolling_actual_bits = cpi->av_per_frame_bandwidth; cpi->long_rolling_target_bits = cpi->av_per_frame_bandwidth; @@ -1538,13 +1494,7 @@ void vp8_init_config(VP8_PTR ptr, VP8_CONFIG *oxcf) cpi->total_actual_bits = 0; cpi->total_target_vs_actual = 0; - // change includes all joint functionality - vp8_change_config(ptr, oxcf); - #if VP8_TEMPORAL_ALT_REF - - cpi->use_weighted_temporal_filter = 0; - { int i; @@ -1668,7 +1618,8 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf) } - cpi->baseline_gf_interval = cpi->oxcf.alt_freq ? cpi->oxcf.alt_freq : DEFAULT_GF_INTERVAL; + cpi->baseline_gf_interval = + cpi->oxcf.alt_freq ? cpi->oxcf.alt_freq : DEFAULT_GF_INTERVAL; cpi->ref_frame_flags = VP8_ALT_FLAG | VP8_GOLD_FLAG | VP8_LAST_FLAG; @@ -1679,7 +1630,8 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf) cm->refresh_entropy_probs = 1; if (cpi->oxcf.token_partitions >= 0 && cpi->oxcf.token_partitions <= 3) - cm->multi_token_partition = (TOKEN_PARTITION) cpi->oxcf.token_partitions; + cm->multi_token_partition = + (TOKEN_PARTITION) cpi->oxcf.token_partitions; setup_features(cpi); @@ -1700,12 +1652,12 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf) cpi->oxcf.starting_buffer_level = 60000; cpi->oxcf.optimal_buffer_level = 60000; cpi->oxcf.maximum_buffer_size = 240000; - } // Convert target bandwidth from Kbit/s to Bit/s cpi->oxcf.target_bandwidth *= 1000; + // Set or reset optimal and maximum buffer levels. if (cpi->oxcf.optimal_buffer_level == 0) cpi->oxcf.optimal_buffer_level = cpi->oxcf.target_bandwidth / 8; else @@ -1720,7 +1672,10 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf) rescale(cpi->oxcf.maximum_buffer_size, cpi->oxcf.target_bandwidth, 1000); + // Set up frame rate and related parameters rate control values. vp8_new_frame_rate(cpi, cpi->oxcf.frame_rate); + + // Set absolute upper and lower quality limits cpi->worst_quality = cpi->oxcf.worst_allowed_q; cpi->best_quality = cpi->oxcf.best_allowed_q; @@ -1749,9 +1704,9 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf) cpi->cq_target_quality = cpi->oxcf.cq_level; // Only allow dropped frames in buffered mode - cpi->drop_frames_allowed = cpi->oxcf.allow_df && cpi->buffered_mode; + cpi->drop_frames_allowed = cpi->oxcf.allow_df && cpi->buffered_mode; - cm->filter_type = (LOOPFILTERTYPE) cpi->filter_type; + cm->filter_type = (LOOPFILTERTYPE) cpi->filter_type; if (!cm->use_bilinear_mc_filter) cm->mcomp_filter_type = SIXTAP; @@ -1766,7 +1721,8 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf) cm->horiz_scale = cpi->horiz_scale; cm->vert_scale = cpi->vert_scale ; - cpi->intra_frame_target = (4 * (cm->Width + cm->Height) / 15) * 1000; // As per VP8 + // As per VP8 + cpi->intra_frame_target = (4 * (cm->Width + cm->Height) / 15) * 1000; // VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs) if (cpi->oxcf.Sharpness > 7) @@ -1787,8 +1743,10 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf) cm->Height = (vs - 1 + cpi->oxcf.Height * vr) / vs; } - if (((cm->Width + 15) & 0xfffffff0) != cm->yv12_fb[cm->lst_fb_idx].y_width || - ((cm->Height + 15) & 0xfffffff0) != cm->yv12_fb[cm->lst_fb_idx].y_height || + if (((cm->Width + 15) & 0xfffffff0) != + cm->yv12_fb[cm->lst_fb_idx].y_width || + ((cm->Height + 15) & 0xfffffff0) != + cm->yv12_fb[cm->lst_fb_idx].y_height || cm->yv12_fb[cm->lst_fb_idx].y_width == 0) { alloc_raw_frame_buffers(cpi); @@ -1927,12 +1885,6 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf) vpx_memset(cpi->active_map , 1, (cpi->common.mb_rows * cpi->common.mb_cols)); cpi->active_map_enabled = 0; -#if !(CONFIG_REALTIME_ONLY) - // Create the first pass motion map structure and set to 0 - // Allocate space for maximum of 15 buffers - CHECK_MEM_ERROR(cpi->fp_motion_map, vpx_calloc(15*cpi->common.MBs, 1)); -#endif - #if 0 // Experimental code for lagged and one pass // Initialise one_pass GF frames stats @@ -2082,7 +2034,7 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf) } else if (cpi->pass == 2) { - size_t packet_sz = vp8_firstpass_stats_sz(cpi->common.MBs); + size_t packet_sz = sizeof(FIRSTPASS_STATS); int packets = oxcf->two_pass_stats_in.sz / packet_sz; cpi->stats_in = oxcf->two_pass_stats_in.buf; @@ -3283,6 +3235,89 @@ static BOOL recode_loop_test( VP8_COMP *cpi, return force_recode; } +void loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm) +{ + if (cm->no_lpf) + { + cm->filter_level = 0; + } + else + { + struct vpx_usec_timer timer; + + vp8_clear_system_state(); + + vpx_usec_timer_start(&timer); + if (cpi->sf.auto_filter == 0) + vp8cx_pick_filter_level_fast(cpi->Source, cpi); + + else + vp8cx_pick_filter_level(cpi->Source, cpi); + + vpx_usec_timer_mark(&timer); + cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer); + } + +#if CONFIG_MULTITHREAD + sem_post(&cpi->h_event_end_lpf); /* signal that we have set filter_level */ +#endif + + if (cm->filter_level > 0) + { + vp8cx_set_alt_lf_level(cpi, cm->filter_level); + vp8_loop_filter_frame(cm, &cpi->mb.e_mbd, cm->filter_level); + cm->last_filter_type = cm->filter_type; + cm->last_sharpness_level = cm->sharpness_level; + } + + vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show); + + { + YV12_BUFFER_CONFIG *lst_yv12 = &cm->yv12_fb[cm->lst_fb_idx]; + YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx]; + YV12_BUFFER_CONFIG *gld_yv12 = &cm->yv12_fb[cm->gld_fb_idx]; + YV12_BUFFER_CONFIG *alt_yv12 = &cm->yv12_fb[cm->alt_fb_idx]; + // At this point the new frame has been encoded. + // If any buffer copy / swapping is signaled it should be done here. + if (cm->frame_type == KEY_FRAME) + { + vp8_yv12_copy_frame_ptr(cm->frame_to_show, gld_yv12); + vp8_yv12_copy_frame_ptr(cm->frame_to_show, alt_yv12); + } + else // For non key frames + { + // Code to copy between reference buffers + if (cm->copy_buffer_to_arf) + { + if (cm->copy_buffer_to_arf == 1) + { + if (cm->refresh_last_frame) + // We copy new_frame here because last and new buffers will already have been swapped if cm->refresh_last_frame is set. + vp8_yv12_copy_frame_ptr(new_yv12, alt_yv12); + else + vp8_yv12_copy_frame_ptr(lst_yv12, alt_yv12); + } + else if (cm->copy_buffer_to_arf == 2) + vp8_yv12_copy_frame_ptr(gld_yv12, alt_yv12); + } + + if (cm->copy_buffer_to_gf) + { + if (cm->copy_buffer_to_gf == 1) + { + if (cm->refresh_last_frame) + // We copy new_frame here because last and new buffers will already have been swapped if cm->refresh_last_frame is set. + vp8_yv12_copy_frame_ptr(new_yv12, gld_yv12); + else + vp8_yv12_copy_frame_ptr(lst_yv12, gld_yv12); + } + else if (cm->copy_buffer_to_gf == 2) + vp8_yv12_copy_frame_ptr(alt_yv12, gld_yv12); + } + } + } +} + static void encode_frame_to_data_rate ( VP8_COMP *cpi, @@ -3637,11 +3672,12 @@ static void encode_frame_to_data_rate } } - // If CBR and the buffer is as full then it is reasonable to allow higher quality on the frames - // to prevent bits just going to waste. + // If CBR and the buffer is as full then it is reasonable to allow + // higher quality on the frames to prevent bits just going to waste. if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) { - // Note that the use of >= here elliminates the risk of a devide by 0 error in the else if clause + // Note that the use of >= here elliminates the risk of a devide + // by 0 error in the else if clause if (cpi->buffer_level >= cpi->oxcf.maximum_buffer_size) cpi->active_best_quality = cpi->best_quality; @@ -3654,6 +3690,20 @@ static void encode_frame_to_data_rate } } } + // Make sure constrained quality mode limits are adhered to for the first + // few frames of one pass encodes + else if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) + { + if ( (cm->frame_type == KEY_FRAME) || + cm->refresh_golden_frame || cpi->common.refresh_alt_ref_frame ) + { + cpi->active_best_quality = cpi->best_quality; + } + else if (cpi->active_best_quality < cpi->cq_target_quality) + { + cpi->active_best_quality = cpi->cq_target_quality; + } + } // Clip the active best and worst quality values to limits if (cpi->active_worst_quality > cpi->worst_quality) @@ -3833,8 +3883,8 @@ static void encode_frame_to_data_rate vp8_setup_key_frame(cpi); // transform / motion compensation build reconstruction frame - vp8_encode_frame(cpi); + cpi->projected_frame_size -= vp8_estimate_entropy_savings(cpi); cpi->projected_frame_size = (cpi->projected_frame_size > 0) ? cpi->projected_frame_size : 0; @@ -4183,92 +4233,43 @@ static void encode_frame_to_data_rate else cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx]; - if (cm->no_lpf) + +#if CONFIG_MULTITHREAD + if (cpi->b_multi_threaded) { - cm->filter_level = 0; + sem_post(&cpi->h_event_start_lpf); /* start loopfilter in separate thread */ } else +#endif { - struct vpx_usec_timer timer; - - vpx_usec_timer_start(&timer); - - if (cpi->sf.auto_filter == 0) - vp8cx_pick_filter_level_fast(cpi->Source, cpi); - else - vp8cx_pick_filter_level(cpi->Source, cpi); - - vpx_usec_timer_mark(&timer); - - cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer); - } - - if (cm->filter_level > 0) - { - vp8cx_set_alt_lf_level(cpi, cm->filter_level); - vp8_loop_filter_frame(cm, &cpi->mb.e_mbd, cm->filter_level); - cm->last_filter_type = cm->filter_type; - cm->last_sharpness_level = cm->sharpness_level; + loopfilter_frame(cpi, cm); } - /* Move storing frame_type out of the above loop since it is also - * needed in motion search besides loopfilter */ - cm->last_frame_type = cm->frame_type; - - vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show); - if (cpi->oxcf.error_resilient_mode == 1) { cm->refresh_entropy_probs = 0; } +#if CONFIG_MULTITHREAD + /* wait that filter_level is picked so that we can continue with stream packing */ + if (cpi->b_multi_threaded) + sem_wait(&cpi->h_event_end_lpf); +#endif + // build the bitstream vp8_pack_bitstream(cpi, dest, size); +#if CONFIG_MULTITHREAD + /* wait for loopfilter thread done */ + if (cpi->b_multi_threaded) { - YV12_BUFFER_CONFIG *lst_yv12 = &cm->yv12_fb[cm->lst_fb_idx]; - YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx]; - YV12_BUFFER_CONFIG *gld_yv12 = &cm->yv12_fb[cm->gld_fb_idx]; - YV12_BUFFER_CONFIG *alt_yv12 = &cm->yv12_fb[cm->alt_fb_idx]; - // At this point the new frame has been encoded coded. - // If any buffer copy / swaping is signalled it should be done here. - if (cm->frame_type == KEY_FRAME) - { - vp8_yv12_copy_frame_ptr(cm->frame_to_show, gld_yv12); - vp8_yv12_copy_frame_ptr(cm->frame_to_show, alt_yv12); - } - else // For non key frames - { - // Code to copy between reference buffers - if (cm->copy_buffer_to_arf) - { - if (cm->copy_buffer_to_arf == 1) - { - if (cm->refresh_last_frame) - // We copy new_frame here because last and new buffers will already have been swapped if cm->refresh_last_frame is set. - vp8_yv12_copy_frame_ptr(new_yv12, alt_yv12); - else - vp8_yv12_copy_frame_ptr(lst_yv12, alt_yv12); - } - else if (cm->copy_buffer_to_arf == 2) - vp8_yv12_copy_frame_ptr(gld_yv12, alt_yv12); - } - - if (cm->copy_buffer_to_gf) - { - if (cm->copy_buffer_to_gf == 1) - { - if (cm->refresh_last_frame) - // We copy new_frame here because last and new buffers will already have been swapped if cm->refresh_last_frame is set. - vp8_yv12_copy_frame_ptr(new_yv12, gld_yv12); - else - vp8_yv12_copy_frame_ptr(lst_yv12, gld_yv12); - } - else if (cm->copy_buffer_to_gf == 2) - vp8_yv12_copy_frame_ptr(alt_yv12, gld_yv12); - } - } + sem_wait(&cpi->h_event_end_lpf); } +#endif + + /* Move storing frame_type out of the above loop since it is also + * needed in motion search besides loopfilter */ + cm->last_frame_type = cm->frame_type; // Update rate control heuristics cpi->total_byte_count += (*size); @@ -5103,7 +5104,9 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer); if (cpi->b_calculate_psnr && cpi->pass != 1 && cm->show_frame) + { generate_psnr_packet(cpi); + } #if CONFIG_PSNR @@ -5119,12 +5122,35 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon if (cpi->b_calculate_psnr) { double y, u, v; - double sq_error; - double frame_psnr = vp8_calc_psnr(cpi->Source, cm->frame_to_show, &y, &u, &v, &sq_error); - - cpi->total_y += y; - cpi->total_u += u; - cpi->total_v += v; + double ye,ue,ve; + double frame_psnr; + YV12_BUFFER_CONFIG *orig = cpi->Source; + YV12_BUFFER_CONFIG *recon = cpi->common.frame_to_show; + YV12_BUFFER_CONFIG *pp = &cm->post_proc_buffer; + int y_samples = orig->y_height * orig->y_width ; + int uv_samples = orig->uv_height * orig->uv_width ; + int t_samples = y_samples + 2 * uv_samples; + long long sq_error; + + ye = calc_plane_error(orig->y_buffer, orig->y_stride, + recon->y_buffer, recon->y_stride, orig->y_width, orig->y_height, + IF_RTCD(&cpi->rtcd.variance)); + + ue = calc_plane_error(orig->u_buffer, orig->uv_stride, + recon->u_buffer, recon->uv_stride, orig->uv_width, orig->uv_height, + IF_RTCD(&cpi->rtcd.variance)); + + ve = calc_plane_error(orig->v_buffer, orig->uv_stride, + recon->v_buffer, recon->uv_stride, orig->uv_width, orig->uv_height, + IF_RTCD(&cpi->rtcd.variance)); + + sq_error = ye + ue + ve; + + frame_psnr = vp8_mse2psnr(t_samples, 255.0, sq_error); + + cpi->total_y += vp8_mse2psnr(y_samples, 255.0, ye); + cpi->total_u += vp8_mse2psnr(uv_samples, 255.0, ue); + cpi->total_v += vp8_mse2psnr(uv_samples, 255.0, ve); cpi->total_sq_error += sq_error; cpi->total += frame_psnr; { @@ -5133,17 +5159,35 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon vp8_deblock(cm->frame_to_show, &cm->post_proc_buffer, cm->filter_level * 10 / 6, 1, 0, IF_RTCD(&cm->rtcd.postproc)); vp8_clear_system_state(); - frame_psnr2 = vp8_calc_psnr(cpi->Source, &cm->post_proc_buffer, &y2, &u2, &v2, &sq_error); - frame_ssim2 = vp8_calc_ssim(cpi->Source, &cm->post_proc_buffer, 1, &weight); - cpi->summed_quality += frame_ssim2 * weight; - cpi->summed_weights += weight; + ye = calc_plane_error(orig->y_buffer, orig->y_stride, + pp->y_buffer, pp->y_stride, orig->y_width, orig->y_height, + IF_RTCD(&cpi->rtcd.variance)); - cpi->totalp_y += y2; - cpi->totalp_u += u2; - cpi->totalp_v += v2; - cpi->totalp += frame_psnr2; + ue = calc_plane_error(orig->u_buffer, orig->uv_stride, + pp->u_buffer, pp->uv_stride, orig->uv_width, orig->uv_height, + IF_RTCD(&cpi->rtcd.variance)); + + ve = calc_plane_error(orig->v_buffer, orig->uv_stride, + pp->v_buffer, pp->uv_stride, orig->uv_width, orig->uv_height, + IF_RTCD(&cpi->rtcd.variance)); + + sq_error = ye + ue + ve; + + frame_psnr2 = vp8_mse2psnr(t_samples, 255.0, sq_error); + + cpi->totalp_y += vp8_mse2psnr(y_samples, 255.0, ye); + cpi->totalp_u += vp8_mse2psnr(uv_samples, 255.0, ue); + cpi->totalp_v += vp8_mse2psnr(uv_samples, 255.0, ve); cpi->total_sq_error2 += sq_error; + cpi->totalp += frame_psnr2; + + frame_ssim2 = vp8_calc_ssim(cpi->Source, + &cm->post_proc_buffer, 1, &weight, + IF_RTCD(&cpi->rtcd.variance)); + + cpi->summed_quality += frame_ssim2 * weight; + cpi->summed_weights += weight; } } diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h index 6d0cbd9fc..0e53f6803 100644 --- a/vp8/encoder/onyx_int.h +++ b/vp8/encoder/onyx_int.h @@ -496,11 +496,6 @@ typedef struct struct vpx_codec_pkt_list *output_pkt_list; int first_pass_done; -#if !(CONFIG_REALTIME_ONLY) - unsigned char *fp_motion_map; - unsigned char *fp_motion_map_stats, *fp_motion_map_stats_save; -#endif - #if 0 // Experimental code for lagged and one pass ONEPASS_FRAMESTATS one_pass_frame_stats[MAX_LAG_BUFFERS]; @@ -604,12 +599,17 @@ typedef struct int encoding_thread_count; pthread_t *h_encoding_thread; + pthread_t h_filter_thread; + MB_ROW_COMP *mb_row_ei; ENCODETHREAD_DATA *en_thread_data; + LPFTHREAD_DATA lpf_thread_data; //events sem_t *h_event_start_encoding; sem_t h_event_end_encoding; + sem_t h_event_start_lpf; + sem_t h_event_end_lpf; #endif TOKENLIST *tplist; @@ -642,8 +642,6 @@ typedef struct YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS]; int fixed_divide[512]; #endif - // Flag to indicate temporal filter method - int use_weighted_temporal_filter; #if CONFIG_PSNR int count; diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c index 6ab85adbc..0790d3517 100644 --- a/vp8/encoder/pickinter.c +++ b/vp8/encoder/pickinter.c @@ -664,7 +664,8 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec case V_PRED: case H_PRED: case TM_PRED: - vp8_build_intra_predictors_mby_ptr(&x->e_mbd); + RECON_INVOKE(&cpi->common.rtcd.recon, build_intra_predictors_mby) + (&x->e_mbd); distortion2 = VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16prederror)(x->src.y_buffer, x->src.y_stride, x->e_mbd.predictor, 16, 0x7fffffff); rate2 += x->mbmode_cost[x->e_mbd.frame_type][x->e_mbd.mode_info_context->mbmi.mode]; this_rd = RD_ESTIMATE(x->rdmult, x->rddiv, rate2, distortion2); diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c index bfffe43d9..9797f5f25 100644 --- a/vp8/encoder/ratectrl.c +++ b/vp8/encoder/ratectrl.c @@ -842,7 +842,8 @@ void vp8_calc_pframe_target_size(VP8_COMP *cpi) { int one_percent_bits = 1 + cpi->oxcf.optimal_buffer_level / 100; - if ((cpi->buffer_level < cpi->oxcf.optimal_buffer_level) || (cpi->bits_off_target < cpi->oxcf.optimal_buffer_level)) + if ((cpi->buffer_level < cpi->oxcf.optimal_buffer_level) || + (cpi->bits_off_target < cpi->oxcf.optimal_buffer_level)) { int percent_low = 0; @@ -851,9 +852,12 @@ void vp8_calc_pframe_target_size(VP8_COMP *cpi) // If we are are below the optimal buffer fullness level and adherence // to buffering contraints is important to the end useage then adjust // the per frame target. - if ((cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) && (cpi->buffer_level < cpi->oxcf.optimal_buffer_level)) + if ((cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) && + (cpi->buffer_level < cpi->oxcf.optimal_buffer_level)) { - percent_low = (cpi->oxcf.optimal_buffer_level - cpi->buffer_level) / one_percent_bits; + percent_low = + (cpi->oxcf.optimal_buffer_level - cpi->buffer_level) / + one_percent_bits; if (percent_low > 100) percent_low = 100; @@ -864,7 +868,8 @@ void vp8_calc_pframe_target_size(VP8_COMP *cpi) else if (cpi->bits_off_target < 0) { // Adjust per frame data target downwards to compensate. - percent_low = (int)(100 * -cpi->bits_off_target / (cpi->total_byte_count * 8)); + percent_low = (int)(100 * -cpi->bits_off_target / + (cpi->total_byte_count * 8)); if (percent_low > 100) percent_low = 100; @@ -873,39 +878,60 @@ void vp8_calc_pframe_target_size(VP8_COMP *cpi) } // lower the target bandwidth for this frame. - cpi->this_frame_target = (cpi->this_frame_target * (100 - (percent_low / 2))) / 100; + cpi->this_frame_target = + (cpi->this_frame_target * (100 - (percent_low / 2))) / 100; - // Are we using allowing control of active_worst_allowed_q according to buffer level. + // Are we using allowing control of active_worst_allowed_q + // according to buffer level. if (cpi->auto_worst_q) { int critical_buffer_level; - // For streaming applications the most important factor is cpi->buffer_level as this takes - // into account the specified short term buffering constraints. However, hitting the long - // term clip data rate target is also important. + // For streaming applications the most important factor is + // cpi->buffer_level as this takes into account the + // specified short term buffering constraints. However, + // hitting the long term clip data rate target is also + // important. if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) { - // Take the smaller of cpi->buffer_level and cpi->bits_off_target - critical_buffer_level = (cpi->buffer_level < cpi->bits_off_target) ? cpi->buffer_level : cpi->bits_off_target; + // Take the smaller of cpi->buffer_level and + // cpi->bits_off_target + critical_buffer_level = + (cpi->buffer_level < cpi->bits_off_target) + ? cpi->buffer_level : cpi->bits_off_target; } - // For local file playback short term buffering contraints are less of an issue + // For local file playback short term buffering contraints + // are less of an issue else { - // Consider only how we are doing for the clip as a whole + // Consider only how we are doing for the clip as a + // whole critical_buffer_level = cpi->bits_off_target; } - // Set the active worst quality based upon the selected buffer fullness number. + // Set the active worst quality based upon the selected + // buffer fullness number. if (critical_buffer_level < cpi->oxcf.optimal_buffer_level) { - if (critical_buffer_level > (cpi->oxcf.optimal_buffer_level / 4)) + if ( critical_buffer_level > + (cpi->oxcf.optimal_buffer_level >> 2) ) { - int qadjustment_range = cpi->worst_quality - cpi->ni_av_qi; - int above_base = (critical_buffer_level - (cpi->oxcf.optimal_buffer_level / 4)); - - // Step active worst quality down from cpi->ni_av_qi when (critical_buffer_level == cpi->optimal_buffer_level) - // to cpi->oxcf.worst_allowed_q when (critical_buffer_level == cpi->optimal_buffer_level/4) - cpi->active_worst_quality = cpi->worst_quality - ((qadjustment_range * above_base) / (cpi->oxcf.optimal_buffer_level * 3 / 4)); + INT64 qadjustment_range = + cpi->worst_quality - cpi->ni_av_qi; + INT64 above_base = + (critical_buffer_level - + (cpi->oxcf.optimal_buffer_level >> 2)); + + // Step active worst quality down from + // cpi->ni_av_qi when (critical_buffer_level == + // cpi->optimal_buffer_level) to + // cpi->worst_quality when + // (critical_buffer_level == + // cpi->optimal_buffer_level >> 2) + cpi->active_worst_quality = + cpi->worst_quality - + ((qadjustment_range * above_base) / + (cpi->oxcf.optimal_buffer_level*3>>2)); } else { @@ -965,6 +991,15 @@ void vp8_calc_pframe_target_size(VP8_COMP *cpi) // Set the active worst quality cpi->active_worst_quality = cpi->worst_quality; } + + // Special trap for constrained quality mode + // "active_worst_quality" may never drop below cq level + // for any frame type. + if ( cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY && + cpi->active_worst_quality < cpi->cq_target_quality) + { + cpi->active_worst_quality = cpi->cq_target_quality; + } } // Test to see if we have to drop a frame diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c index b0dcfe0a4..c706c575f 100644 --- a/vp8/encoder/rdopt.c +++ b/vp8/encoder/rdopt.c @@ -745,7 +745,8 @@ int vp8_rd_pick_intra16x16mby_mode(VP8_COMP *cpi, { x->e_mbd.mode_info_context->mbmi.mode = mode; - vp8_build_intra_predictors_mby_ptr(&x->e_mbd); + RECON_INVOKE(&cpi->common.rtcd.recon, build_intra_predictors_mby) + (&x->e_mbd); macro_block_yrd(x, &ratey, &distortion, IF_RTCD(&cpi->rtcd.encodemb)); rate = ratey + x->mbmode_cost[x->e_mbd.frame_type] @@ -2038,7 +2039,8 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int case H_PRED: case TM_PRED: x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME; - vp8_build_intra_predictors_mby_ptr(&x->e_mbd); + RECON_INVOKE(&cpi->common.rtcd.recon, build_intra_predictors_mby) + (&x->e_mbd); macro_block_yrd(x, &rate_y, &distortion, IF_RTCD(&cpi->rtcd.encodemb)) ; rate2 += rate_y; distortion2 += distortion; diff --git a/vp8/encoder/ssim.c b/vp8/encoder/ssim.c index 4ebcba1a1..64d67c6dd 100644 --- a/vp8/encoder/ssim.c +++ b/vp8/encoder/ssim.c @@ -11,298 +11,13 @@ #include "vpx_scale/yv12config.h" #include "math.h" +#include "onyx_int.h" -#define C1 (float)(64 * 64 * 0.01*255*0.01*255) -#define C2 (float)(64 * 64 * 0.03*255*0.03*255) - -static int width_y; -static int height_y; -static int height_uv; -static int width_uv; -static int stride_uv; -static int stride; -static int lumimask; -static int luminance; -static double plane_summed_weights = 0; - -static short img12_sum_block[8*4096*4096*2] ; - -static short img1_sum[8*4096*2]; -static short img2_sum[8*4096*2]; -static int img1_sq_sum[8*4096*2]; -static int img2_sq_sum[8*4096*2]; -static int img12_mul_sum[8*4096*2]; - - -double vp8_similarity -( - int mu_x, - int mu_y, - int pre_mu_x2, - int pre_mu_y2, - int pre_mu_xy2 -) -{ - int mu_x2, mu_y2, mu_xy, theta_x2, theta_y2, theta_xy; - - mu_x2 = mu_x * mu_x; - mu_y2 = mu_y * mu_y; - mu_xy = mu_x * mu_y; - - theta_x2 = 64 * pre_mu_x2 - mu_x2; - theta_y2 = 64 * pre_mu_y2 - mu_y2; - theta_xy = 64 * pre_mu_xy2 - mu_xy; - - return (2 * mu_xy + C1) * (2 * theta_xy + C2) / ((mu_x2 + mu_y2 + C1) * (theta_x2 + theta_y2 + C2)); -} - -double vp8_ssim -( - const unsigned char *img1, - const unsigned char *img2, - int stride_img1, - int stride_img2, - int width, - int height -) -{ - int x, y, x2, y2, img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block, temp; - - double plane_quality, weight, mean; - - short *img1_sum_ptr1, *img1_sum_ptr2; - short *img2_sum_ptr1, *img2_sum_ptr2; - int *img1_sq_sum_ptr1, *img1_sq_sum_ptr2; - int *img2_sq_sum_ptr1, *img2_sq_sum_ptr2; - int *img12_mul_sum_ptr1, *img12_mul_sum_ptr2; - - plane_quality = 0; - - if (lumimask) - plane_summed_weights = 0.0f; - else - plane_summed_weights = (height - 7) * (width - 7); - - //some prologue for the main loop - temp = 8 * width; - - img1_sum_ptr1 = img1_sum + temp; - img2_sum_ptr1 = img2_sum + temp; - img1_sq_sum_ptr1 = img1_sq_sum + temp; - img2_sq_sum_ptr1 = img2_sq_sum + temp; - img12_mul_sum_ptr1 = img12_mul_sum + temp; - - for (x = 0; x < width; x++) - { - img1_sum[x] = img1[x]; - img2_sum[x] = img2[x]; - img1_sq_sum[x] = img1[x] * img1[x]; - img2_sq_sum[x] = img2[x] * img2[x]; - img12_mul_sum[x] = img1[x] * img2[x]; - - img1_sum_ptr1[x] = 0; - img2_sum_ptr1[x] = 0; - img1_sq_sum_ptr1[x] = 0; - img2_sq_sum_ptr1[x] = 0; - img12_mul_sum_ptr1[x] = 0; - } - - //the main loop - for (y = 1; y < height; y++) - { - img1 += stride_img1; - img2 += stride_img2; - - temp = (y - 1) % 9 * width; - - img1_sum_ptr1 = img1_sum + temp; - img2_sum_ptr1 = img2_sum + temp; - img1_sq_sum_ptr1 = img1_sq_sum + temp; - img2_sq_sum_ptr1 = img2_sq_sum + temp; - img12_mul_sum_ptr1 = img12_mul_sum + temp; - - temp = y % 9 * width; - - img1_sum_ptr2 = img1_sum + temp; - img2_sum_ptr2 = img2_sum + temp; - img1_sq_sum_ptr2 = img1_sq_sum + temp; - img2_sq_sum_ptr2 = img2_sq_sum + temp; - img12_mul_sum_ptr2 = img12_mul_sum + temp; - - for (x = 0; x < width; x++) - { - img1_sum_ptr2[x] = img1_sum_ptr1[x] + img1[x]; - img2_sum_ptr2[x] = img2_sum_ptr1[x] + img2[x]; - img1_sq_sum_ptr2[x] = img1_sq_sum_ptr1[x] + img1[x] * img1[x]; - img2_sq_sum_ptr2[x] = img2_sq_sum_ptr1[x] + img2[x] * img2[x]; - img12_mul_sum_ptr2[x] = img12_mul_sum_ptr1[x] + img1[x] * img2[x]; - } - - if (y > 6) - { - //calculate the sum of the last 8 lines by subtracting the total sum of 8 lines back from the present sum - temp = (y + 1) % 9 * width; - - img1_sum_ptr1 = img1_sum + temp; - img2_sum_ptr1 = img2_sum + temp; - img1_sq_sum_ptr1 = img1_sq_sum + temp; - img2_sq_sum_ptr1 = img2_sq_sum + temp; - img12_mul_sum_ptr1 = img12_mul_sum + temp; - - for (x = 0; x < width; x++) - { - img1_sum_ptr1[x] = img1_sum_ptr2[x] - img1_sum_ptr1[x]; - img2_sum_ptr1[x] = img2_sum_ptr2[x] - img2_sum_ptr1[x]; - img1_sq_sum_ptr1[x] = img1_sq_sum_ptr2[x] - img1_sq_sum_ptr1[x]; - img2_sq_sum_ptr1[x] = img2_sq_sum_ptr2[x] - img2_sq_sum_ptr1[x]; - img12_mul_sum_ptr1[x] = img12_mul_sum_ptr2[x] - img12_mul_sum_ptr1[x]; - } - - //here we calculate the sum over the 8x8 block of pixels - //this is done by sliding a window across the column sums for the last 8 lines - //each time adding the new column sum, and subtracting the one which fell out of the window - img1_block = 0; - img2_block = 0; - img1_sq_block = 0; - img2_sq_block = 0; - img12_mul_block = 0; - - //prologue, and calculation of simularity measure from the first 8 column sums - for (x = 0; x < 8; x++) - { - img1_block += img1_sum_ptr1[x]; - img2_block += img2_sum_ptr1[x]; - img1_sq_block += img1_sq_sum_ptr1[x]; - img2_sq_block += img2_sq_sum_ptr1[x]; - img12_mul_block += img12_mul_sum_ptr1[x]; - } - - if (lumimask) - { - y2 = y - 7; - x2 = 0; - - if (luminance) - { - mean = (img2_block + img1_block) / 128.0f; - - if (!(y2 % 2 || x2 % 2)) - *(img12_sum_block + y2 / 2 * width_uv + x2 / 2) = img2_block + img1_block; - } - else - { - mean = *(img12_sum_block + y2 * width_uv + x2); - mean += *(img12_sum_block + y2 * width_uv + x2 + 4); - mean += *(img12_sum_block + (y2 + 4) * width_uv + x2); - mean += *(img12_sum_block + (y2 + 4) * width_uv + x2 + 4); - - mean /= 512.0f; - } - - weight = mean < 40 ? 0.0f : - (mean < 50 ? (mean - 40.0f) / 10.0f : 1.0f); - plane_summed_weights += weight; - - plane_quality += weight * vp8_similarity(img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block); - } - else - plane_quality += vp8_similarity(img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block); - - //and for the rest - for (x = 8; x < width; x++) - { - img1_block = img1_block + img1_sum_ptr1[x] - img1_sum_ptr1[x - 8]; - img2_block = img2_block + img2_sum_ptr1[x] - img2_sum_ptr1[x - 8]; - img1_sq_block = img1_sq_block + img1_sq_sum_ptr1[x] - img1_sq_sum_ptr1[x - 8]; - img2_sq_block = img2_sq_block + img2_sq_sum_ptr1[x] - img2_sq_sum_ptr1[x - 8]; - img12_mul_block = img12_mul_block + img12_mul_sum_ptr1[x] - img12_mul_sum_ptr1[x - 8]; - - if (lumimask) - { - y2 = y - 7; - x2 = x - 7; - - if (luminance) - { - mean = (img2_block + img1_block) / 128.0f; - - if (!(y2 % 2 || x2 % 2)) - *(img12_sum_block + y2 / 2 * width_uv + x2 / 2) = img2_block + img1_block; - } - else - { - mean = *(img12_sum_block + y2 * width_uv + x2); - mean += *(img12_sum_block + y2 * width_uv + x2 + 4); - mean += *(img12_sum_block + (y2 + 4) * width_uv + x2); - mean += *(img12_sum_block + (y2 + 4) * width_uv + x2 + 4); - - mean /= 512.0f; - } - - weight = mean < 40 ? 0.0f : - (mean < 50 ? (mean - 40.0f) / 10.0f : 1.0f); - plane_summed_weights += weight; - - plane_quality += weight * vp8_similarity(img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block); - } - else - plane_quality += vp8_similarity(img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block); - } - } - } - - if (plane_summed_weights == 0) - return 1.0f; - else - return plane_quality / plane_summed_weights; -} - -double vp8_calc_ssim -( - YV12_BUFFER_CONFIG *source, - YV12_BUFFER_CONFIG *dest, - int lumamask, - double *weight -) -{ - double a, b, c; - double frame_weight; - double ssimv; - - width_y = source->y_width; - height_y = source->y_height; - height_uv = source->uv_height; - width_uv = source->uv_width; - stride_uv = dest->uv_stride; - stride = dest->y_stride; - - lumimask = lumamask; - - luminance = 1; - a = vp8_ssim(source->y_buffer, dest->y_buffer, - source->y_stride, dest->y_stride, source->y_width, source->y_height); - luminance = 0; - - frame_weight = plane_summed_weights / ((width_y - 7) * (height_y - 7)); - - if (frame_weight == 0) - a = b = c = 1.0f; - else - { - b = vp8_ssim(source->u_buffer, dest->u_buffer, - source->uv_stride, dest->uv_stride, source->uv_width, source->uv_height); - - c = vp8_ssim(source->v_buffer, dest->v_buffer, - source->uv_stride, dest->uv_stride, source->uv_width, source->uv_height); - } - - ssimv = a * .8 + .1 * (b + c); - - *weight = frame_weight; - - return ssimv; -} - +#if CONFIG_RUNTIME_CPU_DETECT +#define IF_RTCD(x) (x) +#else +#define IF_RTCD(x) NULL +#endif // Google version of SSIM // SSIM #define KERNEL 3 @@ -520,3 +235,174 @@ double vp8_calc_ssimg *ssim_v /= uvsize; return ssim_all; } + + +void ssim_parms_c +( + unsigned char *s, + int sp, + unsigned char *r, + int rp, + unsigned long *sum_s, + unsigned long *sum_r, + unsigned long *sum_sq_s, + unsigned long *sum_sq_r, + unsigned long *sum_sxr +) +{ + int i,j; + for(i=0;i<16;i++,s+=sp,r+=rp) + { + for(j=0;j<16;j++) + { + *sum_s += s[j]; + *sum_r += r[j]; + *sum_sq_s += s[j] * s[j]; + *sum_sq_r += r[j] * r[j]; + *sum_sxr += s[j] * r[j]; + } + } +} +void ssim_parms_8x8_c +( + unsigned char *s, + int sp, + unsigned char *r, + int rp, + unsigned long *sum_s, + unsigned long *sum_r, + unsigned long *sum_sq_s, + unsigned long *sum_sq_r, + unsigned long *sum_sxr +) +{ + int i,j; + for(i=0;i<8;i++,s+=sp,r+=rp) + { + for(j=0;j<8;j++) + { + *sum_s += s[j]; + *sum_r += r[j]; + *sum_sq_s += s[j] * s[j]; + *sum_sq_r += r[j] * r[j]; + *sum_sxr += s[j] * r[j]; + } + } +} + +const static long long c1 = 426148; // (256^2*(.01*255)^2 +const static long long c2 = 3835331; //(256^2*(.03*255)^2 + +static double similarity +( + unsigned long sum_s, + unsigned long sum_r, + unsigned long sum_sq_s, + unsigned long sum_sq_r, + unsigned long sum_sxr, + int count +) +{ + long long ssim_n = (2*sum_s*sum_r+ c1)*(2*count*sum_sxr-2*sum_s*sum_r+c2); + + long long ssim_d = (sum_s*sum_s +sum_r*sum_r+c1)* + (count*sum_sq_s-sum_s*sum_s + count*sum_sq_r-sum_r*sum_r +c2) ; + + return ssim_n * 1.0 / ssim_d; +} + +static double ssim_16x16(unsigned char *s,int sp, unsigned char *r,int rp, + const vp8_variance_rtcd_vtable_t *rtcd) +{ + unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0; + rtcd->ssimpf(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr); + return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 256); +} +static double ssim_8x8(unsigned char *s,int sp, unsigned char *r,int rp, + const vp8_variance_rtcd_vtable_t *rtcd) +{ + unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0; + rtcd->ssimpf_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr); + return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64); +} + +// TODO: (jbb) tried to scale this function such that we may be able to use it +// for distortion metric in mode selection code ( provided we do a reconstruction) +long dssim(unsigned char *s,int sp, unsigned char *r,int rp, + const vp8_variance_rtcd_vtable_t *rtcd) +{ + unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0; + double ssim3; + long long ssim_n; + long long ssim_d; + + rtcd->ssimpf(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr); + ssim_n = (2*sum_s*sum_r+ c1)*(2*256*sum_sxr-2*sum_s*sum_r+c2); + + ssim_d = (sum_s*sum_s +sum_r*sum_r+c1)* + (256*sum_sq_s-sum_s*sum_s + 256*sum_sq_r-sum_r*sum_r +c2) ; + + ssim3 = 256 * (ssim_d-ssim_n) / ssim_d; + return (long)( 256*ssim3 * ssim3 ); +} +// TODO: (jbb) this 8x8 window might be too big + we may want to pick pixels +// such that the window regions overlap block boundaries to penalize blocking +// artifacts. + +double vp8_ssim2 +( + unsigned char *img1, + unsigned char *img2, + int stride_img1, + int stride_img2, + int width, + int height, + const vp8_variance_rtcd_vtable_t *rtcd +) +{ + int i,j; + + double ssim_total=0; + + // we can sample points as frequently as we like start with 1 per 8x8 + for(i=0; i < height; i+=8, img1 += stride_img1*8, img2 += stride_img2*8) + { + for(j=0; j < width; j+=8 ) + { + ssim_total += ssim_8x8(img1, stride_img1, img2, stride_img2, rtcd); + } + } + ssim_total /= (width/8 * height /8); + return ssim_total; + +} +double vp8_calc_ssim +( + YV12_BUFFER_CONFIG *source, + YV12_BUFFER_CONFIG *dest, + int lumamask, + double *weight, + const vp8_variance_rtcd_vtable_t *rtcd +) +{ + double a, b, c; + double ssimv; + + a = vp8_ssim2(source->y_buffer, dest->y_buffer, + source->y_stride, dest->y_stride, source->y_width, + source->y_height, rtcd); + + b = vp8_ssim2(source->u_buffer, dest->u_buffer, + source->uv_stride, dest->uv_stride, source->uv_width, + source->uv_height, rtcd); + + c = vp8_ssim2(source->v_buffer, dest->v_buffer, + source->uv_stride, dest->uv_stride, source->uv_width, + source->uv_height, rtcd); + + ssimv = a * .8 + .1 * (b + c); + + *weight = 1; + + return ssimv; +} diff --git a/vp8/encoder/temporal_filter.c b/vp8/encoder/temporal_filter.c index 0f8e654a0..fd36b22eb 100644 --- a/vp8/encoder/temporal_filter.c +++ b/vp8/encoder/temporal_filter.c @@ -287,8 +287,7 @@ static void vp8_temporal_filter_iterate_c int byte; int frame; int mb_col, mb_row; - unsigned int filter_weight[MAX_LAG_BUFFERS]; - unsigned char *mm_ptr = cpi->fp_motion_map; + unsigned int filter_weight; int mb_cols = cpi->common.mb_cols; int mb_rows = cpi->common.mb_rows; int MBs = cpi->common.MBs; @@ -306,13 +305,6 @@ static void vp8_temporal_filter_iterate_c unsigned char *u_buffer = mbd->pre.u_buffer; unsigned char *v_buffer = mbd->pre.v_buffer; - if (!cpi->use_weighted_temporal_filter) - { - // Temporal filtering is unweighted - for (frame = 0; frame < frame_count; frame++) - filter_weight[frame] = 1; - } - for (mb_row = 0; mb_row < mb_rows; mb_row++) { #if ALT_REF_MC_ENABLED @@ -338,34 +330,9 @@ static void vp8_temporal_filter_iterate_c + (VP8BORDERINPIXELS - 19); #endif - // Read & process macroblock weights from motion map - if (cpi->use_weighted_temporal_filter) - { - weight_cap = 2; - - for (frame = alt_ref_index-1; frame >= 0; frame--) - { - w = *(mm_ptr + (frame+1)*MBs); - filter_weight[frame] = w < weight_cap ? w : weight_cap; - weight_cap = w; - } - - filter_weight[alt_ref_index] = 2; - - weight_cap = 2; - - for (frame = alt_ref_index+1; frame < frame_count; frame++) - { - w = *(mm_ptr + frame*MBs); - filter_weight[frame] = w < weight_cap ? w : weight_cap; - weight_cap = w; - } - - } - for (frame = 0; frame < frame_count; frame++) { - int err; + int err = 0; if (cpi->frames[frame] == NULL) continue; @@ -374,28 +341,25 @@ static void vp8_temporal_filter_iterate_c mbd->block[0].bmi.mv.as_mv.col = 0; #if ALT_REF_MC_ENABLED - //if (filter_weight[frame] == 0) - { #define THRESH_LOW 10000 #define THRESH_HIGH 20000 - // Correlation has been lost try MC - err = vp8_temporal_filter_find_matching_mb_c - (cpi, - cpi->frames[alt_ref_index], - cpi->frames[frame], - mb_y_offset, - THRESH_LOW); - - if (filter_weight[frame] < 2) - { - // Set weight depending on error - filter_weight[frame] = err<THRESH_LOW - ? 2 : err<THRESH_HIGH ? 1 : 0; - } - } + // Find best match in this frame by MC + err = vp8_temporal_filter_find_matching_mb_c + (cpi, + cpi->frames[alt_ref_index], + cpi->frames[frame], + mb_y_offset, + THRESH_LOW); + #endif - if (filter_weight[frame] != 0) + // Assign higher weight to matching MB if it's error + // score is lower. If not applying MC default behavior + // is to weight all MBs equal. + filter_weight = err<THRESH_LOW + ? 2 : err<THRESH_HIGH ? 1 : 0; + + if (filter_weight != 0) { // Construct the predictors vp8_temporal_filter_predictors_mb_c @@ -415,7 +379,7 @@ static void vp8_temporal_filter_iterate_c predictor, 16, strength, - filter_weight[frame], + filter_weight, accumulator, count); @@ -425,7 +389,7 @@ static void vp8_temporal_filter_iterate_c predictor + 256, 8, strength, - filter_weight[frame], + filter_weight, accumulator + 256, count + 256); @@ -435,7 +399,7 @@ static void vp8_temporal_filter_iterate_c predictor + 320, 8, strength, - filter_weight[frame], + filter_weight, accumulator + 320, count + 320); } @@ -491,7 +455,6 @@ static void vp8_temporal_filter_iterate_c byte += stride - 8; } - mm_ptr++; mb_y_offset += 16; mb_uv_offset += 8; } diff --git a/vp8/encoder/variance.h b/vp8/encoder/variance.h index 5befd3b86..bf17ea8b6 100644 --- a/vp8/encoder/variance.h +++ b/vp8/encoder/variance.h @@ -85,6 +85,19 @@ unsigned int *sse \ ); +#define prototype_ssimpf(sym) \ + void (sym) \ + ( \ + unsigned char *s, \ + int sp, \ + unsigned char *r, \ + int rp, \ + unsigned long *sum_s, \ + unsigned long *sum_r, \ + unsigned long *sum_sq_s, \ + unsigned long *sum_sq_r, \ + unsigned long *sum_sxr \ + ); #define prototype_getmbss(sym) unsigned int (sym)(const short *) @@ -306,6 +319,15 @@ extern prototype_variance2(vp8_variance_get16x16var); #endif extern prototype_sad(vp8_variance_get4x4sse_cs); +#ifndef vp8_ssimpf +#define vp8_ssimpf ssim_parms_c +#endif +extern prototype_ssimpf(vp8_ssimpf) + +#ifndef vp8_ssimpf_8x8 +#define vp8_ssimpf_8x8 ssim_parms_8x8_c +#endif +extern prototype_ssimpf(vp8_ssimpf_8x8) typedef prototype_sad(*vp8_sad_fn_t); typedef prototype_sad_multi_same_address(*vp8_sad_multi_fn_t); @@ -315,6 +337,10 @@ typedef prototype_variance(*vp8_variance_fn_t); typedef prototype_variance2(*vp8_variance2_fn_t); typedef prototype_subpixvariance(*vp8_subpixvariance_fn_t); typedef prototype_getmbss(*vp8_getmbss_fn_t); + +typedef prototype_ssimpf(*vp8_ssimpf_fn_t) + + typedef struct { vp8_sad_fn_t sad4x4; @@ -365,6 +391,11 @@ typedef struct vp8_sad_multi_d_fn_t sad8x8x4d; vp8_sad_multi_d_fn_t sad4x4x4d; +#if CONFIG_PSNR + vp8_ssimpf_fn_t ssimpf_8x8; + vp8_ssimpf_fn_t ssimpf; +#endif + } vp8_variance_rtcd_vtable_t; typedef struct @@ -378,6 +409,7 @@ typedef struct vp8_sad_multi_fn_t sdx3f; vp8_sad_multi1_fn_t sdx8f; vp8_sad_multi_d_fn_t sdx4df; + } vp8_variance_fn_ptr_t; #if CONFIG_RUNTIME_CPU_DETECT diff --git a/vp8/encoder/x86/sad_sse4.asm b/vp8/encoder/x86/sad_sse4.asm index 21e2e5007..03ecec4b3 100644 --- a/vp8/encoder/x86/sad_sse4.asm +++ b/vp8/encoder/x86/sad_sse4.asm @@ -186,7 +186,7 @@ sym(vp8_sad16x16x8_sse4): PROCESS_16X2X8 0 mov rdi, arg(4) ;Results - movdqu XMMWORD PTR [rdi], xmm1 + movdqa XMMWORD PTR [rdi], xmm1 ; begin epilog pop rdi @@ -224,7 +224,7 @@ sym(vp8_sad16x8x8_sse4): PROCESS_16X2X8 0 mov rdi, arg(4) ;Results - movdqu XMMWORD PTR [rdi], xmm1 + movdqa XMMWORD PTR [rdi], xmm1 ; begin epilog pop rdi @@ -262,7 +262,7 @@ sym(vp8_sad8x8x8_sse4): PROCESS_8X2X8 0 mov rdi, arg(4) ;Results - movdqu XMMWORD PTR [rdi], xmm1 + movdqa XMMWORD PTR [rdi], xmm1 ; begin epilog pop rdi @@ -303,7 +303,7 @@ sym(vp8_sad8x16x8_sse4): PROCESS_8X2X8 0 PROCESS_8X2X8 0 mov rdi, arg(4) ;Results - movdqu XMMWORD PTR [rdi], xmm1 + movdqa XMMWORD PTR [rdi], xmm1 ; begin epilog pop rdi @@ -339,7 +339,7 @@ sym(vp8_sad4x4x8_sse4): PROCESS_4X2X8 0 mov rdi, arg(4) ;Results - movdqu XMMWORD PTR [rdi], xmm1 + movdqa XMMWORD PTR [rdi], xmm1 ; begin epilog pop rdi diff --git a/vp8/encoder/x86/ssim_opt.asm b/vp8/encoder/x86/ssim_opt.asm new file mode 100644 index 000000000..c267cdb54 --- /dev/null +++ b/vp8/encoder/x86/ssim_opt.asm @@ -0,0 +1,215 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "vpx_ports/x86_abi_support.asm" + +; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr +%macro TABULATE_SSIM 0 + paddusw xmm15, xmm3 ; sum_s + paddusw xmm14, xmm4 ; sum_r + movdqa xmm1, xmm3 + pmaddwd xmm1, xmm1 + paddq xmm13, xmm1 ; sum_sq_s + movdqa xmm2, xmm4 + pmaddwd xmm2, xmm2 + paddq xmm12, xmm2 ; sum_sq_r + pmaddwd xmm3, xmm4 + paddq xmm11, xmm3 ; sum_sxr +%endmacro + +; Sum across the register %1 starting with q words +%macro SUM_ACROSS_Q 1 + movdqa xmm2,%1 + punpckldq %1,xmm0 + punpckhdq xmm2,xmm0 + paddq %1,xmm2 + movdqa xmm2,%1 + punpcklqdq %1,xmm0 + punpckhqdq xmm2,xmm0 + paddq %1,xmm2 +%endmacro + +; Sum across the register %1 starting with q words +%macro SUM_ACROSS_W 1 + movdqa xmm1, %1 + punpcklwd %1,xmm0 + punpckhwd xmm1,xmm0 + paddd %1, xmm1 + SUM_ACROSS_Q %1 +%endmacro +;void ssim_parms_sse3( +; unsigned char *s, +; int sp, +; unsigned char *r, +; int rp +; unsigned long *sum_s, +; unsigned long *sum_r, +; unsigned long *sum_sq_s, +; unsigned long *sum_sq_r, +; unsigned long *sum_sxr); +; +; TODO: Use parm passing through structure, probably don't need the pxors +; ( calling app will initialize to 0 ) could easily fit everything in sse2 +; without too much hastle, and can probably do better estimates with psadw +; or pavgb At this point this is just meant to be first pass for calculating +; all the parms needed for 16x16 ssim so we can play with dssim as distortion +; in mode selection code. +global sym(vp8_ssim_parms_16x16_sse3) +sym(vp8_ssim_parms_16x16_sse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;s + mov rcx, arg(1) ;sp + mov rdi, arg(2) ;r + mov rax, arg(3) ;rp + + pxor xmm0, xmm0 + pxor xmm15,xmm15 ;sum_s + pxor xmm14,xmm14 ;sum_r + pxor xmm13,xmm13 ;sum_sq_s + pxor xmm12,xmm12 ;sum_sq_r + pxor xmm11,xmm11 ;sum_sxr + + mov rdx, 16 ;row counter +NextRow: + + ;grab source and reference pixels + movdqu xmm5, [rsi] + movdqu xmm6, [rdi] + movdqa xmm3, xmm5 + movdqa xmm4, xmm6 + punpckhbw xmm3, xmm0 ; high_s + punpckhbw xmm4, xmm0 ; high_r + + TABULATE_SSIM + + movdqa xmm3, xmm5 + movdqa xmm4, xmm6 + punpcklbw xmm3, xmm0 ; low_s + punpcklbw xmm4, xmm0 ; low_r + + TABULATE_SSIM + + add rsi, rcx ; next s row + add rdi, rax ; next r row + + dec rdx ; counter + jnz NextRow + + SUM_ACROSS_W xmm15 + SUM_ACROSS_W xmm14 + SUM_ACROSS_Q xmm13 + SUM_ACROSS_Q xmm12 + SUM_ACROSS_Q xmm11 + + mov rdi,arg(4) + movq [rdi], xmm15; + mov rdi,arg(5) + movq [rdi], xmm14; + mov rdi,arg(6) + movq [rdi], xmm13; + mov rdi,arg(7) + movq [rdi], xmm12; + mov rdi,arg(8) + movq [rdi], xmm11; + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;void ssim_parms_sse3( +; unsigned char *s, +; int sp, +; unsigned char *r, +; int rp +; unsigned long *sum_s, +; unsigned long *sum_r, +; unsigned long *sum_sq_s, +; unsigned long *sum_sq_r, +; unsigned long *sum_sxr); +; +; TODO: Use parm passing through structure, probably don't need the pxors +; ( calling app will initialize to 0 ) could easily fit everything in sse2 +; without too much hastle, and can probably do better estimates with psadw +; or pavgb At this point this is just meant to be first pass for calculating +; all the parms needed for 16x16 ssim so we can play with dssim as distortion +; in mode selection code. +global sym(vp8_ssim_parms_8x8_sse3) +sym(vp8_ssim_parms_8x8_sse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;s + mov rcx, arg(1) ;sp + mov rdi, arg(2) ;r + mov rax, arg(3) ;rp + + pxor xmm0, xmm0 + pxor xmm15,xmm15 ;sum_s + pxor xmm14,xmm14 ;sum_r + pxor xmm13,xmm13 ;sum_sq_s + pxor xmm12,xmm12 ;sum_sq_r + pxor xmm11,xmm11 ;sum_sxr + + mov rdx, 8 ;row counter +NextRow2: + + ;grab source and reference pixels + movq xmm5, [rsi] + movq xmm6, [rdi] + + movdqa xmm3, xmm5 + movdqa xmm4, xmm6 + punpcklbw xmm3, xmm0 ; low_s + punpcklbw xmm4, xmm0 ; low_r + + TABULATE_SSIM + + add rsi, rcx ; next s row + add rdi, rax ; next r row + + dec rdx ; counter + jnz NextRow2 + + SUM_ACROSS_W xmm15 + SUM_ACROSS_W xmm14 + SUM_ACROSS_Q xmm13 + SUM_ACROSS_Q xmm12 + SUM_ACROSS_Q xmm11 + + mov rdi,arg(4) + movq [rdi], xmm15; + mov rdi,arg(5) + movq [rdi], xmm14; + mov rdi,arg(6) + movq [rdi], xmm13; + mov rdi,arg(7) + movq [rdi], xmm12; + mov rdi,arg(8) + movq [rdi], xmm11; + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c index 3158ac12b..5ab364147 100644 --- a/vp8/encoder/x86/x86_csystemdependent.c +++ b/vp8/encoder/x86/x86_csystemdependent.c @@ -176,6 +176,25 @@ void vp8_fast_quantize_b_ssse3(BLOCK *b, BLOCKD *d) d->dqcoeff ); } +#if CONFIG_PSNR +#if ARCH_X86_64 +typedef void ssimpf +( + unsigned char *s, + int sp, + unsigned char *r, + int rp, + unsigned long *sum_s, + unsigned long *sum_r, + unsigned long *sum_sq_s, + unsigned long *sum_sq_r, + unsigned long *sum_sxr +); + +extern ssimpf vp8_ssim_parms_16x16_sse3; +extern ssimpf vp8_ssim_parms_8x8_sse3; +#endif +#endif #endif @@ -280,6 +299,8 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) cpi->rtcd.variance.get16x16prederror = vp8_get16x16pred_error_sse2; cpi->rtcd.variance.get8x8var = vp8_get8x8var_sse2; cpi->rtcd.variance.get16x16var = vp8_get16x16var_sse2; + + /* cpi->rtcd.variance.get4x4sse_cs not implemented for wmt */; cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_sse2; @@ -339,9 +360,18 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_ssse3; +#if CONFIG_PSNR +#if ARCH_X86_64 + cpi->rtcd.variance.ssimpf_8x8 = vp8_ssim_parms_8x8_sse3; + cpi->rtcd.variance.ssimpf = vp8_ssim_parms_16x16_sse3; +#endif +#endif + } #endif + + #if HAVE_SSE4_1 if (SSE4_1Enabled) { diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c index a45a37912..2622738ec 100644 --- a/vp8/vp8_cx_iface.c +++ b/vp8/vp8_cx_iface.c @@ -199,7 +199,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, { int mb_r = (cfg->g_h + 15) / 16; int mb_c = (cfg->g_w + 15) / 16; - size_t packet_sz = vp8_firstpass_stats_sz(mb_r * mb_c); + size_t packet_sz = sizeof(FIRSTPASS_STATS); int n_packets = cfg->rc_twopass_stats_in.sz / packet_sz; FIRSTPASS_STATS *stats; diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk index c0ae250f5..8f0681fb9 100644 --- a/vp8/vp8cx.mk +++ b/vp8/vp8cx.mk @@ -116,6 +116,7 @@ VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.asm VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/sad_sse4.asm VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm +VP8_CX_SRCS-$(ARCH_X86_64) += encoder/x86/ssim_opt.asm ifeq ($(CONFIG_REALTIME_ONLY),yes) VP8_CX_SRCS_REMOVE-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm diff --git a/vp8/vp8cx_arm.mk b/vp8/vp8cx_arm.mk index 429898a61..b07ee8ffb 100644 --- a/vp8/vp8cx_arm.mk +++ b/vp8/vp8cx_arm.mk @@ -34,6 +34,7 @@ VP8_CX_SRCS-$(HAVE_ARMV5TE) += encoder/arm/armv5te/vp8_packtokens_partitions_ar #File list for armv6 # encoder +VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_fast_quantize_b_armv6$(ASM) VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_sad16x16_armv6$(ASM) VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_variance16x16_armv6$(ASM) VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_mse16x16_armv6$(ASM) diff --git a/vpx_ports/x86_abi_support.asm b/vpx_ports/x86_abi_support.asm index a872b280e..be64cd7c7 100644 --- a/vpx_ports/x86_abi_support.asm +++ b/vpx_ports/x86_abi_support.asm @@ -168,15 +168,10 @@ %macro GET_GOT 1 push %1 call %%get_got - %%sub_offset: - jmp %%exitGG %%get_got: - mov %1, [esp] - add %1, fake_got - %%sub_offset - ret - %%exitGG: + pop %1 %undef GLOBAL - %define GLOBAL(x) x + %1 - fake_got + %define GLOBAL(x) x + %1 - %%get_got %undef RESTORE_GOT %define RESTORE_GOT pop %1 %endmacro @@ -289,7 +284,6 @@ %elifidn __OUTPUT_FORMAT__,macho32 %macro SECTION_RODATA 0 section .text -fake_got: %endmacro %else %define SECTION_RODATA section .rodata |