diff options
135 files changed, 8335 insertions, 4635 deletions
@@ -1,4 +1,4 @@ -Copyright (c) 2010, Google Inc. All rights reserved. +Copyright (c) 2010, The WebM Project authors. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -12,9 +12,10 @@ met: the documentation and/or other materials provided with the distribution. - * Neither the name of Google nor the names of its contributors may - be used to endorse or promote products derived from this software - without specific prior written permission. + * Neither the name of Google, nor the WebM Project, nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT @@ -57,7 +57,7 @@ int arg_match(struct arg *arg_, const struct arg_def *def, char **argv) } else if (def->long_name) { - int name_len = strlen(def->long_name); + const size_t name_len = strlen(def->long_name); if (strlen(arg.argv[0]) >= name_len + 2 && arg.argv[0][1] == '-' diff --git a/build/make/ads2gas.pl b/build/make/ads2gas.pl index 388133aa2..cea967f93 100755 --- a/build/make/ads2gas.pl +++ b/build/make/ads2gas.pl @@ -126,15 +126,14 @@ while (<STDIN>) # ALIGN directive s/ALIGN/.balign/g; - # Strip ARM - s/\sARM/@ ARM/g; + # ARM code + s/\sARM/.arm/g; - # Strip REQUIRE8 - #s/\sREQUIRE8/@ REQUIRE8/g; - s/\sREQUIRE8/@ /g; #EQU cause problem + # REQUIRE8 Stack is required to be 8-byte aligned + s/\sREQUIRE8/.eabi_attribute Tag_ABI_align_needed, 1/g; - # Strip PRESERVE8 - s/\sPRESERVE8/@ PRESERVE8/g; + # PRESERVE8 Stack 8-byte align is preserved + s/\sPRESERVE8/.eabi_attribute Tag_ABI_align_preserved, 1/g; # Use PROC and ENDP to give the symbols a .size directive. # This makes them show up properly in debugging tools like gdb and valgrind. diff --git a/build/make/ads2gas_apple.pl b/build/make/ads2gas_apple.pl index 78f4a97f5..81280bf78 100755 --- a/build/make/ads2gas_apple.pl +++ b/build/make/ads2gas_apple.pl @@ -30,6 +30,8 @@ my @mapping_list = ("\$0", "\$1", "\$2", "\$3", "\$4", "\$5", "\$6", "\$7", "\$8 my @incoming_array; +my @imported_functions; + # Perl trim function to remove whitespace from the start and end of the string sub trim($) { @@ -132,7 +134,18 @@ while (<STDIN>) # Make function visible to linker, and make additional symbol with # prepended underscore s/EXPORT\s+\|([\$\w]*)\|/.globl _$1\n\t.globl $1/; - s/IMPORT\s+\|([\$\w]*)\|/.globl $1/; + + # Prepend imported functions with _ + if (s/IMPORT\s+\|([\$\w]*)\|/.globl $1/) + { + $function = trim($1); + push(@imported_functions, $function); + } + + foreach $function (@imported_functions) + { + s/$function/_$function/; + } # No vertical bars required; make additional symbol with prepended # underscore @@ -157,8 +170,8 @@ while (<STDIN>) s/\sPRESERVE8/@ PRESERVE8/g; # Strip PROC and ENDPROC - s/PROC/@/g; - s/ENDP/@/g; + s/\bPROC\b/@/g; + s/\bENDP\b/@/g; # EQU directive s/(.*)EQU(.*)/.set $1, $2/; diff --git a/build/make/configure.sh b/build/make/configure.sh index 1279f781a..0426f9220 100755 --- a/build/make/configure.sh +++ b/build/make/configure.sh @@ -561,6 +561,10 @@ process_common_toolchain() { tgt_isa=x86_64 tgt_os=darwin10 ;; + *darwin11*) + tgt_isa=x86_64 + tgt_os=darwin11 + ;; *mingw32*|*cygwin*) [ -z "$tgt_isa" ] && tgt_isa=x86 tgt_os=win32 @@ -617,6 +621,9 @@ process_common_toolchain() { if [ -d "/Developer/SDKs/MacOSX10.6.sdk" ]; then osx_sdk_dir="/Developer/SDKs/MacOSX10.6.sdk" fi + if [ -d "/Developer/SDKs/MacOSX10.7.sdk" ]; then + osx_sdk_dir="/Developer/SDKs/MacOSX10.7.sdk" + fi case ${toolchain} in *-darwin8-*) @@ -637,6 +644,12 @@ process_common_toolchain() { add_ldflags "-isysroot ${osx_sdk_dir}" add_ldflags "-mmacosx-version-min=10.6" ;; + *-darwin11-*) + add_cflags "-isysroot ${osx_sdk_dir}" + add_cflags "-mmacosx-version-min=10.7" + add_ldflags "-isysroot ${osx_sdk_dir}" + add_ldflags "-mmacosx-version-min=10.7" + ;; esac # Handle Solaris variants. Solaris 10 needs -lposix4 @@ -732,7 +745,7 @@ process_common_toolchain() { TOOLCHAIN_PATH=${SDK_PATH}/usr/bin CC=${TOOLCHAIN_PATH}/gcc AR=${TOOLCHAIN_PATH}/ar - LD=${TOOLCHAIN_PATH}/arm-apple-darwin10-gcc-4.2.1 + LD=${TOOLCHAIN_PATH}/arm-apple-darwin10-llvm-gcc-4.2 AS=${TOOLCHAIN_PATH}/as STRIP=${TOOLCHAIN_PATH}/strip NM=${TOOLCHAIN_PATH}/nm @@ -746,13 +759,13 @@ process_common_toolchain() { add_cflags -arch ${tgt_isa} add_ldflags -arch_only ${tgt_isa} - add_cflags "-isysroot ${SDK_PATH}/SDKs/iPhoneOS4.3.sdk" + add_cflags "-isysroot ${SDK_PATH}/SDKs/iPhoneOS5.0.sdk" # This should be overridable - alt_libc=${SDK_PATH}/SDKs/iPhoneOS4.3.sdk + alt_libc=${SDK_PATH}/SDKs/iPhoneOS5.0.sdk # Add the paths for the alternate libc - for d in usr/include usr/include/gcc/darwin/4.2/ usr/lib/gcc/arm-apple-darwin10/4.2.1/include/; do + for d in usr/include; do try_dir="${alt_libc}/${d}" [ -d "${try_dir}" ] && add_cflags -I"${try_dir}" done @@ -35,7 +35,7 @@ Advanced options: ${toggle_internal_stats} output of encoder internal stats for debug, if supported (encoders) ${toggle_mem_tracker} track memory usage ${toggle_postproc} postprocessing - ${toggle_multithread} multithreaded encoding and decoding. + ${toggle_multithread} multithreaded encoding and decoding ${toggle_spatial_resampling} spatial sampling (scaling) support ${toggle_realtime_only} enable this option while building for real-time encoding ${toggle_error_concealment} enable this option to get a decoder which is able to conceal losses @@ -44,6 +44,7 @@ Advanced options: ${toggle_static} static library support ${toggle_small} favor smaller size over speed ${toggle_postproc_visualizer} macro block / block level visualizers + ${toggle_multi_res_encoding} enable multiple-resolution encoding Codecs: Codecs can be selectively enabled or disabled individually, or by family: @@ -118,9 +119,11 @@ all_platforms="${all_platforms} x86-win32-vs8" all_platforms="${all_platforms} x86-win32-vs9" all_platforms="${all_platforms} x86_64-darwin9-gcc" all_platforms="${all_platforms} x86_64-darwin10-gcc" +all_platforms="${all_platforms} x86_64-darwin11-gcc" all_platforms="${all_platforms} x86_64-linux-gcc" all_platforms="${all_platforms} x86_64-linux-icc" all_platforms="${all_platforms} x86_64-solaris-gcc" +all_platforms="${all_platforms} x86_64-win64-gcc" all_platforms="${all_platforms} x86_64-win64-vs8" all_platforms="${all_platforms} x86_64-win64-vs9" all_platforms="${all_platforms} universal-darwin8-gcc" @@ -261,6 +264,7 @@ CONFIG_LIST=" postproc_visualizer os_support unit_tests + multi_res_encoding " CMDLINE_SELECT=" extra_warnings @@ -303,6 +307,7 @@ CMDLINE_SELECT=" small postproc_visualizer unit_tests + multi_res_encoding " process_cmdline() { diff --git a/examples.mk b/examples.mk index 8088d3217..f6c904588 100644 --- a/examples.mk +++ b/examples.mk @@ -96,6 +96,17 @@ GEN_EXAMPLES-$(CONFIG_VP8_ENCODER) += vp8cx_set_ref.c vp8cx_set_ref.GUID = C5E31F7F-96F6-48BD-BD3E-10EBF6E8057A vp8cx_set_ref.DESCRIPTION = VP8 set encoder reference frame +# C file is provided, not generated automatically. +GEN_EXAMPLES-$(CONFIG_MULTI_RES_ENCODING) += vp8_multi_resolution_encoder.c +vp8_multi_resolution_encoder.SRCS \ + += third_party/libyuv/include/libyuv/basic_types.h \ + third_party/libyuv/include/libyuv/cpu_id.h \ + third_party/libyuv/include/libyuv/scale.h \ + third_party/libyuv/source/row.h \ + third_party/libyuv/source/scale.c \ + third_party/libyuv/source/cpu_id.c +vp8_multi_resolution_encoder.GUID = 04f8738e-63c8-423b-90fa-7c2703a374de +vp8_multi_resolution_encoder.DESCRIPTION = VP8 Multiple-resolution Encoding # Handle extra library flags depending on codec configuration diff --git a/third_party/libyuv/README.webm b/third_party/libyuv/README.webm new file mode 100644 index 000000000..d3495caa1 --- /dev/null +++ b/third_party/libyuv/README.webm @@ -0,0 +1,17 @@ +Name: libyuv +URL: http://code.google.com/p/libyuv/ +Version: 102 +License: BSD +License File: LICENSE + +Description: +libyuv is an open source project that includes YUV conversion and scaling +functionality. + +The optimized scaler in libyuv is used in multiple resolution encoder example, +which down-samples the original input video (f.g. 1280x720) a number of times +in order to encode multiple resolution bit streams. + +Local Modifications: +Modified the original scaler code from C++ to C to fit in our current build +system. This is a temporal solution, and will be improved later.
\ No newline at end of file diff --git a/third_party/libyuv/include/libyuv/basic_types.h b/third_party/libyuv/include/libyuv/basic_types.h new file mode 100644 index 000000000..30504ce66 --- /dev/null +++ b/third_party/libyuv/include/libyuv/basic_types.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_ +#define INCLUDE_LIBYUV_BASIC_TYPES_H_ + +#include <stddef.h> // for NULL, size_t + +#if !(defined(_MSC_VER) && (_MSC_VER < 1600)) +#include <stdint.h> // for uintptr_t +#endif + +#ifndef INT_TYPES_DEFINED +#define INT_TYPES_DEFINED +#ifdef COMPILER_MSVC +typedef unsigned __int64 uint64; +typedef __int64 int64; +#ifndef INT64_C +#define INT64_C(x) x ## I64 +#endif +#ifndef UINT64_C +#define UINT64_C(x) x ## UI64 +#endif +#define INT64_F "I64" +#else // COMPILER_MSVC +#ifdef __LP64__ +typedef unsigned long uint64; +typedef long int64; +#ifndef INT64_C +#define INT64_C(x) x ## L +#endif +#ifndef UINT64_C +#define UINT64_C(x) x ## UL +#endif +#define INT64_F "l" +#else // __LP64__ +typedef unsigned long long uint64; +typedef long long int64; +#ifndef INT64_C +#define INT64_C(x) x ## LL +#endif +#ifndef UINT64_C +#define UINT64_C(x) x ## ULL +#endif +#define INT64_F "ll" +#endif // __LP64__ +#endif // COMPILER_MSVC +typedef unsigned int uint32; +typedef int int32; +typedef unsigned short uint16; +typedef short int16; +typedef unsigned char uint8; +typedef char int8; +#endif // INT_TYPES_DEFINED + +// Detect compiler is for x86 or x64. +#if defined(__x86_64__) || defined(_M_X64) || \ + defined(__i386__) || defined(_M_IX86) +#define CPU_X86 1 +#endif + +#define ALIGNP(p, t) \ + ((uint8*)((((uintptr_t)(p) + \ + ((t)-1)) & ~((t)-1)))) + +#endif // INCLUDE_LIBYUV_BASIC_TYPES_H_ diff --git a/third_party/libyuv/include/libyuv/cpu_id.h b/third_party/libyuv/include/libyuv/cpu_id.h new file mode 100644 index 000000000..4a53b5bef --- /dev/null +++ b/third_party/libyuv/include/libyuv/cpu_id.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_CPU_ID_H_ +#define INCLUDE_LIBYUV_CPU_ID_H_ + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// These flags are only valid on x86 processors +static const int kCpuHasSSE2 = 1; +static const int kCpuHasSSSE3 = 2; + +// These flags are only valid on ARM processors +static const int kCpuHasNEON = 4; + +// Internal flag to indicate cpuid is initialized. +static const int kCpuInitialized = 8; + +// Detect CPU has SSE2 etc. +// test_flag parameter should be one of kCpuHas constants above +// returns non-zero if instruction set is detected +static __inline int TestCpuFlag(int test_flag) { + extern int cpu_info_; + extern int InitCpuFlags(); + return (cpu_info_ ? cpu_info_ : InitCpuFlags()) & test_flag; +} + +// For testing, allow CPU flags to be disabled. +// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3. +// -1 to enable all cpu specific optimizations. +// 0 to disable all cpu specific optimizations. +void MaskCpuFlags(int enable_flags); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_CPU_ID_H_ diff --git a/third_party/libyuv/include/libyuv/scale.h b/third_party/libyuv/include/libyuv/scale.h new file mode 100644 index 000000000..21fe360ce --- /dev/null +++ b/third_party/libyuv/include/libyuv/scale.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_SCALE_H_ +#define INCLUDE_LIBYUV_SCALE_H_ + +#include "third_party/libyuv/include/libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Supported filtering +typedef enum { + kFilterNone = 0, // Point sample; Fastest + kFilterBilinear = 1, // Faster than box, but lower quality scaling down. + kFilterBox = 2 // Highest quality +}FilterMode; + +// Scales a YUV 4:2:0 image from the src width and height to the +// dst width and height. +// If filtering is kFilterNone, a simple nearest-neighbor algorithm is +// used. This produces basic (blocky) quality at the fastest speed. +// If filtering is kFilterBilinear, interpolation is used to produce a better +// quality image, at the expense of speed. +// If filtering is kFilterBox, averaging is used to produce ever better +// quality image, at further expense of speed. +// Returns 0 if successful. + +int I420Scale(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + int src_width, int src_height, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int dst_width, int dst_height, + FilterMode filtering); + +// Legacy API. Deprecated +int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v, + int src_stride_y, int src_stride_u, int src_stride_v, + int src_width, int src_height, + uint8* dst_y, uint8* dst_u, uint8* dst_v, + int dst_stride_y, int dst_stride_u, int dst_stride_v, + int dst_width, int dst_height, + int interpolate); + +// Legacy API. Deprecated +int ScaleOffset(const uint8* src, int src_width, int src_height, + uint8* dst, int dst_width, int dst_height, int dst_yoffset, + int interpolate); + +// For testing, allow disabling of optimizations. +void SetUseReferenceImpl(int use); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_SCALE_H_ diff --git a/third_party/libyuv/source/cpu_id.c b/third_party/libyuv/source/cpu_id.c new file mode 100644 index 000000000..fccf3dd44 --- /dev/null +++ b/third_party/libyuv/source/cpu_id.c @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "third_party/libyuv/include/libyuv/cpu_id.h" + +#ifdef _MSC_VER +#include <intrin.h> +#endif +#ifdef __ANDROID__ +#include <cpu-features.h> +#endif + +#include "third_party/libyuv/include/libyuv/basic_types.h" // for CPU_X86 + +// TODO(fbarchard): Use cpuid.h when gcc 4.4 is used on OSX and Linux. +#if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__) +static inline void __cpuid(int cpu_info[4], int info_type) { + asm volatile ( + "mov %%ebx, %%edi \n" + "cpuid \n" + "xchg %%edi, %%ebx \n" + : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) + : "a"(info_type) + ); +} +#elif defined(__i386__) || defined(__x86_64__) +static inline void __cpuid(int cpu_info[4], int info_type) { + asm volatile ( + "cpuid \n" + : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) + : "a"(info_type) + ); +} +#endif + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// CPU detect function for SIMD instruction sets. +int cpu_info_ = 0; + +int InitCpuFlags() { +#ifdef CPU_X86 + int cpu_info[4]; + __cpuid(cpu_info, 1); + cpu_info_ = (cpu_info[3] & 0x04000000 ? kCpuHasSSE2 : 0) | + (cpu_info[2] & 0x00000200 ? kCpuHasSSSE3 : 0) | + kCpuInitialized; +#elif defined(__ANDROID__) && defined(__ARM_NEON__) + uint64_t features = android_getCpuFeatures(); + cpu_info_ = ((features & ANDROID_CPU_ARM_FEATURE_NEON) ? kCpuHasNEON : 0) | + kCpuInitialized; +#elif defined(__ARM_NEON__) + // gcc -mfpu=neon defines __ARM_NEON__ + // Enable Neon if you want support for Neon and Arm, and use MaskCpuFlags + // to disable Neon on devices that do not have it. + cpu_info_ = kCpuHasNEON | kCpuInitialized; +#else + cpu_info_ = kCpuInitialized; +#endif + return cpu_info_; +} + +void MaskCpuFlags(int enable_flags) { + InitCpuFlags(); + cpu_info_ = (cpu_info_ & enable_flags) | kCpuInitialized; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/third_party/libyuv/source/row.h b/third_party/libyuv/source/row.h new file mode 100644 index 000000000..eabe18094 --- /dev/null +++ b/third_party/libyuv/source/row.h @@ -0,0 +1,264 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef LIBYUV_SOURCE_ROW_H_ +#define LIBYUV_SOURCE_ROW_H_ + +#include "third_party/libyuv/include/libyuv/basic_types.h" + +#define kMaxStride (2048 * 4) +#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1))) + +#if defined(COVERAGE_ENABLED) || defined(TARGET_IPHONE_SIMULATOR) +#define YUV_DISABLE_ASM +#endif + +#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM) +#define HAS_FASTCONVERTYUVTOARGBROW_NEON +void FastConvertYUVToARGBRow_NEON(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); +#define HAS_FASTCONVERTYUVTOBGRAROW_NEON +void FastConvertYUVToBGRARow_NEON(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); +#define HAS_FASTCONVERTYUVTOABGRROW_NEON +void FastConvertYUVToABGRRow_NEON(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); +#endif + +// The following are available on all x86 platforms +#if (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \ + !defined(YUV_DISABLE_ASM) +#define HAS_ABGRTOARGBROW_SSSE3 +#define HAS_BGRATOARGBROW_SSSE3 +#define HAS_BG24TOARGBROW_SSSE3 +#define HAS_RAWTOARGBROW_SSSE3 +#define HAS_RGB24TOYROW_SSSE3 +#define HAS_RAWTOYROW_SSSE3 +#define HAS_RGB24TOUVROW_SSSE3 +#define HAS_RAWTOUVROW_SSSE3 +#define HAS_ARGBTOYROW_SSSE3 +#define HAS_BGRATOYROW_SSSE3 +#define HAS_ABGRTOYROW_SSSE3 +#define HAS_ARGBTOUVROW_SSSE3 +#define HAS_BGRATOUVROW_SSSE3 +#define HAS_ABGRTOUVROW_SSSE3 +#define HAS_I400TOARGBROW_SSE2 +#define HAS_FASTCONVERTYTOARGBROW_SSE2 +#define HAS_FASTCONVERTYUVTOARGBROW_SSSE3 +#define HAS_FASTCONVERTYUVTOBGRAROW_SSSE3 +#define HAS_FASTCONVERTYUVTOABGRROW_SSSE3 +#define HAS_FASTCONVERTYUV444TOARGBROW_SSSE3 +#define HAS_REVERSE_ROW_SSSE3 +#endif + +// The following are available on Neon platforms +#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM) +#define HAS_REVERSE_ROW_NEON +#endif + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#ifdef HAS_ARGBTOYROW_SSSE3 +void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +#endif +#if defined(HAS_BG24TOARGBROW_SSSE3) && defined(HAS_ARGBTOYROW_SSSE3) +#define HASRGB24TOYROW_SSSE3 +#endif +#ifdef HASRGB24TOYROW_SSSE3 +void RGB24ToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void RAWToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void RGB24ToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void RAWToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +#endif +#ifdef HAS_REVERSE_ROW_SSSE3 +void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width); +#endif +#ifdef HAS_REVERSE_ROW_NEON +void ReverseRow_NEON(const uint8* src, uint8* dst, int width); +#endif +void ReverseRow_C(const uint8* src, uint8* dst, int width); + +void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); +void BGRAToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); +void ABGRToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); +void RGB24ToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); +void RAWToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void BGRAToUVRow_C(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void ABGRToUVRow_C(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void RGB24ToUVRow_C(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void RAWToUVRow_C(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); + +#ifdef HAS_BG24TOARGBROW_SSSE3 +void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix); +void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix); +void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix); +void RAWToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix); +#endif +void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix); +void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix); +void BG24ToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix); +void RAWToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix); + +#ifdef HAS_I400TOARGBROW_SSE2 +void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix); +#endif +void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix); + +#if defined(_MSC_VER) +#define SIMD_ALIGNED(var) __declspec(align(16)) var +typedef __declspec(align(16)) signed char vec8[16]; +typedef __declspec(align(16)) unsigned char uvec8[16]; +typedef __declspec(align(16)) signed short vec16[8]; +#else // __GNUC__ +#define SIMD_ALIGNED(var) var __attribute__((aligned(16))) +typedef signed char __attribute__((vector_size(16))) vec8; +typedef unsigned char __attribute__((vector_size(16))) uvec8; +typedef signed short __attribute__((vector_size(16))) vec16; +#endif + +//extern "C" +SIMD_ALIGNED(const int16 kCoefficientsRgbY[768][4]); +//extern "C" +SIMD_ALIGNED(const int16 kCoefficientsBgraY[768][4]); +//extern "C" +SIMD_ALIGNED(const int16 kCoefficientsAbgrY[768][4]); + +void FastConvertYUVToARGBRow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYUVToBGRARow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYUVToABGRRow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYUV444ToARGBRow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYToARGBRow_C(const uint8* y_buf, + uint8* rgb_buf, + int width); + +#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSE2 +void FastConvertYUVToARGBRow_SSE2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYUVToARGBRow4_SSE2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYUVToBGRARow_SSE2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYUVToABGRRow_SSE2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYUV444ToARGBRow_SSE2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYToARGBRow_SSE2(const uint8* y_buf, + uint8* rgb_buf, + int width); +#endif + +#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSSE3 +void FastConvertYUVToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYUV444ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +#endif + +#ifdef HAS_FASTCONVERTYTOARGBROW_SSE2 +void FastConvertYToARGBRow_SSE2(const uint8* y_buf, + uint8* rgb_buf, + int width); + +#endif + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // LIBYUV_SOURCE_ROW_H_ diff --git a/third_party/libyuv/source/scale.c b/third_party/libyuv/source/scale.c new file mode 100644 index 000000000..930a7ae09 --- /dev/null +++ b/third_party/libyuv/source/scale.c @@ -0,0 +1,3884 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "third_party/libyuv/include/libyuv/scale.h" + +#include <assert.h> +#include <string.h> + +#include "third_party/libyuv/include/libyuv/cpu_id.h" +#include "third_party/libyuv/source/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +/* + * Note: Defining YUV_DISABLE_ASM allows to use c version. + */ +//#define YUV_DISABLE_ASM + +#if defined(_MSC_VER) +#define ALIGN16(var) __declspec(align(16)) var +#else +#define ALIGN16(var) var __attribute__((aligned(16))) +#endif + +// Note: A Neon reference manual +// http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204j/CJAJIIGG.html +// Note: Some SSE2 reference manuals +// cpuvol1.pdf agner_instruction_tables.pdf 253666.pdf 253667.pdf + +// Set the following flag to true to revert to only +// using the reference implementation ScalePlaneBox(), and +// NOT the optimized versions. Useful for debugging and +// when comparing the quality of the resulting YUV planes +// as produced by the optimized and non-optimized versions. + +static int use_reference_impl_ = 0; + +void SetUseReferenceImpl(int use) { + use_reference_impl_ = use; +} + +// ScaleRowDown2Int also used by planar functions + +/** + * NEON downscalers with interpolation. + * + * Provided by Fritz Koenig + * + */ + +#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM) +#define HAS_SCALEROWDOWN2_NEON +void ScaleRowDown2_NEON(const uint8* src_ptr, int /* src_stride */, + uint8* dst, int dst_width) { + asm volatile ( + "1: \n" + "vld2.u8 {q0,q1}, [%0]! \n" // load even pixels into q0, odd into q1 + "vst1.u8 {q0}, [%1]! \n" // store even pixels + "subs %2, %2, #16 \n" // 16 processed per loop + "bhi 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "q0", "q1" // Clobber List + ); +} + +void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride, + uint8* dst, int dst_width) { + asm volatile ( + "add %1, %0 \n" // change the stride to row 2 pointer + "1: \n" + "vld1.u8 {q0,q1}, [%0]! \n" // load row 1 and post increment + "vld1.u8 {q2,q3}, [%1]! \n" // load row 2 and post increment + "vpaddl.u8 q0, q0 \n" // row 1 add adjacent + "vpaddl.u8 q1, q1 \n" + "vpadal.u8 q0, q2 \n" // row 2 add adjacent, add row 1 to row 2 + "vpadal.u8 q1, q3 \n" + "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack + "vrshrn.u16 d1, q1, #2 \n" + "vst1.u8 {q0}, [%2]! \n" + "subs %3, %3, #16 \n" // 16 processed per loop + "bhi 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "q0", "q1", "q2", "q3" // Clobber List + ); +} + +#define HAS_SCALEROWDOWN4_NEON +static void ScaleRowDown4_NEON(const uint8* src_ptr, int /* src_stride */, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "1: \n" + "vld2.u8 {d0, d1}, [%0]! \n" + "vtrn.u8 d1, d0 \n" + "vshrn.u16 d0, q0, #8 \n" + "vst1.u32 {d0[1]}, [%1]! \n" + + "subs %2, #4 \n" + "bhi 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "q0", "q1", "memory", "cc" + ); +} + +static void ScaleRowDown4Int_NEON(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "add r4, %0, %3 \n" + "add r5, r4, %3 \n" + "add %3, r5, %3 \n" + "1: \n" + "vld1.u8 {q0}, [%0]! \n" // load up 16x4 block of input data + "vld1.u8 {q1}, [r4]! \n" + "vld1.u8 {q2}, [r5]! \n" + "vld1.u8 {q3}, [%3]! \n" + + "vpaddl.u8 q0, q0 \n" + "vpadal.u8 q0, q1 \n" + "vpadal.u8 q0, q2 \n" + "vpadal.u8 q0, q3 \n" + + "vpaddl.u16 q0, q0 \n" + + "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding + + "vmovn.u16 d0, q0 \n" + "vst1.u32 {d0[0]}, [%1]! \n" + + "subs %2, #4 \n" + "bhi 1b \n" + + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(src_stride) // %3 + : "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc" + ); +} + +#define HAS_SCALEROWDOWN34_NEON +// Down scale from 4 to 3 pixels. Use the neon multilane read/write +// to load up the every 4th pixel into a 4 different registers. +// Point samples 32 pixels to 24 pixels. +static void ScaleRowDown34_NEON(const uint8* src_ptr, int /* src_stride */, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "1: \n" + "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "vmov d2, d3 \n" // order needs to be d0, d1, d2 + "vst3.u8 {d0, d1, d2}, [%1]! \n" + "subs %2, #24 \n" + "bhi 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "d0", "d1", "d2", "d3", "memory", "cc" + ); +} + +static void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "vmov.u8 d24, #3 \n" + "add %3, %0 \n" + "1: \n" + "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 + + // filter src line 0 with src line 1 + // expand chars to shorts to allow for room + // when adding lines together + "vmovl.u8 q8, d4 \n" + "vmovl.u8 q9, d5 \n" + "vmovl.u8 q10, d6 \n" + "vmovl.u8 q11, d7 \n" + + // 3 * line_0 + line_1 + "vmlal.u8 q8, d0, d24 \n" + "vmlal.u8 q9, d1, d24 \n" + "vmlal.u8 q10, d2, d24 \n" + "vmlal.u8 q11, d3, d24 \n" + + // (3 * line_0 + line_1) >> 2 + "vqrshrn.u16 d0, q8, #2 \n" + "vqrshrn.u16 d1, q9, #2 \n" + "vqrshrn.u16 d2, q10, #2 \n" + "vqrshrn.u16 d3, q11, #2 \n" + + // a0 = (src[0] * 3 + s[1] * 1) >> 2 + "vmovl.u8 q8, d1 \n" + "vmlal.u8 q8, d0, d24 \n" + "vqrshrn.u16 d0, q8, #2 \n" + + // a1 = (src[1] * 1 + s[2] * 1) >> 1 + "vrhadd.u8 d1, d1, d2 \n" + + // a2 = (src[2] * 1 + s[3] * 3) >> 2 + "vmovl.u8 q8, d2 \n" + "vmlal.u8 q8, d3, d24 \n" + "vqrshrn.u16 d2, q8, #2 \n" + + "vst3.u8 {d0, d1, d2}, [%1]! \n" + + "subs %2, #24 \n" + "bhi 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : + : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc" + ); +} + +static void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "vmov.u8 d24, #3 \n" + "add %3, %0 \n" + "1: \n" + "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 + + // average src line 0 with src line 1 + "vrhadd.u8 q0, q0, q2 \n" + "vrhadd.u8 q1, q1, q3 \n" + + // a0 = (src[0] * 3 + s[1] * 1) >> 2 + "vmovl.u8 q3, d1 \n" + "vmlal.u8 q3, d0, d24 \n" + "vqrshrn.u16 d0, q3, #2 \n" + + // a1 = (src[1] * 1 + s[2] * 1) >> 1 + "vrhadd.u8 d1, d1, d2 \n" + + // a2 = (src[2] * 1 + s[3] * 3) >> 2 + "vmovl.u8 q3, d2 \n" + "vmlal.u8 q3, d3, d24 \n" + "vqrshrn.u16 d2, q3, #2 \n" + + "vst3.u8 {d0, d1, d2}, [%1]! \n" + + "subs %2, #24 \n" + "bhi 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : + : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc" + ); +} + +#define HAS_SCALEROWDOWN38_NEON +const uint8 shuf38[16] __attribute__ ((aligned(16))) = + { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 }; +const uint8 shuf38_2[16] __attribute__ ((aligned(16))) = + { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 }; +const unsigned short mult38_div6[8] __attribute__ ((aligned(16))) = + { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, + 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 }; +const unsigned short mult38_div9[8] __attribute__ ((aligned(16))) = + { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, + 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 }; + +// 32 -> 12 +static void ScaleRowDown38_NEON(const uint8* src_ptr, int, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "vld1.u8 {q3}, [%3] \n" + "1: \n" + "vld1.u8 {d0, d1, d2, d3}, [%0]! \n" + "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" + "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n" + "vst1.u8 {d4}, [%1]! \n" + "vst1.u32 {d5[0]}, [%1]! \n" + "subs %2, #12 \n" + "bhi 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(shuf38) // %3 + : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc" + ); +} + +// 32x3 -> 12x1 +static void ScaleRowDown38_3_Int_NEON(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "vld1.u16 {q13}, [%4] \n" + "vld1.u8 {q14}, [%5] \n" + "vld1.u8 {q15}, [%6] \n" + "add r4, %0, %3, lsl #1 \n" + "add %3, %0 \n" + "1: \n" + + // d0 = 00 40 01 41 02 42 03 43 + // d1 = 10 50 11 51 12 52 13 53 + // d2 = 20 60 21 61 22 62 23 63 + // d3 = 30 70 31 71 32 72 33 73 + "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" + "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" + "vld4.u8 {d16, d17, d18, d19}, [r4]! \n" + + // Shuffle the input data around to get align the data + // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 + // d0 = 00 10 01 11 02 12 03 13 + // d1 = 40 50 41 51 42 52 43 53 + "vtrn.u8 d0, d1 \n" + "vtrn.u8 d4, d5 \n" + "vtrn.u8 d16, d17 \n" + + // d2 = 20 30 21 31 22 32 23 33 + // d3 = 60 70 61 71 62 72 63 73 + "vtrn.u8 d2, d3 \n" + "vtrn.u8 d6, d7 \n" + "vtrn.u8 d18, d19 \n" + + // d0 = 00+10 01+11 02+12 03+13 + // d2 = 40+50 41+51 42+52 43+53 + "vpaddl.u8 q0, q0 \n" + "vpaddl.u8 q2, q2 \n" + "vpaddl.u8 q8, q8 \n" + + // d3 = 60+70 61+71 62+72 63+73 + "vpaddl.u8 d3, d3 \n" + "vpaddl.u8 d7, d7 \n" + "vpaddl.u8 d19, d19 \n" + + // combine source lines + "vadd.u16 q0, q2 \n" + "vadd.u16 q0, q8 \n" + "vadd.u16 d4, d3, d7 \n" + "vadd.u16 d4, d19 \n" + + // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] + // + s[6 + st * 1] + s[7 + st * 1] + // + s[6 + st * 2] + s[7 + st * 2]) / 6 + "vqrdmulh.s16 q2, q13 \n" + "vmovn.u16 d4, q2 \n" + + // Shuffle 2,3 reg around so that 2 can be added to the + // 0,1 reg and 3 can be added to the 4,5 reg. This + // requires expanding from u8 to u16 as the 0,1 and 4,5 + // registers are already expanded. Then do transposes + // to get aligned. + // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 + "vmovl.u8 q1, d2 \n" + "vmovl.u8 q3, d6 \n" + "vmovl.u8 q9, d18 \n" + + // combine source lines + "vadd.u16 q1, q3 \n" + "vadd.u16 q1, q9 \n" + + // d4 = xx 20 xx 30 xx 22 xx 32 + // d5 = xx 21 xx 31 xx 23 xx 33 + "vtrn.u32 d2, d3 \n" + + // d4 = xx 20 xx 21 xx 22 xx 23 + // d5 = xx 30 xx 31 xx 32 xx 33 + "vtrn.u16 d2, d3 \n" + + // 0+1+2, 3+4+5 + "vadd.u16 q0, q1 \n" + + // Need to divide, but can't downshift as the the value + // isn't a power of 2. So multiply by 65536 / n + // and take the upper 16 bits. + "vqrdmulh.s16 q0, q15 \n" + + // Align for table lookup, vtbl requires registers to + // be adjacent + "vmov.u8 d2, d4 \n" + + "vtbl.u8 d3, {d0, d1, d2}, d28 \n" + "vtbl.u8 d4, {d0, d1, d2}, d29 \n" + + "vst1.u8 {d3}, [%1]! \n" + "vst1.u32 {d4[0]}, [%1]! \n" + "subs %2, #12 \n" + "bhi 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : "r"(mult38_div6), // %4 + "r"(shuf38_2), // %5 + "r"(mult38_div9) // %6 + : "r4", "q0", "q1", "q2", "q3", "q8", "q9", + "q13", "q14", "q15", "memory", "cc" + ); +} + +// 32x2 -> 12x1 +static void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "vld1.u16 {q13}, [%4] \n" + "vld1.u8 {q14}, [%5] \n" + "add %3, %0 \n" + "1: \n" + + // d0 = 00 40 01 41 02 42 03 43 + // d1 = 10 50 11 51 12 52 13 53 + // d2 = 20 60 21 61 22 62 23 63 + // d3 = 30 70 31 71 32 72 33 73 + "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" + "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" + + // Shuffle the input data around to get align the data + // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 + // d0 = 00 10 01 11 02 12 03 13 + // d1 = 40 50 41 51 42 52 43 53 + "vtrn.u8 d0, d1 \n" + "vtrn.u8 d4, d5 \n" + + // d2 = 20 30 21 31 22 32 23 33 + // d3 = 60 70 61 71 62 72 63 73 + "vtrn.u8 d2, d3 \n" + "vtrn.u8 d6, d7 \n" + + // d0 = 00+10 01+11 02+12 03+13 + // d2 = 40+50 41+51 42+52 43+53 + "vpaddl.u8 q0, q0 \n" + "vpaddl.u8 q2, q2 \n" + + // d3 = 60+70 61+71 62+72 63+73 + "vpaddl.u8 d3, d3 \n" + "vpaddl.u8 d7, d7 \n" + + // combine source lines + "vadd.u16 q0, q2 \n" + "vadd.u16 d4, d3, d7 \n" + + // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 + "vqrshrn.u16 d4, q2, #2 \n" + + // Shuffle 2,3 reg around so that 2 can be added to the + // 0,1 reg and 3 can be added to the 4,5 reg. This + // requires expanding from u8 to u16 as the 0,1 and 4,5 + // registers are already expanded. Then do transposes + // to get aligned. + // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 + "vmovl.u8 q1, d2 \n" + "vmovl.u8 q3, d6 \n" + + // combine source lines + "vadd.u16 q1, q3 \n" + + // d4 = xx 20 xx 30 xx 22 xx 32 + // d5 = xx 21 xx 31 xx 23 xx 33 + "vtrn.u32 d2, d3 \n" + + // d4 = xx 20 xx 21 xx 22 xx 23 + // d5 = xx 30 xx 31 xx 32 xx 33 + "vtrn.u16 d2, d3 \n" + + // 0+1+2, 3+4+5 + "vadd.u16 q0, q1 \n" + + // Need to divide, but can't downshift as the the value + // isn't a power of 2. So multiply by 65536 / n + // and take the upper 16 bits. + "vqrdmulh.s16 q0, q13 \n" + + // Align for table lookup, vtbl requires registers to + // be adjacent + "vmov.u8 d2, d4 \n" + + "vtbl.u8 d3, {d0, d1, d2}, d28 \n" + "vtbl.u8 d4, {d0, d1, d2}, d29 \n" + + "vst1.u8 {d3}, [%1]! \n" + "vst1.u32 {d4[0]}, [%1]! \n" + "subs %2, #12 \n" + "bhi 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : "r"(mult38_div6), // %4 + "r"(shuf38_2) // %5 + : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc" + ); +} + +/** + * SSE2 downscalers with interpolation. + * + * Provided by Frank Barchard (fbarchard@google.com) + * + */ + +// Constants for SSE2 code +#elif (defined(_M_IX86) || defined(__i386__) || defined(__x86_64__)) && \ + !defined(YUV_DISABLE_ASM) +#if defined(_MSC_VER) +#define TALIGN16(t, var) __declspec(align(16)) t _ ## var +#elif (defined(__APPLE__) || defined(__MINGW32__) || defined(__CYGWIN__)) && defined(__i386__) +#define TALIGN16(t, var) t var __attribute__((aligned(16))) +#else +#define TALIGN16(t, var) t _ ## var __attribute__((aligned(16))) +#endif + +#if (defined(__APPLE__) || defined(__MINGW32__) || defined(__CYGWIN__)) && \ + defined(__i386__) +#define DECLARE_FUNCTION(name) \ + ".text \n" \ + ".globl _" #name " \n" \ +"_" #name ": \n" +#else +#define DECLARE_FUNCTION(name) \ + ".text \n" \ + ".global " #name " \n" \ +#name ": \n" +#endif + + +// Offsets for source bytes 0 to 9 +//extern "C" +TALIGN16(const uint8, shuf0[16]) = + { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. +//extern "C" +TALIGN16(const uint8, shuf1[16]) = + { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. +//extern "C" +TALIGN16(const uint8, shuf2[16]) = + { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Offsets for source bytes 0 to 10 +//extern "C" +TALIGN16(const uint8, shuf01[16]) = + { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 }; + +// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. +//extern "C" +TALIGN16(const uint8, shuf11[16]) = + { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 }; + +// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. +//extern "C" +TALIGN16(const uint8, shuf21[16]) = + { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 }; + +// Coefficients for source bytes 0 to 10 +//extern "C" +TALIGN16(const uint8, madd01[16]) = + { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 }; + +// Coefficients for source bytes 10 to 21 +//extern "C" +TALIGN16(const uint8, madd11[16]) = + { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 }; + +// Coefficients for source bytes 21 to 31 +//extern "C" +TALIGN16(const uint8, madd21[16]) = + { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 }; + +// Coefficients for source bytes 21 to 31 +//extern "C" +TALIGN16(const int16, round34[8]) = + { 2, 2, 2, 2, 2, 2, 2, 2 }; + +//extern "C" +TALIGN16(const uint8, shuf38a[16]) = + { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; + +//extern "C" +TALIGN16(const uint8, shuf38b[16]) = + { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 }; + +// Arrange words 0,3,6 into 0,1,2 +//extern "C" +TALIGN16(const uint8, shufac0[16]) = + { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Arrange words 0,3,6 into 3,4,5 +//extern "C" +TALIGN16(const uint8, shufac3[16]) = + { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 }; + +// Scaling values for boxes of 3x3 and 2x3 +//extern "C" +TALIGN16(const uint16, scaleac3[8]) = + { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 }; + +// Arrange first value for pixels 0,1,2,3,4,5 +//extern "C" +TALIGN16(const uint8, shufab0[16]) = + { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 }; + +// Arrange second value for pixels 0,1,2,3,4,5 +//extern "C" +TALIGN16(const uint8, shufab1[16]) = + { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 }; + +// Arrange third value for pixels 0,1,2,3,4,5 +//extern "C" +TALIGN16(const uint8, shufab2[16]) = + { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; + +// Scaling values for boxes of 3x2 and 2x2 +//extern "C" +TALIGN16(const uint16, scaleab2[8]) = + { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; +#endif + +#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM) + +#define HAS_SCALEROWDOWN2_SSE2 +// Reads 32 pixels, throws half away and writes 16 pixels. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. +__declspec(naked) +static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + pand xmm0, xmm5 + pand xmm1, xmm5 + packuswb xmm0, xmm1 + movdqa [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + ja wloop + + ret + } +} +// Blends 32x2 rectangle to 16x1. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. +__declspec(naked) +void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + esi] + movdqa xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm2 // average rows + pavgb xmm1, xmm3 + + movdqa xmm2, xmm0 // average columns (32 to 16 pixels) + psrlw xmm0, 8 + movdqa xmm3, xmm1 + psrlw xmm1, 8 + pand xmm2, xmm5 + pand xmm3, xmm5 + pavgw xmm0, xmm2 + pavgw xmm1, xmm3 + packuswb xmm0, xmm1 + + movdqa [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + ja wloop + + pop esi + ret + } +} + +#define HAS_SCALEROWDOWN4_SSE2 +// Point samples 32 pixels to 8 pixels. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. +__declspec(naked) +static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + pushad + mov esi, [esp + 32 + 4] // src_ptr + // src_stride ignored + mov edi, [esp + 32 + 12] // dst_ptr + mov ecx, [esp + 32 + 16] // dst_width + pcmpeqb xmm5, xmm5 // generate mask 0x000000ff + psrld xmm5, 24 + + wloop: + movdqa xmm0, [esi] + movdqa xmm1, [esi + 16] + lea esi, [esi + 32] + pand xmm0, xmm5 + pand xmm1, xmm5 + packuswb xmm0, xmm1 + packuswb xmm0, xmm0 + movq qword ptr [edi], xmm0 + lea edi, [edi + 8] + sub ecx, 8 + ja wloop + + popad + ret + } +} + +// Blends 32x4 rectangle to 8x1. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. +__declspec(naked) +static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + pushad + mov esi, [esp + 32 + 4] // src_ptr + mov ebx, [esp + 32 + 8] // src_stride + mov edi, [esp + 32 + 12] // dst_ptr + mov ecx, [esp + 32 + 16] // dst_width + pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff + psrlw xmm7, 8 + lea edx, [ebx + ebx * 2] // src_stride * 3 + + wloop: + movdqa xmm0, [esi] + movdqa xmm1, [esi + 16] + movdqa xmm2, [esi + ebx] + movdqa xmm3, [esi + ebx + 16] + pavgb xmm0, xmm2 // average rows + pavgb xmm1, xmm3 + movdqa xmm2, [esi + ebx * 2] + movdqa xmm3, [esi + ebx * 2 + 16] + movdqa xmm4, [esi + edx] + movdqa xmm5, [esi + edx + 16] + lea esi, [esi + 32] + pavgb xmm2, xmm4 + pavgb xmm3, xmm5 + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + + movdqa xmm2, xmm0 // average columns (32 to 16 pixels) + psrlw xmm0, 8 + movdqa xmm3, xmm1 + psrlw xmm1, 8 + pand xmm2, xmm7 + pand xmm3, xmm7 + pavgw xmm0, xmm2 + pavgw xmm1, xmm3 + packuswb xmm0, xmm1 + + movdqa xmm2, xmm0 // average columns (16 to 8 pixels) + psrlw xmm0, 8 + pand xmm2, xmm7 + pavgw xmm0, xmm2 + packuswb xmm0, xmm0 + + movq qword ptr [edi], xmm0 + lea edi, [edi + 8] + sub ecx, 8 + ja wloop + + popad + ret + } +} + +#define HAS_SCALEROWDOWN8_SSE2 +// Point samples 32 pixels to 4 pixels. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned. +__declspec(naked) +static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + pushad + mov esi, [esp + 32 + 4] // src_ptr + // src_stride ignored + mov edi, [esp + 32 + 12] // dst_ptr + mov ecx, [esp + 32 + 16] // dst_width + pcmpeqb xmm5, xmm5 // generate mask isolating 1 src 8 bytes + psrlq xmm5, 56 + + wloop: + movdqa xmm0, [esi] + movdqa xmm1, [esi + 16] + lea esi, [esi + 32] + pand xmm0, xmm5 + pand xmm1, xmm5 + packuswb xmm0, xmm1 // 32->16 + packuswb xmm0, xmm0 // 16->8 + packuswb xmm0, xmm0 // 8->4 + movd dword ptr [edi], xmm0 + lea edi, [edi + 4] + sub ecx, 4 + ja wloop + + popad + ret + } +} + +// Blends 32x8 rectangle to 4x1. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned. +__declspec(naked) +static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + pushad + mov esi, [esp + 32 + 4] // src_ptr + mov ebx, [esp + 32 + 8] // src_stride + mov edi, [esp + 32 + 12] // dst_ptr + mov ecx, [esp + 32 + 16] // dst_width + lea edx, [ebx + ebx * 2] // src_stride * 3 + pxor xmm7, xmm7 + + wloop: + movdqa xmm0, [esi] // average 8 rows to 1 + movdqa xmm1, [esi + 16] + movdqa xmm2, [esi + ebx] + movdqa xmm3, [esi + ebx + 16] + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + movdqa xmm2, [esi + ebx * 2] + movdqa xmm3, [esi + ebx * 2 + 16] + movdqa xmm4, [esi + edx] + movdqa xmm5, [esi + edx + 16] + lea ebp, [esi + ebx * 4] + lea esi, [esi + 32] + pavgb xmm2, xmm4 + pavgb xmm3, xmm5 + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + + movdqa xmm2, [ebp] + movdqa xmm3, [ebp + 16] + movdqa xmm4, [ebp + ebx] + movdqa xmm5, [ebp + ebx + 16] + pavgb xmm2, xmm4 + pavgb xmm3, xmm5 + movdqa xmm4, [ebp + ebx * 2] + movdqa xmm5, [ebp + ebx * 2 + 16] + movdqa xmm6, [ebp + edx] + pavgb xmm4, xmm6 + movdqa xmm6, [ebp + edx + 16] + pavgb xmm5, xmm6 + pavgb xmm2, xmm4 + pavgb xmm3, xmm5 + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + + psadbw xmm0, xmm7 // average 32 pixels to 4 + psadbw xmm1, xmm7 + pshufd xmm0, xmm0, 0xd8 // x1x0 -> xx01 + pshufd xmm1, xmm1, 0x8d // x3x2 -> 32xx + por xmm0, xmm1 // -> 3201 + psrlw xmm0, 3 + packuswb xmm0, xmm0 + packuswb xmm0, xmm0 + movd dword ptr [edi], xmm0 + + lea edi, [edi + 4] + sub ecx, 4 + ja wloop + + popad + ret + } +} + +#define HAS_SCALEROWDOWN34_SSSE3 +// Point samples 32 pixels to 24 pixels. +// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. +// Then shuffled to do the scaling. + +// Note that movdqa+palign may be better than movdqu. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. +__declspec(naked) +static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + pushad + mov esi, [esp + 32 + 4] // src_ptr + // src_stride ignored + mov edi, [esp + 32 + 12] // dst_ptr + mov ecx, [esp + 32 + 16] // dst_width + movdqa xmm3, _shuf0 + movdqa xmm4, _shuf1 + movdqa xmm5, _shuf2 + + wloop: + movdqa xmm0, [esi] + movdqa xmm1, [esi + 16] + lea esi, [esi + 32] + movdqa xmm2, xmm1 + palignr xmm1, xmm0, 8 + pshufb xmm0, xmm3 + pshufb xmm1, xmm4 + pshufb xmm2, xmm5 + movq qword ptr [edi], xmm0 + movq qword ptr [edi + 8], xmm1 + movq qword ptr [edi + 16], xmm2 + lea edi, [edi + 24] + sub ecx, 24 + ja wloop + + popad + ret + } +} + +// Blends 32x2 rectangle to 24x1 +// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. +// Then shuffled to do the scaling. + +// Register usage: +// xmm0 src_row 0 +// xmm1 src_row 1 +// xmm2 shuf 0 +// xmm3 shuf 1 +// xmm4 shuf 2 +// xmm5 madd 0 +// xmm6 madd 1 +// xmm7 round34 + +// Note that movdqa+palign may be better than movdqu. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. +__declspec(naked) +static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + pushad + mov esi, [esp + 32 + 4] // src_ptr + mov ebx, [esp + 32 + 8] // src_stride + mov edi, [esp + 32 + 12] // dst_ptr + mov ecx, [esp + 32 + 16] // dst_width + movdqa xmm2, _shuf01 + movdqa xmm3, _shuf11 + movdqa xmm4, _shuf21 + movdqa xmm5, _madd01 + movdqa xmm6, _madd11 + movdqa xmm7, _round34 + + wloop: + movdqa xmm0, [esi] // pixels 0..7 + movdqa xmm1, [esi+ebx] + pavgb xmm0, xmm1 + pshufb xmm0, xmm2 + pmaddubsw xmm0, xmm5 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edi], xmm0 + movdqu xmm0, [esi+8] // pixels 8..15 + movdqu xmm1, [esi+ebx+8] + pavgb xmm0, xmm1 + pshufb xmm0, xmm3 + pmaddubsw xmm0, xmm6 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edi+8], xmm0 + movdqa xmm0, [esi+16] // pixels 16..23 + movdqa xmm1, [esi+ebx+16] + lea esi, [esi+32] + pavgb xmm0, xmm1 + pshufb xmm0, xmm4 + movdqa xmm1, _madd21 + pmaddubsw xmm0, xmm1 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edi+16], xmm0 + lea edi, [edi+24] + sub ecx, 24 + ja wloop + + popad + ret + } +} + +// Note that movdqa+palign may be better than movdqu. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. +__declspec(naked) +static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + pushad + mov esi, [esp + 32 + 4] // src_ptr + mov ebx, [esp + 32 + 8] // src_stride + mov edi, [esp + 32 + 12] // dst_ptr + mov ecx, [esp + 32 + 16] // dst_width + movdqa xmm2, _shuf01 + movdqa xmm3, _shuf11 + movdqa xmm4, _shuf21 + movdqa xmm5, _madd01 + movdqa xmm6, _madd11 + movdqa xmm7, _round34 + + wloop: + movdqa xmm0, [esi] // pixels 0..7 + movdqa xmm1, [esi+ebx] + pavgb xmm1, xmm0 + pavgb xmm0, xmm1 + pshufb xmm0, xmm2 + pmaddubsw xmm0, xmm5 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edi], xmm0 + movdqu xmm0, [esi+8] // pixels 8..15 + movdqu xmm1, [esi+ebx+8] + pavgb xmm1, xmm0 + pavgb xmm0, xmm1 + pshufb xmm0, xmm3 + pmaddubsw xmm0, xmm6 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edi+8], xmm0 + movdqa xmm0, [esi+16] // pixels 16..23 + movdqa xmm1, [esi+ebx+16] + lea esi, [esi+32] + pavgb xmm1, xmm0 + pavgb xmm0, xmm1 + pshufb xmm0, xmm4 + movdqa xmm1, _madd21 + pmaddubsw xmm0, xmm1 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edi+16], xmm0 + lea edi, [edi+24] + sub ecx, 24 + ja wloop + + popad + ret + } +} + +#define HAS_SCALEROWDOWN38_SSSE3 +// 3/8 point sampler + +// Scale 32 pixels to 12 +__declspec(naked) +static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + pushad + mov esi, [esp + 32 + 4] // src_ptr + mov edx, [esp + 32 + 8] // src_stride + mov edi, [esp + 32 + 12] // dst_ptr + mov ecx, [esp + 32 + 16] // dst_width + movdqa xmm4, _shuf38a + movdqa xmm5, _shuf38b + + xloop: + movdqa xmm0, [esi] // 16 pixels -> 0,1,2,3,4,5 + movdqa xmm1, [esi + 16] // 16 pixels -> 6,7,8,9,10,11 + lea esi, [esi + 32] + pshufb xmm0, xmm4 + pshufb xmm1, xmm5 + paddusb xmm0, xmm1 + + movq qword ptr [edi], xmm0 // write 12 pixels + movhlps xmm1, xmm0 + movd [edi + 8], xmm1 + lea edi, [edi + 12] + sub ecx, 12 + ja xloop + + popad + ret + } +} + +// Scale 16x3 pixels to 6x1 with interpolation +__declspec(naked) +static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + pushad + mov esi, [esp + 32 + 4] // src_ptr + mov edx, [esp + 32 + 8] // src_stride + mov edi, [esp + 32 + 12] // dst_ptr + mov ecx, [esp + 32 + 16] // dst_width + movdqa xmm4, _shufac0 + movdqa xmm5, _shufac3 + movdqa xmm6, _scaleac3 + pxor xmm7, xmm7 + + xloop: + movdqa xmm0, [esi] // sum up 3 rows into xmm0/1 + movdqa xmm2, [esi + edx] + movhlps xmm1, xmm0 + movhlps xmm3, xmm2 + punpcklbw xmm0, xmm7 + punpcklbw xmm1, xmm7 + punpcklbw xmm2, xmm7 + punpcklbw xmm3, xmm7 + paddusw xmm0, xmm2 + paddusw xmm1, xmm3 + movdqa xmm2, [esi + edx * 2] + lea esi, [esi + 16] + movhlps xmm3, xmm2 + punpcklbw xmm2, xmm7 + punpcklbw xmm3, xmm7 + paddusw xmm0, xmm2 + paddusw xmm1, xmm3 + + movdqa xmm2, xmm0 // 8 pixels -> 0,1,2 of xmm2 + psrldq xmm0, 2 + paddusw xmm2, xmm0 + psrldq xmm0, 2 + paddusw xmm2, xmm0 + pshufb xmm2, xmm4 + + movdqa xmm3, xmm1 // 8 pixels -> 3,4,5 of xmm2 + psrldq xmm1, 2 + paddusw xmm3, xmm1 + psrldq xmm1, 2 + paddusw xmm3, xmm1 + pshufb xmm3, xmm5 + paddusw xmm2, xmm3 + + pmulhuw xmm2, xmm6 // divide by 9,9,6, 9,9,6 + packuswb xmm2, xmm2 + + movd [edi], xmm2 // write 6 pixels + pextrw eax, xmm2, 2 + mov [edi + 4], ax + lea edi, [edi + 6] + sub ecx, 6 + ja xloop + + popad + ret + } +} + +// Scale 16x2 pixels to 6x1 with interpolation +__declspec(naked) +static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + pushad + mov esi, [esp + 32 + 4] // src_ptr + mov edx, [esp + 32 + 8] // src_stride + mov edi, [esp + 32 + 12] // dst_ptr + mov ecx, [esp + 32 + 16] // dst_width + movdqa xmm4, _shufab0 + movdqa xmm5, _shufab1 + movdqa xmm6, _shufab2 + movdqa xmm7, _scaleab2 + + xloop: + movdqa xmm2, [esi] // average 2 rows into xmm2 + pavgb xmm2, [esi + edx] + lea esi, [esi + 16] + + movdqa xmm0, xmm2 // 16 pixels -> 0,1,2,3,4,5 of xmm0 + pshufb xmm0, xmm4 + movdqa xmm1, xmm2 + pshufb xmm1, xmm5 + paddusw xmm0, xmm1 + pshufb xmm2, xmm6 + paddusw xmm0, xmm2 + + pmulhuw xmm0, xmm7 // divide by 3,3,2, 3,3,2 + packuswb xmm0, xmm0 + + movd [edi], xmm0 // write 6 pixels + pextrw eax, xmm0, 2 + mov [edi + 4], ax + lea edi, [edi + 6] + sub ecx, 6 + ja xloop + + popad + ret + } +} + +#define HAS_SCALEADDROWS_SSE2 + +// Reads 8xN bytes and produces 16 shorts at a time. +__declspec(naked) +static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, + uint16* dst_ptr, int src_width, + int src_height) { + __asm { + pushad + mov esi, [esp + 32 + 4] // src_ptr + mov edx, [esp + 32 + 8] // src_stride + mov edi, [esp + 32 + 12] // dst_ptr + mov ecx, [esp + 32 + 16] // dst_width + mov ebx, [esp + 32 + 20] // height + pxor xmm5, xmm5 + dec ebx + + xloop: + // first row + movdqa xmm2, [esi] + lea eax, [esi + edx] + movhlps xmm3, xmm2 + mov ebp, ebx + punpcklbw xmm2, xmm5 + punpcklbw xmm3, xmm5 + + // sum remaining rows + yloop: + movdqa xmm0, [eax] // read 16 pixels + lea eax, [eax + edx] // advance to next row + movhlps xmm1, xmm0 + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + paddusw xmm2, xmm0 // sum 16 words + paddusw xmm3, xmm1 + sub ebp, 1 + ja yloop + + movdqa [edi], xmm2 + movdqa [edi + 16], xmm3 + lea edi, [edi + 32] + lea esi, [esi + 16] + + sub ecx, 16 + ja xloop + + popad + ret + } +} + +// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version. +#define HAS_SCALEFILTERROWS_SSE2 +__declspec(naked) +static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr, + int src_stride, int dst_width, + int source_y_fraction) { + __asm { + push esi + push edi + mov edi, [esp + 8 + 4] // dst_ptr + mov esi, [esp + 8 + 8] // src_ptr + mov edx, [esp + 8 + 12] // src_stride + mov ecx, [esp + 8 + 16] // dst_width + mov eax, [esp + 8 + 20] // source_y_fraction (0..255) + cmp eax, 0 + je xloop1 + cmp eax, 128 + je xloop2 + + movd xmm6, eax // xmm6 = y fraction + punpcklwd xmm6, xmm6 + pshufd xmm6, xmm6, 0 + neg eax // xmm5 = 256 - y fraction + add eax, 256 + movd xmm5, eax + punpcklwd xmm5, xmm5 + pshufd xmm5, xmm5, 0 + pxor xmm7, xmm7 + + xloop: + movdqa xmm0, [esi] + movdqa xmm2, [esi + edx] + lea esi, [esi + 16] + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + punpcklbw xmm0, xmm7 + punpcklbw xmm2, xmm7 + punpckhbw xmm1, xmm7 + punpckhbw xmm3, xmm7 + pmullw xmm0, xmm5 // scale row 0 + pmullw xmm1, xmm5 + pmullw xmm2, xmm6 // scale row 1 + pmullw xmm3, xmm6 + paddusw xmm0, xmm2 // sum rows + paddusw xmm1, xmm3 + psrlw xmm0, 8 + psrlw xmm1, 8 + packuswb xmm0, xmm1 + movdqa [edi], xmm0 + lea edi, [edi + 16] + sub ecx, 16 + ja xloop + + mov al, [edi - 1] + mov [edi], al + pop edi + pop esi + ret + + xloop1: + movdqa xmm0, [esi] + lea esi, [esi + 16] + movdqa [edi], xmm0 + lea edi, [edi + 16] + sub ecx, 16 + ja xloop1 + + mov al, [edi - 1] + mov [edi], al + pop edi + pop esi + ret + + xloop2: + movdqa xmm0, [esi] + movdqa xmm2, [esi + edx] + lea esi, [esi + 16] + pavgb xmm0, xmm2 + movdqa [edi], xmm0 + lea edi, [edi + 16] + sub ecx, 16 + ja xloop2 + + mov al, [edi - 1] + mov [edi], al + pop edi + pop esi + ret + } +} + +// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version. +#define HAS_SCALEFILTERROWS_SSSE3 +__declspec(naked) +static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + int src_stride, int dst_width, + int source_y_fraction) { + __asm { + push esi + push edi + mov edi, [esp + 8 + 4] // dst_ptr + mov esi, [esp + 8 + 8] // src_ptr + mov edx, [esp + 8 + 12] // src_stride + mov ecx, [esp + 8 + 16] // dst_width + mov eax, [esp + 8 + 20] // source_y_fraction (0..255) + cmp eax, 0 + je xloop1 + cmp eax, 128 + je xloop2 + + shr eax, 1 + mov ah,al + neg al + add al, 128 + movd xmm5, eax + punpcklwd xmm5, xmm5 + pshufd xmm5, xmm5, 0 + + xloop: + movdqa xmm0, [esi] + movdqa xmm2, [esi + edx] + lea esi, [esi + 16] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm2 + punpckhbw xmm1, xmm2 + pmaddubsw xmm0, xmm5 + pmaddubsw xmm1, xmm5 + psrlw xmm0, 7 + psrlw xmm1, 7 + packuswb xmm0, xmm1 + movdqa [edi], xmm0 + lea edi, [edi + 16] + sub ecx, 16 + ja xloop + + mov al, [edi - 1] + mov [edi], al + pop edi + pop esi + ret + + xloop1: + movdqa xmm0, [esi] + lea esi, [esi + 16] + movdqa [edi], xmm0 + lea edi, [edi + 16] + sub ecx, 16 + ja xloop1 + + mov al, [edi - 1] + mov [edi], al + pop edi + pop esi + ret + + xloop2: + movdqa xmm0, [esi] + movdqa xmm2, [esi + edx] + lea esi, [esi + 16] + pavgb xmm0, xmm2 + movdqa [edi], xmm0 + lea edi, [edi + 16] + sub ecx, 16 + ja xloop2 + + mov al, [edi - 1] + mov [edi], al + pop edi + pop esi + ret + + } +} + +// Note that movdqa+palign may be better than movdqu. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. +__declspec(naked) +static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + int dst_width) { + __asm { + mov edx, [esp + 4] // dst_ptr + mov eax, [esp + 8] // src_ptr + mov ecx, [esp + 12] // dst_width + movdqa xmm1, _round34 + movdqa xmm2, _shuf01 + movdqa xmm3, _shuf11 + movdqa xmm4, _shuf21 + movdqa xmm5, _madd01 + movdqa xmm6, _madd11 + movdqa xmm7, _madd21 + + wloop: + movdqa xmm0, [eax] // pixels 0..7 + pshufb xmm0, xmm2 + pmaddubsw xmm0, xmm5 + paddsw xmm0, xmm1 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edx], xmm0 + movdqu xmm0, [eax+8] // pixels 8..15 + pshufb xmm0, xmm3 + pmaddubsw xmm0, xmm6 + paddsw xmm0, xmm1 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edx+8], xmm0 + movdqa xmm0, [eax+16] // pixels 16..23 + lea eax, [eax+32] + pshufb xmm0, xmm4 + pmaddubsw xmm0, xmm7 + paddsw xmm0, xmm1 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edx+16], xmm0 + lea edx, [edx+24] + sub ecx, 24 + ja wloop + ret + } +} + +#elif (defined(__x86_64__) || defined(__i386__)) && !defined(YUV_DISABLE_ASM) + +// GCC versions of row functions are verbatim conversions from Visual C. +// Generated using gcc disassembly on Visual C object file: +// objdump -D yuvscaler.obj >yuvscaler.txt +#define HAS_SCALEROWDOWN2_SSE2 +static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" +"1:" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "ja 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +); +} + +static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" +"1:" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "movdqa (%0,%3,1),%%xmm2 \n" + "movdqa 0x10(%0,%3,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psrlw $0x8,%%xmm1 \n" + "pand %%xmm5,%%xmm2 \n" + "pand %%xmm5,%%xmm3 \n" + "pavgw %%xmm2,%%xmm0 \n" + "pavgw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "ja 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc" +); +} + +#define HAS_SCALEROWDOWN4_SSE2 +static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrld $0x18,%%xmm5 \n" +"1:" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x8,%2 \n" + "ja 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +); +} + +static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + intptr_t temp = 0; + asm volatile ( + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $0x8,%%xmm7 \n" + "lea (%4,%4,2),%3 \n" +"1:" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "movdqa (%0,%4,1),%%xmm2 \n" + "movdqa 0x10(%0,%4,1),%%xmm3 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa (%0,%4,2),%%xmm2 \n" + "movdqa 0x10(%0,%4,2),%%xmm3 \n" + "movdqa (%0,%3,1),%%xmm4 \n" + "movdqa 0x10(%0,%3,1),%%xmm5 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm4,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm5,%%xmm3 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psrlw $0x8,%%xmm1 \n" + "pand %%xmm7,%%xmm2 \n" + "pand %%xmm7,%%xmm3 \n" + "pavgw %%xmm2,%%xmm0 \n" + "pavgw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "pand %%xmm7,%%xmm2 \n" + "pavgw %%xmm2,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x8,%2 \n" + "ja 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(temp) // %3 + : "r"((intptr_t)(src_stride)) // %4 + : "memory", "cc" +#if defined(__x86_64__) + , "xmm6", "xmm7" +#endif +); +} + +#define HAS_SCALEROWDOWN8_SSE2 +static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlq $0x38,%%xmm5 \n" +"1:" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0,(%1) \n" + "lea 0x4(%1),%1 \n" + "sub $0x4,%2 \n" + "ja 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +); +} + +#if defined(__i386__) +void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + asm( + DECLARE_FUNCTION(ScaleRowDown8Int_SSE2) + "pusha \n" + "mov 0x24(%esp),%esi \n" + "mov 0x28(%esp),%ebx \n" + "mov 0x2c(%esp),%edi \n" + "mov 0x30(%esp),%ecx \n" + "lea (%ebx,%ebx,2),%edx \n" + "pxor %xmm7,%xmm7 \n" + +"1:" + "movdqa (%esi),%xmm0 \n" + "movdqa 0x10(%esi),%xmm1 \n" + "movdqa (%esi,%ebx,1),%xmm2 \n" + "movdqa 0x10(%esi,%ebx,1),%xmm3 \n" + "pavgb %xmm2,%xmm0 \n" + "pavgb %xmm3,%xmm1 \n" + "movdqa (%esi,%ebx,2),%xmm2 \n" + "movdqa 0x10(%esi,%ebx,2),%xmm3 \n" + "movdqa (%esi,%edx,1),%xmm4 \n" + "movdqa 0x10(%esi,%edx,1),%xmm5 \n" + "lea (%esi,%ebx,4),%ebp \n" + "lea 0x20(%esi),%esi \n" + "pavgb %xmm4,%xmm2 \n" + "pavgb %xmm5,%xmm3 \n" + "pavgb %xmm2,%xmm0 \n" + "pavgb %xmm3,%xmm1 \n" + "movdqa 0x0(%ebp),%xmm2 \n" + "movdqa 0x10(%ebp),%xmm3 \n" + "movdqa 0x0(%ebp,%ebx,1),%xmm4 \n" + "movdqa 0x10(%ebp,%ebx,1),%xmm5 \n" + "pavgb %xmm4,%xmm2 \n" + "pavgb %xmm5,%xmm3 \n" + "movdqa 0x0(%ebp,%ebx,2),%xmm4 \n" + "movdqa 0x10(%ebp,%ebx,2),%xmm5 \n" + "movdqa 0x0(%ebp,%edx,1),%xmm6 \n" + "pavgb %xmm6,%xmm4 \n" + "movdqa 0x10(%ebp,%edx,1),%xmm6 \n" + "pavgb %xmm6,%xmm5 \n" + "pavgb %xmm4,%xmm2 \n" + "pavgb %xmm5,%xmm3 \n" + "pavgb %xmm2,%xmm0 \n" + "pavgb %xmm3,%xmm1 \n" + "psadbw %xmm7,%xmm0 \n" + "psadbw %xmm7,%xmm1 \n" + "pshufd $0xd8,%xmm0,%xmm0 \n" + "pshufd $0x8d,%xmm1,%xmm1 \n" + "por %xmm1,%xmm0 \n" + "psrlw $0x3,%xmm0 \n" + "packuswb %xmm0,%xmm0 \n" + "packuswb %xmm0,%xmm0 \n" + "movd %xmm0,(%edi) \n" + "lea 0x4(%edi),%edi \n" + "sub $0x4,%ecx \n" + "ja 1b \n" + "popa \n" + "ret \n" +); + +// fpic is used for magiccam plugin +#if !defined(__PIC__) +#define HAS_SCALEROWDOWN34_SSSE3 +void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + asm( + DECLARE_FUNCTION(ScaleRowDown34_SSSE3) + "pusha \n" + "mov 0x24(%esp),%esi \n" + "mov 0x2c(%esp),%edi \n" + "mov 0x30(%esp),%ecx \n" + "movdqa _shuf0,%xmm3 \n" + "movdqa _shuf1,%xmm4 \n" + "movdqa _shuf2,%xmm5 \n" + +"1:" + "movdqa (%esi),%xmm0 \n" + "movdqa 0x10(%esi),%xmm2 \n" + "lea 0x20(%esi),%esi \n" + "movdqa %xmm2,%xmm1 \n" + "palignr $0x8,%xmm0,%xmm1 \n" + "pshufb %xmm3,%xmm0 \n" + "pshufb %xmm4,%xmm1 \n" + "pshufb %xmm5,%xmm2 \n" + "movq %xmm0,(%edi) \n" + "movq %xmm1,0x8(%edi) \n" + "movq %xmm2,0x10(%edi) \n" + "lea 0x18(%edi),%edi \n" + "sub $0x18,%ecx \n" + "ja 1b \n" + "popa \n" + "ret \n" +); + +void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + asm( + DECLARE_FUNCTION(ScaleRowDown34_1_Int_SSSE3) + "pusha \n" + "mov 0x24(%esp),%esi \n" + "mov 0x28(%esp),%ebp \n" + "mov 0x2c(%esp),%edi \n" + "mov 0x30(%esp),%ecx \n" + "movdqa _shuf01,%xmm2 \n" + "movdqa _shuf11,%xmm3 \n" + "movdqa _shuf21,%xmm4 \n" + "movdqa _madd01,%xmm5 \n" + "movdqa _madd11,%xmm6 \n" + "movdqa _round34,%xmm7 \n" + +"1:" + "movdqa (%esi),%xmm0 \n" + "movdqa (%esi,%ebp),%xmm1 \n" + "pavgb %xmm1,%xmm0 \n" + "pshufb %xmm2,%xmm0 \n" + "pmaddubsw %xmm5,%xmm0 \n" + "paddsw %xmm7,%xmm0 \n" + "psrlw $0x2,%xmm0 \n" + "packuswb %xmm0,%xmm0 \n" + "movq %xmm0,(%edi) \n" + "movdqu 0x8(%esi),%xmm0 \n" + "movdqu 0x8(%esi,%ebp),%xmm1 \n" + "pavgb %xmm1,%xmm0 \n" + "pshufb %xmm3,%xmm0 \n" + "pmaddubsw %xmm6,%xmm0 \n" + "paddsw %xmm7,%xmm0 \n" + "psrlw $0x2,%xmm0 \n" + "packuswb %xmm0,%xmm0 \n" + "movq %xmm0,0x8(%edi) \n" + "movdqa 0x10(%esi),%xmm0 \n" + "movdqa 0x10(%esi,%ebp),%xmm1 \n" + "lea 0x20(%esi),%esi \n" + "pavgb %xmm1,%xmm0 \n" + "pshufb %xmm4,%xmm0 \n" + "movdqa _madd21,%xmm1 \n" + "pmaddubsw %xmm1,%xmm0 \n" + "paddsw %xmm7,%xmm0 \n" + "psrlw $0x2,%xmm0 \n" + "packuswb %xmm0,%xmm0 \n" + "movq %xmm0,0x10(%edi) \n" + "lea 0x18(%edi),%edi \n" + "sub $0x18,%ecx \n" + "ja 1b \n" + + "popa \n" + "ret \n" +); + +void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + asm( + DECLARE_FUNCTION(ScaleRowDown34_0_Int_SSSE3) + "pusha \n" + "mov 0x24(%esp),%esi \n" + "mov 0x28(%esp),%ebp \n" + "mov 0x2c(%esp),%edi \n" + "mov 0x30(%esp),%ecx \n" + "movdqa _shuf01,%xmm2 \n" + "movdqa _shuf11,%xmm3 \n" + "movdqa _shuf21,%xmm4 \n" + "movdqa _madd01,%xmm5 \n" + "movdqa _madd11,%xmm6 \n" + "movdqa _round34,%xmm7 \n" + +"1:" + "movdqa (%esi),%xmm0 \n" + "movdqa (%esi,%ebp,1),%xmm1 \n" + "pavgb %xmm0,%xmm1 \n" + "pavgb %xmm1,%xmm0 \n" + "pshufb %xmm2,%xmm0 \n" + "pmaddubsw %xmm5,%xmm0 \n" + "paddsw %xmm7,%xmm0 \n" + "psrlw $0x2,%xmm0 \n" + "packuswb %xmm0,%xmm0 \n" + "movq %xmm0,(%edi) \n" + "movdqu 0x8(%esi),%xmm0 \n" + "movdqu 0x8(%esi,%ebp,1),%xmm1 \n" + "pavgb %xmm0,%xmm1 \n" + "pavgb %xmm1,%xmm0 \n" + "pshufb %xmm3,%xmm0 \n" + "pmaddubsw %xmm6,%xmm0 \n" + "paddsw %xmm7,%xmm0 \n" + "psrlw $0x2,%xmm0 \n" + "packuswb %xmm0,%xmm0 \n" + "movq %xmm0,0x8(%edi) \n" + "movdqa 0x10(%esi),%xmm0 \n" + "movdqa 0x10(%esi,%ebp,1),%xmm1 \n" + "lea 0x20(%esi),%esi \n" + "pavgb %xmm0,%xmm1 \n" + "pavgb %xmm1,%xmm0 \n" + "pshufb %xmm4,%xmm0 \n" + "movdqa _madd21,%xmm1 \n" + "pmaddubsw %xmm1,%xmm0 \n" + "paddsw %xmm7,%xmm0 \n" + "psrlw $0x2,%xmm0 \n" + "packuswb %xmm0,%xmm0 \n" + "movq %xmm0,0x10(%edi) \n" + "lea 0x18(%edi),%edi \n" + "sub $0x18,%ecx \n" + "ja 1b \n" + "popa \n" + "ret \n" +); + +#define HAS_SCALEROWDOWN38_SSSE3 +void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + asm( + DECLARE_FUNCTION(ScaleRowDown38_SSSE3) + "pusha \n" + "mov 0x24(%esp),%esi \n" + "mov 0x28(%esp),%edx \n" + "mov 0x2c(%esp),%edi \n" + "mov 0x30(%esp),%ecx \n" + "movdqa _shuf38a ,%xmm4 \n" + "movdqa _shuf38b ,%xmm5 \n" + +"1:" + "movdqa (%esi),%xmm0 \n" + "movdqa 0x10(%esi),%xmm1 \n" + "lea 0x20(%esi),%esi \n" + "pshufb %xmm4,%xmm0 \n" + "pshufb %xmm5,%xmm1 \n" + "paddusb %xmm1,%xmm0 \n" + "movq %xmm0,(%edi) \n" + "movhlps %xmm0,%xmm1 \n" + "movd %xmm1,0x8(%edi) \n" + "lea 0xc(%edi),%edi \n" + "sub $0xc,%ecx \n" + "ja 1b \n" + "popa \n" + "ret \n" +); + +void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + asm( + DECLARE_FUNCTION(ScaleRowDown38_3_Int_SSSE3) + "pusha \n" + "mov 0x24(%esp),%esi \n" + "mov 0x28(%esp),%edx \n" + "mov 0x2c(%esp),%edi \n" + "mov 0x30(%esp),%ecx \n" + "movdqa _shufac0,%xmm4 \n" + "movdqa _shufac3,%xmm5 \n" + "movdqa _scaleac3,%xmm6 \n" + "pxor %xmm7,%xmm7 \n" + +"1:" + "movdqa (%esi),%xmm0 \n" + "movdqa (%esi,%edx,1),%xmm2 \n" + "movhlps %xmm0,%xmm1 \n" + "movhlps %xmm2,%xmm3 \n" + "punpcklbw %xmm7,%xmm0 \n" + "punpcklbw %xmm7,%xmm1 \n" + "punpcklbw %xmm7,%xmm2 \n" + "punpcklbw %xmm7,%xmm3 \n" + "paddusw %xmm2,%xmm0 \n" + "paddusw %xmm3,%xmm1 \n" + "movdqa (%esi,%edx,2),%xmm2 \n" + "lea 0x10(%esi),%esi \n" + "movhlps %xmm2,%xmm3 \n" + "punpcklbw %xmm7,%xmm2 \n" + "punpcklbw %xmm7,%xmm3 \n" + "paddusw %xmm2,%xmm0 \n" + "paddusw %xmm3,%xmm1 \n" + "movdqa %xmm0,%xmm2 \n" + "psrldq $0x2,%xmm0 \n" + "paddusw %xmm0,%xmm2 \n" + "psrldq $0x2,%xmm0 \n" + "paddusw %xmm0,%xmm2 \n" + "pshufb %xmm4,%xmm2 \n" + "movdqa %xmm1,%xmm3 \n" + "psrldq $0x2,%xmm1 \n" + "paddusw %xmm1,%xmm3 \n" + "psrldq $0x2,%xmm1 \n" + "paddusw %xmm1,%xmm3 \n" + "pshufb %xmm5,%xmm3 \n" + "paddusw %xmm3,%xmm2 \n" + "pmulhuw %xmm6,%xmm2 \n" + "packuswb %xmm2,%xmm2 \n" + "movd %xmm2,(%edi) \n" + "pextrw $0x2,%xmm2,%eax \n" + "mov %ax,0x4(%edi) \n" + "lea 0x6(%edi),%edi \n" + "sub $0x6,%ecx \n" + "ja 1b \n" + "popa \n" + "ret \n" +); + +void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + asm( + DECLARE_FUNCTION(ScaleRowDown38_2_Int_SSSE3) + "pusha \n" + "mov 0x24(%esp),%esi \n" + "mov 0x28(%esp),%edx \n" + "mov 0x2c(%esp),%edi \n" + "mov 0x30(%esp),%ecx \n" + "movdqa _shufab0,%xmm4 \n" + "movdqa _shufab1,%xmm5 \n" + "movdqa _shufab2,%xmm6 \n" + "movdqa _scaleab2,%xmm7 \n" + +"1:" + "movdqa (%esi),%xmm2 \n" + "pavgb (%esi,%edx,1),%xmm2 \n" + "lea 0x10(%esi),%esi \n" + "movdqa %xmm2,%xmm0 \n" + "pshufb %xmm4,%xmm0 \n" + "movdqa %xmm2,%xmm1 \n" + "pshufb %xmm5,%xmm1 \n" + "paddusw %xmm1,%xmm0 \n" + "pshufb %xmm6,%xmm2 \n" + "paddusw %xmm2,%xmm0 \n" + "pmulhuw %xmm7,%xmm0 \n" + "packuswb %xmm0,%xmm0 \n" + "movd %xmm0,(%edi) \n" + "pextrw $0x2,%xmm0,%eax \n" + "mov %ax,0x4(%edi) \n" + "lea 0x6(%edi),%edi \n" + "sub $0x6,%ecx \n" + "ja 1b \n" + "popa \n" + "ret \n" +); +#endif // __PIC__ + +#define HAS_SCALEADDROWS_SSE2 +void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, + uint16* dst_ptr, int src_width, + int src_height); + asm( + DECLARE_FUNCTION(ScaleAddRows_SSE2) + "pusha \n" + "mov 0x24(%esp),%esi \n" + "mov 0x28(%esp),%edx \n" + "mov 0x2c(%esp),%edi \n" + "mov 0x30(%esp),%ecx \n" + "mov 0x34(%esp),%ebx \n" + "pxor %xmm5,%xmm5 \n" + +"1:" + "movdqa (%esi),%xmm2 \n" + "lea (%esi,%edx,1),%eax \n" + "movhlps %xmm2,%xmm3 \n" + "lea -0x1(%ebx),%ebp \n" + "punpcklbw %xmm5,%xmm2 \n" + "punpcklbw %xmm5,%xmm3 \n" + +"2:" + "movdqa (%eax),%xmm0 \n" + "lea (%eax,%edx,1),%eax \n" + "movhlps %xmm0,%xmm1 \n" + "punpcklbw %xmm5,%xmm0 \n" + "punpcklbw %xmm5,%xmm1 \n" + "paddusw %xmm0,%xmm2 \n" + "paddusw %xmm1,%xmm3 \n" + "sub $0x1,%ebp \n" + "ja 2b \n" + + "movdqa %xmm2,(%edi) \n" + "movdqa %xmm3,0x10(%edi) \n" + "lea 0x20(%edi),%edi \n" + "lea 0x10(%esi),%esi \n" + "sub $0x10,%ecx \n" + "ja 1b \n" + "popa \n" + "ret \n" +); + +// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version +#define HAS_SCALEFILTERROWS_SSE2 +void ScaleFilterRows_SSE2(uint8* dst_ptr, + const uint8* src_ptr, int src_stride, + int dst_width, int source_y_fraction); + asm( + DECLARE_FUNCTION(ScaleFilterRows_SSE2) + "push %esi \n" + "push %edi \n" + "mov 0xc(%esp),%edi \n" + "mov 0x10(%esp),%esi \n" + "mov 0x14(%esp),%edx \n" + "mov 0x18(%esp),%ecx \n" + "mov 0x1c(%esp),%eax \n" + "cmp $0x0,%eax \n" + "je 2f \n" + "cmp $0x80,%eax \n" + "je 3f \n" + "movd %eax,%xmm6 \n" + "punpcklwd %xmm6,%xmm6 \n" + "pshufd $0x0,%xmm6,%xmm6 \n" + "neg %eax \n" + "add $0x100,%eax \n" + "movd %eax,%xmm5 \n" + "punpcklwd %xmm5,%xmm5 \n" + "pshufd $0x0,%xmm5,%xmm5 \n" + "pxor %xmm7,%xmm7 \n" + +"1:" + "movdqa (%esi),%xmm0 \n" + "movdqa (%esi,%edx,1),%xmm2 \n" + "lea 0x10(%esi),%esi \n" + "movdqa %xmm0,%xmm1 \n" + "movdqa %xmm2,%xmm3 \n" + "punpcklbw %xmm7,%xmm0 \n" + "punpcklbw %xmm7,%xmm2 \n" + "punpckhbw %xmm7,%xmm1 \n" + "punpckhbw %xmm7,%xmm3 \n" + "pmullw %xmm5,%xmm0 \n" + "pmullw %xmm5,%xmm1 \n" + "pmullw %xmm6,%xmm2 \n" + "pmullw %xmm6,%xmm3 \n" + "paddusw %xmm2,%xmm0 \n" + "paddusw %xmm3,%xmm1 \n" + "psrlw $0x8,%xmm0 \n" + "psrlw $0x8,%xmm1 \n" + "packuswb %xmm1,%xmm0 \n" + "movdqa %xmm0,(%edi) \n" + "lea 0x10(%edi),%edi \n" + "sub $0x10,%ecx \n" + "ja 1b \n" + "mov -0x1(%edi),%al \n" + "mov %al,(%edi) \n" + "pop %edi \n" + "pop %esi \n" + "ret \n" + +"2:" + "movdqa (%esi),%xmm0 \n" + "lea 0x10(%esi),%esi \n" + "movdqa %xmm0,(%edi) \n" + "lea 0x10(%edi),%edi \n" + "sub $0x10,%ecx \n" + "ja 2b \n" + + "mov -0x1(%edi),%al \n" + "mov %al,(%edi) \n" + "pop %edi \n" + "pop %esi \n" + "ret \n" + +"3:" + "movdqa (%esi),%xmm0 \n" + "movdqa (%esi,%edx,1),%xmm2 \n" + "lea 0x10(%esi),%esi \n" + "pavgb %xmm2,%xmm0 \n" + "movdqa %xmm0,(%edi) \n" + "lea 0x10(%edi),%edi \n" + "sub $0x10,%ecx \n" + "ja 3b \n" + + "mov -0x1(%edi),%al \n" + "mov %al,(%edi) \n" + "pop %edi \n" + "pop %esi \n" + "ret \n" +); + +// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version +#define HAS_SCALEFILTERROWS_SSSE3 +void ScaleFilterRows_SSSE3(uint8* dst_ptr, + const uint8* src_ptr, int src_stride, + int dst_width, int source_y_fraction); + asm( + DECLARE_FUNCTION(ScaleFilterRows_SSSE3) + "push %esi \n" + "push %edi \n" + "mov 0xc(%esp),%edi \n" + "mov 0x10(%esp),%esi \n" + "mov 0x14(%esp),%edx \n" + "mov 0x18(%esp),%ecx \n" + "mov 0x1c(%esp),%eax \n" + "cmp $0x0,%eax \n" + "je 2f \n" + "cmp $0x80,%eax \n" + "je 3f \n" + "shr %eax \n" + "mov %al,%ah \n" + "neg %al \n" + "add $0x80,%al \n" + "movd %eax,%xmm5 \n" + "punpcklwd %xmm5,%xmm5 \n" + "pshufd $0x0,%xmm5,%xmm5 \n" + +"1:" + "movdqa (%esi),%xmm0 \n" + "movdqa (%esi,%edx,1),%xmm2 \n" + "lea 0x10(%esi),%esi \n" + "movdqa %xmm0,%xmm1 \n" + "punpcklbw %xmm2,%xmm0 \n" + "punpckhbw %xmm2,%xmm1 \n" + "pmaddubsw %xmm5,%xmm0 \n" + "pmaddubsw %xmm5,%xmm1 \n" + "psrlw $0x7,%xmm0 \n" + "psrlw $0x7,%xmm1 \n" + "packuswb %xmm1,%xmm0 \n" + "movdqa %xmm0,(%edi) \n" + "lea 0x10(%edi),%edi \n" + "sub $0x10,%ecx \n" + "ja 1b \n" + "mov -0x1(%edi),%al \n" + "mov %al,(%edi) \n" + "pop %edi \n" + "pop %esi \n" + "ret \n" + +"2:" + "movdqa (%esi),%xmm0 \n" + "lea 0x10(%esi),%esi \n" + "movdqa %xmm0,(%edi) \n" + "lea 0x10(%edi),%edi \n" + "sub $0x10,%ecx \n" + "ja 2b \n" + "mov -0x1(%edi),%al \n" + "mov %al,(%edi) \n" + "pop %edi \n" + "pop %esi \n" + "ret \n" + +"3:" + "movdqa (%esi),%xmm0 \n" + "movdqa (%esi,%edx,1),%xmm2 \n" + "lea 0x10(%esi),%esi \n" + "pavgb %xmm2,%xmm0 \n" + "movdqa %xmm0,(%edi) \n" + "lea 0x10(%edi),%edi \n" + "sub $0x10,%ecx \n" + "ja 3b \n" + "mov -0x1(%edi),%al \n" + "mov %al,(%edi) \n" + "pop %edi \n" + "pop %esi \n" + "ret \n" +); + +#elif defined(__x86_64__) +static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "lea (%3,%3,2),%%r10 \n" + "pxor %%xmm7,%%xmm7 \n" +"1:" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "movdqa (%0,%3,1),%%xmm2 \n" + "movdqa 0x10(%0,%3,1),%%xmm3 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa (%0,%3,2),%%xmm2 \n" + "movdqa 0x10(%0,%3,2),%%xmm3 \n" + "movdqa (%0,%%r10,1),%%xmm4 \n" + "movdqa 0x10(%0,%%r10,1),%%xmm5 \n" + "lea (%0,%3,4),%%r11 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm4,%%xmm2 \n" + "pavgb %%xmm5,%%xmm3 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa 0x0(%%r11),%%xmm2 \n" + "movdqa 0x10(%%r11),%%xmm3 \n" + "movdqa 0x0(%%r11,%3,1),%%xmm4 \n" + "movdqa 0x10(%%r11,%3,1),%%xmm5 \n" + "pavgb %%xmm4,%%xmm2 \n" + "pavgb %%xmm5,%%xmm3 \n" + "movdqa 0x0(%%r11,%3,2),%%xmm4 \n" + "movdqa 0x10(%%r11,%3,2),%%xmm5 \n" + "movdqa 0x0(%%r11,%%r10,1),%%xmm6 \n" + "pavgb %%xmm6,%%xmm4 \n" + "movdqa 0x10(%%r11,%%r10,1),%%xmm6 \n" + "pavgb %%xmm6,%%xmm5 \n" + "pavgb %%xmm4,%%xmm2 \n" + "pavgb %%xmm5,%%xmm3 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "psadbw %%xmm7,%%xmm0 \n" + "psadbw %%xmm7,%%xmm1 \n" + "pshufd $0xd8,%%xmm0,%%xmm0 \n" + "pshufd $0x8d,%%xmm1,%%xmm1 \n" + "por %%xmm1,%%xmm0 \n" + "psrlw $0x3,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0,(%1) \n" + "lea 0x4(%1),%1 \n" + "sub $0x4,%2 \n" + "ja 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc", "r10", "r11", "xmm6", "xmm7" +); +} + +#define HAS_SCALEROWDOWN34_SSSE3 +static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "movdqa (%3),%%xmm3 \n" + "movdqa (%4),%%xmm4 \n" + "movdqa (%5),%%xmm5 \n" +"1:" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm2 \n" + "lea 0x20(%0),%0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "palignr $0x8,%%xmm0,%%xmm1 \n" + "pshufb %%xmm3,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "pshufb %%xmm5,%%xmm2 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x8(%1) \n" + "movq %%xmm2,0x10(%1) \n" + "lea 0x18(%1),%1 \n" + "sub $0x18,%2 \n" + "ja 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(_shuf0), // %3 + "r"(_shuf1), // %4 + "r"(_shuf2) // %5 + : "memory", "cc" +); +} + +static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "movdqa (%4),%%xmm2 \n" // _shuf01 + "movdqa (%5),%%xmm3 \n" // _shuf11 + "movdqa (%6),%%xmm4 \n" // _shuf21 + "movdqa (%7),%%xmm5 \n" // _madd01 + "movdqa (%8),%%xmm6 \n" // _madd11 + "movdqa (%9),%%xmm7 \n" // _round34 + "movdqa (%10),%%xmm8 \n" // _madd21 +"1:" + "movdqa (%0),%%xmm0 \n" + "movdqa (%0,%3),%%xmm1 \n" + "pavgb %%xmm1,%%xmm0 \n" + "pshufb %%xmm2,%%xmm0 \n" + "pmaddubsw %%xmm5,%%xmm0 \n" + "paddsw %%xmm7,%%xmm0 \n" + "psrlw $0x2,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movdqu 0x8(%0),%%xmm0 \n" + "movdqu 0x8(%0,%3),%%xmm1 \n" + "pavgb %%xmm1,%%xmm0 \n" + "pshufb %%xmm3,%%xmm0 \n" + "pmaddubsw %%xmm6,%%xmm0 \n" + "paddsw %%xmm7,%%xmm0 \n" + "psrlw $0x2,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,0x8(%1) \n" + "movdqa 0x10(%0),%%xmm0 \n" + "movdqa 0x10(%0,%3),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm1,%%xmm0 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm8,%%xmm0 \n" + "paddsw %%xmm7,%%xmm0 \n" + "psrlw $0x2,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,0x10(%1) \n" + "lea 0x18(%1),%1 \n" + "sub $0x18,%2 \n" + "ja 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"(_shuf01), // %4 + "r"(_shuf11), // %5 + "r"(_shuf21), // %6 + "r"(_madd01), // %7 + "r"(_madd11), // %8 + "r"(_round34), // %9 + "r"(_madd21) // %10 + : "memory", "cc", "xmm6", "xmm7", "xmm8" +); +} + +static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "movdqa (%4),%%xmm2 \n" // _shuf01 + "movdqa (%5),%%xmm3 \n" // _shuf11 + "movdqa (%6),%%xmm4 \n" // _shuf21 + "movdqa (%7),%%xmm5 \n" // _madd01 + "movdqa (%8),%%xmm6 \n" // _madd11 + "movdqa (%9),%%xmm7 \n" // _round34 + "movdqa (%10),%%xmm8 \n" // _madd21 +"1:" + "movdqa (%0),%%xmm0 \n" + "movdqa (%0,%3,1),%%xmm1 \n" + "pavgb %%xmm0,%%xmm1 \n" + "pavgb %%xmm1,%%xmm0 \n" + "pshufb %%xmm2,%%xmm0 \n" + "pmaddubsw %%xmm5,%%xmm0 \n" + "paddsw %%xmm7,%%xmm0 \n" + "psrlw $0x2,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movdqu 0x8(%0),%%xmm0 \n" + "movdqu 0x8(%0,%3,1),%%xmm1 \n" + "pavgb %%xmm0,%%xmm1 \n" + "pavgb %%xmm1,%%xmm0 \n" + "pshufb %%xmm3,%%xmm0 \n" + "pmaddubsw %%xmm6,%%xmm0 \n" + "paddsw %%xmm7,%%xmm0 \n" + "psrlw $0x2,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,0x8(%1) \n" + "movdqa 0x10(%0),%%xmm0 \n" + "movdqa 0x10(%0,%3,1),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm0,%%xmm1 \n" + "pavgb %%xmm1,%%xmm0 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm8,%%xmm0 \n" + "paddsw %%xmm7,%%xmm0 \n" + "psrlw $0x2,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,0x10(%1) \n" + "lea 0x18(%1),%1 \n" + "sub $0x18,%2 \n" + "ja 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"(_shuf01), // %4 + "r"(_shuf11), // %5 + "r"(_shuf21), // %6 + "r"(_madd01), // %7 + "r"(_madd11), // %8 + "r"(_round34), // %9 + "r"(_madd21) // %10 + : "memory", "cc", "xmm6", "xmm7", "xmm8" +); +} + +#define HAS_SCALEROWDOWN38_SSSE3 +static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "movdqa (%3),%%xmm4 \n" + "movdqa (%4),%%xmm5 \n" +"1:" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movhlps %%xmm0,%%xmm1 \n" + "movd %%xmm1,0x8(%1) \n" + "lea 0xc(%1),%1 \n" + "sub $0xc,%2 \n" + "ja 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(_shuf38a), // %3 + "r"(_shuf38b) // %4 + : "memory", "cc" +); +} + +static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "movdqa (%4),%%xmm4 \n" + "movdqa (%5),%%xmm5 \n" + "movdqa (%6),%%xmm6 \n" + "pxor %%xmm7,%%xmm7 \n" +"1:" + "movdqa (%0),%%xmm0 \n" + "movdqa (%0,%3,1),%%xmm2 \n" + "movhlps %%xmm0,%%xmm1 \n" + "movhlps %%xmm2,%%xmm3 \n" + "punpcklbw %%xmm7,%%xmm0 \n" + "punpcklbw %%xmm7,%%xmm1 \n" + "punpcklbw %%xmm7,%%xmm2 \n" + "punpcklbw %%xmm7,%%xmm3 \n" + "paddusw %%xmm2,%%xmm0 \n" + "paddusw %%xmm3,%%xmm1 \n" + "movdqa (%0,%3,2),%%xmm2 \n" + "lea 0x10(%0),%0 \n" + "movhlps %%xmm2,%%xmm3 \n" + "punpcklbw %%xmm7,%%xmm2 \n" + "punpcklbw %%xmm7,%%xmm3 \n" + "paddusw %%xmm2,%%xmm0 \n" + "paddusw %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psrldq $0x2,%%xmm0 \n" + "paddusw %%xmm0,%%xmm2 \n" + "psrldq $0x2,%%xmm0 \n" + "paddusw %%xmm0,%%xmm2 \n" + "pshufb %%xmm4,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psrldq $0x2,%%xmm1 \n" + "paddusw %%xmm1,%%xmm3 \n" + "psrldq $0x2,%%xmm1 \n" + "paddusw %%xmm1,%%xmm3 \n" + "pshufb %%xmm5,%%xmm3 \n" + "paddusw %%xmm3,%%xmm2 \n" + "pmulhuw %%xmm6,%%xmm2 \n" + "packuswb %%xmm2,%%xmm2 \n" + "movd %%xmm2,(%1) \n" + "pextrw $0x2,%%xmm2,%%eax \n" + "mov %%ax,0x4(%1) \n" + "lea 0x6(%1),%1 \n" + "sub $0x6,%2 \n" + "ja 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"(_shufac0), // %4 + "r"(_shufac3), // %5 + "r"(_scaleac3) // %6 + : "memory", "cc", "rax", "xmm6", "xmm7" +); +} + +static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "movdqa (%4),%%xmm4 \n" + "movdqa (%5),%%xmm5 \n" + "movdqa (%6),%%xmm6 \n" + "movdqa (%7),%%xmm7 \n" +"1:" + "movdqa (%0),%%xmm2 \n" + "pavgb (%0,%3,1),%%xmm2 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm2,%%xmm0 \n" + "pshufb %%xmm4,%%xmm0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "pshufb %%xmm5,%%xmm1 \n" + "paddusw %%xmm1,%%xmm0 \n" + "pshufb %%xmm6,%%xmm2 \n" + "paddusw %%xmm2,%%xmm0 \n" + "pmulhuw %%xmm7,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0,(%1) \n" + "pextrw $0x2,%%xmm0,%%eax \n" + "mov %%ax,0x4(%1) \n" + "lea 0x6(%1),%1 \n" + "sub $0x6,%2 \n" + "ja 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"(_shufab0), // %4 + "r"(_shufab1), // %5 + "r"(_shufab2), // %6 + "r"(_scaleab2) // %7 + : "memory", "cc", "rax", "xmm6", "xmm7" +); +} + +#define HAS_SCALEADDROWS_SSE2 +static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, + uint16* dst_ptr, int src_width, + int src_height) { + asm volatile ( + "pxor %%xmm5,%%xmm5 \n" +"1:" + "movdqa (%0),%%xmm2 \n" + "lea (%0,%4,1),%%r10 \n" + "movhlps %%xmm2,%%xmm3 \n" + "lea -0x1(%3),%%r11 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm3 \n" + +"2:" + "movdqa (%%r10),%%xmm0 \n" + "lea (%%r10,%4,1),%%r10 \n" + "movhlps %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "paddusw %%xmm0,%%xmm2 \n" + "paddusw %%xmm1,%%xmm3 \n" + "sub $0x1,%%r11 \n" + "ja 2b \n" + + "movdqa %%xmm2,(%1) \n" + "movdqa %%xmm3,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "lea 0x10(%0),%0 \n" + "sub $0x10,%2 \n" + "ja 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(src_width), // %2 + "+r"(src_height) // %3 + : "r"((intptr_t)(src_stride)) // %4 + : "memory", "cc", "r10", "r11" +); +} + +// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version +#define HAS_SCALEFILTERROWS_SSE2 +static void ScaleFilterRows_SSE2(uint8* dst_ptr, + const uint8* src_ptr, int src_stride, + int dst_width, int source_y_fraction) { + if (source_y_fraction == 0) { + asm volatile ( + "1:" + "movdqa (%1),%%xmm0 \n" + "lea 0x10(%1),%1 \n" + "movdqa %%xmm0,(%0) \n" + "lea 0x10(%0),%0 \n" + "sub $0x10,%2 \n" + "ja 1b \n" + "mov -0x1(%0),%%al \n" + "mov %%al,(%0) \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "rax" + ); + return; + } else if (source_y_fraction == 128) { + asm volatile ( + "1:" + "movdqa (%1),%%xmm0 \n" + "movdqa (%1,%3,1),%%xmm2 \n" + "lea 0x10(%1),%1 \n" + "pavgb %%xmm2,%%xmm0 \n" + "movdqa %%xmm0,(%0) \n" + "lea 0x10(%0),%0 \n" + "sub $0x10,%2 \n" + "ja 1b \n" + "mov -0x1(%0),%%al \n" + "mov %%al,(%0) \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc", "rax" + ); + return; + } else { + asm volatile ( + "mov %3,%%eax \n" + "movd %%eax,%%xmm6 \n" + "punpcklwd %%xmm6,%%xmm6 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "neg %%eax \n" + "add $0x100,%%eax \n" + "movd %%eax,%%xmm5 \n" + "punpcklwd %%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "pxor %%xmm7,%%xmm7 \n" + "1:" + "movdqa (%1),%%xmm0 \n" + "movdqa (%1,%4,1),%%xmm2 \n" + "lea 0x10(%1),%1 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklbw %%xmm7,%%xmm0 \n" + "punpcklbw %%xmm7,%%xmm2 \n" + "punpckhbw %%xmm7,%%xmm1 \n" + "punpckhbw %%xmm7,%%xmm3 \n" + "pmullw %%xmm5,%%xmm0 \n" + "pmullw %%xmm5,%%xmm1 \n" + "pmullw %%xmm6,%%xmm2 \n" + "pmullw %%xmm6,%%xmm3 \n" + "paddusw %%xmm2,%%xmm0 \n" + "paddusw %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,(%0) \n" + "lea 0x10(%0),%0 \n" + "sub $0x10,%2 \n" + "ja 1b \n" + "mov -0x1(%0),%%al \n" + "mov %%al,(%0) \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(source_y_fraction) // %3 + : "r"((intptr_t)(src_stride)) // %4 + : "memory", "cc", "rax", "xmm6", "xmm7" + ); + } + return; +} + +// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version +#define HAS_SCALEFILTERROWS_SSSE3 +static void ScaleFilterRows_SSSE3(uint8* dst_ptr, + const uint8* src_ptr, int src_stride, + int dst_width, int source_y_fraction) { + if (source_y_fraction == 0) { + asm volatile ( + "1:" + "movdqa (%1),%%xmm0 \n" + "lea 0x10(%1),%1 \n" + "movdqa %%xmm0,(%0) \n" + "lea 0x10(%0),%0 \n" + "sub $0x10,%2 \n" + "ja 1b \n" + "mov -0x1(%0),%%al \n" + "mov %%al,(%0) \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "rax" + ); + return; + } else if (source_y_fraction == 128) { + asm volatile ( + "1:" + "movdqa (%1),%%xmm0 \n" + "movdqa (%1,%3,1),%%xmm2 \n" + "lea 0x10(%1),%1 \n" + "pavgb %%xmm2,%%xmm0 \n" + "movdqa %%xmm0,(%0) \n" + "lea 0x10(%0),%0 \n" + "sub $0x10,%2 \n" + "ja 1b \n" + "mov -0x1(%0),%%al \n" + "mov %%al,(%0) \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc", "rax" + ); + return; + } else { + asm volatile ( + "mov %3,%%eax \n" + "shr %%eax \n" + "mov %%al,%%ah \n" + "neg %%al \n" + "add $0x80,%%al \n" + "movd %%eax,%%xmm5 \n" + "punpcklwd %%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "1:" + "movdqa (%1),%%xmm0 \n" + "movdqa (%1,%4,1),%%xmm2 \n" + "lea 0x10(%1),%1 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "pmaddubsw %%xmm5,%%xmm0 \n" + "pmaddubsw %%xmm5,%%xmm1 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,(%0) \n" + "lea 0x10(%0),%0 \n" + "sub $0x10,%2 \n" + "ja 1b \n" + "mov -0x1(%0),%%al \n" + "mov %%al,(%0) \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(source_y_fraction) // %3 + : "r"((intptr_t)(src_stride)) // %4 + : "memory", "cc", "rax" + ); + } + return; +} +#endif +#endif + +// CPU agnostic row functions +static void ScaleRowDown2_C(const uint8* src_ptr, int src_stride, + uint8* dst, int dst_width) { + int x; + for (x = 0; x < dst_width; ++x) { + *dst++ = *src_ptr; + src_ptr += 2; + } +} + +static void ScaleRowDown2Int_C(const uint8* src_ptr, int src_stride, + uint8* dst, int dst_width) { + int x; + for (x = 0; x < dst_width; ++x) { + *dst++ = (src_ptr[0] + src_ptr[1] + + src_ptr[src_stride] + src_ptr[src_stride + 1] + 2) >> 2; + src_ptr += 2; + } +} + +static void ScaleRowDown4_C(const uint8* src_ptr, int src_stride, + uint8* dst, int dst_width) { + int x; + for (x = 0; x < dst_width; ++x) { + *dst++ = *src_ptr; + src_ptr += 4; + } +} + +static void ScaleRowDown4Int_C(const uint8* src_ptr, int src_stride, + uint8* dst, int dst_width) { + int x; + for (x = 0; x < dst_width; ++x) { + *dst++ = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + + src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + + src_ptr[src_stride + 2] + src_ptr[src_stride + 3] + + src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] + + src_ptr[src_stride * 2 + 2] + src_ptr[src_stride * 2 + 3] + + src_ptr[src_stride * 3 + 0] + src_ptr[src_stride * 3 + 1] + + src_ptr[src_stride * 3 + 2] + src_ptr[src_stride * 3 + 3] + + 8) >> 4; + src_ptr += 4; + } +} + +// 640 output pixels is enough to allow 5120 input pixels with 1/8 scale down. +// Keeping the total buffer under 4096 bytes avoids a stackcheck, saving 4% cpu. +// The following 2 lines cause error on Windows. +//static const int kMaxOutputWidth = 640; +//static const int kMaxRow12 = 1280; //kMaxOutputWidth * 2; +#define kMaxOutputWidth 640 +#define kMaxRow12 1280 + +static void ScaleRowDown8_C(const uint8* src_ptr, int src_stride, + uint8* dst, int dst_width) { + int x; + for (x = 0; x < dst_width; ++x) { + *dst++ = *src_ptr; + src_ptr += 8; + } +} + +// Note calling code checks width is less than max and if not +// uses ScaleRowDown8_C instead. +static void ScaleRowDown8Int_C(const uint8* src_ptr, int src_stride, + uint8* dst, int dst_width) { + ALIGN16(uint8 src_row[kMaxRow12 * 2]); + assert(dst_width <= kMaxOutputWidth); + ScaleRowDown4Int_C(src_ptr, src_stride, src_row, dst_width * 2); + ScaleRowDown4Int_C(src_ptr + src_stride * 4, src_stride, + src_row + kMaxOutputWidth, + dst_width * 2); + ScaleRowDown2Int_C(src_row, kMaxOutputWidth, dst, dst_width); +} + +static void ScaleRowDown34_C(const uint8* src_ptr, int src_stride, + uint8* dst, int dst_width) { + uint8* dend; + assert((dst_width % 3 == 0) && (dst_width > 0)); + dend = dst + dst_width; + do { + dst[0] = src_ptr[0]; + dst[1] = src_ptr[1]; + dst[2] = src_ptr[3]; + dst += 3; + src_ptr += 4; + } while (dst < dend); +} + +// Filter rows 0 and 1 together, 3 : 1 +static void ScaleRowDown34_0_Int_C(const uint8* src_ptr, int src_stride, + uint8* d, int dst_width) { + uint8* dend; + const uint8* s; + const uint8* t; + assert((dst_width % 3 == 0) && (dst_width > 0)); + dend = d + dst_width; + s = src_ptr; + t = src_ptr + src_stride; + do { + uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; + uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; + uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; + uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; + uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; + uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; + d[0] = (a0 * 3 + b0 + 2) >> 2; + d[1] = (a1 * 3 + b1 + 2) >> 2; + d[2] = (a2 * 3 + b2 + 2) >> 2; + d += 3; + s += 4; + t += 4; + } while (d < dend); +} + +// Filter rows 1 and 2 together, 1 : 1 +static void ScaleRowDown34_1_Int_C(const uint8* src_ptr, int src_stride, + uint8* d, int dst_width) { + uint8* dend; + const uint8* s; + const uint8* t; + assert((dst_width % 3 == 0) && (dst_width > 0)); + dend = d + dst_width; + s = src_ptr; + t = src_ptr + src_stride; + do { + uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; + uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; + uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; + uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; + uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; + uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; + d[0] = (a0 + b0 + 1) >> 1; + d[1] = (a1 + b1 + 1) >> 1; + d[2] = (a2 + b2 + 1) >> 1; + d += 3; + s += 4; + t += 4; + } while (d < dend); +} + +#if defined(HAS_SCALEFILTERROWS_SSE2) +// Filter row to 3/4 +static void ScaleFilterCols34_C(uint8* dst_ptr, const uint8* src_ptr, + int dst_width) { + uint8* dend; + const uint8* s; + assert((dst_width % 3 == 0) && (dst_width > 0)); + dend = dst_ptr + dst_width; + s = src_ptr; + do { + dst_ptr[0] = (s[0] * 3 + s[1] * 1 + 2) >> 2; + dst_ptr[1] = (s[1] * 1 + s[2] * 1 + 1) >> 1; + dst_ptr[2] = (s[2] * 1 + s[3] * 3 + 2) >> 2; + dst_ptr += 3; + s += 4; + } while (dst_ptr < dend); +} +#endif + +static void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int dx) { + int x = 0; + int j; + for (j = 0; j < dst_width; ++j) { + int xi = x >> 16; + int xf1 = x & 0xffff; + int xf0 = 65536 - xf1; + + *dst_ptr++ = (src_ptr[xi] * xf0 + src_ptr[xi + 1] * xf1) >> 16; + x += dx; + } +} + +//Not work on Windows +//static const int kMaxInputWidth = 2560; +#define kMaxInputWidth 2560 +#if defined(HAS_SCALEFILTERROWS_SSE2) +#define HAS_SCALEROWDOWN34_SSE2 +// Filter rows 0 and 1 together, 3 : 1 +static void ScaleRowDown34_0_Int_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + ALIGN16(uint8 row[kMaxInputWidth]); + assert((dst_width % 3 == 0) && (dst_width > 0)); + ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 4); + ScaleFilterCols34_C(dst_ptr, row, dst_width); +} + +// Filter rows 1 and 2 together, 1 : 1 +static void ScaleRowDown34_1_Int_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + ALIGN16(uint8 row[kMaxInputWidth]); + assert((dst_width % 3 == 0) && (dst_width > 0)); + ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 2); + ScaleFilterCols34_C(dst_ptr, row, dst_width); +} +#endif + +static void ScaleRowDown38_C(const uint8* src_ptr, int src_stride, + uint8* dst, int dst_width) { + int x; + assert(dst_width % 3 == 0); + for (x = 0; x < dst_width; x += 3) { + dst[0] = src_ptr[0]; + dst[1] = src_ptr[3]; + dst[2] = src_ptr[6]; + dst += 3; + src_ptr += 8; + } +} + +// 8x3 -> 3x1 +static void ScaleRowDown38_3_Int_C(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + int i; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (i = 0; i < dst_width; i+=3) { + dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + + src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + + src_ptr[src_stride + 2] + src_ptr[src_stride * 2 + 0] + + src_ptr[src_stride * 2 + 1] + src_ptr[src_stride * 2 + 2]) * + (65536 / 9) >> 16; + dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + + src_ptr[src_stride + 3] + src_ptr[src_stride + 4] + + src_ptr[src_stride + 5] + src_ptr[src_stride * 2 + 3] + + src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5]) * + (65536 / 9) >> 16; + dst_ptr[2] = (src_ptr[6] + src_ptr[7] + + src_ptr[src_stride + 6] + src_ptr[src_stride + 7] + + src_ptr[src_stride * 2 + 6] + src_ptr[src_stride * 2 + 7]) * + (65536 / 6) >> 16; + src_ptr += 8; + dst_ptr += 3; + } +} + +// 8x2 -> 3x1 +static void ScaleRowDown38_2_Int_C(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + int i; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (i = 0; i < dst_width; i+=3) { + dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + + src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + + src_ptr[src_stride + 2]) * (65536 / 6) >> 16; + dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + + src_ptr[src_stride + 3] + src_ptr[src_stride + 4] + + src_ptr[src_stride + 5]) * (65536 / 6) >> 16; + dst_ptr[2] = (src_ptr[6] + src_ptr[7] + + src_ptr[src_stride + 6] + src_ptr[src_stride + 7]) * + (65536 / 4) >> 16; + src_ptr += 8; + dst_ptr += 3; + } +} + +// C version 8x2 -> 8x1 +static void ScaleFilterRows_C(uint8* dst_ptr, + const uint8* src_ptr, int src_stride, + int dst_width, int source_y_fraction) { + int y1_fraction; + int y0_fraction; + const uint8* src_ptr1; + uint8* end; + assert(dst_width > 0); + y1_fraction = source_y_fraction; + y0_fraction = 256 - y1_fraction; + src_ptr1 = src_ptr + src_stride; + end = dst_ptr + dst_width; + do { + dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8; + dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8; + dst_ptr[2] = (src_ptr[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8; + dst_ptr[3] = (src_ptr[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8; + dst_ptr[4] = (src_ptr[4] * y0_fraction + src_ptr1[4] * y1_fraction) >> 8; + dst_ptr[5] = (src_ptr[5] * y0_fraction + src_ptr1[5] * y1_fraction) >> 8; + dst_ptr[6] = (src_ptr[6] * y0_fraction + src_ptr1[6] * y1_fraction) >> 8; + dst_ptr[7] = (src_ptr[7] * y0_fraction + src_ptr1[7] * y1_fraction) >> 8; + src_ptr += 8; + src_ptr1 += 8; + dst_ptr += 8; + } while (dst_ptr < end); + dst_ptr[0] = dst_ptr[-1]; +} + +void ScaleAddRows_C(const uint8* src_ptr, int src_stride, + uint16* dst_ptr, int src_width, int src_height) { + int x,y; + assert(src_width > 0); + assert(src_height > 0); + for (x = 0; x < src_width; ++x) { + const uint8* s = src_ptr + x; + int sum = 0; + for (y = 0; y < src_height; ++y) { + sum += s[0]; + s += src_stride; + } + dst_ptr[x] = sum; + } +} + +/** + * Scale plane, 1/2 + * + * This is an optimized version for scaling down a plane to 1/2 of + * its original size. + * + */ +static void ScalePlaneDown2(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + FilterMode filtering) { + void (*ScaleRowDown2)(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + assert(IS_ALIGNED(src_width, 2)); + assert(IS_ALIGNED(src_height, 2)); + +#if defined(HAS_SCALEROWDOWN2_NEON) + if (TestCpuFlag(kCpuHasNEON) && + IS_ALIGNED(dst_width, 16)) { + ScaleRowDown2 = filtering ? ScaleRowDown2Int_NEON : ScaleRowDown2_NEON; + } else +#endif +#if defined(HAS_SCALEROWDOWN2_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(dst_width, 16) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { + ScaleRowDown2 = filtering ? ScaleRowDown2Int_SSE2 : ScaleRowDown2_SSE2; + } else +#endif + { + ScaleRowDown2 = filtering ? ScaleRowDown2Int_C : ScaleRowDown2_C; + } + + { + int y; + for (y = 0; y < dst_height; ++y) { + ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width); + src_ptr += (src_stride << 1); + dst_ptr += dst_stride; + } + } +} + +/** + * Scale plane, 1/4 + * + * This is an optimized version for scaling down a plane to 1/4 of + * its original size. + */ +static void ScalePlaneDown4(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + FilterMode filtering) { + void (*ScaleRowDown4)(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + assert(IS_ALIGNED(src_width, 4)); + assert(IS_ALIGNED(src_height, 4)); + +#if defined(HAS_SCALEROWDOWN4_NEON) + if (TestCpuFlag(kCpuHasNEON) && + IS_ALIGNED(dst_width, 4)) { + ScaleRowDown4 = filtering ? ScaleRowDown4Int_NEON : ScaleRowDown4_NEON; + } else +#endif +#if defined(HAS_SCALEROWDOWN4_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(dst_width, 8) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst_ptr, 8) && IS_ALIGNED(dst_stride, 8)) { + ScaleRowDown4 = filtering ? ScaleRowDown4Int_SSE2 : ScaleRowDown4_SSE2; + } else +#endif + { + ScaleRowDown4 = filtering ? ScaleRowDown4Int_C : ScaleRowDown4_C; + } + + { + int y; + for (y = 0; y < dst_height; ++y) { + ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width); + src_ptr += (src_stride << 2); + dst_ptr += dst_stride; + } + } +} + +/** + * Scale plane, 1/8 + * + * This is an optimized version for scaling down a plane to 1/8 + * of its original size. + * + */ +static void ScalePlaneDown8(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + FilterMode filtering) { + void (*ScaleRowDown8)(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + assert(IS_ALIGNED(src_width, 8)); + assert(IS_ALIGNED(src_height, 8)); + +#if defined(HAS_SCALEROWDOWN8_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(dst_width, 4) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { + ScaleRowDown8 = filtering ? ScaleRowDown8Int_SSE2 : ScaleRowDown8_SSE2; + } else +#endif + { + ScaleRowDown8 = filtering && (dst_width <= kMaxOutputWidth) ? + ScaleRowDown8Int_C : ScaleRowDown8_C; + } + + { + int y; + for (y = 0; y < dst_height; ++y) { + ScaleRowDown8(src_ptr, src_stride, dst_ptr, dst_width); + src_ptr += (src_stride << 3); + dst_ptr += dst_stride; + } + } +} + +/** + * Scale plane down, 3/4 + * + * Provided by Frank Barchard (fbarchard@google.com) + * + */ +static void ScalePlaneDown34(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + FilterMode filtering) { + void (*ScaleRowDown34_0)(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + void (*ScaleRowDown34_1)(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + assert(dst_width % 3 == 0); +#if defined(HAS_SCALEROWDOWN34_NEON) + if (TestCpuFlag(kCpuHasNEON) && + (dst_width % 24 == 0)) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_NEON; + ScaleRowDown34_1 = ScaleRowDown34_NEON; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Int_NEON; + ScaleRowDown34_1 = ScaleRowDown34_1_Int_NEON; + } + } else +#endif + +#if defined(HAS_SCALEROWDOWN34_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && + (dst_width % 24 == 0) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst_ptr, 8) && IS_ALIGNED(dst_stride, 8)) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_SSSE3; + ScaleRowDown34_1 = ScaleRowDown34_SSSE3; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSSE3; + ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSSE3; + } + } else +#endif +#if defined(HAS_SCALEROWDOWN34_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + (dst_width % 24 == 0) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst_stride, 8) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8) && + filtering) { + ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSE2; + ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSE2; + } else +#endif + { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_C; + ScaleRowDown34_1 = ScaleRowDown34_C; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Int_C; + ScaleRowDown34_1 = ScaleRowDown34_1_Int_C; + } + } + { + int src_row = 0; + int y; + for (y = 0; y < dst_height; ++y) { + switch (src_row) { + case 0: + ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width); + break; + + case 1: + ScaleRowDown34_1(src_ptr, src_stride, dst_ptr, dst_width); + break; + + case 2: + ScaleRowDown34_0(src_ptr + src_stride, -src_stride, + dst_ptr, dst_width); + break; + } + ++src_row; + src_ptr += src_stride; + dst_ptr += dst_stride; + if (src_row >= 3) { + src_ptr += src_stride; + src_row = 0; + } + } +} +} + +/** + * Scale plane, 3/8 + * + * This is an optimized version for scaling down a plane to 3/8 + * of its original size. + * + * Reduces 16x3 to 6x1 + */ +static void ScalePlaneDown38(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + FilterMode filtering) { + void (*ScaleRowDown38_3)(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + void (*ScaleRowDown38_2)(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + assert(dst_width % 3 == 0); +#if defined(HAS_SCALEROWDOWN38_NEON) + if (TestCpuFlag(kCpuHasNEON) && + (dst_width % 12 == 0)) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_NEON; + ScaleRowDown38_2 = ScaleRowDown38_NEON; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Int_NEON; + ScaleRowDown38_2 = ScaleRowDown38_2_Int_NEON; + } + } else +#endif + +#if defined(HAS_SCALEROWDOWN38_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && + (dst_width % 24 == 0) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst_stride, 8) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8)) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_SSSE3; + ScaleRowDown38_2 = ScaleRowDown38_SSSE3; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Int_SSSE3; + ScaleRowDown38_2 = ScaleRowDown38_2_Int_SSSE3; + } + } else +#endif + { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_C; + ScaleRowDown38_2 = ScaleRowDown38_C; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Int_C; + ScaleRowDown38_2 = ScaleRowDown38_2_Int_C; + } + } + { + int src_row = 0; + int y; + for (y = 0; y < dst_height; ++y) { + switch (src_row) { + case 0: + case 1: + ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width); + src_ptr += src_stride * 3; + ++src_row; + break; + + case 2: + ScaleRowDown38_2(src_ptr, src_stride, dst_ptr, dst_width); + src_ptr += src_stride * 2; + src_row = 0; + break; + } + dst_ptr += dst_stride; + } +} +} + +__inline static uint32 SumBox(int iboxwidth, int iboxheight, + int src_stride, const uint8* src_ptr) { + int x, y; + uint32 sum; + assert(iboxwidth > 0); + assert(iboxheight > 0); + sum = 0u; + for (y = 0; y < iboxheight; ++y) { + for (x = 0; x < iboxwidth; ++x) { + sum += src_ptr[x]; + } + src_ptr += src_stride; + } + return sum; +} + +static void ScalePlaneBoxRow(int dst_width, int boxheight, + int dx, int src_stride, + const uint8* src_ptr, uint8* dst_ptr) { + int x = 0; + int i; + for (i = 0; i < dst_width; ++i) { + int ix = x >> 16; + int boxwidth; + x += dx; + boxwidth = (x >> 16) - ix; + *dst_ptr++ = SumBox(boxwidth, boxheight, src_stride, src_ptr + ix) / + (boxwidth * boxheight); + } +} + +__inline static uint32 SumPixels(int iboxwidth, const uint16* src_ptr) { + uint32 sum; + int x; + assert(iboxwidth > 0); + sum = 0u; + for (x = 0; x < iboxwidth; ++x) { + sum += src_ptr[x]; + } + return sum; +} + +static void ScaleAddCols2_C(int dst_width, int boxheight, int dx, + const uint16* src_ptr, uint8* dst_ptr) { + int scaletbl[2]; + int minboxwidth = (dx >> 16); + scaletbl[0] = 65536 / (minboxwidth * boxheight); + scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight); + { + int *scaleptr = scaletbl - minboxwidth; + int x = 0; + int i; + for (i = 0; i < dst_width; ++i) { + int ix = x >> 16; + int boxwidth; + x += dx; + boxwidth = (x >> 16) - ix; + *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16; + } + } +} + +static void ScaleAddCols1_C(int dst_width, int boxheight, int dx, + const uint16* src_ptr, uint8* dst_ptr) { + int boxwidth = (dx >> 16); + int scaleval = 65536 / (boxwidth * boxheight); + int x = 0; + int i; + for (i = 0; i < dst_width; ++i) { + *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16; + x += boxwidth; + } +} + +/** + * Scale plane down to any dimensions, with interpolation. + * (boxfilter). + * + * Same method as SimpleScale, which is fixed point, outputting + * one pixel of destination using fixed point (16.16) to step + * through source, sampling a box of pixel with simple + * averaging. + */ +static void ScalePlaneBox(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr) { + int dx, dy; + assert(dst_width > 0); + assert(dst_height > 0); + dy = (src_height << 16) / dst_height; + dx = (src_width << 16) / dst_width; + if (!IS_ALIGNED(src_width, 16) || (src_width > kMaxInputWidth) || + dst_height * 2 > src_height) { + uint8* dst = dst_ptr; + int dy = (src_height << 16) / dst_height; + int dx = (src_width << 16) / dst_width; + int y = 0; + int j; + for (j = 0; j < dst_height; ++j) { + int iy = y >> 16; + const uint8* const src = src_ptr + iy * src_stride; + int boxheight; + y += dy; + if (y > (src_height << 16)) { + y = (src_height << 16); + } + boxheight = (y >> 16) - iy; + ScalePlaneBoxRow(dst_width, boxheight, + dx, src_stride, + src, dst); + + dst += dst_stride; + } + } else { + ALIGN16(uint16 row[kMaxInputWidth]); + void (*ScaleAddRows)(const uint8* src_ptr, int src_stride, + uint16* dst_ptr, int src_width, int src_height); + void (*ScaleAddCols)(int dst_width, int boxheight, int dx, + const uint16* src_ptr, uint8* dst_ptr); +#if defined(HAS_SCALEADDROWS_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16) && + IS_ALIGNED(src_width, 16)) { + ScaleAddRows = ScaleAddRows_SSE2; + } else +#endif + { + ScaleAddRows = ScaleAddRows_C; + } + if (dx & 0xffff) { + ScaleAddCols = ScaleAddCols2_C; + } else { + ScaleAddCols = ScaleAddCols1_C; + } + + { + int y = 0; + int j; + for (j = 0; j < dst_height; ++j) { + int iy = y >> 16; + const uint8* const src = src_ptr + iy * src_stride; + int boxheight; + y += dy; + if (y > (src_height << 16)) { + y = (src_height << 16); + } + boxheight = (y >> 16) - iy; + ScaleAddRows(src, src_stride, row, src_width, boxheight); + ScaleAddCols(dst_width, boxheight, dx, row, dst_ptr); + dst_ptr += dst_stride; + } + } + } +} + +/** + * Scale plane to/from any dimensions, with interpolation. + */ +static void ScalePlaneBilinearSimple(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr) { + int i, j; + uint8* dst = dst_ptr; + int dx = (src_width << 16) / dst_width; + int dy = (src_height << 16) / dst_height; + int maxx = ((src_width - 1) << 16) - 1; + int maxy = ((src_height - 1) << 16) - 1; + int y = (dst_height < src_height) ? 32768 : + (src_height << 16) / dst_height - 32768; + for (i = 0; i < dst_height; ++i) { + int cy = (y < 0) ? 0 : y; + int yi = cy >> 16; + int yf = cy & 0xffff; + const uint8* const src = src_ptr + yi * src_stride; + int x = (dst_width < src_width) ? 32768 : + (src_width << 16) / dst_width - 32768; + for (j = 0; j < dst_width; ++j) { + int cx = (x < 0) ? 0 : x; + int xi = cx >> 16; + int xf = cx & 0xffff; + int r0 = (src[xi] * (65536 - xf) + src[xi + 1] * xf) >> 16; + int r1 = (src[xi + src_stride] * (65536 - xf) + + src[xi + src_stride + 1] * xf) >> 16; + *dst++ = (r0 * (65536 - yf) + r1 * yf) >> 16; + x += dx; + if (x > maxx) + x = maxx; + } + dst += dst_stride - dst_width; + y += dy; + if (y > maxy) + y = maxy; + } +} + +/** + * Scale plane to/from any dimensions, with bilinear + * interpolation. + */ +static void ScalePlaneBilinear(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr) { + int dy; + int dx; + assert(dst_width > 0); + assert(dst_height > 0); + dy = (src_height << 16) / dst_height; + dx = (src_width << 16) / dst_width; + if (!IS_ALIGNED(src_width, 8) || (src_width > kMaxInputWidth)) { + ScalePlaneBilinearSimple(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src_ptr, dst_ptr); + + } else { + ALIGN16(uint8 row[kMaxInputWidth + 1]); + void (*ScaleFilterRows)(uint8* dst_ptr, const uint8* src_ptr, + int src_stride, + int dst_width, int source_y_fraction); + void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int dx); +#if defined(HAS_SCALEFILTERROWS_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && + IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16) && + IS_ALIGNED(src_width, 16)) { + ScaleFilterRows = ScaleFilterRows_SSSE3; + } else +#endif +#if defined(HAS_SCALEFILTERROWS_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16) && + IS_ALIGNED(src_width, 16)) { + ScaleFilterRows = ScaleFilterRows_SSE2; + } else +#endif + { + ScaleFilterRows = ScaleFilterRows_C; + } + ScaleFilterCols = ScaleFilterCols_C; + + { + int y = 0; + int maxy = ((src_height - 1) << 16) - 1; // max is filter of last 2 rows. + int j; + for (j = 0; j < dst_height; ++j) { + int iy = y >> 16; + int fy = (y >> 8) & 255; + const uint8* const src = src_ptr + iy * src_stride; + ScaleFilterRows(row, src, src_stride, src_width, fy); + ScaleFilterCols(dst_ptr, row, dst_width, dx); + dst_ptr += dst_stride; + y += dy; + if (y > maxy) { + y = maxy; + } + } + } +} +} + +/** + * Scale plane to/from any dimensions, without interpolation. + * Fixed point math is used for performance: The upper 16 bits + * of x and dx is the integer part of the source position and + * the lower 16 bits are the fixed decimal part. + */ +static void ScalePlaneSimple(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr) { + uint8* dst = dst_ptr; + int dx = (src_width << 16) / dst_width; + int y; + for (y = 0; y < dst_height; ++y) { + const uint8* const src = src_ptr + (y * src_height / dst_height) * + src_stride; + // TODO(fbarchard): Round X coordinate by setting x=0x8000. + int x = 0; + int i; + for (i = 0; i < dst_width; ++i) { + *dst++ = src[x >> 16]; + x += dx; + } + dst += dst_stride - dst_width; + } +} + +/** + * Scale plane to/from any dimensions. + */ +static void ScalePlaneAnySize(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + FilterMode filtering) { + if (!filtering) { + ScalePlaneSimple(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src_ptr, dst_ptr); + } else { + // fall back to non-optimized version + ScalePlaneBilinear(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src_ptr, dst_ptr); + } +} + +/** + * Scale plane down, any size + * + * This is an optimized version for scaling down a plane to any size. + * The current implementation is ~10 times faster compared to the + * reference implementation for e.g. XGA->LowResPAL + * + */ +static void ScalePlaneDown(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + FilterMode filtering) { + if (!filtering) { + ScalePlaneSimple(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src_ptr, dst_ptr); + } else if (filtering == kFilterBilinear || src_height * 2 > dst_height) { + // between 1/2x and 1x use bilinear + ScalePlaneBilinear(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src_ptr, dst_ptr); + } else { + ScalePlaneBox(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src_ptr, dst_ptr); + } +} + +/** + * Copy plane, no scaling + * + * This simply copies the given plane without scaling. + * The current implementation is ~115 times faster + * compared to the reference implementation. + * + */ +static void CopyPlane(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr) { + if (src_stride == src_width && dst_stride == dst_width) { + // All contiguous, so can use REALLY fast path. + memcpy(dst_ptr, src_ptr, src_width * src_height); + } else { + // Not all contiguous; must copy scanlines individually + const uint8* src = src_ptr; + uint8* dst = dst_ptr; + int i; + for (i = 0; i < src_height; ++i) { + memcpy(dst, src, src_width); + dst += dst_stride; + src += src_stride; + } + } +} + +static void ScalePlane(const uint8* src, int src_stride, + int src_width, int src_height, + uint8* dst, int dst_stride, + int dst_width, int dst_height, + FilterMode filtering, int use_ref) { + // Use specialized scales to improve performance for common resolutions. + // For example, all the 1/2 scalings will use ScalePlaneDown2() + if (dst_width == src_width && dst_height == src_height) { + // Straight copy. + CopyPlane(src_width, src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst); + } else if (dst_width <= src_width && dst_height <= src_height) { + // Scale down. + if (use_ref) { + // For testing, allow the optimized versions to be disabled. + ScalePlaneDown(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + } else if (4 * dst_width == 3 * src_width && + 4 * dst_height == 3 * src_height) { + // optimized, 3/4 + ScalePlaneDown34(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + } else if (2 * dst_width == src_width && 2 * dst_height == src_height) { + // optimized, 1/2 + ScalePlaneDown2(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + // 3/8 rounded up for odd sized chroma height. + } else if (8 * dst_width == 3 * src_width && + dst_height == ((src_height * 3 + 7) / 8)) { + // optimized, 3/8 + ScalePlaneDown38(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + } else if (4 * dst_width == src_width && 4 * dst_height == src_height) { + // optimized, 1/4 + ScalePlaneDown4(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + } else if (8 * dst_width == src_width && 8 * dst_height == src_height) { + // optimized, 1/8 + ScalePlaneDown8(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + } else { + // Arbitrary downsample + ScalePlaneDown(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + } + } else { + // Arbitrary scale up and/or down. + ScalePlaneAnySize(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + } +} + +/** + * Scale a plane. + * + * This function in turn calls a scaling function + * suitable for handling the desired resolutions. + * + */ + +int I420Scale(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + int src_width, int src_height, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int dst_width, int dst_height, + FilterMode filtering) { + if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || + !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { + return -1; + } + // Negative height means invert the image. + if (src_height < 0) { + int halfheight; + src_height = -src_height; + halfheight = (src_height + 1) >> 1; + src_y = src_y + (src_height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + { + int src_halfwidth = (src_width + 1) >> 1; + int src_halfheight = (src_height + 1) >> 1; + int dst_halfwidth = (dst_width + 1) >> 1; + int dst_halfheight = (dst_height + 1) >> 1; + + ScalePlane(src_y, src_stride_y, src_width, src_height, + dst_y, dst_stride_y, dst_width, dst_height, + filtering, use_reference_impl_); + ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight, + dst_u, dst_stride_u, dst_halfwidth, dst_halfheight, + filtering, use_reference_impl_); + ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight, + dst_v, dst_stride_v, dst_halfwidth, dst_halfheight, + filtering, use_reference_impl_); + } + return 0; +} + +// Deprecated api +int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v, + int src_stride_y, int src_stride_u, int src_stride_v, + int src_width, int src_height, + uint8* dst_y, uint8* dst_u, uint8* dst_v, + int dst_stride_y, int dst_stride_u, int dst_stride_v, + int dst_width, int dst_height, + int interpolate) { + if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || + !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { + return -1; + } + // Negative height means invert the image. + if (src_height < 0) { + int halfheight; + src_height = -src_height; + halfheight = (src_height + 1) >> 1; + src_y = src_y + (src_height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + { + int src_halfwidth = (src_width + 1) >> 1; + int src_halfheight = (src_height + 1) >> 1; + int dst_halfwidth = (dst_width + 1) >> 1; + int dst_halfheight = (dst_height + 1) >> 1; + FilterMode filtering = interpolate ? kFilterBox : kFilterNone; + + ScalePlane(src_y, src_stride_y, src_width, src_height, + dst_y, dst_stride_y, dst_width, dst_height, + filtering, use_reference_impl_); + ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight, + dst_u, dst_stride_u, dst_halfwidth, dst_halfheight, + filtering, use_reference_impl_); + ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight, + dst_v, dst_stride_v, dst_halfwidth, dst_halfheight, + filtering, use_reference_impl_); + } + return 0; +} + +// Deprecated api +int ScaleOffset(const uint8* src, int src_width, int src_height, + uint8* dst, int dst_width, int dst_height, int dst_yoffset, + int interpolate) { + if (!src || src_width <= 0 || src_height <= 0 || + !dst || dst_width <= 0 || dst_height <= 0 || dst_yoffset < 0 || + dst_yoffset >= dst_height) { + return -1; + } + dst_yoffset = dst_yoffset & ~1; // chroma requires offset to multiple of 2. + { + int src_halfwidth = (src_width + 1) >> 1; + int src_halfheight = (src_height + 1) >> 1; + int dst_halfwidth = (dst_width + 1) >> 1; + int dst_halfheight = (dst_height + 1) >> 1; + int aheight = dst_height - dst_yoffset * 2; // actual output height + const uint8* const src_y = src; + const uint8* const src_u = src + src_width * src_height; + const uint8* const src_v = src + src_width * src_height + + src_halfwidth * src_halfheight; + uint8* dst_y = dst + dst_yoffset * dst_width; + uint8* dst_u = dst + dst_width * dst_height + + (dst_yoffset >> 1) * dst_halfwidth; + uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight + + (dst_yoffset >> 1) * dst_halfwidth; + return Scale(src_y, src_u, src_v, src_width, src_halfwidth, src_halfwidth, + src_width, src_height, dst_y, dst_u, dst_v, dst_width, + dst_halfwidth, dst_halfwidth, dst_width, aheight, interpolate); + } +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif @@ -82,6 +82,7 @@ The available initialization methods are: \if encoder - #vpx_codec_enc_init (calls vpx_codec_enc_init_ver()) \endif + \if multi-encoder - #vpx_codec_enc_init_multi (calls vpx_codec_enc_init_multi_ver()) \endif \if decoder - #vpx_codec_dec_init (calls vpx_codec_dec_init_ver()) \endif diff --git a/usage_cx.dox b/usage_cx.dox index 980a03461..62f3e450b 100644 --- a/usage_cx.dox +++ b/usage_cx.dox @@ -1,6 +1,6 @@ /*! \page usage_encode Encode - The vpx_codec_encode() function is at the core of the decode loop. It + The vpx_codec_encode() function is at the core of the encode loop. It processes raw images passed by the application, producing packets of compressed data. The <code>deadline</code> parameter controls the amount of time in microseconds the encoder should spend working on the frame. For @@ -10,5 +10,4 @@ \ref samples - */ diff --git a/vp8/common/arm/arm_systemdependent.c b/vp8/common/arm/arm_systemdependent.c index b5f194d3d..89a2be825 100644 --- a/vp8/common/arm/arm_systemdependent.c +++ b/vp8/common/arm/arm_systemdependent.c @@ -11,7 +11,6 @@ #include "vpx_config.h" #include "vpx_ports/arm.h" -#include "vp8/common/g_common.h" #include "vp8/common/pragmas.h" #include "vp8/common/subpixel.h" #include "vp8/common/loopfilter.h" @@ -46,7 +45,6 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx) rtcd->subpix.bilinear4x4 = vp8_bilinear_predict4x4_armv6; rtcd->idct.idct16 = vp8_short_idct4x4llm_v6_dual; - rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_v6; rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_v6; rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_armv6; @@ -64,6 +62,12 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx) rtcd->recon.copy8x8 = vp8_copy_mem8x8_v6; rtcd->recon.copy8x4 = vp8_copy_mem8x4_v6; rtcd->recon.intra4x4_predict = vp8_intra4x4_predict_armv6; + + rtcd->dequant.block = vp8_dequantize_b_v6; + rtcd->dequant.idct_add = vp8_dequant_idct_add_v6; + rtcd->dequant.idct_add_y_block = vp8_dequant_idct_add_y_block_v6; + rtcd->dequant.idct_add_uv_block = vp8_dequant_idct_add_uv_block_v6; + } #endif @@ -80,7 +84,6 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx) rtcd->subpix.bilinear4x4 = vp8_bilinear_predict4x4_neon; rtcd->idct.idct16 = vp8_short_idct4x4llm_neon; - rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_neon; rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_neon; rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_neon; @@ -99,6 +102,12 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx) vp8_build_intra_predictors_mby_neon; rtcd->recon.build_intra_predictors_mby_s = vp8_build_intra_predictors_mby_s_neon; + + rtcd->dequant.block = vp8_dequantize_b_neon; + rtcd->dequant.idct_add = vp8_dequant_idct_add_neon; + rtcd->dequant.idct_add_y_block = vp8_dequant_idct_add_y_block_neon; + rtcd->dequant.idct_add_uv_block = vp8_dequant_idct_add_uv_block_neon; + } #endif diff --git a/vp8/decoder/arm/armv6/dequant_idct_v6.asm b/vp8/common/arm/armv6/dequant_idct_v6.asm index 2510ad838..2510ad838 100644 --- a/vp8/decoder/arm/armv6/dequant_idct_v6.asm +++ b/vp8/common/arm/armv6/dequant_idct_v6.asm diff --git a/vp8/decoder/arm/armv6/dequantize_v6.asm b/vp8/common/arm/armv6/dequantize_v6.asm index 72f7e0ee5..72f7e0ee5 100644 --- a/vp8/decoder/arm/armv6/dequantize_v6.asm +++ b/vp8/common/arm/armv6/dequantize_v6.asm diff --git a/vp8/decoder/arm/armv6/idct_blk_v6.c b/vp8/common/arm/armv6/idct_blk_v6.c index 686bb737f..9108929f5 100644 --- a/vp8/decoder/arm/armv6/idct_blk_v6.c +++ b/vp8/common/arm/armv6/idct_blk_v6.c @@ -10,50 +10,9 @@ #include "vpx_config.h" #include "vp8/common/idct.h" -#include "vp8/decoder/dequantize.h" +#include "vp8/common/dequantize.h" -void vp8_dequant_dc_idct_add_y_block_v6(short *q, short *dq, - unsigned char *dst, int stride, - char *eobs, short *dc) -{ - int i; - - for (i = 0; i < 4; i++) - { - if (eobs[0] > 1) - vp8_dequant_dc_idct_add_v6 (q, dq, dst, stride, dc[0]); - else if (eobs[0] == 1) - vp8_dc_only_idct_add_v6 (dc[0], dst, stride, dst, stride); - - if (eobs[1] > 1) - { - vp8_dequant_dc_idct_add_v6 (q+16, dq, dst+4, stride, dc[1]); - } - else if (eobs[1] == 1) - vp8_dc_only_idct_add_v6 (dc[1], dst+4, stride, dst+4, stride); - - if (eobs[2] > 1) - { - vp8_dequant_dc_idct_add_v6 (q+32, dq, dst+8, stride, dc[2]); - } - else if (eobs[2] == 1) - vp8_dc_only_idct_add_v6 (dc[2], dst+8, stride, dst+8, stride); - - if (eobs[3] > 1) - { - vp8_dequant_dc_idct_add_v6 (q+48, dq, dst+12, stride, dc[3]); - } - else if (eobs[3] == 1) - vp8_dc_only_idct_add_v6 (dc[3], dst+12, stride, dst+12, stride); - - q += 64; - dc += 4; - dst += 4*stride; - eobs += 4; - } -} - void vp8_dequant_idct_add_y_block_v6(short *q, short *dq, unsigned char *dst, int stride, char *eobs) diff --git a/vp8/common/arm/armv6/iwalsh_v6.asm b/vp8/common/arm/armv6/iwalsh_v6.asm index 463bff0f5..31ef09cad 100644 --- a/vp8/common/arm/armv6/iwalsh_v6.asm +++ b/vp8/common/arm/armv6/iwalsh_v6.asm @@ -9,7 +9,6 @@ ; EXPORT |vp8_short_inv_walsh4x4_v6| - EXPORT |vp8_short_inv_walsh4x4_1_v6| ARM REQUIRE8 @@ -17,19 +16,19 @@ AREA |.text|, CODE, READONLY ; name this block of code -;short vp8_short_inv_walsh4x4_v6(short *input, short *output) +;short vp8_short_inv_walsh4x4_v6(short *input, short *mb_dqcoeff) |vp8_short_inv_walsh4x4_v6| PROC - stmdb sp!, {r4 - r11, lr} + stmdb sp!, {r4 - r12, lr} - ldr r2, [r0], #4 ; [1 | 0] - ldr r3, [r0], #4 ; [3 | 2] - ldr r4, [r0], #4 ; [5 | 4] - ldr r5, [r0], #4 ; [7 | 6] - ldr r6, [r0], #4 ; [9 | 8] - ldr r7, [r0], #4 ; [11 | 10] - ldr r8, [r0], #4 ; [13 | 12] - ldr r9, [r0] ; [15 | 14] + ldr r2, [r0, #0] ; [1 | 0] + ldr r3, [r0, #4] ; [3 | 2] + ldr r4, [r0, #8] ; [5 | 4] + ldr r5, [r0, #12] ; [7 | 6] + ldr r6, [r0, #16] ; [9 | 8] + ldr r7, [r0, #20] ; [11 | 10] + ldr r8, [r0, #24] ; [13 | 12] + ldr r9, [r0, #28] ; [15 | 14] qadd16 r10, r2, r8 ; a1 [1+13 | 0+12] qadd16 r11, r4, r6 ; b1 [5+9 | 4+8] @@ -69,24 +68,27 @@ qadd16 r4, r4, r10 ; [b2+3|c2+3] qadd16 r5, r5, r10 ; [a2+3|d2+3] - asr r12, r2, #3 ; [1 | x] - pkhtb r12, r12, r3, asr #19; [1 | 0] - lsl lr, r3, #16 ; [~3 | x] - lsl r2, r2, #16 ; [~2 | x] - asr lr, lr, #3 ; [3 | x] - pkhtb lr, lr, r2, asr #19 ; [3 | 2] - - asr r2, r4, #3 ; [5 | x] - pkhtb r2, r2, r5, asr #19 ; [5 | 4] - lsl r3, r5, #16 ; [~7 | x] - lsl r4, r4, #16 ; [~6 | x] - asr r3, r3, #3 ; [7 | x] - pkhtb r3, r3, r4, asr #19 ; [7 | 6] - - str r12, [r1], #4 - str lr, [r1], #4 - str r2, [r1], #4 - str r3, [r1], #4 + asr r12, r3, #19 ; [0] + strh r12, [r1], #32 + asr lr, r2, #19 ; [1] + strh lr, [r1], #32 + sxth r2, r2 + sxth r3, r3 + asr r2, r2, #3 ; [2] + strh r2, [r1], #32 + asr r3, r3, #3 ; [3] + strh r3, [r1], #32 + + asr r12, r5, #19 ; [4] + strh r12, [r1], #32 + asr lr, r4, #19 ; [5] + strh lr, [r1], #32 + sxth r4, r4 + sxth r5, r5 + asr r4, r4, #3 ; [6] + strh r4, [r1], #32 + asr r5, r5, #3 ; [7] + strh r5, [r1], #32 qsubaddx r2, r6, r7 ; [c1|a1] [9-10 | 8+11] qaddsubx r3, r6, r7 ; [b1|d1] [9+10 | 8-11] @@ -103,50 +105,32 @@ qadd16 r8, r8, r10 ; [b2+3|c2+3] qadd16 r9, r9, r10 ; [a2+3|d2+3] - asr r2, r6, #3 ; [9 | x] - pkhtb r2, r2, r7, asr #19 ; [9 | 8] - lsl r3, r7, #16 ; [~11| x] - lsl r4, r6, #16 ; [~10| x] - asr r3, r3, #3 ; [11 | x] - pkhtb r3, r3, r4, asr #19 ; [11 | 10] - - asr r4, r8, #3 ; [13 | x] - pkhtb r4, r4, r9, asr #19 ; [13 | 12] - lsl r5, r9, #16 ; [~15| x] - lsl r6, r8, #16 ; [~14| x] - asr r5, r5, #3 ; [15 | x] - pkhtb r5, r5, r6, asr #19 ; [15 | 14] - - str r2, [r1], #4 - str r3, [r1], #4 - str r4, [r1], #4 - str r5, [r1] - - ldmia sp!, {r4 - r11, pc} + asr r12, r7, #19 ; [8] + strh r12, [r1], #32 + asr lr, r6, #19 ; [9] + strh lr, [r1], #32 + sxth r6, r6 + sxth r7, r7 + asr r6, r6, #3 ; [10] + strh r6, [r1], #32 + asr r7, r7, #3 ; [11] + strh r7, [r1], #32 + + asr r12, r9, #19 ; [12] + strh r12, [r1], #32 + asr lr, r8, #19 ; [13] + strh lr, [r1], #32 + sxth r8, r8 + sxth r9, r9 + asr r8, r8, #3 ; [14] + strh r8, [r1], #32 + asr r9, r9, #3 ; [15] + strh r9, [r1], #32 + + ldmia sp!, {r4 - r12, pc} ENDP ; |vp8_short_inv_walsh4x4_v6| -;short vp8_short_inv_walsh4x4_1_v6(short *input, short *output) -|vp8_short_inv_walsh4x4_1_v6| PROC - - ldrsh r2, [r0] ; [0] - add r2, r2, #3 ; [0] + 3 - asr r2, r2, #3 ; a1 ([0]+3) >> 3 - lsl r2, r2, #16 ; [a1 | x] - orr r2, r2, r2, lsr #16 ; [a1 | a1] - - str r2, [r1], #4 - str r2, [r1], #4 - str r2, [r1], #4 - str r2, [r1], #4 - str r2, [r1], #4 - str r2, [r1], #4 - str r2, [r1], #4 - str r2, [r1] - - bx lr - ENDP ; |vp8_short_inv_walsh4x4_1_v6| - ; Constant Pool c0x00030003 DCD 0x00030003 END diff --git a/vp8/decoder/arm/dequantize_arm.c b/vp8/common/arm/dequantize_arm.c index 2918e0512..20a8ac4fc 100644 --- a/vp8/decoder/arm/dequantize_arm.c +++ b/vp8/common/arm/dequantize_arm.c @@ -10,9 +10,8 @@ #include "vpx_config.h" -#include "vp8/decoder/dequantize.h" +#include "vp8/common/dequantize.h" #include "vp8/common/idct.h" -#include "vpx_mem/vpx_mem.h" #if HAVE_ARMV7 extern void vp8_dequantize_b_loop_neon(short *Q, short *DQC, short *DQ); diff --git a/vp8/decoder/arm/dequantize_arm.h b/vp8/common/arm/dequantize_arm.h index c020c8530..0b4d8fe89 100644 --- a/vp8/decoder/arm/dequantize_arm.h +++ b/vp8/common/arm/dequantize_arm.h @@ -15,8 +15,6 @@ #if HAVE_ARMV6 extern prototype_dequant_block(vp8_dequantize_b_v6); extern prototype_dequant_idct_add(vp8_dequant_idct_add_v6); -extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_v6); -extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_v6); extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_v6); extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_v6); @@ -24,19 +22,13 @@ extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_v6); #undef vp8_dequant_block #define vp8_dequant_block vp8_dequantize_b_v6 -#undef vp8_dequant_idct_add +#undef vp8_dequant_idct_add #define vp8_dequant_idct_add vp8_dequant_idct_add_v6 -#undef vp8_dequant_dc_idct_add -#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_v6 - -#undef vp8_dequant_dc_idct_add_y_block -#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_v6 - -#undef vp8_dequant_idct_add_y_block +#undef vp8_dequant_idct_add_y_block #define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_v6 -#undef vp8_dequant_idct_add_uv_block +#undef vp8_dequant_idct_add_uv_block #define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_v6 #endif #endif @@ -44,8 +36,6 @@ extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_v6); #if HAVE_ARMV7 extern prototype_dequant_block(vp8_dequantize_b_neon); extern prototype_dequant_idct_add(vp8_dequant_idct_add_neon); -extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_neon); -extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_neon); extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_neon); extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_neon); @@ -54,19 +44,13 @@ extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_neon); #undef vp8_dequant_block #define vp8_dequant_block vp8_dequantize_b_neon -#undef vp8_dequant_idct_add +#undef vp8_dequant_idct_add #define vp8_dequant_idct_add vp8_dequant_idct_add_neon -#undef vp8_dequant_dc_idct_add -#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_neon - -#undef vp8_dequant_dc_idct_add_y_block -#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_neon - -#undef vp8_dequant_idct_add_y_block +#undef vp8_dequant_idct_add_y_block #define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_neon -#undef vp8_dequant_idct_add_uv_block +#undef vp8_dequant_idct_add_uv_block #define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_neon #endif diff --git a/vp8/common/arm/idct_arm.h b/vp8/common/arm/idct_arm.h index c710c2eb0..68c0cad11 100644 --- a/vp8/common/arm/idct_arm.h +++ b/vp8/common/arm/idct_arm.h @@ -25,9 +25,6 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_v6); #undef vp8_idct_idct1_scalar_add #define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_v6 -#undef vp8_idct_iwalsh1 -#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_v6 - #undef vp8_idct_iwalsh16 #define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_v6 #endif @@ -46,9 +43,6 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_neon); #undef vp8_idct_idct1_scalar_add #define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_neon -#undef vp8_idct_iwalsh1 -#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_neon - #undef vp8_idct_iwalsh16 #define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_neon #endif diff --git a/vp8/decoder/arm/neon/dequant_idct_neon.asm b/vp8/common/arm/neon/dequant_idct_neon.asm index 602cce676..602cce676 100644 --- a/vp8/decoder/arm/neon/dequant_idct_neon.asm +++ b/vp8/common/arm/neon/dequant_idct_neon.asm diff --git a/vp8/decoder/arm/neon/dequantizeb_neon.asm b/vp8/common/arm/neon/dequantizeb_neon.asm index c8e0c31f2..c8e0c31f2 100644 --- a/vp8/decoder/arm/neon/dequantizeb_neon.asm +++ b/vp8/common/arm/neon/dequantizeb_neon.asm diff --git a/vp8/decoder/arm/neon/idct_blk_neon.c b/vp8/common/arm/neon/idct_blk_neon.c index 086293114..cc55843d5 100644 --- a/vp8/decoder/arm/neon/idct_blk_neon.c +++ b/vp8/common/arm/neon/idct_blk_neon.c @@ -10,51 +10,16 @@ #include "vpx_config.h" #include "vp8/common/idct.h" -#include "vp8/decoder/dequantize.h" +#include "vp8/common/dequantize.h" /* place these declarations here because we don't want to maintain them * outside of this scope */ -void idct_dequant_dc_full_2x_neon(short *input, short *dq, - unsigned char *dst, - int stride, short *dc); -void idct_dequant_dc_0_2x_neon(short *input, short *dq, - unsigned char *dst, - int stride, short *dc); void idct_dequant_full_2x_neon(short *q, short *dq, unsigned char *dst, int stride); void idct_dequant_0_2x_neon(short *q, short dq, unsigned char *dst, int stride); -void vp8_dequant_dc_idct_add_y_block_neon(short *q, short *dq, - unsigned char *dst, - int stride, char *eobs, short *dc) -{ - int i; - - for (i = 0; i < 4; i++) - { - if (((short *)(eobs))[0]) - { - if (((short *)eobs)[0] & 0xfefe) - idct_dequant_dc_full_2x_neon (q, dq, dst, stride, dc); - else - idct_dequant_dc_0_2x_neon(q, dq, dst, stride, dc); - } - - if (((short *)(eobs))[1]) - { - if (((short *)eobs)[1] & 0xfefe) - idct_dequant_dc_full_2x_neon (q+32, dq, dst+8, stride, dc+2); - else - idct_dequant_dc_0_2x_neon(q+32, dq, dst+8, stride, dc+2); - } - q += 64; - dc += 4; - dst += 4*stride; - eobs += 4; - } -} void vp8_dequant_idct_add_y_block_neon(short *q, short *dq, unsigned char *dst, diff --git a/vp8/decoder/arm/neon/idct_dequant_0_2x_neon.asm b/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm index 6c29c5586..6c29c5586 100644 --- a/vp8/decoder/arm/neon/idct_dequant_0_2x_neon.asm +++ b/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm diff --git a/vp8/decoder/arm/neon/idct_dequant_full_2x_neon.asm b/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm index d5dce63f6..d5dce63f6 100644 --- a/vp8/decoder/arm/neon/idct_dequant_full_2x_neon.asm +++ b/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm diff --git a/vp8/common/arm/neon/iwalsh_neon.asm b/vp8/common/arm/neon/iwalsh_neon.asm index 01c79d937..e8ea2a619 100644 --- a/vp8/common/arm/neon/iwalsh_neon.asm +++ b/vp8/common/arm/neon/iwalsh_neon.asm @@ -8,7 +8,6 @@ ; be found in the AUTHORS file in the root of the source tree. ; EXPORT |vp8_short_inv_walsh4x4_neon| - EXPORT |vp8_short_inv_walsh4x4_1_neon| ARM REQUIRE8 @@ -16,7 +15,7 @@ AREA |.text|, CODE, READONLY ; name this block of code -;short vp8_short_inv_walsh4x4_neon(short *input, short *output) +;short vp8_short_inv_walsh4x4_neon(short *input, short *mb_dqcoeff) |vp8_short_inv_walsh4x4_neon| PROC ; read in all four lines of values: d0->d3 @@ -59,22 +58,30 @@ vshr.s16 q0, q0, #3 ;e/f >> 3 vshr.s16 q1, q1, #3 ;g/h >> 3 - vst4.i16 {d0,d1,d2,d3}, [r1@128] + mov r2, #64 + add r3, r1, #32 - bx lr - ENDP ; |vp8_short_inv_walsh4x4_neon| + vst1.i16 d0[0], [r1],r2 + vst1.i16 d1[0], [r3],r2 + vst1.i16 d2[0], [r1],r2 + vst1.i16 d3[0], [r3],r2 + + vst1.i16 d0[1], [r1],r2 + vst1.i16 d1[1], [r3],r2 + vst1.i16 d2[1], [r1],r2 + vst1.i16 d3[1], [r3],r2 + vst1.i16 d0[2], [r1],r2 + vst1.i16 d1[2], [r3],r2 + vst1.i16 d2[2], [r1],r2 + vst1.i16 d3[2], [r3],r2 + + vst1.i16 d0[3], [r1],r2 + vst1.i16 d1[3], [r3],r2 + vst1.i16 d2[3], [r1] + vst1.i16 d3[3], [r3] -;short vp8_short_inv_walsh4x4_1_neon(short *input, short *output) -|vp8_short_inv_walsh4x4_1_neon| PROC - ldrsh r2, [r0] ; load input[0] - add r3, r2, #3 ; add 3 - add r2, r1, #16 ; base for last 8 output - asr r0, r3, #3 ; right shift 3 - vdup.16 q0, r0 ; load and duplicate - vst1.16 {q0}, [r1@128] ; write back 8 - vst1.16 {q0}, [r2@128] ; write back last 8 bx lr - ENDP ; |vp8_short_inv_walsh4x4_1_neon| + ENDP ; |vp8_short_inv_walsh4x4_neon| END diff --git a/vp8/common/bigend.h b/vp8/common/bigend.h deleted file mode 100644 index 6ac3f8b5a..000000000 --- a/vp8/common/bigend.h +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef _bigend_h -#define _bigend_h - -#if defined(__cplusplus) -extern "C" { -#endif - -#define invert2(x) ( (((x)>>8)&0x00ff) | (((x)<<8)&0xff00) ) -#define invert4(x) ( ((invert2(x)&0x0000ffff)<<16) | (invert2((x>>16))&0x0000ffff) ) - -#define high_byte(x) (unsigned char)x -#define mid2Byte(x) (unsigned char)(x >> 8) -#define mid1Byte(x) (unsigned char)(x >> 16) -#define low_byte(x) (unsigned char)(x >> 24) - -#define SWAPENDS 1 - -#if defined(__cplusplus) -} -#endif -#endif diff --git a/vp8/common/blockd.h b/vp8/common/blockd.h index a90c1c0b6..99b731c78 100644 --- a/vp8/common/blockd.h +++ b/vp8/common/blockd.h @@ -21,9 +21,6 @@ void vpx_log(const char *format, ...); #include "subpixel.h" #include "vpx_ports/mem.h" -#define TRUE 1 -#define FALSE 0 - /*#define DCPRED 1*/ #define DCPREDSIMTHRESH 0 #define DCPREDCNTTHRESH 3 @@ -170,6 +167,18 @@ typedef struct union b_mode_info bmi[16]; } MODE_INFO; +#if CONFIG_MULTI_RES_ENCODING +/* The information needed to be stored for higher-resolution encoder */ +typedef struct +{ + MB_PREDICTION_MODE mode; + MV_REFERENCE_FRAME ref_frame; + int_mv mv; + //union b_mode_info bmi[16]; + int dissim; // dissimilarity level of the macroblock +} LOWER_RES_INFO; +#endif + typedef struct { short *qcoeff; diff --git a/vp8/common/common.h b/vp8/common/common.h index 9a93da991..2cc1c544c 100644 --- a/vp8/common/common.h +++ b/vp8/common/common.h @@ -18,8 +18,6 @@ #include "vpx_mem/vpx_mem.h" -#include "common_types.h" - /* Only need this for fixed-size arrays, for structs just assign. */ #define vp8_copy( Dest, Src) { \ diff --git a/vp8/decoder/dequantize.c b/vp8/common/dequantize.c index 0861965eb..4a48a3192 100644 --- a/vp8/decoder/dequantize.c +++ b/vp8/common/dequantize.c @@ -42,22 +42,3 @@ void vp8_dequant_idct_add_c(short *input, short *dq, vpx_memset(input, 0, 32); } - -void vp8_dequant_dc_idct_add_c(short *input, short *dq, - unsigned char *dest, int stride, - int Dc) -{ - int i; - - input[0] = (short)Dc; - - for (i = 1; i < 16; i++) - { - input[i] = dq[i] * input[i]; - } - - vp8_short_idct4x4llm_c(input, dest, stride, dest, stride); - - vpx_memset(input, 0, 32); - -} diff --git a/vp8/decoder/dequantize.h b/vp8/common/dequantize.h index 019b7f6d1..f66cf2bac 100644 --- a/vp8/decoder/dequantize.h +++ b/vp8/common/dequantize.h @@ -21,17 +21,6 @@ unsigned char *output, \ int stride) -#define prototype_dequant_dc_idct_add(sym) \ - void sym(short *input, short *dq, \ - unsigned char *dst, \ - int stride, \ - int dc) - -#define prototype_dequant_dc_idct_add_y_block(sym) \ - void sym(short *q, short *dq, \ - unsigned char *dst, \ - int stride, char *eobs, short *dc) - #define prototype_dequant_idct_add_y_block(sym) \ void sym(short *q, short *dq, \ unsigned char *dst, \ @@ -60,16 +49,6 @@ extern prototype_dequant_block(vp8_dequant_block); #endif extern prototype_dequant_idct_add(vp8_dequant_idct_add); -#ifndef vp8_dequant_dc_idct_add -#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_c -#endif -extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add); - -#ifndef vp8_dequant_dc_idct_add_y_block -#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_c -#endif -extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block); - #ifndef vp8_dequant_idct_add_y_block #define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_c #endif @@ -85,10 +64,6 @@ typedef prototype_dequant_block((*vp8_dequant_block_fn_t)); typedef prototype_dequant_idct_add((*vp8_dequant_idct_add_fn_t)); -typedef prototype_dequant_dc_idct_add((*vp8_dequant_dc_idct_add_fn_t)); - -typedef prototype_dequant_dc_idct_add_y_block((*vp8_dequant_dc_idct_add_y_block_fn_t)); - typedef prototype_dequant_idct_add_y_block((*vp8_dequant_idct_add_y_block_fn_t)); typedef prototype_dequant_idct_add_uv_block((*vp8_dequant_idct_add_uv_block_fn_t)); @@ -97,8 +72,6 @@ typedef struct { vp8_dequant_block_fn_t block; vp8_dequant_idct_add_fn_t idct_add; - vp8_dequant_dc_idct_add_fn_t dc_idct_add; - vp8_dequant_dc_idct_add_y_block_fn_t dc_idct_add_y_block; vp8_dequant_idct_add_y_block_fn_t idct_add_y_block; vp8_dequant_idct_add_uv_block_fn_t idct_add_uv_block; } vp8_dequant_rtcd_vtable_t; diff --git a/vp8/common/dma_desc.h b/vp8/common/dma_desc.h deleted file mode 100644 index b923da6e0..000000000 --- a/vp8/common/dma_desc.h +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef _dma_desc_h -#define _dma_desc_h - -#if defined(__cplusplus) -extern "C" { -#endif - - -#define NDSIZE_LG 0x00000900 // Next Descriptor Size -#define NDSIZE_SM 0x00000800 // Next Descriptor Size -#define NDSIZE_7 0x00000700 // Next Descriptor Size -#define NDSIZE_6 0x00000600 // Next Descriptor Size -#define NDSIZE_5 0x00000500 // Next Descriptor Size -#define NDSIZE_4 0x00000400 // Next Descriptor Size -#define NDSIZE_3 0x00000300 // Next Descriptor Size -#define NDSIZE_2 0x00000200 // Next Descriptor Size -#define NDSIZE_1 0x00000100 // Next Descriptor Size - -#define FLOW_STOP 0x0000 -#define FLOW_AUTO 0x1000 -#define FLOW_DESC_AR 0x4000 -#define FLOW_DESC_SM 0x6000 -#define FLOW_DESC_LG 0x7000 - - typedef struct - { - unsigned int ndp; - //unsigned short ndpl; - //unsigned short ndph; - unsigned int sa; - //unsigned short sal; - //unsigned short sah; - - unsigned short dmacfg; - unsigned short xcnt; - unsigned short xmod; - unsigned short ycnt; - unsigned short ymod; - - } LARGE_DESC; - - typedef struct - { - unsigned short ndpl; - unsigned short sal; - unsigned short sah; - unsigned short dmacfg; - unsigned short xcnt; - unsigned short xmod; - unsigned short ycnt; - unsigned short ymod; - } SMALL_DESC; - - typedef struct - { - unsigned short sal; - unsigned short sah; - unsigned short dmacfg; - unsigned short xcnt; - unsigned short xmod; - unsigned short ycnt; - unsigned short ymod; - } ARRAY_DESC_7; - - typedef struct - { - unsigned short sal; - unsigned short sah; - unsigned short dmacfg; - unsigned short xcnt; - unsigned short xmod; - unsigned short ycnt; - } ARRAY_DESC_6; - - typedef struct - { - unsigned short sal; - unsigned short sah; - unsigned short dmacfg; - unsigned short xcnt; - unsigned short xmod; - } ARRAY_DESC_5; - - typedef struct - { - unsigned short sal; - unsigned short sah; - unsigned short dmacfg; - unsigned short xcnt; - } ARRAY_DESC_4; - - typedef struct - { - unsigned short sal; - unsigned short sah; - unsigned short dmacfg; - } ARRAY_DESC_3; - - typedef struct - { - unsigned short sal; - unsigned short sah; - } ARRAY_DESC_2; - - typedef struct - { - unsigned short sal; - } ARRAY_DESC_1; - -#if defined(__cplusplus) -} -#endif - -#endif //_dma_desc_h diff --git a/vp8/common/duck_io.h b/vp8/common/duck_io.h deleted file mode 100644 index 43daa65bc..000000000 --- a/vp8/common/duck_io.h +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef _duck_io_h -#define _duck_io_h - -#if defined(__cplusplus) -extern "C" { -#endif - -#if defined (_WIN32) - typedef __int64 int64_t; -#elif defined(__MWERKS__) - typedef long long int64_t; -#elif defined(__APPLE__) || defined(__POWERPC) -#include <ppc/types.h> -#else - typedef long long int64_t; -#endif - - typedef struct - { - int64_t offset; // offset to start from - int blocking; // non-zero for blocking - } re_open_t; - - - typedef enum - { - SAL_ERR_MAX = -10, - SAL_ERROR = -11, // Default error - SAL_ERR_WSASTARTUP = -12, - SAL_ERR_SOCKET_CREATE = -13, - SAL_ERR_RESOLVING_HOSTNAME = -14, - SAL_ERR_SERVER_CONNECTION = -15, - SAL_ERR_SENDING_DATA = -16, - SAL_ERR_RECEIVING_DATA = -17, - SAL_ERR_404_FILE_NOT_FOUND = -18, - SAL_ERR_PARSING_HTTP_HEADER = -19, - SAL_ERR_PARSING_CONTENT_LEN = -20, - SAL_ERR_CONNECTION_TIMEOUT = -21, - SAL_ERR_FILE_OPEN_FAILED = -22, - SAL_ERR_MIN = -23 - } SAL_ERR; /* EMH 1-15-03 */ - - - typedef struct sal_err_map_temp - { - SAL_ERR code; - const char *decode; - - } sal_err_map_t; - - - static char *sal_err_text(SAL_ERR e) - { - int t; - const sal_err_map_t g_sal_err_map[] = - { - { SAL_ERR_WSASTARTUP, "Error with WSAStartup" }, - { SAL_ERR_SOCKET_CREATE, "Error creating socket" }, - { SAL_ERR_RESOLVING_HOSTNAME, "Error resolving hostname" }, - { SAL_ERR_SERVER_CONNECTION, "Error connecting to server" }, - { SAL_ERR_SENDING_DATA, "Error sending data" }, - { SAL_ERR_RECEIVING_DATA, "Error receiving data" }, - { SAL_ERR_404_FILE_NOT_FOUND, "Error file not found " }, - { SAL_ERR_PARSING_HTTP_HEADER, "Error parsing http header" }, - { SAL_ERR_PARSING_CONTENT_LEN, "Error parsing content length" }, - { SAL_ERR_CONNECTION_TIMEOUT, "Error Connection timed out" }, - { SAL_ERR_FILE_OPEN_FAILED, "Error opening file" } - }; - - for (t = 0; t < sizeof(g_sal_err_map) / sizeof(sal_err_map_t); t++) - { - if (e == g_sal_err_map[t].code) - return (char *) g_sal_err_map[t].decode; - } - - return 0; - } - - - - - - - - int duck_open(const char *fname, unsigned long user_data); - - void duck_close(int ghndl); - - int duck_read(int ghndl, unsigned char *buf, int nbytes); - - int64_t duck_seek(int g_hndl, int64_t offs, int origin); - - int duck_read_finished(int han, int flag); /* FWG 7-9-99 */ - - int duck_name(int handle, char name[], size_t max_len); /* EMH 9-23-03 */ - - int duck_read_blocking(int handle, unsigned char *buffer, int bytes); /* EMH 9-23-03 */ - - int64_t duck_available_data(int handle); /* EMH 10-23-03 */ - -#if defined(__cplusplus) -} -#endif - -#endif diff --git a/vp8/common/findnearmv.h b/vp8/common/findnearmv.h index 01909b937..a3443d765 100644 --- a/vp8/common/findnearmv.h +++ b/vp8/common/findnearmv.h @@ -60,10 +60,10 @@ static unsigned int vp8_check_mv_bounds(int_mv *mv, int mb_to_left_edge, int mb_to_bottom_edge) { unsigned int need_to_clamp; - need_to_clamp = (mv->as_mv.col < mb_to_left_edge) ? 1 : 0; - need_to_clamp |= (mv->as_mv.col > mb_to_right_edge) ? 1 : 0; - need_to_clamp |= (mv->as_mv.row < mb_to_top_edge) ? 1 : 0; - need_to_clamp |= (mv->as_mv.row > mb_to_bottom_edge) ? 1 : 0; + need_to_clamp = (mv->as_mv.col < mb_to_left_edge); + need_to_clamp |= (mv->as_mv.col > mb_to_right_edge); + need_to_clamp |= (mv->as_mv.row < mb_to_top_edge); + need_to_clamp |= (mv->as_mv.row > mb_to_bottom_edge); return need_to_clamp; } diff --git a/vp8/common/g_common.h b/vp8/common/g_common.h deleted file mode 100644 index 5f523980b..000000000 --- a/vp8/common/g_common.h +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -extern void (*vp8_clear_system_state)(void); -extern void (*vp8_plane_add_noise)(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int DPitch, int q); -extern void (*de_interlace) -( - unsigned char *src_ptr, - unsigned char *dst_ptr, - int Width, - int Height, - int Stride -); diff --git a/vp8/common/generic/systemdependent.c b/vp8/common/generic/systemdependent.c index 9641d8c1e..01d76206d 100644 --- a/vp8/common/generic/systemdependent.c +++ b/vp8/common/generic/systemdependent.c @@ -10,7 +10,6 @@ #include "vpx_config.h" -#include "vp8/common/g_common.h" #include "vp8/common/subpixel.h" #include "vp8/common/loopfilter.h" #include "vp8/common/recon.h" @@ -70,6 +69,14 @@ void vp8_machine_specific_config(VP8_COMMON *ctx) #if CONFIG_RUNTIME_CPU_DETECT VP8_COMMON_RTCD *rtcd = &ctx->rtcd; + + rtcd->dequant.block = vp8_dequantize_b_c; + rtcd->dequant.idct_add = vp8_dequant_idct_add_c; + rtcd->dequant.idct_add_y_block = vp8_dequant_idct_add_y_block_c; + rtcd->dequant.idct_add_uv_block = + vp8_dequant_idct_add_uv_block_c; + + rtcd->idct.idct16 = vp8_short_idct4x4llm_c; rtcd->idct.idct1_scalar_add = vp8_dc_only_idct_add_c; rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_c; diff --git a/vp8/common/idct.h b/vp8/common/idct.h index 411a1b472..7371f85ff 100644 --- a/vp8/common/idct.h +++ b/vp8/common/idct.h @@ -37,6 +37,10 @@ #define vp8_idct_idct16 vp8_short_idct4x4llm_c #endif extern prototype_idct(vp8_idct_idct16); +/* add this prototype to prevent compiler warning about implicit + * declaration of vp8_short_idct4x4llm_c function in dequantize.c + * when building, for example, neon optimized version */ +extern prototype_idct(vp8_short_idct4x4llm_c); #ifndef vp8_idct_idct1_scalar_add #define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_c diff --git a/vp8/decoder/idct_blk.c b/vp8/common/idct_blk.c index 1c16b92a9..249fad4ea 100644 --- a/vp8/decoder/idct_blk.c +++ b/vp8/common/idct_blk.c @@ -12,39 +12,12 @@ #include "vp8/common/idct.h" #include "dequantize.h" -void vp8_dequant_dc_idct_add_c(short *input, short *dq, - unsigned char *dest, int stride, - int Dc); void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *dest, int stride); void vp8_dc_only_idct_add_c(short input_dc, unsigned char * pred, int pred_stride, unsigned char *dst_ptr, int dst_stride); -void vp8_dequant_dc_idct_add_y_block_c - (short *q, short *dq, - unsigned char *dst, int stride, char *eobs, short *dc) -{ - int i, j; - - for (i = 0; i < 4; i++) - { - for (j = 0; j < 4; j++) - { - if (*eobs++ > 1) - vp8_dequant_dc_idct_add_c (q, dq, dst, stride, dc[0]); - else - vp8_dc_only_idct_add_c (dc[0], dst, stride, dst, stride); - - q += 16; - dst += 4; - dc ++; - } - - dst += 4*stride - 16; - } -} - void vp8_dequant_idct_add_y_block_c (short *q, short *dq, unsigned char *dst, int stride, char *eobs) diff --git a/vp8/common/idctllm.c b/vp8/common/idctllm.c index 49496abef..47af52f04 100644 --- a/vp8/common/idctllm.c +++ b/vp8/common/idctllm.c @@ -137,8 +137,9 @@ void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, } -void vp8_short_inv_walsh4x4_c(short *input, short *output) +void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff) { + short output[16]; int i; int a1, b1, c1, d1; int a2, b2, c2, d2; @@ -183,22 +184,21 @@ void vp8_short_inv_walsh4x4_c(short *input, short *output) ip += 4; op += 4; } + + for(i = 0; i < 16; i++) + { + mb_dqcoeff[i * 16] = output[i]; + } } -void vp8_short_inv_walsh4x4_1_c(short *input, short *output) +void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff) { int i; int a1; - short *op = output; a1 = ((input[0] + 3) >> 3); - - for (i = 0; i < 4; i++) + for(i = 0; i < 16; i++) { - op[0] = a1; - op[1] = a1; - op[2] = a1; - op[3] = a1; - op += 4; + mb_dqcoeff[i * 16] = a1; } } diff --git a/vp8/common/invtrans.c b/vp8/common/invtrans.c deleted file mode 100644 index 478cb329f..000000000 --- a/vp8/common/invtrans.c +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "invtrans.h" - - -void vp8_inverse_transform_b(const vp8_idct_rtcd_vtable_t *rtcd, BLOCKD *b, - int pitch) -{ - if (*b->eob > 1) - { - IDCT_INVOKE(rtcd, idct16)(b->dqcoeff, b->predictor, pitch, - *(b->base_dst) + b->dst, b->dst_stride); - } - else - { - IDCT_INVOKE(rtcd, idct1_scalar_add)(b->dqcoeff[0], b->predictor, pitch, - *(b->base_dst) + b->dst, b->dst_stride); - } - -} - -static void recon_dcblock(MACROBLOCKD *x) -{ - BLOCKD *b = &x->block[24]; - int i; - - for (i = 0; i < 16; i++) - { - x->block[i].dqcoeff[0] = b->diff[i]; - } - -} - -void vp8_inverse_transform_mby(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x) -{ - int i; - - if(x->mode_info_context->mbmi.mode != SPLITMV) - { - /* do 2nd order transform on the dc block */ - IDCT_INVOKE(rtcd, iwalsh16)(x->block[24].dqcoeff, x->block[24].diff); - - recon_dcblock(x); - } - - for (i = 0; i < 16; i++) - { - vp8_inverse_transform_b(rtcd, &x->block[i], 16); - } - -} -void vp8_inverse_transform_mbuv(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x) -{ - int i; - - for (i = 16; i < 24; i++) - { - vp8_inverse_transform_b(rtcd, &x->block[i], 8); - } - -} diff --git a/vp8/common/invtrans.h b/vp8/common/invtrans.h index d14573b91..2bcbeeccf 100644 --- a/vp8/common/invtrans.h +++ b/vp8/common/invtrans.h @@ -15,9 +15,66 @@ #include "vpx_config.h" #include "idct.h" #include "blockd.h" -extern void vp8_inverse_transform_b(const vp8_idct_rtcd_vtable_t *rtcd, BLOCKD *b, int pitch); -extern void vp8_inverse_transform_mb(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x); -extern void vp8_inverse_transform_mby(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x); -extern void vp8_inverse_transform_mbuv(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x); +#include "onyxc_int.h" +#if CONFIG_MULTITHREAD +#include "vpx_mem/vpx_mem.h" +#endif + +static void eob_adjust(char *eobs, short *diff) +{ + /* eob adjust.... the idct can only skip if both the dc and eob are zero */ + int js; + for(js = 0; js < 16; js++) + { + if((eobs[js] == 0) && (diff[0] != 0)) + eobs[js]++; + diff+=16; + } +} + +static void vp8_inverse_transform_mby(MACROBLOCKD *xd, + const VP8_COMMON_RTCD *rtcd) +{ + short *DQC = xd->block[0].dequant; + /* save the dc dequant constant in case it is overridden */ + short dc_dequant_temp = DQC[0]; + +#if CONFIG_MULTITHREAD + DECLARE_ALIGNED(16, short, local_dequant[16]); +#endif + + if (xd->mode_info_context->mbmi.mode != SPLITMV) + { + /* do 2nd order transform on the dc block */ + if (xd->eobs[24] > 1) + { + IDCT_INVOKE(&rtcd->idct, iwalsh16) + (&xd->block[24].dqcoeff[0], xd->qcoeff); + } + else + { + IDCT_INVOKE(&rtcd->idct, iwalsh1) + (&xd->block[24].dqcoeff[0], xd->qcoeff); + } + eob_adjust(xd->eobs, xd->qcoeff); + +#if CONFIG_MULTITHREAD + DQC = local_dequant; + + vpx_memcpy(DQC, xd->block[0].dequant, + sizeof(local_dequant)); +#endif + + /* override the dc dequant constant */ + DQC[0] = 1; + } + DEQUANT_INVOKE (&rtcd->dequant, idct_add_y_block) + (xd->qcoeff, DQC, + xd->dst.y_buffer, + xd->dst.y_stride, xd->eobs); + + /* restore the dc dequant constant */ + DQC[0] = dc_dequant_temp; +} #endif diff --git a/vp8/common/littlend.h b/vp8/common/littlend.h deleted file mode 100644 index 99df1164c..000000000 --- a/vp8/common/littlend.h +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef _littlend_h -#define _littlend_h - -#if defined(__cplusplus) -extern "C" { -#endif - -#define invert2(x) (x) -#define invert4(x) (x) - -#define low_byte(x) (unsigned char)x -#define mid1Byte(x) (unsigned char)(x >> 8) -#define mid2Byte(x) (unsigned char)(x >> 16) -#define high_byte(x) (unsigned char)(x >> 24) - -#define SWAPENDS 0 - -#if defined(__cplusplus) -} -#endif - -#endif diff --git a/vp8/common/onyx.h b/vp8/common/onyx.h index 28cbaed98..d17a32b82 100644 --- a/vp8/common/onyx.h +++ b/vp8/common/onyx.h @@ -17,13 +17,14 @@ extern "C" { #endif +#include "vpx_config.h" #include "vpx/internal/vpx_codec_internal.h" #include "vpx/vp8cx.h" #include "vpx/vpx_encoder.h" #include "vpx_scale/yv12config.h" -#include "type_aliases.h" #include "ppflags.h" - typedef int *VP8_PTR; + + struct VP8_COMP; /* Create/destroy static data structures. */ @@ -104,7 +105,7 @@ extern "C" int Version; // 4 versions of bitstream defined 0 best quality/slowest decode, 3 lowest quality/fastest decode int Width; // width of data passed to the compressor int Height; // height of data passed to the compressor - double frame_rate; // set to passed in framerate + struct vpx_rational timebase; int target_bandwidth; // bandwidth to be used in kilobits per second int noise_sensitivity; // parameter used for applying pre processing blur: recommendation 0 @@ -207,32 +208,45 @@ extern "C" unsigned int periodicity; unsigned int layer_id[MAX_PERIODICITY]; +#if CONFIG_MULTI_RES_ENCODING + /* Number of total resolutions encoded */ + unsigned int mr_total_resolutions; + + /* Current encoder ID */ + unsigned int mr_encoder_id; + + /* Down-sampling factor */ + vpx_rational_t mr_down_sampling_factor; + + /* Memory location to store low-resolution encoder's mode info */ + void* mr_low_res_mode_info; +#endif } VP8_CONFIG; void vp8_initialize(); - VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf); - void vp8_remove_compressor(VP8_PTR *comp); + struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf); + void vp8_remove_compressor(struct VP8_COMP* *comp); - void vp8_init_config(VP8_PTR onyx, VP8_CONFIG *oxcf); - void vp8_change_config(VP8_PTR onyx, VP8_CONFIG *oxcf); + void vp8_init_config(struct VP8_COMP* onyx, VP8_CONFIG *oxcf); + void vp8_change_config(struct VP8_COMP* onyx, VP8_CONFIG *oxcf); // receive a frames worth of data caller can assume that a copy of this frame is made // and not just a copy of the pointer.. - int vp8_receive_raw_frame(VP8_PTR comp, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, int64_t end_time_stamp); - int vp8_get_compressed_data(VP8_PTR comp, unsigned int *frame_flags, unsigned long *size, unsigned char *dest, unsigned char *dest_end, int64_t *time_stamp, int64_t *time_end, int flush); - int vp8_get_preview_raw_frame(VP8_PTR comp, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *flags); - - int vp8_use_as_reference(VP8_PTR comp, int ref_frame_flags); - int vp8_update_reference(VP8_PTR comp, int ref_frame_flags); - int vp8_get_reference(VP8_PTR comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd); - int vp8_set_reference(VP8_PTR comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd); - int vp8_update_entropy(VP8_PTR comp, int update); - int vp8_set_roimap(VP8_PTR comp, unsigned char *map, unsigned int rows, unsigned int cols, int delta_q[4], int delta_lf[4], unsigned int threshold[4]); - int vp8_set_active_map(VP8_PTR comp, unsigned char *map, unsigned int rows, unsigned int cols); - int vp8_set_internal_size(VP8_PTR comp, VPX_SCALING horiz_mode, VPX_SCALING vert_mode); - int vp8_get_quantizer(VP8_PTR c); + int vp8_receive_raw_frame(struct VP8_COMP* comp, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, int64_t end_time_stamp); + int vp8_get_compressed_data(struct VP8_COMP* comp, unsigned int *frame_flags, unsigned long *size, unsigned char *dest, unsigned char *dest_end, int64_t *time_stamp, int64_t *time_end, int flush); + int vp8_get_preview_raw_frame(struct VP8_COMP* comp, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *flags); + + int vp8_use_as_reference(struct VP8_COMP* comp, int ref_frame_flags); + int vp8_update_reference(struct VP8_COMP* comp, int ref_frame_flags); + int vp8_get_reference(struct VP8_COMP* comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd); + int vp8_set_reference(struct VP8_COMP* comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd); + int vp8_update_entropy(struct VP8_COMP* comp, int update); + int vp8_set_roimap(struct VP8_COMP* comp, unsigned char *map, unsigned int rows, unsigned int cols, int delta_q[4], int delta_lf[4], unsigned int threshold[4]); + int vp8_set_active_map(struct VP8_COMP* comp, unsigned char *map, unsigned int rows, unsigned int cols); + int vp8_set_internal_size(struct VP8_COMP* comp, VPX_SCALING horiz_mode, VPX_SCALING vert_mode); + int vp8_get_quantizer(struct VP8_COMP* c); #ifdef __cplusplus } diff --git a/vp8/common/onyxc_int.h b/vp8/common/onyxc_int.h index 936fa9f23..f733ff774 100644 --- a/vp8/common/onyxc_int.h +++ b/vp8/common/onyxc_int.h @@ -22,6 +22,7 @@ #if CONFIG_POSTPROC #include "postproc.h" #endif +#include "dequantize.h" /*#ifdef PACKET_TESTING*/ #include "header.h" @@ -73,6 +74,7 @@ typedef enum typedef struct VP8_COMMON_RTCD { #if CONFIG_RUNTIME_CPU_DETECT + vp8_dequant_rtcd_vtable_t dequant; vp8_idct_rtcd_vtable_t idct; vp8_recon_rtcd_vtable_t recon; vp8_subpix_rtcd_vtable_t subpix; diff --git a/vp8/common/onyxd.h b/vp8/common/onyxd.h index 43fa00bd3..35a8b6e55 100644 --- a/vp8/common/onyxd.h +++ b/vp8/common/onyxd.h @@ -18,13 +18,13 @@ extern "C" { #endif -#include "type_aliases.h" #include "vpx_scale/yv12config.h" #include "ppflags.h" #include "vpx_ports/mem.h" #include "vpx/vpx_codec.h" - typedef void *VP8D_PTR; + struct VP8D_COMP; + typedef struct { int Width; @@ -49,19 +49,19 @@ extern "C" void vp8dx_initialize(void); - void vp8dx_set_setting(VP8D_PTR comp, VP8D_SETTING oxst, int x); + void vp8dx_set_setting(struct VP8D_COMP* comp, VP8D_SETTING oxst, int x); - int vp8dx_get_setting(VP8D_PTR comp, VP8D_SETTING oxst); + int vp8dx_get_setting(struct VP8D_COMP* comp, VP8D_SETTING oxst); - int vp8dx_receive_compressed_data(VP8D_PTR comp, unsigned long size, const unsigned char *dest, int64_t time_stamp); - int vp8dx_get_raw_frame(VP8D_PTR comp, YV12_BUFFER_CONFIG *sd, int64_t *time_stamp, int64_t *time_end_stamp, vp8_ppflags_t *flags); + int vp8dx_receive_compressed_data(struct VP8D_COMP* comp, unsigned long size, const unsigned char *dest, int64_t time_stamp); + int vp8dx_get_raw_frame(struct VP8D_COMP* comp, YV12_BUFFER_CONFIG *sd, int64_t *time_stamp, int64_t *time_end_stamp, vp8_ppflags_t *flags); - vpx_codec_err_t vp8dx_get_reference(VP8D_PTR comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd); - vpx_codec_err_t vp8dx_set_reference(VP8D_PTR comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd); + vpx_codec_err_t vp8dx_get_reference(struct VP8D_COMP* comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd); + vpx_codec_err_t vp8dx_set_reference(struct VP8D_COMP* comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd); - VP8D_PTR vp8dx_create_decompressor(VP8D_CONFIG *oxcf); + struct VP8D_COMP* vp8dx_create_decompressor(VP8D_CONFIG *oxcf); - void vp8dx_remove_decompressor(VP8D_PTR comp); + void vp8dx_remove_decompressor(struct VP8D_COMP* comp); #ifdef __cplusplus } diff --git a/vp8/common/ppc/systemdependent.c b/vp8/common/ppc/systemdependent.c index 1f5d79068..7046a63e8 100644 --- a/vp8/common/ppc/systemdependent.c +++ b/vp8/common/ppc/systemdependent.c @@ -9,7 +9,6 @@ */ -#include "g_common.h" #include "subpixel.h" #include "loopfilter.h" #include "recon.h" diff --git a/vp8/common/reconinter.c b/vp8/common/reconinter.c index 24c09a353..6c7af41d4 100644 --- a/vp8/common/reconinter.c +++ b/vp8/common/reconinter.c @@ -334,11 +334,12 @@ void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x) /*encoder only*/ -void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x) +void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x, + unsigned char *dst_y, + int dst_ystride) { unsigned char *ptr_base; unsigned char *ptr; - unsigned char *pred_ptr = x->predictor; int mv_row = x->mode_info_context->mbmi.mv.as_mv.row; int mv_col = x->mode_info_context->mbmi.mv.as_mv.col; int pre_stride = x->block[0].pre_stride; @@ -348,11 +349,13 @@ void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x) if ((mv_row | mv_col) & 7) { - x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, pred_ptr, 16); + x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, + dst_y, dst_ystride); } else { - RECON_INVOKE(&x->rtcd->recon, copy16x16)(ptr, pre_stride, pred_ptr, 16); + RECON_INVOKE(&x->rtcd->recon, copy16x16)(ptr, pre_stride, dst_y, + dst_ystride); } } @@ -596,69 +599,3 @@ void vp8_build_inter_predictors_mb(MACROBLOCKD *xd) build_inter4x4_predictors_mb(xd); } } -/* encoder only*/ -static void build_inter4x4_predictors_mb_e(MACROBLOCKD *x) -{ - int i; - - if (x->mode_info_context->mbmi.partitioning < 3) - { - x->block[ 0].bmi = x->mode_info_context->bmi[ 0]; - x->block[ 2].bmi = x->mode_info_context->bmi[ 2]; - x->block[ 8].bmi = x->mode_info_context->bmi[ 8]; - x->block[10].bmi = x->mode_info_context->bmi[10]; - - build_inter_predictors4b(x, &x->block[ 0], x->block[ 0].predictor, 16); - build_inter_predictors4b(x, &x->block[ 2], x->block[ 2].predictor, 16); - build_inter_predictors4b(x, &x->block[ 8], x->block[ 8].predictor, 16); - build_inter_predictors4b(x, &x->block[10], x->block[10].predictor, 16); - } - else - { - for (i = 0; i < 16; i += 2) - { - BLOCKD *d0 = &x->block[i]; - BLOCKD *d1 = &x->block[i+1]; - - x->block[i+0].bmi = x->mode_info_context->bmi[i+0]; - x->block[i+1].bmi = x->mode_info_context->bmi[i+1]; - - if (d0->bmi.mv.as_int == d1->bmi.mv.as_int) - build_inter_predictors2b(x, d0, d0->predictor, 16); - else - { - build_inter_predictors_b(d0, d0->predictor, 16, x->subpixel_predict); - build_inter_predictors_b(d1, d1->predictor, 16, x->subpixel_predict); - } - - } - - } - - for (i = 16; i < 24; i += 2) - { - BLOCKD *d0 = &x->block[i]; - BLOCKD *d1 = &x->block[i+1]; - - if (d0->bmi.mv.as_int == d1->bmi.mv.as_int) - build_inter_predictors2b(x, d0, d0->predictor, 8); - else - { - build_inter_predictors_b(d0, d0->predictor, 8, x->subpixel_predict); - build_inter_predictors_b(d1, d1->predictor, 8, x->subpixel_predict); - } - } -} -void vp8_build_inter_predictors_mb_e(MACROBLOCKD *xd) -{ - if (xd->mode_info_context->mbmi.mode != SPLITMV) - { - vp8_build_inter16x16_predictors_mb(xd, xd->predictor, &xd->predictor[256], - &xd->predictor[320], 16, 8); - } - else - { - build_4x4uvmvs(xd); - build_inter4x4_predictors_mb_e(xd); - } -} diff --git a/vp8/common/reconinter.h b/vp8/common/reconinter.h index 86f9d5ae3..f57ff73c5 100644 --- a/vp8/common/reconinter.h +++ b/vp8/common/reconinter.h @@ -21,11 +21,13 @@ extern void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x, int dst_uvstride); -extern void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x); -extern void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, vp8_subpix_fn_t sppf); +extern void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x, + unsigned char *dst_y, + int dst_ystride); +extern void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, + vp8_subpix_fn_t sppf); extern void vp8_build_inter16x16_predictors_mbuv(MACROBLOCKD *x); extern void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x); -extern void vp8_build_inter_predictors_mb_e(MACROBLOCKD *xd); #endif diff --git a/vp8/common/type_aliases.h b/vp8/common/type_aliases.h deleted file mode 100644 index 22b531a76..000000000 --- a/vp8/common/type_aliases.h +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -/**************************************************************************** -* -* Module Title : type_aliases.h -* -* Description : Standard type aliases -* -****************************************************************************/ -#ifndef __INC_TYPE_ALIASES_H -#define __INC_TYPE_ALIASES_H - -/**************************************************************************** -* Macros -****************************************************************************/ -#define EXPORT -#define IMPORT extern /* Used to declare imported data & routines */ -#define PRIVATE static /* Used to declare & define module-local data */ -#define LOCAL static /* Used to define all persistent routine-local data */ -#define STD_IN_PATH 0 /* Standard input path */ -#define STD_OUT_PATH 1 /* Standard output path */ -#define STD_ERR_PATH 2 /* Standard error path */ -#define STD_IN_FILE stdin /* Standard input file pointer */ -#define STD_OUT_FILE stdout /* Standard output file pointer */ -#define STD_ERR_FILE stderr /* Standard error file pointer */ -#define max_int 0x7FFFFFFF - -#define __export -#define _export - -#define CCONV - -#ifndef NULL -#ifdef __cplusplus -#define NULL 0 -#else -#define NULL ((void *)0) -#endif -#endif - -#ifndef FALSE -#define FALSE 0 -#endif - -#ifndef TRUE -#define TRUE 1 -#endif - -/**************************************************************************** -* Typedefs -****************************************************************************/ -#ifndef TYPE_INT8 -#define TYPE_INT8 -typedef signed char INT8; -#endif - -#ifndef TYPE_INT16 -/*#define TYPE_INT16*/ -typedef signed short INT16; -#endif - -#ifndef TYPE_INT32 -/*#define TYPE_INT32*/ -typedef signed int INT32; -#endif - -#ifndef TYPE_UINT8 -/*#define TYPE_UINT8*/ -typedef unsigned char UINT8; -#endif - -#ifndef TYPE_UINT32 -/*#define TYPE_UINT32*/ -typedef unsigned int UINT32; -#endif - -#ifndef TYPE_UINT16 -/*#define TYPE_UINT16*/ -typedef unsigned short UINT16; -#endif - -#ifndef TYPE_BOOL -/*#define TYPE_BOOL*/ -typedef int BOOL; -#endif - -typedef unsigned char BOOLEAN; - -#ifdef _MSC_VER -typedef __int64 INT64; -#else - -#ifndef TYPE_INT64 -#ifdef _TMS320C6X -/* for now we only have 40bits */ -typedef long INT64; -#else -typedef long long INT64; -#endif -#endif - -#endif - -/* Floating point */ -typedef double FLOAT64; -typedef float FLOAT32; - -#endif diff --git a/vp8/decoder/x86/dequantize_mmx.asm b/vp8/common/x86/dequantize_mmx.asm index 648bde4c5..de9eba89f 100644 --- a/vp8/decoder/x86/dequantize_mmx.asm +++ b/vp8/common/x86/dequantize_mmx.asm @@ -246,207 +246,6 @@ sym(vp8_dequant_idct_add_mmx): pop rbp ret - -;void dequant_dc_idct_add_mmx( -;short *input, 0 -;short *dq, 1 -;unsigned char *dest, 2 -;int stride, 3 -;int Dc) 4 -global sym(vp8_dequant_dc_idct_add_mmx) -sym(vp8_dequant_dc_idct_add_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - GET_GOT rbx - ; end prolog - - mov rax, arg(0) ;input - mov rdx, arg(1) ;dq - - movq mm0, [rax ] - pmullw mm0, [rdx] - - movq mm1, [rax +8] - pmullw mm1, [rdx +8] - - movq mm2, [rax+16] - pmullw mm2, [rdx+16] - - movq mm3, [rax+24] - pmullw mm3, [rdx+24] - - mov rdx, arg(2) ;pred - pxor mm7, mm7 - - - movq [rax], mm7 - movq [rax+8], mm7 - - movq [rax+16],mm7 - movq [rax+24],mm7 - - ; move lower word of Dc to lower word of mm0 - psrlq mm0, 16 - movzx rcx, word ptr arg(4) ;Dc - psllq mm0, 16 - movq mm7, rcx - por mm0, mm7 - - movsxd rax, dword ptr arg(3) ;stride - - psubw mm0, mm2 ; b1= 0-2 - paddw mm2, mm2 ; - - movq mm5, mm1 - paddw mm2, mm0 ; a1 =0+2 - - pmulhw mm5, [GLOBAL(x_s1sqr2)]; - paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2) - - movq mm7, mm3 ; - pmulhw mm7, [GLOBAL(x_c1sqr2less1)]; - - paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2) - psubw mm7, mm5 ; c1 - - movq mm5, mm1 - movq mm4, mm3 - - pmulhw mm5, [GLOBAL(x_c1sqr2less1)] - paddw mm5, mm1 - - pmulhw mm3, [GLOBAL(x_s1sqr2)] - paddw mm3, mm4 - - paddw mm3, mm5 ; d1 - movq mm6, mm2 ; a1 - - movq mm4, mm0 ; b1 - paddw mm2, mm3 ;0 - - paddw mm4, mm7 ;1 - psubw mm0, mm7 ;2 - - psubw mm6, mm3 ;3 - - movq mm1, mm2 ; 03 02 01 00 - movq mm3, mm4 ; 23 22 21 20 - - punpcklwd mm1, mm0 ; 11 01 10 00 - punpckhwd mm2, mm0 ; 13 03 12 02 - - punpcklwd mm3, mm6 ; 31 21 30 20 - punpckhwd mm4, mm6 ; 33 23 32 22 - - movq mm0, mm1 ; 11 01 10 00 - movq mm5, mm2 ; 13 03 12 02 - - punpckldq mm0, mm3 ; 30 20 10 00 - punpckhdq mm1, mm3 ; 31 21 11 01 - - punpckldq mm2, mm4 ; 32 22 12 02 - punpckhdq mm5, mm4 ; 33 23 13 03 - - movq mm3, mm5 ; 33 23 13 03 - - psubw mm0, mm2 ; b1= 0-2 - paddw mm2, mm2 ; - - movq mm5, mm1 - paddw mm2, mm0 ; a1 =0+2 - - pmulhw mm5, [GLOBAL(x_s1sqr2)]; - paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2) - - movq mm7, mm3 ; - pmulhw mm7, [GLOBAL(x_c1sqr2less1)]; - - paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2) - psubw mm7, mm5 ; c1 - - movq mm5, mm1 - movq mm4, mm3 - - pmulhw mm5, [GLOBAL(x_c1sqr2less1)] - paddw mm5, mm1 - - pmulhw mm3, [GLOBAL(x_s1sqr2)] - paddw mm3, mm4 - - paddw mm3, mm5 ; d1 - paddw mm0, [GLOBAL(fours)] - - paddw mm2, [GLOBAL(fours)] - movq mm6, mm2 ; a1 - - movq mm4, mm0 ; b1 - paddw mm2, mm3 ;0 - - paddw mm4, mm7 ;1 - psubw mm0, mm7 ;2 - - psubw mm6, mm3 ;3 - psraw mm2, 3 - - psraw mm0, 3 - psraw mm4, 3 - - psraw mm6, 3 - - movq mm1, mm2 ; 03 02 01 00 - movq mm3, mm4 ; 23 22 21 20 - - punpcklwd mm1, mm0 ; 11 01 10 00 - punpckhwd mm2, mm0 ; 13 03 12 02 - - punpcklwd mm3, mm6 ; 31 21 30 20 - punpckhwd mm4, mm6 ; 33 23 32 22 - - movq mm0, mm1 ; 11 01 10 00 - movq mm5, mm2 ; 13 03 12 02 - - punpckldq mm0, mm3 ; 30 20 10 00 - punpckhdq mm1, mm3 ; 31 21 11 01 - - punpckldq mm2, mm4 ; 32 22 12 02 - punpckhdq mm5, mm4 ; 33 23 13 03 - - pxor mm7, mm7 - - movd mm4, [rdx] - punpcklbw mm4, mm7 - paddsw mm0, mm4 - packuswb mm0, mm7 - movd [rdx], mm0 - - movd mm4, [rdx+rax] - punpcklbw mm4, mm7 - paddsw mm1, mm4 - packuswb mm1, mm7 - movd [rdx+rax], mm1 - - movd mm4, [rdx+2*rax] - punpcklbw mm4, mm7 - paddsw mm2, mm4 - packuswb mm2, mm7 - movd [rdx+rax*2], mm2 - - add rdx, rax - - movd mm4, [rdx+2*rax] - punpcklbw mm4, mm7 - paddsw mm5, mm4 - packuswb mm5, mm7 - movd [rdx+rax*2], mm5 - - ; begin epilog - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - SECTION_RODATA align 16 x_s1sqr2: diff --git a/vp8/decoder/x86/dequantize_x86.h b/vp8/common/x86/dequantize_x86.h index dc68daab3..49bcb7f19 100644 --- a/vp8/decoder/x86/dequantize_x86.h +++ b/vp8/common/x86/dequantize_x86.h @@ -22,8 +22,6 @@ #if HAVE_MMX extern prototype_dequant_block(vp8_dequantize_b_mmx); extern prototype_dequant_idct_add(vp8_dequant_idct_add_mmx); -extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_mmx); -extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_mmx); extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_mmx); extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_mmx); @@ -34,12 +32,6 @@ extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_mmx); #undef vp8_dequant_idct_add #define vp8_dequant_idct_add vp8_dequant_idct_add_mmx -#undef vp8_dequant_dc_idct_add -#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_mmx - -#undef vp8_dequant_dc_idct_add_y_block -#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_mmx - #undef vp8_dequant_idct_add_y_block #define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_mmx @@ -50,14 +42,10 @@ extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_mmx); #endif #if HAVE_SSE2 -extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_sse2); extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_sse2); extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_sse2); #if !CONFIG_RUNTIME_CPU_DETECT -#undef vp8_dequant_dc_idct_add_y_block -#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_sse2 - #undef vp8_dequant_idct_add_y_block #define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_sse2 diff --git a/vp8/common/x86/filter_x86.c b/vp8/common/x86/filter_x86.c new file mode 100644 index 000000000..ebab814f4 --- /dev/null +++ b/vp8/common/x86/filter_x86.c @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_ports/mem.h" + +DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]) = +{ + { 128, 128, 128, 128, 0, 0, 0, 0 }, + { 112, 112, 112, 112, 16, 16, 16, 16 }, + { 96, 96, 96, 96, 32, 32, 32, 32 }, + { 80, 80, 80, 80, 48, 48, 48, 48 }, + { 64, 64, 64, 64, 64, 64, 64, 64 }, + { 48, 48, 48, 48, 80, 80, 80, 80 }, + { 32, 32, 32, 32, 96, 96, 96, 96 }, + { 16, 16, 16, 16, 112, 112, 112, 112 } +}; + +DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]) = +{ + { 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 }, + { 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 }, + { 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 }, + { 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 }, + { 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 } +}; diff --git a/vp8/common/x86/filter_x86.h b/vp8/common/x86/filter_x86.h new file mode 100644 index 000000000..efcc4dc2a --- /dev/null +++ b/vp8/common/x86/filter_x86.h @@ -0,0 +1,19 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef FILTER_X86_H +#define FILTER_X86_H + +/* x86 assembly specific copy of vp8/common/filter.c:vp8_bilinear_filters with + * duplicated values */ +extern const short vp8_bilinear_filters_x86_4[8][8]; /* duplicated 4x */ +extern const short vp8_bilinear_filters_x86_8[8][16]; /* duplicated 8x */ + +#endif /* FILTER_X86_H */ diff --git a/vp8/decoder/x86/idct_blk_mmx.c b/vp8/common/x86/idct_blk_mmx.c index 37de5b9fd..49cebd6f5 100644 --- a/vp8/decoder/x86/idct_blk_mmx.c +++ b/vp8/common/x86/idct_blk_mmx.c @@ -10,41 +10,16 @@ #include "vpx_config.h" #include "vp8/common/idct.h" -#include "vp8/decoder/dequantize.h" +#include "vp8/common/dequantize.h" -void vp8_dequant_dc_idct_add_y_block_mmx - (short *q, short *dq, - unsigned char *dst, int stride, char *eobs, short *dc) -{ - int i; - - for (i = 0; i < 4; i++) - { - if (eobs[0] > 1) - vp8_dequant_dc_idct_add_mmx (q, dq, dst, stride, dc[0]); - else if (eobs[0] == 1) - vp8_dc_only_idct_add_mmx (dc[0], dst, stride, dst, stride); - - if (eobs[1] > 1) - vp8_dequant_dc_idct_add_mmx (q+16, dq, dst+4, stride, dc[1]); - else if (eobs[1] == 1) - vp8_dc_only_idct_add_mmx (dc[1], dst+4, stride, dst+4, stride); +extern void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q); - if (eobs[2] > 1) - vp8_dequant_dc_idct_add_mmx (q+32, dq, dst+8, stride, dc[2]); - else if (eobs[2] == 1) - vp8_dc_only_idct_add_mmx (dc[2], dst+8, stride, dst+8, stride); - - if (eobs[3] > 1) - vp8_dequant_dc_idct_add_mmx (q+48, dq, dst+12, stride, dc[3]); - else if (eobs[3] == 1) - vp8_dc_only_idct_add_mmx (dc[3], dst+12, stride, dst+12, stride); - - q += 64; - dc += 4; - dst += 4*stride; - eobs += 4; - } +void vp8_dequantize_b_mmx(BLOCKD *d) +{ + short *sq = (short *) d->qcoeff; + short *dq = (short *) d->dqcoeff; + short *q = (short *) d->dequant; + vp8_dequantize_b_impl_mmx(sq, dq, q); } void vp8_dequant_idct_add_y_block_mmx diff --git a/vp8/decoder/x86/idct_blk_sse2.c b/vp8/common/x86/idct_blk_sse2.c index 0495b0610..44e440c0c 100644 --- a/vp8/decoder/x86/idct_blk_sse2.c +++ b/vp8/common/x86/idct_blk_sse2.c @@ -10,14 +10,7 @@ #include "vpx_config.h" #include "vp8/common/idct.h" -#include "vp8/decoder/dequantize.h" - -void vp8_idct_dequant_dc_0_2x_sse2 - (short *q, short *dq, - unsigned char *dst, int dst_stride, short *dc); -void vp8_idct_dequant_dc_full_2x_sse2 - (short *q, short *dq, - unsigned char *dst, int dst_stride, short *dc); +#include "vp8/common/dequantize.h" void vp8_idct_dequant_0_2x_sse2 (short *q, short *dq , @@ -26,36 +19,6 @@ void vp8_idct_dequant_full_2x_sse2 (short *q, short *dq , unsigned char *dst, int dst_stride); -void vp8_dequant_dc_idct_add_y_block_sse2 - (short *q, short *dq, - unsigned char *dst, int stride, char *eobs, short *dc) -{ - int i; - - for (i = 0; i < 4; i++) - { - if (((short *)(eobs))[0]) - { - if (((short *)(eobs))[0] & 0xfefe) - vp8_idct_dequant_dc_full_2x_sse2 (q, dq, dst, stride, dc); - else - vp8_idct_dequant_dc_0_2x_sse2 (q, dq, dst, stride, dc); - } - - if (((short *)(eobs))[1]) - { - if (((short *)(eobs))[1] & 0xfefe) - vp8_idct_dequant_dc_full_2x_sse2 (q+32, dq, dst+8, stride, dc+2); - else - vp8_idct_dequant_dc_0_2x_sse2 (q+32, dq, dst+8, stride, dc+2); - } - q += 64; - dc += 4; - dst += stride*4; - eobs += 4; - } -} - void vp8_dequant_idct_add_y_block_sse2 (short *q, short *dq, unsigned char *dst, int stride, char *eobs) diff --git a/vp8/common/x86/idct_x86.h b/vp8/common/x86/idct_x86.h index f9e3a794d..06e3ea4b5 100644 --- a/vp8/common/x86/idct_x86.h +++ b/vp8/common/x86/idct_x86.h @@ -24,7 +24,6 @@ extern prototype_idct(vp8_short_idct4x4llm_mmx); extern prototype_idct_scalar_add(vp8_dc_only_idct_add_mmx); extern prototype_second_order(vp8_short_inv_walsh4x4_mmx); -extern prototype_second_order(vp8_short_inv_walsh4x4_1_mmx); #if !CONFIG_RUNTIME_CPU_DETECT #undef vp8_idct_idct16 @@ -36,9 +35,6 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_1_mmx); #undef vp8_idct_iwalsh16 #define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_mmx -#undef vp8_idct_iwalsh1 -#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_mmx - #endif #endif diff --git a/vp8/common/x86/iwalsh_mmx.asm b/vp8/common/x86/iwalsh_mmx.asm index 10b5274dc..6582687e1 100644 --- a/vp8/common/x86/iwalsh_mmx.asm +++ b/vp8/common/x86/iwalsh_mmx.asm @@ -11,162 +11,129 @@ %include "vpx_ports/x86_abi_support.asm" -;void vp8_short_inv_walsh4x4_1_mmx(short *input, short *output) -global sym(vp8_short_inv_walsh4x4_1_mmx) -sym(vp8_short_inv_walsh4x4_1_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 2 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) - mov rax, 3 - - mov rdi, arg(1) - add rax, [rsi] ;input[0] + 3 - - movd mm0, eax - - punpcklwd mm0, mm0 ;x x val val - - punpckldq mm0, mm0 ;val val val val - - psraw mm0, 3 ;(input[0] + 3) >> 3 - - movq [rdi + 0], mm0 - movq [rdi + 8], mm0 - movq [rdi + 16], mm0 - movq [rdi + 24], mm0 - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - ;void vp8_short_inv_walsh4x4_mmx(short *input, short *output) global sym(vp8_short_inv_walsh4x4_mmx) sym(vp8_short_inv_walsh4x4_mmx): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 2 - push rsi - push rdi ; end prolog - mov rax, 3 - mov rsi, arg(0) - mov rdi, arg(1) - shl rax, 16 - - movq mm0, [rsi + 0] ;ip[0] - movq mm1, [rsi + 8] ;ip[4] - or rax, 3 ;00030003h - - movq mm2, [rsi + 16] ;ip[8] - movq mm3, [rsi + 24] ;ip[12] - - movq mm7, rax - movq mm4, mm0 + mov rdx, arg(0) + mov rax, 30003h - punpcklwd mm7, mm7 ;0003000300030003h - movq mm5, mm1 + movq mm0, [rdx + 0] ;ip[0] + movq mm1, [rdx + 8] ;ip[4] + movd mm7, rax - paddw mm4, mm3 ;ip[0] + ip[12] aka al - paddw mm5, mm2 ;ip[4] + ip[8] aka bl + movq mm2, [rdx + 16] ;ip[8] + movq mm3, [rdx + 24] ;ip[12] + punpcklwd mm7, mm7 ;0003000300030003h + mov rdx, arg(1) - movq mm6, mm4 ;temp al + movq mm4, mm0 + movq mm5, mm1 - paddw mm4, mm5 ;al + bl - psubw mm6, mm5 ;al - bl + paddw mm4, mm3 ;ip[0] + ip[12] aka al + paddw mm5, mm2 ;ip[4] + ip[8] aka bl - psubw mm0, mm3 ;ip[0] - ip[12] aka d1 - psubw mm1, mm2 ;ip[4] - ip[8] aka c1 + movq mm6, mm4 ;temp al + paddw mm4, mm5 ;al + bl + psubw mm6, mm5 ;al - bl - movq mm5, mm0 ;temp dl + psubw mm0, mm3 ;ip[0] - ip[12] aka d1 + psubw mm1, mm2 ;ip[4] - ip[8] aka c1 - paddw mm0, mm1 ;dl + cl - psubw mm5, mm1 ;dl - cl + movq mm5, mm0 ;temp dl + paddw mm0, mm1 ;dl + cl + psubw mm5, mm1 ;dl - cl ; 03 02 01 00 ; 13 12 11 10 ; 23 22 21 20 ; 33 32 31 30 - movq mm3, mm4 ; 03 02 01 00 - punpcklwd mm4, mm0 ; 11 01 10 00 - punpckhwd mm3, mm0 ; 13 03 12 02 + movq mm3, mm4 ; 03 02 01 00 + punpcklwd mm4, mm0 ; 11 01 10 00 + punpckhwd mm3, mm0 ; 13 03 12 02 - movq mm1, mm6 ; 23 22 21 20 - punpcklwd mm6, mm5 ; 31 21 30 20 - punpckhwd mm1, mm5 ; 33 23 32 22 + movq mm1, mm6 ; 23 22 21 20 + punpcklwd mm6, mm5 ; 31 21 30 20 + punpckhwd mm1, mm5 ; 33 23 32 22 - movq mm0, mm4 ; 11 01 10 00 - movq mm2, mm3 ; 13 03 12 02 + movq mm0, mm4 ; 11 01 10 00 + movq mm2, mm3 ; 13 03 12 02 - punpckldq mm0, mm6 ; 30 20 10 00 aka ip[0] - punpckhdq mm4, mm6 ; 31 21 11 01 aka ip[4] + punpckldq mm0, mm6 ; 30 20 10 00 aka ip[0] + punpckhdq mm4, mm6 ; 31 21 11 01 aka ip[4] - punpckldq mm2, mm1 ; 32 22 12 02 aka ip[8] - punpckhdq mm3, mm1 ; 33 23 13 03 aka ip[12] + punpckldq mm2, mm1 ; 32 22 12 02 aka ip[8] + punpckhdq mm3, mm1 ; 33 23 13 03 aka ip[12] ;~~~~~~~~~~~~~~~~~~~~~ - movq mm1, mm0 - movq mm5, mm4 - - paddw mm1, mm3 ;ip[0] + ip[12] aka al - paddw mm5, mm2 ;ip[4] + ip[8] aka bl - - movq mm6, mm1 ;temp al - - paddw mm1, mm5 ;al + bl - psubw mm6, mm5 ;al - bl - - psubw mm0, mm3 ;ip[0] - ip[12] aka d1 - psubw mm4, mm2 ;ip[4] - ip[8] aka c1 - - movq mm5, mm0 ;temp dl - - paddw mm0, mm4 ;dl + cl - psubw mm5, mm4 ;dl - cl + movq mm1, mm0 + movq mm5, mm4 + paddw mm1, mm3 ;ip[0] + ip[12] aka al + paddw mm5, mm2 ;ip[4] + ip[8] aka bl + + movq mm6, mm1 ;temp al + paddw mm1, mm5 ;al + bl + psubw mm6, mm5 ;al - bl + paddw mm1, mm7 + paddw mm6, mm7 + psraw mm1, 3 + psraw mm6, 3 + + psubw mm0, mm3 ;ip[0] - ip[12] aka d1 + psubw mm4, mm2 ;ip[4] - ip[8] aka c1 + + movq mm5, mm0 ;temp dl + paddw mm0, mm4 ;dl + cl + psubw mm5, mm4 ;dl - cl + paddw mm0, mm7 + paddw mm5, mm7 + psraw mm0, 3 + psraw mm5, 3 ;~~~~~~~~~~~~~~~~~~~~~ - movq mm3, mm1 ; 03 02 01 00 - punpcklwd mm1, mm0 ; 11 01 10 00 - punpckhwd mm3, mm0 ; 13 03 12 02 - - movq mm4, mm6 ; 23 22 21 20 - punpcklwd mm6, mm5 ; 31 21 30 20 - punpckhwd mm4, mm5 ; 33 23 32 22 - - movq mm0, mm1 ; 11 01 10 00 - movq mm2, mm3 ; 13 03 12 02 - - punpckldq mm0, mm6 ; 30 20 10 00 aka ip[0] - punpckhdq mm1, mm6 ; 31 21 11 01 aka ip[4] - - punpckldq mm2, mm4 ; 32 22 12 02 aka ip[8] - punpckhdq mm3, mm4 ; 33 23 13 03 aka ip[12] - - paddw mm0, mm7 - paddw mm1, mm7 - paddw mm2, mm7 - paddw mm3, mm7 - - psraw mm0, 3 - psraw mm1, 3 - psraw mm2, 3 - psraw mm3, 3 - movq [rdi + 0], mm0 - movq [rdi + 8], mm1 - movq [rdi + 16], mm2 - movq [rdi + 24], mm3 + movd eax, mm1 + movd ecx, mm0 + psrlq mm0, 32 + psrlq mm1, 32 + mov word ptr[rdx+32*0], ax + mov word ptr[rdx+32*1], cx + shr eax, 16 + shr ecx, 16 + mov word ptr[rdx+32*4], ax + mov word ptr[rdx+32*5], cx + movd eax, mm1 + movd ecx, mm0 + mov word ptr[rdx+32*8], ax + mov word ptr[rdx+32*9], cx + shr eax, 16 + shr ecx, 16 + mov word ptr[rdx+32*12], ax + mov word ptr[rdx+32*13], cx + + movd eax, mm6 + movd ecx, mm5 + psrlq mm5, 32 + psrlq mm6, 32 + mov word ptr[rdx+32*2], ax + mov word ptr[rdx+32*3], cx + shr eax, 16 + shr ecx, 16 + mov word ptr[rdx+32*6], ax + mov word ptr[rdx+32*7], cx + movd eax, mm6 + movd ecx, mm5 + mov word ptr[rdx+32*10], ax + mov word ptr[rdx+32*11], cx + shr eax, 16 + shr ecx, 16 + mov word ptr[rdx+32*14], ax + mov word ptr[rdx+32*15], cx ; begin epilog - pop rdi - pop rsi UNSHADOW_ARGS pop rbp ret diff --git a/vp8/common/x86/iwalsh_sse2.asm b/vp8/common/x86/iwalsh_sse2.asm index 1da4fd8da..51cb5e21c 100644 --- a/vp8/common/x86/iwalsh_sse2.asm +++ b/vp8/common/x86/iwalsh_sse2.asm @@ -17,103 +17,105 @@ sym(vp8_short_inv_walsh4x4_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 2 - SAVE_XMM 6 - push rsi - push rdi ; end prolog - mov rsi, arg(0) - mov rdi, arg(1) - mov rax, 3 + mov rcx, arg(0) + mov rdx, arg(1) + mov rax, 30003h - movdqa xmm0, [rsi + 0] ;ip[4] ip[0] - movdqa xmm1, [rsi + 16] ;ip[12] ip[8] + movdqa xmm0, [rcx + 0] ;ip[4] ip[0] + movdqa xmm1, [rcx + 16] ;ip[12] ip[8] - shl rax, 16 - or rax, 3 ;00030003h - pshufd xmm2, xmm1, 4eh ;ip[8] ip[12] - movdqa xmm3, xmm0 ;ip[4] ip[0] + pshufd xmm2, xmm1, 4eh ;ip[8] ip[12] + movdqa xmm3, xmm0 ;ip[4] ip[0] - paddw xmm0, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1 - psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1 + paddw xmm0, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1 + psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1 - movdqa xmm4, xmm0 + movdqa xmm4, xmm0 punpcklqdq xmm0, xmm3 ;d1 a1 punpckhqdq xmm4, xmm3 ;c1 b1 - movd xmm6, eax - movdqa xmm1, xmm4 ;c1 b1 - paddw xmm4, xmm0 ;dl+cl a1+b1 aka op[4] op[0] - psubw xmm0, xmm1 ;d1-c1 a1-b1 aka op[12] op[8] + movdqa xmm1, xmm4 ;c1 b1 + paddw xmm4, xmm0 ;dl+cl a1+b1 aka op[4] op[0] + psubw xmm0, xmm1 ;d1-c1 a1-b1 aka op[12] op[8] -;;;temp output -;; movdqu [rdi + 0], xmm4 -;; movdqu [rdi + 16], xmm3 - -;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ; 13 12 11 10 03 02 01 00 ; ; 33 32 31 30 23 22 21 20 ; - movdqa xmm3, xmm4 ; 13 12 11 10 03 02 01 00 - punpcklwd xmm4, xmm0 ; 23 03 22 02 21 01 20 00 - punpckhwd xmm3, xmm0 ; 33 13 32 12 31 11 30 10 - movdqa xmm1, xmm4 ; 23 03 22 02 21 01 20 00 - punpcklwd xmm4, xmm3 ; 31 21 11 01 30 20 10 00 - punpckhwd xmm1, xmm3 ; 33 23 13 03 32 22 12 02 + movdqa xmm3, xmm4 ; 13 12 11 10 03 02 01 00 + punpcklwd xmm4, xmm0 ; 23 03 22 02 21 01 20 00 + punpckhwd xmm3, xmm0 ; 33 13 32 12 31 11 30 10 + movdqa xmm1, xmm4 ; 23 03 22 02 21 01 20 00 + punpcklwd xmm4, xmm3 ; 31 21 11 01 30 20 10 00 + punpckhwd xmm1, xmm3 ; 33 23 13 03 32 22 12 02 ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - pshufd xmm2, xmm1, 4eh ;ip[8] ip[12] - movdqa xmm3, xmm4 ;ip[4] ip[0] + movd xmm0, eax + pshufd xmm2, xmm1, 4eh ;ip[8] ip[12] + movdqa xmm3, xmm4 ;ip[4] ip[0] - pshufd xmm6, xmm6, 0 ;03 03 03 03 03 03 03 03 + pshufd xmm0, xmm0, 0 ;03 03 03 03 03 03 03 03 - paddw xmm4, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1 - psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1 + paddw xmm4, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1 + psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1 - movdqa xmm5, xmm4 + movdqa xmm5, xmm4 punpcklqdq xmm4, xmm3 ;d1 a1 punpckhqdq xmm5, xmm3 ;c1 b1 - movdqa xmm1, xmm5 ;c1 b1 - paddw xmm5, xmm4 ;dl+cl a1+b1 aka op[4] op[0] - psubw xmm4, xmm1 ;d1-c1 a1-b1 aka op[12] op[8] -;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ; 13 12 11 10 03 02 01 00 - ; - ; 33 32 31 30 23 22 21 20 - ; - movdqa xmm0, xmm5 ; 13 12 11 10 03 02 01 00 - punpcklwd xmm5, xmm4 ; 23 03 22 02 21 01 20 00 - punpckhwd xmm0, xmm4 ; 33 13 32 12 31 11 30 10 - movdqa xmm1, xmm5 ; 23 03 22 02 21 01 20 00 - punpcklwd xmm5, xmm0 ; 31 21 11 01 30 20 10 00 - punpckhwd xmm1, xmm0 ; 33 23 13 03 32 22 12 02 -;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - paddw xmm5, xmm6 - paddw xmm1, xmm6 - - psraw xmm5, 3 - psraw xmm1, 3 - - movdqa [rdi + 0], xmm5 - movdqa [rdi + 16], xmm1 + movdqa xmm1, xmm5 ;c1 b1 + paddw xmm5, xmm4 ;dl+cl a1+b1 aka op[4] op[0] + psubw xmm4, xmm1 ;d1-c1 a1-b1 aka op[12] op[8] + + paddw xmm5, xmm0 + paddw xmm4, xmm0 + psraw xmm5, 3 + psraw xmm4, 3 + + movd eax, xmm5 + movd ecx, xmm4 + psrldq xmm5, 4 + psrldq xmm4, 4 + mov word ptr[rdx+32*0], ax + mov word ptr[rdx+32*2], cx + shr eax, 16 + shr ecx, 16 + mov word ptr[rdx+32*4], ax + mov word ptr[rdx+32*6], cx + movd eax, xmm5 + movd ecx, xmm4 + psrldq xmm5, 4 + psrldq xmm4, 4 + mov word ptr[rdx+32*8], ax + mov word ptr[rdx+32*10], cx + shr eax, 16 + shr ecx, 16 + mov word ptr[rdx+32*12], ax + mov word ptr[rdx+32*14], cx + + movd eax, xmm5 + movd ecx, xmm4 + psrldq xmm5, 4 + psrldq xmm4, 4 + mov word ptr[rdx+32*1], ax + mov word ptr[rdx+32*3], cx + shr eax, 16 + shr ecx, 16 + mov word ptr[rdx+32*5], ax + mov word ptr[rdx+32*7], cx + movd eax, xmm5 + movd ecx, xmm4 + mov word ptr[rdx+32*9], ax + mov word ptr[rdx+32*11], cx + shr eax, 16 + shr ecx, 16 + mov word ptr[rdx+32*13], ax + mov word ptr[rdx+32*15], cx ; begin epilog - pop rdi - pop rsi - RESTORE_XMM UNSHADOW_ARGS pop rbp ret - -SECTION_RODATA -align 16 -x_s1sqr2: - times 4 dw 0x8A8C -align 16 -x_c1sqr2less1: - times 4 dw 0x4E7B -align 16 -fours: - times 4 dw 0x0004 diff --git a/vp8/common/x86/loopfilter_sse2.asm b/vp8/common/x86/loopfilter_sse2.asm index 86927d9f1..2ad010adb 100644 --- a/vp8/common/x86/loopfilter_sse2.asm +++ b/vp8/common/x86/loopfilter_sse2.asm @@ -1385,52 +1385,54 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2): SHADOW_ARGS_TO_STACK 3 SAVE_XMM 7 GET_GOT rbx - push rsi - push rdi ; end prolog - mov rsi, arg(0) ;src_ptr + mov rcx, arg(0) ;src_ptr movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? - mov rdx, arg(2) ;blimit - movdqa xmm3, XMMWORD PTR [rdx] - mov rdi, rsi ; rdi points to row +1 for indirect addressing - add rdi, rax + lea rdx, [rcx + rax] neg rax ; calculate mask - movdqa xmm1, [rsi+2*rax] ; p1 - movdqa xmm0, [rdi] ; q1 + movdqa xmm0, [rdx] ; q1 + mov rdx, arg(2) ;blimit + movdqa xmm1, [rcx+2*rax] ; p1 + movdqa xmm2, xmm1 movdqa xmm7, xmm0 - movdqa xmm4, xmm0 + psubusb xmm0, xmm1 ; q1-=p1 - psubusb xmm1, xmm4 ; p1-=q1 + psubusb xmm1, xmm7 ; p1-=q1 por xmm1, xmm0 ; abs(p1-q1) pand xmm1, [GLOBAL(tfe)] ; set lsb of each byte to zero psrlw xmm1, 1 ; abs(p1-q1)/2 - movdqa xmm5, [rsi+rax] ; p0 - movdqa xmm4, [rsi] ; q0 + movdqa xmm3, XMMWORD PTR [rdx] + + movdqa xmm5, [rcx+rax] ; p0 + movdqa xmm4, [rcx] ; q0 movdqa xmm0, xmm4 ; q0 movdqa xmm6, xmm5 ; p0 psubusb xmm5, xmm4 ; p0-=q0 psubusb xmm4, xmm6 ; q0-=p0 por xmm5, xmm4 ; abs(p0 - q0) + + movdqa xmm4, [GLOBAL(t80)] + paddusb xmm5, xmm5 ; abs(p0-q0)*2 paddusb xmm5, xmm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - psubusb xmm5, xmm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit pxor xmm3, xmm3 pcmpeqb xmm5, xmm3 + ; start work on filters - pxor xmm2, [GLOBAL(t80)] ; p1 offset to convert to signed values - pxor xmm7, [GLOBAL(t80)] ; q1 offset to convert to signed values + pxor xmm2, xmm4 ; p1 offset to convert to signed values + pxor xmm7, xmm4 ; q1 offset to convert to signed values psubsb xmm2, xmm7 ; p1 - q1 - pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values - pxor xmm0, [GLOBAL(t80)] ; offset to convert to signed values + pxor xmm6, xmm4 ; offset to convert to signed values + pxor xmm0, xmm4 ; offset to convert to signed values movdqa xmm3, xmm0 ; q0 psubsb xmm0, xmm6 ; q0 - p0 paddsb xmm2, xmm0 ; p1 - q1 + 1 * (q0 - p0) @@ -1438,42 +1440,36 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2): paddsb xmm2, xmm0 ; p1 - q1 + 3 * (q0 - p0) pand xmm5, xmm2 ; mask filter values we don't care about - ; do + 4 side - paddsb xmm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4 - - movdqa xmm0, xmm5 ; get a copy of filters - psllw xmm0, 8 ; shift left 8 - psraw xmm0, 3 ; arithmetic shift right 11 - psrlw xmm0, 8 - movdqa xmm1, xmm5 ; get a copy of filters - psraw xmm1, 11 ; arithmetic shift right 11 - psllw xmm1, 8 ; shift left 8 to put it back - - por xmm0, xmm1 ; put the two together to get result + paddsb xmm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4 + movdqa xmm0, xmm5 + psubsb xmm5, [GLOBAL(t1s)] ; +3 instead of +4 - psubsb xmm3, xmm0 ; q0-= q0 add - pxor xmm3, [GLOBAL(t80)] ; unoffset - movdqa [rsi], xmm3 ; write back + movdqa xmm1, [GLOBAL(te0)] + movdqa xmm2, [GLOBAL(t1f)] - ; now do +3 side - psubsb xmm5, [GLOBAL(t1s)] ; +3 instead of +4 + pxor xmm7, xmm7 + pcmpgtb xmm7, xmm0 ;save sign + pand xmm7, xmm1 ;preserve the upper 3 bits + psrlw xmm0, 3 + pand xmm0, xmm2 ;clear out upper 3 bits + por xmm0, xmm7 ;add sign + psubsb xmm3, xmm0 ; q0-= q0sz add - movdqa xmm0, xmm5 ; get a copy of filters - psllw xmm0, 8 ; shift left 8 - psraw xmm0, 3 ; arithmetic shift right 11 - psrlw xmm0, 8 - psraw xmm5, 11 ; arithmetic shift right 11 - psllw xmm5, 8 ; shift left 8 to put it back - por xmm0, xmm5 ; put the two together to get result + pxor xmm7, xmm7 + pcmpgtb xmm7, xmm5 ;save sign + pand xmm7, xmm1 ;preserve the upper 3 bits + psrlw xmm5, 3 + pand xmm5, xmm2 ;clear out upper 3 bits + por xmm5, xmm7 ;add sign + paddsb xmm6, xmm5 ; p0+= p0 add + pxor xmm3, xmm4 ; unoffset + movdqa [rcx], xmm3 ; write back - paddsb xmm6, xmm0 ; p0+= p0 add - pxor xmm6, [GLOBAL(t80)] ; unoffset - movdqa [rsi+rax], xmm6 ; write back + pxor xmm6, xmm4 ; unoffset + movdqa [rcx+rax], xmm6 ; write back ; begin epilog - pop rdi - pop rsi RESTORE_GOT RESTORE_XMM UNSHADOW_ARGS @@ -1536,9 +1532,6 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2): punpckldq xmm0, xmm1 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 punpckhdq xmm2, xmm1 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 - movdqa t0, xmm0 ; save to t0 - movdqa t1, xmm2 ; save to t1 - lea rsi, [rsi + rax*8] lea rdi, [rsi + rax] lea rdx, [rsi + rax*4] @@ -1551,26 +1544,24 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2): punpckldq xmm4, xmm1 ; c3 c2 c1 c0 83 82 81 80 punpckldq xmm6, xmm3 ; d3 d2 d1 d0 93 92 91 90 - movd xmm0, [rsi + rax*2] ; a3 a2 a1 a0 + movd xmm1, [rsi + rax*2] ; a3 a2 a1 a0 movd xmm5, [rdx + rax*2] ; e3 e2 e1 e0 - movd xmm2, [rdi + rax*2] ; b3 b2 b1 b0 + movd xmm3, [rdi + rax*2] ; b3 b2 b1 b0 movd xmm7, [rcx + rax*2] ; f3 f2 f1 f0 - punpckldq xmm0, xmm5 ; e3 e2 e1 e0 a3 a2 a1 a0 - punpckldq xmm2, xmm7 ; f3 f2 f1 f0 b3 b2 b1 b0 + punpckldq xmm1, xmm5 ; e3 e2 e1 e0 a3 a2 a1 a0 + punpckldq xmm3, xmm7 ; f3 f2 f1 f0 b3 b2 b1 b0 punpcklbw xmm4, xmm6 ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80 - punpcklbw xmm0, xmm2 ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0 + punpcklbw xmm1, xmm3 ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0 - movdqa xmm1, xmm4 - punpcklwd xmm4, xmm0 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80 - punpckhwd xmm1, xmm0 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0 + movdqa xmm7, xmm4 + punpcklwd xmm4, xmm1 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80 + punpckhwd xmm7, xmm1 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0 movdqa xmm6, xmm4 - punpckldq xmm4, xmm1 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80 - punpckhdq xmm6, xmm1 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82 + punpckldq xmm4, xmm7 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80 + punpckhdq xmm6, xmm7 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82 - movdqa xmm0, t0 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 - movdqa xmm2, t1 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 movdqa xmm1, xmm0 movdqa xmm3, xmm2 @@ -1579,6 +1570,8 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2): punpcklqdq xmm2, xmm6 ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 punpckhqdq xmm3, xmm6 ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 + mov rdx, arg(2) ;blimit + ; calculate mask movdqa xmm6, xmm0 ; p1 movdqa xmm7, xmm3 ; q1 @@ -1588,6 +1581,8 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2): pand xmm6, [GLOBAL(tfe)] ; set lsb of each byte to zero psrlw xmm6, 1 ; abs(p1-q1)/2 + movdqa xmm7, [rdx] + movdqa xmm5, xmm1 ; p0 movdqa xmm4, xmm2 ; q0 psubusb xmm5, xmm2 ; p0-=q0 @@ -1596,8 +1591,7 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2): paddusb xmm5, xmm5 ; abs(p0-q0)*2 paddusb xmm5, xmm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - mov rdx, arg(2) ;blimit - movdqa xmm7, XMMWORD PTR [rdx] + movdqa xmm4, [GLOBAL(t80)] psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit pxor xmm7, xmm7 @@ -1607,59 +1601,48 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2): movdqa t0, xmm0 movdqa t1, xmm3 - pxor xmm0, [GLOBAL(t80)] ; p1 offset to convert to signed values - pxor xmm3, [GLOBAL(t80)] ; q1 offset to convert to signed values - + pxor xmm0, xmm4 ; p1 offset to convert to signed values + pxor xmm3, xmm4 ; q1 offset to convert to signed values psubsb xmm0, xmm3 ; p1 - q1 - movdqa xmm6, xmm1 ; p0 - - movdqa xmm7, xmm2 ; q0 - pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values - pxor xmm7, [GLOBAL(t80)] ; offset to convert to signed values - movdqa xmm3, xmm7 ; offseted ; q0 - - psubsb xmm7, xmm6 ; q0 - p0 - paddsb xmm0, xmm7 ; p1 - q1 + 1 * (q0 - p0) + movdqa xmm6, xmm1 ; p0 +; movdqa xmm7, xmm2 ; q0 - paddsb xmm0, xmm7 ; p1 - q1 + 2 * (q0 - p0) - paddsb xmm0, xmm7 ; p1 - q1 + 3 * (q0 - p0) + pxor xmm6, xmm4 ; offset to convert to signed values + pxor xmm2, xmm4 ; offset to convert to signed values + movdqa xmm3, xmm2 ; offseted ; q0 + psubsb xmm2, xmm6 ; q0 - p0 + paddsb xmm0, xmm2 ; p1 - q1 + 1 * (q0 - p0) + paddsb xmm0, xmm2 ; p1 - q1 + 2 * (q0 - p0) + paddsb xmm0, xmm2 ; p1 - q1 + 3 * (q0 - p0) pand xmm5, xmm0 ; mask filter values we don't care about - paddsb xmm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4 - - movdqa xmm0, xmm5 ; get a copy of filters - psllw xmm0, 8 ; shift left 8 - - psraw xmm0, 3 ; arithmetic shift right 11 - psrlw xmm0, 8 - - movdqa xmm7, xmm5 ; get a copy of filters - psraw xmm7, 11 ; arithmetic shift right 11 - - psllw xmm7, 8 ; shift left 8 to put it back - por xmm0, xmm7 ; put the two together to get result - - psubsb xmm3, xmm0 ; q0-= q0sz add - pxor xmm3, [GLOBAL(t80)] ; unoffset q0 - - ; now do +3 side + movdqa xmm0, xmm5 psubsb xmm5, [GLOBAL(t1s)] ; +3 instead of +4 - movdqa xmm0, xmm5 ; get a copy of filters - psllw xmm0, 8 ; shift left 8 - psraw xmm0, 3 ; arithmetic shift right 11 + movdqa xmm1, [GLOBAL(te0)] + movdqa xmm2, [GLOBAL(t1f)] - psrlw xmm0, 8 - psraw xmm5, 11 ; arithmetic shift right 11 + pxor xmm7, xmm7 + pcmpgtb xmm7, xmm0 ;save sign + pand xmm7, xmm1 ;preserve the upper 3 bits + psrlw xmm0, 3 + pand xmm0, xmm2 ;clear out upper 3 bits + por xmm0, xmm7 ;add sign + psubsb xmm3, xmm0 ; q0-= q0sz add - psllw xmm5, 8 ; shift left 8 to put it back - por xmm0, xmm5 ; put the two together to get result + pxor xmm7, xmm7 + pcmpgtb xmm7, xmm5 ;save sign + pand xmm7, xmm1 ;preserve the upper 3 bits + psrlw xmm5, 3 + pand xmm5, xmm2 ;clear out upper 3 bits + por xmm5, xmm7 ;add sign + paddsb xmm6, xmm5 ; p0+= p0 add - paddsb xmm6, xmm0 ; p0+= p0 add - pxor xmm6, [GLOBAL(t80)] ; unoffset p0 + pxor xmm3, xmm4 ; unoffset q0 + pxor xmm6, xmm4 ; unoffset p0 movdqa xmm0, t0 ; p1 movdqa xmm4, t1 ; q1 @@ -1763,3 +1746,9 @@ s9: align 16 s63: times 8 dw 0x003f +align 16 +te0: + times 16 db 0xe0 +align 16 +t1f: + times 16 db 0x1f diff --git a/vp8/common/x86/subpixel_mmx.asm b/vp8/common/x86/subpixel_mmx.asm index e68d950ad..5528fd0e6 100644 --- a/vp8/common/x86/subpixel_mmx.asm +++ b/vp8/common/x86/subpixel_mmx.asm @@ -10,6 +10,7 @@ %include "vpx_ports/x86_abi_support.asm" +extern sym(vp8_bilinear_filters_x86_8) %define BLOCK_HEIGHT_WIDTH 4 @@ -222,14 +223,14 @@ sym(vp8_bilinear_predict8x8_mmx): push rdi ; end prolog - ;const short *HFilter = bilinear_filters_mmx[xoffset]; - ;const short *VFilter = bilinear_filters_mmx[yoffset]; + ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]; + ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]; movsxd rax, dword ptr arg(2) ;xoffset mov rdi, arg(4) ;dst_ptr ; shl rax, 5 ; offset * 32 - lea rcx, [GLOBAL(sym(vp8_bilinear_filters_mmx))] + lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] add rax, rcx ; HFilter mov rsi, arg(0) ;src_ptr ; @@ -379,13 +380,13 @@ sym(vp8_bilinear_predict8x4_mmx): push rdi ; end prolog - ;const short *HFilter = bilinear_filters_mmx[xoffset]; - ;const short *VFilter = bilinear_filters_mmx[yoffset]; + ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]; + ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]; movsxd rax, dword ptr arg(2) ;xoffset mov rdi, arg(4) ;dst_ptr ; - lea rcx, [GLOBAL(sym(vp8_bilinear_filters_mmx))] + lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] shl rax, 5 mov rsi, arg(0) ;src_ptr ; @@ -534,13 +535,13 @@ sym(vp8_bilinear_predict4x4_mmx): push rdi ; end prolog - ;const short *HFilter = bilinear_filters_mmx[xoffset]; - ;const short *VFilter = bilinear_filters_mmx[yoffset]; + ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]; + ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]; movsxd rax, dword ptr arg(2) ;xoffset mov rdi, arg(4) ;dst_ptr ; - lea rcx, [GLOBAL(sym(vp8_bilinear_filters_mmx))] + lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] shl rax, 5 add rax, rcx ; HFilter @@ -699,29 +700,3 @@ sym(vp8_six_tap_mmx): times 8 dw 0 -align 16 -global HIDDEN_DATA(sym(vp8_bilinear_filters_mmx)) -sym(vp8_bilinear_filters_mmx): - times 8 dw 128 - times 8 dw 0 - - times 8 dw 112 - times 8 dw 16 - - times 8 dw 96 - times 8 dw 32 - - times 8 dw 80 - times 8 dw 48 - - times 8 dw 64 - times 8 dw 64 - - times 8 dw 48 - times 8 dw 80 - - times 8 dw 32 - times 8 dw 96 - - times 8 dw 16 - times 8 dw 112 diff --git a/vp8/common/x86/subpixel_sse2.asm b/vp8/common/x86/subpixel_sse2.asm index b62b5c68d..cb550af59 100644 --- a/vp8/common/x86/subpixel_sse2.asm +++ b/vp8/common/x86/subpixel_sse2.asm @@ -10,6 +10,7 @@ %include "vpx_ports/x86_abi_support.asm" +extern sym(vp8_bilinear_filters_x86_8) %define BLOCK_HEIGHT_WIDTH 4 %define VP8_FILTER_WEIGHT 128 @@ -961,7 +962,7 @@ sym(vp8_unpack_block1d16_h6_sse2): ; unsigned char *dst_ptr, ; int dst_pitch ;) -extern sym(vp8_bilinear_filters_mmx) +extern sym(vp8_bilinear_filters_x86_8) global sym(vp8_bilinear_predict16x16_sse2) sym(vp8_bilinear_predict16x16_sse2): push rbp @@ -973,10 +974,10 @@ sym(vp8_bilinear_predict16x16_sse2): push rdi ; end prolog - ;const short *HFilter = bilinear_filters_mmx[xoffset] - ;const short *VFilter = bilinear_filters_mmx[yoffset] + ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset] + ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset] - lea rcx, [GLOBAL(sym(vp8_bilinear_filters_mmx))] + lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] movsxd rax, dword ptr arg(2) ;xoffset cmp rax, 0 ;skip first_pass filter if xoffset=0 @@ -1230,7 +1231,6 @@ sym(vp8_bilinear_predict16x16_sse2): ; unsigned char *dst_ptr, ; int dst_pitch ;) -extern sym(vp8_bilinear_filters_mmx) global sym(vp8_bilinear_predict8x8_sse2) sym(vp8_bilinear_predict8x8_sse2): push rbp @@ -1245,9 +1245,9 @@ sym(vp8_bilinear_predict8x8_sse2): ALIGN_STACK 16, rax sub rsp, 144 ; reserve 144 bytes - ;const short *HFilter = bilinear_filters_mmx[xoffset] - ;const short *VFilter = bilinear_filters_mmx[yoffset] - lea rcx, [GLOBAL(sym(vp8_bilinear_filters_mmx))] + ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset] + ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset] + lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] mov rsi, arg(0) ;src_ptr movsxd rdx, dword ptr arg(1) ;src_pixels_per_line diff --git a/vp8/common/x86/subpixel_x86.h b/vp8/common/x86/subpixel_x86.h index 75991cc4f..01ec9e210 100644 --- a/vp8/common/x86/subpixel_x86.h +++ b/vp8/common/x86/subpixel_x86.h @@ -12,6 +12,8 @@ #ifndef SUBPIXEL_X86_H #define SUBPIXEL_X86_H +#include "filter_x86.h" + /* Note: * * This platform is commonly built for runtime CPU detection. If you modify diff --git a/vp8/common/x86/vp8_asm_stubs.c b/vp8/common/x86/vp8_asm_stubs.c index bce7bc38e..a623c69b4 100644 --- a/vp8/common/x86/vp8_asm_stubs.c +++ b/vp8/common/x86/vp8_asm_stubs.c @@ -12,9 +12,9 @@ #include "vpx_config.h" #include "vpx_ports/mem.h" #include "vp8/common/subpixel.h" +#include "filter_x86.h" extern const short vp8_six_tap_mmx[8][6*8]; -extern const short vp8_bilinear_filters_mmx[8][2*8]; extern void vp8_filter_block1d_h6_mmx ( diff --git a/vp8/common/x86/x86_systemdependent.c b/vp8/common/x86/x86_systemdependent.c index eb36d899d..e1e1b7987 100644 --- a/vp8/common/x86/x86_systemdependent.c +++ b/vp8/common/x86/x86_systemdependent.c @@ -11,7 +11,6 @@ #include "vpx_config.h" #include "vpx_ports/x86.h" -#include "vp8/common/g_common.h" #include "vp8/common/subpixel.h" #include "vp8/common/loopfilter.h" #include "vp8/common/recon.h" @@ -37,12 +36,14 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx) if (flags & HAS_MMX) { + rtcd->dequant.block = vp8_dequantize_b_mmx; + rtcd->dequant.idct_add = vp8_dequant_idct_add_mmx; + rtcd->dequant.idct_add_y_block = vp8_dequant_idct_add_y_block_mmx; + rtcd->dequant.idct_add_uv_block = vp8_dequant_idct_add_uv_block_mmx; + rtcd->idct.idct16 = vp8_short_idct4x4llm_mmx; rtcd->idct.idct1_scalar_add = vp8_dc_only_idct_add_mmx; rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_mmx; - rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_mmx; - - rtcd->recon.copy8x8 = vp8_copy_mem8x8_mmx; rtcd->recon.copy8x4 = vp8_copy_mem8x4_mmx; @@ -89,6 +90,9 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx) rtcd->recon.build_intra_predictors_mby_s = vp8_build_intra_predictors_mby_s_sse2; + rtcd->dequant.idct_add_y_block = vp8_dequant_idct_add_y_block_sse2; + rtcd->dequant.idct_add_uv_block = vp8_dequant_idct_add_uv_block_sse2; + rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_sse2; rtcd->subpix.sixtap16x16 = vp8_sixtap_predict16x16_sse2; diff --git a/vp8/decoder/arm/arm_dsystemdependent.c b/vp8/decoder/arm/arm_dsystemdependent.c index 1b0091cdb..bf0a3481a 100644 --- a/vp8/decoder/arm/arm_dsystemdependent.c +++ b/vp8/decoder/arm/arm_dsystemdependent.c @@ -11,9 +11,6 @@ #include "vpx_config.h" #include "vpx_ports/arm.h" -#include "vp8/common/blockd.h" -#include "vp8/common/pragmas.h" -#include "vp8/decoder/dequantize.h" #include "vp8/decoder/onyxd_int.h" void vp8_arch_arm_decode_init(VP8D_COMP *pbi) @@ -30,25 +27,12 @@ void vp8_arch_arm_decode_init(VP8D_COMP *pbi) #if HAVE_ARMV6 if (flags & HAS_MEDIA) { - pbi->dequant.block = vp8_dequantize_b_v6; - pbi->dequant.idct_add = vp8_dequant_idct_add_v6; - pbi->dequant.dc_idct_add = vp8_dequant_dc_idct_add_v6; - pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_v6; - pbi->dequant.idct_add_y_block = vp8_dequant_idct_add_y_block_v6; - pbi->dequant.idct_add_uv_block = vp8_dequant_idct_add_uv_block_v6; } #endif #if HAVE_ARMV7 if (flags & HAS_NEON) { - pbi->dequant.block = vp8_dequantize_b_neon; - pbi->dequant.idct_add = vp8_dequant_idct_add_neon; - /*This is not used: NEON always dequants two blocks at once. - pbi->dequant.dc_idct_add = vp8_dequant_dc_idct_add_neon;*/ - pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_neon; - pbi->dequant.idct_add_y_block = vp8_dequant_idct_add_y_block_neon; - pbi->dequant.idct_add_uv_block = vp8_dequant_idct_add_uv_block_neon; } #endif #endif diff --git a/vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm b/vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm deleted file mode 100644 index 19f94e089..000000000 --- a/vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm +++ /dev/null @@ -1,213 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. -; - - - EXPORT |vp8_dequant_dc_idct_add_v6| - - AREA |.text|, CODE, READONLY - -;void vp8_dequant_dc_idct_v6(short *input, short *dq, -; unsigned char *dest, int stride, int Dc) -; r0 = input -; r1 = dq -; r2 = dst -; r3 = stride -; sp + 36 = Dc - - -|vp8_dequant_dc_idct_add_v6| PROC - stmdb sp!, {r4-r11, lr} - - ldr r6, [sp, #36] - - ldr r4, [r0] ;input - ldr r5, [r1], #4 ;dq - - sub sp, sp, #4 - str r3, [sp] - - smultt r7, r4, r5 - - ldr r4, [r0, #4] ;input - ldr r5, [r1], #4 ;dq - - strh r6, [r0], #2 - strh r7, [r0], #2 - - smulbb r6, r4, r5 - smultt r7, r4, r5 - - ldr r4, [r0, #4] ;input - ldr r5, [r1], #4 ;dq - - strh r6, [r0], #2 - strh r7, [r0], #2 - - mov r12, #3 - -vp8_dequant_dc_add_loop - smulbb r6, r4, r5 - smultt r7, r4, r5 - - ldr r4, [r0, #4] ;input - ldr r5, [r1], #4 ;dq - - strh r6, [r0], #2 - strh r7, [r0], #2 - - smulbb r6, r4, r5 - smultt r7, r4, r5 - - subs r12, r12, #1 - - ldrne r4, [r0, #4] - ldrne r5, [r1], #4 - - strh r6, [r0], #2 - strh r7, [r0], #2 - - bne vp8_dequant_dc_add_loop - - sub r0, r0, #32 - mov r1, r0 - -; short_idct4x4llm_v6_dual - ldr r3, cospi8sqrt2minus1 - ldr r4, sinpi8sqrt2 - ldr r6, [r0, #8] - mov r5, #2 -vp8_dequant_dc_idct_loop1_v6 - ldr r12, [r0, #24] - ldr r14, [r0, #16] - smulwt r9, r3, r6 - smulwb r7, r3, r6 - smulwt r10, r4, r6 - smulwb r8, r4, r6 - pkhbt r7, r7, r9, lsl #16 - smulwt r11, r3, r12 - pkhbt r8, r8, r10, lsl #16 - uadd16 r6, r6, r7 - smulwt r7, r4, r12 - smulwb r9, r3, r12 - smulwb r10, r4, r12 - subs r5, r5, #1 - pkhbt r9, r9, r11, lsl #16 - ldr r11, [r0], #4 - pkhbt r10, r10, r7, lsl #16 - uadd16 r7, r12, r9 - usub16 r7, r8, r7 - uadd16 r6, r6, r10 - uadd16 r10, r11, r14 - usub16 r8, r11, r14 - uadd16 r9, r10, r6 - usub16 r10, r10, r6 - uadd16 r6, r8, r7 - usub16 r7, r8, r7 - str r6, [r1, #8] - ldrne r6, [r0, #8] - str r7, [r1, #16] - str r10, [r1, #24] - str r9, [r1], #4 - bne vp8_dequant_dc_idct_loop1_v6 - - mov r5, #2 - sub r0, r1, #8 -vp8_dequant_dc_idct_loop2_v6 - ldr r6, [r0], #4 - ldr r7, [r0], #4 - ldr r8, [r0], #4 - ldr r9, [r0], #4 - smulwt r1, r3, r6 - smulwt r12, r4, r6 - smulwt lr, r3, r8 - smulwt r10, r4, r8 - pkhbt r11, r8, r6, lsl #16 - pkhbt r1, lr, r1, lsl #16 - pkhbt r12, r10, r12, lsl #16 - pkhtb r6, r6, r8, asr #16 - uadd16 r6, r1, r6 - pkhbt lr, r9, r7, lsl #16 - uadd16 r10, r11, lr - usub16 lr, r11, lr - pkhtb r8, r7, r9, asr #16 - subs r5, r5, #1 - smulwt r1, r3, r8 - smulwb r7, r3, r8 - smulwt r11, r4, r8 - smulwb r9, r4, r8 - pkhbt r1, r7, r1, lsl #16 - uadd16 r8, r1, r8 - pkhbt r11, r9, r11, lsl #16 - usub16 r1, r12, r8 - uadd16 r8, r11, r6 - ldr r9, c0x00040004 - ldr r12, [sp] ; get stride from stack - uadd16 r6, r10, r8 - usub16 r7, r10, r8 - uadd16 r7, r7, r9 - uadd16 r6, r6, r9 - uadd16 r10, r14, r1 - usub16 r1, r14, r1 - uadd16 r10, r10, r9 - uadd16 r1, r1, r9 - ldr r11, [r2] ; load input from dst - mov r8, r7, asr #3 - pkhtb r9, r8, r10, asr #19 - mov r8, r1, asr #3 - pkhtb r8, r8, r6, asr #19 - uxtb16 lr, r11, ror #8 - qadd16 r9, r9, lr - uxtb16 lr, r11 - qadd16 r8, r8, lr - usat16 r9, #8, r9 - usat16 r8, #8, r8 - orr r9, r8, r9, lsl #8 - ldr r11, [r2, r12] ; load input from dst - mov r7, r7, lsl #16 - mov r1, r1, lsl #16 - mov r10, r10, lsl #16 - mov r6, r6, lsl #16 - mov r7, r7, asr #3 - pkhtb r7, r7, r10, asr #19 - mov r1, r1, asr #3 - pkhtb r1, r1, r6, asr #19 - uxtb16 r8, r11, ror #8 - qadd16 r7, r7, r8 - uxtb16 r8, r11 - qadd16 r1, r1, r8 - usat16 r7, #8, r7 - usat16 r1, #8, r1 - orr r1, r1, r7, lsl #8 - str r9, [r2], r12 ; store output to dst - str r1, [r2], r12 ; store output to dst - bne vp8_dequant_dc_idct_loop2_v6 - -; vpx_memset - sub r0, r0, #32 - add sp, sp, #4 - - mov r12, #0 - str r12, [r0] - str r12, [r0, #4] - str r12, [r0, #8] - str r12, [r0, #12] - str r12, [r0, #16] - str r12, [r0, #20] - str r12, [r0, #24] - str r12, [r0, #28] - - ldmia sp!, {r4 - r11, pc} - ENDP ; |vp8_dequant_dc_idct_add_v6| - -; Constant Pool -cospi8sqrt2minus1 DCD 0x00004E7B -sinpi8sqrt2 DCD 0x00008A8C -c0x00040004 DCD 0x00040004 - - END diff --git a/vp8/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm b/vp8/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm deleted file mode 100644 index bf8d7ddcd..000000000 --- a/vp8/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm +++ /dev/null @@ -1,75 +0,0 @@ -; -; Copyright (c) 2010 The Webm project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. -; - - - EXPORT |idct_dequant_dc_0_2x_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -;void idct_dequant_dc_0_2x_neon(short *q, short *dq, -; unsigned char *dst, int stride); -; r0 *q, -; r1 *dq, -; r2 *dst -; r3 stride -; sp *dc -|idct_dequant_dc_0_2x_neon| PROC - - ; no q- or dq-coeffs, so r0 and r1 are free to use - ldr r1, [sp] ; *dc - add r12, r2, #4 - ldr r0, [r1] - - vld1.32 {d2[0]}, [r2], r3 ; lo - vld1.32 {d8[0]}, [r12], r3 ; hi - vld1.32 {d2[1]}, [r2], r3 - vld1.32 {d8[1]}, [r12], r3 - vld1.32 {d4[0]}, [r2], r3 - vld1.32 {d10[0]}, [r12], r3 - vld1.32 {d4[1]}, [r2], r3 - vld1.32 {d10[1]}, [r12] - - sxth r1, r0 ; lo *dc - add r1, r1, #4 - asr r1, r1, #3 - vdup.16 q0, r1 - sxth r0, r0, ror #16 ; hi *dc - add r0, r0, #4 - asr r0, r0, #3 - vdup.16 q3, r0 - - vaddw.u8 q1, q0, d2 ; lo - vaddw.u8 q2, q0, d4 - vaddw.u8 q4, q3, d8 ; hi - vaddw.u8 q5, q3, d10 - - vqmovun.s16 d2, q1 ; lo - vqmovun.s16 d4, q2 - vqmovun.s16 d8, q4 ; hi - vqmovun.s16 d10, q5 - - sub r2, r2, r3, lsl #2 ; dst - 4*stride - add r0, r2, #4 - - vst1.32 {d2[0]}, [r2], r3 ; lo - vst1.32 {d8[0]}, [r0], r3 ; hi - vst1.32 {d2[1]}, [r2], r3 - vst1.32 {d8[1]}, [r0], r3 - vst1.32 {d4[0]}, [r2], r3 - vst1.32 {d10[0]}, [r0], r3 - vst1.32 {d4[1]}, [r2] - vst1.32 {d10[1]}, [r0] - - bx lr - - ENDP ;|idct_dequant_dc_0_2x_neon| - END diff --git a/vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm b/vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm deleted file mode 100644 index eea41f68c..000000000 --- a/vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm +++ /dev/null @@ -1,208 +0,0 @@ -; -; Copyright (c) 2010 The Webm project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |idct_dequant_dc_full_2x_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -;void idct_dequant_dc_full_2x_neon(short *q, short *dq, -; unsigned char *dst, int stride, short *dc); -; r0 *q, -; r1 *dq, -; r2 *dst -; r3 stride -; sp *dc -|idct_dequant_dc_full_2x_neon| PROC - push {r4} - - vld1.16 {q0, q1}, [r1] ; dq (same l/r) - vld1.16 {q2, q3}, [r0] ; l q - add r0, r0, #32 - vld1.16 {q4, q5}, [r0] ; r q - add r12, r2, #4 - - ; interleave the predictors - vld1.32 {d28[0]}, [r2], r3 ; l pre - vld1.32 {d28[1]}, [r12], r3 ; r pre - vld1.32 {d29[0]}, [r2], r3 - vld1.32 {d29[1]}, [r12], r3 - vld1.32 {d30[0]}, [r2], r3 - vld1.32 {d30[1]}, [r12], r3 - vld1.32 {d31[0]}, [r2], r3 - ldr r1, [sp, #4] ; *dc - vld1.32 {d31[1]}, [r12] - - adr r4, cospi8sqrt2minus1 ; pointer to the first constant - - ldrh r12, [r1], #2 ; lo *dc - ldrh r1, [r1] ; hi *dc - - ; dequant: q[i] = q[i] * dq[i] - vmul.i16 q2, q2, q0 - vmul.i16 q3, q3, q1 - vmul.i16 q4, q4, q0 - vmul.i16 q5, q5, q1 - - ; move dc up to neon and overwrite first element - vmov.16 d4[0], r12 - vmov.16 d8[0], r1 - - vld1.16 {d0}, [r4] - - ; q2: l0r0 q3: l8r8 - ; q4: l4r4 q5: l12r12 - vswp d5, d8 - vswp d7, d10 - - ; _CONSTANTS_ * 4,12 >> 16 - ; q6: 4 * sinpi : c1/temp1 - ; q7: 12 * sinpi : d1/temp2 - ; q8: 4 * cospi - ; q9: 12 * cospi - vqdmulh.s16 q6, q4, d0[2] ; sinpi8sqrt2 - vqdmulh.s16 q7, q5, d0[2] - vqdmulh.s16 q8, q4, d0[0] ; cospi8sqrt2minus1 - vqdmulh.s16 q9, q5, d0[0] - - vqadd.s16 q10, q2, q3 ; a1 = 0 + 8 - vqsub.s16 q11, q2, q3 ; b1 = 0 - 8 - - ; vqdmulh only accepts signed values. this was a problem because - ; our constant had the high bit set, and was treated as a negative value. - ; vqdmulh also doubles the value before it shifts by 16. we need to - ; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0, - ; so we can shift the constant without losing precision. this avoids - ; shift again afterward, but also avoids the sign issue. win win! - ; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we - ; pre-shift it - vshr.s16 q8, q8, #1 - vshr.s16 q9, q9, #1 - - ; q4: 4 + 4 * cospi : d1/temp1 - ; q5: 12 + 12 * cospi : c1/temp2 - vqadd.s16 q4, q4, q8 - vqadd.s16 q5, q5, q9 - - ; c1 = temp1 - temp2 - ; d1 = temp1 + temp2 - vqsub.s16 q2, q6, q5 - vqadd.s16 q3, q4, q7 - - ; [0]: a1+d1 - ; [1]: b1+c1 - ; [2]: b1-c1 - ; [3]: a1-d1 - vqadd.s16 q4, q10, q3 - vqadd.s16 q5, q11, q2 - vqsub.s16 q6, q11, q2 - vqsub.s16 q7, q10, q3 - - ; rotate - vtrn.32 q4, q6 - vtrn.32 q5, q7 - vtrn.16 q4, q5 - vtrn.16 q6, q7 - ; idct loop 2 - ; q4: l 0, 4, 8,12 r 0, 4, 8,12 - ; q5: l 1, 5, 9,13 r 1, 5, 9,13 - ; q6: l 2, 6,10,14 r 2, 6,10,14 - ; q7: l 3, 7,11,15 r 3, 7,11,15 - - ; q8: 1 * sinpi : c1/temp1 - ; q9: 3 * sinpi : d1/temp2 - ; q10: 1 * cospi - ; q11: 3 * cospi - vqdmulh.s16 q8, q5, d0[2] ; sinpi8sqrt2 - vqdmulh.s16 q9, q7, d0[2] - vqdmulh.s16 q10, q5, d0[0] ; cospi8sqrt2minus1 - vqdmulh.s16 q11, q7, d0[0] - - vqadd.s16 q2, q4, q6 ; a1 = 0 + 2 - vqsub.s16 q3, q4, q6 ; b1 = 0 - 2 - - ; see note on shifting above - vshr.s16 q10, q10, #1 - vshr.s16 q11, q11, #1 - - ; q10: 1 + 1 * cospi : d1/temp1 - ; q11: 3 + 3 * cospi : c1/temp2 - vqadd.s16 q10, q5, q10 - vqadd.s16 q11, q7, q11 - - ; q8: c1 = temp1 - temp2 - ; q9: d1 = temp1 + temp2 - vqsub.s16 q8, q8, q11 - vqadd.s16 q9, q10, q9 - - ; a1+d1 - ; b1+c1 - ; b1-c1 - ; a1-d1 - vqadd.s16 q4, q2, q9 - vqadd.s16 q5, q3, q8 - vqsub.s16 q6, q3, q8 - vqsub.s16 q7, q2, q9 - - ; +4 >> 3 (rounding) - vrshr.s16 q4, q4, #3 ; lo - vrshr.s16 q5, q5, #3 - vrshr.s16 q6, q6, #3 ; hi - vrshr.s16 q7, q7, #3 - - vtrn.32 q4, q6 - vtrn.32 q5, q7 - vtrn.16 q4, q5 - vtrn.16 q6, q7 - - ; adding pre - ; input is still packed. pre was read interleaved - vaddw.u8 q4, q4, d28 - vaddw.u8 q5, q5, d29 - vaddw.u8 q6, q6, d30 - vaddw.u8 q7, q7, d31 - - vmov.i16 q14, #0 - vmov q15, q14 - vst1.16 {q14, q15}, [r0] ; write over high input - sub r0, r0, #32 - vst1.16 {q14, q15}, [r0] ; write over low input - - sub r2, r2, r3, lsl #2 ; dst - 4*stride - add r1, r2, #4 ; hi - - ;saturate and narrow - vqmovun.s16 d0, q4 ; lo - vqmovun.s16 d1, q5 - vqmovun.s16 d2, q6 ; hi - vqmovun.s16 d3, q7 - - vst1.32 {d0[0]}, [r2], r3 ; lo - vst1.32 {d0[1]}, [r1], r3 ; hi - vst1.32 {d1[0]}, [r2], r3 - vst1.32 {d1[1]}, [r1], r3 - vst1.32 {d2[0]}, [r2], r3 - vst1.32 {d2[1]}, [r1], r3 - vst1.32 {d3[0]}, [r2] - vst1.32 {d3[1]}, [r1] - - pop {r4} - bx lr - - ENDP ; |idct_dequant_dc_full_2x_neon| - -; Constant Pool -cospi8sqrt2minus1 DCD 0x4e7b -; because the lowest bit in 0x8a8c is 0, we can pre-shift this -sinpi8sqrt2 DCD 0x4546 - - END diff --git a/vp8/decoder/decodframe.c b/vp8/decoder/decodframe.c index e501b9ec7..11d0e38f5 100644 --- a/vp8/decoder/decodframe.c +++ b/vp8/decoder/decodframe.c @@ -15,7 +15,7 @@ #include "vp8/common/reconintra4x4.h" #include "vp8/common/recon.h" #include "vp8/common/reconinter.h" -#include "dequantize.h" +#include "vp8/common/dequantize.h" #include "detokenize.h" #include "vp8/common/invtrans.h" #include "vp8/common/alloccommon.h" @@ -32,7 +32,7 @@ #endif #include "vpx_mem/vpx_mem.h" #include "vp8/common/idct.h" -#include "dequantize.h" + #include "vp8/common/threading.h" #include "decoderthreading.h" #include "dboolhuff.h" @@ -109,32 +109,12 @@ void mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd) #define RTCD_VTABLE(x) NULL #endif -/* skip_recon_mb() is Modified: Instead of writing the result to predictor buffer and then copying it - * to dst buffer, we can write the result directly to dst buffer. This eliminates unnecessary copy. - */ -static void skip_recon_mb(VP8D_COMP *pbi, MACROBLOCKD *xd) -{ - if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) - { - RECON_INVOKE(&pbi->common.rtcd.recon, build_intra_predictors_mbuv_s)(xd); - RECON_INVOKE(&pbi->common.rtcd.recon, - build_intra_predictors_mby_s)(xd); - } - else - { - vp8_build_inter16x16_predictors_mb(xd, xd->dst.y_buffer, - xd->dst.u_buffer, xd->dst.v_buffer, - xd->dst.y_stride, xd->dst.uv_stride); - } -} - static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, unsigned int mb_idx) { - int eobtotal = 0; - int throw_residual = 0; MB_PREDICTION_MODE mode; int i; + int corruption_detected = 0; if (xd->mode_info_context->mbmi.mb_skip_coeff) { @@ -142,27 +122,51 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, } else if (!vp8dx_bool_error(xd->current_bc)) { + int eobtotal; eobtotal = vp8_decode_mb_tokens(pbi, xd); + + /* Special case: Force the loopfilter to skip when eobtotal is zero */ + xd->mode_info_context->mbmi.mb_skip_coeff = (eobtotal==0); } + mode = xd->mode_info_context->mbmi.mode; + + if (xd->segmentation_enabled) + mb_init_dequantizer(pbi, xd); - mode = xd->mode_info_context->mbmi.mode; +#if CONFIG_ERROR_CONCEALMENT - if (eobtotal == 0 && mode != B_PRED && mode != SPLITMV && - !vp8dx_bool_error(xd->current_bc)) + if(pbi->ec_active) { - /* Special case: Force the loopfilter to skip when eobtotal and - * mb_skip_coeff are zero. - * */ - xd->mode_info_context->mbmi.mb_skip_coeff = 1; + int throw_residual; + /* When we have independent partitions we can apply residual even + * though other partitions within the frame are corrupt. + */ + throw_residual = (!pbi->independent_partitions && + pbi->frame_corrupt_residual); + throw_residual = (throw_residual || vp8dx_bool_error(xd->current_bc)); - skip_recon_mb(pbi, xd); - return; + if ((mb_idx >= pbi->mvs_corrupt_from_mb || throw_residual)) + { + /* MB with corrupt residuals or corrupt mode/motion vectors. + * Better to use the predictor as reconstruction. + */ + pbi->frame_corrupt_residual = 1; + vpx_memset(xd->qcoeff, 0, sizeof(xd->qcoeff)); + vp8_conceal_corrupt_mb(xd); + + + corruption_detected = 1; + + /* force idct to be skipped for B_PRED and use the + * prediction only for reconstruction + * */ + vpx_memset(xd->eobs, 0, 25); + } } +#endif - if (xd->segmentation_enabled) - mb_init_dequantizer(pbi, xd); /* do prediction */ if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) @@ -173,113 +177,117 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, { RECON_INVOKE(&pbi->common.rtcd.recon, build_intra_predictors_mby_s)(xd); - } else { + } + else + { + /* clear out residual eob info */ + if(xd->mode_info_context->mbmi.mb_skip_coeff) + vpx_memset(xd->eobs, 0, 25); + vp8_intra_prediction_down_copy(xd); + + for (i = 0; i < 16; i++) + { + BLOCKD *b = &xd->block[i]; + int b_mode = xd->mode_info_context->bmi[i].as_mode; + + RECON_INVOKE(RTCD_VTABLE(recon), intra4x4_predict) + ( *(b->base_dst) + b->dst, b->dst_stride, b_mode, + *(b->base_dst) + b->dst, b->dst_stride ); + + if (xd->eobs[i]) + { + if (xd->eobs[i] > 1) + { + DEQUANT_INVOKE(&pbi->common.rtcd.dequant, idct_add) + (b->qcoeff, b->dequant, + *(b->base_dst) + b->dst, b->dst_stride); + } + else + { + IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add) + (b->qcoeff[0] * b->dequant[0], + *(b->base_dst) + b->dst, b->dst_stride, + *(b->base_dst) + b->dst, b->dst_stride); + ((int *)b->qcoeff)[0] = 0; + } + } + } } } else { vp8_build_inter_predictors_mb(xd); } - /* When we have independent partitions we can apply residual even - * though other partitions within the frame are corrupt. - */ - throw_residual = (!pbi->independent_partitions && - pbi->frame_corrupt_residual); - throw_residual = (throw_residual || vp8dx_bool_error(xd->current_bc)); + #if CONFIG_ERROR_CONCEALMENT - if (pbi->ec_active && - (mb_idx >= pbi->mvs_corrupt_from_mb || throw_residual)) + if (corruption_detected) { - /* MB with corrupt residuals or corrupt mode/motion vectors. - * Better to use the predictor as reconstruction. - */ - pbi->frame_corrupt_residual = 1; - vpx_memset(xd->qcoeff, 0, sizeof(xd->qcoeff)); - vp8_conceal_corrupt_mb(xd); return; } #endif - /* dequantization and idct */ - if (mode == B_PRED) + if(!xd->mode_info_context->mbmi.mb_skip_coeff) { - for (i = 0; i < 16; i++) + /* dequantization and idct */ + if (mode != B_PRED) { - BLOCKD *b = &xd->block[i]; - int b_mode = xd->mode_info_context->bmi[i].as_mode; + short *DQC = xd->block[0].dequant; - RECON_INVOKE(RTCD_VTABLE(recon), intra4x4_predict) - ( *(b->base_dst) + b->dst, b->dst_stride, b_mode, - *(b->base_dst) + b->dst, b->dst_stride ); + /* save the dc dequant constant in case it is overridden */ + short dc_dequant_temp = DQC[0]; - if (xd->eobs[i] ) + if (mode != SPLITMV) { - if (xd->eobs[i] > 1) + BLOCKD *b = &xd->block[24]; + + /* do 2nd order transform on the dc block */ + if (xd->eobs[24] > 1) { - DEQUANT_INVOKE(&pbi->dequant, idct_add) - (b->qcoeff, b->dequant, - *(b->base_dst) + b->dst, b->dst_stride); + DEQUANT_INVOKE(&pbi->common.rtcd.dequant, block)(b); + + IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], + xd->qcoeff); + ((int *)b->qcoeff)[0] = 0; + ((int *)b->qcoeff)[1] = 0; + ((int *)b->qcoeff)[2] = 0; + ((int *)b->qcoeff)[3] = 0; + ((int *)b->qcoeff)[4] = 0; + ((int *)b->qcoeff)[5] = 0; + ((int *)b->qcoeff)[6] = 0; + ((int *)b->qcoeff)[7] = 0; } else { - IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add) - (b->qcoeff[0] * b->dequant[0], - *(b->base_dst) + b->dst, b->dst_stride, - *(b->base_dst) + b->dst, b->dst_stride); + b->dqcoeff[0] = b->qcoeff[0] * b->dequant[0]; + IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], + xd->qcoeff); ((int *)b->qcoeff)[0] = 0; } + + /* override the dc dequant constant in order to preserve the + * dc components + */ + DQC[0] = 1; } - } - } - else if (mode == SPLITMV) - { - DEQUANT_INVOKE (&pbi->dequant, idct_add_y_block) - (xd->qcoeff, xd->block[0].dequant, - xd->dst.y_buffer, - xd->dst.y_stride, xd->eobs); - } - else - { - BLOCKD *b = &xd->block[24]; + DEQUANT_INVOKE (&pbi->common.rtcd.dequant, idct_add_y_block) + (xd->qcoeff, xd->block[0].dequant, + xd->dst.y_buffer, + xd->dst.y_stride, xd->eobs); - /* do 2nd order transform on the dc block */ - if (xd->eobs[24] > 1) - { - DEQUANT_INVOKE(&pbi->dequant, block)(b); - - IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], b->diff); - ((int *)b->qcoeff)[0] = 0; - ((int *)b->qcoeff)[1] = 0; - ((int *)b->qcoeff)[2] = 0; - ((int *)b->qcoeff)[3] = 0; - ((int *)b->qcoeff)[4] = 0; - ((int *)b->qcoeff)[5] = 0; - ((int *)b->qcoeff)[6] = 0; - ((int *)b->qcoeff)[7] = 0; - } - else - { - b->dqcoeff[0] = b->qcoeff[0] * b->dequant[0]; - IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], b->diff); - ((int *)b->qcoeff)[0] = 0; + /* restore the dc dequant constant */ + DQC[0] = dc_dequant_temp; } - DEQUANT_INVOKE (&pbi->dequant, dc_idct_add_y_block) - (xd->qcoeff, xd->block[0].dequant, - xd->dst.y_buffer, - xd->dst.y_stride, xd->eobs, xd->block[24].diff); + DEQUANT_INVOKE (&pbi->common.rtcd.dequant, idct_add_uv_block) + (xd->qcoeff+16*16, xd->block[16].dequant, + xd->dst.u_buffer, xd->dst.v_buffer, + xd->dst.uv_stride, xd->eobs+16); } - - DEQUANT_INVOKE (&pbi->dequant, idct_add_uv_block) - (xd->qcoeff+16*16, xd->block[16].dequant, - xd->dst.u_buffer, xd->dst.v_buffer, - xd->dst.uv_stride, xd->eobs+16); } - static int get_delta_q(vp8_reader *bc, int prev, int *q_update) { int ret_val = 0; @@ -476,7 +484,8 @@ static void setup_token_decoder(VP8D_COMP *pbi, const unsigned char* token_part_sizes) { vp8_reader *bool_decoder = &pbi->bc2; - int fragment_idx, partition_idx; + unsigned int partition_idx; + int fragment_idx; int num_token_partitions; const unsigned char *first_fragment_end = pbi->fragments[0] + pbi->fragment_sizes[0]; @@ -934,16 +943,38 @@ int vp8_decode_frame(VP8D_COMP *pbi) if (!pc->refresh_golden_frame) pc->copy_buffer_to_gf = vp8_read_literal(bc, 2); +#if CONFIG_ERROR_CONCEALMENT + /* Assume we shouldn't copy to the golden if the bit is missing */ + xd->corrupted |= vp8dx_bool_error(bc); + if (pbi->ec_active && xd->corrupted) + pc->copy_buffer_to_gf = 0; +#endif + pc->copy_buffer_to_arf = 0; if (!pc->refresh_alt_ref_frame) pc->copy_buffer_to_arf = vp8_read_literal(bc, 2); +#if CONFIG_ERROR_CONCEALMENT + /* Assume we shouldn't copy to the alt-ref if the bit is missing */ + xd->corrupted |= vp8dx_bool_error(bc); + if (pbi->ec_active && xd->corrupted) + pc->copy_buffer_to_arf = 0; +#endif + + pc->ref_frame_sign_bias[GOLDEN_FRAME] = vp8_read_bit(bc); pc->ref_frame_sign_bias[ALTREF_FRAME] = vp8_read_bit(bc); } pc->refresh_entropy_probs = vp8_read_bit(bc); +#if CONFIG_ERROR_CONCEALMENT + /* Assume we shouldn't refresh the probabilities if the bit is + * missing */ + xd->corrupted |= vp8dx_bool_error(bc); + if (pbi->ec_active && xd->corrupted) + pc->refresh_entropy_probs = 0; +#endif if (pc->refresh_entropy_probs == 0) { vpx_memcpy(&pc->lfc, &pc->fc, sizeof(pc->fc)); diff --git a/vp8/decoder/detokenize.c b/vp8/decoder/detokenize.c index 1d4568593..ba94c58bb 100644 --- a/vp8/decoder/detokenize.c +++ b/vp8/decoder/detokenize.c @@ -15,7 +15,7 @@ #include "vpx_ports/mem.h" #include "detokenize.h" -#define BOOL_DATA UINT8 +#define BOOL_DATA unsigned char #define OCB_X PREV_COEF_CONTEXTS * ENTROPY_NODES DECLARE_ALIGNED(16, static const unsigned char, coef_bands_x[16]) = @@ -157,10 +157,10 @@ DECLARE_ALIGNED(16, extern const unsigned char, vp8_norm[256]); DECODE_AND_APPLYSIGN(val) \ Prob = coef_probs + (ENTROPY_NODES*2); \ if(c < 15){\ - qcoeff_ptr [ scan[c] ] = (INT16) v; \ + qcoeff_ptr [ scan[c] ] = (int16_t) v; \ ++c; \ goto DO_WHILE; }\ - qcoeff_ptr [ 15 ] = (INT16) v; \ + qcoeff_ptr [ 15 ] = (int16_t) v; \ goto BLOCK_FINISHED; @@ -172,7 +172,7 @@ DECLARE_ALIGNED(16, extern const unsigned char, vp8_norm[256]); {\ range = range-split;\ value = value-bigsplit;\ - val += ((UINT16)1<<bits_count);\ + val += ((uint16_t)1<<bits_count);\ }\ else\ {\ @@ -340,12 +340,12 @@ ONE_CONTEXT_NODE_0_: if (c < 15) { - qcoeff_ptr [ scan[c] ] = (INT16) v; + qcoeff_ptr [ scan[c] ] = (int16_t) v; ++c; goto DO_WHILE; } - qcoeff_ptr [ 15 ] = (INT16) v; + qcoeff_ptr [ 15 ] = (int16_t) v; BLOCK_FINISHED: eobs[i] = c; eobtotal += c; diff --git a/vp8/decoder/error_concealment.c b/vp8/decoder/error_concealment.c index 86fa191d3..b77d743f7 100644 --- a/vp8/decoder/error_concealment.c +++ b/vp8/decoder/error_concealment.c @@ -491,33 +491,6 @@ static void find_neighboring_blocks(MODE_INFO *mi, assert(i == 20); } -/* Calculates which reference frame type is dominating among the neighbors */ -static MV_REFERENCE_FRAME dominant_ref_frame(EC_BLOCK *neighbors) -{ - /* Default to referring to "skip" */ - MV_REFERENCE_FRAME dom_ref_frame = LAST_FRAME; - int max_ref_frame_cnt = 0; - int ref_frame_cnt[MAX_REF_FRAMES] = {0}; - int i; - /* Count neighboring reference frames */ - for (i = 0; i < NUM_NEIGHBORS; ++i) - { - if (neighbors[i].ref_frame < MAX_REF_FRAMES && - neighbors[i].ref_frame != INTRA_FRAME) - ++ref_frame_cnt[neighbors[i].ref_frame]; - } - /* Find maximum */ - for (i = 0; i < MAX_REF_FRAMES; ++i) - { - if (ref_frame_cnt[i] > max_ref_frame_cnt) - { - dom_ref_frame = i; - max_ref_frame_cnt = ref_frame_cnt[i]; - } - } - return dom_ref_frame; -} - /* Interpolates all motion vectors for a macroblock from the neighboring blocks' * motion vectors. */ @@ -591,7 +564,6 @@ void vp8_interpolate_motion(MACROBLOCKD *mb, { /* Find relevant neighboring blocks */ EC_BLOCK neighbors[NUM_NEIGHBORS]; - MV_REFERENCE_FRAME dom_ref_frame; int i; /* Initialize the array. MAX_REF_FRAMES is interpreted as "doesn't exist" */ for (i = 0; i < NUM_NEIGHBORS; ++i) @@ -604,13 +576,11 @@ void vp8_interpolate_motion(MACROBLOCKD *mb, mb_row, mb_col, mb_rows, mb_cols, mb->mode_info_stride); - /* Determine the dominant block type */ - dom_ref_frame = dominant_ref_frame(neighbors); - /* Interpolate MVs for the missing blocks - * from the dominating MVs */ - interpolate_mvs(mb, neighbors, dom_ref_frame); + /* Interpolate MVs for the missing blocks from the surrounding + * blocks which refer to the last frame. */ + interpolate_mvs(mb, neighbors, LAST_FRAME); - mb->mode_info_context->mbmi.ref_frame = dom_ref_frame; + mb->mode_info_context->mbmi.ref_frame = LAST_FRAME; mb->mode_info_context->mbmi.mode = SPLITMV; mb->mode_info_context->mbmi.uv_mode = DC_PRED; mb->mode_info_context->mbmi.partitioning = 3; diff --git a/vp8/decoder/generic/dsystemdependent.c b/vp8/decoder/generic/dsystemdependent.c index 9c42bc62d..8a84e566a 100644 --- a/vp8/decoder/generic/dsystemdependent.c +++ b/vp8/decoder/generic/dsystemdependent.c @@ -10,7 +10,7 @@ #include "vpx_config.h" -#include "vp8/decoder/dequantize.h" +#include "vp8/common/dequantize.h" #include "vp8/decoder/onyxd_int.h" extern void vp8_arch_x86_decode_init(VP8D_COMP *pbi); @@ -20,13 +20,7 @@ void vp8_dmachine_specific_config(VP8D_COMP *pbi) { /* Pure C: */ #if CONFIG_RUNTIME_CPU_DETECT - pbi->mb.rtcd = &pbi->common.rtcd; - pbi->dequant.block = vp8_dequantize_b_c; - pbi->dequant.idct_add = vp8_dequant_idct_add_c; - pbi->dequant.dc_idct_add = vp8_dequant_dc_idct_add_c; - pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_c; - pbi->dequant.idct_add_y_block = vp8_dequant_idct_add_y_block_c; - pbi->dequant.idct_add_uv_block = vp8_dequant_idct_add_uv_block_c; + pbi->mb.rtcd = &pbi->common.rtcd; #endif #if ARCH_X86 || ARCH_X86_64 diff --git a/vp8/decoder/onyxd_if.c b/vp8/decoder/onyxd_if.c index 077954948..80648d39f 100644 --- a/vp8/decoder/onyxd_if.c +++ b/vp8/decoder/onyxd_if.c @@ -20,7 +20,6 @@ #include "vpx_scale/yv12extend.h" #include "vp8/common/loopfilter.h" #include "vp8/common/swapyv12buffer.h" -#include "vp8/common/g_common.h" #include "vp8/common/threading.h" #include "decoderthreading.h" #include <stdio.h> @@ -57,7 +56,7 @@ void vp8dx_initialize() } -VP8D_PTR vp8dx_create_decompressor(VP8D_CONFIG *oxcf) +struct VP8D_COMP * vp8dx_create_decompressor(VP8D_CONFIG *oxcf) { VP8D_COMP *pbi = vpx_memalign(32, sizeof(VP8D_COMP)); @@ -117,14 +116,12 @@ VP8D_PTR vp8dx_create_decompressor(VP8D_CONFIG *oxcf) */ pbi->independent_partitions = 0; - return (VP8D_PTR) pbi; + return pbi; } -void vp8dx_remove_decompressor(VP8D_PTR ptr) +void vp8dx_remove_decompressor(VP8D_COMP *pbi) { - VP8D_COMP *pbi = (VP8D_COMP *) ptr; - if (!pbi) return; @@ -142,9 +139,8 @@ void vp8dx_remove_decompressor(VP8D_PTR ptr) } -vpx_codec_err_t vp8dx_get_reference(VP8D_PTR ptr, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd) +vpx_codec_err_t vp8dx_get_reference(VP8D_COMP *pbi, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd) { - VP8D_COMP *pbi = (VP8D_COMP *) ptr; VP8_COMMON *cm = &pbi->common; int ref_fb_idx; @@ -174,9 +170,8 @@ vpx_codec_err_t vp8dx_get_reference(VP8D_PTR ptr, VP8_REFFRAME ref_frame_flag, Y } -vpx_codec_err_t vp8dx_set_reference(VP8D_PTR ptr, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd) +vpx_codec_err_t vp8dx_set_reference(VP8D_COMP *pbi, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd) { - VP8D_COMP *pbi = (VP8D_COMP *) ptr; VP8_COMMON *cm = &pbi->common; int *ref_fb_ptr = NULL; int free_fb; @@ -301,19 +296,18 @@ static int swap_frame_buffers (VP8_COMMON *cm) return err; } -int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsigned char *source, int64_t time_stamp) +int vp8dx_receive_compressed_data(VP8D_COMP *pbi, unsigned long size, const unsigned char *source, int64_t time_stamp) { #if HAVE_ARMV7 int64_t dx_store_reg[8]; #endif - VP8D_COMP *pbi = (VP8D_COMP *) ptr; VP8_COMMON *cm = &pbi->common; int retcode = 0; /*if(pbi->ready_for_new_data == 0) return -1;*/ - if (ptr == 0) + if (pbi == 0) { return -1; } @@ -359,28 +353,38 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign pbi->fragment_sizes[0] = 0; } - if (pbi->num_fragments <= 1 && pbi->fragment_sizes[0] == 0) + if (!pbi->ec_active && + pbi->num_fragments <= 1 && pbi->fragment_sizes[0] == 0) { - /* This is used to signal that we are missing frames. - * We do not know if the missing frame(s) was supposed to update - * any of the reference buffers, but we act conservative and - * mark only the last buffer as corrupted. - */ - cm->yv12_fb[cm->lst_fb_idx].corrupted = 1; - /* If error concealment is disabled we won't signal missing frames * to the decoder. */ - if (!pbi->ec_active) + if (cm->fb_idx_ref_cnt[cm->lst_fb_idx] > 1) { - /* Signal that we have no frame to show. */ - cm->show_frame = 0; + /* The last reference shares buffer with another reference + * buffer. Move it to its own buffer before setting it as + * corrupt, otherwise we will make multiple buffers corrupt. + */ + const int prev_idx = cm->lst_fb_idx; + cm->fb_idx_ref_cnt[prev_idx]--; + cm->lst_fb_idx = get_free_fb(cm); + vp8_yv12_copy_frame_ptr(&cm->yv12_fb[prev_idx], + &cm->yv12_fb[cm->lst_fb_idx]); + } + /* This is used to signal that we are missing frames. + * We do not know if the missing frame(s) was supposed to update + * any of the reference buffers, but we act conservative and + * mark only the last buffer as corrupted. + */ + cm->yv12_fb[cm->lst_fb_idx].corrupted = 1; - pbi->num_fragments = 0; + /* Signal that we have no frame to show. */ + cm->show_frame = 0; - /* Nothing more to do. */ - return 0; - } + pbi->num_fragments = 0; + + /* Nothing more to do. */ + return 0; } #if HAVE_ARMV7 @@ -565,10 +569,9 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign pbi->common.error.setjmp = 0; return retcode; } -int vp8dx_get_raw_frame(VP8D_PTR ptr, YV12_BUFFER_CONFIG *sd, int64_t *time_stamp, int64_t *time_end_stamp, vp8_ppflags_t *flags) +int vp8dx_get_raw_frame(VP8D_COMP *pbi, YV12_BUFFER_CONFIG *sd, int64_t *time_stamp, int64_t *time_end_stamp, vp8_ppflags_t *flags) { int ret = -1; - VP8D_COMP *pbi = (VP8D_COMP *) ptr; if (pbi->ready_for_new_data == 1) return ret; diff --git a/vp8/decoder/onyxd_int.h b/vp8/decoder/onyxd_int.h index 519a7f2b9..cb2593b2c 100644 --- a/vp8/decoder/onyxd_int.h +++ b/vp8/decoder/onyxd_int.h @@ -16,7 +16,8 @@ #include "treereader.h" #include "vp8/common/onyxc_int.h" #include "vp8/common/threading.h" -#include "dequantize.h" + + #if CONFIG_ERROR_CONCEALMENT #include "ec_types.h" #endif @@ -43,7 +44,7 @@ typedef struct } DATARATE; -typedef struct VP8Decompressor +typedef struct VP8D_COMP { DECLARE_ALIGNED(16, MACROBLOCKD, mb); @@ -93,11 +94,6 @@ typedef struct VP8Decompressor DATARATE dr[16]; -#if CONFIG_RUNTIME_CPU_DETECT - vp8_dequant_rtcd_vtable_t dequant; -#endif - - vp8_prob prob_intra; vp8_prob prob_last; vp8_prob prob_gf; diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c index eba5830d5..947b3a1c6 100644 --- a/vp8/decoder/threading.c +++ b/vp8/decoder/threading.c @@ -175,36 +175,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m #endif /* dequantization and idct */ - if (xd->mode_info_context->mbmi.mode != B_PRED && xd->mode_info_context->mbmi.mode != SPLITMV) - { - BLOCKD *b = &xd->block[24]; - DEQUANT_INVOKE(&pbi->dequant, block)(b); - - /* do 2nd order transform on the dc block */ - if (xd->eobs[24] > 1) - { - IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], b->diff); - ((int *)b->qcoeff)[0] = 0; - ((int *)b->qcoeff)[1] = 0; - ((int *)b->qcoeff)[2] = 0; - ((int *)b->qcoeff)[3] = 0; - ((int *)b->qcoeff)[4] = 0; - ((int *)b->qcoeff)[5] = 0; - ((int *)b->qcoeff)[6] = 0; - ((int *)b->qcoeff)[7] = 0; - } - else - { - IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], b->diff); - ((int *)b->qcoeff)[0] = 0; - } - - DEQUANT_INVOKE (&pbi->dequant, dc_idct_add_y_block) - (xd->qcoeff, xd->block[0].dequant, - xd->dst.y_buffer, - xd->dst.y_stride, xd->eobs, xd->block[24].diff); - } - else if (xd->mode_info_context->mbmi.mode == B_PRED) + if (xd->mode_info_context->mbmi.mode == B_PRED) { for (i = 0; i < 16; i++) { @@ -214,37 +185,81 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m vp8mt_predict_intra4x4(pbi, xd, b_mode, *(b->base_dst) + b->dst, b->dst_stride, mb_row, mb_col, i); - if (xd->eobs[i] > 1) + if (xd->eobs[i] ) { - DEQUANT_INVOKE(&pbi->dequant, idct_add) - (b->qcoeff, b->dequant, - *(b->base_dst) + b->dst, b->dst_stride); + if (xd->eobs[i] > 1) + { + DEQUANT_INVOKE(&pbi->common.rtcd.dequant, idct_add) + (b->qcoeff, b->dequant, + *(b->base_dst) + b->dst, b->dst_stride); + } + else + { + IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add) + (b->qcoeff[0] * b->dequant[0], + *(b->base_dst) + b->dst, b->dst_stride, + *(b->base_dst) + b->dst, b->dst_stride); + ((int *)b->qcoeff)[0] = 0; + } + } + } + } + else + { + short *DQC = xd->block[0].dequant; + + DECLARE_ALIGNED(16, short, local_dequant[16]); + + if (xd->mode_info_context->mbmi.mode != SPLITMV) + { + BLOCKD *b = &xd->block[24]; + + /* do 2nd order transform on the dc block */ + if (xd->eobs[24] > 1) + { + DEQUANT_INVOKE(&pbi->common.rtcd.dequant, block)(b); + + IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], + xd->qcoeff); + ((int *)b->qcoeff)[0] = 0; + ((int *)b->qcoeff)[1] = 0; + ((int *)b->qcoeff)[2] = 0; + ((int *)b->qcoeff)[3] = 0; + ((int *)b->qcoeff)[4] = 0; + ((int *)b->qcoeff)[5] = 0; + ((int *)b->qcoeff)[6] = 0; + ((int *)b->qcoeff)[7] = 0; } else { - IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add) - (b->qcoeff[0] * b->dequant[0], - *(b->base_dst) + b->dst, b->dst_stride, - *(b->base_dst) + b->dst, b->dst_stride); + b->dqcoeff[0] = b->qcoeff[0] * b->dequant[0]; + IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], xd->qcoeff); ((int *)b->qcoeff)[0] = 0; } + + /* make a local copy of the dequant constants */ + vpx_memcpy(local_dequant, xd->block[0].dequant, + sizeof(local_dequant)); + + /* override the dc dequant constant */ + local_dequant[0] = 1; + + /* use the new dequant constants */ + DQC = local_dequant; } - } - else - { - DEQUANT_INVOKE (&pbi->dequant, idct_add_y_block) - (xd->qcoeff, xd->block[0].dequant, + + DEQUANT_INVOKE (&pbi->common.rtcd.dequant, idct_add_y_block) + (xd->qcoeff, DQC, xd->dst.y_buffer, xd->dst.y_stride, xd->eobs); } - DEQUANT_INVOKE (&pbi->dequant, idct_add_uv_block) + DEQUANT_INVOKE (&pbi->common.rtcd.dequant, idct_add_uv_block) (xd->qcoeff+16*16, xd->block[16].dequant, xd->dst.u_buffer, xd->dst.v_buffer, xd->dst.uv_stride, xd->eobs+16); } - static THREAD_FUNCTION thread_decoding_proc(void *p_data) { int ithread = ((DECODETHREAD_DATA *)p_data)->ithread; diff --git a/vp8/decoder/x86/x86_dsystemdependent.c b/vp8/decoder/x86/x86_dsystemdependent.c index 443150483..27bf5ddbd 100644 --- a/vp8/decoder/x86/x86_dsystemdependent.c +++ b/vp8/decoder/x86/x86_dsystemdependent.c @@ -13,50 +13,7 @@ #include "vpx_ports/x86.h" #include "vp8/decoder/onyxd_int.h" - -#if HAVE_MMX -void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q); - -void vp8_dequantize_b_mmx(BLOCKD *d) -{ - short *sq = (short *) d->qcoeff; - short *dq = (short *) d->dqcoeff; - short *q = (short *) d->dequant; - vp8_dequantize_b_impl_mmx(sq, dq, q); -} -#endif - void vp8_arch_x86_decode_init(VP8D_COMP *pbi) { -#if CONFIG_RUNTIME_CPU_DETECT - int flags = x86_simd_caps(); - - /* Note: - * - * This platform can be built without runtime CPU detection as well. If - * you modify any of the function mappings present in this file, be sure - * to also update them in static mapings (<arch>/filename_<arch>.h) - */ - /* Override default functions with fastest ones for this CPU. */ -#if HAVE_MMX - if (flags & HAS_MMX) - { - pbi->dequant.block = vp8_dequantize_b_mmx; - pbi->dequant.idct_add = vp8_dequant_idct_add_mmx; - pbi->dequant.dc_idct_add = vp8_dequant_dc_idct_add_mmx; - pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_mmx; - pbi->dequant.idct_add_y_block = vp8_dequant_idct_add_y_block_mmx; - pbi->dequant.idct_add_uv_block = vp8_dequant_idct_add_uv_block_mmx; - } -#endif -#if HAVE_SSE2 - if (flags & HAS_SSE2) - { - pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_sse2; - pbi->dequant.idct_add_y_block = vp8_dequant_idct_add_y_block_sse2; - pbi->dequant.idct_add_uv_block = vp8_dequant_idct_add_uv_block_sse2; - } -#endif -#endif } diff --git a/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm b/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm index 30513f912..5b7e8f66f 100644 --- a/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm +++ b/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm @@ -13,6 +13,7 @@ EXPORT |vp8_encode_bool| EXPORT |vp8_stop_encode| EXPORT |vp8_encode_value| + IMPORT |vp8_validate_buffer_arm| INCLUDE asm_enc_offsets.asm @@ -22,6 +23,20 @@ AREA |.text|, CODE, READONLY + ; macro for validating write buffer position + ; needs vp8_writer in r0 + ; start shall not be in r1 + MACRO + VALIDATE_POS $start, $pos + push {r0-r3, r12, lr} ; rest of regs are preserved by subroutine call + ldr r2, [r0, #vp8_writer_buffer_end] + ldr r3, [r0, #vp8_writer_error] + mov r1, $pos + mov r0, $start + bl vp8_validate_buffer_arm + pop {r0-r3, r12, lr} + MEND + ; r0 BOOL_CODER *br ; r1 unsigned char *source ; r2 unsigned char *source_end @@ -43,7 +58,7 @@ ; r1 int bit ; r2 int probability |vp8_encode_bool| PROC - push {r4-r9, lr} + push {r4-r10, lr} mov r4, r2 @@ -106,6 +121,9 @@ token_high_bit_not_set bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff str r1, [r0, #vp8_writer_pos] sub r3, r3, #8 ; count -= 8 + + VALIDATE_POS r9, r1 ; validate_buffer at pos + strb r7, [r9, r4] ; w->buffer[w->pos++] token_count_lt_zero @@ -114,7 +132,7 @@ token_count_lt_zero str r2, [r0, #vp8_writer_lowvalue] str r5, [r0, #vp8_writer_range] str r3, [r0, #vp8_writer_count] - pop {r4-r9, pc} + pop {r4-r10, pc} ENDP ; r0 BOOL_CODER *br @@ -179,6 +197,9 @@ token_high_bit_not_set_se bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff str r1, [r0, #vp8_writer_pos] sub r3, r3, #8 ; count -= 8 + + VALIDATE_POS r9, r1 ; validate_buffer at pos + strb r7, [r9, r4] ; w->buffer[w->pos++] token_count_lt_zero_se @@ -198,7 +219,7 @@ token_count_lt_zero_se ; r1 int data ; r2 int bits |vp8_encode_value| PROC - push {r4-r11, lr} + push {r4-r12, lr} mov r10, r2 @@ -270,6 +291,9 @@ token_high_bit_not_set_ev bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff str r11, [r0, #vp8_writer_pos] sub r3, r3, #8 ; count -= 8 + + VALIDATE_POS r9, r11 ; validate_buffer at pos + strb r7, [r9, r4] ; w->buffer[w->pos++] token_count_lt_zero_ev @@ -281,7 +305,7 @@ token_count_lt_zero_ev str r2, [r0, #vp8_writer_lowvalue] str r5, [r0, #vp8_writer_range] str r3, [r0, #vp8_writer_count] - pop {r4-r11, pc} + pop {r4-r12, pc} ENDP END diff --git a/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm b/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm index 933717c63..a1cd46704 100644 --- a/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm +++ b/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm @@ -10,6 +10,7 @@ EXPORT |vp8cx_pack_tokens_armv5| + IMPORT |vp8_validate_buffer_arm| INCLUDE asm_enc_offsets.asm @@ -19,6 +20,22 @@ AREA |.text|, CODE, READONLY + + ; macro for validating write buffer position + ; needs vp8_writer in r0 + ; start shall not be in r1 + MACRO + VALIDATE_POS $start, $pos + push {r0-r3, r12, lr} ; rest of regs are preserved by subroutine call + ldr r2, [r0, #vp8_writer_buffer_end] + ldr r3, [r0, #vp8_writer_error] + mov r1, $pos + mov r0, $start + bl vp8_validate_buffer_arm + pop {r0-r3, r12, lr} + MEND + + ; r0 vp8_writer *w ; r1 const TOKENEXTRA *p ; r2 int xcount @@ -26,11 +43,11 @@ ; s0 vp8_extra_bits ; s1 vp8_coef_tree |vp8cx_pack_tokens_armv5| PROC - push {r4-r11, lr} + push {r4-r12, lr} + sub sp, sp, #16 ; Add size of xcount * sizeof (TOKENEXTRA) to get stop ; sizeof (TOKENEXTRA) is 8 - sub sp, sp, #12 add r2, r1, r2, lsl #3 ; stop = p + xcount*sizeof(TOKENEXTRA) str r2, [sp, #0] str r3, [sp, #8] ; save vp8_coef_encodings @@ -57,7 +74,7 @@ while_p_lt_stop subne r8, r8, #1 ; --n rsb r4, r8, #32 ; 32-n - ldr r10, [sp, #52] ; vp8_coef_tree + ldr r10, [sp, #60] ; vp8_coef_tree ; v is kept in r12 during the token pack loop lsl r12, r6, r4 ; r12 = v << 32 - n @@ -128,12 +145,15 @@ token_high_bit_not_set bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff str r11, [r0, #vp8_writer_pos] sub r3, r3, #8 ; count -= 8 + + VALIDATE_POS r10, r11 ; validate_buffer at pos + strb r7, [r10, r4] ; w->buffer[w->pos++] ; r10 is used earlier in the loop, but r10 is used as ; temp variable here. So after r10 is used, reload ; vp8_coef_tree_dcd into r10 - ldr r10, [sp, #52] ; vp8_coef_tree + ldr r10, [sp, #60] ; vp8_coef_tree token_count_lt_zero lsl r2, r2, r6 ; lowvalue <<= shift @@ -142,7 +162,7 @@ token_count_lt_zero bne token_loop ldrb r6, [r1, #tokenextra_token] ; t - ldr r7, [sp, #48] ; vp8_extra_bits + ldr r7, [sp, #56] ; vp8_extra_bits ; Add t * sizeof (vp8_extra_bit_struct) to get the desired ; element. Here vp8_extra_bit_struct == 16 add r12, r7, r6, lsl #4 ; b = vp8_extra_bits + t @@ -223,6 +243,9 @@ extra_high_bit_not_set bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff str r11, [r0, #vp8_writer_pos] sub r3, r3, #8 ; count -= 8 + + VALIDATE_POS r10, r11 ; validate_buffer at pos + strb r7, [r10, r4] ; w->buffer[w->pos++]=(lowvalue >> (24-offset)) ldr r10, [sp, #4] ; b->tree extra_count_lt_zero @@ -271,7 +294,10 @@ end_high_bit_not_set lsr r6, r2, #24 ; lowvalue >> 24 add r12, r4, #1 ; w->pos++ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff - str r12, [r0, #0x10] + str r12, [r0, #vp8_writer_pos] + + VALIDATE_POS r7, r12 ; validate_buffer at pos + strb r6, [r7, r4] end_count_zero skip_extra_bits @@ -284,8 +310,8 @@ check_p_lt_stop str r2, [r0, #vp8_writer_lowvalue] str r5, [r0, #vp8_writer_range] str r3, [r0, #vp8_writer_count] - add sp, sp, #12 - pop {r4-r11, pc} + add sp, sp, #16 + pop {r4-r12, pc} ENDP END diff --git a/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm b/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm index 82bf71f35..1fa5e6c22 100644 --- a/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm +++ b/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm @@ -10,6 +10,7 @@ EXPORT |vp8cx_pack_mb_row_tokens_armv5| + IMPORT |vp8_validate_buffer_arm| INCLUDE asm_enc_offsets.asm @@ -19,6 +20,21 @@ AREA |.text|, CODE, READONLY + + ; macro for validating write buffer position + ; needs vp8_writer in r0 + ; start shall not be in r1 + MACRO + VALIDATE_POS $start, $pos + push {r0-r3, r12, lr} ; rest of regs are preserved by subroutine call + ldr r2, [r0, #vp8_writer_buffer_end] + ldr r3, [r0, #vp8_writer_error] + mov r1, $pos + mov r0, $start + bl vp8_validate_buffer_arm + pop {r0-r3, r12, lr} + MEND + ; r0 VP8_COMP *cpi ; r1 vp8_writer *w ; r2 vp8_coef_encodings @@ -26,7 +42,7 @@ ; s0 vp8_coef_tree |vp8cx_pack_mb_row_tokens_armv5| PROC - push {r4-r11, lr} + push {r4-r12, lr} sub sp, sp, #24 ; Compute address of cpi->common.mb_rows @@ -79,7 +95,7 @@ while_p_lt_stop subne r8, r8, #1 ; --n rsb r4, r8, #32 ; 32-n - ldr r10, [sp, #60] ; vp8_coef_tree + ldr r10, [sp, #64] ; vp8_coef_tree ; v is kept in r12 during the token pack loop lsl r12, r6, r4 ; r12 = v << 32 - n @@ -150,12 +166,15 @@ token_high_bit_not_set bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff str r11, [r0, #vp8_writer_pos] sub r3, r3, #8 ; count -= 8 + + VALIDATE_POS r10, r11 ; validate_buffer at pos + strb r7, [r10, r4] ; w->buffer[w->pos++] ; r10 is used earlier in the loop, but r10 is used as ; temp variable here. So after r10 is used, reload ; vp8_coef_tree_dcd into r10 - ldr r10, [sp, #60] ; vp8_coef_tree + ldr r10, [sp, #64] ; vp8_coef_tree token_count_lt_zero lsl r2, r2, r6 ; lowvalue <<= shift @@ -245,6 +264,9 @@ extra_high_bit_not_set bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff str r11, [r0, #vp8_writer_pos] sub r3, r3, #8 ; count -= 8 + + VALIDATE_POS r10, r11 ; validate_buffer at pos + strb r7, [r10, r4] ; w->buffer[w->pos++]=(lowvalue >> (24-offset)) ldr r10, [sp, #4] ; b->tree extra_count_lt_zero @@ -293,7 +315,10 @@ end_high_bit_not_set lsr r6, r2, #24 ; lowvalue >> 24 add r12, r4, #1 ; w->pos++ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff - str r12, [r0, #0x10] + str r12, [r0, #vp8_writer_pos] + + VALIDATE_POS r7, r12 ; validate_buffer at pos + strb r6, [r7, r4] end_count_zero skip_extra_bits @@ -314,7 +339,7 @@ check_p_lt_stop str r5, [r0, #vp8_writer_range] str r3, [r0, #vp8_writer_count] add sp, sp, #24 - pop {r4-r11, pc} + pop {r4-r12, pc} ENDP _VP8_COMP_common_ diff --git a/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm b/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm index c061b2fab..3a183aa2f 100644 --- a/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm +++ b/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm @@ -10,6 +10,7 @@ EXPORT |vp8cx_pack_tokens_into_partitions_armv5| + IMPORT |vp8_validate_buffer_arm| INCLUDE asm_enc_offsets.asm @@ -19,17 +20,31 @@ AREA |.text|, CODE, READONLY + ; macro for validating write buffer position + ; needs vp8_writer in r0 + ; start shall not be in r1 + MACRO + VALIDATE_POS $start, $pos + push {r0-r3, r12, lr} ; rest of regs are preserved by subroutine call + ldr r2, [r0, #vp8_writer_buffer_end] + ldr r3, [r0, #vp8_writer_error] + mov r1, $pos + mov r0, $start + bl vp8_validate_buffer_arm + pop {r0-r3, r12, lr} + MEND + ; r0 VP8_COMP *cpi ; r1 unsigned char *cx_data -; r2 int num_part -; r3 *size +; r2 const unsigned char *cx_data_end +; r3 int num_part ; s0 vp8_coef_encodings ; s1 vp8_extra_bits, -; s2 const vp8_tree_index *, +; s2 const vp8_tree_index * |vp8cx_pack_tokens_into_partitions_armv5| PROC - push {r4-r11, lr} - sub sp, sp, #44 + push {r4-r12, lr} + sub sp, sp, #40 ; Compute address of cpi->common.mb_rows ldr r4, _VP8_COMP_common_ @@ -39,31 +54,26 @@ ldr r5, [r4, r6] ; load up mb_rows str r5, [sp, #36] ; save mb_rows - str r1, [sp, #24] ; save cx_data - str r2, [sp, #20] ; save num_part - str r3, [sp, #8] ; save *size - - ; *size = 3*(num_part -1 ); - sub r2, r2, #1 ; num_part - 1 - add r2, r2, r2, lsl #1 ; 3*(num_part - 1) - str r2, [r3] - - add r2, r2, r1 ; cx_data + *size - str r2, [sp, #40] ; ptr + str r1, [sp, #24] ; save ptr = cx_data + str r3, [sp, #20] ; save num_part + str r2, [sp, #8] ; save cx_data_end ldr r4, _VP8_COMP_tplist_ add r4, r0, r4 ldr r7, [r4, #0] ; dereference cpi->tp_list str r7, [sp, #32] ; store start of cpi->tp_list - ldr r11, _VP8_COMP_bc2_ ; load up vp8_writer out of cpi + ldr r11, _VP8_COMP_bc_ ; load up vp8_writer out of cpi add r0, r0, r11 mov r11, #0 str r11, [sp, #28] ; i numparts_loop - ldr r10, [sp, #40] ; ptr + ldr r2, _vp8_writer_sz_ ; load up sizeof(vp8_writer) + add r0, r2 ; bc[i + 1] + + ldr r10, [sp, #24] ; ptr ldr r5, [sp, #36] ; move mb_rows to the counting section subs r5, r5, r11 ; move start point with each partition ; mb_rows starts at i @@ -72,6 +82,10 @@ numparts_loop ; Reset all of the VP8 Writer data for each partition that ; is processed. ; start_encode + + ldr r3, [sp, #8] + str r3, [r0, #vp8_writer_buffer_end] + mov r2, #0 ; vp8_writer_lowvalue mov r5, #255 ; vp8_writer_range mvn r3, #23 ; vp8_writer_count @@ -182,6 +196,9 @@ token_high_bit_not_set bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff str r11, [r0, #vp8_writer_pos] sub r3, r3, #8 ; count -= 8 + + VALIDATE_POS r10, r11 ; validate_buffer at pos + strb r7, [r10, r4] ; w->buffer[w->pos++] ; r10 is used earlier in the loop, but r10 is used as @@ -277,6 +294,9 @@ extra_high_bit_not_set bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff str r11, [r0, #vp8_writer_pos] sub r3, r3, #8 ; count -= 8 + + VALIDATE_POS r10, r11 ; validate_buffer at pos + strb r7, [r10, r4] ; w->buffer[w->pos++]=(lowvalue >> (24-offset)) ldr r10, [sp, #4] ; b->tree extra_count_lt_zero @@ -320,12 +340,15 @@ end_high_bit_not_set bne end_count_zero ldr r4, [r0, #vp8_writer_pos] - mvn r3, #7 + mvn r3, #7 ; count = -8 ldr r7, [r0, #vp8_writer_buffer] lsr r6, r2, #24 ; lowvalue >> 24 add r12, r4, #1 ; w->pos++ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff - str r12, [r0, #0x10] + str r12, [r0, #vp8_writer_pos] + + VALIDATE_POS r7, r12 ; validate_buffer at pos + strb r6, [r7, r4] end_count_zero skip_extra_bits @@ -401,6 +424,9 @@ token_high_bit_not_set_se bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff str r11, [r0, #vp8_writer_pos] sub r3, r3, #8 ; count -= 8 + + VALIDATE_POS r10, r11 ; validate_buffer at pos + strb r7, [r10, r4] ; w->buffer[w->pos++] token_count_lt_zero_se @@ -409,33 +435,10 @@ token_count_lt_zero_se subs r12, r12, #1 bne stop_encode_loop - ldr r10, [sp, #8] ; *size - ldr r11, [r10] ldr r4, [r0, #vp8_writer_pos] ; w->pos - add r11, r11, r4 ; *size += w->pos - str r11, [r10] - - ldr r9, [sp, #20] ; num_parts - sub r9, r9, #1 - ldr r10, [sp, #28] ; i - cmp r10, r9 ; if(i<(num_part - 1)) - bge skip_write_partition - - ldr r12, [sp, #40] ; ptr + ldr r12, [sp, #24] ; ptr add r12, r12, r4 ; ptr += w->pos - str r12, [sp, #40] - - ldr r9, [sp, #24] ; cx_data - mov r8, r4, asr #8 - strb r4, [r9, #0] - strb r8, [r9, #1] - mov r4, r4, asr #16 - strb r4, [r9, #2] - - add r9, r9, #3 ; cx_data += 3 - str r9, [sp, #24] - -skip_write_partition + str r12, [sp, #24] ldr r11, [sp, #28] ; i ldr r10, [sp, #20] ; num_parts @@ -451,9 +454,8 @@ skip_write_partition cmp r10, r11 bgt numparts_loop - - add sp, sp, #44 - pop {r4-r11, pc} + add sp, sp, #40 + pop {r4-r12, pc} ENDP _VP8_COMP_common_ @@ -462,7 +464,9 @@ _VP8_COMMON_MBrows_ DCD vp8_common_mb_rows _VP8_COMP_tplist_ DCD vp8_comp_tplist -_VP8_COMP_bc2_ - DCD vp8_comp_bc2 +_VP8_COMP_bc_ + DCD vp8_comp_bc +_vp8_writer_sz_ + DCD vp8_writer_sz END diff --git a/vp8/encoder/arm/armv6/vp8_subtract_armv6.asm b/vp8/encoder/arm/armv6/vp8_subtract_armv6.asm index 0ca74387b..f329f8f73 100644 --- a/vp8/encoder/arm/armv6/vp8_subtract_armv6.asm +++ b/vp8/encoder/arm/armv6/vp8_subtract_armv6.asm @@ -72,22 +72,23 @@ loop_block ; r0 short *diff ; r1 unsigned char *usrc ; r2 unsigned char *vsrc -; r3 unsigned char *pred -; stack int stride +; r3 int src_stride +; sp unsigned char *upred +; sp unsigned char *vpred +; sp int pred_stride |vp8_subtract_mbuv_armv6| PROC - stmfd sp!, {r4-r12, lr} + stmfd sp!, {r4-r11} add r0, r0, #512 ; set *diff point to Cb - add r3, r3, #256 ; set *pred point to Cb - mov r4, #8 ; loop count - ldr r5, [sp, #40] ; stride + ldr r5, [sp, #32] ; upred + ldr r12, [sp, #40] ; pred_stride ; Subtract U block loop_u - ldr r6, [r1] ; src (A) - ldr r7, [r3], #4 ; pred (A) + ldr r6, [r1] ; usrc (A) + ldr r7, [r5] ; upred (A) uxtb16 r8, r6 ; [s2 | s0] (A) uxtb16 r9, r7 ; [p2 | p0] (A) @@ -97,8 +98,8 @@ loop_u usub16 r6, r8, r9 ; [d2 | d0] (A) usub16 r7, r10, r11 ; [d3 | d1] (A) - ldr r10, [r1, #4] ; src (B) - ldr r11, [r3], #4 ; pred (B) + ldr r10, [r1, #4] ; usrc (B) + ldr r11, [r5, #4] ; upred (B) pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (A) pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (A) @@ -114,7 +115,8 @@ loop_u usub16 r6, r8, r9 ; [d2 | d0] (B) usub16 r7, r10, r11 ; [d3 | d1] (B) - add r1, r1, r5 ; update usrc pointer + add r1, r1, r3 ; update usrc pointer + add r5, r5, r12 ; update upred pointer pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (B) pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (B) @@ -125,12 +127,13 @@ loop_u bne loop_u + ldr r5, [sp, #36] ; vpred mov r4, #8 ; loop count ; Subtract V block loop_v - ldr r6, [r2] ; src (A) - ldr r7, [r3], #4 ; pred (A) + ldr r6, [r2] ; vsrc (A) + ldr r7, [r5] ; vpred (A) uxtb16 r8, r6 ; [s2 | s0] (A) uxtb16 r9, r7 ; [p2 | p0] (A) @@ -140,8 +143,8 @@ loop_v usub16 r6, r8, r9 ; [d2 | d0] (A) usub16 r7, r10, r11 ; [d3 | d1] (A) - ldr r10, [r2, #4] ; src (B) - ldr r11, [r3], #4 ; pred (B) + ldr r10, [r2, #4] ; vsrc (B) + ldr r11, [r5, #4] ; vpred (B) pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (A) pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (A) @@ -157,7 +160,8 @@ loop_v usub16 r6, r8, r9 ; [d2 | d0] (B) usub16 r7, r10, r11 ; [d3 | d1] (B) - add r2, r2, r5 ; update vsrc pointer + add r2, r2, r3 ; update vsrc pointer + add r5, r5, r12 ; update vpred pointer pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (B) pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (B) @@ -168,23 +172,25 @@ loop_v bne loop_v - ldmfd sp!, {r4-r12, pc} + ldmfd sp!, {r4-r11} + bx lr ENDP ; r0 short *diff ; r1 unsigned char *src -; r2 unsigned char *pred -; r3 int stride +; r2 int src_stride +; r3 unsigned char *pred +; sp int pred_stride |vp8_subtract_mby_armv6| PROC stmfd sp!, {r4-r11} - + ldr r12, [sp, #32] ; pred_stride mov r4, #16 loop ldr r6, [r1] ; src (A) - ldr r7, [r2], #4 ; pred (A) + ldr r7, [r3] ; pred (A) uxtb16 r8, r6 ; [s2 | s0] (A) uxtb16 r9, r7 ; [p2 | p0] (A) @@ -195,7 +201,7 @@ loop usub16 r7, r10, r11 ; [d3 | d1] (A) ldr r10, [r1, #4] ; src (B) - ldr r11, [r2], #4 ; pred (B) + ldr r11, [r3, #4] ; pred (B) pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (A) pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (A) @@ -212,7 +218,7 @@ loop usub16 r7, r10, r11 ; [d3 | d1] (B) ldr r10, [r1, #8] ; src (C) - ldr r11, [r2], #4 ; pred (C) + ldr r11, [r3, #8] ; pred (C) pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (B) pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (B) @@ -229,10 +235,10 @@ loop usub16 r7, r10, r11 ; [d3 | d1] (C) ldr r10, [r1, #12] ; src (D) - ldr r11, [r2], #4 ; pred (D) + ldr r11, [r3, #12] ; pred (D) - pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (C) - pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (C) + pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (C) + pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (C) str r8, [r0], #4 ; diff (C) uxtb16 r8, r10 ; [s2 | s0] (D) @@ -245,7 +251,8 @@ loop usub16 r6, r8, r9 ; [d2 | d0] (D) usub16 r7, r10, r11 ; [d3 | d1] (D) - add r1, r1, r3 ; update src pointer + add r1, r1, r2 ; update src pointer + add r3, r3, r12 ; update pred pointer pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (D) pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (D) @@ -257,7 +264,7 @@ loop bne loop ldmfd sp!, {r4-r11} - mov pc, lr + bx lr ENDP diff --git a/vp8/encoder/arm/boolhuff_arm.c b/vp8/encoder/arm/boolhuff_arm.c index 9089663ca..17a941bfc 100644 --- a/vp8/encoder/arm/boolhuff_arm.c +++ b/vp8/encoder/arm/boolhuff_arm.c @@ -10,7 +10,7 @@ #include "vp8/encoder/boolhuff.h" -#include "vp8/common/blockd.h" +#include "vpx/internal/vpx_codec_internal.h" const unsigned int vp8_prob_cost[256] = { @@ -32,3 +32,10 @@ const unsigned int vp8_prob_cost[256] = 22, 21, 19, 18, 16, 15, 13, 12, 10, 9, 7, 6, 4, 3, 1, 1 }; +int vp8_validate_buffer_arm(const unsigned char *start, + size_t len, + const unsigned char *end, + struct vpx_internal_error_info *error) +{ + return validate_buffer(start, len, end, error); +} diff --git a/vp8/encoder/arm/neon/subtract_neon.asm b/vp8/encoder/arm/neon/subtract_neon.asm index 68c295062..91a328c29 100644 --- a/vp8/encoder/arm/neon/subtract_neon.asm +++ b/vp8/encoder/arm/neon/subtract_neon.asm @@ -61,19 +61,24 @@ ;========================================== -;void vp8_subtract_mby_neon(short *diff, unsigned char *src, unsigned char *pred, int stride) +;void vp8_subtract_mby_neon(short *diff, unsigned char *src, int src_stride +; unsigned char *pred, int pred_stride) |vp8_subtract_mby_neon| PROC + push {r4-r7} mov r12, #4 + ldr r4, [sp, #16] ; pred_stride + mov r6, #32 ; "diff" stride x2 + add r5, r0, #16 ; second diff pointer subtract_mby_loop - vld1.8 {q0}, [r1], r3 ;load src - vld1.8 {q1}, [r2]! ;load pred - vld1.8 {q2}, [r1], r3 - vld1.8 {q3}, [r2]! - vld1.8 {q4}, [r1], r3 - vld1.8 {q5}, [r2]! - vld1.8 {q6}, [r1], r3 - vld1.8 {q7}, [r2]! + vld1.8 {q0}, [r1], r2 ;load src + vld1.8 {q1}, [r3], r4 ;load pred + vld1.8 {q2}, [r1], r2 + vld1.8 {q3}, [r3], r4 + vld1.8 {q4}, [r1], r2 + vld1.8 {q5}, [r3], r4 + vld1.8 {q6}, [r1], r2 + vld1.8 {q7}, [r3], r4 vsubl.u8 q8, d0, d2 vsubl.u8 q9, d1, d3 @@ -84,46 +89,53 @@ subtract_mby_loop vsubl.u8 q14, d12, d14 vsubl.u8 q15, d13, d15 - vst1.16 {q8}, [r0]! ;store diff - vst1.16 {q9}, [r0]! - vst1.16 {q10}, [r0]! - vst1.16 {q11}, [r0]! - vst1.16 {q12}, [r0]! - vst1.16 {q13}, [r0]! - vst1.16 {q14}, [r0]! - vst1.16 {q15}, [r0]! + vst1.16 {q8}, [r0], r6 ;store diff + vst1.16 {q9}, [r5], r6 + vst1.16 {q10}, [r0], r6 + vst1.16 {q11}, [r5], r6 + vst1.16 {q12}, [r0], r6 + vst1.16 {q13}, [r5], r6 + vst1.16 {q14}, [r0], r6 + vst1.16 {q15}, [r5], r6 subs r12, r12, #1 bne subtract_mby_loop + pop {r4-r7} bx lr ENDP ;================================= -;void vp8_subtract_mbuv_neon(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride) +;void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, +; int src_stride, unsigned char *upred, +; unsigned char *vpred, int pred_stride) + |vp8_subtract_mbuv_neon| PROC - ldr r12, [sp] + push {r4-r7} + ldr r4, [sp, #16] ; upred + ldr r5, [sp, #20] ; vpred + ldr r6, [sp, #24] ; pred_stride + add r0, r0, #512 ; short *udiff = diff + 256; + mov r12, #32 ; "diff" stride x2 + add r7, r0, #16 ; second diff pointer ;u - add r0, r0, #512 ; short *udiff = diff + 256; - add r3, r3, #256 ; unsigned char *upred = pred + 256; - - vld1.8 {d0}, [r1], r12 ;load src - vld1.8 {d1}, [r3]! ;load pred - vld1.8 {d2}, [r1], r12 - vld1.8 {d3}, [r3]! - vld1.8 {d4}, [r1], r12 - vld1.8 {d5}, [r3]! - vld1.8 {d6}, [r1], r12 - vld1.8 {d7}, [r3]! - vld1.8 {d8}, [r1], r12 - vld1.8 {d9}, [r3]! - vld1.8 {d10}, [r1], r12 - vld1.8 {d11}, [r3]! - vld1.8 {d12}, [r1], r12 - vld1.8 {d13}, [r3]! - vld1.8 {d14}, [r1], r12 - vld1.8 {d15}, [r3]! + vld1.8 {d0}, [r1], r3 ;load usrc + vld1.8 {d1}, [r4], r6 ;load upred + vld1.8 {d2}, [r1], r3 + vld1.8 {d3}, [r4], r6 + vld1.8 {d4}, [r1], r3 + vld1.8 {d5}, [r4], r6 + vld1.8 {d6}, [r1], r3 + vld1.8 {d7}, [r4], r6 + vld1.8 {d8}, [r1], r3 + vld1.8 {d9}, [r4], r6 + vld1.8 {d10}, [r1], r3 + vld1.8 {d11}, [r4], r6 + vld1.8 {d12}, [r1], r3 + vld1.8 {d13}, [r4], r6 + vld1.8 {d14}, [r1], r3 + vld1.8 {d15}, [r4], r6 vsubl.u8 q8, d0, d1 vsubl.u8 q9, d2, d3 @@ -134,32 +146,32 @@ subtract_mby_loop vsubl.u8 q14, d12, d13 vsubl.u8 q15, d14, d15 - vst1.16 {q8}, [r0]! ;store diff - vst1.16 {q9}, [r0]! - vst1.16 {q10}, [r0]! - vst1.16 {q11}, [r0]! - vst1.16 {q12}, [r0]! - vst1.16 {q13}, [r0]! - vst1.16 {q14}, [r0]! - vst1.16 {q15}, [r0]! + vst1.16 {q8}, [r0], r12 ;store diff + vst1.16 {q9}, [r7], r12 + vst1.16 {q10}, [r0], r12 + vst1.16 {q11}, [r7], r12 + vst1.16 {q12}, [r0], r12 + vst1.16 {q13}, [r7], r12 + vst1.16 {q14}, [r0], r12 + vst1.16 {q15}, [r7], r12 ;v - vld1.8 {d0}, [r2], r12 ;load src - vld1.8 {d1}, [r3]! ;load pred - vld1.8 {d2}, [r2], r12 - vld1.8 {d3}, [r3]! - vld1.8 {d4}, [r2], r12 - vld1.8 {d5}, [r3]! - vld1.8 {d6}, [r2], r12 - vld1.8 {d7}, [r3]! - vld1.8 {d8}, [r2], r12 - vld1.8 {d9}, [r3]! - vld1.8 {d10}, [r2], r12 - vld1.8 {d11}, [r3]! - vld1.8 {d12}, [r2], r12 - vld1.8 {d13}, [r3]! - vld1.8 {d14}, [r2], r12 - vld1.8 {d15}, [r3]! + vld1.8 {d0}, [r2], r3 ;load vsrc + vld1.8 {d1}, [r5], r6 ;load vpred + vld1.8 {d2}, [r2], r3 + vld1.8 {d3}, [r5], r6 + vld1.8 {d4}, [r2], r3 + vld1.8 {d5}, [r5], r6 + vld1.8 {d6}, [r2], r3 + vld1.8 {d7}, [r5], r6 + vld1.8 {d8}, [r2], r3 + vld1.8 {d9}, [r5], r6 + vld1.8 {d10}, [r2], r3 + vld1.8 {d11}, [r5], r6 + vld1.8 {d12}, [r2], r3 + vld1.8 {d13}, [r5], r6 + vld1.8 {d14}, [r2], r3 + vld1.8 {d15}, [r5], r6 vsubl.u8 q8, d0, d1 vsubl.u8 q9, d2, d3 @@ -170,16 +182,18 @@ subtract_mby_loop vsubl.u8 q14, d12, d13 vsubl.u8 q15, d14, d15 - vst1.16 {q8}, [r0]! ;store diff - vst1.16 {q9}, [r0]! - vst1.16 {q10}, [r0]! - vst1.16 {q11}, [r0]! - vst1.16 {q12}, [r0]! - vst1.16 {q13}, [r0]! - vst1.16 {q14}, [r0]! - vst1.16 {q15}, [r0]! + vst1.16 {q8}, [r0], r12 ;store diff + vst1.16 {q9}, [r7], r12 + vst1.16 {q10}, [r0], r12 + vst1.16 {q11}, [r7], r12 + vst1.16 {q12}, [r0], r12 + vst1.16 {q13}, [r7], r12 + vst1.16 {q14}, [r0], r12 + vst1.16 {q15}, [r7], r12 + pop {r4-r7} bx lr + ENDP END diff --git a/vp8/encoder/arm/variance_arm.c b/vp8/encoder/arm/variance_arm.c index e77be9f73..7fc7473ac 100644 --- a/vp8/encoder/arm/variance_arm.c +++ b/vp8/encoder/arm/variance_arm.c @@ -11,9 +11,9 @@ #include "vpx_config.h" #include "vp8/encoder/variance.h" #include "vp8/common/filter.h" -#include "vp8/common/arm/bilinearfilter_arm.h" #if HAVE_ARMV6 +#include "vp8/common/arm/bilinearfilter_arm.h" unsigned int vp8_sub_pixel_variance8x8_armv6 ( diff --git a/vp8/encoder/asm_enc_offsets.c b/vp8/encoder/asm_enc_offsets.c index d05dab47c..2e9ca7232 100644 --- a/vp8/encoder/asm_enc_offsets.c +++ b/vp8/encoder/asm_enc_offsets.c @@ -50,6 +50,7 @@ DEFINE(vp8_writer_count, offsetof(vp8_writer, count)); DEFINE(vp8_writer_pos, offsetof(vp8_writer, pos)); DEFINE(vp8_writer_buffer, offsetof(vp8_writer, buffer)); DEFINE(vp8_writer_buffer_end, offsetof(vp8_writer, buffer_end)); +DEFINE(vp8_writer_error, offsetof(vp8_writer, error)); DEFINE(tokenextra_token, offsetof(TOKENEXTRA, Token)); DEFINE(tokenextra_extra, offsetof(TOKENEXTRA, Extra)); @@ -69,7 +70,8 @@ DEFINE(vp8_extra_bit_struct_base_val, offsetof(vp8_extra_bit_struct, b DEFINE(vp8_comp_tplist, offsetof(VP8_COMP, tplist)); DEFINE(vp8_comp_common, offsetof(VP8_COMP, common)); -DEFINE(vp8_comp_bc2, offsetof(VP8_COMP, bc2)); +DEFINE(vp8_comp_bc , offsetof(VP8_COMP, bc)); +DEFINE(vp8_writer_sz , sizeof(vp8_writer)); DEFINE(tokenlist_start, offsetof(TOKENLIST, start)); DEFINE(tokenlist_stop, offsetof(TOKENLIST, stop)); diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c index 748b60778..669bfad9a 100644 --- a/vp8/encoder/bitstream.c +++ b/vp8/encoder/bitstream.c @@ -109,7 +109,7 @@ static void update_mbintra_mode_probs(VP8_COMP *cpi) { VP8_COMMON *const x = & cpi->common; - vp8_writer *const w = & cpi->bc; + vp8_writer *const w = cpi->bc; { vp8_prob Pnew [VP8_YMODES-1]; @@ -221,6 +221,11 @@ static void pack_tokens_c(vp8_writer *w, const TOKENEXTRA *p, int xcount) w->buffer[x] += 1; } + validate_buffer(w->buffer + w->pos, + 1, + w->buffer_end, + w->error); + w->buffer[w->pos++] = (lowvalue >> (24 - offset)); lowvalue <<= offset; shift = count; @@ -281,6 +286,11 @@ static void pack_tokens_c(vp8_writer *w, const TOKENEXTRA *p, int xcount) w->buffer[x] += 1; } + validate_buffer(w->buffer + w->pos, + 1, + w->buffer_end, + w->error); + w->buffer[w->pos++] = (lowvalue >> (24 - offset)); lowvalue <<= offset; shift = count; @@ -329,6 +339,12 @@ static void pack_tokens_c(vp8_writer *w, const TOKENEXTRA *p, int xcount) if (!++count) { count = -8; + + validate_buffer(w->buffer + w->pos, + 1, + w->buffer_end, + w->error); + w->buffer[w->pos++] = (lowvalue >> 24); lowvalue &= 0xffffff; } @@ -358,20 +374,21 @@ static void write_partition_size(unsigned char *cx_data, int size) } -static void pack_tokens_into_partitions_c(VP8_COMP *cpi, unsigned char *cx_data, unsigned char * cx_data_end, int num_part, int *size) +static void pack_tokens_into_partitions_c(VP8_COMP *cpi, unsigned char *cx_data, + unsigned char * cx_data_end, + int num_part) { int i; unsigned char *ptr = cx_data; unsigned char *ptr_end = cx_data_end; unsigned int shift; - vp8_writer *w = &cpi->bc2; - *size = 3 * (num_part - 1); - cpi->partition_sz[0] += *size; - ptr = cx_data + (*size); + vp8_writer *w; + ptr = cx_data; for (i = 0; i < num_part; i++) { + w = cpi->bc + i + 1; vp8_start_encode(w, ptr, ptr_end); { unsigned int split; @@ -581,17 +598,7 @@ static void pack_tokens_into_partitions_c(VP8_COMP *cpi, unsigned char *cx_data, } vp8_stop_encode(w); - *size += w->pos; - - /* The first partition size is set earlier */ - cpi->partition_sz[i + 1] = w->pos; - - if (i < (num_part - 1)) - { - write_partition_size(cx_data, w->pos); - cx_data += 3; - ptr += w->pos; - } + ptr += w->pos; } } @@ -664,6 +671,11 @@ static void pack_mb_row_tokens_c(VP8_COMP *cpi, vp8_writer *w) w->buffer[x] += 1; } + validate_buffer(w->buffer + w->pos, + 1, + w->buffer_end, + w->error); + w->buffer[w->pos++] = (lowvalue >> (24 - offset)); lowvalue <<= offset; shift = count; @@ -724,6 +736,11 @@ static void pack_mb_row_tokens_c(VP8_COMP *cpi, vp8_writer *w) w->buffer[x] += 1; } + validate_buffer(w->buffer + w->pos, + 1, + w->buffer_end, + w->error); + w->buffer[w->pos++] = (lowvalue >> (24 - offset)); lowvalue <<= offset; shift = count; @@ -770,6 +787,12 @@ static void pack_mb_row_tokens_c(VP8_COMP *cpi, vp8_writer *w) if (!++count) { count = -8; + + validate_buffer(w->buffer + w->pos, + 1, + w->buffer_end, + w->error); + w->buffer[w->pos++] = (lowvalue >> 24); lowvalue &= 0xffffff; } @@ -855,44 +878,46 @@ static void write_mb_features(vp8_writer *w, const MB_MODE_INFO *mi, const MACRO } } } +void vp8_convert_rfct_to_prob(VP8_COMP *const cpi) +{ + const int *const rfct = cpi->count_mb_ref_frame_usage; + const int rf_intra = rfct[INTRA_FRAME]; + const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]; + + // Calculate the probabilities used to code the ref frame based on useage + if (!(cpi->prob_intra_coded = rf_intra * 255 / (rf_intra + rf_inter))) + cpi->prob_intra_coded = 1; + + cpi->prob_last_coded = rf_inter ? (rfct[LAST_FRAME] * 255) / rf_inter : 128; + if (!cpi->prob_last_coded) + cpi->prob_last_coded = 1; + + cpi->prob_gf_coded = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) + ? (rfct[GOLDEN_FRAME] * 255) / (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) : 128; + + if (!cpi->prob_gf_coded) + cpi->prob_gf_coded = 1; + +} static void pack_inter_mode_mvs(VP8_COMP *const cpi) { VP8_COMMON *const pc = & cpi->common; - vp8_writer *const w = & cpi->bc; + vp8_writer *const w = cpi->bc; const MV_CONTEXT *mvc = pc->fc.mvc; - const int *const rfct = cpi->count_mb_ref_frame_usage; - const int rf_intra = rfct[INTRA_FRAME]; - const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]; MODE_INFO *m = pc->mi, *ms; const int mis = pc->mode_info_stride; int mb_row = -1; - int prob_last_coded; - int prob_gf_coded; int prob_skip_false = 0; ms = pc->mi - 1; cpi->mb.partition_info = cpi->mb.pi; - // Calculate the probabilities to be used to code the reference frame based on actual useage this frame - if (!(cpi->prob_intra_coded = rf_intra * 255 / (rf_intra + rf_inter))) - cpi->prob_intra_coded = 1; - - prob_last_coded = rf_inter ? (rfct[LAST_FRAME] * 255) / rf_inter : 128; - - if (!prob_last_coded) - prob_last_coded = 1; - - prob_gf_coded = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) - ? (rfct[GOLDEN_FRAME] * 255) / (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) : 128; - - if (!prob_gf_coded) - prob_gf_coded = 1; - + vp8_convert_rfct_to_prob(cpi); #ifdef ENTROPY_STATS active_section = 1; @@ -913,8 +938,8 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) } vp8_write_literal(w, cpi->prob_intra_coded, 8); - vp8_write_literal(w, prob_last_coded, 8); - vp8_write_literal(w, prob_gf_coded, 8); + vp8_write_literal(w, cpi->prob_last_coded, 8); + vp8_write_literal(w, cpi->prob_gf_coded, 8); update_mbintra_mode_probs(cpi); @@ -976,11 +1001,11 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) vp8_write(w, 1, cpi->prob_intra_coded); if (rf == LAST_FRAME) - vp8_write(w, 0, prob_last_coded); + vp8_write(w, 0, cpi->prob_last_coded); else { - vp8_write(w, 1, prob_last_coded); - vp8_write(w, (rf == GOLDEN_FRAME) ? 0 : 1, prob_gf_coded); + vp8_write(w, 1, cpi->prob_last_coded); + vp8_write(w, (rf == GOLDEN_FRAME) ? 0 : 1, cpi->prob_gf_coded); } { @@ -1075,7 +1100,7 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) static void write_kfmodes(VP8_COMP *cpi) { - vp8_writer *const bc = & cpi->bc; + vp8_writer *const bc = cpi->bc; const VP8_COMMON *const c = & cpi->common; /* const */ MODE_INFO *m = c->mi; @@ -1181,7 +1206,7 @@ static void sum_probs_over_prev_coef_context( { for (j=0; j < PREV_COEF_CONTEXTS; ++j) { - const int tmp = out[i]; + const unsigned int tmp = out[i]; out[i] += probs[j][i]; /* check for wrap */ if (out[i] < tmp) @@ -1332,6 +1357,24 @@ static int default_coef_context_savings(VP8_COMP *cpi) return savings; } +void vp8_calc_ref_frame_costs(int *ref_frame_cost, + int prob_intra, + int prob_last, + int prob_garf + ) +{ + ref_frame_cost[INTRA_FRAME] = vp8_cost_zero(prob_intra); + ref_frame_cost[LAST_FRAME] = vp8_cost_one(prob_intra) + + vp8_cost_zero(prob_last); + ref_frame_cost[GOLDEN_FRAME] = vp8_cost_one(prob_intra) + + vp8_cost_one(prob_last) + + vp8_cost_zero(prob_garf); + ref_frame_cost[ALTREF_FRAME] = vp8_cost_one(prob_intra) + + vp8_cost_one(prob_last) + + vp8_cost_one(prob_garf); + +} + int vp8_estimate_entropy_savings(VP8_COMP *cpi) { int savings = 0; @@ -1339,7 +1382,7 @@ int vp8_estimate_entropy_savings(VP8_COMP *cpi) const int *const rfct = cpi->count_mb_ref_frame_usage; const int rf_intra = rfct[INTRA_FRAME]; const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]; - int new_intra, new_last, gf_last, oldtotal, newtotal; + int new_intra, new_last, new_garf, oldtotal, newtotal; int ref_frame_cost[MAX_REF_FRAMES]; vp8_clear_system_state(); //__asm emms; @@ -1351,19 +1394,11 @@ int vp8_estimate_entropy_savings(VP8_COMP *cpi) new_last = rf_inter ? (rfct[LAST_FRAME] * 255) / rf_inter : 128; - gf_last = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) + new_garf = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) ? (rfct[GOLDEN_FRAME] * 255) / (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) : 128; - // new costs - ref_frame_cost[INTRA_FRAME] = vp8_cost_zero(new_intra); - ref_frame_cost[LAST_FRAME] = vp8_cost_one(new_intra) - + vp8_cost_zero(new_last); - ref_frame_cost[GOLDEN_FRAME] = vp8_cost_one(new_intra) - + vp8_cost_one(new_last) - + vp8_cost_zero(gf_last); - ref_frame_cost[ALTREF_FRAME] = vp8_cost_one(new_intra) - + vp8_cost_one(new_last) - + vp8_cost_one(gf_last); + + vp8_calc_ref_frame_costs(ref_frame_cost,new_intra,new_last,new_garf); newtotal = rfct[INTRA_FRAME] * ref_frame_cost[INTRA_FRAME] + @@ -1373,15 +1408,8 @@ int vp8_estimate_entropy_savings(VP8_COMP *cpi) // old costs - ref_frame_cost[INTRA_FRAME] = vp8_cost_zero(cpi->prob_intra_coded); - ref_frame_cost[LAST_FRAME] = vp8_cost_one(cpi->prob_intra_coded) - + vp8_cost_zero(cpi->prob_last_coded); - ref_frame_cost[GOLDEN_FRAME] = vp8_cost_one(cpi->prob_intra_coded) - + vp8_cost_one(cpi->prob_last_coded) - + vp8_cost_zero(cpi->prob_gf_coded); - ref_frame_cost[ALTREF_FRAME] = vp8_cost_one(cpi->prob_intra_coded) - + vp8_cost_one(cpi->prob_last_coded) - + vp8_cost_one(cpi->prob_gf_coded); + vp8_calc_ref_frame_costs(ref_frame_cost,cpi->prob_intra_coded, + cpi->prob_last_coded,cpi->prob_gf_coded); oldtotal = rfct[INTRA_FRAME] * ref_frame_cost[INTRA_FRAME] + @@ -1405,7 +1433,7 @@ int vp8_estimate_entropy_savings(VP8_COMP *cpi) static void update_coef_probs(VP8_COMP *cpi) { int i = 0; - vp8_writer *const w = & cpi->bc; + vp8_writer *const w = cpi->bc; int savings = 0; vp8_clear_system_state(); //__asm emms; @@ -1551,7 +1579,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char * dest int i, j; VP8_HEADER oh; VP8_COMMON *const pc = & cpi->common; - vp8_writer *const bc = & cpi->bc; + vp8_writer *const bc = cpi->bc; MACROBLOCKD *const xd = & cpi->mb.e_mbd; int extra_bytes_packed = 0; @@ -1566,6 +1594,8 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char * dest mb_feature_data_bits = vp8_mb_feature_data_bits; + bc[0].error = &pc->error; + validate_buffer(cx_data, 3, cx_data_end, &cpi->common.error); cx_data += 3; @@ -1614,20 +1644,20 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char * dest // Signal whether or not Segmentation is enabled - vp8_write_bit(bc, (xd->segmentation_enabled) ? 1 : 0); + vp8_write_bit(bc, xd->segmentation_enabled); // Indicate which features are enabled if (xd->segmentation_enabled) { // Signal whether or not the segmentation map is being updated. - vp8_write_bit(bc, (xd->update_mb_segmentation_map) ? 1 : 0); - vp8_write_bit(bc, (xd->update_mb_segmentation_data) ? 1 : 0); + vp8_write_bit(bc, xd->update_mb_segmentation_map); + vp8_write_bit(bc, xd->update_mb_segmentation_data); if (xd->update_mb_segmentation_data) { signed char Data; - vp8_write_bit(bc, (xd->mb_segement_abs_delta) ? 1 : 0); + vp8_write_bit(bc, xd->mb_segement_abs_delta); // For each segmentation feature (Quant and loop filter level) for (i = 0; i < MB_LVL_MAX; i++) @@ -1684,7 +1714,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char * dest vp8_write_literal(bc, pc->sharpness_level, 3); // Write out loop filter deltas applied at the MB level based on mode or ref frame (if they are enabled). - vp8_write_bit(bc, (xd->mode_ref_lf_delta_enabled) ? 1 : 0); + vp8_write_bit(bc, xd->mode_ref_lf_delta_enabled); if (xd->mode_ref_lf_delta_enabled) { @@ -1844,7 +1874,9 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char * dest vp8_stop_encode(bc); - oh.first_partition_length_in_bytes = cpi->bc.pos; + cx_data += bc->pos; + + oh.first_partition_length_in_bytes = cpi->bc->pos; /* update frame tag */ { @@ -1858,34 +1890,58 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char * dest dest[2] = v >> 16; } - *size = VP8_HEADER_SIZE + extra_bytes_packed + cpi->bc.pos; + *size = VP8_HEADER_SIZE + extra_bytes_packed + cpi->bc->pos; + cpi->partition_sz[0] = *size; if (pc->multi_token_partition != ONE_PARTITION) { - int num_part; - int asize; - num_part = 1 << pc->multi_token_partition; + int num_part = 1 << pc->multi_token_partition; + + /* partition size table at the end of first partition */ + cpi->partition_sz[0] += 3 * (num_part - 1); + *size += 3 * (num_part - 1); + + validate_buffer(cx_data, 3 * (num_part - 1), cx_data_end, + &pc->error); + + for(i = 1; i < num_part + 1; i++) + { + cpi->bc[i].error = &pc->error; + } - pack_tokens_into_partitions(cpi, cx_data + bc->pos, cx_data_end, num_part, &asize); + pack_tokens_into_partitions(cpi, cx_data + 3 * (num_part - 1), + cx_data_end, num_part); - *size += asize; + for(i = 1; i < num_part; i++) + { + cpi->partition_sz[i] = cpi->bc[i].pos; + write_partition_size(cx_data, cpi->partition_sz[i]); + cx_data += 3; + *size += cpi->partition_sz[i]; /* add to total */ + } + + /* add last partition to total size */ + cpi->partition_sz[i] = cpi->bc[i].pos; + *size += cpi->partition_sz[i]; } else { - vp8_start_encode(&cpi->bc2, cx_data + bc->pos, cx_data_end); + bc[1].error = &pc->error; + + vp8_start_encode(&cpi->bc[1], cx_data, cx_data_end); #if CONFIG_MULTITHREAD if (cpi->b_multi_threaded) - pack_mb_row_tokens(cpi, &cpi->bc2); + pack_mb_row_tokens(cpi, &cpi->bc[1]); else #endif - pack_tokens(&cpi->bc2, cpi->tok, cpi->tok_count); + pack_tokens(&cpi->bc[1], cpi->tok, cpi->tok_count); - vp8_stop_encode(&cpi->bc2); + vp8_stop_encode(&cpi->bc[1]); - *size += cpi->bc2.pos; - cpi->partition_sz[1] = cpi->bc2.pos; + *size += cpi->bc[1].pos; + cpi->partition_sz[1] = cpi->bc[1].pos; } } diff --git a/vp8/encoder/bitstream.h b/vp8/encoder/bitstream.h index 8a875a5bd..9007cede0 100644 --- a/vp8/encoder/bitstream.h +++ b/vp8/encoder/bitstream.h @@ -17,23 +17,27 @@ void vp8cx_pack_tokens_armv5(vp8_writer *w, const TOKENEXTRA *p, int xcount, vp8_token *, vp8_extra_bit_struct *, const vp8_tree_index *); -void vp8cx_pack_tokens_into_partitions_armv5(VP8_COMP *, unsigned char *, int , int *, - vp8_token *, - vp8_extra_bit_struct *, - const vp8_tree_index *); +void vp8cx_pack_tokens_into_partitions_armv5(VP8_COMP *, + unsigned char * cx_data, + const unsigned char *cx_data_end, + int num_parts, + vp8_token *, + vp8_extra_bit_struct *, + const vp8_tree_index *); void vp8cx_pack_mb_row_tokens_armv5(VP8_COMP *cpi, vp8_writer *w, vp8_token *, vp8_extra_bit_struct *, const vp8_tree_index *); # define pack_tokens(a,b,c) \ vp8cx_pack_tokens_armv5(a,b,c,vp8_coef_encodings,vp8_extra_bits,vp8_coef_tree) -# define pack_tokens_into_partitions(a,b,unused,c,d) \ +# define pack_tokens_into_partitions(a,b,c,d) \ vp8cx_pack_tokens_into_partitions_armv5(a,b,c,d,vp8_coef_encodings,vp8_extra_bits,vp8_coef_tree) # define pack_mb_row_tokens(a,b) \ vp8cx_pack_mb_row_tokens_armv5(a,b,vp8_coef_encodings,vp8_extra_bits,vp8_coef_tree) #else -# define pack_tokens(a,b,c) pack_tokens_c(a,b,c) -# define pack_tokens_into_partitions(a,b,c,d,e) pack_tokens_into_partitions_c(a,b,c,d,e) +# define pack_tokens(a,b,c) pack_tokens_c(a,b,c) +# define pack_tokens_into_partitions(a,b,c,d) pack_tokens_into_partitions_c(a,b,c,d) # define pack_mb_row_tokens(a,b) pack_mb_row_tokens_c(a,b) #endif + #endif diff --git a/vp8/encoder/block.h b/vp8/encoder/block.h index 5e5a60db7..0a74ca46d 100644 --- a/vp8/encoder/block.h +++ b/vp8/encoder/block.h @@ -45,10 +45,6 @@ typedef struct unsigned char **base_src; int src; int src_stride; - -// MV enc_mv; - int force_empty; - } BLOCK; typedef struct @@ -107,7 +103,6 @@ typedef struct int mv_row_min; int mv_row_max; - int vector_range; // Used to monitor limiting range of recent vectors to guide search. int skip; int encode_breakout; diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c index 8ec9e27c9..6a9ba291d 100644 --- a/vp8/encoder/encodeframe.c +++ b/vp8/encoder/encodeframe.c @@ -39,7 +39,12 @@ #define IF_RTCD(x) NULL #endif extern void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) ; - +extern void vp8_calc_ref_frame_costs(int *ref_frame_cost, + int prob_intra, + int prob_last, + int prob_garf + ); +extern void vp8_convert_rfct_to_prob(VP8_COMP *const cpi); extern void vp8cx_initialize_me_consts(VP8_COMP *cpi, int QIndex); extern void vp8_auto_select_speed(VP8_COMP *cpi); extern void vp8cx_init_mbrthread_data(VP8_COMP *cpi, @@ -49,8 +54,8 @@ extern void vp8cx_init_mbrthread_data(VP8_COMP *cpi, int count); void vp8_build_block_offsets(MACROBLOCK *x); void vp8_setup_block_ptrs(MACROBLOCK *x); -int vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t, int recon_yoffset, int recon_uvoffset); -int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t); +int vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t, int recon_yoffset, int recon_uvoffset, int mb_row, int mb_col); +int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t, int mb_row, int mb_col); static void adjust_act_zbin( VP8_COMP *cpi, MACROBLOCK *x ); #ifdef MODE_STATS @@ -475,14 +480,14 @@ void encode_mb_row(VP8_COMP *cpi, if (cm->frame_type == KEY_FRAME) { - *totalrate += vp8cx_encode_intra_macro_block(cpi, x, tp); + *totalrate += vp8cx_encode_intra_macro_block(cpi, x, tp, mb_row, mb_col); #ifdef MODE_STATS y_modes[xd->mbmi.mode] ++; #endif } else { - *totalrate += vp8cx_encode_inter_macroblock(cpi, x, tp, recon_yoffset, recon_uvoffset); + *totalrate += vp8cx_encode_inter_macroblock(cpi, x, tp, recon_yoffset, recon_uvoffset, mb_row, mb_col); #ifdef MODE_STATS inter_y_modes[xd->mbmi.mode] ++; @@ -590,8 +595,6 @@ void init_encode_frame_mb_context(VP8_COMP *cpi) // Activity map pointer x->mb_activity_ptr = cpi->mb_activity_map; - x->vector_range = 32; - x->act_zbin_adj = 0; x->partition_info = x->pi; @@ -636,55 +639,23 @@ void init_encode_frame_mb_context(VP8_COMP *cpi) vpx_memset(cm->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * cm->mb_cols); - xd->ref_frame_cost[INTRA_FRAME] = vp8_cost_zero(cpi->prob_intra_coded); - // Special case treatment when GF and ARF are not sensible options for reference if (cpi->ref_frame_flags == VP8_LAST_FLAG) - { - xd->ref_frame_cost[LAST_FRAME] = vp8_cost_one(cpi->prob_intra_coded) - + vp8_cost_zero(255); - xd->ref_frame_cost[GOLDEN_FRAME] = vp8_cost_one(cpi->prob_intra_coded) - + vp8_cost_one(255) - + vp8_cost_zero(128); - xd->ref_frame_cost[ALTREF_FRAME] = vp8_cost_one(cpi->prob_intra_coded) - + vp8_cost_one(255) - + vp8_cost_one(128); - } + vp8_calc_ref_frame_costs(xd->ref_frame_cost, + cpi->prob_intra_coded,255,128); else if ((cpi->oxcf.number_of_layers > 1) && (cpi->ref_frame_flags == VP8_GOLD_FLAG)) - { - xd->ref_frame_cost[LAST_FRAME] = vp8_cost_one(cpi->prob_intra_coded) - + vp8_cost_zero(1); - xd->ref_frame_cost[GOLDEN_FRAME] = vp8_cost_one(cpi->prob_intra_coded) - + vp8_cost_one(1) - + vp8_cost_zero(255); - xd->ref_frame_cost[ALTREF_FRAME] = vp8_cost_one(cpi->prob_intra_coded) - + vp8_cost_one(1) - + vp8_cost_one(255); - } + vp8_calc_ref_frame_costs(xd->ref_frame_cost, + cpi->prob_intra_coded,1,255); else if ((cpi->oxcf.number_of_layers > 1) && (cpi->ref_frame_flags == VP8_ALT_FLAG)) - { - xd->ref_frame_cost[LAST_FRAME] = vp8_cost_one(cpi->prob_intra_coded) - + vp8_cost_zero(1); - xd->ref_frame_cost[GOLDEN_FRAME] = vp8_cost_one(cpi->prob_intra_coded) - + vp8_cost_one(1) - + vp8_cost_zero(1); - xd->ref_frame_cost[ALTREF_FRAME] = vp8_cost_one(cpi->prob_intra_coded) - + vp8_cost_one(1) - + vp8_cost_one(1); - } + vp8_calc_ref_frame_costs(xd->ref_frame_cost, + cpi->prob_intra_coded,1,1); else - { - xd->ref_frame_cost[LAST_FRAME] = vp8_cost_one(cpi->prob_intra_coded) - + vp8_cost_zero(cpi->prob_last_coded); - xd->ref_frame_cost[GOLDEN_FRAME] = vp8_cost_one(cpi->prob_intra_coded) - + vp8_cost_one(cpi->prob_last_coded) - + vp8_cost_zero(cpi->prob_gf_coded); - xd->ref_frame_cost[ALTREF_FRAME] = vp8_cost_one(cpi->prob_intra_coded) - + vp8_cost_one(cpi->prob_last_coded) - + vp8_cost_one(cpi->prob_gf_coded); - } + vp8_calc_ref_frame_costs(xd->ref_frame_cost, + cpi->prob_intra_coded, + cpi->prob_last_coded, + cpi->prob_gf_coded); xd->fullpixel_mask = 0xffffffff; if(cm->full_pixel) @@ -966,31 +937,7 @@ void vp8_encode_frame(VP8_COMP *cpi) if ((cm->frame_type != KEY_FRAME) && ((cpi->oxcf.number_of_layers > 1) || (!cm->refresh_alt_ref_frame && !cm->refresh_golden_frame))) { - const int *const rfct = cpi->count_mb_ref_frame_usage; - const int rf_intra = rfct[INTRA_FRAME]; - const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]; - - if ((rf_intra + rf_inter) > 0) - { - cpi->prob_intra_coded = (rf_intra * 255) / (rf_intra + rf_inter); - - if (cpi->prob_intra_coded < 1) - cpi->prob_intra_coded = 1; - - if ((cm->frames_since_golden > 0) || cpi->source_alt_ref_active) - { - cpi->prob_last_coded = rf_inter ? (rfct[LAST_FRAME] * 255) / rf_inter : 128; - - if (cpi->prob_last_coded < 1) - cpi->prob_last_coded = 1; - - cpi->prob_gf_coded = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) - ? (rfct[GOLDEN_FRAME] * 255) / (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) : 128; - - if (cpi->prob_gf_coded < 1) - cpi->prob_gf_coded = 1; - } - } + vp8_convert_rfct_to_prob(cpi); } #if 0 @@ -1142,8 +1089,10 @@ static void adjust_act_zbin( VP8_COMP *cpi, MACROBLOCK *x ) #endif } -int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t) +int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t, + int mb_row, int mb_col) { + MACROBLOCKD *xd = &x->e_mbd; int rate; if (cpi->sf.RD && cpi->compressor_speed != 2) @@ -1163,14 +1112,17 @@ int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t) vp8_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x); vp8_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x); + sum_intra_stats(cpi, x); vp8_tokenize_mb(cpi, &x->e_mbd, t); - if (x->e_mbd.mode_info_context->mbmi.mode != B_PRED) - vp8_inverse_transform_mby(IF_RTCD(&cpi->rtcd.common->idct), &x->e_mbd); - - vp8_inverse_transform_mbuv(IF_RTCD(&cpi->rtcd.common->idct), &x->e_mbd); + if (xd->mode_info_context->mbmi.mode != B_PRED) + vp8_inverse_transform_mby(xd, IF_RTCD(&cpi->common.rtcd)); + DEQUANT_INVOKE (&cpi->common.rtcd.dequant, idct_add_uv_block) + (xd->qcoeff+16*16, xd->block[16].dequant, + xd->dst.u_buffer, xd->dst.v_buffer, + xd->dst.uv_stride, xd->eobs+16); return rate; } #ifdef SPEEDSTATS @@ -1182,7 +1134,8 @@ extern void vp8_fix_contexts(MACROBLOCKD *x); int vp8cx_encode_inter_macroblock ( VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t, - int recon_yoffset, int recon_uvoffset + int recon_yoffset, int recon_uvoffset, + int mb_row, int mb_col ) { MACROBLOCKD *const xd = &x->e_mbd; @@ -1230,8 +1183,10 @@ int vp8cx_encode_inter_macroblock } else + { vp8_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate, - &distortion, &intra_error); + &distortion, &intra_error, mb_row, mb_col); + } cpi->prediction_error += distortion; cpi->intra_error += intra_error; @@ -1345,12 +1300,14 @@ int vp8cx_encode_inter_macroblock if (!x->skip) { vp8_tokenize_mb(cpi, xd, t); - if (x->e_mbd.mode_info_context->mbmi.mode != B_PRED) - { - vp8_inverse_transform_mby(IF_RTCD(&cpi->rtcd.common->idct), - &x->e_mbd); - } - vp8_inverse_transform_mbuv(IF_RTCD(&cpi->rtcd.common->idct), &x->e_mbd); + + if (xd->mode_info_context->mbmi.mode != B_PRED) + vp8_inverse_transform_mby(xd, IF_RTCD(&cpi->common.rtcd)); + + DEQUANT_INVOKE (&cpi->common.rtcd.dequant, idct_add_uv_block) + (xd->qcoeff+16*16, xd->block[16].dequant, + xd->dst.u_buffer, xd->dst.v_buffer, + xd->dst.uv_stride, xd->eobs+16); } else { diff --git a/vp8/encoder/encodeintra.c b/vp8/encoder/encodeintra.c index d89d74e5e..16393a1ff 100644 --- a/vp8/encoder/encodeintra.c +++ b/vp8/encoder/encodeintra.c @@ -18,7 +18,6 @@ #include "vp8/common/invtrans.h" #include "vp8/common/recon.h" #include "dct.h" -#include "vp8/common/g_common.h" #include "encodeintra.h" @@ -45,7 +44,7 @@ int vp8_encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred) vp8_encode_intra16x16mby(rtcd, x); - vp8_inverse_transform_mby(IF_RTCD(&rtcd->common->idct), &x->e_mbd); + vp8_inverse_transform_mby(&x->e_mbd, IF_RTCD(&cpi->common.rtcd)); } else { @@ -77,8 +76,17 @@ void vp8_encode_intra4x4block(const VP8_ENCODER_RTCD *rtcd, x->quantize_b(be, b); - vp8_inverse_transform_b(IF_RTCD(&rtcd->common->idct), b, 16); - + if (*b->eob > 1) + { + IDCT_INVOKE(IF_RTCD(&rtcd->common->idct), idct16)(b->dqcoeff, + b->predictor, 16, *(b->base_dst) + b->dst, b->dst_stride); + } + else + { + IDCT_INVOKE(IF_RTCD(&rtcd->common->idct), idct1_scalar_add) + (b->dqcoeff[0], b->predictor, 16, *(b->base_dst) + b->dst, + b->dst_stride); + } } void vp8_encode_intra4x4mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *mb) @@ -96,11 +104,12 @@ void vp8_encode_intra4x4mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *mb) void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) { BLOCK *b = &x->block[0]; + MACROBLOCKD *xd = &x->e_mbd; - RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_mby)(&x->e_mbd); + RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_mby_s)(&x->e_mbd); - ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src), - x->e_mbd.predictor, b->src_stride); + ENCODEMB_INVOKE(&rtcd->encodemb, submby) (x->src_diff, *(b->base_src), + b->src_stride, xd->dst.y_buffer, xd->dst.y_stride); vp8_transform_intra_mby(x); @@ -108,14 +117,17 @@ void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) if (x->optimize) vp8_optimize_mby(x, rtcd); - } void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) { - RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_mbuv)(&x->e_mbd); + MACROBLOCKD *xd = &x->e_mbd; - ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride); + RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_mbuv_s)(&x->e_mbd); + + ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, + x->src.v_buffer, x->src.uv_stride, xd->dst.u_buffer, + xd->dst.v_buffer, xd->dst.uv_stride); vp8_transform_mbuv(x); @@ -123,5 +135,4 @@ void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) if (x->optimize) vp8_optimize_mbuv(x, rtcd); - } diff --git a/vp8/encoder/encodemb.c b/vp8/encoder/encodemb.c index 80c32df1b..c9f755333 100644 --- a/vp8/encoder/encodemb.c +++ b/vp8/encoder/encodemb.c @@ -48,12 +48,12 @@ void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch) } } -void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride) +void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, + int src_stride, unsigned char *upred, + unsigned char *vpred, int pred_stride) { short *udiff = diff + 256; short *vdiff = diff + 320; - unsigned char *upred = pred + 256; - unsigned char *vpred = pred + 320; int r, c; @@ -65,8 +65,8 @@ void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, } udiff += 8; - upred += 8; - usrc += stride; + upred += pred_stride; + usrc += src_stride; } for (r = 0; r < 8; r++) @@ -77,12 +77,13 @@ void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, } vdiff += 8; - vpred += 8; - vsrc += stride; + vpred += pred_stride; + vsrc += src_stride; } } -void vp8_subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride) +void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride, + unsigned char *pred, int pred_stride) { int r, c; @@ -94,8 +95,8 @@ void vp8_subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, in } diff += 16; - pred += 16; - src += stride; + pred += pred_stride; + src += src_stride; } } @@ -103,8 +104,11 @@ static void vp8_subtract_mb(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) { BLOCK *b = &x->block[0]; - ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src), x->e_mbd.predictor, b->src_stride); - ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride); + ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src), + b->src_stride, x->e_mbd.dst.y_buffer, x->e_mbd.dst.y_stride); + ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, + x->src.v_buffer, x->src.uv_stride, x->e_mbd.dst.u_buffer, + x->e_mbd.dst.v_buffer, x->e_mbd.dst.uv_stride); } static void build_dcblock(MACROBLOCK *x) @@ -621,7 +625,7 @@ void vp8_optimize_mbuv(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd) void vp8_encode_inter16x16(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) { - vp8_build_inter_predictors_mb_e(&x->e_mbd); + vp8_build_inter_predictors_mb(&x->e_mbd); vp8_subtract_mb(rtcd, x); @@ -631,7 +635,6 @@ void vp8_encode_inter16x16(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) if (x->optimize) optimize_mb(x, rtcd); - } /* this funciton is used by first pass only */ @@ -639,14 +642,15 @@ void vp8_encode_inter16x16y(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) { BLOCK *b = &x->block[0]; - vp8_build_inter16x16_predictors_mby(&x->e_mbd); + vp8_build_inter16x16_predictors_mby(&x->e_mbd, x->e_mbd.dst.y_buffer, + x->e_mbd.dst.y_stride); - ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src), x->e_mbd.predictor, b->src_stride); + ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src), + b->src_stride, x->e_mbd.dst.y_buffer, x->e_mbd.dst.y_stride); transform_mby(x); vp8_quantize_mby(x); - vp8_inverse_transform_mby(IF_RTCD(&rtcd->common->idct), &x->e_mbd); - + vp8_inverse_transform_mby(&x->e_mbd, IF_RTCD(rtcd->common)); } diff --git a/vp8/encoder/encodemb.h b/vp8/encoder/encodemb.h index 8fa457aa8..0fa87cf68 100644 --- a/vp8/encoder/encodemb.h +++ b/vp8/encoder/encodemb.h @@ -12,6 +12,7 @@ #ifndef __INC_ENCODEMB_H #define __INC_ENCODEMB_H + #include "vpx_config.h" #include "block.h" @@ -28,11 +29,13 @@ void (sym)(BLOCK *be,BLOCKD *bd, int pitch) #define prototype_submby(sym) \ - void (sym)(short *diff, unsigned char *src, unsigned char *pred, int stride) + void (sym)(short *diff, unsigned char *src, int src_stride, \ + unsigned char *pred, int pred_stride) #define prototype_submbuv(sym) \ void (sym)(short *diff, unsigned char *usrc, unsigned char *vsrc,\ - unsigned char *pred, int stride) + int src_stride, unsigned char *upred, unsigned char *vpred,\ + int pred_stride) #if ARCH_X86 || ARCH_X86_64 #include "x86/encodemb_x86.h" diff --git a/vp8/encoder/encodemv.c b/vp8/encoder/encodemv.c index a4849c654..c122d038d 100644 --- a/vp8/encoder/encodemv.c +++ b/vp8/encoder/encodemv.c @@ -395,7 +395,7 @@ static void write_component_probs( void vp8_write_mvprobs(VP8_COMP *cpi) { - vp8_writer *const w = & cpi->bc; + vp8_writer *const w = cpi->bc; MV_CONTEXT *mvc = cpi->common.fc.mvc; int flags[2] = {0, 0}; #ifdef ENTROPY_STATS diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c index 557080dba..69655989d 100644 --- a/vp8/encoder/ethreading.c +++ b/vp8/encoder/ethreading.c @@ -38,7 +38,7 @@ static THREAD_FUNCTION loopfilter_thread(void *p_data) if (sem_wait(&cpi->h_event_start_lpf) == 0) { - if (cpi->b_multi_threaded == FALSE) // we're shutting down + if (cpi->b_multi_threaded == 0) // we're shutting down break; loopfilter_frame(cpi, cm); @@ -78,7 +78,7 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data) int *segment_counts = mbri->segment_counts; int *totalrate = &mbri->totalrate; - if (cpi->b_multi_threaded == FALSE) // we're shutting down + if (cpi->b_multi_threaded == 0) // we're shutting down break; for (mb_row = ithread + 1; mb_row < cm->mb_rows; mb_row += (cpi->encoding_thread_count + 1)) @@ -302,7 +302,6 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc) z->mv_col_max = x->mv_col_max; z->mv_row_min = x->mv_row_min; z->mv_row_max = x->mv_row_max; - z->vector_range = x->vector_range ; */ z->vp8_short_fdct4x4 = x->vp8_short_fdct4x4; @@ -343,12 +342,13 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc) z->block[i].zbin = x->block[i].zbin; z->block[i].zrun_zbin_boost = x->block[i].zrun_zbin_boost; z->block[i].round = x->block[i].round; + z->q_index = x->q_index; + z->act_zbin_adj = x->act_zbin_adj; + z->last_act_zbin_adj = x->last_act_zbin_adj; /* z->block[i].src = x->block[i].src; */ z->block[i].src_stride = x->block[i].src_stride; - z->block[i].force_empty = x->block[i].force_empty; - } { @@ -418,8 +418,6 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi, #endif mb->gf_active_ptr = x->gf_active_ptr; - mb->vector_range = 32; - vpx_memset(mbr_ei[i].segment_counts, 0, sizeof(mbr_ei[i].segment_counts)); mbr_ei[i].totalrate = 0; diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c index 23e30508a..346c06f32 100644 --- a/vp8/encoder/firstpass.c +++ b/vp8/encoder/firstpass.c @@ -267,8 +267,8 @@ static void avg_stats(FIRSTPASS_STATS *section) // Calculate a modified Error used in distributing bits between easier and harder frames static double calculate_modified_err(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { - double av_err = ( cpi->twopass.total_stats->ssim_weighted_pred_err / - cpi->twopass.total_stats->count ); + double av_err = ( cpi->twopass.total_stats.ssim_weighted_pred_err / + cpi->twopass.total_stats.count ); double this_err = this_frame->ssim_weighted_pred_err; double modified_err; @@ -373,7 +373,7 @@ static int frame_max_bits(VP8_COMP *cpi) else { // For VBR base this on the bits and frames left plus the two_pass_vbrmax_section rate passed in by the user - max_bits = (int)(((double)cpi->twopass.bits_left / (cpi->twopass.total_stats->count - (double)cpi->common.current_video_frame)) * ((double)cpi->oxcf.two_pass_vbrmax_section / 100.0)); + max_bits = (int)(((double)cpi->twopass.bits_left / (cpi->twopass.total_stats.count - (double)cpi->common.current_video_frame)) * ((double)cpi->oxcf.two_pass_vbrmax_section / 100.0)); } // Trap case where we are out of bits @@ -385,12 +385,12 @@ static int frame_max_bits(VP8_COMP *cpi) void vp8_init_first_pass(VP8_COMP *cpi) { - zero_stats(cpi->twopass.total_stats); + zero_stats(&cpi->twopass.total_stats); } void vp8_end_first_pass(VP8_COMP *cpi) { - output_stats(cpi, cpi->output_pkt_list, cpi->twopass.total_stats); + output_stats(cpi, cpi->output_pkt_list, &cpi->twopass.total_stats); } static void zz_motion_search( VP8_COMP *cpi, MACROBLOCK * x, YV12_BUFFER_CONFIG * recon_buffer, int * best_motion_err, int recon_yoffset ) @@ -804,17 +804,17 @@ void vp8_first_pass(VP8_COMP *cpi) - cpi->source->ts_start; // don't want to do output stats with a stack variable! - memcpy(cpi->twopass.this_frame_stats, + memcpy(&cpi->twopass.this_frame_stats, &fps, sizeof(FIRSTPASS_STATS)); - output_stats(cpi, cpi->output_pkt_list, cpi->twopass.this_frame_stats); - accumulate_stats(cpi->twopass.total_stats, &fps); + output_stats(cpi, cpi->output_pkt_list, &cpi->twopass.this_frame_stats); + accumulate_stats(&cpi->twopass.total_stats, &fps); } // Copy the previous Last Frame into the GF buffer if specific conditions for doing so are met if ((cm->current_video_frame > 0) && - (cpi->twopass.this_frame_stats->pcnt_inter > 0.20) && - ((cpi->twopass.this_frame_stats->intra_error / cpi->twopass.this_frame_stats->coded_error) > 2.0)) + (cpi->twopass.this_frame_stats.pcnt_inter > 0.20) && + ((cpi->twopass.this_frame_stats.intra_error / cpi->twopass.this_frame_stats.coded_error) > 2.0)) { vp8_yv12_copy_frame_ptr(lst_yv12, gld_yv12); } @@ -861,7 +861,7 @@ double bitcost( double prob ) { return -(log( prob ) / log( 2.0 )); } -static long long estimate_modemvcost(VP8_COMP *cpi, +static int64_t estimate_modemvcost(VP8_COMP *cpi, FIRSTPASS_STATS * fpstats) { int mv_cost; @@ -1019,7 +1019,7 @@ static int estimate_max_q(VP8_COMP *cpi, // averaga q observed in clip for non kf/gf.arf frames // Give average a chance to settle though. if ( (cpi->ni_frames > - ((unsigned int)cpi->twopass.total_stats->count >> 8)) && + ((unsigned int)cpi->twopass.total_stats.count >> 8)) && (cpi->ni_frames > 150) ) { cpi->twopass.maxq_max_limit = ((cpi->ni_av_qi + 32) < cpi->worst_quality) @@ -1075,8 +1075,8 @@ static int estimate_cq( VP8_COMP *cpi, } // II ratio correction factor for clip as a whole - clip_iiratio = cpi->twopass.total_stats->intra_error / - DOUBLE_DIVIDE_CHECK(cpi->twopass.total_stats->coded_error); + clip_iiratio = cpi->twopass.total_stats.intra_error / + DOUBLE_DIVIDE_CHECK(cpi->twopass.total_stats.coded_error); clip_iifactor = 1.0 - ((clip_iiratio - 10.0) * 0.025); if (clip_iifactor < 0.80) clip_iifactor = 0.80; @@ -1260,25 +1260,25 @@ void vp8_init_second_pass(VP8_COMP *cpi) double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100); - zero_stats(cpi->twopass.total_stats); - zero_stats(cpi->twopass.total_left_stats); + zero_stats(&cpi->twopass.total_stats); + zero_stats(&cpi->twopass.total_left_stats); if (!cpi->twopass.stats_in_end) return; - *cpi->twopass.total_stats = *cpi->twopass.stats_in_end; - *cpi->twopass.total_left_stats = *cpi->twopass.total_stats; + cpi->twopass.total_stats = *cpi->twopass.stats_in_end; + cpi->twopass.total_left_stats = cpi->twopass.total_stats; // each frame can have a different duration, as the frame rate in the source // isn't guaranteed to be constant. The frame rate prior to the first frame // encoded in the second pass is a guess. However the sum duration is not. // Its calculated based on the actual durations of all frames from the first // pass. - vp8_new_frame_rate(cpi, 10000000.0 * cpi->twopass.total_stats->count / cpi->twopass.total_stats->duration); + vp8_new_frame_rate(cpi, 10000000.0 * cpi->twopass.total_stats.count / cpi->twopass.total_stats.duration); - cpi->output_frame_rate = cpi->oxcf.frame_rate; - cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats->duration * cpi->oxcf.target_bandwidth / 10000000.0) ; - cpi->twopass.bits_left -= (int64_t)(cpi->twopass.total_stats->duration * two_pass_min_rate / 10000000.0); + cpi->output_frame_rate = cpi->frame_rate; + cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats.duration * cpi->oxcf.target_bandwidth / 10000000.0) ; + cpi->twopass.bits_left -= (int64_t)(cpi->twopass.total_stats.duration * two_pass_min_rate / 10000000.0); // Calculate a minimum intra value to be used in determining the IIratio // scores used in the second pass. We have this minimum to make sure @@ -1301,7 +1301,7 @@ void vp8_init_second_pass(VP8_COMP *cpi) sum_iiratio += IIRatio; } - cpi->twopass.avg_iiratio = sum_iiratio / DOUBLE_DIVIDE_CHECK((double)cpi->twopass.total_stats->count); + cpi->twopass.avg_iiratio = sum_iiratio / DOUBLE_DIVIDE_CHECK((double)cpi->twopass.total_stats.count); // Reset file position reset_fpf_position(cpi, start_pos); @@ -1376,7 +1376,7 @@ static int detect_transition_to_still( double loop_decay_rate, double decay_accumulator ) { - BOOL trans_to_still = FALSE; + int trans_to_still = 0; // Break clause to detect very still sections after motion // For example a static image after a fade or other transition @@ -1406,7 +1406,7 @@ static int detect_transition_to_still( // Only if it does do we signal a transition to still if ( j == still_interval ) - trans_to_still = TRUE; + trans_to_still = 1; } return trans_to_still; @@ -1415,14 +1415,14 @@ static int detect_transition_to_still( // This function detects a flash through the high relative pcnt_second_ref // score in the frame following a flash frame. The offset passed in should // reflect this -static BOOL detect_flash( VP8_COMP *cpi, int offset ) +static int detect_flash( VP8_COMP *cpi, int offset ) { FIRSTPASS_STATS next_frame; - BOOL flash_detected = FALSE; + int flash_detected = 0; // Read the frame data. - // The return is FALSE (no flash detected) if not a valid frame + // The return is 0 (no flash detected) if not a valid frame if ( read_frame_stats(cpi, &next_frame, offset) != EOF ) { // What we are looking for here is a situation where there is a @@ -1433,7 +1433,7 @@ static BOOL detect_flash( VP8_COMP *cpi, int offset ) if ( (next_frame.pcnt_second_ref > next_frame.pcnt_inter) && (next_frame.pcnt_second_ref >= 0.5 ) ) { - flash_detected = TRUE; + flash_detected = 1; /*if (1) { @@ -1548,7 +1548,7 @@ static int calc_arf_boost( double mv_in_out_accumulator = 0.0; double abs_mv_in_out_accumulator = 0.0; double r; - BOOL flash_detected = FALSE; + int flash_detected = 0; // Search forward from the proposed arf/next gf position for ( i = 0; i < f_frames; i++ ) @@ -1677,7 +1677,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) int alt_boost = 0; int f_boost = 0; int b_boost = 0; - BOOL flash_detected; + int flash_detected; cpi->twopass.gf_group_bits = 0; cpi->twopass.gf_decay_rate = 0; @@ -1751,7 +1751,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) loop_decay_rate, decay_accumulator ) ) { - allow_alt_ref = FALSE; + allow_alt_ref = 0; boost_score = old_boost_score; break; } @@ -1923,7 +1923,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) int frames_bwd = cpi->oxcf.arnr_max_frames - 1; int frames_fwd = cpi->oxcf.arnr_max_frames - 1; - cpi->source_alt_ref_pending = TRUE; + cpi->source_alt_ref_pending = 1; // For alt ref frames the error score for the end frame of the // group (the alt ref frame) should not contribute to the group @@ -1949,7 +1949,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) // Note: this_frame->frame has been updated in the loop // so it now points at the ARF frame. half_gf_int = cpi->baseline_gf_interval >> 1; - frames_after_arf = cpi->twopass.total_stats->count - + frames_after_arf = cpi->twopass.total_stats.count - this_frame->frame - 1; switch (cpi->oxcf.arnr_type) @@ -1989,13 +1989,13 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) } else { - cpi->source_alt_ref_pending = FALSE; + cpi->source_alt_ref_pending = 0; cpi->baseline_gf_interval = i; } } else { - cpi->source_alt_ref_pending = FALSE; + cpi->source_alt_ref_pending = 0; cpi->baseline_gf_interval = i; } @@ -2005,7 +2005,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) // where cpi->twopass.kf_group_bits is tied to cpi->twopass.bits_left. // This is also important for short clips where there may only be one // key frame. - if (cpi->twopass.frames_to_key >= (int)(cpi->twopass.total_stats->count - + if (cpi->twopass.frames_to_key >= (int)(cpi->twopass.total_stats.count - cpi->common.current_video_frame)) { cpi->twopass.kf_group_bits = @@ -2296,7 +2296,7 @@ static void assign_std_frame_bits(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) void vp8_second_pass(VP8_COMP *cpi) { int tmp_q; - int frames_left = (int)(cpi->twopass.total_stats->count - cpi->common.current_video_frame); + int frames_left = (int)(cpi->twopass.total_stats.count - cpi->common.current_video_frame); FIRSTPASS_STATS this_frame = {0}; FIRSTPASS_STATS this_frame_copy; @@ -2341,7 +2341,7 @@ void vp8_second_pass(VP8_COMP *cpi) cpi->twopass.gf_group_error_left = cpi->twopass.kf_group_error_left; cpi->baseline_gf_interval = cpi->twopass.frames_to_key; cpi->frames_till_gf_update_due = cpi->baseline_gf_interval; - cpi->source_alt_ref_pending = FALSE; + cpi->source_alt_ref_pending = 0; } } @@ -2411,7 +2411,7 @@ void vp8_second_pass(VP8_COMP *cpi) // Account for mv, mode and other overheads. overhead_bits = estimate_modemvcost( - cpi, cpi->twopass.total_left_stats ); + cpi, &cpi->twopass.total_left_stats ); // Special case code for first frame. if (cpi->common.current_video_frame == 0) @@ -2425,7 +2425,7 @@ void vp8_second_pass(VP8_COMP *cpi) est_cq = estimate_cq( cpi, - cpi->twopass.total_left_stats, + &cpi->twopass.total_left_stats, (int)(cpi->twopass.bits_left / frames_left), overhead_bits ); @@ -2440,7 +2440,7 @@ void vp8_second_pass(VP8_COMP *cpi) tmp_q = estimate_max_q( cpi, - cpi->twopass.total_left_stats, + &cpi->twopass.total_left_stats, (int)(cpi->twopass.bits_left / frames_left), overhead_bits ); @@ -2463,16 +2463,16 @@ void vp8_second_pass(VP8_COMP *cpi) // radical adjustments to the allowed quantizer range just to use up a // few surplus bits or get beneath the target rate. else if ( (cpi->common.current_video_frame < - (((unsigned int)cpi->twopass.total_stats->count * 255)>>8)) && + (((unsigned int)cpi->twopass.total_stats.count * 255)>>8)) && ((cpi->common.current_video_frame + cpi->baseline_gf_interval) < - (unsigned int)cpi->twopass.total_stats->count) ) + (unsigned int)cpi->twopass.total_stats.count) ) { if (frames_left < 1) frames_left = 1; tmp_q = estimate_max_q( cpi, - cpi->twopass.total_left_stats, + &cpi->twopass.total_left_stats, (int)(cpi->twopass.bits_left / frames_left), overhead_bits ); @@ -2489,13 +2489,13 @@ void vp8_second_pass(VP8_COMP *cpi) cpi->twopass.frames_to_key --; // Update the total stats remaining sturcture - subtract_stats(cpi->twopass.total_left_stats, &this_frame ); + subtract_stats(&cpi->twopass.total_left_stats, &this_frame ); } -static BOOL test_candidate_kf(VP8_COMP *cpi, FIRSTPASS_STATS *last_frame, FIRSTPASS_STATS *this_frame, FIRSTPASS_STATS *next_frame) +static int test_candidate_kf(VP8_COMP *cpi, FIRSTPASS_STATS *last_frame, FIRSTPASS_STATS *this_frame, FIRSTPASS_STATS *next_frame) { - BOOL is_viable_kf = FALSE; + int is_viable_kf = 0; // Does the frame satisfy the primary criteria of a key frame // If so, then examine how well it predicts subsequent frames @@ -2569,13 +2569,13 @@ static BOOL test_candidate_kf(VP8_COMP *cpi, FIRSTPASS_STATS *last_frame, FIRST // If there is tolerable prediction for at least the next 3 frames then break out else discard this pottential key frame and move on if (boost_score > 5.0 && (i > 3)) - is_viable_kf = TRUE; + is_viable_kf = 1; else { // Reset the file position reset_fpf_position(cpi, start_pos); - is_viable_kf = FALSE; + is_viable_kf = 0; } } @@ -2611,7 +2611,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) cpi->this_key_frame_forced = cpi->next_key_frame_forced; // Clear the alt ref active flag as this can never be active on a key frame - cpi->source_alt_ref_active = FALSE; + cpi->source_alt_ref_active = 0; // Kf is always a gf so clear frames till next gf counter cpi->frames_till_gf_update_due = 0; @@ -2727,10 +2727,10 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) // Reset to the start of the group reset_fpf_position(cpi, current_pos); - cpi->next_key_frame_forced = TRUE; + cpi->next_key_frame_forced = 1; } else - cpi->next_key_frame_forced = FALSE; + cpi->next_key_frame_forced = 0; // Special case for the last frame of the file if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end) @@ -3034,8 +3034,8 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) if (cpi->oxcf.allow_spatial_resampling) { - int resample_trigger = FALSE; - int last_kf_resampled = FALSE; + int resample_trigger = 0; + int last_kf_resampled = 0; int kf_q; int scale_val = 0; int hr, hs, vr, vs; @@ -3053,15 +3053,15 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) double effective_size_ratio; if ((cpi->common.Width != cpi->oxcf.Width) || (cpi->common.Height != cpi->oxcf.Height)) - last_kf_resampled = TRUE; + last_kf_resampled = 1; // Set back to unscaled by defaults cpi->common.horiz_scale = NORMAL; cpi->common.vert_scale = NORMAL; // Calculate Average bits per frame. - //av_bits_per_frame = cpi->twopass.bits_left/(double)(cpi->twopass.total_stats->count - cpi->common.current_video_frame); - av_bits_per_frame = cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->oxcf.frame_rate); + //av_bits_per_frame = cpi->twopass.bits_left/(double)(cpi->twopass.total_stats.count - cpi->common.current_video_frame); + av_bits_per_frame = cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->frame_rate); //if ( av_bits_per_frame < 0.0 ) // av_bits_per_frame = 0.0 @@ -3117,21 +3117,21 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) (last_kf_resampled && (projected_buffer_level < (cpi->oxcf.resample_up_water_mark * cpi->oxcf.optimal_buffer_level / 100)))) //( ((cpi->buffer_level < (cpi->oxcf.resample_down_water_mark * cpi->oxcf.optimal_buffer_level / 100))) && // ((projected_buffer_level < (cpi->oxcf.resample_up_water_mark * cpi->oxcf.optimal_buffer_level / 100))) )) - resample_trigger = TRUE; + resample_trigger = 1; else - resample_trigger = FALSE; + resample_trigger = 0; } else { - int64_t clip_bits = (int64_t)(cpi->twopass.total_stats->count * cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->oxcf.frame_rate)); + int64_t clip_bits = (int64_t)(cpi->twopass.total_stats.count * cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->frame_rate)); int64_t over_spend = cpi->oxcf.starting_buffer_level - cpi->buffer_level; if ((last_kf_resampled && (kf_q > cpi->worst_quality)) || // If triggered last time the threshold for triggering again is reduced ((kf_q > cpi->worst_quality) && // Projected Q higher than allowed and ... (over_spend > clip_bits / 20))) // ... Overspend > 5% of total bits - resample_trigger = TRUE; + resample_trigger = 1; else - resample_trigger = FALSE; + resample_trigger = 0; } diff --git a/vp8/encoder/lookahead.c b/vp8/encoder/lookahead.c index b92e82bdf..3e582e369 100644 --- a/vp8/encoder/lookahead.c +++ b/vp8/encoder/lookahead.c @@ -48,7 +48,7 @@ vp8_lookahead_destroy(struct lookahead_ctx *ctx) { if(ctx->buf) { - int i; + unsigned int i; for(i = 0; i < ctx->max_sz; i++) vp8_yv12_de_alloc_frame_buffer(&ctx->buf[i].img); @@ -65,7 +65,7 @@ vp8_lookahead_init(unsigned int width, unsigned int depth) { struct lookahead_ctx *ctx = NULL; - int i; + unsigned int i; /* Clamp the lookahead queue depth */ if(depth < 1) @@ -188,7 +188,7 @@ vp8_lookahead_pop(struct lookahead_ctx *ctx, struct lookahead_entry* vp8_lookahead_peek(struct lookahead_ctx *ctx, - int index) + unsigned int index) { struct lookahead_entry* buf = NULL; diff --git a/vp8/encoder/lookahead.h b/vp8/encoder/lookahead.h index afb3fd4a9..32bafcd63 100644 --- a/vp8/encoder/lookahead.h +++ b/vp8/encoder/lookahead.h @@ -92,7 +92,7 @@ vp8_lookahead_pop(struct lookahead_ctx *ctx, */ struct lookahead_entry* vp8_lookahead_peek(struct lookahead_ctx *ctx, - int index); + unsigned int index); /**\brief Get the number of frames currently in the lookahead queue diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c index c1a0ea7bf..735af95ca 100644 --- a/vp8/encoder/mcomp.c +++ b/vp8/encoder/mcomp.c @@ -9,6 +9,7 @@ */ +#include "onyx_int.h" #include "mcomp.h" #include "vpx_mem/vpx_mem.h" #include "vpx_config.h" @@ -182,8 +183,6 @@ void vp8_init3smotion_compensation(MACROBLOCK *x, int stride) #define IFMVCV(r,c,s,e) if ( c >= minc && c <= maxc && r >= minr && r <= maxr) s else e; #define ERR(r,c) (MVC(r,c)+DIST(r,c)) // returns distortion + motion vector cost #define CHECK_BETTER(v,r,c) IFMVCV(r,c,{thismse = DIST(r,c); if((v = (MVC(r,c)+thismse)) < besterr) { besterr = v; br=r; bc=c; *distortion = thismse; *sse1 = sse; }}, v=INT_MAX;)// checks if (r,c) has better score than previous best -#define MIN(x,y) (((x)<(y))?(x):(y)) -#define MAX(x,y) (((x)>(y))?(x):(y)) int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *bestmv, int_mv *ref_mv, @@ -331,8 +330,7 @@ int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, #undef IFMVCV #undef ERR #undef CHECK_BETTER -#undef MIN -#undef MAX + int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *bestmv, int_mv *ref_mv, int error_per_bit, @@ -854,6 +852,8 @@ int vp8_hex_search int k = -1; int all_in; int best_site = -1; + int hex_range = 127; + int dia_range = 8; int_mv fcenter_mv; fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3; @@ -873,6 +873,18 @@ int vp8_hex_search in_what_stride, 0x7fffffff) + mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit); +#if CONFIG_MULTI_RES_ENCODING + /* Lower search range based on prediction info */ + if (search_param >= 6) goto cal_neighbors; + else if (search_param >= 5) hex_range = 4; + else if (search_param >= 4) hex_range = 6; + else if (search_param >= 3) hex_range = 15; + else if (search_param >= 2) hex_range = 31; + else if (search_param >= 1) hex_range = 63; + + dia_range = 8; +#endif + // hex search //j=0 CHECK_BOUNDS(2) @@ -909,7 +921,7 @@ int vp8_hex_search k = best_site; } - for (j = 1; j < 127; j++) + for (j = 1; j < hex_range; j++) { best_site = -1; CHECK_BOUNDS(2) @@ -951,7 +963,7 @@ int vp8_hex_search // check 4 1-away neighbors cal_neighbors: - for (j = 0; j < 32; j++) + for (j = 0; j < dia_range; j++) { best_site = -1; CHECK_BOUNDS(1) @@ -1144,7 +1156,7 @@ int vp8_diamond_search_sadx4 int tot_steps; int_mv this_mv; - int bestsad = INT_MAX; + unsigned int bestsad = UINT_MAX; int best_site = 0; int last_site = 0; @@ -1385,7 +1397,7 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, unsigned char *bestaddress; int_mv *best_mv = &d->bmi.mv; int_mv this_mv; - int bestsad = INT_MAX; + unsigned int bestsad = UINT_MAX; int r, c; unsigned char *check_here; @@ -1515,7 +1527,7 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, unsigned char *bestaddress; int_mv *best_mv = &d->bmi.mv; int_mv this_mv; - int bestsad = INT_MAX; + unsigned int bestsad = UINT_MAX; int r, c; unsigned char *check_here; diff --git a/vp8/encoder/mr_dissim.c b/vp8/encoder/mr_dissim.c new file mode 100644 index 000000000..7a62a06ec --- /dev/null +++ b/vp8/encoder/mr_dissim.c @@ -0,0 +1,201 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include <limits.h> +#include "vpx_config.h" +#include "onyx_int.h" +#include "mr_dissim.h" +#include "vpx_mem/vpx_mem.h" +#include "rdopt.h" + +void vp8_cal_low_res_mb_cols(VP8_COMP *cpi) +{ + int low_res_w; + + /* Support arbitrary down-sampling factor */ + unsigned int iw = cpi->oxcf.Width*cpi->oxcf.mr_down_sampling_factor.den + + cpi->oxcf.mr_down_sampling_factor.num - 1; + + low_res_w = iw/cpi->oxcf.mr_down_sampling_factor.num; + cpi->mr_low_res_mb_cols = ((low_res_w + 15) >> 4); +} + +#define GET_MV(x) \ +if(x->mbmi.ref_frame !=INTRA_FRAME) \ +{ \ + mvx[cnt] = x->mbmi.mv.as_mv.row; \ + mvy[cnt] = x->mbmi.mv.as_mv.col; \ + cnt++; \ +} + +#define GET_MV_SIGN(x) \ +if(x->mbmi.ref_frame !=INTRA_FRAME) \ +{ \ + mvx[cnt] = x->mbmi.mv.as_mv.row; \ + mvy[cnt] = x->mbmi.mv.as_mv.col; \ + if (cm->ref_frame_sign_bias[x->mbmi.ref_frame] \ + != cm->ref_frame_sign_bias[tmp->mbmi.ref_frame]) \ + { \ + mvx[cnt] *= -1; \ + mvy[cnt] *= -1; \ + } \ + cnt++; \ +} + +void vp8_cal_dissimilarity(VP8_COMP *cpi) +{ + VP8_COMMON *cm = &cpi->common; + + /* Note: The first row & first column in mip are outside the frame, which + * were initialized to all 0.(ref_frame, mode, mv...) + * Their ref_frame = 0 means they won't be counted in the following + * calculation. + */ + if (cpi->oxcf.mr_total_resolutions >1 + && cpi->oxcf.mr_encoder_id < (cpi->oxcf.mr_total_resolutions - 1)) + { + /* Store info for show/no-show frames for supporting alt_ref. + * If parent frame is alt_ref, child has one too. + */ + if(cm->frame_type != KEY_FRAME) + { + int mb_row; + int mb_col; + /* Point to beginning of allocated MODE_INFO arrays. */ + MODE_INFO *tmp = cm->mip + cm->mode_info_stride; + LOWER_RES_INFO* store_mode_info + = (LOWER_RES_INFO*)cpi->oxcf.mr_low_res_mode_info; + + for (mb_row = 0; mb_row < cm->mb_rows; mb_row ++) + { + tmp++; + for (mb_col = 0; mb_col < cm->mb_cols; mb_col ++) + { + int dissim = INT_MAX; + + if(tmp->mbmi.ref_frame !=INTRA_FRAME) + { + int mvx[8]; + int mvy[8]; + int mmvx; + int mmvy; + int cnt=0; + const MODE_INFO *here = tmp; + const MODE_INFO *above = here - cm->mode_info_stride; + const MODE_INFO *left = here - 1; + const MODE_INFO *aboveleft = above - 1; + const MODE_INFO *aboveright = NULL; + const MODE_INFO *right = NULL; + const MODE_INFO *belowleft = NULL; + const MODE_INFO *below = NULL; + const MODE_INFO *belowright = NULL; + + /* If alternate reference frame is used, we have to + * check sign of MV. */ + if(cpi->oxcf.play_alternate) + { + /* Gather mv of neighboring MBs */ + GET_MV_SIGN(above) + GET_MV_SIGN(left) + GET_MV_SIGN(aboveleft) + + if(mb_col < (cm->mb_cols-1)) + { + right = here + 1; + aboveright = above + 1; + GET_MV_SIGN(right) + GET_MV_SIGN(aboveright) + } + + if(mb_row < (cm->mb_rows-1)) + { + below = here + cm->mode_info_stride; + belowleft = below - 1; + GET_MV_SIGN(below) + GET_MV_SIGN(belowleft) + } + + if(mb_col < (cm->mb_cols-1) + && mb_row < (cm->mb_rows-1)) + { + belowright = below + 1; + GET_MV_SIGN(belowright) + } + }else + { + /* No alt_ref and gather mv of neighboring MBs */ + GET_MV(above) + GET_MV(left) + GET_MV(aboveleft) + + if(mb_col < (cm->mb_cols-1)) + { + right = here + 1; + aboveright = above + 1; + GET_MV(right) + GET_MV(aboveright) + } + + if(mb_row < (cm->mb_rows-1)) + { + below = here + cm->mode_info_stride; + belowleft = below - 1; + GET_MV(below) + GET_MV(belowleft) + } + + if(mb_col < (cm->mb_cols-1) + && mb_row < (cm->mb_rows-1)) + { + belowright = below + 1; + GET_MV(belowright) + } + } + + if (cnt > 0) + { + int max_mvx = mvx[0]; + int min_mvx = mvx[0]; + int max_mvy = mvy[0]; + int min_mvy = mvy[0]; + int i; + + if (cnt > 1) + { + for (i=1; i< cnt; i++) + { + if (mvx[i] > max_mvx) max_mvx = mvx[i]; + else if (mvx[i] < min_mvx) min_mvx = mvx[i]; + if (mvy[i] > max_mvy) max_mvy = mvy[i]; + else if (mvy[i] < min_mvy) min_mvy = mvy[i]; + } + } + + mmvx = MAX(abs(min_mvx - here->mbmi.mv.as_mv.row), + abs(max_mvx - here->mbmi.mv.as_mv.row)); + mmvy = MAX(abs(min_mvy - here->mbmi.mv.as_mv.col), + abs(max_mvy - here->mbmi.mv.as_mv.col)); + dissim = MAX(mmvx, mmvy); + } + } + + /* Store mode info for next resolution encoding */ + store_mode_info->mode = tmp->mbmi.mode; + store_mode_info->ref_frame = tmp->mbmi.ref_frame; + store_mode_info->mv.as_int = tmp->mbmi.mv.as_int; + store_mode_info->dissim = dissim; + tmp++; + store_mode_info++; + } + } + } + } +} diff --git a/vp8/common/common_types.h b/vp8/encoder/mr_dissim.h index 4e6248697..3d2c2035f 100644 --- a/vp8/common/common_types.h +++ b/vp8/encoder/mr_dissim.h @@ -9,10 +9,11 @@ */ -#ifndef __INC_COMMON_TYPES -#define __INC_COMMON_TYPES +#ifndef __INC_MR_DISSIM_H +#define __INC_MR_DISSIM_H +#include "vpx_config.h" -#define TRUE 1 -#define FALSE 0 +extern void vp8_cal_low_res_mb_cols(VP8_COMP *cpi); +extern void vp8_cal_dissimilarity(VP8_COMP *cpi); #endif diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index 1d00e6777..e3f951925 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -23,7 +23,6 @@ #include "ratectrl.h" #include "vp8/common/quant_common.h" #include "segmentation.h" -#include "vp8/common/g_common.h" #include "vpx_scale/yv12extend.h" #if CONFIG_POSTPROC #include "vp8/common/postproc.h" @@ -36,6 +35,9 @@ #if ARCH_ARM #include "vpx_ports/arm.h" #endif +#if CONFIG_MULTI_RES_ENCODING +#include "mr_dissim.h" +#endif #include <math.h> #include <stdio.h> @@ -67,6 +69,7 @@ extern void vp8_yv12_copy_src_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_ #endif int vp8_estimate_entropy_savings(VP8_COMP *cpi); + int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, const vp8_variance_rtcd_vtable_t *rtcd); extern void vp8_temporal_filter_prepare_c(VP8_COMP *cpi, int distance); @@ -355,7 +358,7 @@ static void dealloc_compressor_data(VP8_COMP *cpi) vp8_de_alloc_frame_buffers(&cpi->common); - vp8_yv12_de_alloc_frame_buffer(&cpi->last_frame_uf); + vp8_yv12_de_alloc_frame_buffer(&cpi->pick_lf_lvl_frame); vp8_yv12_de_alloc_frame_buffer(&cpi->scaled_source); #if VP8_TEMPORAL_ALT_REF vp8_yv12_de_alloc_frame_buffer(&cpi->alt_ref_buffer); @@ -377,42 +380,25 @@ static void dealloc_compressor_data(VP8_COMP *cpi) vpx_free(cpi->mb.pip); cpi->mb.pip = 0; - -#if !(CONFIG_REALTIME_ONLY) - vpx_free(cpi->twopass.total_stats); - cpi->twopass.total_stats = 0; - - vpx_free(cpi->twopass.total_left_stats); - cpi->twopass.total_left_stats = 0; - - vpx_free(cpi->twopass.this_frame_stats); - cpi->twopass.this_frame_stats = 0; -#endif } -static void enable_segmentation(VP8_PTR ptr) +static void enable_segmentation(VP8_COMP *cpi) { - VP8_COMP *cpi = (VP8_COMP *)(ptr); - // Set the appropriate feature bit cpi->mb.e_mbd.segmentation_enabled = 1; cpi->mb.e_mbd.update_mb_segmentation_map = 1; cpi->mb.e_mbd.update_mb_segmentation_data = 1; } -static void disable_segmentation(VP8_PTR ptr) +static void disable_segmentation(VP8_COMP *cpi) { - VP8_COMP *cpi = (VP8_COMP *)(ptr); - // Clear the appropriate feature bit cpi->mb.e_mbd.segmentation_enabled = 0; } // Valid values for a segment are 0 to 3 // Segmentation map is arrange as [Rows][Columns] -static void set_segmentation_map(VP8_PTR ptr, unsigned char *segmentation_map) +static void set_segmentation_map(VP8_COMP *cpi, unsigned char *segmentation_map) { - VP8_COMP *cpi = (VP8_COMP *)(ptr); - // Copy in the new segmentation map vpx_memcpy(cpi->segmentation_map, segmentation_map, (cpi->common.mb_rows * cpi->common.mb_cols)); @@ -429,19 +415,15 @@ static void set_segmentation_map(VP8_PTR ptr, unsigned char *segmentation_map) // abs_delta = SEGMENT_DELTADATA (deltas) abs_delta = SEGMENT_ABSDATA (use the absolute values given). // // -static void set_segment_data(VP8_PTR ptr, signed char *feature_data, unsigned char abs_delta) +static void set_segment_data(VP8_COMP *cpi, signed char *feature_data, unsigned char abs_delta) { - VP8_COMP *cpi = (VP8_COMP *)(ptr); - cpi->mb.e_mbd.mb_segement_abs_delta = abs_delta; vpx_memcpy(cpi->segment_feature_data, feature_data, sizeof(cpi->segment_feature_data)); } -static void segmentation_test_function(VP8_PTR ptr) +static void segmentation_test_function(VP8_COMP *cpi) { - VP8_COMP *cpi = (VP8_COMP *)(ptr); - unsigned char *seg_map; signed char feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS]; @@ -469,10 +451,10 @@ static void segmentation_test_function(VP8_PTR ptr) }*/ // Set the segmentation Map - set_segmentation_map(ptr, seg_map); + set_segmentation_map(cpi, seg_map); // Activate segmentation. - enable_segmentation(ptr); + enable_segmentation(cpi); // Set up the quant segment data feature_data[MB_LVL_ALT_Q][0] = 0; @@ -487,7 +469,7 @@ static void segmentation_test_function(VP8_PTR ptr) // Initialise the feature data structure // SEGMENT_DELTADATA 0, SEGMENT_ABSDATA 1 - set_segment_data(ptr, &feature_data[0][0], SEGMENT_DELTADATA); + set_segment_data(cpi, &feature_data[0][0], SEGMENT_DELTADATA); // Delete sementation map vpx_free(seg_map); @@ -561,10 +543,10 @@ static void cyclic_background_refresh(VP8_COMP *cpi, int Q, int lf_adjustment) } // Set the segmentation Map - set_segmentation_map((VP8_PTR)cpi, seg_map); + set_segmentation_map(cpi, seg_map); // Activate segmentation. - enable_segmentation((VP8_PTR)cpi); + enable_segmentation(cpi); // Set up the quant segment data feature_data[MB_LVL_ALT_Q][0] = 0; @@ -580,7 +562,7 @@ static void cyclic_background_refresh(VP8_COMP *cpi, int Q, int lf_adjustment) // Initialise the feature data structure // SEGMENT_DELTADATA 0, SEGMENT_ABSDATA 1 - set_segment_data((VP8_PTR)cpi, &feature_data[0][0], SEGMENT_DELTADATA); + set_segment_data(cpi, &feature_data[0][0], SEGMENT_DELTADATA); // Delete sementation map vpx_free(seg_map); @@ -609,6 +591,93 @@ static void set_default_lf_deltas(VP8_COMP *cpi) cpi->mb.e_mbd.mode_lf_deltas[3] = 4; // Split mv } +/* Convenience macros for mapping speed and mode into a continuous + * range + */ +#define GOOD(x) (x+1) +#define RT(x) (x+7) + +static int speed_map(int speed, int *map) +{ + int res; + + do + { + res = *map++; + } while(speed >= *map++); + return res; +} + +static int thresh_mult_map_znn[] = { + /* map common to zero, nearest, and near */ + 0, GOOD(2), 1500, GOOD(3), 2000, RT(0), 1000, RT(2), 2000, INT_MAX +}; + +static int thresh_mult_map_vhpred[] = { + 1000, GOOD(2), 1500, GOOD(3), 2000, RT(0), 1000, RT(1), 2000, + RT(7), INT_MAX, INT_MAX +}; + +static int thresh_mult_map_bpred[] = { + 2000, GOOD(0), 2500, GOOD(2), 5000, GOOD(3), 7500, RT(0), 2500, RT(1), 5000, + RT(6), INT_MAX, INT_MAX +}; + +static int thresh_mult_map_tm[] = { + 1000, GOOD(2), 1500, GOOD(3), 2000, RT(0), 0, RT(1), 1000, RT(2), 2000, + RT(7), INT_MAX, INT_MAX +}; + +static int thresh_mult_map_new1[] = { + 1000, GOOD(2), 2000, RT(0), 2000, INT_MAX +}; + +static int thresh_mult_map_new2[] = { + 1000, GOOD(2), 2000, GOOD(3), 2500, GOOD(5), 4000, RT(0), 2000, RT(2), 2500, + RT(5), 4000, INT_MAX +}; + +static int thresh_mult_map_split1[] = { + 2500, GOOD(0), 1700, GOOD(2), 10000, GOOD(3), 25000, GOOD(4), INT_MAX, + RT(0), 5000, RT(1), 10000, RT(2), 25000, RT(3), INT_MAX, INT_MAX +}; + +static int thresh_mult_map_split2[] = { + 5000, GOOD(0), 4500, GOOD(2), 20000, GOOD(3), 50000, GOOD(4), INT_MAX, + RT(0), 10000, RT(1), 20000, RT(2), 50000, RT(3), INT_MAX, INT_MAX +}; + +static int mode_check_freq_map_zn2[] = { + /* {zero,nearest}{2,3} */ + 0, RT(10), 1<<1, RT(11), 1<<2, RT(12), 1<<3, INT_MAX +}; + +static int mode_check_freq_map_vhbpred[] = { + 0, GOOD(5), 2, RT(0), 0, RT(3), 2, RT(5), 4, INT_MAX +}; + +static int mode_check_freq_map_near2[] = { + 0, GOOD(5), 2, RT(0), 0, RT(3), 2, RT(10), 1<<2, RT(11), 1<<3, RT(12), 1<<4, + INT_MAX +}; + +static int mode_check_freq_map_new1[] = { + 0, RT(10), 1<<1, RT(11), 1<<2, RT(12), 1<<3, INT_MAX +}; + +static int mode_check_freq_map_new2[] = { + 0, GOOD(5), 4, RT(0), 0, RT(3), 4, RT(10), 1<<3, RT(11), 1<<4, RT(12), 1<<5, + INT_MAX +}; + +static int mode_check_freq_map_split1[] = { + 0, GOOD(2), 2, GOOD(3), 7, RT(1), 2, RT(2), 7, INT_MAX +}; + +static int mode_check_freq_map_split2[] = { + 0, GOOD(1), 2, GOOD(2), 4, GOOD(3), 15, RT(1), 4, RT(2), 15, INT_MAX +}; + void vp8_set_speed_features(VP8_COMP *cpi) { SPEED_FEATURES *sf = &cpi->sf; @@ -617,6 +686,7 @@ void vp8_set_speed_features(VP8_COMP *cpi) int i; VP8_COMMON *cm = &cpi->common; int last_improved_quant = sf->improved_quant; + int ref_frames; // Initialise default mode frequency sampling variables for (i = 0; i < MAX_MODES; i ++) @@ -650,93 +720,90 @@ void vp8_set_speed_features(VP8_COMP *cpi) for (i = 0; i < MAX_MODES; i++) sf->thresh_mult[i] = 0; + /* Count enabled references */ + ref_frames = 1; + if (cpi->ref_frame_flags & VP8_LAST_FLAG) + ref_frames++; + if (cpi->ref_frame_flags & VP8_GOLD_FLAG) + ref_frames++; + if (cpi->ref_frame_flags & VP8_ALT_FLAG) + ref_frames++; + + /* Convert speed to continuous range, with clamping */ + if (Mode == 0) + Speed = 0; + else if (Mode == 2) + Speed = RT(Speed); + else + { + if (Speed > 5) + Speed = 5; + Speed = GOOD(Speed); + } + + sf->thresh_mult[THR_ZERO1] = + sf->thresh_mult[THR_NEAREST1] = + sf->thresh_mult[THR_NEAR1] = + sf->thresh_mult[THR_DC] = 0; /* always */ + + sf->thresh_mult[THR_ZERO2] = + sf->thresh_mult[THR_ZERO3] = + sf->thresh_mult[THR_NEAREST2] = + sf->thresh_mult[THR_NEAREST3] = + sf->thresh_mult[THR_NEAR2] = + sf->thresh_mult[THR_NEAR3] = speed_map(Speed, thresh_mult_map_znn); + + sf->thresh_mult[THR_V_PRED] = + sf->thresh_mult[THR_H_PRED] = speed_map(Speed, thresh_mult_map_vhpred); + sf->thresh_mult[THR_B_PRED] = speed_map(Speed, thresh_mult_map_bpred); + sf->thresh_mult[THR_TM] = speed_map(Speed, thresh_mult_map_tm); + sf->thresh_mult[THR_NEW1] = speed_map(Speed, thresh_mult_map_new1); + sf->thresh_mult[THR_NEW2] = + sf->thresh_mult[THR_NEW3] = speed_map(Speed, thresh_mult_map_new2); + sf->thresh_mult[THR_SPLIT1] = speed_map(Speed, thresh_mult_map_split1); + sf->thresh_mult[THR_SPLIT2] = + sf->thresh_mult[THR_SPLIT3] = speed_map(Speed, thresh_mult_map_split2); + + cpi->mode_check_freq[THR_ZERO1] = + cpi->mode_check_freq[THR_NEAREST1] = + cpi->mode_check_freq[THR_NEAR1] = + cpi->mode_check_freq[THR_TM] = + cpi->mode_check_freq[THR_DC] = 0; /* always */ + + cpi->mode_check_freq[THR_ZERO2] = + cpi->mode_check_freq[THR_ZERO3] = + cpi->mode_check_freq[THR_NEAREST2] = + cpi->mode_check_freq[THR_NEAREST3] = speed_map(Speed, + mode_check_freq_map_zn2); + + cpi->mode_check_freq[THR_NEAR2] = + cpi->mode_check_freq[THR_NEAR3] = speed_map(Speed, + mode_check_freq_map_near2); + + cpi->mode_check_freq[THR_V_PRED] = + cpi->mode_check_freq[THR_H_PRED] = + cpi->mode_check_freq[THR_B_PRED] = speed_map(Speed, + mode_check_freq_map_vhbpred); + cpi->mode_check_freq[THR_NEW1] = speed_map(Speed, + mode_check_freq_map_new1); + cpi->mode_check_freq[THR_NEW2] = + cpi->mode_check_freq[THR_NEW3] = speed_map(Speed, + mode_check_freq_map_new2); + cpi->mode_check_freq[THR_SPLIT1] = speed_map(Speed, + mode_check_freq_map_split1); + cpi->mode_check_freq[THR_SPLIT2] = + cpi->mode_check_freq[THR_SPLIT3] = speed_map(Speed, + mode_check_freq_map_split2); + Speed = cpi->Speed; switch (Mode) { #if !(CONFIG_REALTIME_ONLY) case 0: // best quality mode - sf->thresh_mult[THR_ZEROMV ] = 0; - sf->thresh_mult[THR_ZEROG ] = 0; - sf->thresh_mult[THR_ZEROA ] = 0; - sf->thresh_mult[THR_NEARESTMV] = 0; - sf->thresh_mult[THR_NEARESTG ] = 0; - sf->thresh_mult[THR_NEARESTA ] = 0; - sf->thresh_mult[THR_NEARMV ] = 0; - sf->thresh_mult[THR_NEARG ] = 0; - sf->thresh_mult[THR_NEARA ] = 0; - - sf->thresh_mult[THR_DC ] = 0; - - sf->thresh_mult[THR_V_PRED ] = 1000; - sf->thresh_mult[THR_H_PRED ] = 1000; - sf->thresh_mult[THR_B_PRED ] = 2000; - sf->thresh_mult[THR_TM ] = 1000; - - sf->thresh_mult[THR_NEWMV ] = 1000; - sf->thresh_mult[THR_NEWG ] = 1000; - sf->thresh_mult[THR_NEWA ] = 1000; - - sf->thresh_mult[THR_SPLITMV ] = 2500; - sf->thresh_mult[THR_SPLITG ] = 5000; - sf->thresh_mult[THR_SPLITA ] = 5000; - - sf->first_step = 0; sf->max_step_search_steps = MAX_MVSEARCH_STEPS; break; case 1: case 3: - sf->thresh_mult[THR_NEARESTMV] = 0; - sf->thresh_mult[THR_ZEROMV ] = 0; - sf->thresh_mult[THR_DC ] = 0; - sf->thresh_mult[THR_NEARMV ] = 0; - sf->thresh_mult[THR_V_PRED ] = 1000; - sf->thresh_mult[THR_H_PRED ] = 1000; - sf->thresh_mult[THR_B_PRED ] = 2500; - sf->thresh_mult[THR_TM ] = 1000; - - sf->thresh_mult[THR_NEARESTG ] = 1000; - sf->thresh_mult[THR_NEARESTA ] = 1000; - - sf->thresh_mult[THR_ZEROG ] = 1000; - sf->thresh_mult[THR_ZEROA ] = 1000; - sf->thresh_mult[THR_NEARG ] = 1000; - sf->thresh_mult[THR_NEARA ] = 1000; - -#if 1 - sf->thresh_mult[THR_ZEROMV ] = 0; - sf->thresh_mult[THR_ZEROG ] = 0; - sf->thresh_mult[THR_ZEROA ] = 0; - sf->thresh_mult[THR_NEARESTMV] = 0; - sf->thresh_mult[THR_NEARESTG ] = 0; - sf->thresh_mult[THR_NEARESTA ] = 0; - sf->thresh_mult[THR_NEARMV ] = 0; - sf->thresh_mult[THR_NEARG ] = 0; - sf->thresh_mult[THR_NEARA ] = 0; - -// sf->thresh_mult[THR_DC ] = 0; - -// sf->thresh_mult[THR_V_PRED ] = 1000; -// sf->thresh_mult[THR_H_PRED ] = 1000; -// sf->thresh_mult[THR_B_PRED ] = 2000; -// sf->thresh_mult[THR_TM ] = 1000; - - sf->thresh_mult[THR_NEWMV ] = 1000; - sf->thresh_mult[THR_NEWG ] = 1000; - sf->thresh_mult[THR_NEWA ] = 1000; - - sf->thresh_mult[THR_SPLITMV ] = 1700; - sf->thresh_mult[THR_SPLITG ] = 4500; - sf->thresh_mult[THR_SPLITA ] = 4500; -#else - sf->thresh_mult[THR_NEWMV ] = 1500; - sf->thresh_mult[THR_NEWG ] = 1500; - sf->thresh_mult[THR_NEWA ] = 1500; - - sf->thresh_mult[THR_SPLITMV ] = 5000; - sf->thresh_mult[THR_SPLITG ] = 10000; - sf->thresh_mult[THR_SPLITA ] = 10000; -#endif - if (Speed > 0) { /* Disable coefficient optimization above speed 0 */ @@ -745,83 +812,10 @@ void vp8_set_speed_features(VP8_COMP *cpi) sf->no_skip_block4x4_search = 0; sf->first_step = 1; - - cpi->mode_check_freq[THR_SPLITG] = 2; - cpi->mode_check_freq[THR_SPLITA] = 2; - cpi->mode_check_freq[THR_SPLITMV] = 0; - } - - if (Speed > 1) - { - cpi->mode_check_freq[THR_SPLITG] = 4; - cpi->mode_check_freq[THR_SPLITA] = 4; - cpi->mode_check_freq[THR_SPLITMV] = 2; - - sf->thresh_mult[THR_TM ] = 1500; - sf->thresh_mult[THR_V_PRED ] = 1500; - sf->thresh_mult[THR_H_PRED ] = 1500; - sf->thresh_mult[THR_B_PRED ] = 5000; - - if (cpi->ref_frame_flags & VP8_LAST_FLAG) - { - sf->thresh_mult[THR_NEWMV ] = 2000; - sf->thresh_mult[THR_SPLITMV ] = 10000; - } - - if (cpi->ref_frame_flags & VP8_GOLD_FLAG) - { - sf->thresh_mult[THR_NEARESTG ] = 1500; - sf->thresh_mult[THR_ZEROG ] = 1500; - sf->thresh_mult[THR_NEARG ] = 1500; - sf->thresh_mult[THR_NEWG ] = 2000; - sf->thresh_mult[THR_SPLITG ] = 20000; - } - - if (cpi->ref_frame_flags & VP8_ALT_FLAG) - { - sf->thresh_mult[THR_NEARESTA ] = 1500; - sf->thresh_mult[THR_ZEROA ] = 1500; - sf->thresh_mult[THR_NEARA ] = 1500; - sf->thresh_mult[THR_NEWA ] = 2000; - sf->thresh_mult[THR_SPLITA ] = 20000; - } } if (Speed > 2) { - cpi->mode_check_freq[THR_SPLITG] = 15; - cpi->mode_check_freq[THR_SPLITA] = 15; - cpi->mode_check_freq[THR_SPLITMV] = 7; - - sf->thresh_mult[THR_TM ] = 2000; - sf->thresh_mult[THR_V_PRED ] = 2000; - sf->thresh_mult[THR_H_PRED ] = 2000; - sf->thresh_mult[THR_B_PRED ] = 7500; - - if (cpi->ref_frame_flags & VP8_LAST_FLAG) - { - sf->thresh_mult[THR_NEWMV ] = 2000; - sf->thresh_mult[THR_SPLITMV ] = 25000; - } - - if (cpi->ref_frame_flags & VP8_GOLD_FLAG) - { - sf->thresh_mult[THR_NEARESTG ] = 2000; - sf->thresh_mult[THR_ZEROG ] = 2000; - sf->thresh_mult[THR_NEARG ] = 2000; - sf->thresh_mult[THR_NEWG ] = 2500; - sf->thresh_mult[THR_SPLITG ] = 50000; - } - - if (cpi->ref_frame_flags & VP8_ALT_FLAG) - { - sf->thresh_mult[THR_NEARESTA ] = 2000; - sf->thresh_mult[THR_ZEROA ] = 2000; - sf->thresh_mult[THR_NEARA ] = 2000; - sf->thresh_mult[THR_NEWA ] = 2500; - sf->thresh_mult[THR_SPLITA ] = 50000; - } - sf->improved_quant = 0; sf->improved_dct = 0; @@ -833,18 +827,6 @@ void vp8_set_speed_features(VP8_COMP *cpi) if (Speed > 3) { - sf->thresh_mult[THR_SPLITA ] = INT_MAX; - sf->thresh_mult[THR_SPLITG ] = INT_MAX; - sf->thresh_mult[THR_SPLITMV ] = INT_MAX; - - cpi->mode_check_freq[THR_V_PRED] = 0; - cpi->mode_check_freq[THR_H_PRED] = 0; - cpi->mode_check_freq[THR_B_PRED] = 0; - cpi->mode_check_freq[THR_NEARG] = 0; - cpi->mode_check_freq[THR_NEWG] = 0; - cpi->mode_check_freq[THR_NEARA] = 0; - cpi->mode_check_freq[THR_NEWA] = 0; - sf->auto_filter = 1; sf->recode_loop = 0; // recode loop off sf->RD = 0; // Turn rd off @@ -854,38 +836,6 @@ void vp8_set_speed_features(VP8_COMP *cpi) if (Speed > 4) { sf->auto_filter = 0; // Faster selection of loop filter - - cpi->mode_check_freq[THR_V_PRED] = 2; - cpi->mode_check_freq[THR_H_PRED] = 2; - cpi->mode_check_freq[THR_B_PRED] = 2; - - if (cpi->ref_frame_flags & VP8_GOLD_FLAG) - { - cpi->mode_check_freq[THR_NEARG] = 2; - cpi->mode_check_freq[THR_NEWG] = 4; - } - - if (cpi->ref_frame_flags & VP8_ALT_FLAG) - { - cpi->mode_check_freq[THR_NEARA] = 2; - cpi->mode_check_freq[THR_NEWA] = 4; - } - - if (cpi->ref_frame_flags & VP8_GOLD_FLAG) - { - sf->thresh_mult[THR_NEARESTG ] = 2000; - sf->thresh_mult[THR_ZEROG ] = 2000; - sf->thresh_mult[THR_NEARG ] = 2000; - sf->thresh_mult[THR_NEWG ] = 4000; - } - - if (cpi->ref_frame_flags & VP8_ALT_FLAG) - { - sf->thresh_mult[THR_NEARESTA ] = 2000; - sf->thresh_mult[THR_ZEROA ] = 2000; - sf->thresh_mult[THR_NEARA ] = 2000; - sf->thresh_mult[THR_NEWA ] = 4000; - } } break; @@ -895,67 +845,10 @@ void vp8_set_speed_features(VP8_COMP *cpi) sf->recode_loop = 0; sf->auto_filter = 1; sf->iterative_sub_pixel = 1; - sf->thresh_mult[THR_NEARESTMV] = 0; - sf->thresh_mult[THR_ZEROMV ] = 0; - sf->thresh_mult[THR_DC ] = 0; - sf->thresh_mult[THR_TM ] = 0; - sf->thresh_mult[THR_NEARMV ] = 0; - sf->thresh_mult[THR_V_PRED ] = 1000; - sf->thresh_mult[THR_H_PRED ] = 1000; - sf->thresh_mult[THR_B_PRED ] = 2500; - sf->thresh_mult[THR_NEARESTG ] = 1000; - sf->thresh_mult[THR_ZEROG ] = 1000; - sf->thresh_mult[THR_NEARG ] = 1000; - sf->thresh_mult[THR_NEARESTA ] = 1000; - sf->thresh_mult[THR_ZEROA ] = 1000; - sf->thresh_mult[THR_NEARA ] = 1000; - sf->thresh_mult[THR_NEWMV ] = 2000; - sf->thresh_mult[THR_NEWG ] = 2000; - sf->thresh_mult[THR_NEWA ] = 2000; - sf->thresh_mult[THR_SPLITMV ] = 5000; - sf->thresh_mult[THR_SPLITG ] = 10000; - sf->thresh_mult[THR_SPLITA ] = 10000; sf->search_method = NSTEP; if (Speed > 0) { - cpi->mode_check_freq[THR_SPLITG] = 4; - cpi->mode_check_freq[THR_SPLITA] = 4; - cpi->mode_check_freq[THR_SPLITMV] = 2; - - sf->thresh_mult[THR_DC ] = 0; - sf->thresh_mult[THR_TM ] = 1000; - sf->thresh_mult[THR_V_PRED ] = 2000; - sf->thresh_mult[THR_H_PRED ] = 2000; - sf->thresh_mult[THR_B_PRED ] = 5000; - - if (cpi->ref_frame_flags & VP8_LAST_FLAG) - { - sf->thresh_mult[THR_NEARESTMV] = 0; - sf->thresh_mult[THR_ZEROMV ] = 0; - sf->thresh_mult[THR_NEARMV ] = 0; - sf->thresh_mult[THR_NEWMV ] = 2000; - sf->thresh_mult[THR_SPLITMV ] = 10000; - } - - if (cpi->ref_frame_flags & VP8_GOLD_FLAG) - { - sf->thresh_mult[THR_NEARESTG ] = 1000; - sf->thresh_mult[THR_ZEROG ] = 1000; - sf->thresh_mult[THR_NEARG ] = 1000; - sf->thresh_mult[THR_NEWG ] = 2000; - sf->thresh_mult[THR_SPLITG ] = 20000; - } - - if (cpi->ref_frame_flags & VP8_ALT_FLAG) - { - sf->thresh_mult[THR_NEARESTA ] = 1000; - sf->thresh_mult[THR_ZEROA ] = 1000; - sf->thresh_mult[THR_NEARA ] = 1000; - sf->thresh_mult[THR_NEWA ] = 2000; - sf->thresh_mult[THR_SPLITA ] = 20000; - } - sf->improved_quant = 0; sf->improved_dct = 0; @@ -964,133 +857,28 @@ void vp8_set_speed_features(VP8_COMP *cpi) sf->first_step = 1; } - if (Speed > 1) - { - cpi->mode_check_freq[THR_SPLITMV] = 7; - cpi->mode_check_freq[THR_SPLITG] = 15; - cpi->mode_check_freq[THR_SPLITA] = 15; - - sf->thresh_mult[THR_TM ] = 2000; - sf->thresh_mult[THR_V_PRED ] = 2000; - sf->thresh_mult[THR_H_PRED ] = 2000; - sf->thresh_mult[THR_B_PRED ] = 5000; - - if (cpi->ref_frame_flags & VP8_LAST_FLAG) - { - sf->thresh_mult[THR_NEWMV ] = 2000; - sf->thresh_mult[THR_SPLITMV ] = 25000; - } - - if (cpi->ref_frame_flags & VP8_GOLD_FLAG) - { - sf->thresh_mult[THR_NEARESTG ] = 2000; - sf->thresh_mult[THR_ZEROG ] = 2000; - sf->thresh_mult[THR_NEARG ] = 2000; - sf->thresh_mult[THR_NEWG ] = 2500; - sf->thresh_mult[THR_SPLITG ] = 50000; - } - - if (cpi->ref_frame_flags & VP8_ALT_FLAG) - { - sf->thresh_mult[THR_NEARESTA ] = 2000; - sf->thresh_mult[THR_ZEROA ] = 2000; - sf->thresh_mult[THR_NEARA ] = 2000; - sf->thresh_mult[THR_NEWA ] = 2500; - sf->thresh_mult[THR_SPLITA ] = 50000; - } - - } - if (Speed > 2) - { sf->auto_filter = 0; // Faster selection of loop filter - cpi->mode_check_freq[THR_V_PRED] = 2; - cpi->mode_check_freq[THR_H_PRED] = 2; - cpi->mode_check_freq[THR_B_PRED] = 2; - - if (cpi->ref_frame_flags & VP8_GOLD_FLAG) - { - cpi->mode_check_freq[THR_NEARG] = 2; - cpi->mode_check_freq[THR_NEWG] = 4; - } - - if (cpi->ref_frame_flags & VP8_ALT_FLAG) - { - cpi->mode_check_freq[THR_NEARA] = 2; - cpi->mode_check_freq[THR_NEWA] = 4; - } - - sf->thresh_mult[THR_SPLITMV ] = INT_MAX; - sf->thresh_mult[THR_SPLITG ] = INT_MAX; - sf->thresh_mult[THR_SPLITA ] = INT_MAX; - - } - if (Speed > 3) { sf->RD = 0; - sf->auto_filter = 1; } if (Speed > 4) { sf->auto_filter = 0; // Faster selection of loop filter - sf->search_method = HEX; - //sf->search_method = DIAMOND; - sf->iterative_sub_pixel = 0; - - cpi->mode_check_freq[THR_V_PRED] = 4; - cpi->mode_check_freq[THR_H_PRED] = 4; - cpi->mode_check_freq[THR_B_PRED] = 4; - - if (cpi->ref_frame_flags & VP8_GOLD_FLAG) - { - cpi->mode_check_freq[THR_NEARG] = 2; - cpi->mode_check_freq[THR_NEWG] = 4; - } - - if (cpi->ref_frame_flags & VP8_ALT_FLAG) - { - cpi->mode_check_freq[THR_NEARA] = 2; - cpi->mode_check_freq[THR_NEWA] = 4; - } - - sf->thresh_mult[THR_TM ] = 2000; - sf->thresh_mult[THR_B_PRED ] = 5000; - - if (cpi->ref_frame_flags & VP8_GOLD_FLAG) - { - sf->thresh_mult[THR_NEARESTG ] = 2000; - sf->thresh_mult[THR_ZEROG ] = 2000; - sf->thresh_mult[THR_NEARG ] = 2000; - sf->thresh_mult[THR_NEWG ] = 4000; - } - - if (cpi->ref_frame_flags & VP8_ALT_FLAG) - { - sf->thresh_mult[THR_NEARESTA ] = 2000; - sf->thresh_mult[THR_ZEROA ] = 2000; - sf->thresh_mult[THR_NEARA ] = 2000; - sf->thresh_mult[THR_NEWA ] = 4000; - } - } - - if (Speed > 5) - { - // Disable split MB intra prediction mode - sf->thresh_mult[THR_B_PRED] = INT_MAX; } if (Speed > 6) { - unsigned int i, sum = 0; + unsigned int sum = 0; unsigned int total_mbs = cm->MBs; - int thresh; - int total_skip; + int i, thresh; + unsigned int total_skip; int min = 2000; @@ -1122,109 +910,53 @@ void vp8_set_speed_features(VP8_COMP *cpi) if (thresh < 2000) thresh = 2000; - if (cpi->ref_frame_flags & VP8_LAST_FLAG) + if (ref_frames > 1) { - sf->thresh_mult[THR_NEWMV] = thresh; - sf->thresh_mult[THR_NEARESTMV ] = thresh >> 1; - sf->thresh_mult[THR_NEARMV ] = thresh >> 1; + sf->thresh_mult[THR_NEW1 ] = thresh; + sf->thresh_mult[THR_NEAREST1 ] = thresh >> 1; + sf->thresh_mult[THR_NEAR1 ] = thresh >> 1; } - if (cpi->ref_frame_flags & VP8_GOLD_FLAG) + if (ref_frames > 2) { - sf->thresh_mult[THR_NEWG] = thresh << 1; - sf->thresh_mult[THR_NEARESTG ] = thresh; - sf->thresh_mult[THR_NEARG ] = thresh; + sf->thresh_mult[THR_NEW2] = thresh << 1; + sf->thresh_mult[THR_NEAREST2 ] = thresh; + sf->thresh_mult[THR_NEAR2 ] = thresh; } - if (cpi->ref_frame_flags & VP8_ALT_FLAG) + if (ref_frames > 3) { - sf->thresh_mult[THR_NEWA] = thresh << 1; - sf->thresh_mult[THR_NEARESTA ] = thresh; - sf->thresh_mult[THR_NEARA ] = thresh; + sf->thresh_mult[THR_NEW3] = thresh << 1; + sf->thresh_mult[THR_NEAREST3 ] = thresh; + sf->thresh_mult[THR_NEAR3 ] = thresh; } - // Disable other intra prediction modes - sf->thresh_mult[THR_TM] = INT_MAX; - sf->thresh_mult[THR_V_PRED] = INT_MAX; - sf->thresh_mult[THR_H_PRED] = INT_MAX; - sf->improved_mv_pred = 0; } if (Speed > 8) - { sf->quarter_pixel_search = 0; - } - if (Speed > 9) + if(cm->version == 0) { - int Tmp = cpi->Speed - 8; - - if (Tmp > 4) - Tmp = 4; + cm->filter_type = NORMAL_LOOPFILTER; - if (cpi->ref_frame_flags & VP8_GOLD_FLAG) - { - cpi->mode_check_freq[THR_ZEROG] = 1 << (Tmp - 1); - cpi->mode_check_freq[THR_NEARESTG] = 1 << (Tmp - 1); - cpi->mode_check_freq[THR_NEARG] = 1 << Tmp; - cpi->mode_check_freq[THR_NEWG] = 1 << (Tmp + 1); - } - - if (cpi->ref_frame_flags & VP8_ALT_FLAG) - { - cpi->mode_check_freq[THR_ZEROA] = 1 << (Tmp - 1); - cpi->mode_check_freq[THR_NEARESTA] = 1 << (Tmp - 1); - cpi->mode_check_freq[THR_NEARA] = 1 << Tmp; - cpi->mode_check_freq[THR_NEWA] = 1 << (Tmp + 1); - } - - cpi->mode_check_freq[THR_NEWMV] = 1 << (Tmp - 1); + if (Speed >= 14) + cm->filter_type = SIMPLE_LOOPFILTER; } - - cm->filter_type = NORMAL_LOOPFILTER; - - if (Speed >= 14) + else + { cm->filter_type = SIMPLE_LOOPFILTER; + } + // This has a big hit on quality. Last resort if (Speed >= 15) - { - sf->half_pixel_search = 0; // This has a big hit on quality. Last resort - } + sf->half_pixel_search = 0; vpx_memset(cpi->error_bins, 0, sizeof(cpi->error_bins)); }; /* switch */ - /* disable frame modes if flags not set */ - if (!(cpi->ref_frame_flags & VP8_LAST_FLAG)) - { - sf->thresh_mult[THR_NEWMV ] = INT_MAX; - sf->thresh_mult[THR_NEARESTMV] = INT_MAX; - sf->thresh_mult[THR_ZEROMV ] = INT_MAX; - sf->thresh_mult[THR_NEARMV ] = INT_MAX; - sf->thresh_mult[THR_SPLITMV ] = INT_MAX; - } - - if (!(cpi->ref_frame_flags & VP8_GOLD_FLAG)) - { - sf->thresh_mult[THR_NEARESTG ] = INT_MAX; - sf->thresh_mult[THR_ZEROG ] = INT_MAX; - sf->thresh_mult[THR_NEARG ] = INT_MAX; - sf->thresh_mult[THR_NEWG ] = INT_MAX; - sf->thresh_mult[THR_SPLITG ] = INT_MAX; - } - - if (!(cpi->ref_frame_flags & VP8_ALT_FLAG)) - { - sf->thresh_mult[THR_NEARESTA ] = INT_MAX; - sf->thresh_mult[THR_ZEROA ] = INT_MAX; - sf->thresh_mult[THR_NEARA ] = INT_MAX; - sf->thresh_mult[THR_NEWA ] = INT_MAX; - sf->thresh_mult[THR_SPLITA ] = INT_MAX; - } - - // Slow quant, dct and trellis not worthwhile for first pass // so make sure they are always turned off. if ( cpi->pass == 1 ) @@ -1306,6 +1038,9 @@ void vp8_set_speed_features(VP8_COMP *cpi) frames_at_speed[cpi->Speed]++; #endif } +#undef GOOD +#undef RT + static void alloc_raw_frame_buffers(VP8_COMP *cpi) { int width = (cpi->oxcf.Width + 15) & ~15; @@ -1365,7 +1100,7 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) height += 16 - (height & 0xf); - if (vp8_yv12_alloc_frame_buffer(&cpi->last_frame_uf, + if (vp8_yv12_alloc_frame_buffer(&cpi->pick_lf_lvl_frame, width, height, VP8BORDERINPIXELS)) vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, "Failed to allocate last frame buffer"); @@ -1406,25 +1141,6 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) vpx_calloc(sizeof(unsigned int), cm->mb_rows * cm->mb_cols)); -#if !(CONFIG_REALTIME_ONLY) - vpx_free(cpi->twopass.total_stats); - - cpi->twopass.total_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS)); - - vpx_free(cpi->twopass.total_left_stats); - cpi->twopass.total_left_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS)); - - vpx_free(cpi->twopass.this_frame_stats); - - cpi->twopass.this_frame_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS)); - - if( !cpi->twopass.total_stats || - !cpi->twopass.total_left_stats || - !cpi->twopass.this_frame_stats) - vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, - "Failed to allocate firstpass stats"); -#endif - #if CONFIG_MULTITHREAD if (width < 640) cpi->mt_sync_range = 1; @@ -1436,7 +1152,7 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) cpi->mt_sync_range = 16; #endif - vpx_free(cpi->tplist); + vpx_free(cpi->tplist); CHECK_MEM_ERROR(cpi->tplist, vpx_malloc(sizeof(TOKENLIST) * cpi->common.mb_rows)); } @@ -1470,8 +1186,8 @@ void vp8_new_frame_rate(VP8_COMP *cpi, double framerate) if(framerate < .1) framerate = 30; - cpi->oxcf.frame_rate = framerate; - cpi->output_frame_rate = cpi->oxcf.frame_rate; + cpi->frame_rate = framerate; + cpi->output_frame_rate = framerate; cpi->per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth / cpi->output_frame_rate); cpi->av_per_frame_bandwidth = cpi->per_frame_bandwidth; @@ -1513,9 +1229,8 @@ rescale(int val, int num, int denom) } -static void init_config(VP8_PTR ptr, VP8_CONFIG *oxcf) +static void init_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) { - VP8_COMP *cpi = (VP8_COMP *)(ptr); VP8_COMMON *cm = &cpi->common; cpi->oxcf = *oxcf; @@ -1527,8 +1242,20 @@ static void init_config(VP8_PTR ptr, VP8_CONFIG *oxcf) cm->version = oxcf->Version; vp8_setup_version(cm); + /* frame rate is not available on the first frame, as it's derived from + * the observed timestamps. The actual value used here doesn't matter + * too much, as it will adapt quickly. If the reciprocal of the timebase + * seems like a reasonable framerate, then use that as a guess, otherwise + * use 30. + */ + cpi->frame_rate = (double)(oxcf->timebase.den) / + (double)(oxcf->timebase.num); + + if (cpi->frame_rate > 180) + cpi->frame_rate = 30; + // change includes all joint functionality - vp8_change_config(ptr, oxcf); + vp8_change_config(cpi, oxcf); // Initialize active best and worst q and average q values. cpi->active_worst_quality = cpi->oxcf.worst_allowed_q; @@ -1550,8 +1277,8 @@ static void init_config(VP8_PTR ptr, VP8_CONFIG *oxcf) // Temporal scalabilty if (cpi->oxcf.number_of_layers > 1) { - int i; - int prev_layer_frame_rate=0; + unsigned int i; + double prev_layer_frame_rate=0; for (i=0; i<cpi->oxcf.number_of_layers; i++) { @@ -1619,9 +1346,8 @@ static void init_config(VP8_PTR ptr, VP8_CONFIG *oxcf) } -void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf) +void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) { - VP8_COMP *cpi = (VP8_COMP *)(ptr); VP8_COMMON *cm = &cpi->common; if (!cpi) @@ -1787,7 +1513,7 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf) cpi->oxcf.target_bandwidth, 1000); // Set up frame rate and related parameters rate control values. - vp8_new_frame_rate(cpi, cpi->oxcf.frame_rate); + vp8_new_frame_rate(cpi, cpi->frame_rate); // Set absolute upper and lower quality limits cpi->worst_quality = cpi->oxcf.worst_allowed_q; @@ -1813,7 +1539,7 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf) cpi->active_best_quality = cpi->oxcf.worst_allowed_q; } - cpi->buffered_mode = (cpi->oxcf.optimal_buffer_level > 0) ? TRUE : FALSE; + cpi->buffered_mode = cpi->oxcf.optimal_buffer_level > 0; cpi->cq_target_quality = cpi->oxcf.cq_level; @@ -1912,19 +1638,14 @@ static void cal_mvsadcosts(int *mvsadcost[2]) while (++i <= mvfp_max); } -VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf) +struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf) { int i; - volatile union - { - VP8_COMP *cpi; - VP8_PTR ptr; - } ctx; VP8_COMP *cpi; VP8_COMMON *cm; - cpi = ctx.cpi = vpx_memalign(32, sizeof(VP8_COMP)); + cpi = vpx_memalign(32, sizeof(VP8_COMP)); // Check that the CPI instance is valid if (!cpi) return 0; @@ -1935,10 +1656,8 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf) if (setjmp(cm->error.jmp)) { - VP8_PTR ptr = ctx.ptr; - - ctx.cpi->common.error.setjmp = 0; - vp8_remove_compressor(&ptr); + cpi->common.error.setjmp = 0; + vp8_remove_compressor(&cpi); return 0; } @@ -1949,7 +1668,7 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf) vp8_create_common(&cpi->common); vp8_cmachine_specific_config(cpi); - init_config((VP8_PTR)cpi, oxcf); + init_config(cpi, oxcf); memcpy(cpi->base_skip_false_prob, vp8cx_base_skip_false_prob, sizeof(vp8cx_base_skip_false_prob)); cpi->common.current_video_frame = 0; @@ -2028,7 +1747,7 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf) cpi->cyclic_refresh_map = (signed char *) NULL; // Test function for segmentation - //segmentation_test_function((VP8_PTR) cpi); + //segmentation_test_function( cpi); #ifdef ENTROPY_STATS init_context_counters(); @@ -2039,11 +1758,11 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf) cpi->frames_since_key = 8; // Give a sensible default for the first frame. cpi->key_frame_frequency = cpi->oxcf.key_freq; - cpi->this_key_frame_forced = FALSE; - cpi->next_key_frame_forced = FALSE; + cpi->this_key_frame_forced = 0; + cpi->next_key_frame_forced = 0; - cpi->source_alt_ref_pending = FALSE; - cpi->source_alt_ref_active = FALSE; + cpi->source_alt_ref_pending = 0; + cpi->source_alt_ref_active = 0; cpi->common.refresh_alt_ref_frame = 0; cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS; @@ -2241,14 +1960,21 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf) vp8_loop_filter_init(cm); cpi->common.error.setjmp = 0; - return (VP8_PTR) cpi; + +#if CONFIG_MULTI_RES_ENCODING + /* Calculate # of MBs in a row in lower-resolution level image. */ + if (cpi->oxcf.mr_encoder_id > 0) + vp8_cal_low_res_mb_cols(cpi); +#endif + + return cpi; } -void vp8_remove_compressor(VP8_PTR *ptr) +void vp8_remove_compressor(VP8_COMP **ptr) { - VP8_COMP *cpi = (VP8_COMP *)(*ptr); + VP8_COMP *cpi = *ptr; if (!cpi) return; @@ -2408,7 +2134,7 @@ void vp8_remove_compressor(VP8_PTR *ptr) { extern int count_mb_seg[4]; FILE *f = fopen("modes.stt", "a"); - double dr = (double)cpi->oxcf.frame_rate * (double)bytes * (double)8 / (double)count / (double)1000 ; + double dr = (double)cpi->frame_rate * (double)bytes * (double)8 / (double)count / (double)1000 ; fprintf(f, "intra_mode in Intra Frames:\n"); fprintf(f, "Y: %8d, %8d, %8d, %8d, %8d\n", y_modes[0], y_modes[1], y_modes[2], y_modes[3], y_modes[4]); fprintf(f, "UV:%8d, %8d, %8d, %8d\n", uv_modes[0], uv_modes[1], uv_modes[2], uv_modes[3]); @@ -2659,20 +2385,16 @@ static void generate_psnr_packet(VP8_COMP *cpi) } -int vp8_use_as_reference(VP8_PTR ptr, int ref_frame_flags) +int vp8_use_as_reference(VP8_COMP *cpi, int ref_frame_flags) { - VP8_COMP *cpi = (VP8_COMP *)(ptr); - if (ref_frame_flags > 7) return -1 ; cpi->ref_frame_flags = ref_frame_flags; return 0; } -int vp8_update_reference(VP8_PTR ptr, int ref_frame_flags) +int vp8_update_reference(VP8_COMP *cpi, int ref_frame_flags) { - VP8_COMP *cpi = (VP8_COMP *)(ptr); - if (ref_frame_flags > 7) return -1 ; @@ -2692,9 +2414,8 @@ int vp8_update_reference(VP8_PTR ptr, int ref_frame_flags) return 0; } -int vp8_get_reference(VP8_PTR ptr, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd) +int vp8_get_reference(VP8_COMP *cpi, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd) { - VP8_COMP *cpi = (VP8_COMP *)(ptr); VP8_COMMON *cm = &cpi->common; int ref_fb_idx; @@ -2711,9 +2432,8 @@ int vp8_get_reference(VP8_PTR ptr, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONF return 0; } -int vp8_set_reference(VP8_PTR ptr, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd) +int vp8_set_reference(VP8_COMP *cpi, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd) { - VP8_COMP *cpi = (VP8_COMP *)(ptr); VP8_COMMON *cm = &cpi->common; int ref_fb_idx; @@ -2731,9 +2451,8 @@ int vp8_set_reference(VP8_PTR ptr, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONF return 0; } -int vp8_update_entropy(VP8_PTR comp, int update) +int vp8_update_entropy(VP8_COMP *cpi, int update) { - VP8_COMP *cpi = (VP8_COMP *) comp; VP8_COMMON *cm = &cpi->common; cm->refresh_entropy_probs = update; @@ -2889,10 +2608,10 @@ static void update_alt_ref_frame_stats(VP8_COMP *cpi) cpi->common.frames_since_golden = 0; // Clear the alternate reference update pending flag. - cpi->source_alt_ref_pending = FALSE; + cpi->source_alt_ref_pending = 0; // Set the alternate refernce frame active flag - cpi->source_alt_ref_active = TRUE; + cpi->source_alt_ref_active = 1; } @@ -2955,12 +2674,12 @@ static void update_golden_frame_stats(VP8_COMP *cpi) if (cpi->oxcf.fixed_q >= 0 && cpi->oxcf.play_alternate && !cpi->common.refresh_alt_ref_frame) { - cpi->source_alt_ref_pending = TRUE; + cpi->source_alt_ref_pending = 1; cpi->frames_till_gf_update_due = cpi->baseline_gf_interval; } if (!cpi->source_alt_ref_pending) - cpi->source_alt_ref_active = FALSE; + cpi->source_alt_ref_active = 0; // Decrement count down till next gf if (cpi->frames_till_gf_update_due > 0) @@ -2994,8 +2713,7 @@ static void update_rd_ref_frame_probs(VP8_COMP *cpi) { VP8_COMMON *cm = &cpi->common; -#if 0 - const int *const rfct = cpi->recent_ref_frame_usage; + const int *const rfct = cpi->count_mb_ref_frame_usage; const int rf_intra = rfct[INTRA_FRAME]; const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]; @@ -3007,100 +2725,10 @@ static void update_rd_ref_frame_probs(VP8_COMP *cpi) } else if (!(rf_intra + rf_inter)) { - // This is a trap in case this function is called with cpi->recent_ref_frame_usage[] blank. cpi->prob_intra_coded = 63; cpi->prob_last_coded = 128; cpi->prob_gf_coded = 128; } - else - { - cpi->prob_intra_coded = (rf_intra * 255) / (rf_intra + rf_inter); - - if (cpi->prob_intra_coded < 1) - cpi->prob_intra_coded = 1; - - if ((cm->frames_since_golden > 0) || cpi->source_alt_ref_active) - { - cpi->prob_last_coded = rf_inter ? (rfct[LAST_FRAME] * 255) / rf_inter : 128; - - if (cpi->prob_last_coded < 1) - cpi->prob_last_coded = 1; - - cpi->prob_gf_coded = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) - ? (rfct[GOLDEN_FRAME] * 255) / (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) : 128; - - if (cpi->prob_gf_coded < 1) - cpi->prob_gf_coded = 1; - } - } - -#else - const int *const rfct = cpi->count_mb_ref_frame_usage; - const int rf_intra = rfct[INTRA_FRAME]; - const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]; - - if (cm->frame_type == KEY_FRAME) - { - cpi->prob_intra_coded = 255; - cpi->prob_last_coded = 128; - cpi->prob_gf_coded = 128; - } - else if (!(rf_intra + rf_inter)) - { - if (cpi->oxcf.number_of_layers > 1) - { - if (cpi->ref_frame_flags == VP8_LAST_FLAG) - { - cpi->prob_intra_coded = 63; - cpi->prob_last_coded = 255; - cpi->prob_gf_coded = 128; - } - else if (cpi->ref_frame_flags == VP8_GOLD_FLAG) - { - cpi->prob_intra_coded = 63; - cpi->prob_last_coded = 1; - cpi->prob_gf_coded = 255; - } - else if (cpi->ref_frame_flags == VP8_ALT_FLAG) - { - cpi->prob_intra_coded = 63; - cpi->prob_last_coded = 1; - cpi->prob_gf_coded = 1; - } - else - { - cpi->prob_intra_coded = 63; - cpi->prob_last_coded = 128; - cpi->prob_gf_coded = 128; - } - } - else - { - // This is a trap in case this function is called with - // cpi->recent_ref_frame_usage[] blank. - cpi->prob_intra_coded = 63; - cpi->prob_last_coded = 128; - cpi->prob_gf_coded = 128; - } - } - else - { - cpi->prob_intra_coded = (rf_intra * 255) / (rf_intra + rf_inter); - - if (cpi->prob_intra_coded < 1) - cpi->prob_intra_coded = 1; - - cpi->prob_last_coded = rf_inter ? (rfct[LAST_FRAME] * 255) / rf_inter : 128; - - if (cpi->prob_last_coded < 1) - cpi->prob_last_coded = 1; - - cpi->prob_gf_coded = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) - ? (rfct[GOLDEN_FRAME] * 255) / (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) : 128; - - if (cpi->prob_gf_coded < 1) - cpi->prob_gf_coded = 1; - } // update reference frame costs since we can do better than what we got last frame. if (cpi->oxcf.number_of_layers == 1) @@ -3114,7 +2742,6 @@ static void update_rd_ref_frame_probs(VP8_COMP *cpi) else if (cpi->common.frames_since_golden == 0) { cpi->prob_last_coded = 214; - cpi->prob_gf_coded = 1; } else if (cpi->common.frames_since_golden == 1) { @@ -3123,14 +2750,14 @@ static void update_rd_ref_frame_probs(VP8_COMP *cpi) } else if (cpi->source_alt_ref_active) { - //int dist = cpi->common.frames_till_alt_ref_frame + cpi->common.frames_since_golden; cpi->prob_gf_coded -= 20; if (cpi->prob_gf_coded < 10) cpi->prob_gf_coded = 10; } + if (!cpi->source_alt_ref_active) + cpi->prob_gf_coded = 255; } -#endif } @@ -3139,12 +2766,12 @@ static int decide_key_frame(VP8_COMP *cpi) { VP8_COMMON *cm = &cpi->common; - int code_key_frame = FALSE; + int code_key_frame = 0; cpi->kf_boost = 0; if (cpi->Speed > 11) - return FALSE; + return 0; // Clear down mmx registers vp8_clear_system_state(); //__asm emms; @@ -3186,10 +2813,10 @@ static int decide_key_frame(VP8_COMP *cpi) && (change > .25 || change2 > .25)) { /*(change > 1.4 || change < .75)&& cpi->this_frame_percent_intra > cpi->last_frame_percent_intra + 3*/ - return TRUE; + return 1; } - return FALSE; + return 0; } @@ -3199,7 +2826,7 @@ static int decide_key_frame(VP8_COMP *cpi) ((cpi->this_frame_percent_intra > 95) && (cpi->this_frame_percent_intra >= (cpi->last_frame_percent_intra + 5)))) { - code_key_frame = TRUE; + code_key_frame = 1; } // in addition if the following are true and this is not a golden frame then code a key frame // Note that on golden frames there often seems to be a pop in intra useage anyway hence this @@ -3212,7 +2839,7 @@ static int decide_key_frame(VP8_COMP *cpi) (cpi->this_frame_percent_intra > (cpi->last_frame_percent_intra + 10)))) { if (!cm->refresh_golden_frame) - code_key_frame = TRUE; + code_key_frame = 1; } return code_key_frame; @@ -3268,11 +2895,11 @@ void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) // Function to test for conditions that indeicate we should loop // back and recode a frame. -static BOOL recode_loop_test( VP8_COMP *cpi, +static int recode_loop_test( VP8_COMP *cpi, int high_limit, int low_limit, int q, int maxq, int minq ) { - BOOL force_recode = FALSE; + int force_recode = 0; VP8_COMMON *cm = &cpi->common; // Is frame recode allowed at all @@ -3288,7 +2915,7 @@ static BOOL recode_loop_test( VP8_COMP *cpi, if ( ((cpi->projected_frame_size > high_limit) && (q < maxq)) || ((cpi->projected_frame_size < low_limit) && (q > minq)) ) { - force_recode = TRUE; + force_recode = 1; } // Special Constrained quality tests else if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) @@ -3298,14 +2925,14 @@ static BOOL recode_loop_test( VP8_COMP *cpi, (cpi->projected_frame_size < ((cpi->this_frame_target * 7) >> 3))) { - force_recode = TRUE; + force_recode = 1; } // Severe undershoot and between auto and user cq level else if ( (q > cpi->oxcf.cq_level) && (cpi->projected_frame_size < cpi->min_frame_bandwidth) && (cpi->active_best_quality > cpi->oxcf.cq_level)) { - force_recode = TRUE; + force_recode = 1; cpi->active_best_quality = cpi->oxcf.cq_level; } } @@ -3456,7 +3083,7 @@ static void encode_frame_to_data_rate int frame_over_shoot_limit; int frame_under_shoot_limit; - int Loop = FALSE; + int Loop = 0; int loop_count; int this_q; int last_zbin_oq; @@ -3468,10 +3095,10 @@ static void encode_frame_to_data_rate int top_index; int bottom_index; VP8_COMMON *cm = &cpi->common; - int active_worst_qchanged = FALSE; + int active_worst_qchanged = 0; - int overshoot_seen = FALSE; - int undershoot_seen = FALSE; + int overshoot_seen = 0; + int undershoot_seen = 0; int drop_mark = cpi->oxcf.drop_frames_water_mark * cpi->oxcf.optimal_buffer_level / 100; int drop_mark75 = drop_mark * 2 / 3; int drop_mark50 = drop_mark / 4; @@ -3482,7 +3109,7 @@ static void encode_frame_to_data_rate vp8_clear_system_state(); // Test code for segmentation of gf/arf (0,0) - //segmentation_test_function((VP8_PTR) cpi); + //segmentation_test_function( cpi); if (cpi->compressor_speed == 2) { @@ -3522,12 +3149,12 @@ static void encode_frame_to_data_rate // Enable or disable mode based tweaking of the zbin // For 2 Pass Only used where GF/ARF prediction quality // is above a threshold - cpi->zbin_mode_boost_enabled = TRUE; + cpi->zbin_mode_boost_enabled = 1; if (cpi->pass == 2) { if ( cpi->gfu_boost <= 400 ) { - cpi->zbin_mode_boost_enabled = FALSE; + cpi->zbin_mode_boost_enabled = 0; } } @@ -3568,7 +3195,7 @@ static void encode_frame_to_data_rate } // The alternate reference frame cannot be active for a key frame - cpi->source_alt_ref_active = FALSE; + cpi->source_alt_ref_active = 0; // Reset the RD threshold multipliers to default of * 1 (128) for (i = 0; i < MAX_MODES; i++) @@ -3580,9 +3207,9 @@ static void encode_frame_to_data_rate // Test code for segmentation //if ( (cm->frame_type == KEY_FRAME) || ((cm->current_video_frame % 2) == 0)) //if ( (cm->current_video_frame % 2) == 0 ) - // enable_segmentation((VP8_PTR)cpi); + // enable_segmentation(cpi); //else - // disable_segmentation((VP8_PTR)cpi); + // disable_segmentation(cpi); #if 0 // Experimental code for lagged compress and one pass @@ -3676,7 +3303,7 @@ static void encode_frame_to_data_rate if (cpi->oxcf.number_of_layers > 1) { - int i; + unsigned int i; // Propagate bits saved by dropping the frame to higher layers for (i=cpi->current_layer+1; i<cpi->oxcf.number_of_layers; i++) @@ -4080,7 +3707,7 @@ static void encode_frame_to_data_rate vp8_pick_frame_size(cpi); // Clear the Alt reference frame active flag when we have a key frame - cpi->source_alt_ref_active = FALSE; + cpi->source_alt_ref_active = 0; // Reset the loop filter deltas and segmentation map setup_features(cpi); @@ -4105,7 +3732,7 @@ static void encode_frame_to_data_rate q_high = cpi->active_worst_quality; loop_count++; - Loop = TRUE; + Loop = 1; continue; } @@ -4133,10 +3760,10 @@ static void encode_frame_to_data_rate } // If we have updated the active max Q do not call vp8_update_rate_correction_factors() this loop. - active_worst_qchanged = TRUE; + active_worst_qchanged = 1; } else - active_worst_qchanged = FALSE; + active_worst_qchanged = 0; #if !(CONFIG_REALTIME_ONLY) // Special case handling for forced key frames @@ -4172,7 +3799,7 @@ static void encode_frame_to_data_rate else if (Q < q_low) Q = q_low; - Loop = ((Q != last_q)) ? TRUE : FALSE; + Loop = Q != last_q; } // Is the projected frame size out of range and are we allowed to attempt to recode. @@ -4229,7 +3856,7 @@ static void encode_frame_to_data_rate } } - overshoot_seen = TRUE; + overshoot_seen = 1; } // Frame is too small else @@ -4279,7 +3906,7 @@ static void encode_frame_to_data_rate } } - undershoot_seen = TRUE; + undershoot_seen = 1; } // Clamp Q to upper and lower limits: @@ -4291,18 +3918,18 @@ static void encode_frame_to_data_rate // Clamp cpi->zbin_over_quant cpi->zbin_over_quant = (cpi->zbin_over_quant < zbin_oq_low) ? zbin_oq_low : (cpi->zbin_over_quant > zbin_oq_high) ? zbin_oq_high : cpi->zbin_over_quant; - //Loop = ((Q != last_q) || (last_zbin_oq != cpi->zbin_over_quant)) ? TRUE : FALSE; - Loop = ((Q != last_q)) ? TRUE : FALSE; + //Loop = (Q != last_q) || (last_zbin_oq != cpi->zbin_over_quant); + Loop = Q != last_q; last_zbin_oq = cpi->zbin_over_quant; } else #endif - Loop = FALSE; + Loop = 0; if (cpi->is_src_frame_alt_ref) - Loop = FALSE; + Loop = 0; - if (Loop == TRUE) + if (Loop == 1) { vp8_restore_coding_context(cpi); loop_count++; @@ -4311,7 +3938,7 @@ static void encode_frame_to_data_rate #endif } } - while (Loop == TRUE); + while (Loop == 1); #if 0 // Experimental code for lagged and one pass @@ -4345,13 +3972,20 @@ static void encode_frame_to_data_rate IF_RTCD(&cpi->rtcd.variance)); } - // This frame's MVs are saved and will be used in next frame's MV prediction. - // Last frame has one more line(add to bottom) and one more column(add to right) than cm->mip. The edge elements are initialized to 0. - if(cm->show_frame) //do not save for altref frame + /* This frame's MVs are saved and will be used in next frame's MV predictor. + * Last frame has one more line(add to bottom) and one more column(add to + * right) than cm->mip. The edge elements are initialized to 0. + */ +#if CONFIG_MULTI_RES_ENCODING + if(!cpi->oxcf.mr_encoder_id && cm->show_frame) +#else + if(cm->show_frame) /* do not save for altref frame */ +#endif { int mb_row; int mb_col; - MODE_INFO *tmp = cm->mip; //point to beginning of allocated MODE_INFO arrays. + /* Point to beginning of allocated MODE_INFO arrays. */ + MODE_INFO *tmp = cm->mip; if(cm->frame_type != KEY_FRAME) { @@ -4370,6 +4004,10 @@ static void encode_frame_to_data_rate } } +#if CONFIG_MULTI_RES_ENCODING + vp8_cal_dissimilarity(cpi); +#endif + // Update the GF useage maps. // This is done after completing the compression of a frame when all // modes etc. are finalized but before loop filter @@ -4442,7 +4080,7 @@ static void encode_frame_to_data_rate if (cpi->oxcf.number_of_layers > 1) { - int i; + unsigned int i; for (i=cpi->current_layer+1; i<cpi->oxcf.number_of_layers; i++) cpi->layer_context[i].total_byte_count += (*size); } @@ -4509,7 +4147,7 @@ static void encode_frame_to_data_rate (cpi->buffer_level < cpi->oxcf.drop_frames_water_mark * cpi->oxcf.optimal_buffer_level / 100) && (cpi->projected_frame_size > (4 * cpi->this_frame_target))) { - cpi->drop_frame = TRUE; + cpi->drop_frame = 1; } #endif @@ -4553,7 +4191,7 @@ static void encode_frame_to_data_rate // Propagate values to higher temporal layers if (cpi->oxcf.number_of_layers > 1) { - int i; + unsigned int i; for (i=cpi->current_layer+1; i<cpi->oxcf.number_of_layers; i++) { @@ -4856,7 +4494,7 @@ static void Pass2Encode(VP8_COMP *cpi, unsigned long *size, unsigned char *dest, { double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth *cpi->oxcf.two_pass_vbrmin_section / 100); - cpi->twopass.bits_left += (int64_t)(two_pass_min_rate / cpi->oxcf.frame_rate); + cpi->twopass.bits_left += (int64_t)(two_pass_min_rate / cpi->frame_rate); } } #endif @@ -4868,12 +4506,11 @@ extern void vp8_pop_neon(int64_t *store); #endif -int vp8_receive_raw_frame(VP8_PTR ptr, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, int64_t end_time) +int vp8_receive_raw_frame(VP8_COMP *cpi, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, int64_t end_time) { #if HAVE_ARMV7 int64_t store_reg[8]; #endif - VP8_COMP *cpi = (VP8_COMP *) ptr; VP8_COMMON *cm = &cpi->common; struct vpx_usec_timer timer; int res = 0; @@ -4922,13 +4559,12 @@ static int frame_is_reference(const VP8_COMP *cpi) } -int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned long *size, unsigned char *dest, unsigned char *dest_end, int64_t *time_stamp, int64_t *time_end, int flush) +int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned long *size, unsigned char *dest, unsigned char *dest_end, int64_t *time_stamp, int64_t *time_end, int flush) { #if HAVE_ARMV7 int64_t store_reg[8]; #endif - VP8_COMP *cpi = (VP8_COMP *) ptr; - VP8_COMMON *cm = &cpi->common; + VP8_COMMON *cm; struct vpx_usec_timer tsctimer; struct vpx_usec_timer ticktimer; struct vpx_usec_timer cmptimer; @@ -4937,12 +4573,14 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon if (!cpi) return -1; - if (setjmp(cpi->common.error.jmp)){ + cm = &cpi->common; + + if (setjmp(cpi->common.error.jmp)) + { cpi->common.error.setjmp = 0; return VPX_CODEC_CORRUPT_FRAME; } - cpi->bc.error = &cpi->common.error; cpi->common.error.setjmp = 1; #if HAVE_ARMV7 @@ -4979,7 +4617,7 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon cm->refresh_golden_frame = 0; cm->refresh_last_frame = 0; cm->show_frame = 0; - cpi->source_alt_ref_pending = FALSE; // Clear Pending alt Ref flag. + cpi->source_alt_ref_pending = 0; // Clear Pending alt Ref flag. cpi->is_src_frame_alt_ref = 0; } } @@ -5092,7 +4730,7 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon if(interval > 10000000.0) interval = 10000000; - avg_duration = 10000000.0 / cpi->oxcf.frame_rate; + avg_duration = 10000000.0 / cpi->frame_rate; avg_duration *= (interval - avg_duration + this_duration); avg_duration /= interval; @@ -5200,6 +4838,17 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon vpx_memcpy(&cm->fc, &cm->lfc, sizeof(cm->fc)); } + // Save the contexts separately for alt ref, gold and last. + // (TODO jbb -> Optimize this with pointers to avoid extra copies. ) + if(cm->refresh_alt_ref_frame) + vpx_memcpy(&cpi->lfc_a, &cm->fc, sizeof(cm->fc)); + + if(cm->refresh_golden_frame) + vpx_memcpy(&cpi->lfc_g, &cm->fc, sizeof(cm->fc)); + + if(cm->refresh_last_frame) + vpx_memcpy(&cpi->lfc_n, &cm->fc, sizeof(cm->fc)); + // if its a dropped frame honor the requests on subsequent frames if (*size > 0) { @@ -5400,10 +5049,8 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon return 0; } -int vp8_get_preview_raw_frame(VP8_PTR comp, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *flags) +int vp8_get_preview_raw_frame(VP8_COMP *cpi, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *flags) { - VP8_COMP *cpi = (VP8_COMP *) comp; - if (cpi->common.refresh_alt_ref_frame) return -1; else @@ -5432,9 +5079,8 @@ int vp8_get_preview_raw_frame(VP8_PTR comp, YV12_BUFFER_CONFIG *dest, vp8_ppflag } } -int vp8_set_roimap(VP8_PTR comp, unsigned char *map, unsigned int rows, unsigned int cols, int delta_q[4], int delta_lf[4], unsigned int threshold[4]) +int vp8_set_roimap(VP8_COMP *cpi, unsigned char *map, unsigned int rows, unsigned int cols, int delta_q[4], int delta_lf[4], unsigned int threshold[4]) { - VP8_COMP *cpi = (VP8_COMP *) comp; signed char feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS]; if (cpi->common.mb_rows != rows || cpi->common.mb_cols != cols) @@ -5442,15 +5088,15 @@ int vp8_set_roimap(VP8_PTR comp, unsigned char *map, unsigned int rows, unsigned if (!map) { - disable_segmentation((VP8_PTR)cpi); + disable_segmentation(cpi); return 0; } // Set the segmentation Map - set_segmentation_map((VP8_PTR)cpi, map); + set_segmentation_map(cpi, map); // Activate segmentation. - enable_segmentation((VP8_PTR)cpi); + enable_segmentation(cpi); // Set up the quant segment data feature_data[MB_LVL_ALT_Q][0] = delta_q[0]; @@ -5471,15 +5117,13 @@ int vp8_set_roimap(VP8_PTR comp, unsigned char *map, unsigned int rows, unsigned // Initialise the feature data structure // SEGMENT_DELTADATA 0, SEGMENT_ABSDATA 1 - set_segment_data((VP8_PTR)cpi, &feature_data[0][0], SEGMENT_DELTADATA); + set_segment_data(cpi, &feature_data[0][0], SEGMENT_DELTADATA); return 0; } -int vp8_set_active_map(VP8_PTR comp, unsigned char *map, unsigned int rows, unsigned int cols) +int vp8_set_active_map(VP8_COMP *cpi, unsigned char *map, unsigned int rows, unsigned int cols) { - VP8_COMP *cpi = (VP8_COMP *) comp; - if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols) { if (map) @@ -5499,10 +5143,8 @@ int vp8_set_active_map(VP8_PTR comp, unsigned char *map, unsigned int rows, unsi } } -int vp8_set_internal_size(VP8_PTR comp, VPX_SCALING horiz_mode, VPX_SCALING vert_mode) +int vp8_set_internal_size(VP8_COMP *cpi, VPX_SCALING horiz_mode, VPX_SCALING vert_mode) { - VP8_COMP *cpi = (VP8_COMP *) comp; - if (horiz_mode <= ONETWO) cpi->common.horiz_scale = horiz_mode; else @@ -5544,8 +5186,7 @@ int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, const } -int vp8_get_quantizer(VP8_PTR c) +int vp8_get_quantizer(VP8_COMP *cpi) { - VP8_COMP *cpi = (VP8_COMP *) c; return cpi->common.base_qindex; } diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h index a0828a479..46951e3b9 100644 --- a/vp8/encoder/onyx_int.h +++ b/vp8/encoder/onyx_int.h @@ -58,6 +58,9 @@ #define MAX_PERIODICITY 16 +#define MAX(x,y) (((x)>(y))?(x):(y)) +#define MIN(x,y) (((x)<(y))?(x):(y)) + typedef struct { int kf_indicated; @@ -133,32 +136,32 @@ typedef struct typedef enum { - THR_ZEROMV = 0, + THR_ZERO1 = 0, THR_DC = 1, - THR_NEARESTMV = 2, - THR_NEARMV = 3, + THR_NEAREST1 = 2, + THR_NEAR1 = 3, - THR_ZEROG = 4, - THR_NEARESTG = 5, + THR_ZERO2 = 4, + THR_NEAREST2 = 5, - THR_ZEROA = 6, - THR_NEARESTA = 7, + THR_ZERO3 = 6, + THR_NEAREST3 = 7, - THR_NEARG = 8, - THR_NEARA = 9, + THR_NEAR2 = 8, + THR_NEAR3 = 9, THR_V_PRED = 10, THR_H_PRED = 11, THR_TM = 12, - THR_NEWMV = 13, - THR_NEWG = 14, - THR_NEWA = 15, + THR_NEW1 = 13, + THR_NEW2 = 14, + THR_NEW3 = 15, - THR_SPLITMV = 16, - THR_SPLITG = 17, - THR_SPLITA = 18, + THR_SPLIT1 = 16, + THR_SPLIT2 = 17, + THR_SPLIT3 = 18, THR_B_PRED = 19, } @@ -256,7 +259,7 @@ typedef struct int buffer_level; int bits_off_target; - long long total_actual_bits; + int64_t total_actual_bits; int total_target_vs_actual; int worst_quality; @@ -276,7 +279,7 @@ typedef struct int zbin_over_quant; int inter_frame_target; - INT64 total_byte_count; + int64_t total_byte_count; int filter_level; @@ -314,8 +317,7 @@ typedef struct VP8_COMP MACROBLOCK mb; VP8_COMMON common; - vp8_writer bc, bc2; - // bool_writer *bc2; + vp8_writer bc[9]; // one boolcoder for each partition VP8_CONFIG oxcf; @@ -337,7 +339,7 @@ typedef struct VP8_COMP int gold_is_alt; // don't do both alt and gold search ( just do gold). //int refresh_alt_ref_frame; - YV12_BUFFER_CONFIG last_frame_uf; + YV12_BUFFER_CONFIG pick_lf_lvl_frame; TOKENEXTRA *tok; unsigned int tok_count; @@ -418,6 +420,7 @@ typedef struct VP8_COMP int buffered_mode; + double frame_rate; int64_t buffer_level; int bits_off_target; @@ -565,16 +568,21 @@ typedef struct VP8_COMP int base_skip_false_prob[128]; + FRAME_CONTEXT lfc_n; /* last frame entropy */ + FRAME_CONTEXT lfc_a; /* last alt ref entropy */ + FRAME_CONTEXT lfc_g; /* last gold ref entropy */ + + struct twopass_rc { unsigned int section_intra_rating; double section_max_qfactor; unsigned int next_iiratio; unsigned int this_iiratio; - FIRSTPASS_STATS *total_stats; - FIRSTPASS_STATS *this_frame_stats; + FIRSTPASS_STATS total_stats; + FIRSTPASS_STATS this_frame_stats; FIRSTPASS_STATS *stats_in, *stats_in_end, *stats_in_start; - FIRSTPASS_STATS *total_left_stats; + FIRSTPASS_STATS total_left_stats; int first_pass_done; int64_t bits_left; int64_t clip_bits_total; @@ -665,8 +673,8 @@ typedef struct VP8_COMP unsigned int current_layer; LAYER_CONTEXT layer_context[MAX_LAYERS]; - long long frames_in_layer[MAX_LAYERS]; - long long bytes_in_layer[MAX_LAYERS]; + int64_t frames_in_layer[MAX_LAYERS]; + int64_t bytes_in_layer[MAX_LAYERS]; double sum_psnr[MAX_LAYERS]; double sum_psnr_p[MAX_LAYERS]; double total_error2[MAX_LAYERS]; @@ -679,6 +687,11 @@ typedef struct VP8_COMP double total_ssimg_v_in_layer[MAX_LAYERS]; double total_ssimg_all_in_layer[MAX_LAYERS]; +#if CONFIG_MULTI_RES_ENCODING + /* Number of MBs per row at lower-resolution level */ + int mr_low_res_mb_cols; +#endif + } VP8_COMP; void control_data_rate(VP8_COMP *cpi); diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c index 4d8734137..46f53a18d 100644 --- a/vp8/encoder/pickinter.c +++ b/vp8/encoder/pickinter.c @@ -21,7 +21,6 @@ #include "vp8/common/reconinter.h" #include "vp8/common/reconintra.h" #include "vp8/common/reconintra4x4.h" -#include "vp8/common/g_common.h" #include "variance.h" #include "mcomp.h" #include "rdopt.h" @@ -39,7 +38,7 @@ extern int VP8_UVSSE(MACROBLOCK *x, const vp8_variance_rtcd_vtable_t *rtcd); extern unsigned int cnt_pm; #endif -extern const MV_REFERENCE_FRAME vp8_ref_frame_order[MAX_MODES]; +extern const int vp8_ref_frame_order[MAX_MODES]; extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES]; extern unsigned int (*vp8_get4x4sse_cs)(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride); @@ -402,9 +401,68 @@ static void update_mvcount(VP8_COMP *cpi, MACROBLOCKD *xd, int_mv *best_ref_mv) } } + +#if CONFIG_MULTI_RES_ENCODING +static +void get_lower_res_motion_info(VP8_COMP *cpi, MACROBLOCKD *xd, int *dissim, + int *parent_ref_frame, + MB_PREDICTION_MODE *parent_mode, + int_mv *parent_ref_mv, int mb_row, int mb_col) +{ + LOWER_RES_INFO* store_mode_info + = (LOWER_RES_INFO*)cpi->oxcf.mr_low_res_mode_info; + unsigned int parent_mb_index; + //unsigned int parent_mb_index = map_640x480_to_320x240[mb_row][mb_col]; + + /* Consider different down_sampling_factor. */ + { + /* TODO: Removed the loop that supports special down_sampling_factor + * such as 2, 4, 8. Will revisit it if needed. + * Should also try using a look-up table to see if it helps + * performance. */ + int round = cpi->oxcf.mr_down_sampling_factor.num/2; + int parent_mb_row, parent_mb_col; + + parent_mb_row = (mb_row*cpi->oxcf.mr_down_sampling_factor.den+round) + /cpi->oxcf.mr_down_sampling_factor.num; + parent_mb_col = (mb_col*cpi->oxcf.mr_down_sampling_factor.den+round) + /cpi->oxcf.mr_down_sampling_factor.num; + parent_mb_index = parent_mb_row*cpi->mr_low_res_mb_cols + parent_mb_col; + } + + /* Read lower-resolution mode & motion result from memory.*/ + *parent_ref_frame = store_mode_info[parent_mb_index].ref_frame; + *parent_mode = store_mode_info[parent_mb_index].mode; + *dissim = store_mode_info[parent_mb_index].dissim; + + /* For highest-resolution encoder, adjust dissim value. Lower its quality + * for good performance. */ + if (cpi->oxcf.mr_encoder_id == (cpi->oxcf.mr_total_resolutions - 1)) + *dissim>>=1; + + if(*parent_ref_frame != INTRA_FRAME) + { + /* Consider different down_sampling_factor. + * The result can be rounded to be more precise, but it takes more time. + */ + //int round = cpi->oxcf.mr_down_sampling_factor.den/2; + (*parent_ref_mv).as_mv.row = store_mode_info[parent_mb_index].mv.as_mv.row + *cpi->oxcf.mr_down_sampling_factor.num + /cpi->oxcf.mr_down_sampling_factor.den; + (*parent_ref_mv).as_mv.col = store_mode_info[parent_mb_index].mv.as_mv.col + *cpi->oxcf.mr_down_sampling_factor.num + /cpi->oxcf.mr_down_sampling_factor.den; + + vp8_clamp_mv2(parent_ref_mv, xd); + } +} +#endif + + void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, - int *returndistortion, int *returnintra) + int *returndistortion, int *returnintra, int mb_row, + int mb_col) { BLOCK *b = &x->block[0]; BLOCKD *d = &x->e_mbd.block[0]; @@ -422,34 +480,67 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rate; int rate2; int distortion2; - int bestsme; - //int all_rds[MAX_MODES]; // Experimental debug code. + int bestsme = INT_MAX; int best_mode_index = 0; unsigned int sse = INT_MAX, best_sse = INT_MAX; int_mv mvp; + int near_sadidx[8] = {0, 1, 2, 3, 4, 5, 6, 7}; int saddone=0; int sr=0; //search range got from mv_pred(). It uses step_param levels. (0-7) - int_mv nearest_mv[4]; - int_mv near_mv[4]; - int_mv frame_best_ref_mv[4]; - int MDCounts[4][4]; unsigned char *y_buffer[4]; unsigned char *u_buffer[4]; unsigned char *v_buffer[4]; + int i; + int ref_frame_map[4]; + int sign_bias = 0; - int skip_mode[4] = {0, 0, 0, 0}; - int found_near_mvs[4] = {0, 0, 0, 0}; + int have_subp_search = cpi->sf.half_pixel_search; /* In real-time mode, + when Speed >= 15, no sub-pixel search. */ - int have_subp_search = cpi->sf.half_pixel_search; /* In real-time mode, when Speed >= 15, no sub-pixel search. */ +#if CONFIG_MULTI_RES_ENCODING + int dissim = INT_MAX; + int parent_ref_frame = 0; + int_mv parent_ref_mv; + MB_PREDICTION_MODE parent_mode = 0; + + if (cpi->oxcf.mr_encoder_id) + get_lower_res_motion_info(cpi, xd, &dissim, &parent_ref_frame, + &parent_mode, &parent_ref_mv, mb_row, mb_col); +#endif vpx_memset(mode_mv, 0, sizeof(mode_mv)); - vpx_memset(nearest_mv, 0, sizeof(nearest_mv)); - vpx_memset(near_mv, 0, sizeof(near_mv)); vpx_memset(&best_mbmode, 0, sizeof(best_mbmode)); + /* Setup search priorities */ + i=0; + ref_frame_map[i++] = INTRA_FRAME; + if (cpi->ref_frame_flags & VP8_LAST_FLAG) + ref_frame_map[i++] = LAST_FRAME; + if (cpi->ref_frame_flags & VP8_GOLD_FLAG) + ref_frame_map[i++] = GOLDEN_FRAME; + if (cpi->ref_frame_flags & VP8_ALT_FLAG) // &&(cpi->source_alt_ref_active || cpi->oxcf.number_of_layers > 1) + ref_frame_map[i++] = ALTREF_FRAME; + for(; i<4; i++) + ref_frame_map[i] = -1; + + /* Check to see if there is at least 1 valid reference frame that we need + * to calculate near_mvs. + */ + if (ref_frame_map[1] > 0) + { + vp8_find_near_mvs(&x->e_mbd, + x->e_mbd.mode_info_context, + &mode_mv[NEARESTMV], &mode_mv[NEARMV], + &best_ref_mv, + mdcounts, + ref_frame_map[1], + cpi->common.ref_frame_sign_bias); + + sign_bias = cpi->common.ref_frame_sign_bias[ref_frame_map[1]]; + } // set up all the refframe dependent pointers. if (cpi->ref_frame_flags & VP8_LAST_FLAG) @@ -459,8 +550,6 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, u_buffer[LAST_FRAME] = lst_yv12->u_buffer + recon_uvoffset; v_buffer[LAST_FRAME] = lst_yv12->v_buffer + recon_uvoffset; } - else - skip_mode[LAST_FRAME] = 1; if (cpi->ref_frame_flags & VP8_GOLD_FLAG) { @@ -469,21 +558,16 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, u_buffer[GOLDEN_FRAME] = gld_yv12->u_buffer + recon_uvoffset; v_buffer[GOLDEN_FRAME] = gld_yv12->v_buffer + recon_uvoffset; } - else - skip_mode[GOLDEN_FRAME] = 1; - if ((cpi->ref_frame_flags & VP8_ALT_FLAG) && - (cpi->source_alt_ref_active || cpi->oxcf.number_of_layers > 1)) + if (cpi->ref_frame_flags & VP8_ALT_FLAG) { YV12_BUFFER_CONFIG *alt_yv12 = &cpi->common.yv12_fb[cpi->common.alt_fb_idx]; y_buffer[ALTREF_FRAME] = alt_yv12->y_buffer + recon_yoffset; u_buffer[ALTREF_FRAME] = alt_yv12->u_buffer + recon_uvoffset; v_buffer[ALTREF_FRAME] = alt_yv12->v_buffer + recon_uvoffset; } - else - skip_mode[ALTREF_FRAME] = 1; - cpi->mbs_tested_so_far++; // Count of the number of MBs tested so far this frame + cpi->mbs_tested_so_far++; // Count of the number of MBs tested so far this frame *returnintra = INT_MAX; x->skip = 0; @@ -496,50 +580,98 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, { int frame_cost; int this_rd = INT_MAX; + int this_ref_frame = ref_frame_map[vp8_ref_frame_order[mode_index]]; if (best_rd <= cpi->rd_threshes[mode_index]) continue; - x->e_mbd.mode_info_context->mbmi.ref_frame = vp8_ref_frame_order[mode_index]; - - if (skip_mode[x->e_mbd.mode_info_context->mbmi.ref_frame]) + if (this_ref_frame < 0) continue; - // Check to see if the testing frequency for this mode is at its max - // If so then prevent it from being tested and increase the threshold for its testing - if (cpi->mode_test_hit_counts[mode_index] && (cpi->mode_check_freq[mode_index] > 1)) + x->e_mbd.mode_info_context->mbmi.ref_frame = this_ref_frame; + +#if CONFIG_MULTI_RES_ENCODING + if (cpi->oxcf.mr_encoder_id) + { + /* If parent MB is intra, child MB is intra. */ + if (!parent_ref_frame && this_ref_frame) + continue; + + /* If parent MB is inter, and it is unlikely there are multiple + * objects in parent MB, we use parent ref frame as child MB's + * ref frame. */ + if (parent_ref_frame && dissim < 8 + && parent_ref_frame != this_ref_frame) + continue; + } +#endif + + // everything but intra + if (x->e_mbd.mode_info_context->mbmi.ref_frame) + { + x->e_mbd.pre.y_buffer = y_buffer[x->e_mbd.mode_info_context->mbmi.ref_frame]; + x->e_mbd.pre.u_buffer = u_buffer[x->e_mbd.mode_info_context->mbmi.ref_frame]; + x->e_mbd.pre.v_buffer = v_buffer[x->e_mbd.mode_info_context->mbmi.ref_frame]; + + if (sign_bias != + cpi->common.ref_frame_sign_bias[x->e_mbd.mode_info_context->mbmi.ref_frame]) + { + mode_mv[NEARESTMV].as_mv.row *= -1; + mode_mv[NEARESTMV].as_mv.col *= -1; + mode_mv[NEARMV].as_mv.row *= -1; + mode_mv[NEARMV].as_mv.col *= -1; + best_ref_mv.as_mv.row *= -1; + best_ref_mv.as_mv.col *= -1; + sign_bias + = cpi->common.ref_frame_sign_bias[x->e_mbd.mode_info_context->mbmi.ref_frame]; + } + +#if CONFIG_MULTI_RES_ENCODING + if (cpi->oxcf.mr_encoder_id) + { + if (vp8_mode_order[mode_index] == NEARESTMV && + mode_mv[NEARESTMV].as_int ==0) + continue; + if (vp8_mode_order[mode_index] == NEARMV && + mode_mv[NEARMV].as_int ==0) + continue; + + if (vp8_mode_order[mode_index] == NEWMV && parent_mode == ZEROMV + && best_ref_mv.as_int==0) //&& dissim==0 + continue; + else if(vp8_mode_order[mode_index] == NEWMV && dissim==0 + && best_ref_mv.as_int==parent_ref_mv.as_int) + continue; + } +#endif + } + + /* Check to see if the testing frequency for this mode is at its max + * If so then prevent it from being tested and increase the threshold + * for its testing */ + if (cpi->mode_test_hit_counts[mode_index] && + (cpi->mode_check_freq[mode_index] > 1)) { - //if ( (cpi->mbs_tested_so_far / cpi->mode_test_hit_counts[mode_index]) <= cpi->mode_check_freq[mode_index] ) - if (cpi->mbs_tested_so_far <= (cpi->mode_check_freq[mode_index] * cpi->mode_test_hit_counts[mode_index])) + if (cpi->mbs_tested_so_far <= (cpi->mode_check_freq[mode_index] * + cpi->mode_test_hit_counts[mode_index])) { - // Increase the threshold for coding this mode to make it less likely to be chosen + /* Increase the threshold for coding this mode to make it less + * likely to be chosen */ cpi->rd_thresh_mult[mode_index] += 4; if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT) cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT; - cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index]; - + cpi->rd_threshes[mode_index] = + (cpi->rd_baseline_thresh[mode_index] >> 7) * + cpi->rd_thresh_mult[mode_index]; continue; } } - // If nearby MVs haven't been found for this reference frame then do it now. - if (x->e_mbd.mode_info_context->mbmi.ref_frame != INTRA_FRAME && - !found_near_mvs[x->e_mbd.mode_info_context->mbmi.ref_frame]) - { - int ref_frame = x->e_mbd.mode_info_context->mbmi.ref_frame; - vp8_find_near_mvs(&x->e_mbd, - x->e_mbd.mode_info_context, - &nearest_mv[ref_frame], &near_mv[ref_frame], - &frame_best_ref_mv[ref_frame], - MDCounts[ref_frame], - ref_frame, - cpi->common.ref_frame_sign_bias); - found_near_mvs[ref_frame] = 1; - } - - // We have now reached the point where we are going to test the current mode so increment the counter for the number of times it has been tested + /* We have now reached the point where we are going to test the current + * mode so increment the counter for the number of times it has been + * tested */ cpi->mode_test_hit_counts[mode_index] ++; rate2 = 0; @@ -547,42 +679,28 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, this_mode = vp8_mode_order[mode_index]; - // Experimental debug code. - //all_rds[mode_index] = -1; - x->e_mbd.mode_info_context->mbmi.mode = this_mode; x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED; - // Work out the cost assosciated with selecting the reference frame + /* Work out the cost assosciated with selecting the reference frame */ frame_cost = x->e_mbd.ref_frame_cost[x->e_mbd.mode_info_context->mbmi.ref_frame]; rate2 += frame_cost; - // everything but intra - if (x->e_mbd.mode_info_context->mbmi.ref_frame) - { - x->e_mbd.pre.y_buffer = y_buffer[x->e_mbd.mode_info_context->mbmi.ref_frame]; - x->e_mbd.pre.u_buffer = u_buffer[x->e_mbd.mode_info_context->mbmi.ref_frame]; - x->e_mbd.pre.v_buffer = v_buffer[x->e_mbd.mode_info_context->mbmi.ref_frame]; - mode_mv[NEARESTMV] = nearest_mv[x->e_mbd.mode_info_context->mbmi.ref_frame]; - mode_mv[NEARMV] = near_mv[x->e_mbd.mode_info_context->mbmi.ref_frame]; - best_ref_mv = frame_best_ref_mv[x->e_mbd.mode_info_context->mbmi.ref_frame]; - memcpy(mdcounts, MDCounts[x->e_mbd.mode_info_context->mbmi.ref_frame], sizeof(mdcounts)); - } - - // Only consider ZEROMV/ALTREF_FRAME for alt ref frame, - // unless ARNR filtering is enabled in which case we want - // an unfiltered alternative + /* Only consider ZEROMV/ALTREF_FRAME for alt ref frame, + * unless ARNR filtering is enabled in which case we want + * an unfiltered alternative */ if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) { - if (this_mode != ZEROMV || x->e_mbd.mode_info_context->mbmi.ref_frame != ALTREF_FRAME) + if (this_mode != ZEROMV || + x->e_mbd.mode_info_context->mbmi.ref_frame != ALTREF_FRAME) continue; } switch (this_mode) { case B_PRED: - // Pass best so far to pick_intra4x4mby_modes to use as breakout + /* Pass best so far to pick_intra4x4mby_modes to use as breakout */ distortion2 = best_sse; pick_intra4x4mby_modes(IF_RTCD(&cpi->rtcd), x, &rate, &distortion2); @@ -641,10 +759,12 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int sadpb = x->sadperbit16; int_mv mvp_full; - int col_min = (best_ref_mv.as_mv.col>>3) - MAX_FULL_PEL_VAL + ((best_ref_mv.as_mv.col & 7)?1:0); - int row_min = (best_ref_mv.as_mv.row>>3) - MAX_FULL_PEL_VAL + ((best_ref_mv.as_mv.row & 7)?1:0); - int col_max = (best_ref_mv.as_mv.col>>3) + MAX_FULL_PEL_VAL; - int row_max = (best_ref_mv.as_mv.row>>3) + MAX_FULL_PEL_VAL; + int col_min = ((best_ref_mv.as_mv.col+7)>>3) - MAX_FULL_PEL_VAL; + int row_min = ((best_ref_mv.as_mv.row+7)>>3) - MAX_FULL_PEL_VAL; + int col_max = (best_ref_mv.as_mv.col>>3) + + MAX_FULL_PEL_VAL; + int row_max = (best_ref_mv.as_mv.row>>3) + + MAX_FULL_PEL_VAL; int tmp_col_min = x->mv_col_min; int tmp_col_max = x->mv_col_max; @@ -656,110 +776,156 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, // Further step/diamond searches as necessary step_param = cpi->sf.first_step + speed_adjust; - if(cpi->sf.improved_mv_pred) +#if CONFIG_MULTI_RES_ENCODING + if (cpi->oxcf.mr_encoder_id) { - if(!saddone) - { - vp8_cal_sad(cpi,xd,x, recon_yoffset ,&near_sadidx[0] ); - saddone = 1; - } - - vp8_mv_pred(cpi, &x->e_mbd, x->e_mbd.mode_info_context, &mvp, - x->e_mbd.mode_info_context->mbmi.ref_frame, cpi->common.ref_frame_sign_bias, &sr, &near_sadidx[0]); - - sr += speed_adjust; - //adjust search range according to sr from mv prediction - if(sr > step_param) - step_param = sr; - - mvp_full.as_mv.col = mvp.as_mv.col>>3; - mvp_full.as_mv.row = mvp.as_mv.row>>3; - + // Use parent MV as predictor. Adjust search range accordingly. + mvp.as_int = parent_ref_mv.as_int; + mvp_full.as_mv.col = parent_ref_mv.as_mv.col>>3; + mvp_full.as_mv.row = parent_ref_mv.as_mv.row>>3; + + if(dissim <=32) step_param += 3; + else if(dissim <=128) step_param += 2; + else step_param += 1; }else +#endif { - mvp.as_int = best_ref_mv.as_int; - mvp_full.as_mv.col = best_ref_mv.as_mv.col>>3; - mvp_full.as_mv.row = best_ref_mv.as_mv.row>>3; - } + if(cpi->sf.improved_mv_pred) + { + if(!saddone) + { + vp8_cal_sad(cpi,xd,x, recon_yoffset ,&near_sadidx[0] ); + saddone = 1; + } - // Get intersection of UMV window and valid MV window to reduce # of checks in diamond search. - if (x->mv_col_min < col_min ) - x->mv_col_min = col_min; - if (x->mv_col_max > col_max ) - x->mv_col_max = col_max; - if (x->mv_row_min < row_min ) - x->mv_row_min = row_min; - if (x->mv_row_max > row_max ) - x->mv_row_max = row_max; + vp8_mv_pred(cpi, &x->e_mbd, x->e_mbd.mode_info_context, + &mvp,x->e_mbd.mode_info_context->mbmi.ref_frame, + cpi->common.ref_frame_sign_bias, &sr, + &near_sadidx[0]); - further_steps = (cpi->Speed >= 8)? 0: (cpi->sf.max_step_search_steps - 1 - step_param); + sr += speed_adjust; + //adjust search range according to sr from mv prediction + if(sr > step_param) + step_param = sr; - if (cpi->sf.search_method == HEX) - { - bestsme = vp8_hex_search(x, b, d, &mvp_full, &d->bmi.mv, step_param, - sadpb, &cpi->fn_ptr[BLOCK_16X16], - x->mvsadcost, x->mvcost, &best_ref_mv); - mode_mv[NEWMV].as_int = d->bmi.mv.as_int; + mvp_full.as_mv.col = mvp.as_mv.col>>3; + mvp_full.as_mv.row = mvp.as_mv.row>>3; + }else + { + mvp.as_int = best_ref_mv.as_int; + mvp_full.as_mv.col = best_ref_mv.as_mv.col>>3; + mvp_full.as_mv.row = best_ref_mv.as_mv.row>>3; + } } - else + +#if CONFIG_MULTI_RES_ENCODING + if (cpi->oxcf.mr_encoder_id && dissim <= 2 && + MAX(abs(best_ref_mv.as_mv.row - parent_ref_mv.as_mv.row), + abs(best_ref_mv.as_mv.col - parent_ref_mv.as_mv.col)) <= 4) { - bestsme = cpi->diamond_search_sad(x, b, d, &mvp_full, &d->bmi.mv, - step_param, sadpb, &num00, - &cpi->fn_ptr[BLOCK_16X16], - x->mvcost, &best_ref_mv); - mode_mv[NEWMV].as_int = d->bmi.mv.as_int; + d->bmi.mv.as_int = mvp_full.as_int; + mode_mv[NEWMV].as_int = mvp_full.as_int; - // Further step/diamond searches as necessary - n = 0; - //further_steps = (cpi->sf.max_step_search_steps - 1) - step_param; + cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv, &best_ref_mv, + x->errorperbit, + &cpi->fn_ptr[BLOCK_16X16], + cpi->mb.mvcost, + &distortion2,&sse); + }else +#endif + { + /* Get intersection of UMV window and valid MV window to + * reduce # of checks in diamond search. */ + if (x->mv_col_min < col_min ) + x->mv_col_min = col_min; + if (x->mv_col_max > col_max ) + x->mv_col_max = col_max; + if (x->mv_row_min < row_min ) + x->mv_row_min = row_min; + if (x->mv_row_max > row_max ) + x->mv_row_max = row_max; + + further_steps = (cpi->Speed >= 8)? + 0: (cpi->sf.max_step_search_steps - 1 - step_param); + + if (cpi->sf.search_method == HEX) + { +#if CONFIG_MULTI_RES_ENCODING + /* TODO: In higher-res pick_inter_mode, step_param is used to + * modify hex search range. Here, set step_param to 0 not to + * change the behavior in lowest-resolution encoder. + * Will improve it later. + */ + if (!cpi->oxcf.mr_encoder_id) + step_param = 0; +#endif + bestsme = vp8_hex_search(x, b, d, &mvp_full, &d->bmi.mv, + step_param, sadpb, + &cpi->fn_ptr[BLOCK_16X16], + x->mvsadcost, x->mvcost, &best_ref_mv); + mode_mv[NEWMV].as_int = d->bmi.mv.as_int; + } + else + { + bestsme = cpi->diamond_search_sad(x, b, d, &mvp_full, + &d->bmi.mv, step_param, sadpb, &num00, + &cpi->fn_ptr[BLOCK_16X16], + x->mvcost, &best_ref_mv); + mode_mv[NEWMV].as_int = d->bmi.mv.as_int; - n = num00; - num00 = 0; + // Further step/diamond searches as necessary + n = 0; + //further_steps = (cpi->sf.max_step_search_steps - 1) - step_param; - while (n < further_steps) - { - n++; + n = num00; + num00 = 0; - if (num00) - num00--; - else + while (n < further_steps) { - thissme = - cpi->diamond_search_sad(x, b, d, &mvp_full, - &d->bmi.mv, - step_param + n, - sadpb, &num00, - &cpi->fn_ptr[BLOCK_16X16], - x->mvcost, &best_ref_mv); - if (thissme < bestsme) - { - bestsme = thissme; - mode_mv[NEWMV].as_int = d->bmi.mv.as_int; - } + n++; + + if (num00) + num00--; else { - d->bmi.mv.as_int = mode_mv[NEWMV].as_int; + thissme = + cpi->diamond_search_sad(x, b, d, &mvp_full, + &d->bmi.mv, + step_param + n, + sadpb, &num00, + &cpi->fn_ptr[BLOCK_16X16], + x->mvcost, &best_ref_mv); + if (thissme < bestsme) + { + bestsme = thissme; + mode_mv[NEWMV].as_int = d->bmi.mv.as_int; + } + else + { + d->bmi.mv.as_int = mode_mv[NEWMV].as_int; + } } } } - } - x->mv_col_min = tmp_col_min; - x->mv_col_max = tmp_col_max; - x->mv_row_min = tmp_row_min; - x->mv_row_max = tmp_row_max; + x->mv_col_min = tmp_col_min; + x->mv_col_max = tmp_col_max; + x->mv_row_min = tmp_row_min; + x->mv_row_max = tmp_row_max; - if (bestsme < INT_MAX) - cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv, &best_ref_mv, - x->errorperbit, + if (bestsme < INT_MAX) + cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv, + &best_ref_mv, x->errorperbit, &cpi->fn_ptr[BLOCK_16X16], cpi->mb.mvcost, &distortion2,&sse); + } mode_mv[NEWMV].as_int = d->bmi.mv.as_int; // mv cost; - rate2 += vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv, cpi->mb.mvcost, 128); + rate2 += vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv, + cpi->mb.mvcost, 128); } case NEARESTMV: @@ -770,18 +936,23 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, case ZEROMV: - // Trap vectors that reach beyond the UMV borders - // Note that ALL New MV, Nearest MV Near MV and Zero MV code drops through to this point - // because of the lack of break statements in the previous two cases. - if (((mode_mv[this_mode].as_mv.row >> 3) < x->mv_row_min) || ((mode_mv[this_mode].as_mv.row >> 3) > x->mv_row_max) || - ((mode_mv[this_mode].as_mv.col >> 3) < x->mv_col_min) || ((mode_mv[this_mode].as_mv.col >> 3) > x->mv_col_max)) + /* Trap vectors that reach beyond the UMV borders + * Note that ALL New MV, Nearest MV Near MV and Zero MV code drops + * through to this point because of the lack of break statements + * in the previous two cases. + */ + if (((mode_mv[this_mode].as_mv.row >> 3) < x->mv_row_min) || + ((mode_mv[this_mode].as_mv.row >> 3) > x->mv_row_max) || + ((mode_mv[this_mode].as_mv.col >> 3) < x->mv_col_min) || + ((mode_mv[this_mode].as_mv.col >> 3) > x->mv_col_max)) continue; rate2 += vp8_cost_mv_ref(this_mode, mdcounts); x->e_mbd.mode_info_context->mbmi.mv.as_int = mode_mv[this_mode].as_int; - /* Exit early and don't compute the distortion if this macroblock is marked inactive. */ + /* Exit early and don't compute the distortion if this macroblock + * is marked inactive. */ if (cpi->active_map_enabled && x->active_ptr[0] == 0) { sse = 0; @@ -816,9 +987,6 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, break; } - // Experimental debug code. - //all_rds[mode_index] = this_rd; - if (this_rd < best_rd || x->skip) { // Note index of best mode @@ -828,14 +996,23 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, *returndistortion = distortion2; best_sse = sse; best_rd = this_rd; - vpx_memcpy(&best_mbmode, &x->e_mbd.mode_info_context->mbmi, sizeof(MB_MODE_INFO)); - - // Testing this mode gave rise to an improvement in best error score. Lower threshold a bit for next time - cpi->rd_thresh_mult[mode_index] = (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ? cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT; - cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index]; + vpx_memcpy(&best_mbmode, &x->e_mbd.mode_info_context->mbmi, + sizeof(MB_MODE_INFO)); + + /* Testing this mode gave rise to an improvement in best error + * score. Lower threshold a bit for next time + */ + cpi->rd_thresh_mult[mode_index] = + (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ? + cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT; + cpi->rd_threshes[mode_index] = + (cpi->rd_baseline_thresh[mode_index] >> 7) * + cpi->rd_thresh_mult[mode_index]; } - // If the mode did not help improve the best error case then raise the threshold for testing that mode next time around. + /* If the mode did not help improve the best error case then raise the + * threshold for testing that mode next time around. + */ else { cpi->rd_thresh_mult[mode_index] += 4; @@ -843,7 +1020,9 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT) cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT; - cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index]; + cpi->rd_threshes[mode_index] = + (cpi->rd_baseline_thresh[mode_index] >> 7) * + cpi->rd_thresh_mult[mode_index]; } if (x->skip) @@ -855,8 +1034,14 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, { int best_adjustment = (cpi->rd_thresh_mult[best_mode_index] >> 3); - cpi->rd_thresh_mult[best_mode_index] = (cpi->rd_thresh_mult[best_mode_index] >= (MIN_THRESHMULT + best_adjustment)) ? cpi->rd_thresh_mult[best_mode_index] - best_adjustment : MIN_THRESHMULT; - cpi->rd_threshes[best_mode_index] = (cpi->rd_baseline_thresh[best_mode_index] >> 7) * cpi->rd_thresh_mult[best_mode_index]; + cpi->rd_thresh_mult[best_mode_index] = + (cpi->rd_thresh_mult[best_mode_index] + >= (MIN_THRESHMULT + best_adjustment)) ? + cpi->rd_thresh_mult[best_mode_index] - best_adjustment : + MIN_THRESHMULT; + cpi->rd_threshes[best_mode_index] = + (cpi->rd_baseline_thresh[best_mode_index] >> 7) * + cpi->rd_thresh_mult[best_mode_index]; } @@ -879,15 +1064,17 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, x->e_mbd.mode_info_context->mbmi.mv.as_int = 0; x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED; x->e_mbd.mode_info_context->mbmi.mb_skip_coeff = - (cpi->common.mb_no_coeff_skip) ? 1 : 0; + (cpi->common.mb_no_coeff_skip); x->e_mbd.mode_info_context->mbmi.partitioning = 0; return; } - /* set to the best mb mode, this copy can be skip if x->skip since it already has the right content */ + /* set to the best mb mode, this copy can be skip if x->skip since it + * already has the right content */ if (!x->skip) - vpx_memcpy(&x->e_mbd.mode_info_context->mbmi, &best_mbmode, sizeof(MB_MODE_INFO)); + vpx_memcpy(&x->e_mbd.mode_info_context->mbmi, &best_mbmode, + sizeof(MB_MODE_INFO)); if (best_mbmode.mode <= B_PRED) { @@ -895,7 +1082,14 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, pick_intra_mbuv_mode(x); } - update_mvcount(cpi, &x->e_mbd, &frame_best_ref_mv[xd->mode_info_context->mbmi.ref_frame]); + if (sign_bias + != cpi->common.ref_frame_sign_bias[xd->mode_info_context->mbmi.ref_frame]) + { + best_ref_mv.as_mv.row *= -1; + best_ref_mv.as_mv.col *= -1; + } + + update_mvcount(cpi, &x->e_mbd, &best_ref_mv); } diff --git a/vp8/encoder/pickinter.h b/vp8/encoder/pickinter.h index 1c5d6a6e6..3d83782b5 100644 --- a/vp8/encoder/pickinter.h +++ b/vp8/encoder/pickinter.h @@ -14,6 +14,10 @@ #include "vpx_config.h" #include "vp8/common/onyxc_int.h" -extern void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra); +extern void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, + int recon_uvoffset, int *returnrate, + int *returndistortion, int *returnintra, + int mb_row, int mb_col); extern void vp8_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate); + #endif diff --git a/vp8/encoder/picklpf.c b/vp8/encoder/picklpf.c index c1e5f7797..2449ae540 100644 --- a/vp8/encoder/picklpf.c +++ b/vp8/encoder/picklpf.c @@ -152,9 +152,10 @@ void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi) int max_filter_level = get_max_filter_level(cpi, cm->base_qindex); int filt_val; int best_filt_val = cm->filter_level; + YV12_BUFFER_CONFIG * saved_frame = cm->frame_to_show; - // Make a copy of the unfiltered / processed recon buffer - vp8_yv12_copy_partial_frame_ptr(cm->frame_to_show, &cpi->last_frame_uf); + /* Replace unfiltered frame buffer with a new one */ + cm->frame_to_show = &cpi->pick_lf_lvl_frame; if (cm->frame_type == KEY_FRAME) cm->sharpness_level = 0; @@ -177,26 +178,26 @@ void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi) best_filt_val = filt_val; // Get the err using the previous frame's filter value. - vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val); - best_err = calc_partial_ssl_err(sd, cm->frame_to_show, IF_RTCD(&cpi->rtcd.variance)); + /* Copy the unfiltered / processed recon buffer to the new buffer */ + vp8_yv12_copy_partial_frame_ptr(saved_frame, cm->frame_to_show); + vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val); - // Re-instate the unfiltered frame - vp8_yv12_copy_partial_frame_ptr(&cpi->last_frame_uf, cm->frame_to_show); + best_err = calc_partial_ssl_err(sd, cm->frame_to_show, + IF_RTCD(&cpi->rtcd.variance)); - filt_val -= (1 + ((filt_val > 10) ? 1 : 0)); + filt_val -= 1 + (filt_val > 10); // Search lower filter levels while (filt_val >= min_filter_level) { // Apply the loop filter + vp8_yv12_copy_partial_frame_ptr(saved_frame, cm->frame_to_show); vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val); // Get the err for filtered frame - filt_err = calc_partial_ssl_err(sd, cm->frame_to_show, IF_RTCD(&cpi->rtcd.variance)); - - // Re-instate the unfiltered frame - vp8_yv12_copy_partial_frame_ptr(&cpi->last_frame_uf, cm->frame_to_show); + filt_err = calc_partial_ssl_err(sd, cm->frame_to_show, + IF_RTCD(&cpi->rtcd.variance)); // Update the best case record or exit loop. if (filt_err < best_err) @@ -208,11 +209,11 @@ void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi) break; // Adjust filter level - filt_val -= (1 + ((filt_val > 10) ? 1 : 0)); + filt_val -= 1 + (filt_val > 10); } // Search up (note that we have already done filt_val = cm->filter_level) - filt_val = cm->filter_level + (1 + ((filt_val > 10) ? 1 : 0)); + filt_val = cm->filter_level + 1 + (filt_val > 10); if (best_filt_val == cm->filter_level) { @@ -222,13 +223,13 @@ void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi) while (filt_val < max_filter_level) { // Apply the loop filter + vp8_yv12_copy_partial_frame_ptr(saved_frame, cm->frame_to_show); + vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val); // Get the err for filtered frame - filt_err = calc_partial_ssl_err(sd, cm->frame_to_show, IF_RTCD(&cpi->rtcd.variance)); - - // Re-instate the unfiltered frame - vp8_yv12_copy_partial_frame_ptr(&cpi->last_frame_uf, cm->frame_to_show); + filt_err = calc_partial_ssl_err(sd, cm->frame_to_show, + IF_RTCD(&cpi->rtcd.variance)); // Update the best case record or exit loop. if (filt_err < best_err) @@ -242,7 +243,7 @@ void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi) break; // Adjust filter level - filt_val += (1 + ((filt_val > 10) ? 1 : 0)); + filt_val += 1 + (filt_val > 10); } } @@ -253,6 +254,9 @@ void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi) if (cm->filter_level > max_filter_level) cm->filter_level = max_filter_level; + + /* restore unfiltered frame pointer */ + cm->frame_to_show = saved_frame; } // Stub function for now Alt LF not used @@ -283,10 +287,16 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi) int filt_best; int filt_direction = 0; - int Bias = 0; // Bias against raising loop filter and in favour of lowering it + int Bias = 0; // Bias against raising loop filter and in favor of lowering it - // Make a copy of the unfiltered / processed recon buffer - vp8_yv12_copy_y_ptr(cm->frame_to_show, &cpi->last_frame_uf); + int ss_err[MAX_LOOP_FILTER + 1]; + + YV12_BUFFER_CONFIG * saved_frame = cm->frame_to_show; + + vpx_memset(ss_err, 0, sizeof(ss_err)); + + /* Replace unfiltered frame buffer with a new one */ + cm->frame_to_show = &cpi->pick_lf_lvl_frame; if (cm->frame_type == KEY_FRAME) cm->sharpness_level = 0; @@ -305,14 +315,19 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi) filter_step = (filt_mid < 16) ? 4 : filt_mid / 4; // Get baseline error score + + /* Copy the unfiltered / processed recon buffer to the new buffer */ + vp8_yv12_copy_y_ptr(saved_frame, cm->frame_to_show); + vp8cx_set_alt_lf_level(cpi, filt_mid); vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_mid); - best_err = vp8_calc_ss_err(sd, cm->frame_to_show, IF_RTCD(&cpi->rtcd.variance)); - filt_best = filt_mid; + best_err = vp8_calc_ss_err(sd, cm->frame_to_show, + IF_RTCD(&cpi->rtcd.variance)); - // Re-instate the unfiltered frame - vp8_yv12_copy_y_ptr(&cpi->last_frame_uf, cm->frame_to_show); + ss_err[filt_mid] = best_err; + + filt_best = filt_mid; while (filter_step > 0) { @@ -327,14 +342,19 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi) if ((filt_direction <= 0) && (filt_low != filt_mid)) { - // Get Low filter error score - vp8cx_set_alt_lf_level(cpi, filt_low); - vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_low); - - filt_err = vp8_calc_ss_err(sd, cm->frame_to_show, IF_RTCD(&cpi->rtcd.variance)); - - // Re-instate the unfiltered frame - vp8_yv12_copy_y_ptr(&cpi->last_frame_uf, cm->frame_to_show); + if(ss_err[filt_low] == 0) + { + // Get Low filter error score + vp8_yv12_copy_y_ptr(saved_frame, cm->frame_to_show); + vp8cx_set_alt_lf_level(cpi, filt_low); + vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_low); + + filt_err = vp8_calc_ss_err(sd, cm->frame_to_show, + IF_RTCD(&cpi->rtcd.variance)); + ss_err[filt_low] = filt_err; + } + else + filt_err = ss_err[filt_low]; // If value is close to the best so far then bias towards a lower loop filter value. if ((filt_err - Bias) < best_err) @@ -350,13 +370,18 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi) // Now look at filt_high if ((filt_direction >= 0) && (filt_high != filt_mid)) { - vp8cx_set_alt_lf_level(cpi, filt_high); - vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_high); - - filt_err = vp8_calc_ss_err(sd, cm->frame_to_show, IF_RTCD(&cpi->rtcd.variance)); + if(ss_err[filt_high] == 0) + { + vp8_yv12_copy_y_ptr(saved_frame, cm->frame_to_show); + vp8cx_set_alt_lf_level(cpi, filt_high); + vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_high); - // Re-instate the unfiltered frame - vp8_yv12_copy_y_ptr(&cpi->last_frame_uf, cm->frame_to_show); + filt_err = vp8_calc_ss_err(sd, cm->frame_to_show, + IF_RTCD(&cpi->rtcd.variance)); + ss_err[filt_high] = filt_err; + } + else + filt_err = ss_err[filt_high]; // Was it better than the previous best? if (filt_err < (best_err - Bias)) @@ -380,4 +405,7 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi) } cm->filter_level = filt_best; + + /* restore unfiltered frame pointer */ + cm->frame_to_show = saved_frame; } diff --git a/vp8/encoder/quantize.c b/vp8/encoder/quantize.c index e57a26430..ce04212e6 100644 --- a/vp8/encoder/quantize.c +++ b/vp8/encoder/quantize.c @@ -436,7 +436,8 @@ void vp8cx_init_quantizer(VP8_COMP *cpi) int quant_val; int Q; - int zbin_boost[16] = {0, 0, 8, 10, 12, 14, 16, 20, 24, 28, 32, 36, 40, 44, 44, 44}; + int zbin_boost[16] = {0, 0, 8, 10, 12, 14, 16, 20, 24, 28, 32, 36, 40, 44, + 44, 44}; for (Q = 0; Q < QINDEX_RANGE; Q++) { @@ -469,36 +470,61 @@ void vp8cx_init_quantizer(VP8_COMP *cpi) cpi->zrun_zbin_boost_uv[Q][0] = (quant_val * zbin_boost[0]) >> 7; // all the ac values = ; - for (i = 1; i < 16; i++) + quant_val = vp8_ac_yquant(Q); + cpi->Y1quant_fast[Q][1] = (1 << 16) / quant_val; + invert_quant(cpi->sf.improved_quant, cpi->Y1quant[Q] + 1, + cpi->Y1quant_shift[Q] + 1, quant_val); + cpi->Y1zbin[Q][1] = ((qzbin_factors[Q] * quant_val) + 64) >> 7; + cpi->Y1round[Q][1] = (qrounding_factors[Q] * quant_val) >> 7; + cpi->common.Y1dequant[Q][1] = quant_val; + cpi->zrun_zbin_boost_y1[Q][1] = (quant_val * zbin_boost[1]) >> 7; + + quant_val = vp8_ac2quant(Q, cpi->common.y2ac_delta_q); + cpi->Y2quant_fast[Q][1] = (1 << 16) / quant_val; + invert_quant(cpi->sf.improved_quant, cpi->Y2quant[Q] + 1, + cpi->Y2quant_shift[Q] + 1, quant_val); + cpi->Y2zbin[Q][1] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7; + cpi->Y2round[Q][1] = (qrounding_factors_y2[Q] * quant_val) >> 7; + cpi->common.Y2dequant[Q][1] = quant_val; + cpi->zrun_zbin_boost_y2[Q][1] = (quant_val * zbin_boost[1]) >> 7; + + quant_val = vp8_ac_uv_quant(Q, cpi->common.uvac_delta_q); + cpi->UVquant_fast[Q][1] = (1 << 16) / quant_val; + invert_quant(cpi->sf.improved_quant, cpi->UVquant[Q] + 1, + cpi->UVquant_shift[Q] + 1, quant_val); + cpi->UVzbin[Q][1] = ((qzbin_factors[Q] * quant_val) + 64) >> 7; + cpi->UVround[Q][1] = (qrounding_factors[Q] * quant_val) >> 7; + cpi->common.UVdequant[Q][1] = quant_val; + cpi->zrun_zbin_boost_uv[Q][1] = (quant_val * zbin_boost[1]) >> 7; + + for (i = 2; i < 16; i++) { - int rc = vp8_default_zig_zag1d[i]; - - quant_val = vp8_ac_yquant(Q); - cpi->Y1quant_fast[Q][rc] = (1 << 16) / quant_val; - invert_quant(cpi->sf.improved_quant, cpi->Y1quant[Q] + rc, - cpi->Y1quant_shift[Q] + rc, quant_val); - cpi->Y1zbin[Q][rc] = ((qzbin_factors[Q] * quant_val) + 64) >> 7; - cpi->Y1round[Q][rc] = (qrounding_factors[Q] * quant_val) >> 7; - cpi->common.Y1dequant[Q][rc] = quant_val; - cpi->zrun_zbin_boost_y1[Q][i] = (quant_val * zbin_boost[i]) >> 7; - - quant_val = vp8_ac2quant(Q, cpi->common.y2ac_delta_q); - cpi->Y2quant_fast[Q][rc] = (1 << 16) / quant_val; - invert_quant(cpi->sf.improved_quant, cpi->Y2quant[Q] + rc, - cpi->Y2quant_shift[Q] + rc, quant_val); - cpi->Y2zbin[Q][rc] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7; - cpi->Y2round[Q][rc] = (qrounding_factors_y2[Q] * quant_val) >> 7; - cpi->common.Y2dequant[Q][rc] = quant_val; - cpi->zrun_zbin_boost_y2[Q][i] = (quant_val * zbin_boost[i]) >> 7; - - quant_val = vp8_ac_uv_quant(Q, cpi->common.uvac_delta_q); - cpi->UVquant_fast[Q][rc] = (1 << 16) / quant_val; - invert_quant(cpi->sf.improved_quant, cpi->UVquant[Q] + rc, - cpi->UVquant_shift[Q] + rc, quant_val); - cpi->UVzbin[Q][rc] = ((qzbin_factors[Q] * quant_val) + 64) >> 7; - cpi->UVround[Q][rc] = (qrounding_factors[Q] * quant_val) >> 7; - cpi->common.UVdequant[Q][rc] = quant_val; - cpi->zrun_zbin_boost_uv[Q][i] = (quant_val * zbin_boost[i]) >> 7; + cpi->Y1quant_fast[Q][i] = cpi->Y1quant_fast[Q][1]; + cpi->Y1quant[Q][i] = cpi->Y1quant[Q][1]; + cpi->Y1quant_shift[Q][i] = cpi->Y1quant_shift[Q][1]; + cpi->Y1zbin[Q][i] = cpi->Y1zbin[Q][1]; + cpi->Y1round[Q][i] = cpi->Y1round[Q][1]; + cpi->common.Y1dequant[Q][i] = cpi->common.Y1dequant[Q][1]; + cpi->zrun_zbin_boost_y1[Q][i] = (cpi->common.Y1dequant[Q][1] * + zbin_boost[i]) >> 7; + + cpi->Y2quant_fast[Q][i] = cpi->Y2quant_fast[Q][1]; + cpi->Y2quant[Q][i] = cpi->Y2quant[Q][1]; + cpi->Y2quant_shift[Q][i] = cpi->Y2quant_shift[Q][1]; + cpi->Y2zbin[Q][i] = cpi->Y2zbin[Q][1]; + cpi->Y2round[Q][i] = cpi->Y2round[Q][1]; + cpi->common.Y2dequant[Q][i] = cpi->common.Y2dequant[Q][1]; + cpi->zrun_zbin_boost_y2[Q][i] = (cpi->common.Y2dequant[Q][1] * + zbin_boost[i]) >> 7; + + cpi->UVquant_fast[Q][i] = cpi->UVquant_fast[Q][1]; + cpi->UVquant[Q][i] = cpi->UVquant[Q][1]; + cpi->UVquant_shift[Q][i] = cpi->UVquant_shift[Q][1]; + cpi->UVzbin[Q][i] = cpi->UVzbin[Q][1]; + cpi->UVround[Q][i] = cpi->UVround[Q][1]; + cpi->common.UVdequant[Q][i] = cpi->common.UVdequant[Q][1]; + cpi->zrun_zbin_boost_uv[Q][i] = (cpi->common.UVdequant[Q][1] * + zbin_boost[i]) >> 7; } } } @@ -609,6 +635,9 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x, int ok_to_skip) /* This initialization should be called at least once. Use ok_to_skip to * decide if it is ok to skip. + * Before encoding a frame, this function is always called with ok_to_skip + * =0, which means no skiping of calculations. The "last" values are + * initialized at that time. */ if (!ok_to_skip || QIndex != x->q_index) { diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c index dc6feb980..1c43c1171 100644 --- a/vp8/encoder/ratectrl.c +++ b/vp8/encoder/ratectrl.c @@ -28,7 +28,6 @@ #define MAX_BPB_FACTOR 50 extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES]; -extern const MV_REFERENCE_FRAME vp8_ref_frame_order[MAX_MODES]; @@ -305,6 +304,8 @@ void vp8_setup_key_frame(VP8_COMP *cpi) // Setup for Key frame: vp8_default_coef_probs(& cpi->common); + + vp8_kf_default_bmode_probs(cpi->common.kf_bmode_prob); vpx_memcpy(cpi->common.fc.mvc, vp8_default_mv_context, sizeof(vp8_default_mv_context)); @@ -315,6 +316,12 @@ void vp8_setup_key_frame(VP8_COMP *cpi) vpx_memset(cpi->common.fc.pre_mvc, 0, sizeof(cpi->common.fc.pre_mvc)); //initialize pre_mvc to all zero. + // Make sure we initialize separate contexts for altref,gold, and normal. + // TODO shouldn't need 3 different copies of structure to do this! + vpx_memcpy(&cpi->lfc_a, &cpi->common.fc, sizeof(cpi->common.fc)); + vpx_memcpy(&cpi->lfc_g, &cpi->common.fc, sizeof(cpi->common.fc)); + vpx_memcpy(&cpi->lfc_n, &cpi->common.fc, sizeof(cpi->common.fc)); + //cpi->common.filter_level = 0; // Reset every key frame. cpi->common.filter_level = cpi->common.base_qindex * 3 / 8 ; @@ -325,8 +332,8 @@ void vp8_setup_key_frame(VP8_COMP *cpi) else cpi->frames_till_gf_update_due = cpi->goldfreq; - cpi->common.refresh_golden_frame = TRUE; - cpi->common.refresh_alt_ref_frame = TRUE; + cpi->common.refresh_golden_frame = 1; + cpi->common.refresh_alt_ref_frame = 1; } @@ -464,7 +471,7 @@ static void calc_gf_params(VP8_COMP *cpi) if (cpi->pass != 2) { // Single Pass lagged mode: TBD - if (FALSE) + if (0) { } @@ -591,14 +598,14 @@ static void calc_gf_params(VP8_COMP *cpi) if (cpi->pass != 2) { // For now Alt ref is not allowed except in 2 pass modes. - cpi->source_alt_ref_pending = FALSE; + cpi->source_alt_ref_pending = 0; /*if ( cpi->oxcf.fixed_q == -1) { if ( cpi->oxcf.play_alternate && (cpi->last_boost > (100 + (AF_THRESH*cpi->frames_till_gf_update_due)) ) ) - cpi->source_alt_ref_pending = TRUE; + cpi->source_alt_ref_pending = 1; else - cpi->source_alt_ref_pending = FALSE; + cpi->source_alt_ref_pending = 0; }*/ } } @@ -933,6 +940,8 @@ static void calc_pframe_target_size(VP8_COMP *cpi) if (cpi->active_worst_quality <= cpi->active_best_quality) cpi->active_worst_quality = cpi->active_best_quality + 1; + if(cpi->active_worst_quality > 127) + cpi->active_worst_quality = 127; } // Unbuffered mode (eg. video conferencing) else @@ -973,7 +982,7 @@ static void calc_pframe_target_size(VP8_COMP *cpi) #endif //vpx_log("Decoder: Drop frame due to bandwidth: %d \n",cpi->buffer_level, cpi->av_per_frame_bandwidth); - cpi->drop_frame = TRUE; + cpi->drop_frame = 1; } #if 0 @@ -981,7 +990,7 @@ static void calc_pframe_target_size(VP8_COMP *cpi) else if ((cpi->buffer_level < cpi->oxcf.drop_frames_water_mark * cpi->oxcf.optimal_buffer_level / 100) && (cpi->drop_count < cpi->max_drop_count) && (cpi->pass == 0)) { - cpi->drop_frame = TRUE; + cpi->drop_frame = 1; } #endif @@ -1027,11 +1036,11 @@ static void calc_pframe_target_size(VP8_COMP *cpi) { // For one pass throw a GF if recent frame intra useage is low or the GF useage is high if ((cpi->pass == 0) && (cpi->this_frame_percent_intra < 15 || gf_frame_useage >= 5)) - cpi->common.refresh_golden_frame = TRUE; + cpi->common.refresh_golden_frame = 1; // Two pass GF descision else if (cpi->pass == 2) - cpi->common.refresh_golden_frame = TRUE; + cpi->common.refresh_golden_frame = 1; } #if 0 @@ -1049,7 +1058,7 @@ static void calc_pframe_target_size(VP8_COMP *cpi) #endif - if (cpi->common.refresh_golden_frame == TRUE) + if (cpi->common.refresh_golden_frame == 1) { #if 0 @@ -1534,7 +1543,7 @@ int vp8_pick_frame_size(VP8_COMP *cpi) // Check if we're dropping the frame: if (cpi->drop_frame) { - cpi->drop_frame = FALSE; + cpi->drop_frame = 0; cpi->drop_count++; return 0; } diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c index e8abf848c..ce979619a 100644 --- a/vp8/encoder/rdopt.c +++ b/vp8/encoder/rdopt.c @@ -28,7 +28,6 @@ #include "encodemb.h" #include "quantize.h" #include "vp8/common/idct.h" -#include "vp8/common/g_common.h" #include "variance.h" #include "mcomp.h" #include "rdopt.h" @@ -100,36 +99,39 @@ const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES] = B_PRED, }; -const MV_REFERENCE_FRAME vp8_ref_frame_order[MAX_MODES] = +/* This table determines the search order in reference frame priority order, + * which may not necessarily match INTRA,LAST,GOLDEN,ARF + */ +const int vp8_ref_frame_order[MAX_MODES] = { - LAST_FRAME, - INTRA_FRAME, + 1, + 0, - LAST_FRAME, - LAST_FRAME, + 1, + 1, - GOLDEN_FRAME, - GOLDEN_FRAME, + 2, + 2, - ALTREF_FRAME, - ALTREF_FRAME, + 3, + 3, - GOLDEN_FRAME, - ALTREF_FRAME, + 2, + 3, - INTRA_FRAME, - INTRA_FRAME, - INTRA_FRAME, + 0, + 0, + 0, - LAST_FRAME, - GOLDEN_FRAME, - ALTREF_FRAME, + 1, + 2, + 3, - LAST_FRAME, - GOLDEN_FRAME, - ALTREF_FRAME, + 1, + 2, + 3, - INTRA_FRAME, + 0, }; static void fill_token_costs( @@ -285,18 +287,39 @@ void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue) } } - fill_token_costs( - cpi->mb.token_costs, - (const vp8_prob( *)[8][3][11]) cpi->common.fc.coef_probs - ); + { + // build token cost array for the type of frame we have now + FRAME_CONTEXT *l = &cpi->lfc_n; + + if(cpi->common.refresh_alt_ref_frame) + l = &cpi->lfc_a; + else if(cpi->common.refresh_golden_frame) + l = &cpi->lfc_g; + + fill_token_costs( + cpi->mb.token_costs, + (const vp8_prob( *)[8][3][11]) l->coef_probs + ); + /* + fill_token_costs( + cpi->mb.token_costs, + (const vp8_prob( *)[8][3][11]) cpi->common.fc.coef_probs); + */ + - vp8_init_mode_costs(cpi); + // TODO make these mode costs depend on last,alt or gold too. (jbb) + vp8_init_mode_costs(cpi); + + // TODO figure onnnnuut why making mv cost frame type dependent didn't help (jbb) + //vp8_build_component_cost_table(cpi->mb.mvcost, (const MV_CONTEXT *) l->mvc, flags); + + } } void vp8_auto_select_speed(VP8_COMP *cpi) { - int milliseconds_for_compress = (int)(1000000 / cpi->oxcf.frame_rate); + int milliseconds_for_compress = (int)(1000000 / cpi->frame_rate); milliseconds_for_compress = milliseconds_for_compress * (16 - cpi->oxcf.cpu_used) / 16; @@ -552,7 +575,7 @@ static void macro_block_yrd( MACROBLOCK *mb, int d; ENCODEMB_INVOKE(rtcd, submby)( mb->src_diff, *(mb->block[0].base_src), - mb->e_mbd.predictor, mb->block[0].src_stride ); + mb->block[0].src_stride, mb->e_mbd.predictor, 16); // Fdct and building the 2nd order block for (beptr = mb->block; beptr < mb->block + 16; beptr += 2) @@ -800,7 +823,8 @@ static int rd_inter16x16_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate, { vp8_build_inter16x16_predictors_mbuv(&x->e_mbd); ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), submbuv)(x->src_diff, - x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride); + x->src.u_buffer, x->src.v_buffer, x->src.uv_stride, + &x->e_mbd.predictor[256], &x->e_mbd.predictor[320], 8); vp8_transform_mbuv(x); vp8_quantize_mbuv(x); @@ -816,7 +840,8 @@ static int rd_inter4x4_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate, { vp8_build_inter4x4_predictors_mbuv(&x->e_mbd); ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), submbuv)(x->src_diff, - x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride); + x->src.u_buffer, x->src.v_buffer, x->src.uv_stride, + &x->e_mbd.predictor[256], &x->e_mbd.predictor[320], 8); vp8_transform_mbuv(x); vp8_quantize_mbuv(x); @@ -845,8 +870,8 @@ static void rd_pick_intra_mbuv_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate, int RECON_INVOKE(&cpi->rtcd.common->recon, build_intra_predictors_mbuv) (&x->e_mbd); ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), submbuv)(x->src_diff, - x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, - x->src.uv_stride); + x->src.u_buffer, x->src.v_buffer, x->src.uv_stride, + &x->e_mbd.predictor[256], &x->e_mbd.predictor[320], 8); vp8_transform_mbuv(x); vp8_quantize_mbuv(x); @@ -1359,8 +1384,8 @@ static int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x, if (bsi.segment_rd < best_rd) { - int col_min = (best_ref_mv->as_mv.col>>3) - MAX_FULL_PEL_VAL + ((best_ref_mv->as_mv.col & 7)?1:0); - int row_min = (best_ref_mv->as_mv.row>>3) - MAX_FULL_PEL_VAL + ((best_ref_mv->as_mv.row & 7)?1:0); + int col_min = ((best_ref_mv->as_mv.col+7)>>3) - MAX_FULL_PEL_VAL; + int row_min = ((best_ref_mv->as_mv.row+7)>>3) - MAX_FULL_PEL_VAL; int col_max = (best_ref_mv->as_mv.col>>3) + MAX_FULL_PEL_VAL; int row_max = (best_ref_mv->as_mv.row>>3) + MAX_FULL_PEL_VAL; @@ -1458,57 +1483,6 @@ static int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x, return bsi.segment_rd; } -static void insertsortmv(int arr[], int len) -{ - int i, j, k; - - for ( i = 1 ; i <= len-1 ; i++ ) - { - for ( j = 0 ; j < i ; j++ ) - { - if ( arr[j] > arr[i] ) - { - int temp; - - temp = arr[i]; - - for ( k = i; k >j; k--) - arr[k] = arr[k - 1] ; - - arr[j] = temp ; - } - } - } -} - -static void insertsortsad(int arr[],int idx[], int len) -{ - int i, j, k; - - for ( i = 1 ; i <= len-1 ; i++ ) - { - for ( j = 0 ; j < i ; j++ ) - { - if ( arr[j] > arr[i] ) - { - int temp, tempi; - - temp = arr[i]; - tempi = idx[i]; - - for ( k = i; k >j; k--) - { - arr[k] = arr[k - 1] ; - idx[k] = idx[k - 1]; - } - - arr[j] = temp ; - idx[j] = tempi; - } - } - } -} - //The improved MV prediction void vp8_mv_pred ( @@ -1741,7 +1715,9 @@ static void rd_update_mvcount(VP8_COMP *cpi, MACROBLOCK *x, int_mv *best_ref_mv) } } -void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra) +void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, + int recon_uvoffset, int *returnrate, + int *returndistortion, int *returnintra) { BLOCK *b = &x->block[0]; BLOCKD *d = &x->e_mbd.block[0]; @@ -1768,40 +1744,58 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int int distortion_uv; int best_yrd = INT_MAX; - //int all_rds[MAX_MODES]; // Experimental debug code. - //int all_rates[MAX_MODES]; - //int all_dist[MAX_MODES]; - //int intermodecost[MAX_MODES]; - MB_PREDICTION_MODE uv_intra_mode; int_mv mvp; int near_sadidx[8] = {0, 1, 2, 3, 4, 5, 6, 7}; int saddone=0; int sr=0; //search range got from mv_pred(). It uses step_param levels. (0-7) - int_mv frame_nearest_mv[4]; - int_mv frame_near_mv[4]; - int_mv frame_best_ref_mv[4]; - int frame_mdcounts[4][4]; int frame_lf_or_gf[4]; unsigned char *y_buffer[4]; unsigned char *u_buffer[4]; unsigned char *v_buffer[4]; + int ref_frame_map[4]; + int sign_bias = 0; + vpx_memset(mode_mv, 0, sizeof(mode_mv)); vpx_memset(&best_mbmode, 0, sizeof(best_mbmode)); vpx_memset(&best_bmodes, 0, sizeof(best_bmodes)); + /* Setup search priorities */ + i=0; + ref_frame_map[i++] = INTRA_FRAME; if (cpi->ref_frame_flags & VP8_LAST_FLAG) + ref_frame_map[i++] = LAST_FRAME; + if (cpi->ref_frame_flags & VP8_GOLD_FLAG) + ref_frame_map[i++] = GOLDEN_FRAME; + if (cpi->ref_frame_flags & VP8_ALT_FLAG) + ref_frame_map[i++] = ALTREF_FRAME; + for(; i<4; i++) + ref_frame_map[i] = -1; + + /* Check to see if there is at least 1 valid reference frame that we need + * to calculate near_mvs. + */ + if (ref_frame_map[1] > 0) { - YV12_BUFFER_CONFIG *lst_yv12 = &cpi->common.yv12_fb[cpi->common.lst_fb_idx]; + vp8_find_near_mvs(&x->e_mbd, + x->e_mbd.mode_info_context, + &mode_mv[NEARESTMV], &mode_mv[NEARMV], + &best_ref_mv, + mdcounts, + ref_frame_map[1], + cpi->common.ref_frame_sign_bias); + + sign_bias = cpi->common.ref_frame_sign_bias[ref_frame_map[1]]; + } - vp8_find_near_mvs(&x->e_mbd, x->e_mbd.mode_info_context, &frame_nearest_mv[LAST_FRAME], &frame_near_mv[LAST_FRAME], - &frame_best_ref_mv[LAST_FRAME], frame_mdcounts[LAST_FRAME], LAST_FRAME, cpi->common.ref_frame_sign_bias); + if (cpi->ref_frame_flags & VP8_LAST_FLAG) + { + YV12_BUFFER_CONFIG *lst_yv12 = &cpi->common.yv12_fb[cpi->common.lst_fb_idx]; y_buffer[LAST_FRAME] = lst_yv12->y_buffer + recon_yoffset; u_buffer[LAST_FRAME] = lst_yv12->u_buffer + recon_uvoffset; v_buffer[LAST_FRAME] = lst_yv12->v_buffer + recon_uvoffset; - frame_lf_or_gf[LAST_FRAME] = 0; } @@ -1809,13 +1803,9 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int { YV12_BUFFER_CONFIG *gld_yv12 = &cpi->common.yv12_fb[cpi->common.gld_fb_idx]; - vp8_find_near_mvs(&x->e_mbd, x->e_mbd.mode_info_context, &frame_nearest_mv[GOLDEN_FRAME], &frame_near_mv[GOLDEN_FRAME], - &frame_best_ref_mv[GOLDEN_FRAME], frame_mdcounts[GOLDEN_FRAME], GOLDEN_FRAME, cpi->common.ref_frame_sign_bias); - y_buffer[GOLDEN_FRAME] = gld_yv12->y_buffer + recon_yoffset; u_buffer[GOLDEN_FRAME] = gld_yv12->u_buffer + recon_uvoffset; v_buffer[GOLDEN_FRAME] = gld_yv12->v_buffer + recon_uvoffset; - frame_lf_or_gf[GOLDEN_FRAME] = 1; } @@ -1823,13 +1813,9 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int { YV12_BUFFER_CONFIG *alt_yv12 = &cpi->common.yv12_fb[cpi->common.alt_fb_idx]; - vp8_find_near_mvs(&x->e_mbd, x->e_mbd.mode_info_context, &frame_nearest_mv[ALTREF_FRAME], &frame_near_mv[ALTREF_FRAME], - &frame_best_ref_mv[ALTREF_FRAME], frame_mdcounts[ALTREF_FRAME], ALTREF_FRAME, cpi->common.ref_frame_sign_bias); - y_buffer[ALTREF_FRAME] = alt_yv12->y_buffer + recon_yoffset; u_buffer[ALTREF_FRAME] = alt_yv12->u_buffer + recon_uvoffset; v_buffer[ALTREF_FRAME] = alt_yv12->v_buffer + recon_uvoffset; - frame_lf_or_gf[ALTREF_FRAME] = 1; } @@ -1838,8 +1824,6 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int x->skip = 0; - vpx_memset(mode_mv, 0, sizeof(mode_mv)); - x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME; rd_pick_intra_mbuv_mode(cpi, x, &uv_intra_rate, &uv_intra_rate_tokenonly, &uv_intra_distortion); uv_intra_mode = x->e_mbd.mode_info_context->mbmi.uv_mode; @@ -1850,18 +1834,15 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int int lf_or_gf = 0; // Lat Frame (01) or gf/arf (1) int disable_skip = 0; int other_cost = 0; - - // Experimental debug code. - // Record of rd values recorded for this MB. -1 indicates not measured - //all_rds[mode_index] = -1; - //all_rates[mode_index] = -1; - //all_dist[mode_index] = -1; - //intermodecost[mode_index] = -1; + int this_ref_frame = ref_frame_map[vp8_ref_frame_order[mode_index]]; // Test best rd so far against threshold for trying this mode. if (best_rd <= cpi->rd_threshes[mode_index]) continue; + if (this_ref_frame < 0) + continue; + // These variables hold are rolling total cost and distortion for this mode rate2 = 0; distortion2 = 0; @@ -1870,7 +1851,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int x->e_mbd.mode_info_context->mbmi.mode = this_mode; x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED; - x->e_mbd.mode_info_context->mbmi.ref_frame = vp8_ref_frame_order[mode_index]; + x->e_mbd.mode_info_context->mbmi.ref_frame = this_ref_frame; // Only consider ZEROMV/ALTREF_FRAME for alt ref frame, // unless ARNR filtering is enabled in which case we want @@ -1887,10 +1868,20 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int x->e_mbd.pre.y_buffer = y_buffer[x->e_mbd.mode_info_context->mbmi.ref_frame]; x->e_mbd.pre.u_buffer = u_buffer[x->e_mbd.mode_info_context->mbmi.ref_frame]; x->e_mbd.pre.v_buffer = v_buffer[x->e_mbd.mode_info_context->mbmi.ref_frame]; - mode_mv[NEARESTMV] = frame_nearest_mv[x->e_mbd.mode_info_context->mbmi.ref_frame]; - mode_mv[NEARMV] = frame_near_mv[x->e_mbd.mode_info_context->mbmi.ref_frame]; - best_ref_mv = frame_best_ref_mv[x->e_mbd.mode_info_context->mbmi.ref_frame]; - vpx_memcpy(mdcounts, frame_mdcounts[x->e_mbd.mode_info_context->mbmi.ref_frame], sizeof(mdcounts)); + + if (sign_bias != + cpi->common.ref_frame_sign_bias[x->e_mbd.mode_info_context->mbmi.ref_frame]) + { + mode_mv[NEARESTMV].as_mv.row *= -1; + mode_mv[NEARESTMV].as_mv.col *= -1; + mode_mv[NEARMV].as_mv.row *= -1; + mode_mv[NEARMV].as_mv.col *= -1; + best_ref_mv.as_mv.row *= -1; + best_ref_mv.as_mv.col *= -1; + sign_bias + = cpi->common.ref_frame_sign_bias[x->e_mbd.mode_info_context->mbmi.ref_frame]; + } + lf_or_gf = frame_lf_or_gf[x->e_mbd.mode_info_context->mbmi.ref_frame]; } @@ -1918,13 +1909,13 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int // Experimental code. Special case for gf and arf zeromv modes. Increase zbin size to supress noise if (cpi->zbin_mode_boost_enabled) { - if ( vp8_ref_frame_order[mode_index] == INTRA_FRAME ) + if ( this_ref_frame == INTRA_FRAME ) cpi->zbin_mode_boost = 0; else { if (vp8_mode_order[mode_index] == ZEROMV) { - if (vp8_ref_frame_order[mode_index] != LAST_FRAME) + if (this_ref_frame != LAST_FRAME) cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST; else cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST; @@ -1969,8 +1960,8 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int int tmp_rd; int this_rd_thresh; - this_rd_thresh = (x->e_mbd.mode_info_context->mbmi.ref_frame == LAST_FRAME) ? cpi->rd_threshes[THR_NEWMV] : cpi->rd_threshes[THR_NEWA]; - this_rd_thresh = (x->e_mbd.mode_info_context->mbmi.ref_frame == GOLDEN_FRAME) ? cpi->rd_threshes[THR_NEWG]: this_rd_thresh; + this_rd_thresh = (vp8_ref_frame_order[mode_index] == 1) ? cpi->rd_threshes[THR_NEW1] : cpi->rd_threshes[THR_NEW3]; + this_rd_thresh = (vp8_ref_frame_order[mode_index] == 2) ? cpi->rd_threshes[THR_NEW2] : this_rd_thresh; tmp_rd = vp8_rd_pick_best_mbsegmentation(cpi, x, &best_ref_mv, best_yrd, mdcounts, @@ -2024,8 +2015,8 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int int sadpb = x->sadperbit16; int_mv mvp_full; - int col_min = (best_ref_mv.as_mv.col>>3) - MAX_FULL_PEL_VAL + ((best_ref_mv.as_mv.col & 7)?1:0); - int row_min = (best_ref_mv.as_mv.row>>3) - MAX_FULL_PEL_VAL + ((best_ref_mv.as_mv.row & 7)?1:0); + int col_min = ((best_ref_mv.as_mv.col+7)>>3) - MAX_FULL_PEL_VAL; + int row_min = ((best_ref_mv.as_mv.row+7)>>3) - MAX_FULL_PEL_VAL; int col_max = (best_ref_mv.as_mv.col>>3) + MAX_FULL_PEL_VAL; int row_max = (best_ref_mv.as_mv.row>>3) + MAX_FULL_PEL_VAL; @@ -2174,7 +2165,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int continue; vp8_set_mbmode_and_mvs(x, this_mode, &mode_mv[this_mode]); - vp8_build_inter16x16_predictors_mby(&x->e_mbd); + vp8_build_inter16x16_predictors_mby(&x->e_mbd, x->e_mbd.predictor, 16); if (cpi->active_map_enabled && x->active_ptr[0] == 0) { x->skip = 1; @@ -2294,11 +2285,6 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2); } - // Experimental debug code. - //all_rds[mode_index] = this_rd; - //all_rates[mode_index] = rate2; - //all_dist[mode_index] = distortion2; - // Keep record of best intra distortion if ((x->e_mbd.mode_info_context->mbmi.ref_frame == INTRA_FRAME) && (this_rd < best_intra_rd) ) @@ -2399,7 +2385,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int x->e_mbd.mode_info_context->mbmi.mv.as_int = 0; x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED; x->e_mbd.mode_info_context->mbmi.mb_skip_coeff = - (cpi->common.mb_no_coeff_skip) ? 1 : 0; + (cpi->common.mb_no_coeff_skip); x->e_mbd.mode_info_context->mbmi.partitioning = 0; return; @@ -2426,10 +2412,14 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int x->partition_info->bmi[15].mv.as_int; } - rd_update_mvcount(cpi, x, &frame_best_ref_mv[xd->mode_info_context->mbmi.ref_frame]); - - + if (sign_bias + != cpi->common.ref_frame_sign_bias[xd->mode_info_context->mbmi.ref_frame]) + { + best_ref_mv.as_mv.row *= -1; + best_ref_mv.as_mv.col *= -1; + } + rd_update_mvcount(cpi, x, &best_ref_mv); } void vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate_) diff --git a/vp8/encoder/rdopt.h b/vp8/encoder/rdopt.h index 95134cb81..5ee869903 100644 --- a/vp8/encoder/rdopt.h +++ b/vp8/encoder/rdopt.h @@ -14,6 +14,57 @@ #define RDCOST(RM,DM,R,D) ( ((128+(R)*(RM)) >> 8) + (DM)*(D) ) +static void insertsortmv(int arr[], int len) +{ + int i, j, k; + + for ( i = 1 ; i <= len-1 ; i++ ) + { + for ( j = 0 ; j < i ; j++ ) + { + if ( arr[j] > arr[i] ) + { + int temp; + + temp = arr[i]; + + for ( k = i; k >j; k--) + arr[k] = arr[k - 1] ; + + arr[j] = temp ; + } + } + } +} + +static void insertsortsad(int arr[],int idx[], int len) +{ + int i, j, k; + + for ( i = 1 ; i <= len-1 ; i++ ) + { + for ( j = 0 ; j < i ; j++ ) + { + if ( arr[j] > arr[i] ) + { + int temp, tempi; + + temp = arr[i]; + tempi = idx[i]; + + for ( k = i; k >j; k--) + { + arr[k] = arr[k - 1] ; + idx[k] = idx[k - 1]; + } + + arr[j] = temp ; + idx[j] = tempi; + } + } + } +} + extern void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue); extern void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra); extern void vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate); diff --git a/vp8/encoder/temporal_filter.c b/vp8/encoder/temporal_filter.c index b9ade1c6c..545e4f205 100644 --- a/vp8/encoder/temporal_filter.c +++ b/vp8/encoder/temporal_filter.c @@ -22,7 +22,6 @@ #include "ratectrl.h" #include "vp8/common/quant_common.h" #include "segmentation.h" -#include "vp8/common/g_common.h" #include "vpx_scale/yv12extend.h" #include "vpx_mem/vpx_mem.h" #include "vp8/common/swapyv12buffer.h" @@ -98,7 +97,7 @@ void vp8_temporal_filter_apply_c unsigned short *count ) { - int i, j, k; + unsigned int i, j, k; int modifier; int byte = 0; @@ -186,7 +185,7 @@ static int vp8_temporal_filter_find_matching_mb_c if (cpi->Speed < 8) { step_param = cpi->sf.first_step + - ((cpi->Speed > 5) ? 1 : 0); + (cpi->Speed > 5); further_steps = (cpi->sf.max_step_search_steps - 1)-step_param; } diff --git a/vp8/encoder/tokenize.c b/vp8/encoder/tokenize.c index e81948567..8bfc47f8f 100644 --- a/vp8/encoder/tokenize.c +++ b/vp8/encoder/tokenize.c @@ -514,17 +514,19 @@ static __inline void stuff1st_order_b TOKENEXTRA **tp, ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l, + int type, VP8_COMP *cpi ) { int pt; /* near block/prev token context index */ + int band; TOKENEXTRA *t = *tp; /* store tokens starting here */ VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l); - + band = type ? 0 : 1; t->Token = DCT_EOB_TOKEN; - t->context_tree = cpi->common.fc.coef_probs [0] [1] [pt]; + t->context_tree = cpi->common.fc.coef_probs [type] [band] [pt]; t->skip_eob_node = 0; - ++cpi->coef_counts [0] [1] [pt] [DCT_EOB_TOKEN]; + ++cpi->coef_counts [type] [band] [pt] [DCT_EOB_TOKEN]; ++t; *tp = t; pt = 0; /* 0 <-> all coeff data is zero */ @@ -561,15 +563,19 @@ void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) ENTROPY_CONTEXT * L = (ENTROPY_CONTEXT *)x->left_context; int plane_type; int b; - - stuff2nd_order_b(t, + plane_type = 3; + if((x->mode_info_context->mbmi.mode != B_PRED + && x->mode_info_context->mbmi.mode != SPLITMV)) + { + stuff2nd_order_b(t, A + vp8_block2above[24], L + vp8_block2left[24], cpi); - plane_type = 0; + plane_type = 0; + } for (b = 0; b < 16; b++) stuff1st_order_b(t, A + vp8_block2above[b], - L + vp8_block2left[b], cpi); + L + vp8_block2left[b], plane_type, cpi); for (b = 16; b < 24; b++) stuff1st_order_buv(t, diff --git a/vp8/encoder/x86/subtract_mmx.asm b/vp8/encoder/x86/subtract_mmx.asm index 4ce16ce90..75e8aa3c2 100644 --- a/vp8/encoder/x86/subtract_mmx.asm +++ b/vp8/encoder/x86/subtract_mmx.asm @@ -73,74 +73,71 @@ sym(vp8_subtract_b_mmx_impl): pop rbp ret -;void vp8_subtract_mby_mmx(short *diff, unsigned char *src, unsigned char *pred, int stride) +;void vp8_subtract_mby_mmx(short *diff, unsigned char *src, int src_stride, +;unsigned char *pred, int pred_stride) global sym(vp8_subtract_mby_mmx) sym(vp8_subtract_mby_mmx): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 + SHADOW_ARGS_TO_STACK 5 push rsi push rdi ; end prolog + mov rdi, arg(0) ;diff + mov rsi, arg(1) ;src + movsxd rdx, dword ptr arg(2);src_stride + mov rax, arg(3) ;pred + push rbx + movsxd rbx, dword ptr arg(4);pred_stride - mov rsi, arg(1) ;src - mov rdi, arg(0) ;diff - - mov rax, arg(2) ;pred - movsxd rdx, dword ptr arg(3) ;stride + pxor mm0, mm0 + mov rcx, 16 - mov rcx, 16 - pxor mm0, mm0 .submby_loop: + movq mm1, [rsi] + movq mm3, [rax] - movq mm1, [rsi] - movq mm3, [rax] - - movq mm2, mm1 - movq mm4, mm3 - - punpcklbw mm1, mm0 - punpcklbw mm3, mm0 - - punpckhbw mm2, mm0 - punpckhbw mm4, mm0 - - psubw mm1, mm3 - psubw mm2, mm4 + movq mm2, mm1 + movq mm4, mm3 - movq [rdi], mm1 - movq [rdi+8], mm2 + punpcklbw mm1, mm0 + punpcklbw mm3, mm0 + punpckhbw mm2, mm0 + punpckhbw mm4, mm0 - movq mm1, [rsi+8] - movq mm3, [rax+8] + psubw mm1, mm3 + psubw mm2, mm4 - movq mm2, mm1 - movq mm4, mm3 + movq [rdi], mm1 + movq [rdi+8], mm2 - punpcklbw mm1, mm0 - punpcklbw mm3, mm0 + movq mm1, [rsi+8] + movq mm3, [rax+8] - punpckhbw mm2, mm0 - punpckhbw mm4, mm0 + movq mm2, mm1 + movq mm4, mm3 - psubw mm1, mm3 - psubw mm2, mm4 + punpcklbw mm1, mm0 + punpcklbw mm3, mm0 - movq [rdi+16], mm1 - movq [rdi+24], mm2 + punpckhbw mm2, mm0 + punpckhbw mm4, mm0 + psubw mm1, mm3 + psubw mm2, mm4 - add rdi, 32 - add rax, 16 - - lea rsi, [rsi+rdx] - - sub rcx, 1 - jnz .submby_loop + movq [rdi+16], mm1 + movq [rdi+24], mm2 + add rdi, 32 + lea rax, [rax+rbx] + lea rsi, [rsi+rdx] + dec rcx + jnz .submby_loop + pop rbx pop rdi pop rsi ; begin epilog @@ -149,281 +146,75 @@ sym(vp8_subtract_mby_mmx): ret -;void vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride) +;vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, +; int src_stride, unsigned char *upred, +; unsigned char *vpred, int pred_stride) + global sym(vp8_subtract_mbuv_mmx) sym(vp8_subtract_mbuv_mmx): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 + SHADOW_ARGS_TO_STACK 7 push rsi push rdi ; end prolog - ;short *udiff = diff + 256; - ;short *vdiff = diff + 320; - ;unsigned char *upred = pred + 256; - ;unsigned char *vpred = pred + 320; - - ;unsigned char *z = usrc; - ;unsigned short *diff = udiff; - ;unsigned char *Predictor= upred; - - mov rdi, arg(0) ;diff - mov rax, arg(3) ;pred - mov rsi, arg(1) ;z = usrc - add rdi, 256*2 ;diff = diff + 256 (shorts) - add rax, 256 ;Predictor = pred + 256 - movsxd rdx, dword ptr arg(4) ;stride; - pxor mm7, mm7 - - movq mm0, [rsi] - movq mm1, [rax] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi], mm0 - movq [rdi+8], mm3 - - - movq mm0, [rsi+rdx] - movq mm1, [rax+8] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi+16], mm0 - movq [rdi+24], mm3 - - movq mm0, [rsi+rdx*2] - movq mm1, [rax+16] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi+32], mm0 - movq [rdi+40], mm3 - lea rsi, [rsi+rdx*2] - - - movq mm0, [rsi+rdx] - movq mm1, [rax+24] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - - movq [rdi+48], mm0 - movq [rdi+56], mm3 - - - add rdi, 64 - add rax, 32 - lea rsi, [rsi+rdx*2] - - - movq mm0, [rsi] - movq mm1, [rax] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi], mm0 - movq [rdi+8], mm3 - - - movq mm0, [rsi+rdx] - movq mm1, [rax+8] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi+16], mm0 - movq [rdi+24], mm3 - - movq mm0, [rsi+rdx*2] - movq mm1, [rax+16] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi+32], mm0 - movq [rdi+40], mm3 - lea rsi, [rsi+rdx*2] - - - movq mm0, [rsi+rdx] - movq mm1, [rax+24] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - - movq [rdi+48], mm0 - movq [rdi+56], mm3 - - ;unsigned char *z = vsrc; - ;unsigned short *diff = vdiff; - ;unsigned char *Predictor= vpred; - - mov rdi, arg(0) ;diff - mov rax, arg(3) ;pred - mov rsi, arg(2) ;z = usrc - add rdi, 320*2 ;diff = diff + 320 (shorts) - add rax, 320 ;Predictor = pred + 320 - movsxd rdx, dword ptr arg(4) ;stride; - pxor mm7, mm7 - - movq mm0, [rsi] - movq mm1, [rax] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi], mm0 - movq [rdi+8], mm3 - - - movq mm0, [rsi+rdx] - movq mm1, [rax+8] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi+16], mm0 - movq [rdi+24], mm3 - - movq mm0, [rsi+rdx*2] - movq mm1, [rax+16] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi+32], mm0 - movq [rdi+40], mm3 - lea rsi, [rsi+rdx*2] - - - movq mm0, [rsi+rdx] - movq mm1, [rax+24] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - - movq [rdi+48], mm0 - movq [rdi+56], mm3 - - - add rdi, 64 - add rax, 32 - lea rsi, [rsi+rdx*2] - - - movq mm0, [rsi] - movq mm1, [rax] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi], mm0 - movq [rdi+8], mm3 - - - movq mm0, [rsi+rdx] - movq mm1, [rax+8] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi+16], mm0 - movq [rdi+24], mm3 - - movq mm0, [rsi+rdx*2] - movq mm1, [rax+16] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi+32], mm0 - movq [rdi+40], mm3 - lea rsi, [rsi+rdx*2] - - - movq mm0, [rsi+rdx] - movq mm1, [rax+24] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - - movq [rdi+48], mm0 - movq [rdi+56], mm3 - + mov rdi, arg(0) ;diff + mov rsi, arg(1) ;usrc + movsxd rdx, dword ptr arg(3);src_stride; + mov rax, arg(4) ;upred + add rdi, 256*2 ;diff = diff + 256 (shorts) + mov rcx, 8 + push rbx + movsxd rbx, dword ptr arg(6);pred_stride + + pxor mm7, mm7 + +.submbu_loop: + movq mm0, [rsi] + movq mm1, [rax] + movq mm3, mm0 + movq mm4, mm1 + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + punpckhbw mm3, mm7 + punpckhbw mm4, mm7 + psubw mm0, mm1 + psubw mm3, mm4 + movq [rdi], mm0 + movq [rdi+8], mm3 + add rdi, 16 + add rsi, rdx + add rax, rbx + + dec rcx + jnz .submbu_loop + + mov rsi, arg(2) ;vsrc + mov rax, arg(5) ;vpred + mov rcx, 8 + +.submbv_loop: + movq mm0, [rsi] + movq mm1, [rax] + movq mm3, mm0 + movq mm4, mm1 + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + punpckhbw mm3, mm7 + punpckhbw mm4, mm7 + psubw mm0, mm1 + psubw mm3, mm4 + movq [rdi], mm0 + movq [rdi+8], mm3 + add rdi, 16 + add rsi, rdx + add rax, rbx + + dec rcx + jnz .submbv_loop + + pop rbx ; begin epilog pop rdi pop rsi diff --git a/vp8/encoder/x86/subtract_sse2.asm b/vp8/encoder/x86/subtract_sse2.asm index 3bd1ff678..008e9c7d1 100644 --- a/vp8/encoder/x86/subtract_sse2.asm +++ b/vp8/encoder/x86/subtract_sse2.asm @@ -71,277 +71,166 @@ sym(vp8_subtract_b_sse2_impl): ret -;void vp8_subtract_mby_sse2(short *diff, unsigned char *src, unsigned char *pred, int stride) +;void vp8_subtract_mby_sse2(short *diff, unsigned char *src, int src_stride, +;unsigned char *pred, int pred_stride) global sym(vp8_subtract_mby_sse2) sym(vp8_subtract_mby_sse2): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - SAVE_XMM 7 + SHADOW_ARGS_TO_STACK 5 GET_GOT rbx push rsi push rdi ; end prolog - mov rsi, arg(1) ;src - mov rdi, arg(0) ;diff - - mov rax, arg(2) ;pred - movsxd rdx, dword ptr arg(3) ;stride - - mov rcx, 8 ; do two lines at one time + mov rdi, arg(0) ;diff + mov rsi, arg(1) ;src + movsxd rdx, dword ptr arg(2);src_stride + mov rax, arg(3) ;pred + movdqa xmm4, [GLOBAL(t80)] + push rbx + mov rcx, 8 ; do two lines at one time + movsxd rbx, dword ptr arg(4);pred_stride .submby_loop: - movdqa xmm0, XMMWORD PTR [rsi] ; src - movdqa xmm1, XMMWORD PTR [rax] ; pred + movdqa xmm0, [rsi] ; src + movdqa xmm1, [rax] ; pred - movdqa xmm2, xmm0 - psubb xmm0, xmm1 + movdqa xmm2, xmm0 + psubb xmm0, xmm1 - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information + pxor xmm1, xmm4 ;convert to signed values + pxor xmm2, xmm4 + pcmpgtb xmm1, xmm2 ; obtain sign information - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction + movdqa xmm2, xmm0 + punpcklbw xmm0, xmm1 ; put sign back to subtraction + punpckhbw xmm2, xmm1 ; put sign back to subtraction - movdqa XMMWORD PTR [rdi], xmm0 - movdqa XMMWORD PTR [rdi +16], xmm2 + movdqa xmm3, [rsi + rdx] + movdqa xmm5, [rax + rbx] - movdqa xmm4, XMMWORD PTR [rsi + rdx] - movdqa xmm5, XMMWORD PTR [rax + 16] + lea rsi, [rsi+rdx*2] + lea rax, [rax+rbx*2] - movdqa xmm6, xmm4 - psubb xmm4, xmm5 + movdqa [rdi], xmm0 + movdqa [rdi +16], xmm2 - pxor xmm5, [GLOBAL(t80)] ;convert to signed values - pxor xmm6, [GLOBAL(t80)] - pcmpgtb xmm5, xmm6 ; obtain sign information + movdqa xmm1, xmm3 + psubb xmm3, xmm5 - movdqa xmm6, xmm4 - movdqa xmm7, xmm5 - punpcklbw xmm4, xmm5 ; put sign back to subtraction - punpckhbw xmm6, xmm7 ; put sign back to subtraction + pxor xmm5, xmm4 ;convert to signed values + pxor xmm1, xmm4 + pcmpgtb xmm5, xmm1 ; obtain sign information - movdqa XMMWORD PTR [rdi +32], xmm4 - movdqa XMMWORD PTR [rdi +48], xmm6 + movdqa xmm1, xmm3 + punpcklbw xmm3, xmm5 ; put sign back to subtraction + punpckhbw xmm1, xmm5 ; put sign back to subtraction - add rdi, 64 - add rax, 32 - lea rsi, [rsi+rdx*2] + movdqa [rdi +32], xmm3 + movdqa [rdi +48], xmm1 - sub rcx, 1 - jnz .submby_loop + add rdi, 64 + dec rcx + jnz .submby_loop + pop rbx pop rdi pop rsi ; begin epilog RESTORE_GOT - RESTORE_XMM UNSHADOW_ARGS pop rbp ret - -;void vp8_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride) +;vp8_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc, +; int src_stride, unsigned char *upred, +; unsigned char *vpred, int pred_stride) global sym(vp8_subtract_mbuv_sse2) sym(vp8_subtract_mbuv_sse2): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 + SHADOW_ARGS_TO_STACK 7 GET_GOT rbx push rsi push rdi ; end prolog - mov rdi, arg(0) ;diff - mov rax, arg(3) ;pred - mov rsi, arg(1) ;z = usrc - add rdi, 256*2 ;diff = diff + 256 (shorts) - add rax, 256 ;Predictor = pred + 256 - movsxd rdx, dword ptr arg(4) ;stride; - lea rcx, [rdx + rdx*2] - - ;u - ;line 0 1 - movq xmm0, MMWORD PTR [rsi] ; src - movq xmm2, MMWORD PTR [rsi+rdx] - movdqa xmm1, XMMWORD PTR [rax] ; pred - punpcklqdq xmm0, xmm2 - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 ; subtraction with sign missed - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi], xmm0 - movdqa XMMWORD PTR [rdi +16], xmm2 - - ;line 2 3 - movq xmm0, MMWORD PTR [rsi+rdx*2] ; src - movq xmm2, MMWORD PTR [rsi+rcx] - movdqa xmm1, XMMWORD PTR [rax+16] ; pred - punpcklqdq xmm0, xmm2 - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 ; subtraction with sign missed - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi + 32], xmm0 - movdqa XMMWORD PTR [rdi + 48], xmm2 - - ;line 4 5 - lea rsi, [rsi + rdx*4] - - movq xmm0, MMWORD PTR [rsi] ; src - movq xmm2, MMWORD PTR [rsi+rdx] - movdqa xmm1, XMMWORD PTR [rax + 32] ; pred - punpcklqdq xmm0, xmm2 - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 ; subtraction with sign missed - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi + 64], xmm0 - movdqa XMMWORD PTR [rdi + 80], xmm2 - - ;line 6 7 - movq xmm0, MMWORD PTR [rsi+rdx*2] ; src - movq xmm2, MMWORD PTR [rsi+rcx] - movdqa xmm1, XMMWORD PTR [rax+ 48] ; pred - punpcklqdq xmm0, xmm2 - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 ; subtraction with sign missed - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi + 96], xmm0 - movdqa XMMWORD PTR [rdi + 112], xmm2 - - ;v - mov rsi, arg(2) ;z = vsrc - add rdi, 64*2 ;diff = diff + 320 (shorts) - add rax, 64 ;Predictor = pred + 320 - - ;line 0 1 - movq xmm0, MMWORD PTR [rsi] ; src - movq xmm2, MMWORD PTR [rsi+rdx] - movdqa xmm1, XMMWORD PTR [rax] ; pred - punpcklqdq xmm0, xmm2 - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 ; subtraction with sign missed - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi], xmm0 - movdqa XMMWORD PTR [rdi +16], xmm2 - - ;line 2 3 - movq xmm0, MMWORD PTR [rsi+rdx*2] ; src - movq xmm2, MMWORD PTR [rsi+rcx] - movdqa xmm1, XMMWORD PTR [rax+16] ; pred - punpcklqdq xmm0, xmm2 - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 ; subtraction with sign missed - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi + 32], xmm0 - movdqa XMMWORD PTR [rdi + 48], xmm2 - - ;line 4 5 - lea rsi, [rsi + rdx*4] - - movq xmm0, MMWORD PTR [rsi] ; src - movq xmm2, MMWORD PTR [rsi+rdx] - movdqa xmm1, XMMWORD PTR [rax + 32] ; pred - punpcklqdq xmm0, xmm2 - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 ; subtraction with sign missed - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi + 64], xmm0 - movdqa XMMWORD PTR [rdi + 80], xmm2 - - ;line 6 7 - movq xmm0, MMWORD PTR [rsi+rdx*2] ; src - movq xmm2, MMWORD PTR [rsi+rcx] - movdqa xmm1, XMMWORD PTR [rax+ 48] ; pred - punpcklqdq xmm0, xmm2 - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 ; subtraction with sign missed - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi + 96], xmm0 - movdqa XMMWORD PTR [rdi + 112], xmm2 - + movdqa xmm4, [GLOBAL(t80)] + mov rdi, arg(0) ;diff + mov rsi, arg(1) ;usrc + movsxd rdx, dword ptr arg(3);src_stride; + mov rax, arg(4) ;upred + add rdi, 256*2 ;diff = diff + 256 (shorts) + mov rcx, 4 + push rbx + movsxd rbx, dword ptr arg(6);pred_stride + + ;u +.submbu_loop: + movq xmm0, [rsi] ; src + movq xmm2, [rsi+rdx] ; src -- next line + movq xmm1, [rax] ; pred + movq xmm3, [rax+rbx] ; pred -- next line + lea rsi, [rsi + rdx*2] + lea rax, [rax + rbx*2] + + punpcklqdq xmm0, xmm2 + punpcklqdq xmm1, xmm3 + + movdqa xmm2, xmm0 + psubb xmm0, xmm1 ; subtraction with sign missed + + pxor xmm1, xmm4 ;convert to signed values + pxor xmm2, xmm4 + pcmpgtb xmm1, xmm2 ; obtain sign information + + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + punpcklbw xmm0, xmm1 ; put sign back to subtraction + punpckhbw xmm2, xmm3 ; put sign back to subtraction + + movdqa [rdi], xmm0 ; store difference + movdqa [rdi +16], xmm2 ; store difference + add rdi, 32 + sub rcx, 1 + jnz .submbu_loop + + mov rsi, arg(2) ;vsrc + mov rax, arg(5) ;vpred + mov rcx, 4 + + ;v +.submbv_loop: + movq xmm0, [rsi] ; src + movq xmm2, [rsi+rdx] ; src -- next line + movq xmm1, [rax] ; pred + movq xmm3, [rax+rbx] ; pred -- next line + lea rsi, [rsi + rdx*2] + lea rax, [rax + rbx*2] + + punpcklqdq xmm0, xmm2 + punpcklqdq xmm1, xmm3 + + movdqa xmm2, xmm0 + psubb xmm0, xmm1 ; subtraction with sign missed + + pxor xmm1, xmm4 ;convert to signed values + pxor xmm2, xmm4 + pcmpgtb xmm1, xmm2 ; obtain sign information + + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + punpcklbw xmm0, xmm1 ; put sign back to subtraction + punpckhbw xmm2, xmm3 ; put sign back to subtraction + + movdqa [rdi], xmm0 ; store difference + movdqa [rdi +16], xmm2 ; store difference + add rdi, 32 + sub rcx, 1 + jnz .submbv_loop + + pop rbx ; begin epilog pop rdi pop rsi diff --git a/vp8/encoder/x86/variance_mmx.c b/vp8/encoder/x86/variance_mmx.c index 92b695f17..e2524b46a 100644 --- a/vp8/encoder/x86/variance_mmx.c +++ b/vp8/encoder/x86/variance_mmx.c @@ -12,6 +12,7 @@ #include "vp8/encoder/variance.h" #include "vp8/common/pragmas.h" #include "vpx_ports/mem.h" +#include "vp8/common/x86/filter_x86.h" extern void filter_block1d_h6_mmx ( @@ -21,7 +22,7 @@ extern void filter_block1d_h6_mmx unsigned int pixel_step, unsigned int output_height, unsigned int output_width, - short *vp7_filter + short *filter ); extern void filter_block1d_v6_mmx ( @@ -31,7 +32,7 @@ extern void filter_block1d_v6_mmx unsigned int pixel_step, unsigned int output_height, unsigned int output_width, - short *vp7_filter + short *filter ); extern unsigned int vp8_get_mb_ss_mmx(const short *src_ptr); @@ -198,24 +199,6 @@ unsigned int vp8_variance8x16_mmx( } - - -/////////////////////////////////////////////////////////////////////////// -// the mmx function that does the bilinear filtering and var calculation // -// int one pass // -/////////////////////////////////////////////////////////////////////////// -DECLARE_ALIGNED(16, const short, vp8_vp7_bilinear_filters_mmx[8][8]) = -{ - { 128, 128, 128, 128, 0, 0, 0, 0 }, - { 112, 112, 112, 112, 16, 16, 16, 16 }, - { 96, 96, 96, 96, 32, 32, 32, 32 }, - { 80, 80, 80, 80, 48, 48, 48, 48 }, - { 64, 64, 64, 64, 64, 64, 64, 64 }, - { 48, 48, 48, 48, 80, 80, 80, 80 }, - { 32, 32, 32, 32, 96, 96, 96, 96 }, - { 16, 16, 16, 16, 112, 112, 112, 112 } -}; - unsigned int vp8_sub_pixel_variance4x4_mmx ( const unsigned char *src_ptr, @@ -232,7 +215,7 @@ unsigned int vp8_sub_pixel_variance4x4_mmx vp8_filter_block2d_bil4x4_var_mmx( src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, - vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], + vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset], &xsum, &xxsum ); *sse = xxsum; @@ -257,7 +240,7 @@ unsigned int vp8_sub_pixel_variance8x8_mmx vp8_filter_block2d_bil_var_mmx( src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, 8, - vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], + vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset], &xsum, &xxsum ); *sse = xxsum; @@ -283,7 +266,7 @@ unsigned int vp8_sub_pixel_variance16x16_mmx vp8_filter_block2d_bil_var_mmx( src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, 16, - vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], + vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset], &xsum0, &xxsum0 ); @@ -291,7 +274,7 @@ unsigned int vp8_sub_pixel_variance16x16_mmx vp8_filter_block2d_bil_var_mmx( src_ptr + 8, src_pixels_per_line, dst_ptr + 8, dst_pixels_per_line, 16, - vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], + vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset], &xsum1, &xxsum1 ); @@ -336,7 +319,7 @@ unsigned int vp8_sub_pixel_variance16x8_mmx vp8_filter_block2d_bil_var_mmx( src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, 8, - vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], + vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset], &xsum0, &xxsum0 ); @@ -344,7 +327,7 @@ unsigned int vp8_sub_pixel_variance16x8_mmx vp8_filter_block2d_bil_var_mmx( src_ptr + 8, src_pixels_per_line, dst_ptr + 8, dst_pixels_per_line, 8, - vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], + vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset], &xsum1, &xxsum1 ); @@ -371,7 +354,7 @@ unsigned int vp8_sub_pixel_variance8x16_mmx vp8_filter_block2d_bil_var_mmx( src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, 16, - vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], + vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset], &xsum, &xxsum ); *sse = xxsum; diff --git a/vp8/encoder/x86/variance_sse2.c b/vp8/encoder/x86/variance_sse2.c index 24062eb9b..39213b03d 100644 --- a/vp8/encoder/x86/variance_sse2.c +++ b/vp8/encoder/x86/variance_sse2.c @@ -12,11 +12,12 @@ #include "vp8/encoder/variance.h" #include "vp8/common/pragmas.h" #include "vpx_ports/mem.h" +#include "vp8/common/x86/filter_x86.h" -extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter); -extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter); -extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter); -extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter); +extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter); +extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter); +extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter); +extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter); extern void vp8_filter_block2d_bil4x4_var_mmx ( @@ -135,8 +136,6 @@ void vp8_half_vert_variance16x_h_sse2 unsigned int *sumsquared ); -DECLARE_ALIGNED(16, extern short, vp8_vp7_bilinear_filters_mmx[8][8]); - unsigned int vp8_variance4x4_wmt( const unsigned char *src_ptr, int source_stride, @@ -262,7 +261,7 @@ unsigned int vp8_sub_pixel_variance4x4_wmt vp8_filter_block2d_bil4x4_var_mmx( src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, - vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], + vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset], &xsum, &xxsum ); *sse = xxsum; diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk index 5c15a3e4f..0e564320f 100644 --- a/vp8/vp8_common.mk +++ b/vp8/vp8_common.mk @@ -9,7 +9,6 @@ ## VP8_COMMON_SRCS-yes += vp8_common.mk -VP8_COMMON_SRCS-yes += common/type_aliases.h VP8_COMMON_SRCS-yes += common/pragmas.h VP8_COMMON_SRCS-yes += common/ppflags.h VP8_COMMON_SRCS-yes += common/onyx.h @@ -20,6 +19,8 @@ VP8_COMMON_SRCS-yes += common/blockd.c VP8_COMMON_SRCS-yes += common/coefupdateprobs.h VP8_COMMON_SRCS-yes += common/debugmodes.c VP8_COMMON_SRCS-yes += common/default_coef_probs.h +VP8_COMMON_SRCS-yes += common/dequantize.c +VP8_COMMON_SRCS-yes += common/dequantize.h VP8_COMMON_SRCS-yes += common/entropy.c VP8_COMMON_SRCS-yes += common/entropymode.c VP8_COMMON_SRCS-yes += common/entropymv.c @@ -28,17 +29,16 @@ VP8_COMMON_SRCS-yes += common/filter.c VP8_COMMON_SRCS-yes += common/filter.h VP8_COMMON_SRCS-yes += common/findnearmv.c VP8_COMMON_SRCS-yes += common/generic/systemdependent.c +VP8_COMMON_SRCS-yes += common/idct_blk.c VP8_COMMON_SRCS-yes += common/idctllm.c VP8_COMMON_SRCS-yes += common/alloccommon.h VP8_COMMON_SRCS-yes += common/blockd.h VP8_COMMON_SRCS-yes += common/common.h -VP8_COMMON_SRCS-yes += common/common_types.h VP8_COMMON_SRCS-yes += common/entropy.h VP8_COMMON_SRCS-yes += common/entropymode.h VP8_COMMON_SRCS-yes += common/entropymv.h VP8_COMMON_SRCS-yes += common/extend.h VP8_COMMON_SRCS-yes += common/findnearmv.h -VP8_COMMON_SRCS-yes += common/g_common.h VP8_COMMON_SRCS-yes += common/header.h VP8_COMMON_SRCS-yes += common/idct.h VP8_COMMON_SRCS-yes += common/invtrans.h @@ -57,7 +57,6 @@ VP8_COMMON_SRCS-yes += common/swapyv12buffer.h VP8_COMMON_SRCS-yes += common/systemdependent.h VP8_COMMON_SRCS-yes += common/threading.h VP8_COMMON_SRCS-yes += common/treecoder.h -VP8_COMMON_SRCS-yes += common/invtrans.c VP8_COMMON_SRCS-yes += common/loopfilter.c VP8_COMMON_SRCS-yes += common/loopfilter_filters.c VP8_COMMON_SRCS-yes += common/mbpitch.c @@ -69,9 +68,15 @@ VP8_COMMON_SRCS-yes += common/reconintra.c VP8_COMMON_SRCS-yes += common/reconintra4x4.c VP8_COMMON_SRCS-yes += common/setupintrarecon.c VP8_COMMON_SRCS-yes += common/swapyv12buffer.c + + + VP8_COMMON_SRCS-$(CONFIG_POSTPROC_VISUALIZER) += common/textblit.c VP8_COMMON_SRCS-yes += common/treecoder.c +VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/dequantize_x86.h +VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/filter_x86.c +VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/filter_x86.h VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/idct_x86.h VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/subpixel_x86.h VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/recon_x86.h @@ -82,11 +87,14 @@ VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp8_asm_stubs.c VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/loopfilter_x86.c VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/postproc.h VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/postproc.c +VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/dequantize_mmx.asm +VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idct_blk_mmx.c VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idctllm_mmx.asm VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/iwalsh_mmx.asm VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/recon_mmx.asm VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/subpixel_mmx.asm VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/loopfilter_mmx.asm +VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idct_blk_sse2.c VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idctllm_sse2.asm VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_sse2.asm VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_wrapper_sse2.c @@ -104,8 +112,6 @@ endif # common (c) VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/arm_systemdependent.c -VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/bilinearfilter_arm.c -VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/bilinearfilter_arm.h VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/filter_arm.c VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/idct_arm.h VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/loopfilter_arm.c @@ -113,8 +119,12 @@ VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/loopfilter_arm.h VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/recon_arm.h VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/reconintra_arm.c VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/subpixel_arm.h +VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/dequantize_arm.c +VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/dequantize_arm.h # common (armv6) +VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/bilinearfilter_arm.c +VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/bilinearfilter_arm.h VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/bilinearfilter_v6$(ASM) VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/copymem8x4_v6$(ASM) VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/copymem8x8_v6$(ASM) @@ -127,6 +137,9 @@ VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/loopfilter_v6$(ASM) VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/simpleloopfilter_v6$(ASM) VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/sixtappredict8x4_v6$(ASM) VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/intra4x4_predict_v6$(ASM) +VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/dequant_idct_v6$(ASM) +VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/dequantize_v6$(ASM) +VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/idct_blk_v6.c # common (neon) VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/bilinearpredict4x4_neon$(ASM) @@ -149,3 +162,8 @@ VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/sixtappredict8x8_neon$(ASM) VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/sixtappredict16x16_neon$(ASM) VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/buildintrapredictorsmby_neon$(ASM) VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/save_neon_reg$(ASM) +VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/dequant_idct_neon$(ASM) +VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/idct_dequant_full_2x_neon$(ASM) +VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/idct_dequant_0_2x_neon$(ASM) +VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/dequantizeb_neon$(ASM) +VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/idct_blk_neon.c diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c index 7260e942b..6181ee8ee 100644 --- a/vp8/vp8_cx_iface.c +++ b/vp8/vp8_cx_iface.c @@ -83,7 +83,7 @@ struct vpx_codec_alg_priv vpx_codec_enc_cfg_t cfg; struct vp8_extracfg vp8_cfg; VP8_CONFIG oxcf; - VP8_PTR cpi; + struct VP8_COMP *cpi; unsigned char *cx_data; unsigned int cx_data_sz; vpx_image_t preview_img; @@ -137,7 +137,8 @@ update_error_state(vpx_codec_alg_priv_t *ctx, static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, const vpx_codec_enc_cfg_t *cfg, - const struct vp8_extracfg *vp8_cfg) + const struct vp8_extracfg *vp8_cfg, + int finalize) { RANGE_CHECK(cfg, g_w, 1, 16383); /* 14 bits available */ RANGE_CHECK(cfg, g_h, 1, 16383); /* 14 bits available */ @@ -193,6 +194,9 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, RANGE_CHECK_HI(vp8_cfg, arnr_strength, 6); RANGE_CHECK(vp8_cfg, arnr_type, 1, 3); RANGE_CHECK(vp8_cfg, cq_level, 0, 63); + if(finalize && cfg->rc_end_usage == VPX_CQ) + RANGE_CHECK(vp8_cfg, cq_level, + cfg->rc_min_quantizer, cfg->rc_max_quantizer); #if !(CONFIG_REALTIME_ONLY) if (cfg->g_pass == VPX_RC_LAST_PASS) @@ -264,21 +268,15 @@ static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx, static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf, vpx_codec_enc_cfg_t cfg, - struct vp8_extracfg vp8_cfg) + struct vp8_extracfg vp8_cfg, + vpx_codec_priv_enc_mr_cfg_t *mr_cfg) { oxcf->multi_threaded = cfg.g_threads; oxcf->Version = cfg.g_profile; oxcf->Width = cfg.g_w; oxcf->Height = cfg.g_h; - /* guess a frame rate if out of whack, use 30 */ - oxcf->frame_rate = (double)(cfg.g_timebase.den) / - (double)(cfg.g_timebase.num); - - if (oxcf->frame_rate > 180) - { - oxcf->frame_rate = 30; - } + oxcf->timebase = cfg.g_timebase; oxcf->error_resilient_mode = cfg.g_error_resilient; @@ -362,6 +360,21 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf, memcpy (oxcf->layer_id, cfg.ts_layer_id, sizeof(cfg.ts_layer_id)); } +#if CONFIG_MULTI_RES_ENCODING + /* When mr_cfg is NULL, oxcf->mr_total_resolutions and oxcf->mr_encoder_id + * are both memset to 0, which ensures the correct logic under this + * situation. + */ + if(mr_cfg) + { + oxcf->mr_total_resolutions = mr_cfg->mr_total_resolutions; + oxcf->mr_encoder_id = mr_cfg->mr_encoder_id; + oxcf->mr_down_sampling_factor.num = mr_cfg->mr_down_sampling_factor.num; + oxcf->mr_down_sampling_factor.den = mr_cfg->mr_down_sampling_factor.den; + oxcf->mr_low_res_mode_info = mr_cfg->mr_low_res_mode_info; + } +#endif + //oxcf->delete_first_pass_file = cfg.g_delete_firstpassfile; //strcpy(oxcf->first_pass_file, cfg.g_firstpass_file); @@ -434,12 +447,12 @@ static vpx_codec_err_t vp8e_set_config(vpx_codec_alg_priv_t *ctx, if ((cfg->g_lag_in_frames > ctx->cfg.g_lag_in_frames)) ERROR("Cannot increase lag_in_frames"); - res = validate_config(ctx, cfg, &ctx->vp8_cfg); + res = validate_config(ctx, cfg, &ctx->vp8_cfg, 0); if (!res) { ctx->cfg = *cfg; - set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg); + set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg, NULL); vp8_change_config(ctx->cpi, &ctx->oxcf); } @@ -500,26 +513,50 @@ static vpx_codec_err_t set_param(vpx_codec_alg_priv_t *ctx, } - res = validate_config(ctx, &ctx->cfg, &xcfg); + res = validate_config(ctx, &ctx->cfg, &xcfg, 0); if (!res) { ctx->vp8_cfg = xcfg; - set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg); + set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg, NULL); vp8_change_config(ctx->cpi, &ctx->oxcf); } return res; #undef MAP } -static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx) + +static vpx_codec_err_t vp8e_mr_alloc_mem(const vpx_codec_enc_cfg_t *cfg, + void **mem_loc) +{ + vpx_codec_err_t res = 0; + +#if CONFIG_MULTI_RES_ENCODING + int mb_rows = ((cfg->g_w + 15) >>4); + int mb_cols = ((cfg->g_h + 15) >>4); + + *mem_loc = calloc(mb_rows*mb_cols, sizeof(LOWER_RES_INFO)); + if(!(*mem_loc)) + { + free(*mem_loc); + res = VPX_CODEC_MEM_ERROR; + } + else + res = VPX_CODEC_OK; +#endif + + return res; +} + +static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx, + vpx_codec_priv_enc_mr_cfg_t *mr_cfg) { vpx_codec_err_t res = VPX_DEC_OK; struct vpx_codec_alg_priv *priv; vpx_codec_enc_cfg_t *cfg; unsigned int i; - VP8_PTR optr; + struct VP8_COMP *optr; if (!ctx->priv) { @@ -573,13 +610,20 @@ static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx) vp8_initialize(); - res = validate_config(priv, &priv->cfg, &priv->vp8_cfg); + res = validate_config(priv, &priv->cfg, &priv->vp8_cfg, 0); if (!res) { + if(mr_cfg) + ctx->priv->enc.total_encoders = mr_cfg->mr_total_resolutions; + else + ctx->priv->enc.total_encoders = 1; + set_vp8e_config(&ctx->priv->alg_priv->oxcf, ctx->priv->alg_priv->cfg, - ctx->priv->alg_priv->vp8_cfg); + ctx->priv->alg_priv->vp8_cfg, + mr_cfg); + optr = vp8_create_compressor(&ctx->priv->alg_priv->oxcf); if (!optr) @@ -594,6 +638,11 @@ static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx) static vpx_codec_err_t vp8e_destroy(vpx_codec_alg_priv_t *ctx) { +#if CONFIG_MULTI_RES_ENCODING + /* Free multi-encoder shared memory */ + if (ctx->oxcf.mr_total_resolutions > 0 && (ctx->oxcf.mr_encoder_id == ctx->oxcf.mr_total_resolutions-1)) + free(ctx->oxcf.mr_low_res_mode_info); +#endif free(ctx->cx_data); vp8_remove_compressor(&ctx->cpi); @@ -691,6 +740,9 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx, if (img) res = validate_img(ctx, img); + if (!res) + res = validate_config(ctx, &ctx->cfg, &ctx->vp8_cfg, 1); + pick_quickcompress_mode(ctx, duration, deadline); vpx_codec_pkt_list_init(&ctx->pkt_list); @@ -1230,6 +1282,7 @@ CODEC_INTERFACE(vpx_codec_vp8_cx) = vp8e_set_config, NOT_IMPLEMENTED, vp8e_get_preview, + vp8e_mr_alloc_mem, } /* encoder functions */ }; @@ -1314,5 +1367,6 @@ vpx_codec_iface_t vpx_enc_vp8_algo = vp8e_set_config, NOT_IMPLEMENTED, vp8e_get_preview, + vp8e_mr_alloc_mem, } /* encoder functions */ }; diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c index cdfcd2142..fbe58171c 100644 --- a/vp8/vp8_dx_iface.c +++ b/vp8/vp8_dx_iface.c @@ -57,7 +57,7 @@ struct vpx_codec_alg_priv vp8_stream_info_t si; int defer_alloc; int decoder_init; - VP8D_PTR pbi; + struct VP8D_COMP *pbi; int postproc_cfg_set; vp8_postproc_cfg_t postproc_cfg; #if CONFIG_POSTPROC_VISUALIZER @@ -181,9 +181,11 @@ static void vp8_finalize_mmaps(vpx_codec_alg_priv_t *ctx) /* nothing to clean up */ } -static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx) +static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx, + vpx_codec_priv_enc_mr_cfg_t *data) { vpx_codec_err_t res = VPX_CODEC_OK; + (void) data; /* This function only allocates space for the vpx_codec_alg_priv_t * structure. More memory may be required at the time the stream @@ -387,7 +389,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, if (!res) { VP8D_CONFIG oxcf; - VP8D_PTR optr; + struct VP8D_COMP* optr; vp8dx_initialize(); @@ -564,7 +566,7 @@ static vpx_codec_err_t vp8_xma_set_mmap(vpx_codec_ctx_t *ctx, if (done && !res) { vp8_finalize_mmaps(ctx->priv->alg_priv); - res = ctx->iface->init(ctx); + res = ctx->iface->init(ctx, NULL); } return res; diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk index b71a54aea..2d99981f5 100644 --- a/vp8/vp8cx.mk +++ b/vp8/vp8cx.mk @@ -86,6 +86,8 @@ VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.h VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.c VP8_CX_SRCS-yes += encoder/temporal_filter.c VP8_CX_SRCS-yes += encoder/temporal_filter.h +VP8_CX_SRCS-$(CONFIG_MULTI_RES_ENCODING) += encoder/mr_dissim.c +VP8_CX_SRCS-$(CONFIG_MULTI_RES_ENCODING) += encoder/mr_dissim.h ifeq ($(CONFIG_REALTIME_ONLY),yes) VP8_CX_SRCS_REMOVE-yes += encoder/firstpass.c diff --git a/vp8/vp8dx.mk b/vp8/vp8dx.mk index d88b595fb..d6dc15305 100644 --- a/vp8/vp8dx.mk +++ b/vp8/vp8dx.mk @@ -52,7 +52,6 @@ VP8_DX_SRCS-yes += decoder/asm_dec_offsets.c VP8_DX_SRCS-yes += decoder/dboolhuff.c VP8_DX_SRCS-yes += decoder/decodemv.c VP8_DX_SRCS-yes += decoder/decodframe.c -VP8_DX_SRCS-yes += decoder/dequantize.c VP8_DX_SRCS-yes += decoder/detokenize.c VP8_DX_SRCS-$(CONFIG_ERROR_CONCEALMENT) += decoder/ec_types.h VP8_DX_SRCS-$(CONFIG_ERROR_CONCEALMENT) += decoder/error_concealment.h @@ -61,20 +60,14 @@ VP8_DX_SRCS-yes += decoder/generic/dsystemdependent.c VP8_DX_SRCS-yes += decoder/dboolhuff.h VP8_DX_SRCS-yes += decoder/decodemv.h VP8_DX_SRCS-yes += decoder/decoderthreading.h -VP8_DX_SRCS-yes += decoder/dequantize.h VP8_DX_SRCS-yes += decoder/detokenize.h VP8_DX_SRCS-yes += decoder/onyxd_int.h VP8_DX_SRCS-yes += decoder/treereader.h VP8_DX_SRCS-yes += decoder/onyxd_if.c VP8_DX_SRCS-$(CONFIG_MULTITHREAD) += decoder/threading.c -VP8_DX_SRCS-yes += decoder/idct_blk.c VP8_DX_SRCS-$(CONFIG_MULTITHREAD) += decoder/reconintra_mt.h VP8_DX_SRCS-$(CONFIG_MULTITHREAD) += decoder/reconintra_mt.c VP8_DX_SRCS-yes := $(filter-out $(VP8_DX_SRCS_REMOVE-yes),$(VP8_DX_SRCS-yes)) -VP8_DX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += decoder/x86/dequantize_x86.h VP8_DX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += decoder/x86/x86_dsystemdependent.c -VP8_DX_SRCS-$(HAVE_MMX) += decoder/x86/dequantize_mmx.asm -VP8_DX_SRCS-$(HAVE_MMX) += decoder/x86/idct_blk_mmx.c -VP8_DX_SRCS-$(HAVE_SSE2) += decoder/x86/idct_blk_sse2.c diff --git a/vp8/vp8dx_arm.mk b/vp8/vp8dx_arm.mk index 6bde42f4c..fa1aaea0b 100644 --- a/vp8/vp8dx_arm.mk +++ b/vp8/vp8dx_arm.mk @@ -12,20 +12,3 @@ #VP8_DX_SRCS list is modified according to different platforms. VP8_DX_SRCS-$(ARCH_ARM) += decoder/arm/arm_dsystemdependent.c -VP8_DX_SRCS-$(ARCH_ARM) += decoder/arm/dequantize_arm.c -VP8_DX_SRCS-$(ARCH_ARM) += decoder/arm/dequantize_arm.h - -#File list for armv6 -VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequant_dc_idct_v6$(ASM) -VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequant_idct_v6$(ASM) -VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequantize_v6$(ASM) -VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/idct_blk_v6.c - -#File list for neon -VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/idct_dequant_dc_full_2x_neon$(ASM) -VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/idct_dequant_dc_0_2x_neon$(ASM) -VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequant_idct_neon$(ASM) -VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/idct_dequant_full_2x_neon$(ASM) -VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/idct_dequant_0_2x_neon$(ASM) -VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequantizeb_neon$(ASM) -VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/idct_blk_neon.c diff --git a/vp8_multi_resolution_encoder.c b/vp8_multi_resolution_encoder.c new file mode 100644 index 000000000..11c33d618 --- /dev/null +++ b/vp8_multi_resolution_encoder.c @@ -0,0 +1,463 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +/* + * This is an example demonstrating multi-resolution encoding in VP8. + * High-resolution input video is down-sampled to lower-resolutions. The + * encoder then encodes the video and outputs multiple bitstreams with + * different resolutions. + */ +#include <stdio.h> +#include <stdlib.h> +#include <stdarg.h> +#include <string.h> +#include "math.h" +#define VPX_CODEC_DISABLE_COMPAT 1 +#include "vpx/vpx_encoder.h" +#include "vpx/vp8cx.h" +#include "vpx_ports/mem_ops.h" +#define interface (vpx_codec_vp8_cx()) +#define fourcc 0x30385056 + +#define IVF_FILE_HDR_SZ (32) +#define IVF_FRAME_HDR_SZ (12) + +/* + * The input video frame is downsampled several times to generate a multi-level + * hierarchical structure. NUM_ENCODERS is defined as the number of encoding + * levels required. For example, if the size of input video is 1280x720, + * NUM_ENCODERS is 3, and down-sampling factor is 2, the encoder outputs 3 + * bitstreams with resolution of 1280x720(level 0), 640x360(level 1), and + * 320x180(level 2) respectively. + */ +#define NUM_ENCODERS 3 + +/* This example uses the scaler function in libyuv. */ +#include "third_party/libyuv/include/libyuv/basic_types.h" +#include "third_party/libyuv/include/libyuv/scale.h" +#include "third_party/libyuv/include/libyuv/cpu_id.h" + +static double vp8_mse2psnr(double Samples, double Peak, double Mse) +{ + double psnr; + + if ((double)Mse > 0.0) + psnr = 10.0 * log10(Peak * Peak * Samples / Mse); + else + psnr = 60; // Limit to prevent / 0 + + if (psnr > 60) + psnr = 60; + + return psnr; +} + +static void die(const char *fmt, ...) { + va_list ap; + + va_start(ap, fmt); + vprintf(fmt, ap); + if(fmt[strlen(fmt)-1] != '\n') + printf("\n"); + exit(EXIT_FAILURE); +} + +static void die_codec(vpx_codec_ctx_t *ctx, const char *s) { + const char *detail = vpx_codec_error_detail(ctx); + + printf("%s: %s\n", s, vpx_codec_error(ctx)); + if(detail) + printf(" %s\n",detail); + exit(EXIT_FAILURE); +} + +int (*read_frame_p)(FILE *f, vpx_image_t *img); + +static int read_frame(FILE *f, vpx_image_t *img) { + size_t nbytes, to_read; + int res = 1; + + to_read = img->w*img->h*3/2; + nbytes = fread(img->planes[0], 1, to_read, f); + if(nbytes != to_read) { + res = 0; + if(nbytes > 0) + printf("Warning: Read partial frame. Check your width & height!\n"); + } + return res; +} + +static int read_frame_by_row(FILE *f, vpx_image_t *img) { + size_t nbytes, to_read; + int res = 1; + int plane; + + for (plane = 0; plane < 3; plane++) + { + unsigned char *ptr; + int w = (plane ? (1 + img->d_w) / 2 : img->d_w); + int h = (plane ? (1 + img->d_h) / 2 : img->d_h); + int r; + + /* Determine the correct plane based on the image format. The for-loop + * always counts in Y,U,V order, but this may not match the order of + * the data on disk. + */ + switch (plane) + { + case 1: + ptr = img->planes[img->fmt==VPX_IMG_FMT_YV12? VPX_PLANE_V : VPX_PLANE_U]; + break; + case 2: + ptr = img->planes[img->fmt==VPX_IMG_FMT_YV12?VPX_PLANE_U : VPX_PLANE_V]; + break; + default: + ptr = img->planes[plane]; + } + + for (r = 0; r < h; r++) + { + to_read = w; + + nbytes = fread(ptr, 1, to_read, f); + if(nbytes != to_read) { + res = 0; + if(nbytes > 0) + printf("Warning: Read partial frame. Check your width & height!\n"); + break; + } + + ptr += img->stride[plane]; + } + if (!res) + break; + } + + return res; +} + +static void write_ivf_file_header(FILE *outfile, + const vpx_codec_enc_cfg_t *cfg, + int frame_cnt) { + char header[32]; + + if(cfg->g_pass != VPX_RC_ONE_PASS && cfg->g_pass != VPX_RC_LAST_PASS) + return; + header[0] = 'D'; + header[1] = 'K'; + header[2] = 'I'; + header[3] = 'F'; + mem_put_le16(header+4, 0); /* version */ + mem_put_le16(header+6, 32); /* headersize */ + mem_put_le32(header+8, fourcc); /* headersize */ + mem_put_le16(header+12, cfg->g_w); /* width */ + mem_put_le16(header+14, cfg->g_h); /* height */ + mem_put_le32(header+16, cfg->g_timebase.den); /* rate */ + mem_put_le32(header+20, cfg->g_timebase.num); /* scale */ + mem_put_le32(header+24, frame_cnt); /* length */ + mem_put_le32(header+28, 0); /* unused */ + + if(fwrite(header, 1, 32, outfile)); +} + +static void write_ivf_frame_header(FILE *outfile, + const vpx_codec_cx_pkt_t *pkt) +{ + char header[12]; + vpx_codec_pts_t pts; + + if(pkt->kind != VPX_CODEC_CX_FRAME_PKT) + return; + + pts = pkt->data.frame.pts; + mem_put_le32(header, pkt->data.frame.sz); + mem_put_le32(header+4, pts&0xFFFFFFFF); + mem_put_le32(header+8, pts >> 32); + + if(fwrite(header, 1, 12, outfile)); +} + +int main(int argc, char **argv) +{ + FILE *infile, *outfile[NUM_ENCODERS]; + vpx_codec_ctx_t codec[NUM_ENCODERS]; + vpx_codec_enc_cfg_t cfg[NUM_ENCODERS]; + vpx_codec_pts_t frame_cnt = 0; + vpx_image_t raw[NUM_ENCODERS]; + vpx_codec_err_t res[NUM_ENCODERS]; + + int i; + long width; + long height; + int frame_avail; + int got_data; + int flags = 0; + + /*Currently, only realtime mode is supported in multi-resolution encoding.*/ + int arg_deadline = VPX_DL_REALTIME; + + /* Set show_psnr to 1/0 to show/not show PSNR. Choose show_psnr=0 if you + don't need to know PSNR, which will skip PSNR calculation and save + encoding time. */ + int show_psnr = 0; + uint64_t psnr_sse_total[NUM_ENCODERS] = {0}; + uint64_t psnr_samples_total[NUM_ENCODERS] = {0}; + double psnr_totals[NUM_ENCODERS][4] = {{0,0}}; + int psnr_count[NUM_ENCODERS] = {0}; + + /* Set the required target bitrates for each resolution level. */ + unsigned int target_bitrate[NUM_ENCODERS]={1400, 500, 100}; + /* Enter the frame rate of the input video */ + int framerate = 30; + /* Set down-sampling factor for each resolution level. + dsf[0] controls down sampling from level 0 to level 1; + dsf[1] controls down sampling from level 1 to level 2; + dsf[2] is not used. */ + vpx_rational_t dsf[NUM_ENCODERS] = {{2, 1}, {2, 1}, {1, 1}}; + + if(argc!= (5+NUM_ENCODERS)) + die("Usage: %s <width> <height> <infile> <outfile(s)> <output psnr?>\n", + argv[0]); + + printf("Using %s\n",vpx_codec_iface_name(interface)); + + width = strtol(argv[1], NULL, 0); + height = strtol(argv[2], NULL, 0); + + if(width < 16 || width%2 || height <16 || height%2) + die("Invalid resolution: %ldx%ld", width, height); + + /* Open input video file for encoding */ + if(!(infile = fopen(argv[3], "rb"))) + die("Failed to open %s for reading", argv[3]); + + /* Open output file for each encoder to output bitstreams */ + for (i=0; i< NUM_ENCODERS; i++) + { + if(!(outfile[i] = fopen(argv[i+4], "wb"))) + die("Failed to open %s for writing", argv[i+4]); + } + + show_psnr = strtol(argv[NUM_ENCODERS + 4], NULL, 0); + + /* Populate default encoder configuration */ + for (i=0; i< NUM_ENCODERS; i++) + { + res[i] = vpx_codec_enc_config_default(interface, &cfg[i], 0); + if(res[i]) { + printf("Failed to get config: %s\n", vpx_codec_err_to_string(res[i])); + return EXIT_FAILURE; + } + } + + /* + * Update the default configuration according to needs of the application. + */ + /* Highest-resolution encoder settings */ + cfg[0].g_w = width; + cfg[0].g_h = height; + cfg[0].g_threads = 1; /* number of threads used */ + cfg[0].rc_dropframe_thresh = 0; + cfg[0].rc_end_usage = VPX_CBR; + cfg[0].rc_resize_allowed = 0; + cfg[0].rc_min_quantizer = 4; + cfg[0].rc_max_quantizer = 56; + cfg[0].rc_undershoot_pct = 98; + cfg[0].rc_overshoot_pct = 100; + cfg[0].rc_buf_initial_sz = 500; + cfg[0].rc_buf_optimal_sz = 600; + cfg[0].rc_buf_sz = 1000; + //cfg[0].rc_dropframe_thresh = 10; + cfg[0].g_error_resilient = 1; /* Enable error resilient mode */ + cfg[0].g_lag_in_frames = 0; + + /* Disable automatic keyframe placement */ + //cfg[0].kf_mode = VPX_KF_DISABLED; + cfg[0].kf_min_dist = cfg[0].kf_max_dist = 1000; + + cfg[0].rc_target_bitrate = target_bitrate[0]; /* Set target bitrate */ + cfg[0].g_timebase.num = 1; /* Set fps */ + cfg[0].g_timebase.den = framerate; + + /* Other-resolution encoder settings */ + for (i=1; i< NUM_ENCODERS; i++) + { + memcpy(&cfg[i], &cfg[0], sizeof(vpx_codec_enc_cfg_t)); + + cfg[i].g_threads = 1; /* number of threads used */ + cfg[i].rc_target_bitrate = target_bitrate[i]; + + /* Note: Width & height of other-resolution encoders are calculated + * from the highest-resolution encoder's size and the corresponding + * down_sampling_factor. + */ + { + unsigned int iw = cfg[i-1].g_w*dsf[i-1].den + dsf[i-1].num - 1; + unsigned int ih = cfg[i-1].g_h*dsf[i-1].den + dsf[i-1].num - 1; + cfg[i].g_w = iw/dsf[i-1].num; + cfg[i].g_h = ih/dsf[i-1].num; + } + + /* Make width & height to be multiplier of 2. */ + // Should support odd size ??? + if((cfg[i].g_w)%2)cfg[i].g_w++; + if((cfg[i].g_h)%2)cfg[i].g_h++; + } + + /* Allocate image for each encoder */ + for (i=0; i< NUM_ENCODERS; i++) + if(!vpx_img_alloc(&raw[i], VPX_IMG_FMT_I420, cfg[i].g_w, cfg[i].g_h, 32)) + die("Failed to allocate image", cfg[i].g_w, cfg[i].g_h); + + if (raw[0].stride[VPX_PLANE_Y] == raw[0].d_w) + read_frame_p = read_frame; + else + read_frame_p = read_frame_by_row; + + for (i=0; i< NUM_ENCODERS; i++) + write_ivf_file_header(outfile[i], &cfg[i], 0); + + /* Initialize multi-encoder */ + if(vpx_codec_enc_init_multi(&codec[0], interface, &cfg[0], NUM_ENCODERS, + (show_psnr ? VPX_CODEC_USE_PSNR : 0), &dsf[0])) + die_codec(&codec[0], "Failed to initialize encoder"); + + /* The extra encoding configuration parameters can be set as follows. */ + /* Set encoding speed */ + for ( i=0; i<NUM_ENCODERS; i++) + { + int speed = -6; + if(vpx_codec_control(&codec[i], VP8E_SET_CPUUSED, speed)) + die_codec(&codec[i], "Failed to set cpu_used"); + } + /* Set static thresh for highest-resolution encoder. Set it to 1000 for + * better performance. */ + { + unsigned int static_thresh = 1000; + if(vpx_codec_control(&codec[0], VP8E_SET_STATIC_THRESHOLD, static_thresh)) + die_codec(&codec[0], "Failed to set static threshold"); + } + /* Set static thresh = 0 for other encoders for better quality */ + for ( i=1; i<NUM_ENCODERS; i++) + { + unsigned int static_thresh = 0; + if(vpx_codec_control(&codec[i], VP8E_SET_STATIC_THRESHOLD, static_thresh)) + die_codec(&codec[i], "Failed to set static threshold"); + } + + frame_avail = 1; + got_data = 0; + + while(frame_avail || got_data) + { + vpx_codec_iter_t iter[NUM_ENCODERS]={NULL}; + const vpx_codec_cx_pkt_t *pkt[NUM_ENCODERS]; + + flags = 0; + frame_avail = read_frame_p(infile, &raw[0]); + + if(frame_avail) + { + for ( i=1; i<NUM_ENCODERS; i++) + { + /*Scale the image down a number of times by downsampling factor*/ + /* FilterMode 1 or 2 give better psnr than FilterMode 0. */ + I420Scale(raw[i-1].planes[VPX_PLANE_Y], raw[i-1].stride[VPX_PLANE_Y], + raw[i-1].planes[VPX_PLANE_U], raw[i-1].stride[VPX_PLANE_U], + raw[i-1].planes[VPX_PLANE_V], raw[i-1].stride[VPX_PLANE_V], + raw[i-1].d_w, raw[i-1].d_h, + raw[i].planes[VPX_PLANE_Y], raw[i].stride[VPX_PLANE_Y], + raw[i].planes[VPX_PLANE_U], raw[i].stride[VPX_PLANE_U], + raw[i].planes[VPX_PLANE_V], raw[i].stride[VPX_PLANE_V], + raw[i].d_w, raw[i].d_h, 1); + } + } + + /* Encode each frame at multi-levels */ + if(vpx_codec_encode(&codec[0], frame_avail? &raw[0] : NULL, + frame_cnt, 1, flags, arg_deadline)) + die_codec(&codec[0], "Failed to encode frame"); + + for (i=NUM_ENCODERS-1; i>=0 ; i--) + { + got_data = 0; + + while( (pkt[i] = vpx_codec_get_cx_data(&codec[i], &iter[i])) ) + { + got_data = 1; + switch(pkt[i]->kind) { + case VPX_CODEC_CX_FRAME_PKT: + write_ivf_frame_header(outfile[i], pkt[i]); + if(fwrite(pkt[i]->data.frame.buf, 1, pkt[i]->data.frame.sz, + outfile[i])); + break; + case VPX_CODEC_PSNR_PKT: + if (show_psnr) + { + int j; + + psnr_sse_total[i] += pkt[i]->data.psnr.sse[0]; + psnr_samples_total[i] += pkt[i]->data.psnr.samples[0]; + for (j = 0; j < 4; j++) + { + //fprintf(stderr, "%.3lf ", pkt[i]->data.psnr.psnr[j]); + psnr_totals[i][j] += pkt[i]->data.psnr.psnr[j]; + } + psnr_count[i]++; + } + + break; + default: + break; + } + printf(pkt[i]->kind == VPX_CODEC_CX_FRAME_PKT + && (pkt[i]->data.frame.flags & VPX_FRAME_IS_KEY)? "K":"."); + fflush(stdout); + } + } + frame_cnt++; + } + printf("\n"); + + fclose(infile); + + for (i=0; i< NUM_ENCODERS; i++) + { + printf("Processed %ld frames.\n",(long int)frame_cnt-1); + + /* Calculate PSNR and print it out */ + if ( (show_psnr) && (psnr_count[i]>0) ) + { + int j; + double ovpsnr = vp8_mse2psnr(psnr_samples_total[i], 255.0, + psnr_sse_total[i]); + + fprintf(stderr, "\n ENC%d PSNR (Overall/Avg/Y/U/V)", i); + + fprintf(stderr, " %.3lf", ovpsnr); + for (j = 0; j < 4; j++) + { + fprintf(stderr, " %.3lf", psnr_totals[i][j]/psnr_count[i]); + } + } + + if(vpx_codec_destroy(&codec[i])) + die_codec(&codec[i], "Failed to destroy codec"); + + /* Try to rewrite the file header with the actual frame count */ + if(!fseek(outfile[i], 0, SEEK_SET)) + write_ivf_file_header(outfile[i], &cfg[i], frame_cnt-1); + fclose(outfile[i]); + + vpx_img_free(&raw[i]); + } + + return EXIT_SUCCESS; +} diff --git a/vpx/internal/vpx_codec_internal.h b/vpx/internal/vpx_codec_internal.h index a1ff1921e..0703d6a4f 100644 --- a/vpx/internal/vpx_codec_internal.h +++ b/vpx/internal/vpx_codec_internal.h @@ -56,9 +56,10 @@ * types, removing or reassigning enums, adding/removing/rearranging * fields to structures */ -#define VPX_CODEC_INTERNAL_ABI_VERSION (3) /**<\hideinitializer*/ +#define VPX_CODEC_INTERNAL_ABI_VERSION (4) /**<\hideinitializer*/ typedef struct vpx_codec_alg_priv vpx_codec_alg_priv_t; +typedef struct vpx_codec_priv_enc_mr_cfg vpx_codec_priv_enc_mr_cfg_t; /*!\brief init function pointer prototype * @@ -73,7 +74,8 @@ typedef struct vpx_codec_alg_priv vpx_codec_alg_priv_t; * \retval #VPX_CODEC_MEM_ERROR * Memory operation failed. */ -typedef vpx_codec_err_t (*vpx_codec_init_fn_t)(vpx_codec_ctx_t *ctx); +typedef vpx_codec_err_t (*vpx_codec_init_fn_t)(vpx_codec_ctx_t *ctx, + vpx_codec_priv_enc_mr_cfg_t *data); /*!\brief destroy function pointer prototype * @@ -264,6 +266,10 @@ typedef vpx_fixed_buf_t * typedef vpx_image_t * (*vpx_codec_get_preview_frame_fn_t)(vpx_codec_alg_priv_t *ctx); +typedef vpx_codec_err_t +(*vpx_codec_enc_mr_get_mem_loc_fn_t)(const vpx_codec_enc_cfg_t *cfg, + void **mem_loc); + /*!\brief usage configuration mapping * * This structure stores the mapping between usage identifiers and @@ -309,8 +315,9 @@ struct vpx_codec_iface vpx_codec_encode_fn_t encode; /**< \copydoc ::vpx_codec_encode_fn_t */ vpx_codec_get_cx_data_fn_t get_cx_data; /**< \copydoc ::vpx_codec_get_cx_data_fn_t */ vpx_codec_enc_config_set_fn_t cfg_set; /**< \copydoc ::vpx_codec_enc_config_set_fn_t */ - vpx_codec_get_global_headers_fn_t get_glob_hdrs; /**< \copydoc ::vpx_codec_enc_config_set_fn_t */ + vpx_codec_get_global_headers_fn_t get_glob_hdrs; /**< \copydoc ::vpx_codec_get_global_headers_fn_t */ vpx_codec_get_preview_frame_fn_t get_preview; /**< \copydoc ::vpx_codec_get_preview_frame_fn_t */ + vpx_codec_enc_mr_get_mem_loc_fn_t mr_get_mem_loc; /**< \copydoc ::vpx_codec_enc_mr_get_mem_loc_fn_t */ } enc; }; @@ -353,9 +360,21 @@ struct vpx_codec_priv unsigned int cx_data_pad_before; unsigned int cx_data_pad_after; vpx_codec_cx_pkt_t cx_data_pkt; + unsigned int total_encoders; } enc; }; +/* + * Multi-resolution encoding internal configuration + */ +struct vpx_codec_priv_enc_mr_cfg +{ + unsigned int mr_total_resolutions; + unsigned int mr_encoder_id; + struct vpx_rational mr_down_sampling_factor; + void* mr_low_res_mode_info; +}; + #undef VPX_CTRL_USE_TYPE #define VPX_CTRL_USE_TYPE(id, typ) \ static typ id##__value(va_list args) {return va_arg(args, typ);} \ diff --git a/vpx/src/vpx_decoder.c b/vpx/src/vpx_decoder.c index 5d31c2c49..59a783dd9 100644 --- a/vpx/src/vpx_decoder.c +++ b/vpx/src/vpx_decoder.c @@ -56,7 +56,7 @@ vpx_codec_err_t vpx_codec_dec_init_ver(vpx_codec_ctx_t *ctx, if (!(flags & VPX_CODEC_USE_XMA)) { - res = ctx->iface->init(ctx); + res = ctx->iface->init(ctx, NULL); if (res) { diff --git a/vpx/src/vpx_encoder.c b/vpx/src/vpx_encoder.c index 5e86835ea..03ddc62b2 100644 --- a/vpx/src/vpx_encoder.c +++ b/vpx/src/vpx_encoder.c @@ -51,7 +51,7 @@ vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t *ctx, ctx->priv = NULL; ctx->init_flags = flags; ctx->config.enc = cfg; - res = ctx->iface->init(ctx); + res = ctx->iface->init(ctx, NULL); if (res) { @@ -66,6 +66,85 @@ vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t *ctx, return SAVE_STATUS(ctx, res); } +vpx_codec_err_t vpx_codec_enc_init_multi_ver(vpx_codec_ctx_t *ctx, + vpx_codec_iface_t *iface, + vpx_codec_enc_cfg_t *cfg, + int num_enc, + vpx_codec_flags_t flags, + vpx_rational_t *dsf, + int ver) +{ + vpx_codec_err_t res = 0; + + if (ver != VPX_ENCODER_ABI_VERSION) + res = VPX_CODEC_ABI_MISMATCH; + else if (!ctx || !iface || !cfg || (num_enc > 16 || num_enc < 1)) + res = VPX_CODEC_INVALID_PARAM; + else if (iface->abi_version != VPX_CODEC_INTERNAL_ABI_VERSION) + res = VPX_CODEC_ABI_MISMATCH; + else if (!(iface->caps & VPX_CODEC_CAP_ENCODER)) + res = VPX_CODEC_INCAPABLE; + else if ((flags & VPX_CODEC_USE_XMA) && !(iface->caps & VPX_CODEC_CAP_XMA)) + res = VPX_CODEC_INCAPABLE; + else if ((flags & VPX_CODEC_USE_PSNR) + && !(iface->caps & VPX_CODEC_CAP_PSNR)) + res = VPX_CODEC_INCAPABLE; + else if ((flags & VPX_CODEC_USE_OUTPUT_PARTITION) + && !(iface->caps & VPX_CODEC_CAP_OUTPUT_PARTITION)) + res = VPX_CODEC_INCAPABLE; + else + { + int i; + void *mem_loc = NULL; + + if(!(res = iface->enc.mr_get_mem_loc(cfg, &mem_loc))) + { + for (i = 0; i < num_enc; i++) + { + vpx_codec_priv_enc_mr_cfg_t mr_cfg; + + /* Validate down-sampling factor. */ + if(dsf->num < 1 || dsf->num > 4096 || dsf->den < 1 || + dsf->den > dsf->num) + { + res = VPX_CODEC_INVALID_PARAM; + break; + } + + mr_cfg.mr_low_res_mode_info = mem_loc; + mr_cfg.mr_total_resolutions = num_enc; + mr_cfg.mr_encoder_id = num_enc-1-i; + mr_cfg.mr_down_sampling_factor.num = dsf->num; + mr_cfg.mr_down_sampling_factor.den = dsf->den; + + ctx->iface = iface; + ctx->name = iface->name; + ctx->priv = NULL; + ctx->init_flags = flags; + ctx->config.enc = cfg; + res = ctx->iface->init(ctx, &mr_cfg); + + if (res) + { + ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL; + vpx_codec_destroy(ctx); + } + + if (ctx->priv) + ctx->priv->iface = ctx->iface; + + if (res) + break; + + ctx++; + cfg++; + dsf++; + } + } + } + + return SAVE_STATUS(ctx, res); +} vpx_codec_err_t vpx_codec_enc_config_default(vpx_codec_iface_t *iface, @@ -123,7 +202,7 @@ vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, vpx_enc_frame_flags_t flags, unsigned long deadline) { - vpx_codec_err_t res; + vpx_codec_err_t res = 0; if (!ctx || (img && !duration)) res = VPX_CODEC_INVALID_PARAM; @@ -136,9 +215,37 @@ vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, /* Execute in a normalized floating point environment, if the platform * requires it. */ + unsigned int num_enc =ctx->priv->enc.total_encoders; + FLOATING_POINT_INIT(); - res = ctx->iface->enc.encode(ctx->priv->alg_priv, img, pts, - duration, flags, deadline); + + if (num_enc == 1) + res = ctx->iface->enc.encode(ctx->priv->alg_priv, img, pts, + duration, flags, deadline); + else + { + /* Multi-resolution encoding: + * Encode multi-levels in reverse order. For example, + * if mr_total_resolutions = 3, first encode level 2, + * then encode level 1, and finally encode level 0. + */ + int i; + + ctx += num_enc - 1; + if (img) img += num_enc - 1; + + for (i = num_enc-1; i >= 0; i--) + { + if ((res = ctx->iface->enc.encode(ctx->priv->alg_priv, img, pts, + duration, flags, deadline))) + break; + + ctx--; + if (img) img--; + } + ctx++; + } + FLOATING_POINT_RESTORE(); } diff --git a/vpx/src/vpx_image.c b/vpx/src/vpx_image.c index 7a4e27062..336b6e29d 100644 --- a/vpx/src/vpx_image.c +++ b/vpx/src/vpx_image.c @@ -13,10 +13,42 @@ #include <string.h> #include "vpx/vpx_image.h" +#define ADDRESS_STORAGE_SIZE sizeof(size_t) +/*returns an addr aligned to the byte boundary specified by align*/ +#define align_addr(addr,align) (void*)(((size_t)(addr) + ((align) - 1)) & (size_t)-(align)) + +/* Memalign code is copied from vpx_mem.c */ +static void *img_buf_memalign(size_t align, size_t size) +{ + void *addr, + * x = NULL; + + addr = malloc(size + align - 1 + ADDRESS_STORAGE_SIZE); + + if (addr) + { + x = align_addr((unsigned char *)addr + ADDRESS_STORAGE_SIZE, (int)align); + /* save the actual malloc address */ + ((size_t *)x)[-1] = (size_t)addr; + } + + return x; +} + +static void img_buf_free(void *memblk) +{ + if (memblk) + { + void *addr = (void *)(((size_t *)memblk)[-1]); + free(addr); + } +} + static vpx_image_t *img_alloc_helper(vpx_image_t *img, vpx_img_fmt_t fmt, unsigned int d_w, unsigned int d_h, + unsigned int buf_align, unsigned int stride_align, unsigned char *img_data) { @@ -25,6 +57,14 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img, int align; /* Treat align==0 like align==1 */ + if (!buf_align) + buf_align = 1; + + /* Validate alignment (must be power of 2) */ + if (buf_align & (buf_align - 1)) + goto fail; + + /* Treat align==0 like align==1 */ if (!stride_align) stride_align = 1; @@ -119,7 +159,8 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img, if (!img_data) { - img->img_data = malloc((fmt & VPX_IMG_FMT_PLANAR) ? h * w * bps / 8 : h * s); + img->img_data = img_buf_memalign(buf_align, ((fmt & VPX_IMG_FMT_PLANAR)? + h * s * bps / 8 : h * s)); img->img_data_owner = 1; } @@ -150,9 +191,9 @@ vpx_image_t *vpx_img_alloc(vpx_image_t *img, vpx_img_fmt_t fmt, unsigned int d_w, unsigned int d_h, - unsigned int stride_align) + unsigned int align) { - return img_alloc_helper(img, fmt, d_w, d_h, stride_align, NULL); + return img_alloc_helper(img, fmt, d_w, d_h, align, align, NULL); } vpx_image_t *vpx_img_wrap(vpx_image_t *img, @@ -162,7 +203,9 @@ vpx_image_t *vpx_img_wrap(vpx_image_t *img, unsigned int stride_align, unsigned char *img_data) { - return img_alloc_helper(img, fmt, d_w, d_h, stride_align, img_data); + /* By setting buf_align = 1, we don't change buffer alignment in this + * function. */ + return img_alloc_helper(img, fmt, d_w, d_h, 1, stride_align, img_data); } int vpx_img_set_rect(vpx_image_t *img, @@ -254,7 +297,7 @@ void vpx_img_free(vpx_image_t *img) if (img) { if (img->img_data && img->img_data_owner) - free(img->img_data); + img_buf_free(img->img_data); if (img->self_allocd) free(img); diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h index 87ab20c75..885ca229f 100644 --- a/vpx/vpx_encoder.h +++ b/vpx/vpx_encoder.h @@ -634,7 +634,6 @@ extern "C" { * then ts_layer_id = (0,1,0,1,0,1,0,1). */ unsigned int ts_layer_id[MAX_PERIODICITY]; - } vpx_codec_enc_cfg_t; /**< alias for struct vpx_codec_enc_cfg */ @@ -675,6 +674,48 @@ extern "C" { vpx_codec_enc_init_ver(ctx, iface, cfg, flags, VPX_ENCODER_ABI_VERSION) + /*!\brief Initialize multi-encoder instance + * + * Initializes multi-encoder context using the given interface. + * Applications should call the vpx_codec_enc_init_multi convenience macro + * instead of this function directly, to ensure that the ABI version number + * parameter is properly initialized. + * + * In XMA mode (activated by setting VPX_CODEC_USE_XMA in the flags + * parameter), the storage pointed to by the cfg parameter must be + * kept readable and stable until all memory maps have been set. + * + * \param[in] ctx Pointer to this instance's context. + * \param[in] iface Pointer to the algorithm interface to use. + * \param[in] cfg Configuration to use, if known. May be NULL. + * \param[in] num_enc Total number of encoders. + * \param[in] flags Bitfield of VPX_CODEC_USE_* flags + * \param[in] dsf Pointer to down-sampling factors. + * \param[in] ver ABI version number. Must be set to + * VPX_ENCODER_ABI_VERSION + * \retval #VPX_CODEC_OK + * The decoder algorithm initialized. + * \retval #VPX_CODEC_MEM_ERROR + * Memory allocation failed. + */ + vpx_codec_err_t vpx_codec_enc_init_multi_ver(vpx_codec_ctx_t *ctx, + vpx_codec_iface_t *iface, + vpx_codec_enc_cfg_t *cfg, + int num_enc, + vpx_codec_flags_t flags, + vpx_rational_t *dsf, + int ver); + + + /*!\brief Convenience macro for vpx_codec_enc_init_multi_ver() + * + * Ensures the ABI version parameter is properly set. + */ +#define vpx_codec_enc_init_multi(ctx, iface, cfg, num_enc, flags, dsf) \ + vpx_codec_enc_init_multi_ver(ctx, iface, cfg, num_enc, flags, dsf, \ + VPX_ENCODER_ABI_VERSION) + + /*!\brief Get a default configuration * * Initializes a encoder configuration structure with default values. Supports @@ -780,7 +821,6 @@ extern "C" { vpx_enc_frame_flags_t flags, unsigned long deadline); - /*!\brief Set compressed data output buffer * * Sets the buffer that the codec should output the compressed data diff --git a/vpx/vpx_image.h b/vpx/vpx_image.h index 8e08b3642..3e424470f 100644 --- a/vpx/vpx_image.h +++ b/vpx/vpx_image.h @@ -160,7 +160,8 @@ extern "C" { * \param[in] fmt Format for the image * \param[in] d_w Width of the image * \param[in] d_h Height of the image - * \param[in] align Alignment, in bytes, of each row in the image. + * \param[in] align Alignment, in bytes, of the image buffer and + * each row in the image(stride). * * \return Returns a pointer to the initialized image descriptor. If the img * parameter is non-null, the value of the img parameter will be diff --git a/vpx/vpx_integer.h b/vpx/vpx_integer.h index 608760f8b..218bca773 100644 --- a/vpx/vpx_integer.h +++ b/vpx/vpx_integer.h @@ -29,16 +29,8 @@ typedef signed __int64 int64_t; typedef unsigned __int64 uint64_t; #endif -#ifdef HAVE_ARMV6 -typedef unsigned int int_fast16_t; -#else -typedef signed short int_fast16_t; -#endif -typedef signed char int_fast8_t; -typedef unsigned char uint_fast8_t; - #ifndef _UINTPTR_T_DEFINED -typedef unsigned int uintptr_t; +typedef size_t uintptr_t; #endif #else @@ -32,7 +32,7 @@ #include "nestegg/include/nestegg/nestegg.h" #if CONFIG_OS_SUPPORT -#if defined(_WIN32) +#if defined(_MSC_VER) #include <io.h> #define snprintf _snprintf #define isatty _isatty @@ -47,9 +47,11 @@ typedef __int64 off_t; #define fseeko _fseeki64 #define ftello _ftelli64 #elif defined(_WIN32) -/* MinGW defines off_t, and uses f{seek,tell}o64 */ +/* MinGW defines off_t as long + and uses f{seek,tell}o64/off64_t for large files */ #define fseeko fseeko64 #define ftello ftello64 +#define off_t off64_t #endif #if defined(_MSC_VER) @@ -805,7 +807,7 @@ write_webm_file_footer(EbmlGlobal *glob, long hash) { EbmlLoc start; - int i; + unsigned int i; glob->cue_pos = ftello(glob->stream); Ebml_StartSubElement(glob, &start, Cues); @@ -1440,7 +1442,8 @@ static void show_rate_histogram(struct rate_hist *hist, show_histogram(hist->bucket, buckets, hist->total, scale); } -#define ARG_CTRL_CNT_MAX 10 +#define NELEMENTS(x) (sizeof(x)/sizeof(x[0])) +#define ARG_CTRL_CNT_MAX NELEMENTS(vp8_arg_ctrl_map) int main(int argc, const char **argv_) { @@ -1719,14 +1722,26 @@ int main(int argc, const char **argv_) { if (arg_match(&arg, ctrl_args[i], argi)) { + int j; match = 1; - if (arg_ctrl_cnt < ARG_CTRL_CNT_MAX) + /* Point either to the next free element or the first + * instance of this control. + */ + for(j=0; j<arg_ctrl_cnt; j++) + if(arg_ctrls[j][0] == ctrl_args_map[i]) + break; + + /* Update/insert */ + assert(j < ARG_CTRL_CNT_MAX); + if (j < ARG_CTRL_CNT_MAX) { - arg_ctrls[arg_ctrl_cnt][0] = ctrl_args_map[i]; - arg_ctrls[arg_ctrl_cnt][1] = arg_parse_enum_or_int(&arg); - arg_ctrl_cnt++; + arg_ctrls[j][0] = ctrl_args_map[i]; + arg_ctrls[j][1] = arg_parse_enum_or_int(&arg); + if(j == arg_ctrl_cnt) + arg_ctrl_cnt++; } + } } |