summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--LICENSE9
-rw-r--r--args.c2
-rwxr-xr-xbuild/make/ads2gas.pl13
-rwxr-xr-xbuild/make/ads2gas_apple.pl19
-rwxr-xr-xbuild/make/configure.sh21
-rwxr-xr-xconfigure7
-rw-r--r--examples.mk11
-rw-r--r--third_party/libyuv/README.webm17
-rw-r--r--third_party/libyuv/include/libyuv/basic_types.h73
-rw-r--r--third_party/libyuv/include/libyuv/cpu_id.h49
-rw-r--r--third_party/libyuv/include/libyuv/scale.h70
-rw-r--r--third_party/libyuv/source/cpu_id.c81
-rw-r--r--third_party/libyuv/source/row.h264
-rw-r--r--third_party/libyuv/source/scale.c3884
-rw-r--r--usage.dox1
-rw-r--r--usage_cx.dox3
-rw-r--r--vp8/common/arm/arm_systemdependent.c15
-rw-r--r--vp8/common/arm/armv6/dequant_idct_v6.asm (renamed from vp8/decoder/arm/armv6/dequant_idct_v6.asm)0
-rw-r--r--vp8/common/arm/armv6/dequantize_v6.asm (renamed from vp8/decoder/arm/armv6/dequantize_v6.asm)0
-rw-r--r--vp8/common/arm/armv6/idct_blk_v6.c (renamed from vp8/decoder/arm/armv6/idct_blk_v6.c)43
-rw-r--r--vp8/common/arm/armv6/iwalsh_v6.asm124
-rw-r--r--vp8/common/arm/dequantize_arm.c (renamed from vp8/decoder/arm/dequantize_arm.c)3
-rw-r--r--vp8/common/arm/dequantize_arm.h (renamed from vp8/decoder/arm/dequantize_arm.h)28
-rw-r--r--vp8/common/arm/idct_arm.h6
-rw-r--r--vp8/common/arm/neon/dequant_idct_neon.asm (renamed from vp8/decoder/arm/neon/dequant_idct_neon.asm)0
-rw-r--r--vp8/common/arm/neon/dequantizeb_neon.asm (renamed from vp8/decoder/arm/neon/dequantizeb_neon.asm)0
-rw-r--r--vp8/common/arm/neon/idct_blk_neon.c (renamed from vp8/decoder/arm/neon/idct_blk_neon.c)37
-rw-r--r--vp8/common/arm/neon/idct_dequant_0_2x_neon.asm (renamed from vp8/decoder/arm/neon/idct_dequant_0_2x_neon.asm)0
-rw-r--r--vp8/common/arm/neon/idct_dequant_full_2x_neon.asm (renamed from vp8/decoder/arm/neon/idct_dequant_full_2x_neon.asm)0
-rw-r--r--vp8/common/arm/neon/iwalsh_neon.asm37
-rw-r--r--vp8/common/bigend.h32
-rw-r--r--vp8/common/blockd.h15
-rw-r--r--vp8/common/common.h2
-rw-r--r--vp8/common/dequantize.c (renamed from vp8/decoder/dequantize.c)19
-rw-r--r--vp8/common/dequantize.h (renamed from vp8/decoder/dequantize.h)27
-rw-r--r--vp8/common/dma_desc.h125
-rw-r--r--vp8/common/duck_io.h116
-rw-r--r--vp8/common/findnearmv.h8
-rw-r--r--vp8/common/g_common.h21
-rw-r--r--vp8/common/generic/systemdependent.c9
-rw-r--r--vp8/common/idct.h4
-rw-r--r--vp8/common/idct_blk.c (renamed from vp8/decoder/idct_blk.c)27
-rw-r--r--vp8/common/idctllm.c20
-rw-r--r--vp8/common/invtrans.c70
-rw-r--r--vp8/common/invtrans.h65
-rw-r--r--vp8/common/littlend.h33
-rw-r--r--vp8/common/onyx.h54
-rw-r--r--vp8/common/onyxc_int.h2
-rw-r--r--vp8/common/onyxd.h20
-rw-r--r--vp8/common/ppc/systemdependent.c1
-rw-r--r--vp8/common/reconinter.c77
-rw-r--r--vp8/common/reconinter.h8
-rw-r--r--vp8/common/type_aliases.h117
-rw-r--r--vp8/common/x86/dequantize_mmx.asm (renamed from vp8/decoder/x86/dequantize_mmx.asm)201
-rw-r--r--vp8/common/x86/dequantize_x86.h (renamed from vp8/decoder/x86/dequantize_x86.h)12
-rw-r--r--vp8/common/x86/filter_x86.c35
-rw-r--r--vp8/common/x86/filter_x86.h19
-rw-r--r--vp8/common/x86/idct_blk_mmx.c (renamed from vp8/decoder/x86/idct_blk_mmx.c)41
-rw-r--r--vp8/common/x86/idct_blk_sse2.c (renamed from vp8/decoder/x86/idct_blk_sse2.c)39
-rw-r--r--vp8/common/x86/idct_x86.h4
-rw-r--r--vp8/common/x86/iwalsh_mmx.asm219
-rw-r--r--vp8/common/x86/iwalsh_sse2.asm148
-rw-r--r--vp8/common/x86/loopfilter_sse2.asm199
-rw-r--r--vp8/common/x86/subpixel_mmx.asm45
-rw-r--r--vp8/common/x86/subpixel_sse2.asm16
-rw-r--r--vp8/common/x86/subpixel_x86.h2
-rw-r--r--vp8/common/x86/vp8_asm_stubs.c2
-rw-r--r--vp8/common/x86/x86_systemdependent.c12
-rw-r--r--vp8/decoder/arm/arm_dsystemdependent.c16
-rw-r--r--vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm213
-rw-r--r--vp8/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm75
-rw-r--r--vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm208
-rw-r--r--vp8/decoder/decodframe.c249
-rw-r--r--vp8/decoder/detokenize.c12
-rw-r--r--vp8/decoder/error_concealment.c38
-rw-r--r--vp8/decoder/generic/dsystemdependent.c10
-rw-r--r--vp8/decoder/onyxd_if.c63
-rw-r--r--vp8/decoder/onyxd_int.h10
-rw-r--r--vp8/decoder/threading.c105
-rw-r--r--vp8/decoder/x86/x86_dsystemdependent.c43
-rw-r--r--vp8/encoder/arm/armv5te/boolhuff_armv5te.asm32
-rw-r--r--vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm42
-rw-r--r--vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm35
-rw-r--r--vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm104
-rw-r--r--vp8/encoder/arm/armv6/vp8_subtract_armv6.asm63
-rw-r--r--vp8/encoder/arm/boolhuff_arm.c9
-rw-r--r--vp8/encoder/arm/neon/subtract_neon.asm154
-rw-r--r--vp8/encoder/arm/variance_arm.c2
-rw-r--r--vp8/encoder/asm_enc_offsets.c4
-rw-r--r--vp8/encoder/bitstream.c228
-rw-r--r--vp8/encoder/bitstream.h18
-rw-r--r--vp8/encoder/block.h5
-rw-r--r--vp8/encoder/encodeframe.c131
-rw-r--r--vp8/encoder/encodeintra.c33
-rw-r--r--vp8/encoder/encodemb.c40
-rw-r--r--vp8/encoder/encodemb.h7
-rw-r--r--vp8/encoder/encodemv.c2
-rw-r--r--vp8/encoder/ethreading.c12
-rw-r--r--vp8/encoder/firstpass.c126
-rw-r--r--vp8/encoder/lookahead.c6
-rw-r--r--vp8/encoder/lookahead.h2
-rw-r--r--vp8/encoder/mcomp.c30
-rw-r--r--vp8/encoder/mr_dissim.c201
-rw-r--r--vp8/encoder/mr_dissim.h (renamed from vp8/common/common_types.h)9
-rw-r--r--vp8/encoder/onyx_if.c1049
-rw-r--r--vp8/encoder/onyx_int.h63
-rw-r--r--vp8/encoder/pickinter.c542
-rw-r--r--vp8/encoder/pickinter.h6
-rw-r--r--vp8/encoder/picklpf.c106
-rw-r--r--vp8/encoder/quantize.c89
-rw-r--r--vp8/encoder/ratectrl.c35
-rw-r--r--vp8/encoder/rdopt.c266
-rw-r--r--vp8/encoder/rdopt.h51
-rw-r--r--vp8/encoder/temporal_filter.c5
-rw-r--r--vp8/encoder/tokenize.c20
-rw-r--r--vp8/encoder/x86/subtract_mmx.asm413
-rw-r--r--vp8/encoder/x86/subtract_sse2.asm351
-rw-r--r--vp8/encoder/x86/variance_mmx.c37
-rw-r--r--vp8/encoder/x86/variance_sse2.c13
-rw-r--r--vp8/vp8_common.mk30
-rw-r--r--vp8/vp8_cx_iface.c92
-rw-r--r--vp8/vp8_dx_iface.c10
-rw-r--r--vp8/vp8cx.mk2
-rw-r--r--vp8/vp8dx.mk7
-rw-r--r--vp8/vp8dx_arm.mk17
-rw-r--r--vp8_multi_resolution_encoder.c463
-rw-r--r--vpx/internal/vpx_codec_internal.h25
-rw-r--r--vpx/src/vpx_decoder.c2
-rw-r--r--vpx/src/vpx_encoder.c115
-rw-r--r--vpx/src/vpx_image.c53
-rw-r--r--vpx/vpx_encoder.h44
-rw-r--r--vpx/vpx_image.h3
-rw-r--r--vpx/vpx_integer.h10
-rw-r--r--vpxdec.c2
-rw-r--r--vpxenc.c29
135 files changed, 8335 insertions, 4635 deletions
diff --git a/LICENSE b/LICENSE
index 7a6f99547..1ce44343c 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,4 +1,4 @@
-Copyright (c) 2010, Google Inc. All rights reserved.
+Copyright (c) 2010, The WebM Project authors. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -12,9 +12,10 @@ met:
the documentation and/or other materials provided with the
distribution.
- * Neither the name of Google nor the names of its contributors may
- be used to endorse or promote products derived from this software
- without specific prior written permission.
+ * Neither the name of Google, nor the WebM Project, nor the names
+ of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written
+ permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
diff --git a/args.c b/args.c
index 7b2cc3a10..37ba77884 100644
--- a/args.c
+++ b/args.c
@@ -57,7 +57,7 @@ int arg_match(struct arg *arg_, const struct arg_def *def, char **argv)
}
else if (def->long_name)
{
- int name_len = strlen(def->long_name);
+ const size_t name_len = strlen(def->long_name);
if (strlen(arg.argv[0]) >= name_len + 2
&& arg.argv[0][1] == '-'
diff --git a/build/make/ads2gas.pl b/build/make/ads2gas.pl
index 388133aa2..cea967f93 100755
--- a/build/make/ads2gas.pl
+++ b/build/make/ads2gas.pl
@@ -126,15 +126,14 @@ while (<STDIN>)
# ALIGN directive
s/ALIGN/.balign/g;
- # Strip ARM
- s/\sARM/@ ARM/g;
+ # ARM code
+ s/\sARM/.arm/g;
- # Strip REQUIRE8
- #s/\sREQUIRE8/@ REQUIRE8/g;
- s/\sREQUIRE8/@ /g; #EQU cause problem
+ # REQUIRE8 Stack is required to be 8-byte aligned
+ s/\sREQUIRE8/.eabi_attribute Tag_ABI_align_needed, 1/g;
- # Strip PRESERVE8
- s/\sPRESERVE8/@ PRESERVE8/g;
+ # PRESERVE8 Stack 8-byte align is preserved
+ s/\sPRESERVE8/.eabi_attribute Tag_ABI_align_preserved, 1/g;
# Use PROC and ENDP to give the symbols a .size directive.
# This makes them show up properly in debugging tools like gdb and valgrind.
diff --git a/build/make/ads2gas_apple.pl b/build/make/ads2gas_apple.pl
index 78f4a97f5..81280bf78 100755
--- a/build/make/ads2gas_apple.pl
+++ b/build/make/ads2gas_apple.pl
@@ -30,6 +30,8 @@ my @mapping_list = ("\$0", "\$1", "\$2", "\$3", "\$4", "\$5", "\$6", "\$7", "\$8
my @incoming_array;
+my @imported_functions;
+
# Perl trim function to remove whitespace from the start and end of the string
sub trim($)
{
@@ -132,7 +134,18 @@ while (<STDIN>)
# Make function visible to linker, and make additional symbol with
# prepended underscore
s/EXPORT\s+\|([\$\w]*)\|/.globl _$1\n\t.globl $1/;
- s/IMPORT\s+\|([\$\w]*)\|/.globl $1/;
+
+ # Prepend imported functions with _
+ if (s/IMPORT\s+\|([\$\w]*)\|/.globl $1/)
+ {
+ $function = trim($1);
+ push(@imported_functions, $function);
+ }
+
+ foreach $function (@imported_functions)
+ {
+ s/$function/_$function/;
+ }
# No vertical bars required; make additional symbol with prepended
# underscore
@@ -157,8 +170,8 @@ while (<STDIN>)
s/\sPRESERVE8/@ PRESERVE8/g;
# Strip PROC and ENDPROC
- s/PROC/@/g;
- s/ENDP/@/g;
+ s/\bPROC\b/@/g;
+ s/\bENDP\b/@/g;
# EQU directive
s/(.*)EQU(.*)/.set $1, $2/;
diff --git a/build/make/configure.sh b/build/make/configure.sh
index 1279f781a..0426f9220 100755
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -561,6 +561,10 @@ process_common_toolchain() {
tgt_isa=x86_64
tgt_os=darwin10
;;
+ *darwin11*)
+ tgt_isa=x86_64
+ tgt_os=darwin11
+ ;;
*mingw32*|*cygwin*)
[ -z "$tgt_isa" ] && tgt_isa=x86
tgt_os=win32
@@ -617,6 +621,9 @@ process_common_toolchain() {
if [ -d "/Developer/SDKs/MacOSX10.6.sdk" ]; then
osx_sdk_dir="/Developer/SDKs/MacOSX10.6.sdk"
fi
+ if [ -d "/Developer/SDKs/MacOSX10.7.sdk" ]; then
+ osx_sdk_dir="/Developer/SDKs/MacOSX10.7.sdk"
+ fi
case ${toolchain} in
*-darwin8-*)
@@ -637,6 +644,12 @@ process_common_toolchain() {
add_ldflags "-isysroot ${osx_sdk_dir}"
add_ldflags "-mmacosx-version-min=10.6"
;;
+ *-darwin11-*)
+ add_cflags "-isysroot ${osx_sdk_dir}"
+ add_cflags "-mmacosx-version-min=10.7"
+ add_ldflags "-isysroot ${osx_sdk_dir}"
+ add_ldflags "-mmacosx-version-min=10.7"
+ ;;
esac
# Handle Solaris variants. Solaris 10 needs -lposix4
@@ -732,7 +745,7 @@ process_common_toolchain() {
TOOLCHAIN_PATH=${SDK_PATH}/usr/bin
CC=${TOOLCHAIN_PATH}/gcc
AR=${TOOLCHAIN_PATH}/ar
- LD=${TOOLCHAIN_PATH}/arm-apple-darwin10-gcc-4.2.1
+ LD=${TOOLCHAIN_PATH}/arm-apple-darwin10-llvm-gcc-4.2
AS=${TOOLCHAIN_PATH}/as
STRIP=${TOOLCHAIN_PATH}/strip
NM=${TOOLCHAIN_PATH}/nm
@@ -746,13 +759,13 @@ process_common_toolchain() {
add_cflags -arch ${tgt_isa}
add_ldflags -arch_only ${tgt_isa}
- add_cflags "-isysroot ${SDK_PATH}/SDKs/iPhoneOS4.3.sdk"
+ add_cflags "-isysroot ${SDK_PATH}/SDKs/iPhoneOS5.0.sdk"
# This should be overridable
- alt_libc=${SDK_PATH}/SDKs/iPhoneOS4.3.sdk
+ alt_libc=${SDK_PATH}/SDKs/iPhoneOS5.0.sdk
# Add the paths for the alternate libc
- for d in usr/include usr/include/gcc/darwin/4.2/ usr/lib/gcc/arm-apple-darwin10/4.2.1/include/; do
+ for d in usr/include; do
try_dir="${alt_libc}/${d}"
[ -d "${try_dir}" ] && add_cflags -I"${try_dir}"
done
diff --git a/configure b/configure
index e6fcc87da..6f20c6b77 100755
--- a/configure
+++ b/configure
@@ -35,7 +35,7 @@ Advanced options:
${toggle_internal_stats} output of encoder internal stats for debug, if supported (encoders)
${toggle_mem_tracker} track memory usage
${toggle_postproc} postprocessing
- ${toggle_multithread} multithreaded encoding and decoding.
+ ${toggle_multithread} multithreaded encoding and decoding
${toggle_spatial_resampling} spatial sampling (scaling) support
${toggle_realtime_only} enable this option while building for real-time encoding
${toggle_error_concealment} enable this option to get a decoder which is able to conceal losses
@@ -44,6 +44,7 @@ Advanced options:
${toggle_static} static library support
${toggle_small} favor smaller size over speed
${toggle_postproc_visualizer} macro block / block level visualizers
+ ${toggle_multi_res_encoding} enable multiple-resolution encoding
Codecs:
Codecs can be selectively enabled or disabled individually, or by family:
@@ -118,9 +119,11 @@ all_platforms="${all_platforms} x86-win32-vs8"
all_platforms="${all_platforms} x86-win32-vs9"
all_platforms="${all_platforms} x86_64-darwin9-gcc"
all_platforms="${all_platforms} x86_64-darwin10-gcc"
+all_platforms="${all_platforms} x86_64-darwin11-gcc"
all_platforms="${all_platforms} x86_64-linux-gcc"
all_platforms="${all_platforms} x86_64-linux-icc"
all_platforms="${all_platforms} x86_64-solaris-gcc"
+all_platforms="${all_platforms} x86_64-win64-gcc"
all_platforms="${all_platforms} x86_64-win64-vs8"
all_platforms="${all_platforms} x86_64-win64-vs9"
all_platforms="${all_platforms} universal-darwin8-gcc"
@@ -261,6 +264,7 @@ CONFIG_LIST="
postproc_visualizer
os_support
unit_tests
+ multi_res_encoding
"
CMDLINE_SELECT="
extra_warnings
@@ -303,6 +307,7 @@ CMDLINE_SELECT="
small
postproc_visualizer
unit_tests
+ multi_res_encoding
"
process_cmdline() {
diff --git a/examples.mk b/examples.mk
index 8088d3217..f6c904588 100644
--- a/examples.mk
+++ b/examples.mk
@@ -96,6 +96,17 @@ GEN_EXAMPLES-$(CONFIG_VP8_ENCODER) += vp8cx_set_ref.c
vp8cx_set_ref.GUID = C5E31F7F-96F6-48BD-BD3E-10EBF6E8057A
vp8cx_set_ref.DESCRIPTION = VP8 set encoder reference frame
+# C file is provided, not generated automatically.
+GEN_EXAMPLES-$(CONFIG_MULTI_RES_ENCODING) += vp8_multi_resolution_encoder.c
+vp8_multi_resolution_encoder.SRCS \
+ += third_party/libyuv/include/libyuv/basic_types.h \
+ third_party/libyuv/include/libyuv/cpu_id.h \
+ third_party/libyuv/include/libyuv/scale.h \
+ third_party/libyuv/source/row.h \
+ third_party/libyuv/source/scale.c \
+ third_party/libyuv/source/cpu_id.c
+vp8_multi_resolution_encoder.GUID = 04f8738e-63c8-423b-90fa-7c2703a374de
+vp8_multi_resolution_encoder.DESCRIPTION = VP8 Multiple-resolution Encoding
# Handle extra library flags depending on codec configuration
diff --git a/third_party/libyuv/README.webm b/third_party/libyuv/README.webm
new file mode 100644
index 000000000..d3495caa1
--- /dev/null
+++ b/third_party/libyuv/README.webm
@@ -0,0 +1,17 @@
+Name: libyuv
+URL: http://code.google.com/p/libyuv/
+Version: 102
+License: BSD
+License File: LICENSE
+
+Description:
+libyuv is an open source project that includes YUV conversion and scaling
+functionality.
+
+The optimized scaler in libyuv is used in multiple resolution encoder example,
+which down-samples the original input video (f.g. 1280x720) a number of times
+in order to encode multiple resolution bit streams.
+
+Local Modifications:
+Modified the original scaler code from C++ to C to fit in our current build
+system. This is a temporal solution, and will be improved later. \ No newline at end of file
diff --git a/third_party/libyuv/include/libyuv/basic_types.h b/third_party/libyuv/include/libyuv/basic_types.h
new file mode 100644
index 000000000..30504ce66
--- /dev/null
+++ b/third_party/libyuv/include/libyuv/basic_types.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_
+#define INCLUDE_LIBYUV_BASIC_TYPES_H_
+
+#include <stddef.h> // for NULL, size_t
+
+#if !(defined(_MSC_VER) && (_MSC_VER < 1600))
+#include <stdint.h> // for uintptr_t
+#endif
+
+#ifndef INT_TYPES_DEFINED
+#define INT_TYPES_DEFINED
+#ifdef COMPILER_MSVC
+typedef unsigned __int64 uint64;
+typedef __int64 int64;
+#ifndef INT64_C
+#define INT64_C(x) x ## I64
+#endif
+#ifndef UINT64_C
+#define UINT64_C(x) x ## UI64
+#endif
+#define INT64_F "I64"
+#else // COMPILER_MSVC
+#ifdef __LP64__
+typedef unsigned long uint64;
+typedef long int64;
+#ifndef INT64_C
+#define INT64_C(x) x ## L
+#endif
+#ifndef UINT64_C
+#define UINT64_C(x) x ## UL
+#endif
+#define INT64_F "l"
+#else // __LP64__
+typedef unsigned long long uint64;
+typedef long long int64;
+#ifndef INT64_C
+#define INT64_C(x) x ## LL
+#endif
+#ifndef UINT64_C
+#define UINT64_C(x) x ## ULL
+#endif
+#define INT64_F "ll"
+#endif // __LP64__
+#endif // COMPILER_MSVC
+typedef unsigned int uint32;
+typedef int int32;
+typedef unsigned short uint16;
+typedef short int16;
+typedef unsigned char uint8;
+typedef char int8;
+#endif // INT_TYPES_DEFINED
+
+// Detect compiler is for x86 or x64.
+#if defined(__x86_64__) || defined(_M_X64) || \
+ defined(__i386__) || defined(_M_IX86)
+#define CPU_X86 1
+#endif
+
+#define ALIGNP(p, t) \
+ ((uint8*)((((uintptr_t)(p) + \
+ ((t)-1)) & ~((t)-1))))
+
+#endif // INCLUDE_LIBYUV_BASIC_TYPES_H_
diff --git a/third_party/libyuv/include/libyuv/cpu_id.h b/third_party/libyuv/include/libyuv/cpu_id.h
new file mode 100644
index 000000000..4a53b5bef
--- /dev/null
+++ b/third_party/libyuv/include/libyuv/cpu_id.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_CPU_ID_H_
+#define INCLUDE_LIBYUV_CPU_ID_H_
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// These flags are only valid on x86 processors
+static const int kCpuHasSSE2 = 1;
+static const int kCpuHasSSSE3 = 2;
+
+// These flags are only valid on ARM processors
+static const int kCpuHasNEON = 4;
+
+// Internal flag to indicate cpuid is initialized.
+static const int kCpuInitialized = 8;
+
+// Detect CPU has SSE2 etc.
+// test_flag parameter should be one of kCpuHas constants above
+// returns non-zero if instruction set is detected
+static __inline int TestCpuFlag(int test_flag) {
+ extern int cpu_info_;
+ extern int InitCpuFlags();
+ return (cpu_info_ ? cpu_info_ : InitCpuFlags()) & test_flag;
+}
+
+// For testing, allow CPU flags to be disabled.
+// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.
+// -1 to enable all cpu specific optimizations.
+// 0 to disable all cpu specific optimizations.
+void MaskCpuFlags(int enable_flags);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_CPU_ID_H_
diff --git a/third_party/libyuv/include/libyuv/scale.h b/third_party/libyuv/include/libyuv/scale.h
new file mode 100644
index 000000000..21fe360ce
--- /dev/null
+++ b/third_party/libyuv/include/libyuv/scale.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_SCALE_H_
+#define INCLUDE_LIBYUV_SCALE_H_
+
+#include "third_party/libyuv/include/libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Supported filtering
+typedef enum {
+ kFilterNone = 0, // Point sample; Fastest
+ kFilterBilinear = 1, // Faster than box, but lower quality scaling down.
+ kFilterBox = 2 // Highest quality
+}FilterMode;
+
+// Scales a YUV 4:2:0 image from the src width and height to the
+// dst width and height.
+// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
+// used. This produces basic (blocky) quality at the fastest speed.
+// If filtering is kFilterBilinear, interpolation is used to produce a better
+// quality image, at the expense of speed.
+// If filtering is kFilterBox, averaging is used to produce ever better
+// quality image, at further expense of speed.
+// Returns 0 if successful.
+
+int I420Scale(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ int src_width, int src_height,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int dst_width, int dst_height,
+ FilterMode filtering);
+
+// Legacy API. Deprecated
+int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
+ int src_stride_y, int src_stride_u, int src_stride_v,
+ int src_width, int src_height,
+ uint8* dst_y, uint8* dst_u, uint8* dst_v,
+ int dst_stride_y, int dst_stride_u, int dst_stride_v,
+ int dst_width, int dst_height,
+ int interpolate);
+
+// Legacy API. Deprecated
+int ScaleOffset(const uint8* src, int src_width, int src_height,
+ uint8* dst, int dst_width, int dst_height, int dst_yoffset,
+ int interpolate);
+
+// For testing, allow disabling of optimizations.
+void SetUseReferenceImpl(int use);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_SCALE_H_
diff --git a/third_party/libyuv/source/cpu_id.c b/third_party/libyuv/source/cpu_id.c
new file mode 100644
index 000000000..fccf3dd44
--- /dev/null
+++ b/third_party/libyuv/source/cpu_id.c
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/libyuv/include/libyuv/cpu_id.h"
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+#ifdef __ANDROID__
+#include <cpu-features.h>
+#endif
+
+#include "third_party/libyuv/include/libyuv/basic_types.h" // for CPU_X86
+
+// TODO(fbarchard): Use cpuid.h when gcc 4.4 is used on OSX and Linux.
+#if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__)
+static inline void __cpuid(int cpu_info[4], int info_type) {
+ asm volatile (
+ "mov %%ebx, %%edi \n"
+ "cpuid \n"
+ "xchg %%edi, %%ebx \n"
+ : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
+ : "a"(info_type)
+ );
+}
+#elif defined(__i386__) || defined(__x86_64__)
+static inline void __cpuid(int cpu_info[4], int info_type) {
+ asm volatile (
+ "cpuid \n"
+ : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
+ : "a"(info_type)
+ );
+}
+#endif
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// CPU detect function for SIMD instruction sets.
+int cpu_info_ = 0;
+
+int InitCpuFlags() {
+#ifdef CPU_X86
+ int cpu_info[4];
+ __cpuid(cpu_info, 1);
+ cpu_info_ = (cpu_info[3] & 0x04000000 ? kCpuHasSSE2 : 0) |
+ (cpu_info[2] & 0x00000200 ? kCpuHasSSSE3 : 0) |
+ kCpuInitialized;
+#elif defined(__ANDROID__) && defined(__ARM_NEON__)
+ uint64_t features = android_getCpuFeatures();
+ cpu_info_ = ((features & ANDROID_CPU_ARM_FEATURE_NEON) ? kCpuHasNEON : 0) |
+ kCpuInitialized;
+#elif defined(__ARM_NEON__)
+ // gcc -mfpu=neon defines __ARM_NEON__
+ // Enable Neon if you want support for Neon and Arm, and use MaskCpuFlags
+ // to disable Neon on devices that do not have it.
+ cpu_info_ = kCpuHasNEON | kCpuInitialized;
+#else
+ cpu_info_ = kCpuInitialized;
+#endif
+ return cpu_info_;
+}
+
+void MaskCpuFlags(int enable_flags) {
+ InitCpuFlags();
+ cpu_info_ = (cpu_info_ & enable_flags) | kCpuInitialized;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/libyuv/source/row.h b/third_party/libyuv/source/row.h
new file mode 100644
index 000000000..eabe18094
--- /dev/null
+++ b/third_party/libyuv/source/row.h
@@ -0,0 +1,264 @@
+/*
+ * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef LIBYUV_SOURCE_ROW_H_
+#define LIBYUV_SOURCE_ROW_H_
+
+#include "third_party/libyuv/include/libyuv/basic_types.h"
+
+#define kMaxStride (2048 * 4)
+#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1)))
+
+#if defined(COVERAGE_ENABLED) || defined(TARGET_IPHONE_SIMULATOR)
+#define YUV_DISABLE_ASM
+#endif
+
+#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
+#define HAS_FASTCONVERTYUVTOARGBROW_NEON
+void FastConvertYUVToARGBRow_NEON(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width);
+#define HAS_FASTCONVERTYUVTOBGRAROW_NEON
+void FastConvertYUVToBGRARow_NEON(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width);
+#define HAS_FASTCONVERTYUVTOABGRROW_NEON
+void FastConvertYUVToABGRRow_NEON(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width);
+#endif
+
+// The following are available on all x86 platforms
+#if (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
+ !defined(YUV_DISABLE_ASM)
+#define HAS_ABGRTOARGBROW_SSSE3
+#define HAS_BGRATOARGBROW_SSSE3
+#define HAS_BG24TOARGBROW_SSSE3
+#define HAS_RAWTOARGBROW_SSSE3
+#define HAS_RGB24TOYROW_SSSE3
+#define HAS_RAWTOYROW_SSSE3
+#define HAS_RGB24TOUVROW_SSSE3
+#define HAS_RAWTOUVROW_SSSE3
+#define HAS_ARGBTOYROW_SSSE3
+#define HAS_BGRATOYROW_SSSE3
+#define HAS_ABGRTOYROW_SSSE3
+#define HAS_ARGBTOUVROW_SSSE3
+#define HAS_BGRATOUVROW_SSSE3
+#define HAS_ABGRTOUVROW_SSSE3
+#define HAS_I400TOARGBROW_SSE2
+#define HAS_FASTCONVERTYTOARGBROW_SSE2
+#define HAS_FASTCONVERTYUVTOARGBROW_SSSE3
+#define HAS_FASTCONVERTYUVTOBGRAROW_SSSE3
+#define HAS_FASTCONVERTYUVTOABGRROW_SSSE3
+#define HAS_FASTCONVERTYUV444TOARGBROW_SSSE3
+#define HAS_REVERSE_ROW_SSSE3
+#endif
+
+// The following are available on Neon platforms
+#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
+#define HAS_REVERSE_ROW_NEON
+#endif
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#ifdef HAS_ARGBTOYROW_SSSE3
+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+#endif
+#if defined(HAS_BG24TOARGBROW_SSSE3) && defined(HAS_ARGBTOYROW_SSSE3)
+#define HASRGB24TOYROW_SSSE3
+#endif
+#ifdef HASRGB24TOYROW_SSSE3
+void RGB24ToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void RAWToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void RGB24ToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void RAWToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+#endif
+#ifdef HAS_REVERSE_ROW_SSSE3
+void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width);
+#endif
+#ifdef HAS_REVERSE_ROW_NEON
+void ReverseRow_NEON(const uint8* src, uint8* dst, int width);
+#endif
+void ReverseRow_C(const uint8* src, uint8* dst, int width);
+
+void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
+void BGRAToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
+void ABGRToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
+void RGB24ToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
+void RAWToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void BGRAToUVRow_C(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void ABGRToUVRow_C(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void RGB24ToUVRow_C(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void RAWToUVRow_C(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+
+#ifdef HAS_BG24TOARGBROW_SSSE3
+void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix);
+void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix);
+void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix);
+void RAWToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix);
+#endif
+void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix);
+void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix);
+void BG24ToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix);
+void RAWToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix);
+
+#ifdef HAS_I400TOARGBROW_SSE2
+void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
+#endif
+void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);
+
+#if defined(_MSC_VER)
+#define SIMD_ALIGNED(var) __declspec(align(16)) var
+typedef __declspec(align(16)) signed char vec8[16];
+typedef __declspec(align(16)) unsigned char uvec8[16];
+typedef __declspec(align(16)) signed short vec16[8];
+#else // __GNUC__
+#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
+typedef signed char __attribute__((vector_size(16))) vec8;
+typedef unsigned char __attribute__((vector_size(16))) uvec8;
+typedef signed short __attribute__((vector_size(16))) vec16;
+#endif
+
+//extern "C"
+SIMD_ALIGNED(const int16 kCoefficientsRgbY[768][4]);
+//extern "C"
+SIMD_ALIGNED(const int16 kCoefficientsBgraY[768][4]);
+//extern "C"
+SIMD_ALIGNED(const int16 kCoefficientsAbgrY[768][4]);
+
+void FastConvertYUVToARGBRow_C(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width);
+
+void FastConvertYUVToBGRARow_C(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width);
+
+void FastConvertYUVToABGRRow_C(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width);
+
+void FastConvertYUV444ToARGBRow_C(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width);
+
+void FastConvertYToARGBRow_C(const uint8* y_buf,
+ uint8* rgb_buf,
+ int width);
+
+#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSE2
+void FastConvertYUVToARGBRow_SSE2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width);
+
+void FastConvertYUVToARGBRow4_SSE2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width);
+
+void FastConvertYUVToBGRARow_SSE2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width);
+
+void FastConvertYUVToABGRRow_SSE2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width);
+
+void FastConvertYUV444ToARGBRow_SSE2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width);
+
+void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
+ uint8* rgb_buf,
+ int width);
+#endif
+
+#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSSE3
+void FastConvertYUVToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width);
+
+void FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width);
+
+void FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width);
+
+void FastConvertYUV444ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width);
+
+#endif
+
+#ifdef HAS_FASTCONVERTYTOARGBROW_SSE2
+void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
+ uint8* rgb_buf,
+ int width);
+
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // LIBYUV_SOURCE_ROW_H_
diff --git a/third_party/libyuv/source/scale.c b/third_party/libyuv/source/scale.c
new file mode 100644
index 000000000..930a7ae09
--- /dev/null
+++ b/third_party/libyuv/source/scale.c
@@ -0,0 +1,3884 @@
+/*
+ * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/libyuv/include/libyuv/scale.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include "third_party/libyuv/include/libyuv/cpu_id.h"
+#include "third_party/libyuv/source/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+/*
+ * Note: Defining YUV_DISABLE_ASM allows to use c version.
+ */
+//#define YUV_DISABLE_ASM
+
+#if defined(_MSC_VER)
+#define ALIGN16(var) __declspec(align(16)) var
+#else
+#define ALIGN16(var) var __attribute__((aligned(16)))
+#endif
+
+// Note: A Neon reference manual
+// http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204j/CJAJIIGG.html
+// Note: Some SSE2 reference manuals
+// cpuvol1.pdf agner_instruction_tables.pdf 253666.pdf 253667.pdf
+
+// Set the following flag to true to revert to only
+// using the reference implementation ScalePlaneBox(), and
+// NOT the optimized versions. Useful for debugging and
+// when comparing the quality of the resulting YUV planes
+// as produced by the optimized and non-optimized versions.
+
+static int use_reference_impl_ = 0;
+
+void SetUseReferenceImpl(int use) {
+ use_reference_impl_ = use;
+}
+
+// ScaleRowDown2Int also used by planar functions
+
+/**
+ * NEON downscalers with interpolation.
+ *
+ * Provided by Fritz Koenig
+ *
+ */
+
+#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
+#define HAS_SCALEROWDOWN2_NEON
+void ScaleRowDown2_NEON(const uint8* src_ptr, int /* src_stride */,
+ uint8* dst, int dst_width) {
+ asm volatile (
+ "1: \n"
+ "vld2.u8 {q0,q1}, [%0]! \n" // load even pixels into q0, odd into q1
+ "vst1.u8 {q0}, [%1]! \n" // store even pixels
+ "subs %2, %2, #16 \n" // 16 processed per loop
+ "bhi 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst), // %1
+ "+r"(dst_width) // %2
+ :
+ : "q0", "q1" // Clobber List
+ );
+}
+
+void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride,
+ uint8* dst, int dst_width) {
+ asm volatile (
+ "add %1, %0 \n" // change the stride to row 2 pointer
+ "1: \n"
+ "vld1.u8 {q0,q1}, [%0]! \n" // load row 1 and post increment
+ "vld1.u8 {q2,q3}, [%1]! \n" // load row 2 and post increment
+ "vpaddl.u8 q0, q0 \n" // row 1 add adjacent
+ "vpaddl.u8 q1, q1 \n"
+ "vpadal.u8 q0, q2 \n" // row 2 add adjacent, add row 1 to row 2
+ "vpadal.u8 q1, q3 \n"
+ "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
+ "vrshrn.u16 d1, q1, #2 \n"
+ "vst1.u8 {q0}, [%2]! \n"
+ "subs %3, %3, #16 \n" // 16 processed per loop
+ "bhi 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst), // %2
+ "+r"(dst_width) // %3
+ :
+ : "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
+#define HAS_SCALEROWDOWN4_NEON
+static void ScaleRowDown4_NEON(const uint8* src_ptr, int /* src_stride */,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "1: \n"
+ "vld2.u8 {d0, d1}, [%0]! \n"
+ "vtrn.u8 d1, d0 \n"
+ "vshrn.u16 d0, q0, #8 \n"
+ "vst1.u32 {d0[1]}, [%1]! \n"
+
+ "subs %2, #4 \n"
+ "bhi 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "q0", "q1", "memory", "cc"
+ );
+}
+
+static void ScaleRowDown4Int_NEON(const uint8* src_ptr, int src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "add r4, %0, %3 \n"
+ "add r5, r4, %3 \n"
+ "add %3, r5, %3 \n"
+ "1: \n"
+ "vld1.u8 {q0}, [%0]! \n" // load up 16x4 block of input data
+ "vld1.u8 {q1}, [r4]! \n"
+ "vld1.u8 {q2}, [r5]! \n"
+ "vld1.u8 {q3}, [%3]! \n"
+
+ "vpaddl.u8 q0, q0 \n"
+ "vpadal.u8 q0, q1 \n"
+ "vpadal.u8 q0, q2 \n"
+ "vpadal.u8 q0, q3 \n"
+
+ "vpaddl.u16 q0, q0 \n"
+
+ "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
+
+ "vmovn.u16 d0, q0 \n"
+ "vst1.u32 {d0[0]}, [%1]! \n"
+
+ "subs %2, #4 \n"
+ "bhi 1b \n"
+
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"(src_stride) // %3
+ : "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc"
+ );
+}
+
+#define HAS_SCALEROWDOWN34_NEON
+// Down scale from 4 to 3 pixels. Use the neon multilane read/write
+// to load up the every 4th pixel into a 4 different registers.
+// Point samples 32 pixels to 24 pixels.
+static void ScaleRowDown34_NEON(const uint8* src_ptr, int /* src_stride */,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "1: \n"
+ "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "vmov d2, d3 \n" // order needs to be d0, d1, d2
+ "vst3.u8 {d0, d1, d2}, [%1]! \n"
+ "subs %2, #24 \n"
+ "bhi 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "d0", "d1", "d2", "d3", "memory", "cc"
+ );
+}
+
+static void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr, int src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "vmov.u8 d24, #3 \n"
+ "add %3, %0 \n"
+ "1: \n"
+ "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
+
+ // filter src line 0 with src line 1
+ // expand chars to shorts to allow for room
+ // when adding lines together
+ "vmovl.u8 q8, d4 \n"
+ "vmovl.u8 q9, d5 \n"
+ "vmovl.u8 q10, d6 \n"
+ "vmovl.u8 q11, d7 \n"
+
+ // 3 * line_0 + line_1
+ "vmlal.u8 q8, d0, d24 \n"
+ "vmlal.u8 q9, d1, d24 \n"
+ "vmlal.u8 q10, d2, d24 \n"
+ "vmlal.u8 q11, d3, d24 \n"
+
+ // (3 * line_0 + line_1) >> 2
+ "vqrshrn.u16 d0, q8, #2 \n"
+ "vqrshrn.u16 d1, q9, #2 \n"
+ "vqrshrn.u16 d2, q10, #2 \n"
+ "vqrshrn.u16 d3, q11, #2 \n"
+
+ // a0 = (src[0] * 3 + s[1] * 1) >> 2
+ "vmovl.u8 q8, d1 \n"
+ "vmlal.u8 q8, d0, d24 \n"
+ "vqrshrn.u16 d0, q8, #2 \n"
+
+ // a1 = (src[1] * 1 + s[2] * 1) >> 1
+ "vrhadd.u8 d1, d1, d2 \n"
+
+ // a2 = (src[2] * 1 + s[3] * 3) >> 2
+ "vmovl.u8 q8, d2 \n"
+ "vmlal.u8 q8, d3, d24 \n"
+ "vqrshrn.u16 d2, q8, #2 \n"
+
+ "vst3.u8 {d0, d1, d2}, [%1]! \n"
+
+ "subs %2, #24 \n"
+ "bhi 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride) // %3
+ :
+ : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
+ );
+}
+
+static void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr, int src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "vmov.u8 d24, #3 \n"
+ "add %3, %0 \n"
+ "1: \n"
+ "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
+
+ // average src line 0 with src line 1
+ "vrhadd.u8 q0, q0, q2 \n"
+ "vrhadd.u8 q1, q1, q3 \n"
+
+ // a0 = (src[0] * 3 + s[1] * 1) >> 2
+ "vmovl.u8 q3, d1 \n"
+ "vmlal.u8 q3, d0, d24 \n"
+ "vqrshrn.u16 d0, q3, #2 \n"
+
+ // a1 = (src[1] * 1 + s[2] * 1) >> 1
+ "vrhadd.u8 d1, d1, d2 \n"
+
+ // a2 = (src[2] * 1 + s[3] * 3) >> 2
+ "vmovl.u8 q3, d2 \n"
+ "vmlal.u8 q3, d3, d24 \n"
+ "vqrshrn.u16 d2, q3, #2 \n"
+
+ "vst3.u8 {d0, d1, d2}, [%1]! \n"
+
+ "subs %2, #24 \n"
+ "bhi 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride) // %3
+ :
+ : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
+ );
+}
+
+#define HAS_SCALEROWDOWN38_NEON
+const uint8 shuf38[16] __attribute__ ((aligned(16))) =
+ { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
+const uint8 shuf38_2[16] __attribute__ ((aligned(16))) =
+ { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
+const unsigned short mult38_div6[8] __attribute__ ((aligned(16))) =
+ { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
+ 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
+const unsigned short mult38_div9[8] __attribute__ ((aligned(16))) =
+ { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
+ 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
+
+// 32 -> 12
+static void ScaleRowDown38_NEON(const uint8* src_ptr, int,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "vld1.u8 {q3}, [%3] \n"
+ "1: \n"
+ "vld1.u8 {d0, d1, d2, d3}, [%0]! \n"
+ "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
+ "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
+ "vst1.u8 {d4}, [%1]! \n"
+ "vst1.u32 {d5[0]}, [%1]! \n"
+ "subs %2, #12 \n"
+ "bhi 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"(shuf38) // %3
+ : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
+ );
+}
+
+// 32x3 -> 12x1
+static void ScaleRowDown38_3_Int_NEON(const uint8* src_ptr, int src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "vld1.u16 {q13}, [%4] \n"
+ "vld1.u8 {q14}, [%5] \n"
+ "vld1.u8 {q15}, [%6] \n"
+ "add r4, %0, %3, lsl #1 \n"
+ "add %3, %0 \n"
+ "1: \n"
+
+ // d0 = 00 40 01 41 02 42 03 43
+ // d1 = 10 50 11 51 12 52 13 53
+ // d2 = 20 60 21 61 22 62 23 63
+ // d3 = 30 70 31 71 32 72 33 73
+ "vld4.u8 {d0, d1, d2, d3}, [%0]! \n"
+ "vld4.u8 {d4, d5, d6, d7}, [%3]! \n"
+ "vld4.u8 {d16, d17, d18, d19}, [r4]! \n"
+
+ // Shuffle the input data around to get align the data
+ // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+ // d0 = 00 10 01 11 02 12 03 13
+ // d1 = 40 50 41 51 42 52 43 53
+ "vtrn.u8 d0, d1 \n"
+ "vtrn.u8 d4, d5 \n"
+ "vtrn.u8 d16, d17 \n"
+
+ // d2 = 20 30 21 31 22 32 23 33
+ // d3 = 60 70 61 71 62 72 63 73
+ "vtrn.u8 d2, d3 \n"
+ "vtrn.u8 d6, d7 \n"
+ "vtrn.u8 d18, d19 \n"
+
+ // d0 = 00+10 01+11 02+12 03+13
+ // d2 = 40+50 41+51 42+52 43+53
+ "vpaddl.u8 q0, q0 \n"
+ "vpaddl.u8 q2, q2 \n"
+ "vpaddl.u8 q8, q8 \n"
+
+ // d3 = 60+70 61+71 62+72 63+73
+ "vpaddl.u8 d3, d3 \n"
+ "vpaddl.u8 d7, d7 \n"
+ "vpaddl.u8 d19, d19 \n"
+
+ // combine source lines
+ "vadd.u16 q0, q2 \n"
+ "vadd.u16 q0, q8 \n"
+ "vadd.u16 d4, d3, d7 \n"
+ "vadd.u16 d4, d19 \n"
+
+ // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
+ // + s[6 + st * 1] + s[7 + st * 1]
+ // + s[6 + st * 2] + s[7 + st * 2]) / 6
+ "vqrdmulh.s16 q2, q13 \n"
+ "vmovn.u16 d4, q2 \n"
+
+ // Shuffle 2,3 reg around so that 2 can be added to the
+ // 0,1 reg and 3 can be added to the 4,5 reg. This
+ // requires expanding from u8 to u16 as the 0,1 and 4,5
+ // registers are already expanded. Then do transposes
+ // to get aligned.
+ // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+ "vmovl.u8 q1, d2 \n"
+ "vmovl.u8 q3, d6 \n"
+ "vmovl.u8 q9, d18 \n"
+
+ // combine source lines
+ "vadd.u16 q1, q3 \n"
+ "vadd.u16 q1, q9 \n"
+
+ // d4 = xx 20 xx 30 xx 22 xx 32
+ // d5 = xx 21 xx 31 xx 23 xx 33
+ "vtrn.u32 d2, d3 \n"
+
+ // d4 = xx 20 xx 21 xx 22 xx 23
+ // d5 = xx 30 xx 31 xx 32 xx 33
+ "vtrn.u16 d2, d3 \n"
+
+ // 0+1+2, 3+4+5
+ "vadd.u16 q0, q1 \n"
+
+ // Need to divide, but can't downshift as the the value
+ // isn't a power of 2. So multiply by 65536 / n
+ // and take the upper 16 bits.
+ "vqrdmulh.s16 q0, q15 \n"
+
+ // Align for table lookup, vtbl requires registers to
+ // be adjacent
+ "vmov.u8 d2, d4 \n"
+
+ "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
+ "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
+
+ "vst1.u8 {d3}, [%1]! \n"
+ "vst1.u32 {d4[0]}, [%1]! \n"
+ "subs %2, #12 \n"
+ "bhi 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride) // %3
+ : "r"(mult38_div6), // %4
+ "r"(shuf38_2), // %5
+ "r"(mult38_div9) // %6
+ : "r4", "q0", "q1", "q2", "q3", "q8", "q9",
+ "q13", "q14", "q15", "memory", "cc"
+ );
+}
+
+// 32x2 -> 12x1
+static void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr, int src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "vld1.u16 {q13}, [%4] \n"
+ "vld1.u8 {q14}, [%5] \n"
+ "add %3, %0 \n"
+ "1: \n"
+
+ // d0 = 00 40 01 41 02 42 03 43
+ // d1 = 10 50 11 51 12 52 13 53
+ // d2 = 20 60 21 61 22 62 23 63
+ // d3 = 30 70 31 71 32 72 33 73
+ "vld4.u8 {d0, d1, d2, d3}, [%0]! \n"
+ "vld4.u8 {d4, d5, d6, d7}, [%3]! \n"
+
+ // Shuffle the input data around to get align the data
+ // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+ // d0 = 00 10 01 11 02 12 03 13
+ // d1 = 40 50 41 51 42 52 43 53
+ "vtrn.u8 d0, d1 \n"
+ "vtrn.u8 d4, d5 \n"
+
+ // d2 = 20 30 21 31 22 32 23 33
+ // d3 = 60 70 61 71 62 72 63 73
+ "vtrn.u8 d2, d3 \n"
+ "vtrn.u8 d6, d7 \n"
+
+ // d0 = 00+10 01+11 02+12 03+13
+ // d2 = 40+50 41+51 42+52 43+53
+ "vpaddl.u8 q0, q0 \n"
+ "vpaddl.u8 q2, q2 \n"
+
+ // d3 = 60+70 61+71 62+72 63+73
+ "vpaddl.u8 d3, d3 \n"
+ "vpaddl.u8 d7, d7 \n"
+
+ // combine source lines
+ "vadd.u16 q0, q2 \n"
+ "vadd.u16 d4, d3, d7 \n"
+
+ // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
+ "vqrshrn.u16 d4, q2, #2 \n"
+
+ // Shuffle 2,3 reg around so that 2 can be added to the
+ // 0,1 reg and 3 can be added to the 4,5 reg. This
+ // requires expanding from u8 to u16 as the 0,1 and 4,5
+ // registers are already expanded. Then do transposes
+ // to get aligned.
+ // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+ "vmovl.u8 q1, d2 \n"
+ "vmovl.u8 q3, d6 \n"
+
+ // combine source lines
+ "vadd.u16 q1, q3 \n"
+
+ // d4 = xx 20 xx 30 xx 22 xx 32
+ // d5 = xx 21 xx 31 xx 23 xx 33
+ "vtrn.u32 d2, d3 \n"
+
+ // d4 = xx 20 xx 21 xx 22 xx 23
+ // d5 = xx 30 xx 31 xx 32 xx 33
+ "vtrn.u16 d2, d3 \n"
+
+ // 0+1+2, 3+4+5
+ "vadd.u16 q0, q1 \n"
+
+ // Need to divide, but can't downshift as the the value
+ // isn't a power of 2. So multiply by 65536 / n
+ // and take the upper 16 bits.
+ "vqrdmulh.s16 q0, q13 \n"
+
+ // Align for table lookup, vtbl requires registers to
+ // be adjacent
+ "vmov.u8 d2, d4 \n"
+
+ "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
+ "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
+
+ "vst1.u8 {d3}, [%1]! \n"
+ "vst1.u32 {d4[0]}, [%1]! \n"
+ "subs %2, #12 \n"
+ "bhi 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride) // %3
+ : "r"(mult38_div6), // %4
+ "r"(shuf38_2) // %5
+ : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
+ );
+}
+
+/**
+ * SSE2 downscalers with interpolation.
+ *
+ * Provided by Frank Barchard (fbarchard@google.com)
+ *
+ */
+
+// Constants for SSE2 code
+#elif (defined(_M_IX86) || defined(__i386__) || defined(__x86_64__)) && \
+ !defined(YUV_DISABLE_ASM)
+#if defined(_MSC_VER)
+#define TALIGN16(t, var) __declspec(align(16)) t _ ## var
+#elif (defined(__APPLE__) || defined(__MINGW32__) || defined(__CYGWIN__)) && defined(__i386__)
+#define TALIGN16(t, var) t var __attribute__((aligned(16)))
+#else
+#define TALIGN16(t, var) t _ ## var __attribute__((aligned(16)))
+#endif
+
+#if (defined(__APPLE__) || defined(__MINGW32__) || defined(__CYGWIN__)) && \
+ defined(__i386__)
+#define DECLARE_FUNCTION(name) \
+ ".text \n" \
+ ".globl _" #name " \n" \
+"_" #name ": \n"
+#else
+#define DECLARE_FUNCTION(name) \
+ ".text \n" \
+ ".global " #name " \n" \
+#name ": \n"
+#endif
+
+
+// Offsets for source bytes 0 to 9
+//extern "C"
+TALIGN16(const uint8, shuf0[16]) =
+ { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
+//extern "C"
+TALIGN16(const uint8, shuf1[16]) =
+ { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+//extern "C"
+TALIGN16(const uint8, shuf2[16]) =
+ { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 0 to 10
+//extern "C"
+TALIGN16(const uint8, shuf01[16]) =
+ { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
+
+// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
+//extern "C"
+TALIGN16(const uint8, shuf11[16]) =
+ { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+//extern "C"
+TALIGN16(const uint8, shuf21[16]) =
+ { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
+
+// Coefficients for source bytes 0 to 10
+//extern "C"
+TALIGN16(const uint8, madd01[16]) =
+ { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
+
+// Coefficients for source bytes 10 to 21
+//extern "C"
+TALIGN16(const uint8, madd11[16]) =
+ { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
+
+// Coefficients for source bytes 21 to 31
+//extern "C"
+TALIGN16(const uint8, madd21[16]) =
+ { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
+
+// Coefficients for source bytes 21 to 31
+//extern "C"
+TALIGN16(const int16, round34[8]) =
+ { 2, 2, 2, 2, 2, 2, 2, 2 };
+
+//extern "C"
+TALIGN16(const uint8, shuf38a[16]) =
+ { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+//extern "C"
+TALIGN16(const uint8, shuf38b[16]) =
+ { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
+
+// Arrange words 0,3,6 into 0,1,2
+//extern "C"
+TALIGN16(const uint8, shufac0[16]) =
+ { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Arrange words 0,3,6 into 3,4,5
+//extern "C"
+TALIGN16(const uint8, shufac3[16]) =
+ { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
+
+// Scaling values for boxes of 3x3 and 2x3
+//extern "C"
+TALIGN16(const uint16, scaleac3[8]) =
+ { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
+
+// Arrange first value for pixels 0,1,2,3,4,5
+//extern "C"
+TALIGN16(const uint8, shufab0[16]) =
+ { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
+
+// Arrange second value for pixels 0,1,2,3,4,5
+//extern "C"
+TALIGN16(const uint8, shufab1[16]) =
+ { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
+
+// Arrange third value for pixels 0,1,2,3,4,5
+//extern "C"
+TALIGN16(const uint8, shufab2[16]) =
+ { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
+
+// Scaling values for boxes of 3x2 and 2x2
+//extern "C"
+TALIGN16(const uint16, scaleab2[8]) =
+ { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
+#endif
+
+#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
+
+#define HAS_SCALEROWDOWN2_SSE2
+// Reads 32 pixels, throws half away and writes 16 pixels.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
+__declspec(naked)
+static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+
+ wloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ pand xmm0, xmm5
+ pand xmm1, xmm5
+ packuswb xmm0, xmm1
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 16
+ ja wloop
+
+ ret
+ }
+}
+// Blends 32x2 rectangle to 16x1.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
+__declspec(naked)
+void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_ptr
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_ptr
+ mov ecx, [esp + 4 + 16] // dst_width
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+
+ wloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + esi]
+ movdqa xmm3, [eax + esi + 16]
+ lea eax, [eax + 32]
+ pavgb xmm0, xmm2 // average rows
+ pavgb xmm1, xmm3
+
+ movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
+ psrlw xmm0, 8
+ movdqa xmm3, xmm1
+ psrlw xmm1, 8
+ pand xmm2, xmm5
+ pand xmm3, xmm5
+ pavgw xmm0, xmm2
+ pavgw xmm1, xmm3
+ packuswb xmm0, xmm1
+
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 16
+ ja wloop
+
+ pop esi
+ ret
+ }
+}
+
+#define HAS_SCALEROWDOWN4_SSE2
+// Point samples 32 pixels to 8 pixels.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
+__declspec(naked)
+static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ pushad
+ mov esi, [esp + 32 + 4] // src_ptr
+ // src_stride ignored
+ mov edi, [esp + 32 + 12] // dst_ptr
+ mov ecx, [esp + 32 + 16] // dst_width
+ pcmpeqb xmm5, xmm5 // generate mask 0x000000ff
+ psrld xmm5, 24
+
+ wloop:
+ movdqa xmm0, [esi]
+ movdqa xmm1, [esi + 16]
+ lea esi, [esi + 32]
+ pand xmm0, xmm5
+ pand xmm1, xmm5
+ packuswb xmm0, xmm1
+ packuswb xmm0, xmm0
+ movq qword ptr [edi], xmm0
+ lea edi, [edi + 8]
+ sub ecx, 8
+ ja wloop
+
+ popad
+ ret
+ }
+}
+
+// Blends 32x4 rectangle to 8x1.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
+__declspec(naked)
+static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ pushad
+ mov esi, [esp + 32 + 4] // src_ptr
+ mov ebx, [esp + 32 + 8] // src_stride
+ mov edi, [esp + 32 + 12] // dst_ptr
+ mov ecx, [esp + 32 + 16] // dst_width
+ pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff
+ psrlw xmm7, 8
+ lea edx, [ebx + ebx * 2] // src_stride * 3
+
+ wloop:
+ movdqa xmm0, [esi]
+ movdqa xmm1, [esi + 16]
+ movdqa xmm2, [esi + ebx]
+ movdqa xmm3, [esi + ebx + 16]
+ pavgb xmm0, xmm2 // average rows
+ pavgb xmm1, xmm3
+ movdqa xmm2, [esi + ebx * 2]
+ movdqa xmm3, [esi + ebx * 2 + 16]
+ movdqa xmm4, [esi + edx]
+ movdqa xmm5, [esi + edx + 16]
+ lea esi, [esi + 32]
+ pavgb xmm2, xmm4
+ pavgb xmm3, xmm5
+ pavgb xmm0, xmm2
+ pavgb xmm1, xmm3
+
+ movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
+ psrlw xmm0, 8
+ movdqa xmm3, xmm1
+ psrlw xmm1, 8
+ pand xmm2, xmm7
+ pand xmm3, xmm7
+ pavgw xmm0, xmm2
+ pavgw xmm1, xmm3
+ packuswb xmm0, xmm1
+
+ movdqa xmm2, xmm0 // average columns (16 to 8 pixels)
+ psrlw xmm0, 8
+ pand xmm2, xmm7
+ pavgw xmm0, xmm2
+ packuswb xmm0, xmm0
+
+ movq qword ptr [edi], xmm0
+ lea edi, [edi + 8]
+ sub ecx, 8
+ ja wloop
+
+ popad
+ ret
+ }
+}
+
+#define HAS_SCALEROWDOWN8_SSE2
+// Point samples 32 pixels to 4 pixels.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.
+__declspec(naked)
+static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ pushad
+ mov esi, [esp + 32 + 4] // src_ptr
+ // src_stride ignored
+ mov edi, [esp + 32 + 12] // dst_ptr
+ mov ecx, [esp + 32 + 16] // dst_width
+ pcmpeqb xmm5, xmm5 // generate mask isolating 1 src 8 bytes
+ psrlq xmm5, 56
+
+ wloop:
+ movdqa xmm0, [esi]
+ movdqa xmm1, [esi + 16]
+ lea esi, [esi + 32]
+ pand xmm0, xmm5
+ pand xmm1, xmm5
+ packuswb xmm0, xmm1 // 32->16
+ packuswb xmm0, xmm0 // 16->8
+ packuswb xmm0, xmm0 // 8->4
+ movd dword ptr [edi], xmm0
+ lea edi, [edi + 4]
+ sub ecx, 4
+ ja wloop
+
+ popad
+ ret
+ }
+}
+
+// Blends 32x8 rectangle to 4x1.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.
+__declspec(naked)
+static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ pushad
+ mov esi, [esp + 32 + 4] // src_ptr
+ mov ebx, [esp + 32 + 8] // src_stride
+ mov edi, [esp + 32 + 12] // dst_ptr
+ mov ecx, [esp + 32 + 16] // dst_width
+ lea edx, [ebx + ebx * 2] // src_stride * 3
+ pxor xmm7, xmm7
+
+ wloop:
+ movdqa xmm0, [esi] // average 8 rows to 1
+ movdqa xmm1, [esi + 16]
+ movdqa xmm2, [esi + ebx]
+ movdqa xmm3, [esi + ebx + 16]
+ pavgb xmm0, xmm2
+ pavgb xmm1, xmm3
+ movdqa xmm2, [esi + ebx * 2]
+ movdqa xmm3, [esi + ebx * 2 + 16]
+ movdqa xmm4, [esi + edx]
+ movdqa xmm5, [esi + edx + 16]
+ lea ebp, [esi + ebx * 4]
+ lea esi, [esi + 32]
+ pavgb xmm2, xmm4
+ pavgb xmm3, xmm5
+ pavgb xmm0, xmm2
+ pavgb xmm1, xmm3
+
+ movdqa xmm2, [ebp]
+ movdqa xmm3, [ebp + 16]
+ movdqa xmm4, [ebp + ebx]
+ movdqa xmm5, [ebp + ebx + 16]
+ pavgb xmm2, xmm4
+ pavgb xmm3, xmm5
+ movdqa xmm4, [ebp + ebx * 2]
+ movdqa xmm5, [ebp + ebx * 2 + 16]
+ movdqa xmm6, [ebp + edx]
+ pavgb xmm4, xmm6
+ movdqa xmm6, [ebp + edx + 16]
+ pavgb xmm5, xmm6
+ pavgb xmm2, xmm4
+ pavgb xmm3, xmm5
+ pavgb xmm0, xmm2
+ pavgb xmm1, xmm3
+
+ psadbw xmm0, xmm7 // average 32 pixels to 4
+ psadbw xmm1, xmm7
+ pshufd xmm0, xmm0, 0xd8 // x1x0 -> xx01
+ pshufd xmm1, xmm1, 0x8d // x3x2 -> 32xx
+ por xmm0, xmm1 // -> 3201
+ psrlw xmm0, 3
+ packuswb xmm0, xmm0
+ packuswb xmm0, xmm0
+ movd dword ptr [edi], xmm0
+
+ lea edi, [edi + 4]
+ sub ecx, 4
+ ja wloop
+
+ popad
+ ret
+ }
+}
+
+#define HAS_SCALEROWDOWN34_SSSE3
+// Point samples 32 pixels to 24 pixels.
+// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
+// Then shuffled to do the scaling.
+
+// Note that movdqa+palign may be better than movdqu.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
+__declspec(naked)
+static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ pushad
+ mov esi, [esp + 32 + 4] // src_ptr
+ // src_stride ignored
+ mov edi, [esp + 32 + 12] // dst_ptr
+ mov ecx, [esp + 32 + 16] // dst_width
+ movdqa xmm3, _shuf0
+ movdqa xmm4, _shuf1
+ movdqa xmm5, _shuf2
+
+ wloop:
+ movdqa xmm0, [esi]
+ movdqa xmm1, [esi + 16]
+ lea esi, [esi + 32]
+ movdqa xmm2, xmm1
+ palignr xmm1, xmm0, 8
+ pshufb xmm0, xmm3
+ pshufb xmm1, xmm4
+ pshufb xmm2, xmm5
+ movq qword ptr [edi], xmm0
+ movq qword ptr [edi + 8], xmm1
+ movq qword ptr [edi + 16], xmm2
+ lea edi, [edi + 24]
+ sub ecx, 24
+ ja wloop
+
+ popad
+ ret
+ }
+}
+
+// Blends 32x2 rectangle to 24x1
+// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
+// Then shuffled to do the scaling.
+
+// Register usage:
+// xmm0 src_row 0
+// xmm1 src_row 1
+// xmm2 shuf 0
+// xmm3 shuf 1
+// xmm4 shuf 2
+// xmm5 madd 0
+// xmm6 madd 1
+// xmm7 round34
+
+// Note that movdqa+palign may be better than movdqu.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
+__declspec(naked)
+static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ pushad
+ mov esi, [esp + 32 + 4] // src_ptr
+ mov ebx, [esp + 32 + 8] // src_stride
+ mov edi, [esp + 32 + 12] // dst_ptr
+ mov ecx, [esp + 32 + 16] // dst_width
+ movdqa xmm2, _shuf01
+ movdqa xmm3, _shuf11
+ movdqa xmm4, _shuf21
+ movdqa xmm5, _madd01
+ movdqa xmm6, _madd11
+ movdqa xmm7, _round34
+
+ wloop:
+ movdqa xmm0, [esi] // pixels 0..7
+ movdqa xmm1, [esi+ebx]
+ pavgb xmm0, xmm1
+ pshufb xmm0, xmm2
+ pmaddubsw xmm0, xmm5
+ paddsw xmm0, xmm7
+ psrlw xmm0, 2
+ packuswb xmm0, xmm0
+ movq qword ptr [edi], xmm0
+ movdqu xmm0, [esi+8] // pixels 8..15
+ movdqu xmm1, [esi+ebx+8]
+ pavgb xmm0, xmm1
+ pshufb xmm0, xmm3
+ pmaddubsw xmm0, xmm6
+ paddsw xmm0, xmm7
+ psrlw xmm0, 2
+ packuswb xmm0, xmm0
+ movq qword ptr [edi+8], xmm0
+ movdqa xmm0, [esi+16] // pixels 16..23
+ movdqa xmm1, [esi+ebx+16]
+ lea esi, [esi+32]
+ pavgb xmm0, xmm1
+ pshufb xmm0, xmm4
+ movdqa xmm1, _madd21
+ pmaddubsw xmm0, xmm1
+ paddsw xmm0, xmm7
+ psrlw xmm0, 2
+ packuswb xmm0, xmm0
+ movq qword ptr [edi+16], xmm0
+ lea edi, [edi+24]
+ sub ecx, 24
+ ja wloop
+
+ popad
+ ret
+ }
+}
+
+// Note that movdqa+palign may be better than movdqu.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
+__declspec(naked)
+static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ pushad
+ mov esi, [esp + 32 + 4] // src_ptr
+ mov ebx, [esp + 32 + 8] // src_stride
+ mov edi, [esp + 32 + 12] // dst_ptr
+ mov ecx, [esp + 32 + 16] // dst_width
+ movdqa xmm2, _shuf01
+ movdqa xmm3, _shuf11
+ movdqa xmm4, _shuf21
+ movdqa xmm5, _madd01
+ movdqa xmm6, _madd11
+ movdqa xmm7, _round34
+
+ wloop:
+ movdqa xmm0, [esi] // pixels 0..7
+ movdqa xmm1, [esi+ebx]
+ pavgb xmm1, xmm0
+ pavgb xmm0, xmm1
+ pshufb xmm0, xmm2
+ pmaddubsw xmm0, xmm5
+ paddsw xmm0, xmm7
+ psrlw xmm0, 2
+ packuswb xmm0, xmm0
+ movq qword ptr [edi], xmm0
+ movdqu xmm0, [esi+8] // pixels 8..15
+ movdqu xmm1, [esi+ebx+8]
+ pavgb xmm1, xmm0
+ pavgb xmm0, xmm1
+ pshufb xmm0, xmm3
+ pmaddubsw xmm0, xmm6
+ paddsw xmm0, xmm7
+ psrlw xmm0, 2
+ packuswb xmm0, xmm0
+ movq qword ptr [edi+8], xmm0
+ movdqa xmm0, [esi+16] // pixels 16..23
+ movdqa xmm1, [esi+ebx+16]
+ lea esi, [esi+32]
+ pavgb xmm1, xmm0
+ pavgb xmm0, xmm1
+ pshufb xmm0, xmm4
+ movdqa xmm1, _madd21
+ pmaddubsw xmm0, xmm1
+ paddsw xmm0, xmm7
+ psrlw xmm0, 2
+ packuswb xmm0, xmm0
+ movq qword ptr [edi+16], xmm0
+ lea edi, [edi+24]
+ sub ecx, 24
+ ja wloop
+
+ popad
+ ret
+ }
+}
+
+#define HAS_SCALEROWDOWN38_SSSE3
+// 3/8 point sampler
+
+// Scale 32 pixels to 12
+__declspec(naked)
+static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ pushad
+ mov esi, [esp + 32 + 4] // src_ptr
+ mov edx, [esp + 32 + 8] // src_stride
+ mov edi, [esp + 32 + 12] // dst_ptr
+ mov ecx, [esp + 32 + 16] // dst_width
+ movdqa xmm4, _shuf38a
+ movdqa xmm5, _shuf38b
+
+ xloop:
+ movdqa xmm0, [esi] // 16 pixels -> 0,1,2,3,4,5
+ movdqa xmm1, [esi + 16] // 16 pixels -> 6,7,8,9,10,11
+ lea esi, [esi + 32]
+ pshufb xmm0, xmm4
+ pshufb xmm1, xmm5
+ paddusb xmm0, xmm1
+
+ movq qword ptr [edi], xmm0 // write 12 pixels
+ movhlps xmm1, xmm0
+ movd [edi + 8], xmm1
+ lea edi, [edi + 12]
+ sub ecx, 12
+ ja xloop
+
+ popad
+ ret
+ }
+}
+
+// Scale 16x3 pixels to 6x1 with interpolation
+__declspec(naked)
+static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ pushad
+ mov esi, [esp + 32 + 4] // src_ptr
+ mov edx, [esp + 32 + 8] // src_stride
+ mov edi, [esp + 32 + 12] // dst_ptr
+ mov ecx, [esp + 32 + 16] // dst_width
+ movdqa xmm4, _shufac0
+ movdqa xmm5, _shufac3
+ movdqa xmm6, _scaleac3
+ pxor xmm7, xmm7
+
+ xloop:
+ movdqa xmm0, [esi] // sum up 3 rows into xmm0/1
+ movdqa xmm2, [esi + edx]
+ movhlps xmm1, xmm0
+ movhlps xmm3, xmm2
+ punpcklbw xmm0, xmm7
+ punpcklbw xmm1, xmm7
+ punpcklbw xmm2, xmm7
+ punpcklbw xmm3, xmm7
+ paddusw xmm0, xmm2
+ paddusw xmm1, xmm3
+ movdqa xmm2, [esi + edx * 2]
+ lea esi, [esi + 16]
+ movhlps xmm3, xmm2
+ punpcklbw xmm2, xmm7
+ punpcklbw xmm3, xmm7
+ paddusw xmm0, xmm2
+ paddusw xmm1, xmm3
+
+ movdqa xmm2, xmm0 // 8 pixels -> 0,1,2 of xmm2
+ psrldq xmm0, 2
+ paddusw xmm2, xmm0
+ psrldq xmm0, 2
+ paddusw xmm2, xmm0
+ pshufb xmm2, xmm4
+
+ movdqa xmm3, xmm1 // 8 pixels -> 3,4,5 of xmm2
+ psrldq xmm1, 2
+ paddusw xmm3, xmm1
+ psrldq xmm1, 2
+ paddusw xmm3, xmm1
+ pshufb xmm3, xmm5
+ paddusw xmm2, xmm3
+
+ pmulhuw xmm2, xmm6 // divide by 9,9,6, 9,9,6
+ packuswb xmm2, xmm2
+
+ movd [edi], xmm2 // write 6 pixels
+ pextrw eax, xmm2, 2
+ mov [edi + 4], ax
+ lea edi, [edi + 6]
+ sub ecx, 6
+ ja xloop
+
+ popad
+ ret
+ }
+}
+
+// Scale 16x2 pixels to 6x1 with interpolation
+__declspec(naked)
+static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ pushad
+ mov esi, [esp + 32 + 4] // src_ptr
+ mov edx, [esp + 32 + 8] // src_stride
+ mov edi, [esp + 32 + 12] // dst_ptr
+ mov ecx, [esp + 32 + 16] // dst_width
+ movdqa xmm4, _shufab0
+ movdqa xmm5, _shufab1
+ movdqa xmm6, _shufab2
+ movdqa xmm7, _scaleab2
+
+ xloop:
+ movdqa xmm2, [esi] // average 2 rows into xmm2
+ pavgb xmm2, [esi + edx]
+ lea esi, [esi + 16]
+
+ movdqa xmm0, xmm2 // 16 pixels -> 0,1,2,3,4,5 of xmm0
+ pshufb xmm0, xmm4
+ movdqa xmm1, xmm2
+ pshufb xmm1, xmm5
+ paddusw xmm0, xmm1
+ pshufb xmm2, xmm6
+ paddusw xmm0, xmm2
+
+ pmulhuw xmm0, xmm7 // divide by 3,3,2, 3,3,2
+ packuswb xmm0, xmm0
+
+ movd [edi], xmm0 // write 6 pixels
+ pextrw eax, xmm0, 2
+ mov [edi + 4], ax
+ lea edi, [edi + 6]
+ sub ecx, 6
+ ja xloop
+
+ popad
+ ret
+ }
+}
+
+#define HAS_SCALEADDROWS_SSE2
+
+// Reads 8xN bytes and produces 16 shorts at a time.
+__declspec(naked)
+static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
+ uint16* dst_ptr, int src_width,
+ int src_height) {
+ __asm {
+ pushad
+ mov esi, [esp + 32 + 4] // src_ptr
+ mov edx, [esp + 32 + 8] // src_stride
+ mov edi, [esp + 32 + 12] // dst_ptr
+ mov ecx, [esp + 32 + 16] // dst_width
+ mov ebx, [esp + 32 + 20] // height
+ pxor xmm5, xmm5
+ dec ebx
+
+ xloop:
+ // first row
+ movdqa xmm2, [esi]
+ lea eax, [esi + edx]
+ movhlps xmm3, xmm2
+ mov ebp, ebx
+ punpcklbw xmm2, xmm5
+ punpcklbw xmm3, xmm5
+
+ // sum remaining rows
+ yloop:
+ movdqa xmm0, [eax] // read 16 pixels
+ lea eax, [eax + edx] // advance to next row
+ movhlps xmm1, xmm0
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm1, xmm5
+ paddusw xmm2, xmm0 // sum 16 words
+ paddusw xmm3, xmm1
+ sub ebp, 1
+ ja yloop
+
+ movdqa [edi], xmm2
+ movdqa [edi + 16], xmm3
+ lea edi, [edi + 32]
+ lea esi, [esi + 16]
+
+ sub ecx, 16
+ ja xloop
+
+ popad
+ ret
+ }
+}
+
+// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version.
+#define HAS_SCALEFILTERROWS_SSE2
+__declspec(naked)
+static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+ int src_stride, int dst_width,
+ int source_y_fraction) {
+ __asm {
+ push esi
+ push edi
+ mov edi, [esp + 8 + 4] // dst_ptr
+ mov esi, [esp + 8 + 8] // src_ptr
+ mov edx, [esp + 8 + 12] // src_stride
+ mov ecx, [esp + 8 + 16] // dst_width
+ mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
+ cmp eax, 0
+ je xloop1
+ cmp eax, 128
+ je xloop2
+
+ movd xmm6, eax // xmm6 = y fraction
+ punpcklwd xmm6, xmm6
+ pshufd xmm6, xmm6, 0
+ neg eax // xmm5 = 256 - y fraction
+ add eax, 256
+ movd xmm5, eax
+ punpcklwd xmm5, xmm5
+ pshufd xmm5, xmm5, 0
+ pxor xmm7, xmm7
+
+ xloop:
+ movdqa xmm0, [esi]
+ movdqa xmm2, [esi + edx]
+ lea esi, [esi + 16]
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ punpcklbw xmm0, xmm7
+ punpcklbw xmm2, xmm7
+ punpckhbw xmm1, xmm7
+ punpckhbw xmm3, xmm7
+ pmullw xmm0, xmm5 // scale row 0
+ pmullw xmm1, xmm5
+ pmullw xmm2, xmm6 // scale row 1
+ pmullw xmm3, xmm6
+ paddusw xmm0, xmm2 // sum rows
+ paddusw xmm1, xmm3
+ psrlw xmm0, 8
+ psrlw xmm1, 8
+ packuswb xmm0, xmm1
+ movdqa [edi], xmm0
+ lea edi, [edi + 16]
+ sub ecx, 16
+ ja xloop
+
+ mov al, [edi - 1]
+ mov [edi], al
+ pop edi
+ pop esi
+ ret
+
+ xloop1:
+ movdqa xmm0, [esi]
+ lea esi, [esi + 16]
+ movdqa [edi], xmm0
+ lea edi, [edi + 16]
+ sub ecx, 16
+ ja xloop1
+
+ mov al, [edi - 1]
+ mov [edi], al
+ pop edi
+ pop esi
+ ret
+
+ xloop2:
+ movdqa xmm0, [esi]
+ movdqa xmm2, [esi + edx]
+ lea esi, [esi + 16]
+ pavgb xmm0, xmm2
+ movdqa [edi], xmm0
+ lea edi, [edi + 16]
+ sub ecx, 16
+ ja xloop2
+
+ mov al, [edi - 1]
+ mov [edi], al
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version.
+#define HAS_SCALEFILTERROWS_SSSE3
+__declspec(naked)
+static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+ int src_stride, int dst_width,
+ int source_y_fraction) {
+ __asm {
+ push esi
+ push edi
+ mov edi, [esp + 8 + 4] // dst_ptr
+ mov esi, [esp + 8 + 8] // src_ptr
+ mov edx, [esp + 8 + 12] // src_stride
+ mov ecx, [esp + 8 + 16] // dst_width
+ mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
+ cmp eax, 0
+ je xloop1
+ cmp eax, 128
+ je xloop2
+
+ shr eax, 1
+ mov ah,al
+ neg al
+ add al, 128
+ movd xmm5, eax
+ punpcklwd xmm5, xmm5
+ pshufd xmm5, xmm5, 0
+
+ xloop:
+ movdqa xmm0, [esi]
+ movdqa xmm2, [esi + edx]
+ lea esi, [esi + 16]
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm2
+ punpckhbw xmm1, xmm2
+ pmaddubsw xmm0, xmm5
+ pmaddubsw xmm1, xmm5
+ psrlw xmm0, 7
+ psrlw xmm1, 7
+ packuswb xmm0, xmm1
+ movdqa [edi], xmm0
+ lea edi, [edi + 16]
+ sub ecx, 16
+ ja xloop
+
+ mov al, [edi - 1]
+ mov [edi], al
+ pop edi
+ pop esi
+ ret
+
+ xloop1:
+ movdqa xmm0, [esi]
+ lea esi, [esi + 16]
+ movdqa [edi], xmm0
+ lea edi, [edi + 16]
+ sub ecx, 16
+ ja xloop1
+
+ mov al, [edi - 1]
+ mov [edi], al
+ pop edi
+ pop esi
+ ret
+
+ xloop2:
+ movdqa xmm0, [esi]
+ movdqa xmm2, [esi + edx]
+ lea esi, [esi + 16]
+ pavgb xmm0, xmm2
+ movdqa [edi], xmm0
+ lea edi, [edi + 16]
+ sub ecx, 16
+ ja xloop2
+
+ mov al, [edi - 1]
+ mov [edi], al
+ pop edi
+ pop esi
+ ret
+
+ }
+}
+
+// Note that movdqa+palign may be better than movdqu.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
+__declspec(naked)
+static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width) {
+ __asm {
+ mov edx, [esp + 4] // dst_ptr
+ mov eax, [esp + 8] // src_ptr
+ mov ecx, [esp + 12] // dst_width
+ movdqa xmm1, _round34
+ movdqa xmm2, _shuf01
+ movdqa xmm3, _shuf11
+ movdqa xmm4, _shuf21
+ movdqa xmm5, _madd01
+ movdqa xmm6, _madd11
+ movdqa xmm7, _madd21
+
+ wloop:
+ movdqa xmm0, [eax] // pixels 0..7
+ pshufb xmm0, xmm2
+ pmaddubsw xmm0, xmm5
+ paddsw xmm0, xmm1
+ psrlw xmm0, 2
+ packuswb xmm0, xmm0
+ movq qword ptr [edx], xmm0
+ movdqu xmm0, [eax+8] // pixels 8..15
+ pshufb xmm0, xmm3
+ pmaddubsw xmm0, xmm6
+ paddsw xmm0, xmm1
+ psrlw xmm0, 2
+ packuswb xmm0, xmm0
+ movq qword ptr [edx+8], xmm0
+ movdqa xmm0, [eax+16] // pixels 16..23
+ lea eax, [eax+32]
+ pshufb xmm0, xmm4
+ pmaddubsw xmm0, xmm7
+ paddsw xmm0, xmm1
+ psrlw xmm0, 2
+ packuswb xmm0, xmm0
+ movq qword ptr [edx+16], xmm0
+ lea edx, [edx+24]
+ sub ecx, 24
+ ja wloop
+ ret
+ }
+}
+
+#elif (defined(__x86_64__) || defined(__i386__)) && !defined(YUV_DISABLE_ASM)
+
+// GCC versions of row functions are verbatim conversions from Visual C.
+// Generated using gcc disassembly on Visual C object file:
+// objdump -D yuvscaler.obj >yuvscaler.txt
+#define HAS_SCALEROWDOWN2_SSE2
+static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+"1:"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "ja 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc"
+);
+}
+
+static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+"1:"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "movdqa (%0,%3,1),%%xmm2 \n"
+ "movdqa 0x10(%0,%3,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "pand %%xmm5,%%xmm2 \n"
+ "pand %%xmm5,%%xmm3 \n"
+ "pavgw %%xmm2,%%xmm0 \n"
+ "pavgw %%xmm3,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "ja 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)) // %3
+ : "memory", "cc"
+);
+}
+
+#define HAS_SCALEROWDOWN4_SSE2
+static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrld $0x18,%%xmm5 \n"
+"1:"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "ja 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc"
+);
+}
+
+static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
+ uint8* dst_ptr, int dst_width) {
+ intptr_t temp = 0;
+ asm volatile (
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psrlw $0x8,%%xmm7 \n"
+ "lea (%4,%4,2),%3 \n"
+"1:"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "movdqa (%0,%4,1),%%xmm2 \n"
+ "movdqa 0x10(%0,%4,1),%%xmm3 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "movdqa (%0,%4,2),%%xmm2 \n"
+ "movdqa 0x10(%0,%4,2),%%xmm3 \n"
+ "movdqa (%0,%3,1),%%xmm4 \n"
+ "movdqa 0x10(%0,%3,1),%%xmm5 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm4,%%xmm2 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm5,%%xmm3 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "pand %%xmm7,%%xmm2 \n"
+ "pand %%xmm7,%%xmm3 \n"
+ "pavgw %%xmm2,%%xmm0 \n"
+ "pavgw %%xmm3,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "pand %%xmm7,%%xmm2 \n"
+ "pavgw %%xmm2,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "ja 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(temp) // %3
+ : "r"((intptr_t)(src_stride)) // %4
+ : "memory", "cc"
+#if defined(__x86_64__)
+ , "xmm6", "xmm7"
+#endif
+);
+}
+
+#define HAS_SCALEROWDOWN8_SSE2
+static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlq $0x38,%%xmm5 \n"
+"1:"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movd %%xmm0,(%1) \n"
+ "lea 0x4(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "ja 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc"
+);
+}
+
+#if defined(__i386__)
+void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
+ uint8* dst_ptr, int dst_width);
+ asm(
+ DECLARE_FUNCTION(ScaleRowDown8Int_SSE2)
+ "pusha \n"
+ "mov 0x24(%esp),%esi \n"
+ "mov 0x28(%esp),%ebx \n"
+ "mov 0x2c(%esp),%edi \n"
+ "mov 0x30(%esp),%ecx \n"
+ "lea (%ebx,%ebx,2),%edx \n"
+ "pxor %xmm7,%xmm7 \n"
+
+"1:"
+ "movdqa (%esi),%xmm0 \n"
+ "movdqa 0x10(%esi),%xmm1 \n"
+ "movdqa (%esi,%ebx,1),%xmm2 \n"
+ "movdqa 0x10(%esi,%ebx,1),%xmm3 \n"
+ "pavgb %xmm2,%xmm0 \n"
+ "pavgb %xmm3,%xmm1 \n"
+ "movdqa (%esi,%ebx,2),%xmm2 \n"
+ "movdqa 0x10(%esi,%ebx,2),%xmm3 \n"
+ "movdqa (%esi,%edx,1),%xmm4 \n"
+ "movdqa 0x10(%esi,%edx,1),%xmm5 \n"
+ "lea (%esi,%ebx,4),%ebp \n"
+ "lea 0x20(%esi),%esi \n"
+ "pavgb %xmm4,%xmm2 \n"
+ "pavgb %xmm5,%xmm3 \n"
+ "pavgb %xmm2,%xmm0 \n"
+ "pavgb %xmm3,%xmm1 \n"
+ "movdqa 0x0(%ebp),%xmm2 \n"
+ "movdqa 0x10(%ebp),%xmm3 \n"
+ "movdqa 0x0(%ebp,%ebx,1),%xmm4 \n"
+ "movdqa 0x10(%ebp,%ebx,1),%xmm5 \n"
+ "pavgb %xmm4,%xmm2 \n"
+ "pavgb %xmm5,%xmm3 \n"
+ "movdqa 0x0(%ebp,%ebx,2),%xmm4 \n"
+ "movdqa 0x10(%ebp,%ebx,2),%xmm5 \n"
+ "movdqa 0x0(%ebp,%edx,1),%xmm6 \n"
+ "pavgb %xmm6,%xmm4 \n"
+ "movdqa 0x10(%ebp,%edx,1),%xmm6 \n"
+ "pavgb %xmm6,%xmm5 \n"
+ "pavgb %xmm4,%xmm2 \n"
+ "pavgb %xmm5,%xmm3 \n"
+ "pavgb %xmm2,%xmm0 \n"
+ "pavgb %xmm3,%xmm1 \n"
+ "psadbw %xmm7,%xmm0 \n"
+ "psadbw %xmm7,%xmm1 \n"
+ "pshufd $0xd8,%xmm0,%xmm0 \n"
+ "pshufd $0x8d,%xmm1,%xmm1 \n"
+ "por %xmm1,%xmm0 \n"
+ "psrlw $0x3,%xmm0 \n"
+ "packuswb %xmm0,%xmm0 \n"
+ "packuswb %xmm0,%xmm0 \n"
+ "movd %xmm0,(%edi) \n"
+ "lea 0x4(%edi),%edi \n"
+ "sub $0x4,%ecx \n"
+ "ja 1b \n"
+ "popa \n"
+ "ret \n"
+);
+
+// fpic is used for magiccam plugin
+#if !defined(__PIC__)
+#define HAS_SCALEROWDOWN34_SSSE3
+void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
+ uint8* dst_ptr, int dst_width);
+ asm(
+ DECLARE_FUNCTION(ScaleRowDown34_SSSE3)
+ "pusha \n"
+ "mov 0x24(%esp),%esi \n"
+ "mov 0x2c(%esp),%edi \n"
+ "mov 0x30(%esp),%ecx \n"
+ "movdqa _shuf0,%xmm3 \n"
+ "movdqa _shuf1,%xmm4 \n"
+ "movdqa _shuf2,%xmm5 \n"
+
+"1:"
+ "movdqa (%esi),%xmm0 \n"
+ "movdqa 0x10(%esi),%xmm2 \n"
+ "lea 0x20(%esi),%esi \n"
+ "movdqa %xmm2,%xmm1 \n"
+ "palignr $0x8,%xmm0,%xmm1 \n"
+ "pshufb %xmm3,%xmm0 \n"
+ "pshufb %xmm4,%xmm1 \n"
+ "pshufb %xmm5,%xmm2 \n"
+ "movq %xmm0,(%edi) \n"
+ "movq %xmm1,0x8(%edi) \n"
+ "movq %xmm2,0x10(%edi) \n"
+ "lea 0x18(%edi),%edi \n"
+ "sub $0x18,%ecx \n"
+ "ja 1b \n"
+ "popa \n"
+ "ret \n"
+);
+
+void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
+ uint8* dst_ptr, int dst_width);
+ asm(
+ DECLARE_FUNCTION(ScaleRowDown34_1_Int_SSSE3)
+ "pusha \n"
+ "mov 0x24(%esp),%esi \n"
+ "mov 0x28(%esp),%ebp \n"
+ "mov 0x2c(%esp),%edi \n"
+ "mov 0x30(%esp),%ecx \n"
+ "movdqa _shuf01,%xmm2 \n"
+ "movdqa _shuf11,%xmm3 \n"
+ "movdqa _shuf21,%xmm4 \n"
+ "movdqa _madd01,%xmm5 \n"
+ "movdqa _madd11,%xmm6 \n"
+ "movdqa _round34,%xmm7 \n"
+
+"1:"
+ "movdqa (%esi),%xmm0 \n"
+ "movdqa (%esi,%ebp),%xmm1 \n"
+ "pavgb %xmm1,%xmm0 \n"
+ "pshufb %xmm2,%xmm0 \n"
+ "pmaddubsw %xmm5,%xmm0 \n"
+ "paddsw %xmm7,%xmm0 \n"
+ "psrlw $0x2,%xmm0 \n"
+ "packuswb %xmm0,%xmm0 \n"
+ "movq %xmm0,(%edi) \n"
+ "movdqu 0x8(%esi),%xmm0 \n"
+ "movdqu 0x8(%esi,%ebp),%xmm1 \n"
+ "pavgb %xmm1,%xmm0 \n"
+ "pshufb %xmm3,%xmm0 \n"
+ "pmaddubsw %xmm6,%xmm0 \n"
+ "paddsw %xmm7,%xmm0 \n"
+ "psrlw $0x2,%xmm0 \n"
+ "packuswb %xmm0,%xmm0 \n"
+ "movq %xmm0,0x8(%edi) \n"
+ "movdqa 0x10(%esi),%xmm0 \n"
+ "movdqa 0x10(%esi,%ebp),%xmm1 \n"
+ "lea 0x20(%esi),%esi \n"
+ "pavgb %xmm1,%xmm0 \n"
+ "pshufb %xmm4,%xmm0 \n"
+ "movdqa _madd21,%xmm1 \n"
+ "pmaddubsw %xmm1,%xmm0 \n"
+ "paddsw %xmm7,%xmm0 \n"
+ "psrlw $0x2,%xmm0 \n"
+ "packuswb %xmm0,%xmm0 \n"
+ "movq %xmm0,0x10(%edi) \n"
+ "lea 0x18(%edi),%edi \n"
+ "sub $0x18,%ecx \n"
+ "ja 1b \n"
+
+ "popa \n"
+ "ret \n"
+);
+
+void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
+ uint8* dst_ptr, int dst_width);
+ asm(
+ DECLARE_FUNCTION(ScaleRowDown34_0_Int_SSSE3)
+ "pusha \n"
+ "mov 0x24(%esp),%esi \n"
+ "mov 0x28(%esp),%ebp \n"
+ "mov 0x2c(%esp),%edi \n"
+ "mov 0x30(%esp),%ecx \n"
+ "movdqa _shuf01,%xmm2 \n"
+ "movdqa _shuf11,%xmm3 \n"
+ "movdqa _shuf21,%xmm4 \n"
+ "movdqa _madd01,%xmm5 \n"
+ "movdqa _madd11,%xmm6 \n"
+ "movdqa _round34,%xmm7 \n"
+
+"1:"
+ "movdqa (%esi),%xmm0 \n"
+ "movdqa (%esi,%ebp,1),%xmm1 \n"
+ "pavgb %xmm0,%xmm1 \n"
+ "pavgb %xmm1,%xmm0 \n"
+ "pshufb %xmm2,%xmm0 \n"
+ "pmaddubsw %xmm5,%xmm0 \n"
+ "paddsw %xmm7,%xmm0 \n"
+ "psrlw $0x2,%xmm0 \n"
+ "packuswb %xmm0,%xmm0 \n"
+ "movq %xmm0,(%edi) \n"
+ "movdqu 0x8(%esi),%xmm0 \n"
+ "movdqu 0x8(%esi,%ebp,1),%xmm1 \n"
+ "pavgb %xmm0,%xmm1 \n"
+ "pavgb %xmm1,%xmm0 \n"
+ "pshufb %xmm3,%xmm0 \n"
+ "pmaddubsw %xmm6,%xmm0 \n"
+ "paddsw %xmm7,%xmm0 \n"
+ "psrlw $0x2,%xmm0 \n"
+ "packuswb %xmm0,%xmm0 \n"
+ "movq %xmm0,0x8(%edi) \n"
+ "movdqa 0x10(%esi),%xmm0 \n"
+ "movdqa 0x10(%esi,%ebp,1),%xmm1 \n"
+ "lea 0x20(%esi),%esi \n"
+ "pavgb %xmm0,%xmm1 \n"
+ "pavgb %xmm1,%xmm0 \n"
+ "pshufb %xmm4,%xmm0 \n"
+ "movdqa _madd21,%xmm1 \n"
+ "pmaddubsw %xmm1,%xmm0 \n"
+ "paddsw %xmm7,%xmm0 \n"
+ "psrlw $0x2,%xmm0 \n"
+ "packuswb %xmm0,%xmm0 \n"
+ "movq %xmm0,0x10(%edi) \n"
+ "lea 0x18(%edi),%edi \n"
+ "sub $0x18,%ecx \n"
+ "ja 1b \n"
+ "popa \n"
+ "ret \n"
+);
+
+#define HAS_SCALEROWDOWN38_SSSE3
+void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
+ uint8* dst_ptr, int dst_width);
+ asm(
+ DECLARE_FUNCTION(ScaleRowDown38_SSSE3)
+ "pusha \n"
+ "mov 0x24(%esp),%esi \n"
+ "mov 0x28(%esp),%edx \n"
+ "mov 0x2c(%esp),%edi \n"
+ "mov 0x30(%esp),%ecx \n"
+ "movdqa _shuf38a ,%xmm4 \n"
+ "movdqa _shuf38b ,%xmm5 \n"
+
+"1:"
+ "movdqa (%esi),%xmm0 \n"
+ "movdqa 0x10(%esi),%xmm1 \n"
+ "lea 0x20(%esi),%esi \n"
+ "pshufb %xmm4,%xmm0 \n"
+ "pshufb %xmm5,%xmm1 \n"
+ "paddusb %xmm1,%xmm0 \n"
+ "movq %xmm0,(%edi) \n"
+ "movhlps %xmm0,%xmm1 \n"
+ "movd %xmm1,0x8(%edi) \n"
+ "lea 0xc(%edi),%edi \n"
+ "sub $0xc,%ecx \n"
+ "ja 1b \n"
+ "popa \n"
+ "ret \n"
+);
+
+void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
+ uint8* dst_ptr, int dst_width);
+ asm(
+ DECLARE_FUNCTION(ScaleRowDown38_3_Int_SSSE3)
+ "pusha \n"
+ "mov 0x24(%esp),%esi \n"
+ "mov 0x28(%esp),%edx \n"
+ "mov 0x2c(%esp),%edi \n"
+ "mov 0x30(%esp),%ecx \n"
+ "movdqa _shufac0,%xmm4 \n"
+ "movdqa _shufac3,%xmm5 \n"
+ "movdqa _scaleac3,%xmm6 \n"
+ "pxor %xmm7,%xmm7 \n"
+
+"1:"
+ "movdqa (%esi),%xmm0 \n"
+ "movdqa (%esi,%edx,1),%xmm2 \n"
+ "movhlps %xmm0,%xmm1 \n"
+ "movhlps %xmm2,%xmm3 \n"
+ "punpcklbw %xmm7,%xmm0 \n"
+ "punpcklbw %xmm7,%xmm1 \n"
+ "punpcklbw %xmm7,%xmm2 \n"
+ "punpcklbw %xmm7,%xmm3 \n"
+ "paddusw %xmm2,%xmm0 \n"
+ "paddusw %xmm3,%xmm1 \n"
+ "movdqa (%esi,%edx,2),%xmm2 \n"
+ "lea 0x10(%esi),%esi \n"
+ "movhlps %xmm2,%xmm3 \n"
+ "punpcklbw %xmm7,%xmm2 \n"
+ "punpcklbw %xmm7,%xmm3 \n"
+ "paddusw %xmm2,%xmm0 \n"
+ "paddusw %xmm3,%xmm1 \n"
+ "movdqa %xmm0,%xmm2 \n"
+ "psrldq $0x2,%xmm0 \n"
+ "paddusw %xmm0,%xmm2 \n"
+ "psrldq $0x2,%xmm0 \n"
+ "paddusw %xmm0,%xmm2 \n"
+ "pshufb %xmm4,%xmm2 \n"
+ "movdqa %xmm1,%xmm3 \n"
+ "psrldq $0x2,%xmm1 \n"
+ "paddusw %xmm1,%xmm3 \n"
+ "psrldq $0x2,%xmm1 \n"
+ "paddusw %xmm1,%xmm3 \n"
+ "pshufb %xmm5,%xmm3 \n"
+ "paddusw %xmm3,%xmm2 \n"
+ "pmulhuw %xmm6,%xmm2 \n"
+ "packuswb %xmm2,%xmm2 \n"
+ "movd %xmm2,(%edi) \n"
+ "pextrw $0x2,%xmm2,%eax \n"
+ "mov %ax,0x4(%edi) \n"
+ "lea 0x6(%edi),%edi \n"
+ "sub $0x6,%ecx \n"
+ "ja 1b \n"
+ "popa \n"
+ "ret \n"
+);
+
+void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
+ uint8* dst_ptr, int dst_width);
+ asm(
+ DECLARE_FUNCTION(ScaleRowDown38_2_Int_SSSE3)
+ "pusha \n"
+ "mov 0x24(%esp),%esi \n"
+ "mov 0x28(%esp),%edx \n"
+ "mov 0x2c(%esp),%edi \n"
+ "mov 0x30(%esp),%ecx \n"
+ "movdqa _shufab0,%xmm4 \n"
+ "movdqa _shufab1,%xmm5 \n"
+ "movdqa _shufab2,%xmm6 \n"
+ "movdqa _scaleab2,%xmm7 \n"
+
+"1:"
+ "movdqa (%esi),%xmm2 \n"
+ "pavgb (%esi,%edx,1),%xmm2 \n"
+ "lea 0x10(%esi),%esi \n"
+ "movdqa %xmm2,%xmm0 \n"
+ "pshufb %xmm4,%xmm0 \n"
+ "movdqa %xmm2,%xmm1 \n"
+ "pshufb %xmm5,%xmm1 \n"
+ "paddusw %xmm1,%xmm0 \n"
+ "pshufb %xmm6,%xmm2 \n"
+ "paddusw %xmm2,%xmm0 \n"
+ "pmulhuw %xmm7,%xmm0 \n"
+ "packuswb %xmm0,%xmm0 \n"
+ "movd %xmm0,(%edi) \n"
+ "pextrw $0x2,%xmm0,%eax \n"
+ "mov %ax,0x4(%edi) \n"
+ "lea 0x6(%edi),%edi \n"
+ "sub $0x6,%ecx \n"
+ "ja 1b \n"
+ "popa \n"
+ "ret \n"
+);
+#endif // __PIC__
+
+#define HAS_SCALEADDROWS_SSE2
+void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
+ uint16* dst_ptr, int src_width,
+ int src_height);
+ asm(
+ DECLARE_FUNCTION(ScaleAddRows_SSE2)
+ "pusha \n"
+ "mov 0x24(%esp),%esi \n"
+ "mov 0x28(%esp),%edx \n"
+ "mov 0x2c(%esp),%edi \n"
+ "mov 0x30(%esp),%ecx \n"
+ "mov 0x34(%esp),%ebx \n"
+ "pxor %xmm5,%xmm5 \n"
+
+"1:"
+ "movdqa (%esi),%xmm2 \n"
+ "lea (%esi,%edx,1),%eax \n"
+ "movhlps %xmm2,%xmm3 \n"
+ "lea -0x1(%ebx),%ebp \n"
+ "punpcklbw %xmm5,%xmm2 \n"
+ "punpcklbw %xmm5,%xmm3 \n"
+
+"2:"
+ "movdqa (%eax),%xmm0 \n"
+ "lea (%eax,%edx,1),%eax \n"
+ "movhlps %xmm0,%xmm1 \n"
+ "punpcklbw %xmm5,%xmm0 \n"
+ "punpcklbw %xmm5,%xmm1 \n"
+ "paddusw %xmm0,%xmm2 \n"
+ "paddusw %xmm1,%xmm3 \n"
+ "sub $0x1,%ebp \n"
+ "ja 2b \n"
+
+ "movdqa %xmm2,(%edi) \n"
+ "movdqa %xmm3,0x10(%edi) \n"
+ "lea 0x20(%edi),%edi \n"
+ "lea 0x10(%esi),%esi \n"
+ "sub $0x10,%ecx \n"
+ "ja 1b \n"
+ "popa \n"
+ "ret \n"
+);
+
+// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version
+#define HAS_SCALEFILTERROWS_SSE2
+void ScaleFilterRows_SSE2(uint8* dst_ptr,
+ const uint8* src_ptr, int src_stride,
+ int dst_width, int source_y_fraction);
+ asm(
+ DECLARE_FUNCTION(ScaleFilterRows_SSE2)
+ "push %esi \n"
+ "push %edi \n"
+ "mov 0xc(%esp),%edi \n"
+ "mov 0x10(%esp),%esi \n"
+ "mov 0x14(%esp),%edx \n"
+ "mov 0x18(%esp),%ecx \n"
+ "mov 0x1c(%esp),%eax \n"
+ "cmp $0x0,%eax \n"
+ "je 2f \n"
+ "cmp $0x80,%eax \n"
+ "je 3f \n"
+ "movd %eax,%xmm6 \n"
+ "punpcklwd %xmm6,%xmm6 \n"
+ "pshufd $0x0,%xmm6,%xmm6 \n"
+ "neg %eax \n"
+ "add $0x100,%eax \n"
+ "movd %eax,%xmm5 \n"
+ "punpcklwd %xmm5,%xmm5 \n"
+ "pshufd $0x0,%xmm5,%xmm5 \n"
+ "pxor %xmm7,%xmm7 \n"
+
+"1:"
+ "movdqa (%esi),%xmm0 \n"
+ "movdqa (%esi,%edx,1),%xmm2 \n"
+ "lea 0x10(%esi),%esi \n"
+ "movdqa %xmm0,%xmm1 \n"
+ "movdqa %xmm2,%xmm3 \n"
+ "punpcklbw %xmm7,%xmm0 \n"
+ "punpcklbw %xmm7,%xmm2 \n"
+ "punpckhbw %xmm7,%xmm1 \n"
+ "punpckhbw %xmm7,%xmm3 \n"
+ "pmullw %xmm5,%xmm0 \n"
+ "pmullw %xmm5,%xmm1 \n"
+ "pmullw %xmm6,%xmm2 \n"
+ "pmullw %xmm6,%xmm3 \n"
+ "paddusw %xmm2,%xmm0 \n"
+ "paddusw %xmm3,%xmm1 \n"
+ "psrlw $0x8,%xmm0 \n"
+ "psrlw $0x8,%xmm1 \n"
+ "packuswb %xmm1,%xmm0 \n"
+ "movdqa %xmm0,(%edi) \n"
+ "lea 0x10(%edi),%edi \n"
+ "sub $0x10,%ecx \n"
+ "ja 1b \n"
+ "mov -0x1(%edi),%al \n"
+ "mov %al,(%edi) \n"
+ "pop %edi \n"
+ "pop %esi \n"
+ "ret \n"
+
+"2:"
+ "movdqa (%esi),%xmm0 \n"
+ "lea 0x10(%esi),%esi \n"
+ "movdqa %xmm0,(%edi) \n"
+ "lea 0x10(%edi),%edi \n"
+ "sub $0x10,%ecx \n"
+ "ja 2b \n"
+
+ "mov -0x1(%edi),%al \n"
+ "mov %al,(%edi) \n"
+ "pop %edi \n"
+ "pop %esi \n"
+ "ret \n"
+
+"3:"
+ "movdqa (%esi),%xmm0 \n"
+ "movdqa (%esi,%edx,1),%xmm2 \n"
+ "lea 0x10(%esi),%esi \n"
+ "pavgb %xmm2,%xmm0 \n"
+ "movdqa %xmm0,(%edi) \n"
+ "lea 0x10(%edi),%edi \n"
+ "sub $0x10,%ecx \n"
+ "ja 3b \n"
+
+ "mov -0x1(%edi),%al \n"
+ "mov %al,(%edi) \n"
+ "pop %edi \n"
+ "pop %esi \n"
+ "ret \n"
+);
+
+// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version
+#define HAS_SCALEFILTERROWS_SSSE3
+void ScaleFilterRows_SSSE3(uint8* dst_ptr,
+ const uint8* src_ptr, int src_stride,
+ int dst_width, int source_y_fraction);
+ asm(
+ DECLARE_FUNCTION(ScaleFilterRows_SSSE3)
+ "push %esi \n"
+ "push %edi \n"
+ "mov 0xc(%esp),%edi \n"
+ "mov 0x10(%esp),%esi \n"
+ "mov 0x14(%esp),%edx \n"
+ "mov 0x18(%esp),%ecx \n"
+ "mov 0x1c(%esp),%eax \n"
+ "cmp $0x0,%eax \n"
+ "je 2f \n"
+ "cmp $0x80,%eax \n"
+ "je 3f \n"
+ "shr %eax \n"
+ "mov %al,%ah \n"
+ "neg %al \n"
+ "add $0x80,%al \n"
+ "movd %eax,%xmm5 \n"
+ "punpcklwd %xmm5,%xmm5 \n"
+ "pshufd $0x0,%xmm5,%xmm5 \n"
+
+"1:"
+ "movdqa (%esi),%xmm0 \n"
+ "movdqa (%esi,%edx,1),%xmm2 \n"
+ "lea 0x10(%esi),%esi \n"
+ "movdqa %xmm0,%xmm1 \n"
+ "punpcklbw %xmm2,%xmm0 \n"
+ "punpckhbw %xmm2,%xmm1 \n"
+ "pmaddubsw %xmm5,%xmm0 \n"
+ "pmaddubsw %xmm5,%xmm1 \n"
+ "psrlw $0x7,%xmm0 \n"
+ "psrlw $0x7,%xmm1 \n"
+ "packuswb %xmm1,%xmm0 \n"
+ "movdqa %xmm0,(%edi) \n"
+ "lea 0x10(%edi),%edi \n"
+ "sub $0x10,%ecx \n"
+ "ja 1b \n"
+ "mov -0x1(%edi),%al \n"
+ "mov %al,(%edi) \n"
+ "pop %edi \n"
+ "pop %esi \n"
+ "ret \n"
+
+"2:"
+ "movdqa (%esi),%xmm0 \n"
+ "lea 0x10(%esi),%esi \n"
+ "movdqa %xmm0,(%edi) \n"
+ "lea 0x10(%edi),%edi \n"
+ "sub $0x10,%ecx \n"
+ "ja 2b \n"
+ "mov -0x1(%edi),%al \n"
+ "mov %al,(%edi) \n"
+ "pop %edi \n"
+ "pop %esi \n"
+ "ret \n"
+
+"3:"
+ "movdqa (%esi),%xmm0 \n"
+ "movdqa (%esi,%edx,1),%xmm2 \n"
+ "lea 0x10(%esi),%esi \n"
+ "pavgb %xmm2,%xmm0 \n"
+ "movdqa %xmm0,(%edi) \n"
+ "lea 0x10(%edi),%edi \n"
+ "sub $0x10,%ecx \n"
+ "ja 3b \n"
+ "mov -0x1(%edi),%al \n"
+ "mov %al,(%edi) \n"
+ "pop %edi \n"
+ "pop %esi \n"
+ "ret \n"
+);
+
+#elif defined(__x86_64__)
+static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "lea (%3,%3,2),%%r10 \n"
+ "pxor %%xmm7,%%xmm7 \n"
+"1:"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "movdqa (%0,%3,1),%%xmm2 \n"
+ "movdqa 0x10(%0,%3,1),%%xmm3 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "movdqa (%0,%3,2),%%xmm2 \n"
+ "movdqa 0x10(%0,%3,2),%%xmm3 \n"
+ "movdqa (%0,%%r10,1),%%xmm4 \n"
+ "movdqa 0x10(%0,%%r10,1),%%xmm5 \n"
+ "lea (%0,%3,4),%%r11 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm4,%%xmm2 \n"
+ "pavgb %%xmm5,%%xmm3 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "movdqa 0x0(%%r11),%%xmm2 \n"
+ "movdqa 0x10(%%r11),%%xmm3 \n"
+ "movdqa 0x0(%%r11,%3,1),%%xmm4 \n"
+ "movdqa 0x10(%%r11,%3,1),%%xmm5 \n"
+ "pavgb %%xmm4,%%xmm2 \n"
+ "pavgb %%xmm5,%%xmm3 \n"
+ "movdqa 0x0(%%r11,%3,2),%%xmm4 \n"
+ "movdqa 0x10(%%r11,%3,2),%%xmm5 \n"
+ "movdqa 0x0(%%r11,%%r10,1),%%xmm6 \n"
+ "pavgb %%xmm6,%%xmm4 \n"
+ "movdqa 0x10(%%r11,%%r10,1),%%xmm6 \n"
+ "pavgb %%xmm6,%%xmm5 \n"
+ "pavgb %%xmm4,%%xmm2 \n"
+ "pavgb %%xmm5,%%xmm3 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "psadbw %%xmm7,%%xmm0 \n"
+ "psadbw %%xmm7,%%xmm1 \n"
+ "pshufd $0xd8,%%xmm0,%%xmm0 \n"
+ "pshufd $0x8d,%%xmm1,%%xmm1 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "psrlw $0x3,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movd %%xmm0,(%1) \n"
+ "lea 0x4(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "ja 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)) // %3
+ : "memory", "cc", "r10", "r11", "xmm6", "xmm7"
+);
+}
+
+#define HAS_SCALEROWDOWN34_SSSE3
+static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "movdqa (%3),%%xmm3 \n"
+ "movdqa (%4),%%xmm4 \n"
+ "movdqa (%5),%%xmm5 \n"
+"1:"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm2 \n"
+ "lea 0x20(%0),%0 \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "palignr $0x8,%%xmm0,%%xmm1 \n"
+ "pshufb %%xmm3,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "pshufb %%xmm5,%%xmm2 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,0x8(%1) \n"
+ "movq %%xmm2,0x10(%1) \n"
+ "lea 0x18(%1),%1 \n"
+ "sub $0x18,%2 \n"
+ "ja 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"(_shuf0), // %3
+ "r"(_shuf1), // %4
+ "r"(_shuf2) // %5
+ : "memory", "cc"
+);
+}
+
+static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "movdqa (%4),%%xmm2 \n" // _shuf01
+ "movdqa (%5),%%xmm3 \n" // _shuf11
+ "movdqa (%6),%%xmm4 \n" // _shuf21
+ "movdqa (%7),%%xmm5 \n" // _madd01
+ "movdqa (%8),%%xmm6 \n" // _madd11
+ "movdqa (%9),%%xmm7 \n" // _round34
+ "movdqa (%10),%%xmm8 \n" // _madd21
+"1:"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa (%0,%3),%%xmm1 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "pshufb %%xmm2,%%xmm0 \n"
+ "pmaddubsw %%xmm5,%%xmm0 \n"
+ "paddsw %%xmm7,%%xmm0 \n"
+ "psrlw $0x2,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "movdqu 0x8(%0),%%xmm0 \n"
+ "movdqu 0x8(%0,%3),%%xmm1 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "pshufb %%xmm3,%%xmm0 \n"
+ "pmaddubsw %%xmm6,%%xmm0 \n"
+ "paddsw %%xmm7,%%xmm0 \n"
+ "psrlw $0x2,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,0x8(%1) \n"
+ "movdqa 0x10(%0),%%xmm0 \n"
+ "movdqa 0x10(%0,%3),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm8,%%xmm0 \n"
+ "paddsw %%xmm7,%%xmm0 \n"
+ "psrlw $0x2,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,0x10(%1) \n"
+ "lea 0x18(%1),%1 \n"
+ "sub $0x18,%2 \n"
+ "ja 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"(_shuf01), // %4
+ "r"(_shuf11), // %5
+ "r"(_shuf21), // %6
+ "r"(_madd01), // %7
+ "r"(_madd11), // %8
+ "r"(_round34), // %9
+ "r"(_madd21) // %10
+ : "memory", "cc", "xmm6", "xmm7", "xmm8"
+);
+}
+
+static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "movdqa (%4),%%xmm2 \n" // _shuf01
+ "movdqa (%5),%%xmm3 \n" // _shuf11
+ "movdqa (%6),%%xmm4 \n" // _shuf21
+ "movdqa (%7),%%xmm5 \n" // _madd01
+ "movdqa (%8),%%xmm6 \n" // _madd11
+ "movdqa (%9),%%xmm7 \n" // _round34
+ "movdqa (%10),%%xmm8 \n" // _madd21
+"1:"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa (%0,%3,1),%%xmm1 \n"
+ "pavgb %%xmm0,%%xmm1 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "pshufb %%xmm2,%%xmm0 \n"
+ "pmaddubsw %%xmm5,%%xmm0 \n"
+ "paddsw %%xmm7,%%xmm0 \n"
+ "psrlw $0x2,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "movdqu 0x8(%0),%%xmm0 \n"
+ "movdqu 0x8(%0,%3,1),%%xmm1 \n"
+ "pavgb %%xmm0,%%xmm1 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "pshufb %%xmm3,%%xmm0 \n"
+ "pmaddubsw %%xmm6,%%xmm0 \n"
+ "paddsw %%xmm7,%%xmm0 \n"
+ "psrlw $0x2,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,0x8(%1) \n"
+ "movdqa 0x10(%0),%%xmm0 \n"
+ "movdqa 0x10(%0,%3,1),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm0,%%xmm1 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm8,%%xmm0 \n"
+ "paddsw %%xmm7,%%xmm0 \n"
+ "psrlw $0x2,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,0x10(%1) \n"
+ "lea 0x18(%1),%1 \n"
+ "sub $0x18,%2 \n"
+ "ja 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"(_shuf01), // %4
+ "r"(_shuf11), // %5
+ "r"(_shuf21), // %6
+ "r"(_madd01), // %7
+ "r"(_madd11), // %8
+ "r"(_round34), // %9
+ "r"(_madd21) // %10
+ : "memory", "cc", "xmm6", "xmm7", "xmm8"
+);
+}
+
+#define HAS_SCALEROWDOWN38_SSSE3
+static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "movdqa (%3),%%xmm4 \n"
+ "movdqa (%4),%%xmm5 \n"
+"1:"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "movhlps %%xmm0,%%xmm1 \n"
+ "movd %%xmm1,0x8(%1) \n"
+ "lea 0xc(%1),%1 \n"
+ "sub $0xc,%2 \n"
+ "ja 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"(_shuf38a), // %3
+ "r"(_shuf38b) // %4
+ : "memory", "cc"
+);
+}
+
+static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "movdqa (%4),%%xmm4 \n"
+ "movdqa (%5),%%xmm5 \n"
+ "movdqa (%6),%%xmm6 \n"
+ "pxor %%xmm7,%%xmm7 \n"
+"1:"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa (%0,%3,1),%%xmm2 \n"
+ "movhlps %%xmm0,%%xmm1 \n"
+ "movhlps %%xmm2,%%xmm3 \n"
+ "punpcklbw %%xmm7,%%xmm0 \n"
+ "punpcklbw %%xmm7,%%xmm1 \n"
+ "punpcklbw %%xmm7,%%xmm2 \n"
+ "punpcklbw %%xmm7,%%xmm3 \n"
+ "paddusw %%xmm2,%%xmm0 \n"
+ "paddusw %%xmm3,%%xmm1 \n"
+ "movdqa (%0,%3,2),%%xmm2 \n"
+ "lea 0x10(%0),%0 \n"
+ "movhlps %%xmm2,%%xmm3 \n"
+ "punpcklbw %%xmm7,%%xmm2 \n"
+ "punpcklbw %%xmm7,%%xmm3 \n"
+ "paddusw %%xmm2,%%xmm0 \n"
+ "paddusw %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "psrldq $0x2,%%xmm0 \n"
+ "paddusw %%xmm0,%%xmm2 \n"
+ "psrldq $0x2,%%xmm0 \n"
+ "paddusw %%xmm0,%%xmm2 \n"
+ "pshufb %%xmm4,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "psrldq $0x2,%%xmm1 \n"
+ "paddusw %%xmm1,%%xmm3 \n"
+ "psrldq $0x2,%%xmm1 \n"
+ "paddusw %%xmm1,%%xmm3 \n"
+ "pshufb %%xmm5,%%xmm3 \n"
+ "paddusw %%xmm3,%%xmm2 \n"
+ "pmulhuw %%xmm6,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm2 \n"
+ "movd %%xmm2,(%1) \n"
+ "pextrw $0x2,%%xmm2,%%eax \n"
+ "mov %%ax,0x4(%1) \n"
+ "lea 0x6(%1),%1 \n"
+ "sub $0x6,%2 \n"
+ "ja 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"(_shufac0), // %4
+ "r"(_shufac3), // %5
+ "r"(_scaleac3) // %6
+ : "memory", "cc", "rax", "xmm6", "xmm7"
+);
+}
+
+static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "movdqa (%4),%%xmm4 \n"
+ "movdqa (%5),%%xmm5 \n"
+ "movdqa (%6),%%xmm6 \n"
+ "movdqa (%7),%%xmm7 \n"
+"1:"
+ "movdqa (%0),%%xmm2 \n"
+ "pavgb (%0,%3,1),%%xmm2 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm2,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "paddusw %%xmm1,%%xmm0 \n"
+ "pshufb %%xmm6,%%xmm2 \n"
+ "paddusw %%xmm2,%%xmm0 \n"
+ "pmulhuw %%xmm7,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movd %%xmm0,(%1) \n"
+ "pextrw $0x2,%%xmm0,%%eax \n"
+ "mov %%ax,0x4(%1) \n"
+ "lea 0x6(%1),%1 \n"
+ "sub $0x6,%2 \n"
+ "ja 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"(_shufab0), // %4
+ "r"(_shufab1), // %5
+ "r"(_shufab2), // %6
+ "r"(_scaleab2) // %7
+ : "memory", "cc", "rax", "xmm6", "xmm7"
+);
+}
+
+#define HAS_SCALEADDROWS_SSE2
+static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
+ uint16* dst_ptr, int src_width,
+ int src_height) {
+ asm volatile (
+ "pxor %%xmm5,%%xmm5 \n"
+"1:"
+ "movdqa (%0),%%xmm2 \n"
+ "lea (%0,%4,1),%%r10 \n"
+ "movhlps %%xmm2,%%xmm3 \n"
+ "lea -0x1(%3),%%r11 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm3 \n"
+
+"2:"
+ "movdqa (%%r10),%%xmm0 \n"
+ "lea (%%r10,%4,1),%%r10 \n"
+ "movhlps %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "paddusw %%xmm0,%%xmm2 \n"
+ "paddusw %%xmm1,%%xmm3 \n"
+ "sub $0x1,%%r11 \n"
+ "ja 2b \n"
+
+ "movdqa %%xmm2,(%1) \n"
+ "movdqa %%xmm3,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "lea 0x10(%0),%0 \n"
+ "sub $0x10,%2 \n"
+ "ja 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(src_width), // %2
+ "+r"(src_height) // %3
+ : "r"((intptr_t)(src_stride)) // %4
+ : "memory", "cc", "r10", "r11"
+);
+}
+
+// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version
+#define HAS_SCALEFILTERROWS_SSE2
+static void ScaleFilterRows_SSE2(uint8* dst_ptr,
+ const uint8* src_ptr, int src_stride,
+ int dst_width, int source_y_fraction) {
+ if (source_y_fraction == 0) {
+ asm volatile (
+ "1:"
+ "movdqa (%1),%%xmm0 \n"
+ "lea 0x10(%1),%1 \n"
+ "movdqa %%xmm0,(%0) \n"
+ "lea 0x10(%0),%0 \n"
+ "sub $0x10,%2 \n"
+ "ja 1b \n"
+ "mov -0x1(%0),%%al \n"
+ "mov %%al,(%0) \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc", "rax"
+ );
+ return;
+ } else if (source_y_fraction == 128) {
+ asm volatile (
+ "1:"
+ "movdqa (%1),%%xmm0 \n"
+ "movdqa (%1,%3,1),%%xmm2 \n"
+ "lea 0x10(%1),%1 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "movdqa %%xmm0,(%0) \n"
+ "lea 0x10(%0),%0 \n"
+ "sub $0x10,%2 \n"
+ "ja 1b \n"
+ "mov -0x1(%0),%%al \n"
+ "mov %%al,(%0) \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)) // %3
+ : "memory", "cc", "rax"
+ );
+ return;
+ } else {
+ asm volatile (
+ "mov %3,%%eax \n"
+ "movd %%eax,%%xmm6 \n"
+ "punpcklwd %%xmm6,%%xmm6 \n"
+ "pshufd $0x0,%%xmm6,%%xmm6 \n"
+ "neg %%eax \n"
+ "add $0x100,%%eax \n"
+ "movd %%eax,%%xmm5 \n"
+ "punpcklwd %%xmm5,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "pxor %%xmm7,%%xmm7 \n"
+ "1:"
+ "movdqa (%1),%%xmm0 \n"
+ "movdqa (%1,%4,1),%%xmm2 \n"
+ "lea 0x10(%1),%1 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "punpcklbw %%xmm7,%%xmm0 \n"
+ "punpcklbw %%xmm7,%%xmm2 \n"
+ "punpckhbw %%xmm7,%%xmm1 \n"
+ "punpckhbw %%xmm7,%%xmm3 \n"
+ "pmullw %%xmm5,%%xmm0 \n"
+ "pmullw %%xmm5,%%xmm1 \n"
+ "pmullw %%xmm6,%%xmm2 \n"
+ "pmullw %%xmm6,%%xmm3 \n"
+ "paddusw %%xmm2,%%xmm0 \n"
+ "paddusw %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,(%0) \n"
+ "lea 0x10(%0),%0 \n"
+ "sub $0x10,%2 \n"
+ "ja 1b \n"
+ "mov -0x1(%0),%%al \n"
+ "mov %%al,(%0) \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(source_y_fraction) // %3
+ : "r"((intptr_t)(src_stride)) // %4
+ : "memory", "cc", "rax", "xmm6", "xmm7"
+ );
+ }
+ return;
+}
+
+// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version
+#define HAS_SCALEFILTERROWS_SSSE3
+static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
+ const uint8* src_ptr, int src_stride,
+ int dst_width, int source_y_fraction) {
+ if (source_y_fraction == 0) {
+ asm volatile (
+ "1:"
+ "movdqa (%1),%%xmm0 \n"
+ "lea 0x10(%1),%1 \n"
+ "movdqa %%xmm0,(%0) \n"
+ "lea 0x10(%0),%0 \n"
+ "sub $0x10,%2 \n"
+ "ja 1b \n"
+ "mov -0x1(%0),%%al \n"
+ "mov %%al,(%0) \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc", "rax"
+ );
+ return;
+ } else if (source_y_fraction == 128) {
+ asm volatile (
+ "1:"
+ "movdqa (%1),%%xmm0 \n"
+ "movdqa (%1,%3,1),%%xmm2 \n"
+ "lea 0x10(%1),%1 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "movdqa %%xmm0,(%0) \n"
+ "lea 0x10(%0),%0 \n"
+ "sub $0x10,%2 \n"
+ "ja 1b \n"
+ "mov -0x1(%0),%%al \n"
+ "mov %%al,(%0) \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)) // %3
+ : "memory", "cc", "rax"
+ );
+ return;
+ } else {
+ asm volatile (
+ "mov %3,%%eax \n"
+ "shr %%eax \n"
+ "mov %%al,%%ah \n"
+ "neg %%al \n"
+ "add $0x80,%%al \n"
+ "movd %%eax,%%xmm5 \n"
+ "punpcklwd %%xmm5,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "1:"
+ "movdqa (%1),%%xmm0 \n"
+ "movdqa (%1,%4,1),%%xmm2 \n"
+ "lea 0x10(%1),%1 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm0 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ "pmaddubsw %%xmm5,%%xmm0 \n"
+ "pmaddubsw %%xmm5,%%xmm1 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,(%0) \n"
+ "lea 0x10(%0),%0 \n"
+ "sub $0x10,%2 \n"
+ "ja 1b \n"
+ "mov -0x1(%0),%%al \n"
+ "mov %%al,(%0) \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(source_y_fraction) // %3
+ : "r"((intptr_t)(src_stride)) // %4
+ : "memory", "cc", "rax"
+ );
+ }
+ return;
+}
+#endif
+#endif
+
+// CPU agnostic row functions
+static void ScaleRowDown2_C(const uint8* src_ptr, int src_stride,
+ uint8* dst, int dst_width) {
+ int x;
+ for (x = 0; x < dst_width; ++x) {
+ *dst++ = *src_ptr;
+ src_ptr += 2;
+ }
+}
+
+static void ScaleRowDown2Int_C(const uint8* src_ptr, int src_stride,
+ uint8* dst, int dst_width) {
+ int x;
+ for (x = 0; x < dst_width; ++x) {
+ *dst++ = (src_ptr[0] + src_ptr[1] +
+ src_ptr[src_stride] + src_ptr[src_stride + 1] + 2) >> 2;
+ src_ptr += 2;
+ }
+}
+
+static void ScaleRowDown4_C(const uint8* src_ptr, int src_stride,
+ uint8* dst, int dst_width) {
+ int x;
+ for (x = 0; x < dst_width; ++x) {
+ *dst++ = *src_ptr;
+ src_ptr += 4;
+ }
+}
+
+static void ScaleRowDown4Int_C(const uint8* src_ptr, int src_stride,
+ uint8* dst, int dst_width) {
+ int x;
+ for (x = 0; x < dst_width; ++x) {
+ *dst++ = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+ src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
+ src_ptr[src_stride + 2] + src_ptr[src_stride + 3] +
+ src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] +
+ src_ptr[src_stride * 2 + 2] + src_ptr[src_stride * 2 + 3] +
+ src_ptr[src_stride * 3 + 0] + src_ptr[src_stride * 3 + 1] +
+ src_ptr[src_stride * 3 + 2] + src_ptr[src_stride * 3 + 3] +
+ 8) >> 4;
+ src_ptr += 4;
+ }
+}
+
+// 640 output pixels is enough to allow 5120 input pixels with 1/8 scale down.
+// Keeping the total buffer under 4096 bytes avoids a stackcheck, saving 4% cpu.
+// The following 2 lines cause error on Windows.
+//static const int kMaxOutputWidth = 640;
+//static const int kMaxRow12 = 1280; //kMaxOutputWidth * 2;
+#define kMaxOutputWidth 640
+#define kMaxRow12 1280
+
+static void ScaleRowDown8_C(const uint8* src_ptr, int src_stride,
+ uint8* dst, int dst_width) {
+ int x;
+ for (x = 0; x < dst_width; ++x) {
+ *dst++ = *src_ptr;
+ src_ptr += 8;
+ }
+}
+
+// Note calling code checks width is less than max and if not
+// uses ScaleRowDown8_C instead.
+static void ScaleRowDown8Int_C(const uint8* src_ptr, int src_stride,
+ uint8* dst, int dst_width) {
+ ALIGN16(uint8 src_row[kMaxRow12 * 2]);
+ assert(dst_width <= kMaxOutputWidth);
+ ScaleRowDown4Int_C(src_ptr, src_stride, src_row, dst_width * 2);
+ ScaleRowDown4Int_C(src_ptr + src_stride * 4, src_stride,
+ src_row + kMaxOutputWidth,
+ dst_width * 2);
+ ScaleRowDown2Int_C(src_row, kMaxOutputWidth, dst, dst_width);
+}
+
+static void ScaleRowDown34_C(const uint8* src_ptr, int src_stride,
+ uint8* dst, int dst_width) {
+ uint8* dend;
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ dend = dst + dst_width;
+ do {
+ dst[0] = src_ptr[0];
+ dst[1] = src_ptr[1];
+ dst[2] = src_ptr[3];
+ dst += 3;
+ src_ptr += 4;
+ } while (dst < dend);
+}
+
+// Filter rows 0 and 1 together, 3 : 1
+static void ScaleRowDown34_0_Int_C(const uint8* src_ptr, int src_stride,
+ uint8* d, int dst_width) {
+ uint8* dend;
+ const uint8* s;
+ const uint8* t;
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ dend = d + dst_width;
+ s = src_ptr;
+ t = src_ptr + src_stride;
+ do {
+ uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+ uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+ uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+ uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+ uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+ uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+ d[0] = (a0 * 3 + b0 + 2) >> 2;
+ d[1] = (a1 * 3 + b1 + 2) >> 2;
+ d[2] = (a2 * 3 + b2 + 2) >> 2;
+ d += 3;
+ s += 4;
+ t += 4;
+ } while (d < dend);
+}
+
+// Filter rows 1 and 2 together, 1 : 1
+static void ScaleRowDown34_1_Int_C(const uint8* src_ptr, int src_stride,
+ uint8* d, int dst_width) {
+ uint8* dend;
+ const uint8* s;
+ const uint8* t;
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ dend = d + dst_width;
+ s = src_ptr;
+ t = src_ptr + src_stride;
+ do {
+ uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+ uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+ uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+ uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+ uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+ uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+ d[0] = (a0 + b0 + 1) >> 1;
+ d[1] = (a1 + b1 + 1) >> 1;
+ d[2] = (a2 + b2 + 1) >> 1;
+ d += 3;
+ s += 4;
+ t += 4;
+ } while (d < dend);
+}
+
+#if defined(HAS_SCALEFILTERROWS_SSE2)
+// Filter row to 3/4
+static void ScaleFilterCols34_C(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width) {
+ uint8* dend;
+ const uint8* s;
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ dend = dst_ptr + dst_width;
+ s = src_ptr;
+ do {
+ dst_ptr[0] = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+ dst_ptr[1] = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+ dst_ptr[2] = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+ dst_ptr += 3;
+ s += 4;
+ } while (dst_ptr < dend);
+}
+#endif
+
+static void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width, int dx) {
+ int x = 0;
+ int j;
+ for (j = 0; j < dst_width; ++j) {
+ int xi = x >> 16;
+ int xf1 = x & 0xffff;
+ int xf0 = 65536 - xf1;
+
+ *dst_ptr++ = (src_ptr[xi] * xf0 + src_ptr[xi + 1] * xf1) >> 16;
+ x += dx;
+ }
+}
+
+//Not work on Windows
+//static const int kMaxInputWidth = 2560;
+#define kMaxInputWidth 2560
+#if defined(HAS_SCALEFILTERROWS_SSE2)
+#define HAS_SCALEROWDOWN34_SSE2
+// Filter rows 0 and 1 together, 3 : 1
+static void ScaleRowDown34_0_Int_SSE2(const uint8* src_ptr, int src_stride,
+ uint8* dst_ptr, int dst_width) {
+ ALIGN16(uint8 row[kMaxInputWidth]);
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 4);
+ ScaleFilterCols34_C(dst_ptr, row, dst_width);
+}
+
+// Filter rows 1 and 2 together, 1 : 1
+static void ScaleRowDown34_1_Int_SSE2(const uint8* src_ptr, int src_stride,
+ uint8* dst_ptr, int dst_width) {
+ ALIGN16(uint8 row[kMaxInputWidth]);
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 2);
+ ScaleFilterCols34_C(dst_ptr, row, dst_width);
+}
+#endif
+
+static void ScaleRowDown38_C(const uint8* src_ptr, int src_stride,
+ uint8* dst, int dst_width) {
+ int x;
+ assert(dst_width % 3 == 0);
+ for (x = 0; x < dst_width; x += 3) {
+ dst[0] = src_ptr[0];
+ dst[1] = src_ptr[3];
+ dst[2] = src_ptr[6];
+ dst += 3;
+ src_ptr += 8;
+ }
+}
+
+// 8x3 -> 3x1
+static void ScaleRowDown38_3_Int_C(const uint8* src_ptr, int src_stride,
+ uint8* dst_ptr, int dst_width) {
+ int i;
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ for (i = 0; i < dst_width; i+=3) {
+ dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
+ src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
+ src_ptr[src_stride + 2] + src_ptr[src_stride * 2 + 0] +
+ src_ptr[src_stride * 2 + 1] + src_ptr[src_stride * 2 + 2]) *
+ (65536 / 9) >> 16;
+ dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
+ src_ptr[src_stride + 3] + src_ptr[src_stride + 4] +
+ src_ptr[src_stride + 5] + src_ptr[src_stride * 2 + 3] +
+ src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5]) *
+ (65536 / 9) >> 16;
+ dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
+ src_ptr[src_stride + 6] + src_ptr[src_stride + 7] +
+ src_ptr[src_stride * 2 + 6] + src_ptr[src_stride * 2 + 7]) *
+ (65536 / 6) >> 16;
+ src_ptr += 8;
+ dst_ptr += 3;
+ }
+}
+
+// 8x2 -> 3x1
+static void ScaleRowDown38_2_Int_C(const uint8* src_ptr, int src_stride,
+ uint8* dst_ptr, int dst_width) {
+ int i;
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ for (i = 0; i < dst_width; i+=3) {
+ dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
+ src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
+ src_ptr[src_stride + 2]) * (65536 / 6) >> 16;
+ dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
+ src_ptr[src_stride + 3] + src_ptr[src_stride + 4] +
+ src_ptr[src_stride + 5]) * (65536 / 6) >> 16;
+ dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
+ src_ptr[src_stride + 6] + src_ptr[src_stride + 7]) *
+ (65536 / 4) >> 16;
+ src_ptr += 8;
+ dst_ptr += 3;
+ }
+}
+
+// C version 8x2 -> 8x1
+static void ScaleFilterRows_C(uint8* dst_ptr,
+ const uint8* src_ptr, int src_stride,
+ int dst_width, int source_y_fraction) {
+ int y1_fraction;
+ int y0_fraction;
+ const uint8* src_ptr1;
+ uint8* end;
+ assert(dst_width > 0);
+ y1_fraction = source_y_fraction;
+ y0_fraction = 256 - y1_fraction;
+ src_ptr1 = src_ptr + src_stride;
+ end = dst_ptr + dst_width;
+ do {
+ dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
+ dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
+ dst_ptr[2] = (src_ptr[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8;
+ dst_ptr[3] = (src_ptr[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8;
+ dst_ptr[4] = (src_ptr[4] * y0_fraction + src_ptr1[4] * y1_fraction) >> 8;
+ dst_ptr[5] = (src_ptr[5] * y0_fraction + src_ptr1[5] * y1_fraction) >> 8;
+ dst_ptr[6] = (src_ptr[6] * y0_fraction + src_ptr1[6] * y1_fraction) >> 8;
+ dst_ptr[7] = (src_ptr[7] * y0_fraction + src_ptr1[7] * y1_fraction) >> 8;
+ src_ptr += 8;
+ src_ptr1 += 8;
+ dst_ptr += 8;
+ } while (dst_ptr < end);
+ dst_ptr[0] = dst_ptr[-1];
+}
+
+void ScaleAddRows_C(const uint8* src_ptr, int src_stride,
+ uint16* dst_ptr, int src_width, int src_height) {
+ int x,y;
+ assert(src_width > 0);
+ assert(src_height > 0);
+ for (x = 0; x < src_width; ++x) {
+ const uint8* s = src_ptr + x;
+ int sum = 0;
+ for (y = 0; y < src_height; ++y) {
+ sum += s[0];
+ s += src_stride;
+ }
+ dst_ptr[x] = sum;
+ }
+}
+
+/**
+ * Scale plane, 1/2
+ *
+ * This is an optimized version for scaling down a plane to 1/2 of
+ * its original size.
+ *
+ */
+static void ScalePlaneDown2(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_ptr, uint8* dst_ptr,
+ FilterMode filtering) {
+ void (*ScaleRowDown2)(const uint8* src_ptr, int src_stride,
+ uint8* dst_ptr, int dst_width);
+ assert(IS_ALIGNED(src_width, 2));
+ assert(IS_ALIGNED(src_height, 2));
+
+#if defined(HAS_SCALEROWDOWN2_NEON)
+ if (TestCpuFlag(kCpuHasNEON) &&
+ IS_ALIGNED(dst_width, 16)) {
+ ScaleRowDown2 = filtering ? ScaleRowDown2Int_NEON : ScaleRowDown2_NEON;
+ } else
+#endif
+#if defined(HAS_SCALEROWDOWN2_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) &&
+ IS_ALIGNED(dst_width, 16) &&
+ IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
+ IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
+ ScaleRowDown2 = filtering ? ScaleRowDown2Int_SSE2 : ScaleRowDown2_SSE2;
+ } else
+#endif
+ {
+ ScaleRowDown2 = filtering ? ScaleRowDown2Int_C : ScaleRowDown2_C;
+ }
+
+ {
+ int y;
+ for (y = 0; y < dst_height; ++y) {
+ ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
+ src_ptr += (src_stride << 1);
+ dst_ptr += dst_stride;
+ }
+ }
+}
+
+/**
+ * Scale plane, 1/4
+ *
+ * This is an optimized version for scaling down a plane to 1/4 of
+ * its original size.
+ */
+static void ScalePlaneDown4(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_ptr, uint8* dst_ptr,
+ FilterMode filtering) {
+ void (*ScaleRowDown4)(const uint8* src_ptr, int src_stride,
+ uint8* dst_ptr, int dst_width);
+ assert(IS_ALIGNED(src_width, 4));
+ assert(IS_ALIGNED(src_height, 4));
+
+#if defined(HAS_SCALEROWDOWN4_NEON)
+ if (TestCpuFlag(kCpuHasNEON) &&
+ IS_ALIGNED(dst_width, 4)) {
+ ScaleRowDown4 = filtering ? ScaleRowDown4Int_NEON : ScaleRowDown4_NEON;
+ } else
+#endif
+#if defined(HAS_SCALEROWDOWN4_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) &&
+ IS_ALIGNED(dst_width, 8) &&
+ IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
+ IS_ALIGNED(dst_ptr, 8) && IS_ALIGNED(dst_stride, 8)) {
+ ScaleRowDown4 = filtering ? ScaleRowDown4Int_SSE2 : ScaleRowDown4_SSE2;
+ } else
+#endif
+ {
+ ScaleRowDown4 = filtering ? ScaleRowDown4Int_C : ScaleRowDown4_C;
+ }
+
+ {
+ int y;
+ for (y = 0; y < dst_height; ++y) {
+ ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
+ src_ptr += (src_stride << 2);
+ dst_ptr += dst_stride;
+ }
+ }
+}
+
+/**
+ * Scale plane, 1/8
+ *
+ * This is an optimized version for scaling down a plane to 1/8
+ * of its original size.
+ *
+ */
+static void ScalePlaneDown8(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_ptr, uint8* dst_ptr,
+ FilterMode filtering) {
+ void (*ScaleRowDown8)(const uint8* src_ptr, int src_stride,
+ uint8* dst_ptr, int dst_width);
+ assert(IS_ALIGNED(src_width, 8));
+ assert(IS_ALIGNED(src_height, 8));
+
+#if defined(HAS_SCALEROWDOWN8_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) &&
+ IS_ALIGNED(dst_width, 4) &&
+ IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
+ IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+ ScaleRowDown8 = filtering ? ScaleRowDown8Int_SSE2 : ScaleRowDown8_SSE2;
+ } else
+#endif
+ {
+ ScaleRowDown8 = filtering && (dst_width <= kMaxOutputWidth) ?
+ ScaleRowDown8Int_C : ScaleRowDown8_C;
+ }
+
+ {
+ int y;
+ for (y = 0; y < dst_height; ++y) {
+ ScaleRowDown8(src_ptr, src_stride, dst_ptr, dst_width);
+ src_ptr += (src_stride << 3);
+ dst_ptr += dst_stride;
+ }
+ }
+}
+
+/**
+ * Scale plane down, 3/4
+ *
+ * Provided by Frank Barchard (fbarchard@google.com)
+ *
+ */
+static void ScalePlaneDown34(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_ptr, uint8* dst_ptr,
+ FilterMode filtering) {
+ void (*ScaleRowDown34_0)(const uint8* src_ptr, int src_stride,
+ uint8* dst_ptr, int dst_width);
+ void (*ScaleRowDown34_1)(const uint8* src_ptr, int src_stride,
+ uint8* dst_ptr, int dst_width);
+ assert(dst_width % 3 == 0);
+#if defined(HAS_SCALEROWDOWN34_NEON)
+ if (TestCpuFlag(kCpuHasNEON) &&
+ (dst_width % 24 == 0)) {
+ if (!filtering) {
+ ScaleRowDown34_0 = ScaleRowDown34_NEON;
+ ScaleRowDown34_1 = ScaleRowDown34_NEON;
+ } else {
+ ScaleRowDown34_0 = ScaleRowDown34_0_Int_NEON;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Int_NEON;
+ }
+ } else
+#endif
+
+#if defined(HAS_SCALEROWDOWN34_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) &&
+ (dst_width % 24 == 0) &&
+ IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
+ IS_ALIGNED(dst_ptr, 8) && IS_ALIGNED(dst_stride, 8)) {
+ if (!filtering) {
+ ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
+ ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
+ } else {
+ ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSSE3;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSSE3;
+ }
+ } else
+#endif
+#if defined(HAS_SCALEROWDOWN34_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) &&
+ (dst_width % 24 == 0) && IS_ALIGNED(src_stride, 16) &&
+ IS_ALIGNED(dst_stride, 8) &&
+ IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8) &&
+ filtering) {
+ ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSE2;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSE2;
+ } else
+#endif
+ {
+ if (!filtering) {
+ ScaleRowDown34_0 = ScaleRowDown34_C;
+ ScaleRowDown34_1 = ScaleRowDown34_C;
+ } else {
+ ScaleRowDown34_0 = ScaleRowDown34_0_Int_C;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Int_C;
+ }
+ }
+ {
+ int src_row = 0;
+ int y;
+ for (y = 0; y < dst_height; ++y) {
+ switch (src_row) {
+ case 0:
+ ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width);
+ break;
+
+ case 1:
+ ScaleRowDown34_1(src_ptr, src_stride, dst_ptr, dst_width);
+ break;
+
+ case 2:
+ ScaleRowDown34_0(src_ptr + src_stride, -src_stride,
+ dst_ptr, dst_width);
+ break;
+ }
+ ++src_row;
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ if (src_row >= 3) {
+ src_ptr += src_stride;
+ src_row = 0;
+ }
+ }
+}
+}
+
+/**
+ * Scale plane, 3/8
+ *
+ * This is an optimized version for scaling down a plane to 3/8
+ * of its original size.
+ *
+ * Reduces 16x3 to 6x1
+ */
+static void ScalePlaneDown38(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_ptr, uint8* dst_ptr,
+ FilterMode filtering) {
+ void (*ScaleRowDown38_3)(const uint8* src_ptr, int src_stride,
+ uint8* dst_ptr, int dst_width);
+ void (*ScaleRowDown38_2)(const uint8* src_ptr, int src_stride,
+ uint8* dst_ptr, int dst_width);
+ assert(dst_width % 3 == 0);
+#if defined(HAS_SCALEROWDOWN38_NEON)
+ if (TestCpuFlag(kCpuHasNEON) &&
+ (dst_width % 12 == 0)) {
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_NEON;
+ ScaleRowDown38_2 = ScaleRowDown38_NEON;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Int_NEON;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Int_NEON;
+ }
+ } else
+#endif
+
+#if defined(HAS_SCALEROWDOWN38_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) &&
+ (dst_width % 24 == 0) && IS_ALIGNED(src_stride, 16) &&
+ IS_ALIGNED(dst_stride, 8) &&
+ IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8)) {
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_SSSE3;
+ ScaleRowDown38_2 = ScaleRowDown38_SSSE3;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Int_SSSE3;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Int_SSSE3;
+ }
+ } else
+#endif
+ {
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_C;
+ ScaleRowDown38_2 = ScaleRowDown38_C;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Int_C;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Int_C;
+ }
+ }
+ {
+ int src_row = 0;
+ int y;
+ for (y = 0; y < dst_height; ++y) {
+ switch (src_row) {
+ case 0:
+ case 1:
+ ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width);
+ src_ptr += src_stride * 3;
+ ++src_row;
+ break;
+
+ case 2:
+ ScaleRowDown38_2(src_ptr, src_stride, dst_ptr, dst_width);
+ src_ptr += src_stride * 2;
+ src_row = 0;
+ break;
+ }
+ dst_ptr += dst_stride;
+ }
+}
+}
+
+__inline static uint32 SumBox(int iboxwidth, int iboxheight,
+ int src_stride, const uint8* src_ptr) {
+ int x, y;
+ uint32 sum;
+ assert(iboxwidth > 0);
+ assert(iboxheight > 0);
+ sum = 0u;
+ for (y = 0; y < iboxheight; ++y) {
+ for (x = 0; x < iboxwidth; ++x) {
+ sum += src_ptr[x];
+ }
+ src_ptr += src_stride;
+ }
+ return sum;
+}
+
+static void ScalePlaneBoxRow(int dst_width, int boxheight,
+ int dx, int src_stride,
+ const uint8* src_ptr, uint8* dst_ptr) {
+ int x = 0;
+ int i;
+ for (i = 0; i < dst_width; ++i) {
+ int ix = x >> 16;
+ int boxwidth;
+ x += dx;
+ boxwidth = (x >> 16) - ix;
+ *dst_ptr++ = SumBox(boxwidth, boxheight, src_stride, src_ptr + ix) /
+ (boxwidth * boxheight);
+ }
+}
+
+__inline static uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {
+ uint32 sum;
+ int x;
+ assert(iboxwidth > 0);
+ sum = 0u;
+ for (x = 0; x < iboxwidth; ++x) {
+ sum += src_ptr[x];
+ }
+ return sum;
+}
+
+static void ScaleAddCols2_C(int dst_width, int boxheight, int dx,
+ const uint16* src_ptr, uint8* dst_ptr) {
+ int scaletbl[2];
+ int minboxwidth = (dx >> 16);
+ scaletbl[0] = 65536 / (minboxwidth * boxheight);
+ scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight);
+ {
+ int *scaleptr = scaletbl - minboxwidth;
+ int x = 0;
+ int i;
+ for (i = 0; i < dst_width; ++i) {
+ int ix = x >> 16;
+ int boxwidth;
+ x += dx;
+ boxwidth = (x >> 16) - ix;
+ *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16;
+ }
+ }
+}
+
+static void ScaleAddCols1_C(int dst_width, int boxheight, int dx,
+ const uint16* src_ptr, uint8* dst_ptr) {
+ int boxwidth = (dx >> 16);
+ int scaleval = 65536 / (boxwidth * boxheight);
+ int x = 0;
+ int i;
+ for (i = 0; i < dst_width; ++i) {
+ *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16;
+ x += boxwidth;
+ }
+}
+
+/**
+ * Scale plane down to any dimensions, with interpolation.
+ * (boxfilter).
+ *
+ * Same method as SimpleScale, which is fixed point, outputting
+ * one pixel of destination using fixed point (16.16) to step
+ * through source, sampling a box of pixel with simple
+ * averaging.
+ */
+static void ScalePlaneBox(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_ptr, uint8* dst_ptr) {
+ int dx, dy;
+ assert(dst_width > 0);
+ assert(dst_height > 0);
+ dy = (src_height << 16) / dst_height;
+ dx = (src_width << 16) / dst_width;
+ if (!IS_ALIGNED(src_width, 16) || (src_width > kMaxInputWidth) ||
+ dst_height * 2 > src_height) {
+ uint8* dst = dst_ptr;
+ int dy = (src_height << 16) / dst_height;
+ int dx = (src_width << 16) / dst_width;
+ int y = 0;
+ int j;
+ for (j = 0; j < dst_height; ++j) {
+ int iy = y >> 16;
+ const uint8* const src = src_ptr + iy * src_stride;
+ int boxheight;
+ y += dy;
+ if (y > (src_height << 16)) {
+ y = (src_height << 16);
+ }
+ boxheight = (y >> 16) - iy;
+ ScalePlaneBoxRow(dst_width, boxheight,
+ dx, src_stride,
+ src, dst);
+
+ dst += dst_stride;
+ }
+ } else {
+ ALIGN16(uint16 row[kMaxInputWidth]);
+ void (*ScaleAddRows)(const uint8* src_ptr, int src_stride,
+ uint16* dst_ptr, int src_width, int src_height);
+ void (*ScaleAddCols)(int dst_width, int boxheight, int dx,
+ const uint16* src_ptr, uint8* dst_ptr);
+#if defined(HAS_SCALEADDROWS_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) &&
+ IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16) &&
+ IS_ALIGNED(src_width, 16)) {
+ ScaleAddRows = ScaleAddRows_SSE2;
+ } else
+#endif
+ {
+ ScaleAddRows = ScaleAddRows_C;
+ }
+ if (dx & 0xffff) {
+ ScaleAddCols = ScaleAddCols2_C;
+ } else {
+ ScaleAddCols = ScaleAddCols1_C;
+ }
+
+ {
+ int y = 0;
+ int j;
+ for (j = 0; j < dst_height; ++j) {
+ int iy = y >> 16;
+ const uint8* const src = src_ptr + iy * src_stride;
+ int boxheight;
+ y += dy;
+ if (y > (src_height << 16)) {
+ y = (src_height << 16);
+ }
+ boxheight = (y >> 16) - iy;
+ ScaleAddRows(src, src_stride, row, src_width, boxheight);
+ ScaleAddCols(dst_width, boxheight, dx, row, dst_ptr);
+ dst_ptr += dst_stride;
+ }
+ }
+ }
+}
+
+/**
+ * Scale plane to/from any dimensions, with interpolation.
+ */
+static void ScalePlaneBilinearSimple(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_ptr, uint8* dst_ptr) {
+ int i, j;
+ uint8* dst = dst_ptr;
+ int dx = (src_width << 16) / dst_width;
+ int dy = (src_height << 16) / dst_height;
+ int maxx = ((src_width - 1) << 16) - 1;
+ int maxy = ((src_height - 1) << 16) - 1;
+ int y = (dst_height < src_height) ? 32768 :
+ (src_height << 16) / dst_height - 32768;
+ for (i = 0; i < dst_height; ++i) {
+ int cy = (y < 0) ? 0 : y;
+ int yi = cy >> 16;
+ int yf = cy & 0xffff;
+ const uint8* const src = src_ptr + yi * src_stride;
+ int x = (dst_width < src_width) ? 32768 :
+ (src_width << 16) / dst_width - 32768;
+ for (j = 0; j < dst_width; ++j) {
+ int cx = (x < 0) ? 0 : x;
+ int xi = cx >> 16;
+ int xf = cx & 0xffff;
+ int r0 = (src[xi] * (65536 - xf) + src[xi + 1] * xf) >> 16;
+ int r1 = (src[xi + src_stride] * (65536 - xf) +
+ src[xi + src_stride + 1] * xf) >> 16;
+ *dst++ = (r0 * (65536 - yf) + r1 * yf) >> 16;
+ x += dx;
+ if (x > maxx)
+ x = maxx;
+ }
+ dst += dst_stride - dst_width;
+ y += dy;
+ if (y > maxy)
+ y = maxy;
+ }
+}
+
+/**
+ * Scale plane to/from any dimensions, with bilinear
+ * interpolation.
+ */
+static void ScalePlaneBilinear(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_ptr, uint8* dst_ptr) {
+ int dy;
+ int dx;
+ assert(dst_width > 0);
+ assert(dst_height > 0);
+ dy = (src_height << 16) / dst_height;
+ dx = (src_width << 16) / dst_width;
+ if (!IS_ALIGNED(src_width, 8) || (src_width > kMaxInputWidth)) {
+ ScalePlaneBilinearSimple(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src_ptr, dst_ptr);
+
+ } else {
+ ALIGN16(uint8 row[kMaxInputWidth + 1]);
+ void (*ScaleFilterRows)(uint8* dst_ptr, const uint8* src_ptr,
+ int src_stride,
+ int dst_width, int source_y_fraction);
+ void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width, int dx);
+#if defined(HAS_SCALEFILTERROWS_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) &&
+ IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16) &&
+ IS_ALIGNED(src_width, 16)) {
+ ScaleFilterRows = ScaleFilterRows_SSSE3;
+ } else
+#endif
+#if defined(HAS_SCALEFILTERROWS_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) &&
+ IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16) &&
+ IS_ALIGNED(src_width, 16)) {
+ ScaleFilterRows = ScaleFilterRows_SSE2;
+ } else
+#endif
+ {
+ ScaleFilterRows = ScaleFilterRows_C;
+ }
+ ScaleFilterCols = ScaleFilterCols_C;
+
+ {
+ int y = 0;
+ int maxy = ((src_height - 1) << 16) - 1; // max is filter of last 2 rows.
+ int j;
+ for (j = 0; j < dst_height; ++j) {
+ int iy = y >> 16;
+ int fy = (y >> 8) & 255;
+ const uint8* const src = src_ptr + iy * src_stride;
+ ScaleFilterRows(row, src, src_stride, src_width, fy);
+ ScaleFilterCols(dst_ptr, row, dst_width, dx);
+ dst_ptr += dst_stride;
+ y += dy;
+ if (y > maxy) {
+ y = maxy;
+ }
+ }
+ }
+}
+}
+
+/**
+ * Scale plane to/from any dimensions, without interpolation.
+ * Fixed point math is used for performance: The upper 16 bits
+ * of x and dx is the integer part of the source position and
+ * the lower 16 bits are the fixed decimal part.
+ */
+static void ScalePlaneSimple(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_ptr, uint8* dst_ptr) {
+ uint8* dst = dst_ptr;
+ int dx = (src_width << 16) / dst_width;
+ int y;
+ for (y = 0; y < dst_height; ++y) {
+ const uint8* const src = src_ptr + (y * src_height / dst_height) *
+ src_stride;
+ // TODO(fbarchard): Round X coordinate by setting x=0x8000.
+ int x = 0;
+ int i;
+ for (i = 0; i < dst_width; ++i) {
+ *dst++ = src[x >> 16];
+ x += dx;
+ }
+ dst += dst_stride - dst_width;
+ }
+}
+
+/**
+ * Scale plane to/from any dimensions.
+ */
+static void ScalePlaneAnySize(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_ptr, uint8* dst_ptr,
+ FilterMode filtering) {
+ if (!filtering) {
+ ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src_ptr, dst_ptr);
+ } else {
+ // fall back to non-optimized version
+ ScalePlaneBilinear(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src_ptr, dst_ptr);
+ }
+}
+
+/**
+ * Scale plane down, any size
+ *
+ * This is an optimized version for scaling down a plane to any size.
+ * The current implementation is ~10 times faster compared to the
+ * reference implementation for e.g. XGA->LowResPAL
+ *
+ */
+static void ScalePlaneDown(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_ptr, uint8* dst_ptr,
+ FilterMode filtering) {
+ if (!filtering) {
+ ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src_ptr, dst_ptr);
+ } else if (filtering == kFilterBilinear || src_height * 2 > dst_height) {
+ // between 1/2x and 1x use bilinear
+ ScalePlaneBilinear(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src_ptr, dst_ptr);
+ } else {
+ ScalePlaneBox(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src_ptr, dst_ptr);
+ }
+}
+
+/**
+ * Copy plane, no scaling
+ *
+ * This simply copies the given plane without scaling.
+ * The current implementation is ~115 times faster
+ * compared to the reference implementation.
+ *
+ */
+static void CopyPlane(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_ptr, uint8* dst_ptr) {
+ if (src_stride == src_width && dst_stride == dst_width) {
+ // All contiguous, so can use REALLY fast path.
+ memcpy(dst_ptr, src_ptr, src_width * src_height);
+ } else {
+ // Not all contiguous; must copy scanlines individually
+ const uint8* src = src_ptr;
+ uint8* dst = dst_ptr;
+ int i;
+ for (i = 0; i < src_height; ++i) {
+ memcpy(dst, src, src_width);
+ dst += dst_stride;
+ src += src_stride;
+ }
+ }
+}
+
+static void ScalePlane(const uint8* src, int src_stride,
+ int src_width, int src_height,
+ uint8* dst, int dst_stride,
+ int dst_width, int dst_height,
+ FilterMode filtering, int use_ref) {
+ // Use specialized scales to improve performance for common resolutions.
+ // For example, all the 1/2 scalings will use ScalePlaneDown2()
+ if (dst_width == src_width && dst_height == src_height) {
+ // Straight copy.
+ CopyPlane(src_width, src_height, dst_width, dst_height, src_stride,
+ dst_stride, src, dst);
+ } else if (dst_width <= src_width && dst_height <= src_height) {
+ // Scale down.
+ if (use_ref) {
+ // For testing, allow the optimized versions to be disabled.
+ ScalePlaneDown(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ } else if (4 * dst_width == 3 * src_width &&
+ 4 * dst_height == 3 * src_height) {
+ // optimized, 3/4
+ ScalePlaneDown34(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ } else if (2 * dst_width == src_width && 2 * dst_height == src_height) {
+ // optimized, 1/2
+ ScalePlaneDown2(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ // 3/8 rounded up for odd sized chroma height.
+ } else if (8 * dst_width == 3 * src_width &&
+ dst_height == ((src_height * 3 + 7) / 8)) {
+ // optimized, 3/8
+ ScalePlaneDown38(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ } else if (4 * dst_width == src_width && 4 * dst_height == src_height) {
+ // optimized, 1/4
+ ScalePlaneDown4(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ } else if (8 * dst_width == src_width && 8 * dst_height == src_height) {
+ // optimized, 1/8
+ ScalePlaneDown8(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ } else {
+ // Arbitrary downsample
+ ScalePlaneDown(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ }
+ } else {
+ // Arbitrary scale up and/or down.
+ ScalePlaneAnySize(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ }
+}
+
+/**
+ * Scale a plane.
+ *
+ * This function in turn calls a scaling function
+ * suitable for handling the desired resolutions.
+ *
+ */
+
+int I420Scale(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ int src_width, int src_height,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int dst_width, int dst_height,
+ FilterMode filtering) {
+ if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
+ !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (src_height < 0) {
+ int halfheight;
+ src_height = -src_height;
+ halfheight = (src_height + 1) >> 1;
+ src_y = src_y + (src_height - 1) * src_stride_y;
+ src_u = src_u + (halfheight - 1) * src_stride_u;
+ src_v = src_v + (halfheight - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+ {
+ int src_halfwidth = (src_width + 1) >> 1;
+ int src_halfheight = (src_height + 1) >> 1;
+ int dst_halfwidth = (dst_width + 1) >> 1;
+ int dst_halfheight = (dst_height + 1) >> 1;
+
+ ScalePlane(src_y, src_stride_y, src_width, src_height,
+ dst_y, dst_stride_y, dst_width, dst_height,
+ filtering, use_reference_impl_);
+ ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight,
+ dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
+ filtering, use_reference_impl_);
+ ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight,
+ dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
+ filtering, use_reference_impl_);
+ }
+ return 0;
+}
+
+// Deprecated api
+int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
+ int src_stride_y, int src_stride_u, int src_stride_v,
+ int src_width, int src_height,
+ uint8* dst_y, uint8* dst_u, uint8* dst_v,
+ int dst_stride_y, int dst_stride_u, int dst_stride_v,
+ int dst_width, int dst_height,
+ int interpolate) {
+ if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
+ !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (src_height < 0) {
+ int halfheight;
+ src_height = -src_height;
+ halfheight = (src_height + 1) >> 1;
+ src_y = src_y + (src_height - 1) * src_stride_y;
+ src_u = src_u + (halfheight - 1) * src_stride_u;
+ src_v = src_v + (halfheight - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+ {
+ int src_halfwidth = (src_width + 1) >> 1;
+ int src_halfheight = (src_height + 1) >> 1;
+ int dst_halfwidth = (dst_width + 1) >> 1;
+ int dst_halfheight = (dst_height + 1) >> 1;
+ FilterMode filtering = interpolate ? kFilterBox : kFilterNone;
+
+ ScalePlane(src_y, src_stride_y, src_width, src_height,
+ dst_y, dst_stride_y, dst_width, dst_height,
+ filtering, use_reference_impl_);
+ ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight,
+ dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
+ filtering, use_reference_impl_);
+ ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight,
+ dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
+ filtering, use_reference_impl_);
+ }
+ return 0;
+}
+
+// Deprecated api
+int ScaleOffset(const uint8* src, int src_width, int src_height,
+ uint8* dst, int dst_width, int dst_height, int dst_yoffset,
+ int interpolate) {
+ if (!src || src_width <= 0 || src_height <= 0 ||
+ !dst || dst_width <= 0 || dst_height <= 0 || dst_yoffset < 0 ||
+ dst_yoffset >= dst_height) {
+ return -1;
+ }
+ dst_yoffset = dst_yoffset & ~1; // chroma requires offset to multiple of 2.
+ {
+ int src_halfwidth = (src_width + 1) >> 1;
+ int src_halfheight = (src_height + 1) >> 1;
+ int dst_halfwidth = (dst_width + 1) >> 1;
+ int dst_halfheight = (dst_height + 1) >> 1;
+ int aheight = dst_height - dst_yoffset * 2; // actual output height
+ const uint8* const src_y = src;
+ const uint8* const src_u = src + src_width * src_height;
+ const uint8* const src_v = src + src_width * src_height +
+ src_halfwidth * src_halfheight;
+ uint8* dst_y = dst + dst_yoffset * dst_width;
+ uint8* dst_u = dst + dst_width * dst_height +
+ (dst_yoffset >> 1) * dst_halfwidth;
+ uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight +
+ (dst_yoffset >> 1) * dst_halfwidth;
+ return Scale(src_y, src_u, src_v, src_width, src_halfwidth, src_halfwidth,
+ src_width, src_height, dst_y, dst_u, dst_v, dst_width,
+ dst_halfwidth, dst_halfwidth, dst_width, aheight, interpolate);
+ }
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/usage.dox b/usage.dox
index 0db080b00..9370e428f 100644
--- a/usage.dox
+++ b/usage.dox
@@ -82,6 +82,7 @@
The available initialization methods are:
\if encoder - #vpx_codec_enc_init (calls vpx_codec_enc_init_ver()) \endif
+ \if multi-encoder - #vpx_codec_enc_init_multi (calls vpx_codec_enc_init_multi_ver()) \endif
\if decoder - #vpx_codec_dec_init (calls vpx_codec_dec_init_ver()) \endif
diff --git a/usage_cx.dox b/usage_cx.dox
index 980a03461..62f3e450b 100644
--- a/usage_cx.dox
+++ b/usage_cx.dox
@@ -1,6 +1,6 @@
/*! \page usage_encode Encode
- The vpx_codec_encode() function is at the core of the decode loop. It
+ The vpx_codec_encode() function is at the core of the encode loop. It
processes raw images passed by the application, producing packets of
compressed data. The <code>deadline</code> parameter controls the amount
of time in microseconds the encoder should spend working on the frame. For
@@ -10,5 +10,4 @@
\ref samples
-
*/
diff --git a/vp8/common/arm/arm_systemdependent.c b/vp8/common/arm/arm_systemdependent.c
index b5f194d3d..89a2be825 100644
--- a/vp8/common/arm/arm_systemdependent.c
+++ b/vp8/common/arm/arm_systemdependent.c
@@ -11,7 +11,6 @@
#include "vpx_config.h"
#include "vpx_ports/arm.h"
-#include "vp8/common/g_common.h"
#include "vp8/common/pragmas.h"
#include "vp8/common/subpixel.h"
#include "vp8/common/loopfilter.h"
@@ -46,7 +45,6 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx)
rtcd->subpix.bilinear4x4 = vp8_bilinear_predict4x4_armv6;
rtcd->idct.idct16 = vp8_short_idct4x4llm_v6_dual;
- rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_v6;
rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_v6;
rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_armv6;
@@ -64,6 +62,12 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx)
rtcd->recon.copy8x8 = vp8_copy_mem8x8_v6;
rtcd->recon.copy8x4 = vp8_copy_mem8x4_v6;
rtcd->recon.intra4x4_predict = vp8_intra4x4_predict_armv6;
+
+ rtcd->dequant.block = vp8_dequantize_b_v6;
+ rtcd->dequant.idct_add = vp8_dequant_idct_add_v6;
+ rtcd->dequant.idct_add_y_block = vp8_dequant_idct_add_y_block_v6;
+ rtcd->dequant.idct_add_uv_block = vp8_dequant_idct_add_uv_block_v6;
+
}
#endif
@@ -80,7 +84,6 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx)
rtcd->subpix.bilinear4x4 = vp8_bilinear_predict4x4_neon;
rtcd->idct.idct16 = vp8_short_idct4x4llm_neon;
- rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_neon;
rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_neon;
rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_neon;
@@ -99,6 +102,12 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx)
vp8_build_intra_predictors_mby_neon;
rtcd->recon.build_intra_predictors_mby_s =
vp8_build_intra_predictors_mby_s_neon;
+
+ rtcd->dequant.block = vp8_dequantize_b_neon;
+ rtcd->dequant.idct_add = vp8_dequant_idct_add_neon;
+ rtcd->dequant.idct_add_y_block = vp8_dequant_idct_add_y_block_neon;
+ rtcd->dequant.idct_add_uv_block = vp8_dequant_idct_add_uv_block_neon;
+
}
#endif
diff --git a/vp8/decoder/arm/armv6/dequant_idct_v6.asm b/vp8/common/arm/armv6/dequant_idct_v6.asm
index 2510ad838..2510ad838 100644
--- a/vp8/decoder/arm/armv6/dequant_idct_v6.asm
+++ b/vp8/common/arm/armv6/dequant_idct_v6.asm
diff --git a/vp8/decoder/arm/armv6/dequantize_v6.asm b/vp8/common/arm/armv6/dequantize_v6.asm
index 72f7e0ee5..72f7e0ee5 100644
--- a/vp8/decoder/arm/armv6/dequantize_v6.asm
+++ b/vp8/common/arm/armv6/dequantize_v6.asm
diff --git a/vp8/decoder/arm/armv6/idct_blk_v6.c b/vp8/common/arm/armv6/idct_blk_v6.c
index 686bb737f..9108929f5 100644
--- a/vp8/decoder/arm/armv6/idct_blk_v6.c
+++ b/vp8/common/arm/armv6/idct_blk_v6.c
@@ -10,50 +10,9 @@
#include "vpx_config.h"
#include "vp8/common/idct.h"
-#include "vp8/decoder/dequantize.h"
+#include "vp8/common/dequantize.h"
-void vp8_dequant_dc_idct_add_y_block_v6(short *q, short *dq,
- unsigned char *dst, int stride,
- char *eobs, short *dc)
-{
- int i;
-
- for (i = 0; i < 4; i++)
- {
- if (eobs[0] > 1)
- vp8_dequant_dc_idct_add_v6 (q, dq, dst, stride, dc[0]);
- else if (eobs[0] == 1)
- vp8_dc_only_idct_add_v6 (dc[0], dst, stride, dst, stride);
-
- if (eobs[1] > 1)
- {
- vp8_dequant_dc_idct_add_v6 (q+16, dq, dst+4, stride, dc[1]);
- }
- else if (eobs[1] == 1)
- vp8_dc_only_idct_add_v6 (dc[1], dst+4, stride, dst+4, stride);
-
- if (eobs[2] > 1)
- {
- vp8_dequant_dc_idct_add_v6 (q+32, dq, dst+8, stride, dc[2]);
- }
- else if (eobs[2] == 1)
- vp8_dc_only_idct_add_v6 (dc[2], dst+8, stride, dst+8, stride);
-
- if (eobs[3] > 1)
- {
- vp8_dequant_dc_idct_add_v6 (q+48, dq, dst+12, stride, dc[3]);
- }
- else if (eobs[3] == 1)
- vp8_dc_only_idct_add_v6 (dc[3], dst+12, stride, dst+12, stride);
-
- q += 64;
- dc += 4;
- dst += 4*stride;
- eobs += 4;
- }
-}
-
void vp8_dequant_idct_add_y_block_v6(short *q, short *dq,
unsigned char *dst,
int stride, char *eobs)
diff --git a/vp8/common/arm/armv6/iwalsh_v6.asm b/vp8/common/arm/armv6/iwalsh_v6.asm
index 463bff0f5..31ef09cad 100644
--- a/vp8/common/arm/armv6/iwalsh_v6.asm
+++ b/vp8/common/arm/armv6/iwalsh_v6.asm
@@ -9,7 +9,6 @@
;
EXPORT |vp8_short_inv_walsh4x4_v6|
- EXPORT |vp8_short_inv_walsh4x4_1_v6|
ARM
REQUIRE8
@@ -17,19 +16,19 @@
AREA |.text|, CODE, READONLY ; name this block of code
-;short vp8_short_inv_walsh4x4_v6(short *input, short *output)
+;short vp8_short_inv_walsh4x4_v6(short *input, short *mb_dqcoeff)
|vp8_short_inv_walsh4x4_v6| PROC
- stmdb sp!, {r4 - r11, lr}
+ stmdb sp!, {r4 - r12, lr}
- ldr r2, [r0], #4 ; [1 | 0]
- ldr r3, [r0], #4 ; [3 | 2]
- ldr r4, [r0], #4 ; [5 | 4]
- ldr r5, [r0], #4 ; [7 | 6]
- ldr r6, [r0], #4 ; [9 | 8]
- ldr r7, [r0], #4 ; [11 | 10]
- ldr r8, [r0], #4 ; [13 | 12]
- ldr r9, [r0] ; [15 | 14]
+ ldr r2, [r0, #0] ; [1 | 0]
+ ldr r3, [r0, #4] ; [3 | 2]
+ ldr r4, [r0, #8] ; [5 | 4]
+ ldr r5, [r0, #12] ; [7 | 6]
+ ldr r6, [r0, #16] ; [9 | 8]
+ ldr r7, [r0, #20] ; [11 | 10]
+ ldr r8, [r0, #24] ; [13 | 12]
+ ldr r9, [r0, #28] ; [15 | 14]
qadd16 r10, r2, r8 ; a1 [1+13 | 0+12]
qadd16 r11, r4, r6 ; b1 [5+9 | 4+8]
@@ -69,24 +68,27 @@
qadd16 r4, r4, r10 ; [b2+3|c2+3]
qadd16 r5, r5, r10 ; [a2+3|d2+3]
- asr r12, r2, #3 ; [1 | x]
- pkhtb r12, r12, r3, asr #19; [1 | 0]
- lsl lr, r3, #16 ; [~3 | x]
- lsl r2, r2, #16 ; [~2 | x]
- asr lr, lr, #3 ; [3 | x]
- pkhtb lr, lr, r2, asr #19 ; [3 | 2]
-
- asr r2, r4, #3 ; [5 | x]
- pkhtb r2, r2, r5, asr #19 ; [5 | 4]
- lsl r3, r5, #16 ; [~7 | x]
- lsl r4, r4, #16 ; [~6 | x]
- asr r3, r3, #3 ; [7 | x]
- pkhtb r3, r3, r4, asr #19 ; [7 | 6]
-
- str r12, [r1], #4
- str lr, [r1], #4
- str r2, [r1], #4
- str r3, [r1], #4
+ asr r12, r3, #19 ; [0]
+ strh r12, [r1], #32
+ asr lr, r2, #19 ; [1]
+ strh lr, [r1], #32
+ sxth r2, r2
+ sxth r3, r3
+ asr r2, r2, #3 ; [2]
+ strh r2, [r1], #32
+ asr r3, r3, #3 ; [3]
+ strh r3, [r1], #32
+
+ asr r12, r5, #19 ; [4]
+ strh r12, [r1], #32
+ asr lr, r4, #19 ; [5]
+ strh lr, [r1], #32
+ sxth r4, r4
+ sxth r5, r5
+ asr r4, r4, #3 ; [6]
+ strh r4, [r1], #32
+ asr r5, r5, #3 ; [7]
+ strh r5, [r1], #32
qsubaddx r2, r6, r7 ; [c1|a1] [9-10 | 8+11]
qaddsubx r3, r6, r7 ; [b1|d1] [9+10 | 8-11]
@@ -103,50 +105,32 @@
qadd16 r8, r8, r10 ; [b2+3|c2+3]
qadd16 r9, r9, r10 ; [a2+3|d2+3]
- asr r2, r6, #3 ; [9 | x]
- pkhtb r2, r2, r7, asr #19 ; [9 | 8]
- lsl r3, r7, #16 ; [~11| x]
- lsl r4, r6, #16 ; [~10| x]
- asr r3, r3, #3 ; [11 | x]
- pkhtb r3, r3, r4, asr #19 ; [11 | 10]
-
- asr r4, r8, #3 ; [13 | x]
- pkhtb r4, r4, r9, asr #19 ; [13 | 12]
- lsl r5, r9, #16 ; [~15| x]
- lsl r6, r8, #16 ; [~14| x]
- asr r5, r5, #3 ; [15 | x]
- pkhtb r5, r5, r6, asr #19 ; [15 | 14]
-
- str r2, [r1], #4
- str r3, [r1], #4
- str r4, [r1], #4
- str r5, [r1]
-
- ldmia sp!, {r4 - r11, pc}
+ asr r12, r7, #19 ; [8]
+ strh r12, [r1], #32
+ asr lr, r6, #19 ; [9]
+ strh lr, [r1], #32
+ sxth r6, r6
+ sxth r7, r7
+ asr r6, r6, #3 ; [10]
+ strh r6, [r1], #32
+ asr r7, r7, #3 ; [11]
+ strh r7, [r1], #32
+
+ asr r12, r9, #19 ; [12]
+ strh r12, [r1], #32
+ asr lr, r8, #19 ; [13]
+ strh lr, [r1], #32
+ sxth r8, r8
+ sxth r9, r9
+ asr r8, r8, #3 ; [14]
+ strh r8, [r1], #32
+ asr r9, r9, #3 ; [15]
+ strh r9, [r1], #32
+
+ ldmia sp!, {r4 - r12, pc}
ENDP ; |vp8_short_inv_walsh4x4_v6|
-;short vp8_short_inv_walsh4x4_1_v6(short *input, short *output)
-|vp8_short_inv_walsh4x4_1_v6| PROC
-
- ldrsh r2, [r0] ; [0]
- add r2, r2, #3 ; [0] + 3
- asr r2, r2, #3 ; a1 ([0]+3) >> 3
- lsl r2, r2, #16 ; [a1 | x]
- orr r2, r2, r2, lsr #16 ; [a1 | a1]
-
- str r2, [r1], #4
- str r2, [r1], #4
- str r2, [r1], #4
- str r2, [r1], #4
- str r2, [r1], #4
- str r2, [r1], #4
- str r2, [r1], #4
- str r2, [r1]
-
- bx lr
- ENDP ; |vp8_short_inv_walsh4x4_1_v6|
-
; Constant Pool
c0x00030003 DCD 0x00030003
END
diff --git a/vp8/decoder/arm/dequantize_arm.c b/vp8/common/arm/dequantize_arm.c
index 2918e0512..20a8ac4fc 100644
--- a/vp8/decoder/arm/dequantize_arm.c
+++ b/vp8/common/arm/dequantize_arm.c
@@ -10,9 +10,8 @@
#include "vpx_config.h"
-#include "vp8/decoder/dequantize.h"
+#include "vp8/common/dequantize.h"
#include "vp8/common/idct.h"
-#include "vpx_mem/vpx_mem.h"
#if HAVE_ARMV7
extern void vp8_dequantize_b_loop_neon(short *Q, short *DQC, short *DQ);
diff --git a/vp8/decoder/arm/dequantize_arm.h b/vp8/common/arm/dequantize_arm.h
index c020c8530..0b4d8fe89 100644
--- a/vp8/decoder/arm/dequantize_arm.h
+++ b/vp8/common/arm/dequantize_arm.h
@@ -15,8 +15,6 @@
#if HAVE_ARMV6
extern prototype_dequant_block(vp8_dequantize_b_v6);
extern prototype_dequant_idct_add(vp8_dequant_idct_add_v6);
-extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_v6);
-extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_v6);
extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_v6);
extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_v6);
@@ -24,19 +22,13 @@ extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_v6);
#undef vp8_dequant_block
#define vp8_dequant_block vp8_dequantize_b_v6
-#undef vp8_dequant_idct_add
+#undef vp8_dequant_idct_add
#define vp8_dequant_idct_add vp8_dequant_idct_add_v6
-#undef vp8_dequant_dc_idct_add
-#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_v6
-
-#undef vp8_dequant_dc_idct_add_y_block
-#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_v6
-
-#undef vp8_dequant_idct_add_y_block
+#undef vp8_dequant_idct_add_y_block
#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_v6
-#undef vp8_dequant_idct_add_uv_block
+#undef vp8_dequant_idct_add_uv_block
#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_v6
#endif
#endif
@@ -44,8 +36,6 @@ extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_v6);
#if HAVE_ARMV7
extern prototype_dequant_block(vp8_dequantize_b_neon);
extern prototype_dequant_idct_add(vp8_dequant_idct_add_neon);
-extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_neon);
-extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_neon);
extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_neon);
extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_neon);
@@ -54,19 +44,13 @@ extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_neon);
#undef vp8_dequant_block
#define vp8_dequant_block vp8_dequantize_b_neon
-#undef vp8_dequant_idct_add
+#undef vp8_dequant_idct_add
#define vp8_dequant_idct_add vp8_dequant_idct_add_neon
-#undef vp8_dequant_dc_idct_add
-#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_neon
-
-#undef vp8_dequant_dc_idct_add_y_block
-#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_neon
-
-#undef vp8_dequant_idct_add_y_block
+#undef vp8_dequant_idct_add_y_block
#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_neon
-#undef vp8_dequant_idct_add_uv_block
+#undef vp8_dequant_idct_add_uv_block
#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_neon
#endif
diff --git a/vp8/common/arm/idct_arm.h b/vp8/common/arm/idct_arm.h
index c710c2eb0..68c0cad11 100644
--- a/vp8/common/arm/idct_arm.h
+++ b/vp8/common/arm/idct_arm.h
@@ -25,9 +25,6 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_v6);
#undef vp8_idct_idct1_scalar_add
#define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_v6
-#undef vp8_idct_iwalsh1
-#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_v6
-
#undef vp8_idct_iwalsh16
#define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_v6
#endif
@@ -46,9 +43,6 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_neon);
#undef vp8_idct_idct1_scalar_add
#define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_neon
-#undef vp8_idct_iwalsh1
-#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_neon
-
#undef vp8_idct_iwalsh16
#define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_neon
#endif
diff --git a/vp8/decoder/arm/neon/dequant_idct_neon.asm b/vp8/common/arm/neon/dequant_idct_neon.asm
index 602cce676..602cce676 100644
--- a/vp8/decoder/arm/neon/dequant_idct_neon.asm
+++ b/vp8/common/arm/neon/dequant_idct_neon.asm
diff --git a/vp8/decoder/arm/neon/dequantizeb_neon.asm b/vp8/common/arm/neon/dequantizeb_neon.asm
index c8e0c31f2..c8e0c31f2 100644
--- a/vp8/decoder/arm/neon/dequantizeb_neon.asm
+++ b/vp8/common/arm/neon/dequantizeb_neon.asm
diff --git a/vp8/decoder/arm/neon/idct_blk_neon.c b/vp8/common/arm/neon/idct_blk_neon.c
index 086293114..cc55843d5 100644
--- a/vp8/decoder/arm/neon/idct_blk_neon.c
+++ b/vp8/common/arm/neon/idct_blk_neon.c
@@ -10,51 +10,16 @@
#include "vpx_config.h"
#include "vp8/common/idct.h"
-#include "vp8/decoder/dequantize.h"
+#include "vp8/common/dequantize.h"
/* place these declarations here because we don't want to maintain them
* outside of this scope
*/
-void idct_dequant_dc_full_2x_neon(short *input, short *dq,
- unsigned char *dst,
- int stride, short *dc);
-void idct_dequant_dc_0_2x_neon(short *input, short *dq,
- unsigned char *dst,
- int stride, short *dc);
void idct_dequant_full_2x_neon(short *q, short *dq,
unsigned char *dst, int stride);
void idct_dequant_0_2x_neon(short *q, short dq,
unsigned char *dst, int stride);
-void vp8_dequant_dc_idct_add_y_block_neon(short *q, short *dq,
- unsigned char *dst,
- int stride, char *eobs, short *dc)
-{
- int i;
-
- for (i = 0; i < 4; i++)
- {
- if (((short *)(eobs))[0])
- {
- if (((short *)eobs)[0] & 0xfefe)
- idct_dequant_dc_full_2x_neon (q, dq, dst, stride, dc);
- else
- idct_dequant_dc_0_2x_neon(q, dq, dst, stride, dc);
- }
-
- if (((short *)(eobs))[1])
- {
- if (((short *)eobs)[1] & 0xfefe)
- idct_dequant_dc_full_2x_neon (q+32, dq, dst+8, stride, dc+2);
- else
- idct_dequant_dc_0_2x_neon(q+32, dq, dst+8, stride, dc+2);
- }
- q += 64;
- dc += 4;
- dst += 4*stride;
- eobs += 4;
- }
-}
void vp8_dequant_idct_add_y_block_neon(short *q, short *dq,
unsigned char *dst,
diff --git a/vp8/decoder/arm/neon/idct_dequant_0_2x_neon.asm b/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm
index 6c29c5586..6c29c5586 100644
--- a/vp8/decoder/arm/neon/idct_dequant_0_2x_neon.asm
+++ b/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm
diff --git a/vp8/decoder/arm/neon/idct_dequant_full_2x_neon.asm b/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm
index d5dce63f6..d5dce63f6 100644
--- a/vp8/decoder/arm/neon/idct_dequant_full_2x_neon.asm
+++ b/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm
diff --git a/vp8/common/arm/neon/iwalsh_neon.asm b/vp8/common/arm/neon/iwalsh_neon.asm
index 01c79d937..e8ea2a619 100644
--- a/vp8/common/arm/neon/iwalsh_neon.asm
+++ b/vp8/common/arm/neon/iwalsh_neon.asm
@@ -8,7 +8,6 @@
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_short_inv_walsh4x4_neon|
- EXPORT |vp8_short_inv_walsh4x4_1_neon|
ARM
REQUIRE8
@@ -16,7 +15,7 @@
AREA |.text|, CODE, READONLY ; name this block of code
-;short vp8_short_inv_walsh4x4_neon(short *input, short *output)
+;short vp8_short_inv_walsh4x4_neon(short *input, short *mb_dqcoeff)
|vp8_short_inv_walsh4x4_neon| PROC
; read in all four lines of values: d0->d3
@@ -59,22 +58,30 @@
vshr.s16 q0, q0, #3 ;e/f >> 3
vshr.s16 q1, q1, #3 ;g/h >> 3
- vst4.i16 {d0,d1,d2,d3}, [r1@128]
+ mov r2, #64
+ add r3, r1, #32
- bx lr
- ENDP ; |vp8_short_inv_walsh4x4_neon|
+ vst1.i16 d0[0], [r1],r2
+ vst1.i16 d1[0], [r3],r2
+ vst1.i16 d2[0], [r1],r2
+ vst1.i16 d3[0], [r3],r2
+
+ vst1.i16 d0[1], [r1],r2
+ vst1.i16 d1[1], [r3],r2
+ vst1.i16 d2[1], [r1],r2
+ vst1.i16 d3[1], [r3],r2
+ vst1.i16 d0[2], [r1],r2
+ vst1.i16 d1[2], [r3],r2
+ vst1.i16 d2[2], [r1],r2
+ vst1.i16 d3[2], [r3],r2
+
+ vst1.i16 d0[3], [r1],r2
+ vst1.i16 d1[3], [r3],r2
+ vst1.i16 d2[3], [r1]
+ vst1.i16 d3[3], [r3]
-;short vp8_short_inv_walsh4x4_1_neon(short *input, short *output)
-|vp8_short_inv_walsh4x4_1_neon| PROC
- ldrsh r2, [r0] ; load input[0]
- add r3, r2, #3 ; add 3
- add r2, r1, #16 ; base for last 8 output
- asr r0, r3, #3 ; right shift 3
- vdup.16 q0, r0 ; load and duplicate
- vst1.16 {q0}, [r1@128] ; write back 8
- vst1.16 {q0}, [r2@128] ; write back last 8
bx lr
- ENDP ; |vp8_short_inv_walsh4x4_1_neon|
+ ENDP ; |vp8_short_inv_walsh4x4_neon|
END
diff --git a/vp8/common/bigend.h b/vp8/common/bigend.h
deleted file mode 100644
index 6ac3f8b5a..000000000
--- a/vp8/common/bigend.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef _bigend_h
-#define _bigend_h
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-#define invert2(x) ( (((x)>>8)&0x00ff) | (((x)<<8)&0xff00) )
-#define invert4(x) ( ((invert2(x)&0x0000ffff)<<16) | (invert2((x>>16))&0x0000ffff) )
-
-#define high_byte(x) (unsigned char)x
-#define mid2Byte(x) (unsigned char)(x >> 8)
-#define mid1Byte(x) (unsigned char)(x >> 16)
-#define low_byte(x) (unsigned char)(x >> 24)
-
-#define SWAPENDS 1
-
-#if defined(__cplusplus)
-}
-#endif
-#endif
diff --git a/vp8/common/blockd.h b/vp8/common/blockd.h
index a90c1c0b6..99b731c78 100644
--- a/vp8/common/blockd.h
+++ b/vp8/common/blockd.h
@@ -21,9 +21,6 @@ void vpx_log(const char *format, ...);
#include "subpixel.h"
#include "vpx_ports/mem.h"
-#define TRUE 1
-#define FALSE 0
-
/*#define DCPRED 1*/
#define DCPREDSIMTHRESH 0
#define DCPREDCNTTHRESH 3
@@ -170,6 +167,18 @@ typedef struct
union b_mode_info bmi[16];
} MODE_INFO;
+#if CONFIG_MULTI_RES_ENCODING
+/* The information needed to be stored for higher-resolution encoder */
+typedef struct
+{
+ MB_PREDICTION_MODE mode;
+ MV_REFERENCE_FRAME ref_frame;
+ int_mv mv;
+ //union b_mode_info bmi[16];
+ int dissim; // dissimilarity level of the macroblock
+} LOWER_RES_INFO;
+#endif
+
typedef struct
{
short *qcoeff;
diff --git a/vp8/common/common.h b/vp8/common/common.h
index 9a93da991..2cc1c544c 100644
--- a/vp8/common/common.h
+++ b/vp8/common/common.h
@@ -18,8 +18,6 @@
#include "vpx_mem/vpx_mem.h"
-#include "common_types.h"
-
/* Only need this for fixed-size arrays, for structs just assign. */
#define vp8_copy( Dest, Src) { \
diff --git a/vp8/decoder/dequantize.c b/vp8/common/dequantize.c
index 0861965eb..4a48a3192 100644
--- a/vp8/decoder/dequantize.c
+++ b/vp8/common/dequantize.c
@@ -42,22 +42,3 @@ void vp8_dequant_idct_add_c(short *input, short *dq,
vpx_memset(input, 0, 32);
}
-
-void vp8_dequant_dc_idct_add_c(short *input, short *dq,
- unsigned char *dest, int stride,
- int Dc)
-{
- int i;
-
- input[0] = (short)Dc;
-
- for (i = 1; i < 16; i++)
- {
- input[i] = dq[i] * input[i];
- }
-
- vp8_short_idct4x4llm_c(input, dest, stride, dest, stride);
-
- vpx_memset(input, 0, 32);
-
-}
diff --git a/vp8/decoder/dequantize.h b/vp8/common/dequantize.h
index 019b7f6d1..f66cf2bac 100644
--- a/vp8/decoder/dequantize.h
+++ b/vp8/common/dequantize.h
@@ -21,17 +21,6 @@
unsigned char *output, \
int stride)
-#define prototype_dequant_dc_idct_add(sym) \
- void sym(short *input, short *dq, \
- unsigned char *dst, \
- int stride, \
- int dc)
-
-#define prototype_dequant_dc_idct_add_y_block(sym) \
- void sym(short *q, short *dq, \
- unsigned char *dst, \
- int stride, char *eobs, short *dc)
-
#define prototype_dequant_idct_add_y_block(sym) \
void sym(short *q, short *dq, \
unsigned char *dst, \
@@ -60,16 +49,6 @@ extern prototype_dequant_block(vp8_dequant_block);
#endif
extern prototype_dequant_idct_add(vp8_dequant_idct_add);
-#ifndef vp8_dequant_dc_idct_add
-#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_c
-#endif
-extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add);
-
-#ifndef vp8_dequant_dc_idct_add_y_block
-#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_c
-#endif
-extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block);
-
#ifndef vp8_dequant_idct_add_y_block
#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_c
#endif
@@ -85,10 +64,6 @@ typedef prototype_dequant_block((*vp8_dequant_block_fn_t));
typedef prototype_dequant_idct_add((*vp8_dequant_idct_add_fn_t));
-typedef prototype_dequant_dc_idct_add((*vp8_dequant_dc_idct_add_fn_t));
-
-typedef prototype_dequant_dc_idct_add_y_block((*vp8_dequant_dc_idct_add_y_block_fn_t));
-
typedef prototype_dequant_idct_add_y_block((*vp8_dequant_idct_add_y_block_fn_t));
typedef prototype_dequant_idct_add_uv_block((*vp8_dequant_idct_add_uv_block_fn_t));
@@ -97,8 +72,6 @@ typedef struct
{
vp8_dequant_block_fn_t block;
vp8_dequant_idct_add_fn_t idct_add;
- vp8_dequant_dc_idct_add_fn_t dc_idct_add;
- vp8_dequant_dc_idct_add_y_block_fn_t dc_idct_add_y_block;
vp8_dequant_idct_add_y_block_fn_t idct_add_y_block;
vp8_dequant_idct_add_uv_block_fn_t idct_add_uv_block;
} vp8_dequant_rtcd_vtable_t;
diff --git a/vp8/common/dma_desc.h b/vp8/common/dma_desc.h
deleted file mode 100644
index b923da6e0..000000000
--- a/vp8/common/dma_desc.h
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef _dma_desc_h
-#define _dma_desc_h
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-
-#define NDSIZE_LG 0x00000900 // Next Descriptor Size
-#define NDSIZE_SM 0x00000800 // Next Descriptor Size
-#define NDSIZE_7 0x00000700 // Next Descriptor Size
-#define NDSIZE_6 0x00000600 // Next Descriptor Size
-#define NDSIZE_5 0x00000500 // Next Descriptor Size
-#define NDSIZE_4 0x00000400 // Next Descriptor Size
-#define NDSIZE_3 0x00000300 // Next Descriptor Size
-#define NDSIZE_2 0x00000200 // Next Descriptor Size
-#define NDSIZE_1 0x00000100 // Next Descriptor Size
-
-#define FLOW_STOP 0x0000
-#define FLOW_AUTO 0x1000
-#define FLOW_DESC_AR 0x4000
-#define FLOW_DESC_SM 0x6000
-#define FLOW_DESC_LG 0x7000
-
- typedef struct
- {
- unsigned int ndp;
- //unsigned short ndpl;
- //unsigned short ndph;
- unsigned int sa;
- //unsigned short sal;
- //unsigned short sah;
-
- unsigned short dmacfg;
- unsigned short xcnt;
- unsigned short xmod;
- unsigned short ycnt;
- unsigned short ymod;
-
- } LARGE_DESC;
-
- typedef struct
- {
- unsigned short ndpl;
- unsigned short sal;
- unsigned short sah;
- unsigned short dmacfg;
- unsigned short xcnt;
- unsigned short xmod;
- unsigned short ycnt;
- unsigned short ymod;
- } SMALL_DESC;
-
- typedef struct
- {
- unsigned short sal;
- unsigned short sah;
- unsigned short dmacfg;
- unsigned short xcnt;
- unsigned short xmod;
- unsigned short ycnt;
- unsigned short ymod;
- } ARRAY_DESC_7;
-
- typedef struct
- {
- unsigned short sal;
- unsigned short sah;
- unsigned short dmacfg;
- unsigned short xcnt;
- unsigned short xmod;
- unsigned short ycnt;
- } ARRAY_DESC_6;
-
- typedef struct
- {
- unsigned short sal;
- unsigned short sah;
- unsigned short dmacfg;
- unsigned short xcnt;
- unsigned short xmod;
- } ARRAY_DESC_5;
-
- typedef struct
- {
- unsigned short sal;
- unsigned short sah;
- unsigned short dmacfg;
- unsigned short xcnt;
- } ARRAY_DESC_4;
-
- typedef struct
- {
- unsigned short sal;
- unsigned short sah;
- unsigned short dmacfg;
- } ARRAY_DESC_3;
-
- typedef struct
- {
- unsigned short sal;
- unsigned short sah;
- } ARRAY_DESC_2;
-
- typedef struct
- {
- unsigned short sal;
- } ARRAY_DESC_1;
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif //_dma_desc_h
diff --git a/vp8/common/duck_io.h b/vp8/common/duck_io.h
deleted file mode 100644
index 43daa65bc..000000000
--- a/vp8/common/duck_io.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef _duck_io_h
-#define _duck_io_h
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-#if defined (_WIN32)
- typedef __int64 int64_t;
-#elif defined(__MWERKS__)
- typedef long long int64_t;
-#elif defined(__APPLE__) || defined(__POWERPC)
-#include <ppc/types.h>
-#else
- typedef long long int64_t;
-#endif
-
- typedef struct
- {
- int64_t offset; // offset to start from
- int blocking; // non-zero for blocking
- } re_open_t;
-
-
- typedef enum
- {
- SAL_ERR_MAX = -10,
- SAL_ERROR = -11, // Default error
- SAL_ERR_WSASTARTUP = -12,
- SAL_ERR_SOCKET_CREATE = -13,
- SAL_ERR_RESOLVING_HOSTNAME = -14,
- SAL_ERR_SERVER_CONNECTION = -15,
- SAL_ERR_SENDING_DATA = -16,
- SAL_ERR_RECEIVING_DATA = -17,
- SAL_ERR_404_FILE_NOT_FOUND = -18,
- SAL_ERR_PARSING_HTTP_HEADER = -19,
- SAL_ERR_PARSING_CONTENT_LEN = -20,
- SAL_ERR_CONNECTION_TIMEOUT = -21,
- SAL_ERR_FILE_OPEN_FAILED = -22,
- SAL_ERR_MIN = -23
- } SAL_ERR; /* EMH 1-15-03 */
-
-
- typedef struct sal_err_map_temp
- {
- SAL_ERR code;
- const char *decode;
-
- } sal_err_map_t;
-
-
- static char *sal_err_text(SAL_ERR e)
- {
- int t;
- const sal_err_map_t g_sal_err_map[] =
- {
- { SAL_ERR_WSASTARTUP, "Error with WSAStartup" },
- { SAL_ERR_SOCKET_CREATE, "Error creating socket" },
- { SAL_ERR_RESOLVING_HOSTNAME, "Error resolving hostname" },
- { SAL_ERR_SERVER_CONNECTION, "Error connecting to server" },
- { SAL_ERR_SENDING_DATA, "Error sending data" },
- { SAL_ERR_RECEIVING_DATA, "Error receiving data" },
- { SAL_ERR_404_FILE_NOT_FOUND, "Error file not found " },
- { SAL_ERR_PARSING_HTTP_HEADER, "Error parsing http header" },
- { SAL_ERR_PARSING_CONTENT_LEN, "Error parsing content length" },
- { SAL_ERR_CONNECTION_TIMEOUT, "Error Connection timed out" },
- { SAL_ERR_FILE_OPEN_FAILED, "Error opening file" }
- };
-
- for (t = 0; t < sizeof(g_sal_err_map) / sizeof(sal_err_map_t); t++)
- {
- if (e == g_sal_err_map[t].code)
- return (char *) g_sal_err_map[t].decode;
- }
-
- return 0;
- }
-
-
-
-
-
-
-
- int duck_open(const char *fname, unsigned long user_data);
-
- void duck_close(int ghndl);
-
- int duck_read(int ghndl, unsigned char *buf, int nbytes);
-
- int64_t duck_seek(int g_hndl, int64_t offs, int origin);
-
- int duck_read_finished(int han, int flag); /* FWG 7-9-99 */
-
- int duck_name(int handle, char name[], size_t max_len); /* EMH 9-23-03 */
-
- int duck_read_blocking(int handle, unsigned char *buffer, int bytes); /* EMH 9-23-03 */
-
- int64_t duck_available_data(int handle); /* EMH 10-23-03 */
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif
diff --git a/vp8/common/findnearmv.h b/vp8/common/findnearmv.h
index 01909b937..a3443d765 100644
--- a/vp8/common/findnearmv.h
+++ b/vp8/common/findnearmv.h
@@ -60,10 +60,10 @@ static unsigned int vp8_check_mv_bounds(int_mv *mv, int mb_to_left_edge,
int mb_to_bottom_edge)
{
unsigned int need_to_clamp;
- need_to_clamp = (mv->as_mv.col < mb_to_left_edge) ? 1 : 0;
- need_to_clamp |= (mv->as_mv.col > mb_to_right_edge) ? 1 : 0;
- need_to_clamp |= (mv->as_mv.row < mb_to_top_edge) ? 1 : 0;
- need_to_clamp |= (mv->as_mv.row > mb_to_bottom_edge) ? 1 : 0;
+ need_to_clamp = (mv->as_mv.col < mb_to_left_edge);
+ need_to_clamp |= (mv->as_mv.col > mb_to_right_edge);
+ need_to_clamp |= (mv->as_mv.row < mb_to_top_edge);
+ need_to_clamp |= (mv->as_mv.row > mb_to_bottom_edge);
return need_to_clamp;
}
diff --git a/vp8/common/g_common.h b/vp8/common/g_common.h
deleted file mode 100644
index 5f523980b..000000000
--- a/vp8/common/g_common.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-extern void (*vp8_clear_system_state)(void);
-extern void (*vp8_plane_add_noise)(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int DPitch, int q);
-extern void (*de_interlace)
-(
- unsigned char *src_ptr,
- unsigned char *dst_ptr,
- int Width,
- int Height,
- int Stride
-);
diff --git a/vp8/common/generic/systemdependent.c b/vp8/common/generic/systemdependent.c
index 9641d8c1e..01d76206d 100644
--- a/vp8/common/generic/systemdependent.c
+++ b/vp8/common/generic/systemdependent.c
@@ -10,7 +10,6 @@
#include "vpx_config.h"
-#include "vp8/common/g_common.h"
#include "vp8/common/subpixel.h"
#include "vp8/common/loopfilter.h"
#include "vp8/common/recon.h"
@@ -70,6 +69,14 @@ void vp8_machine_specific_config(VP8_COMMON *ctx)
#if CONFIG_RUNTIME_CPU_DETECT
VP8_COMMON_RTCD *rtcd = &ctx->rtcd;
+
+ rtcd->dequant.block = vp8_dequantize_b_c;
+ rtcd->dequant.idct_add = vp8_dequant_idct_add_c;
+ rtcd->dequant.idct_add_y_block = vp8_dequant_idct_add_y_block_c;
+ rtcd->dequant.idct_add_uv_block =
+ vp8_dequant_idct_add_uv_block_c;
+
+
rtcd->idct.idct16 = vp8_short_idct4x4llm_c;
rtcd->idct.idct1_scalar_add = vp8_dc_only_idct_add_c;
rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_c;
diff --git a/vp8/common/idct.h b/vp8/common/idct.h
index 411a1b472..7371f85ff 100644
--- a/vp8/common/idct.h
+++ b/vp8/common/idct.h
@@ -37,6 +37,10 @@
#define vp8_idct_idct16 vp8_short_idct4x4llm_c
#endif
extern prototype_idct(vp8_idct_idct16);
+/* add this prototype to prevent compiler warning about implicit
+ * declaration of vp8_short_idct4x4llm_c function in dequantize.c
+ * when building, for example, neon optimized version */
+extern prototype_idct(vp8_short_idct4x4llm_c);
#ifndef vp8_idct_idct1_scalar_add
#define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_c
diff --git a/vp8/decoder/idct_blk.c b/vp8/common/idct_blk.c
index 1c16b92a9..249fad4ea 100644
--- a/vp8/decoder/idct_blk.c
+++ b/vp8/common/idct_blk.c
@@ -12,39 +12,12 @@
#include "vp8/common/idct.h"
#include "dequantize.h"
-void vp8_dequant_dc_idct_add_c(short *input, short *dq,
- unsigned char *dest, int stride,
- int Dc);
void vp8_dequant_idct_add_c(short *input, short *dq,
unsigned char *dest, int stride);
void vp8_dc_only_idct_add_c(short input_dc, unsigned char * pred,
int pred_stride, unsigned char *dst_ptr,
int dst_stride);
-void vp8_dequant_dc_idct_add_y_block_c
- (short *q, short *dq,
- unsigned char *dst, int stride, char *eobs, short *dc)
-{
- int i, j;
-
- for (i = 0; i < 4; i++)
- {
- for (j = 0; j < 4; j++)
- {
- if (*eobs++ > 1)
- vp8_dequant_dc_idct_add_c (q, dq, dst, stride, dc[0]);
- else
- vp8_dc_only_idct_add_c (dc[0], dst, stride, dst, stride);
-
- q += 16;
- dst += 4;
- dc ++;
- }
-
- dst += 4*stride - 16;
- }
-}
-
void vp8_dequant_idct_add_y_block_c
(short *q, short *dq,
unsigned char *dst, int stride, char *eobs)
diff --git a/vp8/common/idctllm.c b/vp8/common/idctllm.c
index 49496abef..47af52f04 100644
--- a/vp8/common/idctllm.c
+++ b/vp8/common/idctllm.c
@@ -137,8 +137,9 @@ void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
}
-void vp8_short_inv_walsh4x4_c(short *input, short *output)
+void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff)
{
+ short output[16];
int i;
int a1, b1, c1, d1;
int a2, b2, c2, d2;
@@ -183,22 +184,21 @@ void vp8_short_inv_walsh4x4_c(short *input, short *output)
ip += 4;
op += 4;
}
+
+ for(i = 0; i < 16; i++)
+ {
+ mb_dqcoeff[i * 16] = output[i];
+ }
}
-void vp8_short_inv_walsh4x4_1_c(short *input, short *output)
+void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff)
{
int i;
int a1;
- short *op = output;
a1 = ((input[0] + 3) >> 3);
-
- for (i = 0; i < 4; i++)
+ for(i = 0; i < 16; i++)
{
- op[0] = a1;
- op[1] = a1;
- op[2] = a1;
- op[3] = a1;
- op += 4;
+ mb_dqcoeff[i * 16] = a1;
}
}
diff --git a/vp8/common/invtrans.c b/vp8/common/invtrans.c
deleted file mode 100644
index 478cb329f..000000000
--- a/vp8/common/invtrans.c
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "invtrans.h"
-
-
-void vp8_inverse_transform_b(const vp8_idct_rtcd_vtable_t *rtcd, BLOCKD *b,
- int pitch)
-{
- if (*b->eob > 1)
- {
- IDCT_INVOKE(rtcd, idct16)(b->dqcoeff, b->predictor, pitch,
- *(b->base_dst) + b->dst, b->dst_stride);
- }
- else
- {
- IDCT_INVOKE(rtcd, idct1_scalar_add)(b->dqcoeff[0], b->predictor, pitch,
- *(b->base_dst) + b->dst, b->dst_stride);
- }
-
-}
-
-static void recon_dcblock(MACROBLOCKD *x)
-{
- BLOCKD *b = &x->block[24];
- int i;
-
- for (i = 0; i < 16; i++)
- {
- x->block[i].dqcoeff[0] = b->diff[i];
- }
-
-}
-
-void vp8_inverse_transform_mby(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
-{
- int i;
-
- if(x->mode_info_context->mbmi.mode != SPLITMV)
- {
- /* do 2nd order transform on the dc block */
- IDCT_INVOKE(rtcd, iwalsh16)(x->block[24].dqcoeff, x->block[24].diff);
-
- recon_dcblock(x);
- }
-
- for (i = 0; i < 16; i++)
- {
- vp8_inverse_transform_b(rtcd, &x->block[i], 16);
- }
-
-}
-void vp8_inverse_transform_mbuv(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
-{
- int i;
-
- for (i = 16; i < 24; i++)
- {
- vp8_inverse_transform_b(rtcd, &x->block[i], 8);
- }
-
-}
diff --git a/vp8/common/invtrans.h b/vp8/common/invtrans.h
index d14573b91..2bcbeeccf 100644
--- a/vp8/common/invtrans.h
+++ b/vp8/common/invtrans.h
@@ -15,9 +15,66 @@
#include "vpx_config.h"
#include "idct.h"
#include "blockd.h"
-extern void vp8_inverse_transform_b(const vp8_idct_rtcd_vtable_t *rtcd, BLOCKD *b, int pitch);
-extern void vp8_inverse_transform_mb(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x);
-extern void vp8_inverse_transform_mby(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x);
-extern void vp8_inverse_transform_mbuv(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x);
+#include "onyxc_int.h"
+#if CONFIG_MULTITHREAD
+#include "vpx_mem/vpx_mem.h"
+#endif
+
+static void eob_adjust(char *eobs, short *diff)
+{
+ /* eob adjust.... the idct can only skip if both the dc and eob are zero */
+ int js;
+ for(js = 0; js < 16; js++)
+ {
+ if((eobs[js] == 0) && (diff[0] != 0))
+ eobs[js]++;
+ diff+=16;
+ }
+}
+
+static void vp8_inverse_transform_mby(MACROBLOCKD *xd,
+ const VP8_COMMON_RTCD *rtcd)
+{
+ short *DQC = xd->block[0].dequant;
+ /* save the dc dequant constant in case it is overridden */
+ short dc_dequant_temp = DQC[0];
+
+#if CONFIG_MULTITHREAD
+ DECLARE_ALIGNED(16, short, local_dequant[16]);
+#endif
+
+ if (xd->mode_info_context->mbmi.mode != SPLITMV)
+ {
+ /* do 2nd order transform on the dc block */
+ if (xd->eobs[24] > 1)
+ {
+ IDCT_INVOKE(&rtcd->idct, iwalsh16)
+ (&xd->block[24].dqcoeff[0], xd->qcoeff);
+ }
+ else
+ {
+ IDCT_INVOKE(&rtcd->idct, iwalsh1)
+ (&xd->block[24].dqcoeff[0], xd->qcoeff);
+ }
+ eob_adjust(xd->eobs, xd->qcoeff);
+
+#if CONFIG_MULTITHREAD
+ DQC = local_dequant;
+
+ vpx_memcpy(DQC, xd->block[0].dequant,
+ sizeof(local_dequant));
+#endif
+
+ /* override the dc dequant constant */
+ DQC[0] = 1;
+ }
+ DEQUANT_INVOKE (&rtcd->dequant, idct_add_y_block)
+ (xd->qcoeff, DQC,
+ xd->dst.y_buffer,
+ xd->dst.y_stride, xd->eobs);
+
+ /* restore the dc dequant constant */
+ DQC[0] = dc_dequant_temp;
+}
#endif
diff --git a/vp8/common/littlend.h b/vp8/common/littlend.h
deleted file mode 100644
index 99df1164c..000000000
--- a/vp8/common/littlend.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef _littlend_h
-#define _littlend_h
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-#define invert2(x) (x)
-#define invert4(x) (x)
-
-#define low_byte(x) (unsigned char)x
-#define mid1Byte(x) (unsigned char)(x >> 8)
-#define mid2Byte(x) (unsigned char)(x >> 16)
-#define high_byte(x) (unsigned char)(x >> 24)
-
-#define SWAPENDS 0
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif
diff --git a/vp8/common/onyx.h b/vp8/common/onyx.h
index 28cbaed98..d17a32b82 100644
--- a/vp8/common/onyx.h
+++ b/vp8/common/onyx.h
@@ -17,13 +17,14 @@ extern "C"
{
#endif
+#include "vpx_config.h"
#include "vpx/internal/vpx_codec_internal.h"
#include "vpx/vp8cx.h"
#include "vpx/vpx_encoder.h"
#include "vpx_scale/yv12config.h"
-#include "type_aliases.h"
#include "ppflags.h"
- typedef int *VP8_PTR;
+
+ struct VP8_COMP;
/* Create/destroy static data structures. */
@@ -104,7 +105,7 @@ extern "C"
int Version; // 4 versions of bitstream defined 0 best quality/slowest decode, 3 lowest quality/fastest decode
int Width; // width of data passed to the compressor
int Height; // height of data passed to the compressor
- double frame_rate; // set to passed in framerate
+ struct vpx_rational timebase;
int target_bandwidth; // bandwidth to be used in kilobits per second
int noise_sensitivity; // parameter used for applying pre processing blur: recommendation 0
@@ -207,32 +208,45 @@ extern "C"
unsigned int periodicity;
unsigned int layer_id[MAX_PERIODICITY];
+#if CONFIG_MULTI_RES_ENCODING
+ /* Number of total resolutions encoded */
+ unsigned int mr_total_resolutions;
+
+ /* Current encoder ID */
+ unsigned int mr_encoder_id;
+
+ /* Down-sampling factor */
+ vpx_rational_t mr_down_sampling_factor;
+
+ /* Memory location to store low-resolution encoder's mode info */
+ void* mr_low_res_mode_info;
+#endif
} VP8_CONFIG;
void vp8_initialize();
- VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf);
- void vp8_remove_compressor(VP8_PTR *comp);
+ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf);
+ void vp8_remove_compressor(struct VP8_COMP* *comp);
- void vp8_init_config(VP8_PTR onyx, VP8_CONFIG *oxcf);
- void vp8_change_config(VP8_PTR onyx, VP8_CONFIG *oxcf);
+ void vp8_init_config(struct VP8_COMP* onyx, VP8_CONFIG *oxcf);
+ void vp8_change_config(struct VP8_COMP* onyx, VP8_CONFIG *oxcf);
// receive a frames worth of data caller can assume that a copy of this frame is made
// and not just a copy of the pointer..
- int vp8_receive_raw_frame(VP8_PTR comp, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, int64_t end_time_stamp);
- int vp8_get_compressed_data(VP8_PTR comp, unsigned int *frame_flags, unsigned long *size, unsigned char *dest, unsigned char *dest_end, int64_t *time_stamp, int64_t *time_end, int flush);
- int vp8_get_preview_raw_frame(VP8_PTR comp, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *flags);
-
- int vp8_use_as_reference(VP8_PTR comp, int ref_frame_flags);
- int vp8_update_reference(VP8_PTR comp, int ref_frame_flags);
- int vp8_get_reference(VP8_PTR comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
- int vp8_set_reference(VP8_PTR comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
- int vp8_update_entropy(VP8_PTR comp, int update);
- int vp8_set_roimap(VP8_PTR comp, unsigned char *map, unsigned int rows, unsigned int cols, int delta_q[4], int delta_lf[4], unsigned int threshold[4]);
- int vp8_set_active_map(VP8_PTR comp, unsigned char *map, unsigned int rows, unsigned int cols);
- int vp8_set_internal_size(VP8_PTR comp, VPX_SCALING horiz_mode, VPX_SCALING vert_mode);
- int vp8_get_quantizer(VP8_PTR c);
+ int vp8_receive_raw_frame(struct VP8_COMP* comp, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, int64_t end_time_stamp);
+ int vp8_get_compressed_data(struct VP8_COMP* comp, unsigned int *frame_flags, unsigned long *size, unsigned char *dest, unsigned char *dest_end, int64_t *time_stamp, int64_t *time_end, int flush);
+ int vp8_get_preview_raw_frame(struct VP8_COMP* comp, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *flags);
+
+ int vp8_use_as_reference(struct VP8_COMP* comp, int ref_frame_flags);
+ int vp8_update_reference(struct VP8_COMP* comp, int ref_frame_flags);
+ int vp8_get_reference(struct VP8_COMP* comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
+ int vp8_set_reference(struct VP8_COMP* comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
+ int vp8_update_entropy(struct VP8_COMP* comp, int update);
+ int vp8_set_roimap(struct VP8_COMP* comp, unsigned char *map, unsigned int rows, unsigned int cols, int delta_q[4], int delta_lf[4], unsigned int threshold[4]);
+ int vp8_set_active_map(struct VP8_COMP* comp, unsigned char *map, unsigned int rows, unsigned int cols);
+ int vp8_set_internal_size(struct VP8_COMP* comp, VPX_SCALING horiz_mode, VPX_SCALING vert_mode);
+ int vp8_get_quantizer(struct VP8_COMP* c);
#ifdef __cplusplus
}
diff --git a/vp8/common/onyxc_int.h b/vp8/common/onyxc_int.h
index 936fa9f23..f733ff774 100644
--- a/vp8/common/onyxc_int.h
+++ b/vp8/common/onyxc_int.h
@@ -22,6 +22,7 @@
#if CONFIG_POSTPROC
#include "postproc.h"
#endif
+#include "dequantize.h"
/*#ifdef PACKET_TESTING*/
#include "header.h"
@@ -73,6 +74,7 @@ typedef enum
typedef struct VP8_COMMON_RTCD
{
#if CONFIG_RUNTIME_CPU_DETECT
+ vp8_dequant_rtcd_vtable_t dequant;
vp8_idct_rtcd_vtable_t idct;
vp8_recon_rtcd_vtable_t recon;
vp8_subpix_rtcd_vtable_t subpix;
diff --git a/vp8/common/onyxd.h b/vp8/common/onyxd.h
index 43fa00bd3..35a8b6e55 100644
--- a/vp8/common/onyxd.h
+++ b/vp8/common/onyxd.h
@@ -18,13 +18,13 @@
extern "C"
{
#endif
-#include "type_aliases.h"
#include "vpx_scale/yv12config.h"
#include "ppflags.h"
#include "vpx_ports/mem.h"
#include "vpx/vpx_codec.h"
- typedef void *VP8D_PTR;
+ struct VP8D_COMP;
+
typedef struct
{
int Width;
@@ -49,19 +49,19 @@ extern "C"
void vp8dx_initialize(void);
- void vp8dx_set_setting(VP8D_PTR comp, VP8D_SETTING oxst, int x);
+ void vp8dx_set_setting(struct VP8D_COMP* comp, VP8D_SETTING oxst, int x);
- int vp8dx_get_setting(VP8D_PTR comp, VP8D_SETTING oxst);
+ int vp8dx_get_setting(struct VP8D_COMP* comp, VP8D_SETTING oxst);
- int vp8dx_receive_compressed_data(VP8D_PTR comp, unsigned long size, const unsigned char *dest, int64_t time_stamp);
- int vp8dx_get_raw_frame(VP8D_PTR comp, YV12_BUFFER_CONFIG *sd, int64_t *time_stamp, int64_t *time_end_stamp, vp8_ppflags_t *flags);
+ int vp8dx_receive_compressed_data(struct VP8D_COMP* comp, unsigned long size, const unsigned char *dest, int64_t time_stamp);
+ int vp8dx_get_raw_frame(struct VP8D_COMP* comp, YV12_BUFFER_CONFIG *sd, int64_t *time_stamp, int64_t *time_end_stamp, vp8_ppflags_t *flags);
- vpx_codec_err_t vp8dx_get_reference(VP8D_PTR comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
- vpx_codec_err_t vp8dx_set_reference(VP8D_PTR comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
+ vpx_codec_err_t vp8dx_get_reference(struct VP8D_COMP* comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
+ vpx_codec_err_t vp8dx_set_reference(struct VP8D_COMP* comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
- VP8D_PTR vp8dx_create_decompressor(VP8D_CONFIG *oxcf);
+ struct VP8D_COMP* vp8dx_create_decompressor(VP8D_CONFIG *oxcf);
- void vp8dx_remove_decompressor(VP8D_PTR comp);
+ void vp8dx_remove_decompressor(struct VP8D_COMP* comp);
#ifdef __cplusplus
}
diff --git a/vp8/common/ppc/systemdependent.c b/vp8/common/ppc/systemdependent.c
index 1f5d79068..7046a63e8 100644
--- a/vp8/common/ppc/systemdependent.c
+++ b/vp8/common/ppc/systemdependent.c
@@ -9,7 +9,6 @@
*/
-#include "g_common.h"
#include "subpixel.h"
#include "loopfilter.h"
#include "recon.h"
diff --git a/vp8/common/reconinter.c b/vp8/common/reconinter.c
index 24c09a353..6c7af41d4 100644
--- a/vp8/common/reconinter.c
+++ b/vp8/common/reconinter.c
@@ -334,11 +334,12 @@ void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x)
/*encoder only*/
-void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x)
+void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x,
+ unsigned char *dst_y,
+ int dst_ystride)
{
unsigned char *ptr_base;
unsigned char *ptr;
- unsigned char *pred_ptr = x->predictor;
int mv_row = x->mode_info_context->mbmi.mv.as_mv.row;
int mv_col = x->mode_info_context->mbmi.mv.as_mv.col;
int pre_stride = x->block[0].pre_stride;
@@ -348,11 +349,13 @@ void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x)
if ((mv_row | mv_col) & 7)
{
- x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, pred_ptr, 16);
+ x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7,
+ dst_y, dst_ystride);
}
else
{
- RECON_INVOKE(&x->rtcd->recon, copy16x16)(ptr, pre_stride, pred_ptr, 16);
+ RECON_INVOKE(&x->rtcd->recon, copy16x16)(ptr, pre_stride, dst_y,
+ dst_ystride);
}
}
@@ -596,69 +599,3 @@ void vp8_build_inter_predictors_mb(MACROBLOCKD *xd)
build_inter4x4_predictors_mb(xd);
}
}
-/* encoder only*/
-static void build_inter4x4_predictors_mb_e(MACROBLOCKD *x)
-{
- int i;
-
- if (x->mode_info_context->mbmi.partitioning < 3)
- {
- x->block[ 0].bmi = x->mode_info_context->bmi[ 0];
- x->block[ 2].bmi = x->mode_info_context->bmi[ 2];
- x->block[ 8].bmi = x->mode_info_context->bmi[ 8];
- x->block[10].bmi = x->mode_info_context->bmi[10];
-
- build_inter_predictors4b(x, &x->block[ 0], x->block[ 0].predictor, 16);
- build_inter_predictors4b(x, &x->block[ 2], x->block[ 2].predictor, 16);
- build_inter_predictors4b(x, &x->block[ 8], x->block[ 8].predictor, 16);
- build_inter_predictors4b(x, &x->block[10], x->block[10].predictor, 16);
- }
- else
- {
- for (i = 0; i < 16; i += 2)
- {
- BLOCKD *d0 = &x->block[i];
- BLOCKD *d1 = &x->block[i+1];
-
- x->block[i+0].bmi = x->mode_info_context->bmi[i+0];
- x->block[i+1].bmi = x->mode_info_context->bmi[i+1];
-
- if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
- build_inter_predictors2b(x, d0, d0->predictor, 16);
- else
- {
- build_inter_predictors_b(d0, d0->predictor, 16, x->subpixel_predict);
- build_inter_predictors_b(d1, d1->predictor, 16, x->subpixel_predict);
- }
-
- }
-
- }
-
- for (i = 16; i < 24; i += 2)
- {
- BLOCKD *d0 = &x->block[i];
- BLOCKD *d1 = &x->block[i+1];
-
- if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
- build_inter_predictors2b(x, d0, d0->predictor, 8);
- else
- {
- build_inter_predictors_b(d0, d0->predictor, 8, x->subpixel_predict);
- build_inter_predictors_b(d1, d1->predictor, 8, x->subpixel_predict);
- }
- }
-}
-void vp8_build_inter_predictors_mb_e(MACROBLOCKD *xd)
-{
- if (xd->mode_info_context->mbmi.mode != SPLITMV)
- {
- vp8_build_inter16x16_predictors_mb(xd, xd->predictor, &xd->predictor[256],
- &xd->predictor[320], 16, 8);
- }
- else
- {
- build_4x4uvmvs(xd);
- build_inter4x4_predictors_mb_e(xd);
- }
-}
diff --git a/vp8/common/reconinter.h b/vp8/common/reconinter.h
index 86f9d5ae3..f57ff73c5 100644
--- a/vp8/common/reconinter.h
+++ b/vp8/common/reconinter.h
@@ -21,11 +21,13 @@ extern void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x,
int dst_uvstride);
-extern void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x);
-extern void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, vp8_subpix_fn_t sppf);
+extern void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x,
+ unsigned char *dst_y,
+ int dst_ystride);
+extern void vp8_build_inter_predictors_b(BLOCKD *d, int pitch,
+ vp8_subpix_fn_t sppf);
extern void vp8_build_inter16x16_predictors_mbuv(MACROBLOCKD *x);
extern void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x);
-extern void vp8_build_inter_predictors_mb_e(MACROBLOCKD *xd);
#endif
diff --git a/vp8/common/type_aliases.h b/vp8/common/type_aliases.h
deleted file mode 100644
index 22b531a76..000000000
--- a/vp8/common/type_aliases.h
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/****************************************************************************
-*
-* Module Title : type_aliases.h
-*
-* Description : Standard type aliases
-*
-****************************************************************************/
-#ifndef __INC_TYPE_ALIASES_H
-#define __INC_TYPE_ALIASES_H
-
-/****************************************************************************
-* Macros
-****************************************************************************/
-#define EXPORT
-#define IMPORT extern /* Used to declare imported data & routines */
-#define PRIVATE static /* Used to declare & define module-local data */
-#define LOCAL static /* Used to define all persistent routine-local data */
-#define STD_IN_PATH 0 /* Standard input path */
-#define STD_OUT_PATH 1 /* Standard output path */
-#define STD_ERR_PATH 2 /* Standard error path */
-#define STD_IN_FILE stdin /* Standard input file pointer */
-#define STD_OUT_FILE stdout /* Standard output file pointer */
-#define STD_ERR_FILE stderr /* Standard error file pointer */
-#define max_int 0x7FFFFFFF
-
-#define __export
-#define _export
-
-#define CCONV
-
-#ifndef NULL
-#ifdef __cplusplus
-#define NULL 0
-#else
-#define NULL ((void *)0)
-#endif
-#endif
-
-#ifndef FALSE
-#define FALSE 0
-#endif
-
-#ifndef TRUE
-#define TRUE 1
-#endif
-
-/****************************************************************************
-* Typedefs
-****************************************************************************/
-#ifndef TYPE_INT8
-#define TYPE_INT8
-typedef signed char INT8;
-#endif
-
-#ifndef TYPE_INT16
-/*#define TYPE_INT16*/
-typedef signed short INT16;
-#endif
-
-#ifndef TYPE_INT32
-/*#define TYPE_INT32*/
-typedef signed int INT32;
-#endif
-
-#ifndef TYPE_UINT8
-/*#define TYPE_UINT8*/
-typedef unsigned char UINT8;
-#endif
-
-#ifndef TYPE_UINT32
-/*#define TYPE_UINT32*/
-typedef unsigned int UINT32;
-#endif
-
-#ifndef TYPE_UINT16
-/*#define TYPE_UINT16*/
-typedef unsigned short UINT16;
-#endif
-
-#ifndef TYPE_BOOL
-/*#define TYPE_BOOL*/
-typedef int BOOL;
-#endif
-
-typedef unsigned char BOOLEAN;
-
-#ifdef _MSC_VER
-typedef __int64 INT64;
-#else
-
-#ifndef TYPE_INT64
-#ifdef _TMS320C6X
-/* for now we only have 40bits */
-typedef long INT64;
-#else
-typedef long long INT64;
-#endif
-#endif
-
-#endif
-
-/* Floating point */
-typedef double FLOAT64;
-typedef float FLOAT32;
-
-#endif
diff --git a/vp8/decoder/x86/dequantize_mmx.asm b/vp8/common/x86/dequantize_mmx.asm
index 648bde4c5..de9eba89f 100644
--- a/vp8/decoder/x86/dequantize_mmx.asm
+++ b/vp8/common/x86/dequantize_mmx.asm
@@ -246,207 +246,6 @@ sym(vp8_dequant_idct_add_mmx):
pop rbp
ret
-
-;void dequant_dc_idct_add_mmx(
-;short *input, 0
-;short *dq, 1
-;unsigned char *dest, 2
-;int stride, 3
-;int Dc) 4
-global sym(vp8_dequant_dc_idct_add_mmx)
-sym(vp8_dequant_dc_idct_add_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- GET_GOT rbx
- ; end prolog
-
- mov rax, arg(0) ;input
- mov rdx, arg(1) ;dq
-
- movq mm0, [rax ]
- pmullw mm0, [rdx]
-
- movq mm1, [rax +8]
- pmullw mm1, [rdx +8]
-
- movq mm2, [rax+16]
- pmullw mm2, [rdx+16]
-
- movq mm3, [rax+24]
- pmullw mm3, [rdx+24]
-
- mov rdx, arg(2) ;pred
- pxor mm7, mm7
-
-
- movq [rax], mm7
- movq [rax+8], mm7
-
- movq [rax+16],mm7
- movq [rax+24],mm7
-
- ; move lower word of Dc to lower word of mm0
- psrlq mm0, 16
- movzx rcx, word ptr arg(4) ;Dc
- psllq mm0, 16
- movq mm7, rcx
- por mm0, mm7
-
- movsxd rax, dword ptr arg(3) ;stride
-
- psubw mm0, mm2 ; b1= 0-2
- paddw mm2, mm2 ;
-
- movq mm5, mm1
- paddw mm2, mm0 ; a1 =0+2
-
- pmulhw mm5, [GLOBAL(x_s1sqr2)];
- paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
-
- movq mm7, mm3 ;
- pmulhw mm7, [GLOBAL(x_c1sqr2less1)];
-
- paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
- psubw mm7, mm5 ; c1
-
- movq mm5, mm1
- movq mm4, mm3
-
- pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
- paddw mm5, mm1
-
- pmulhw mm3, [GLOBAL(x_s1sqr2)]
- paddw mm3, mm4
-
- paddw mm3, mm5 ; d1
- movq mm6, mm2 ; a1
-
- movq mm4, mm0 ; b1
- paddw mm2, mm3 ;0
-
- paddw mm4, mm7 ;1
- psubw mm0, mm7 ;2
-
- psubw mm6, mm3 ;3
-
- movq mm1, mm2 ; 03 02 01 00
- movq mm3, mm4 ; 23 22 21 20
-
- punpcklwd mm1, mm0 ; 11 01 10 00
- punpckhwd mm2, mm0 ; 13 03 12 02
-
- punpcklwd mm3, mm6 ; 31 21 30 20
- punpckhwd mm4, mm6 ; 33 23 32 22
-
- movq mm0, mm1 ; 11 01 10 00
- movq mm5, mm2 ; 13 03 12 02
-
- punpckldq mm0, mm3 ; 30 20 10 00
- punpckhdq mm1, mm3 ; 31 21 11 01
-
- punpckldq mm2, mm4 ; 32 22 12 02
- punpckhdq mm5, mm4 ; 33 23 13 03
-
- movq mm3, mm5 ; 33 23 13 03
-
- psubw mm0, mm2 ; b1= 0-2
- paddw mm2, mm2 ;
-
- movq mm5, mm1
- paddw mm2, mm0 ; a1 =0+2
-
- pmulhw mm5, [GLOBAL(x_s1sqr2)];
- paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
-
- movq mm7, mm3 ;
- pmulhw mm7, [GLOBAL(x_c1sqr2less1)];
-
- paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
- psubw mm7, mm5 ; c1
-
- movq mm5, mm1
- movq mm4, mm3
-
- pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
- paddw mm5, mm1
-
- pmulhw mm3, [GLOBAL(x_s1sqr2)]
- paddw mm3, mm4
-
- paddw mm3, mm5 ; d1
- paddw mm0, [GLOBAL(fours)]
-
- paddw mm2, [GLOBAL(fours)]
- movq mm6, mm2 ; a1
-
- movq mm4, mm0 ; b1
- paddw mm2, mm3 ;0
-
- paddw mm4, mm7 ;1
- psubw mm0, mm7 ;2
-
- psubw mm6, mm3 ;3
- psraw mm2, 3
-
- psraw mm0, 3
- psraw mm4, 3
-
- psraw mm6, 3
-
- movq mm1, mm2 ; 03 02 01 00
- movq mm3, mm4 ; 23 22 21 20
-
- punpcklwd mm1, mm0 ; 11 01 10 00
- punpckhwd mm2, mm0 ; 13 03 12 02
-
- punpcklwd mm3, mm6 ; 31 21 30 20
- punpckhwd mm4, mm6 ; 33 23 32 22
-
- movq mm0, mm1 ; 11 01 10 00
- movq mm5, mm2 ; 13 03 12 02
-
- punpckldq mm0, mm3 ; 30 20 10 00
- punpckhdq mm1, mm3 ; 31 21 11 01
-
- punpckldq mm2, mm4 ; 32 22 12 02
- punpckhdq mm5, mm4 ; 33 23 13 03
-
- pxor mm7, mm7
-
- movd mm4, [rdx]
- punpcklbw mm4, mm7
- paddsw mm0, mm4
- packuswb mm0, mm7
- movd [rdx], mm0
-
- movd mm4, [rdx+rax]
- punpcklbw mm4, mm7
- paddsw mm1, mm4
- packuswb mm1, mm7
- movd [rdx+rax], mm1
-
- movd mm4, [rdx+2*rax]
- punpcklbw mm4, mm7
- paddsw mm2, mm4
- packuswb mm2, mm7
- movd [rdx+rax*2], mm2
-
- add rdx, rax
-
- movd mm4, [rdx+2*rax]
- punpcklbw mm4, mm7
- paddsw mm5, mm4
- packuswb mm5, mm7
- movd [rdx+rax*2], mm5
-
- ; begin epilog
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
SECTION_RODATA
align 16
x_s1sqr2:
diff --git a/vp8/decoder/x86/dequantize_x86.h b/vp8/common/x86/dequantize_x86.h
index dc68daab3..49bcb7f19 100644
--- a/vp8/decoder/x86/dequantize_x86.h
+++ b/vp8/common/x86/dequantize_x86.h
@@ -22,8 +22,6 @@
#if HAVE_MMX
extern prototype_dequant_block(vp8_dequantize_b_mmx);
extern prototype_dequant_idct_add(vp8_dequant_idct_add_mmx);
-extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_mmx);
-extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_mmx);
extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_mmx);
extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_mmx);
@@ -34,12 +32,6 @@ extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_mmx);
#undef vp8_dequant_idct_add
#define vp8_dequant_idct_add vp8_dequant_idct_add_mmx
-#undef vp8_dequant_dc_idct_add
-#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_mmx
-
-#undef vp8_dequant_dc_idct_add_y_block
-#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_mmx
-
#undef vp8_dequant_idct_add_y_block
#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_mmx
@@ -50,14 +42,10 @@ extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_mmx);
#endif
#if HAVE_SSE2
-extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_sse2);
extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_sse2);
extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_sse2);
#if !CONFIG_RUNTIME_CPU_DETECT
-#undef vp8_dequant_dc_idct_add_y_block
-#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_sse2
-
#undef vp8_dequant_idct_add_y_block
#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_sse2
diff --git a/vp8/common/x86/filter_x86.c b/vp8/common/x86/filter_x86.c
new file mode 100644
index 000000000..ebab814f4
--- /dev/null
+++ b/vp8/common/x86/filter_x86.c
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_ports/mem.h"
+
+DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]) =
+{
+ { 128, 128, 128, 128, 0, 0, 0, 0 },
+ { 112, 112, 112, 112, 16, 16, 16, 16 },
+ { 96, 96, 96, 96, 32, 32, 32, 32 },
+ { 80, 80, 80, 80, 48, 48, 48, 48 },
+ { 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 48, 48, 48, 48, 80, 80, 80, 80 },
+ { 32, 32, 32, 32, 96, 96, 96, 96 },
+ { 16, 16, 16, 16, 112, 112, 112, 112 }
+};
+
+DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]) =
+{
+ { 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 },
+ { 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 },
+ { 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 },
+ { 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 },
+ { 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 }
+};
diff --git a/vp8/common/x86/filter_x86.h b/vp8/common/x86/filter_x86.h
new file mode 100644
index 000000000..efcc4dc2a
--- /dev/null
+++ b/vp8/common/x86/filter_x86.h
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef FILTER_X86_H
+#define FILTER_X86_H
+
+/* x86 assembly specific copy of vp8/common/filter.c:vp8_bilinear_filters with
+ * duplicated values */
+extern const short vp8_bilinear_filters_x86_4[8][8]; /* duplicated 4x */
+extern const short vp8_bilinear_filters_x86_8[8][16]; /* duplicated 8x */
+
+#endif /* FILTER_X86_H */
diff --git a/vp8/decoder/x86/idct_blk_mmx.c b/vp8/common/x86/idct_blk_mmx.c
index 37de5b9fd..49cebd6f5 100644
--- a/vp8/decoder/x86/idct_blk_mmx.c
+++ b/vp8/common/x86/idct_blk_mmx.c
@@ -10,41 +10,16 @@
#include "vpx_config.h"
#include "vp8/common/idct.h"
-#include "vp8/decoder/dequantize.h"
+#include "vp8/common/dequantize.h"
-void vp8_dequant_dc_idct_add_y_block_mmx
- (short *q, short *dq,
- unsigned char *dst, int stride, char *eobs, short *dc)
-{
- int i;
-
- for (i = 0; i < 4; i++)
- {
- if (eobs[0] > 1)
- vp8_dequant_dc_idct_add_mmx (q, dq, dst, stride, dc[0]);
- else if (eobs[0] == 1)
- vp8_dc_only_idct_add_mmx (dc[0], dst, stride, dst, stride);
-
- if (eobs[1] > 1)
- vp8_dequant_dc_idct_add_mmx (q+16, dq, dst+4, stride, dc[1]);
- else if (eobs[1] == 1)
- vp8_dc_only_idct_add_mmx (dc[1], dst+4, stride, dst+4, stride);
+extern void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q);
- if (eobs[2] > 1)
- vp8_dequant_dc_idct_add_mmx (q+32, dq, dst+8, stride, dc[2]);
- else if (eobs[2] == 1)
- vp8_dc_only_idct_add_mmx (dc[2], dst+8, stride, dst+8, stride);
-
- if (eobs[3] > 1)
- vp8_dequant_dc_idct_add_mmx (q+48, dq, dst+12, stride, dc[3]);
- else if (eobs[3] == 1)
- vp8_dc_only_idct_add_mmx (dc[3], dst+12, stride, dst+12, stride);
-
- q += 64;
- dc += 4;
- dst += 4*stride;
- eobs += 4;
- }
+void vp8_dequantize_b_mmx(BLOCKD *d)
+{
+ short *sq = (short *) d->qcoeff;
+ short *dq = (short *) d->dqcoeff;
+ short *q = (short *) d->dequant;
+ vp8_dequantize_b_impl_mmx(sq, dq, q);
}
void vp8_dequant_idct_add_y_block_mmx
diff --git a/vp8/decoder/x86/idct_blk_sse2.c b/vp8/common/x86/idct_blk_sse2.c
index 0495b0610..44e440c0c 100644
--- a/vp8/decoder/x86/idct_blk_sse2.c
+++ b/vp8/common/x86/idct_blk_sse2.c
@@ -10,14 +10,7 @@
#include "vpx_config.h"
#include "vp8/common/idct.h"
-#include "vp8/decoder/dequantize.h"
-
-void vp8_idct_dequant_dc_0_2x_sse2
- (short *q, short *dq,
- unsigned char *dst, int dst_stride, short *dc);
-void vp8_idct_dequant_dc_full_2x_sse2
- (short *q, short *dq,
- unsigned char *dst, int dst_stride, short *dc);
+#include "vp8/common/dequantize.h"
void vp8_idct_dequant_0_2x_sse2
(short *q, short *dq ,
@@ -26,36 +19,6 @@ void vp8_idct_dequant_full_2x_sse2
(short *q, short *dq ,
unsigned char *dst, int dst_stride);
-void vp8_dequant_dc_idct_add_y_block_sse2
- (short *q, short *dq,
- unsigned char *dst, int stride, char *eobs, short *dc)
-{
- int i;
-
- for (i = 0; i < 4; i++)
- {
- if (((short *)(eobs))[0])
- {
- if (((short *)(eobs))[0] & 0xfefe)
- vp8_idct_dequant_dc_full_2x_sse2 (q, dq, dst, stride, dc);
- else
- vp8_idct_dequant_dc_0_2x_sse2 (q, dq, dst, stride, dc);
- }
-
- if (((short *)(eobs))[1])
- {
- if (((short *)(eobs))[1] & 0xfefe)
- vp8_idct_dequant_dc_full_2x_sse2 (q+32, dq, dst+8, stride, dc+2);
- else
- vp8_idct_dequant_dc_0_2x_sse2 (q+32, dq, dst+8, stride, dc+2);
- }
- q += 64;
- dc += 4;
- dst += stride*4;
- eobs += 4;
- }
-}
-
void vp8_dequant_idct_add_y_block_sse2
(short *q, short *dq,
unsigned char *dst, int stride, char *eobs)
diff --git a/vp8/common/x86/idct_x86.h b/vp8/common/x86/idct_x86.h
index f9e3a794d..06e3ea4b5 100644
--- a/vp8/common/x86/idct_x86.h
+++ b/vp8/common/x86/idct_x86.h
@@ -24,7 +24,6 @@ extern prototype_idct(vp8_short_idct4x4llm_mmx);
extern prototype_idct_scalar_add(vp8_dc_only_idct_add_mmx);
extern prototype_second_order(vp8_short_inv_walsh4x4_mmx);
-extern prototype_second_order(vp8_short_inv_walsh4x4_1_mmx);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_idct_idct16
@@ -36,9 +35,6 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_1_mmx);
#undef vp8_idct_iwalsh16
#define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_mmx
-#undef vp8_idct_iwalsh1
-#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_mmx
-
#endif
#endif
diff --git a/vp8/common/x86/iwalsh_mmx.asm b/vp8/common/x86/iwalsh_mmx.asm
index 10b5274dc..6582687e1 100644
--- a/vp8/common/x86/iwalsh_mmx.asm
+++ b/vp8/common/x86/iwalsh_mmx.asm
@@ -11,162 +11,129 @@
%include "vpx_ports/x86_abi_support.asm"
-;void vp8_short_inv_walsh4x4_1_mmx(short *input, short *output)
-global sym(vp8_short_inv_walsh4x4_1_mmx)
-sym(vp8_short_inv_walsh4x4_1_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 2
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0)
- mov rax, 3
-
- mov rdi, arg(1)
- add rax, [rsi] ;input[0] + 3
-
- movd mm0, eax
-
- punpcklwd mm0, mm0 ;x x val val
-
- punpckldq mm0, mm0 ;val val val val
-
- psraw mm0, 3 ;(input[0] + 3) >> 3
-
- movq [rdi + 0], mm0
- movq [rdi + 8], mm0
- movq [rdi + 16], mm0
- movq [rdi + 24], mm0
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
;void vp8_short_inv_walsh4x4_mmx(short *input, short *output)
global sym(vp8_short_inv_walsh4x4_mmx)
sym(vp8_short_inv_walsh4x4_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 2
- push rsi
- push rdi
; end prolog
- mov rax, 3
- mov rsi, arg(0)
- mov rdi, arg(1)
- shl rax, 16
-
- movq mm0, [rsi + 0] ;ip[0]
- movq mm1, [rsi + 8] ;ip[4]
- or rax, 3 ;00030003h
-
- movq mm2, [rsi + 16] ;ip[8]
- movq mm3, [rsi + 24] ;ip[12]
-
- movq mm7, rax
- movq mm4, mm0
+ mov rdx, arg(0)
+ mov rax, 30003h
- punpcklwd mm7, mm7 ;0003000300030003h
- movq mm5, mm1
+ movq mm0, [rdx + 0] ;ip[0]
+ movq mm1, [rdx + 8] ;ip[4]
+ movd mm7, rax
- paddw mm4, mm3 ;ip[0] + ip[12] aka al
- paddw mm5, mm2 ;ip[4] + ip[8] aka bl
+ movq mm2, [rdx + 16] ;ip[8]
+ movq mm3, [rdx + 24] ;ip[12]
+ punpcklwd mm7, mm7 ;0003000300030003h
+ mov rdx, arg(1)
- movq mm6, mm4 ;temp al
+ movq mm4, mm0
+ movq mm5, mm1
- paddw mm4, mm5 ;al + bl
- psubw mm6, mm5 ;al - bl
+ paddw mm4, mm3 ;ip[0] + ip[12] aka al
+ paddw mm5, mm2 ;ip[4] + ip[8] aka bl
- psubw mm0, mm3 ;ip[0] - ip[12] aka d1
- psubw mm1, mm2 ;ip[4] - ip[8] aka c1
+ movq mm6, mm4 ;temp al
+ paddw mm4, mm5 ;al + bl
+ psubw mm6, mm5 ;al - bl
- movq mm5, mm0 ;temp dl
+ psubw mm0, mm3 ;ip[0] - ip[12] aka d1
+ psubw mm1, mm2 ;ip[4] - ip[8] aka c1
- paddw mm0, mm1 ;dl + cl
- psubw mm5, mm1 ;dl - cl
+ movq mm5, mm0 ;temp dl
+ paddw mm0, mm1 ;dl + cl
+ psubw mm5, mm1 ;dl - cl
; 03 02 01 00
; 13 12 11 10
; 23 22 21 20
; 33 32 31 30
- movq mm3, mm4 ; 03 02 01 00
- punpcklwd mm4, mm0 ; 11 01 10 00
- punpckhwd mm3, mm0 ; 13 03 12 02
+ movq mm3, mm4 ; 03 02 01 00
+ punpcklwd mm4, mm0 ; 11 01 10 00
+ punpckhwd mm3, mm0 ; 13 03 12 02
- movq mm1, mm6 ; 23 22 21 20
- punpcklwd mm6, mm5 ; 31 21 30 20
- punpckhwd mm1, mm5 ; 33 23 32 22
+ movq mm1, mm6 ; 23 22 21 20
+ punpcklwd mm6, mm5 ; 31 21 30 20
+ punpckhwd mm1, mm5 ; 33 23 32 22
- movq mm0, mm4 ; 11 01 10 00
- movq mm2, mm3 ; 13 03 12 02
+ movq mm0, mm4 ; 11 01 10 00
+ movq mm2, mm3 ; 13 03 12 02
- punpckldq mm0, mm6 ; 30 20 10 00 aka ip[0]
- punpckhdq mm4, mm6 ; 31 21 11 01 aka ip[4]
+ punpckldq mm0, mm6 ; 30 20 10 00 aka ip[0]
+ punpckhdq mm4, mm6 ; 31 21 11 01 aka ip[4]
- punpckldq mm2, mm1 ; 32 22 12 02 aka ip[8]
- punpckhdq mm3, mm1 ; 33 23 13 03 aka ip[12]
+ punpckldq mm2, mm1 ; 32 22 12 02 aka ip[8]
+ punpckhdq mm3, mm1 ; 33 23 13 03 aka ip[12]
;~~~~~~~~~~~~~~~~~~~~~
- movq mm1, mm0
- movq mm5, mm4
-
- paddw mm1, mm3 ;ip[0] + ip[12] aka al
- paddw mm5, mm2 ;ip[4] + ip[8] aka bl
-
- movq mm6, mm1 ;temp al
-
- paddw mm1, mm5 ;al + bl
- psubw mm6, mm5 ;al - bl
-
- psubw mm0, mm3 ;ip[0] - ip[12] aka d1
- psubw mm4, mm2 ;ip[4] - ip[8] aka c1
-
- movq mm5, mm0 ;temp dl
-
- paddw mm0, mm4 ;dl + cl
- psubw mm5, mm4 ;dl - cl
+ movq mm1, mm0
+ movq mm5, mm4
+ paddw mm1, mm3 ;ip[0] + ip[12] aka al
+ paddw mm5, mm2 ;ip[4] + ip[8] aka bl
+
+ movq mm6, mm1 ;temp al
+ paddw mm1, mm5 ;al + bl
+ psubw mm6, mm5 ;al - bl
+ paddw mm1, mm7
+ paddw mm6, mm7
+ psraw mm1, 3
+ psraw mm6, 3
+
+ psubw mm0, mm3 ;ip[0] - ip[12] aka d1
+ psubw mm4, mm2 ;ip[4] - ip[8] aka c1
+
+ movq mm5, mm0 ;temp dl
+ paddw mm0, mm4 ;dl + cl
+ psubw mm5, mm4 ;dl - cl
+ paddw mm0, mm7
+ paddw mm5, mm7
+ psraw mm0, 3
+ psraw mm5, 3
;~~~~~~~~~~~~~~~~~~~~~
- movq mm3, mm1 ; 03 02 01 00
- punpcklwd mm1, mm0 ; 11 01 10 00
- punpckhwd mm3, mm0 ; 13 03 12 02
-
- movq mm4, mm6 ; 23 22 21 20
- punpcklwd mm6, mm5 ; 31 21 30 20
- punpckhwd mm4, mm5 ; 33 23 32 22
-
- movq mm0, mm1 ; 11 01 10 00
- movq mm2, mm3 ; 13 03 12 02
-
- punpckldq mm0, mm6 ; 30 20 10 00 aka ip[0]
- punpckhdq mm1, mm6 ; 31 21 11 01 aka ip[4]
-
- punpckldq mm2, mm4 ; 32 22 12 02 aka ip[8]
- punpckhdq mm3, mm4 ; 33 23 13 03 aka ip[12]
-
- paddw mm0, mm7
- paddw mm1, mm7
- paddw mm2, mm7
- paddw mm3, mm7
-
- psraw mm0, 3
- psraw mm1, 3
- psraw mm2, 3
- psraw mm3, 3
- movq [rdi + 0], mm0
- movq [rdi + 8], mm1
- movq [rdi + 16], mm2
- movq [rdi + 24], mm3
+ movd eax, mm1
+ movd ecx, mm0
+ psrlq mm0, 32
+ psrlq mm1, 32
+ mov word ptr[rdx+32*0], ax
+ mov word ptr[rdx+32*1], cx
+ shr eax, 16
+ shr ecx, 16
+ mov word ptr[rdx+32*4], ax
+ mov word ptr[rdx+32*5], cx
+ movd eax, mm1
+ movd ecx, mm0
+ mov word ptr[rdx+32*8], ax
+ mov word ptr[rdx+32*9], cx
+ shr eax, 16
+ shr ecx, 16
+ mov word ptr[rdx+32*12], ax
+ mov word ptr[rdx+32*13], cx
+
+ movd eax, mm6
+ movd ecx, mm5
+ psrlq mm5, 32
+ psrlq mm6, 32
+ mov word ptr[rdx+32*2], ax
+ mov word ptr[rdx+32*3], cx
+ shr eax, 16
+ shr ecx, 16
+ mov word ptr[rdx+32*6], ax
+ mov word ptr[rdx+32*7], cx
+ movd eax, mm6
+ movd ecx, mm5
+ mov word ptr[rdx+32*10], ax
+ mov word ptr[rdx+32*11], cx
+ shr eax, 16
+ shr ecx, 16
+ mov word ptr[rdx+32*14], ax
+ mov word ptr[rdx+32*15], cx
; begin epilog
- pop rdi
- pop rsi
UNSHADOW_ARGS
pop rbp
ret
diff --git a/vp8/common/x86/iwalsh_sse2.asm b/vp8/common/x86/iwalsh_sse2.asm
index 1da4fd8da..51cb5e21c 100644
--- a/vp8/common/x86/iwalsh_sse2.asm
+++ b/vp8/common/x86/iwalsh_sse2.asm
@@ -17,103 +17,105 @@ sym(vp8_short_inv_walsh4x4_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 2
- SAVE_XMM 6
- push rsi
- push rdi
; end prolog
- mov rsi, arg(0)
- mov rdi, arg(1)
- mov rax, 3
+ mov rcx, arg(0)
+ mov rdx, arg(1)
+ mov rax, 30003h
- movdqa xmm0, [rsi + 0] ;ip[4] ip[0]
- movdqa xmm1, [rsi + 16] ;ip[12] ip[8]
+ movdqa xmm0, [rcx + 0] ;ip[4] ip[0]
+ movdqa xmm1, [rcx + 16] ;ip[12] ip[8]
- shl rax, 16
- or rax, 3 ;00030003h
- pshufd xmm2, xmm1, 4eh ;ip[8] ip[12]
- movdqa xmm3, xmm0 ;ip[4] ip[0]
+ pshufd xmm2, xmm1, 4eh ;ip[8] ip[12]
+ movdqa xmm3, xmm0 ;ip[4] ip[0]
- paddw xmm0, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
- psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
+ paddw xmm0, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
+ psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
- movdqa xmm4, xmm0
+ movdqa xmm4, xmm0
punpcklqdq xmm0, xmm3 ;d1 a1
punpckhqdq xmm4, xmm3 ;c1 b1
- movd xmm6, eax
- movdqa xmm1, xmm4 ;c1 b1
- paddw xmm4, xmm0 ;dl+cl a1+b1 aka op[4] op[0]
- psubw xmm0, xmm1 ;d1-c1 a1-b1 aka op[12] op[8]
+ movdqa xmm1, xmm4 ;c1 b1
+ paddw xmm4, xmm0 ;dl+cl a1+b1 aka op[4] op[0]
+ psubw xmm0, xmm1 ;d1-c1 a1-b1 aka op[12] op[8]
-;;;temp output
-;; movdqu [rdi + 0], xmm4
-;; movdqu [rdi + 16], xmm3
-
-;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
; 13 12 11 10 03 02 01 00
;
; 33 32 31 30 23 22 21 20
;
- movdqa xmm3, xmm4 ; 13 12 11 10 03 02 01 00
- punpcklwd xmm4, xmm0 ; 23 03 22 02 21 01 20 00
- punpckhwd xmm3, xmm0 ; 33 13 32 12 31 11 30 10
- movdqa xmm1, xmm4 ; 23 03 22 02 21 01 20 00
- punpcklwd xmm4, xmm3 ; 31 21 11 01 30 20 10 00
- punpckhwd xmm1, xmm3 ; 33 23 13 03 32 22 12 02
+ movdqa xmm3, xmm4 ; 13 12 11 10 03 02 01 00
+ punpcklwd xmm4, xmm0 ; 23 03 22 02 21 01 20 00
+ punpckhwd xmm3, xmm0 ; 33 13 32 12 31 11 30 10
+ movdqa xmm1, xmm4 ; 23 03 22 02 21 01 20 00
+ punpcklwd xmm4, xmm3 ; 31 21 11 01 30 20 10 00
+ punpckhwd xmm1, xmm3 ; 33 23 13 03 32 22 12 02
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- pshufd xmm2, xmm1, 4eh ;ip[8] ip[12]
- movdqa xmm3, xmm4 ;ip[4] ip[0]
+ movd xmm0, eax
+ pshufd xmm2, xmm1, 4eh ;ip[8] ip[12]
+ movdqa xmm3, xmm4 ;ip[4] ip[0]
- pshufd xmm6, xmm6, 0 ;03 03 03 03 03 03 03 03
+ pshufd xmm0, xmm0, 0 ;03 03 03 03 03 03 03 03
- paddw xmm4, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
- psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
+ paddw xmm4, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
+ psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
- movdqa xmm5, xmm4
+ movdqa xmm5, xmm4
punpcklqdq xmm4, xmm3 ;d1 a1
punpckhqdq xmm5, xmm3 ;c1 b1
- movdqa xmm1, xmm5 ;c1 b1
- paddw xmm5, xmm4 ;dl+cl a1+b1 aka op[4] op[0]
- psubw xmm4, xmm1 ;d1-c1 a1-b1 aka op[12] op[8]
-;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- ; 13 12 11 10 03 02 01 00
- ;
- ; 33 32 31 30 23 22 21 20
- ;
- movdqa xmm0, xmm5 ; 13 12 11 10 03 02 01 00
- punpcklwd xmm5, xmm4 ; 23 03 22 02 21 01 20 00
- punpckhwd xmm0, xmm4 ; 33 13 32 12 31 11 30 10
- movdqa xmm1, xmm5 ; 23 03 22 02 21 01 20 00
- punpcklwd xmm5, xmm0 ; 31 21 11 01 30 20 10 00
- punpckhwd xmm1, xmm0 ; 33 23 13 03 32 22 12 02
-;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- paddw xmm5, xmm6
- paddw xmm1, xmm6
-
- psraw xmm5, 3
- psraw xmm1, 3
-
- movdqa [rdi + 0], xmm5
- movdqa [rdi + 16], xmm1
+ movdqa xmm1, xmm5 ;c1 b1
+ paddw xmm5, xmm4 ;dl+cl a1+b1 aka op[4] op[0]
+ psubw xmm4, xmm1 ;d1-c1 a1-b1 aka op[12] op[8]
+
+ paddw xmm5, xmm0
+ paddw xmm4, xmm0
+ psraw xmm5, 3
+ psraw xmm4, 3
+
+ movd eax, xmm5
+ movd ecx, xmm4
+ psrldq xmm5, 4
+ psrldq xmm4, 4
+ mov word ptr[rdx+32*0], ax
+ mov word ptr[rdx+32*2], cx
+ shr eax, 16
+ shr ecx, 16
+ mov word ptr[rdx+32*4], ax
+ mov word ptr[rdx+32*6], cx
+ movd eax, xmm5
+ movd ecx, xmm4
+ psrldq xmm5, 4
+ psrldq xmm4, 4
+ mov word ptr[rdx+32*8], ax
+ mov word ptr[rdx+32*10], cx
+ shr eax, 16
+ shr ecx, 16
+ mov word ptr[rdx+32*12], ax
+ mov word ptr[rdx+32*14], cx
+
+ movd eax, xmm5
+ movd ecx, xmm4
+ psrldq xmm5, 4
+ psrldq xmm4, 4
+ mov word ptr[rdx+32*1], ax
+ mov word ptr[rdx+32*3], cx
+ shr eax, 16
+ shr ecx, 16
+ mov word ptr[rdx+32*5], ax
+ mov word ptr[rdx+32*7], cx
+ movd eax, xmm5
+ movd ecx, xmm4
+ mov word ptr[rdx+32*9], ax
+ mov word ptr[rdx+32*11], cx
+ shr eax, 16
+ shr ecx, 16
+ mov word ptr[rdx+32*13], ax
+ mov word ptr[rdx+32*15], cx
; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
-
-SECTION_RODATA
-align 16
-x_s1sqr2:
- times 4 dw 0x8A8C
-align 16
-x_c1sqr2less1:
- times 4 dw 0x4E7B
-align 16
-fours:
- times 4 dw 0x0004
diff --git a/vp8/common/x86/loopfilter_sse2.asm b/vp8/common/x86/loopfilter_sse2.asm
index 86927d9f1..2ad010adb 100644
--- a/vp8/common/x86/loopfilter_sse2.asm
+++ b/vp8/common/x86/loopfilter_sse2.asm
@@ -1385,52 +1385,54 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2):
SHADOW_ARGS_TO_STACK 3
SAVE_XMM 7
GET_GOT rbx
- push rsi
- push rdi
; end prolog
- mov rsi, arg(0) ;src_ptr
+ mov rcx, arg(0) ;src_ptr
movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
- mov rdx, arg(2) ;blimit
- movdqa xmm3, XMMWORD PTR [rdx]
- mov rdi, rsi ; rdi points to row +1 for indirect addressing
- add rdi, rax
+ lea rdx, [rcx + rax]
neg rax
; calculate mask
- movdqa xmm1, [rsi+2*rax] ; p1
- movdqa xmm0, [rdi] ; q1
+ movdqa xmm0, [rdx] ; q1
+ mov rdx, arg(2) ;blimit
+ movdqa xmm1, [rcx+2*rax] ; p1
+
movdqa xmm2, xmm1
movdqa xmm7, xmm0
- movdqa xmm4, xmm0
+
psubusb xmm0, xmm1 ; q1-=p1
- psubusb xmm1, xmm4 ; p1-=q1
+ psubusb xmm1, xmm7 ; p1-=q1
por xmm1, xmm0 ; abs(p1-q1)
pand xmm1, [GLOBAL(tfe)] ; set lsb of each byte to zero
psrlw xmm1, 1 ; abs(p1-q1)/2
- movdqa xmm5, [rsi+rax] ; p0
- movdqa xmm4, [rsi] ; q0
+ movdqa xmm3, XMMWORD PTR [rdx]
+
+ movdqa xmm5, [rcx+rax] ; p0
+ movdqa xmm4, [rcx] ; q0
movdqa xmm0, xmm4 ; q0
movdqa xmm6, xmm5 ; p0
psubusb xmm5, xmm4 ; p0-=q0
psubusb xmm4, xmm6 ; q0-=p0
por xmm5, xmm4 ; abs(p0 - q0)
+
+ movdqa xmm4, [GLOBAL(t80)]
+
paddusb xmm5, xmm5 ; abs(p0-q0)*2
paddusb xmm5, xmm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
psubusb xmm5, xmm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
pxor xmm3, xmm3
pcmpeqb xmm5, xmm3
+
; start work on filters
- pxor xmm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
- pxor xmm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
+ pxor xmm2, xmm4 ; p1 offset to convert to signed values
+ pxor xmm7, xmm4 ; q1 offset to convert to signed values
psubsb xmm2, xmm7 ; p1 - q1
- pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values
- pxor xmm0, [GLOBAL(t80)] ; offset to convert to signed values
+ pxor xmm6, xmm4 ; offset to convert to signed values
+ pxor xmm0, xmm4 ; offset to convert to signed values
movdqa xmm3, xmm0 ; q0
psubsb xmm0, xmm6 ; q0 - p0
paddsb xmm2, xmm0 ; p1 - q1 + 1 * (q0 - p0)
@@ -1438,42 +1440,36 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2):
paddsb xmm2, xmm0 ; p1 - q1 + 3 * (q0 - p0)
pand xmm5, xmm2 ; mask filter values we don't care about
- ; do + 4 side
- paddsb xmm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4
-
- movdqa xmm0, xmm5 ; get a copy of filters
- psllw xmm0, 8 ; shift left 8
- psraw xmm0, 3 ; arithmetic shift right 11
- psrlw xmm0, 8
- movdqa xmm1, xmm5 ; get a copy of filters
- psraw xmm1, 11 ; arithmetic shift right 11
- psllw xmm1, 8 ; shift left 8 to put it back
-
- por xmm0, xmm1 ; put the two together to get result
+ paddsb xmm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4
+ movdqa xmm0, xmm5
+ psubsb xmm5, [GLOBAL(t1s)] ; +3 instead of +4
- psubsb xmm3, xmm0 ; q0-= q0 add
- pxor xmm3, [GLOBAL(t80)] ; unoffset
- movdqa [rsi], xmm3 ; write back
+ movdqa xmm1, [GLOBAL(te0)]
+ movdqa xmm2, [GLOBAL(t1f)]
- ; now do +3 side
- psubsb xmm5, [GLOBAL(t1s)] ; +3 instead of +4
+ pxor xmm7, xmm7
+ pcmpgtb xmm7, xmm0 ;save sign
+ pand xmm7, xmm1 ;preserve the upper 3 bits
+ psrlw xmm0, 3
+ pand xmm0, xmm2 ;clear out upper 3 bits
+ por xmm0, xmm7 ;add sign
+ psubsb xmm3, xmm0 ; q0-= q0sz add
- movdqa xmm0, xmm5 ; get a copy of filters
- psllw xmm0, 8 ; shift left 8
- psraw xmm0, 3 ; arithmetic shift right 11
- psrlw xmm0, 8
- psraw xmm5, 11 ; arithmetic shift right 11
- psllw xmm5, 8 ; shift left 8 to put it back
- por xmm0, xmm5 ; put the two together to get result
+ pxor xmm7, xmm7
+ pcmpgtb xmm7, xmm5 ;save sign
+ pand xmm7, xmm1 ;preserve the upper 3 bits
+ psrlw xmm5, 3
+ pand xmm5, xmm2 ;clear out upper 3 bits
+ por xmm5, xmm7 ;add sign
+ paddsb xmm6, xmm5 ; p0+= p0 add
+ pxor xmm3, xmm4 ; unoffset
+ movdqa [rcx], xmm3 ; write back
- paddsb xmm6, xmm0 ; p0+= p0 add
- pxor xmm6, [GLOBAL(t80)] ; unoffset
- movdqa [rsi+rax], xmm6 ; write back
+ pxor xmm6, xmm4 ; unoffset
+ movdqa [rcx+rax], xmm6 ; write back
; begin epilog
- pop rdi
- pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
@@ -1536,9 +1532,6 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
punpckldq xmm0, xmm1 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
punpckhdq xmm2, xmm1 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
- movdqa t0, xmm0 ; save to t0
- movdqa t1, xmm2 ; save to t1
-
lea rsi, [rsi + rax*8]
lea rdi, [rsi + rax]
lea rdx, [rsi + rax*4]
@@ -1551,26 +1544,24 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
punpckldq xmm4, xmm1 ; c3 c2 c1 c0 83 82 81 80
punpckldq xmm6, xmm3 ; d3 d2 d1 d0 93 92 91 90
- movd xmm0, [rsi + rax*2] ; a3 a2 a1 a0
+ movd xmm1, [rsi + rax*2] ; a3 a2 a1 a0
movd xmm5, [rdx + rax*2] ; e3 e2 e1 e0
- movd xmm2, [rdi + rax*2] ; b3 b2 b1 b0
+ movd xmm3, [rdi + rax*2] ; b3 b2 b1 b0
movd xmm7, [rcx + rax*2] ; f3 f2 f1 f0
- punpckldq xmm0, xmm5 ; e3 e2 e1 e0 a3 a2 a1 a0
- punpckldq xmm2, xmm7 ; f3 f2 f1 f0 b3 b2 b1 b0
+ punpckldq xmm1, xmm5 ; e3 e2 e1 e0 a3 a2 a1 a0
+ punpckldq xmm3, xmm7 ; f3 f2 f1 f0 b3 b2 b1 b0
punpcklbw xmm4, xmm6 ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80
- punpcklbw xmm0, xmm2 ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0
+ punpcklbw xmm1, xmm3 ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0
- movdqa xmm1, xmm4
- punpcklwd xmm4, xmm0 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
- punpckhwd xmm1, xmm0 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
+ movdqa xmm7, xmm4
+ punpcklwd xmm4, xmm1 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
+ punpckhwd xmm7, xmm1 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
movdqa xmm6, xmm4
- punpckldq xmm4, xmm1 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
- punpckhdq xmm6, xmm1 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
+ punpckldq xmm4, xmm7 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
+ punpckhdq xmm6, xmm7 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
- movdqa xmm0, t0 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
- movdqa xmm2, t1 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
movdqa xmm1, xmm0
movdqa xmm3, xmm2
@@ -1579,6 +1570,8 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
punpcklqdq xmm2, xmm6 ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
punpckhqdq xmm3, xmm6 ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+ mov rdx, arg(2) ;blimit
+
; calculate mask
movdqa xmm6, xmm0 ; p1
movdqa xmm7, xmm3 ; q1
@@ -1588,6 +1581,8 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
pand xmm6, [GLOBAL(tfe)] ; set lsb of each byte to zero
psrlw xmm6, 1 ; abs(p1-q1)/2
+ movdqa xmm7, [rdx]
+
movdqa xmm5, xmm1 ; p0
movdqa xmm4, xmm2 ; q0
psubusb xmm5, xmm2 ; p0-=q0
@@ -1596,8 +1591,7 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
paddusb xmm5, xmm5 ; abs(p0-q0)*2
paddusb xmm5, xmm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2
- mov rdx, arg(2) ;blimit
- movdqa xmm7, XMMWORD PTR [rdx]
+ movdqa xmm4, [GLOBAL(t80)]
psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
pxor xmm7, xmm7
@@ -1607,59 +1601,48 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
movdqa t0, xmm0
movdqa t1, xmm3
- pxor xmm0, [GLOBAL(t80)] ; p1 offset to convert to signed values
- pxor xmm3, [GLOBAL(t80)] ; q1 offset to convert to signed values
-
+ pxor xmm0, xmm4 ; p1 offset to convert to signed values
+ pxor xmm3, xmm4 ; q1 offset to convert to signed values
psubsb xmm0, xmm3 ; p1 - q1
- movdqa xmm6, xmm1 ; p0
-
- movdqa xmm7, xmm2 ; q0
- pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values
- pxor xmm7, [GLOBAL(t80)] ; offset to convert to signed values
- movdqa xmm3, xmm7 ; offseted ; q0
-
- psubsb xmm7, xmm6 ; q0 - p0
- paddsb xmm0, xmm7 ; p1 - q1 + 1 * (q0 - p0)
+ movdqa xmm6, xmm1 ; p0
+; movdqa xmm7, xmm2 ; q0
- paddsb xmm0, xmm7 ; p1 - q1 + 2 * (q0 - p0)
- paddsb xmm0, xmm7 ; p1 - q1 + 3 * (q0 - p0)
+ pxor xmm6, xmm4 ; offset to convert to signed values
+ pxor xmm2, xmm4 ; offset to convert to signed values
+ movdqa xmm3, xmm2 ; offseted ; q0
+ psubsb xmm2, xmm6 ; q0 - p0
+ paddsb xmm0, xmm2 ; p1 - q1 + 1 * (q0 - p0)
+ paddsb xmm0, xmm2 ; p1 - q1 + 2 * (q0 - p0)
+ paddsb xmm0, xmm2 ; p1 - q1 + 3 * (q0 - p0)
pand xmm5, xmm0 ; mask filter values we don't care about
-
paddsb xmm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4
-
- movdqa xmm0, xmm5 ; get a copy of filters
- psllw xmm0, 8 ; shift left 8
-
- psraw xmm0, 3 ; arithmetic shift right 11
- psrlw xmm0, 8
-
- movdqa xmm7, xmm5 ; get a copy of filters
- psraw xmm7, 11 ; arithmetic shift right 11
-
- psllw xmm7, 8 ; shift left 8 to put it back
- por xmm0, xmm7 ; put the two together to get result
-
- psubsb xmm3, xmm0 ; q0-= q0sz add
- pxor xmm3, [GLOBAL(t80)] ; unoffset q0
-
- ; now do +3 side
+ movdqa xmm0, xmm5
psubsb xmm5, [GLOBAL(t1s)] ; +3 instead of +4
- movdqa xmm0, xmm5 ; get a copy of filters
- psllw xmm0, 8 ; shift left 8
- psraw xmm0, 3 ; arithmetic shift right 11
+ movdqa xmm1, [GLOBAL(te0)]
+ movdqa xmm2, [GLOBAL(t1f)]
- psrlw xmm0, 8
- psraw xmm5, 11 ; arithmetic shift right 11
+ pxor xmm7, xmm7
+ pcmpgtb xmm7, xmm0 ;save sign
+ pand xmm7, xmm1 ;preserve the upper 3 bits
+ psrlw xmm0, 3
+ pand xmm0, xmm2 ;clear out upper 3 bits
+ por xmm0, xmm7 ;add sign
+ psubsb xmm3, xmm0 ; q0-= q0sz add
- psllw xmm5, 8 ; shift left 8 to put it back
- por xmm0, xmm5 ; put the two together to get result
+ pxor xmm7, xmm7
+ pcmpgtb xmm7, xmm5 ;save sign
+ pand xmm7, xmm1 ;preserve the upper 3 bits
+ psrlw xmm5, 3
+ pand xmm5, xmm2 ;clear out upper 3 bits
+ por xmm5, xmm7 ;add sign
+ paddsb xmm6, xmm5 ; p0+= p0 add
- paddsb xmm6, xmm0 ; p0+= p0 add
- pxor xmm6, [GLOBAL(t80)] ; unoffset p0
+ pxor xmm3, xmm4 ; unoffset q0
+ pxor xmm6, xmm4 ; unoffset p0
movdqa xmm0, t0 ; p1
movdqa xmm4, t1 ; q1
@@ -1763,3 +1746,9 @@ s9:
align 16
s63:
times 8 dw 0x003f
+align 16
+te0:
+ times 16 db 0xe0
+align 16
+t1f:
+ times 16 db 0x1f
diff --git a/vp8/common/x86/subpixel_mmx.asm b/vp8/common/x86/subpixel_mmx.asm
index e68d950ad..5528fd0e6 100644
--- a/vp8/common/x86/subpixel_mmx.asm
+++ b/vp8/common/x86/subpixel_mmx.asm
@@ -10,6 +10,7 @@
%include "vpx_ports/x86_abi_support.asm"
+extern sym(vp8_bilinear_filters_x86_8)
%define BLOCK_HEIGHT_WIDTH 4
@@ -222,14 +223,14 @@ sym(vp8_bilinear_predict8x8_mmx):
push rdi
; end prolog
- ;const short *HFilter = bilinear_filters_mmx[xoffset];
- ;const short *VFilter = bilinear_filters_mmx[yoffset];
+ ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
+ ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
movsxd rax, dword ptr arg(2) ;xoffset
mov rdi, arg(4) ;dst_ptr ;
shl rax, 5 ; offset * 32
- lea rcx, [GLOBAL(sym(vp8_bilinear_filters_mmx))]
+ lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
add rax, rcx ; HFilter
mov rsi, arg(0) ;src_ptr ;
@@ -379,13 +380,13 @@ sym(vp8_bilinear_predict8x4_mmx):
push rdi
; end prolog
- ;const short *HFilter = bilinear_filters_mmx[xoffset];
- ;const short *VFilter = bilinear_filters_mmx[yoffset];
+ ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
+ ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
movsxd rax, dword ptr arg(2) ;xoffset
mov rdi, arg(4) ;dst_ptr ;
- lea rcx, [GLOBAL(sym(vp8_bilinear_filters_mmx))]
+ lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
shl rax, 5
mov rsi, arg(0) ;src_ptr ;
@@ -534,13 +535,13 @@ sym(vp8_bilinear_predict4x4_mmx):
push rdi
; end prolog
- ;const short *HFilter = bilinear_filters_mmx[xoffset];
- ;const short *VFilter = bilinear_filters_mmx[yoffset];
+ ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
+ ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
movsxd rax, dword ptr arg(2) ;xoffset
mov rdi, arg(4) ;dst_ptr ;
- lea rcx, [GLOBAL(sym(vp8_bilinear_filters_mmx))]
+ lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
shl rax, 5
add rax, rcx ; HFilter
@@ -699,29 +700,3 @@ sym(vp8_six_tap_mmx):
times 8 dw 0
-align 16
-global HIDDEN_DATA(sym(vp8_bilinear_filters_mmx))
-sym(vp8_bilinear_filters_mmx):
- times 8 dw 128
- times 8 dw 0
-
- times 8 dw 112
- times 8 dw 16
-
- times 8 dw 96
- times 8 dw 32
-
- times 8 dw 80
- times 8 dw 48
-
- times 8 dw 64
- times 8 dw 64
-
- times 8 dw 48
- times 8 dw 80
-
- times 8 dw 32
- times 8 dw 96
-
- times 8 dw 16
- times 8 dw 112
diff --git a/vp8/common/x86/subpixel_sse2.asm b/vp8/common/x86/subpixel_sse2.asm
index b62b5c68d..cb550af59 100644
--- a/vp8/common/x86/subpixel_sse2.asm
+++ b/vp8/common/x86/subpixel_sse2.asm
@@ -10,6 +10,7 @@
%include "vpx_ports/x86_abi_support.asm"
+extern sym(vp8_bilinear_filters_x86_8)
%define BLOCK_HEIGHT_WIDTH 4
%define VP8_FILTER_WEIGHT 128
@@ -961,7 +962,7 @@ sym(vp8_unpack_block1d16_h6_sse2):
; unsigned char *dst_ptr,
; int dst_pitch
;)
-extern sym(vp8_bilinear_filters_mmx)
+extern sym(vp8_bilinear_filters_x86_8)
global sym(vp8_bilinear_predict16x16_sse2)
sym(vp8_bilinear_predict16x16_sse2):
push rbp
@@ -973,10 +974,10 @@ sym(vp8_bilinear_predict16x16_sse2):
push rdi
; end prolog
- ;const short *HFilter = bilinear_filters_mmx[xoffset]
- ;const short *VFilter = bilinear_filters_mmx[yoffset]
+ ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]
+ ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]
- lea rcx, [GLOBAL(sym(vp8_bilinear_filters_mmx))]
+ lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
movsxd rax, dword ptr arg(2) ;xoffset
cmp rax, 0 ;skip first_pass filter if xoffset=0
@@ -1230,7 +1231,6 @@ sym(vp8_bilinear_predict16x16_sse2):
; unsigned char *dst_ptr,
; int dst_pitch
;)
-extern sym(vp8_bilinear_filters_mmx)
global sym(vp8_bilinear_predict8x8_sse2)
sym(vp8_bilinear_predict8x8_sse2):
push rbp
@@ -1245,9 +1245,9 @@ sym(vp8_bilinear_predict8x8_sse2):
ALIGN_STACK 16, rax
sub rsp, 144 ; reserve 144 bytes
- ;const short *HFilter = bilinear_filters_mmx[xoffset]
- ;const short *VFilter = bilinear_filters_mmx[yoffset]
- lea rcx, [GLOBAL(sym(vp8_bilinear_filters_mmx))]
+ ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]
+ ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]
+ lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
mov rsi, arg(0) ;src_ptr
movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
diff --git a/vp8/common/x86/subpixel_x86.h b/vp8/common/x86/subpixel_x86.h
index 75991cc4f..01ec9e210 100644
--- a/vp8/common/x86/subpixel_x86.h
+++ b/vp8/common/x86/subpixel_x86.h
@@ -12,6 +12,8 @@
#ifndef SUBPIXEL_X86_H
#define SUBPIXEL_X86_H
+#include "filter_x86.h"
+
/* Note:
*
* This platform is commonly built for runtime CPU detection. If you modify
diff --git a/vp8/common/x86/vp8_asm_stubs.c b/vp8/common/x86/vp8_asm_stubs.c
index bce7bc38e..a623c69b4 100644
--- a/vp8/common/x86/vp8_asm_stubs.c
+++ b/vp8/common/x86/vp8_asm_stubs.c
@@ -12,9 +12,9 @@
#include "vpx_config.h"
#include "vpx_ports/mem.h"
#include "vp8/common/subpixel.h"
+#include "filter_x86.h"
extern const short vp8_six_tap_mmx[8][6*8];
-extern const short vp8_bilinear_filters_mmx[8][2*8];
extern void vp8_filter_block1d_h6_mmx
(
diff --git a/vp8/common/x86/x86_systemdependent.c b/vp8/common/x86/x86_systemdependent.c
index eb36d899d..e1e1b7987 100644
--- a/vp8/common/x86/x86_systemdependent.c
+++ b/vp8/common/x86/x86_systemdependent.c
@@ -11,7 +11,6 @@
#include "vpx_config.h"
#include "vpx_ports/x86.h"
-#include "vp8/common/g_common.h"
#include "vp8/common/subpixel.h"
#include "vp8/common/loopfilter.h"
#include "vp8/common/recon.h"
@@ -37,12 +36,14 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx)
if (flags & HAS_MMX)
{
+ rtcd->dequant.block = vp8_dequantize_b_mmx;
+ rtcd->dequant.idct_add = vp8_dequant_idct_add_mmx;
+ rtcd->dequant.idct_add_y_block = vp8_dequant_idct_add_y_block_mmx;
+ rtcd->dequant.idct_add_uv_block = vp8_dequant_idct_add_uv_block_mmx;
+
rtcd->idct.idct16 = vp8_short_idct4x4llm_mmx;
rtcd->idct.idct1_scalar_add = vp8_dc_only_idct_add_mmx;
rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_mmx;
- rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_mmx;
-
-
rtcd->recon.copy8x8 = vp8_copy_mem8x8_mmx;
rtcd->recon.copy8x4 = vp8_copy_mem8x4_mmx;
@@ -89,6 +90,9 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx)
rtcd->recon.build_intra_predictors_mby_s =
vp8_build_intra_predictors_mby_s_sse2;
+ rtcd->dequant.idct_add_y_block = vp8_dequant_idct_add_y_block_sse2;
+ rtcd->dequant.idct_add_uv_block = vp8_dequant_idct_add_uv_block_sse2;
+
rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_sse2;
rtcd->subpix.sixtap16x16 = vp8_sixtap_predict16x16_sse2;
diff --git a/vp8/decoder/arm/arm_dsystemdependent.c b/vp8/decoder/arm/arm_dsystemdependent.c
index 1b0091cdb..bf0a3481a 100644
--- a/vp8/decoder/arm/arm_dsystemdependent.c
+++ b/vp8/decoder/arm/arm_dsystemdependent.c
@@ -11,9 +11,6 @@
#include "vpx_config.h"
#include "vpx_ports/arm.h"
-#include "vp8/common/blockd.h"
-#include "vp8/common/pragmas.h"
-#include "vp8/decoder/dequantize.h"
#include "vp8/decoder/onyxd_int.h"
void vp8_arch_arm_decode_init(VP8D_COMP *pbi)
@@ -30,25 +27,12 @@ void vp8_arch_arm_decode_init(VP8D_COMP *pbi)
#if HAVE_ARMV6
if (flags & HAS_MEDIA)
{
- pbi->dequant.block = vp8_dequantize_b_v6;
- pbi->dequant.idct_add = vp8_dequant_idct_add_v6;
- pbi->dequant.dc_idct_add = vp8_dequant_dc_idct_add_v6;
- pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_v6;
- pbi->dequant.idct_add_y_block = vp8_dequant_idct_add_y_block_v6;
- pbi->dequant.idct_add_uv_block = vp8_dequant_idct_add_uv_block_v6;
}
#endif
#if HAVE_ARMV7
if (flags & HAS_NEON)
{
- pbi->dequant.block = vp8_dequantize_b_neon;
- pbi->dequant.idct_add = vp8_dequant_idct_add_neon;
- /*This is not used: NEON always dequants two blocks at once.
- pbi->dequant.dc_idct_add = vp8_dequant_dc_idct_add_neon;*/
- pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_neon;
- pbi->dequant.idct_add_y_block = vp8_dequant_idct_add_y_block_neon;
- pbi->dequant.idct_add_uv_block = vp8_dequant_idct_add_uv_block_neon;
}
#endif
#endif
diff --git a/vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm b/vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm
deleted file mode 100644
index 19f94e089..000000000
--- a/vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm
+++ /dev/null
@@ -1,213 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license and patent
-; grant that can be found in the LICENSE file in the root of the source
-; tree. All contributing project authors may be found in the AUTHORS
-; file in the root of the source tree.
-;
-
-
- EXPORT |vp8_dequant_dc_idct_add_v6|
-
- AREA |.text|, CODE, READONLY
-
-;void vp8_dequant_dc_idct_v6(short *input, short *dq,
-; unsigned char *dest, int stride, int Dc)
-; r0 = input
-; r1 = dq
-; r2 = dst
-; r3 = stride
-; sp + 36 = Dc
-
-
-|vp8_dequant_dc_idct_add_v6| PROC
- stmdb sp!, {r4-r11, lr}
-
- ldr r6, [sp, #36]
-
- ldr r4, [r0] ;input
- ldr r5, [r1], #4 ;dq
-
- sub sp, sp, #4
- str r3, [sp]
-
- smultt r7, r4, r5
-
- ldr r4, [r0, #4] ;input
- ldr r5, [r1], #4 ;dq
-
- strh r6, [r0], #2
- strh r7, [r0], #2
-
- smulbb r6, r4, r5
- smultt r7, r4, r5
-
- ldr r4, [r0, #4] ;input
- ldr r5, [r1], #4 ;dq
-
- strh r6, [r0], #2
- strh r7, [r0], #2
-
- mov r12, #3
-
-vp8_dequant_dc_add_loop
- smulbb r6, r4, r5
- smultt r7, r4, r5
-
- ldr r4, [r0, #4] ;input
- ldr r5, [r1], #4 ;dq
-
- strh r6, [r0], #2
- strh r7, [r0], #2
-
- smulbb r6, r4, r5
- smultt r7, r4, r5
-
- subs r12, r12, #1
-
- ldrne r4, [r0, #4]
- ldrne r5, [r1], #4
-
- strh r6, [r0], #2
- strh r7, [r0], #2
-
- bne vp8_dequant_dc_add_loop
-
- sub r0, r0, #32
- mov r1, r0
-
-; short_idct4x4llm_v6_dual
- ldr r3, cospi8sqrt2minus1
- ldr r4, sinpi8sqrt2
- ldr r6, [r0, #8]
- mov r5, #2
-vp8_dequant_dc_idct_loop1_v6
- ldr r12, [r0, #24]
- ldr r14, [r0, #16]
- smulwt r9, r3, r6
- smulwb r7, r3, r6
- smulwt r10, r4, r6
- smulwb r8, r4, r6
- pkhbt r7, r7, r9, lsl #16
- smulwt r11, r3, r12
- pkhbt r8, r8, r10, lsl #16
- uadd16 r6, r6, r7
- smulwt r7, r4, r12
- smulwb r9, r3, r12
- smulwb r10, r4, r12
- subs r5, r5, #1
- pkhbt r9, r9, r11, lsl #16
- ldr r11, [r0], #4
- pkhbt r10, r10, r7, lsl #16
- uadd16 r7, r12, r9
- usub16 r7, r8, r7
- uadd16 r6, r6, r10
- uadd16 r10, r11, r14
- usub16 r8, r11, r14
- uadd16 r9, r10, r6
- usub16 r10, r10, r6
- uadd16 r6, r8, r7
- usub16 r7, r8, r7
- str r6, [r1, #8]
- ldrne r6, [r0, #8]
- str r7, [r1, #16]
- str r10, [r1, #24]
- str r9, [r1], #4
- bne vp8_dequant_dc_idct_loop1_v6
-
- mov r5, #2
- sub r0, r1, #8
-vp8_dequant_dc_idct_loop2_v6
- ldr r6, [r0], #4
- ldr r7, [r0], #4
- ldr r8, [r0], #4
- ldr r9, [r0], #4
- smulwt r1, r3, r6
- smulwt r12, r4, r6
- smulwt lr, r3, r8
- smulwt r10, r4, r8
- pkhbt r11, r8, r6, lsl #16
- pkhbt r1, lr, r1, lsl #16
- pkhbt r12, r10, r12, lsl #16
- pkhtb r6, r6, r8, asr #16
- uadd16 r6, r1, r6
- pkhbt lr, r9, r7, lsl #16
- uadd16 r10, r11, lr
- usub16 lr, r11, lr
- pkhtb r8, r7, r9, asr #16
- subs r5, r5, #1
- smulwt r1, r3, r8
- smulwb r7, r3, r8
- smulwt r11, r4, r8
- smulwb r9, r4, r8
- pkhbt r1, r7, r1, lsl #16
- uadd16 r8, r1, r8
- pkhbt r11, r9, r11, lsl #16
- usub16 r1, r12, r8
- uadd16 r8, r11, r6
- ldr r9, c0x00040004
- ldr r12, [sp] ; get stride from stack
- uadd16 r6, r10, r8
- usub16 r7, r10, r8
- uadd16 r7, r7, r9
- uadd16 r6, r6, r9
- uadd16 r10, r14, r1
- usub16 r1, r14, r1
- uadd16 r10, r10, r9
- uadd16 r1, r1, r9
- ldr r11, [r2] ; load input from dst
- mov r8, r7, asr #3
- pkhtb r9, r8, r10, asr #19
- mov r8, r1, asr #3
- pkhtb r8, r8, r6, asr #19
- uxtb16 lr, r11, ror #8
- qadd16 r9, r9, lr
- uxtb16 lr, r11
- qadd16 r8, r8, lr
- usat16 r9, #8, r9
- usat16 r8, #8, r8
- orr r9, r8, r9, lsl #8
- ldr r11, [r2, r12] ; load input from dst
- mov r7, r7, lsl #16
- mov r1, r1, lsl #16
- mov r10, r10, lsl #16
- mov r6, r6, lsl #16
- mov r7, r7, asr #3
- pkhtb r7, r7, r10, asr #19
- mov r1, r1, asr #3
- pkhtb r1, r1, r6, asr #19
- uxtb16 r8, r11, ror #8
- qadd16 r7, r7, r8
- uxtb16 r8, r11
- qadd16 r1, r1, r8
- usat16 r7, #8, r7
- usat16 r1, #8, r1
- orr r1, r1, r7, lsl #8
- str r9, [r2], r12 ; store output to dst
- str r1, [r2], r12 ; store output to dst
- bne vp8_dequant_dc_idct_loop2_v6
-
-; vpx_memset
- sub r0, r0, #32
- add sp, sp, #4
-
- mov r12, #0
- str r12, [r0]
- str r12, [r0, #4]
- str r12, [r0, #8]
- str r12, [r0, #12]
- str r12, [r0, #16]
- str r12, [r0, #20]
- str r12, [r0, #24]
- str r12, [r0, #28]
-
- ldmia sp!, {r4 - r11, pc}
- ENDP ; |vp8_dequant_dc_idct_add_v6|
-
-; Constant Pool
-cospi8sqrt2minus1 DCD 0x00004E7B
-sinpi8sqrt2 DCD 0x00008A8C
-c0x00040004 DCD 0x00040004
-
- END
diff --git a/vp8/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm b/vp8/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm
deleted file mode 100644
index bf8d7ddcd..000000000
--- a/vp8/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm
+++ /dev/null
@@ -1,75 +0,0 @@
-;
-; Copyright (c) 2010 The Webm project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license and patent
-; grant that can be found in the LICENSE file in the root of the source
-; tree. All contributing project authors may be found in the AUTHORS
-; file in the root of the source tree.
-;
-
-
- EXPORT |idct_dequant_dc_0_2x_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;void idct_dequant_dc_0_2x_neon(short *q, short *dq,
-; unsigned char *dst, int stride);
-; r0 *q,
-; r1 *dq,
-; r2 *dst
-; r3 stride
-; sp *dc
-|idct_dequant_dc_0_2x_neon| PROC
-
- ; no q- or dq-coeffs, so r0 and r1 are free to use
- ldr r1, [sp] ; *dc
- add r12, r2, #4
- ldr r0, [r1]
-
- vld1.32 {d2[0]}, [r2], r3 ; lo
- vld1.32 {d8[0]}, [r12], r3 ; hi
- vld1.32 {d2[1]}, [r2], r3
- vld1.32 {d8[1]}, [r12], r3
- vld1.32 {d4[0]}, [r2], r3
- vld1.32 {d10[0]}, [r12], r3
- vld1.32 {d4[1]}, [r2], r3
- vld1.32 {d10[1]}, [r12]
-
- sxth r1, r0 ; lo *dc
- add r1, r1, #4
- asr r1, r1, #3
- vdup.16 q0, r1
- sxth r0, r0, ror #16 ; hi *dc
- add r0, r0, #4
- asr r0, r0, #3
- vdup.16 q3, r0
-
- vaddw.u8 q1, q0, d2 ; lo
- vaddw.u8 q2, q0, d4
- vaddw.u8 q4, q3, d8 ; hi
- vaddw.u8 q5, q3, d10
-
- vqmovun.s16 d2, q1 ; lo
- vqmovun.s16 d4, q2
- vqmovun.s16 d8, q4 ; hi
- vqmovun.s16 d10, q5
-
- sub r2, r2, r3, lsl #2 ; dst - 4*stride
- add r0, r2, #4
-
- vst1.32 {d2[0]}, [r2], r3 ; lo
- vst1.32 {d8[0]}, [r0], r3 ; hi
- vst1.32 {d2[1]}, [r2], r3
- vst1.32 {d8[1]}, [r0], r3
- vst1.32 {d4[0]}, [r2], r3
- vst1.32 {d10[0]}, [r0], r3
- vst1.32 {d4[1]}, [r2]
- vst1.32 {d10[1]}, [r0]
-
- bx lr
-
- ENDP ;|idct_dequant_dc_0_2x_neon|
- END
diff --git a/vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm b/vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm
deleted file mode 100644
index eea41f68c..000000000
--- a/vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm
+++ /dev/null
@@ -1,208 +0,0 @@
-;
-; Copyright (c) 2010 The Webm project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |idct_dequant_dc_full_2x_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-;void idct_dequant_dc_full_2x_neon(short *q, short *dq,
-; unsigned char *dst, int stride, short *dc);
-; r0 *q,
-; r1 *dq,
-; r2 *dst
-; r3 stride
-; sp *dc
-|idct_dequant_dc_full_2x_neon| PROC
- push {r4}
-
- vld1.16 {q0, q1}, [r1] ; dq (same l/r)
- vld1.16 {q2, q3}, [r0] ; l q
- add r0, r0, #32
- vld1.16 {q4, q5}, [r0] ; r q
- add r12, r2, #4
-
- ; interleave the predictors
- vld1.32 {d28[0]}, [r2], r3 ; l pre
- vld1.32 {d28[1]}, [r12], r3 ; r pre
- vld1.32 {d29[0]}, [r2], r3
- vld1.32 {d29[1]}, [r12], r3
- vld1.32 {d30[0]}, [r2], r3
- vld1.32 {d30[1]}, [r12], r3
- vld1.32 {d31[0]}, [r2], r3
- ldr r1, [sp, #4] ; *dc
- vld1.32 {d31[1]}, [r12]
-
- adr r4, cospi8sqrt2minus1 ; pointer to the first constant
-
- ldrh r12, [r1], #2 ; lo *dc
- ldrh r1, [r1] ; hi *dc
-
- ; dequant: q[i] = q[i] * dq[i]
- vmul.i16 q2, q2, q0
- vmul.i16 q3, q3, q1
- vmul.i16 q4, q4, q0
- vmul.i16 q5, q5, q1
-
- ; move dc up to neon and overwrite first element
- vmov.16 d4[0], r12
- vmov.16 d8[0], r1
-
- vld1.16 {d0}, [r4]
-
- ; q2: l0r0 q3: l8r8
- ; q4: l4r4 q5: l12r12
- vswp d5, d8
- vswp d7, d10
-
- ; _CONSTANTS_ * 4,12 >> 16
- ; q6: 4 * sinpi : c1/temp1
- ; q7: 12 * sinpi : d1/temp2
- ; q8: 4 * cospi
- ; q9: 12 * cospi
- vqdmulh.s16 q6, q4, d0[2] ; sinpi8sqrt2
- vqdmulh.s16 q7, q5, d0[2]
- vqdmulh.s16 q8, q4, d0[0] ; cospi8sqrt2minus1
- vqdmulh.s16 q9, q5, d0[0]
-
- vqadd.s16 q10, q2, q3 ; a1 = 0 + 8
- vqsub.s16 q11, q2, q3 ; b1 = 0 - 8
-
- ; vqdmulh only accepts signed values. this was a problem because
- ; our constant had the high bit set, and was treated as a negative value.
- ; vqdmulh also doubles the value before it shifts by 16. we need to
- ; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0,
- ; so we can shift the constant without losing precision. this avoids
- ; shift again afterward, but also avoids the sign issue. win win!
- ; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we
- ; pre-shift it
- vshr.s16 q8, q8, #1
- vshr.s16 q9, q9, #1
-
- ; q4: 4 + 4 * cospi : d1/temp1
- ; q5: 12 + 12 * cospi : c1/temp2
- vqadd.s16 q4, q4, q8
- vqadd.s16 q5, q5, q9
-
- ; c1 = temp1 - temp2
- ; d1 = temp1 + temp2
- vqsub.s16 q2, q6, q5
- vqadd.s16 q3, q4, q7
-
- ; [0]: a1+d1
- ; [1]: b1+c1
- ; [2]: b1-c1
- ; [3]: a1-d1
- vqadd.s16 q4, q10, q3
- vqadd.s16 q5, q11, q2
- vqsub.s16 q6, q11, q2
- vqsub.s16 q7, q10, q3
-
- ; rotate
- vtrn.32 q4, q6
- vtrn.32 q5, q7
- vtrn.16 q4, q5
- vtrn.16 q6, q7
- ; idct loop 2
- ; q4: l 0, 4, 8,12 r 0, 4, 8,12
- ; q5: l 1, 5, 9,13 r 1, 5, 9,13
- ; q6: l 2, 6,10,14 r 2, 6,10,14
- ; q7: l 3, 7,11,15 r 3, 7,11,15
-
- ; q8: 1 * sinpi : c1/temp1
- ; q9: 3 * sinpi : d1/temp2
- ; q10: 1 * cospi
- ; q11: 3 * cospi
- vqdmulh.s16 q8, q5, d0[2] ; sinpi8sqrt2
- vqdmulh.s16 q9, q7, d0[2]
- vqdmulh.s16 q10, q5, d0[0] ; cospi8sqrt2minus1
- vqdmulh.s16 q11, q7, d0[0]
-
- vqadd.s16 q2, q4, q6 ; a1 = 0 + 2
- vqsub.s16 q3, q4, q6 ; b1 = 0 - 2
-
- ; see note on shifting above
- vshr.s16 q10, q10, #1
- vshr.s16 q11, q11, #1
-
- ; q10: 1 + 1 * cospi : d1/temp1
- ; q11: 3 + 3 * cospi : c1/temp2
- vqadd.s16 q10, q5, q10
- vqadd.s16 q11, q7, q11
-
- ; q8: c1 = temp1 - temp2
- ; q9: d1 = temp1 + temp2
- vqsub.s16 q8, q8, q11
- vqadd.s16 q9, q10, q9
-
- ; a1+d1
- ; b1+c1
- ; b1-c1
- ; a1-d1
- vqadd.s16 q4, q2, q9
- vqadd.s16 q5, q3, q8
- vqsub.s16 q6, q3, q8
- vqsub.s16 q7, q2, q9
-
- ; +4 >> 3 (rounding)
- vrshr.s16 q4, q4, #3 ; lo
- vrshr.s16 q5, q5, #3
- vrshr.s16 q6, q6, #3 ; hi
- vrshr.s16 q7, q7, #3
-
- vtrn.32 q4, q6
- vtrn.32 q5, q7
- vtrn.16 q4, q5
- vtrn.16 q6, q7
-
- ; adding pre
- ; input is still packed. pre was read interleaved
- vaddw.u8 q4, q4, d28
- vaddw.u8 q5, q5, d29
- vaddw.u8 q6, q6, d30
- vaddw.u8 q7, q7, d31
-
- vmov.i16 q14, #0
- vmov q15, q14
- vst1.16 {q14, q15}, [r0] ; write over high input
- sub r0, r0, #32
- vst1.16 {q14, q15}, [r0] ; write over low input
-
- sub r2, r2, r3, lsl #2 ; dst - 4*stride
- add r1, r2, #4 ; hi
-
- ;saturate and narrow
- vqmovun.s16 d0, q4 ; lo
- vqmovun.s16 d1, q5
- vqmovun.s16 d2, q6 ; hi
- vqmovun.s16 d3, q7
-
- vst1.32 {d0[0]}, [r2], r3 ; lo
- vst1.32 {d0[1]}, [r1], r3 ; hi
- vst1.32 {d1[0]}, [r2], r3
- vst1.32 {d1[1]}, [r1], r3
- vst1.32 {d2[0]}, [r2], r3
- vst1.32 {d2[1]}, [r1], r3
- vst1.32 {d3[0]}, [r2]
- vst1.32 {d3[1]}, [r1]
-
- pop {r4}
- bx lr
-
- ENDP ; |idct_dequant_dc_full_2x_neon|
-
-; Constant Pool
-cospi8sqrt2minus1 DCD 0x4e7b
-; because the lowest bit in 0x8a8c is 0, we can pre-shift this
-sinpi8sqrt2 DCD 0x4546
-
- END
diff --git a/vp8/decoder/decodframe.c b/vp8/decoder/decodframe.c
index e501b9ec7..11d0e38f5 100644
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -15,7 +15,7 @@
#include "vp8/common/reconintra4x4.h"
#include "vp8/common/recon.h"
#include "vp8/common/reconinter.h"
-#include "dequantize.h"
+#include "vp8/common/dequantize.h"
#include "detokenize.h"
#include "vp8/common/invtrans.h"
#include "vp8/common/alloccommon.h"
@@ -32,7 +32,7 @@
#endif
#include "vpx_mem/vpx_mem.h"
#include "vp8/common/idct.h"
-#include "dequantize.h"
+
#include "vp8/common/threading.h"
#include "decoderthreading.h"
#include "dboolhuff.h"
@@ -109,32 +109,12 @@ void mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd)
#define RTCD_VTABLE(x) NULL
#endif
-/* skip_recon_mb() is Modified: Instead of writing the result to predictor buffer and then copying it
- * to dst buffer, we can write the result directly to dst buffer. This eliminates unnecessary copy.
- */
-static void skip_recon_mb(VP8D_COMP *pbi, MACROBLOCKD *xd)
-{
- if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
- {
- RECON_INVOKE(&pbi->common.rtcd.recon, build_intra_predictors_mbuv_s)(xd);
- RECON_INVOKE(&pbi->common.rtcd.recon,
- build_intra_predictors_mby_s)(xd);
- }
- else
- {
- vp8_build_inter16x16_predictors_mb(xd, xd->dst.y_buffer,
- xd->dst.u_buffer, xd->dst.v_buffer,
- xd->dst.y_stride, xd->dst.uv_stride);
- }
-}
-
static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
unsigned int mb_idx)
{
- int eobtotal = 0;
- int throw_residual = 0;
MB_PREDICTION_MODE mode;
int i;
+ int corruption_detected = 0;
if (xd->mode_info_context->mbmi.mb_skip_coeff)
{
@@ -142,27 +122,51 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
}
else if (!vp8dx_bool_error(xd->current_bc))
{
+ int eobtotal;
eobtotal = vp8_decode_mb_tokens(pbi, xd);
+
+ /* Special case: Force the loopfilter to skip when eobtotal is zero */
+ xd->mode_info_context->mbmi.mb_skip_coeff = (eobtotal==0);
}
+ mode = xd->mode_info_context->mbmi.mode;
+
+ if (xd->segmentation_enabled)
+ mb_init_dequantizer(pbi, xd);
- mode = xd->mode_info_context->mbmi.mode;
+#if CONFIG_ERROR_CONCEALMENT
- if (eobtotal == 0 && mode != B_PRED && mode != SPLITMV &&
- !vp8dx_bool_error(xd->current_bc))
+ if(pbi->ec_active)
{
- /* Special case: Force the loopfilter to skip when eobtotal and
- * mb_skip_coeff are zero.
- * */
- xd->mode_info_context->mbmi.mb_skip_coeff = 1;
+ int throw_residual;
+ /* When we have independent partitions we can apply residual even
+ * though other partitions within the frame are corrupt.
+ */
+ throw_residual = (!pbi->independent_partitions &&
+ pbi->frame_corrupt_residual);
+ throw_residual = (throw_residual || vp8dx_bool_error(xd->current_bc));
- skip_recon_mb(pbi, xd);
- return;
+ if ((mb_idx >= pbi->mvs_corrupt_from_mb || throw_residual))
+ {
+ /* MB with corrupt residuals or corrupt mode/motion vectors.
+ * Better to use the predictor as reconstruction.
+ */
+ pbi->frame_corrupt_residual = 1;
+ vpx_memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
+ vp8_conceal_corrupt_mb(xd);
+
+
+ corruption_detected = 1;
+
+ /* force idct to be skipped for B_PRED and use the
+ * prediction only for reconstruction
+ * */
+ vpx_memset(xd->eobs, 0, 25);
+ }
}
+#endif
- if (xd->segmentation_enabled)
- mb_init_dequantizer(pbi, xd);
/* do prediction */
if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
@@ -173,113 +177,117 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
{
RECON_INVOKE(&pbi->common.rtcd.recon,
build_intra_predictors_mby_s)(xd);
- } else {
+ }
+ else
+ {
+ /* clear out residual eob info */
+ if(xd->mode_info_context->mbmi.mb_skip_coeff)
+ vpx_memset(xd->eobs, 0, 25);
+
vp8_intra_prediction_down_copy(xd);
+
+ for (i = 0; i < 16; i++)
+ {
+ BLOCKD *b = &xd->block[i];
+ int b_mode = xd->mode_info_context->bmi[i].as_mode;
+
+ RECON_INVOKE(RTCD_VTABLE(recon), intra4x4_predict)
+ ( *(b->base_dst) + b->dst, b->dst_stride, b_mode,
+ *(b->base_dst) + b->dst, b->dst_stride );
+
+ if (xd->eobs[i])
+ {
+ if (xd->eobs[i] > 1)
+ {
+ DEQUANT_INVOKE(&pbi->common.rtcd.dequant, idct_add)
+ (b->qcoeff, b->dequant,
+ *(b->base_dst) + b->dst, b->dst_stride);
+ }
+ else
+ {
+ IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)
+ (b->qcoeff[0] * b->dequant[0],
+ *(b->base_dst) + b->dst, b->dst_stride,
+ *(b->base_dst) + b->dst, b->dst_stride);
+ ((int *)b->qcoeff)[0] = 0;
+ }
+ }
+ }
}
}
else
{
vp8_build_inter_predictors_mb(xd);
}
- /* When we have independent partitions we can apply residual even
- * though other partitions within the frame are corrupt.
- */
- throw_residual = (!pbi->independent_partitions &&
- pbi->frame_corrupt_residual);
- throw_residual = (throw_residual || vp8dx_bool_error(xd->current_bc));
+
#if CONFIG_ERROR_CONCEALMENT
- if (pbi->ec_active &&
- (mb_idx >= pbi->mvs_corrupt_from_mb || throw_residual))
+ if (corruption_detected)
{
- /* MB with corrupt residuals or corrupt mode/motion vectors.
- * Better to use the predictor as reconstruction.
- */
- pbi->frame_corrupt_residual = 1;
- vpx_memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
- vp8_conceal_corrupt_mb(xd);
return;
}
#endif
- /* dequantization and idct */
- if (mode == B_PRED)
+ if(!xd->mode_info_context->mbmi.mb_skip_coeff)
{
- for (i = 0; i < 16; i++)
+ /* dequantization and idct */
+ if (mode != B_PRED)
{
- BLOCKD *b = &xd->block[i];
- int b_mode = xd->mode_info_context->bmi[i].as_mode;
+ short *DQC = xd->block[0].dequant;
- RECON_INVOKE(RTCD_VTABLE(recon), intra4x4_predict)
- ( *(b->base_dst) + b->dst, b->dst_stride, b_mode,
- *(b->base_dst) + b->dst, b->dst_stride );
+ /* save the dc dequant constant in case it is overridden */
+ short dc_dequant_temp = DQC[0];
- if (xd->eobs[i] )
+ if (mode != SPLITMV)
{
- if (xd->eobs[i] > 1)
+ BLOCKD *b = &xd->block[24];
+
+ /* do 2nd order transform on the dc block */
+ if (xd->eobs[24] > 1)
{
- DEQUANT_INVOKE(&pbi->dequant, idct_add)
- (b->qcoeff, b->dequant,
- *(b->base_dst) + b->dst, b->dst_stride);
+ DEQUANT_INVOKE(&pbi->common.rtcd.dequant, block)(b);
+
+ IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0],
+ xd->qcoeff);
+ ((int *)b->qcoeff)[0] = 0;
+ ((int *)b->qcoeff)[1] = 0;
+ ((int *)b->qcoeff)[2] = 0;
+ ((int *)b->qcoeff)[3] = 0;
+ ((int *)b->qcoeff)[4] = 0;
+ ((int *)b->qcoeff)[5] = 0;
+ ((int *)b->qcoeff)[6] = 0;
+ ((int *)b->qcoeff)[7] = 0;
}
else
{
- IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)
- (b->qcoeff[0] * b->dequant[0],
- *(b->base_dst) + b->dst, b->dst_stride,
- *(b->base_dst) + b->dst, b->dst_stride);
+ b->dqcoeff[0] = b->qcoeff[0] * b->dequant[0];
+ IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0],
+ xd->qcoeff);
((int *)b->qcoeff)[0] = 0;
}
+
+ /* override the dc dequant constant in order to preserve the
+ * dc components
+ */
+ DQC[0] = 1;
}
- }
- }
- else if (mode == SPLITMV)
- {
- DEQUANT_INVOKE (&pbi->dequant, idct_add_y_block)
- (xd->qcoeff, xd->block[0].dequant,
- xd->dst.y_buffer,
- xd->dst.y_stride, xd->eobs);
- }
- else
- {
- BLOCKD *b = &xd->block[24];
+ DEQUANT_INVOKE (&pbi->common.rtcd.dequant, idct_add_y_block)
+ (xd->qcoeff, xd->block[0].dequant,
+ xd->dst.y_buffer,
+ xd->dst.y_stride, xd->eobs);
- /* do 2nd order transform on the dc block */
- if (xd->eobs[24] > 1)
- {
- DEQUANT_INVOKE(&pbi->dequant, block)(b);
-
- IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], b->diff);
- ((int *)b->qcoeff)[0] = 0;
- ((int *)b->qcoeff)[1] = 0;
- ((int *)b->qcoeff)[2] = 0;
- ((int *)b->qcoeff)[3] = 0;
- ((int *)b->qcoeff)[4] = 0;
- ((int *)b->qcoeff)[5] = 0;
- ((int *)b->qcoeff)[6] = 0;
- ((int *)b->qcoeff)[7] = 0;
- }
- else
- {
- b->dqcoeff[0] = b->qcoeff[0] * b->dequant[0];
- IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], b->diff);
- ((int *)b->qcoeff)[0] = 0;
+ /* restore the dc dequant constant */
+ DQC[0] = dc_dequant_temp;
}
- DEQUANT_INVOKE (&pbi->dequant, dc_idct_add_y_block)
- (xd->qcoeff, xd->block[0].dequant,
- xd->dst.y_buffer,
- xd->dst.y_stride, xd->eobs, xd->block[24].diff);
+ DEQUANT_INVOKE (&pbi->common.rtcd.dequant, idct_add_uv_block)
+ (xd->qcoeff+16*16, xd->block[16].dequant,
+ xd->dst.u_buffer, xd->dst.v_buffer,
+ xd->dst.uv_stride, xd->eobs+16);
}
-
- DEQUANT_INVOKE (&pbi->dequant, idct_add_uv_block)
- (xd->qcoeff+16*16, xd->block[16].dequant,
- xd->dst.u_buffer, xd->dst.v_buffer,
- xd->dst.uv_stride, xd->eobs+16);
}
-
static int get_delta_q(vp8_reader *bc, int prev, int *q_update)
{
int ret_val = 0;
@@ -476,7 +484,8 @@ static void setup_token_decoder(VP8D_COMP *pbi,
const unsigned char* token_part_sizes)
{
vp8_reader *bool_decoder = &pbi->bc2;
- int fragment_idx, partition_idx;
+ unsigned int partition_idx;
+ int fragment_idx;
int num_token_partitions;
const unsigned char *first_fragment_end = pbi->fragments[0] +
pbi->fragment_sizes[0];
@@ -934,16 +943,38 @@ int vp8_decode_frame(VP8D_COMP *pbi)
if (!pc->refresh_golden_frame)
pc->copy_buffer_to_gf = vp8_read_literal(bc, 2);
+#if CONFIG_ERROR_CONCEALMENT
+ /* Assume we shouldn't copy to the golden if the bit is missing */
+ xd->corrupted |= vp8dx_bool_error(bc);
+ if (pbi->ec_active && xd->corrupted)
+ pc->copy_buffer_to_gf = 0;
+#endif
+
pc->copy_buffer_to_arf = 0;
if (!pc->refresh_alt_ref_frame)
pc->copy_buffer_to_arf = vp8_read_literal(bc, 2);
+#if CONFIG_ERROR_CONCEALMENT
+ /* Assume we shouldn't copy to the alt-ref if the bit is missing */
+ xd->corrupted |= vp8dx_bool_error(bc);
+ if (pbi->ec_active && xd->corrupted)
+ pc->copy_buffer_to_arf = 0;
+#endif
+
+
pc->ref_frame_sign_bias[GOLDEN_FRAME] = vp8_read_bit(bc);
pc->ref_frame_sign_bias[ALTREF_FRAME] = vp8_read_bit(bc);
}
pc->refresh_entropy_probs = vp8_read_bit(bc);
+#if CONFIG_ERROR_CONCEALMENT
+ /* Assume we shouldn't refresh the probabilities if the bit is
+ * missing */
+ xd->corrupted |= vp8dx_bool_error(bc);
+ if (pbi->ec_active && xd->corrupted)
+ pc->refresh_entropy_probs = 0;
+#endif
if (pc->refresh_entropy_probs == 0)
{
vpx_memcpy(&pc->lfc, &pc->fc, sizeof(pc->fc));
diff --git a/vp8/decoder/detokenize.c b/vp8/decoder/detokenize.c
index 1d4568593..ba94c58bb 100644
--- a/vp8/decoder/detokenize.c
+++ b/vp8/decoder/detokenize.c
@@ -15,7 +15,7 @@
#include "vpx_ports/mem.h"
#include "detokenize.h"
-#define BOOL_DATA UINT8
+#define BOOL_DATA unsigned char
#define OCB_X PREV_COEF_CONTEXTS * ENTROPY_NODES
DECLARE_ALIGNED(16, static const unsigned char, coef_bands_x[16]) =
@@ -157,10 +157,10 @@ DECLARE_ALIGNED(16, extern const unsigned char, vp8_norm[256]);
DECODE_AND_APPLYSIGN(val) \
Prob = coef_probs + (ENTROPY_NODES*2); \
if(c < 15){\
- qcoeff_ptr [ scan[c] ] = (INT16) v; \
+ qcoeff_ptr [ scan[c] ] = (int16_t) v; \
++c; \
goto DO_WHILE; }\
- qcoeff_ptr [ 15 ] = (INT16) v; \
+ qcoeff_ptr [ 15 ] = (int16_t) v; \
goto BLOCK_FINISHED;
@@ -172,7 +172,7 @@ DECLARE_ALIGNED(16, extern const unsigned char, vp8_norm[256]);
{\
range = range-split;\
value = value-bigsplit;\
- val += ((UINT16)1<<bits_count);\
+ val += ((uint16_t)1<<bits_count);\
}\
else\
{\
@@ -340,12 +340,12 @@ ONE_CONTEXT_NODE_0_:
if (c < 15)
{
- qcoeff_ptr [ scan[c] ] = (INT16) v;
+ qcoeff_ptr [ scan[c] ] = (int16_t) v;
++c;
goto DO_WHILE;
}
- qcoeff_ptr [ 15 ] = (INT16) v;
+ qcoeff_ptr [ 15 ] = (int16_t) v;
BLOCK_FINISHED:
eobs[i] = c;
eobtotal += c;
diff --git a/vp8/decoder/error_concealment.c b/vp8/decoder/error_concealment.c
index 86fa191d3..b77d743f7 100644
--- a/vp8/decoder/error_concealment.c
+++ b/vp8/decoder/error_concealment.c
@@ -491,33 +491,6 @@ static void find_neighboring_blocks(MODE_INFO *mi,
assert(i == 20);
}
-/* Calculates which reference frame type is dominating among the neighbors */
-static MV_REFERENCE_FRAME dominant_ref_frame(EC_BLOCK *neighbors)
-{
- /* Default to referring to "skip" */
- MV_REFERENCE_FRAME dom_ref_frame = LAST_FRAME;
- int max_ref_frame_cnt = 0;
- int ref_frame_cnt[MAX_REF_FRAMES] = {0};
- int i;
- /* Count neighboring reference frames */
- for (i = 0; i < NUM_NEIGHBORS; ++i)
- {
- if (neighbors[i].ref_frame < MAX_REF_FRAMES &&
- neighbors[i].ref_frame != INTRA_FRAME)
- ++ref_frame_cnt[neighbors[i].ref_frame];
- }
- /* Find maximum */
- for (i = 0; i < MAX_REF_FRAMES; ++i)
- {
- if (ref_frame_cnt[i] > max_ref_frame_cnt)
- {
- dom_ref_frame = i;
- max_ref_frame_cnt = ref_frame_cnt[i];
- }
- }
- return dom_ref_frame;
-}
-
/* Interpolates all motion vectors for a macroblock from the neighboring blocks'
* motion vectors.
*/
@@ -591,7 +564,6 @@ void vp8_interpolate_motion(MACROBLOCKD *mb,
{
/* Find relevant neighboring blocks */
EC_BLOCK neighbors[NUM_NEIGHBORS];
- MV_REFERENCE_FRAME dom_ref_frame;
int i;
/* Initialize the array. MAX_REF_FRAMES is interpreted as "doesn't exist" */
for (i = 0; i < NUM_NEIGHBORS; ++i)
@@ -604,13 +576,11 @@ void vp8_interpolate_motion(MACROBLOCKD *mb,
mb_row, mb_col,
mb_rows, mb_cols,
mb->mode_info_stride);
- /* Determine the dominant block type */
- dom_ref_frame = dominant_ref_frame(neighbors);
- /* Interpolate MVs for the missing blocks
- * from the dominating MVs */
- interpolate_mvs(mb, neighbors, dom_ref_frame);
+ /* Interpolate MVs for the missing blocks from the surrounding
+ * blocks which refer to the last frame. */
+ interpolate_mvs(mb, neighbors, LAST_FRAME);
- mb->mode_info_context->mbmi.ref_frame = dom_ref_frame;
+ mb->mode_info_context->mbmi.ref_frame = LAST_FRAME;
mb->mode_info_context->mbmi.mode = SPLITMV;
mb->mode_info_context->mbmi.uv_mode = DC_PRED;
mb->mode_info_context->mbmi.partitioning = 3;
diff --git a/vp8/decoder/generic/dsystemdependent.c b/vp8/decoder/generic/dsystemdependent.c
index 9c42bc62d..8a84e566a 100644
--- a/vp8/decoder/generic/dsystemdependent.c
+++ b/vp8/decoder/generic/dsystemdependent.c
@@ -10,7 +10,7 @@
#include "vpx_config.h"
-#include "vp8/decoder/dequantize.h"
+#include "vp8/common/dequantize.h"
#include "vp8/decoder/onyxd_int.h"
extern void vp8_arch_x86_decode_init(VP8D_COMP *pbi);
@@ -20,13 +20,7 @@ void vp8_dmachine_specific_config(VP8D_COMP *pbi)
{
/* Pure C: */
#if CONFIG_RUNTIME_CPU_DETECT
- pbi->mb.rtcd = &pbi->common.rtcd;
- pbi->dequant.block = vp8_dequantize_b_c;
- pbi->dequant.idct_add = vp8_dequant_idct_add_c;
- pbi->dequant.dc_idct_add = vp8_dequant_dc_idct_add_c;
- pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_c;
- pbi->dequant.idct_add_y_block = vp8_dequant_idct_add_y_block_c;
- pbi->dequant.idct_add_uv_block = vp8_dequant_idct_add_uv_block_c;
+ pbi->mb.rtcd = &pbi->common.rtcd;
#endif
#if ARCH_X86 || ARCH_X86_64
diff --git a/vp8/decoder/onyxd_if.c b/vp8/decoder/onyxd_if.c
index 077954948..80648d39f 100644
--- a/vp8/decoder/onyxd_if.c
+++ b/vp8/decoder/onyxd_if.c
@@ -20,7 +20,6 @@
#include "vpx_scale/yv12extend.h"
#include "vp8/common/loopfilter.h"
#include "vp8/common/swapyv12buffer.h"
-#include "vp8/common/g_common.h"
#include "vp8/common/threading.h"
#include "decoderthreading.h"
#include <stdio.h>
@@ -57,7 +56,7 @@ void vp8dx_initialize()
}
-VP8D_PTR vp8dx_create_decompressor(VP8D_CONFIG *oxcf)
+struct VP8D_COMP * vp8dx_create_decompressor(VP8D_CONFIG *oxcf)
{
VP8D_COMP *pbi = vpx_memalign(32, sizeof(VP8D_COMP));
@@ -117,14 +116,12 @@ VP8D_PTR vp8dx_create_decompressor(VP8D_CONFIG *oxcf)
*/
pbi->independent_partitions = 0;
- return (VP8D_PTR) pbi;
+ return pbi;
}
-void vp8dx_remove_decompressor(VP8D_PTR ptr)
+void vp8dx_remove_decompressor(VP8D_COMP *pbi)
{
- VP8D_COMP *pbi = (VP8D_COMP *) ptr;
-
if (!pbi)
return;
@@ -142,9 +139,8 @@ void vp8dx_remove_decompressor(VP8D_PTR ptr)
}
-vpx_codec_err_t vp8dx_get_reference(VP8D_PTR ptr, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd)
+vpx_codec_err_t vp8dx_get_reference(VP8D_COMP *pbi, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd)
{
- VP8D_COMP *pbi = (VP8D_COMP *) ptr;
VP8_COMMON *cm = &pbi->common;
int ref_fb_idx;
@@ -174,9 +170,8 @@ vpx_codec_err_t vp8dx_get_reference(VP8D_PTR ptr, VP8_REFFRAME ref_frame_flag, Y
}
-vpx_codec_err_t vp8dx_set_reference(VP8D_PTR ptr, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd)
+vpx_codec_err_t vp8dx_set_reference(VP8D_COMP *pbi, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd)
{
- VP8D_COMP *pbi = (VP8D_COMP *) ptr;
VP8_COMMON *cm = &pbi->common;
int *ref_fb_ptr = NULL;
int free_fb;
@@ -301,19 +296,18 @@ static int swap_frame_buffers (VP8_COMMON *cm)
return err;
}
-int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsigned char *source, int64_t time_stamp)
+int vp8dx_receive_compressed_data(VP8D_COMP *pbi, unsigned long size, const unsigned char *source, int64_t time_stamp)
{
#if HAVE_ARMV7
int64_t dx_store_reg[8];
#endif
- VP8D_COMP *pbi = (VP8D_COMP *) ptr;
VP8_COMMON *cm = &pbi->common;
int retcode = 0;
/*if(pbi->ready_for_new_data == 0)
return -1;*/
- if (ptr == 0)
+ if (pbi == 0)
{
return -1;
}
@@ -359,28 +353,38 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign
pbi->fragment_sizes[0] = 0;
}
- if (pbi->num_fragments <= 1 && pbi->fragment_sizes[0] == 0)
+ if (!pbi->ec_active &&
+ pbi->num_fragments <= 1 && pbi->fragment_sizes[0] == 0)
{
- /* This is used to signal that we are missing frames.
- * We do not know if the missing frame(s) was supposed to update
- * any of the reference buffers, but we act conservative and
- * mark only the last buffer as corrupted.
- */
- cm->yv12_fb[cm->lst_fb_idx].corrupted = 1;
-
/* If error concealment is disabled we won't signal missing frames
* to the decoder.
*/
- if (!pbi->ec_active)
+ if (cm->fb_idx_ref_cnt[cm->lst_fb_idx] > 1)
{
- /* Signal that we have no frame to show. */
- cm->show_frame = 0;
+ /* The last reference shares buffer with another reference
+ * buffer. Move it to its own buffer before setting it as
+ * corrupt, otherwise we will make multiple buffers corrupt.
+ */
+ const int prev_idx = cm->lst_fb_idx;
+ cm->fb_idx_ref_cnt[prev_idx]--;
+ cm->lst_fb_idx = get_free_fb(cm);
+ vp8_yv12_copy_frame_ptr(&cm->yv12_fb[prev_idx],
+ &cm->yv12_fb[cm->lst_fb_idx]);
+ }
+ /* This is used to signal that we are missing frames.
+ * We do not know if the missing frame(s) was supposed to update
+ * any of the reference buffers, but we act conservative and
+ * mark only the last buffer as corrupted.
+ */
+ cm->yv12_fb[cm->lst_fb_idx].corrupted = 1;
- pbi->num_fragments = 0;
+ /* Signal that we have no frame to show. */
+ cm->show_frame = 0;
- /* Nothing more to do. */
- return 0;
- }
+ pbi->num_fragments = 0;
+
+ /* Nothing more to do. */
+ return 0;
}
#if HAVE_ARMV7
@@ -565,10 +569,9 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign
pbi->common.error.setjmp = 0;
return retcode;
}
-int vp8dx_get_raw_frame(VP8D_PTR ptr, YV12_BUFFER_CONFIG *sd, int64_t *time_stamp, int64_t *time_end_stamp, vp8_ppflags_t *flags)
+int vp8dx_get_raw_frame(VP8D_COMP *pbi, YV12_BUFFER_CONFIG *sd, int64_t *time_stamp, int64_t *time_end_stamp, vp8_ppflags_t *flags)
{
int ret = -1;
- VP8D_COMP *pbi = (VP8D_COMP *) ptr;
if (pbi->ready_for_new_data == 1)
return ret;
diff --git a/vp8/decoder/onyxd_int.h b/vp8/decoder/onyxd_int.h
index 519a7f2b9..cb2593b2c 100644
--- a/vp8/decoder/onyxd_int.h
+++ b/vp8/decoder/onyxd_int.h
@@ -16,7 +16,8 @@
#include "treereader.h"
#include "vp8/common/onyxc_int.h"
#include "vp8/common/threading.h"
-#include "dequantize.h"
+
+
#if CONFIG_ERROR_CONCEALMENT
#include "ec_types.h"
#endif
@@ -43,7 +44,7 @@ typedef struct
} DATARATE;
-typedef struct VP8Decompressor
+typedef struct VP8D_COMP
{
DECLARE_ALIGNED(16, MACROBLOCKD, mb);
@@ -93,11 +94,6 @@ typedef struct VP8Decompressor
DATARATE dr[16];
-#if CONFIG_RUNTIME_CPU_DETECT
- vp8_dequant_rtcd_vtable_t dequant;
-#endif
-
-
vp8_prob prob_intra;
vp8_prob prob_last;
vp8_prob prob_gf;
diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c
index eba5830d5..947b3a1c6 100644
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -175,36 +175,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
#endif
/* dequantization and idct */
- if (xd->mode_info_context->mbmi.mode != B_PRED && xd->mode_info_context->mbmi.mode != SPLITMV)
- {
- BLOCKD *b = &xd->block[24];
- DEQUANT_INVOKE(&pbi->dequant, block)(b);
-
- /* do 2nd order transform on the dc block */
- if (xd->eobs[24] > 1)
- {
- IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], b->diff);
- ((int *)b->qcoeff)[0] = 0;
- ((int *)b->qcoeff)[1] = 0;
- ((int *)b->qcoeff)[2] = 0;
- ((int *)b->qcoeff)[3] = 0;
- ((int *)b->qcoeff)[4] = 0;
- ((int *)b->qcoeff)[5] = 0;
- ((int *)b->qcoeff)[6] = 0;
- ((int *)b->qcoeff)[7] = 0;
- }
- else
- {
- IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], b->diff);
- ((int *)b->qcoeff)[0] = 0;
- }
-
- DEQUANT_INVOKE (&pbi->dequant, dc_idct_add_y_block)
- (xd->qcoeff, xd->block[0].dequant,
- xd->dst.y_buffer,
- xd->dst.y_stride, xd->eobs, xd->block[24].diff);
- }
- else if (xd->mode_info_context->mbmi.mode == B_PRED)
+ if (xd->mode_info_context->mbmi.mode == B_PRED)
{
for (i = 0; i < 16; i++)
{
@@ -214,37 +185,81 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
vp8mt_predict_intra4x4(pbi, xd, b_mode, *(b->base_dst) + b->dst,
b->dst_stride, mb_row, mb_col, i);
- if (xd->eobs[i] > 1)
+ if (xd->eobs[i] )
{
- DEQUANT_INVOKE(&pbi->dequant, idct_add)
- (b->qcoeff, b->dequant,
- *(b->base_dst) + b->dst, b->dst_stride);
+ if (xd->eobs[i] > 1)
+ {
+ DEQUANT_INVOKE(&pbi->common.rtcd.dequant, idct_add)
+ (b->qcoeff, b->dequant,
+ *(b->base_dst) + b->dst, b->dst_stride);
+ }
+ else
+ {
+ IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)
+ (b->qcoeff[0] * b->dequant[0],
+ *(b->base_dst) + b->dst, b->dst_stride,
+ *(b->base_dst) + b->dst, b->dst_stride);
+ ((int *)b->qcoeff)[0] = 0;
+ }
+ }
+ }
+ }
+ else
+ {
+ short *DQC = xd->block[0].dequant;
+
+ DECLARE_ALIGNED(16, short, local_dequant[16]);
+
+ if (xd->mode_info_context->mbmi.mode != SPLITMV)
+ {
+ BLOCKD *b = &xd->block[24];
+
+ /* do 2nd order transform on the dc block */
+ if (xd->eobs[24] > 1)
+ {
+ DEQUANT_INVOKE(&pbi->common.rtcd.dequant, block)(b);
+
+ IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0],
+ xd->qcoeff);
+ ((int *)b->qcoeff)[0] = 0;
+ ((int *)b->qcoeff)[1] = 0;
+ ((int *)b->qcoeff)[2] = 0;
+ ((int *)b->qcoeff)[3] = 0;
+ ((int *)b->qcoeff)[4] = 0;
+ ((int *)b->qcoeff)[5] = 0;
+ ((int *)b->qcoeff)[6] = 0;
+ ((int *)b->qcoeff)[7] = 0;
}
else
{
- IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)
- (b->qcoeff[0] * b->dequant[0],
- *(b->base_dst) + b->dst, b->dst_stride,
- *(b->base_dst) + b->dst, b->dst_stride);
+ b->dqcoeff[0] = b->qcoeff[0] * b->dequant[0];
+ IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], xd->qcoeff);
((int *)b->qcoeff)[0] = 0;
}
+
+ /* make a local copy of the dequant constants */
+ vpx_memcpy(local_dequant, xd->block[0].dequant,
+ sizeof(local_dequant));
+
+ /* override the dc dequant constant */
+ local_dequant[0] = 1;
+
+ /* use the new dequant constants */
+ DQC = local_dequant;
}
- }
- else
- {
- DEQUANT_INVOKE (&pbi->dequant, idct_add_y_block)
- (xd->qcoeff, xd->block[0].dequant,
+
+ DEQUANT_INVOKE (&pbi->common.rtcd.dequant, idct_add_y_block)
+ (xd->qcoeff, DQC,
xd->dst.y_buffer,
xd->dst.y_stride, xd->eobs);
}
- DEQUANT_INVOKE (&pbi->dequant, idct_add_uv_block)
+ DEQUANT_INVOKE (&pbi->common.rtcd.dequant, idct_add_uv_block)
(xd->qcoeff+16*16, xd->block[16].dequant,
xd->dst.u_buffer, xd->dst.v_buffer,
xd->dst.uv_stride, xd->eobs+16);
}
-
static THREAD_FUNCTION thread_decoding_proc(void *p_data)
{
int ithread = ((DECODETHREAD_DATA *)p_data)->ithread;
diff --git a/vp8/decoder/x86/x86_dsystemdependent.c b/vp8/decoder/x86/x86_dsystemdependent.c
index 443150483..27bf5ddbd 100644
--- a/vp8/decoder/x86/x86_dsystemdependent.c
+++ b/vp8/decoder/x86/x86_dsystemdependent.c
@@ -13,50 +13,7 @@
#include "vpx_ports/x86.h"
#include "vp8/decoder/onyxd_int.h"
-
-#if HAVE_MMX
-void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q);
-
-void vp8_dequantize_b_mmx(BLOCKD *d)
-{
- short *sq = (short *) d->qcoeff;
- short *dq = (short *) d->dqcoeff;
- short *q = (short *) d->dequant;
- vp8_dequantize_b_impl_mmx(sq, dq, q);
-}
-#endif
-
void vp8_arch_x86_decode_init(VP8D_COMP *pbi)
{
-#if CONFIG_RUNTIME_CPU_DETECT
- int flags = x86_simd_caps();
-
- /* Note:
- *
- * This platform can be built without runtime CPU detection as well. If
- * you modify any of the function mappings present in this file, be sure
- * to also update them in static mapings (<arch>/filename_<arch>.h)
- */
- /* Override default functions with fastest ones for this CPU. */
-#if HAVE_MMX
- if (flags & HAS_MMX)
- {
- pbi->dequant.block = vp8_dequantize_b_mmx;
- pbi->dequant.idct_add = vp8_dequant_idct_add_mmx;
- pbi->dequant.dc_idct_add = vp8_dequant_dc_idct_add_mmx;
- pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_mmx;
- pbi->dequant.idct_add_y_block = vp8_dequant_idct_add_y_block_mmx;
- pbi->dequant.idct_add_uv_block = vp8_dequant_idct_add_uv_block_mmx;
- }
-#endif
-#if HAVE_SSE2
- if (flags & HAS_SSE2)
- {
- pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_sse2;
- pbi->dequant.idct_add_y_block = vp8_dequant_idct_add_y_block_sse2;
- pbi->dequant.idct_add_uv_block = vp8_dequant_idct_add_uv_block_sse2;
- }
-#endif
-#endif
}
diff --git a/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm b/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm
index 30513f912..5b7e8f66f 100644
--- a/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm
+++ b/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm
@@ -13,6 +13,7 @@
EXPORT |vp8_encode_bool|
EXPORT |vp8_stop_encode|
EXPORT |vp8_encode_value|
+ IMPORT |vp8_validate_buffer_arm|
INCLUDE asm_enc_offsets.asm
@@ -22,6 +23,20 @@
AREA |.text|, CODE, READONLY
+ ; macro for validating write buffer position
+ ; needs vp8_writer in r0
+ ; start shall not be in r1
+ MACRO
+ VALIDATE_POS $start, $pos
+ push {r0-r3, r12, lr} ; rest of regs are preserved by subroutine call
+ ldr r2, [r0, #vp8_writer_buffer_end]
+ ldr r3, [r0, #vp8_writer_error]
+ mov r1, $pos
+ mov r0, $start
+ bl vp8_validate_buffer_arm
+ pop {r0-r3, r12, lr}
+ MEND
+
; r0 BOOL_CODER *br
; r1 unsigned char *source
; r2 unsigned char *source_end
@@ -43,7 +58,7 @@
; r1 int bit
; r2 int probability
|vp8_encode_bool| PROC
- push {r4-r9, lr}
+ push {r4-r10, lr}
mov r4, r2
@@ -106,6 +121,9 @@ token_high_bit_not_set
bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
str r1, [r0, #vp8_writer_pos]
sub r3, r3, #8 ; count -= 8
+
+ VALIDATE_POS r9, r1 ; validate_buffer at pos
+
strb r7, [r9, r4] ; w->buffer[w->pos++]
token_count_lt_zero
@@ -114,7 +132,7 @@ token_count_lt_zero
str r2, [r0, #vp8_writer_lowvalue]
str r5, [r0, #vp8_writer_range]
str r3, [r0, #vp8_writer_count]
- pop {r4-r9, pc}
+ pop {r4-r10, pc}
ENDP
; r0 BOOL_CODER *br
@@ -179,6 +197,9 @@ token_high_bit_not_set_se
bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
str r1, [r0, #vp8_writer_pos]
sub r3, r3, #8 ; count -= 8
+
+ VALIDATE_POS r9, r1 ; validate_buffer at pos
+
strb r7, [r9, r4] ; w->buffer[w->pos++]
token_count_lt_zero_se
@@ -198,7 +219,7 @@ token_count_lt_zero_se
; r1 int data
; r2 int bits
|vp8_encode_value| PROC
- push {r4-r11, lr}
+ push {r4-r12, lr}
mov r10, r2
@@ -270,6 +291,9 @@ token_high_bit_not_set_ev
bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
str r11, [r0, #vp8_writer_pos]
sub r3, r3, #8 ; count -= 8
+
+ VALIDATE_POS r9, r11 ; validate_buffer at pos
+
strb r7, [r9, r4] ; w->buffer[w->pos++]
token_count_lt_zero_ev
@@ -281,7 +305,7 @@ token_count_lt_zero_ev
str r2, [r0, #vp8_writer_lowvalue]
str r5, [r0, #vp8_writer_range]
str r3, [r0, #vp8_writer_count]
- pop {r4-r11, pc}
+ pop {r4-r12, pc}
ENDP
END
diff --git a/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm b/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm
index 933717c63..a1cd46704 100644
--- a/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm
+++ b/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm
@@ -10,6 +10,7 @@
EXPORT |vp8cx_pack_tokens_armv5|
+ IMPORT |vp8_validate_buffer_arm|
INCLUDE asm_enc_offsets.asm
@@ -19,6 +20,22 @@
AREA |.text|, CODE, READONLY
+
+ ; macro for validating write buffer position
+ ; needs vp8_writer in r0
+ ; start shall not be in r1
+ MACRO
+ VALIDATE_POS $start, $pos
+ push {r0-r3, r12, lr} ; rest of regs are preserved by subroutine call
+ ldr r2, [r0, #vp8_writer_buffer_end]
+ ldr r3, [r0, #vp8_writer_error]
+ mov r1, $pos
+ mov r0, $start
+ bl vp8_validate_buffer_arm
+ pop {r0-r3, r12, lr}
+ MEND
+
+
; r0 vp8_writer *w
; r1 const TOKENEXTRA *p
; r2 int xcount
@@ -26,11 +43,11 @@
; s0 vp8_extra_bits
; s1 vp8_coef_tree
|vp8cx_pack_tokens_armv5| PROC
- push {r4-r11, lr}
+ push {r4-r12, lr}
+ sub sp, sp, #16
; Add size of xcount * sizeof (TOKENEXTRA) to get stop
; sizeof (TOKENEXTRA) is 8
- sub sp, sp, #12
add r2, r1, r2, lsl #3 ; stop = p + xcount*sizeof(TOKENEXTRA)
str r2, [sp, #0]
str r3, [sp, #8] ; save vp8_coef_encodings
@@ -57,7 +74,7 @@ while_p_lt_stop
subne r8, r8, #1 ; --n
rsb r4, r8, #32 ; 32-n
- ldr r10, [sp, #52] ; vp8_coef_tree
+ ldr r10, [sp, #60] ; vp8_coef_tree
; v is kept in r12 during the token pack loop
lsl r12, r6, r4 ; r12 = v << 32 - n
@@ -128,12 +145,15 @@ token_high_bit_not_set
bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
str r11, [r0, #vp8_writer_pos]
sub r3, r3, #8 ; count -= 8
+
+ VALIDATE_POS r10, r11 ; validate_buffer at pos
+
strb r7, [r10, r4] ; w->buffer[w->pos++]
; r10 is used earlier in the loop, but r10 is used as
; temp variable here. So after r10 is used, reload
; vp8_coef_tree_dcd into r10
- ldr r10, [sp, #52] ; vp8_coef_tree
+ ldr r10, [sp, #60] ; vp8_coef_tree
token_count_lt_zero
lsl r2, r2, r6 ; lowvalue <<= shift
@@ -142,7 +162,7 @@ token_count_lt_zero
bne token_loop
ldrb r6, [r1, #tokenextra_token] ; t
- ldr r7, [sp, #48] ; vp8_extra_bits
+ ldr r7, [sp, #56] ; vp8_extra_bits
; Add t * sizeof (vp8_extra_bit_struct) to get the desired
; element. Here vp8_extra_bit_struct == 16
add r12, r7, r6, lsl #4 ; b = vp8_extra_bits + t
@@ -223,6 +243,9 @@ extra_high_bit_not_set
bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
str r11, [r0, #vp8_writer_pos]
sub r3, r3, #8 ; count -= 8
+
+ VALIDATE_POS r10, r11 ; validate_buffer at pos
+
strb r7, [r10, r4] ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
ldr r10, [sp, #4] ; b->tree
extra_count_lt_zero
@@ -271,7 +294,10 @@ end_high_bit_not_set
lsr r6, r2, #24 ; lowvalue >> 24
add r12, r4, #1 ; w->pos++
bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
- str r12, [r0, #0x10]
+ str r12, [r0, #vp8_writer_pos]
+
+ VALIDATE_POS r7, r12 ; validate_buffer at pos
+
strb r6, [r7, r4]
end_count_zero
skip_extra_bits
@@ -284,8 +310,8 @@ check_p_lt_stop
str r2, [r0, #vp8_writer_lowvalue]
str r5, [r0, #vp8_writer_range]
str r3, [r0, #vp8_writer_count]
- add sp, sp, #12
- pop {r4-r11, pc}
+ add sp, sp, #16
+ pop {r4-r12, pc}
ENDP
END
diff --git a/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm b/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
index 82bf71f35..1fa5e6c22 100644
--- a/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
+++ b/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
@@ -10,6 +10,7 @@
EXPORT |vp8cx_pack_mb_row_tokens_armv5|
+ IMPORT |vp8_validate_buffer_arm|
INCLUDE asm_enc_offsets.asm
@@ -19,6 +20,21 @@
AREA |.text|, CODE, READONLY
+
+ ; macro for validating write buffer position
+ ; needs vp8_writer in r0
+ ; start shall not be in r1
+ MACRO
+ VALIDATE_POS $start, $pos
+ push {r0-r3, r12, lr} ; rest of regs are preserved by subroutine call
+ ldr r2, [r0, #vp8_writer_buffer_end]
+ ldr r3, [r0, #vp8_writer_error]
+ mov r1, $pos
+ mov r0, $start
+ bl vp8_validate_buffer_arm
+ pop {r0-r3, r12, lr}
+ MEND
+
; r0 VP8_COMP *cpi
; r1 vp8_writer *w
; r2 vp8_coef_encodings
@@ -26,7 +42,7 @@
; s0 vp8_coef_tree
|vp8cx_pack_mb_row_tokens_armv5| PROC
- push {r4-r11, lr}
+ push {r4-r12, lr}
sub sp, sp, #24
; Compute address of cpi->common.mb_rows
@@ -79,7 +95,7 @@ while_p_lt_stop
subne r8, r8, #1 ; --n
rsb r4, r8, #32 ; 32-n
- ldr r10, [sp, #60] ; vp8_coef_tree
+ ldr r10, [sp, #64] ; vp8_coef_tree
; v is kept in r12 during the token pack loop
lsl r12, r6, r4 ; r12 = v << 32 - n
@@ -150,12 +166,15 @@ token_high_bit_not_set
bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
str r11, [r0, #vp8_writer_pos]
sub r3, r3, #8 ; count -= 8
+
+ VALIDATE_POS r10, r11 ; validate_buffer at pos
+
strb r7, [r10, r4] ; w->buffer[w->pos++]
; r10 is used earlier in the loop, but r10 is used as
; temp variable here. So after r10 is used, reload
; vp8_coef_tree_dcd into r10
- ldr r10, [sp, #60] ; vp8_coef_tree
+ ldr r10, [sp, #64] ; vp8_coef_tree
token_count_lt_zero
lsl r2, r2, r6 ; lowvalue <<= shift
@@ -245,6 +264,9 @@ extra_high_bit_not_set
bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
str r11, [r0, #vp8_writer_pos]
sub r3, r3, #8 ; count -= 8
+
+ VALIDATE_POS r10, r11 ; validate_buffer at pos
+
strb r7, [r10, r4] ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
ldr r10, [sp, #4] ; b->tree
extra_count_lt_zero
@@ -293,7 +315,10 @@ end_high_bit_not_set
lsr r6, r2, #24 ; lowvalue >> 24
add r12, r4, #1 ; w->pos++
bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
- str r12, [r0, #0x10]
+ str r12, [r0, #vp8_writer_pos]
+
+ VALIDATE_POS r7, r12 ; validate_buffer at pos
+
strb r6, [r7, r4]
end_count_zero
skip_extra_bits
@@ -314,7 +339,7 @@ check_p_lt_stop
str r5, [r0, #vp8_writer_range]
str r3, [r0, #vp8_writer_count]
add sp, sp, #24
- pop {r4-r11, pc}
+ pop {r4-r12, pc}
ENDP
_VP8_COMP_common_
diff --git a/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm b/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
index c061b2fab..3a183aa2f 100644
--- a/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
+++ b/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
@@ -10,6 +10,7 @@
EXPORT |vp8cx_pack_tokens_into_partitions_armv5|
+ IMPORT |vp8_validate_buffer_arm|
INCLUDE asm_enc_offsets.asm
@@ -19,17 +20,31 @@
AREA |.text|, CODE, READONLY
+ ; macro for validating write buffer position
+ ; needs vp8_writer in r0
+ ; start shall not be in r1
+ MACRO
+ VALIDATE_POS $start, $pos
+ push {r0-r3, r12, lr} ; rest of regs are preserved by subroutine call
+ ldr r2, [r0, #vp8_writer_buffer_end]
+ ldr r3, [r0, #vp8_writer_error]
+ mov r1, $pos
+ mov r0, $start
+ bl vp8_validate_buffer_arm
+ pop {r0-r3, r12, lr}
+ MEND
+
; r0 VP8_COMP *cpi
; r1 unsigned char *cx_data
-; r2 int num_part
-; r3 *size
+; r2 const unsigned char *cx_data_end
+; r3 int num_part
; s0 vp8_coef_encodings
; s1 vp8_extra_bits,
-; s2 const vp8_tree_index *,
+; s2 const vp8_tree_index *
|vp8cx_pack_tokens_into_partitions_armv5| PROC
- push {r4-r11, lr}
- sub sp, sp, #44
+ push {r4-r12, lr}
+ sub sp, sp, #40
; Compute address of cpi->common.mb_rows
ldr r4, _VP8_COMP_common_
@@ -39,31 +54,26 @@
ldr r5, [r4, r6] ; load up mb_rows
str r5, [sp, #36] ; save mb_rows
- str r1, [sp, #24] ; save cx_data
- str r2, [sp, #20] ; save num_part
- str r3, [sp, #8] ; save *size
-
- ; *size = 3*(num_part -1 );
- sub r2, r2, #1 ; num_part - 1
- add r2, r2, r2, lsl #1 ; 3*(num_part - 1)
- str r2, [r3]
-
- add r2, r2, r1 ; cx_data + *size
- str r2, [sp, #40] ; ptr
+ str r1, [sp, #24] ; save ptr = cx_data
+ str r3, [sp, #20] ; save num_part
+ str r2, [sp, #8] ; save cx_data_end
ldr r4, _VP8_COMP_tplist_
add r4, r0, r4
ldr r7, [r4, #0] ; dereference cpi->tp_list
str r7, [sp, #32] ; store start of cpi->tp_list
- ldr r11, _VP8_COMP_bc2_ ; load up vp8_writer out of cpi
+ ldr r11, _VP8_COMP_bc_ ; load up vp8_writer out of cpi
add r0, r0, r11
mov r11, #0
str r11, [sp, #28] ; i
numparts_loop
- ldr r10, [sp, #40] ; ptr
+ ldr r2, _vp8_writer_sz_ ; load up sizeof(vp8_writer)
+ add r0, r2 ; bc[i + 1]
+
+ ldr r10, [sp, #24] ; ptr
ldr r5, [sp, #36] ; move mb_rows to the counting section
subs r5, r5, r11 ; move start point with each partition
; mb_rows starts at i
@@ -72,6 +82,10 @@ numparts_loop
; Reset all of the VP8 Writer data for each partition that
; is processed.
; start_encode
+
+ ldr r3, [sp, #8]
+ str r3, [r0, #vp8_writer_buffer_end]
+
mov r2, #0 ; vp8_writer_lowvalue
mov r5, #255 ; vp8_writer_range
mvn r3, #23 ; vp8_writer_count
@@ -182,6 +196,9 @@ token_high_bit_not_set
bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
str r11, [r0, #vp8_writer_pos]
sub r3, r3, #8 ; count -= 8
+
+ VALIDATE_POS r10, r11 ; validate_buffer at pos
+
strb r7, [r10, r4] ; w->buffer[w->pos++]
; r10 is used earlier in the loop, but r10 is used as
@@ -277,6 +294,9 @@ extra_high_bit_not_set
bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
str r11, [r0, #vp8_writer_pos]
sub r3, r3, #8 ; count -= 8
+
+ VALIDATE_POS r10, r11 ; validate_buffer at pos
+
strb r7, [r10, r4] ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
ldr r10, [sp, #4] ; b->tree
extra_count_lt_zero
@@ -320,12 +340,15 @@ end_high_bit_not_set
bne end_count_zero
ldr r4, [r0, #vp8_writer_pos]
- mvn r3, #7
+ mvn r3, #7 ; count = -8
ldr r7, [r0, #vp8_writer_buffer]
lsr r6, r2, #24 ; lowvalue >> 24
add r12, r4, #1 ; w->pos++
bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
- str r12, [r0, #0x10]
+ str r12, [r0, #vp8_writer_pos]
+
+ VALIDATE_POS r7, r12 ; validate_buffer at pos
+
strb r6, [r7, r4]
end_count_zero
skip_extra_bits
@@ -401,6 +424,9 @@ token_high_bit_not_set_se
bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
str r11, [r0, #vp8_writer_pos]
sub r3, r3, #8 ; count -= 8
+
+ VALIDATE_POS r10, r11 ; validate_buffer at pos
+
strb r7, [r10, r4] ; w->buffer[w->pos++]
token_count_lt_zero_se
@@ -409,33 +435,10 @@ token_count_lt_zero_se
subs r12, r12, #1
bne stop_encode_loop
- ldr r10, [sp, #8] ; *size
- ldr r11, [r10]
ldr r4, [r0, #vp8_writer_pos] ; w->pos
- add r11, r11, r4 ; *size += w->pos
- str r11, [r10]
-
- ldr r9, [sp, #20] ; num_parts
- sub r9, r9, #1
- ldr r10, [sp, #28] ; i
- cmp r10, r9 ; if(i<(num_part - 1))
- bge skip_write_partition
-
- ldr r12, [sp, #40] ; ptr
+ ldr r12, [sp, #24] ; ptr
add r12, r12, r4 ; ptr += w->pos
- str r12, [sp, #40]
-
- ldr r9, [sp, #24] ; cx_data
- mov r8, r4, asr #8
- strb r4, [r9, #0]
- strb r8, [r9, #1]
- mov r4, r4, asr #16
- strb r4, [r9, #2]
-
- add r9, r9, #3 ; cx_data += 3
- str r9, [sp, #24]
-
-skip_write_partition
+ str r12, [sp, #24]
ldr r11, [sp, #28] ; i
ldr r10, [sp, #20] ; num_parts
@@ -451,9 +454,8 @@ skip_write_partition
cmp r10, r11
bgt numparts_loop
-
- add sp, sp, #44
- pop {r4-r11, pc}
+ add sp, sp, #40
+ pop {r4-r12, pc}
ENDP
_VP8_COMP_common_
@@ -462,7 +464,9 @@ _VP8_COMMON_MBrows_
DCD vp8_common_mb_rows
_VP8_COMP_tplist_
DCD vp8_comp_tplist
-_VP8_COMP_bc2_
- DCD vp8_comp_bc2
+_VP8_COMP_bc_
+ DCD vp8_comp_bc
+_vp8_writer_sz_
+ DCD vp8_writer_sz
END
diff --git a/vp8/encoder/arm/armv6/vp8_subtract_armv6.asm b/vp8/encoder/arm/armv6/vp8_subtract_armv6.asm
index 0ca74387b..f329f8f73 100644
--- a/vp8/encoder/arm/armv6/vp8_subtract_armv6.asm
+++ b/vp8/encoder/arm/armv6/vp8_subtract_armv6.asm
@@ -72,22 +72,23 @@ loop_block
; r0 short *diff
; r1 unsigned char *usrc
; r2 unsigned char *vsrc
-; r3 unsigned char *pred
-; stack int stride
+; r3 int src_stride
+; sp unsigned char *upred
+; sp unsigned char *vpred
+; sp int pred_stride
|vp8_subtract_mbuv_armv6| PROC
- stmfd sp!, {r4-r12, lr}
+ stmfd sp!, {r4-r11}
add r0, r0, #512 ; set *diff point to Cb
- add r3, r3, #256 ; set *pred point to Cb
-
mov r4, #8 ; loop count
- ldr r5, [sp, #40] ; stride
+ ldr r5, [sp, #32] ; upred
+ ldr r12, [sp, #40] ; pred_stride
; Subtract U block
loop_u
- ldr r6, [r1] ; src (A)
- ldr r7, [r3], #4 ; pred (A)
+ ldr r6, [r1] ; usrc (A)
+ ldr r7, [r5] ; upred (A)
uxtb16 r8, r6 ; [s2 | s0] (A)
uxtb16 r9, r7 ; [p2 | p0] (A)
@@ -97,8 +98,8 @@ loop_u
usub16 r6, r8, r9 ; [d2 | d0] (A)
usub16 r7, r10, r11 ; [d3 | d1] (A)
- ldr r10, [r1, #4] ; src (B)
- ldr r11, [r3], #4 ; pred (B)
+ ldr r10, [r1, #4] ; usrc (B)
+ ldr r11, [r5, #4] ; upred (B)
pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (A)
pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (A)
@@ -114,7 +115,8 @@ loop_u
usub16 r6, r8, r9 ; [d2 | d0] (B)
usub16 r7, r10, r11 ; [d3 | d1] (B)
- add r1, r1, r5 ; update usrc pointer
+ add r1, r1, r3 ; update usrc pointer
+ add r5, r5, r12 ; update upred pointer
pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (B)
pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (B)
@@ -125,12 +127,13 @@ loop_u
bne loop_u
+ ldr r5, [sp, #36] ; vpred
mov r4, #8 ; loop count
; Subtract V block
loop_v
- ldr r6, [r2] ; src (A)
- ldr r7, [r3], #4 ; pred (A)
+ ldr r6, [r2] ; vsrc (A)
+ ldr r7, [r5] ; vpred (A)
uxtb16 r8, r6 ; [s2 | s0] (A)
uxtb16 r9, r7 ; [p2 | p0] (A)
@@ -140,8 +143,8 @@ loop_v
usub16 r6, r8, r9 ; [d2 | d0] (A)
usub16 r7, r10, r11 ; [d3 | d1] (A)
- ldr r10, [r2, #4] ; src (B)
- ldr r11, [r3], #4 ; pred (B)
+ ldr r10, [r2, #4] ; vsrc (B)
+ ldr r11, [r5, #4] ; vpred (B)
pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (A)
pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (A)
@@ -157,7 +160,8 @@ loop_v
usub16 r6, r8, r9 ; [d2 | d0] (B)
usub16 r7, r10, r11 ; [d3 | d1] (B)
- add r2, r2, r5 ; update vsrc pointer
+ add r2, r2, r3 ; update vsrc pointer
+ add r5, r5, r12 ; update vpred pointer
pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (B)
pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (B)
@@ -168,23 +172,25 @@ loop_v
bne loop_v
- ldmfd sp!, {r4-r12, pc}
+ ldmfd sp!, {r4-r11}
+ bx lr
ENDP
; r0 short *diff
; r1 unsigned char *src
-; r2 unsigned char *pred
-; r3 int stride
+; r2 int src_stride
+; r3 unsigned char *pred
+; sp int pred_stride
|vp8_subtract_mby_armv6| PROC
stmfd sp!, {r4-r11}
-
+ ldr r12, [sp, #32] ; pred_stride
mov r4, #16
loop
ldr r6, [r1] ; src (A)
- ldr r7, [r2], #4 ; pred (A)
+ ldr r7, [r3] ; pred (A)
uxtb16 r8, r6 ; [s2 | s0] (A)
uxtb16 r9, r7 ; [p2 | p0] (A)
@@ -195,7 +201,7 @@ loop
usub16 r7, r10, r11 ; [d3 | d1] (A)
ldr r10, [r1, #4] ; src (B)
- ldr r11, [r2], #4 ; pred (B)
+ ldr r11, [r3, #4] ; pred (B)
pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (A)
pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (A)
@@ -212,7 +218,7 @@ loop
usub16 r7, r10, r11 ; [d3 | d1] (B)
ldr r10, [r1, #8] ; src (C)
- ldr r11, [r2], #4 ; pred (C)
+ ldr r11, [r3, #8] ; pred (C)
pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (B)
pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (B)
@@ -229,10 +235,10 @@ loop
usub16 r7, r10, r11 ; [d3 | d1] (C)
ldr r10, [r1, #12] ; src (D)
- ldr r11, [r2], #4 ; pred (D)
+ ldr r11, [r3, #12] ; pred (D)
- pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (C)
- pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (C)
+ pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (C)
+ pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (C)
str r8, [r0], #4 ; diff (C)
uxtb16 r8, r10 ; [s2 | s0] (D)
@@ -245,7 +251,8 @@ loop
usub16 r6, r8, r9 ; [d2 | d0] (D)
usub16 r7, r10, r11 ; [d3 | d1] (D)
- add r1, r1, r3 ; update src pointer
+ add r1, r1, r2 ; update src pointer
+ add r3, r3, r12 ; update pred pointer
pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (D)
pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (D)
@@ -257,7 +264,7 @@ loop
bne loop
ldmfd sp!, {r4-r11}
- mov pc, lr
+ bx lr
ENDP
diff --git a/vp8/encoder/arm/boolhuff_arm.c b/vp8/encoder/arm/boolhuff_arm.c
index 9089663ca..17a941bfc 100644
--- a/vp8/encoder/arm/boolhuff_arm.c
+++ b/vp8/encoder/arm/boolhuff_arm.c
@@ -10,7 +10,7 @@
#include "vp8/encoder/boolhuff.h"
-#include "vp8/common/blockd.h"
+#include "vpx/internal/vpx_codec_internal.h"
const unsigned int vp8_prob_cost[256] =
{
@@ -32,3 +32,10 @@ const unsigned int vp8_prob_cost[256] =
22, 21, 19, 18, 16, 15, 13, 12, 10, 9, 7, 6, 4, 3, 1, 1
};
+int vp8_validate_buffer_arm(const unsigned char *start,
+ size_t len,
+ const unsigned char *end,
+ struct vpx_internal_error_info *error)
+{
+ return validate_buffer(start, len, end, error);
+}
diff --git a/vp8/encoder/arm/neon/subtract_neon.asm b/vp8/encoder/arm/neon/subtract_neon.asm
index 68c295062..91a328c29 100644
--- a/vp8/encoder/arm/neon/subtract_neon.asm
+++ b/vp8/encoder/arm/neon/subtract_neon.asm
@@ -61,19 +61,24 @@
;==========================================
-;void vp8_subtract_mby_neon(short *diff, unsigned char *src, unsigned char *pred, int stride)
+;void vp8_subtract_mby_neon(short *diff, unsigned char *src, int src_stride
+; unsigned char *pred, int pred_stride)
|vp8_subtract_mby_neon| PROC
+ push {r4-r7}
mov r12, #4
+ ldr r4, [sp, #16] ; pred_stride
+ mov r6, #32 ; "diff" stride x2
+ add r5, r0, #16 ; second diff pointer
subtract_mby_loop
- vld1.8 {q0}, [r1], r3 ;load src
- vld1.8 {q1}, [r2]! ;load pred
- vld1.8 {q2}, [r1], r3
- vld1.8 {q3}, [r2]!
- vld1.8 {q4}, [r1], r3
- vld1.8 {q5}, [r2]!
- vld1.8 {q6}, [r1], r3
- vld1.8 {q7}, [r2]!
+ vld1.8 {q0}, [r1], r2 ;load src
+ vld1.8 {q1}, [r3], r4 ;load pred
+ vld1.8 {q2}, [r1], r2
+ vld1.8 {q3}, [r3], r4
+ vld1.8 {q4}, [r1], r2
+ vld1.8 {q5}, [r3], r4
+ vld1.8 {q6}, [r1], r2
+ vld1.8 {q7}, [r3], r4
vsubl.u8 q8, d0, d2
vsubl.u8 q9, d1, d3
@@ -84,46 +89,53 @@ subtract_mby_loop
vsubl.u8 q14, d12, d14
vsubl.u8 q15, d13, d15
- vst1.16 {q8}, [r0]! ;store diff
- vst1.16 {q9}, [r0]!
- vst1.16 {q10}, [r0]!
- vst1.16 {q11}, [r0]!
- vst1.16 {q12}, [r0]!
- vst1.16 {q13}, [r0]!
- vst1.16 {q14}, [r0]!
- vst1.16 {q15}, [r0]!
+ vst1.16 {q8}, [r0], r6 ;store diff
+ vst1.16 {q9}, [r5], r6
+ vst1.16 {q10}, [r0], r6
+ vst1.16 {q11}, [r5], r6
+ vst1.16 {q12}, [r0], r6
+ vst1.16 {q13}, [r5], r6
+ vst1.16 {q14}, [r0], r6
+ vst1.16 {q15}, [r5], r6
subs r12, r12, #1
bne subtract_mby_loop
+ pop {r4-r7}
bx lr
ENDP
;=================================
-;void vp8_subtract_mbuv_neon(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
+;void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc,
+; int src_stride, unsigned char *upred,
+; unsigned char *vpred, int pred_stride)
+
|vp8_subtract_mbuv_neon| PROC
- ldr r12, [sp]
+ push {r4-r7}
+ ldr r4, [sp, #16] ; upred
+ ldr r5, [sp, #20] ; vpred
+ ldr r6, [sp, #24] ; pred_stride
+ add r0, r0, #512 ; short *udiff = diff + 256;
+ mov r12, #32 ; "diff" stride x2
+ add r7, r0, #16 ; second diff pointer
;u
- add r0, r0, #512 ; short *udiff = diff + 256;
- add r3, r3, #256 ; unsigned char *upred = pred + 256;
-
- vld1.8 {d0}, [r1], r12 ;load src
- vld1.8 {d1}, [r3]! ;load pred
- vld1.8 {d2}, [r1], r12
- vld1.8 {d3}, [r3]!
- vld1.8 {d4}, [r1], r12
- vld1.8 {d5}, [r3]!
- vld1.8 {d6}, [r1], r12
- vld1.8 {d7}, [r3]!
- vld1.8 {d8}, [r1], r12
- vld1.8 {d9}, [r3]!
- vld1.8 {d10}, [r1], r12
- vld1.8 {d11}, [r3]!
- vld1.8 {d12}, [r1], r12
- vld1.8 {d13}, [r3]!
- vld1.8 {d14}, [r1], r12
- vld1.8 {d15}, [r3]!
+ vld1.8 {d0}, [r1], r3 ;load usrc
+ vld1.8 {d1}, [r4], r6 ;load upred
+ vld1.8 {d2}, [r1], r3
+ vld1.8 {d3}, [r4], r6
+ vld1.8 {d4}, [r1], r3
+ vld1.8 {d5}, [r4], r6
+ vld1.8 {d6}, [r1], r3
+ vld1.8 {d7}, [r4], r6
+ vld1.8 {d8}, [r1], r3
+ vld1.8 {d9}, [r4], r6
+ vld1.8 {d10}, [r1], r3
+ vld1.8 {d11}, [r4], r6
+ vld1.8 {d12}, [r1], r3
+ vld1.8 {d13}, [r4], r6
+ vld1.8 {d14}, [r1], r3
+ vld1.8 {d15}, [r4], r6
vsubl.u8 q8, d0, d1
vsubl.u8 q9, d2, d3
@@ -134,32 +146,32 @@ subtract_mby_loop
vsubl.u8 q14, d12, d13
vsubl.u8 q15, d14, d15
- vst1.16 {q8}, [r0]! ;store diff
- vst1.16 {q9}, [r0]!
- vst1.16 {q10}, [r0]!
- vst1.16 {q11}, [r0]!
- vst1.16 {q12}, [r0]!
- vst1.16 {q13}, [r0]!
- vst1.16 {q14}, [r0]!
- vst1.16 {q15}, [r0]!
+ vst1.16 {q8}, [r0], r12 ;store diff
+ vst1.16 {q9}, [r7], r12
+ vst1.16 {q10}, [r0], r12
+ vst1.16 {q11}, [r7], r12
+ vst1.16 {q12}, [r0], r12
+ vst1.16 {q13}, [r7], r12
+ vst1.16 {q14}, [r0], r12
+ vst1.16 {q15}, [r7], r12
;v
- vld1.8 {d0}, [r2], r12 ;load src
- vld1.8 {d1}, [r3]! ;load pred
- vld1.8 {d2}, [r2], r12
- vld1.8 {d3}, [r3]!
- vld1.8 {d4}, [r2], r12
- vld1.8 {d5}, [r3]!
- vld1.8 {d6}, [r2], r12
- vld1.8 {d7}, [r3]!
- vld1.8 {d8}, [r2], r12
- vld1.8 {d9}, [r3]!
- vld1.8 {d10}, [r2], r12
- vld1.8 {d11}, [r3]!
- vld1.8 {d12}, [r2], r12
- vld1.8 {d13}, [r3]!
- vld1.8 {d14}, [r2], r12
- vld1.8 {d15}, [r3]!
+ vld1.8 {d0}, [r2], r3 ;load vsrc
+ vld1.8 {d1}, [r5], r6 ;load vpred
+ vld1.8 {d2}, [r2], r3
+ vld1.8 {d3}, [r5], r6
+ vld1.8 {d4}, [r2], r3
+ vld1.8 {d5}, [r5], r6
+ vld1.8 {d6}, [r2], r3
+ vld1.8 {d7}, [r5], r6
+ vld1.8 {d8}, [r2], r3
+ vld1.8 {d9}, [r5], r6
+ vld1.8 {d10}, [r2], r3
+ vld1.8 {d11}, [r5], r6
+ vld1.8 {d12}, [r2], r3
+ vld1.8 {d13}, [r5], r6
+ vld1.8 {d14}, [r2], r3
+ vld1.8 {d15}, [r5], r6
vsubl.u8 q8, d0, d1
vsubl.u8 q9, d2, d3
@@ -170,16 +182,18 @@ subtract_mby_loop
vsubl.u8 q14, d12, d13
vsubl.u8 q15, d14, d15
- vst1.16 {q8}, [r0]! ;store diff
- vst1.16 {q9}, [r0]!
- vst1.16 {q10}, [r0]!
- vst1.16 {q11}, [r0]!
- vst1.16 {q12}, [r0]!
- vst1.16 {q13}, [r0]!
- vst1.16 {q14}, [r0]!
- vst1.16 {q15}, [r0]!
+ vst1.16 {q8}, [r0], r12 ;store diff
+ vst1.16 {q9}, [r7], r12
+ vst1.16 {q10}, [r0], r12
+ vst1.16 {q11}, [r7], r12
+ vst1.16 {q12}, [r0], r12
+ vst1.16 {q13}, [r7], r12
+ vst1.16 {q14}, [r0], r12
+ vst1.16 {q15}, [r7], r12
+ pop {r4-r7}
bx lr
+
ENDP
END
diff --git a/vp8/encoder/arm/variance_arm.c b/vp8/encoder/arm/variance_arm.c
index e77be9f73..7fc7473ac 100644
--- a/vp8/encoder/arm/variance_arm.c
+++ b/vp8/encoder/arm/variance_arm.c
@@ -11,9 +11,9 @@
#include "vpx_config.h"
#include "vp8/encoder/variance.h"
#include "vp8/common/filter.h"
-#include "vp8/common/arm/bilinearfilter_arm.h"
#if HAVE_ARMV6
+#include "vp8/common/arm/bilinearfilter_arm.h"
unsigned int vp8_sub_pixel_variance8x8_armv6
(
diff --git a/vp8/encoder/asm_enc_offsets.c b/vp8/encoder/asm_enc_offsets.c
index d05dab47c..2e9ca7232 100644
--- a/vp8/encoder/asm_enc_offsets.c
+++ b/vp8/encoder/asm_enc_offsets.c
@@ -50,6 +50,7 @@ DEFINE(vp8_writer_count, offsetof(vp8_writer, count));
DEFINE(vp8_writer_pos, offsetof(vp8_writer, pos));
DEFINE(vp8_writer_buffer, offsetof(vp8_writer, buffer));
DEFINE(vp8_writer_buffer_end, offsetof(vp8_writer, buffer_end));
+DEFINE(vp8_writer_error, offsetof(vp8_writer, error));
DEFINE(tokenextra_token, offsetof(TOKENEXTRA, Token));
DEFINE(tokenextra_extra, offsetof(TOKENEXTRA, Extra));
@@ -69,7 +70,8 @@ DEFINE(vp8_extra_bit_struct_base_val, offsetof(vp8_extra_bit_struct, b
DEFINE(vp8_comp_tplist, offsetof(VP8_COMP, tplist));
DEFINE(vp8_comp_common, offsetof(VP8_COMP, common));
-DEFINE(vp8_comp_bc2, offsetof(VP8_COMP, bc2));
+DEFINE(vp8_comp_bc , offsetof(VP8_COMP, bc));
+DEFINE(vp8_writer_sz , sizeof(vp8_writer));
DEFINE(tokenlist_start, offsetof(TOKENLIST, start));
DEFINE(tokenlist_stop, offsetof(TOKENLIST, stop));
diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c
index 748b60778..669bfad9a 100644
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c
@@ -109,7 +109,7 @@ static void update_mbintra_mode_probs(VP8_COMP *cpi)
{
VP8_COMMON *const x = & cpi->common;
- vp8_writer *const w = & cpi->bc;
+ vp8_writer *const w = cpi->bc;
{
vp8_prob Pnew [VP8_YMODES-1];
@@ -221,6 +221,11 @@ static void pack_tokens_c(vp8_writer *w, const TOKENEXTRA *p, int xcount)
w->buffer[x] += 1;
}
+ validate_buffer(w->buffer + w->pos,
+ 1,
+ w->buffer_end,
+ w->error);
+
w->buffer[w->pos++] = (lowvalue >> (24 - offset));
lowvalue <<= offset;
shift = count;
@@ -281,6 +286,11 @@ static void pack_tokens_c(vp8_writer *w, const TOKENEXTRA *p, int xcount)
w->buffer[x] += 1;
}
+ validate_buffer(w->buffer + w->pos,
+ 1,
+ w->buffer_end,
+ w->error);
+
w->buffer[w->pos++] = (lowvalue >> (24 - offset));
lowvalue <<= offset;
shift = count;
@@ -329,6 +339,12 @@ static void pack_tokens_c(vp8_writer *w, const TOKENEXTRA *p, int xcount)
if (!++count)
{
count = -8;
+
+ validate_buffer(w->buffer + w->pos,
+ 1,
+ w->buffer_end,
+ w->error);
+
w->buffer[w->pos++] = (lowvalue >> 24);
lowvalue &= 0xffffff;
}
@@ -358,20 +374,21 @@ static void write_partition_size(unsigned char *cx_data, int size)
}
-static void pack_tokens_into_partitions_c(VP8_COMP *cpi, unsigned char *cx_data, unsigned char * cx_data_end, int num_part, int *size)
+static void pack_tokens_into_partitions_c(VP8_COMP *cpi, unsigned char *cx_data,
+ unsigned char * cx_data_end,
+ int num_part)
{
int i;
unsigned char *ptr = cx_data;
unsigned char *ptr_end = cx_data_end;
unsigned int shift;
- vp8_writer *w = &cpi->bc2;
- *size = 3 * (num_part - 1);
- cpi->partition_sz[0] += *size;
- ptr = cx_data + (*size);
+ vp8_writer *w;
+ ptr = cx_data;
for (i = 0; i < num_part; i++)
{
+ w = cpi->bc + i + 1;
vp8_start_encode(w, ptr, ptr_end);
{
unsigned int split;
@@ -581,17 +598,7 @@ static void pack_tokens_into_partitions_c(VP8_COMP *cpi, unsigned char *cx_data,
}
vp8_stop_encode(w);
- *size += w->pos;
-
- /* The first partition size is set earlier */
- cpi->partition_sz[i + 1] = w->pos;
-
- if (i < (num_part - 1))
- {
- write_partition_size(cx_data, w->pos);
- cx_data += 3;
- ptr += w->pos;
- }
+ ptr += w->pos;
}
}
@@ -664,6 +671,11 @@ static void pack_mb_row_tokens_c(VP8_COMP *cpi, vp8_writer *w)
w->buffer[x] += 1;
}
+ validate_buffer(w->buffer + w->pos,
+ 1,
+ w->buffer_end,
+ w->error);
+
w->buffer[w->pos++] = (lowvalue >> (24 - offset));
lowvalue <<= offset;
shift = count;
@@ -724,6 +736,11 @@ static void pack_mb_row_tokens_c(VP8_COMP *cpi, vp8_writer *w)
w->buffer[x] += 1;
}
+ validate_buffer(w->buffer + w->pos,
+ 1,
+ w->buffer_end,
+ w->error);
+
w->buffer[w->pos++] = (lowvalue >> (24 - offset));
lowvalue <<= offset;
shift = count;
@@ -770,6 +787,12 @@ static void pack_mb_row_tokens_c(VP8_COMP *cpi, vp8_writer *w)
if (!++count)
{
count = -8;
+
+ validate_buffer(w->buffer + w->pos,
+ 1,
+ w->buffer_end,
+ w->error);
+
w->buffer[w->pos++] = (lowvalue >> 24);
lowvalue &= 0xffffff;
}
@@ -855,44 +878,46 @@ static void write_mb_features(vp8_writer *w, const MB_MODE_INFO *mi, const MACRO
}
}
}
+void vp8_convert_rfct_to_prob(VP8_COMP *const cpi)
+{
+ const int *const rfct = cpi->count_mb_ref_frame_usage;
+ const int rf_intra = rfct[INTRA_FRAME];
+ const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];
+
+ // Calculate the probabilities used to code the ref frame based on useage
+ if (!(cpi->prob_intra_coded = rf_intra * 255 / (rf_intra + rf_inter)))
+ cpi->prob_intra_coded = 1;
+
+ cpi->prob_last_coded = rf_inter ? (rfct[LAST_FRAME] * 255) / rf_inter : 128;
+ if (!cpi->prob_last_coded)
+ cpi->prob_last_coded = 1;
+
+ cpi->prob_gf_coded = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME])
+ ? (rfct[GOLDEN_FRAME] * 255) / (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) : 128;
+
+ if (!cpi->prob_gf_coded)
+ cpi->prob_gf_coded = 1;
+
+}
static void pack_inter_mode_mvs(VP8_COMP *const cpi)
{
VP8_COMMON *const pc = & cpi->common;
- vp8_writer *const w = & cpi->bc;
+ vp8_writer *const w = cpi->bc;
const MV_CONTEXT *mvc = pc->fc.mvc;
- const int *const rfct = cpi->count_mb_ref_frame_usage;
- const int rf_intra = rfct[INTRA_FRAME];
- const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];
MODE_INFO *m = pc->mi, *ms;
const int mis = pc->mode_info_stride;
int mb_row = -1;
- int prob_last_coded;
- int prob_gf_coded;
int prob_skip_false = 0;
ms = pc->mi - 1;
cpi->mb.partition_info = cpi->mb.pi;
- // Calculate the probabilities to be used to code the reference frame based on actual useage this frame
- if (!(cpi->prob_intra_coded = rf_intra * 255 / (rf_intra + rf_inter)))
- cpi->prob_intra_coded = 1;
-
- prob_last_coded = rf_inter ? (rfct[LAST_FRAME] * 255) / rf_inter : 128;
-
- if (!prob_last_coded)
- prob_last_coded = 1;
-
- prob_gf_coded = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME])
- ? (rfct[GOLDEN_FRAME] * 255) / (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) : 128;
-
- if (!prob_gf_coded)
- prob_gf_coded = 1;
-
+ vp8_convert_rfct_to_prob(cpi);
#ifdef ENTROPY_STATS
active_section = 1;
@@ -913,8 +938,8 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi)
}
vp8_write_literal(w, cpi->prob_intra_coded, 8);
- vp8_write_literal(w, prob_last_coded, 8);
- vp8_write_literal(w, prob_gf_coded, 8);
+ vp8_write_literal(w, cpi->prob_last_coded, 8);
+ vp8_write_literal(w, cpi->prob_gf_coded, 8);
update_mbintra_mode_probs(cpi);
@@ -976,11 +1001,11 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi)
vp8_write(w, 1, cpi->prob_intra_coded);
if (rf == LAST_FRAME)
- vp8_write(w, 0, prob_last_coded);
+ vp8_write(w, 0, cpi->prob_last_coded);
else
{
- vp8_write(w, 1, prob_last_coded);
- vp8_write(w, (rf == GOLDEN_FRAME) ? 0 : 1, prob_gf_coded);
+ vp8_write(w, 1, cpi->prob_last_coded);
+ vp8_write(w, (rf == GOLDEN_FRAME) ? 0 : 1, cpi->prob_gf_coded);
}
{
@@ -1075,7 +1100,7 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi)
static void write_kfmodes(VP8_COMP *cpi)
{
- vp8_writer *const bc = & cpi->bc;
+ vp8_writer *const bc = cpi->bc;
const VP8_COMMON *const c = & cpi->common;
/* const */
MODE_INFO *m = c->mi;
@@ -1181,7 +1206,7 @@ static void sum_probs_over_prev_coef_context(
{
for (j=0; j < PREV_COEF_CONTEXTS; ++j)
{
- const int tmp = out[i];
+ const unsigned int tmp = out[i];
out[i] += probs[j][i];
/* check for wrap */
if (out[i] < tmp)
@@ -1332,6 +1357,24 @@ static int default_coef_context_savings(VP8_COMP *cpi)
return savings;
}
+void vp8_calc_ref_frame_costs(int *ref_frame_cost,
+ int prob_intra,
+ int prob_last,
+ int prob_garf
+ )
+{
+ ref_frame_cost[INTRA_FRAME] = vp8_cost_zero(prob_intra);
+ ref_frame_cost[LAST_FRAME] = vp8_cost_one(prob_intra)
+ + vp8_cost_zero(prob_last);
+ ref_frame_cost[GOLDEN_FRAME] = vp8_cost_one(prob_intra)
+ + vp8_cost_one(prob_last)
+ + vp8_cost_zero(prob_garf);
+ ref_frame_cost[ALTREF_FRAME] = vp8_cost_one(prob_intra)
+ + vp8_cost_one(prob_last)
+ + vp8_cost_one(prob_garf);
+
+}
+
int vp8_estimate_entropy_savings(VP8_COMP *cpi)
{
int savings = 0;
@@ -1339,7 +1382,7 @@ int vp8_estimate_entropy_savings(VP8_COMP *cpi)
const int *const rfct = cpi->count_mb_ref_frame_usage;
const int rf_intra = rfct[INTRA_FRAME];
const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];
- int new_intra, new_last, gf_last, oldtotal, newtotal;
+ int new_intra, new_last, new_garf, oldtotal, newtotal;
int ref_frame_cost[MAX_REF_FRAMES];
vp8_clear_system_state(); //__asm emms;
@@ -1351,19 +1394,11 @@ int vp8_estimate_entropy_savings(VP8_COMP *cpi)
new_last = rf_inter ? (rfct[LAST_FRAME] * 255) / rf_inter : 128;
- gf_last = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME])
+ new_garf = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME])
? (rfct[GOLDEN_FRAME] * 255) / (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) : 128;
- // new costs
- ref_frame_cost[INTRA_FRAME] = vp8_cost_zero(new_intra);
- ref_frame_cost[LAST_FRAME] = vp8_cost_one(new_intra)
- + vp8_cost_zero(new_last);
- ref_frame_cost[GOLDEN_FRAME] = vp8_cost_one(new_intra)
- + vp8_cost_one(new_last)
- + vp8_cost_zero(gf_last);
- ref_frame_cost[ALTREF_FRAME] = vp8_cost_one(new_intra)
- + vp8_cost_one(new_last)
- + vp8_cost_one(gf_last);
+
+ vp8_calc_ref_frame_costs(ref_frame_cost,new_intra,new_last,new_garf);
newtotal =
rfct[INTRA_FRAME] * ref_frame_cost[INTRA_FRAME] +
@@ -1373,15 +1408,8 @@ int vp8_estimate_entropy_savings(VP8_COMP *cpi)
// old costs
- ref_frame_cost[INTRA_FRAME] = vp8_cost_zero(cpi->prob_intra_coded);
- ref_frame_cost[LAST_FRAME] = vp8_cost_one(cpi->prob_intra_coded)
- + vp8_cost_zero(cpi->prob_last_coded);
- ref_frame_cost[GOLDEN_FRAME] = vp8_cost_one(cpi->prob_intra_coded)
- + vp8_cost_one(cpi->prob_last_coded)
- + vp8_cost_zero(cpi->prob_gf_coded);
- ref_frame_cost[ALTREF_FRAME] = vp8_cost_one(cpi->prob_intra_coded)
- + vp8_cost_one(cpi->prob_last_coded)
- + vp8_cost_one(cpi->prob_gf_coded);
+ vp8_calc_ref_frame_costs(ref_frame_cost,cpi->prob_intra_coded,
+ cpi->prob_last_coded,cpi->prob_gf_coded);
oldtotal =
rfct[INTRA_FRAME] * ref_frame_cost[INTRA_FRAME] +
@@ -1405,7 +1433,7 @@ int vp8_estimate_entropy_savings(VP8_COMP *cpi)
static void update_coef_probs(VP8_COMP *cpi)
{
int i = 0;
- vp8_writer *const w = & cpi->bc;
+ vp8_writer *const w = cpi->bc;
int savings = 0;
vp8_clear_system_state(); //__asm emms;
@@ -1551,7 +1579,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char * dest
int i, j;
VP8_HEADER oh;
VP8_COMMON *const pc = & cpi->common;
- vp8_writer *const bc = & cpi->bc;
+ vp8_writer *const bc = cpi->bc;
MACROBLOCKD *const xd = & cpi->mb.e_mbd;
int extra_bytes_packed = 0;
@@ -1566,6 +1594,8 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char * dest
mb_feature_data_bits = vp8_mb_feature_data_bits;
+ bc[0].error = &pc->error;
+
validate_buffer(cx_data, 3, cx_data_end, &cpi->common.error);
cx_data += 3;
@@ -1614,20 +1644,20 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char * dest
// Signal whether or not Segmentation is enabled
- vp8_write_bit(bc, (xd->segmentation_enabled) ? 1 : 0);
+ vp8_write_bit(bc, xd->segmentation_enabled);
// Indicate which features are enabled
if (xd->segmentation_enabled)
{
// Signal whether or not the segmentation map is being updated.
- vp8_write_bit(bc, (xd->update_mb_segmentation_map) ? 1 : 0);
- vp8_write_bit(bc, (xd->update_mb_segmentation_data) ? 1 : 0);
+ vp8_write_bit(bc, xd->update_mb_segmentation_map);
+ vp8_write_bit(bc, xd->update_mb_segmentation_data);
if (xd->update_mb_segmentation_data)
{
signed char Data;
- vp8_write_bit(bc, (xd->mb_segement_abs_delta) ? 1 : 0);
+ vp8_write_bit(bc, xd->mb_segement_abs_delta);
// For each segmentation feature (Quant and loop filter level)
for (i = 0; i < MB_LVL_MAX; i++)
@@ -1684,7 +1714,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char * dest
vp8_write_literal(bc, pc->sharpness_level, 3);
// Write out loop filter deltas applied at the MB level based on mode or ref frame (if they are enabled).
- vp8_write_bit(bc, (xd->mode_ref_lf_delta_enabled) ? 1 : 0);
+ vp8_write_bit(bc, xd->mode_ref_lf_delta_enabled);
if (xd->mode_ref_lf_delta_enabled)
{
@@ -1844,7 +1874,9 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char * dest
vp8_stop_encode(bc);
- oh.first_partition_length_in_bytes = cpi->bc.pos;
+ cx_data += bc->pos;
+
+ oh.first_partition_length_in_bytes = cpi->bc->pos;
/* update frame tag */
{
@@ -1858,34 +1890,58 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char * dest
dest[2] = v >> 16;
}
- *size = VP8_HEADER_SIZE + extra_bytes_packed + cpi->bc.pos;
+ *size = VP8_HEADER_SIZE + extra_bytes_packed + cpi->bc->pos;
+
cpi->partition_sz[0] = *size;
if (pc->multi_token_partition != ONE_PARTITION)
{
- int num_part;
- int asize;
- num_part = 1 << pc->multi_token_partition;
+ int num_part = 1 << pc->multi_token_partition;
+
+ /* partition size table at the end of first partition */
+ cpi->partition_sz[0] += 3 * (num_part - 1);
+ *size += 3 * (num_part - 1);
+
+ validate_buffer(cx_data, 3 * (num_part - 1), cx_data_end,
+ &pc->error);
+
+ for(i = 1; i < num_part + 1; i++)
+ {
+ cpi->bc[i].error = &pc->error;
+ }
- pack_tokens_into_partitions(cpi, cx_data + bc->pos, cx_data_end, num_part, &asize);
+ pack_tokens_into_partitions(cpi, cx_data + 3 * (num_part - 1),
+ cx_data_end, num_part);
- *size += asize;
+ for(i = 1; i < num_part; i++)
+ {
+ cpi->partition_sz[i] = cpi->bc[i].pos;
+ write_partition_size(cx_data, cpi->partition_sz[i]);
+ cx_data += 3;
+ *size += cpi->partition_sz[i]; /* add to total */
+ }
+
+ /* add last partition to total size */
+ cpi->partition_sz[i] = cpi->bc[i].pos;
+ *size += cpi->partition_sz[i];
}
else
{
- vp8_start_encode(&cpi->bc2, cx_data + bc->pos, cx_data_end);
+ bc[1].error = &pc->error;
+
+ vp8_start_encode(&cpi->bc[1], cx_data, cx_data_end);
#if CONFIG_MULTITHREAD
if (cpi->b_multi_threaded)
- pack_mb_row_tokens(cpi, &cpi->bc2);
+ pack_mb_row_tokens(cpi, &cpi->bc[1]);
else
#endif
- pack_tokens(&cpi->bc2, cpi->tok, cpi->tok_count);
+ pack_tokens(&cpi->bc[1], cpi->tok, cpi->tok_count);
- vp8_stop_encode(&cpi->bc2);
+ vp8_stop_encode(&cpi->bc[1]);
- *size += cpi->bc2.pos;
- cpi->partition_sz[1] = cpi->bc2.pos;
+ *size += cpi->bc[1].pos;
+ cpi->partition_sz[1] = cpi->bc[1].pos;
}
}
diff --git a/vp8/encoder/bitstream.h b/vp8/encoder/bitstream.h
index 8a875a5bd..9007cede0 100644
--- a/vp8/encoder/bitstream.h
+++ b/vp8/encoder/bitstream.h
@@ -17,23 +17,27 @@ void vp8cx_pack_tokens_armv5(vp8_writer *w, const TOKENEXTRA *p, int xcount,
vp8_token *,
vp8_extra_bit_struct *,
const vp8_tree_index *);
-void vp8cx_pack_tokens_into_partitions_armv5(VP8_COMP *, unsigned char *, int , int *,
- vp8_token *,
- vp8_extra_bit_struct *,
- const vp8_tree_index *);
+void vp8cx_pack_tokens_into_partitions_armv5(VP8_COMP *,
+ unsigned char * cx_data,
+ const unsigned char *cx_data_end,
+ int num_parts,
+ vp8_token *,
+ vp8_extra_bit_struct *,
+ const vp8_tree_index *);
void vp8cx_pack_mb_row_tokens_armv5(VP8_COMP *cpi, vp8_writer *w,
vp8_token *,
vp8_extra_bit_struct *,
const vp8_tree_index *);
# define pack_tokens(a,b,c) \
vp8cx_pack_tokens_armv5(a,b,c,vp8_coef_encodings,vp8_extra_bits,vp8_coef_tree)
-# define pack_tokens_into_partitions(a,b,unused,c,d) \
+# define pack_tokens_into_partitions(a,b,c,d) \
vp8cx_pack_tokens_into_partitions_armv5(a,b,c,d,vp8_coef_encodings,vp8_extra_bits,vp8_coef_tree)
# define pack_mb_row_tokens(a,b) \
vp8cx_pack_mb_row_tokens_armv5(a,b,vp8_coef_encodings,vp8_extra_bits,vp8_coef_tree)
#else
-# define pack_tokens(a,b,c) pack_tokens_c(a,b,c)
-# define pack_tokens_into_partitions(a,b,c,d,e) pack_tokens_into_partitions_c(a,b,c,d,e)
+# define pack_tokens(a,b,c) pack_tokens_c(a,b,c)
+# define pack_tokens_into_partitions(a,b,c,d) pack_tokens_into_partitions_c(a,b,c,d)
# define pack_mb_row_tokens(a,b) pack_mb_row_tokens_c(a,b)
#endif
+
#endif
diff --git a/vp8/encoder/block.h b/vp8/encoder/block.h
index 5e5a60db7..0a74ca46d 100644
--- a/vp8/encoder/block.h
+++ b/vp8/encoder/block.h
@@ -45,10 +45,6 @@ typedef struct
unsigned char **base_src;
int src;
int src_stride;
-
-// MV enc_mv;
- int force_empty;
-
} BLOCK;
typedef struct
@@ -107,7 +103,6 @@ typedef struct
int mv_row_min;
int mv_row_max;
- int vector_range; // Used to monitor limiting range of recent vectors to guide search.
int skip;
int encode_breakout;
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index 8ec9e27c9..6a9ba291d 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -39,7 +39,12 @@
#define IF_RTCD(x) NULL
#endif
extern void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) ;
-
+extern void vp8_calc_ref_frame_costs(int *ref_frame_cost,
+ int prob_intra,
+ int prob_last,
+ int prob_garf
+ );
+extern void vp8_convert_rfct_to_prob(VP8_COMP *const cpi);
extern void vp8cx_initialize_me_consts(VP8_COMP *cpi, int QIndex);
extern void vp8_auto_select_speed(VP8_COMP *cpi);
extern void vp8cx_init_mbrthread_data(VP8_COMP *cpi,
@@ -49,8 +54,8 @@ extern void vp8cx_init_mbrthread_data(VP8_COMP *cpi,
int count);
void vp8_build_block_offsets(MACROBLOCK *x);
void vp8_setup_block_ptrs(MACROBLOCK *x);
-int vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t, int recon_yoffset, int recon_uvoffset);
-int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t);
+int vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t, int recon_yoffset, int recon_uvoffset, int mb_row, int mb_col);
+int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t, int mb_row, int mb_col);
static void adjust_act_zbin( VP8_COMP *cpi, MACROBLOCK *x );
#ifdef MODE_STATS
@@ -475,14 +480,14 @@ void encode_mb_row(VP8_COMP *cpi,
if (cm->frame_type == KEY_FRAME)
{
- *totalrate += vp8cx_encode_intra_macro_block(cpi, x, tp);
+ *totalrate += vp8cx_encode_intra_macro_block(cpi, x, tp, mb_row, mb_col);
#ifdef MODE_STATS
y_modes[xd->mbmi.mode] ++;
#endif
}
else
{
- *totalrate += vp8cx_encode_inter_macroblock(cpi, x, tp, recon_yoffset, recon_uvoffset);
+ *totalrate += vp8cx_encode_inter_macroblock(cpi, x, tp, recon_yoffset, recon_uvoffset, mb_row, mb_col);
#ifdef MODE_STATS
inter_y_modes[xd->mbmi.mode] ++;
@@ -590,8 +595,6 @@ void init_encode_frame_mb_context(VP8_COMP *cpi)
// Activity map pointer
x->mb_activity_ptr = cpi->mb_activity_map;
- x->vector_range = 32;
-
x->act_zbin_adj = 0;
x->partition_info = x->pi;
@@ -636,55 +639,23 @@ void init_encode_frame_mb_context(VP8_COMP *cpi)
vpx_memset(cm->above_context, 0,
sizeof(ENTROPY_CONTEXT_PLANES) * cm->mb_cols);
- xd->ref_frame_cost[INTRA_FRAME] = vp8_cost_zero(cpi->prob_intra_coded);
-
// Special case treatment when GF and ARF are not sensible options for reference
if (cpi->ref_frame_flags == VP8_LAST_FLAG)
- {
- xd->ref_frame_cost[LAST_FRAME] = vp8_cost_one(cpi->prob_intra_coded)
- + vp8_cost_zero(255);
- xd->ref_frame_cost[GOLDEN_FRAME] = vp8_cost_one(cpi->prob_intra_coded)
- + vp8_cost_one(255)
- + vp8_cost_zero(128);
- xd->ref_frame_cost[ALTREF_FRAME] = vp8_cost_one(cpi->prob_intra_coded)
- + vp8_cost_one(255)
- + vp8_cost_one(128);
- }
+ vp8_calc_ref_frame_costs(xd->ref_frame_cost,
+ cpi->prob_intra_coded,255,128);
else if ((cpi->oxcf.number_of_layers > 1) &&
(cpi->ref_frame_flags == VP8_GOLD_FLAG))
- {
- xd->ref_frame_cost[LAST_FRAME] = vp8_cost_one(cpi->prob_intra_coded)
- + vp8_cost_zero(1);
- xd->ref_frame_cost[GOLDEN_FRAME] = vp8_cost_one(cpi->prob_intra_coded)
- + vp8_cost_one(1)
- + vp8_cost_zero(255);
- xd->ref_frame_cost[ALTREF_FRAME] = vp8_cost_one(cpi->prob_intra_coded)
- + vp8_cost_one(1)
- + vp8_cost_one(255);
- }
+ vp8_calc_ref_frame_costs(xd->ref_frame_cost,
+ cpi->prob_intra_coded,1,255);
else if ((cpi->oxcf.number_of_layers > 1) &&
(cpi->ref_frame_flags == VP8_ALT_FLAG))
- {
- xd->ref_frame_cost[LAST_FRAME] = vp8_cost_one(cpi->prob_intra_coded)
- + vp8_cost_zero(1);
- xd->ref_frame_cost[GOLDEN_FRAME] = vp8_cost_one(cpi->prob_intra_coded)
- + vp8_cost_one(1)
- + vp8_cost_zero(1);
- xd->ref_frame_cost[ALTREF_FRAME] = vp8_cost_one(cpi->prob_intra_coded)
- + vp8_cost_one(1)
- + vp8_cost_one(1);
- }
+ vp8_calc_ref_frame_costs(xd->ref_frame_cost,
+ cpi->prob_intra_coded,1,1);
else
- {
- xd->ref_frame_cost[LAST_FRAME] = vp8_cost_one(cpi->prob_intra_coded)
- + vp8_cost_zero(cpi->prob_last_coded);
- xd->ref_frame_cost[GOLDEN_FRAME] = vp8_cost_one(cpi->prob_intra_coded)
- + vp8_cost_one(cpi->prob_last_coded)
- + vp8_cost_zero(cpi->prob_gf_coded);
- xd->ref_frame_cost[ALTREF_FRAME] = vp8_cost_one(cpi->prob_intra_coded)
- + vp8_cost_one(cpi->prob_last_coded)
- + vp8_cost_one(cpi->prob_gf_coded);
- }
+ vp8_calc_ref_frame_costs(xd->ref_frame_cost,
+ cpi->prob_intra_coded,
+ cpi->prob_last_coded,
+ cpi->prob_gf_coded);
xd->fullpixel_mask = 0xffffffff;
if(cm->full_pixel)
@@ -966,31 +937,7 @@ void vp8_encode_frame(VP8_COMP *cpi)
if ((cm->frame_type != KEY_FRAME) && ((cpi->oxcf.number_of_layers > 1) ||
(!cm->refresh_alt_ref_frame && !cm->refresh_golden_frame)))
{
- const int *const rfct = cpi->count_mb_ref_frame_usage;
- const int rf_intra = rfct[INTRA_FRAME];
- const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];
-
- if ((rf_intra + rf_inter) > 0)
- {
- cpi->prob_intra_coded = (rf_intra * 255) / (rf_intra + rf_inter);
-
- if (cpi->prob_intra_coded < 1)
- cpi->prob_intra_coded = 1;
-
- if ((cm->frames_since_golden > 0) || cpi->source_alt_ref_active)
- {
- cpi->prob_last_coded = rf_inter ? (rfct[LAST_FRAME] * 255) / rf_inter : 128;
-
- if (cpi->prob_last_coded < 1)
- cpi->prob_last_coded = 1;
-
- cpi->prob_gf_coded = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME])
- ? (rfct[GOLDEN_FRAME] * 255) / (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) : 128;
-
- if (cpi->prob_gf_coded < 1)
- cpi->prob_gf_coded = 1;
- }
- }
+ vp8_convert_rfct_to_prob(cpi);
}
#if 0
@@ -1142,8 +1089,10 @@ static void adjust_act_zbin( VP8_COMP *cpi, MACROBLOCK *x )
#endif
}
-int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t)
+int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,
+ int mb_row, int mb_col)
{
+ MACROBLOCKD *xd = &x->e_mbd;
int rate;
if (cpi->sf.RD && cpi->compressor_speed != 2)
@@ -1163,14 +1112,17 @@ int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t)
vp8_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);
vp8_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);
+
sum_intra_stats(cpi, x);
vp8_tokenize_mb(cpi, &x->e_mbd, t);
- if (x->e_mbd.mode_info_context->mbmi.mode != B_PRED)
- vp8_inverse_transform_mby(IF_RTCD(&cpi->rtcd.common->idct), &x->e_mbd);
-
- vp8_inverse_transform_mbuv(IF_RTCD(&cpi->rtcd.common->idct), &x->e_mbd);
+ if (xd->mode_info_context->mbmi.mode != B_PRED)
+ vp8_inverse_transform_mby(xd, IF_RTCD(&cpi->common.rtcd));
+ DEQUANT_INVOKE (&cpi->common.rtcd.dequant, idct_add_uv_block)
+ (xd->qcoeff+16*16, xd->block[16].dequant,
+ xd->dst.u_buffer, xd->dst.v_buffer,
+ xd->dst.uv_stride, xd->eobs+16);
return rate;
}
#ifdef SPEEDSTATS
@@ -1182,7 +1134,8 @@ extern void vp8_fix_contexts(MACROBLOCKD *x);
int vp8cx_encode_inter_macroblock
(
VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,
- int recon_yoffset, int recon_uvoffset
+ int recon_yoffset, int recon_uvoffset,
+ int mb_row, int mb_col
)
{
MACROBLOCKD *const xd = &x->e_mbd;
@@ -1230,8 +1183,10 @@ int vp8cx_encode_inter_macroblock
}
else
+ {
vp8_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate,
- &distortion, &intra_error);
+ &distortion, &intra_error, mb_row, mb_col);
+ }
cpi->prediction_error += distortion;
cpi->intra_error += intra_error;
@@ -1345,12 +1300,14 @@ int vp8cx_encode_inter_macroblock
if (!x->skip)
{
vp8_tokenize_mb(cpi, xd, t);
- if (x->e_mbd.mode_info_context->mbmi.mode != B_PRED)
- {
- vp8_inverse_transform_mby(IF_RTCD(&cpi->rtcd.common->idct),
- &x->e_mbd);
- }
- vp8_inverse_transform_mbuv(IF_RTCD(&cpi->rtcd.common->idct), &x->e_mbd);
+
+ if (xd->mode_info_context->mbmi.mode != B_PRED)
+ vp8_inverse_transform_mby(xd, IF_RTCD(&cpi->common.rtcd));
+
+ DEQUANT_INVOKE (&cpi->common.rtcd.dequant, idct_add_uv_block)
+ (xd->qcoeff+16*16, xd->block[16].dequant,
+ xd->dst.u_buffer, xd->dst.v_buffer,
+ xd->dst.uv_stride, xd->eobs+16);
}
else
{
diff --git a/vp8/encoder/encodeintra.c b/vp8/encoder/encodeintra.c
index d89d74e5e..16393a1ff 100644
--- a/vp8/encoder/encodeintra.c
+++ b/vp8/encoder/encodeintra.c
@@ -18,7 +18,6 @@
#include "vp8/common/invtrans.h"
#include "vp8/common/recon.h"
#include "dct.h"
-#include "vp8/common/g_common.h"
#include "encodeintra.h"
@@ -45,7 +44,7 @@ int vp8_encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred)
vp8_encode_intra16x16mby(rtcd, x);
- vp8_inverse_transform_mby(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+ vp8_inverse_transform_mby(&x->e_mbd, IF_RTCD(&cpi->common.rtcd));
}
else
{
@@ -77,8 +76,17 @@ void vp8_encode_intra4x4block(const VP8_ENCODER_RTCD *rtcd,
x->quantize_b(be, b);
- vp8_inverse_transform_b(IF_RTCD(&rtcd->common->idct), b, 16);
-
+ if (*b->eob > 1)
+ {
+ IDCT_INVOKE(IF_RTCD(&rtcd->common->idct), idct16)(b->dqcoeff,
+ b->predictor, 16, *(b->base_dst) + b->dst, b->dst_stride);
+ }
+ else
+ {
+ IDCT_INVOKE(IF_RTCD(&rtcd->common->idct), idct1_scalar_add)
+ (b->dqcoeff[0], b->predictor, 16, *(b->base_dst) + b->dst,
+ b->dst_stride);
+ }
}
void vp8_encode_intra4x4mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *mb)
@@ -96,11 +104,12 @@ void vp8_encode_intra4x4mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *mb)
void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
{
BLOCK *b = &x->block[0];
+ MACROBLOCKD *xd = &x->e_mbd;
- RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_mby)(&x->e_mbd);
+ RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_mby_s)(&x->e_mbd);
- ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src),
- x->e_mbd.predictor, b->src_stride);
+ ENCODEMB_INVOKE(&rtcd->encodemb, submby) (x->src_diff, *(b->base_src),
+ b->src_stride, xd->dst.y_buffer, xd->dst.y_stride);
vp8_transform_intra_mby(x);
@@ -108,14 +117,17 @@ void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
if (x->optimize)
vp8_optimize_mby(x, rtcd);
-
}
void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
{
- RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_mbuv)(&x->e_mbd);
+ MACROBLOCKD *xd = &x->e_mbd;
- ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
+ RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_mbuv_s)(&x->e_mbd);
+
+ ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer,
+ x->src.v_buffer, x->src.uv_stride, xd->dst.u_buffer,
+ xd->dst.v_buffer, xd->dst.uv_stride);
vp8_transform_mbuv(x);
@@ -123,5 +135,4 @@ void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
if (x->optimize)
vp8_optimize_mbuv(x, rtcd);
-
}
diff --git a/vp8/encoder/encodemb.c b/vp8/encoder/encodemb.c
index 80c32df1b..c9f755333 100644
--- a/vp8/encoder/encodemb.c
+++ b/vp8/encoder/encodemb.c
@@ -48,12 +48,12 @@ void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch)
}
}
-void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
+void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc,
+ int src_stride, unsigned char *upred,
+ unsigned char *vpred, int pred_stride)
{
short *udiff = diff + 256;
short *vdiff = diff + 320;
- unsigned char *upred = pred + 256;
- unsigned char *vpred = pred + 320;
int r, c;
@@ -65,8 +65,8 @@ void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc,
}
udiff += 8;
- upred += 8;
- usrc += stride;
+ upred += pred_stride;
+ usrc += src_stride;
}
for (r = 0; r < 8; r++)
@@ -77,12 +77,13 @@ void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc,
}
vdiff += 8;
- vpred += 8;
- vsrc += stride;
+ vpred += pred_stride;
+ vsrc += src_stride;
}
}
-void vp8_subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride)
+void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride,
+ unsigned char *pred, int pred_stride)
{
int r, c;
@@ -94,8 +95,8 @@ void vp8_subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, in
}
diff += 16;
- pred += 16;
- src += stride;
+ pred += pred_stride;
+ src += src_stride;
}
}
@@ -103,8 +104,11 @@ static void vp8_subtract_mb(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
{
BLOCK *b = &x->block[0];
- ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src), x->e_mbd.predictor, b->src_stride);
- ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
+ ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src),
+ b->src_stride, x->e_mbd.dst.y_buffer, x->e_mbd.dst.y_stride);
+ ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer,
+ x->src.v_buffer, x->src.uv_stride, x->e_mbd.dst.u_buffer,
+ x->e_mbd.dst.v_buffer, x->e_mbd.dst.uv_stride);
}
static void build_dcblock(MACROBLOCK *x)
@@ -621,7 +625,7 @@ void vp8_optimize_mbuv(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)
void vp8_encode_inter16x16(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
{
- vp8_build_inter_predictors_mb_e(&x->e_mbd);
+ vp8_build_inter_predictors_mb(&x->e_mbd);
vp8_subtract_mb(rtcd, x);
@@ -631,7 +635,6 @@ void vp8_encode_inter16x16(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
if (x->optimize)
optimize_mb(x, rtcd);
-
}
/* this funciton is used by first pass only */
@@ -639,14 +642,15 @@ void vp8_encode_inter16x16y(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
{
BLOCK *b = &x->block[0];
- vp8_build_inter16x16_predictors_mby(&x->e_mbd);
+ vp8_build_inter16x16_predictors_mby(&x->e_mbd, x->e_mbd.dst.y_buffer,
+ x->e_mbd.dst.y_stride);
- ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src), x->e_mbd.predictor, b->src_stride);
+ ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src),
+ b->src_stride, x->e_mbd.dst.y_buffer, x->e_mbd.dst.y_stride);
transform_mby(x);
vp8_quantize_mby(x);
- vp8_inverse_transform_mby(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
-
+ vp8_inverse_transform_mby(&x->e_mbd, IF_RTCD(rtcd->common));
}
diff --git a/vp8/encoder/encodemb.h b/vp8/encoder/encodemb.h
index 8fa457aa8..0fa87cf68 100644
--- a/vp8/encoder/encodemb.h
+++ b/vp8/encoder/encodemb.h
@@ -12,6 +12,7 @@
#ifndef __INC_ENCODEMB_H
#define __INC_ENCODEMB_H
+
#include "vpx_config.h"
#include "block.h"
@@ -28,11 +29,13 @@
void (sym)(BLOCK *be,BLOCKD *bd, int pitch)
#define prototype_submby(sym) \
- void (sym)(short *diff, unsigned char *src, unsigned char *pred, int stride)
+ void (sym)(short *diff, unsigned char *src, int src_stride, \
+ unsigned char *pred, int pred_stride)
#define prototype_submbuv(sym) \
void (sym)(short *diff, unsigned char *usrc, unsigned char *vsrc,\
- unsigned char *pred, int stride)
+ int src_stride, unsigned char *upred, unsigned char *vpred,\
+ int pred_stride)
#if ARCH_X86 || ARCH_X86_64
#include "x86/encodemb_x86.h"
diff --git a/vp8/encoder/encodemv.c b/vp8/encoder/encodemv.c
index a4849c654..c122d038d 100644
--- a/vp8/encoder/encodemv.c
+++ b/vp8/encoder/encodemv.c
@@ -395,7 +395,7 @@ static void write_component_probs(
void vp8_write_mvprobs(VP8_COMP *cpi)
{
- vp8_writer *const w = & cpi->bc;
+ vp8_writer *const w = cpi->bc;
MV_CONTEXT *mvc = cpi->common.fc.mvc;
int flags[2] = {0, 0};
#ifdef ENTROPY_STATS
diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c
index 557080dba..69655989d 100644
--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -38,7 +38,7 @@ static THREAD_FUNCTION loopfilter_thread(void *p_data)
if (sem_wait(&cpi->h_event_start_lpf) == 0)
{
- if (cpi->b_multi_threaded == FALSE) // we're shutting down
+ if (cpi->b_multi_threaded == 0) // we're shutting down
break;
loopfilter_frame(cpi, cm);
@@ -78,7 +78,7 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
int *segment_counts = mbri->segment_counts;
int *totalrate = &mbri->totalrate;
- if (cpi->b_multi_threaded == FALSE) // we're shutting down
+ if (cpi->b_multi_threaded == 0) // we're shutting down
break;
for (mb_row = ithread + 1; mb_row < cm->mb_rows; mb_row += (cpi->encoding_thread_count + 1))
@@ -302,7 +302,6 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
z->mv_col_max = x->mv_col_max;
z->mv_row_min = x->mv_row_min;
z->mv_row_max = x->mv_row_max;
- z->vector_range = x->vector_range ;
*/
z->vp8_short_fdct4x4 = x->vp8_short_fdct4x4;
@@ -343,12 +342,13 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
z->block[i].zbin = x->block[i].zbin;
z->block[i].zrun_zbin_boost = x->block[i].zrun_zbin_boost;
z->block[i].round = x->block[i].round;
+ z->q_index = x->q_index;
+ z->act_zbin_adj = x->act_zbin_adj;
+ z->last_act_zbin_adj = x->last_act_zbin_adj;
/*
z->block[i].src = x->block[i].src;
*/
z->block[i].src_stride = x->block[i].src_stride;
- z->block[i].force_empty = x->block[i].force_empty;
-
}
{
@@ -418,8 +418,6 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi,
#endif
mb->gf_active_ptr = x->gf_active_ptr;
- mb->vector_range = 32;
-
vpx_memset(mbr_ei[i].segment_counts, 0, sizeof(mbr_ei[i].segment_counts));
mbr_ei[i].totalrate = 0;
diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index 23e30508a..346c06f32 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -267,8 +267,8 @@ static void avg_stats(FIRSTPASS_STATS *section)
// Calculate a modified Error used in distributing bits between easier and harder frames
static double calculate_modified_err(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
{
- double av_err = ( cpi->twopass.total_stats->ssim_weighted_pred_err /
- cpi->twopass.total_stats->count );
+ double av_err = ( cpi->twopass.total_stats.ssim_weighted_pred_err /
+ cpi->twopass.total_stats.count );
double this_err = this_frame->ssim_weighted_pred_err;
double modified_err;
@@ -373,7 +373,7 @@ static int frame_max_bits(VP8_COMP *cpi)
else
{
// For VBR base this on the bits and frames left plus the two_pass_vbrmax_section rate passed in by the user
- max_bits = (int)(((double)cpi->twopass.bits_left / (cpi->twopass.total_stats->count - (double)cpi->common.current_video_frame)) * ((double)cpi->oxcf.two_pass_vbrmax_section / 100.0));
+ max_bits = (int)(((double)cpi->twopass.bits_left / (cpi->twopass.total_stats.count - (double)cpi->common.current_video_frame)) * ((double)cpi->oxcf.two_pass_vbrmax_section / 100.0));
}
// Trap case where we are out of bits
@@ -385,12 +385,12 @@ static int frame_max_bits(VP8_COMP *cpi)
void vp8_init_first_pass(VP8_COMP *cpi)
{
- zero_stats(cpi->twopass.total_stats);
+ zero_stats(&cpi->twopass.total_stats);
}
void vp8_end_first_pass(VP8_COMP *cpi)
{
- output_stats(cpi, cpi->output_pkt_list, cpi->twopass.total_stats);
+ output_stats(cpi, cpi->output_pkt_list, &cpi->twopass.total_stats);
}
static void zz_motion_search( VP8_COMP *cpi, MACROBLOCK * x, YV12_BUFFER_CONFIG * recon_buffer, int * best_motion_err, int recon_yoffset )
@@ -804,17 +804,17 @@ void vp8_first_pass(VP8_COMP *cpi)
- cpi->source->ts_start;
// don't want to do output stats with a stack variable!
- memcpy(cpi->twopass.this_frame_stats,
+ memcpy(&cpi->twopass.this_frame_stats,
&fps,
sizeof(FIRSTPASS_STATS));
- output_stats(cpi, cpi->output_pkt_list, cpi->twopass.this_frame_stats);
- accumulate_stats(cpi->twopass.total_stats, &fps);
+ output_stats(cpi, cpi->output_pkt_list, &cpi->twopass.this_frame_stats);
+ accumulate_stats(&cpi->twopass.total_stats, &fps);
}
// Copy the previous Last Frame into the GF buffer if specific conditions for doing so are met
if ((cm->current_video_frame > 0) &&
- (cpi->twopass.this_frame_stats->pcnt_inter > 0.20) &&
- ((cpi->twopass.this_frame_stats->intra_error / cpi->twopass.this_frame_stats->coded_error) > 2.0))
+ (cpi->twopass.this_frame_stats.pcnt_inter > 0.20) &&
+ ((cpi->twopass.this_frame_stats.intra_error / cpi->twopass.this_frame_stats.coded_error) > 2.0))
{
vp8_yv12_copy_frame_ptr(lst_yv12, gld_yv12);
}
@@ -861,7 +861,7 @@ double bitcost( double prob )
{
return -(log( prob ) / log( 2.0 ));
}
-static long long estimate_modemvcost(VP8_COMP *cpi,
+static int64_t estimate_modemvcost(VP8_COMP *cpi,
FIRSTPASS_STATS * fpstats)
{
int mv_cost;
@@ -1019,7 +1019,7 @@ static int estimate_max_q(VP8_COMP *cpi,
// averaga q observed in clip for non kf/gf.arf frames
// Give average a chance to settle though.
if ( (cpi->ni_frames >
- ((unsigned int)cpi->twopass.total_stats->count >> 8)) &&
+ ((unsigned int)cpi->twopass.total_stats.count >> 8)) &&
(cpi->ni_frames > 150) )
{
cpi->twopass.maxq_max_limit = ((cpi->ni_av_qi + 32) < cpi->worst_quality)
@@ -1075,8 +1075,8 @@ static int estimate_cq( VP8_COMP *cpi,
}
// II ratio correction factor for clip as a whole
- clip_iiratio = cpi->twopass.total_stats->intra_error /
- DOUBLE_DIVIDE_CHECK(cpi->twopass.total_stats->coded_error);
+ clip_iiratio = cpi->twopass.total_stats.intra_error /
+ DOUBLE_DIVIDE_CHECK(cpi->twopass.total_stats.coded_error);
clip_iifactor = 1.0 - ((clip_iiratio - 10.0) * 0.025);
if (clip_iifactor < 0.80)
clip_iifactor = 0.80;
@@ -1260,25 +1260,25 @@ void vp8_init_second_pass(VP8_COMP *cpi)
double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);
- zero_stats(cpi->twopass.total_stats);
- zero_stats(cpi->twopass.total_left_stats);
+ zero_stats(&cpi->twopass.total_stats);
+ zero_stats(&cpi->twopass.total_left_stats);
if (!cpi->twopass.stats_in_end)
return;
- *cpi->twopass.total_stats = *cpi->twopass.stats_in_end;
- *cpi->twopass.total_left_stats = *cpi->twopass.total_stats;
+ cpi->twopass.total_stats = *cpi->twopass.stats_in_end;
+ cpi->twopass.total_left_stats = cpi->twopass.total_stats;
// each frame can have a different duration, as the frame rate in the source
// isn't guaranteed to be constant. The frame rate prior to the first frame
// encoded in the second pass is a guess. However the sum duration is not.
// Its calculated based on the actual durations of all frames from the first
// pass.
- vp8_new_frame_rate(cpi, 10000000.0 * cpi->twopass.total_stats->count / cpi->twopass.total_stats->duration);
+ vp8_new_frame_rate(cpi, 10000000.0 * cpi->twopass.total_stats.count / cpi->twopass.total_stats.duration);
- cpi->output_frame_rate = cpi->oxcf.frame_rate;
- cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats->duration * cpi->oxcf.target_bandwidth / 10000000.0) ;
- cpi->twopass.bits_left -= (int64_t)(cpi->twopass.total_stats->duration * two_pass_min_rate / 10000000.0);
+ cpi->output_frame_rate = cpi->frame_rate;
+ cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats.duration * cpi->oxcf.target_bandwidth / 10000000.0) ;
+ cpi->twopass.bits_left -= (int64_t)(cpi->twopass.total_stats.duration * two_pass_min_rate / 10000000.0);
// Calculate a minimum intra value to be used in determining the IIratio
// scores used in the second pass. We have this minimum to make sure
@@ -1301,7 +1301,7 @@ void vp8_init_second_pass(VP8_COMP *cpi)
sum_iiratio += IIRatio;
}
- cpi->twopass.avg_iiratio = sum_iiratio / DOUBLE_DIVIDE_CHECK((double)cpi->twopass.total_stats->count);
+ cpi->twopass.avg_iiratio = sum_iiratio / DOUBLE_DIVIDE_CHECK((double)cpi->twopass.total_stats.count);
// Reset file position
reset_fpf_position(cpi, start_pos);
@@ -1376,7 +1376,7 @@ static int detect_transition_to_still(
double loop_decay_rate,
double decay_accumulator )
{
- BOOL trans_to_still = FALSE;
+ int trans_to_still = 0;
// Break clause to detect very still sections after motion
// For example a static image after a fade or other transition
@@ -1406,7 +1406,7 @@ static int detect_transition_to_still(
// Only if it does do we signal a transition to still
if ( j == still_interval )
- trans_to_still = TRUE;
+ trans_to_still = 1;
}
return trans_to_still;
@@ -1415,14 +1415,14 @@ static int detect_transition_to_still(
// This function detects a flash through the high relative pcnt_second_ref
// score in the frame following a flash frame. The offset passed in should
// reflect this
-static BOOL detect_flash( VP8_COMP *cpi, int offset )
+static int detect_flash( VP8_COMP *cpi, int offset )
{
FIRSTPASS_STATS next_frame;
- BOOL flash_detected = FALSE;
+ int flash_detected = 0;
// Read the frame data.
- // The return is FALSE (no flash detected) if not a valid frame
+ // The return is 0 (no flash detected) if not a valid frame
if ( read_frame_stats(cpi, &next_frame, offset) != EOF )
{
// What we are looking for here is a situation where there is a
@@ -1433,7 +1433,7 @@ static BOOL detect_flash( VP8_COMP *cpi, int offset )
if ( (next_frame.pcnt_second_ref > next_frame.pcnt_inter) &&
(next_frame.pcnt_second_ref >= 0.5 ) )
{
- flash_detected = TRUE;
+ flash_detected = 1;
/*if (1)
{
@@ -1548,7 +1548,7 @@ static int calc_arf_boost(
double mv_in_out_accumulator = 0.0;
double abs_mv_in_out_accumulator = 0.0;
double r;
- BOOL flash_detected = FALSE;
+ int flash_detected = 0;
// Search forward from the proposed arf/next gf position
for ( i = 0; i < f_frames; i++ )
@@ -1677,7 +1677,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
int alt_boost = 0;
int f_boost = 0;
int b_boost = 0;
- BOOL flash_detected;
+ int flash_detected;
cpi->twopass.gf_group_bits = 0;
cpi->twopass.gf_decay_rate = 0;
@@ -1751,7 +1751,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
loop_decay_rate,
decay_accumulator ) )
{
- allow_alt_ref = FALSE;
+ allow_alt_ref = 0;
boost_score = old_boost_score;
break;
}
@@ -1923,7 +1923,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
int frames_bwd = cpi->oxcf.arnr_max_frames - 1;
int frames_fwd = cpi->oxcf.arnr_max_frames - 1;
- cpi->source_alt_ref_pending = TRUE;
+ cpi->source_alt_ref_pending = 1;
// For alt ref frames the error score for the end frame of the
// group (the alt ref frame) should not contribute to the group
@@ -1949,7 +1949,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
// Note: this_frame->frame has been updated in the loop
// so it now points at the ARF frame.
half_gf_int = cpi->baseline_gf_interval >> 1;
- frames_after_arf = cpi->twopass.total_stats->count -
+ frames_after_arf = cpi->twopass.total_stats.count -
this_frame->frame - 1;
switch (cpi->oxcf.arnr_type)
@@ -1989,13 +1989,13 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
}
else
{
- cpi->source_alt_ref_pending = FALSE;
+ cpi->source_alt_ref_pending = 0;
cpi->baseline_gf_interval = i;
}
}
else
{
- cpi->source_alt_ref_pending = FALSE;
+ cpi->source_alt_ref_pending = 0;
cpi->baseline_gf_interval = i;
}
@@ -2005,7 +2005,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
// where cpi->twopass.kf_group_bits is tied to cpi->twopass.bits_left.
// This is also important for short clips where there may only be one
// key frame.
- if (cpi->twopass.frames_to_key >= (int)(cpi->twopass.total_stats->count -
+ if (cpi->twopass.frames_to_key >= (int)(cpi->twopass.total_stats.count -
cpi->common.current_video_frame))
{
cpi->twopass.kf_group_bits =
@@ -2296,7 +2296,7 @@ static void assign_std_frame_bits(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
void vp8_second_pass(VP8_COMP *cpi)
{
int tmp_q;
- int frames_left = (int)(cpi->twopass.total_stats->count - cpi->common.current_video_frame);
+ int frames_left = (int)(cpi->twopass.total_stats.count - cpi->common.current_video_frame);
FIRSTPASS_STATS this_frame = {0};
FIRSTPASS_STATS this_frame_copy;
@@ -2341,7 +2341,7 @@ void vp8_second_pass(VP8_COMP *cpi)
cpi->twopass.gf_group_error_left = cpi->twopass.kf_group_error_left;
cpi->baseline_gf_interval = cpi->twopass.frames_to_key;
cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
- cpi->source_alt_ref_pending = FALSE;
+ cpi->source_alt_ref_pending = 0;
}
}
@@ -2411,7 +2411,7 @@ void vp8_second_pass(VP8_COMP *cpi)
// Account for mv, mode and other overheads.
overhead_bits = estimate_modemvcost(
- cpi, cpi->twopass.total_left_stats );
+ cpi, &cpi->twopass.total_left_stats );
// Special case code for first frame.
if (cpi->common.current_video_frame == 0)
@@ -2425,7 +2425,7 @@ void vp8_second_pass(VP8_COMP *cpi)
est_cq =
estimate_cq( cpi,
- cpi->twopass.total_left_stats,
+ &cpi->twopass.total_left_stats,
(int)(cpi->twopass.bits_left / frames_left),
overhead_bits );
@@ -2440,7 +2440,7 @@ void vp8_second_pass(VP8_COMP *cpi)
tmp_q = estimate_max_q(
cpi,
- cpi->twopass.total_left_stats,
+ &cpi->twopass.total_left_stats,
(int)(cpi->twopass.bits_left / frames_left),
overhead_bits );
@@ -2463,16 +2463,16 @@ void vp8_second_pass(VP8_COMP *cpi)
// radical adjustments to the allowed quantizer range just to use up a
// few surplus bits or get beneath the target rate.
else if ( (cpi->common.current_video_frame <
- (((unsigned int)cpi->twopass.total_stats->count * 255)>>8)) &&
+ (((unsigned int)cpi->twopass.total_stats.count * 255)>>8)) &&
((cpi->common.current_video_frame + cpi->baseline_gf_interval) <
- (unsigned int)cpi->twopass.total_stats->count) )
+ (unsigned int)cpi->twopass.total_stats.count) )
{
if (frames_left < 1)
frames_left = 1;
tmp_q = estimate_max_q(
cpi,
- cpi->twopass.total_left_stats,
+ &cpi->twopass.total_left_stats,
(int)(cpi->twopass.bits_left / frames_left),
overhead_bits );
@@ -2489,13 +2489,13 @@ void vp8_second_pass(VP8_COMP *cpi)
cpi->twopass.frames_to_key --;
// Update the total stats remaining sturcture
- subtract_stats(cpi->twopass.total_left_stats, &this_frame );
+ subtract_stats(&cpi->twopass.total_left_stats, &this_frame );
}
-static BOOL test_candidate_kf(VP8_COMP *cpi, FIRSTPASS_STATS *last_frame, FIRSTPASS_STATS *this_frame, FIRSTPASS_STATS *next_frame)
+static int test_candidate_kf(VP8_COMP *cpi, FIRSTPASS_STATS *last_frame, FIRSTPASS_STATS *this_frame, FIRSTPASS_STATS *next_frame)
{
- BOOL is_viable_kf = FALSE;
+ int is_viable_kf = 0;
// Does the frame satisfy the primary criteria of a key frame
// If so, then examine how well it predicts subsequent frames
@@ -2569,13 +2569,13 @@ static BOOL test_candidate_kf(VP8_COMP *cpi, FIRSTPASS_STATS *last_frame, FIRST
// If there is tolerable prediction for at least the next 3 frames then break out else discard this pottential key frame and move on
if (boost_score > 5.0 && (i > 3))
- is_viable_kf = TRUE;
+ is_viable_kf = 1;
else
{
// Reset the file position
reset_fpf_position(cpi, start_pos);
- is_viable_kf = FALSE;
+ is_viable_kf = 0;
}
}
@@ -2611,7 +2611,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
cpi->this_key_frame_forced = cpi->next_key_frame_forced;
// Clear the alt ref active flag as this can never be active on a key frame
- cpi->source_alt_ref_active = FALSE;
+ cpi->source_alt_ref_active = 0;
// Kf is always a gf so clear frames till next gf counter
cpi->frames_till_gf_update_due = 0;
@@ -2727,10 +2727,10 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
// Reset to the start of the group
reset_fpf_position(cpi, current_pos);
- cpi->next_key_frame_forced = TRUE;
+ cpi->next_key_frame_forced = 1;
}
else
- cpi->next_key_frame_forced = FALSE;
+ cpi->next_key_frame_forced = 0;
// Special case for the last frame of the file
if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end)
@@ -3034,8 +3034,8 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
if (cpi->oxcf.allow_spatial_resampling)
{
- int resample_trigger = FALSE;
- int last_kf_resampled = FALSE;
+ int resample_trigger = 0;
+ int last_kf_resampled = 0;
int kf_q;
int scale_val = 0;
int hr, hs, vr, vs;
@@ -3053,15 +3053,15 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
double effective_size_ratio;
if ((cpi->common.Width != cpi->oxcf.Width) || (cpi->common.Height != cpi->oxcf.Height))
- last_kf_resampled = TRUE;
+ last_kf_resampled = 1;
// Set back to unscaled by defaults
cpi->common.horiz_scale = NORMAL;
cpi->common.vert_scale = NORMAL;
// Calculate Average bits per frame.
- //av_bits_per_frame = cpi->twopass.bits_left/(double)(cpi->twopass.total_stats->count - cpi->common.current_video_frame);
- av_bits_per_frame = cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->oxcf.frame_rate);
+ //av_bits_per_frame = cpi->twopass.bits_left/(double)(cpi->twopass.total_stats.count - cpi->common.current_video_frame);
+ av_bits_per_frame = cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->frame_rate);
//if ( av_bits_per_frame < 0.0 )
// av_bits_per_frame = 0.0
@@ -3117,21 +3117,21 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
(last_kf_resampled && (projected_buffer_level < (cpi->oxcf.resample_up_water_mark * cpi->oxcf.optimal_buffer_level / 100))))
//( ((cpi->buffer_level < (cpi->oxcf.resample_down_water_mark * cpi->oxcf.optimal_buffer_level / 100))) &&
// ((projected_buffer_level < (cpi->oxcf.resample_up_water_mark * cpi->oxcf.optimal_buffer_level / 100))) ))
- resample_trigger = TRUE;
+ resample_trigger = 1;
else
- resample_trigger = FALSE;
+ resample_trigger = 0;
}
else
{
- int64_t clip_bits = (int64_t)(cpi->twopass.total_stats->count * cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->oxcf.frame_rate));
+ int64_t clip_bits = (int64_t)(cpi->twopass.total_stats.count * cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->frame_rate));
int64_t over_spend = cpi->oxcf.starting_buffer_level - cpi->buffer_level;
if ((last_kf_resampled && (kf_q > cpi->worst_quality)) || // If triggered last time the threshold for triggering again is reduced
((kf_q > cpi->worst_quality) && // Projected Q higher than allowed and ...
(over_spend > clip_bits / 20))) // ... Overspend > 5% of total bits
- resample_trigger = TRUE;
+ resample_trigger = 1;
else
- resample_trigger = FALSE;
+ resample_trigger = 0;
}
diff --git a/vp8/encoder/lookahead.c b/vp8/encoder/lookahead.c
index b92e82bdf..3e582e369 100644
--- a/vp8/encoder/lookahead.c
+++ b/vp8/encoder/lookahead.c
@@ -48,7 +48,7 @@ vp8_lookahead_destroy(struct lookahead_ctx *ctx)
{
if(ctx->buf)
{
- int i;
+ unsigned int i;
for(i = 0; i < ctx->max_sz; i++)
vp8_yv12_de_alloc_frame_buffer(&ctx->buf[i].img);
@@ -65,7 +65,7 @@ vp8_lookahead_init(unsigned int width,
unsigned int depth)
{
struct lookahead_ctx *ctx = NULL;
- int i;
+ unsigned int i;
/* Clamp the lookahead queue depth */
if(depth < 1)
@@ -188,7 +188,7 @@ vp8_lookahead_pop(struct lookahead_ctx *ctx,
struct lookahead_entry*
vp8_lookahead_peek(struct lookahead_ctx *ctx,
- int index)
+ unsigned int index)
{
struct lookahead_entry* buf = NULL;
diff --git a/vp8/encoder/lookahead.h b/vp8/encoder/lookahead.h
index afb3fd4a9..32bafcd63 100644
--- a/vp8/encoder/lookahead.h
+++ b/vp8/encoder/lookahead.h
@@ -92,7 +92,7 @@ vp8_lookahead_pop(struct lookahead_ctx *ctx,
*/
struct lookahead_entry*
vp8_lookahead_peek(struct lookahead_ctx *ctx,
- int index);
+ unsigned int index);
/**\brief Get the number of frames currently in the lookahead queue
diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c
index c1a0ea7bf..735af95ca 100644
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@@ -9,6 +9,7 @@
*/
+#include "onyx_int.h"
#include "mcomp.h"
#include "vpx_mem/vpx_mem.h"
#include "vpx_config.h"
@@ -182,8 +183,6 @@ void vp8_init3smotion_compensation(MACROBLOCK *x, int stride)
#define IFMVCV(r,c,s,e) if ( c >= minc && c <= maxc && r >= minr && r <= maxr) s else e;
#define ERR(r,c) (MVC(r,c)+DIST(r,c)) // returns distortion + motion vector cost
#define CHECK_BETTER(v,r,c) IFMVCV(r,c,{thismse = DIST(r,c); if((v = (MVC(r,c)+thismse)) < besterr) { besterr = v; br=r; bc=c; *distortion = thismse; *sse1 = sse; }}, v=INT_MAX;)// checks if (r,c) has better score than previous best
-#define MIN(x,y) (((x)<(y))?(x):(y))
-#define MAX(x,y) (((x)>(y))?(x):(y))
int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
int_mv *bestmv, int_mv *ref_mv,
@@ -331,8 +330,7 @@ int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
#undef IFMVCV
#undef ERR
#undef CHECK_BETTER
-#undef MIN
-#undef MAX
+
int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
int_mv *bestmv, int_mv *ref_mv,
int error_per_bit,
@@ -854,6 +852,8 @@ int vp8_hex_search
int k = -1;
int all_in;
int best_site = -1;
+ int hex_range = 127;
+ int dia_range = 8;
int_mv fcenter_mv;
fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
@@ -873,6 +873,18 @@ int vp8_hex_search
in_what_stride, 0x7fffffff)
+ mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit);
+#if CONFIG_MULTI_RES_ENCODING
+ /* Lower search range based on prediction info */
+ if (search_param >= 6) goto cal_neighbors;
+ else if (search_param >= 5) hex_range = 4;
+ else if (search_param >= 4) hex_range = 6;
+ else if (search_param >= 3) hex_range = 15;
+ else if (search_param >= 2) hex_range = 31;
+ else if (search_param >= 1) hex_range = 63;
+
+ dia_range = 8;
+#endif
+
// hex search
//j=0
CHECK_BOUNDS(2)
@@ -909,7 +921,7 @@ int vp8_hex_search
k = best_site;
}
- for (j = 1; j < 127; j++)
+ for (j = 1; j < hex_range; j++)
{
best_site = -1;
CHECK_BOUNDS(2)
@@ -951,7 +963,7 @@ int vp8_hex_search
// check 4 1-away neighbors
cal_neighbors:
- for (j = 0; j < 32; j++)
+ for (j = 0; j < dia_range; j++)
{
best_site = -1;
CHECK_BOUNDS(1)
@@ -1144,7 +1156,7 @@ int vp8_diamond_search_sadx4
int tot_steps;
int_mv this_mv;
- int bestsad = INT_MAX;
+ unsigned int bestsad = UINT_MAX;
int best_site = 0;
int last_site = 0;
@@ -1385,7 +1397,7 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
unsigned char *bestaddress;
int_mv *best_mv = &d->bmi.mv;
int_mv this_mv;
- int bestsad = INT_MAX;
+ unsigned int bestsad = UINT_MAX;
int r, c;
unsigned char *check_here;
@@ -1515,7 +1527,7 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
unsigned char *bestaddress;
int_mv *best_mv = &d->bmi.mv;
int_mv this_mv;
- int bestsad = INT_MAX;
+ unsigned int bestsad = UINT_MAX;
int r, c;
unsigned char *check_here;
diff --git a/vp8/encoder/mr_dissim.c b/vp8/encoder/mr_dissim.c
new file mode 100644
index 000000000..7a62a06ec
--- /dev/null
+++ b/vp8/encoder/mr_dissim.c
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <limits.h>
+#include "vpx_config.h"
+#include "onyx_int.h"
+#include "mr_dissim.h"
+#include "vpx_mem/vpx_mem.h"
+#include "rdopt.h"
+
+void vp8_cal_low_res_mb_cols(VP8_COMP *cpi)
+{
+ int low_res_w;
+
+ /* Support arbitrary down-sampling factor */
+ unsigned int iw = cpi->oxcf.Width*cpi->oxcf.mr_down_sampling_factor.den
+ + cpi->oxcf.mr_down_sampling_factor.num - 1;
+
+ low_res_w = iw/cpi->oxcf.mr_down_sampling_factor.num;
+ cpi->mr_low_res_mb_cols = ((low_res_w + 15) >> 4);
+}
+
+#define GET_MV(x) \
+if(x->mbmi.ref_frame !=INTRA_FRAME) \
+{ \
+ mvx[cnt] = x->mbmi.mv.as_mv.row; \
+ mvy[cnt] = x->mbmi.mv.as_mv.col; \
+ cnt++; \
+}
+
+#define GET_MV_SIGN(x) \
+if(x->mbmi.ref_frame !=INTRA_FRAME) \
+{ \
+ mvx[cnt] = x->mbmi.mv.as_mv.row; \
+ mvy[cnt] = x->mbmi.mv.as_mv.col; \
+ if (cm->ref_frame_sign_bias[x->mbmi.ref_frame] \
+ != cm->ref_frame_sign_bias[tmp->mbmi.ref_frame]) \
+ { \
+ mvx[cnt] *= -1; \
+ mvy[cnt] *= -1; \
+ } \
+ cnt++; \
+}
+
+void vp8_cal_dissimilarity(VP8_COMP *cpi)
+{
+ VP8_COMMON *cm = &cpi->common;
+
+ /* Note: The first row & first column in mip are outside the frame, which
+ * were initialized to all 0.(ref_frame, mode, mv...)
+ * Their ref_frame = 0 means they won't be counted in the following
+ * calculation.
+ */
+ if (cpi->oxcf.mr_total_resolutions >1
+ && cpi->oxcf.mr_encoder_id < (cpi->oxcf.mr_total_resolutions - 1))
+ {
+ /* Store info for show/no-show frames for supporting alt_ref.
+ * If parent frame is alt_ref, child has one too.
+ */
+ if(cm->frame_type != KEY_FRAME)
+ {
+ int mb_row;
+ int mb_col;
+ /* Point to beginning of allocated MODE_INFO arrays. */
+ MODE_INFO *tmp = cm->mip + cm->mode_info_stride;
+ LOWER_RES_INFO* store_mode_info
+ = (LOWER_RES_INFO*)cpi->oxcf.mr_low_res_mode_info;
+
+ for (mb_row = 0; mb_row < cm->mb_rows; mb_row ++)
+ {
+ tmp++;
+ for (mb_col = 0; mb_col < cm->mb_cols; mb_col ++)
+ {
+ int dissim = INT_MAX;
+
+ if(tmp->mbmi.ref_frame !=INTRA_FRAME)
+ {
+ int mvx[8];
+ int mvy[8];
+ int mmvx;
+ int mmvy;
+ int cnt=0;
+ const MODE_INFO *here = tmp;
+ const MODE_INFO *above = here - cm->mode_info_stride;
+ const MODE_INFO *left = here - 1;
+ const MODE_INFO *aboveleft = above - 1;
+ const MODE_INFO *aboveright = NULL;
+ const MODE_INFO *right = NULL;
+ const MODE_INFO *belowleft = NULL;
+ const MODE_INFO *below = NULL;
+ const MODE_INFO *belowright = NULL;
+
+ /* If alternate reference frame is used, we have to
+ * check sign of MV. */
+ if(cpi->oxcf.play_alternate)
+ {
+ /* Gather mv of neighboring MBs */
+ GET_MV_SIGN(above)
+ GET_MV_SIGN(left)
+ GET_MV_SIGN(aboveleft)
+
+ if(mb_col < (cm->mb_cols-1))
+ {
+ right = here + 1;
+ aboveright = above + 1;
+ GET_MV_SIGN(right)
+ GET_MV_SIGN(aboveright)
+ }
+
+ if(mb_row < (cm->mb_rows-1))
+ {
+ below = here + cm->mode_info_stride;
+ belowleft = below - 1;
+ GET_MV_SIGN(below)
+ GET_MV_SIGN(belowleft)
+ }
+
+ if(mb_col < (cm->mb_cols-1)
+ && mb_row < (cm->mb_rows-1))
+ {
+ belowright = below + 1;
+ GET_MV_SIGN(belowright)
+ }
+ }else
+ {
+ /* No alt_ref and gather mv of neighboring MBs */
+ GET_MV(above)
+ GET_MV(left)
+ GET_MV(aboveleft)
+
+ if(mb_col < (cm->mb_cols-1))
+ {
+ right = here + 1;
+ aboveright = above + 1;
+ GET_MV(right)
+ GET_MV(aboveright)
+ }
+
+ if(mb_row < (cm->mb_rows-1))
+ {
+ below = here + cm->mode_info_stride;
+ belowleft = below - 1;
+ GET_MV(below)
+ GET_MV(belowleft)
+ }
+
+ if(mb_col < (cm->mb_cols-1)
+ && mb_row < (cm->mb_rows-1))
+ {
+ belowright = below + 1;
+ GET_MV(belowright)
+ }
+ }
+
+ if (cnt > 0)
+ {
+ int max_mvx = mvx[0];
+ int min_mvx = mvx[0];
+ int max_mvy = mvy[0];
+ int min_mvy = mvy[0];
+ int i;
+
+ if (cnt > 1)
+ {
+ for (i=1; i< cnt; i++)
+ {
+ if (mvx[i] > max_mvx) max_mvx = mvx[i];
+ else if (mvx[i] < min_mvx) min_mvx = mvx[i];
+ if (mvy[i] > max_mvy) max_mvy = mvy[i];
+ else if (mvy[i] < min_mvy) min_mvy = mvy[i];
+ }
+ }
+
+ mmvx = MAX(abs(min_mvx - here->mbmi.mv.as_mv.row),
+ abs(max_mvx - here->mbmi.mv.as_mv.row));
+ mmvy = MAX(abs(min_mvy - here->mbmi.mv.as_mv.col),
+ abs(max_mvy - here->mbmi.mv.as_mv.col));
+ dissim = MAX(mmvx, mmvy);
+ }
+ }
+
+ /* Store mode info for next resolution encoding */
+ store_mode_info->mode = tmp->mbmi.mode;
+ store_mode_info->ref_frame = tmp->mbmi.ref_frame;
+ store_mode_info->mv.as_int = tmp->mbmi.mv.as_int;
+ store_mode_info->dissim = dissim;
+ tmp++;
+ store_mode_info++;
+ }
+ }
+ }
+ }
+}
diff --git a/vp8/common/common_types.h b/vp8/encoder/mr_dissim.h
index 4e6248697..3d2c2035f 100644
--- a/vp8/common/common_types.h
+++ b/vp8/encoder/mr_dissim.h
@@ -9,10 +9,11 @@
*/
-#ifndef __INC_COMMON_TYPES
-#define __INC_COMMON_TYPES
+#ifndef __INC_MR_DISSIM_H
+#define __INC_MR_DISSIM_H
+#include "vpx_config.h"
-#define TRUE 1
-#define FALSE 0
+extern void vp8_cal_low_res_mb_cols(VP8_COMP *cpi);
+extern void vp8_cal_dissimilarity(VP8_COMP *cpi);
#endif
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 1d00e6777..e3f951925 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -23,7 +23,6 @@
#include "ratectrl.h"
#include "vp8/common/quant_common.h"
#include "segmentation.h"
-#include "vp8/common/g_common.h"
#include "vpx_scale/yv12extend.h"
#if CONFIG_POSTPROC
#include "vp8/common/postproc.h"
@@ -36,6 +35,9 @@
#if ARCH_ARM
#include "vpx_ports/arm.h"
#endif
+#if CONFIG_MULTI_RES_ENCODING
+#include "mr_dissim.h"
+#endif
#include <math.h>
#include <stdio.h>
@@ -67,6 +69,7 @@ extern void vp8_yv12_copy_src_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_
#endif
int vp8_estimate_entropy_savings(VP8_COMP *cpi);
+
int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, const vp8_variance_rtcd_vtable_t *rtcd);
extern void vp8_temporal_filter_prepare_c(VP8_COMP *cpi, int distance);
@@ -355,7 +358,7 @@ static void dealloc_compressor_data(VP8_COMP *cpi)
vp8_de_alloc_frame_buffers(&cpi->common);
- vp8_yv12_de_alloc_frame_buffer(&cpi->last_frame_uf);
+ vp8_yv12_de_alloc_frame_buffer(&cpi->pick_lf_lvl_frame);
vp8_yv12_de_alloc_frame_buffer(&cpi->scaled_source);
#if VP8_TEMPORAL_ALT_REF
vp8_yv12_de_alloc_frame_buffer(&cpi->alt_ref_buffer);
@@ -377,42 +380,25 @@ static void dealloc_compressor_data(VP8_COMP *cpi)
vpx_free(cpi->mb.pip);
cpi->mb.pip = 0;
-
-#if !(CONFIG_REALTIME_ONLY)
- vpx_free(cpi->twopass.total_stats);
- cpi->twopass.total_stats = 0;
-
- vpx_free(cpi->twopass.total_left_stats);
- cpi->twopass.total_left_stats = 0;
-
- vpx_free(cpi->twopass.this_frame_stats);
- cpi->twopass.this_frame_stats = 0;
-#endif
}
-static void enable_segmentation(VP8_PTR ptr)
+static void enable_segmentation(VP8_COMP *cpi)
{
- VP8_COMP *cpi = (VP8_COMP *)(ptr);
-
// Set the appropriate feature bit
cpi->mb.e_mbd.segmentation_enabled = 1;
cpi->mb.e_mbd.update_mb_segmentation_map = 1;
cpi->mb.e_mbd.update_mb_segmentation_data = 1;
}
-static void disable_segmentation(VP8_PTR ptr)
+static void disable_segmentation(VP8_COMP *cpi)
{
- VP8_COMP *cpi = (VP8_COMP *)(ptr);
-
// Clear the appropriate feature bit
cpi->mb.e_mbd.segmentation_enabled = 0;
}
// Valid values for a segment are 0 to 3
// Segmentation map is arrange as [Rows][Columns]
-static void set_segmentation_map(VP8_PTR ptr, unsigned char *segmentation_map)
+static void set_segmentation_map(VP8_COMP *cpi, unsigned char *segmentation_map)
{
- VP8_COMP *cpi = (VP8_COMP *)(ptr);
-
// Copy in the new segmentation map
vpx_memcpy(cpi->segmentation_map, segmentation_map, (cpi->common.mb_rows * cpi->common.mb_cols));
@@ -429,19 +415,15 @@ static void set_segmentation_map(VP8_PTR ptr, unsigned char *segmentation_map)
// abs_delta = SEGMENT_DELTADATA (deltas) abs_delta = SEGMENT_ABSDATA (use the absolute values given).
//
//
-static void set_segment_data(VP8_PTR ptr, signed char *feature_data, unsigned char abs_delta)
+static void set_segment_data(VP8_COMP *cpi, signed char *feature_data, unsigned char abs_delta)
{
- VP8_COMP *cpi = (VP8_COMP *)(ptr);
-
cpi->mb.e_mbd.mb_segement_abs_delta = abs_delta;
vpx_memcpy(cpi->segment_feature_data, feature_data, sizeof(cpi->segment_feature_data));
}
-static void segmentation_test_function(VP8_PTR ptr)
+static void segmentation_test_function(VP8_COMP *cpi)
{
- VP8_COMP *cpi = (VP8_COMP *)(ptr);
-
unsigned char *seg_map;
signed char feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS];
@@ -469,10 +451,10 @@ static void segmentation_test_function(VP8_PTR ptr)
}*/
// Set the segmentation Map
- set_segmentation_map(ptr, seg_map);
+ set_segmentation_map(cpi, seg_map);
// Activate segmentation.
- enable_segmentation(ptr);
+ enable_segmentation(cpi);
// Set up the quant segment data
feature_data[MB_LVL_ALT_Q][0] = 0;
@@ -487,7 +469,7 @@ static void segmentation_test_function(VP8_PTR ptr)
// Initialise the feature data structure
// SEGMENT_DELTADATA 0, SEGMENT_ABSDATA 1
- set_segment_data(ptr, &feature_data[0][0], SEGMENT_DELTADATA);
+ set_segment_data(cpi, &feature_data[0][0], SEGMENT_DELTADATA);
// Delete sementation map
vpx_free(seg_map);
@@ -561,10 +543,10 @@ static void cyclic_background_refresh(VP8_COMP *cpi, int Q, int lf_adjustment)
}
// Set the segmentation Map
- set_segmentation_map((VP8_PTR)cpi, seg_map);
+ set_segmentation_map(cpi, seg_map);
// Activate segmentation.
- enable_segmentation((VP8_PTR)cpi);
+ enable_segmentation(cpi);
// Set up the quant segment data
feature_data[MB_LVL_ALT_Q][0] = 0;
@@ -580,7 +562,7 @@ static void cyclic_background_refresh(VP8_COMP *cpi, int Q, int lf_adjustment)
// Initialise the feature data structure
// SEGMENT_DELTADATA 0, SEGMENT_ABSDATA 1
- set_segment_data((VP8_PTR)cpi, &feature_data[0][0], SEGMENT_DELTADATA);
+ set_segment_data(cpi, &feature_data[0][0], SEGMENT_DELTADATA);
// Delete sementation map
vpx_free(seg_map);
@@ -609,6 +591,93 @@ static void set_default_lf_deltas(VP8_COMP *cpi)
cpi->mb.e_mbd.mode_lf_deltas[3] = 4; // Split mv
}
+/* Convenience macros for mapping speed and mode into a continuous
+ * range
+ */
+#define GOOD(x) (x+1)
+#define RT(x) (x+7)
+
+static int speed_map(int speed, int *map)
+{
+ int res;
+
+ do
+ {
+ res = *map++;
+ } while(speed >= *map++);
+ return res;
+}
+
+static int thresh_mult_map_znn[] = {
+ /* map common to zero, nearest, and near */
+ 0, GOOD(2), 1500, GOOD(3), 2000, RT(0), 1000, RT(2), 2000, INT_MAX
+};
+
+static int thresh_mult_map_vhpred[] = {
+ 1000, GOOD(2), 1500, GOOD(3), 2000, RT(0), 1000, RT(1), 2000,
+ RT(7), INT_MAX, INT_MAX
+};
+
+static int thresh_mult_map_bpred[] = {
+ 2000, GOOD(0), 2500, GOOD(2), 5000, GOOD(3), 7500, RT(0), 2500, RT(1), 5000,
+ RT(6), INT_MAX, INT_MAX
+};
+
+static int thresh_mult_map_tm[] = {
+ 1000, GOOD(2), 1500, GOOD(3), 2000, RT(0), 0, RT(1), 1000, RT(2), 2000,
+ RT(7), INT_MAX, INT_MAX
+};
+
+static int thresh_mult_map_new1[] = {
+ 1000, GOOD(2), 2000, RT(0), 2000, INT_MAX
+};
+
+static int thresh_mult_map_new2[] = {
+ 1000, GOOD(2), 2000, GOOD(3), 2500, GOOD(5), 4000, RT(0), 2000, RT(2), 2500,
+ RT(5), 4000, INT_MAX
+};
+
+static int thresh_mult_map_split1[] = {
+ 2500, GOOD(0), 1700, GOOD(2), 10000, GOOD(3), 25000, GOOD(4), INT_MAX,
+ RT(0), 5000, RT(1), 10000, RT(2), 25000, RT(3), INT_MAX, INT_MAX
+};
+
+static int thresh_mult_map_split2[] = {
+ 5000, GOOD(0), 4500, GOOD(2), 20000, GOOD(3), 50000, GOOD(4), INT_MAX,
+ RT(0), 10000, RT(1), 20000, RT(2), 50000, RT(3), INT_MAX, INT_MAX
+};
+
+static int mode_check_freq_map_zn2[] = {
+ /* {zero,nearest}{2,3} */
+ 0, RT(10), 1<<1, RT(11), 1<<2, RT(12), 1<<3, INT_MAX
+};
+
+static int mode_check_freq_map_vhbpred[] = {
+ 0, GOOD(5), 2, RT(0), 0, RT(3), 2, RT(5), 4, INT_MAX
+};
+
+static int mode_check_freq_map_near2[] = {
+ 0, GOOD(5), 2, RT(0), 0, RT(3), 2, RT(10), 1<<2, RT(11), 1<<3, RT(12), 1<<4,
+ INT_MAX
+};
+
+static int mode_check_freq_map_new1[] = {
+ 0, RT(10), 1<<1, RT(11), 1<<2, RT(12), 1<<3, INT_MAX
+};
+
+static int mode_check_freq_map_new2[] = {
+ 0, GOOD(5), 4, RT(0), 0, RT(3), 4, RT(10), 1<<3, RT(11), 1<<4, RT(12), 1<<5,
+ INT_MAX
+};
+
+static int mode_check_freq_map_split1[] = {
+ 0, GOOD(2), 2, GOOD(3), 7, RT(1), 2, RT(2), 7, INT_MAX
+};
+
+static int mode_check_freq_map_split2[] = {
+ 0, GOOD(1), 2, GOOD(2), 4, GOOD(3), 15, RT(1), 4, RT(2), 15, INT_MAX
+};
+
void vp8_set_speed_features(VP8_COMP *cpi)
{
SPEED_FEATURES *sf = &cpi->sf;
@@ -617,6 +686,7 @@ void vp8_set_speed_features(VP8_COMP *cpi)
int i;
VP8_COMMON *cm = &cpi->common;
int last_improved_quant = sf->improved_quant;
+ int ref_frames;
// Initialise default mode frequency sampling variables
for (i = 0; i < MAX_MODES; i ++)
@@ -650,93 +720,90 @@ void vp8_set_speed_features(VP8_COMP *cpi)
for (i = 0; i < MAX_MODES; i++)
sf->thresh_mult[i] = 0;
+ /* Count enabled references */
+ ref_frames = 1;
+ if (cpi->ref_frame_flags & VP8_LAST_FLAG)
+ ref_frames++;
+ if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
+ ref_frames++;
+ if (cpi->ref_frame_flags & VP8_ALT_FLAG)
+ ref_frames++;
+
+ /* Convert speed to continuous range, with clamping */
+ if (Mode == 0)
+ Speed = 0;
+ else if (Mode == 2)
+ Speed = RT(Speed);
+ else
+ {
+ if (Speed > 5)
+ Speed = 5;
+ Speed = GOOD(Speed);
+ }
+
+ sf->thresh_mult[THR_ZERO1] =
+ sf->thresh_mult[THR_NEAREST1] =
+ sf->thresh_mult[THR_NEAR1] =
+ sf->thresh_mult[THR_DC] = 0; /* always */
+
+ sf->thresh_mult[THR_ZERO2] =
+ sf->thresh_mult[THR_ZERO3] =
+ sf->thresh_mult[THR_NEAREST2] =
+ sf->thresh_mult[THR_NEAREST3] =
+ sf->thresh_mult[THR_NEAR2] =
+ sf->thresh_mult[THR_NEAR3] = speed_map(Speed, thresh_mult_map_znn);
+
+ sf->thresh_mult[THR_V_PRED] =
+ sf->thresh_mult[THR_H_PRED] = speed_map(Speed, thresh_mult_map_vhpred);
+ sf->thresh_mult[THR_B_PRED] = speed_map(Speed, thresh_mult_map_bpred);
+ sf->thresh_mult[THR_TM] = speed_map(Speed, thresh_mult_map_tm);
+ sf->thresh_mult[THR_NEW1] = speed_map(Speed, thresh_mult_map_new1);
+ sf->thresh_mult[THR_NEW2] =
+ sf->thresh_mult[THR_NEW3] = speed_map(Speed, thresh_mult_map_new2);
+ sf->thresh_mult[THR_SPLIT1] = speed_map(Speed, thresh_mult_map_split1);
+ sf->thresh_mult[THR_SPLIT2] =
+ sf->thresh_mult[THR_SPLIT3] = speed_map(Speed, thresh_mult_map_split2);
+
+ cpi->mode_check_freq[THR_ZERO1] =
+ cpi->mode_check_freq[THR_NEAREST1] =
+ cpi->mode_check_freq[THR_NEAR1] =
+ cpi->mode_check_freq[THR_TM] =
+ cpi->mode_check_freq[THR_DC] = 0; /* always */
+
+ cpi->mode_check_freq[THR_ZERO2] =
+ cpi->mode_check_freq[THR_ZERO3] =
+ cpi->mode_check_freq[THR_NEAREST2] =
+ cpi->mode_check_freq[THR_NEAREST3] = speed_map(Speed,
+ mode_check_freq_map_zn2);
+
+ cpi->mode_check_freq[THR_NEAR2] =
+ cpi->mode_check_freq[THR_NEAR3] = speed_map(Speed,
+ mode_check_freq_map_near2);
+
+ cpi->mode_check_freq[THR_V_PRED] =
+ cpi->mode_check_freq[THR_H_PRED] =
+ cpi->mode_check_freq[THR_B_PRED] = speed_map(Speed,
+ mode_check_freq_map_vhbpred);
+ cpi->mode_check_freq[THR_NEW1] = speed_map(Speed,
+ mode_check_freq_map_new1);
+ cpi->mode_check_freq[THR_NEW2] =
+ cpi->mode_check_freq[THR_NEW3] = speed_map(Speed,
+ mode_check_freq_map_new2);
+ cpi->mode_check_freq[THR_SPLIT1] = speed_map(Speed,
+ mode_check_freq_map_split1);
+ cpi->mode_check_freq[THR_SPLIT2] =
+ cpi->mode_check_freq[THR_SPLIT3] = speed_map(Speed,
+ mode_check_freq_map_split2);
+ Speed = cpi->Speed;
switch (Mode)
{
#if !(CONFIG_REALTIME_ONLY)
case 0: // best quality mode
- sf->thresh_mult[THR_ZEROMV ] = 0;
- sf->thresh_mult[THR_ZEROG ] = 0;
- sf->thresh_mult[THR_ZEROA ] = 0;
- sf->thresh_mult[THR_NEARESTMV] = 0;
- sf->thresh_mult[THR_NEARESTG ] = 0;
- sf->thresh_mult[THR_NEARESTA ] = 0;
- sf->thresh_mult[THR_NEARMV ] = 0;
- sf->thresh_mult[THR_NEARG ] = 0;
- sf->thresh_mult[THR_NEARA ] = 0;
-
- sf->thresh_mult[THR_DC ] = 0;
-
- sf->thresh_mult[THR_V_PRED ] = 1000;
- sf->thresh_mult[THR_H_PRED ] = 1000;
- sf->thresh_mult[THR_B_PRED ] = 2000;
- sf->thresh_mult[THR_TM ] = 1000;
-
- sf->thresh_mult[THR_NEWMV ] = 1000;
- sf->thresh_mult[THR_NEWG ] = 1000;
- sf->thresh_mult[THR_NEWA ] = 1000;
-
- sf->thresh_mult[THR_SPLITMV ] = 2500;
- sf->thresh_mult[THR_SPLITG ] = 5000;
- sf->thresh_mult[THR_SPLITA ] = 5000;
-
-
sf->first_step = 0;
sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
break;
case 1:
case 3:
- sf->thresh_mult[THR_NEARESTMV] = 0;
- sf->thresh_mult[THR_ZEROMV ] = 0;
- sf->thresh_mult[THR_DC ] = 0;
- sf->thresh_mult[THR_NEARMV ] = 0;
- sf->thresh_mult[THR_V_PRED ] = 1000;
- sf->thresh_mult[THR_H_PRED ] = 1000;
- sf->thresh_mult[THR_B_PRED ] = 2500;
- sf->thresh_mult[THR_TM ] = 1000;
-
- sf->thresh_mult[THR_NEARESTG ] = 1000;
- sf->thresh_mult[THR_NEARESTA ] = 1000;
-
- sf->thresh_mult[THR_ZEROG ] = 1000;
- sf->thresh_mult[THR_ZEROA ] = 1000;
- sf->thresh_mult[THR_NEARG ] = 1000;
- sf->thresh_mult[THR_NEARA ] = 1000;
-
-#if 1
- sf->thresh_mult[THR_ZEROMV ] = 0;
- sf->thresh_mult[THR_ZEROG ] = 0;
- sf->thresh_mult[THR_ZEROA ] = 0;
- sf->thresh_mult[THR_NEARESTMV] = 0;
- sf->thresh_mult[THR_NEARESTG ] = 0;
- sf->thresh_mult[THR_NEARESTA ] = 0;
- sf->thresh_mult[THR_NEARMV ] = 0;
- sf->thresh_mult[THR_NEARG ] = 0;
- sf->thresh_mult[THR_NEARA ] = 0;
-
-// sf->thresh_mult[THR_DC ] = 0;
-
-// sf->thresh_mult[THR_V_PRED ] = 1000;
-// sf->thresh_mult[THR_H_PRED ] = 1000;
-// sf->thresh_mult[THR_B_PRED ] = 2000;
-// sf->thresh_mult[THR_TM ] = 1000;
-
- sf->thresh_mult[THR_NEWMV ] = 1000;
- sf->thresh_mult[THR_NEWG ] = 1000;
- sf->thresh_mult[THR_NEWA ] = 1000;
-
- sf->thresh_mult[THR_SPLITMV ] = 1700;
- sf->thresh_mult[THR_SPLITG ] = 4500;
- sf->thresh_mult[THR_SPLITA ] = 4500;
-#else
- sf->thresh_mult[THR_NEWMV ] = 1500;
- sf->thresh_mult[THR_NEWG ] = 1500;
- sf->thresh_mult[THR_NEWA ] = 1500;
-
- sf->thresh_mult[THR_SPLITMV ] = 5000;
- sf->thresh_mult[THR_SPLITG ] = 10000;
- sf->thresh_mult[THR_SPLITA ] = 10000;
-#endif
-
if (Speed > 0)
{
/* Disable coefficient optimization above speed 0 */
@@ -745,83 +812,10 @@ void vp8_set_speed_features(VP8_COMP *cpi)
sf->no_skip_block4x4_search = 0;
sf->first_step = 1;
-
- cpi->mode_check_freq[THR_SPLITG] = 2;
- cpi->mode_check_freq[THR_SPLITA] = 2;
- cpi->mode_check_freq[THR_SPLITMV] = 0;
- }
-
- if (Speed > 1)
- {
- cpi->mode_check_freq[THR_SPLITG] = 4;
- cpi->mode_check_freq[THR_SPLITA] = 4;
- cpi->mode_check_freq[THR_SPLITMV] = 2;
-
- sf->thresh_mult[THR_TM ] = 1500;
- sf->thresh_mult[THR_V_PRED ] = 1500;
- sf->thresh_mult[THR_H_PRED ] = 1500;
- sf->thresh_mult[THR_B_PRED ] = 5000;
-
- if (cpi->ref_frame_flags & VP8_LAST_FLAG)
- {
- sf->thresh_mult[THR_NEWMV ] = 2000;
- sf->thresh_mult[THR_SPLITMV ] = 10000;
- }
-
- if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
- {
- sf->thresh_mult[THR_NEARESTG ] = 1500;
- sf->thresh_mult[THR_ZEROG ] = 1500;
- sf->thresh_mult[THR_NEARG ] = 1500;
- sf->thresh_mult[THR_NEWG ] = 2000;
- sf->thresh_mult[THR_SPLITG ] = 20000;
- }
-
- if (cpi->ref_frame_flags & VP8_ALT_FLAG)
- {
- sf->thresh_mult[THR_NEARESTA ] = 1500;
- sf->thresh_mult[THR_ZEROA ] = 1500;
- sf->thresh_mult[THR_NEARA ] = 1500;
- sf->thresh_mult[THR_NEWA ] = 2000;
- sf->thresh_mult[THR_SPLITA ] = 20000;
- }
}
if (Speed > 2)
{
- cpi->mode_check_freq[THR_SPLITG] = 15;
- cpi->mode_check_freq[THR_SPLITA] = 15;
- cpi->mode_check_freq[THR_SPLITMV] = 7;
-
- sf->thresh_mult[THR_TM ] = 2000;
- sf->thresh_mult[THR_V_PRED ] = 2000;
- sf->thresh_mult[THR_H_PRED ] = 2000;
- sf->thresh_mult[THR_B_PRED ] = 7500;
-
- if (cpi->ref_frame_flags & VP8_LAST_FLAG)
- {
- sf->thresh_mult[THR_NEWMV ] = 2000;
- sf->thresh_mult[THR_SPLITMV ] = 25000;
- }
-
- if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
- {
- sf->thresh_mult[THR_NEARESTG ] = 2000;
- sf->thresh_mult[THR_ZEROG ] = 2000;
- sf->thresh_mult[THR_NEARG ] = 2000;
- sf->thresh_mult[THR_NEWG ] = 2500;
- sf->thresh_mult[THR_SPLITG ] = 50000;
- }
-
- if (cpi->ref_frame_flags & VP8_ALT_FLAG)
- {
- sf->thresh_mult[THR_NEARESTA ] = 2000;
- sf->thresh_mult[THR_ZEROA ] = 2000;
- sf->thresh_mult[THR_NEARA ] = 2000;
- sf->thresh_mult[THR_NEWA ] = 2500;
- sf->thresh_mult[THR_SPLITA ] = 50000;
- }
-
sf->improved_quant = 0;
sf->improved_dct = 0;
@@ -833,18 +827,6 @@ void vp8_set_speed_features(VP8_COMP *cpi)
if (Speed > 3)
{
- sf->thresh_mult[THR_SPLITA ] = INT_MAX;
- sf->thresh_mult[THR_SPLITG ] = INT_MAX;
- sf->thresh_mult[THR_SPLITMV ] = INT_MAX;
-
- cpi->mode_check_freq[THR_V_PRED] = 0;
- cpi->mode_check_freq[THR_H_PRED] = 0;
- cpi->mode_check_freq[THR_B_PRED] = 0;
- cpi->mode_check_freq[THR_NEARG] = 0;
- cpi->mode_check_freq[THR_NEWG] = 0;
- cpi->mode_check_freq[THR_NEARA] = 0;
- cpi->mode_check_freq[THR_NEWA] = 0;
-
sf->auto_filter = 1;
sf->recode_loop = 0; // recode loop off
sf->RD = 0; // Turn rd off
@@ -854,38 +836,6 @@ void vp8_set_speed_features(VP8_COMP *cpi)
if (Speed > 4)
{
sf->auto_filter = 0; // Faster selection of loop filter
-
- cpi->mode_check_freq[THR_V_PRED] = 2;
- cpi->mode_check_freq[THR_H_PRED] = 2;
- cpi->mode_check_freq[THR_B_PRED] = 2;
-
- if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
- {
- cpi->mode_check_freq[THR_NEARG] = 2;
- cpi->mode_check_freq[THR_NEWG] = 4;
- }
-
- if (cpi->ref_frame_flags & VP8_ALT_FLAG)
- {
- cpi->mode_check_freq[THR_NEARA] = 2;
- cpi->mode_check_freq[THR_NEWA] = 4;
- }
-
- if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
- {
- sf->thresh_mult[THR_NEARESTG ] = 2000;
- sf->thresh_mult[THR_ZEROG ] = 2000;
- sf->thresh_mult[THR_NEARG ] = 2000;
- sf->thresh_mult[THR_NEWG ] = 4000;
- }
-
- if (cpi->ref_frame_flags & VP8_ALT_FLAG)
- {
- sf->thresh_mult[THR_NEARESTA ] = 2000;
- sf->thresh_mult[THR_ZEROA ] = 2000;
- sf->thresh_mult[THR_NEARA ] = 2000;
- sf->thresh_mult[THR_NEWA ] = 4000;
- }
}
break;
@@ -895,67 +845,10 @@ void vp8_set_speed_features(VP8_COMP *cpi)
sf->recode_loop = 0;
sf->auto_filter = 1;
sf->iterative_sub_pixel = 1;
- sf->thresh_mult[THR_NEARESTMV] = 0;
- sf->thresh_mult[THR_ZEROMV ] = 0;
- sf->thresh_mult[THR_DC ] = 0;
- sf->thresh_mult[THR_TM ] = 0;
- sf->thresh_mult[THR_NEARMV ] = 0;
- sf->thresh_mult[THR_V_PRED ] = 1000;
- sf->thresh_mult[THR_H_PRED ] = 1000;
- sf->thresh_mult[THR_B_PRED ] = 2500;
- sf->thresh_mult[THR_NEARESTG ] = 1000;
- sf->thresh_mult[THR_ZEROG ] = 1000;
- sf->thresh_mult[THR_NEARG ] = 1000;
- sf->thresh_mult[THR_NEARESTA ] = 1000;
- sf->thresh_mult[THR_ZEROA ] = 1000;
- sf->thresh_mult[THR_NEARA ] = 1000;
- sf->thresh_mult[THR_NEWMV ] = 2000;
- sf->thresh_mult[THR_NEWG ] = 2000;
- sf->thresh_mult[THR_NEWA ] = 2000;
- sf->thresh_mult[THR_SPLITMV ] = 5000;
- sf->thresh_mult[THR_SPLITG ] = 10000;
- sf->thresh_mult[THR_SPLITA ] = 10000;
sf->search_method = NSTEP;
if (Speed > 0)
{
- cpi->mode_check_freq[THR_SPLITG] = 4;
- cpi->mode_check_freq[THR_SPLITA] = 4;
- cpi->mode_check_freq[THR_SPLITMV] = 2;
-
- sf->thresh_mult[THR_DC ] = 0;
- sf->thresh_mult[THR_TM ] = 1000;
- sf->thresh_mult[THR_V_PRED ] = 2000;
- sf->thresh_mult[THR_H_PRED ] = 2000;
- sf->thresh_mult[THR_B_PRED ] = 5000;
-
- if (cpi->ref_frame_flags & VP8_LAST_FLAG)
- {
- sf->thresh_mult[THR_NEARESTMV] = 0;
- sf->thresh_mult[THR_ZEROMV ] = 0;
- sf->thresh_mult[THR_NEARMV ] = 0;
- sf->thresh_mult[THR_NEWMV ] = 2000;
- sf->thresh_mult[THR_SPLITMV ] = 10000;
- }
-
- if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
- {
- sf->thresh_mult[THR_NEARESTG ] = 1000;
- sf->thresh_mult[THR_ZEROG ] = 1000;
- sf->thresh_mult[THR_NEARG ] = 1000;
- sf->thresh_mult[THR_NEWG ] = 2000;
- sf->thresh_mult[THR_SPLITG ] = 20000;
- }
-
- if (cpi->ref_frame_flags & VP8_ALT_FLAG)
- {
- sf->thresh_mult[THR_NEARESTA ] = 1000;
- sf->thresh_mult[THR_ZEROA ] = 1000;
- sf->thresh_mult[THR_NEARA ] = 1000;
- sf->thresh_mult[THR_NEWA ] = 2000;
- sf->thresh_mult[THR_SPLITA ] = 20000;
- }
-
sf->improved_quant = 0;
sf->improved_dct = 0;
@@ -964,133 +857,28 @@ void vp8_set_speed_features(VP8_COMP *cpi)
sf->first_step = 1;
}
- if (Speed > 1)
- {
- cpi->mode_check_freq[THR_SPLITMV] = 7;
- cpi->mode_check_freq[THR_SPLITG] = 15;
- cpi->mode_check_freq[THR_SPLITA] = 15;
-
- sf->thresh_mult[THR_TM ] = 2000;
- sf->thresh_mult[THR_V_PRED ] = 2000;
- sf->thresh_mult[THR_H_PRED ] = 2000;
- sf->thresh_mult[THR_B_PRED ] = 5000;
-
- if (cpi->ref_frame_flags & VP8_LAST_FLAG)
- {
- sf->thresh_mult[THR_NEWMV ] = 2000;
- sf->thresh_mult[THR_SPLITMV ] = 25000;
- }
-
- if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
- {
- sf->thresh_mult[THR_NEARESTG ] = 2000;
- sf->thresh_mult[THR_ZEROG ] = 2000;
- sf->thresh_mult[THR_NEARG ] = 2000;
- sf->thresh_mult[THR_NEWG ] = 2500;
- sf->thresh_mult[THR_SPLITG ] = 50000;
- }
-
- if (cpi->ref_frame_flags & VP8_ALT_FLAG)
- {
- sf->thresh_mult[THR_NEARESTA ] = 2000;
- sf->thresh_mult[THR_ZEROA ] = 2000;
- sf->thresh_mult[THR_NEARA ] = 2000;
- sf->thresh_mult[THR_NEWA ] = 2500;
- sf->thresh_mult[THR_SPLITA ] = 50000;
- }
-
- }
-
if (Speed > 2)
- {
sf->auto_filter = 0; // Faster selection of loop filter
- cpi->mode_check_freq[THR_V_PRED] = 2;
- cpi->mode_check_freq[THR_H_PRED] = 2;
- cpi->mode_check_freq[THR_B_PRED] = 2;
-
- if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
- {
- cpi->mode_check_freq[THR_NEARG] = 2;
- cpi->mode_check_freq[THR_NEWG] = 4;
- }
-
- if (cpi->ref_frame_flags & VP8_ALT_FLAG)
- {
- cpi->mode_check_freq[THR_NEARA] = 2;
- cpi->mode_check_freq[THR_NEWA] = 4;
- }
-
- sf->thresh_mult[THR_SPLITMV ] = INT_MAX;
- sf->thresh_mult[THR_SPLITG ] = INT_MAX;
- sf->thresh_mult[THR_SPLITA ] = INT_MAX;
-
- }
-
if (Speed > 3)
{
sf->RD = 0;
-
sf->auto_filter = 1;
}
if (Speed > 4)
{
sf->auto_filter = 0; // Faster selection of loop filter
-
sf->search_method = HEX;
- //sf->search_method = DIAMOND;
-
sf->iterative_sub_pixel = 0;
-
- cpi->mode_check_freq[THR_V_PRED] = 4;
- cpi->mode_check_freq[THR_H_PRED] = 4;
- cpi->mode_check_freq[THR_B_PRED] = 4;
-
- if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
- {
- cpi->mode_check_freq[THR_NEARG] = 2;
- cpi->mode_check_freq[THR_NEWG] = 4;
- }
-
- if (cpi->ref_frame_flags & VP8_ALT_FLAG)
- {
- cpi->mode_check_freq[THR_NEARA] = 2;
- cpi->mode_check_freq[THR_NEWA] = 4;
- }
-
- sf->thresh_mult[THR_TM ] = 2000;
- sf->thresh_mult[THR_B_PRED ] = 5000;
-
- if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
- {
- sf->thresh_mult[THR_NEARESTG ] = 2000;
- sf->thresh_mult[THR_ZEROG ] = 2000;
- sf->thresh_mult[THR_NEARG ] = 2000;
- sf->thresh_mult[THR_NEWG ] = 4000;
- }
-
- if (cpi->ref_frame_flags & VP8_ALT_FLAG)
- {
- sf->thresh_mult[THR_NEARESTA ] = 2000;
- sf->thresh_mult[THR_ZEROA ] = 2000;
- sf->thresh_mult[THR_NEARA ] = 2000;
- sf->thresh_mult[THR_NEWA ] = 4000;
- }
- }
-
- if (Speed > 5)
- {
- // Disable split MB intra prediction mode
- sf->thresh_mult[THR_B_PRED] = INT_MAX;
}
if (Speed > 6)
{
- unsigned int i, sum = 0;
+ unsigned int sum = 0;
unsigned int total_mbs = cm->MBs;
- int thresh;
- int total_skip;
+ int i, thresh;
+ unsigned int total_skip;
int min = 2000;
@@ -1122,109 +910,53 @@ void vp8_set_speed_features(VP8_COMP *cpi)
if (thresh < 2000)
thresh = 2000;
- if (cpi->ref_frame_flags & VP8_LAST_FLAG)
+ if (ref_frames > 1)
{
- sf->thresh_mult[THR_NEWMV] = thresh;
- sf->thresh_mult[THR_NEARESTMV ] = thresh >> 1;
- sf->thresh_mult[THR_NEARMV ] = thresh >> 1;
+ sf->thresh_mult[THR_NEW1 ] = thresh;
+ sf->thresh_mult[THR_NEAREST1 ] = thresh >> 1;
+ sf->thresh_mult[THR_NEAR1 ] = thresh >> 1;
}
- if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
+ if (ref_frames > 2)
{
- sf->thresh_mult[THR_NEWG] = thresh << 1;
- sf->thresh_mult[THR_NEARESTG ] = thresh;
- sf->thresh_mult[THR_NEARG ] = thresh;
+ sf->thresh_mult[THR_NEW2] = thresh << 1;
+ sf->thresh_mult[THR_NEAREST2 ] = thresh;
+ sf->thresh_mult[THR_NEAR2 ] = thresh;
}
- if (cpi->ref_frame_flags & VP8_ALT_FLAG)
+ if (ref_frames > 3)
{
- sf->thresh_mult[THR_NEWA] = thresh << 1;
- sf->thresh_mult[THR_NEARESTA ] = thresh;
- sf->thresh_mult[THR_NEARA ] = thresh;
+ sf->thresh_mult[THR_NEW3] = thresh << 1;
+ sf->thresh_mult[THR_NEAREST3 ] = thresh;
+ sf->thresh_mult[THR_NEAR3 ] = thresh;
}
- // Disable other intra prediction modes
- sf->thresh_mult[THR_TM] = INT_MAX;
- sf->thresh_mult[THR_V_PRED] = INT_MAX;
- sf->thresh_mult[THR_H_PRED] = INT_MAX;
-
sf->improved_mv_pred = 0;
}
if (Speed > 8)
- {
sf->quarter_pixel_search = 0;
- }
- if (Speed > 9)
+ if(cm->version == 0)
{
- int Tmp = cpi->Speed - 8;
-
- if (Tmp > 4)
- Tmp = 4;
+ cm->filter_type = NORMAL_LOOPFILTER;
- if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
- {
- cpi->mode_check_freq[THR_ZEROG] = 1 << (Tmp - 1);
- cpi->mode_check_freq[THR_NEARESTG] = 1 << (Tmp - 1);
- cpi->mode_check_freq[THR_NEARG] = 1 << Tmp;
- cpi->mode_check_freq[THR_NEWG] = 1 << (Tmp + 1);
- }
-
- if (cpi->ref_frame_flags & VP8_ALT_FLAG)
- {
- cpi->mode_check_freq[THR_ZEROA] = 1 << (Tmp - 1);
- cpi->mode_check_freq[THR_NEARESTA] = 1 << (Tmp - 1);
- cpi->mode_check_freq[THR_NEARA] = 1 << Tmp;
- cpi->mode_check_freq[THR_NEWA] = 1 << (Tmp + 1);
- }
-
- cpi->mode_check_freq[THR_NEWMV] = 1 << (Tmp - 1);
+ if (Speed >= 14)
+ cm->filter_type = SIMPLE_LOOPFILTER;
}
-
- cm->filter_type = NORMAL_LOOPFILTER;
-
- if (Speed >= 14)
+ else
+ {
cm->filter_type = SIMPLE_LOOPFILTER;
+ }
+ // This has a big hit on quality. Last resort
if (Speed >= 15)
- {
- sf->half_pixel_search = 0; // This has a big hit on quality. Last resort
- }
+ sf->half_pixel_search = 0;
vpx_memset(cpi->error_bins, 0, sizeof(cpi->error_bins));
}; /* switch */
- /* disable frame modes if flags not set */
- if (!(cpi->ref_frame_flags & VP8_LAST_FLAG))
- {
- sf->thresh_mult[THR_NEWMV ] = INT_MAX;
- sf->thresh_mult[THR_NEARESTMV] = INT_MAX;
- sf->thresh_mult[THR_ZEROMV ] = INT_MAX;
- sf->thresh_mult[THR_NEARMV ] = INT_MAX;
- sf->thresh_mult[THR_SPLITMV ] = INT_MAX;
- }
-
- if (!(cpi->ref_frame_flags & VP8_GOLD_FLAG))
- {
- sf->thresh_mult[THR_NEARESTG ] = INT_MAX;
- sf->thresh_mult[THR_ZEROG ] = INT_MAX;
- sf->thresh_mult[THR_NEARG ] = INT_MAX;
- sf->thresh_mult[THR_NEWG ] = INT_MAX;
- sf->thresh_mult[THR_SPLITG ] = INT_MAX;
- }
-
- if (!(cpi->ref_frame_flags & VP8_ALT_FLAG))
- {
- sf->thresh_mult[THR_NEARESTA ] = INT_MAX;
- sf->thresh_mult[THR_ZEROA ] = INT_MAX;
- sf->thresh_mult[THR_NEARA ] = INT_MAX;
- sf->thresh_mult[THR_NEWA ] = INT_MAX;
- sf->thresh_mult[THR_SPLITA ] = INT_MAX;
- }
-
-
// Slow quant, dct and trellis not worthwhile for first pass
// so make sure they are always turned off.
if ( cpi->pass == 1 )
@@ -1306,6 +1038,9 @@ void vp8_set_speed_features(VP8_COMP *cpi)
frames_at_speed[cpi->Speed]++;
#endif
}
+#undef GOOD
+#undef RT
+
static void alloc_raw_frame_buffers(VP8_COMP *cpi)
{
int width = (cpi->oxcf.Width + 15) & ~15;
@@ -1365,7 +1100,7 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi)
height += 16 - (height & 0xf);
- if (vp8_yv12_alloc_frame_buffer(&cpi->last_frame_uf,
+ if (vp8_yv12_alloc_frame_buffer(&cpi->pick_lf_lvl_frame,
width, height, VP8BORDERINPIXELS))
vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
"Failed to allocate last frame buffer");
@@ -1406,25 +1141,6 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi)
vpx_calloc(sizeof(unsigned int),
cm->mb_rows * cm->mb_cols));
-#if !(CONFIG_REALTIME_ONLY)
- vpx_free(cpi->twopass.total_stats);
-
- cpi->twopass.total_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS));
-
- vpx_free(cpi->twopass.total_left_stats);
- cpi->twopass.total_left_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS));
-
- vpx_free(cpi->twopass.this_frame_stats);
-
- cpi->twopass.this_frame_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS));
-
- if( !cpi->twopass.total_stats ||
- !cpi->twopass.total_left_stats ||
- !cpi->twopass.this_frame_stats)
- vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
- "Failed to allocate firstpass stats");
-#endif
-
#if CONFIG_MULTITHREAD
if (width < 640)
cpi->mt_sync_range = 1;
@@ -1436,7 +1152,7 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi)
cpi->mt_sync_range = 16;
#endif
- vpx_free(cpi->tplist);
+ vpx_free(cpi->tplist);
CHECK_MEM_ERROR(cpi->tplist, vpx_malloc(sizeof(TOKENLIST) * cpi->common.mb_rows));
}
@@ -1470,8 +1186,8 @@ void vp8_new_frame_rate(VP8_COMP *cpi, double framerate)
if(framerate < .1)
framerate = 30;
- cpi->oxcf.frame_rate = framerate;
- cpi->output_frame_rate = cpi->oxcf.frame_rate;
+ cpi->frame_rate = framerate;
+ cpi->output_frame_rate = framerate;
cpi->per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth /
cpi->output_frame_rate);
cpi->av_per_frame_bandwidth = cpi->per_frame_bandwidth;
@@ -1513,9 +1229,8 @@ rescale(int val, int num, int denom)
}
-static void init_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
+static void init_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
{
- VP8_COMP *cpi = (VP8_COMP *)(ptr);
VP8_COMMON *cm = &cpi->common;
cpi->oxcf = *oxcf;
@@ -1527,8 +1242,20 @@ static void init_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
cm->version = oxcf->Version;
vp8_setup_version(cm);
+ /* frame rate is not available on the first frame, as it's derived from
+ * the observed timestamps. The actual value used here doesn't matter
+ * too much, as it will adapt quickly. If the reciprocal of the timebase
+ * seems like a reasonable framerate, then use that as a guess, otherwise
+ * use 30.
+ */
+ cpi->frame_rate = (double)(oxcf->timebase.den) /
+ (double)(oxcf->timebase.num);
+
+ if (cpi->frame_rate > 180)
+ cpi->frame_rate = 30;
+
// change includes all joint functionality
- vp8_change_config(ptr, oxcf);
+ vp8_change_config(cpi, oxcf);
// Initialize active best and worst q and average q values.
cpi->active_worst_quality = cpi->oxcf.worst_allowed_q;
@@ -1550,8 +1277,8 @@ static void init_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
// Temporal scalabilty
if (cpi->oxcf.number_of_layers > 1)
{
- int i;
- int prev_layer_frame_rate=0;
+ unsigned int i;
+ double prev_layer_frame_rate=0;
for (i=0; i<cpi->oxcf.number_of_layers; i++)
{
@@ -1619,9 +1346,8 @@ static void init_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
}
-void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
+void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
{
- VP8_COMP *cpi = (VP8_COMP *)(ptr);
VP8_COMMON *cm = &cpi->common;
if (!cpi)
@@ -1787,7 +1513,7 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
cpi->oxcf.target_bandwidth, 1000);
// Set up frame rate and related parameters rate control values.
- vp8_new_frame_rate(cpi, cpi->oxcf.frame_rate);
+ vp8_new_frame_rate(cpi, cpi->frame_rate);
// Set absolute upper and lower quality limits
cpi->worst_quality = cpi->oxcf.worst_allowed_q;
@@ -1813,7 +1539,7 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
cpi->active_best_quality = cpi->oxcf.worst_allowed_q;
}
- cpi->buffered_mode = (cpi->oxcf.optimal_buffer_level > 0) ? TRUE : FALSE;
+ cpi->buffered_mode = cpi->oxcf.optimal_buffer_level > 0;
cpi->cq_target_quality = cpi->oxcf.cq_level;
@@ -1912,19 +1638,14 @@ static void cal_mvsadcosts(int *mvsadcost[2])
while (++i <= mvfp_max);
}
-VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
+struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
{
int i;
- volatile union
- {
- VP8_COMP *cpi;
- VP8_PTR ptr;
- } ctx;
VP8_COMP *cpi;
VP8_COMMON *cm;
- cpi = ctx.cpi = vpx_memalign(32, sizeof(VP8_COMP));
+ cpi = vpx_memalign(32, sizeof(VP8_COMP));
// Check that the CPI instance is valid
if (!cpi)
return 0;
@@ -1935,10 +1656,8 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
if (setjmp(cm->error.jmp))
{
- VP8_PTR ptr = ctx.ptr;
-
- ctx.cpi->common.error.setjmp = 0;
- vp8_remove_compressor(&ptr);
+ cpi->common.error.setjmp = 0;
+ vp8_remove_compressor(&cpi);
return 0;
}
@@ -1949,7 +1668,7 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
vp8_create_common(&cpi->common);
vp8_cmachine_specific_config(cpi);
- init_config((VP8_PTR)cpi, oxcf);
+ init_config(cpi, oxcf);
memcpy(cpi->base_skip_false_prob, vp8cx_base_skip_false_prob, sizeof(vp8cx_base_skip_false_prob));
cpi->common.current_video_frame = 0;
@@ -2028,7 +1747,7 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
cpi->cyclic_refresh_map = (signed char *) NULL;
// Test function for segmentation
- //segmentation_test_function((VP8_PTR) cpi);
+ //segmentation_test_function( cpi);
#ifdef ENTROPY_STATS
init_context_counters();
@@ -2039,11 +1758,11 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
cpi->frames_since_key = 8; // Give a sensible default for the first frame.
cpi->key_frame_frequency = cpi->oxcf.key_freq;
- cpi->this_key_frame_forced = FALSE;
- cpi->next_key_frame_forced = FALSE;
+ cpi->this_key_frame_forced = 0;
+ cpi->next_key_frame_forced = 0;
- cpi->source_alt_ref_pending = FALSE;
- cpi->source_alt_ref_active = FALSE;
+ cpi->source_alt_ref_pending = 0;
+ cpi->source_alt_ref_active = 0;
cpi->common.refresh_alt_ref_frame = 0;
cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
@@ -2241,14 +1960,21 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
vp8_loop_filter_init(cm);
cpi->common.error.setjmp = 0;
- return (VP8_PTR) cpi;
+
+#if CONFIG_MULTI_RES_ENCODING
+ /* Calculate # of MBs in a row in lower-resolution level image. */
+ if (cpi->oxcf.mr_encoder_id > 0)
+ vp8_cal_low_res_mb_cols(cpi);
+#endif
+
+ return cpi;
}
-void vp8_remove_compressor(VP8_PTR *ptr)
+void vp8_remove_compressor(VP8_COMP **ptr)
{
- VP8_COMP *cpi = (VP8_COMP *)(*ptr);
+ VP8_COMP *cpi = *ptr;
if (!cpi)
return;
@@ -2408,7 +2134,7 @@ void vp8_remove_compressor(VP8_PTR *ptr)
{
extern int count_mb_seg[4];
FILE *f = fopen("modes.stt", "a");
- double dr = (double)cpi->oxcf.frame_rate * (double)bytes * (double)8 / (double)count / (double)1000 ;
+ double dr = (double)cpi->frame_rate * (double)bytes * (double)8 / (double)count / (double)1000 ;
fprintf(f, "intra_mode in Intra Frames:\n");
fprintf(f, "Y: %8d, %8d, %8d, %8d, %8d\n", y_modes[0], y_modes[1], y_modes[2], y_modes[3], y_modes[4]);
fprintf(f, "UV:%8d, %8d, %8d, %8d\n", uv_modes[0], uv_modes[1], uv_modes[2], uv_modes[3]);
@@ -2659,20 +2385,16 @@ static void generate_psnr_packet(VP8_COMP *cpi)
}
-int vp8_use_as_reference(VP8_PTR ptr, int ref_frame_flags)
+int vp8_use_as_reference(VP8_COMP *cpi, int ref_frame_flags)
{
- VP8_COMP *cpi = (VP8_COMP *)(ptr);
-
if (ref_frame_flags > 7)
return -1 ;
cpi->ref_frame_flags = ref_frame_flags;
return 0;
}
-int vp8_update_reference(VP8_PTR ptr, int ref_frame_flags)
+int vp8_update_reference(VP8_COMP *cpi, int ref_frame_flags)
{
- VP8_COMP *cpi = (VP8_COMP *)(ptr);
-
if (ref_frame_flags > 7)
return -1 ;
@@ -2692,9 +2414,8 @@ int vp8_update_reference(VP8_PTR ptr, int ref_frame_flags)
return 0;
}
-int vp8_get_reference(VP8_PTR ptr, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd)
+int vp8_get_reference(VP8_COMP *cpi, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd)
{
- VP8_COMP *cpi = (VP8_COMP *)(ptr);
VP8_COMMON *cm = &cpi->common;
int ref_fb_idx;
@@ -2711,9 +2432,8 @@ int vp8_get_reference(VP8_PTR ptr, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONF
return 0;
}
-int vp8_set_reference(VP8_PTR ptr, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd)
+int vp8_set_reference(VP8_COMP *cpi, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd)
{
- VP8_COMP *cpi = (VP8_COMP *)(ptr);
VP8_COMMON *cm = &cpi->common;
int ref_fb_idx;
@@ -2731,9 +2451,8 @@ int vp8_set_reference(VP8_PTR ptr, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONF
return 0;
}
-int vp8_update_entropy(VP8_PTR comp, int update)
+int vp8_update_entropy(VP8_COMP *cpi, int update)
{
- VP8_COMP *cpi = (VP8_COMP *) comp;
VP8_COMMON *cm = &cpi->common;
cm->refresh_entropy_probs = update;
@@ -2889,10 +2608,10 @@ static void update_alt_ref_frame_stats(VP8_COMP *cpi)
cpi->common.frames_since_golden = 0;
// Clear the alternate reference update pending flag.
- cpi->source_alt_ref_pending = FALSE;
+ cpi->source_alt_ref_pending = 0;
// Set the alternate refernce frame active flag
- cpi->source_alt_ref_active = TRUE;
+ cpi->source_alt_ref_active = 1;
}
@@ -2955,12 +2674,12 @@ static void update_golden_frame_stats(VP8_COMP *cpi)
if (cpi->oxcf.fixed_q >= 0 &&
cpi->oxcf.play_alternate && !cpi->common.refresh_alt_ref_frame)
{
- cpi->source_alt_ref_pending = TRUE;
+ cpi->source_alt_ref_pending = 1;
cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
}
if (!cpi->source_alt_ref_pending)
- cpi->source_alt_ref_active = FALSE;
+ cpi->source_alt_ref_active = 0;
// Decrement count down till next gf
if (cpi->frames_till_gf_update_due > 0)
@@ -2994,8 +2713,7 @@ static void update_rd_ref_frame_probs(VP8_COMP *cpi)
{
VP8_COMMON *cm = &cpi->common;
-#if 0
- const int *const rfct = cpi->recent_ref_frame_usage;
+ const int *const rfct = cpi->count_mb_ref_frame_usage;
const int rf_intra = rfct[INTRA_FRAME];
const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];
@@ -3007,100 +2725,10 @@ static void update_rd_ref_frame_probs(VP8_COMP *cpi)
}
else if (!(rf_intra + rf_inter))
{
- // This is a trap in case this function is called with cpi->recent_ref_frame_usage[] blank.
cpi->prob_intra_coded = 63;
cpi->prob_last_coded = 128;
cpi->prob_gf_coded = 128;
}
- else
- {
- cpi->prob_intra_coded = (rf_intra * 255) / (rf_intra + rf_inter);
-
- if (cpi->prob_intra_coded < 1)
- cpi->prob_intra_coded = 1;
-
- if ((cm->frames_since_golden > 0) || cpi->source_alt_ref_active)
- {
- cpi->prob_last_coded = rf_inter ? (rfct[LAST_FRAME] * 255) / rf_inter : 128;
-
- if (cpi->prob_last_coded < 1)
- cpi->prob_last_coded = 1;
-
- cpi->prob_gf_coded = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME])
- ? (rfct[GOLDEN_FRAME] * 255) / (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) : 128;
-
- if (cpi->prob_gf_coded < 1)
- cpi->prob_gf_coded = 1;
- }
- }
-
-#else
- const int *const rfct = cpi->count_mb_ref_frame_usage;
- const int rf_intra = rfct[INTRA_FRAME];
- const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];
-
- if (cm->frame_type == KEY_FRAME)
- {
- cpi->prob_intra_coded = 255;
- cpi->prob_last_coded = 128;
- cpi->prob_gf_coded = 128;
- }
- else if (!(rf_intra + rf_inter))
- {
- if (cpi->oxcf.number_of_layers > 1)
- {
- if (cpi->ref_frame_flags == VP8_LAST_FLAG)
- {
- cpi->prob_intra_coded = 63;
- cpi->prob_last_coded = 255;
- cpi->prob_gf_coded = 128;
- }
- else if (cpi->ref_frame_flags == VP8_GOLD_FLAG)
- {
- cpi->prob_intra_coded = 63;
- cpi->prob_last_coded = 1;
- cpi->prob_gf_coded = 255;
- }
- else if (cpi->ref_frame_flags == VP8_ALT_FLAG)
- {
- cpi->prob_intra_coded = 63;
- cpi->prob_last_coded = 1;
- cpi->prob_gf_coded = 1;
- }
- else
- {
- cpi->prob_intra_coded = 63;
- cpi->prob_last_coded = 128;
- cpi->prob_gf_coded = 128;
- }
- }
- else
- {
- // This is a trap in case this function is called with
- // cpi->recent_ref_frame_usage[] blank.
- cpi->prob_intra_coded = 63;
- cpi->prob_last_coded = 128;
- cpi->prob_gf_coded = 128;
- }
- }
- else
- {
- cpi->prob_intra_coded = (rf_intra * 255) / (rf_intra + rf_inter);
-
- if (cpi->prob_intra_coded < 1)
- cpi->prob_intra_coded = 1;
-
- cpi->prob_last_coded = rf_inter ? (rfct[LAST_FRAME] * 255) / rf_inter : 128;
-
- if (cpi->prob_last_coded < 1)
- cpi->prob_last_coded = 1;
-
- cpi->prob_gf_coded = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME])
- ? (rfct[GOLDEN_FRAME] * 255) / (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) : 128;
-
- if (cpi->prob_gf_coded < 1)
- cpi->prob_gf_coded = 1;
- }
// update reference frame costs since we can do better than what we got last frame.
if (cpi->oxcf.number_of_layers == 1)
@@ -3114,7 +2742,6 @@ static void update_rd_ref_frame_probs(VP8_COMP *cpi)
else if (cpi->common.frames_since_golden == 0)
{
cpi->prob_last_coded = 214;
- cpi->prob_gf_coded = 1;
}
else if (cpi->common.frames_since_golden == 1)
{
@@ -3123,14 +2750,14 @@ static void update_rd_ref_frame_probs(VP8_COMP *cpi)
}
else if (cpi->source_alt_ref_active)
{
- //int dist = cpi->common.frames_till_alt_ref_frame + cpi->common.frames_since_golden;
cpi->prob_gf_coded -= 20;
if (cpi->prob_gf_coded < 10)
cpi->prob_gf_coded = 10;
}
+ if (!cpi->source_alt_ref_active)
+ cpi->prob_gf_coded = 255;
}
-#endif
}
@@ -3139,12 +2766,12 @@ static int decide_key_frame(VP8_COMP *cpi)
{
VP8_COMMON *cm = &cpi->common;
- int code_key_frame = FALSE;
+ int code_key_frame = 0;
cpi->kf_boost = 0;
if (cpi->Speed > 11)
- return FALSE;
+ return 0;
// Clear down mmx registers
vp8_clear_system_state(); //__asm emms;
@@ -3186,10 +2813,10 @@ static int decide_key_frame(VP8_COMP *cpi)
&& (change > .25 || change2 > .25))
{
/*(change > 1.4 || change < .75)&& cpi->this_frame_percent_intra > cpi->last_frame_percent_intra + 3*/
- return TRUE;
+ return 1;
}
- return FALSE;
+ return 0;
}
@@ -3199,7 +2826,7 @@ static int decide_key_frame(VP8_COMP *cpi)
((cpi->this_frame_percent_intra > 95) &&
(cpi->this_frame_percent_intra >= (cpi->last_frame_percent_intra + 5))))
{
- code_key_frame = TRUE;
+ code_key_frame = 1;
}
// in addition if the following are true and this is not a golden frame then code a key frame
// Note that on golden frames there often seems to be a pop in intra useage anyway hence this
@@ -3212,7 +2839,7 @@ static int decide_key_frame(VP8_COMP *cpi)
(cpi->this_frame_percent_intra > (cpi->last_frame_percent_intra + 10))))
{
if (!cm->refresh_golden_frame)
- code_key_frame = TRUE;
+ code_key_frame = 1;
}
return code_key_frame;
@@ -3268,11 +2895,11 @@ void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame)
// Function to test for conditions that indeicate we should loop
// back and recode a frame.
-static BOOL recode_loop_test( VP8_COMP *cpi,
+static int recode_loop_test( VP8_COMP *cpi,
int high_limit, int low_limit,
int q, int maxq, int minq )
{
- BOOL force_recode = FALSE;
+ int force_recode = 0;
VP8_COMMON *cm = &cpi->common;
// Is frame recode allowed at all
@@ -3288,7 +2915,7 @@ static BOOL recode_loop_test( VP8_COMP *cpi,
if ( ((cpi->projected_frame_size > high_limit) && (q < maxq)) ||
((cpi->projected_frame_size < low_limit) && (q > minq)) )
{
- force_recode = TRUE;
+ force_recode = 1;
}
// Special Constrained quality tests
else if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY)
@@ -3298,14 +2925,14 @@ static BOOL recode_loop_test( VP8_COMP *cpi,
(cpi->projected_frame_size <
((cpi->this_frame_target * 7) >> 3)))
{
- force_recode = TRUE;
+ force_recode = 1;
}
// Severe undershoot and between auto and user cq level
else if ( (q > cpi->oxcf.cq_level) &&
(cpi->projected_frame_size < cpi->min_frame_bandwidth) &&
(cpi->active_best_quality > cpi->oxcf.cq_level))
{
- force_recode = TRUE;
+ force_recode = 1;
cpi->active_best_quality = cpi->oxcf.cq_level;
}
}
@@ -3456,7 +3083,7 @@ static void encode_frame_to_data_rate
int frame_over_shoot_limit;
int frame_under_shoot_limit;
- int Loop = FALSE;
+ int Loop = 0;
int loop_count;
int this_q;
int last_zbin_oq;
@@ -3468,10 +3095,10 @@ static void encode_frame_to_data_rate
int top_index;
int bottom_index;
VP8_COMMON *cm = &cpi->common;
- int active_worst_qchanged = FALSE;
+ int active_worst_qchanged = 0;
- int overshoot_seen = FALSE;
- int undershoot_seen = FALSE;
+ int overshoot_seen = 0;
+ int undershoot_seen = 0;
int drop_mark = cpi->oxcf.drop_frames_water_mark * cpi->oxcf.optimal_buffer_level / 100;
int drop_mark75 = drop_mark * 2 / 3;
int drop_mark50 = drop_mark / 4;
@@ -3482,7 +3109,7 @@ static void encode_frame_to_data_rate
vp8_clear_system_state();
// Test code for segmentation of gf/arf (0,0)
- //segmentation_test_function((VP8_PTR) cpi);
+ //segmentation_test_function( cpi);
if (cpi->compressor_speed == 2)
{
@@ -3522,12 +3149,12 @@ static void encode_frame_to_data_rate
// Enable or disable mode based tweaking of the zbin
// For 2 Pass Only used where GF/ARF prediction quality
// is above a threshold
- cpi->zbin_mode_boost_enabled = TRUE;
+ cpi->zbin_mode_boost_enabled = 1;
if (cpi->pass == 2)
{
if ( cpi->gfu_boost <= 400 )
{
- cpi->zbin_mode_boost_enabled = FALSE;
+ cpi->zbin_mode_boost_enabled = 0;
}
}
@@ -3568,7 +3195,7 @@ static void encode_frame_to_data_rate
}
// The alternate reference frame cannot be active for a key frame
- cpi->source_alt_ref_active = FALSE;
+ cpi->source_alt_ref_active = 0;
// Reset the RD threshold multipliers to default of * 1 (128)
for (i = 0; i < MAX_MODES; i++)
@@ -3580,9 +3207,9 @@ static void encode_frame_to_data_rate
// Test code for segmentation
//if ( (cm->frame_type == KEY_FRAME) || ((cm->current_video_frame % 2) == 0))
//if ( (cm->current_video_frame % 2) == 0 )
- // enable_segmentation((VP8_PTR)cpi);
+ // enable_segmentation(cpi);
//else
- // disable_segmentation((VP8_PTR)cpi);
+ // disable_segmentation(cpi);
#if 0
// Experimental code for lagged compress and one pass
@@ -3676,7 +3303,7 @@ static void encode_frame_to_data_rate
if (cpi->oxcf.number_of_layers > 1)
{
- int i;
+ unsigned int i;
// Propagate bits saved by dropping the frame to higher layers
for (i=cpi->current_layer+1; i<cpi->oxcf.number_of_layers; i++)
@@ -4080,7 +3707,7 @@ static void encode_frame_to_data_rate
vp8_pick_frame_size(cpi);
// Clear the Alt reference frame active flag when we have a key frame
- cpi->source_alt_ref_active = FALSE;
+ cpi->source_alt_ref_active = 0;
// Reset the loop filter deltas and segmentation map
setup_features(cpi);
@@ -4105,7 +3732,7 @@ static void encode_frame_to_data_rate
q_high = cpi->active_worst_quality;
loop_count++;
- Loop = TRUE;
+ Loop = 1;
continue;
}
@@ -4133,10 +3760,10 @@ static void encode_frame_to_data_rate
}
// If we have updated the active max Q do not call vp8_update_rate_correction_factors() this loop.
- active_worst_qchanged = TRUE;
+ active_worst_qchanged = 1;
}
else
- active_worst_qchanged = FALSE;
+ active_worst_qchanged = 0;
#if !(CONFIG_REALTIME_ONLY)
// Special case handling for forced key frames
@@ -4172,7 +3799,7 @@ static void encode_frame_to_data_rate
else if (Q < q_low)
Q = q_low;
- Loop = ((Q != last_q)) ? TRUE : FALSE;
+ Loop = Q != last_q;
}
// Is the projected frame size out of range and are we allowed to attempt to recode.
@@ -4229,7 +3856,7 @@ static void encode_frame_to_data_rate
}
}
- overshoot_seen = TRUE;
+ overshoot_seen = 1;
}
// Frame is too small
else
@@ -4279,7 +3906,7 @@ static void encode_frame_to_data_rate
}
}
- undershoot_seen = TRUE;
+ undershoot_seen = 1;
}
// Clamp Q to upper and lower limits:
@@ -4291,18 +3918,18 @@ static void encode_frame_to_data_rate
// Clamp cpi->zbin_over_quant
cpi->zbin_over_quant = (cpi->zbin_over_quant < zbin_oq_low) ? zbin_oq_low : (cpi->zbin_over_quant > zbin_oq_high) ? zbin_oq_high : cpi->zbin_over_quant;
- //Loop = ((Q != last_q) || (last_zbin_oq != cpi->zbin_over_quant)) ? TRUE : FALSE;
- Loop = ((Q != last_q)) ? TRUE : FALSE;
+ //Loop = (Q != last_q) || (last_zbin_oq != cpi->zbin_over_quant);
+ Loop = Q != last_q;
last_zbin_oq = cpi->zbin_over_quant;
}
else
#endif
- Loop = FALSE;
+ Loop = 0;
if (cpi->is_src_frame_alt_ref)
- Loop = FALSE;
+ Loop = 0;
- if (Loop == TRUE)
+ if (Loop == 1)
{
vp8_restore_coding_context(cpi);
loop_count++;
@@ -4311,7 +3938,7 @@ static void encode_frame_to_data_rate
#endif
}
}
- while (Loop == TRUE);
+ while (Loop == 1);
#if 0
// Experimental code for lagged and one pass
@@ -4345,13 +3972,20 @@ static void encode_frame_to_data_rate
IF_RTCD(&cpi->rtcd.variance));
}
- // This frame's MVs are saved and will be used in next frame's MV prediction.
- // Last frame has one more line(add to bottom) and one more column(add to right) than cm->mip. The edge elements are initialized to 0.
- if(cm->show_frame) //do not save for altref frame
+ /* This frame's MVs are saved and will be used in next frame's MV predictor.
+ * Last frame has one more line(add to bottom) and one more column(add to
+ * right) than cm->mip. The edge elements are initialized to 0.
+ */
+#if CONFIG_MULTI_RES_ENCODING
+ if(!cpi->oxcf.mr_encoder_id && cm->show_frame)
+#else
+ if(cm->show_frame) /* do not save for altref frame */
+#endif
{
int mb_row;
int mb_col;
- MODE_INFO *tmp = cm->mip; //point to beginning of allocated MODE_INFO arrays.
+ /* Point to beginning of allocated MODE_INFO arrays. */
+ MODE_INFO *tmp = cm->mip;
if(cm->frame_type != KEY_FRAME)
{
@@ -4370,6 +4004,10 @@ static void encode_frame_to_data_rate
}
}
+#if CONFIG_MULTI_RES_ENCODING
+ vp8_cal_dissimilarity(cpi);
+#endif
+
// Update the GF useage maps.
// This is done after completing the compression of a frame when all
// modes etc. are finalized but before loop filter
@@ -4442,7 +4080,7 @@ static void encode_frame_to_data_rate
if (cpi->oxcf.number_of_layers > 1)
{
- int i;
+ unsigned int i;
for (i=cpi->current_layer+1; i<cpi->oxcf.number_of_layers; i++)
cpi->layer_context[i].total_byte_count += (*size);
}
@@ -4509,7 +4147,7 @@ static void encode_frame_to_data_rate
(cpi->buffer_level < cpi->oxcf.drop_frames_water_mark * cpi->oxcf.optimal_buffer_level / 100) &&
(cpi->projected_frame_size > (4 * cpi->this_frame_target)))
{
- cpi->drop_frame = TRUE;
+ cpi->drop_frame = 1;
}
#endif
@@ -4553,7 +4191,7 @@ static void encode_frame_to_data_rate
// Propagate values to higher temporal layers
if (cpi->oxcf.number_of_layers > 1)
{
- int i;
+ unsigned int i;
for (i=cpi->current_layer+1; i<cpi->oxcf.number_of_layers; i++)
{
@@ -4856,7 +4494,7 @@ static void Pass2Encode(VP8_COMP *cpi, unsigned long *size, unsigned char *dest,
{
double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth
*cpi->oxcf.two_pass_vbrmin_section / 100);
- cpi->twopass.bits_left += (int64_t)(two_pass_min_rate / cpi->oxcf.frame_rate);
+ cpi->twopass.bits_left += (int64_t)(two_pass_min_rate / cpi->frame_rate);
}
}
#endif
@@ -4868,12 +4506,11 @@ extern void vp8_pop_neon(int64_t *store);
#endif
-int vp8_receive_raw_frame(VP8_PTR ptr, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, int64_t end_time)
+int vp8_receive_raw_frame(VP8_COMP *cpi, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, int64_t end_time)
{
#if HAVE_ARMV7
int64_t store_reg[8];
#endif
- VP8_COMP *cpi = (VP8_COMP *) ptr;
VP8_COMMON *cm = &cpi->common;
struct vpx_usec_timer timer;
int res = 0;
@@ -4922,13 +4559,12 @@ static int frame_is_reference(const VP8_COMP *cpi)
}
-int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned long *size, unsigned char *dest, unsigned char *dest_end, int64_t *time_stamp, int64_t *time_end, int flush)
+int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned long *size, unsigned char *dest, unsigned char *dest_end, int64_t *time_stamp, int64_t *time_end, int flush)
{
#if HAVE_ARMV7
int64_t store_reg[8];
#endif
- VP8_COMP *cpi = (VP8_COMP *) ptr;
- VP8_COMMON *cm = &cpi->common;
+ VP8_COMMON *cm;
struct vpx_usec_timer tsctimer;
struct vpx_usec_timer ticktimer;
struct vpx_usec_timer cmptimer;
@@ -4937,12 +4573,14 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
if (!cpi)
return -1;
- if (setjmp(cpi->common.error.jmp)){
+ cm = &cpi->common;
+
+ if (setjmp(cpi->common.error.jmp))
+ {
cpi->common.error.setjmp = 0;
return VPX_CODEC_CORRUPT_FRAME;
}
- cpi->bc.error = &cpi->common.error;
cpi->common.error.setjmp = 1;
#if HAVE_ARMV7
@@ -4979,7 +4617,7 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
cm->refresh_golden_frame = 0;
cm->refresh_last_frame = 0;
cm->show_frame = 0;
- cpi->source_alt_ref_pending = FALSE; // Clear Pending alt Ref flag.
+ cpi->source_alt_ref_pending = 0; // Clear Pending alt Ref flag.
cpi->is_src_frame_alt_ref = 0;
}
}
@@ -5092,7 +4730,7 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
if(interval > 10000000.0)
interval = 10000000;
- avg_duration = 10000000.0 / cpi->oxcf.frame_rate;
+ avg_duration = 10000000.0 / cpi->frame_rate;
avg_duration *= (interval - avg_duration + this_duration);
avg_duration /= interval;
@@ -5200,6 +4838,17 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
vpx_memcpy(&cm->fc, &cm->lfc, sizeof(cm->fc));
}
+ // Save the contexts separately for alt ref, gold and last.
+ // (TODO jbb -> Optimize this with pointers to avoid extra copies. )
+ if(cm->refresh_alt_ref_frame)
+ vpx_memcpy(&cpi->lfc_a, &cm->fc, sizeof(cm->fc));
+
+ if(cm->refresh_golden_frame)
+ vpx_memcpy(&cpi->lfc_g, &cm->fc, sizeof(cm->fc));
+
+ if(cm->refresh_last_frame)
+ vpx_memcpy(&cpi->lfc_n, &cm->fc, sizeof(cm->fc));
+
// if its a dropped frame honor the requests on subsequent frames
if (*size > 0)
{
@@ -5400,10 +5049,8 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
return 0;
}
-int vp8_get_preview_raw_frame(VP8_PTR comp, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *flags)
+int vp8_get_preview_raw_frame(VP8_COMP *cpi, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *flags)
{
- VP8_COMP *cpi = (VP8_COMP *) comp;
-
if (cpi->common.refresh_alt_ref_frame)
return -1;
else
@@ -5432,9 +5079,8 @@ int vp8_get_preview_raw_frame(VP8_PTR comp, YV12_BUFFER_CONFIG *dest, vp8_ppflag
}
}
-int vp8_set_roimap(VP8_PTR comp, unsigned char *map, unsigned int rows, unsigned int cols, int delta_q[4], int delta_lf[4], unsigned int threshold[4])
+int vp8_set_roimap(VP8_COMP *cpi, unsigned char *map, unsigned int rows, unsigned int cols, int delta_q[4], int delta_lf[4], unsigned int threshold[4])
{
- VP8_COMP *cpi = (VP8_COMP *) comp;
signed char feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS];
if (cpi->common.mb_rows != rows || cpi->common.mb_cols != cols)
@@ -5442,15 +5088,15 @@ int vp8_set_roimap(VP8_PTR comp, unsigned char *map, unsigned int rows, unsigned
if (!map)
{
- disable_segmentation((VP8_PTR)cpi);
+ disable_segmentation(cpi);
return 0;
}
// Set the segmentation Map
- set_segmentation_map((VP8_PTR)cpi, map);
+ set_segmentation_map(cpi, map);
// Activate segmentation.
- enable_segmentation((VP8_PTR)cpi);
+ enable_segmentation(cpi);
// Set up the quant segment data
feature_data[MB_LVL_ALT_Q][0] = delta_q[0];
@@ -5471,15 +5117,13 @@ int vp8_set_roimap(VP8_PTR comp, unsigned char *map, unsigned int rows, unsigned
// Initialise the feature data structure
// SEGMENT_DELTADATA 0, SEGMENT_ABSDATA 1
- set_segment_data((VP8_PTR)cpi, &feature_data[0][0], SEGMENT_DELTADATA);
+ set_segment_data(cpi, &feature_data[0][0], SEGMENT_DELTADATA);
return 0;
}
-int vp8_set_active_map(VP8_PTR comp, unsigned char *map, unsigned int rows, unsigned int cols)
+int vp8_set_active_map(VP8_COMP *cpi, unsigned char *map, unsigned int rows, unsigned int cols)
{
- VP8_COMP *cpi = (VP8_COMP *) comp;
-
if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols)
{
if (map)
@@ -5499,10 +5143,8 @@ int vp8_set_active_map(VP8_PTR comp, unsigned char *map, unsigned int rows, unsi
}
}
-int vp8_set_internal_size(VP8_PTR comp, VPX_SCALING horiz_mode, VPX_SCALING vert_mode)
+int vp8_set_internal_size(VP8_COMP *cpi, VPX_SCALING horiz_mode, VPX_SCALING vert_mode)
{
- VP8_COMP *cpi = (VP8_COMP *) comp;
-
if (horiz_mode <= ONETWO)
cpi->common.horiz_scale = horiz_mode;
else
@@ -5544,8 +5186,7 @@ int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, const
}
-int vp8_get_quantizer(VP8_PTR c)
+int vp8_get_quantizer(VP8_COMP *cpi)
{
- VP8_COMP *cpi = (VP8_COMP *) c;
return cpi->common.base_qindex;
}
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index a0828a479..46951e3b9 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -58,6 +58,9 @@
#define MAX_PERIODICITY 16
+#define MAX(x,y) (((x)>(y))?(x):(y))
+#define MIN(x,y) (((x)<(y))?(x):(y))
+
typedef struct
{
int kf_indicated;
@@ -133,32 +136,32 @@ typedef struct
typedef enum
{
- THR_ZEROMV = 0,
+ THR_ZERO1 = 0,
THR_DC = 1,
- THR_NEARESTMV = 2,
- THR_NEARMV = 3,
+ THR_NEAREST1 = 2,
+ THR_NEAR1 = 3,
- THR_ZEROG = 4,
- THR_NEARESTG = 5,
+ THR_ZERO2 = 4,
+ THR_NEAREST2 = 5,
- THR_ZEROA = 6,
- THR_NEARESTA = 7,
+ THR_ZERO3 = 6,
+ THR_NEAREST3 = 7,
- THR_NEARG = 8,
- THR_NEARA = 9,
+ THR_NEAR2 = 8,
+ THR_NEAR3 = 9,
THR_V_PRED = 10,
THR_H_PRED = 11,
THR_TM = 12,
- THR_NEWMV = 13,
- THR_NEWG = 14,
- THR_NEWA = 15,
+ THR_NEW1 = 13,
+ THR_NEW2 = 14,
+ THR_NEW3 = 15,
- THR_SPLITMV = 16,
- THR_SPLITG = 17,
- THR_SPLITA = 18,
+ THR_SPLIT1 = 16,
+ THR_SPLIT2 = 17,
+ THR_SPLIT3 = 18,
THR_B_PRED = 19,
}
@@ -256,7 +259,7 @@ typedef struct
int buffer_level;
int bits_off_target;
- long long total_actual_bits;
+ int64_t total_actual_bits;
int total_target_vs_actual;
int worst_quality;
@@ -276,7 +279,7 @@ typedef struct
int zbin_over_quant;
int inter_frame_target;
- INT64 total_byte_count;
+ int64_t total_byte_count;
int filter_level;
@@ -314,8 +317,7 @@ typedef struct VP8_COMP
MACROBLOCK mb;
VP8_COMMON common;
- vp8_writer bc, bc2;
- // bool_writer *bc2;
+ vp8_writer bc[9]; // one boolcoder for each partition
VP8_CONFIG oxcf;
@@ -337,7 +339,7 @@ typedef struct VP8_COMP
int gold_is_alt; // don't do both alt and gold search ( just do gold).
//int refresh_alt_ref_frame;
- YV12_BUFFER_CONFIG last_frame_uf;
+ YV12_BUFFER_CONFIG pick_lf_lvl_frame;
TOKENEXTRA *tok;
unsigned int tok_count;
@@ -418,6 +420,7 @@ typedef struct VP8_COMP
int buffered_mode;
+ double frame_rate;
int64_t buffer_level;
int bits_off_target;
@@ -565,16 +568,21 @@ typedef struct VP8_COMP
int base_skip_false_prob[128];
+ FRAME_CONTEXT lfc_n; /* last frame entropy */
+ FRAME_CONTEXT lfc_a; /* last alt ref entropy */
+ FRAME_CONTEXT lfc_g; /* last gold ref entropy */
+
+
struct twopass_rc
{
unsigned int section_intra_rating;
double section_max_qfactor;
unsigned int next_iiratio;
unsigned int this_iiratio;
- FIRSTPASS_STATS *total_stats;
- FIRSTPASS_STATS *this_frame_stats;
+ FIRSTPASS_STATS total_stats;
+ FIRSTPASS_STATS this_frame_stats;
FIRSTPASS_STATS *stats_in, *stats_in_end, *stats_in_start;
- FIRSTPASS_STATS *total_left_stats;
+ FIRSTPASS_STATS total_left_stats;
int first_pass_done;
int64_t bits_left;
int64_t clip_bits_total;
@@ -665,8 +673,8 @@ typedef struct VP8_COMP
unsigned int current_layer;
LAYER_CONTEXT layer_context[MAX_LAYERS];
- long long frames_in_layer[MAX_LAYERS];
- long long bytes_in_layer[MAX_LAYERS];
+ int64_t frames_in_layer[MAX_LAYERS];
+ int64_t bytes_in_layer[MAX_LAYERS];
double sum_psnr[MAX_LAYERS];
double sum_psnr_p[MAX_LAYERS];
double total_error2[MAX_LAYERS];
@@ -679,6 +687,11 @@ typedef struct VP8_COMP
double total_ssimg_v_in_layer[MAX_LAYERS];
double total_ssimg_all_in_layer[MAX_LAYERS];
+#if CONFIG_MULTI_RES_ENCODING
+ /* Number of MBs per row at lower-resolution level */
+ int mr_low_res_mb_cols;
+#endif
+
} VP8_COMP;
void control_data_rate(VP8_COMP *cpi);
diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
index 4d8734137..46f53a18d 100644
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -21,7 +21,6 @@
#include "vp8/common/reconinter.h"
#include "vp8/common/reconintra.h"
#include "vp8/common/reconintra4x4.h"
-#include "vp8/common/g_common.h"
#include "variance.h"
#include "mcomp.h"
#include "rdopt.h"
@@ -39,7 +38,7 @@ extern int VP8_UVSSE(MACROBLOCK *x, const vp8_variance_rtcd_vtable_t *rtcd);
extern unsigned int cnt_pm;
#endif
-extern const MV_REFERENCE_FRAME vp8_ref_frame_order[MAX_MODES];
+extern const int vp8_ref_frame_order[MAX_MODES];
extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES];
extern unsigned int (*vp8_get4x4sse_cs)(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride);
@@ -402,9 +401,68 @@ static void update_mvcount(VP8_COMP *cpi, MACROBLOCKD *xd, int_mv *best_ref_mv)
}
}
+
+#if CONFIG_MULTI_RES_ENCODING
+static
+void get_lower_res_motion_info(VP8_COMP *cpi, MACROBLOCKD *xd, int *dissim,
+ int *parent_ref_frame,
+ MB_PREDICTION_MODE *parent_mode,
+ int_mv *parent_ref_mv, int mb_row, int mb_col)
+{
+ LOWER_RES_INFO* store_mode_info
+ = (LOWER_RES_INFO*)cpi->oxcf.mr_low_res_mode_info;
+ unsigned int parent_mb_index;
+ //unsigned int parent_mb_index = map_640x480_to_320x240[mb_row][mb_col];
+
+ /* Consider different down_sampling_factor. */
+ {
+ /* TODO: Removed the loop that supports special down_sampling_factor
+ * such as 2, 4, 8. Will revisit it if needed.
+ * Should also try using a look-up table to see if it helps
+ * performance. */
+ int round = cpi->oxcf.mr_down_sampling_factor.num/2;
+ int parent_mb_row, parent_mb_col;
+
+ parent_mb_row = (mb_row*cpi->oxcf.mr_down_sampling_factor.den+round)
+ /cpi->oxcf.mr_down_sampling_factor.num;
+ parent_mb_col = (mb_col*cpi->oxcf.mr_down_sampling_factor.den+round)
+ /cpi->oxcf.mr_down_sampling_factor.num;
+ parent_mb_index = parent_mb_row*cpi->mr_low_res_mb_cols + parent_mb_col;
+ }
+
+ /* Read lower-resolution mode & motion result from memory.*/
+ *parent_ref_frame = store_mode_info[parent_mb_index].ref_frame;
+ *parent_mode = store_mode_info[parent_mb_index].mode;
+ *dissim = store_mode_info[parent_mb_index].dissim;
+
+ /* For highest-resolution encoder, adjust dissim value. Lower its quality
+ * for good performance. */
+ if (cpi->oxcf.mr_encoder_id == (cpi->oxcf.mr_total_resolutions - 1))
+ *dissim>>=1;
+
+ if(*parent_ref_frame != INTRA_FRAME)
+ {
+ /* Consider different down_sampling_factor.
+ * The result can be rounded to be more precise, but it takes more time.
+ */
+ //int round = cpi->oxcf.mr_down_sampling_factor.den/2;
+ (*parent_ref_mv).as_mv.row = store_mode_info[parent_mb_index].mv.as_mv.row
+ *cpi->oxcf.mr_down_sampling_factor.num
+ /cpi->oxcf.mr_down_sampling_factor.den;
+ (*parent_ref_mv).as_mv.col = store_mode_info[parent_mb_index].mv.as_mv.col
+ *cpi->oxcf.mr_down_sampling_factor.num
+ /cpi->oxcf.mr_down_sampling_factor.den;
+
+ vp8_clamp_mv2(parent_ref_mv, xd);
+ }
+}
+#endif
+
+
void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
int recon_uvoffset, int *returnrate,
- int *returndistortion, int *returnintra)
+ int *returndistortion, int *returnintra, int mb_row,
+ int mb_col)
{
BLOCK *b = &x->block[0];
BLOCKD *d = &x->e_mbd.block[0];
@@ -422,34 +480,67 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
int rate;
int rate2;
int distortion2;
- int bestsme;
- //int all_rds[MAX_MODES]; // Experimental debug code.
+ int bestsme = INT_MAX;
int best_mode_index = 0;
unsigned int sse = INT_MAX, best_sse = INT_MAX;
int_mv mvp;
+
int near_sadidx[8] = {0, 1, 2, 3, 4, 5, 6, 7};
int saddone=0;
int sr=0; //search range got from mv_pred(). It uses step_param levels. (0-7)
- int_mv nearest_mv[4];
- int_mv near_mv[4];
- int_mv frame_best_ref_mv[4];
- int MDCounts[4][4];
unsigned char *y_buffer[4];
unsigned char *u_buffer[4];
unsigned char *v_buffer[4];
+ int i;
+ int ref_frame_map[4];
+ int sign_bias = 0;
- int skip_mode[4] = {0, 0, 0, 0};
- int found_near_mvs[4] = {0, 0, 0, 0};
+ int have_subp_search = cpi->sf.half_pixel_search; /* In real-time mode,
+ when Speed >= 15, no sub-pixel search. */
- int have_subp_search = cpi->sf.half_pixel_search; /* In real-time mode, when Speed >= 15, no sub-pixel search. */
+#if CONFIG_MULTI_RES_ENCODING
+ int dissim = INT_MAX;
+ int parent_ref_frame = 0;
+ int_mv parent_ref_mv;
+ MB_PREDICTION_MODE parent_mode = 0;
+
+ if (cpi->oxcf.mr_encoder_id)
+ get_lower_res_motion_info(cpi, xd, &dissim, &parent_ref_frame,
+ &parent_mode, &parent_ref_mv, mb_row, mb_col);
+#endif
vpx_memset(mode_mv, 0, sizeof(mode_mv));
- vpx_memset(nearest_mv, 0, sizeof(nearest_mv));
- vpx_memset(near_mv, 0, sizeof(near_mv));
vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));
+ /* Setup search priorities */
+ i=0;
+ ref_frame_map[i++] = INTRA_FRAME;
+ if (cpi->ref_frame_flags & VP8_LAST_FLAG)
+ ref_frame_map[i++] = LAST_FRAME;
+ if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
+ ref_frame_map[i++] = GOLDEN_FRAME;
+ if (cpi->ref_frame_flags & VP8_ALT_FLAG) // &&(cpi->source_alt_ref_active || cpi->oxcf.number_of_layers > 1)
+ ref_frame_map[i++] = ALTREF_FRAME;
+ for(; i<4; i++)
+ ref_frame_map[i] = -1;
+
+ /* Check to see if there is at least 1 valid reference frame that we need
+ * to calculate near_mvs.
+ */
+ if (ref_frame_map[1] > 0)
+ {
+ vp8_find_near_mvs(&x->e_mbd,
+ x->e_mbd.mode_info_context,
+ &mode_mv[NEARESTMV], &mode_mv[NEARMV],
+ &best_ref_mv,
+ mdcounts,
+ ref_frame_map[1],
+ cpi->common.ref_frame_sign_bias);
+
+ sign_bias = cpi->common.ref_frame_sign_bias[ref_frame_map[1]];
+ }
// set up all the refframe dependent pointers.
if (cpi->ref_frame_flags & VP8_LAST_FLAG)
@@ -459,8 +550,6 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
u_buffer[LAST_FRAME] = lst_yv12->u_buffer + recon_uvoffset;
v_buffer[LAST_FRAME] = lst_yv12->v_buffer + recon_uvoffset;
}
- else
- skip_mode[LAST_FRAME] = 1;
if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
{
@@ -469,21 +558,16 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
u_buffer[GOLDEN_FRAME] = gld_yv12->u_buffer + recon_uvoffset;
v_buffer[GOLDEN_FRAME] = gld_yv12->v_buffer + recon_uvoffset;
}
- else
- skip_mode[GOLDEN_FRAME] = 1;
- if ((cpi->ref_frame_flags & VP8_ALT_FLAG) &&
- (cpi->source_alt_ref_active || cpi->oxcf.number_of_layers > 1))
+ if (cpi->ref_frame_flags & VP8_ALT_FLAG)
{
YV12_BUFFER_CONFIG *alt_yv12 = &cpi->common.yv12_fb[cpi->common.alt_fb_idx];
y_buffer[ALTREF_FRAME] = alt_yv12->y_buffer + recon_yoffset;
u_buffer[ALTREF_FRAME] = alt_yv12->u_buffer + recon_uvoffset;
v_buffer[ALTREF_FRAME] = alt_yv12->v_buffer + recon_uvoffset;
}
- else
- skip_mode[ALTREF_FRAME] = 1;
- cpi->mbs_tested_so_far++; // Count of the number of MBs tested so far this frame
+ cpi->mbs_tested_so_far++; // Count of the number of MBs tested so far this frame
*returnintra = INT_MAX;
x->skip = 0;
@@ -496,50 +580,98 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
{
int frame_cost;
int this_rd = INT_MAX;
+ int this_ref_frame = ref_frame_map[vp8_ref_frame_order[mode_index]];
if (best_rd <= cpi->rd_threshes[mode_index])
continue;
- x->e_mbd.mode_info_context->mbmi.ref_frame = vp8_ref_frame_order[mode_index];
-
- if (skip_mode[x->e_mbd.mode_info_context->mbmi.ref_frame])
+ if (this_ref_frame < 0)
continue;
- // Check to see if the testing frequency for this mode is at its max
- // If so then prevent it from being tested and increase the threshold for its testing
- if (cpi->mode_test_hit_counts[mode_index] && (cpi->mode_check_freq[mode_index] > 1))
+ x->e_mbd.mode_info_context->mbmi.ref_frame = this_ref_frame;
+
+#if CONFIG_MULTI_RES_ENCODING
+ if (cpi->oxcf.mr_encoder_id)
+ {
+ /* If parent MB is intra, child MB is intra. */
+ if (!parent_ref_frame && this_ref_frame)
+ continue;
+
+ /* If parent MB is inter, and it is unlikely there are multiple
+ * objects in parent MB, we use parent ref frame as child MB's
+ * ref frame. */
+ if (parent_ref_frame && dissim < 8
+ && parent_ref_frame != this_ref_frame)
+ continue;
+ }
+#endif
+
+ // everything but intra
+ if (x->e_mbd.mode_info_context->mbmi.ref_frame)
+ {
+ x->e_mbd.pre.y_buffer = y_buffer[x->e_mbd.mode_info_context->mbmi.ref_frame];
+ x->e_mbd.pre.u_buffer = u_buffer[x->e_mbd.mode_info_context->mbmi.ref_frame];
+ x->e_mbd.pre.v_buffer = v_buffer[x->e_mbd.mode_info_context->mbmi.ref_frame];
+
+ if (sign_bias !=
+ cpi->common.ref_frame_sign_bias[x->e_mbd.mode_info_context->mbmi.ref_frame])
+ {
+ mode_mv[NEARESTMV].as_mv.row *= -1;
+ mode_mv[NEARESTMV].as_mv.col *= -1;
+ mode_mv[NEARMV].as_mv.row *= -1;
+ mode_mv[NEARMV].as_mv.col *= -1;
+ best_ref_mv.as_mv.row *= -1;
+ best_ref_mv.as_mv.col *= -1;
+ sign_bias
+ = cpi->common.ref_frame_sign_bias[x->e_mbd.mode_info_context->mbmi.ref_frame];
+ }
+
+#if CONFIG_MULTI_RES_ENCODING
+ if (cpi->oxcf.mr_encoder_id)
+ {
+ if (vp8_mode_order[mode_index] == NEARESTMV &&
+ mode_mv[NEARESTMV].as_int ==0)
+ continue;
+ if (vp8_mode_order[mode_index] == NEARMV &&
+ mode_mv[NEARMV].as_int ==0)
+ continue;
+
+ if (vp8_mode_order[mode_index] == NEWMV && parent_mode == ZEROMV
+ && best_ref_mv.as_int==0) //&& dissim==0
+ continue;
+ else if(vp8_mode_order[mode_index] == NEWMV && dissim==0
+ && best_ref_mv.as_int==parent_ref_mv.as_int)
+ continue;
+ }
+#endif
+ }
+
+ /* Check to see if the testing frequency for this mode is at its max
+ * If so then prevent it from being tested and increase the threshold
+ * for its testing */
+ if (cpi->mode_test_hit_counts[mode_index] &&
+ (cpi->mode_check_freq[mode_index] > 1))
{
- //if ( (cpi->mbs_tested_so_far / cpi->mode_test_hit_counts[mode_index]) <= cpi->mode_check_freq[mode_index] )
- if (cpi->mbs_tested_so_far <= (cpi->mode_check_freq[mode_index] * cpi->mode_test_hit_counts[mode_index]))
+ if (cpi->mbs_tested_so_far <= (cpi->mode_check_freq[mode_index] *
+ cpi->mode_test_hit_counts[mode_index]))
{
- // Increase the threshold for coding this mode to make it less likely to be chosen
+ /* Increase the threshold for coding this mode to make it less
+ * likely to be chosen */
cpi->rd_thresh_mult[mode_index] += 4;
if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
- cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
-
+ cpi->rd_threshes[mode_index] =
+ (cpi->rd_baseline_thresh[mode_index] >> 7) *
+ cpi->rd_thresh_mult[mode_index];
continue;
}
}
- // If nearby MVs haven't been found for this reference frame then do it now.
- if (x->e_mbd.mode_info_context->mbmi.ref_frame != INTRA_FRAME &&
- !found_near_mvs[x->e_mbd.mode_info_context->mbmi.ref_frame])
- {
- int ref_frame = x->e_mbd.mode_info_context->mbmi.ref_frame;
- vp8_find_near_mvs(&x->e_mbd,
- x->e_mbd.mode_info_context,
- &nearest_mv[ref_frame], &near_mv[ref_frame],
- &frame_best_ref_mv[ref_frame],
- MDCounts[ref_frame],
- ref_frame,
- cpi->common.ref_frame_sign_bias);
- found_near_mvs[ref_frame] = 1;
- }
-
- // We have now reached the point where we are going to test the current mode so increment the counter for the number of times it has been tested
+ /* We have now reached the point where we are going to test the current
+ * mode so increment the counter for the number of times it has been
+ * tested */
cpi->mode_test_hit_counts[mode_index] ++;
rate2 = 0;
@@ -547,42 +679,28 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
this_mode = vp8_mode_order[mode_index];
- // Experimental debug code.
- //all_rds[mode_index] = -1;
-
x->e_mbd.mode_info_context->mbmi.mode = this_mode;
x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;
- // Work out the cost assosciated with selecting the reference frame
+ /* Work out the cost assosciated with selecting the reference frame */
frame_cost =
x->e_mbd.ref_frame_cost[x->e_mbd.mode_info_context->mbmi.ref_frame];
rate2 += frame_cost;
- // everything but intra
- if (x->e_mbd.mode_info_context->mbmi.ref_frame)
- {
- x->e_mbd.pre.y_buffer = y_buffer[x->e_mbd.mode_info_context->mbmi.ref_frame];
- x->e_mbd.pre.u_buffer = u_buffer[x->e_mbd.mode_info_context->mbmi.ref_frame];
- x->e_mbd.pre.v_buffer = v_buffer[x->e_mbd.mode_info_context->mbmi.ref_frame];
- mode_mv[NEARESTMV] = nearest_mv[x->e_mbd.mode_info_context->mbmi.ref_frame];
- mode_mv[NEARMV] = near_mv[x->e_mbd.mode_info_context->mbmi.ref_frame];
- best_ref_mv = frame_best_ref_mv[x->e_mbd.mode_info_context->mbmi.ref_frame];
- memcpy(mdcounts, MDCounts[x->e_mbd.mode_info_context->mbmi.ref_frame], sizeof(mdcounts));
- }
-
- // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
- // unless ARNR filtering is enabled in which case we want
- // an unfiltered alternative
+ /* Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
+ * unless ARNR filtering is enabled in which case we want
+ * an unfiltered alternative */
if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0))
{
- if (this_mode != ZEROMV || x->e_mbd.mode_info_context->mbmi.ref_frame != ALTREF_FRAME)
+ if (this_mode != ZEROMV ||
+ x->e_mbd.mode_info_context->mbmi.ref_frame != ALTREF_FRAME)
continue;
}
switch (this_mode)
{
case B_PRED:
- // Pass best so far to pick_intra4x4mby_modes to use as breakout
+ /* Pass best so far to pick_intra4x4mby_modes to use as breakout */
distortion2 = best_sse;
pick_intra4x4mby_modes(IF_RTCD(&cpi->rtcd), x, &rate, &distortion2);
@@ -641,10 +759,12 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
int sadpb = x->sadperbit16;
int_mv mvp_full;
- int col_min = (best_ref_mv.as_mv.col>>3) - MAX_FULL_PEL_VAL + ((best_ref_mv.as_mv.col & 7)?1:0);
- int row_min = (best_ref_mv.as_mv.row>>3) - MAX_FULL_PEL_VAL + ((best_ref_mv.as_mv.row & 7)?1:0);
- int col_max = (best_ref_mv.as_mv.col>>3) + MAX_FULL_PEL_VAL;
- int row_max = (best_ref_mv.as_mv.row>>3) + MAX_FULL_PEL_VAL;
+ int col_min = ((best_ref_mv.as_mv.col+7)>>3) - MAX_FULL_PEL_VAL;
+ int row_min = ((best_ref_mv.as_mv.row+7)>>3) - MAX_FULL_PEL_VAL;
+ int col_max = (best_ref_mv.as_mv.col>>3)
+ + MAX_FULL_PEL_VAL;
+ int row_max = (best_ref_mv.as_mv.row>>3)
+ + MAX_FULL_PEL_VAL;
int tmp_col_min = x->mv_col_min;
int tmp_col_max = x->mv_col_max;
@@ -656,110 +776,156 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
// Further step/diamond searches as necessary
step_param = cpi->sf.first_step + speed_adjust;
- if(cpi->sf.improved_mv_pred)
+#if CONFIG_MULTI_RES_ENCODING
+ if (cpi->oxcf.mr_encoder_id)
{
- if(!saddone)
- {
- vp8_cal_sad(cpi,xd,x, recon_yoffset ,&near_sadidx[0] );
- saddone = 1;
- }
-
- vp8_mv_pred(cpi, &x->e_mbd, x->e_mbd.mode_info_context, &mvp,
- x->e_mbd.mode_info_context->mbmi.ref_frame, cpi->common.ref_frame_sign_bias, &sr, &near_sadidx[0]);
-
- sr += speed_adjust;
- //adjust search range according to sr from mv prediction
- if(sr > step_param)
- step_param = sr;
-
- mvp_full.as_mv.col = mvp.as_mv.col>>3;
- mvp_full.as_mv.row = mvp.as_mv.row>>3;
-
+ // Use parent MV as predictor. Adjust search range accordingly.
+ mvp.as_int = parent_ref_mv.as_int;
+ mvp_full.as_mv.col = parent_ref_mv.as_mv.col>>3;
+ mvp_full.as_mv.row = parent_ref_mv.as_mv.row>>3;
+
+ if(dissim <=32) step_param += 3;
+ else if(dissim <=128) step_param += 2;
+ else step_param += 1;
}else
+#endif
{
- mvp.as_int = best_ref_mv.as_int;
- mvp_full.as_mv.col = best_ref_mv.as_mv.col>>3;
- mvp_full.as_mv.row = best_ref_mv.as_mv.row>>3;
- }
+ if(cpi->sf.improved_mv_pred)
+ {
+ if(!saddone)
+ {
+ vp8_cal_sad(cpi,xd,x, recon_yoffset ,&near_sadidx[0] );
+ saddone = 1;
+ }
- // Get intersection of UMV window and valid MV window to reduce # of checks in diamond search.
- if (x->mv_col_min < col_min )
- x->mv_col_min = col_min;
- if (x->mv_col_max > col_max )
- x->mv_col_max = col_max;
- if (x->mv_row_min < row_min )
- x->mv_row_min = row_min;
- if (x->mv_row_max > row_max )
- x->mv_row_max = row_max;
+ vp8_mv_pred(cpi, &x->e_mbd, x->e_mbd.mode_info_context,
+ &mvp,x->e_mbd.mode_info_context->mbmi.ref_frame,
+ cpi->common.ref_frame_sign_bias, &sr,
+ &near_sadidx[0]);
- further_steps = (cpi->Speed >= 8)? 0: (cpi->sf.max_step_search_steps - 1 - step_param);
+ sr += speed_adjust;
+ //adjust search range according to sr from mv prediction
+ if(sr > step_param)
+ step_param = sr;
- if (cpi->sf.search_method == HEX)
- {
- bestsme = vp8_hex_search(x, b, d, &mvp_full, &d->bmi.mv, step_param,
- sadpb, &cpi->fn_ptr[BLOCK_16X16],
- x->mvsadcost, x->mvcost, &best_ref_mv);
- mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
+ mvp_full.as_mv.col = mvp.as_mv.col>>3;
+ mvp_full.as_mv.row = mvp.as_mv.row>>3;
+ }else
+ {
+ mvp.as_int = best_ref_mv.as_int;
+ mvp_full.as_mv.col = best_ref_mv.as_mv.col>>3;
+ mvp_full.as_mv.row = best_ref_mv.as_mv.row>>3;
+ }
}
- else
+
+#if CONFIG_MULTI_RES_ENCODING
+ if (cpi->oxcf.mr_encoder_id && dissim <= 2 &&
+ MAX(abs(best_ref_mv.as_mv.row - parent_ref_mv.as_mv.row),
+ abs(best_ref_mv.as_mv.col - parent_ref_mv.as_mv.col)) <= 4)
{
- bestsme = cpi->diamond_search_sad(x, b, d, &mvp_full, &d->bmi.mv,
- step_param, sadpb, &num00,
- &cpi->fn_ptr[BLOCK_16X16],
- x->mvcost, &best_ref_mv);
- mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
+ d->bmi.mv.as_int = mvp_full.as_int;
+ mode_mv[NEWMV].as_int = mvp_full.as_int;
- // Further step/diamond searches as necessary
- n = 0;
- //further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
+ cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv, &best_ref_mv,
+ x->errorperbit,
+ &cpi->fn_ptr[BLOCK_16X16],
+ cpi->mb.mvcost,
+ &distortion2,&sse);
+ }else
+#endif
+ {
+ /* Get intersection of UMV window and valid MV window to
+ * reduce # of checks in diamond search. */
+ if (x->mv_col_min < col_min )
+ x->mv_col_min = col_min;
+ if (x->mv_col_max > col_max )
+ x->mv_col_max = col_max;
+ if (x->mv_row_min < row_min )
+ x->mv_row_min = row_min;
+ if (x->mv_row_max > row_max )
+ x->mv_row_max = row_max;
+
+ further_steps = (cpi->Speed >= 8)?
+ 0: (cpi->sf.max_step_search_steps - 1 - step_param);
+
+ if (cpi->sf.search_method == HEX)
+ {
+#if CONFIG_MULTI_RES_ENCODING
+ /* TODO: In higher-res pick_inter_mode, step_param is used to
+ * modify hex search range. Here, set step_param to 0 not to
+ * change the behavior in lowest-resolution encoder.
+ * Will improve it later.
+ */
+ if (!cpi->oxcf.mr_encoder_id)
+ step_param = 0;
+#endif
+ bestsme = vp8_hex_search(x, b, d, &mvp_full, &d->bmi.mv,
+ step_param, sadpb,
+ &cpi->fn_ptr[BLOCK_16X16],
+ x->mvsadcost, x->mvcost, &best_ref_mv);
+ mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
+ }
+ else
+ {
+ bestsme = cpi->diamond_search_sad(x, b, d, &mvp_full,
+ &d->bmi.mv, step_param, sadpb, &num00,
+ &cpi->fn_ptr[BLOCK_16X16],
+ x->mvcost, &best_ref_mv);
+ mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
- n = num00;
- num00 = 0;
+ // Further step/diamond searches as necessary
+ n = 0;
+ //further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
- while (n < further_steps)
- {
- n++;
+ n = num00;
+ num00 = 0;
- if (num00)
- num00--;
- else
+ while (n < further_steps)
{
- thissme =
- cpi->diamond_search_sad(x, b, d, &mvp_full,
- &d->bmi.mv,
- step_param + n,
- sadpb, &num00,
- &cpi->fn_ptr[BLOCK_16X16],
- x->mvcost, &best_ref_mv);
- if (thissme < bestsme)
- {
- bestsme = thissme;
- mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
- }
+ n++;
+
+ if (num00)
+ num00--;
else
{
- d->bmi.mv.as_int = mode_mv[NEWMV].as_int;
+ thissme =
+ cpi->diamond_search_sad(x, b, d, &mvp_full,
+ &d->bmi.mv,
+ step_param + n,
+ sadpb, &num00,
+ &cpi->fn_ptr[BLOCK_16X16],
+ x->mvcost, &best_ref_mv);
+ if (thissme < bestsme)
+ {
+ bestsme = thissme;
+ mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
+ }
+ else
+ {
+ d->bmi.mv.as_int = mode_mv[NEWMV].as_int;
+ }
}
}
}
- }
- x->mv_col_min = tmp_col_min;
- x->mv_col_max = tmp_col_max;
- x->mv_row_min = tmp_row_min;
- x->mv_row_max = tmp_row_max;
+ x->mv_col_min = tmp_col_min;
+ x->mv_col_max = tmp_col_max;
+ x->mv_row_min = tmp_row_min;
+ x->mv_row_max = tmp_row_max;
- if (bestsme < INT_MAX)
- cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv, &best_ref_mv,
- x->errorperbit,
+ if (bestsme < INT_MAX)
+ cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv,
+ &best_ref_mv, x->errorperbit,
&cpi->fn_ptr[BLOCK_16X16],
cpi->mb.mvcost,
&distortion2,&sse);
+ }
mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
// mv cost;
- rate2 += vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv, cpi->mb.mvcost, 128);
+ rate2 += vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv,
+ cpi->mb.mvcost, 128);
}
case NEARESTMV:
@@ -770,18 +936,23 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
case ZEROMV:
- // Trap vectors that reach beyond the UMV borders
- // Note that ALL New MV, Nearest MV Near MV and Zero MV code drops through to this point
- // because of the lack of break statements in the previous two cases.
- if (((mode_mv[this_mode].as_mv.row >> 3) < x->mv_row_min) || ((mode_mv[this_mode].as_mv.row >> 3) > x->mv_row_max) ||
- ((mode_mv[this_mode].as_mv.col >> 3) < x->mv_col_min) || ((mode_mv[this_mode].as_mv.col >> 3) > x->mv_col_max))
+ /* Trap vectors that reach beyond the UMV borders
+ * Note that ALL New MV, Nearest MV Near MV and Zero MV code drops
+ * through to this point because of the lack of break statements
+ * in the previous two cases.
+ */
+ if (((mode_mv[this_mode].as_mv.row >> 3) < x->mv_row_min) ||
+ ((mode_mv[this_mode].as_mv.row >> 3) > x->mv_row_max) ||
+ ((mode_mv[this_mode].as_mv.col >> 3) < x->mv_col_min) ||
+ ((mode_mv[this_mode].as_mv.col >> 3) > x->mv_col_max))
continue;
rate2 += vp8_cost_mv_ref(this_mode, mdcounts);
x->e_mbd.mode_info_context->mbmi.mv.as_int =
mode_mv[this_mode].as_int;
- /* Exit early and don't compute the distortion if this macroblock is marked inactive. */
+ /* Exit early and don't compute the distortion if this macroblock
+ * is marked inactive. */
if (cpi->active_map_enabled && x->active_ptr[0] == 0)
{
sse = 0;
@@ -816,9 +987,6 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
break;
}
- // Experimental debug code.
- //all_rds[mode_index] = this_rd;
-
if (this_rd < best_rd || x->skip)
{
// Note index of best mode
@@ -828,14 +996,23 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
*returndistortion = distortion2;
best_sse = sse;
best_rd = this_rd;
- vpx_memcpy(&best_mbmode, &x->e_mbd.mode_info_context->mbmi, sizeof(MB_MODE_INFO));
-
- // Testing this mode gave rise to an improvement in best error score. Lower threshold a bit for next time
- cpi->rd_thresh_mult[mode_index] = (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ? cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;
- cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
+ vpx_memcpy(&best_mbmode, &x->e_mbd.mode_info_context->mbmi,
+ sizeof(MB_MODE_INFO));
+
+ /* Testing this mode gave rise to an improvement in best error
+ * score. Lower threshold a bit for next time
+ */
+ cpi->rd_thresh_mult[mode_index] =
+ (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ?
+ cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;
+ cpi->rd_threshes[mode_index] =
+ (cpi->rd_baseline_thresh[mode_index] >> 7) *
+ cpi->rd_thresh_mult[mode_index];
}
- // If the mode did not help improve the best error case then raise the threshold for testing that mode next time around.
+ /* If the mode did not help improve the best error case then raise the
+ * threshold for testing that mode next time around.
+ */
else
{
cpi->rd_thresh_mult[mode_index] += 4;
@@ -843,7 +1020,9 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
- cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
+ cpi->rd_threshes[mode_index] =
+ (cpi->rd_baseline_thresh[mode_index] >> 7) *
+ cpi->rd_thresh_mult[mode_index];
}
if (x->skip)
@@ -855,8 +1034,14 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
{
int best_adjustment = (cpi->rd_thresh_mult[best_mode_index] >> 3);
- cpi->rd_thresh_mult[best_mode_index] = (cpi->rd_thresh_mult[best_mode_index] >= (MIN_THRESHMULT + best_adjustment)) ? cpi->rd_thresh_mult[best_mode_index] - best_adjustment : MIN_THRESHMULT;
- cpi->rd_threshes[best_mode_index] = (cpi->rd_baseline_thresh[best_mode_index] >> 7) * cpi->rd_thresh_mult[best_mode_index];
+ cpi->rd_thresh_mult[best_mode_index] =
+ (cpi->rd_thresh_mult[best_mode_index]
+ >= (MIN_THRESHMULT + best_adjustment)) ?
+ cpi->rd_thresh_mult[best_mode_index] - best_adjustment :
+ MIN_THRESHMULT;
+ cpi->rd_threshes[best_mode_index] =
+ (cpi->rd_baseline_thresh[best_mode_index] >> 7) *
+ cpi->rd_thresh_mult[best_mode_index];
}
@@ -879,15 +1064,17 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
x->e_mbd.mode_info_context->mbmi.mv.as_int = 0;
x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;
x->e_mbd.mode_info_context->mbmi.mb_skip_coeff =
- (cpi->common.mb_no_coeff_skip) ? 1 : 0;
+ (cpi->common.mb_no_coeff_skip);
x->e_mbd.mode_info_context->mbmi.partitioning = 0;
return;
}
- /* set to the best mb mode, this copy can be skip if x->skip since it already has the right content */
+ /* set to the best mb mode, this copy can be skip if x->skip since it
+ * already has the right content */
if (!x->skip)
- vpx_memcpy(&x->e_mbd.mode_info_context->mbmi, &best_mbmode, sizeof(MB_MODE_INFO));
+ vpx_memcpy(&x->e_mbd.mode_info_context->mbmi, &best_mbmode,
+ sizeof(MB_MODE_INFO));
if (best_mbmode.mode <= B_PRED)
{
@@ -895,7 +1082,14 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
pick_intra_mbuv_mode(x);
}
- update_mvcount(cpi, &x->e_mbd, &frame_best_ref_mv[xd->mode_info_context->mbmi.ref_frame]);
+ if (sign_bias
+ != cpi->common.ref_frame_sign_bias[xd->mode_info_context->mbmi.ref_frame])
+ {
+ best_ref_mv.as_mv.row *= -1;
+ best_ref_mv.as_mv.col *= -1;
+ }
+
+ update_mvcount(cpi, &x->e_mbd, &best_ref_mv);
}
diff --git a/vp8/encoder/pickinter.h b/vp8/encoder/pickinter.h
index 1c5d6a6e6..3d83782b5 100644
--- a/vp8/encoder/pickinter.h
+++ b/vp8/encoder/pickinter.h
@@ -14,6 +14,10 @@
#include "vpx_config.h"
#include "vp8/common/onyxc_int.h"
-extern void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra);
+extern void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
+ int recon_uvoffset, int *returnrate,
+ int *returndistortion, int *returnintra,
+ int mb_row, int mb_col);
extern void vp8_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate);
+
#endif
diff --git a/vp8/encoder/picklpf.c b/vp8/encoder/picklpf.c
index c1e5f7797..2449ae540 100644
--- a/vp8/encoder/picklpf.c
+++ b/vp8/encoder/picklpf.c
@@ -152,9 +152,10 @@ void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
int max_filter_level = get_max_filter_level(cpi, cm->base_qindex);
int filt_val;
int best_filt_val = cm->filter_level;
+ YV12_BUFFER_CONFIG * saved_frame = cm->frame_to_show;
- // Make a copy of the unfiltered / processed recon buffer
- vp8_yv12_copy_partial_frame_ptr(cm->frame_to_show, &cpi->last_frame_uf);
+ /* Replace unfiltered frame buffer with a new one */
+ cm->frame_to_show = &cpi->pick_lf_lvl_frame;
if (cm->frame_type == KEY_FRAME)
cm->sharpness_level = 0;
@@ -177,26 +178,26 @@ void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
best_filt_val = filt_val;
// Get the err using the previous frame's filter value.
- vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
- best_err = calc_partial_ssl_err(sd, cm->frame_to_show, IF_RTCD(&cpi->rtcd.variance));
+ /* Copy the unfiltered / processed recon buffer to the new buffer */
+ vp8_yv12_copy_partial_frame_ptr(saved_frame, cm->frame_to_show);
+ vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
- // Re-instate the unfiltered frame
- vp8_yv12_copy_partial_frame_ptr(&cpi->last_frame_uf, cm->frame_to_show);
+ best_err = calc_partial_ssl_err(sd, cm->frame_to_show,
+ IF_RTCD(&cpi->rtcd.variance));
- filt_val -= (1 + ((filt_val > 10) ? 1 : 0));
+ filt_val -= 1 + (filt_val > 10);
// Search lower filter levels
while (filt_val >= min_filter_level)
{
// Apply the loop filter
+ vp8_yv12_copy_partial_frame_ptr(saved_frame, cm->frame_to_show);
vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
// Get the err for filtered frame
- filt_err = calc_partial_ssl_err(sd, cm->frame_to_show, IF_RTCD(&cpi->rtcd.variance));
-
- // Re-instate the unfiltered frame
- vp8_yv12_copy_partial_frame_ptr(&cpi->last_frame_uf, cm->frame_to_show);
+ filt_err = calc_partial_ssl_err(sd, cm->frame_to_show,
+ IF_RTCD(&cpi->rtcd.variance));
// Update the best case record or exit loop.
if (filt_err < best_err)
@@ -208,11 +209,11 @@ void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
break;
// Adjust filter level
- filt_val -= (1 + ((filt_val > 10) ? 1 : 0));
+ filt_val -= 1 + (filt_val > 10);
}
// Search up (note that we have already done filt_val = cm->filter_level)
- filt_val = cm->filter_level + (1 + ((filt_val > 10) ? 1 : 0));
+ filt_val = cm->filter_level + 1 + (filt_val > 10);
if (best_filt_val == cm->filter_level)
{
@@ -222,13 +223,13 @@ void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
while (filt_val < max_filter_level)
{
// Apply the loop filter
+ vp8_yv12_copy_partial_frame_ptr(saved_frame, cm->frame_to_show);
+
vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
// Get the err for filtered frame
- filt_err = calc_partial_ssl_err(sd, cm->frame_to_show, IF_RTCD(&cpi->rtcd.variance));
-
- // Re-instate the unfiltered frame
- vp8_yv12_copy_partial_frame_ptr(&cpi->last_frame_uf, cm->frame_to_show);
+ filt_err = calc_partial_ssl_err(sd, cm->frame_to_show,
+ IF_RTCD(&cpi->rtcd.variance));
// Update the best case record or exit loop.
if (filt_err < best_err)
@@ -242,7 +243,7 @@ void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
break;
// Adjust filter level
- filt_val += (1 + ((filt_val > 10) ? 1 : 0));
+ filt_val += 1 + (filt_val > 10);
}
}
@@ -253,6 +254,9 @@ void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
if (cm->filter_level > max_filter_level)
cm->filter_level = max_filter_level;
+
+ /* restore unfiltered frame pointer */
+ cm->frame_to_show = saved_frame;
}
// Stub function for now Alt LF not used
@@ -283,10 +287,16 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
int filt_best;
int filt_direction = 0;
- int Bias = 0; // Bias against raising loop filter and in favour of lowering it
+ int Bias = 0; // Bias against raising loop filter and in favor of lowering it
- // Make a copy of the unfiltered / processed recon buffer
- vp8_yv12_copy_y_ptr(cm->frame_to_show, &cpi->last_frame_uf);
+ int ss_err[MAX_LOOP_FILTER + 1];
+
+ YV12_BUFFER_CONFIG * saved_frame = cm->frame_to_show;
+
+ vpx_memset(ss_err, 0, sizeof(ss_err));
+
+ /* Replace unfiltered frame buffer with a new one */
+ cm->frame_to_show = &cpi->pick_lf_lvl_frame;
if (cm->frame_type == KEY_FRAME)
cm->sharpness_level = 0;
@@ -305,14 +315,19 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
filter_step = (filt_mid < 16) ? 4 : filt_mid / 4;
// Get baseline error score
+
+ /* Copy the unfiltered / processed recon buffer to the new buffer */
+ vp8_yv12_copy_y_ptr(saved_frame, cm->frame_to_show);
+
vp8cx_set_alt_lf_level(cpi, filt_mid);
vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_mid);
- best_err = vp8_calc_ss_err(sd, cm->frame_to_show, IF_RTCD(&cpi->rtcd.variance));
- filt_best = filt_mid;
+ best_err = vp8_calc_ss_err(sd, cm->frame_to_show,
+ IF_RTCD(&cpi->rtcd.variance));
- // Re-instate the unfiltered frame
- vp8_yv12_copy_y_ptr(&cpi->last_frame_uf, cm->frame_to_show);
+ ss_err[filt_mid] = best_err;
+
+ filt_best = filt_mid;
while (filter_step > 0)
{
@@ -327,14 +342,19 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
if ((filt_direction <= 0) && (filt_low != filt_mid))
{
- // Get Low filter error score
- vp8cx_set_alt_lf_level(cpi, filt_low);
- vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_low);
-
- filt_err = vp8_calc_ss_err(sd, cm->frame_to_show, IF_RTCD(&cpi->rtcd.variance));
-
- // Re-instate the unfiltered frame
- vp8_yv12_copy_y_ptr(&cpi->last_frame_uf, cm->frame_to_show);
+ if(ss_err[filt_low] == 0)
+ {
+ // Get Low filter error score
+ vp8_yv12_copy_y_ptr(saved_frame, cm->frame_to_show);
+ vp8cx_set_alt_lf_level(cpi, filt_low);
+ vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_low);
+
+ filt_err = vp8_calc_ss_err(sd, cm->frame_to_show,
+ IF_RTCD(&cpi->rtcd.variance));
+ ss_err[filt_low] = filt_err;
+ }
+ else
+ filt_err = ss_err[filt_low];
// If value is close to the best so far then bias towards a lower loop filter value.
if ((filt_err - Bias) < best_err)
@@ -350,13 +370,18 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
// Now look at filt_high
if ((filt_direction >= 0) && (filt_high != filt_mid))
{
- vp8cx_set_alt_lf_level(cpi, filt_high);
- vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_high);
-
- filt_err = vp8_calc_ss_err(sd, cm->frame_to_show, IF_RTCD(&cpi->rtcd.variance));
+ if(ss_err[filt_high] == 0)
+ {
+ vp8_yv12_copy_y_ptr(saved_frame, cm->frame_to_show);
+ vp8cx_set_alt_lf_level(cpi, filt_high);
+ vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_high);
- // Re-instate the unfiltered frame
- vp8_yv12_copy_y_ptr(&cpi->last_frame_uf, cm->frame_to_show);
+ filt_err = vp8_calc_ss_err(sd, cm->frame_to_show,
+ IF_RTCD(&cpi->rtcd.variance));
+ ss_err[filt_high] = filt_err;
+ }
+ else
+ filt_err = ss_err[filt_high];
// Was it better than the previous best?
if (filt_err < (best_err - Bias))
@@ -380,4 +405,7 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
}
cm->filter_level = filt_best;
+
+ /* restore unfiltered frame pointer */
+ cm->frame_to_show = saved_frame;
}
diff --git a/vp8/encoder/quantize.c b/vp8/encoder/quantize.c
index e57a26430..ce04212e6 100644
--- a/vp8/encoder/quantize.c
+++ b/vp8/encoder/quantize.c
@@ -436,7 +436,8 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)
int quant_val;
int Q;
- int zbin_boost[16] = {0, 0, 8, 10, 12, 14, 16, 20, 24, 28, 32, 36, 40, 44, 44, 44};
+ int zbin_boost[16] = {0, 0, 8, 10, 12, 14, 16, 20, 24, 28, 32, 36, 40, 44,
+ 44, 44};
for (Q = 0; Q < QINDEX_RANGE; Q++)
{
@@ -469,36 +470,61 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)
cpi->zrun_zbin_boost_uv[Q][0] = (quant_val * zbin_boost[0]) >> 7;
// all the ac values = ;
- for (i = 1; i < 16; i++)
+ quant_val = vp8_ac_yquant(Q);
+ cpi->Y1quant_fast[Q][1] = (1 << 16) / quant_val;
+ invert_quant(cpi->sf.improved_quant, cpi->Y1quant[Q] + 1,
+ cpi->Y1quant_shift[Q] + 1, quant_val);
+ cpi->Y1zbin[Q][1] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
+ cpi->Y1round[Q][1] = (qrounding_factors[Q] * quant_val) >> 7;
+ cpi->common.Y1dequant[Q][1] = quant_val;
+ cpi->zrun_zbin_boost_y1[Q][1] = (quant_val * zbin_boost[1]) >> 7;
+
+ quant_val = vp8_ac2quant(Q, cpi->common.y2ac_delta_q);
+ cpi->Y2quant_fast[Q][1] = (1 << 16) / quant_val;
+ invert_quant(cpi->sf.improved_quant, cpi->Y2quant[Q] + 1,
+ cpi->Y2quant_shift[Q] + 1, quant_val);
+ cpi->Y2zbin[Q][1] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7;
+ cpi->Y2round[Q][1] = (qrounding_factors_y2[Q] * quant_val) >> 7;
+ cpi->common.Y2dequant[Q][1] = quant_val;
+ cpi->zrun_zbin_boost_y2[Q][1] = (quant_val * zbin_boost[1]) >> 7;
+
+ quant_val = vp8_ac_uv_quant(Q, cpi->common.uvac_delta_q);
+ cpi->UVquant_fast[Q][1] = (1 << 16) / quant_val;
+ invert_quant(cpi->sf.improved_quant, cpi->UVquant[Q] + 1,
+ cpi->UVquant_shift[Q] + 1, quant_val);
+ cpi->UVzbin[Q][1] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
+ cpi->UVround[Q][1] = (qrounding_factors[Q] * quant_val) >> 7;
+ cpi->common.UVdequant[Q][1] = quant_val;
+ cpi->zrun_zbin_boost_uv[Q][1] = (quant_val * zbin_boost[1]) >> 7;
+
+ for (i = 2; i < 16; i++)
{
- int rc = vp8_default_zig_zag1d[i];
-
- quant_val = vp8_ac_yquant(Q);
- cpi->Y1quant_fast[Q][rc] = (1 << 16) / quant_val;
- invert_quant(cpi->sf.improved_quant, cpi->Y1quant[Q] + rc,
- cpi->Y1quant_shift[Q] + rc, quant_val);
- cpi->Y1zbin[Q][rc] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
- cpi->Y1round[Q][rc] = (qrounding_factors[Q] * quant_val) >> 7;
- cpi->common.Y1dequant[Q][rc] = quant_val;
- cpi->zrun_zbin_boost_y1[Q][i] = (quant_val * zbin_boost[i]) >> 7;
-
- quant_val = vp8_ac2quant(Q, cpi->common.y2ac_delta_q);
- cpi->Y2quant_fast[Q][rc] = (1 << 16) / quant_val;
- invert_quant(cpi->sf.improved_quant, cpi->Y2quant[Q] + rc,
- cpi->Y2quant_shift[Q] + rc, quant_val);
- cpi->Y2zbin[Q][rc] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7;
- cpi->Y2round[Q][rc] = (qrounding_factors_y2[Q] * quant_val) >> 7;
- cpi->common.Y2dequant[Q][rc] = quant_val;
- cpi->zrun_zbin_boost_y2[Q][i] = (quant_val * zbin_boost[i]) >> 7;
-
- quant_val = vp8_ac_uv_quant(Q, cpi->common.uvac_delta_q);
- cpi->UVquant_fast[Q][rc] = (1 << 16) / quant_val;
- invert_quant(cpi->sf.improved_quant, cpi->UVquant[Q] + rc,
- cpi->UVquant_shift[Q] + rc, quant_val);
- cpi->UVzbin[Q][rc] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
- cpi->UVround[Q][rc] = (qrounding_factors[Q] * quant_val) >> 7;
- cpi->common.UVdequant[Q][rc] = quant_val;
- cpi->zrun_zbin_boost_uv[Q][i] = (quant_val * zbin_boost[i]) >> 7;
+ cpi->Y1quant_fast[Q][i] = cpi->Y1quant_fast[Q][1];
+ cpi->Y1quant[Q][i] = cpi->Y1quant[Q][1];
+ cpi->Y1quant_shift[Q][i] = cpi->Y1quant_shift[Q][1];
+ cpi->Y1zbin[Q][i] = cpi->Y1zbin[Q][1];
+ cpi->Y1round[Q][i] = cpi->Y1round[Q][1];
+ cpi->common.Y1dequant[Q][i] = cpi->common.Y1dequant[Q][1];
+ cpi->zrun_zbin_boost_y1[Q][i] = (cpi->common.Y1dequant[Q][1] *
+ zbin_boost[i]) >> 7;
+
+ cpi->Y2quant_fast[Q][i] = cpi->Y2quant_fast[Q][1];
+ cpi->Y2quant[Q][i] = cpi->Y2quant[Q][1];
+ cpi->Y2quant_shift[Q][i] = cpi->Y2quant_shift[Q][1];
+ cpi->Y2zbin[Q][i] = cpi->Y2zbin[Q][1];
+ cpi->Y2round[Q][i] = cpi->Y2round[Q][1];
+ cpi->common.Y2dequant[Q][i] = cpi->common.Y2dequant[Q][1];
+ cpi->zrun_zbin_boost_y2[Q][i] = (cpi->common.Y2dequant[Q][1] *
+ zbin_boost[i]) >> 7;
+
+ cpi->UVquant_fast[Q][i] = cpi->UVquant_fast[Q][1];
+ cpi->UVquant[Q][i] = cpi->UVquant[Q][1];
+ cpi->UVquant_shift[Q][i] = cpi->UVquant_shift[Q][1];
+ cpi->UVzbin[Q][i] = cpi->UVzbin[Q][1];
+ cpi->UVround[Q][i] = cpi->UVround[Q][1];
+ cpi->common.UVdequant[Q][i] = cpi->common.UVdequant[Q][1];
+ cpi->zrun_zbin_boost_uv[Q][i] = (cpi->common.UVdequant[Q][1] *
+ zbin_boost[i]) >> 7;
}
}
}
@@ -609,6 +635,9 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x, int ok_to_skip)
/* This initialization should be called at least once. Use ok_to_skip to
* decide if it is ok to skip.
+ * Before encoding a frame, this function is always called with ok_to_skip
+ * =0, which means no skiping of calculations. The "last" values are
+ * initialized at that time.
*/
if (!ok_to_skip || QIndex != x->q_index)
{
diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c
index dc6feb980..1c43c1171 100644
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -28,7 +28,6 @@
#define MAX_BPB_FACTOR 50
extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES];
-extern const MV_REFERENCE_FRAME vp8_ref_frame_order[MAX_MODES];
@@ -305,6 +304,8 @@ void vp8_setup_key_frame(VP8_COMP *cpi)
// Setup for Key frame:
vp8_default_coef_probs(& cpi->common);
+
+
vp8_kf_default_bmode_probs(cpi->common.kf_bmode_prob);
vpx_memcpy(cpi->common.fc.mvc, vp8_default_mv_context, sizeof(vp8_default_mv_context));
@@ -315,6 +316,12 @@ void vp8_setup_key_frame(VP8_COMP *cpi)
vpx_memset(cpi->common.fc.pre_mvc, 0, sizeof(cpi->common.fc.pre_mvc)); //initialize pre_mvc to all zero.
+ // Make sure we initialize separate contexts for altref,gold, and normal.
+ // TODO shouldn't need 3 different copies of structure to do this!
+ vpx_memcpy(&cpi->lfc_a, &cpi->common.fc, sizeof(cpi->common.fc));
+ vpx_memcpy(&cpi->lfc_g, &cpi->common.fc, sizeof(cpi->common.fc));
+ vpx_memcpy(&cpi->lfc_n, &cpi->common.fc, sizeof(cpi->common.fc));
+
//cpi->common.filter_level = 0; // Reset every key frame.
cpi->common.filter_level = cpi->common.base_qindex * 3 / 8 ;
@@ -325,8 +332,8 @@ void vp8_setup_key_frame(VP8_COMP *cpi)
else
cpi->frames_till_gf_update_due = cpi->goldfreq;
- cpi->common.refresh_golden_frame = TRUE;
- cpi->common.refresh_alt_ref_frame = TRUE;
+ cpi->common.refresh_golden_frame = 1;
+ cpi->common.refresh_alt_ref_frame = 1;
}
@@ -464,7 +471,7 @@ static void calc_gf_params(VP8_COMP *cpi)
if (cpi->pass != 2)
{
// Single Pass lagged mode: TBD
- if (FALSE)
+ if (0)
{
}
@@ -591,14 +598,14 @@ static void calc_gf_params(VP8_COMP *cpi)
if (cpi->pass != 2)
{
// For now Alt ref is not allowed except in 2 pass modes.
- cpi->source_alt_ref_pending = FALSE;
+ cpi->source_alt_ref_pending = 0;
/*if ( cpi->oxcf.fixed_q == -1)
{
if ( cpi->oxcf.play_alternate && (cpi->last_boost > (100 + (AF_THRESH*cpi->frames_till_gf_update_due)) ) )
- cpi->source_alt_ref_pending = TRUE;
+ cpi->source_alt_ref_pending = 1;
else
- cpi->source_alt_ref_pending = FALSE;
+ cpi->source_alt_ref_pending = 0;
}*/
}
}
@@ -933,6 +940,8 @@ static void calc_pframe_target_size(VP8_COMP *cpi)
if (cpi->active_worst_quality <= cpi->active_best_quality)
cpi->active_worst_quality = cpi->active_best_quality + 1;
+ if(cpi->active_worst_quality > 127)
+ cpi->active_worst_quality = 127;
}
// Unbuffered mode (eg. video conferencing)
else
@@ -973,7 +982,7 @@ static void calc_pframe_target_size(VP8_COMP *cpi)
#endif
//vpx_log("Decoder: Drop frame due to bandwidth: %d \n",cpi->buffer_level, cpi->av_per_frame_bandwidth);
- cpi->drop_frame = TRUE;
+ cpi->drop_frame = 1;
}
#if 0
@@ -981,7 +990,7 @@ static void calc_pframe_target_size(VP8_COMP *cpi)
else if ((cpi->buffer_level < cpi->oxcf.drop_frames_water_mark * cpi->oxcf.optimal_buffer_level / 100) &&
(cpi->drop_count < cpi->max_drop_count) && (cpi->pass == 0))
{
- cpi->drop_frame = TRUE;
+ cpi->drop_frame = 1;
}
#endif
@@ -1027,11 +1036,11 @@ static void calc_pframe_target_size(VP8_COMP *cpi)
{
// For one pass throw a GF if recent frame intra useage is low or the GF useage is high
if ((cpi->pass == 0) && (cpi->this_frame_percent_intra < 15 || gf_frame_useage >= 5))
- cpi->common.refresh_golden_frame = TRUE;
+ cpi->common.refresh_golden_frame = 1;
// Two pass GF descision
else if (cpi->pass == 2)
- cpi->common.refresh_golden_frame = TRUE;
+ cpi->common.refresh_golden_frame = 1;
}
#if 0
@@ -1049,7 +1058,7 @@ static void calc_pframe_target_size(VP8_COMP *cpi)
#endif
- if (cpi->common.refresh_golden_frame == TRUE)
+ if (cpi->common.refresh_golden_frame == 1)
{
#if 0
@@ -1534,7 +1543,7 @@ int vp8_pick_frame_size(VP8_COMP *cpi)
// Check if we're dropping the frame:
if (cpi->drop_frame)
{
- cpi->drop_frame = FALSE;
+ cpi->drop_frame = 0;
cpi->drop_count++;
return 0;
}
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index e8abf848c..ce979619a 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -28,7 +28,6 @@
#include "encodemb.h"
#include "quantize.h"
#include "vp8/common/idct.h"
-#include "vp8/common/g_common.h"
#include "variance.h"
#include "mcomp.h"
#include "rdopt.h"
@@ -100,36 +99,39 @@ const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES] =
B_PRED,
};
-const MV_REFERENCE_FRAME vp8_ref_frame_order[MAX_MODES] =
+/* This table determines the search order in reference frame priority order,
+ * which may not necessarily match INTRA,LAST,GOLDEN,ARF
+ */
+const int vp8_ref_frame_order[MAX_MODES] =
{
- LAST_FRAME,
- INTRA_FRAME,
+ 1,
+ 0,
- LAST_FRAME,
- LAST_FRAME,
+ 1,
+ 1,
- GOLDEN_FRAME,
- GOLDEN_FRAME,
+ 2,
+ 2,
- ALTREF_FRAME,
- ALTREF_FRAME,
+ 3,
+ 3,
- GOLDEN_FRAME,
- ALTREF_FRAME,
+ 2,
+ 3,
- INTRA_FRAME,
- INTRA_FRAME,
- INTRA_FRAME,
+ 0,
+ 0,
+ 0,
- LAST_FRAME,
- GOLDEN_FRAME,
- ALTREF_FRAME,
+ 1,
+ 2,
+ 3,
- LAST_FRAME,
- GOLDEN_FRAME,
- ALTREF_FRAME,
+ 1,
+ 2,
+ 3,
- INTRA_FRAME,
+ 0,
};
static void fill_token_costs(
@@ -285,18 +287,39 @@ void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue)
}
}
- fill_token_costs(
- cpi->mb.token_costs,
- (const vp8_prob( *)[8][3][11]) cpi->common.fc.coef_probs
- );
+ {
+ // build token cost array for the type of frame we have now
+ FRAME_CONTEXT *l = &cpi->lfc_n;
+
+ if(cpi->common.refresh_alt_ref_frame)
+ l = &cpi->lfc_a;
+ else if(cpi->common.refresh_golden_frame)
+ l = &cpi->lfc_g;
+
+ fill_token_costs(
+ cpi->mb.token_costs,
+ (const vp8_prob( *)[8][3][11]) l->coef_probs
+ );
+ /*
+ fill_token_costs(
+ cpi->mb.token_costs,
+ (const vp8_prob( *)[8][3][11]) cpi->common.fc.coef_probs);
+ */
+
- vp8_init_mode_costs(cpi);
+ // TODO make these mode costs depend on last,alt or gold too. (jbb)
+ vp8_init_mode_costs(cpi);
+
+ // TODO figure onnnnuut why making mv cost frame type dependent didn't help (jbb)
+ //vp8_build_component_cost_table(cpi->mb.mvcost, (const MV_CONTEXT *) l->mvc, flags);
+
+ }
}
void vp8_auto_select_speed(VP8_COMP *cpi)
{
- int milliseconds_for_compress = (int)(1000000 / cpi->oxcf.frame_rate);
+ int milliseconds_for_compress = (int)(1000000 / cpi->frame_rate);
milliseconds_for_compress = milliseconds_for_compress * (16 - cpi->oxcf.cpu_used) / 16;
@@ -552,7 +575,7 @@ static void macro_block_yrd( MACROBLOCK *mb,
int d;
ENCODEMB_INVOKE(rtcd, submby)( mb->src_diff, *(mb->block[0].base_src),
- mb->e_mbd.predictor, mb->block[0].src_stride );
+ mb->block[0].src_stride, mb->e_mbd.predictor, 16);
// Fdct and building the 2nd order block
for (beptr = mb->block; beptr < mb->block + 16; beptr += 2)
@@ -800,7 +823,8 @@ static int rd_inter16x16_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
{
vp8_build_inter16x16_predictors_mbuv(&x->e_mbd);
ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), submbuv)(x->src_diff,
- x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
+ x->src.u_buffer, x->src.v_buffer, x->src.uv_stride,
+ &x->e_mbd.predictor[256], &x->e_mbd.predictor[320], 8);
vp8_transform_mbuv(x);
vp8_quantize_mbuv(x);
@@ -816,7 +840,8 @@ static int rd_inter4x4_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
{
vp8_build_inter4x4_predictors_mbuv(&x->e_mbd);
ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), submbuv)(x->src_diff,
- x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
+ x->src.u_buffer, x->src.v_buffer, x->src.uv_stride,
+ &x->e_mbd.predictor[256], &x->e_mbd.predictor[320], 8);
vp8_transform_mbuv(x);
vp8_quantize_mbuv(x);
@@ -845,8 +870,8 @@ static void rd_pick_intra_mbuv_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate, int
RECON_INVOKE(&cpi->rtcd.common->recon, build_intra_predictors_mbuv)
(&x->e_mbd);
ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), submbuv)(x->src_diff,
- x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor,
- x->src.uv_stride);
+ x->src.u_buffer, x->src.v_buffer, x->src.uv_stride,
+ &x->e_mbd.predictor[256], &x->e_mbd.predictor[320], 8);
vp8_transform_mbuv(x);
vp8_quantize_mbuv(x);
@@ -1359,8 +1384,8 @@ static int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x,
if (bsi.segment_rd < best_rd)
{
- int col_min = (best_ref_mv->as_mv.col>>3) - MAX_FULL_PEL_VAL + ((best_ref_mv->as_mv.col & 7)?1:0);
- int row_min = (best_ref_mv->as_mv.row>>3) - MAX_FULL_PEL_VAL + ((best_ref_mv->as_mv.row & 7)?1:0);
+ int col_min = ((best_ref_mv->as_mv.col+7)>>3) - MAX_FULL_PEL_VAL;
+ int row_min = ((best_ref_mv->as_mv.row+7)>>3) - MAX_FULL_PEL_VAL;
int col_max = (best_ref_mv->as_mv.col>>3) + MAX_FULL_PEL_VAL;
int row_max = (best_ref_mv->as_mv.row>>3) + MAX_FULL_PEL_VAL;
@@ -1458,57 +1483,6 @@ static int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x,
return bsi.segment_rd;
}
-static void insertsortmv(int arr[], int len)
-{
- int i, j, k;
-
- for ( i = 1 ; i <= len-1 ; i++ )
- {
- for ( j = 0 ; j < i ; j++ )
- {
- if ( arr[j] > arr[i] )
- {
- int temp;
-
- temp = arr[i];
-
- for ( k = i; k >j; k--)
- arr[k] = arr[k - 1] ;
-
- arr[j] = temp ;
- }
- }
- }
-}
-
-static void insertsortsad(int arr[],int idx[], int len)
-{
- int i, j, k;
-
- for ( i = 1 ; i <= len-1 ; i++ )
- {
- for ( j = 0 ; j < i ; j++ )
- {
- if ( arr[j] > arr[i] )
- {
- int temp, tempi;
-
- temp = arr[i];
- tempi = idx[i];
-
- for ( k = i; k >j; k--)
- {
- arr[k] = arr[k - 1] ;
- idx[k] = idx[k - 1];
- }
-
- arr[j] = temp ;
- idx[j] = tempi;
- }
- }
- }
-}
-
//The improved MV prediction
void vp8_mv_pred
(
@@ -1741,7 +1715,9 @@ static void rd_update_mvcount(VP8_COMP *cpi, MACROBLOCK *x, int_mv *best_ref_mv)
}
}
-void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra)
+void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
+ int recon_uvoffset, int *returnrate,
+ int *returndistortion, int *returnintra)
{
BLOCK *b = &x->block[0];
BLOCKD *d = &x->e_mbd.block[0];
@@ -1768,40 +1744,58 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
int distortion_uv;
int best_yrd = INT_MAX;
- //int all_rds[MAX_MODES]; // Experimental debug code.
- //int all_rates[MAX_MODES];
- //int all_dist[MAX_MODES];
- //int intermodecost[MAX_MODES];
-
MB_PREDICTION_MODE uv_intra_mode;
int_mv mvp;
int near_sadidx[8] = {0, 1, 2, 3, 4, 5, 6, 7};
int saddone=0;
int sr=0; //search range got from mv_pred(). It uses step_param levels. (0-7)
- int_mv frame_nearest_mv[4];
- int_mv frame_near_mv[4];
- int_mv frame_best_ref_mv[4];
- int frame_mdcounts[4][4];
int frame_lf_or_gf[4];
unsigned char *y_buffer[4];
unsigned char *u_buffer[4];
unsigned char *v_buffer[4];
+ int ref_frame_map[4];
+ int sign_bias = 0;
+ vpx_memset(mode_mv, 0, sizeof(mode_mv));
vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));
vpx_memset(&best_bmodes, 0, sizeof(best_bmodes));
+ /* Setup search priorities */
+ i=0;
+ ref_frame_map[i++] = INTRA_FRAME;
if (cpi->ref_frame_flags & VP8_LAST_FLAG)
+ ref_frame_map[i++] = LAST_FRAME;
+ if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
+ ref_frame_map[i++] = GOLDEN_FRAME;
+ if (cpi->ref_frame_flags & VP8_ALT_FLAG)
+ ref_frame_map[i++] = ALTREF_FRAME;
+ for(; i<4; i++)
+ ref_frame_map[i] = -1;
+
+ /* Check to see if there is at least 1 valid reference frame that we need
+ * to calculate near_mvs.
+ */
+ if (ref_frame_map[1] > 0)
{
- YV12_BUFFER_CONFIG *lst_yv12 = &cpi->common.yv12_fb[cpi->common.lst_fb_idx];
+ vp8_find_near_mvs(&x->e_mbd,
+ x->e_mbd.mode_info_context,
+ &mode_mv[NEARESTMV], &mode_mv[NEARMV],
+ &best_ref_mv,
+ mdcounts,
+ ref_frame_map[1],
+ cpi->common.ref_frame_sign_bias);
+
+ sign_bias = cpi->common.ref_frame_sign_bias[ref_frame_map[1]];
+ }
- vp8_find_near_mvs(&x->e_mbd, x->e_mbd.mode_info_context, &frame_nearest_mv[LAST_FRAME], &frame_near_mv[LAST_FRAME],
- &frame_best_ref_mv[LAST_FRAME], frame_mdcounts[LAST_FRAME], LAST_FRAME, cpi->common.ref_frame_sign_bias);
+ if (cpi->ref_frame_flags & VP8_LAST_FLAG)
+ {
+ YV12_BUFFER_CONFIG *lst_yv12 = &cpi->common.yv12_fb[cpi->common.lst_fb_idx];
y_buffer[LAST_FRAME] = lst_yv12->y_buffer + recon_yoffset;
u_buffer[LAST_FRAME] = lst_yv12->u_buffer + recon_uvoffset;
v_buffer[LAST_FRAME] = lst_yv12->v_buffer + recon_uvoffset;
-
frame_lf_or_gf[LAST_FRAME] = 0;
}
@@ -1809,13 +1803,9 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
{
YV12_BUFFER_CONFIG *gld_yv12 = &cpi->common.yv12_fb[cpi->common.gld_fb_idx];
- vp8_find_near_mvs(&x->e_mbd, x->e_mbd.mode_info_context, &frame_nearest_mv[GOLDEN_FRAME], &frame_near_mv[GOLDEN_FRAME],
- &frame_best_ref_mv[GOLDEN_FRAME], frame_mdcounts[GOLDEN_FRAME], GOLDEN_FRAME, cpi->common.ref_frame_sign_bias);
-
y_buffer[GOLDEN_FRAME] = gld_yv12->y_buffer + recon_yoffset;
u_buffer[GOLDEN_FRAME] = gld_yv12->u_buffer + recon_uvoffset;
v_buffer[GOLDEN_FRAME] = gld_yv12->v_buffer + recon_uvoffset;
-
frame_lf_or_gf[GOLDEN_FRAME] = 1;
}
@@ -1823,13 +1813,9 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
{
YV12_BUFFER_CONFIG *alt_yv12 = &cpi->common.yv12_fb[cpi->common.alt_fb_idx];
- vp8_find_near_mvs(&x->e_mbd, x->e_mbd.mode_info_context, &frame_nearest_mv[ALTREF_FRAME], &frame_near_mv[ALTREF_FRAME],
- &frame_best_ref_mv[ALTREF_FRAME], frame_mdcounts[ALTREF_FRAME], ALTREF_FRAME, cpi->common.ref_frame_sign_bias);
-
y_buffer[ALTREF_FRAME] = alt_yv12->y_buffer + recon_yoffset;
u_buffer[ALTREF_FRAME] = alt_yv12->u_buffer + recon_uvoffset;
v_buffer[ALTREF_FRAME] = alt_yv12->v_buffer + recon_uvoffset;
-
frame_lf_or_gf[ALTREF_FRAME] = 1;
}
@@ -1838,8 +1824,6 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
x->skip = 0;
- vpx_memset(mode_mv, 0, sizeof(mode_mv));
-
x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;
rd_pick_intra_mbuv_mode(cpi, x, &uv_intra_rate, &uv_intra_rate_tokenonly, &uv_intra_distortion);
uv_intra_mode = x->e_mbd.mode_info_context->mbmi.uv_mode;
@@ -1850,18 +1834,15 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
int lf_or_gf = 0; // Lat Frame (01) or gf/arf (1)
int disable_skip = 0;
int other_cost = 0;
-
- // Experimental debug code.
- // Record of rd values recorded for this MB. -1 indicates not measured
- //all_rds[mode_index] = -1;
- //all_rates[mode_index] = -1;
- //all_dist[mode_index] = -1;
- //intermodecost[mode_index] = -1;
+ int this_ref_frame = ref_frame_map[vp8_ref_frame_order[mode_index]];
// Test best rd so far against threshold for trying this mode.
if (best_rd <= cpi->rd_threshes[mode_index])
continue;
+ if (this_ref_frame < 0)
+ continue;
+
// These variables hold are rolling total cost and distortion for this mode
rate2 = 0;
distortion2 = 0;
@@ -1870,7 +1851,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
x->e_mbd.mode_info_context->mbmi.mode = this_mode;
x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;
- x->e_mbd.mode_info_context->mbmi.ref_frame = vp8_ref_frame_order[mode_index];
+ x->e_mbd.mode_info_context->mbmi.ref_frame = this_ref_frame;
// Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
// unless ARNR filtering is enabled in which case we want
@@ -1887,10 +1868,20 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
x->e_mbd.pre.y_buffer = y_buffer[x->e_mbd.mode_info_context->mbmi.ref_frame];
x->e_mbd.pre.u_buffer = u_buffer[x->e_mbd.mode_info_context->mbmi.ref_frame];
x->e_mbd.pre.v_buffer = v_buffer[x->e_mbd.mode_info_context->mbmi.ref_frame];
- mode_mv[NEARESTMV] = frame_nearest_mv[x->e_mbd.mode_info_context->mbmi.ref_frame];
- mode_mv[NEARMV] = frame_near_mv[x->e_mbd.mode_info_context->mbmi.ref_frame];
- best_ref_mv = frame_best_ref_mv[x->e_mbd.mode_info_context->mbmi.ref_frame];
- vpx_memcpy(mdcounts, frame_mdcounts[x->e_mbd.mode_info_context->mbmi.ref_frame], sizeof(mdcounts));
+
+ if (sign_bias !=
+ cpi->common.ref_frame_sign_bias[x->e_mbd.mode_info_context->mbmi.ref_frame])
+ {
+ mode_mv[NEARESTMV].as_mv.row *= -1;
+ mode_mv[NEARESTMV].as_mv.col *= -1;
+ mode_mv[NEARMV].as_mv.row *= -1;
+ mode_mv[NEARMV].as_mv.col *= -1;
+ best_ref_mv.as_mv.row *= -1;
+ best_ref_mv.as_mv.col *= -1;
+ sign_bias
+ = cpi->common.ref_frame_sign_bias[x->e_mbd.mode_info_context->mbmi.ref_frame];
+ }
+
lf_or_gf = frame_lf_or_gf[x->e_mbd.mode_info_context->mbmi.ref_frame];
}
@@ -1918,13 +1909,13 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
// Experimental code. Special case for gf and arf zeromv modes. Increase zbin size to supress noise
if (cpi->zbin_mode_boost_enabled)
{
- if ( vp8_ref_frame_order[mode_index] == INTRA_FRAME )
+ if ( this_ref_frame == INTRA_FRAME )
cpi->zbin_mode_boost = 0;
else
{
if (vp8_mode_order[mode_index] == ZEROMV)
{
- if (vp8_ref_frame_order[mode_index] != LAST_FRAME)
+ if (this_ref_frame != LAST_FRAME)
cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
else
cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
@@ -1969,8 +1960,8 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
int tmp_rd;
int this_rd_thresh;
- this_rd_thresh = (x->e_mbd.mode_info_context->mbmi.ref_frame == LAST_FRAME) ? cpi->rd_threshes[THR_NEWMV] : cpi->rd_threshes[THR_NEWA];
- this_rd_thresh = (x->e_mbd.mode_info_context->mbmi.ref_frame == GOLDEN_FRAME) ? cpi->rd_threshes[THR_NEWG]: this_rd_thresh;
+ this_rd_thresh = (vp8_ref_frame_order[mode_index] == 1) ? cpi->rd_threshes[THR_NEW1] : cpi->rd_threshes[THR_NEW3];
+ this_rd_thresh = (vp8_ref_frame_order[mode_index] == 2) ? cpi->rd_threshes[THR_NEW2] : this_rd_thresh;
tmp_rd = vp8_rd_pick_best_mbsegmentation(cpi, x, &best_ref_mv,
best_yrd, mdcounts,
@@ -2024,8 +2015,8 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
int sadpb = x->sadperbit16;
int_mv mvp_full;
- int col_min = (best_ref_mv.as_mv.col>>3) - MAX_FULL_PEL_VAL + ((best_ref_mv.as_mv.col & 7)?1:0);
- int row_min = (best_ref_mv.as_mv.row>>3) - MAX_FULL_PEL_VAL + ((best_ref_mv.as_mv.row & 7)?1:0);
+ int col_min = ((best_ref_mv.as_mv.col+7)>>3) - MAX_FULL_PEL_VAL;
+ int row_min = ((best_ref_mv.as_mv.row+7)>>3) - MAX_FULL_PEL_VAL;
int col_max = (best_ref_mv.as_mv.col>>3) + MAX_FULL_PEL_VAL;
int row_max = (best_ref_mv.as_mv.row>>3) + MAX_FULL_PEL_VAL;
@@ -2174,7 +2165,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
continue;
vp8_set_mbmode_and_mvs(x, this_mode, &mode_mv[this_mode]);
- vp8_build_inter16x16_predictors_mby(&x->e_mbd);
+ vp8_build_inter16x16_predictors_mby(&x->e_mbd, x->e_mbd.predictor, 16);
if (cpi->active_map_enabled && x->active_ptr[0] == 0) {
x->skip = 1;
@@ -2294,11 +2285,6 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
}
- // Experimental debug code.
- //all_rds[mode_index] = this_rd;
- //all_rates[mode_index] = rate2;
- //all_dist[mode_index] = distortion2;
-
// Keep record of best intra distortion
if ((x->e_mbd.mode_info_context->mbmi.ref_frame == INTRA_FRAME) &&
(this_rd < best_intra_rd) )
@@ -2399,7 +2385,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
x->e_mbd.mode_info_context->mbmi.mv.as_int = 0;
x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;
x->e_mbd.mode_info_context->mbmi.mb_skip_coeff =
- (cpi->common.mb_no_coeff_skip) ? 1 : 0;
+ (cpi->common.mb_no_coeff_skip);
x->e_mbd.mode_info_context->mbmi.partitioning = 0;
return;
@@ -2426,10 +2412,14 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
x->partition_info->bmi[15].mv.as_int;
}
- rd_update_mvcount(cpi, x, &frame_best_ref_mv[xd->mode_info_context->mbmi.ref_frame]);
-
-
+ if (sign_bias
+ != cpi->common.ref_frame_sign_bias[xd->mode_info_context->mbmi.ref_frame])
+ {
+ best_ref_mv.as_mv.row *= -1;
+ best_ref_mv.as_mv.col *= -1;
+ }
+ rd_update_mvcount(cpi, x, &best_ref_mv);
}
void vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate_)
diff --git a/vp8/encoder/rdopt.h b/vp8/encoder/rdopt.h
index 95134cb81..5ee869903 100644
--- a/vp8/encoder/rdopt.h
+++ b/vp8/encoder/rdopt.h
@@ -14,6 +14,57 @@
#define RDCOST(RM,DM,R,D) ( ((128+(R)*(RM)) >> 8) + (DM)*(D) )
+static void insertsortmv(int arr[], int len)
+{
+ int i, j, k;
+
+ for ( i = 1 ; i <= len-1 ; i++ )
+ {
+ for ( j = 0 ; j < i ; j++ )
+ {
+ if ( arr[j] > arr[i] )
+ {
+ int temp;
+
+ temp = arr[i];
+
+ for ( k = i; k >j; k--)
+ arr[k] = arr[k - 1] ;
+
+ arr[j] = temp ;
+ }
+ }
+ }
+}
+
+static void insertsortsad(int arr[],int idx[], int len)
+{
+ int i, j, k;
+
+ for ( i = 1 ; i <= len-1 ; i++ )
+ {
+ for ( j = 0 ; j < i ; j++ )
+ {
+ if ( arr[j] > arr[i] )
+ {
+ int temp, tempi;
+
+ temp = arr[i];
+ tempi = idx[i];
+
+ for ( k = i; k >j; k--)
+ {
+ arr[k] = arr[k - 1] ;
+ idx[k] = idx[k - 1];
+ }
+
+ arr[j] = temp ;
+ idx[j] = tempi;
+ }
+ }
+ }
+}
+
extern void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue);
extern void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra);
extern void vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate);
diff --git a/vp8/encoder/temporal_filter.c b/vp8/encoder/temporal_filter.c
index b9ade1c6c..545e4f205 100644
--- a/vp8/encoder/temporal_filter.c
+++ b/vp8/encoder/temporal_filter.c
@@ -22,7 +22,6 @@
#include "ratectrl.h"
#include "vp8/common/quant_common.h"
#include "segmentation.h"
-#include "vp8/common/g_common.h"
#include "vpx_scale/yv12extend.h"
#include "vpx_mem/vpx_mem.h"
#include "vp8/common/swapyv12buffer.h"
@@ -98,7 +97,7 @@ void vp8_temporal_filter_apply_c
unsigned short *count
)
{
- int i, j, k;
+ unsigned int i, j, k;
int modifier;
int byte = 0;
@@ -186,7 +185,7 @@ static int vp8_temporal_filter_find_matching_mb_c
if (cpi->Speed < 8)
{
step_param = cpi->sf.first_step +
- ((cpi->Speed > 5) ? 1 : 0);
+ (cpi->Speed > 5);
further_steps =
(cpi->sf.max_step_search_steps - 1)-step_param;
}
diff --git a/vp8/encoder/tokenize.c b/vp8/encoder/tokenize.c
index e81948567..8bfc47f8f 100644
--- a/vp8/encoder/tokenize.c
+++ b/vp8/encoder/tokenize.c
@@ -514,17 +514,19 @@ static __inline void stuff1st_order_b
TOKENEXTRA **tp,
ENTROPY_CONTEXT *a,
ENTROPY_CONTEXT *l,
+ int type,
VP8_COMP *cpi
)
{
int pt; /* near block/prev token context index */
+ int band;
TOKENEXTRA *t = *tp; /* store tokens starting here */
VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
-
+ band = type ? 0 : 1;
t->Token = DCT_EOB_TOKEN;
- t->context_tree = cpi->common.fc.coef_probs [0] [1] [pt];
+ t->context_tree = cpi->common.fc.coef_probs [type] [band] [pt];
t->skip_eob_node = 0;
- ++cpi->coef_counts [0] [1] [pt] [DCT_EOB_TOKEN];
+ ++cpi->coef_counts [type] [band] [pt] [DCT_EOB_TOKEN];
++t;
*tp = t;
pt = 0; /* 0 <-> all coeff data is zero */
@@ -561,15 +563,19 @@ void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t)
ENTROPY_CONTEXT * L = (ENTROPY_CONTEXT *)x->left_context;
int plane_type;
int b;
-
- stuff2nd_order_b(t,
+ plane_type = 3;
+ if((x->mode_info_context->mbmi.mode != B_PRED
+ && x->mode_info_context->mbmi.mode != SPLITMV))
+ {
+ stuff2nd_order_b(t,
A + vp8_block2above[24], L + vp8_block2left[24], cpi);
- plane_type = 0;
+ plane_type = 0;
+ }
for (b = 0; b < 16; b++)
stuff1st_order_b(t,
A + vp8_block2above[b],
- L + vp8_block2left[b], cpi);
+ L + vp8_block2left[b], plane_type, cpi);
for (b = 16; b < 24; b++)
stuff1st_order_buv(t,
diff --git a/vp8/encoder/x86/subtract_mmx.asm b/vp8/encoder/x86/subtract_mmx.asm
index 4ce16ce90..75e8aa3c2 100644
--- a/vp8/encoder/x86/subtract_mmx.asm
+++ b/vp8/encoder/x86/subtract_mmx.asm
@@ -73,74 +73,71 @@ sym(vp8_subtract_b_mmx_impl):
pop rbp
ret
-;void vp8_subtract_mby_mmx(short *diff, unsigned char *src, unsigned char *pred, int stride)
+;void vp8_subtract_mby_mmx(short *diff, unsigned char *src, int src_stride,
+;unsigned char *pred, int pred_stride)
global sym(vp8_subtract_mby_mmx)
sym(vp8_subtract_mby_mmx):
push rbp
mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
+ SHADOW_ARGS_TO_STACK 5
push rsi
push rdi
; end prolog
+ mov rdi, arg(0) ;diff
+ mov rsi, arg(1) ;src
+ movsxd rdx, dword ptr arg(2);src_stride
+ mov rax, arg(3) ;pred
+ push rbx
+ movsxd rbx, dword ptr arg(4);pred_stride
- mov rsi, arg(1) ;src
- mov rdi, arg(0) ;diff
-
- mov rax, arg(2) ;pred
- movsxd rdx, dword ptr arg(3) ;stride
+ pxor mm0, mm0
+ mov rcx, 16
- mov rcx, 16
- pxor mm0, mm0
.submby_loop:
+ movq mm1, [rsi]
+ movq mm3, [rax]
- movq mm1, [rsi]
- movq mm3, [rax]
-
- movq mm2, mm1
- movq mm4, mm3
-
- punpcklbw mm1, mm0
- punpcklbw mm3, mm0
-
- punpckhbw mm2, mm0
- punpckhbw mm4, mm0
-
- psubw mm1, mm3
- psubw mm2, mm4
+ movq mm2, mm1
+ movq mm4, mm3
- movq [rdi], mm1
- movq [rdi+8], mm2
+ punpcklbw mm1, mm0
+ punpcklbw mm3, mm0
+ punpckhbw mm2, mm0
+ punpckhbw mm4, mm0
- movq mm1, [rsi+8]
- movq mm3, [rax+8]
+ psubw mm1, mm3
+ psubw mm2, mm4
- movq mm2, mm1
- movq mm4, mm3
+ movq [rdi], mm1
+ movq [rdi+8], mm2
- punpcklbw mm1, mm0
- punpcklbw mm3, mm0
+ movq mm1, [rsi+8]
+ movq mm3, [rax+8]
- punpckhbw mm2, mm0
- punpckhbw mm4, mm0
+ movq mm2, mm1
+ movq mm4, mm3
- psubw mm1, mm3
- psubw mm2, mm4
+ punpcklbw mm1, mm0
+ punpcklbw mm3, mm0
- movq [rdi+16], mm1
- movq [rdi+24], mm2
+ punpckhbw mm2, mm0
+ punpckhbw mm4, mm0
+ psubw mm1, mm3
+ psubw mm2, mm4
- add rdi, 32
- add rax, 16
-
- lea rsi, [rsi+rdx]
-
- sub rcx, 1
- jnz .submby_loop
+ movq [rdi+16], mm1
+ movq [rdi+24], mm2
+ add rdi, 32
+ lea rax, [rax+rbx]
+ lea rsi, [rsi+rdx]
+ dec rcx
+ jnz .submby_loop
+ pop rbx
pop rdi
pop rsi
; begin epilog
@@ -149,281 +146,75 @@ sym(vp8_subtract_mby_mmx):
ret
-;void vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
+;vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc,
+; int src_stride, unsigned char *upred,
+; unsigned char *vpred, int pred_stride)
+
global sym(vp8_subtract_mbuv_mmx)
sym(vp8_subtract_mbuv_mmx):
push rbp
mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
+ SHADOW_ARGS_TO_STACK 7
push rsi
push rdi
; end prolog
- ;short *udiff = diff + 256;
- ;short *vdiff = diff + 320;
- ;unsigned char *upred = pred + 256;
- ;unsigned char *vpred = pred + 320;
-
- ;unsigned char *z = usrc;
- ;unsigned short *diff = udiff;
- ;unsigned char *Predictor= upred;
-
- mov rdi, arg(0) ;diff
- mov rax, arg(3) ;pred
- mov rsi, arg(1) ;z = usrc
- add rdi, 256*2 ;diff = diff + 256 (shorts)
- add rax, 256 ;Predictor = pred + 256
- movsxd rdx, dword ptr arg(4) ;stride;
- pxor mm7, mm7
-
- movq mm0, [rsi]
- movq mm1, [rax]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
- movq [rdi], mm0
- movq [rdi+8], mm3
-
-
- movq mm0, [rsi+rdx]
- movq mm1, [rax+8]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
- movq [rdi+16], mm0
- movq [rdi+24], mm3
-
- movq mm0, [rsi+rdx*2]
- movq mm1, [rax+16]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
- movq [rdi+32], mm0
- movq [rdi+40], mm3
- lea rsi, [rsi+rdx*2]
-
-
- movq mm0, [rsi+rdx]
- movq mm1, [rax+24]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
-
- movq [rdi+48], mm0
- movq [rdi+56], mm3
-
-
- add rdi, 64
- add rax, 32
- lea rsi, [rsi+rdx*2]
-
-
- movq mm0, [rsi]
- movq mm1, [rax]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
- movq [rdi], mm0
- movq [rdi+8], mm3
-
-
- movq mm0, [rsi+rdx]
- movq mm1, [rax+8]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
- movq [rdi+16], mm0
- movq [rdi+24], mm3
-
- movq mm0, [rsi+rdx*2]
- movq mm1, [rax+16]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
- movq [rdi+32], mm0
- movq [rdi+40], mm3
- lea rsi, [rsi+rdx*2]
-
-
- movq mm0, [rsi+rdx]
- movq mm1, [rax+24]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
-
- movq [rdi+48], mm0
- movq [rdi+56], mm3
-
- ;unsigned char *z = vsrc;
- ;unsigned short *diff = vdiff;
- ;unsigned char *Predictor= vpred;
-
- mov rdi, arg(0) ;diff
- mov rax, arg(3) ;pred
- mov rsi, arg(2) ;z = usrc
- add rdi, 320*2 ;diff = diff + 320 (shorts)
- add rax, 320 ;Predictor = pred + 320
- movsxd rdx, dword ptr arg(4) ;stride;
- pxor mm7, mm7
-
- movq mm0, [rsi]
- movq mm1, [rax]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
- movq [rdi], mm0
- movq [rdi+8], mm3
-
-
- movq mm0, [rsi+rdx]
- movq mm1, [rax+8]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
- movq [rdi+16], mm0
- movq [rdi+24], mm3
-
- movq mm0, [rsi+rdx*2]
- movq mm1, [rax+16]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
- movq [rdi+32], mm0
- movq [rdi+40], mm3
- lea rsi, [rsi+rdx*2]
-
-
- movq mm0, [rsi+rdx]
- movq mm1, [rax+24]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
-
- movq [rdi+48], mm0
- movq [rdi+56], mm3
-
-
- add rdi, 64
- add rax, 32
- lea rsi, [rsi+rdx*2]
-
-
- movq mm0, [rsi]
- movq mm1, [rax]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
- movq [rdi], mm0
- movq [rdi+8], mm3
-
-
- movq mm0, [rsi+rdx]
- movq mm1, [rax+8]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
- movq [rdi+16], mm0
- movq [rdi+24], mm3
-
- movq mm0, [rsi+rdx*2]
- movq mm1, [rax+16]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
- movq [rdi+32], mm0
- movq [rdi+40], mm3
- lea rsi, [rsi+rdx*2]
-
-
- movq mm0, [rsi+rdx]
- movq mm1, [rax+24]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
-
- movq [rdi+48], mm0
- movq [rdi+56], mm3
-
+ mov rdi, arg(0) ;diff
+ mov rsi, arg(1) ;usrc
+ movsxd rdx, dword ptr arg(3);src_stride;
+ mov rax, arg(4) ;upred
+ add rdi, 256*2 ;diff = diff + 256 (shorts)
+ mov rcx, 8
+ push rbx
+ movsxd rbx, dword ptr arg(6);pred_stride
+
+ pxor mm7, mm7
+
+.submbu_loop:
+ movq mm0, [rsi]
+ movq mm1, [rax]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+ movq [rdi], mm0
+ movq [rdi+8], mm3
+ add rdi, 16
+ add rsi, rdx
+ add rax, rbx
+
+ dec rcx
+ jnz .submbu_loop
+
+ mov rsi, arg(2) ;vsrc
+ mov rax, arg(5) ;vpred
+ mov rcx, 8
+
+.submbv_loop:
+ movq mm0, [rsi]
+ movq mm1, [rax]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+ movq [rdi], mm0
+ movq [rdi+8], mm3
+ add rdi, 16
+ add rsi, rdx
+ add rax, rbx
+
+ dec rcx
+ jnz .submbv_loop
+
+ pop rbx
; begin epilog
pop rdi
pop rsi
diff --git a/vp8/encoder/x86/subtract_sse2.asm b/vp8/encoder/x86/subtract_sse2.asm
index 3bd1ff678..008e9c7d1 100644
--- a/vp8/encoder/x86/subtract_sse2.asm
+++ b/vp8/encoder/x86/subtract_sse2.asm
@@ -71,277 +71,166 @@ sym(vp8_subtract_b_sse2_impl):
ret
-;void vp8_subtract_mby_sse2(short *diff, unsigned char *src, unsigned char *pred, int stride)
+;void vp8_subtract_mby_sse2(short *diff, unsigned char *src, int src_stride,
+;unsigned char *pred, int pred_stride)
global sym(vp8_subtract_mby_sse2)
sym(vp8_subtract_mby_sse2):
push rbp
mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- SAVE_XMM 7
+ SHADOW_ARGS_TO_STACK 5
GET_GOT rbx
push rsi
push rdi
; end prolog
- mov rsi, arg(1) ;src
- mov rdi, arg(0) ;diff
-
- mov rax, arg(2) ;pred
- movsxd rdx, dword ptr arg(3) ;stride
-
- mov rcx, 8 ; do two lines at one time
+ mov rdi, arg(0) ;diff
+ mov rsi, arg(1) ;src
+ movsxd rdx, dword ptr arg(2);src_stride
+ mov rax, arg(3) ;pred
+ movdqa xmm4, [GLOBAL(t80)]
+ push rbx
+ mov rcx, 8 ; do two lines at one time
+ movsxd rbx, dword ptr arg(4);pred_stride
.submby_loop:
- movdqa xmm0, XMMWORD PTR [rsi] ; src
- movdqa xmm1, XMMWORD PTR [rax] ; pred
+ movdqa xmm0, [rsi] ; src
+ movdqa xmm1, [rax] ; pred
- movdqa xmm2, xmm0
- psubb xmm0, xmm1
+ movdqa xmm2, xmm0
+ psubb xmm0, xmm1
- pxor xmm1, [GLOBAL(t80)] ;convert to signed values
- pxor xmm2, [GLOBAL(t80)]
- pcmpgtb xmm1, xmm2 ; obtain sign information
+ pxor xmm1, xmm4 ;convert to signed values
+ pxor xmm2, xmm4
+ pcmpgtb xmm1, xmm2 ; obtain sign information
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- punpcklbw xmm0, xmm1 ; put sign back to subtraction
- punpckhbw xmm2, xmm3 ; put sign back to subtraction
+ movdqa xmm2, xmm0
+ punpcklbw xmm0, xmm1 ; put sign back to subtraction
+ punpckhbw xmm2, xmm1 ; put sign back to subtraction
- movdqa XMMWORD PTR [rdi], xmm0
- movdqa XMMWORD PTR [rdi +16], xmm2
+ movdqa xmm3, [rsi + rdx]
+ movdqa xmm5, [rax + rbx]
- movdqa xmm4, XMMWORD PTR [rsi + rdx]
- movdqa xmm5, XMMWORD PTR [rax + 16]
+ lea rsi, [rsi+rdx*2]
+ lea rax, [rax+rbx*2]
- movdqa xmm6, xmm4
- psubb xmm4, xmm5
+ movdqa [rdi], xmm0
+ movdqa [rdi +16], xmm2
- pxor xmm5, [GLOBAL(t80)] ;convert to signed values
- pxor xmm6, [GLOBAL(t80)]
- pcmpgtb xmm5, xmm6 ; obtain sign information
+ movdqa xmm1, xmm3
+ psubb xmm3, xmm5
- movdqa xmm6, xmm4
- movdqa xmm7, xmm5
- punpcklbw xmm4, xmm5 ; put sign back to subtraction
- punpckhbw xmm6, xmm7 ; put sign back to subtraction
+ pxor xmm5, xmm4 ;convert to signed values
+ pxor xmm1, xmm4
+ pcmpgtb xmm5, xmm1 ; obtain sign information
- movdqa XMMWORD PTR [rdi +32], xmm4
- movdqa XMMWORD PTR [rdi +48], xmm6
+ movdqa xmm1, xmm3
+ punpcklbw xmm3, xmm5 ; put sign back to subtraction
+ punpckhbw xmm1, xmm5 ; put sign back to subtraction
- add rdi, 64
- add rax, 32
- lea rsi, [rsi+rdx*2]
+ movdqa [rdi +32], xmm3
+ movdqa [rdi +48], xmm1
- sub rcx, 1
- jnz .submby_loop
+ add rdi, 64
+ dec rcx
+ jnz .submby_loop
+ pop rbx
pop rdi
pop rsi
; begin epilog
RESTORE_GOT
- RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
-
-;void vp8_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
+;vp8_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc,
+; int src_stride, unsigned char *upred,
+; unsigned char *vpred, int pred_stride)
global sym(vp8_subtract_mbuv_sse2)
sym(vp8_subtract_mbuv_sse2):
push rbp
mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
+ SHADOW_ARGS_TO_STACK 7
GET_GOT rbx
push rsi
push rdi
; end prolog
- mov rdi, arg(0) ;diff
- mov rax, arg(3) ;pred
- mov rsi, arg(1) ;z = usrc
- add rdi, 256*2 ;diff = diff + 256 (shorts)
- add rax, 256 ;Predictor = pred + 256
- movsxd rdx, dword ptr arg(4) ;stride;
- lea rcx, [rdx + rdx*2]
-
- ;u
- ;line 0 1
- movq xmm0, MMWORD PTR [rsi] ; src
- movq xmm2, MMWORD PTR [rsi+rdx]
- movdqa xmm1, XMMWORD PTR [rax] ; pred
- punpcklqdq xmm0, xmm2
-
- movdqa xmm2, xmm0
- psubb xmm0, xmm1 ; subtraction with sign missed
-
- pxor xmm1, [GLOBAL(t80)] ;convert to signed values
- pxor xmm2, [GLOBAL(t80)]
- pcmpgtb xmm1, xmm2 ; obtain sign information
-
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- punpcklbw xmm0, xmm1 ; put sign back to subtraction
- punpckhbw xmm2, xmm3 ; put sign back to subtraction
-
- movdqa XMMWORD PTR [rdi], xmm0
- movdqa XMMWORD PTR [rdi +16], xmm2
-
- ;line 2 3
- movq xmm0, MMWORD PTR [rsi+rdx*2] ; src
- movq xmm2, MMWORD PTR [rsi+rcx]
- movdqa xmm1, XMMWORD PTR [rax+16] ; pred
- punpcklqdq xmm0, xmm2
-
- movdqa xmm2, xmm0
- psubb xmm0, xmm1 ; subtraction with sign missed
-
- pxor xmm1, [GLOBAL(t80)] ;convert to signed values
- pxor xmm2, [GLOBAL(t80)]
- pcmpgtb xmm1, xmm2 ; obtain sign information
-
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- punpcklbw xmm0, xmm1 ; put sign back to subtraction
- punpckhbw xmm2, xmm3 ; put sign back to subtraction
-
- movdqa XMMWORD PTR [rdi + 32], xmm0
- movdqa XMMWORD PTR [rdi + 48], xmm2
-
- ;line 4 5
- lea rsi, [rsi + rdx*4]
-
- movq xmm0, MMWORD PTR [rsi] ; src
- movq xmm2, MMWORD PTR [rsi+rdx]
- movdqa xmm1, XMMWORD PTR [rax + 32] ; pred
- punpcklqdq xmm0, xmm2
-
- movdqa xmm2, xmm0
- psubb xmm0, xmm1 ; subtraction with sign missed
-
- pxor xmm1, [GLOBAL(t80)] ;convert to signed values
- pxor xmm2, [GLOBAL(t80)]
- pcmpgtb xmm1, xmm2 ; obtain sign information
-
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- punpcklbw xmm0, xmm1 ; put sign back to subtraction
- punpckhbw xmm2, xmm3 ; put sign back to subtraction
-
- movdqa XMMWORD PTR [rdi + 64], xmm0
- movdqa XMMWORD PTR [rdi + 80], xmm2
-
- ;line 6 7
- movq xmm0, MMWORD PTR [rsi+rdx*2] ; src
- movq xmm2, MMWORD PTR [rsi+rcx]
- movdqa xmm1, XMMWORD PTR [rax+ 48] ; pred
- punpcklqdq xmm0, xmm2
-
- movdqa xmm2, xmm0
- psubb xmm0, xmm1 ; subtraction with sign missed
-
- pxor xmm1, [GLOBAL(t80)] ;convert to signed values
- pxor xmm2, [GLOBAL(t80)]
- pcmpgtb xmm1, xmm2 ; obtain sign information
-
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- punpcklbw xmm0, xmm1 ; put sign back to subtraction
- punpckhbw xmm2, xmm3 ; put sign back to subtraction
-
- movdqa XMMWORD PTR [rdi + 96], xmm0
- movdqa XMMWORD PTR [rdi + 112], xmm2
-
- ;v
- mov rsi, arg(2) ;z = vsrc
- add rdi, 64*2 ;diff = diff + 320 (shorts)
- add rax, 64 ;Predictor = pred + 320
-
- ;line 0 1
- movq xmm0, MMWORD PTR [rsi] ; src
- movq xmm2, MMWORD PTR [rsi+rdx]
- movdqa xmm1, XMMWORD PTR [rax] ; pred
- punpcklqdq xmm0, xmm2
-
- movdqa xmm2, xmm0
- psubb xmm0, xmm1 ; subtraction with sign missed
-
- pxor xmm1, [GLOBAL(t80)] ;convert to signed values
- pxor xmm2, [GLOBAL(t80)]
- pcmpgtb xmm1, xmm2 ; obtain sign information
-
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- punpcklbw xmm0, xmm1 ; put sign back to subtraction
- punpckhbw xmm2, xmm3 ; put sign back to subtraction
-
- movdqa XMMWORD PTR [rdi], xmm0
- movdqa XMMWORD PTR [rdi +16], xmm2
-
- ;line 2 3
- movq xmm0, MMWORD PTR [rsi+rdx*2] ; src
- movq xmm2, MMWORD PTR [rsi+rcx]
- movdqa xmm1, XMMWORD PTR [rax+16] ; pred
- punpcklqdq xmm0, xmm2
-
- movdqa xmm2, xmm0
- psubb xmm0, xmm1 ; subtraction with sign missed
-
- pxor xmm1, [GLOBAL(t80)] ;convert to signed values
- pxor xmm2, [GLOBAL(t80)]
- pcmpgtb xmm1, xmm2 ; obtain sign information
-
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- punpcklbw xmm0, xmm1 ; put sign back to subtraction
- punpckhbw xmm2, xmm3 ; put sign back to subtraction
-
- movdqa XMMWORD PTR [rdi + 32], xmm0
- movdqa XMMWORD PTR [rdi + 48], xmm2
-
- ;line 4 5
- lea rsi, [rsi + rdx*4]
-
- movq xmm0, MMWORD PTR [rsi] ; src
- movq xmm2, MMWORD PTR [rsi+rdx]
- movdqa xmm1, XMMWORD PTR [rax + 32] ; pred
- punpcklqdq xmm0, xmm2
-
- movdqa xmm2, xmm0
- psubb xmm0, xmm1 ; subtraction with sign missed
-
- pxor xmm1, [GLOBAL(t80)] ;convert to signed values
- pxor xmm2, [GLOBAL(t80)]
- pcmpgtb xmm1, xmm2 ; obtain sign information
-
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- punpcklbw xmm0, xmm1 ; put sign back to subtraction
- punpckhbw xmm2, xmm3 ; put sign back to subtraction
-
- movdqa XMMWORD PTR [rdi + 64], xmm0
- movdqa XMMWORD PTR [rdi + 80], xmm2
-
- ;line 6 7
- movq xmm0, MMWORD PTR [rsi+rdx*2] ; src
- movq xmm2, MMWORD PTR [rsi+rcx]
- movdqa xmm1, XMMWORD PTR [rax+ 48] ; pred
- punpcklqdq xmm0, xmm2
-
- movdqa xmm2, xmm0
- psubb xmm0, xmm1 ; subtraction with sign missed
-
- pxor xmm1, [GLOBAL(t80)] ;convert to signed values
- pxor xmm2, [GLOBAL(t80)]
- pcmpgtb xmm1, xmm2 ; obtain sign information
-
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- punpcklbw xmm0, xmm1 ; put sign back to subtraction
- punpckhbw xmm2, xmm3 ; put sign back to subtraction
-
- movdqa XMMWORD PTR [rdi + 96], xmm0
- movdqa XMMWORD PTR [rdi + 112], xmm2
-
+ movdqa xmm4, [GLOBAL(t80)]
+ mov rdi, arg(0) ;diff
+ mov rsi, arg(1) ;usrc
+ movsxd rdx, dword ptr arg(3);src_stride;
+ mov rax, arg(4) ;upred
+ add rdi, 256*2 ;diff = diff + 256 (shorts)
+ mov rcx, 4
+ push rbx
+ movsxd rbx, dword ptr arg(6);pred_stride
+
+ ;u
+.submbu_loop:
+ movq xmm0, [rsi] ; src
+ movq xmm2, [rsi+rdx] ; src -- next line
+ movq xmm1, [rax] ; pred
+ movq xmm3, [rax+rbx] ; pred -- next line
+ lea rsi, [rsi + rdx*2]
+ lea rax, [rax + rbx*2]
+
+ punpcklqdq xmm0, xmm2
+ punpcklqdq xmm1, xmm3
+
+ movdqa xmm2, xmm0
+ psubb xmm0, xmm1 ; subtraction with sign missed
+
+ pxor xmm1, xmm4 ;convert to signed values
+ pxor xmm2, xmm4
+ pcmpgtb xmm1, xmm2 ; obtain sign information
+
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+ punpcklbw xmm0, xmm1 ; put sign back to subtraction
+ punpckhbw xmm2, xmm3 ; put sign back to subtraction
+
+ movdqa [rdi], xmm0 ; store difference
+ movdqa [rdi +16], xmm2 ; store difference
+ add rdi, 32
+ sub rcx, 1
+ jnz .submbu_loop
+
+ mov rsi, arg(2) ;vsrc
+ mov rax, arg(5) ;vpred
+ mov rcx, 4
+
+ ;v
+.submbv_loop:
+ movq xmm0, [rsi] ; src
+ movq xmm2, [rsi+rdx] ; src -- next line
+ movq xmm1, [rax] ; pred
+ movq xmm3, [rax+rbx] ; pred -- next line
+ lea rsi, [rsi + rdx*2]
+ lea rax, [rax + rbx*2]
+
+ punpcklqdq xmm0, xmm2
+ punpcklqdq xmm1, xmm3
+
+ movdqa xmm2, xmm0
+ psubb xmm0, xmm1 ; subtraction with sign missed
+
+ pxor xmm1, xmm4 ;convert to signed values
+ pxor xmm2, xmm4
+ pcmpgtb xmm1, xmm2 ; obtain sign information
+
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+ punpcklbw xmm0, xmm1 ; put sign back to subtraction
+ punpckhbw xmm2, xmm3 ; put sign back to subtraction
+
+ movdqa [rdi], xmm0 ; store difference
+ movdqa [rdi +16], xmm2 ; store difference
+ add rdi, 32
+ sub rcx, 1
+ jnz .submbv_loop
+
+ pop rbx
; begin epilog
pop rdi
pop rsi
diff --git a/vp8/encoder/x86/variance_mmx.c b/vp8/encoder/x86/variance_mmx.c
index 92b695f17..e2524b46a 100644
--- a/vp8/encoder/x86/variance_mmx.c
+++ b/vp8/encoder/x86/variance_mmx.c
@@ -12,6 +12,7 @@
#include "vp8/encoder/variance.h"
#include "vp8/common/pragmas.h"
#include "vpx_ports/mem.h"
+#include "vp8/common/x86/filter_x86.h"
extern void filter_block1d_h6_mmx
(
@@ -21,7 +22,7 @@ extern void filter_block1d_h6_mmx
unsigned int pixel_step,
unsigned int output_height,
unsigned int output_width,
- short *vp7_filter
+ short *filter
);
extern void filter_block1d_v6_mmx
(
@@ -31,7 +32,7 @@ extern void filter_block1d_v6_mmx
unsigned int pixel_step,
unsigned int output_height,
unsigned int output_width,
- short *vp7_filter
+ short *filter
);
extern unsigned int vp8_get_mb_ss_mmx(const short *src_ptr);
@@ -198,24 +199,6 @@ unsigned int vp8_variance8x16_mmx(
}
-
-
-///////////////////////////////////////////////////////////////////////////
-// the mmx function that does the bilinear filtering and var calculation //
-// int one pass //
-///////////////////////////////////////////////////////////////////////////
-DECLARE_ALIGNED(16, const short, vp8_vp7_bilinear_filters_mmx[8][8]) =
-{
- { 128, 128, 128, 128, 0, 0, 0, 0 },
- { 112, 112, 112, 112, 16, 16, 16, 16 },
- { 96, 96, 96, 96, 32, 32, 32, 32 },
- { 80, 80, 80, 80, 48, 48, 48, 48 },
- { 64, 64, 64, 64, 64, 64, 64, 64 },
- { 48, 48, 48, 48, 80, 80, 80, 80 },
- { 32, 32, 32, 32, 96, 96, 96, 96 },
- { 16, 16, 16, 16, 112, 112, 112, 112 }
-};
-
unsigned int vp8_sub_pixel_variance4x4_mmx
(
const unsigned char *src_ptr,
@@ -232,7 +215,7 @@ unsigned int vp8_sub_pixel_variance4x4_mmx
vp8_filter_block2d_bil4x4_var_mmx(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line,
- vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+ vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
&xsum, &xxsum
);
*sse = xxsum;
@@ -257,7 +240,7 @@ unsigned int vp8_sub_pixel_variance8x8_mmx
vp8_filter_block2d_bil_var_mmx(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
- vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+ vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
&xsum, &xxsum
);
*sse = xxsum;
@@ -283,7 +266,7 @@ unsigned int vp8_sub_pixel_variance16x16_mmx
vp8_filter_block2d_bil_var_mmx(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
- vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+ vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
&xsum0, &xxsum0
);
@@ -291,7 +274,7 @@ unsigned int vp8_sub_pixel_variance16x16_mmx
vp8_filter_block2d_bil_var_mmx(
src_ptr + 8, src_pixels_per_line,
dst_ptr + 8, dst_pixels_per_line, 16,
- vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+ vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
&xsum1, &xxsum1
);
@@ -336,7 +319,7 @@ unsigned int vp8_sub_pixel_variance16x8_mmx
vp8_filter_block2d_bil_var_mmx(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
- vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+ vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
&xsum0, &xxsum0
);
@@ -344,7 +327,7 @@ unsigned int vp8_sub_pixel_variance16x8_mmx
vp8_filter_block2d_bil_var_mmx(
src_ptr + 8, src_pixels_per_line,
dst_ptr + 8, dst_pixels_per_line, 8,
- vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+ vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
&xsum1, &xxsum1
);
@@ -371,7 +354,7 @@ unsigned int vp8_sub_pixel_variance8x16_mmx
vp8_filter_block2d_bil_var_mmx(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
- vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+ vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
&xsum, &xxsum
);
*sse = xxsum;
diff --git a/vp8/encoder/x86/variance_sse2.c b/vp8/encoder/x86/variance_sse2.c
index 24062eb9b..39213b03d 100644
--- a/vp8/encoder/x86/variance_sse2.c
+++ b/vp8/encoder/x86/variance_sse2.c
@@ -12,11 +12,12 @@
#include "vp8/encoder/variance.h"
#include "vp8/common/pragmas.h"
#include "vpx_ports/mem.h"
+#include "vp8/common/x86/filter_x86.h"
-extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
-extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
-extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
-extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
+extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
+extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
+extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
+extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
extern void vp8_filter_block2d_bil4x4_var_mmx
(
@@ -135,8 +136,6 @@ void vp8_half_vert_variance16x_h_sse2
unsigned int *sumsquared
);
-DECLARE_ALIGNED(16, extern short, vp8_vp7_bilinear_filters_mmx[8][8]);
-
unsigned int vp8_variance4x4_wmt(
const unsigned char *src_ptr,
int source_stride,
@@ -262,7 +261,7 @@ unsigned int vp8_sub_pixel_variance4x4_wmt
vp8_filter_block2d_bil4x4_var_mmx(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line,
- vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+ vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
&xsum, &xxsum
);
*sse = xxsum;
diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index 5c15a3e4f..0e564320f 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -9,7 +9,6 @@
##
VP8_COMMON_SRCS-yes += vp8_common.mk
-VP8_COMMON_SRCS-yes += common/type_aliases.h
VP8_COMMON_SRCS-yes += common/pragmas.h
VP8_COMMON_SRCS-yes += common/ppflags.h
VP8_COMMON_SRCS-yes += common/onyx.h
@@ -20,6 +19,8 @@ VP8_COMMON_SRCS-yes += common/blockd.c
VP8_COMMON_SRCS-yes += common/coefupdateprobs.h
VP8_COMMON_SRCS-yes += common/debugmodes.c
VP8_COMMON_SRCS-yes += common/default_coef_probs.h
+VP8_COMMON_SRCS-yes += common/dequantize.c
+VP8_COMMON_SRCS-yes += common/dequantize.h
VP8_COMMON_SRCS-yes += common/entropy.c
VP8_COMMON_SRCS-yes += common/entropymode.c
VP8_COMMON_SRCS-yes += common/entropymv.c
@@ -28,17 +29,16 @@ VP8_COMMON_SRCS-yes += common/filter.c
VP8_COMMON_SRCS-yes += common/filter.h
VP8_COMMON_SRCS-yes += common/findnearmv.c
VP8_COMMON_SRCS-yes += common/generic/systemdependent.c
+VP8_COMMON_SRCS-yes += common/idct_blk.c
VP8_COMMON_SRCS-yes += common/idctllm.c
VP8_COMMON_SRCS-yes += common/alloccommon.h
VP8_COMMON_SRCS-yes += common/blockd.h
VP8_COMMON_SRCS-yes += common/common.h
-VP8_COMMON_SRCS-yes += common/common_types.h
VP8_COMMON_SRCS-yes += common/entropy.h
VP8_COMMON_SRCS-yes += common/entropymode.h
VP8_COMMON_SRCS-yes += common/entropymv.h
VP8_COMMON_SRCS-yes += common/extend.h
VP8_COMMON_SRCS-yes += common/findnearmv.h
-VP8_COMMON_SRCS-yes += common/g_common.h
VP8_COMMON_SRCS-yes += common/header.h
VP8_COMMON_SRCS-yes += common/idct.h
VP8_COMMON_SRCS-yes += common/invtrans.h
@@ -57,7 +57,6 @@ VP8_COMMON_SRCS-yes += common/swapyv12buffer.h
VP8_COMMON_SRCS-yes += common/systemdependent.h
VP8_COMMON_SRCS-yes += common/threading.h
VP8_COMMON_SRCS-yes += common/treecoder.h
-VP8_COMMON_SRCS-yes += common/invtrans.c
VP8_COMMON_SRCS-yes += common/loopfilter.c
VP8_COMMON_SRCS-yes += common/loopfilter_filters.c
VP8_COMMON_SRCS-yes += common/mbpitch.c
@@ -69,9 +68,15 @@ VP8_COMMON_SRCS-yes += common/reconintra.c
VP8_COMMON_SRCS-yes += common/reconintra4x4.c
VP8_COMMON_SRCS-yes += common/setupintrarecon.c
VP8_COMMON_SRCS-yes += common/swapyv12buffer.c
+
+
+
VP8_COMMON_SRCS-$(CONFIG_POSTPROC_VISUALIZER) += common/textblit.c
VP8_COMMON_SRCS-yes += common/treecoder.c
+VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/dequantize_x86.h
+VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/filter_x86.c
+VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/filter_x86.h
VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/idct_x86.h
VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/subpixel_x86.h
VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/recon_x86.h
@@ -82,11 +87,14 @@ VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp8_asm_stubs.c
VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/loopfilter_x86.c
VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/postproc.h
VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/postproc.c
+VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/dequantize_mmx.asm
+VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idct_blk_mmx.c
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idctllm_mmx.asm
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/iwalsh_mmx.asm
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/recon_mmx.asm
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/subpixel_mmx.asm
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/loopfilter_mmx.asm
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idct_blk_sse2.c
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idctllm_sse2.asm
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_sse2.asm
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_wrapper_sse2.c
@@ -104,8 +112,6 @@ endif
# common (c)
VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/arm_systemdependent.c
-VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/bilinearfilter_arm.c
-VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/bilinearfilter_arm.h
VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/filter_arm.c
VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/idct_arm.h
VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/loopfilter_arm.c
@@ -113,8 +119,12 @@ VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/loopfilter_arm.h
VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/recon_arm.h
VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/reconintra_arm.c
VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/subpixel_arm.h
+VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/dequantize_arm.c
+VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/dequantize_arm.h
# common (armv6)
+VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/bilinearfilter_arm.c
+VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/bilinearfilter_arm.h
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/bilinearfilter_v6$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/copymem8x4_v6$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/copymem8x8_v6$(ASM)
@@ -127,6 +137,9 @@ VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/loopfilter_v6$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/simpleloopfilter_v6$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/sixtappredict8x4_v6$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/intra4x4_predict_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/dequant_idct_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/dequantize_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/idct_blk_v6.c
# common (neon)
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/bilinearpredict4x4_neon$(ASM)
@@ -149,3 +162,8 @@ VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/sixtappredict8x8_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/sixtappredict16x16_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/buildintrapredictorsmby_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/save_neon_reg$(ASM)
+VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/dequant_idct_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/idct_dequant_full_2x_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/idct_dequant_0_2x_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/dequantizeb_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/idct_blk_neon.c
diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index 7260e942b..6181ee8ee 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -83,7 +83,7 @@ struct vpx_codec_alg_priv
vpx_codec_enc_cfg_t cfg;
struct vp8_extracfg vp8_cfg;
VP8_CONFIG oxcf;
- VP8_PTR cpi;
+ struct VP8_COMP *cpi;
unsigned char *cx_data;
unsigned int cx_data_sz;
vpx_image_t preview_img;
@@ -137,7 +137,8 @@ update_error_state(vpx_codec_alg_priv_t *ctx,
static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
const vpx_codec_enc_cfg_t *cfg,
- const struct vp8_extracfg *vp8_cfg)
+ const struct vp8_extracfg *vp8_cfg,
+ int finalize)
{
RANGE_CHECK(cfg, g_w, 1, 16383); /* 14 bits available */
RANGE_CHECK(cfg, g_h, 1, 16383); /* 14 bits available */
@@ -193,6 +194,9 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
RANGE_CHECK_HI(vp8_cfg, arnr_strength, 6);
RANGE_CHECK(vp8_cfg, arnr_type, 1, 3);
RANGE_CHECK(vp8_cfg, cq_level, 0, 63);
+ if(finalize && cfg->rc_end_usage == VPX_CQ)
+ RANGE_CHECK(vp8_cfg, cq_level,
+ cfg->rc_min_quantizer, cfg->rc_max_quantizer);
#if !(CONFIG_REALTIME_ONLY)
if (cfg->g_pass == VPX_RC_LAST_PASS)
@@ -264,21 +268,15 @@ static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx,
static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf,
vpx_codec_enc_cfg_t cfg,
- struct vp8_extracfg vp8_cfg)
+ struct vp8_extracfg vp8_cfg,
+ vpx_codec_priv_enc_mr_cfg_t *mr_cfg)
{
oxcf->multi_threaded = cfg.g_threads;
oxcf->Version = cfg.g_profile;
oxcf->Width = cfg.g_w;
oxcf->Height = cfg.g_h;
- /* guess a frame rate if out of whack, use 30 */
- oxcf->frame_rate = (double)(cfg.g_timebase.den) /
- (double)(cfg.g_timebase.num);
-
- if (oxcf->frame_rate > 180)
- {
- oxcf->frame_rate = 30;
- }
+ oxcf->timebase = cfg.g_timebase;
oxcf->error_resilient_mode = cfg.g_error_resilient;
@@ -362,6 +360,21 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf,
memcpy (oxcf->layer_id, cfg.ts_layer_id, sizeof(cfg.ts_layer_id));
}
+#if CONFIG_MULTI_RES_ENCODING
+ /* When mr_cfg is NULL, oxcf->mr_total_resolutions and oxcf->mr_encoder_id
+ * are both memset to 0, which ensures the correct logic under this
+ * situation.
+ */
+ if(mr_cfg)
+ {
+ oxcf->mr_total_resolutions = mr_cfg->mr_total_resolutions;
+ oxcf->mr_encoder_id = mr_cfg->mr_encoder_id;
+ oxcf->mr_down_sampling_factor.num = mr_cfg->mr_down_sampling_factor.num;
+ oxcf->mr_down_sampling_factor.den = mr_cfg->mr_down_sampling_factor.den;
+ oxcf->mr_low_res_mode_info = mr_cfg->mr_low_res_mode_info;
+ }
+#endif
+
//oxcf->delete_first_pass_file = cfg.g_delete_firstpassfile;
//strcpy(oxcf->first_pass_file, cfg.g_firstpass_file);
@@ -434,12 +447,12 @@ static vpx_codec_err_t vp8e_set_config(vpx_codec_alg_priv_t *ctx,
if ((cfg->g_lag_in_frames > ctx->cfg.g_lag_in_frames))
ERROR("Cannot increase lag_in_frames");
- res = validate_config(ctx, cfg, &ctx->vp8_cfg);
+ res = validate_config(ctx, cfg, &ctx->vp8_cfg, 0);
if (!res)
{
ctx->cfg = *cfg;
- set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg);
+ set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg, NULL);
vp8_change_config(ctx->cpi, &ctx->oxcf);
}
@@ -500,26 +513,50 @@ static vpx_codec_err_t set_param(vpx_codec_alg_priv_t *ctx,
}
- res = validate_config(ctx, &ctx->cfg, &xcfg);
+ res = validate_config(ctx, &ctx->cfg, &xcfg, 0);
if (!res)
{
ctx->vp8_cfg = xcfg;
- set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg);
+ set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg, NULL);
vp8_change_config(ctx->cpi, &ctx->oxcf);
}
return res;
#undef MAP
}
-static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx)
+
+static vpx_codec_err_t vp8e_mr_alloc_mem(const vpx_codec_enc_cfg_t *cfg,
+ void **mem_loc)
+{
+ vpx_codec_err_t res = 0;
+
+#if CONFIG_MULTI_RES_ENCODING
+ int mb_rows = ((cfg->g_w + 15) >>4);
+ int mb_cols = ((cfg->g_h + 15) >>4);
+
+ *mem_loc = calloc(mb_rows*mb_cols, sizeof(LOWER_RES_INFO));
+ if(!(*mem_loc))
+ {
+ free(*mem_loc);
+ res = VPX_CODEC_MEM_ERROR;
+ }
+ else
+ res = VPX_CODEC_OK;
+#endif
+
+ return res;
+}
+
+static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx,
+ vpx_codec_priv_enc_mr_cfg_t *mr_cfg)
{
vpx_codec_err_t res = VPX_DEC_OK;
struct vpx_codec_alg_priv *priv;
vpx_codec_enc_cfg_t *cfg;
unsigned int i;
- VP8_PTR optr;
+ struct VP8_COMP *optr;
if (!ctx->priv)
{
@@ -573,13 +610,20 @@ static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx)
vp8_initialize();
- res = validate_config(priv, &priv->cfg, &priv->vp8_cfg);
+ res = validate_config(priv, &priv->cfg, &priv->vp8_cfg, 0);
if (!res)
{
+ if(mr_cfg)
+ ctx->priv->enc.total_encoders = mr_cfg->mr_total_resolutions;
+ else
+ ctx->priv->enc.total_encoders = 1;
+
set_vp8e_config(&ctx->priv->alg_priv->oxcf,
ctx->priv->alg_priv->cfg,
- ctx->priv->alg_priv->vp8_cfg);
+ ctx->priv->alg_priv->vp8_cfg,
+ mr_cfg);
+
optr = vp8_create_compressor(&ctx->priv->alg_priv->oxcf);
if (!optr)
@@ -594,6 +638,11 @@ static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx)
static vpx_codec_err_t vp8e_destroy(vpx_codec_alg_priv_t *ctx)
{
+#if CONFIG_MULTI_RES_ENCODING
+ /* Free multi-encoder shared memory */
+ if (ctx->oxcf.mr_total_resolutions > 0 && (ctx->oxcf.mr_encoder_id == ctx->oxcf.mr_total_resolutions-1))
+ free(ctx->oxcf.mr_low_res_mode_info);
+#endif
free(ctx->cx_data);
vp8_remove_compressor(&ctx->cpi);
@@ -691,6 +740,9 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
if (img)
res = validate_img(ctx, img);
+ if (!res)
+ res = validate_config(ctx, &ctx->cfg, &ctx->vp8_cfg, 1);
+
pick_quickcompress_mode(ctx, duration, deadline);
vpx_codec_pkt_list_init(&ctx->pkt_list);
@@ -1230,6 +1282,7 @@ CODEC_INTERFACE(vpx_codec_vp8_cx) =
vp8e_set_config,
NOT_IMPLEMENTED,
vp8e_get_preview,
+ vp8e_mr_alloc_mem,
} /* encoder functions */
};
@@ -1314,5 +1367,6 @@ vpx_codec_iface_t vpx_enc_vp8_algo =
vp8e_set_config,
NOT_IMPLEMENTED,
vp8e_get_preview,
+ vp8e_mr_alloc_mem,
} /* encoder functions */
};
diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c
index cdfcd2142..fbe58171c 100644
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -57,7 +57,7 @@ struct vpx_codec_alg_priv
vp8_stream_info_t si;
int defer_alloc;
int decoder_init;
- VP8D_PTR pbi;
+ struct VP8D_COMP *pbi;
int postproc_cfg_set;
vp8_postproc_cfg_t postproc_cfg;
#if CONFIG_POSTPROC_VISUALIZER
@@ -181,9 +181,11 @@ static void vp8_finalize_mmaps(vpx_codec_alg_priv_t *ctx)
/* nothing to clean up */
}
-static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx)
+static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx,
+ vpx_codec_priv_enc_mr_cfg_t *data)
{
vpx_codec_err_t res = VPX_CODEC_OK;
+ (void) data;
/* This function only allocates space for the vpx_codec_alg_priv_t
* structure. More memory may be required at the time the stream
@@ -387,7 +389,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
if (!res)
{
VP8D_CONFIG oxcf;
- VP8D_PTR optr;
+ struct VP8D_COMP* optr;
vp8dx_initialize();
@@ -564,7 +566,7 @@ static vpx_codec_err_t vp8_xma_set_mmap(vpx_codec_ctx_t *ctx,
if (done && !res)
{
vp8_finalize_mmaps(ctx->priv->alg_priv);
- res = ctx->iface->init(ctx);
+ res = ctx->iface->init(ctx, NULL);
}
return res;
diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk
index b71a54aea..2d99981f5 100644
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -86,6 +86,8 @@ VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.h
VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.c
VP8_CX_SRCS-yes += encoder/temporal_filter.c
VP8_CX_SRCS-yes += encoder/temporal_filter.h
+VP8_CX_SRCS-$(CONFIG_MULTI_RES_ENCODING) += encoder/mr_dissim.c
+VP8_CX_SRCS-$(CONFIG_MULTI_RES_ENCODING) += encoder/mr_dissim.h
ifeq ($(CONFIG_REALTIME_ONLY),yes)
VP8_CX_SRCS_REMOVE-yes += encoder/firstpass.c
diff --git a/vp8/vp8dx.mk b/vp8/vp8dx.mk
index d88b595fb..d6dc15305 100644
--- a/vp8/vp8dx.mk
+++ b/vp8/vp8dx.mk
@@ -52,7 +52,6 @@ VP8_DX_SRCS-yes += decoder/asm_dec_offsets.c
VP8_DX_SRCS-yes += decoder/dboolhuff.c
VP8_DX_SRCS-yes += decoder/decodemv.c
VP8_DX_SRCS-yes += decoder/decodframe.c
-VP8_DX_SRCS-yes += decoder/dequantize.c
VP8_DX_SRCS-yes += decoder/detokenize.c
VP8_DX_SRCS-$(CONFIG_ERROR_CONCEALMENT) += decoder/ec_types.h
VP8_DX_SRCS-$(CONFIG_ERROR_CONCEALMENT) += decoder/error_concealment.h
@@ -61,20 +60,14 @@ VP8_DX_SRCS-yes += decoder/generic/dsystemdependent.c
VP8_DX_SRCS-yes += decoder/dboolhuff.h
VP8_DX_SRCS-yes += decoder/decodemv.h
VP8_DX_SRCS-yes += decoder/decoderthreading.h
-VP8_DX_SRCS-yes += decoder/dequantize.h
VP8_DX_SRCS-yes += decoder/detokenize.h
VP8_DX_SRCS-yes += decoder/onyxd_int.h
VP8_DX_SRCS-yes += decoder/treereader.h
VP8_DX_SRCS-yes += decoder/onyxd_if.c
VP8_DX_SRCS-$(CONFIG_MULTITHREAD) += decoder/threading.c
-VP8_DX_SRCS-yes += decoder/idct_blk.c
VP8_DX_SRCS-$(CONFIG_MULTITHREAD) += decoder/reconintra_mt.h
VP8_DX_SRCS-$(CONFIG_MULTITHREAD) += decoder/reconintra_mt.c
VP8_DX_SRCS-yes := $(filter-out $(VP8_DX_SRCS_REMOVE-yes),$(VP8_DX_SRCS-yes))
-VP8_DX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += decoder/x86/dequantize_x86.h
VP8_DX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += decoder/x86/x86_dsystemdependent.c
-VP8_DX_SRCS-$(HAVE_MMX) += decoder/x86/dequantize_mmx.asm
-VP8_DX_SRCS-$(HAVE_MMX) += decoder/x86/idct_blk_mmx.c
-VP8_DX_SRCS-$(HAVE_SSE2) += decoder/x86/idct_blk_sse2.c
diff --git a/vp8/vp8dx_arm.mk b/vp8/vp8dx_arm.mk
index 6bde42f4c..fa1aaea0b 100644
--- a/vp8/vp8dx_arm.mk
+++ b/vp8/vp8dx_arm.mk
@@ -12,20 +12,3 @@
#VP8_DX_SRCS list is modified according to different platforms.
VP8_DX_SRCS-$(ARCH_ARM) += decoder/arm/arm_dsystemdependent.c
-VP8_DX_SRCS-$(ARCH_ARM) += decoder/arm/dequantize_arm.c
-VP8_DX_SRCS-$(ARCH_ARM) += decoder/arm/dequantize_arm.h
-
-#File list for armv6
-VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequant_dc_idct_v6$(ASM)
-VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequant_idct_v6$(ASM)
-VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequantize_v6$(ASM)
-VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/idct_blk_v6.c
-
-#File list for neon
-VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/idct_dequant_dc_full_2x_neon$(ASM)
-VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/idct_dequant_dc_0_2x_neon$(ASM)
-VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequant_idct_neon$(ASM)
-VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/idct_dequant_full_2x_neon$(ASM)
-VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/idct_dequant_0_2x_neon$(ASM)
-VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequantizeb_neon$(ASM)
-VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/idct_blk_neon.c
diff --git a/vp8_multi_resolution_encoder.c b/vp8_multi_resolution_encoder.c
new file mode 100644
index 000000000..11c33d618
--- /dev/null
+++ b/vp8_multi_resolution_encoder.c
@@ -0,0 +1,463 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*
+ * This is an example demonstrating multi-resolution encoding in VP8.
+ * High-resolution input video is down-sampled to lower-resolutions. The
+ * encoder then encodes the video and outputs multiple bitstreams with
+ * different resolutions.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include "math.h"
+#define VPX_CODEC_DISABLE_COMPAT 1
+#include "vpx/vpx_encoder.h"
+#include "vpx/vp8cx.h"
+#include "vpx_ports/mem_ops.h"
+#define interface (vpx_codec_vp8_cx())
+#define fourcc 0x30385056
+
+#define IVF_FILE_HDR_SZ (32)
+#define IVF_FRAME_HDR_SZ (12)
+
+/*
+ * The input video frame is downsampled several times to generate a multi-level
+ * hierarchical structure. NUM_ENCODERS is defined as the number of encoding
+ * levels required. For example, if the size of input video is 1280x720,
+ * NUM_ENCODERS is 3, and down-sampling factor is 2, the encoder outputs 3
+ * bitstreams with resolution of 1280x720(level 0), 640x360(level 1), and
+ * 320x180(level 2) respectively.
+ */
+#define NUM_ENCODERS 3
+
+/* This example uses the scaler function in libyuv. */
+#include "third_party/libyuv/include/libyuv/basic_types.h"
+#include "third_party/libyuv/include/libyuv/scale.h"
+#include "third_party/libyuv/include/libyuv/cpu_id.h"
+
+static double vp8_mse2psnr(double Samples, double Peak, double Mse)
+{
+ double psnr;
+
+ if ((double)Mse > 0.0)
+ psnr = 10.0 * log10(Peak * Peak * Samples / Mse);
+ else
+ psnr = 60; // Limit to prevent / 0
+
+ if (psnr > 60)
+ psnr = 60;
+
+ return psnr;
+}
+
+static void die(const char *fmt, ...) {
+ va_list ap;
+
+ va_start(ap, fmt);
+ vprintf(fmt, ap);
+ if(fmt[strlen(fmt)-1] != '\n')
+ printf("\n");
+ exit(EXIT_FAILURE);
+}
+
+static void die_codec(vpx_codec_ctx_t *ctx, const char *s) {
+ const char *detail = vpx_codec_error_detail(ctx);
+
+ printf("%s: %s\n", s, vpx_codec_error(ctx));
+ if(detail)
+ printf(" %s\n",detail);
+ exit(EXIT_FAILURE);
+}
+
+int (*read_frame_p)(FILE *f, vpx_image_t *img);
+
+static int read_frame(FILE *f, vpx_image_t *img) {
+ size_t nbytes, to_read;
+ int res = 1;
+
+ to_read = img->w*img->h*3/2;
+ nbytes = fread(img->planes[0], 1, to_read, f);
+ if(nbytes != to_read) {
+ res = 0;
+ if(nbytes > 0)
+ printf("Warning: Read partial frame. Check your width & height!\n");
+ }
+ return res;
+}
+
+static int read_frame_by_row(FILE *f, vpx_image_t *img) {
+ size_t nbytes, to_read;
+ int res = 1;
+ int plane;
+
+ for (plane = 0; plane < 3; plane++)
+ {
+ unsigned char *ptr;
+ int w = (plane ? (1 + img->d_w) / 2 : img->d_w);
+ int h = (plane ? (1 + img->d_h) / 2 : img->d_h);
+ int r;
+
+ /* Determine the correct plane based on the image format. The for-loop
+ * always counts in Y,U,V order, but this may not match the order of
+ * the data on disk.
+ */
+ switch (plane)
+ {
+ case 1:
+ ptr = img->planes[img->fmt==VPX_IMG_FMT_YV12? VPX_PLANE_V : VPX_PLANE_U];
+ break;
+ case 2:
+ ptr = img->planes[img->fmt==VPX_IMG_FMT_YV12?VPX_PLANE_U : VPX_PLANE_V];
+ break;
+ default:
+ ptr = img->planes[plane];
+ }
+
+ for (r = 0; r < h; r++)
+ {
+ to_read = w;
+
+ nbytes = fread(ptr, 1, to_read, f);
+ if(nbytes != to_read) {
+ res = 0;
+ if(nbytes > 0)
+ printf("Warning: Read partial frame. Check your width & height!\n");
+ break;
+ }
+
+ ptr += img->stride[plane];
+ }
+ if (!res)
+ break;
+ }
+
+ return res;
+}
+
+static void write_ivf_file_header(FILE *outfile,
+ const vpx_codec_enc_cfg_t *cfg,
+ int frame_cnt) {
+ char header[32];
+
+ if(cfg->g_pass != VPX_RC_ONE_PASS && cfg->g_pass != VPX_RC_LAST_PASS)
+ return;
+ header[0] = 'D';
+ header[1] = 'K';
+ header[2] = 'I';
+ header[3] = 'F';
+ mem_put_le16(header+4, 0); /* version */
+ mem_put_le16(header+6, 32); /* headersize */
+ mem_put_le32(header+8, fourcc); /* headersize */
+ mem_put_le16(header+12, cfg->g_w); /* width */
+ mem_put_le16(header+14, cfg->g_h); /* height */
+ mem_put_le32(header+16, cfg->g_timebase.den); /* rate */
+ mem_put_le32(header+20, cfg->g_timebase.num); /* scale */
+ mem_put_le32(header+24, frame_cnt); /* length */
+ mem_put_le32(header+28, 0); /* unused */
+
+ if(fwrite(header, 1, 32, outfile));
+}
+
+static void write_ivf_frame_header(FILE *outfile,
+ const vpx_codec_cx_pkt_t *pkt)
+{
+ char header[12];
+ vpx_codec_pts_t pts;
+
+ if(pkt->kind != VPX_CODEC_CX_FRAME_PKT)
+ return;
+
+ pts = pkt->data.frame.pts;
+ mem_put_le32(header, pkt->data.frame.sz);
+ mem_put_le32(header+4, pts&0xFFFFFFFF);
+ mem_put_le32(header+8, pts >> 32);
+
+ if(fwrite(header, 1, 12, outfile));
+}
+
+int main(int argc, char **argv)
+{
+ FILE *infile, *outfile[NUM_ENCODERS];
+ vpx_codec_ctx_t codec[NUM_ENCODERS];
+ vpx_codec_enc_cfg_t cfg[NUM_ENCODERS];
+ vpx_codec_pts_t frame_cnt = 0;
+ vpx_image_t raw[NUM_ENCODERS];
+ vpx_codec_err_t res[NUM_ENCODERS];
+
+ int i;
+ long width;
+ long height;
+ int frame_avail;
+ int got_data;
+ int flags = 0;
+
+ /*Currently, only realtime mode is supported in multi-resolution encoding.*/
+ int arg_deadline = VPX_DL_REALTIME;
+
+ /* Set show_psnr to 1/0 to show/not show PSNR. Choose show_psnr=0 if you
+ don't need to know PSNR, which will skip PSNR calculation and save
+ encoding time. */
+ int show_psnr = 0;
+ uint64_t psnr_sse_total[NUM_ENCODERS] = {0};
+ uint64_t psnr_samples_total[NUM_ENCODERS] = {0};
+ double psnr_totals[NUM_ENCODERS][4] = {{0,0}};
+ int psnr_count[NUM_ENCODERS] = {0};
+
+ /* Set the required target bitrates for each resolution level. */
+ unsigned int target_bitrate[NUM_ENCODERS]={1400, 500, 100};
+ /* Enter the frame rate of the input video */
+ int framerate = 30;
+ /* Set down-sampling factor for each resolution level.
+ dsf[0] controls down sampling from level 0 to level 1;
+ dsf[1] controls down sampling from level 1 to level 2;
+ dsf[2] is not used. */
+ vpx_rational_t dsf[NUM_ENCODERS] = {{2, 1}, {2, 1}, {1, 1}};
+
+ if(argc!= (5+NUM_ENCODERS))
+ die("Usage: %s <width> <height> <infile> <outfile(s)> <output psnr?>\n",
+ argv[0]);
+
+ printf("Using %s\n",vpx_codec_iface_name(interface));
+
+ width = strtol(argv[1], NULL, 0);
+ height = strtol(argv[2], NULL, 0);
+
+ if(width < 16 || width%2 || height <16 || height%2)
+ die("Invalid resolution: %ldx%ld", width, height);
+
+ /* Open input video file for encoding */
+ if(!(infile = fopen(argv[3], "rb")))
+ die("Failed to open %s for reading", argv[3]);
+
+ /* Open output file for each encoder to output bitstreams */
+ for (i=0; i< NUM_ENCODERS; i++)
+ {
+ if(!(outfile[i] = fopen(argv[i+4], "wb")))
+ die("Failed to open %s for writing", argv[i+4]);
+ }
+
+ show_psnr = strtol(argv[NUM_ENCODERS + 4], NULL, 0);
+
+ /* Populate default encoder configuration */
+ for (i=0; i< NUM_ENCODERS; i++)
+ {
+ res[i] = vpx_codec_enc_config_default(interface, &cfg[i], 0);
+ if(res[i]) {
+ printf("Failed to get config: %s\n", vpx_codec_err_to_string(res[i]));
+ return EXIT_FAILURE;
+ }
+ }
+
+ /*
+ * Update the default configuration according to needs of the application.
+ */
+ /* Highest-resolution encoder settings */
+ cfg[0].g_w = width;
+ cfg[0].g_h = height;
+ cfg[0].g_threads = 1; /* number of threads used */
+ cfg[0].rc_dropframe_thresh = 0;
+ cfg[0].rc_end_usage = VPX_CBR;
+ cfg[0].rc_resize_allowed = 0;
+ cfg[0].rc_min_quantizer = 4;
+ cfg[0].rc_max_quantizer = 56;
+ cfg[0].rc_undershoot_pct = 98;
+ cfg[0].rc_overshoot_pct = 100;
+ cfg[0].rc_buf_initial_sz = 500;
+ cfg[0].rc_buf_optimal_sz = 600;
+ cfg[0].rc_buf_sz = 1000;
+ //cfg[0].rc_dropframe_thresh = 10;
+ cfg[0].g_error_resilient = 1; /* Enable error resilient mode */
+ cfg[0].g_lag_in_frames = 0;
+
+ /* Disable automatic keyframe placement */
+ //cfg[0].kf_mode = VPX_KF_DISABLED;
+ cfg[0].kf_min_dist = cfg[0].kf_max_dist = 1000;
+
+ cfg[0].rc_target_bitrate = target_bitrate[0]; /* Set target bitrate */
+ cfg[0].g_timebase.num = 1; /* Set fps */
+ cfg[0].g_timebase.den = framerate;
+
+ /* Other-resolution encoder settings */
+ for (i=1; i< NUM_ENCODERS; i++)
+ {
+ memcpy(&cfg[i], &cfg[0], sizeof(vpx_codec_enc_cfg_t));
+
+ cfg[i].g_threads = 1; /* number of threads used */
+ cfg[i].rc_target_bitrate = target_bitrate[i];
+
+ /* Note: Width & height of other-resolution encoders are calculated
+ * from the highest-resolution encoder's size and the corresponding
+ * down_sampling_factor.
+ */
+ {
+ unsigned int iw = cfg[i-1].g_w*dsf[i-1].den + dsf[i-1].num - 1;
+ unsigned int ih = cfg[i-1].g_h*dsf[i-1].den + dsf[i-1].num - 1;
+ cfg[i].g_w = iw/dsf[i-1].num;
+ cfg[i].g_h = ih/dsf[i-1].num;
+ }
+
+ /* Make width & height to be multiplier of 2. */
+ // Should support odd size ???
+ if((cfg[i].g_w)%2)cfg[i].g_w++;
+ if((cfg[i].g_h)%2)cfg[i].g_h++;
+ }
+
+ /* Allocate image for each encoder */
+ for (i=0; i< NUM_ENCODERS; i++)
+ if(!vpx_img_alloc(&raw[i], VPX_IMG_FMT_I420, cfg[i].g_w, cfg[i].g_h, 32))
+ die("Failed to allocate image", cfg[i].g_w, cfg[i].g_h);
+
+ if (raw[0].stride[VPX_PLANE_Y] == raw[0].d_w)
+ read_frame_p = read_frame;
+ else
+ read_frame_p = read_frame_by_row;
+
+ for (i=0; i< NUM_ENCODERS; i++)
+ write_ivf_file_header(outfile[i], &cfg[i], 0);
+
+ /* Initialize multi-encoder */
+ if(vpx_codec_enc_init_multi(&codec[0], interface, &cfg[0], NUM_ENCODERS,
+ (show_psnr ? VPX_CODEC_USE_PSNR : 0), &dsf[0]))
+ die_codec(&codec[0], "Failed to initialize encoder");
+
+ /* The extra encoding configuration parameters can be set as follows. */
+ /* Set encoding speed */
+ for ( i=0; i<NUM_ENCODERS; i++)
+ {
+ int speed = -6;
+ if(vpx_codec_control(&codec[i], VP8E_SET_CPUUSED, speed))
+ die_codec(&codec[i], "Failed to set cpu_used");
+ }
+ /* Set static thresh for highest-resolution encoder. Set it to 1000 for
+ * better performance. */
+ {
+ unsigned int static_thresh = 1000;
+ if(vpx_codec_control(&codec[0], VP8E_SET_STATIC_THRESHOLD, static_thresh))
+ die_codec(&codec[0], "Failed to set static threshold");
+ }
+ /* Set static thresh = 0 for other encoders for better quality */
+ for ( i=1; i<NUM_ENCODERS; i++)
+ {
+ unsigned int static_thresh = 0;
+ if(vpx_codec_control(&codec[i], VP8E_SET_STATIC_THRESHOLD, static_thresh))
+ die_codec(&codec[i], "Failed to set static threshold");
+ }
+
+ frame_avail = 1;
+ got_data = 0;
+
+ while(frame_avail || got_data)
+ {
+ vpx_codec_iter_t iter[NUM_ENCODERS]={NULL};
+ const vpx_codec_cx_pkt_t *pkt[NUM_ENCODERS];
+
+ flags = 0;
+ frame_avail = read_frame_p(infile, &raw[0]);
+
+ if(frame_avail)
+ {
+ for ( i=1; i<NUM_ENCODERS; i++)
+ {
+ /*Scale the image down a number of times by downsampling factor*/
+ /* FilterMode 1 or 2 give better psnr than FilterMode 0. */
+ I420Scale(raw[i-1].planes[VPX_PLANE_Y], raw[i-1].stride[VPX_PLANE_Y],
+ raw[i-1].planes[VPX_PLANE_U], raw[i-1].stride[VPX_PLANE_U],
+ raw[i-1].planes[VPX_PLANE_V], raw[i-1].stride[VPX_PLANE_V],
+ raw[i-1].d_w, raw[i-1].d_h,
+ raw[i].planes[VPX_PLANE_Y], raw[i].stride[VPX_PLANE_Y],
+ raw[i].planes[VPX_PLANE_U], raw[i].stride[VPX_PLANE_U],
+ raw[i].planes[VPX_PLANE_V], raw[i].stride[VPX_PLANE_V],
+ raw[i].d_w, raw[i].d_h, 1);
+ }
+ }
+
+ /* Encode each frame at multi-levels */
+ if(vpx_codec_encode(&codec[0], frame_avail? &raw[0] : NULL,
+ frame_cnt, 1, flags, arg_deadline))
+ die_codec(&codec[0], "Failed to encode frame");
+
+ for (i=NUM_ENCODERS-1; i>=0 ; i--)
+ {
+ got_data = 0;
+
+ while( (pkt[i] = vpx_codec_get_cx_data(&codec[i], &iter[i])) )
+ {
+ got_data = 1;
+ switch(pkt[i]->kind) {
+ case VPX_CODEC_CX_FRAME_PKT:
+ write_ivf_frame_header(outfile[i], pkt[i]);
+ if(fwrite(pkt[i]->data.frame.buf, 1, pkt[i]->data.frame.sz,
+ outfile[i]));
+ break;
+ case VPX_CODEC_PSNR_PKT:
+ if (show_psnr)
+ {
+ int j;
+
+ psnr_sse_total[i] += pkt[i]->data.psnr.sse[0];
+ psnr_samples_total[i] += pkt[i]->data.psnr.samples[0];
+ for (j = 0; j < 4; j++)
+ {
+ //fprintf(stderr, "%.3lf ", pkt[i]->data.psnr.psnr[j]);
+ psnr_totals[i][j] += pkt[i]->data.psnr.psnr[j];
+ }
+ psnr_count[i]++;
+ }
+
+ break;
+ default:
+ break;
+ }
+ printf(pkt[i]->kind == VPX_CODEC_CX_FRAME_PKT
+ && (pkt[i]->data.frame.flags & VPX_FRAME_IS_KEY)? "K":".");
+ fflush(stdout);
+ }
+ }
+ frame_cnt++;
+ }
+ printf("\n");
+
+ fclose(infile);
+
+ for (i=0; i< NUM_ENCODERS; i++)
+ {
+ printf("Processed %ld frames.\n",(long int)frame_cnt-1);
+
+ /* Calculate PSNR and print it out */
+ if ( (show_psnr) && (psnr_count[i]>0) )
+ {
+ int j;
+ double ovpsnr = vp8_mse2psnr(psnr_samples_total[i], 255.0,
+ psnr_sse_total[i]);
+
+ fprintf(stderr, "\n ENC%d PSNR (Overall/Avg/Y/U/V)", i);
+
+ fprintf(stderr, " %.3lf", ovpsnr);
+ for (j = 0; j < 4; j++)
+ {
+ fprintf(stderr, " %.3lf", psnr_totals[i][j]/psnr_count[i]);
+ }
+ }
+
+ if(vpx_codec_destroy(&codec[i]))
+ die_codec(&codec[i], "Failed to destroy codec");
+
+ /* Try to rewrite the file header with the actual frame count */
+ if(!fseek(outfile[i], 0, SEEK_SET))
+ write_ivf_file_header(outfile[i], &cfg[i], frame_cnt-1);
+ fclose(outfile[i]);
+
+ vpx_img_free(&raw[i]);
+ }
+
+ return EXIT_SUCCESS;
+}
diff --git a/vpx/internal/vpx_codec_internal.h b/vpx/internal/vpx_codec_internal.h
index a1ff1921e..0703d6a4f 100644
--- a/vpx/internal/vpx_codec_internal.h
+++ b/vpx/internal/vpx_codec_internal.h
@@ -56,9 +56,10 @@
* types, removing or reassigning enums, adding/removing/rearranging
* fields to structures
*/
-#define VPX_CODEC_INTERNAL_ABI_VERSION (3) /**<\hideinitializer*/
+#define VPX_CODEC_INTERNAL_ABI_VERSION (4) /**<\hideinitializer*/
typedef struct vpx_codec_alg_priv vpx_codec_alg_priv_t;
+typedef struct vpx_codec_priv_enc_mr_cfg vpx_codec_priv_enc_mr_cfg_t;
/*!\brief init function pointer prototype
*
@@ -73,7 +74,8 @@ typedef struct vpx_codec_alg_priv vpx_codec_alg_priv_t;
* \retval #VPX_CODEC_MEM_ERROR
* Memory operation failed.
*/
-typedef vpx_codec_err_t (*vpx_codec_init_fn_t)(vpx_codec_ctx_t *ctx);
+typedef vpx_codec_err_t (*vpx_codec_init_fn_t)(vpx_codec_ctx_t *ctx,
+ vpx_codec_priv_enc_mr_cfg_t *data);
/*!\brief destroy function pointer prototype
*
@@ -264,6 +266,10 @@ typedef vpx_fixed_buf_t *
typedef vpx_image_t *
(*vpx_codec_get_preview_frame_fn_t)(vpx_codec_alg_priv_t *ctx);
+typedef vpx_codec_err_t
+(*vpx_codec_enc_mr_get_mem_loc_fn_t)(const vpx_codec_enc_cfg_t *cfg,
+ void **mem_loc);
+
/*!\brief usage configuration mapping
*
* This structure stores the mapping between usage identifiers and
@@ -309,8 +315,9 @@ struct vpx_codec_iface
vpx_codec_encode_fn_t encode; /**< \copydoc ::vpx_codec_encode_fn_t */
vpx_codec_get_cx_data_fn_t get_cx_data; /**< \copydoc ::vpx_codec_get_cx_data_fn_t */
vpx_codec_enc_config_set_fn_t cfg_set; /**< \copydoc ::vpx_codec_enc_config_set_fn_t */
- vpx_codec_get_global_headers_fn_t get_glob_hdrs; /**< \copydoc ::vpx_codec_enc_config_set_fn_t */
+ vpx_codec_get_global_headers_fn_t get_glob_hdrs; /**< \copydoc ::vpx_codec_get_global_headers_fn_t */
vpx_codec_get_preview_frame_fn_t get_preview; /**< \copydoc ::vpx_codec_get_preview_frame_fn_t */
+ vpx_codec_enc_mr_get_mem_loc_fn_t mr_get_mem_loc; /**< \copydoc ::vpx_codec_enc_mr_get_mem_loc_fn_t */
} enc;
};
@@ -353,9 +360,21 @@ struct vpx_codec_priv
unsigned int cx_data_pad_before;
unsigned int cx_data_pad_after;
vpx_codec_cx_pkt_t cx_data_pkt;
+ unsigned int total_encoders;
} enc;
};
+/*
+ * Multi-resolution encoding internal configuration
+ */
+struct vpx_codec_priv_enc_mr_cfg
+{
+ unsigned int mr_total_resolutions;
+ unsigned int mr_encoder_id;
+ struct vpx_rational mr_down_sampling_factor;
+ void* mr_low_res_mode_info;
+};
+
#undef VPX_CTRL_USE_TYPE
#define VPX_CTRL_USE_TYPE(id, typ) \
static typ id##__value(va_list args) {return va_arg(args, typ);} \
diff --git a/vpx/src/vpx_decoder.c b/vpx/src/vpx_decoder.c
index 5d31c2c49..59a783dd9 100644
--- a/vpx/src/vpx_decoder.c
+++ b/vpx/src/vpx_decoder.c
@@ -56,7 +56,7 @@ vpx_codec_err_t vpx_codec_dec_init_ver(vpx_codec_ctx_t *ctx,
if (!(flags & VPX_CODEC_USE_XMA))
{
- res = ctx->iface->init(ctx);
+ res = ctx->iface->init(ctx, NULL);
if (res)
{
diff --git a/vpx/src/vpx_encoder.c b/vpx/src/vpx_encoder.c
index 5e86835ea..03ddc62b2 100644
--- a/vpx/src/vpx_encoder.c
+++ b/vpx/src/vpx_encoder.c
@@ -51,7 +51,7 @@ vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t *ctx,
ctx->priv = NULL;
ctx->init_flags = flags;
ctx->config.enc = cfg;
- res = ctx->iface->init(ctx);
+ res = ctx->iface->init(ctx, NULL);
if (res)
{
@@ -66,6 +66,85 @@ vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t *ctx,
return SAVE_STATUS(ctx, res);
}
+vpx_codec_err_t vpx_codec_enc_init_multi_ver(vpx_codec_ctx_t *ctx,
+ vpx_codec_iface_t *iface,
+ vpx_codec_enc_cfg_t *cfg,
+ int num_enc,
+ vpx_codec_flags_t flags,
+ vpx_rational_t *dsf,
+ int ver)
+{
+ vpx_codec_err_t res = 0;
+
+ if (ver != VPX_ENCODER_ABI_VERSION)
+ res = VPX_CODEC_ABI_MISMATCH;
+ else if (!ctx || !iface || !cfg || (num_enc > 16 || num_enc < 1))
+ res = VPX_CODEC_INVALID_PARAM;
+ else if (iface->abi_version != VPX_CODEC_INTERNAL_ABI_VERSION)
+ res = VPX_CODEC_ABI_MISMATCH;
+ else if (!(iface->caps & VPX_CODEC_CAP_ENCODER))
+ res = VPX_CODEC_INCAPABLE;
+ else if ((flags & VPX_CODEC_USE_XMA) && !(iface->caps & VPX_CODEC_CAP_XMA))
+ res = VPX_CODEC_INCAPABLE;
+ else if ((flags & VPX_CODEC_USE_PSNR)
+ && !(iface->caps & VPX_CODEC_CAP_PSNR))
+ res = VPX_CODEC_INCAPABLE;
+ else if ((flags & VPX_CODEC_USE_OUTPUT_PARTITION)
+ && !(iface->caps & VPX_CODEC_CAP_OUTPUT_PARTITION))
+ res = VPX_CODEC_INCAPABLE;
+ else
+ {
+ int i;
+ void *mem_loc = NULL;
+
+ if(!(res = iface->enc.mr_get_mem_loc(cfg, &mem_loc)))
+ {
+ for (i = 0; i < num_enc; i++)
+ {
+ vpx_codec_priv_enc_mr_cfg_t mr_cfg;
+
+ /* Validate down-sampling factor. */
+ if(dsf->num < 1 || dsf->num > 4096 || dsf->den < 1 ||
+ dsf->den > dsf->num)
+ {
+ res = VPX_CODEC_INVALID_PARAM;
+ break;
+ }
+
+ mr_cfg.mr_low_res_mode_info = mem_loc;
+ mr_cfg.mr_total_resolutions = num_enc;
+ mr_cfg.mr_encoder_id = num_enc-1-i;
+ mr_cfg.mr_down_sampling_factor.num = dsf->num;
+ mr_cfg.mr_down_sampling_factor.den = dsf->den;
+
+ ctx->iface = iface;
+ ctx->name = iface->name;
+ ctx->priv = NULL;
+ ctx->init_flags = flags;
+ ctx->config.enc = cfg;
+ res = ctx->iface->init(ctx, &mr_cfg);
+
+ if (res)
+ {
+ ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL;
+ vpx_codec_destroy(ctx);
+ }
+
+ if (ctx->priv)
+ ctx->priv->iface = ctx->iface;
+
+ if (res)
+ break;
+
+ ctx++;
+ cfg++;
+ dsf++;
+ }
+ }
+ }
+
+ return SAVE_STATUS(ctx, res);
+}
vpx_codec_err_t vpx_codec_enc_config_default(vpx_codec_iface_t *iface,
@@ -123,7 +202,7 @@ vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx,
vpx_enc_frame_flags_t flags,
unsigned long deadline)
{
- vpx_codec_err_t res;
+ vpx_codec_err_t res = 0;
if (!ctx || (img && !duration))
res = VPX_CODEC_INVALID_PARAM;
@@ -136,9 +215,37 @@ vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx,
/* Execute in a normalized floating point environment, if the platform
* requires it.
*/
+ unsigned int num_enc =ctx->priv->enc.total_encoders;
+
FLOATING_POINT_INIT();
- res = ctx->iface->enc.encode(ctx->priv->alg_priv, img, pts,
- duration, flags, deadline);
+
+ if (num_enc == 1)
+ res = ctx->iface->enc.encode(ctx->priv->alg_priv, img, pts,
+ duration, flags, deadline);
+ else
+ {
+ /* Multi-resolution encoding:
+ * Encode multi-levels in reverse order. For example,
+ * if mr_total_resolutions = 3, first encode level 2,
+ * then encode level 1, and finally encode level 0.
+ */
+ int i;
+
+ ctx += num_enc - 1;
+ if (img) img += num_enc - 1;
+
+ for (i = num_enc-1; i >= 0; i--)
+ {
+ if ((res = ctx->iface->enc.encode(ctx->priv->alg_priv, img, pts,
+ duration, flags, deadline)))
+ break;
+
+ ctx--;
+ if (img) img--;
+ }
+ ctx++;
+ }
+
FLOATING_POINT_RESTORE();
}
diff --git a/vpx/src/vpx_image.c b/vpx/src/vpx_image.c
index 7a4e27062..336b6e29d 100644
--- a/vpx/src/vpx_image.c
+++ b/vpx/src/vpx_image.c
@@ -13,10 +13,42 @@
#include <string.h>
#include "vpx/vpx_image.h"
+#define ADDRESS_STORAGE_SIZE sizeof(size_t)
+/*returns an addr aligned to the byte boundary specified by align*/
+#define align_addr(addr,align) (void*)(((size_t)(addr) + ((align) - 1)) & (size_t)-(align))
+
+/* Memalign code is copied from vpx_mem.c */
+static void *img_buf_memalign(size_t align, size_t size)
+{
+ void *addr,
+ * x = NULL;
+
+ addr = malloc(size + align - 1 + ADDRESS_STORAGE_SIZE);
+
+ if (addr)
+ {
+ x = align_addr((unsigned char *)addr + ADDRESS_STORAGE_SIZE, (int)align);
+ /* save the actual malloc address */
+ ((size_t *)x)[-1] = (size_t)addr;
+ }
+
+ return x;
+}
+
+static void img_buf_free(void *memblk)
+{
+ if (memblk)
+ {
+ void *addr = (void *)(((size_t *)memblk)[-1]);
+ free(addr);
+ }
+}
+
static vpx_image_t *img_alloc_helper(vpx_image_t *img,
vpx_img_fmt_t fmt,
unsigned int d_w,
unsigned int d_h,
+ unsigned int buf_align,
unsigned int stride_align,
unsigned char *img_data)
{
@@ -25,6 +57,14 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img,
int align;
/* Treat align==0 like align==1 */
+ if (!buf_align)
+ buf_align = 1;
+
+ /* Validate alignment (must be power of 2) */
+ if (buf_align & (buf_align - 1))
+ goto fail;
+
+ /* Treat align==0 like align==1 */
if (!stride_align)
stride_align = 1;
@@ -119,7 +159,8 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img,
if (!img_data)
{
- img->img_data = malloc((fmt & VPX_IMG_FMT_PLANAR) ? h * w * bps / 8 : h * s);
+ img->img_data = img_buf_memalign(buf_align, ((fmt & VPX_IMG_FMT_PLANAR)?
+ h * s * bps / 8 : h * s));
img->img_data_owner = 1;
}
@@ -150,9 +191,9 @@ vpx_image_t *vpx_img_alloc(vpx_image_t *img,
vpx_img_fmt_t fmt,
unsigned int d_w,
unsigned int d_h,
- unsigned int stride_align)
+ unsigned int align)
{
- return img_alloc_helper(img, fmt, d_w, d_h, stride_align, NULL);
+ return img_alloc_helper(img, fmt, d_w, d_h, align, align, NULL);
}
vpx_image_t *vpx_img_wrap(vpx_image_t *img,
@@ -162,7 +203,9 @@ vpx_image_t *vpx_img_wrap(vpx_image_t *img,
unsigned int stride_align,
unsigned char *img_data)
{
- return img_alloc_helper(img, fmt, d_w, d_h, stride_align, img_data);
+ /* By setting buf_align = 1, we don't change buffer alignment in this
+ * function. */
+ return img_alloc_helper(img, fmt, d_w, d_h, 1, stride_align, img_data);
}
int vpx_img_set_rect(vpx_image_t *img,
@@ -254,7 +297,7 @@ void vpx_img_free(vpx_image_t *img)
if (img)
{
if (img->img_data && img->img_data_owner)
- free(img->img_data);
+ img_buf_free(img->img_data);
if (img->self_allocd)
free(img);
diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h
index 87ab20c75..885ca229f 100644
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h
@@ -634,7 +634,6 @@ extern "C" {
* then ts_layer_id = (0,1,0,1,0,1,0,1).
*/
unsigned int ts_layer_id[MAX_PERIODICITY];
-
} vpx_codec_enc_cfg_t; /**< alias for struct vpx_codec_enc_cfg */
@@ -675,6 +674,48 @@ extern "C" {
vpx_codec_enc_init_ver(ctx, iface, cfg, flags, VPX_ENCODER_ABI_VERSION)
+ /*!\brief Initialize multi-encoder instance
+ *
+ * Initializes multi-encoder context using the given interface.
+ * Applications should call the vpx_codec_enc_init_multi convenience macro
+ * instead of this function directly, to ensure that the ABI version number
+ * parameter is properly initialized.
+ *
+ * In XMA mode (activated by setting VPX_CODEC_USE_XMA in the flags
+ * parameter), the storage pointed to by the cfg parameter must be
+ * kept readable and stable until all memory maps have been set.
+ *
+ * \param[in] ctx Pointer to this instance's context.
+ * \param[in] iface Pointer to the algorithm interface to use.
+ * \param[in] cfg Configuration to use, if known. May be NULL.
+ * \param[in] num_enc Total number of encoders.
+ * \param[in] flags Bitfield of VPX_CODEC_USE_* flags
+ * \param[in] dsf Pointer to down-sampling factors.
+ * \param[in] ver ABI version number. Must be set to
+ * VPX_ENCODER_ABI_VERSION
+ * \retval #VPX_CODEC_OK
+ * The decoder algorithm initialized.
+ * \retval #VPX_CODEC_MEM_ERROR
+ * Memory allocation failed.
+ */
+ vpx_codec_err_t vpx_codec_enc_init_multi_ver(vpx_codec_ctx_t *ctx,
+ vpx_codec_iface_t *iface,
+ vpx_codec_enc_cfg_t *cfg,
+ int num_enc,
+ vpx_codec_flags_t flags,
+ vpx_rational_t *dsf,
+ int ver);
+
+
+ /*!\brief Convenience macro for vpx_codec_enc_init_multi_ver()
+ *
+ * Ensures the ABI version parameter is properly set.
+ */
+#define vpx_codec_enc_init_multi(ctx, iface, cfg, num_enc, flags, dsf) \
+ vpx_codec_enc_init_multi_ver(ctx, iface, cfg, num_enc, flags, dsf, \
+ VPX_ENCODER_ABI_VERSION)
+
+
/*!\brief Get a default configuration
*
* Initializes a encoder configuration structure with default values. Supports
@@ -780,7 +821,6 @@ extern "C" {
vpx_enc_frame_flags_t flags,
unsigned long deadline);
-
/*!\brief Set compressed data output buffer
*
* Sets the buffer that the codec should output the compressed data
diff --git a/vpx/vpx_image.h b/vpx/vpx_image.h
index 8e08b3642..3e424470f 100644
--- a/vpx/vpx_image.h
+++ b/vpx/vpx_image.h
@@ -160,7 +160,8 @@ extern "C" {
* \param[in] fmt Format for the image
* \param[in] d_w Width of the image
* \param[in] d_h Height of the image
- * \param[in] align Alignment, in bytes, of each row in the image.
+ * \param[in] align Alignment, in bytes, of the image buffer and
+ * each row in the image(stride).
*
* \return Returns a pointer to the initialized image descriptor. If the img
* parameter is non-null, the value of the img parameter will be
diff --git a/vpx/vpx_integer.h b/vpx/vpx_integer.h
index 608760f8b..218bca773 100644
--- a/vpx/vpx_integer.h
+++ b/vpx/vpx_integer.h
@@ -29,16 +29,8 @@ typedef signed __int64 int64_t;
typedef unsigned __int64 uint64_t;
#endif
-#ifdef HAVE_ARMV6
-typedef unsigned int int_fast16_t;
-#else
-typedef signed short int_fast16_t;
-#endif
-typedef signed char int_fast8_t;
-typedef unsigned char uint_fast8_t;
-
#ifndef _UINTPTR_T_DEFINED
-typedef unsigned int uintptr_t;
+typedef size_t uintptr_t;
#endif
#else
diff --git a/vpxdec.c b/vpxdec.c
index 6a1a0f523..7401101f8 100644
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -32,7 +32,7 @@
#include "nestegg/include/nestegg/nestegg.h"
#if CONFIG_OS_SUPPORT
-#if defined(_WIN32)
+#if defined(_MSC_VER)
#include <io.h>
#define snprintf _snprintf
#define isatty _isatty
diff --git a/vpxenc.c b/vpxenc.c
index abbf093a9..3637acba0 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -47,9 +47,11 @@ typedef __int64 off_t;
#define fseeko _fseeki64
#define ftello _ftelli64
#elif defined(_WIN32)
-/* MinGW defines off_t, and uses f{seek,tell}o64 */
+/* MinGW defines off_t as long
+ and uses f{seek,tell}o64/off64_t for large files */
#define fseeko fseeko64
#define ftello ftello64
+#define off_t off64_t
#endif
#if defined(_MSC_VER)
@@ -805,7 +807,7 @@ write_webm_file_footer(EbmlGlobal *glob, long hash)
{
EbmlLoc start;
- int i;
+ unsigned int i;
glob->cue_pos = ftello(glob->stream);
Ebml_StartSubElement(glob, &start, Cues);
@@ -1440,7 +1442,8 @@ static void show_rate_histogram(struct rate_hist *hist,
show_histogram(hist->bucket, buckets, hist->total, scale);
}
-#define ARG_CTRL_CNT_MAX 10
+#define NELEMENTS(x) (sizeof(x)/sizeof(x[0]))
+#define ARG_CTRL_CNT_MAX NELEMENTS(vp8_arg_ctrl_map)
int main(int argc, const char **argv_)
{
@@ -1719,14 +1722,26 @@ int main(int argc, const char **argv_)
{
if (arg_match(&arg, ctrl_args[i], argi))
{
+ int j;
match = 1;
- if (arg_ctrl_cnt < ARG_CTRL_CNT_MAX)
+ /* Point either to the next free element or the first
+ * instance of this control.
+ */
+ for(j=0; j<arg_ctrl_cnt; j++)
+ if(arg_ctrls[j][0] == ctrl_args_map[i])
+ break;
+
+ /* Update/insert */
+ assert(j < ARG_CTRL_CNT_MAX);
+ if (j < ARG_CTRL_CNT_MAX)
{
- arg_ctrls[arg_ctrl_cnt][0] = ctrl_args_map[i];
- arg_ctrls[arg_ctrl_cnt][1] = arg_parse_enum_or_int(&arg);
- arg_ctrl_cnt++;
+ arg_ctrls[j][0] = ctrl_args_map[i];
+ arg_ctrls[j][1] = arg_parse_enum_or_int(&arg);
+ if(j == arg_ctrl_cnt)
+ arg_ctrl_cnt++;
}
+
}
}