summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--args.c4
-rwxr-xr-xconfigure40
-rw-r--r--examples.mk14
-rw-r--r--test/dct16x16_test.cc1
-rw-r--r--test/fdct8x8_test.cc1
-rw-r--r--test/test_vectors.cc3
-rw-r--r--test/variance_test.cc1011
-rw-r--r--test/vp9_error_block_test.cc1
-rw-r--r--test/vp9_quantize_test.cc2
-rw-r--r--tools_common.h2
-rw-r--r--vp8/common/arm/armv6/vp8_variance16x16_armv6.asm154
-rw-r--r--vp8/common/arm/armv6/vp8_variance8x8_armv6.asm101
-rw-r--r--vp8/common/arm/neon/variance_neon.c320
-rw-r--r--vp8/common/arm/variance_arm.c19
-rw-r--r--vp8/common/mfqe.c20
-rw-r--r--vp8/common/rtcd_defs.pl39
-rw-r--r--vp8/common/variance.h34
-rw-r--r--vp8/common/variance_c.c147
-rw-r--r--vp8/common/x86/variance_impl_mmx.asm498
-rw-r--r--vp8/common/x86/variance_impl_sse2.asm387
-rw-r--r--vp8/common/x86/variance_mmx.c140
-rw-r--r--vp8/common/x86/variance_sse2.c141
-rw-r--r--vp8/common/x86/variance_ssse3.c9
-rw-r--r--vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm138
-rw-r--r--vp8/encoder/arm/neon/vp8_mse16x16_neon.c131
-rw-r--r--vp8/encoder/encodeframe.c3
-rw-r--r--vp8/encoder/encodeintra.c3
-rw-r--r--vp8/encoder/firstpass.c11
-rw-r--r--vp8/encoder/onyx_if.c21
-rw-r--r--vp8/encoder/pickinter.c36
-rw-r--r--vp8/encoder/picklpf.c3
-rw-r--r--vp8/encoder/ratectrl.c4
-rw-r--r--vp8/encoder/rdopt.c7
-rw-r--r--vp8/vp8_common.mk3
-rw-r--r--vp8/vp8cx_arm.mk2
-rw-r--r--vp9/common/arm/neon/vp9_reconintra_neon.c766
-rw-r--r--vp9/common/vp9_alloccommon.c42
-rw-r--r--vp9/common/vp9_blockd.h73
-rw-r--r--vp9/common/vp9_entropy.h33
-rw-r--r--vp9/common/vp9_entropymode.h15
-rw-r--r--vp9/common/vp9_enums.h38
-rw-r--r--vp9/common/vp9_mfqe.c6
-rw-r--r--vp9/common/vp9_mvref_common.c2
-rw-r--r--vp9/common/vp9_onyxc_int.h12
-rw-r--r--vp9/common/vp9_rtcd_defs.pl231
-rw-r--r--vp9/common/vp9_scan.h12
-rw-r--r--vp9/common/vp9_systemdependent.h9
-rw-r--r--vp9/common/x86/convolve.h296
-rw-r--r--vp9/common/x86/vp9_asm_stubs.c416
-rw-r--r--vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c122
-rw-r--r--vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c201
-rw-r--r--vp9/decoder/vp9_decodeframe.c3
-rw-r--r--vp9/decoder/vp9_decoder.c7
-rw-r--r--vp9/decoder/vp9_detokenize.c1
-rw-r--r--vp9/encoder/arm/neon/vp9_variance_neon.c155
-rw-r--r--vp9/encoder/vp9_aq_variance.c6
-rw-r--r--vp9/encoder/vp9_encodeframe.c97
-rw-r--r--vp9/encoder/vp9_encodeframe.h2
-rw-r--r--vp9/encoder/vp9_encodemb.c1
-rw-r--r--vp9/encoder/vp9_encoder.c180
-rw-r--r--vp9/encoder/vp9_firstpass.c80
-rw-r--r--vp9/encoder/vp9_mcomp.c49
-rw-r--r--vp9/encoder/vp9_mcomp.h3
-rw-r--r--vp9/encoder/vp9_pickmode.c6
-rw-r--r--vp9/encoder/vp9_ratectrl.c8
-rw-r--r--vp9/encoder/vp9_rdopt.c1
-rw-r--r--vp9/encoder/vp9_svc_layercontext.c23
-rw-r--r--vp9/encoder/vp9_svc_layercontext.h3
-rw-r--r--vp9/encoder/vp9_tokenize.c1
-rw-r--r--vp9/encoder/vp9_variance.c308
-rw-r--r--vp9/encoder/vp9_variance.h78
-rw-r--r--vp9/encoder/x86/vp9_highbd_variance_sse2.c231
-rw-r--r--vp9/encoder/x86/vp9_variance_avx2.c87
-rw-r--r--vp9/encoder/x86/vp9_variance_sse2.c293
-rw-r--r--vp9/vp9_common.mk1
-rw-r--r--vp9/vp9cx.mk2
-rw-r--r--vpx_dsp/arm/variance_media.asm358
-rw-r--r--vpx_dsp/arm/variance_neon.c417
-rw-r--r--vpx_dsp/sad.c1
-rw-r--r--vpx_dsp/variance.c306
-rw-r--r--vpx_dsp/vpx_dsp.mk20
-rw-r--r--vpx_dsp/vpx_dsp_rtcd_defs.pl208
-rw-r--r--vpx_dsp/x86/highbd_variance_impl_sse2.asm (renamed from vp9/encoder/x86/vp9_highbd_variance_impl_sse2.asm)12
-rw-r--r--vpx_dsp/x86/highbd_variance_sse2.c245
-rw-r--r--vpx_dsp/x86/variance_avx2.c93
-rw-r--r--vpx_dsp/x86/variance_impl_avx2.c (renamed from vp9/encoder/x86/vp9_variance_impl_intrin_avx2.c)6
-rw-r--r--vpx_dsp/x86/variance_impl_mmx.asm424
-rw-r--r--vpx_dsp/x86/variance_mmx.c107
-rw-r--r--vpx_dsp/x86/variance_sse2.c309
-rw-r--r--vpx_ports/msvc.h22
-rw-r--r--vpx_ports/vpx_ports.mk1
91 files changed, 4480 insertions, 5399 deletions
diff --git a/args.c b/args.c
index 9dabc9bdd..14b031040 100644
--- a/args.c
+++ b/args.c
@@ -14,9 +14,7 @@
#include <limits.h>
#include "args.h"
-#ifdef _MSC_VER
-#define snprintf _snprintf
-#endif
+#include "vpx_ports/msvc.h"
#if defined(__GNUC__) && __GNUC__
extern void die(const char *fmt, ...) __attribute__((noreturn));
diff --git a/configure b/configure
index 310c42a50..d3d6e6732 100755
--- a/configure
+++ b/configure
@@ -184,6 +184,10 @@ if [ ${doxy_major:-0} -ge 1 ]; then
[ $doxy_minor -eq 5 ] && [ $doxy_patch -ge 3 ] && enable_feature doxygen
fi
+# disable codecs when their source directory does not exist
+[ -d "${source_path}/vp8" ] || disable_feature vp8
+[ -d "${source_path}/vp9" ] || disable_feature vp9
+
# install everything except the sources, by default. sources will have
# to be enabled when doing dist builds, since that's no longer a common
# case.
@@ -199,31 +203,16 @@ enable_feature multithread
enable_feature os_support
enable_feature temporal_denoising
-[ -d "${source_path}/../include" ] && enable_feature alt_tree_layout
-for d in vp8 vp9; do
- [ -d "${source_path}/${d}" ] && disable_feature alt_tree_layout;
-done
-
-if ! enabled alt_tree_layout; then
-# development environment
-[ -d "${source_path}/vp8" ] && CODECS="${CODECS} vp8_encoder vp8_decoder"
-[ -d "${source_path}/vp9" ] && CODECS="${CODECS} vp9_encoder vp9_decoder"
-else
-# customer environment
-[ -f "${source_path}/../include/vpx/vp8cx.h" ] && CODECS="${CODECS} vp8_encoder"
-[ -f "${source_path}/../include/vpx/vp8dx.h" ] && CODECS="${CODECS} vp8_decoder"
-[ -f "${source_path}/../include/vpx/vp9cx.h" ] && CODECS="${CODECS} vp9_encoder"
-[ -f "${source_path}/../include/vpx/vp9dx.h" ] && CODECS="${CODECS} vp9_decoder"
-[ -f "${source_path}/../include/vpx/vp8cx.h" ] || disable_feature vp8_encoder
-[ -f "${source_path}/../include/vpx/vp8dx.h" ] || disable_feature vp8_decoder
-[ -f "${source_path}/../include/vpx/vp9cx.h" ] || disable_feature vp9_encoder
-[ -f "${source_path}/../include/vpx/vp9dx.h" ] || disable_feature vp9_decoder
-
-[ -f "${source_path}/../lib/*/*mt.lib" ] && soft_enable static_msvcrt
-fi
-
-CODECS="$(echo ${CODECS} | tr ' ' '\n')"
-CODEC_FAMILIES="$(for c in ${CODECS}; do echo ${c%_*}; done | sort | uniq)"
+CODECS="
+ vp8_encoder
+ vp8_decoder
+ vp9_encoder
+ vp9_decoder
+"
+CODEC_FAMILIES="
+ vp8
+ vp9
+"
ARCH_LIST="
arm
@@ -255,7 +244,6 @@ HAVE_LIST="
${ARCH_EXT_LIST}
vpx_ports
stdint_h
- alt_tree_layout
pthread_h
sys_mman_h
unistd_h
diff --git a/examples.mk b/examples.mk
index b92507a6f..174c71d10 100644
--- a/examples.mk
+++ b/examples.mk
@@ -56,6 +56,7 @@ UTILS-$(CONFIG_DECODERS) += vpxdec.c
vpxdec.SRCS += md5_utils.c md5_utils.h
vpxdec.SRCS += vpx_ports/mem_ops.h
vpxdec.SRCS += vpx_ports/mem_ops_aligned.h
+vpxdec.SRCS += vpx_ports/msvc.h
vpxdec.SRCS += vpx_ports/vpx_timer.h
vpxdec.SRCS += vpx/vpx_integer.h
vpxdec.SRCS += args.c args.h
@@ -80,6 +81,7 @@ vpxenc.SRCS += tools_common.c tools_common.h
vpxenc.SRCS += warnings.c warnings.h
vpxenc.SRCS += vpx_ports/mem_ops.h
vpxenc.SRCS += vpx_ports/mem_ops_aligned.h
+vpxenc.SRCS += vpx_ports/msvc.h
vpxenc.SRCS += vpx_ports/vpx_timer.h
vpxenc.SRCS += vpxstats.c vpxstats.h
ifeq ($(CONFIG_LIBYUV),yes)
@@ -98,6 +100,7 @@ ifeq ($(CONFIG_SPATIAL_SVC),yes)
vp9_spatial_svc_encoder.SRCS += tools_common.c tools_common.h
vp9_spatial_svc_encoder.SRCS += video_common.h
vp9_spatial_svc_encoder.SRCS += video_writer.h video_writer.c
+ vp9_spatial_svc_encoder.SRCS += vpx_ports/msvc.h
vp9_spatial_svc_encoder.SRCS += vpxstats.c vpxstats.h
vp9_spatial_svc_encoder.GUID = 4A38598D-627D-4505-9C7B-D4020C84100D
vp9_spatial_svc_encoder.DESCRIPTION = VP9 Spatial SVC Encoder
@@ -112,6 +115,7 @@ vpx_temporal_svc_encoder.SRCS += ivfenc.c ivfenc.h
vpx_temporal_svc_encoder.SRCS += tools_common.c tools_common.h
vpx_temporal_svc_encoder.SRCS += video_common.h
vpx_temporal_svc_encoder.SRCS += video_writer.h video_writer.c
+vpx_temporal_svc_encoder.SRCS += vpx_ports/msvc.h
vpx_temporal_svc_encoder.GUID = B18C08F2-A439-4502-A78E-849BE3D60947
vpx_temporal_svc_encoder.DESCRIPTION = Temporal SVC Encoder
EXAMPLES-$(CONFIG_DECODERS) += simple_decoder.c
@@ -122,6 +126,7 @@ simple_decoder.SRCS += video_common.h
simple_decoder.SRCS += video_reader.h video_reader.c
simple_decoder.SRCS += vpx_ports/mem_ops.h
simple_decoder.SRCS += vpx_ports/mem_ops_aligned.h
+simple_decoder.SRCS += vpx_ports/msvc.h
simple_decoder.DESCRIPTION = Simplified decoder loop
EXAMPLES-$(CONFIG_DECODERS) += postproc.c
postproc.SRCS += ivfdec.h ivfdec.c
@@ -130,6 +135,7 @@ postproc.SRCS += video_common.h
postproc.SRCS += video_reader.h video_reader.c
postproc.SRCS += vpx_ports/mem_ops.h
postproc.SRCS += vpx_ports/mem_ops_aligned.h
+postproc.SRCS += vpx_ports/msvc.h
postproc.GUID = 65E33355-F35E-4088-884D-3FD4905881D7
postproc.DESCRIPTION = Decoder postprocessor control
EXAMPLES-$(CONFIG_DECODERS) += decode_to_md5.c
@@ -140,6 +146,7 @@ decode_to_md5.SRCS += video_common.h
decode_to_md5.SRCS += video_reader.h video_reader.c
decode_to_md5.SRCS += vpx_ports/mem_ops.h
decode_to_md5.SRCS += vpx_ports/mem_ops_aligned.h
+decode_to_md5.SRCS += vpx_ports/msvc.h
decode_to_md5.GUID = 59120B9B-2735-4BFE-B022-146CA340FE42
decode_to_md5.DESCRIPTION = Frame by frame MD5 checksum
EXAMPLES-$(CONFIG_ENCODERS) += simple_encoder.c
@@ -147,6 +154,7 @@ simple_encoder.SRCS += ivfenc.h ivfenc.c
simple_encoder.SRCS += tools_common.h tools_common.c
simple_encoder.SRCS += video_common.h
simple_encoder.SRCS += video_writer.h video_writer.c
+simple_encoder.SRCS += vpx_ports/msvc.h
simple_encoder.GUID = 4607D299-8A71-4D2C-9B1D-071899B6FBFD
simple_encoder.DESCRIPTION = Simplified encoder loop
EXAMPLES-$(CONFIG_VP9_ENCODER) += vp9_lossless_encoder.c
@@ -154,6 +162,7 @@ vp9_lossless_encoder.SRCS += ivfenc.h ivfenc.c
vp9_lossless_encoder.SRCS += tools_common.h tools_common.c
vp9_lossless_encoder.SRCS += video_common.h
vp9_lossless_encoder.SRCS += video_writer.h video_writer.c
+vp9_lossless_encoder.SRCS += vpx_ports/msvc.h
vp9_lossless_encoder.GUID = B63C7C88-5348-46DC-A5A6-CC151EF93366
vp9_lossless_encoder.DESCRIPTION = Simplified lossless VP9 encoder
EXAMPLES-$(CONFIG_ENCODERS) += twopass_encoder.c
@@ -161,6 +170,7 @@ twopass_encoder.SRCS += ivfenc.h ivfenc.c
twopass_encoder.SRCS += tools_common.h tools_common.c
twopass_encoder.SRCS += video_common.h
twopass_encoder.SRCS += video_writer.h video_writer.c
+twopass_encoder.SRCS += vpx_ports/msvc.h
twopass_encoder.GUID = 73494FA6-4AF9-4763-8FBB-265C92402FD8
twopass_encoder.DESCRIPTION = Two-pass encoder loop
EXAMPLES-$(CONFIG_DECODERS) += decode_with_drops.c
@@ -170,6 +180,7 @@ decode_with_drops.SRCS += video_common.h
decode_with_drops.SRCS += video_reader.h video_reader.c
decode_with_drops.SRCS += vpx_ports/mem_ops.h
decode_with_drops.SRCS += vpx_ports/mem_ops_aligned.h
+decode_with_drops.SRCS += vpx_ports/msvc.h
decode_with_drops.GUID = CE5C53C4-8DDA-438A-86ED-0DDD3CDB8D26
decode_with_drops.DESCRIPTION = Drops frames while decoding
EXAMPLES-$(CONFIG_ENCODERS) += set_maps.c
@@ -177,6 +188,7 @@ set_maps.SRCS += ivfenc.h ivfenc.c
set_maps.SRCS += tools_common.h tools_common.c
set_maps.SRCS += video_common.h
set_maps.SRCS += video_writer.h video_writer.c
+set_maps.SRCS += vpx_ports/msvc.h
set_maps.GUID = ECB2D24D-98B8-4015-A465-A4AF3DCC145F
set_maps.DESCRIPTION = Set active and ROI maps
EXAMPLES-$(CONFIG_VP8_ENCODER) += vp8cx_set_ref.c
@@ -184,6 +196,7 @@ vp8cx_set_ref.SRCS += ivfenc.h ivfenc.c
vp8cx_set_ref.SRCS += tools_common.h tools_common.c
vp8cx_set_ref.SRCS += video_common.h
vp8cx_set_ref.SRCS += video_writer.h video_writer.c
+vp8cx_set_ref.SRCS += vpx_ports/msvc.h
vp8cx_set_ref.GUID = C5E31F7F-96F6-48BD-BD3E-10EBF6E8057A
vp8cx_set_ref.DESCRIPTION = VP8 set encoder reference frame
@@ -194,6 +207,7 @@ EXAMPLES-$(CONFIG_VP8_ENCODER) += vp8_multi_resolution_encoder.c
vp8_multi_resolution_encoder.SRCS += ivfenc.h ivfenc.c
vp8_multi_resolution_encoder.SRCS += tools_common.h tools_common.c
vp8_multi_resolution_encoder.SRCS += video_writer.h video_writer.c
+vp8_multi_resolution_encoder.SRCS += vpx_ports/msvc.h
vp8_multi_resolution_encoder.SRCS += $(LIBYUV_SRCS)
vp8_multi_resolution_encoder.GUID = 04f8738e-63c8-423b-90fa-7c2703a374de
vp8_multi_resolution_encoder.DESCRIPTION = VP8 Multiple-resolution Encoding
diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index 48a8006af..b37d8e353 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -20,6 +20,7 @@
#include "./vp9_rtcd.h"
#include "vp9/common/vp9_entropy.h"
+#include "vp9/common/vp9_scan.h"
#include "vpx/vpx_codec.h"
#include "vpx/vpx_integer.h"
#include "vpx_ports/mem.h"
diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index 5c0b09bb3..4c12bb49b 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -20,6 +20,7 @@
#include "./vp9_rtcd.h"
#include "vp9/common/vp9_entropy.h"
+#include "vp9/common/vp9_scan.h"
#include "vpx/vpx_codec.h"
#include "vpx/vpx_integer.h"
#include "vpx_ports/mem.h"
diff --git a/test/test_vectors.cc b/test/test_vectors.cc
index 07d306ff4..434a38251 100644
--- a/test/test_vectors.cc
+++ b/test/test_vectors.cc
@@ -165,7 +165,10 @@ const char *const kVP9TestVectors[] = {
"vp90-2-11-size-351x287.webm", "vp90-2-11-size-351x288.webm",
"vp90-2-11-size-352x287.webm", "vp90-2-12-droppable_1.ivf",
"vp90-2-12-droppable_2.ivf", "vp90-2-12-droppable_3.ivf",
+#if !CONFIG_SIZE_LIMIT || \
+ (DECODE_WIDTH_LIMIT >= 20400 && DECODE_HEIGHT_LIMIT >= 120)
"vp90-2-13-largescaling.webm",
+#endif
"vp90-2-14-resize-fp-tiles-1-16.webm",
"vp90-2-14-resize-fp-tiles-1-2-4-8-16.webm",
"vp90-2-14-resize-fp-tiles-1-2.webm", "vp90-2-14-resize-fp-tiles-1-4.webm",
diff --git a/test/variance_test.cc b/test/variance_test.cc
index e4e27af7c..e45d90fae 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -21,22 +21,45 @@
#include "vpx/vpx_integer.h"
#include "vpx_mem/vpx_mem.h"
#include "vpx_ports/mem.h"
-#if CONFIG_VP8_ENCODER
-# include "./vp8_rtcd.h"
-# include "vp8/common/variance.h"
-#endif
#if CONFIG_VP9_ENCODER
# include "./vp9_rtcd.h"
# include "vp9/encoder/vp9_variance.h"
-#endif
+#endif // CONFIG_VP9_ENCODER
+#include "./vpx_dsp_rtcd.h"
namespace {
+typedef unsigned int (*VarianceMxNFunc)(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ unsigned int *sse);
+typedef unsigned int (*Get4x4SseFunc)(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride);
+
+
using ::std::tr1::get;
using ::std::tr1::make_tuple;
using ::std::tr1::tuple;
using libvpx_test::ACMRandom;
+// Truncate high bit depth results by downshifting (with rounding) by:
+// 2 * (bit_depth - 8) for sse
+// (bit_depth - 8) for se
+static void RoundHighBitDepth(int bit_depth, int64_t *se, uint64_t *sse) {
+ switch (bit_depth) {
+ case VPX_BITS_12:
+ *sse = (*sse + 128) >> 8;
+ *se = (*se + 8) >> 4;
+ break;
+ case VPX_BITS_10:
+ *sse = (*sse + 8) >> 4;
+ *se = (*se + 2) >> 2;
+ break;
+ case VPX_BITS_8:
+ default:
+ break;
+ }
+}
+
static unsigned int mb_ss_ref(const int16_t *src) {
unsigned int res = 0;
for (int i = 0; i < 256; ++i) {
@@ -50,7 +73,6 @@ static unsigned int variance_ref(const uint8_t *src, const uint8_t *ref,
int ref_stride_coeff, uint32_t *sse_ptr,
bool use_high_bit_depth_,
vpx_bit_depth_t bit_depth) {
-#if CONFIG_VP9_HIGHBITDEPTH
int64_t se = 0;
uint64_t sse = 0;
const int w = 1 << l2w;
@@ -63,32 +85,17 @@ static unsigned int variance_ref(const uint8_t *src, const uint8_t *ref,
src[w * y * src_stride_coeff + x];
se += diff;
sse += diff * diff;
+#if CONFIG_VP9_HIGHBITDEPTH
} else {
diff = CONVERT_TO_SHORTPTR(ref)[w * y * ref_stride_coeff + x] -
CONVERT_TO_SHORTPTR(src)[w * y * src_stride_coeff + x];
se += diff;
sse += diff * diff;
+#endif // CONFIG_VP9_HIGHBITDEPTH
}
}
}
- if (bit_depth > VPX_BITS_8) {
- sse = ROUND_POWER_OF_TWO(sse, 2 * (bit_depth - 8));
- se = ROUND_POWER_OF_TWO(se, bit_depth - 8);
- }
-#else
- int se = 0;
- unsigned int sse = 0;
- const int w = 1 << l2w;
- const int h = 1 << l2h;
- for (int y = 0; y < h; y++) {
- for (int x = 0; x < w; x++) {
- int diff = ref[w * y * ref_stride_coeff + x] -
- src[w * y * src_stride_coeff + x];
- se += diff;
- sse += diff * diff;
- }
- }
-#endif // CONFIG_VP9_HIGHBITDEPTH
+ RoundHighBitDepth(bit_depth, &se, &sse);
*sse_ptr = sse;
return sse - (((int64_t) se * se) >> (l2w + l2h));
}
@@ -98,7 +105,6 @@ static unsigned int subpel_variance_ref(const uint8_t *ref, const uint8_t *src,
unsigned int *sse_ptr,
bool use_high_bit_depth_,
vpx_bit_depth_t bit_depth) {
-#if CONFIG_VP9_HIGHBITDEPTH
int64_t se = 0;
uint64_t sse = 0;
const int w = 1 << l2w;
@@ -117,6 +123,7 @@ static unsigned int subpel_variance_ref(const uint8_t *ref, const uint8_t *src,
const int diff = r - src[w * y + x];
se += diff;
sse += diff * diff;
+#if CONFIG_VP9_HIGHBITDEPTH
} else {
uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref);
uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
@@ -130,34 +137,11 @@ static unsigned int subpel_variance_ref(const uint8_t *ref, const uint8_t *src,
const int diff = r - src16[w * y + x];
se += diff;
sse += diff * diff;
+#endif // CONFIG_VP9_HIGHBITDEPTH
}
}
}
- if (bit_depth > VPX_BITS_8) {
- sse = ROUND_POWER_OF_TWO(sse, 2 * (bit_depth - 8));
- se = ROUND_POWER_OF_TWO(se, bit_depth - 8);
- }
-#else
- int se = 0;
- unsigned int sse = 0;
- const int w = 1 << l2w;
- const int h = 1 << l2h;
- for (int y = 0; y < h; y++) {
- for (int x = 0; x < w; x++) {
- // Bilinear interpolation at a 16th pel step.
- const int a1 = ref[(w + 1) * (y + 0) + x + 0];
- const int a2 = ref[(w + 1) * (y + 0) + x + 1];
- const int b1 = ref[(w + 1) * (y + 1) + x + 0];
- const int b2 = ref[(w + 1) * (y + 1) + x + 1];
- const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
- const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
- const int r = a + (((b - a) * yoff + 8) >> 4);
- const int diff = r - src[w * y + x];
- se += diff;
- sse += diff * diff;
- }
- }
-#endif // CONFIG_VP9_HIGHBITDEPTH
+ RoundHighBitDepth(bit_depth, &se, &sse);
*sse_ptr = sse;
return sse - (((int64_t) se * se) >> (l2w + l2h));
}
@@ -229,36 +213,30 @@ class VarianceTest
rnd_.Reset(ACMRandom::DeterministicSeed());
block_size_ = width_ * height_;
-#if CONFIG_VP9_HIGHBITDEPTH
if (!use_high_bit_depth_) {
src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_ * 2));
ref_ = new uint8_t[block_size_ * 2];
+#if CONFIG_VP9_HIGHBITDEPTH
} else {
src_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
vpx_memalign(16, block_size_ * 2 * sizeof(uint16_t))));
ref_ = CONVERT_TO_BYTEPTR(new uint16_t[block_size_ * 2]);
+#endif // CONFIG_VP9_HIGHBITDEPTH
}
-#else
- src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_ * 2));
- ref_ = new uint8_t[block_size_ * 2];
-#endif
ASSERT_TRUE(src_ != NULL);
ASSERT_TRUE(ref_ != NULL);
}
virtual void TearDown() {
-#if CONFIG_VP9_HIGHBITDEPTH
if (!use_high_bit_depth_) {
vpx_free(src_);
delete[] ref_;
+#if CONFIG_VP9_HIGHBITDEPTH
} else {
vpx_free(CONVERT_TO_SHORTPTR(src_));
delete[] CONVERT_TO_SHORTPTR(ref_);
+#endif // CONFIG_VP9_HIGHBITDEPTH
}
-#else
- vpx_free(src_);
- delete[] ref_;
-#endif
libvpx_test::ClearSystemState();
}
@@ -283,27 +261,23 @@ class VarianceTest
template<typename VarianceFunctionType>
void VarianceTest<VarianceFunctionType>::ZeroTest() {
for (int i = 0; i <= 255; ++i) {
-#if CONFIG_VP9_HIGHBITDEPTH
if (!use_high_bit_depth_) {
memset(src_, i, block_size_);
+#if CONFIG_VP9_HIGHBITDEPTH
} else {
vpx_memset16(CONVERT_TO_SHORTPTR(src_), i << (bit_depth_ - 8),
block_size_);
+#endif // CONFIG_VP9_HIGHBITDEPTH
}
-#else
- memset(src_, i, block_size_);
-#endif
for (int j = 0; j <= 255; ++j) {
-#if CONFIG_VP9_HIGHBITDEPTH
if (!use_high_bit_depth_) {
memset(ref_, j, block_size_);
+#if CONFIG_VP9_HIGHBITDEPTH
} else {
vpx_memset16(CONVERT_TO_SHORTPTR(ref_), j << (bit_depth_ - 8),
block_size_);
+#endif // CONFIG_VP9_HIGHBITDEPTH
}
-#else
- memset(ref_, j, block_size_);
-#endif
unsigned int sse;
unsigned int var;
ASM_REGISTER_STATE_CHECK(
@@ -317,18 +291,15 @@ template<typename VarianceFunctionType>
void VarianceTest<VarianceFunctionType>::RefTest() {
for (int i = 0; i < 10; ++i) {
for (int j = 0; j < block_size_; j++) {
-#if CONFIG_VP9_HIGHBITDEPTH
if (!use_high_bit_depth_) {
src_[j] = rnd_.Rand8();
ref_[j] = rnd_.Rand8();
+#if CONFIG_VP9_HIGHBITDEPTH
} else {
CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() && mask_;
CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() && mask_;
+#endif // CONFIG_VP9_HIGHBITDEPTH
}
-#else
- src_[j] = rnd_.Rand8();
- ref_[j] = rnd_.Rand8();
-#endif
}
unsigned int sse1, sse2;
unsigned int var1;
@@ -352,18 +323,15 @@ void VarianceTest<VarianceFunctionType>::RefStrideTest() {
for (int j = 0; j < block_size_; j++) {
int ref_ind = (j / width_) * ref_stride_coeff * width_ + j % width_;
int src_ind = (j / width_) * src_stride_coeff * width_ + j % width_;
-#if CONFIG_VP9_HIGHBITDEPTH
if (!use_high_bit_depth_) {
src_[src_ind] = rnd_.Rand8();
ref_[ref_ind] = rnd_.Rand8();
+#if CONFIG_VP9_HIGHBITDEPTH
} else {
CONVERT_TO_SHORTPTR(src_)[src_ind] = rnd_.Rand16() && mask_;
CONVERT_TO_SHORTPTR(ref_)[ref_ind] = rnd_.Rand16() && mask_;
+#endif // CONFIG_VP9_HIGHBITDEPTH
}
-#else
- src_[src_ind] = rnd_.Rand8();
- ref_[ref_ind] = rnd_.Rand8();
-#endif
}
unsigned int sse1, sse2;
unsigned int var1;
@@ -383,22 +351,18 @@ void VarianceTest<VarianceFunctionType>::RefStrideTest() {
template<typename VarianceFunctionType>
void VarianceTest<VarianceFunctionType>::OneQuarterTest() {
const int half = block_size_ / 2;
-#if CONFIG_VP9_HIGHBITDEPTH
if (!use_high_bit_depth_) {
memset(src_, 255, block_size_);
memset(ref_, 255, half);
memset(ref_ + half, 0, half);
+#if CONFIG_VP9_HIGHBITDEPTH
} else {
vpx_memset16(CONVERT_TO_SHORTPTR(src_), 255 << (bit_depth_ - 8),
block_size_);
vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 255 << (bit_depth_ - 8), half);
vpx_memset16(CONVERT_TO_SHORTPTR(ref_) + half, 0, half);
+#endif // CONFIG_VP9_HIGHBITDEPTH
}
-#else
- memset(src_, 255, block_size_);
- memset(ref_, 255, half);
- memset(ref_ + half, 0, half);
-#endif
unsigned int sse;
unsigned int var;
ASM_REGISTER_STATE_CHECK(var = variance_(src_, width_, ref_, width_, &sse));
@@ -406,7 +370,6 @@ void VarianceTest<VarianceFunctionType>::OneQuarterTest() {
EXPECT_EQ(expected, var);
}
-#if CONFIG_VP8_ENCODER
template<typename MseFunctionType>
class MseTest
: public ::testing::TestWithParam<tuple<int, int, MseFunctionType> > {
@@ -500,9 +463,7 @@ void MseTest<MseFunctionType>::MaxTest_sse() {
const unsigned int expected = block_size_ * 255 * 255;
EXPECT_EQ(expected, var);
}
-#endif
-#if CONFIG_VP9_ENCODER
unsigned int subpel_avg_variance_ref(const uint8_t *ref,
const uint8_t *src,
const uint8_t *second_pred,
@@ -511,7 +472,6 @@ unsigned int subpel_avg_variance_ref(const uint8_t *ref,
unsigned int *sse_ptr,
bool use_high_bit_depth,
vpx_bit_depth_t bit_depth) {
-#if CONFIG_VP9_HIGHBITDEPTH
int64_t se = 0;
uint64_t sse = 0;
const int w = 1 << l2w;
@@ -530,6 +490,7 @@ unsigned int subpel_avg_variance_ref(const uint8_t *ref,
const int diff = ((r + second_pred[w * y + x] + 1) >> 1) - src[w * y + x];
se += diff;
sse += diff * diff;
+#if CONFIG_VP9_HIGHBITDEPTH
} else {
uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref);
uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
@@ -544,34 +505,11 @@ unsigned int subpel_avg_variance_ref(const uint8_t *ref,
const int diff = ((r + sec16[w * y + x] + 1) >> 1) - src16[w * y + x];
se += diff;
sse += diff * diff;
+#endif // CONFIG_VP9_HIGHBITDEPTH
}
}
}
- if (bit_depth > 8) {
- sse = ROUND_POWER_OF_TWO(sse, 2*(bit_depth-8));
- se = ROUND_POWER_OF_TWO(se, bit_depth-8);
- }
-#else
- int se = 0;
- unsigned int sse = 0;
- const int w = 1 << l2w;
- const int h = 1 << l2h;
- for (int y = 0; y < h; y++) {
- for (int x = 0; x < w; x++) {
- // bilinear interpolation at a 16th pel step
- const int a1 = ref[(w + 1) * (y + 0) + x + 0];
- const int a2 = ref[(w + 1) * (y + 0) + x + 1];
- const int b1 = ref[(w + 1) * (y + 1) + x + 0];
- const int b2 = ref[(w + 1) * (y + 1) + x + 1];
- const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
- const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
- const int r = a + (((b - a) * yoff + 8) >> 4);
- const int diff = ((r + second_pred[w * y + x] + 1) >> 1) - src[w * y + x];
- se += diff;
- sse += diff * diff;
- }
- }
-#endif // CONFIG_VP9_HIGHBITDEPTH
+ RoundHighBitDepth(bit_depth, &se, &sse);
*sse_ptr = sse;
return sse - (((int64_t) se * se) >> (l2w + l2h));
}
@@ -600,11 +538,11 @@ class SubpelVarianceTest
rnd_.Reset(ACMRandom::DeterministicSeed());
block_size_ = width_ * height_;
-#if CONFIG_VP9_HIGHBITDEPTH
if (!use_high_bit_depth_) {
src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
sec_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
ref_ = new uint8_t[block_size_ + width_ + height_ + 1];
+#if CONFIG_VP9_HIGHBITDEPTH
} else {
src_ = CONVERT_TO_BYTEPTR(
reinterpret_cast<uint16_t *>(
@@ -614,33 +552,25 @@ class SubpelVarianceTest
vpx_memalign(16, block_size_*sizeof(uint16_t))));
ref_ = CONVERT_TO_BYTEPTR(
new uint16_t[block_size_ + width_ + height_ + 1]);
- }
-#else
- src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
- sec_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
- ref_ = new uint8_t[block_size_ + width_ + height_ + 1];
#endif // CONFIG_VP9_HIGHBITDEPTH
+ }
ASSERT_TRUE(src_ != NULL);
ASSERT_TRUE(sec_ != NULL);
ASSERT_TRUE(ref_ != NULL);
}
virtual void TearDown() {
-#if CONFIG_VP9_HIGHBITDEPTH
if (!use_high_bit_depth_) {
vpx_free(src_);
delete[] ref_;
vpx_free(sec_);
+#if CONFIG_VP9_HIGHBITDEPTH
} else {
vpx_free(CONVERT_TO_SHORTPTR(src_));
delete[] CONVERT_TO_SHORTPTR(ref_);
vpx_free(CONVERT_TO_SHORTPTR(sec_));
+#endif // CONFIG_VP9_HIGHBITDEPTH
}
-#else
- vpx_free(src_);
- delete[] ref_;
- vpx_free(sec_);
-#endif
libvpx_test::ClearSystemState();
}
@@ -664,7 +594,6 @@ template<typename SubpelVarianceFunctionType>
void SubpelVarianceTest<SubpelVarianceFunctionType>::RefTest() {
for (int x = 0; x < 16; ++x) {
for (int y = 0; y < 16; ++y) {
-#if CONFIG_VP9_HIGHBITDEPTH
if (!use_high_bit_depth_) {
for (int j = 0; j < block_size_; j++) {
src_[j] = rnd_.Rand8();
@@ -672,6 +601,7 @@ void SubpelVarianceTest<SubpelVarianceFunctionType>::RefTest() {
for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
ref_[j] = rnd_.Rand8();
}
+#if CONFIG_VP9_HIGHBITDEPTH
} else {
for (int j = 0; j < block_size_; j++) {
CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask_;
@@ -679,15 +609,8 @@ void SubpelVarianceTest<SubpelVarianceFunctionType>::RefTest() {
for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask_;
}
- }
-#else
- for (int j = 0; j < block_size_; j++) {
- src_[j] = rnd_.Rand8();
- }
- for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
- ref_[j] = rnd_.Rand8();
- }
#endif // CONFIG_VP9_HIGHBITDEPTH
+ }
unsigned int sse1, sse2;
unsigned int var1;
ASM_REGISTER_STATE_CHECK(var1 = subpel_variance_(ref_, width_ + 1, x, y,
@@ -710,25 +633,20 @@ void SubpelVarianceTest<SubpelVarianceFunctionType>::ExtremeRefTest() {
for (int x = 0; x < 16; ++x) {
for (int y = 0; y < 16; ++y) {
const int half = block_size_ / 2;
-#if CONFIG_VP9_HIGHBITDEPTH
if (!use_high_bit_depth_) {
memset(src_, 0, half);
memset(src_ + half, 255, half);
memset(ref_, 255, half);
memset(ref_ + half, 0, half + width_ + height_ + 1);
+#if CONFIG_VP9_HIGHBITDEPTH
} else {
vpx_memset16(CONVERT_TO_SHORTPTR(src_), mask_, half);
vpx_memset16(CONVERT_TO_SHORTPTR(src_) + half, 0, half);
vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 0, half);
vpx_memset16(CONVERT_TO_SHORTPTR(ref_) + half, mask_,
half + width_ + height_ + 1);
- }
-#else
- memset(src_, 0, half);
- memset(src_ + half, 255, half);
- memset(ref_, 255, half);
- memset(ref_ + half, 0, half + width_ + height_ + 1);
#endif // CONFIG_VP9_HIGHBITDEPTH
+ }
unsigned int sse1, sse2;
unsigned int var1;
ASM_REGISTER_STATE_CHECK(
@@ -742,11 +660,11 @@ void SubpelVarianceTest<SubpelVarianceFunctionType>::ExtremeRefTest() {
}
}
+#if CONFIG_VP9_ENCODER
template<>
void SubpelVarianceTest<vp9_subp_avg_variance_fn_t>::RefTest() {
for (int x = 0; x < 16; ++x) {
for (int y = 0; y < 16; ++y) {
-#if CONFIG_VP9_HIGHBITDEPTH
if (!use_high_bit_depth_) {
for (int j = 0; j < block_size_; j++) {
src_[j] = rnd_.Rand8();
@@ -755,6 +673,7 @@ void SubpelVarianceTest<vp9_subp_avg_variance_fn_t>::RefTest() {
for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
ref_[j] = rnd_.Rand8();
}
+#if CONFIG_VP9_HIGHBITDEPTH
} else {
for (int j = 0; j < block_size_; j++) {
CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask_;
@@ -763,16 +682,8 @@ void SubpelVarianceTest<vp9_subp_avg_variance_fn_t>::RefTest() {
for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask_;
}
+#endif // CONFIG_VP9_HIGHBITDEPTH
}
-#else
- for (int j = 0; j < block_size_; j++) {
- src_[j] = rnd_.Rand8();
- sec_[j] = rnd_.Rand8();
- }
- for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
- ref_[j] = rnd_.Rand8();
- }
-#endif
unsigned int sse1, sse2;
unsigned int var1;
ASM_REGISTER_STATE_CHECK(
@@ -788,272 +699,407 @@ void SubpelVarianceTest<vp9_subp_avg_variance_fn_t>::RefTest() {
}
}
}
-
#endif // CONFIG_VP9_ENCODER
-// -----------------------------------------------------------------------------
-// VP8 test cases.
-
-namespace vp8 {
-
-#if CONFIG_VP8_ENCODER
-typedef unsigned int (*vp8_sse_fn_t)(const unsigned char *src_ptr,
- int source_stride, const unsigned char *ref_ptr, int ref_stride);
-
-typedef MseTest<vp8_sse_fn_t> VP8SseTest;
-typedef MseTest<vp8_variance_fn_t> VP8MseTest;
-typedef VarianceTest<vp8_variance_fn_t> VP8VarianceTest;
-
-TEST_P(VP8SseTest, Ref_sse) { RefTest_sse(); }
-TEST_P(VP8SseTest, Max_sse) { MaxTest_sse(); }
-TEST_P(VP8MseTest, Ref_mse) { RefTest_mse(); }
-TEST_P(VP8MseTest, Max_mse) { MaxTest_mse(); }
-TEST_P(VP8VarianceTest, Zero) { ZeroTest(); }
-TEST_P(VP8VarianceTest, Ref) { RefTest(); }
-TEST_P(VP8VarianceTest, OneQuarter) { OneQuarterTest(); }
+typedef MseTest<Get4x4SseFunc> VpxSseTest;
+typedef MseTest<VarianceMxNFunc> VpxMseTest;
+typedef VarianceTest<VarianceMxNFunc> VpxVarianceTest;
+
+TEST_P(VpxSseTest, Ref_sse) { RefTest_sse(); }
+TEST_P(VpxSseTest, Max_sse) { MaxTest_sse(); }
+TEST_P(VpxMseTest, Ref_mse) { RefTest_mse(); }
+TEST_P(VpxMseTest, Max_mse) { MaxTest_mse(); }
+TEST_P(VpxVarianceTest, Zero) { ZeroTest(); }
+TEST_P(VpxVarianceTest, Ref) { RefTest(); }
+TEST_P(VpxVarianceTest, RefStride) { RefStrideTest(); }
+TEST_P(VpxVarianceTest, OneQuarter) { OneQuarterTest(); }
+TEST_P(SumOfSquaresTest, Const) { ConstTest(); }
+TEST_P(SumOfSquaresTest, Ref) { RefTest(); }
-const vp8_sse_fn_t get4x4sse_cs_c = vp8_get4x4sse_cs_c;
-INSTANTIATE_TEST_CASE_P(
- C, VP8SseTest,
- ::testing::Values(make_tuple(2, 2, get4x4sse_cs_c)));
+INSTANTIATE_TEST_CASE_P(C, SumOfSquaresTest,
+ ::testing::Values(vpx_get_mb_ss_c));
+
+const Get4x4SseFunc get4x4sse_cs_c = vpx_get4x4sse_cs_c;
+INSTANTIATE_TEST_CASE_P(C, VpxSseTest,
+ ::testing::Values(make_tuple(2, 2, get4x4sse_cs_c)));
+
+const VarianceMxNFunc mse16x16_c = vpx_mse16x16_c;
+const VarianceMxNFunc mse16x8_c = vpx_mse16x8_c;
+const VarianceMxNFunc mse8x16_c = vpx_mse8x16_c;
+const VarianceMxNFunc mse8x8_c = vpx_mse8x8_c;
+INSTANTIATE_TEST_CASE_P(C, VpxMseTest,
+ ::testing::Values(make_tuple(4, 4, mse16x16_c),
+ make_tuple(4, 3, mse16x8_c),
+ make_tuple(3, 4, mse8x16_c),
+ make_tuple(3, 3, mse8x8_c)));
+
+const VarianceMxNFunc variance64x64_c = vpx_variance64x64_c;
+const VarianceMxNFunc variance64x32_c = vpx_variance64x32_c;
+const VarianceMxNFunc variance32x64_c = vpx_variance32x64_c;
+const VarianceMxNFunc variance32x32_c = vpx_variance32x32_c;
+const VarianceMxNFunc variance32x16_c = vpx_variance32x16_c;
+const VarianceMxNFunc variance16x32_c = vpx_variance16x32_c;
+const VarianceMxNFunc variance16x16_c = vpx_variance16x16_c;
+const VarianceMxNFunc variance16x8_c = vpx_variance16x8_c;
+const VarianceMxNFunc variance8x16_c = vpx_variance8x16_c;
+const VarianceMxNFunc variance8x8_c = vpx_variance8x8_c;
+const VarianceMxNFunc variance8x4_c = vpx_variance8x4_c;
+const VarianceMxNFunc variance4x8_c = vpx_variance4x8_c;
+const VarianceMxNFunc variance4x4_c = vpx_variance4x4_c;
-const vp8_variance_fn_t mse16x16_c = vp8_mse16x16_c;
INSTANTIATE_TEST_CASE_P(
- C, VP8MseTest,
- ::testing::Values(make_tuple(4, 4, mse16x16_c)));
-
-const vp8_variance_fn_t variance4x4_c = vp8_variance4x4_c;
-const vp8_variance_fn_t variance8x8_c = vp8_variance8x8_c;
-const vp8_variance_fn_t variance8x16_c = vp8_variance8x16_c;
-const vp8_variance_fn_t variance16x8_c = vp8_variance16x8_c;
-const vp8_variance_fn_t variance16x16_c = vp8_variance16x16_c;
-INSTANTIATE_TEST_CASE_P(
- C, VP8VarianceTest,
- ::testing::Values(make_tuple(2, 2, variance4x4_c, 0),
- make_tuple(3, 3, variance8x8_c, 0),
- make_tuple(3, 4, variance8x16_c, 0),
+ C, VpxVarianceTest,
+ ::testing::Values(make_tuple(6, 6, variance64x64_c, 0),
+ make_tuple(6, 5, variance64x32_c, 0),
+ make_tuple(5, 6, variance32x64_c, 0),
+ make_tuple(5, 5, variance32x32_c, 0),
+ make_tuple(5, 4, variance32x16_c, 0),
+ make_tuple(4, 5, variance16x32_c, 0),
+ make_tuple(4, 4, variance16x16_c, 0),
make_tuple(4, 3, variance16x8_c, 0),
- make_tuple(4, 4, variance16x16_c, 0)));
+ make_tuple(3, 4, variance8x16_c, 0),
+ make_tuple(3, 3, variance8x8_c, 0),
+ make_tuple(3, 2, variance8x4_c, 0),
+ make_tuple(2, 3, variance4x8_c, 0),
+ make_tuple(2, 2, variance4x4_c, 0)));
-#if HAVE_NEON
-const vp8_sse_fn_t get4x4sse_cs_neon = vp8_get4x4sse_cs_neon;
-INSTANTIATE_TEST_CASE_P(
- NEON, VP8SseTest,
- ::testing::Values(make_tuple(2, 2, get4x4sse_cs_neon)));
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef MseTest<VarianceMxNFunc> VpxHBDMseTest;
+typedef VarianceTest<VarianceMxNFunc> VpxHBDVarianceTest;
+
+TEST_P(VpxHBDMseTest, Ref_mse) { RefTest_mse(); }
+TEST_P(VpxHBDMseTest, Max_mse) { MaxTest_mse(); }
+TEST_P(VpxHBDVarianceTest, Zero) { ZeroTest(); }
+TEST_P(VpxHBDVarianceTest, Ref) { RefTest(); }
+TEST_P(VpxHBDVarianceTest, RefStride) { RefStrideTest(); }
+TEST_P(VpxHBDVarianceTest, OneQuarter) { OneQuarterTest(); }
+
+/* TODO(debargha): This test does not support the highbd version
+const VarianceMxNFunc highbd_12_mse16x16_c = vpx_highbd_12_mse16x16_c;
+const VarianceMxNFunc highbd_12_mse16x8_c = vpx_highbd_12_mse16x8_c;
+const VarianceMxNFunc highbd_12_mse8x16_c = vpx_highbd_12_mse8x16_c;
+const VarianceMxNFunc highbd_12_mse8x8_c = vpx_highbd_12_mse8x8_c;
+
+const VarianceMxNFunc highbd_10_mse16x16_c = vpx_highbd_10_mse16x16_c;
+const VarianceMxNFunc highbd_10_mse16x8_c = vpx_highbd_10_mse16x8_c;
+const VarianceMxNFunc highbd_10_mse8x16_c = vpx_highbd_10_mse8x16_c;
+const VarianceMxNFunc highbd_10_mse8x8_c = vpx_highbd_10_mse8x8_c;
+
+const VarianceMxNFunc highbd_8_mse16x16_c = vpx_highbd_8_mse16x16_c;
+const VarianceMxNFunc highbd_8_mse16x8_c = vpx_highbd_8_mse16x8_c;
+const VarianceMxNFunc highbd_8_mse8x16_c = vpx_highbd_8_mse8x16_c;
+const VarianceMxNFunc highbd_8_mse8x8_c = vpx_highbd_8_mse8x8_c;
-const vp8_variance_fn_t mse16x16_neon = vp8_mse16x16_neon;
INSTANTIATE_TEST_CASE_P(
- NEON, VP8MseTest,
- ::testing::Values(make_tuple(4, 4, mse16x16_neon)));
-
-const vp8_variance_fn_t variance8x8_neon = vp8_variance8x8_neon;
-const vp8_variance_fn_t variance8x16_neon = vp8_variance8x16_neon;
-const vp8_variance_fn_t variance16x8_neon = vp8_variance16x8_neon;
-const vp8_variance_fn_t variance16x16_neon = vp8_variance16x16_neon;
+ C, VpxHBDMseTest, ::testing::Values(make_tuple(4, 4, highbd_12_mse16x16_c),
+ make_tuple(4, 4, highbd_12_mse16x8_c),
+ make_tuple(4, 4, highbd_12_mse8x16_c),
+ make_tuple(4, 4, highbd_12_mse8x8_c),
+ make_tuple(4, 4, highbd_10_mse16x16_c),
+ make_tuple(4, 4, highbd_10_mse16x8_c),
+ make_tuple(4, 4, highbd_10_mse8x16_c),
+ make_tuple(4, 4, highbd_10_mse8x8_c),
+ make_tuple(4, 4, highbd_8_mse16x16_c),
+ make_tuple(4, 4, highbd_8_mse16x8_c),
+ make_tuple(4, 4, highbd_8_mse8x16_c),
+ make_tuple(4, 4, highbd_8_mse8x8_c)));
+*/
+
+
+const VarianceMxNFunc highbd_12_variance64x64_c = vpx_highbd_12_variance64x64_c;
+const VarianceMxNFunc highbd_12_variance64x32_c = vpx_highbd_12_variance64x32_c;
+const VarianceMxNFunc highbd_12_variance32x64_c = vpx_highbd_12_variance32x64_c;
+const VarianceMxNFunc highbd_12_variance32x32_c = vpx_highbd_12_variance32x32_c;
+const VarianceMxNFunc highbd_12_variance32x16_c = vpx_highbd_12_variance32x16_c;
+const VarianceMxNFunc highbd_12_variance16x32_c = vpx_highbd_12_variance16x32_c;
+const VarianceMxNFunc highbd_12_variance16x16_c = vpx_highbd_12_variance16x16_c;
+const VarianceMxNFunc highbd_12_variance16x8_c = vpx_highbd_12_variance16x8_c;
+const VarianceMxNFunc highbd_12_variance8x16_c = vpx_highbd_12_variance8x16_c;
+const VarianceMxNFunc highbd_12_variance8x8_c = vpx_highbd_12_variance8x8_c;
+const VarianceMxNFunc highbd_12_variance8x4_c = vpx_highbd_12_variance8x4_c;
+const VarianceMxNFunc highbd_12_variance4x8_c = vpx_highbd_12_variance4x8_c;
+const VarianceMxNFunc highbd_12_variance4x4_c = vpx_highbd_12_variance4x4_c;
+
+const VarianceMxNFunc highbd_10_variance64x64_c = vpx_highbd_10_variance64x64_c;
+const VarianceMxNFunc highbd_10_variance64x32_c = vpx_highbd_10_variance64x32_c;
+const VarianceMxNFunc highbd_10_variance32x64_c = vpx_highbd_10_variance32x64_c;
+const VarianceMxNFunc highbd_10_variance32x32_c = vpx_highbd_10_variance32x32_c;
+const VarianceMxNFunc highbd_10_variance32x16_c = vpx_highbd_10_variance32x16_c;
+const VarianceMxNFunc highbd_10_variance16x32_c = vpx_highbd_10_variance16x32_c;
+const VarianceMxNFunc highbd_10_variance16x16_c = vpx_highbd_10_variance16x16_c;
+const VarianceMxNFunc highbd_10_variance16x8_c = vpx_highbd_10_variance16x8_c;
+const VarianceMxNFunc highbd_10_variance8x16_c = vpx_highbd_10_variance8x16_c;
+const VarianceMxNFunc highbd_10_variance8x8_c = vpx_highbd_10_variance8x8_c;
+const VarianceMxNFunc highbd_10_variance8x4_c = vpx_highbd_10_variance8x4_c;
+const VarianceMxNFunc highbd_10_variance4x8_c = vpx_highbd_10_variance4x8_c;
+const VarianceMxNFunc highbd_10_variance4x4_c = vpx_highbd_10_variance4x4_c;
+
+const VarianceMxNFunc highbd_8_variance64x64_c = vpx_highbd_8_variance64x64_c;
+const VarianceMxNFunc highbd_8_variance64x32_c = vpx_highbd_8_variance64x32_c;
+const VarianceMxNFunc highbd_8_variance32x64_c = vpx_highbd_8_variance32x64_c;
+const VarianceMxNFunc highbd_8_variance32x32_c = vpx_highbd_8_variance32x32_c;
+const VarianceMxNFunc highbd_8_variance32x16_c = vpx_highbd_8_variance32x16_c;
+const VarianceMxNFunc highbd_8_variance16x32_c = vpx_highbd_8_variance16x32_c;
+const VarianceMxNFunc highbd_8_variance16x16_c = vpx_highbd_8_variance16x16_c;
+const VarianceMxNFunc highbd_8_variance16x8_c = vpx_highbd_8_variance16x8_c;
+const VarianceMxNFunc highbd_8_variance8x16_c = vpx_highbd_8_variance8x16_c;
+const VarianceMxNFunc highbd_8_variance8x8_c = vpx_highbd_8_variance8x8_c;
+const VarianceMxNFunc highbd_8_variance8x4_c = vpx_highbd_8_variance8x4_c;
+const VarianceMxNFunc highbd_8_variance4x8_c = vpx_highbd_8_variance4x8_c;
+const VarianceMxNFunc highbd_8_variance4x4_c = vpx_highbd_8_variance4x4_c;
INSTANTIATE_TEST_CASE_P(
- NEON, VP8VarianceTest,
- ::testing::Values(make_tuple(3, 3, variance8x8_neon, 0),
- make_tuple(3, 4, variance8x16_neon, 0),
- make_tuple(4, 3, variance16x8_neon, 0),
- make_tuple(4, 4, variance16x16_neon, 0)));
-#endif
+ C, VpxHBDVarianceTest,
+ ::testing::Values(make_tuple(6, 6, highbd_12_variance64x64_c, 12),
+ make_tuple(6, 5, highbd_12_variance64x32_c, 12),
+ make_tuple(5, 6, highbd_12_variance32x64_c, 12),
+ make_tuple(5, 5, highbd_12_variance32x32_c, 12),
+ make_tuple(5, 4, highbd_12_variance32x16_c, 12),
+ make_tuple(4, 5, highbd_12_variance16x32_c, 12),
+ make_tuple(4, 4, highbd_12_variance16x16_c, 12),
+ make_tuple(4, 3, highbd_12_variance16x8_c, 12),
+ make_tuple(3, 4, highbd_12_variance8x16_c, 12),
+ make_tuple(3, 3, highbd_12_variance8x8_c, 12),
+ make_tuple(3, 2, highbd_12_variance8x4_c, 12),
+ make_tuple(2, 3, highbd_12_variance4x8_c, 12),
+ make_tuple(2, 2, highbd_12_variance4x4_c, 12),
+ make_tuple(6, 6, highbd_10_variance64x64_c, 10),
+ make_tuple(6, 5, highbd_10_variance64x32_c, 10),
+ make_tuple(5, 6, highbd_10_variance32x64_c, 10),
+ make_tuple(5, 5, highbd_10_variance32x32_c, 10),
+ make_tuple(5, 4, highbd_10_variance32x16_c, 10),
+ make_tuple(4, 5, highbd_10_variance16x32_c, 10),
+ make_tuple(4, 4, highbd_10_variance16x16_c, 10),
+ make_tuple(4, 3, highbd_10_variance16x8_c, 10),
+ make_tuple(3, 4, highbd_10_variance8x16_c, 10),
+ make_tuple(3, 3, highbd_10_variance8x8_c, 10),
+ make_tuple(3, 2, highbd_10_variance8x4_c, 10),
+ make_tuple(2, 3, highbd_10_variance4x8_c, 10),
+ make_tuple(2, 2, highbd_10_variance4x4_c, 10),
+ make_tuple(6, 6, highbd_8_variance64x64_c, 8),
+ make_tuple(6, 5, highbd_8_variance64x32_c, 8),
+ make_tuple(5, 6, highbd_8_variance32x64_c, 8),
+ make_tuple(5, 5, highbd_8_variance32x32_c, 8),
+ make_tuple(5, 4, highbd_8_variance32x16_c, 8),
+ make_tuple(4, 5, highbd_8_variance16x32_c, 8),
+ make_tuple(4, 4, highbd_8_variance16x16_c, 8),
+ make_tuple(4, 3, highbd_8_variance16x8_c, 8),
+ make_tuple(3, 4, highbd_8_variance8x16_c, 8),
+ make_tuple(3, 3, highbd_8_variance8x8_c, 8),
+ make_tuple(3, 2, highbd_8_variance8x4_c, 8),
+ make_tuple(2, 3, highbd_8_variance4x8_c, 8),
+ make_tuple(2, 2, highbd_8_variance4x4_c, 8)));
+#endif // CONFIG_VP9_HIGHBITDEPTH
#if HAVE_MMX
-const vp8_variance_fn_t variance4x4_mmx = vp8_variance4x4_mmx;
-const vp8_variance_fn_t variance8x8_mmx = vp8_variance8x8_mmx;
-const vp8_variance_fn_t variance8x16_mmx = vp8_variance8x16_mmx;
-const vp8_variance_fn_t variance16x8_mmx = vp8_variance16x8_mmx;
-const vp8_variance_fn_t variance16x16_mmx = vp8_variance16x16_mmx;
+const VarianceMxNFunc mse16x16_mmx = vpx_mse16x16_mmx;
+INSTANTIATE_TEST_CASE_P(MMX, VpxMseTest,
+ ::testing::Values(make_tuple(4, 4, mse16x16_mmx)));
+
+INSTANTIATE_TEST_CASE_P(MMX, SumOfSquaresTest,
+ ::testing::Values(vpx_get_mb_ss_mmx));
+
+const VarianceMxNFunc variance16x16_mmx = vpx_variance16x16_mmx;
+const VarianceMxNFunc variance16x8_mmx = vpx_variance16x8_mmx;
+const VarianceMxNFunc variance8x16_mmx = vpx_variance8x16_mmx;
+const VarianceMxNFunc variance8x8_mmx = vpx_variance8x8_mmx;
+const VarianceMxNFunc variance4x4_mmx = vpx_variance4x4_mmx;
INSTANTIATE_TEST_CASE_P(
- MMX, VP8VarianceTest,
- ::testing::Values(make_tuple(2, 2, variance4x4_mmx, 0),
- make_tuple(3, 3, variance8x8_mmx, 0),
- make_tuple(3, 4, variance8x16_mmx, 0),
+ MMX, VpxVarianceTest,
+ ::testing::Values(make_tuple(4, 4, variance16x16_mmx, 0),
make_tuple(4, 3, variance16x8_mmx, 0),
- make_tuple(4, 4, variance16x16_mmx, 0)));
-#endif
+ make_tuple(3, 4, variance8x16_mmx, 0),
+ make_tuple(3, 3, variance8x8_mmx, 0),
+ make_tuple(2, 2, variance4x4_mmx, 0)));
+#endif // HAVE_MMX
#if HAVE_SSE2
-const vp8_variance_fn_t variance4x4_wmt = vp8_variance4x4_wmt;
-const vp8_variance_fn_t variance8x8_wmt = vp8_variance8x8_wmt;
-const vp8_variance_fn_t variance8x16_wmt = vp8_variance8x16_wmt;
-const vp8_variance_fn_t variance16x8_wmt = vp8_variance16x8_wmt;
-const vp8_variance_fn_t variance16x16_wmt = vp8_variance16x16_wmt;
+INSTANTIATE_TEST_CASE_P(SSE2, SumOfSquaresTest,
+ ::testing::Values(vpx_get_mb_ss_sse2));
+
+const VarianceMxNFunc mse16x16_sse2 = vpx_mse16x16_sse2;
+const VarianceMxNFunc mse16x8_sse2 = vpx_mse16x8_sse2;
+const VarianceMxNFunc mse8x16_sse2 = vpx_mse8x16_sse2;
+const VarianceMxNFunc mse8x8_sse2 = vpx_mse8x8_sse2;
+INSTANTIATE_TEST_CASE_P(SSE2, VpxMseTest,
+ ::testing::Values(make_tuple(4, 4, mse16x16_sse2),
+ make_tuple(4, 3, mse16x8_sse2),
+ make_tuple(3, 4, mse8x16_sse2),
+ make_tuple(3, 3, mse8x8_sse2)));
+
+const VarianceMxNFunc variance64x64_sse2 = vpx_variance64x64_sse2;
+const VarianceMxNFunc variance64x32_sse2 = vpx_variance64x32_sse2;
+const VarianceMxNFunc variance32x64_sse2 = vpx_variance32x64_sse2;
+const VarianceMxNFunc variance32x32_sse2 = vpx_variance32x32_sse2;
+const VarianceMxNFunc variance32x16_sse2 = vpx_variance32x16_sse2;
+const VarianceMxNFunc variance16x32_sse2 = vpx_variance16x32_sse2;
+const VarianceMxNFunc variance16x16_sse2 = vpx_variance16x16_sse2;
+const VarianceMxNFunc variance16x8_sse2 = vpx_variance16x8_sse2;
+const VarianceMxNFunc variance8x16_sse2 = vpx_variance8x16_sse2;
+const VarianceMxNFunc variance8x8_sse2 = vpx_variance8x8_sse2;
+const VarianceMxNFunc variance8x4_sse2 = vpx_variance8x4_sse2;
+const VarianceMxNFunc variance4x8_sse2 = vpx_variance4x8_sse2;
+const VarianceMxNFunc variance4x4_sse2 = vpx_variance4x4_sse2;
INSTANTIATE_TEST_CASE_P(
- SSE2, VP8VarianceTest,
- ::testing::Values(make_tuple(2, 2, variance4x4_wmt, 0),
- make_tuple(3, 3, variance8x8_wmt, 0),
- make_tuple(3, 4, variance8x16_wmt, 0),
- make_tuple(4, 3, variance16x8_wmt, 0),
- make_tuple(4, 4, variance16x16_wmt, 0)));
-#endif
-#endif // CONFIG_VP8_ENCODER
-
-} // namespace vp8
+ SSE2, VpxVarianceTest,
+ ::testing::Values(make_tuple(6, 6, variance64x64_sse2, 0),
+ make_tuple(6, 5, variance64x32_sse2, 0),
+ make_tuple(5, 6, variance32x64_sse2, 0),
+ make_tuple(5, 5, variance32x32_sse2, 0),
+ make_tuple(5, 4, variance32x16_sse2, 0),
+ make_tuple(4, 5, variance16x32_sse2, 0),
+ make_tuple(4, 4, variance16x16_sse2, 0),
+ make_tuple(4, 3, variance16x8_sse2, 0),
+ make_tuple(3, 4, variance8x16_sse2, 0),
+ make_tuple(3, 3, variance8x8_sse2, 0),
+ make_tuple(3, 2, variance8x4_sse2, 0),
+ make_tuple(2, 3, variance4x8_sse2, 0),
+ make_tuple(2, 2, variance4x4_sse2, 0)));
+#if CONFIG_VP9_HIGHBITDEPTH
+/* TODO(debargha): This test does not support the highbd version
+const VarianceMxNFunc highbd_12_mse16x16_sse2 = vpx_highbd_12_mse16x16_sse2;
+const VarianceMxNFunc highbd_12_mse16x8_sse2 = vpx_highbd_12_mse16x8_sse2;
+const VarianceMxNFunc highbd_12_mse8x16_sse2 = vpx_highbd_12_mse8x16_sse2;
+const VarianceMxNFunc highbd_12_mse8x8_sse2 = vpx_highbd_12_mse8x8_sse2;
+
+const VarianceMxNFunc highbd_10_mse16x16_sse2 = vpx_highbd_10_mse16x16_sse2;
+const VarianceMxNFunc highbd_10_mse16x8_sse2 = vpx_highbd_10_mse16x8_sse2;
+const VarianceMxNFunc highbd_10_mse8x16_sse2 = vpx_highbd_10_mse8x16_sse2;
+const VarianceMxNFunc highbd_10_mse8x8_sse2 = vpx_highbd_10_mse8x8_sse2;
+
+const VarianceMxNFunc highbd_8_mse16x16_sse2 = vpx_highbd_8_mse16x16_sse2;
+const VarianceMxNFunc highbd_8_mse16x8_sse2 = vpx_highbd_8_mse16x8_sse2;
+const VarianceMxNFunc highbd_8_mse8x16_sse2 = vpx_highbd_8_mse8x16_sse2;
+const VarianceMxNFunc highbd_8_mse8x8_sse2 = vpx_highbd_8_mse8x8_sse2;
-// -----------------------------------------------------------------------------
-// VP9 test cases.
+INSTANTIATE_TEST_CASE_P(
+ SSE2, VpxHBDMseTest, ::testing::Values(make_tuple(4, 4, highbd_12_mse16x16_sse2),
+ make_tuple(4, 3, highbd_12_mse16x8_sse2),
+ make_tuple(3, 4, highbd_12_mse8x16_sse2),
+ make_tuple(3, 3, highbd_12_mse8x8_sse2),
+ make_tuple(4, 4, highbd_10_mse16x16_sse2),
+ make_tuple(4, 3, highbd_10_mse16x8_sse2),
+ make_tuple(3, 4, highbd_10_mse8x16_sse2),
+ make_tuple(3, 3, highbd_10_mse8x8_sse2),
+ make_tuple(4, 4, highbd_8_mse16x16_sse2),
+ make_tuple(4, 3, highbd_8_mse16x8_sse2),
+ make_tuple(3, 4, highbd_8_mse8x16_sse2),
+ make_tuple(3, 3, highbd_8_mse8x8_sse2)));
+*/
+
+const VarianceMxNFunc highbd_12_variance64x64_sse2 =
+ vpx_highbd_12_variance64x64_sse2;
+const VarianceMxNFunc highbd_12_variance64x32_sse2 =
+ vpx_highbd_12_variance64x32_sse2;
+const VarianceMxNFunc highbd_12_variance32x64_sse2 =
+ vpx_highbd_12_variance32x64_sse2;
+const VarianceMxNFunc highbd_12_variance32x32_sse2 =
+ vpx_highbd_12_variance32x32_sse2;
+const VarianceMxNFunc highbd_12_variance32x16_sse2 =
+ vpx_highbd_12_variance32x16_sse2;
+const VarianceMxNFunc highbd_12_variance16x32_sse2 =
+ vpx_highbd_12_variance16x32_sse2;
+const VarianceMxNFunc highbd_12_variance16x16_sse2 =
+ vpx_highbd_12_variance16x16_sse2;
+const VarianceMxNFunc highbd_12_variance16x8_sse2 =
+ vpx_highbd_12_variance16x8_sse2;
+const VarianceMxNFunc highbd_12_variance8x16_sse2 =
+ vpx_highbd_12_variance8x16_sse2;
+const VarianceMxNFunc highbd_12_variance8x8_sse2 =
+ vpx_highbd_12_variance8x8_sse2;
+const VarianceMxNFunc highbd_10_variance64x64_sse2 =
+ vpx_highbd_10_variance64x64_sse2;
+const VarianceMxNFunc highbd_10_variance64x32_sse2 =
+ vpx_highbd_10_variance64x32_sse2;
+const VarianceMxNFunc highbd_10_variance32x64_sse2 =
+ vpx_highbd_10_variance32x64_sse2;
+const VarianceMxNFunc highbd_10_variance32x32_sse2 =
+ vpx_highbd_10_variance32x32_sse2;
+const VarianceMxNFunc highbd_10_variance32x16_sse2 =
+ vpx_highbd_10_variance32x16_sse2;
+const VarianceMxNFunc highbd_10_variance16x32_sse2 =
+ vpx_highbd_10_variance16x32_sse2;
+const VarianceMxNFunc highbd_10_variance16x16_sse2 =
+ vpx_highbd_10_variance16x16_sse2;
+const VarianceMxNFunc highbd_10_variance16x8_sse2 =
+ vpx_highbd_10_variance16x8_sse2;
+const VarianceMxNFunc highbd_10_variance8x16_sse2 =
+ vpx_highbd_10_variance8x16_sse2;
+const VarianceMxNFunc highbd_10_variance8x8_sse2 =
+ vpx_highbd_10_variance8x8_sse2;
+const VarianceMxNFunc highbd_8_variance64x64_sse2 =
+ vpx_highbd_8_variance64x64_sse2;
+const VarianceMxNFunc highbd_8_variance64x32_sse2 =
+ vpx_highbd_8_variance64x32_sse2;
+const VarianceMxNFunc highbd_8_variance32x64_sse2 =
+ vpx_highbd_8_variance32x64_sse2;
+const VarianceMxNFunc highbd_8_variance32x32_sse2 =
+ vpx_highbd_8_variance32x32_sse2;
+const VarianceMxNFunc highbd_8_variance32x16_sse2 =
+ vpx_highbd_8_variance32x16_sse2;
+const VarianceMxNFunc highbd_8_variance16x32_sse2 =
+ vpx_highbd_8_variance16x32_sse2;
+const VarianceMxNFunc highbd_8_variance16x16_sse2 =
+ vpx_highbd_8_variance16x16_sse2;
+const VarianceMxNFunc highbd_8_variance16x8_sse2 =
+ vpx_highbd_8_variance16x8_sse2;
+const VarianceMxNFunc highbd_8_variance8x16_sse2 =
+ vpx_highbd_8_variance8x16_sse2;
+const VarianceMxNFunc highbd_8_variance8x8_sse2 =
+ vpx_highbd_8_variance8x8_sse2;
-namespace vp9 {
+INSTANTIATE_TEST_CASE_P(
+ SSE2, VpxHBDVarianceTest,
+ ::testing::Values(make_tuple(6, 6, highbd_12_variance64x64_sse2, 12),
+ make_tuple(6, 5, highbd_12_variance64x32_sse2, 12),
+ make_tuple(5, 6, highbd_12_variance32x64_sse2, 12),
+ make_tuple(5, 5, highbd_12_variance32x32_sse2, 12),
+ make_tuple(5, 4, highbd_12_variance32x16_sse2, 12),
+ make_tuple(4, 5, highbd_12_variance16x32_sse2, 12),
+ make_tuple(4, 4, highbd_12_variance16x16_sse2, 12),
+ make_tuple(4, 3, highbd_12_variance16x8_sse2, 12),
+ make_tuple(3, 4, highbd_12_variance8x16_sse2, 12),
+ make_tuple(3, 3, highbd_12_variance8x8_sse2, 12),
+ make_tuple(6, 6, highbd_10_variance64x64_sse2, 10),
+ make_tuple(6, 5, highbd_10_variance64x32_sse2, 10),
+ make_tuple(5, 6, highbd_10_variance32x64_sse2, 10),
+ make_tuple(5, 5, highbd_10_variance32x32_sse2, 10),
+ make_tuple(5, 4, highbd_10_variance32x16_sse2, 10),
+ make_tuple(4, 5, highbd_10_variance16x32_sse2, 10),
+ make_tuple(4, 4, highbd_10_variance16x16_sse2, 10),
+ make_tuple(4, 3, highbd_10_variance16x8_sse2, 10),
+ make_tuple(3, 4, highbd_10_variance8x16_sse2, 10),
+ make_tuple(3, 3, highbd_10_variance8x8_sse2, 10),
+ make_tuple(6, 6, highbd_8_variance64x64_sse2, 8),
+ make_tuple(6, 5, highbd_8_variance64x32_sse2, 8),
+ make_tuple(5, 6, highbd_8_variance32x64_sse2, 8),
+ make_tuple(5, 5, highbd_8_variance32x32_sse2, 8),
+ make_tuple(5, 4, highbd_8_variance32x16_sse2, 8),
+ make_tuple(4, 5, highbd_8_variance16x32_sse2, 8),
+ make_tuple(4, 4, highbd_8_variance16x16_sse2, 8),
+ make_tuple(4, 3, highbd_8_variance16x8_sse2, 8),
+ make_tuple(3, 4, highbd_8_variance8x16_sse2, 8),
+ make_tuple(3, 3, highbd_8_variance8x8_sse2, 8)));
+#endif // CONFIG_VP9_HIGHBITDEPTH
+#endif // HAVE_SSE2
#if CONFIG_VP9_ENCODER
-TEST_P(SumOfSquaresTest, Const) { ConstTest(); }
-TEST_P(SumOfSquaresTest, Ref) { RefTest(); }
-
-INSTANTIATE_TEST_CASE_P(C, SumOfSquaresTest,
- ::testing::Values(vp9_get_mb_ss_c));
-
-typedef VarianceTest<vp9_variance_fn_t> VP9VarianceTest;
typedef SubpelVarianceTest<vp9_subpixvariance_fn_t> VP9SubpelVarianceTest;
typedef SubpelVarianceTest<vp9_subp_avg_variance_fn_t> VP9SubpelAvgVarianceTest;
-TEST_P(VP9VarianceTest, Zero) { ZeroTest(); }
-TEST_P(VP9VarianceTest, Ref) { RefTest(); }
-TEST_P(VP9VarianceTest, RefStride) { RefStrideTest(); }
TEST_P(VP9SubpelVarianceTest, Ref) { RefTest(); }
TEST_P(VP9SubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
TEST_P(VP9SubpelAvgVarianceTest, Ref) { RefTest(); }
-TEST_P(VP9VarianceTest, OneQuarter) { OneQuarterTest(); }
#if CONFIG_VP9_HIGHBITDEPTH
-typedef VarianceTest<vp9_variance_fn_t> VP9VarianceHighTest;
typedef SubpelVarianceTest<vp9_subpixvariance_fn_t> VP9SubpelVarianceHighTest;
typedef SubpelVarianceTest<vp9_subp_avg_variance_fn_t>
VP9SubpelAvgVarianceHighTest;
-TEST_P(VP9VarianceHighTest, Zero) { ZeroTest(); }
-TEST_P(VP9VarianceHighTest, Ref) { RefTest(); }
-TEST_P(VP9VarianceHighTest, RefStride) { RefStrideTest(); }
TEST_P(VP9SubpelVarianceHighTest, Ref) { RefTest(); }
TEST_P(VP9SubpelVarianceHighTest, ExtremeRef) { ExtremeRefTest(); }
TEST_P(VP9SubpelAvgVarianceHighTest, Ref) { RefTest(); }
-TEST_P(VP9VarianceHighTest, OneQuarter) { OneQuarterTest(); }
#endif // CONFIG_VP9_HIGHBITDEPTH
-const vp9_variance_fn_t variance4x4_c = vp9_variance4x4_c;
-const vp9_variance_fn_t variance4x8_c = vp9_variance4x8_c;
-const vp9_variance_fn_t variance8x4_c = vp9_variance8x4_c;
-const vp9_variance_fn_t variance8x8_c = vp9_variance8x8_c;
-const vp9_variance_fn_t variance8x16_c = vp9_variance8x16_c;
-const vp9_variance_fn_t variance16x8_c = vp9_variance16x8_c;
-const vp9_variance_fn_t variance16x16_c = vp9_variance16x16_c;
-const vp9_variance_fn_t variance16x32_c = vp9_variance16x32_c;
-const vp9_variance_fn_t variance32x16_c = vp9_variance32x16_c;
-const vp9_variance_fn_t variance32x32_c = vp9_variance32x32_c;
-const vp9_variance_fn_t variance32x64_c = vp9_variance32x64_c;
-const vp9_variance_fn_t variance64x32_c = vp9_variance64x32_c;
-const vp9_variance_fn_t variance64x64_c = vp9_variance64x64_c;
-INSTANTIATE_TEST_CASE_P(
- C, VP9VarianceTest,
- ::testing::Values(make_tuple(2, 2, variance4x4_c, 0),
- make_tuple(2, 3, variance4x8_c, 0),
- make_tuple(3, 2, variance8x4_c, 0),
- make_tuple(3, 3, variance8x8_c, 0),
- make_tuple(3, 4, variance8x16_c, 0),
- make_tuple(4, 3, variance16x8_c, 0),
- make_tuple(4, 4, variance16x16_c, 0),
- make_tuple(4, 5, variance16x32_c, 0),
- make_tuple(5, 4, variance32x16_c, 0),
- make_tuple(5, 5, variance32x32_c, 0),
- make_tuple(5, 6, variance32x64_c, 0),
- make_tuple(6, 5, variance64x32_c, 0),
- make_tuple(6, 6, variance64x64_c, 0)));
-#if CONFIG_VP9_HIGHBITDEPTH
-const vp9_variance_fn_t highbd_10_variance4x4_c = vp9_highbd_10_variance4x4_c;
-const vp9_variance_fn_t highbd_10_variance4x8_c = vp9_highbd_10_variance4x8_c;
-const vp9_variance_fn_t highbd_10_variance8x4_c = vp9_highbd_10_variance8x4_c;
-const vp9_variance_fn_t highbd_10_variance8x8_c = vp9_highbd_10_variance8x8_c;
-const vp9_variance_fn_t highbd_10_variance8x16_c = vp9_highbd_10_variance8x16_c;
-const vp9_variance_fn_t highbd_10_variance16x8_c = vp9_highbd_10_variance16x8_c;
-const vp9_variance_fn_t highbd_10_variance16x16_c =
- vp9_highbd_10_variance16x16_c;
-const vp9_variance_fn_t highbd_10_variance16x32_c =
- vp9_highbd_10_variance16x32_c;
-const vp9_variance_fn_t highbd_10_variance32x16_c =
- vp9_highbd_10_variance32x16_c;
-const vp9_variance_fn_t highbd_10_variance32x32_c =
- vp9_highbd_10_variance32x32_c;
-const vp9_variance_fn_t highbd_10_variance32x64_c =
- vp9_highbd_10_variance32x64_c;
-const vp9_variance_fn_t highbd_10_variance64x32_c =
- vp9_highbd_10_variance64x32_c;
-const vp9_variance_fn_t highbd_10_variance64x64_c =
- vp9_highbd_10_variance64x64_c;
-const vp9_variance_fn_t highbd_12_variance4x4_c = vp9_highbd_12_variance4x4_c;
-const vp9_variance_fn_t highbd_12_variance4x8_c = vp9_highbd_12_variance4x8_c;
-const vp9_variance_fn_t highbd_12_variance8x4_c = vp9_highbd_12_variance8x4_c;
-const vp9_variance_fn_t highbd_12_variance8x8_c = vp9_highbd_12_variance8x8_c;
-const vp9_variance_fn_t highbd_12_variance8x16_c = vp9_highbd_12_variance8x16_c;
-const vp9_variance_fn_t highbd_12_variance16x8_c = vp9_highbd_12_variance16x8_c;
-const vp9_variance_fn_t highbd_12_variance16x16_c =
- vp9_highbd_12_variance16x16_c;
-const vp9_variance_fn_t highbd_12_variance16x32_c =
- vp9_highbd_12_variance16x32_c;
-const vp9_variance_fn_t highbd_12_variance32x16_c =
- vp9_highbd_12_variance32x16_c;
-const vp9_variance_fn_t highbd_12_variance32x32_c =
- vp9_highbd_12_variance32x32_c;
-const vp9_variance_fn_t highbd_12_variance32x64_c =
- vp9_highbd_12_variance32x64_c;
-const vp9_variance_fn_t highbd_12_variance64x32_c =
- vp9_highbd_12_variance64x32_c;
-const vp9_variance_fn_t highbd_12_variance64x64_c =
- vp9_highbd_12_variance64x64_c;
-const vp9_variance_fn_t highbd_variance4x4_c = vp9_highbd_variance4x4_c;
-const vp9_variance_fn_t highbd_variance4x8_c = vp9_highbd_variance4x8_c;
-const vp9_variance_fn_t highbd_variance8x4_c = vp9_highbd_variance8x4_c;
-const vp9_variance_fn_t highbd_variance8x8_c = vp9_highbd_variance8x8_c;
-const vp9_variance_fn_t highbd_variance8x16_c = vp9_highbd_variance8x16_c;
-const vp9_variance_fn_t highbd_variance16x8_c = vp9_highbd_variance16x8_c;
-const vp9_variance_fn_t highbd_variance16x16_c = vp9_highbd_variance16x16_c;
-const vp9_variance_fn_t highbd_variance16x32_c = vp9_highbd_variance16x32_c;
-const vp9_variance_fn_t highbd_variance32x16_c = vp9_highbd_variance32x16_c;
-const vp9_variance_fn_t highbd_variance32x32_c = vp9_highbd_variance32x32_c;
-const vp9_variance_fn_t highbd_variance32x64_c = vp9_highbd_variance32x64_c;
-const vp9_variance_fn_t highbd_variance64x32_c = vp9_highbd_variance64x32_c;
-const vp9_variance_fn_t highbd_variance64x64_c = vp9_highbd_variance64x64_c;
-INSTANTIATE_TEST_CASE_P(
- C, VP9VarianceHighTest,
- ::testing::Values(make_tuple(2, 2, highbd_10_variance4x4_c, 10),
- make_tuple(2, 3, highbd_10_variance4x8_c, 10),
- make_tuple(3, 2, highbd_10_variance8x4_c, 10),
- make_tuple(3, 3, highbd_10_variance8x8_c, 10),
- make_tuple(3, 4, highbd_10_variance8x16_c, 10),
- make_tuple(4, 3, highbd_10_variance16x8_c, 10),
- make_tuple(4, 4, highbd_10_variance16x16_c, 10),
- make_tuple(4, 5, highbd_10_variance16x32_c, 10),
- make_tuple(5, 4, highbd_10_variance32x16_c, 10),
- make_tuple(5, 5, highbd_10_variance32x32_c, 10),
- make_tuple(5, 6, highbd_10_variance32x64_c, 10),
- make_tuple(6, 5, highbd_10_variance64x32_c, 10),
- make_tuple(6, 6, highbd_10_variance64x64_c, 10),
- make_tuple(2, 2, highbd_12_variance4x4_c, 12),
- make_tuple(2, 3, highbd_12_variance4x8_c, 12),
- make_tuple(3, 2, highbd_12_variance8x4_c, 12),
- make_tuple(3, 3, highbd_12_variance8x8_c, 12),
- make_tuple(3, 4, highbd_12_variance8x16_c, 12),
- make_tuple(4, 3, highbd_12_variance16x8_c, 12),
- make_tuple(4, 4, highbd_12_variance16x16_c, 12),
- make_tuple(4, 5, highbd_12_variance16x32_c, 12),
- make_tuple(5, 4, highbd_12_variance32x16_c, 12),
- make_tuple(5, 5, highbd_12_variance32x32_c, 12),
- make_tuple(5, 6, highbd_12_variance32x64_c, 12),
- make_tuple(6, 5, highbd_12_variance64x32_c, 12),
- make_tuple(6, 6, highbd_12_variance64x64_c, 12),
- make_tuple(2, 2, highbd_variance4x4_c, 8),
- make_tuple(2, 3, highbd_variance4x8_c, 8),
- make_tuple(3, 2, highbd_variance8x4_c, 8),
- make_tuple(3, 3, highbd_variance8x8_c, 8),
- make_tuple(3, 4, highbd_variance8x16_c, 8),
- make_tuple(4, 3, highbd_variance16x8_c, 8),
- make_tuple(4, 4, highbd_variance16x16_c, 8),
- make_tuple(4, 5, highbd_variance16x32_c, 8),
- make_tuple(5, 4, highbd_variance32x16_c, 8),
- make_tuple(5, 5, highbd_variance32x32_c, 8),
- make_tuple(5, 6, highbd_variance32x64_c, 8),
- make_tuple(6, 5, highbd_variance64x32_c, 8),
- make_tuple(6, 6, highbd_variance64x64_c, 8)));
-#endif // CONFIG_VP9_HIGHBITDEPTH
const vp9_subpixvariance_fn_t subpel_variance4x4_c =
vp9_sub_pixel_variance4x4_c;
const vp9_subpixvariance_fn_t subpel_variance4x8_c =
@@ -1377,40 +1423,11 @@ INSTANTIATE_TEST_CASE_P(
make_tuple(6, 5, highbd_subpel_avg_variance64x32_c, 8),
make_tuple(6, 6, highbd_subpel_avg_variance64x64_c, 8)));
#endif // CONFIG_VP9_HIGHBITDEPTH
+#endif // CONFIG_VP9_ENCODER
+#if CONFIG_VP9_ENCODER
#if HAVE_SSE2
#if CONFIG_USE_X86INC
-INSTANTIATE_TEST_CASE_P(SSE2, SumOfSquaresTest,
- ::testing::Values(vp9_get_mb_ss_sse2));
-
-const vp9_variance_fn_t variance4x4_sse2 = vp9_variance4x4_sse2;
-const vp9_variance_fn_t variance4x8_sse2 = vp9_variance4x8_sse2;
-const vp9_variance_fn_t variance8x4_sse2 = vp9_variance8x4_sse2;
-const vp9_variance_fn_t variance8x8_sse2 = vp9_variance8x8_sse2;
-const vp9_variance_fn_t variance8x16_sse2 = vp9_variance8x16_sse2;
-const vp9_variance_fn_t variance16x8_sse2 = vp9_variance16x8_sse2;
-const vp9_variance_fn_t variance16x16_sse2 = vp9_variance16x16_sse2;
-const vp9_variance_fn_t variance16x32_sse2 = vp9_variance16x32_sse2;
-const vp9_variance_fn_t variance32x16_sse2 = vp9_variance32x16_sse2;
-const vp9_variance_fn_t variance32x32_sse2 = vp9_variance32x32_sse2;
-const vp9_variance_fn_t variance32x64_sse2 = vp9_variance32x64_sse2;
-const vp9_variance_fn_t variance64x32_sse2 = vp9_variance64x32_sse2;
-const vp9_variance_fn_t variance64x64_sse2 = vp9_variance64x64_sse2;
-INSTANTIATE_TEST_CASE_P(
- SSE2, VP9VarianceTest,
- ::testing::Values(make_tuple(2, 2, variance4x4_sse2, 0),
- make_tuple(2, 3, variance4x8_sse2, 0),
- make_tuple(3, 2, variance8x4_sse2, 0),
- make_tuple(3, 3, variance8x8_sse2, 0),
- make_tuple(3, 4, variance8x16_sse2, 0),
- make_tuple(4, 3, variance16x8_sse2, 0),
- make_tuple(4, 4, variance16x16_sse2, 0),
- make_tuple(4, 5, variance16x32_sse2, 0),
- make_tuple(5, 4, variance32x16_sse2, 0),
- make_tuple(5, 5, variance32x32_sse2, 0),
- make_tuple(5, 6, variance32x64_sse2, 0),
- make_tuple(6, 5, variance64x32_sse2, 0),
- make_tuple(6, 6, variance64x64_sse2, 0)));
const vp9_subpixvariance_fn_t subpel_variance4x4_sse =
vp9_sub_pixel_variance4x4_sse;
const vp9_subpixvariance_fn_t subpel_variance4x8_sse =
@@ -1494,96 +1511,6 @@ INSTANTIATE_TEST_CASE_P(
make_tuple(6, 5, subpel_avg_variance64x32_sse2, 0),
make_tuple(6, 6, subpel_avg_variance64x64_sse2, 0)));
#if CONFIG_VP9_HIGHBITDEPTH
-const vp9_variance_fn_t highbd_variance8x8_sse2 = vp9_highbd_variance8x8_sse2;
-const vp9_variance_fn_t highbd_10_variance8x8_sse2 =
- vp9_highbd_10_variance8x8_sse2;
-const vp9_variance_fn_t highbd_12_variance8x8_sse2 =
- vp9_highbd_12_variance8x8_sse2;
-const vp9_variance_fn_t highbd_variance8x16_sse2 = vp9_highbd_variance8x16_sse2;
-const vp9_variance_fn_t highbd_10_variance8x16_sse2 =
- vp9_highbd_10_variance8x16_sse2;
-const vp9_variance_fn_t highbd_12_variance8x16_sse2 =
- vp9_highbd_12_variance8x16_sse2;
-const vp9_variance_fn_t highbd_variance16x8_sse2 =
- vp9_highbd_variance16x8_sse2;
-const vp9_variance_fn_t highbd_10_variance16x8_sse2 =
- vp9_highbd_10_variance16x8_sse2;
-const vp9_variance_fn_t highbd_12_variance16x8_sse2 =
- vp9_highbd_12_variance16x8_sse2;
-const vp9_variance_fn_t highbd_variance16x16_sse2 =
- vp9_highbd_variance16x16_sse2;
-const vp9_variance_fn_t highbd_10_variance16x16_sse2 =
- vp9_highbd_10_variance16x16_sse2;
-const vp9_variance_fn_t highbd_12_variance16x16_sse2 =
- vp9_highbd_12_variance16x16_sse2;
-const vp9_variance_fn_t highbd_variance16x32_sse2 =
- vp9_highbd_variance16x32_sse2;
-const vp9_variance_fn_t highbd_10_variance16x32_sse2 =
- vp9_highbd_10_variance16x32_sse2;
-const vp9_variance_fn_t highbd_12_variance16x32_sse2 =
- vp9_highbd_12_variance16x32_sse2;
-const vp9_variance_fn_t highbd_variance32x16_sse2 =
- vp9_highbd_variance32x16_sse2;
-const vp9_variance_fn_t highbd_10_variance32x16_sse2 =
- vp9_highbd_10_variance32x16_sse2;
-const vp9_variance_fn_t highbd_12_variance32x16_sse2 =
- vp9_highbd_12_variance32x16_sse2;
-const vp9_variance_fn_t highbd_variance32x32_sse2 =
- vp9_highbd_variance32x32_sse2;
-const vp9_variance_fn_t highbd_10_variance32x32_sse2 =
- vp9_highbd_10_variance32x32_sse2;
-const vp9_variance_fn_t highbd_12_variance32x32_sse2 =
- vp9_highbd_12_variance32x32_sse2;
-const vp9_variance_fn_t highbd_variance32x64_sse2 =
- vp9_highbd_variance32x64_sse2;
-const vp9_variance_fn_t highbd_10_variance32x64_sse2 =
- vp9_highbd_10_variance32x64_sse2;
-const vp9_variance_fn_t highbd_12_variance32x64_sse2 =
- vp9_highbd_12_variance32x64_sse2;
-const vp9_variance_fn_t highbd_variance64x32_sse2 =
- vp9_highbd_variance64x32_sse2;
-const vp9_variance_fn_t highbd_10_variance64x32_sse2 =
- vp9_highbd_10_variance64x32_sse2;
-const vp9_variance_fn_t highbd_12_variance64x32_sse2 =
- vp9_highbd_12_variance64x32_sse2;
-const vp9_variance_fn_t highbd_variance64x64_sse2 =
- vp9_highbd_variance64x64_sse2;
-const vp9_variance_fn_t highbd_10_variance64x64_sse2 =
- vp9_highbd_10_variance64x64_sse2;
-const vp9_variance_fn_t highbd_12_variance64x64_sse2 =
- vp9_highbd_12_variance64x64_sse2;
-INSTANTIATE_TEST_CASE_P(
- SSE2, VP9VarianceHighTest,
- ::testing::Values(make_tuple(3, 3, highbd_10_variance8x8_sse2, 10),
- make_tuple(3, 4, highbd_10_variance8x16_sse2, 10),
- make_tuple(4, 3, highbd_10_variance16x8_sse2, 10),
- make_tuple(4, 4, highbd_10_variance16x16_sse2, 10),
- make_tuple(4, 5, highbd_10_variance16x32_sse2, 10),
- make_tuple(5, 4, highbd_10_variance32x16_sse2, 10),
- make_tuple(5, 5, highbd_10_variance32x32_sse2, 10),
- make_tuple(5, 6, highbd_10_variance32x64_sse2, 10),
- make_tuple(6, 5, highbd_10_variance64x32_sse2, 10),
- make_tuple(6, 6, highbd_10_variance64x64_sse2, 10),
- make_tuple(3, 3, highbd_12_variance8x8_sse2, 12),
- make_tuple(3, 4, highbd_12_variance8x16_sse2, 12),
- make_tuple(4, 3, highbd_12_variance16x8_sse2, 12),
- make_tuple(4, 4, highbd_12_variance16x16_sse2, 12),
- make_tuple(4, 5, highbd_12_variance16x32_sse2, 12),
- make_tuple(5, 4, highbd_12_variance32x16_sse2, 12),
- make_tuple(5, 5, highbd_12_variance32x32_sse2, 12),
- make_tuple(5, 6, highbd_12_variance32x64_sse2, 12),
- make_tuple(6, 5, highbd_12_variance64x32_sse2, 12),
- make_tuple(6, 6, highbd_12_variance64x64_sse2, 12),
- make_tuple(3, 3, highbd_variance8x8_sse2, 8),
- make_tuple(3, 4, highbd_variance8x16_sse2, 8),
- make_tuple(4, 3, highbd_variance16x8_sse2, 8),
- make_tuple(4, 4, highbd_variance16x16_sse2, 8),
- make_tuple(4, 5, highbd_variance16x32_sse2, 8),
- make_tuple(5, 4, highbd_variance32x16_sse2, 8),
- make_tuple(5, 5, highbd_variance32x32_sse2, 8),
- make_tuple(5, 6, highbd_variance32x64_sse2, 8),
- make_tuple(6, 5, highbd_variance64x32_sse2, 8),
- make_tuple(6, 6, highbd_variance64x64_sse2, 8)));
const vp9_subpixvariance_fn_t highbd_subpel_variance8x4_sse2 =
vp9_highbd_sub_pixel_variance8x4_sse2;
const vp9_subpixvariance_fn_t highbd_subpel_variance8x8_sse2 =
@@ -1790,6 +1717,9 @@ INSTANTIATE_TEST_CASE_P(
#endif // CONFIG_VP9_HIGHBITDEPTH
#endif // CONFIG_USE_X86INC
#endif // HAVE_SSE2
+#endif // CONFIG_VP9_ENCODER
+
+#if CONFIG_VP9_ENCODER
#if HAVE_SSSE3
#if CONFIG_USE_X86INC
@@ -1877,22 +1807,27 @@ INSTANTIATE_TEST_CASE_P(
make_tuple(6, 6, subpel_avg_variance64x64_ssse3, 0)));
#endif // CONFIG_USE_X86INC
#endif // HAVE_SSSE3
+#endif // CONFIG_VP9_ENCODER
#if HAVE_AVX2
-
-const vp9_variance_fn_t variance16x16_avx2 = vp9_variance16x16_avx2;
-const vp9_variance_fn_t variance32x16_avx2 = vp9_variance32x16_avx2;
-const vp9_variance_fn_t variance32x32_avx2 = vp9_variance32x32_avx2;
-const vp9_variance_fn_t variance64x32_avx2 = vp9_variance64x32_avx2;
-const vp9_variance_fn_t variance64x64_avx2 = vp9_variance64x64_avx2;
+const VarianceMxNFunc mse16x16_avx2 = vpx_mse16x16_avx2;
+INSTANTIATE_TEST_CASE_P(AVX2, VpxMseTest,
+ ::testing::Values(make_tuple(4, 4, mse16x16_avx2)));
+
+const VarianceMxNFunc variance64x64_avx2 = vpx_variance64x64_avx2;
+const VarianceMxNFunc variance64x32_avx2 = vpx_variance64x32_avx2;
+const VarianceMxNFunc variance32x32_avx2 = vpx_variance32x32_avx2;
+const VarianceMxNFunc variance32x16_avx2 = vpx_variance32x16_avx2;
+const VarianceMxNFunc variance16x16_avx2 = vpx_variance16x16_avx2;
INSTANTIATE_TEST_CASE_P(
- AVX2, VP9VarianceTest,
- ::testing::Values(make_tuple(4, 4, variance16x16_avx2, 0),
- make_tuple(5, 4, variance32x16_avx2, 0),
- make_tuple(5, 5, variance32x32_avx2, 0),
+ AVX2, VpxVarianceTest,
+ ::testing::Values(make_tuple(6, 6, variance64x64_avx2, 0),
make_tuple(6, 5, variance64x32_avx2, 0),
- make_tuple(6, 6, variance64x64_avx2, 0)));
+ make_tuple(5, 5, variance32x32_avx2, 0),
+ make_tuple(5, 4, variance32x16_avx2, 0),
+ make_tuple(4, 4, variance16x16_avx2, 0)));
+#if CONFIG_VP9_ENCODER
const vp9_subpixvariance_fn_t subpel_variance32x32_avx2 =
vp9_sub_pixel_variance32x32_avx2;
const vp9_subpixvariance_fn_t subpel_variance64x64_avx2 =
@@ -1910,23 +1845,38 @@ INSTANTIATE_TEST_CASE_P(
AVX2, VP9SubpelAvgVarianceTest,
::testing::Values(make_tuple(5, 5, subpel_avg_variance32x32_avx2, 0),
make_tuple(6, 6, subpel_avg_variance64x64_avx2, 0)));
+#endif // CONFIG_VP9_ENCODER
#endif // HAVE_AVX2
+
#if HAVE_NEON
-const vp9_variance_fn_t variance8x8_neon = vp9_variance8x8_neon;
-const vp9_variance_fn_t variance16x16_neon = vp9_variance16x16_neon;
-const vp9_variance_fn_t variance32x32_neon = vp9_variance32x32_neon;
-const vp9_variance_fn_t variance32x64_neon = vp9_variance32x64_neon;
-const vp9_variance_fn_t variance64x32_neon = vp9_variance64x32_neon;
-const vp9_variance_fn_t variance64x64_neon = vp9_variance64x64_neon;
+const Get4x4SseFunc get4x4sse_cs_neon = vpx_get4x4sse_cs_neon;
+INSTANTIATE_TEST_CASE_P(NEON, VpxSseTest,
+ ::testing::Values(make_tuple(2, 2, get4x4sse_cs_neon)));
+
+const VarianceMxNFunc mse16x16_neon = vpx_mse16x16_neon;
+INSTANTIATE_TEST_CASE_P(NEON, VpxMseTest,
+ ::testing::Values(make_tuple(4, 4, mse16x16_neon)));
+
+const VarianceMxNFunc variance64x64_neon = vpx_variance64x64_neon;
+const VarianceMxNFunc variance64x32_neon = vpx_variance64x32_neon;
+const VarianceMxNFunc variance32x64_neon = vpx_variance32x64_neon;
+const VarianceMxNFunc variance32x32_neon = vpx_variance32x32_neon;
+const VarianceMxNFunc variance16x16_neon = vpx_variance16x16_neon;
+const VarianceMxNFunc variance16x8_neon = vpx_variance16x8_neon;
+const VarianceMxNFunc variance8x16_neon = vpx_variance8x16_neon;
+const VarianceMxNFunc variance8x8_neon = vpx_variance8x8_neon;
INSTANTIATE_TEST_CASE_P(
- NEON, VP9VarianceTest,
- ::testing::Values(make_tuple(3, 3, variance8x8_neon, 0),
- make_tuple(4, 4, variance16x16_neon, 0),
- make_tuple(5, 5, variance32x32_neon, 0),
- make_tuple(5, 6, variance32x64_neon, 0),
+ NEON, VpxVarianceTest,
+ ::testing::Values(make_tuple(6, 6, variance64x64_neon, 0),
make_tuple(6, 5, variance64x32_neon, 0),
- make_tuple(6, 6, variance64x64_neon, 0)));
+ make_tuple(5, 6, variance32x64_neon, 0),
+ make_tuple(5, 5, variance32x32_neon, 0),
+ make_tuple(4, 4, variance16x16_neon, 0),
+ make_tuple(4, 3, variance16x8_neon, 0),
+ make_tuple(3, 4, variance8x16_neon, 0),
+ make_tuple(3, 3, variance8x8_neon, 0)));
+#if CONFIG_VP9_ENCODER
const vp9_subpixvariance_fn_t subpel_variance8x8_neon =
vp9_sub_pixel_variance8x8_neon;
const vp9_subpixvariance_fn_t subpel_variance16x16_neon =
@@ -1941,8 +1891,19 @@ INSTANTIATE_TEST_CASE_P(
make_tuple(4, 4, subpel_variance16x16_neon, 0),
make_tuple(5, 5, subpel_variance32x32_neon, 0),
make_tuple(6, 6, subpel_variance64x64_neon, 0)));
-#endif // HAVE_NEON
#endif // CONFIG_VP9_ENCODER
+#endif // HAVE_NEON
-} // namespace vp9
+#if HAVE_MEDIA
+const VarianceMxNFunc mse16x16_media = vpx_mse16x16_media;
+INSTANTIATE_TEST_CASE_P(MEDIA, VpxMseTest,
+ ::testing::Values(make_tuple(4, 4, mse16x16_media)));
+
+const VarianceMxNFunc variance16x16_media = vpx_variance16x16_media;
+const VarianceMxNFunc variance8x8_media = vpx_variance8x8_media;
+INSTANTIATE_TEST_CASE_P(
+ MEDIA, VpxVarianceTest,
+ ::testing::Values(make_tuple(4, 4, variance16x16_media, 0),
+ make_tuple(3, 3, variance8x8_media, 0)));
+#endif // HAVE_MEDIA
} // namespace
diff --git a/test/vp9_error_block_test.cc b/test/vp9_error_block_test.cc
index d7ba1b024..ac19c2e3d 100644
--- a/test/vp9_error_block_test.cc
+++ b/test/vp9_error_block_test.cc
@@ -21,6 +21,7 @@
#include "./vpx_config.h"
#include "./vp9_rtcd.h"
#include "vp9/common/vp9_entropy.h"
+#include "vpx/vpx_codec.h"
#include "vpx/vpx_integer.h"
using libvpx_test::ACMRandom;
diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index 2d910466d..943c00b87 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -21,6 +21,8 @@
#include "./vpx_config.h"
#include "./vp9_rtcd.h"
#include "vp9/common/vp9_entropy.h"
+#include "vp9/common/vp9_scan.h"
+#include "vpx/vpx_codec.h"
#include "vpx/vpx_integer.h"
using libvpx_test::ACMRandom;
diff --git a/tools_common.h b/tools_common.h
index a87e814c1..aa7f02599 100644
--- a/tools_common.h
+++ b/tools_common.h
@@ -16,6 +16,7 @@
#include "vpx/vpx_codec.h"
#include "vpx/vpx_image.h"
#include "vpx/vpx_integer.h"
+#include "vpx_ports/msvc.h"
#if CONFIG_ENCODERS
#include "./y4minput.h"
@@ -34,7 +35,6 @@
#if CONFIG_OS_SUPPORT
#if defined(_MSC_VER)
#include <io.h> /* NOLINT */
-#define snprintf _snprintf
#define isatty _isatty
#define fileno _fileno
#else
diff --git a/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm b/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm
deleted file mode 100644
index 39919579f..000000000
--- a/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm
+++ /dev/null
@@ -1,154 +0,0 @@
-;
-; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_variance16x16_armv6|
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0 unsigned char *src_ptr
-; r1 int source_stride
-; r2 unsigned char *ref_ptr
-; r3 int recon_stride
-; stack unsigned int *sse
-|vp8_variance16x16_armv6| PROC
-
- stmfd sp!, {r4-r12, lr}
-
- pld [r0, r1, lsl #0]
- pld [r2, r3, lsl #0]
-
- mov r8, #0 ; initialize sum = 0
- mov r11, #0 ; initialize sse = 0
- mov r12, #16 ; set loop counter to 16 (=block height)
-
-loop
- ; 1st 4 pixels
- ldr r4, [r0, #0] ; load 4 src pixels
- ldr r5, [r2, #0] ; load 4 ref pixels
-
- mov lr, #0 ; constant zero
-
- usub8 r6, r4, r5 ; calculate difference
- pld [r0, r1, lsl #1]
- sel r7, r6, lr ; select bytes with positive difference
- usub8 r9, r5, r4 ; calculate difference with reversed operands
- pld [r2, r3, lsl #1]
- sel r6, r9, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r4, r7, lr ; calculate sum of positive differences
- usad8 r5, r6, lr ; calculate sum of negative differences
- orr r6, r6, r7 ; differences of all 4 pixels
- ; calculate total sum
- adds r8, r8, r4 ; add positive differences to sum
- subs r8, r8, r5 ; subtract negative differences from sum
-
- ; calculate sse
- uxtb16 r5, r6 ; byte (two pixels) to halfwords
- uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
- smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
-
- ; 2nd 4 pixels
- ldr r4, [r0, #4] ; load 4 src pixels
- ldr r5, [r2, #4] ; load 4 ref pixels
- smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
-
- usub8 r6, r4, r5 ; calculate difference
- sel r7, r6, lr ; select bytes with positive difference
- usub8 r9, r5, r4 ; calculate difference with reversed operands
- sel r6, r9, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r4, r7, lr ; calculate sum of positive differences
- usad8 r5, r6, lr ; calculate sum of negative differences
- orr r6, r6, r7 ; differences of all 4 pixels
-
- ; calculate total sum
- add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; subtract negative differences from sum
-
- ; calculate sse
- uxtb16 r5, r6 ; byte (two pixels) to halfwords
- uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
- smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
-
- ; 3rd 4 pixels
- ldr r4, [r0, #8] ; load 4 src pixels
- ldr r5, [r2, #8] ; load 4 ref pixels
- smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
-
- usub8 r6, r4, r5 ; calculate difference
- sel r7, r6, lr ; select bytes with positive difference
- usub8 r9, r5, r4 ; calculate difference with reversed operands
- sel r6, r9, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r4, r7, lr ; calculate sum of positive differences
- usad8 r5, r6, lr ; calculate sum of negative differences
- orr r6, r6, r7 ; differences of all 4 pixels
-
- ; calculate total sum
- add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; subtract negative differences from sum
-
- ; calculate sse
- uxtb16 r5, r6 ; byte (two pixels) to halfwords
- uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
- smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
-
- ; 4th 4 pixels
- ldr r4, [r0, #12] ; load 4 src pixels
- ldr r5, [r2, #12] ; load 4 ref pixels
- smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
-
- usub8 r6, r4, r5 ; calculate difference
- add r0, r0, r1 ; set src_ptr to next row
- sel r7, r6, lr ; select bytes with positive difference
- usub8 r9, r5, r4 ; calculate difference with reversed operands
- add r2, r2, r3 ; set dst_ptr to next row
- sel r6, r9, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r4, r7, lr ; calculate sum of positive differences
- usad8 r5, r6, lr ; calculate sum of negative differences
- orr r6, r6, r7 ; differences of all 4 pixels
-
- ; calculate total sum
- add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; subtract negative differences from sum
-
- ; calculate sse
- uxtb16 r5, r6 ; byte (two pixels) to halfwords
- uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
- smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
- smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
-
-
- subs r12, r12, #1
-
- bne loop
-
- ; return stuff
- ldr r6, [sp, #40] ; get address of sse
- mul r0, r8, r8 ; sum * sum
- str r11, [r6] ; store sse
- sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
-
- ldmfd sp!, {r4-r12, pc}
-
- ENDP
-
- END
-
diff --git a/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm b/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm
deleted file mode 100644
index 915ee4993..000000000
--- a/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm
+++ /dev/null
@@ -1,101 +0,0 @@
-;
-; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_variance8x8_armv6|
-
- ARM
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0 unsigned char *src_ptr
-; r1 int source_stride
-; r2 unsigned char *ref_ptr
-; r3 int recon_stride
-; stack unsigned int *sse
-|vp8_variance8x8_armv6| PROC
-
- push {r4-r10, lr}
-
- pld [r0, r1, lsl #0]
- pld [r2, r3, lsl #0]
-
- mov r12, #8 ; set loop counter to 8 (=block height)
- mov r4, #0 ; initialize sum = 0
- mov r5, #0 ; initialize sse = 0
-
-loop
- ; 1st 4 pixels
- ldr r6, [r0, #0x0] ; load 4 src pixels
- ldr r7, [r2, #0x0] ; load 4 ref pixels
-
- mov lr, #0 ; constant zero
-
- usub8 r8, r6, r7 ; calculate difference
- pld [r0, r1, lsl #1]
- sel r10, r8, lr ; select bytes with positive difference
- usub8 r9, r7, r6 ; calculate difference with reversed operands
- pld [r2, r3, lsl #1]
- sel r8, r9, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r6, r10, lr ; calculate sum of positive differences
- usad8 r7, r8, lr ; calculate sum of negative differences
- orr r8, r8, r10 ; differences of all 4 pixels
- ; calculate total sum
- add r4, r4, r6 ; add positive differences to sum
- sub r4, r4, r7 ; subtract negative differences from sum
-
- ; calculate sse
- uxtb16 r7, r8 ; byte (two pixels) to halfwords
- uxtb16 r10, r8, ror #8 ; another two pixels to halfwords
- smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1)
-
- ; 2nd 4 pixels
- ldr r6, [r0, #0x4] ; load 4 src pixels
- ldr r7, [r2, #0x4] ; load 4 ref pixels
- smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2)
-
- usub8 r8, r6, r7 ; calculate difference
- add r0, r0, r1 ; set src_ptr to next row
- sel r10, r8, lr ; select bytes with positive difference
- usub8 r9, r7, r6 ; calculate difference with reversed operands
- add r2, r2, r3 ; set dst_ptr to next row
- sel r8, r9, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r6, r10, lr ; calculate sum of positive differences
- usad8 r7, r8, lr ; calculate sum of negative differences
- orr r8, r8, r10 ; differences of all 4 pixels
-
- ; calculate total sum
- add r4, r4, r6 ; add positive differences to sum
- sub r4, r4, r7 ; subtract negative differences from sum
-
- ; calculate sse
- uxtb16 r7, r8 ; byte (two pixels) to halfwords
- uxtb16 r10, r8, ror #8 ; another two pixels to halfwords
- smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1)
- subs r12, r12, #1 ; next row
- smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2)
-
- bne loop
-
- ; return stuff
- ldr r8, [sp, #32] ; get address of sse
- mul r1, r4, r4 ; sum * sum
- str r5, [r8] ; store sse
- sub r0, r5, r1, ASR #6 ; return (sse - ((sum * sum) >> 6))
-
- pop {r4-r10, pc}
-
- ENDP
-
- END
diff --git a/vp8/common/arm/neon/variance_neon.c b/vp8/common/arm/neon/variance_neon.c
deleted file mode 100644
index 1b1979073..000000000
--- a/vp8/common/arm/neon/variance_neon.c
+++ /dev/null
@@ -1,320 +0,0 @@
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-#include "vpx_ports/mem.h"
-
-unsigned int vp8_variance16x16_neon(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- int i;
- int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
- uint32x2_t d0u32, d10u32;
- int64x1_t d0s64, d1s64;
- uint8x16_t q0u8, q1u8, q2u8, q3u8;
- uint16x8_t q11u16, q12u16, q13u16, q14u16;
- int32x4_t q8s32, q9s32, q10s32;
- int64x2_t q0s64, q1s64, q5s64;
-
- q8s32 = vdupq_n_s32(0);
- q9s32 = vdupq_n_s32(0);
- q10s32 = vdupq_n_s32(0);
-
- for (i = 0; i < 8; i++) {
- q0u8 = vld1q_u8(src_ptr);
- src_ptr += source_stride;
- q1u8 = vld1q_u8(src_ptr);
- src_ptr += source_stride;
- __builtin_prefetch(src_ptr);
-
- q2u8 = vld1q_u8(ref_ptr);
- ref_ptr += recon_stride;
- q3u8 = vld1q_u8(ref_ptr);
- ref_ptr += recon_stride;
- __builtin_prefetch(ref_ptr);
-
- q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
- q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
- q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
- q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
-
- d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
- d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
- q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
- q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
-
- d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
- d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
- q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
- q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-
- d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
- d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
- q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
- q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
-
- d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
- d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
- q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
- q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
- }
-
- q10s32 = vaddq_s32(q10s32, q9s32);
- q0s64 = vpaddlq_s32(q8s32);
- q1s64 = vpaddlq_s32(q10s32);
-
- d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
- d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
- q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
- vreinterpret_s32_s64(d0s64));
- vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
-
- d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
- d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
-
- return vget_lane_u32(d0u32, 0);
-}
-
-unsigned int vp8_variance16x8_neon(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- int i;
- int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
- uint32x2_t d0u32, d10u32;
- int64x1_t d0s64, d1s64;
- uint8x16_t q0u8, q1u8, q2u8, q3u8;
- uint16x8_t q11u16, q12u16, q13u16, q14u16;
- int32x4_t q8s32, q9s32, q10s32;
- int64x2_t q0s64, q1s64, q5s64;
-
- q8s32 = vdupq_n_s32(0);
- q9s32 = vdupq_n_s32(0);
- q10s32 = vdupq_n_s32(0);
-
- for (i = 0; i < 4; i++) { // variance16x8_neon_loop
- q0u8 = vld1q_u8(src_ptr);
- src_ptr += source_stride;
- q1u8 = vld1q_u8(src_ptr);
- src_ptr += source_stride;
- __builtin_prefetch(src_ptr);
-
- q2u8 = vld1q_u8(ref_ptr);
- ref_ptr += recon_stride;
- q3u8 = vld1q_u8(ref_ptr);
- ref_ptr += recon_stride;
- __builtin_prefetch(ref_ptr);
-
- q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
- q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
- q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
- q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
-
- d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
- d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
- q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
- q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
-
- d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
- d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
- q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
- q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-
- d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
- d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
- q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
- q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
-
- d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
- d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
- q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
- q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
- }
-
- q10s32 = vaddq_s32(q10s32, q9s32);
- q0s64 = vpaddlq_s32(q8s32);
- q1s64 = vpaddlq_s32(q10s32);
-
- d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
- d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
- q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
- vreinterpret_s32_s64(d0s64));
- vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
-
- d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
- d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
-
- return vget_lane_u32(d0u32, 0);
-}
-
-unsigned int vp8_variance8x16_neon(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- int i;
- uint8x8_t d0u8, d2u8, d4u8, d6u8;
- int16x4_t d22s16, d23s16, d24s16, d25s16;
- uint32x2_t d0u32, d10u32;
- int64x1_t d0s64, d1s64;
- uint16x8_t q11u16, q12u16;
- int32x4_t q8s32, q9s32, q10s32;
- int64x2_t q0s64, q1s64, q5s64;
-
- q8s32 = vdupq_n_s32(0);
- q9s32 = vdupq_n_s32(0);
- q10s32 = vdupq_n_s32(0);
-
- for (i = 0; i < 8; i++) { // variance8x16_neon_loop
- d0u8 = vld1_u8(src_ptr);
- src_ptr += source_stride;
- d2u8 = vld1_u8(src_ptr);
- src_ptr += source_stride;
- __builtin_prefetch(src_ptr);
-
- d4u8 = vld1_u8(ref_ptr);
- ref_ptr += recon_stride;
- d6u8 = vld1_u8(ref_ptr);
- ref_ptr += recon_stride;
- __builtin_prefetch(ref_ptr);
-
- q11u16 = vsubl_u8(d0u8, d4u8);
- q12u16 = vsubl_u8(d2u8, d6u8);
-
- d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
- d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
- q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
- q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
-
- d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
- d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
- q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
- q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
- }
-
- q10s32 = vaddq_s32(q10s32, q9s32);
- q0s64 = vpaddlq_s32(q8s32);
- q1s64 = vpaddlq_s32(q10s32);
-
- d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
- d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
- q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
- vreinterpret_s32_s64(d0s64));
- vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
-
- d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
- d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
-
- return vget_lane_u32(d0u32, 0);
-}
-
-unsigned int vp8_variance8x8_neon(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- int i;
- uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
- int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
- uint32x2_t d0u32, d10u32;
- int64x1_t d0s64, d1s64;
- uint16x8_t q11u16, q12u16, q13u16, q14u16;
- int32x4_t q8s32, q9s32, q10s32;
- int64x2_t q0s64, q1s64, q5s64;
-
- q8s32 = vdupq_n_s32(0);
- q9s32 = vdupq_n_s32(0);
- q10s32 = vdupq_n_s32(0);
-
- for (i = 0; i < 2; i++) { // variance8x8_neon_loop
- d0u8 = vld1_u8(src_ptr);
- src_ptr += source_stride;
- d1u8 = vld1_u8(src_ptr);
- src_ptr += source_stride;
- d2u8 = vld1_u8(src_ptr);
- src_ptr += source_stride;
- d3u8 = vld1_u8(src_ptr);
- src_ptr += source_stride;
-
- d4u8 = vld1_u8(ref_ptr);
- ref_ptr += recon_stride;
- d5u8 = vld1_u8(ref_ptr);
- ref_ptr += recon_stride;
- d6u8 = vld1_u8(ref_ptr);
- ref_ptr += recon_stride;
- d7u8 = vld1_u8(ref_ptr);
- ref_ptr += recon_stride;
-
- q11u16 = vsubl_u8(d0u8, d4u8);
- q12u16 = vsubl_u8(d1u8, d5u8);
- q13u16 = vsubl_u8(d2u8, d6u8);
- q14u16 = vsubl_u8(d3u8, d7u8);
-
- d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
- d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
- q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
- q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
-
- d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
- d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
- q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
- q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-
- d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
- d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
- q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
- q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
-
- d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
- d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
- q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
- q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
- }
-
- q10s32 = vaddq_s32(q10s32, q9s32);
- q0s64 = vpaddlq_s32(q8s32);
- q1s64 = vpaddlq_s32(q10s32);
-
- d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
- d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
- q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
- vreinterpret_s32_s64(d0s64));
- vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
-
- d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 6);
- d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
-
- return vget_lane_u32(d0u32, 0);
-}
diff --git a/vp8/common/arm/variance_arm.c b/vp8/common/arm/variance_arm.c
index 467a50942..0f293f03d 100644
--- a/vp8/common/arm/variance_arm.c
+++ b/vp8/common/arm/variance_arm.c
@@ -9,10 +9,14 @@
*/
#include "vpx_config.h"
-#include "vp8_rtcd.h"
+#include "./vp8_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
#include "vp8/common/variance.h"
#include "vp8/common/filter.h"
+// TODO(johannkoenig): Move this to vpx_dsp or vp8/encoder
+#if CONFIG_VP8_ENCODER
+
#if HAVE_MEDIA
#include "vp8/common/arm/bilinearfilter_arm.h"
@@ -40,8 +44,8 @@ unsigned int vp8_sub_pixel_variance8x8_armv6
vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
8, 8, 8, VFilter);
- return vp8_variance8x8_armv6(second_pass, 8, dst_ptr,
- dst_pixels_per_line, sse);
+ return vpx_variance8x8_media(second_pass, 8, dst_ptr,
+ dst_pixels_per_line, sse);
}
unsigned int vp8_sub_pixel_variance16x16_armv6
@@ -86,13 +90,13 @@ unsigned int vp8_sub_pixel_variance16x16_armv6
vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
16, 16, 16, VFilter);
- var = vp8_variance16x16_armv6(second_pass, 16, dst_ptr,
- dst_pixels_per_line, sse);
+ var = vpx_variance16x16_media(second_pass, 16, dst_ptr,
+ dst_pixels_per_line, sse);
}
return var;
}
-#endif /* HAVE_MEDIA */
+#endif // HAVE_MEDIA
#if HAVE_NEON
@@ -129,4 +133,5 @@ unsigned int vp8_sub_pixel_variance16x16_neon
return vp8_sub_pixel_variance16x16_neon_func(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
}
-#endif
+#endif // HAVE_NEON
+#endif // CONFIG_VP8_ENCODER
diff --git a/vp8/common/mfqe.c b/vp8/common/mfqe.c
index d12dea193..5c0680f42 100644
--- a/vp8/common/mfqe.c
+++ b/vp8/common/mfqe.c
@@ -151,14 +151,14 @@ static void multiframe_quality_enhance_block
if (blksize == 16)
{
- actd = (vp8_variance16x16(yd, yd_stride, VP8_ZEROS, 0, &sse)+128)>>8;
- act = (vp8_variance16x16(y, y_stride, VP8_ZEROS, 0, &sse)+128)>>8;
+ actd = (vpx_variance16x16(yd, yd_stride, VP8_ZEROS, 0, &sse)+128)>>8;
+ act = (vpx_variance16x16(y, y_stride, VP8_ZEROS, 0, &sse)+128)>>8;
#ifdef USE_SSD
- vp8_variance16x16(y, y_stride, yd, yd_stride, &sse);
+ vpx_variance16x16(y, y_stride, yd, yd_stride, &sse);
sad = (sse + 128)>>8;
- vp8_variance8x8(u, uv_stride, ud, uvd_stride, &sse);
+ vpx_variance8x8(u, uv_stride, ud, uvd_stride, &sse);
usad = (sse + 32)>>6;
- vp8_variance8x8(v, uv_stride, vd, uvd_stride, &sse);
+ vpx_variance8x8(v, uv_stride, vd, uvd_stride, &sse);
vsad = (sse + 32)>>6;
#else
sad = (vpx_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8;
@@ -168,14 +168,14 @@ static void multiframe_quality_enhance_block
}
else /* if (blksize == 8) */
{
- actd = (vp8_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse)+32)>>6;
- act = (vp8_variance8x8(y, y_stride, VP8_ZEROS, 0, &sse)+32)>>6;
+ actd = (vpx_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse)+32)>>6;
+ act = (vpx_variance8x8(y, y_stride, VP8_ZEROS, 0, &sse)+32)>>6;
#ifdef USE_SSD
- vp8_variance8x8(y, y_stride, yd, yd_stride, &sse);
+ vpx_variance8x8(y, y_stride, yd, yd_stride, &sse);
sad = (sse + 32)>>6;
- vp8_variance4x4(u, uv_stride, ud, uvd_stride, &sse);
+ vpx_variance4x4(u, uv_stride, ud, uvd_stride, &sse);
usad = (sse + 8)>>4;
- vp8_variance4x4(v, uv_stride, vd, uvd_stride, &sse);
+ vpx_variance4x4(v, uv_stride, vd, uvd_stride, &sse);
vsad = (sse + 8)>>4;
#else
sad = (vpx_sad8x8(y, y_stride, yd, yd_stride) + 32) >> 6;
diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl
index c9f14d58a..4b820338e 100644
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -237,31 +237,6 @@ specialize qw/vp8_bilinear_predict4x4 mmx media neon/;
$vp8_bilinear_predict4x4_media=vp8_bilinear_predict4x4_armv6;
#
-# Whole-pixel Variance
-#
-add_proto qw/unsigned int vp8_variance4x4/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp8_variance4x4 mmx sse2/;
-$vp8_variance4x4_sse2=vp8_variance4x4_wmt;
-
-add_proto qw/unsigned int vp8_variance8x8/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp8_variance8x8 mmx sse2 media neon/;
-$vp8_variance8x8_sse2=vp8_variance8x8_wmt;
-$vp8_variance8x8_media=vp8_variance8x8_armv6;
-
-add_proto qw/unsigned int vp8_variance8x16/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp8_variance8x16 mmx sse2 neon/;
-$vp8_variance8x16_sse2=vp8_variance8x16_wmt;
-
-add_proto qw/unsigned int vp8_variance16x8/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp8_variance16x8 mmx sse2 neon/;
-$vp8_variance16x8_sse2=vp8_variance16x8_wmt;
-
-add_proto qw/unsigned int vp8_variance16x16/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp8_variance16x16 mmx sse2 media neon/;
-$vp8_variance16x16_sse2=vp8_variance16x16_wmt;
-$vp8_variance16x16_media=vp8_variance16x16_armv6;
-
-#
# Sub-pixel Variance
#
add_proto qw/unsigned int vp8_sub_pixel_variance4x4/, "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";
@@ -309,26 +284,12 @@ $vp8_variance_halfpixvar16x16_hv_media=vp8_variance_halfpixvar16x16_hv_armv6;
if (vpx_config("CONFIG_VP8_ENCODER") eq "yes") {
#
-# Sum of squares (vector)
-#
-add_proto qw/unsigned int vp8_get_mb_ss/, "const short *";
-specialize qw/vp8_get_mb_ss mmx sse2/;
-
-#
# SSE (Sum Squared Error)
#
add_proto qw/unsigned int vp8_sub_pixel_mse16x16/, "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";
specialize qw/vp8_sub_pixel_mse16x16 mmx sse2/;
$vp8_sub_pixel_mse16x16_sse2=vp8_sub_pixel_mse16x16_wmt;
-add_proto qw/unsigned int vp8_mse16x16/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp8_mse16x16 mmx sse2 media neon/;
-$vp8_mse16x16_sse2=vp8_mse16x16_wmt;
-$vp8_mse16x16_media=vp8_mse16x16_armv6;
-
-add_proto qw/unsigned int vp8_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride";
-specialize qw/vp8_get4x4sse_cs mmx neon/;
-
#
# Block copy
#
diff --git a/vp8/common/variance.h b/vp8/common/variance.h
index b62cc6136..c6c9f41bf 100644
--- a/vp8/common/variance.h
+++ b/vp8/common/variance.h
@@ -39,6 +39,7 @@ typedef void (*vpx_sad_multi_fn_t)(
const unsigned char *ref_array,
int ref_stride,
unsigned int *sad_array);
+
typedef void (*vpx_sad_multi_d_fn_t)
(
const unsigned char *src_ptr,
@@ -48,7 +49,7 @@ typedef void (*vpx_sad_multi_d_fn_t)
unsigned int *sad_array
);
-typedef unsigned int (*vp8_variance_fn_t)
+typedef unsigned int (*vpx_variance_fn_t)
(
const unsigned char *src_ptr,
int source_stride,
@@ -68,37 +69,14 @@ typedef unsigned int (*vp8_subpixvariance_fn_t)
unsigned int *sse
);
-typedef void (*vp8_ssimpf_fn_t)
- (
- unsigned char *s,
- int sp,
- unsigned char *r,
- int rp,
- unsigned long *sum_s,
- unsigned long *sum_r,
- unsigned long *sum_sq_s,
- unsigned long *sum_sq_r,
- unsigned long *sum_sxr
- );
-
-typedef unsigned int (*vp8_getmbss_fn_t)(const short *);
-
-typedef unsigned int (*vp8_get16x16prederror_fn_t)
- (
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int ref_stride
- );
-
typedef struct variance_vtable
{
vpx_sad_fn_t sdf;
- vp8_variance_fn_t vf;
+ vpx_variance_fn_t vf;
vp8_subpixvariance_fn_t svf;
- vp8_variance_fn_t svf_halfpix_h;
- vp8_variance_fn_t svf_halfpix_v;
- vp8_variance_fn_t svf_halfpix_hv;
+ vpx_variance_fn_t svf_halfpix_h;
+ vpx_variance_fn_t svf_halfpix_v;
+ vpx_variance_fn_t svf_halfpix_hv;
vpx_sad_multi_fn_t sdx3f;
vpx_sad_multi_fn_t sdx8f;
vpx_sad_multi_d_fn_t sdx4df;
diff --git a/vp8/common/variance_c.c b/vp8/common/variance_c.c
index dc95bfeb3..79d1ca00c 100644
--- a/vp8/common/variance_c.c
+++ b/vp8/common/variance_c.c
@@ -8,44 +8,34 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-
#include "./vp8_rtcd.h"
#include "filter.h"
#include "variance.h"
-
-unsigned int vp8_get_mb_ss_c
-(
- const short *src_ptr
-)
-{
- unsigned int i = 0, sum = 0;
-
- do
- {
- sum += (src_ptr[i] * src_ptr[i]);
- i++;
- }
- while (i < 256);
-
- return sum;
+/* This is a bad idea.
+ * ctz = count trailing zeros */
+static int ctz(int a) {
+ int b = 0;
+ while (a != 1) {
+ a >>= 1;
+ b++;
+ }
+ return b;
}
-
-static void variance(
+static unsigned int variance(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
int w,
int h,
- unsigned int *sse,
- int *sum)
+ unsigned int *sse)
{
int i, j;
- int diff;
+ int diff, sum;
- *sum = 0;
+ sum = 0;
*sse = 0;
for (i = 0; i < h; i++)
@@ -53,114 +43,17 @@ static void variance(
for (j = 0; j < w; j++)
{
diff = src_ptr[j] - ref_ptr[j];
- *sum += diff;
+ sum += diff;
*sse += diff * diff;
}
src_ptr += source_stride;
ref_ptr += recon_stride;
}
-}
-
-unsigned int vp8_variance16x16_c(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse)
-{
- unsigned int var;
- int avg;
-
-
- variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
- *sse = var;
- return (var - (((unsigned int)avg * avg) >> 8));
-}
-
-unsigned int vp8_variance8x16_c(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse)
-{
- unsigned int var;
- int avg;
-
-
- variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, &var, &avg);
- *sse = var;
- return (var - (((unsigned int)avg * avg) >> 7));
+ return (*sse - (((unsigned int)sum * sum) >> (int)((ctz(w) + ctz(h)))));
}
-unsigned int vp8_variance16x8_c(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse)
-{
- unsigned int var;
- int avg;
-
-
- variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, &var, &avg);
- *sse = var;
- return (var - (((unsigned int)avg * avg) >> 7));
-}
-
-
-unsigned int vp8_variance8x8_c(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse)
-{
- unsigned int var;
- int avg;
-
-
- variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, &var, &avg);
- *sse = var;
- return (var - (((unsigned int)avg * avg) >> 6));
-}
-
-unsigned int vp8_variance4x4_c(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse)
-{
- unsigned int var;
- int avg;
-
-
- variance(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4, &var, &avg);
- *sse = var;
- return (var - (((unsigned int)avg * avg) >> 4));
-}
-
-
-unsigned int vp8_mse16x16_c(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse)
-{
- unsigned int var;
- int avg;
-
- variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
- *sse = var;
- return var;
-}
-
-
/****************************************************************************
*
* ROUTINE : filter_block2d_bil_first_pass
@@ -304,7 +197,7 @@ unsigned int vp8_sub_pixel_variance4x4_c
/* Now filter Verticaly */
var_filter_block2d_bil_second_pass(FData3, temp2, 4, 4, 4, 4, VFilter);
- return vp8_variance4x4_c(temp2, 4, dst_ptr, dst_pixels_per_line, sse);
+ return variance(temp2, 4, dst_ptr, dst_pixels_per_line, 4, 4, sse);
}
@@ -329,7 +222,7 @@ unsigned int vp8_sub_pixel_variance8x8_c
var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 8, HFilter);
var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 8, 8, VFilter);
- return vp8_variance8x8_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
+ return variance(temp2, 8, dst_ptr, dst_pixels_per_line, 8, 8, sse);
}
unsigned int vp8_sub_pixel_variance16x16_c
@@ -353,7 +246,7 @@ unsigned int vp8_sub_pixel_variance16x16_c
var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 16, HFilter);
var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 16, 16, VFilter);
- return vp8_variance16x16_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
+ return variance(temp2, 16, dst_ptr, dst_pixels_per_line, 16, 16, sse);
}
@@ -429,7 +322,7 @@ unsigned int vp8_sub_pixel_variance16x8_c
var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 16, HFilter);
var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 8, 16, VFilter);
- return vp8_variance16x8_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
+ return variance(temp2, 16, dst_ptr, dst_pixels_per_line, 16, 8, sse);
}
unsigned int vp8_sub_pixel_variance8x16_c
@@ -455,5 +348,5 @@ unsigned int vp8_sub_pixel_variance8x16_c
var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 8, HFilter);
var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 16, 8, VFilter);
- return vp8_variance8x16_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
+ return variance(temp2, 8, dst_ptr, dst_pixels_per_line, 8, 16, sse);
}
diff --git a/vp8/common/x86/variance_impl_mmx.asm b/vp8/common/x86/variance_impl_mmx.asm
index 7d5e6810b..97f25275d 100644
--- a/vp8/common/x86/variance_impl_mmx.asm
+++ b/vp8/common/x86/variance_impl_mmx.asm
@@ -11,504 +11,6 @@
%include "vpx_ports/x86_abi_support.asm"
-;unsigned int vp8_get_mb_ss_mmx( short *src_ptr )
-global sym(vp8_get_mb_ss_mmx) PRIVATE
-sym(vp8_get_mb_ss_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- GET_GOT rbx
- push rsi
- push rdi
- sub rsp, 8
- ; end prolog
-
- mov rax, arg(0) ;src_ptr
- mov rcx, 16
- pxor mm4, mm4
-
-.NEXTROW:
- movq mm0, [rax]
- movq mm1, [rax+8]
- movq mm2, [rax+16]
- movq mm3, [rax+24]
- pmaddwd mm0, mm0
- pmaddwd mm1, mm1
- pmaddwd mm2, mm2
- pmaddwd mm3, mm3
-
- paddd mm4, mm0
- paddd mm4, mm1
- paddd mm4, mm2
- paddd mm4, mm3
-
- add rax, 32
- dec rcx
- ja .NEXTROW
- movq QWORD PTR [rsp], mm4
-
- ;return sum[0]+sum[1];
- movsxd rax, dword ptr [rsp]
- movsxd rcx, dword ptr [rsp+4]
- add rax, rcx
-
-
- ; begin epilog
- add rsp, 8
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;unsigned int vp8_get8x8var_mmx
-;(
-; unsigned char *src_ptr,
-; int source_stride,
-; unsigned char *ref_ptr,
-; int recon_stride,
-; unsigned int *SSE,
-; int *Sum
-;)
-global sym(vp8_get8x8var_mmx) PRIVATE
-sym(vp8_get8x8var_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- push rsi
- push rdi
- push rbx
- sub rsp, 16
- ; end prolog
-
-
- pxor mm5, mm5 ; Blank mmx6
- pxor mm6, mm6 ; Blank mmx7
- pxor mm7, mm7 ; Blank mmx7
-
- mov rax, arg(0) ;[src_ptr] ; Load base addresses
- mov rbx, arg(2) ;[ref_ptr]
- movsxd rcx, dword ptr arg(1) ;[source_stride]
- movsxd rdx, dword ptr arg(3) ;[recon_stride]
-
- ; Row 1
- movq mm0, [rax] ; Copy eight bytes to mm0
- movq mm1, [rbx] ; Copy eight bytes to mm1
- movq mm2, mm0 ; Take copies
- movq mm3, mm1 ; Take copies
-
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- punpckhbw mm2, mm6 ; unpack to higher prrcision
- punpckhbw mm3, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- psubsw mm2, mm3 ; A-B (high order) to MM2
-
- paddw mm5, mm0 ; accumulate differences in mm5
- paddw mm5, mm2 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- pmaddwd mm2, mm2 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movq mm1, [rbx] ; Copy eight bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
- paddd mm7, mm2 ; accumulate in mm7
-
-
- ; Row 2
- movq mm0, [rax] ; Copy eight bytes to mm0
- movq mm2, mm0 ; Take copies
- movq mm3, mm1 ; Take copies
-
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- punpckhbw mm2, mm6 ; unpack to higher prrcision
- punpckhbw mm3, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- psubsw mm2, mm3 ; A-B (high order) to MM2
-
- paddw mm5, mm0 ; accumulate differences in mm5
- paddw mm5, mm2 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- pmaddwd mm2, mm2 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movq mm1, [rbx] ; Copy eight bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
- paddd mm7, mm2 ; accumulate in mm7
-
- ; Row 3
- movq mm0, [rax] ; Copy eight bytes to mm0
- movq mm2, mm0 ; Take copies
- movq mm3, mm1 ; Take copies
-
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- punpckhbw mm2, mm6 ; unpack to higher prrcision
- punpckhbw mm3, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- psubsw mm2, mm3 ; A-B (high order) to MM2
-
- paddw mm5, mm0 ; accumulate differences in mm5
- paddw mm5, mm2 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- pmaddwd mm2, mm2 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movq mm1, [rbx] ; Copy eight bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
- paddd mm7, mm2 ; accumulate in mm7
-
- ; Row 4
- movq mm0, [rax] ; Copy eight bytes to mm0
- movq mm2, mm0 ; Take copies
- movq mm3, mm1 ; Take copies
-
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- punpckhbw mm2, mm6 ; unpack to higher prrcision
- punpckhbw mm3, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- psubsw mm2, mm3 ; A-B (high order) to MM2
-
- paddw mm5, mm0 ; accumulate differences in mm5
- paddw mm5, mm2 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- pmaddwd mm2, mm2 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movq mm1, [rbx] ; Copy eight bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
- paddd mm7, mm2 ; accumulate in mm7
-
- ; Row 5
- movq mm0, [rax] ; Copy eight bytes to mm0
- movq mm2, mm0 ; Take copies
- movq mm3, mm1 ; Take copies
-
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- punpckhbw mm2, mm6 ; unpack to higher prrcision
- punpckhbw mm3, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- psubsw mm2, mm3 ; A-B (high order) to MM2
-
- paddw mm5, mm0 ; accumulate differences in mm5
- paddw mm5, mm2 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- pmaddwd mm2, mm2 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movq mm1, [rbx] ; Copy eight bytes to mm1
- ; movq mm4, [rbx + rdx]
- paddd mm7, mm0 ; accumulate in mm7
- paddd mm7, mm2 ; accumulate in mm7
-
- ; Row 6
- movq mm0, [rax] ; Copy eight bytes to mm0
- movq mm2, mm0 ; Take copies
- movq mm3, mm1 ; Take copies
-
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- punpckhbw mm2, mm6 ; unpack to higher prrcision
- punpckhbw mm3, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- psubsw mm2, mm3 ; A-B (high order) to MM2
-
- paddw mm5, mm0 ; accumulate differences in mm5
- paddw mm5, mm2 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- pmaddwd mm2, mm2 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movq mm1, [rbx] ; Copy eight bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
- paddd mm7, mm2 ; accumulate in mm7
-
- ; Row 7
- movq mm0, [rax] ; Copy eight bytes to mm0
- movq mm2, mm0 ; Take copies
- movq mm3, mm1 ; Take copies
-
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- punpckhbw mm2, mm6 ; unpack to higher prrcision
- punpckhbw mm3, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- psubsw mm2, mm3 ; A-B (high order) to MM2
-
- paddw mm5, mm0 ; accumulate differences in mm5
- paddw mm5, mm2 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- pmaddwd mm2, mm2 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movq mm1, [rbx] ; Copy eight bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
- paddd mm7, mm2 ; accumulate in mm7
-
- ; Row 8
- movq mm0, [rax] ; Copy eight bytes to mm0
- movq mm2, mm0 ; Take copies
- movq mm3, mm1 ; Take copies
-
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- punpckhbw mm2, mm6 ; unpack to higher prrcision
- punpckhbw mm3, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- psubsw mm2, mm3 ; A-B (high order) to MM2
-
- paddw mm5, mm0 ; accumulate differences in mm5
- paddw mm5, mm2 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- pmaddwd mm2, mm2 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- paddd mm7, mm0 ; accumulate in mm7
- paddd mm7, mm2 ; accumulate in mm7
-
- ; Now accumulate the final results.
- movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
- movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
- movsx rdx, WORD PTR [rsp+8]
- movsx rcx, WORD PTR [rsp+10]
- movsx rbx, WORD PTR [rsp+12]
- movsx rax, WORD PTR [rsp+14]
- add rdx, rcx
- add rbx, rax
- add rdx, rbx ;XSum
- movsxd rax, DWORD PTR [rsp]
- movsxd rcx, DWORD PTR [rsp+4]
- add rax, rcx ;XXSum
- mov rsi, arg(4) ;SSE
- mov rdi, arg(5) ;Sum
- mov dword ptr [rsi], eax
- mov dword ptr [rdi], edx
- xor rax, rax ; return 0
-
-
- ; begin epilog
- add rsp, 16
- pop rbx
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-
-;unsigned int
-;vp8_get4x4var_mmx
-;(
-; unsigned char *src_ptr,
-; int source_stride,
-; unsigned char *ref_ptr,
-; int recon_stride,
-; unsigned int *SSE,
-; int *Sum
-;)
-global sym(vp8_get4x4var_mmx) PRIVATE
-sym(vp8_get4x4var_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- push rsi
- push rdi
- push rbx
- sub rsp, 16
- ; end prolog
-
-
- pxor mm5, mm5 ; Blank mmx6
- pxor mm6, mm6 ; Blank mmx7
- pxor mm7, mm7 ; Blank mmx7
-
- mov rax, arg(0) ;[src_ptr] ; Load base addresses
- mov rbx, arg(2) ;[ref_ptr]
- movsxd rcx, dword ptr arg(1) ;[source_stride]
- movsxd rdx, dword ptr arg(3) ;[recon_stride]
-
- ; Row 1
- movd mm0, [rax] ; Copy four bytes to mm0
- movd mm1, [rbx] ; Copy four bytes to mm1
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- paddw mm5, mm0 ; accumulate differences in mm5
- pmaddwd mm0, mm0 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movd mm1, [rbx] ; Copy four bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
-
-
- ; Row 2
- movd mm0, [rax] ; Copy four bytes to mm0
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- paddw mm5, mm0 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movd mm1, [rbx] ; Copy four bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
-
- ; Row 3
- movd mm0, [rax] ; Copy four bytes to mm0
- punpcklbw mm0, mm6 ; unpack to higher precision
- punpcklbw mm1, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- paddw mm5, mm0 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movd mm1, [rbx] ; Copy four bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
-
- ; Row 4
- movd mm0, [rax] ; Copy four bytes to mm0
-
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
-
- paddw mm5, mm0 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- paddd mm7, mm0 ; accumulate in mm7
-
-
- ; Now accumulate the final results.
- movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
- movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
- movsx rdx, WORD PTR [rsp+8]
- movsx rcx, WORD PTR [rsp+10]
- movsx rbx, WORD PTR [rsp+12]
- movsx rax, WORD PTR [rsp+14]
- add rdx, rcx
- add rbx, rax
- add rdx, rbx ;XSum
- movsxd rax, DWORD PTR [rsp]
- movsxd rcx, DWORD PTR [rsp+4]
- add rax, rcx ;XXSum
- mov rsi, arg(4) ;SSE
- mov rdi, arg(5) ;Sum
- mov dword ptr [rsi], eax
- mov dword ptr [rdi], edx
- xor rax, rax ; return 0
-
-
- ; begin epilog
- add rsp, 16
- pop rbx
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-
-;unsigned int
-;vp8_get4x4sse_cs_mmx
-;(
-; unsigned char *src_ptr,
-; int source_stride,
-; unsigned char *ref_ptr,
-; int recon_stride
-;)
-global sym(vp8_get4x4sse_cs_mmx) PRIVATE
-sym(vp8_get4x4sse_cs_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- push rbx
- ; end prolog
-
-
- pxor mm6, mm6 ; Blank mmx7
- pxor mm7, mm7 ; Blank mmx7
-
- mov rax, arg(0) ;[src_ptr] ; Load base addresses
- mov rbx, arg(2) ;[ref_ptr]
- movsxd rcx, dword ptr arg(1) ;[source_stride]
- movsxd rdx, dword ptr arg(3) ;[recon_stride]
- ; Row 1
- movd mm0, [rax] ; Copy eight bytes to mm0
- movd mm1, [rbx] ; Copy eight bytes to mm1
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- pmaddwd mm0, mm0 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movd mm1, [rbx] ; Copy eight bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
-
- ; Row 2
- movd mm0, [rax] ; Copy eight bytes to mm0
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- pmaddwd mm0, mm0 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movd mm1, [rbx] ; Copy eight bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
-
- ; Row 3
- movd mm0, [rax] ; Copy eight bytes to mm0
- punpcklbw mm1, mm6
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- psubsw mm0, mm1 ; A-B (low order) to MM0
-
- pmaddwd mm0, mm0 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movd mm1, [rbx] ; Copy eight bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
-
- ; Row 4
- movd mm0, [rax] ; Copy eight bytes to mm0
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- pmaddwd mm0, mm0 ; square and accumulate
- paddd mm7, mm0 ; accumulate in mm7
-
- movq mm0, mm7 ;
- psrlq mm7, 32
-
- paddd mm0, mm7
- movq rax, mm0
-
-
- ; begin epilog
- pop rbx
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
%define mmx_filter_shift 7
;void vp8_filter_block2d_bil4x4_var_mmx
diff --git a/vp8/common/x86/variance_impl_sse2.asm b/vp8/common/x86/variance_impl_sse2.asm
index 761433c11..26de5e860 100644
--- a/vp8/common/x86/variance_impl_sse2.asm
+++ b/vp8/common/x86/variance_impl_sse2.asm
@@ -13,393 +13,6 @@
%define xmm_filter_shift 7
-;unsigned int vp8_get_mb_ss_sse2
-;(
-; short *src_ptr
-;)
-global sym(vp8_get_mb_ss_sse2) PRIVATE
-sym(vp8_get_mb_ss_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 1
- GET_GOT rbx
- push rsi
- push rdi
- sub rsp, 16
- ; end prolog
-
-
- mov rax, arg(0) ;[src_ptr]
- mov rcx, 8
- pxor xmm4, xmm4
-
-.NEXTROW:
- movdqa xmm0, [rax]
- movdqa xmm1, [rax+16]
- movdqa xmm2, [rax+32]
- movdqa xmm3, [rax+48]
- pmaddwd xmm0, xmm0
- pmaddwd xmm1, xmm1
- pmaddwd xmm2, xmm2
- pmaddwd xmm3, xmm3
-
- paddd xmm0, xmm1
- paddd xmm2, xmm3
- paddd xmm4, xmm0
- paddd xmm4, xmm2
-
- add rax, 0x40
- dec rcx
- ja .NEXTROW
-
- movdqa xmm3,xmm4
- psrldq xmm4,8
- paddd xmm4,xmm3
- movdqa xmm3,xmm4
- psrldq xmm4,4
- paddd xmm4,xmm3
- movq rax,xmm4
-
-
- ; begin epilog
- add rsp, 16
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;unsigned int vp8_get16x16var_sse2
-;(
-; unsigned char * src_ptr,
-; int source_stride,
-; unsigned char * ref_ptr,
-; int recon_stride,
-; unsigned int * SSE,
-; int * Sum
-;)
-global sym(vp8_get16x16var_sse2) PRIVATE
-sym(vp8_get16x16var_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;[src_ptr]
- mov rdi, arg(2) ;[ref_ptr]
-
- movsxd rax, DWORD PTR arg(1) ;[source_stride]
- movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
-
- ; Prefetch data
- lea rcx, [rax+rax*2]
- prefetcht0 [rsi]
- prefetcht0 [rsi+rax]
- prefetcht0 [rsi+rax*2]
- prefetcht0 [rsi+rcx]
- lea rbx, [rsi+rax*4]
- prefetcht0 [rbx]
- prefetcht0 [rbx+rax]
- prefetcht0 [rbx+rax*2]
- prefetcht0 [rbx+rcx]
-
- lea rcx, [rdx+rdx*2]
- prefetcht0 [rdi]
- prefetcht0 [rdi+rdx]
- prefetcht0 [rdi+rdx*2]
- prefetcht0 [rdi+rcx]
- lea rbx, [rdi+rdx*4]
- prefetcht0 [rbx]
- prefetcht0 [rbx+rdx]
- prefetcht0 [rbx+rdx*2]
- prefetcht0 [rbx+rcx]
-
- pxor xmm0, xmm0 ; clear xmm0 for unpack
- pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
-
- pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
- mov rcx, 16
-
-.var16loop:
- movdqu xmm1, XMMWORD PTR [rsi]
- movdqu xmm2, XMMWORD PTR [rdi]
-
- prefetcht0 [rsi+rax*8]
- prefetcht0 [rdi+rdx*8]
-
- movdqa xmm3, xmm1
- movdqa xmm4, xmm2
-
-
- punpcklbw xmm1, xmm0
- punpckhbw xmm3, xmm0
-
- punpcklbw xmm2, xmm0
- punpckhbw xmm4, xmm0
-
-
- psubw xmm1, xmm2
- psubw xmm3, xmm4
-
- paddw xmm7, xmm1
- pmaddwd xmm1, xmm1
-
- paddw xmm7, xmm3
- pmaddwd xmm3, xmm3
-
- paddd xmm6, xmm1
- paddd xmm6, xmm3
-
- add rsi, rax
- add rdi, rdx
-
- sub rcx, 1
- jnz .var16loop
-
-
- movdqa xmm1, xmm6
- pxor xmm6, xmm6
-
- pxor xmm5, xmm5
- punpcklwd xmm6, xmm7
-
- punpckhwd xmm5, xmm7
- psrad xmm5, 16
-
- psrad xmm6, 16
- paddd xmm6, xmm5
-
- movdqa xmm2, xmm1
- punpckldq xmm1, xmm0
-
- punpckhdq xmm2, xmm0
- movdqa xmm7, xmm6
-
- paddd xmm1, xmm2
- punpckldq xmm6, xmm0
-
- punpckhdq xmm7, xmm0
- paddd xmm6, xmm7
-
- movdqa xmm2, xmm1
- movdqa xmm7, xmm6
-
- psrldq xmm1, 8
- psrldq xmm6, 8
-
- paddd xmm7, xmm6
- paddd xmm1, xmm2
-
- mov rax, arg(5) ;[Sum]
- mov rdi, arg(4) ;[SSE]
-
- movd DWORD PTR [rax], xmm7
- movd DWORD PTR [rdi], xmm1
-
-
- ; begin epilog
- pop rdi
- pop rsi
- pop rbx
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-
-
-;unsigned int vp8_get8x8var_sse2
-;(
-; unsigned char * src_ptr,
-; int source_stride,
-; unsigned char * ref_ptr,
-; int recon_stride,
-; unsigned int * SSE,
-; int * Sum
-;)
-global sym(vp8_get8x8var_sse2) PRIVATE
-sym(vp8_get8x8var_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- sub rsp, 16
- ; end prolog
-
- mov rsi, arg(0) ;[src_ptr]
- mov rdi, arg(2) ;[ref_ptr]
-
- movsxd rax, DWORD PTR arg(1) ;[source_stride]
- movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
-
- pxor xmm0, xmm0 ; clear xmm0 for unpack
- pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
-
- movq xmm1, QWORD PTR [rsi]
- movq xmm2, QWORD PTR [rdi]
-
- punpcklbw xmm1, xmm0
- punpcklbw xmm2, xmm0
-
- psubsw xmm1, xmm2
- paddw xmm7, xmm1
-
- pmaddwd xmm1, xmm1
-
- movq xmm2, QWORD PTR[rsi + rax]
- movq xmm3, QWORD PTR[rdi + rdx]
-
- punpcklbw xmm2, xmm0
- punpcklbw xmm3, xmm0
-
- psubsw xmm2, xmm3
- paddw xmm7, xmm2
-
- pmaddwd xmm2, xmm2
- paddd xmm1, xmm2
-
-
- movq xmm2, QWORD PTR[rsi + rax * 2]
- movq xmm3, QWORD PTR[rdi + rdx * 2]
-
- punpcklbw xmm2, xmm0
- punpcklbw xmm3, xmm0
-
- psubsw xmm2, xmm3
- paddw xmm7, xmm2
-
- pmaddwd xmm2, xmm2
- paddd xmm1, xmm2
-
-
- lea rsi, [rsi + rax * 2]
- lea rdi, [rdi + rdx * 2]
- movq xmm2, QWORD PTR[rsi + rax]
- movq xmm3, QWORD PTR[rdi + rdx]
-
- punpcklbw xmm2, xmm0
- punpcklbw xmm3, xmm0
-
- psubsw xmm2, xmm3
- paddw xmm7, xmm2
-
- pmaddwd xmm2, xmm2
- paddd xmm1, xmm2
-
- movq xmm2, QWORD PTR[rsi + rax *2]
- movq xmm3, QWORD PTR[rdi + rdx *2]
-
- punpcklbw xmm2, xmm0
- punpcklbw xmm3, xmm0
-
- psubsw xmm2, xmm3
- paddw xmm7, xmm2
-
- pmaddwd xmm2, xmm2
- paddd xmm1, xmm2
-
-
- lea rsi, [rsi + rax * 2]
- lea rdi, [rdi + rdx * 2]
-
-
- movq xmm2, QWORD PTR[rsi + rax]
- movq xmm3, QWORD PTR[rdi + rdx]
-
- punpcklbw xmm2, xmm0
- punpcklbw xmm3, xmm0
-
- psubsw xmm2, xmm3
- paddw xmm7, xmm2
-
- pmaddwd xmm2, xmm2
- paddd xmm1, xmm2
-
- movq xmm2, QWORD PTR[rsi + rax *2]
- movq xmm3, QWORD PTR[rdi + rdx *2]
-
- punpcklbw xmm2, xmm0
- punpcklbw xmm3, xmm0
-
- psubsw xmm2, xmm3
- paddw xmm7, xmm2
-
- pmaddwd xmm2, xmm2
- paddd xmm1, xmm2
-
-
- lea rsi, [rsi + rax * 2]
- lea rdi, [rdi + rdx * 2]
-
- movq xmm2, QWORD PTR[rsi + rax]
- movq xmm3, QWORD PTR[rdi + rdx]
-
- punpcklbw xmm2, xmm0
- punpcklbw xmm3, xmm0
-
- psubsw xmm2, xmm3
- paddw xmm7, xmm2
-
- pmaddwd xmm2, xmm2
- paddd xmm1, xmm2
-
-
- movdqa xmm6, xmm7
- punpcklwd xmm6, xmm0
-
- punpckhwd xmm7, xmm0
- movdqa xmm2, xmm1
-
- paddw xmm6, xmm7
- punpckldq xmm1, xmm0
-
- punpckhdq xmm2, xmm0
- movdqa xmm7, xmm6
-
- paddd xmm1, xmm2
- punpckldq xmm6, xmm0
-
- punpckhdq xmm7, xmm0
- paddw xmm6, xmm7
-
- movdqa xmm2, xmm1
- movdqa xmm7, xmm6
-
- psrldq xmm1, 8
- psrldq xmm6, 8
-
- paddw xmm7, xmm6
- paddd xmm1, xmm2
-
- mov rax, arg(5) ;[Sum]
- mov rdi, arg(4) ;[SSE]
-
- movq rdx, xmm7
- movsx rcx, dx
-
- mov dword ptr [rax], ecx
- movd DWORD PTR [rdi], xmm1
-
- ; begin epilog
- add rsp, 16
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
;void vp8_filter_block2d_bil_var_sse2
;(
; unsigned char *ref_ptr,
diff --git a/vp8/common/x86/variance_mmx.c b/vp8/common/x86/variance_mmx.c
index 10a58b822..25ae5767f 100644
--- a/vp8/common/x86/variance_mmx.c
+++ b/vp8/common/x86/variance_mmx.c
@@ -35,25 +35,6 @@ extern void filter_block1d_v6_mmx
short *filter
);
-extern unsigned int vp8_get_mb_ss_mmx(const short *src_ptr);
-extern unsigned int vp8_get8x8var_mmx
-(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *SSE,
- int *Sum
-);
-extern unsigned int vp8_get4x4var_mmx
-(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *SSE,
- int *Sum
-);
extern void vp8_filter_block2d_bil4x4_var_mmx
(
const unsigned char *ref_ptr,
@@ -78,127 +59,6 @@ extern void vp8_filter_block2d_bil_var_mmx
unsigned int *sumsquared
);
-
-unsigned int vp8_variance4x4_mmx(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse)
-{
- unsigned int var;
- int avg;
-
- vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
- *sse = var;
- return (var - (((unsigned int)avg * avg) >> 4));
-
-}
-
-unsigned int vp8_variance8x8_mmx(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse)
-{
- unsigned int var;
- int avg;
-
- vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
- *sse = var;
-
- return (var - (((unsigned int)avg * avg) >> 6));
-
-}
-
-unsigned int vp8_mse16x16_mmx(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse)
-{
- unsigned int sse0, sse1, sse2, sse3, var;
- int sum0, sum1, sum2, sum3;
-
-
- vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
- vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
- vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
- vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
-
- var = sse0 + sse1 + sse2 + sse3;
- *sse = var;
- return var;
-}
-
-
-unsigned int vp8_variance16x16_mmx(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse)
-{
- unsigned int sse0, sse1, sse2, sse3, var;
- int sum0, sum1, sum2, sum3, avg;
-
-
- vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
- vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
- vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
- vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
-
- var = sse0 + sse1 + sse2 + sse3;
- avg = sum0 + sum1 + sum2 + sum3;
- *sse = var;
- return (var - (((unsigned int)avg * avg) >> 8));
-}
-
-unsigned int vp8_variance16x8_mmx(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse)
-{
- unsigned int sse0, sse1, var;
- int sum0, sum1, avg;
-
- vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
- vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
-
- var = sse0 + sse1;
- avg = sum0 + sum1;
- *sse = var;
- return (var - (((unsigned int)avg * avg) >> 7));
-
-}
-
-
-unsigned int vp8_variance8x16_mmx(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse)
-{
- unsigned int sse0, sse1, var;
- int sum0, sum1, avg;
-
- vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
- vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
-
- var = sse0 + sse1;
- avg = sum0 + sum1;
- *sse = var;
-
- return (var - (((unsigned int)avg * avg) >> 7));
-
-}
-
-
unsigned int vp8_sub_pixel_variance4x4_mmx
(
const unsigned char *src_ptr,
diff --git a/vp8/common/x86/variance_sse2.c b/vp8/common/x86/variance_sse2.c
index 6c6539d8e..f6dfb2787 100644
--- a/vp8/common/x86/variance_sse2.c
+++ b/vp8/common/x86/variance_sse2.c
@@ -31,38 +31,6 @@ extern void vp8_filter_block2d_bil4x4_var_mmx
unsigned int *sumsquared
);
-extern unsigned int vp8_get4x4var_mmx
-(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *SSE,
- int *Sum
-);
-
-unsigned int vp8_get_mb_ss_sse2
-(
- const short *src_ptr
-);
-unsigned int vp8_get16x16var_sse2
-(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *SSE,
- int *Sum
-);
-unsigned int vp8_get8x8var_sse2
-(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *SSE,
- int *Sum
-);
void vp8_filter_block2d_bil_var_sse2
(
const unsigned char *ref_ptr,
@@ -136,115 +104,6 @@ void vp8_half_vert_variance16x_h_sse2
unsigned int *sumsquared
);
-unsigned int vp8_variance4x4_wmt(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse)
-{
- unsigned int var;
- int avg;
-
- vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
- *sse = var;
- return (var - (((unsigned int)avg * avg) >> 4));
-
-}
-
-unsigned int vp8_variance8x8_wmt
-(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse)
-{
- unsigned int var;
- int avg;
-
- vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
- *sse = var;
- return (var - (((unsigned int)avg * avg) >> 6));
-
-}
-
-
-unsigned int vp8_variance16x16_wmt
-(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse)
-{
- unsigned int sse0;
- int sum0;
-
-
- vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
- *sse = sse0;
- return (sse0 - (((unsigned int)sum0 * sum0) >> 8));
-}
-unsigned int vp8_mse16x16_wmt(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse)
-{
-
- unsigned int sse0;
- int sum0;
- vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
- *sse = sse0;
- return sse0;
-
-}
-
-
-unsigned int vp8_variance16x8_wmt
-(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse)
-{
- unsigned int sse0, sse1, var;
- int sum0, sum1, avg;
-
- vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
- vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
-
- var = sse0 + sse1;
- avg = sum0 + sum1;
- *sse = var;
- return (var - (((unsigned int)avg * avg) >> 7));
-
-}
-
-unsigned int vp8_variance8x16_wmt
-(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse)
-{
- unsigned int sse0, sse1, var;
- int sum0, sum1, avg;
-
- vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
- vp8_get8x8var_sse2(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
-
- var = sse0 + sse1;
- avg = sum0 + sum1;
- *sse = var;
- return (var - (((unsigned int)avg * avg) >> 7));
-
-}
-
unsigned int vp8_sub_pixel_variance4x4_wmt
(
const unsigned char *src_ptr,
diff --git a/vp8/common/x86/variance_ssse3.c b/vp8/common/x86/variance_ssse3.c
index d8c8da540..2a0df640a 100644
--- a/vp8/common/x86/variance_ssse3.c
+++ b/vp8/common/x86/variance_ssse3.c
@@ -13,15 +13,6 @@
#include "vp8/common/variance.h"
#include "vpx_ports/mem.h"
-extern unsigned int vp8_get16x16var_sse2
-(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *SSE,
- int *Sum
-);
extern void vp8_half_horiz_vert_variance16x_h_sse2
(
const unsigned char *ref_ptr,
diff --git a/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm b/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm
deleted file mode 100644
index 000805d4f..000000000
--- a/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm
+++ /dev/null
@@ -1,138 +0,0 @@
-;
-; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_mse16x16_armv6|
-
- ARM
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0 unsigned char *src_ptr
-; r1 int source_stride
-; r2 unsigned char *ref_ptr
-; r3 int recon_stride
-; stack unsigned int *sse
-;
-;note: Based on vp8_variance16x16_armv6. In this function, sum is never used.
-; So, we can remove this part of calculation.
-
-|vp8_mse16x16_armv6| PROC
-
- push {r4-r9, lr}
-
- pld [r0, r1, lsl #0]
- pld [r2, r3, lsl #0]
-
- mov r12, #16 ; set loop counter to 16 (=block height)
- mov r4, #0 ; initialize sse = 0
-
-loop
- ; 1st 4 pixels
- ldr r5, [r0, #0x0] ; load 4 src pixels
- ldr r6, [r2, #0x0] ; load 4 ref pixels
-
- mov lr, #0 ; constant zero
-
- usub8 r8, r5, r6 ; calculate difference
- pld [r0, r1, lsl #1]
- sel r7, r8, lr ; select bytes with positive difference
- usub8 r9, r6, r5 ; calculate difference with reversed operands
- pld [r2, r3, lsl #1]
- sel r8, r9, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r5, r7, lr ; calculate sum of positive differences
- usad8 r6, r8, lr ; calculate sum of negative differences
- orr r8, r8, r7 ; differences of all 4 pixels
-
- ldr r5, [r0, #0x4] ; load 4 src pixels
-
- ; calculate sse
- uxtb16 r6, r8 ; byte (two pixels) to halfwords
- uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
- smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
-
- ; 2nd 4 pixels
- ldr r6, [r2, #0x4] ; load 4 ref pixels
- smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
-
- usub8 r8, r5, r6 ; calculate difference
- sel r7, r8, lr ; select bytes with positive difference
- usub8 r9, r6, r5 ; calculate difference with reversed operands
- sel r8, r9, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r5, r7, lr ; calculate sum of positive differences
- usad8 r6, r8, lr ; calculate sum of negative differences
- orr r8, r8, r7 ; differences of all 4 pixels
- ldr r5, [r0, #0x8] ; load 4 src pixels
- ; calculate sse
- uxtb16 r6, r8 ; byte (two pixels) to halfwords
- uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
- smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
-
- ; 3rd 4 pixels
- ldr r6, [r2, #0x8] ; load 4 ref pixels
- smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
-
- usub8 r8, r5, r6 ; calculate difference
- sel r7, r8, lr ; select bytes with positive difference
- usub8 r9, r6, r5 ; calculate difference with reversed operands
- sel r8, r9, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r5, r7, lr ; calculate sum of positive differences
- usad8 r6, r8, lr ; calculate sum of negative differences
- orr r8, r8, r7 ; differences of all 4 pixels
-
- ldr r5, [r0, #0xc] ; load 4 src pixels
-
- ; calculate sse
- uxtb16 r6, r8 ; byte (two pixels) to halfwords
- uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
- smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
-
- ; 4th 4 pixels
- ldr r6, [r2, #0xc] ; load 4 ref pixels
- smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
-
- usub8 r8, r5, r6 ; calculate difference
- add r0, r0, r1 ; set src_ptr to next row
- sel r7, r8, lr ; select bytes with positive difference
- usub8 r9, r6, r5 ; calculate difference with reversed operands
- add r2, r2, r3 ; set dst_ptr to next row
- sel r8, r9, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r5, r7, lr ; calculate sum of positive differences
- usad8 r6, r8, lr ; calculate sum of negative differences
- orr r8, r8, r7 ; differences of all 4 pixels
-
- subs r12, r12, #1 ; next row
-
- ; calculate sse
- uxtb16 r6, r8 ; byte (two pixels) to halfwords
- uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
- smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
- smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
-
- bne loop
-
- ; return stuff
- ldr r1, [sp, #28] ; get address of sse
- mov r0, r4 ; return sse
- str r4, [r1] ; store sse
-
- pop {r4-r9, pc}
-
- ENDP
-
- END
diff --git a/vp8/encoder/arm/neon/vp8_mse16x16_neon.c b/vp8/encoder/arm/neon/vp8_mse16x16_neon.c
deleted file mode 100644
index f806809df..000000000
--- a/vp8/encoder/arm/neon/vp8_mse16x16_neon.c
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-unsigned int vp8_mse16x16_neon(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- int i;
- int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
- int64x1_t d0s64;
- uint8x16_t q0u8, q1u8, q2u8, q3u8;
- int32x4_t q7s32, q8s32, q9s32, q10s32;
- uint16x8_t q11u16, q12u16, q13u16, q14u16;
- int64x2_t q1s64;
-
- q7s32 = vdupq_n_s32(0);
- q8s32 = vdupq_n_s32(0);
- q9s32 = vdupq_n_s32(0);
- q10s32 = vdupq_n_s32(0);
-
- for (i = 0; i < 8; i++) { // mse16x16_neon_loop
- q0u8 = vld1q_u8(src_ptr);
- src_ptr += source_stride;
- q1u8 = vld1q_u8(src_ptr);
- src_ptr += source_stride;
- q2u8 = vld1q_u8(ref_ptr);
- ref_ptr += recon_stride;
- q3u8 = vld1q_u8(ref_ptr);
- ref_ptr += recon_stride;
-
- q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
- q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
- q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
- q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
-
- d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
- d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
- q7s32 = vmlal_s16(q7s32, d22s16, d22s16);
- q8s32 = vmlal_s16(q8s32, d23s16, d23s16);
-
- d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
- d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
- q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
- q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-
- d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
- d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
- q7s32 = vmlal_s16(q7s32, d26s16, d26s16);
- q8s32 = vmlal_s16(q8s32, d27s16, d27s16);
-
- d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
- d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
- q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
- q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
- }
-
- q7s32 = vaddq_s32(q7s32, q8s32);
- q9s32 = vaddq_s32(q9s32, q10s32);
- q10s32 = vaddq_s32(q7s32, q9s32);
-
- q1s64 = vpaddlq_s32(q10s32);
- d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
- vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0);
- return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
-}
-
-unsigned int vp8_get4x4sse_cs_neon(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride) {
- int16x4_t d22s16, d24s16, d26s16, d28s16;
- int64x1_t d0s64;
- uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
- int32x4_t q7s32, q8s32, q9s32, q10s32;
- uint16x8_t q11u16, q12u16, q13u16, q14u16;
- int64x2_t q1s64;
-
- d0u8 = vld1_u8(src_ptr);
- src_ptr += source_stride;
- d4u8 = vld1_u8(ref_ptr);
- ref_ptr += recon_stride;
- d1u8 = vld1_u8(src_ptr);
- src_ptr += source_stride;
- d5u8 = vld1_u8(ref_ptr);
- ref_ptr += recon_stride;
- d2u8 = vld1_u8(src_ptr);
- src_ptr += source_stride;
- d6u8 = vld1_u8(ref_ptr);
- ref_ptr += recon_stride;
- d3u8 = vld1_u8(src_ptr);
- src_ptr += source_stride;
- d7u8 = vld1_u8(ref_ptr);
- ref_ptr += recon_stride;
-
- q11u16 = vsubl_u8(d0u8, d4u8);
- q12u16 = vsubl_u8(d1u8, d5u8);
- q13u16 = vsubl_u8(d2u8, d6u8);
- q14u16 = vsubl_u8(d3u8, d7u8);
-
- d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16));
- d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16));
- d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16));
- d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16));
-
- q7s32 = vmull_s16(d22s16, d22s16);
- q8s32 = vmull_s16(d24s16, d24s16);
- q9s32 = vmull_s16(d26s16, d26s16);
- q10s32 = vmull_s16(d28s16, d28s16);
-
- q7s32 = vaddq_s32(q7s32, q8s32);
- q9s32 = vaddq_s32(q9s32, q10s32);
- q9s32 = vaddq_s32(q7s32, q9s32);
-
- q1s64 = vpaddlq_s32(q9s32);
- d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
- return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
-}
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index 378e902c6..d381d8ddf 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -11,6 +11,7 @@
#include "vpx_config.h"
#include "vp8_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
#include "encodemb.h"
#include "encodemv.h"
#include "vp8/common/common.h"
@@ -90,7 +91,7 @@ static unsigned int tt_activity_measure( VP8_COMP *cpi, MACROBLOCK *x )
* lambda using a non-linear combination (e.g., the smallest, or second
* smallest, etc.).
*/
- act = vp8_variance16x16(x->src.y_buffer,
+ act = vpx_variance16x16(x->src.y_buffer,
x->src.y_stride, VP8_VAR_OFFS, 0, &sse);
act = act<<4;
diff --git a/vp8/encoder/encodeintra.c b/vp8/encoder/encodeintra.c
index cfa4cb927..e2de5eecb 100644
--- a/vp8/encoder/encodeintra.c
+++ b/vp8/encoder/encodeintra.c
@@ -11,6 +11,7 @@
#include "vpx_config.h"
#include "vp8_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
#include "quantize.h"
#include "vp8/common/reconintra4x4.h"
#include "encodemb.h"
@@ -44,7 +45,7 @@ int vp8_encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred)
}
}
- intra_pred_var = vp8_get_mb_ss(x->src_diff);
+ intra_pred_var = vpx_get_mb_ss(x->src_diff);
return intra_pred_var;
}
diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index a6ff0e7a0..3deb4abb3 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -12,6 +12,7 @@
#include <limits.h>
#include <stdio.h>
+#include "./vpx_dsp_rtcd.h"
#include "./vpx_scale_rtcd.h"
#include "block.h"
#include "onyx_int.h"
@@ -422,14 +423,14 @@ static void zz_motion_search( VP8_COMP *cpi, MACROBLOCK * x,
/* Set up pointers for this macro block raw buffer */
raw_ptr = (unsigned char *)(raw_buffer->y_buffer + recon_yoffset
+ d->offset);
- vp8_mse16x16 ( src_ptr, src_stride, raw_ptr, raw_stride,
- (unsigned int *)(raw_motion_err));
+ vpx_mse16x16(src_ptr, src_stride, raw_ptr, raw_stride,
+ (unsigned int *)(raw_motion_err));
/* Set up pointers for this macro block recon buffer */
xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;
ref_ptr = (unsigned char *)(xd->pre.y_buffer + d->offset );
- vp8_mse16x16 ( src_ptr, src_stride, ref_ptr, ref_stride,
- (unsigned int *)(best_motion_err));
+ vpx_mse16x16(src_ptr, src_stride, ref_ptr, ref_stride,
+ (unsigned int *)(best_motion_err));
}
static void first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x,
@@ -453,7 +454,7 @@ static void first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x,
int new_mv_mode_penalty = 256;
/* override the default variance function to use MSE */
- v_fn_ptr.vf = vp8_mse16x16;
+ v_fn_ptr.vf = vpx_mse16x16;
/* Set up pointers for this macro block recon buffer */
xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index c2bb23295..40e29e191 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -2131,7 +2131,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
#endif
cpi->fn_ptr[BLOCK_16X16].sdf = vpx_sad16x16;
- cpi->fn_ptr[BLOCK_16X16].vf = vp8_variance16x16;
+ cpi->fn_ptr[BLOCK_16X16].vf = vpx_variance16x16;
cpi->fn_ptr[BLOCK_16X16].svf = vp8_sub_pixel_variance16x16;
cpi->fn_ptr[BLOCK_16X16].svf_halfpix_h = vp8_variance_halfpixvar16x16_h;
cpi->fn_ptr[BLOCK_16X16].svf_halfpix_v = vp8_variance_halfpixvar16x16_v;
@@ -2141,7 +2141,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
cpi->fn_ptr[BLOCK_16X16].sdx4df = vpx_sad16x16x4d;
cpi->fn_ptr[BLOCK_16X8].sdf = vpx_sad16x8;
- cpi->fn_ptr[BLOCK_16X8].vf = vp8_variance16x8;
+ cpi->fn_ptr[BLOCK_16X8].vf = vpx_variance16x8;
cpi->fn_ptr[BLOCK_16X8].svf = vp8_sub_pixel_variance16x8;
cpi->fn_ptr[BLOCK_16X8].svf_halfpix_h = NULL;
cpi->fn_ptr[BLOCK_16X8].svf_halfpix_v = NULL;
@@ -2151,7 +2151,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
cpi->fn_ptr[BLOCK_16X8].sdx4df = vpx_sad16x8x4d;
cpi->fn_ptr[BLOCK_8X16].sdf = vpx_sad8x16;
- cpi->fn_ptr[BLOCK_8X16].vf = vp8_variance8x16;
+ cpi->fn_ptr[BLOCK_8X16].vf = vpx_variance8x16;
cpi->fn_ptr[BLOCK_8X16].svf = vp8_sub_pixel_variance8x16;
cpi->fn_ptr[BLOCK_8X16].svf_halfpix_h = NULL;
cpi->fn_ptr[BLOCK_8X16].svf_halfpix_v = NULL;
@@ -2161,7 +2161,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
cpi->fn_ptr[BLOCK_8X16].sdx4df = vpx_sad8x16x4d;
cpi->fn_ptr[BLOCK_8X8].sdf = vpx_sad8x8;
- cpi->fn_ptr[BLOCK_8X8].vf = vp8_variance8x8;
+ cpi->fn_ptr[BLOCK_8X8].vf = vpx_variance8x8;
cpi->fn_ptr[BLOCK_8X8].svf = vp8_sub_pixel_variance8x8;
cpi->fn_ptr[BLOCK_8X8].svf_halfpix_h = NULL;
cpi->fn_ptr[BLOCK_8X8].svf_halfpix_v = NULL;
@@ -2171,7 +2171,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
cpi->fn_ptr[BLOCK_8X8].sdx4df = vpx_sad8x8x4d;
cpi->fn_ptr[BLOCK_4X4].sdf = vpx_sad4x4;
- cpi->fn_ptr[BLOCK_4X4].vf = vp8_variance4x4;
+ cpi->fn_ptr[BLOCK_4X4].vf = vpx_variance4x4;
cpi->fn_ptr[BLOCK_4X4].svf = vp8_sub_pixel_variance4x4;
cpi->fn_ptr[BLOCK_4X4].svf_halfpix_h = NULL;
cpi->fn_ptr[BLOCK_4X4].svf_halfpix_v = NULL;
@@ -2558,7 +2558,7 @@ static uint64_t calc_plane_error(unsigned char *orig, int orig_stride,
{
unsigned int sse;
- vp8_mse16x16(orig + col, orig_stride,
+ vpx_mse16x16(orig + col, orig_stride,
recon + col, recon_stride,
&sse);
total_sse += sse;
@@ -3384,7 +3384,7 @@ static int measure_square_diff_partial(YV12_BUFFER_CONFIG *source,
int index = block_index_row + (j >> 4);
if (cpi->consec_zero_last[index] >= min_consec_zero_last) {
unsigned int sse;
- Total += vp8_mse16x16(src + j,
+ Total += vpx_mse16x16(src + j,
source->y_stride,
dst + j, dest->y_stride,
&sse);
@@ -3448,7 +3448,7 @@ static void process_denoiser_mode_change(VP8_COMP *cpi) {
int index = block_index_row + (j >> 4);
if (cpi->consec_zero_last[index] >= min_consec_zero_last) {
unsigned int sse;
- const unsigned int var = vp8_variance16x16(src + j,
+ const unsigned int var = vpx_variance16x16(src + j,
ystride,
dst + j,
ystride,
@@ -3458,7 +3458,7 @@ static void process_denoiser_mode_change(VP8_COMP *cpi) {
// is small (to avoid effects from lighting change).
if ((sse - var) < 128) {
unsigned int sse2;
- const unsigned int act = vp8_variance16x16(src + j,
+ const unsigned int act = vpx_variance16x16(src + j,
ystride,
const_source,
0,
@@ -5993,7 +5993,8 @@ int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest)
for (j = 0; j < source->y_width; j += 16)
{
unsigned int sse;
- Total += vp8_mse16x16(src + j, source->y_stride, dst + j, dest->y_stride, &sse);
+ Total += vpx_mse16x16(src + j, source->y_stride,
+ dst + j, dest->y_stride, &sse);
}
src += 16 * source->y_stride;
diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
index 98ea5a040..053bf119a 100644
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -11,6 +11,7 @@
#include <limits.h>
#include "vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
#include "onyx_int.h"
#include "modecosts.h"
#include "encodeintra.h"
@@ -215,33 +216,6 @@ int vp8_get_inter_mbpred_error(MACROBLOCK *mb,
}
-
-unsigned int vp8_get4x4sse_cs_c
-(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride
-)
-{
- int distortion = 0;
- int r, c;
-
- for (r = 0; r < 4; r++)
- {
- for (c = 0; c < 4; c++)
- {
- int diff = src_ptr[c] - ref_ptr[c];
- distortion += diff * diff;
- }
-
- src_ptr += source_stride;
- ref_ptr += recon_stride;
- }
-
- return distortion;
-}
-
static int get_prediction_error(BLOCK *be, BLOCKD *b)
{
unsigned char *sptr;
@@ -249,7 +223,7 @@ static int get_prediction_error(BLOCK *be, BLOCKD *b)
sptr = (*(be->base_src) + be->src);
dptr = b->predictor;
- return vp8_get4x4sse_cs(sptr, be->src_stride, dptr, 16);
+ return vpx_get4x4sse_cs(sptr, be->src_stride, dptr, 16);
}
@@ -1037,7 +1011,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
else
{
rate2 += rate;
- distortion2 = vp8_variance16x16(
+ distortion2 = vpx_variance16x16(
*(b->base_src), b->src_stride,
x->e_mbd.predictor, 16, &sse);
this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
@@ -1066,7 +1040,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
xd->dst.y_stride,
xd->predictor,
16);
- distortion2 = vp8_variance16x16
+ distortion2 = vpx_variance16x16
(*(b->base_src), b->src_stride,
x->e_mbd.predictor, 16, &sse);
rate2 += x->mbmode_cost[x->e_mbd.frame_type][x->e_mbd.mode_info_context->mbmi.mode];
@@ -1547,7 +1521,7 @@ void vp8_pick_intra_mode(MACROBLOCK *x, int *rate_)
xd->dst.y_stride,
xd->predictor,
16);
- distortion = vp8_variance16x16
+ distortion = vpx_variance16x16
(*(b->base_src), b->src_stride, xd->predictor, 16, &sse);
rate = x->mbmode_cost[xd->frame_type][mode];
this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
diff --git a/vp8/encoder/picklpf.c b/vp8/encoder/picklpf.c
index 890053dcf..875b37f68 100644
--- a/vp8/encoder/picklpf.c
+++ b/vp8/encoder/picklpf.c
@@ -9,6 +9,7 @@
*/
+#include "./vpx_dsp_rtcd.h"
#include "./vpx_scale_rtcd.h"
#include "vp8/common/onyxc_int.h"
#include "onyx_int.h"
@@ -83,7 +84,7 @@ static int calc_partial_ssl_err(YV12_BUFFER_CONFIG *source,
for (j = 0; j < source->y_width; j += 16)
{
unsigned int sse;
- Total += vp8_mse16x16(src + j, source->y_stride,
+ Total += vpx_mse16x16(src + j, source->y_stride,
dst + j, dest->y_stride,
&sse);
}
diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c
index 10d340880..e8796a1fc 100644
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -1587,7 +1587,7 @@ int vp8_drop_encodedframe_overshoot(VP8_COMP *cpi, int Q) {
// Threshold for the average (over all macroblocks) of the pixel-sum
// residual error over 16x16 block. Should add QP dependence on threshold?
int thresh_pred_err_mb = (256 << 4);
- int pred_err_mb = cpi->mb.prediction_error / cpi->common.MBs;
+ int pred_err_mb = (int)(cpi->mb.prediction_error / cpi->common.MBs);
if (Q < thresh_qp &&
cpi->projected_frame_size > thresh_rate &&
pred_err_mb > thresh_pred_err_mb) {
@@ -1601,7 +1601,9 @@ int vp8_drop_encodedframe_overshoot(VP8_COMP *cpi, int Q) {
cpi->force_maxqp = 0;
return 0;
}
+ cpi->force_maxqp = 0;
return 0;
}
+ cpi->force_maxqp = 0;
return 0;
}
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index 9ccd85eb9..17194f0d4 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -15,6 +15,7 @@
#include <assert.h>
#include "vpx_config.h"
#include "vp8_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
#include "tokenize.h"
#include "treewriter.h"
#include "onyx_int.h"
@@ -507,9 +508,9 @@ int VP8_UVSSE(MACROBLOCK *x)
}
else
{
- vp8_variance8x8(uptr, pre_stride,
+ vpx_variance8x8(uptr, pre_stride,
upred_ptr, uv_stride, &sse2);
- vp8_variance8x8(vptr, pre_stride,
+ vpx_variance8x8(vptr, pre_stride,
vpred_ptr, uv_stride, &sse1);
sse2 += sse1;
}
@@ -1783,7 +1784,7 @@ static int evaluate_inter_mode_rd(int mdcounts[4],
if(threshold < x->encode_breakout)
threshold = x->encode_breakout;
- var = vp8_variance16x16
+ var = vpx_variance16x16
(*(b->base_src), b->src_stride,
x->e_mbd.predictor, 16, &sse);
diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index b4c814075..c71d592f5 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -145,8 +145,6 @@ VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/intra4x4_predict_v6$(ASM)
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/dequant_idct_v6$(ASM)
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/dequantize_v6$(ASM)
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/idct_blk_v6.c
-VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance8x8_armv6$(ASM)
-VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance16x16_armv6$(ASM)
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6$(ASM)
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6$(ASM)
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM)
@@ -168,7 +166,6 @@ VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/mbloopfilter_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/reconintra_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/shortidct4x4llm_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sixtappredict_neon.c
-VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/variance_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp8_subpixelvariance_neon.c
$(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.pl))
diff --git a/vp8/vp8cx_arm.mk b/vp8/vp8cx_arm.mk
index 050030179..0b0f6a70a 100644
--- a/vp8/vp8cx_arm.mk
+++ b/vp8/vp8cx_arm.mk
@@ -18,7 +18,6 @@ VP8_CX_SRCS-$(ARCH_ARM) += encoder/arm/dct_arm.c
#File list for media
# encoder
VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/vp8_short_fdct4x4_armv6$(ASM)
-VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/vp8_mse16x16_armv6$(ASM)
VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/walsh_v6$(ASM)
#File list for neon
@@ -27,5 +26,4 @@ VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/denoising_neon.c
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/fastquantizeb_neon.c
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/shortfdct_neon.c
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/subtract_neon.c
-VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp8_mse16x16_neon.c
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp8_shortwalsh4x4_neon.c
diff --git a/vp9/common/arm/neon/vp9_reconintra_neon.c b/vp9/common/arm/neon/vp9_reconintra_neon.c
index d0beaa720..66cf6600e 100644
--- a/vp9/common/arm/neon/vp9_reconintra_neon.c
+++ b/vp9/common/arm/neon/vp9_reconintra_neon.c
@@ -11,463 +11,415 @@
#include <stddef.h>
#include <arm_neon.h>
-void vp9_v_predictor_4x4_neon(
- uint8_t *dst,
- ptrdiff_t y_stride,
- const uint8_t *above,
- const uint8_t *left) {
- int i;
- uint32x2_t d0u32 = vdup_n_u32(0);
- (void)left;
+void vp9_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ int i;
+ uint32x2_t d0u32 = vdup_n_u32(0);
+ (void)left;
- d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0);
- for (i = 0; i < 4; i++, dst += y_stride)
- vst1_lane_u32((uint32_t *)dst, d0u32, 0);
- return;
+ d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0);
+ for (i = 0; i < 4; i++, dst += y_stride)
+ vst1_lane_u32((uint32_t *)dst, d0u32, 0);
}
-void vp9_v_predictor_8x8_neon(
- uint8_t *dst,
- ptrdiff_t y_stride,
- const uint8_t *above,
- const uint8_t *left) {
- int i;
- uint8x8_t d0u8 = vdup_n_u8(0);
- (void)left;
+void vp9_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ int i;
+ uint8x8_t d0u8 = vdup_n_u8(0);
+ (void)left;
- d0u8 = vld1_u8(above);
- for (i = 0; i < 8; i++, dst += y_stride)
- vst1_u8(dst, d0u8);
- return;
+ d0u8 = vld1_u8(above);
+ for (i = 0; i < 8; i++, dst += y_stride)
+ vst1_u8(dst, d0u8);
}
-void vp9_v_predictor_16x16_neon(
- uint8_t *dst,
- ptrdiff_t y_stride,
- const uint8_t *above,
- const uint8_t *left) {
- int i;
- uint8x16_t q0u8 = vdupq_n_u8(0);
- (void)left;
+void vp9_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ int i;
+ uint8x16_t q0u8 = vdupq_n_u8(0);
+ (void)left;
- q0u8 = vld1q_u8(above);
- for (i = 0; i < 16; i++, dst += y_stride)
- vst1q_u8(dst, q0u8);
- return;
+ q0u8 = vld1q_u8(above);
+ for (i = 0; i < 16; i++, dst += y_stride)
+ vst1q_u8(dst, q0u8);
}
-void vp9_v_predictor_32x32_neon(
- uint8_t *dst,
- ptrdiff_t y_stride,
- const uint8_t *above,
- const uint8_t *left) {
- int i;
- uint8x16_t q0u8 = vdupq_n_u8(0);
- uint8x16_t q1u8 = vdupq_n_u8(0);
- (void)left;
+void vp9_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ int i;
+ uint8x16_t q0u8 = vdupq_n_u8(0);
+ uint8x16_t q1u8 = vdupq_n_u8(0);
+ (void)left;
- q0u8 = vld1q_u8(above);
- q1u8 = vld1q_u8(above + 16);
- for (i = 0; i < 32; i++, dst += y_stride) {
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q1u8);
- }
- return;
+ q0u8 = vld1q_u8(above);
+ q1u8 = vld1q_u8(above + 16);
+ for (i = 0; i < 32; i++, dst += y_stride) {
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q1u8);
+ }
}
-void vp9_h_predictor_4x4_neon(
- uint8_t *dst,
- ptrdiff_t y_stride,
- const uint8_t *above,
- const uint8_t *left) {
- uint8x8_t d0u8 = vdup_n_u8(0);
- uint32x2_t d1u32 = vdup_n_u32(0);
- (void)above;
+void vp9_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x8_t d0u8 = vdup_n_u8(0);
+ uint32x2_t d1u32 = vdup_n_u32(0);
+ (void)above;
- d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0);
+ d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0);
- d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0);
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
- dst += y_stride;
- d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1);
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
- dst += y_stride;
- d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2);
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
- dst += y_stride;
- d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3);
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
- return;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+ dst += y_stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+ dst += y_stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+ dst += y_stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
}
-void vp9_h_predictor_8x8_neon(
- uint8_t *dst,
- ptrdiff_t y_stride,
- const uint8_t *above,
- const uint8_t *left) {
- uint8x8_t d0u8 = vdup_n_u8(0);
- uint64x1_t d1u64 = vdup_n_u64(0);
- (void)above;
+void vp9_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x8_t d0u8 = vdup_n_u8(0);
+ uint64x1_t d1u64 = vdup_n_u64(0);
+ (void)above;
- d1u64 = vld1_u64((const uint64_t *)left);
+ d1u64 = vld1_u64((const uint64_t *)left);
- d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0);
- vst1_u8(dst, d0u8);
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0);
+ vst1_u8(dst, d0u8);
+ dst += y_stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1);
+ vst1_u8(dst, d0u8);
+ dst += y_stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2);
+ vst1_u8(dst, d0u8);
+ dst += y_stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3);
+ vst1_u8(dst, d0u8);
+ dst += y_stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4);
+ vst1_u8(dst, d0u8);
+ dst += y_stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5);
+ vst1_u8(dst, d0u8);
+ dst += y_stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6);
+ vst1_u8(dst, d0u8);
+ dst += y_stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7);
+ vst1_u8(dst, d0u8);
+}
+
+void vp9_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ int j;
+ uint8x8_t d2u8 = vdup_n_u8(0);
+ uint8x16_t q0u8 = vdupq_n_u8(0);
+ uint8x16_t q1u8 = vdupq_n_u8(0);
+ (void)above;
+
+ q1u8 = vld1q_u8(left);
+ d2u8 = vget_low_u8(q1u8);
+ for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
+ q0u8 = vdupq_lane_u8(d2u8, 0);
+ vst1q_u8(dst, q0u8);
dst += y_stride;
- d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1);
- vst1_u8(dst, d0u8);
+ q0u8 = vdupq_lane_u8(d2u8, 1);
+ vst1q_u8(dst, q0u8);
dst += y_stride;
- d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2);
- vst1_u8(dst, d0u8);
+ q0u8 = vdupq_lane_u8(d2u8, 2);
+ vst1q_u8(dst, q0u8);
dst += y_stride;
- d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3);
- vst1_u8(dst, d0u8);
+ q0u8 = vdupq_lane_u8(d2u8, 3);
+ vst1q_u8(dst, q0u8);
dst += y_stride;
- d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4);
- vst1_u8(dst, d0u8);
+ q0u8 = vdupq_lane_u8(d2u8, 4);
+ vst1q_u8(dst, q0u8);
dst += y_stride;
- d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5);
- vst1_u8(dst, d0u8);
+ q0u8 = vdupq_lane_u8(d2u8, 5);
+ vst1q_u8(dst, q0u8);
dst += y_stride;
- d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6);
- vst1_u8(dst, d0u8);
+ q0u8 = vdupq_lane_u8(d2u8, 6);
+ vst1q_u8(dst, q0u8);
dst += y_stride;
- d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7);
- vst1_u8(dst, d0u8);
- return;
+ q0u8 = vdupq_lane_u8(d2u8, 7);
+ vst1q_u8(dst, q0u8);
+ dst += y_stride;
+ }
}
-void vp9_h_predictor_16x16_neon(
- uint8_t *dst,
- ptrdiff_t y_stride,
- const uint8_t *above,
- const uint8_t *left) {
- int j;
- uint8x8_t d2u8 = vdup_n_u8(0);
- uint8x16_t q0u8 = vdupq_n_u8(0);
- uint8x16_t q1u8 = vdupq_n_u8(0);
- (void)above;
+void vp9_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ int j, k;
+ uint8x8_t d2u8 = vdup_n_u8(0);
+ uint8x16_t q0u8 = vdupq_n_u8(0);
+ uint8x16_t q1u8 = vdupq_n_u8(0);
+ (void)above;
+ for (k = 0; k < 2; k++, left += 16) {
q1u8 = vld1q_u8(left);
d2u8 = vget_low_u8(q1u8);
for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
- q0u8 = vdupq_lane_u8(d2u8, 0);
- vst1q_u8(dst, q0u8);
- dst += y_stride;
- q0u8 = vdupq_lane_u8(d2u8, 1);
- vst1q_u8(dst, q0u8);
- dst += y_stride;
- q0u8 = vdupq_lane_u8(d2u8, 2);
- vst1q_u8(dst, q0u8);
- dst += y_stride;
- q0u8 = vdupq_lane_u8(d2u8, 3);
- vst1q_u8(dst, q0u8);
- dst += y_stride;
- q0u8 = vdupq_lane_u8(d2u8, 4);
- vst1q_u8(dst, q0u8);
- dst += y_stride;
- q0u8 = vdupq_lane_u8(d2u8, 5);
- vst1q_u8(dst, q0u8);
- dst += y_stride;
- q0u8 = vdupq_lane_u8(d2u8, 6);
- vst1q_u8(dst, q0u8);
- dst += y_stride;
- q0u8 = vdupq_lane_u8(d2u8, 7);
- vst1q_u8(dst, q0u8);
- dst += y_stride;
+ q0u8 = vdupq_lane_u8(d2u8, 0);
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q0u8);
+ dst += y_stride;
+ q0u8 = vdupq_lane_u8(d2u8, 1);
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q0u8);
+ dst += y_stride;
+ q0u8 = vdupq_lane_u8(d2u8, 2);
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q0u8);
+ dst += y_stride;
+ q0u8 = vdupq_lane_u8(d2u8, 3);
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q0u8);
+ dst += y_stride;
+ q0u8 = vdupq_lane_u8(d2u8, 4);
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q0u8);
+ dst += y_stride;
+ q0u8 = vdupq_lane_u8(d2u8, 5);
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q0u8);
+ dst += y_stride;
+ q0u8 = vdupq_lane_u8(d2u8, 6);
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q0u8);
+ dst += y_stride;
+ q0u8 = vdupq_lane_u8(d2u8, 7);
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q0u8);
+ dst += y_stride;
}
- return;
+ }
}
-void vp9_h_predictor_32x32_neon(
- uint8_t *dst,
- ptrdiff_t y_stride,
- const uint8_t *above,
- const uint8_t *left) {
- int j, k;
- uint8x8_t d2u8 = vdup_n_u8(0);
- uint8x16_t q0u8 = vdupq_n_u8(0);
- uint8x16_t q1u8 = vdupq_n_u8(0);
- (void)above;
+void vp9_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ int i;
+ uint16x8_t q1u16, q3u16;
+ int16x8_t q1s16;
+ uint8x8_t d0u8 = vdup_n_u8(0);
+ uint32x2_t d2u32 = vdup_n_u32(0);
- for (k = 0; k < 2; k++, left += 16) {
- q1u8 = vld1q_u8(left);
- d2u8 = vget_low_u8(q1u8);
- for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
- q0u8 = vdupq_lane_u8(d2u8, 0);
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q0u8);
- dst += y_stride;
- q0u8 = vdupq_lane_u8(d2u8, 1);
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q0u8);
- dst += y_stride;
- q0u8 = vdupq_lane_u8(d2u8, 2);
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q0u8);
- dst += y_stride;
- q0u8 = vdupq_lane_u8(d2u8, 3);
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q0u8);
- dst += y_stride;
- q0u8 = vdupq_lane_u8(d2u8, 4);
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q0u8);
- dst += y_stride;
- q0u8 = vdupq_lane_u8(d2u8, 5);
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q0u8);
- dst += y_stride;
- q0u8 = vdupq_lane_u8(d2u8, 6);
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q0u8);
- dst += y_stride;
- q0u8 = vdupq_lane_u8(d2u8, 7);
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q0u8);
- dst += y_stride;
- }
- }
- return;
+ d0u8 = vdup_n_u8(above[-1]);
+ d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0);
+ q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8);
+ for (i = 0; i < 4; i++, dst += y_stride) {
+ q1u16 = vdupq_n_u16((uint16_t)left[i]);
+ q1s16 = vaddq_s16(vreinterpretq_s16_u16(q1u16),
+ vreinterpretq_s16_u16(q3u16));
+ d0u8 = vqmovun_s16(q1s16);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+ }
}
-void vp9_tm_predictor_4x4_neon(
- uint8_t *dst,
- ptrdiff_t y_stride,
- const uint8_t *above,
- const uint8_t *left) {
- int i;
- uint16x8_t q1u16, q3u16;
- int16x8_t q1s16;
- uint8x8_t d0u8 = vdup_n_u8(0);
- uint32x2_t d2u32 = vdup_n_u32(0);
+void vp9_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ int j;
+ uint16x8_t q0u16, q3u16, q10u16;
+ int16x8_t q0s16;
+ uint16x4_t d20u16;
+ uint8x8_t d0u8, d2u8, d30u8;
- d0u8 = vdup_n_u8(above[-1]);
- d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0);
- q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8);
- for (i = 0; i < 4; i++, dst += y_stride) {
- q1u16 = vdupq_n_u16((uint16_t)left[i]);
- q1s16 = vaddq_s16(vreinterpretq_s16_u16(q1u16),
- vreinterpretq_s16_u16(q3u16));
- d0u8 = vqmovun_s16(q1s16);
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
- }
- return;
+ d0u8 = vdup_n_u8(above[-1]);
+ d30u8 = vld1_u8(left);
+ d2u8 = vld1_u8(above);
+ q10u16 = vmovl_u8(d30u8);
+ q3u16 = vsubl_u8(d2u8, d0u8);
+ d20u16 = vget_low_u16(q10u16);
+ for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
+ q0u16 = vdupq_lane_u16(d20u16, 0);
+ q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
+ vreinterpretq_s16_u16(q0u16));
+ d0u8 = vqmovun_s16(q0s16);
+ vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+ dst += y_stride;
+ q0u16 = vdupq_lane_u16(d20u16, 1);
+ q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
+ vreinterpretq_s16_u16(q0u16));
+ d0u8 = vqmovun_s16(q0s16);
+ vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+ dst += y_stride;
+ q0u16 = vdupq_lane_u16(d20u16, 2);
+ q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
+ vreinterpretq_s16_u16(q0u16));
+ d0u8 = vqmovun_s16(q0s16);
+ vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+ dst += y_stride;
+ q0u16 = vdupq_lane_u16(d20u16, 3);
+ q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
+ vreinterpretq_s16_u16(q0u16));
+ d0u8 = vqmovun_s16(q0s16);
+ vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+ dst += y_stride;
+ }
}
-void vp9_tm_predictor_8x8_neon(
- uint8_t *dst,
- ptrdiff_t y_stride,
- const uint8_t *above,
- const uint8_t *left) {
- int j;
- uint16x8_t q0u16, q3u16, q10u16;
- int16x8_t q0s16;
- uint16x4_t d20u16;
- uint8x8_t d0u8, d2u8, d30u8;
+void vp9_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ int j, k;
+ uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16;
+ uint8x16_t q0u8, q1u8;
+ int16x8_t q0s16, q1s16, q8s16, q11s16;
+ uint16x4_t d20u16;
+ uint8x8_t d2u8, d3u8, d18u8, d22u8, d23u8;
- d0u8 = vdup_n_u8(above[-1]);
- d30u8 = vld1_u8(left);
- d2u8 = vld1_u8(above);
- q10u16 = vmovl_u8(d30u8);
- q3u16 = vsubl_u8(d2u8, d0u8);
+ q0u8 = vdupq_n_u8(above[-1]);
+ q1u8 = vld1q_u8(above);
+ q2u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
+ q3u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
+ for (k = 0; k < 2; k++, left += 8) {
+ d18u8 = vld1_u8(left);
+ q10u16 = vmovl_u8(d18u8);
d20u16 = vget_low_u16(q10u16);
for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
- q0u16 = vdupq_lane_u16(d20u16, 0);
- q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
- vreinterpretq_s16_u16(q0u16));
- d0u8 = vqmovun_s16(q0s16);
- vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
- dst += y_stride;
- q0u16 = vdupq_lane_u16(d20u16, 1);
- q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
- vreinterpretq_s16_u16(q0u16));
- d0u8 = vqmovun_s16(q0s16);
- vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
- dst += y_stride;
- q0u16 = vdupq_lane_u16(d20u16, 2);
- q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
- vreinterpretq_s16_u16(q0u16));
- d0u8 = vqmovun_s16(q0s16);
- vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
- dst += y_stride;
- q0u16 = vdupq_lane_u16(d20u16, 3);
- q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
- vreinterpretq_s16_u16(q0u16));
- d0u8 = vqmovun_s16(q0s16);
- vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
- dst += y_stride;
- }
- return;
-}
-
-void vp9_tm_predictor_16x16_neon(
- uint8_t *dst,
- ptrdiff_t y_stride,
- const uint8_t *above,
- const uint8_t *left) {
- int j, k;
- uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16;
- uint8x16_t q0u8, q1u8;
- int16x8_t q0s16, q1s16, q8s16, q11s16;
- uint16x4_t d20u16;
- uint8x8_t d2u8, d3u8, d18u8, d22u8, d23u8;
-
- q0u8 = vdupq_n_u8(above[-1]);
- q1u8 = vld1q_u8(above);
- q2u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
- q3u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
- for (k = 0; k < 2; k++, left += 8) {
- d18u8 = vld1_u8(left);
- q10u16 = vmovl_u8(d18u8);
- d20u16 = vget_low_u16(q10u16);
- for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
- q0u16 = vdupq_lane_u16(d20u16, 0);
- q8u16 = vdupq_lane_u16(d20u16, 1);
- q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q2u16));
- q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q3u16));
- q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
- vreinterpretq_s16_u16(q2u16));
- q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
- vreinterpretq_s16_u16(q3u16));
- d2u8 = vqmovun_s16(q1s16);
- d3u8 = vqmovun_s16(q0s16);
- d22u8 = vqmovun_s16(q11s16);
- d23u8 = vqmovun_s16(q8s16);
- vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
- vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
- dst += y_stride;
- vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
- vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
- dst += y_stride;
+ q0u16 = vdupq_lane_u16(d20u16, 0);
+ q8u16 = vdupq_lane_u16(d20u16, 1);
+ q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q2u16));
+ q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q3u16));
+ q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
+ vreinterpretq_s16_u16(q2u16));
+ q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
+ vreinterpretq_s16_u16(q3u16));
+ d2u8 = vqmovun_s16(q1s16);
+ d3u8 = vqmovun_s16(q0s16);
+ d22u8 = vqmovun_s16(q11s16);
+ d23u8 = vqmovun_s16(q8s16);
+ vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
+ vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
+ dst += y_stride;
+ vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
+ vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
+ dst += y_stride;
- q0u16 = vdupq_lane_u16(d20u16, 2);
- q8u16 = vdupq_lane_u16(d20u16, 3);
- q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q2u16));
- q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q3u16));
- q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
- vreinterpretq_s16_u16(q2u16));
- q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
- vreinterpretq_s16_u16(q3u16));
- d2u8 = vqmovun_s16(q1s16);
- d3u8 = vqmovun_s16(q0s16);
- d22u8 = vqmovun_s16(q11s16);
- d23u8 = vqmovun_s16(q8s16);
- vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
- vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
- dst += y_stride;
- vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
- vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
- dst += y_stride;
- }
+ q0u16 = vdupq_lane_u16(d20u16, 2);
+ q8u16 = vdupq_lane_u16(d20u16, 3);
+ q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q2u16));
+ q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q3u16));
+ q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
+ vreinterpretq_s16_u16(q2u16));
+ q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
+ vreinterpretq_s16_u16(q3u16));
+ d2u8 = vqmovun_s16(q1s16);
+ d3u8 = vqmovun_s16(q0s16);
+ d22u8 = vqmovun_s16(q11s16);
+ d23u8 = vqmovun_s16(q8s16);
+ vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
+ vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
+ dst += y_stride;
+ vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
+ vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
+ dst += y_stride;
}
- return;
+ }
}
-void vp9_tm_predictor_32x32_neon(
- uint8_t *dst,
- ptrdiff_t y_stride,
- const uint8_t *above,
- const uint8_t *left) {
- int j, k;
- uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16;
- uint8x16_t q0u8, q1u8, q2u8;
- int16x8_t q12s16, q13s16, q14s16, q15s16;
- uint16x4_t d6u16;
- uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8;
+void vp9_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ int j, k;
+ uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16;
+ uint8x16_t q0u8, q1u8, q2u8;
+ int16x8_t q12s16, q13s16, q14s16, q15s16;
+ uint16x4_t d6u16;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8;
- q0u8 = vdupq_n_u8(above[-1]);
- q1u8 = vld1q_u8(above);
- q2u8 = vld1q_u8(above + 16);
- q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
- q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
- q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8));
- q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8));
- for (k = 0; k < 4; k++, left += 8) {
- d26u8 = vld1_u8(left);
- q3u16 = vmovl_u8(d26u8);
- d6u16 = vget_low_u16(q3u16);
- for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) {
- q0u16 = vdupq_lane_u16(d6u16, 0);
- q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q8u16));
- q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q9u16));
- q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q10u16));
- q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q11u16));
- d0u8 = vqmovun_s16(q12s16);
- d1u8 = vqmovun_s16(q13s16);
- d2u8 = vqmovun_s16(q14s16);
- d3u8 = vqmovun_s16(q15s16);
- q0u8 = vcombine_u8(d0u8, d1u8);
- q1u8 = vcombine_u8(d2u8, d3u8);
- vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
- vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
- dst += y_stride;
+ q0u8 = vdupq_n_u8(above[-1]);
+ q1u8 = vld1q_u8(above);
+ q2u8 = vld1q_u8(above + 16);
+ q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
+ q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
+ q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8));
+ q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8));
+ for (k = 0; k < 4; k++, left += 8) {
+ d26u8 = vld1_u8(left);
+ q3u16 = vmovl_u8(d26u8);
+ d6u16 = vget_low_u16(q3u16);
+ for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) {
+ q0u16 = vdupq_lane_u16(d6u16, 0);
+ q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q8u16));
+ q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q9u16));
+ q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q10u16));
+ q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q11u16));
+ d0u8 = vqmovun_s16(q12s16);
+ d1u8 = vqmovun_s16(q13s16);
+ d2u8 = vqmovun_s16(q14s16);
+ d3u8 = vqmovun_s16(q15s16);
+ q0u8 = vcombine_u8(d0u8, d1u8);
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+ vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+ dst += y_stride;
- q0u16 = vdupq_lane_u16(d6u16, 1);
- q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q8u16));
- q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q9u16));
- q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q10u16));
- q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q11u16));
- d0u8 = vqmovun_s16(q12s16);
- d1u8 = vqmovun_s16(q13s16);
- d2u8 = vqmovun_s16(q14s16);
- d3u8 = vqmovun_s16(q15s16);
- q0u8 = vcombine_u8(d0u8, d1u8);
- q1u8 = vcombine_u8(d2u8, d3u8);
- vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
- vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
- dst += y_stride;
+ q0u16 = vdupq_lane_u16(d6u16, 1);
+ q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q8u16));
+ q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q9u16));
+ q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q10u16));
+ q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q11u16));
+ d0u8 = vqmovun_s16(q12s16);
+ d1u8 = vqmovun_s16(q13s16);
+ d2u8 = vqmovun_s16(q14s16);
+ d3u8 = vqmovun_s16(q15s16);
+ q0u8 = vcombine_u8(d0u8, d1u8);
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+ vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+ dst += y_stride;
- q0u16 = vdupq_lane_u16(d6u16, 2);
- q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q8u16));
- q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q9u16));
- q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q10u16));
- q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q11u16));
- d0u8 = vqmovun_s16(q12s16);
- d1u8 = vqmovun_s16(q13s16);
- d2u8 = vqmovun_s16(q14s16);
- d3u8 = vqmovun_s16(q15s16);
- q0u8 = vcombine_u8(d0u8, d1u8);
- q1u8 = vcombine_u8(d2u8, d3u8);
- vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
- vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
- dst += y_stride;
+ q0u16 = vdupq_lane_u16(d6u16, 2);
+ q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q8u16));
+ q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q9u16));
+ q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q10u16));
+ q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q11u16));
+ d0u8 = vqmovun_s16(q12s16);
+ d1u8 = vqmovun_s16(q13s16);
+ d2u8 = vqmovun_s16(q14s16);
+ d3u8 = vqmovun_s16(q15s16);
+ q0u8 = vcombine_u8(d0u8, d1u8);
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+ vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+ dst += y_stride;
- q0u16 = vdupq_lane_u16(d6u16, 3);
- q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q8u16));
- q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q9u16));
- q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q10u16));
- q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q11u16));
- d0u8 = vqmovun_s16(q12s16);
- d1u8 = vqmovun_s16(q13s16);
- d2u8 = vqmovun_s16(q14s16);
- d3u8 = vqmovun_s16(q15s16);
- q0u8 = vcombine_u8(d0u8, d1u8);
- q1u8 = vcombine_u8(d2u8, d3u8);
- vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
- vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
- dst += y_stride;
- }
+ q0u16 = vdupq_lane_u16(d6u16, 3);
+ q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q8u16));
+ q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q9u16));
+ q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q10u16));
+ q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q11u16));
+ d0u8 = vqmovun_s16(q12s16);
+ d1u8 = vqmovun_s16(q13s16);
+ d2u8 = vqmovun_s16(q14s16);
+ d3u8 = vqmovun_s16(q15s16);
+ q0u8 = vcombine_u8(d0u8, d1u8);
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+ vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+ dst += y_stride;
}
- return;
+ }
}
diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index 600cb13d8..8eda491de 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -57,6 +57,7 @@ static int alloc_seg_map(VP9_COMMON *cm, int seg_map_size) {
if (cm->seg_map_array[i] == NULL)
return 1;
}
+ cm->seg_map_alloc_size = seg_map_size;
// Init the index.
cm->seg_map_idx = 0;
@@ -118,25 +119,36 @@ void vp9_free_context_buffers(VP9_COMMON *cm) {
}
int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) {
- vp9_free_context_buffers(cm);
+ int new_mi_size;
vp9_set_mb_mi(cm, width, height);
- if (cm->alloc_mi(cm, cm->mi_stride * calc_mi_size(cm->mi_rows)))
- goto fail;
-
- // Create the segmentation map structure and set to 0.
- free_seg_map(cm);
- if (alloc_seg_map(cm, cm->mi_rows * cm->mi_cols))
- goto fail;
+ new_mi_size = cm->mi_stride * calc_mi_size(cm->mi_rows);
+ if (cm->mi_alloc_size < new_mi_size) {
+ cm->free_mi(cm);
+ if (cm->alloc_mi(cm, new_mi_size))
+ goto fail;
+ }
- cm->above_context = (ENTROPY_CONTEXT *)vpx_calloc(
- 2 * mi_cols_aligned_to_sb(cm->mi_cols) * MAX_MB_PLANE,
- sizeof(*cm->above_context));
- if (!cm->above_context) goto fail;
+ if (cm->seg_map_alloc_size < cm->mi_rows * cm->mi_cols) {
+ // Create the segmentation map structure and set to 0.
+ free_seg_map(cm);
+ if (alloc_seg_map(cm, cm->mi_rows * cm->mi_cols))
+ goto fail;
+ }
- cm->above_seg_context = (PARTITION_CONTEXT *)vpx_calloc(
- mi_cols_aligned_to_sb(cm->mi_cols), sizeof(*cm->above_seg_context));
- if (!cm->above_seg_context) goto fail;
+ if (cm->above_context_alloc_cols < cm->mi_cols) {
+ vpx_free(cm->above_context);
+ cm->above_context = (ENTROPY_CONTEXT *)vpx_calloc(
+ 2 * mi_cols_aligned_to_sb(cm->mi_cols) * MAX_MB_PLANE,
+ sizeof(*cm->above_context));
+ if (!cm->above_context) goto fail;
+
+ vpx_free(cm->above_seg_context);
+ cm->above_seg_context = (PARTITION_CONTEXT *)vpx_calloc(
+ mi_cols_aligned_to_sb(cm->mi_cols), sizeof(*cm->above_seg_context));
+ if (!cm->above_seg_context) goto fail;
+ cm->above_context_alloc_cols = cm->mi_cols;
+ }
return 0;
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 097053a7d..319d34832 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -18,74 +18,28 @@
#include "vpx_scale/yv12config.h"
#include "vp9/common/vp9_common_data.h"
-#include "vp9/common/vp9_filter.h"
+#include "vp9/common/vp9_entropy.h"
+#include "vp9/common/vp9_entropymode.h"
#include "vp9/common/vp9_mv.h"
#include "vp9/common/vp9_scale.h"
+#include "vp9/common/vp9_seg_common.h"
#ifdef __cplusplus
extern "C" {
#endif
-#define BLOCK_SIZE_GROUPS 4
-#define SKIP_CONTEXTS 3
-#define INTER_MODE_CONTEXTS 7
-
-/* Segment Feature Masks */
-#define MAX_MV_REF_CANDIDATES 2
-
-#define INTRA_INTER_CONTEXTS 4
-#define COMP_INTER_CONTEXTS 5
-#define REF_CONTEXTS 5
-
-typedef enum {
- PLANE_TYPE_Y = 0,
- PLANE_TYPE_UV = 1,
- PLANE_TYPES
-} PLANE_TYPE;
-
#define MAX_MB_PLANE 3
-typedef char ENTROPY_CONTEXT;
-
-static INLINE int combine_entropy_contexts(ENTROPY_CONTEXT a,
- ENTROPY_CONTEXT b) {
- return (a != 0) + (b != 0);
-}
-
typedef enum {
KEY_FRAME = 0,
INTER_FRAME = 1,
FRAME_TYPES,
} FRAME_TYPE;
-typedef enum {
- DC_PRED, // Average of above and left pixels
- V_PRED, // Vertical
- H_PRED, // Horizontal
- D45_PRED, // Directional 45 deg = round(arctan(1/1) * 180/pi)
- D135_PRED, // Directional 135 deg = 180 - 45
- D117_PRED, // Directional 117 deg = 180 - 63
- D153_PRED, // Directional 153 deg = 180 - 27
- D207_PRED, // Directional 207 deg = 180 + 27
- D63_PRED, // Directional 63 deg = round(arctan(2/1) * 180/pi)
- TM_PRED, // True-motion
- NEARESTMV,
- NEARMV,
- ZEROMV,
- NEWMV,
- MB_MODE_COUNT
-} PREDICTION_MODE;
-
static INLINE int is_inter_mode(PREDICTION_MODE mode) {
return mode >= NEARESTMV && mode <= NEWMV;
}
-#define INTRA_MODES (TM_PRED + 1)
-
-#define INTER_MODES (1 + NEWMV - NEARESTMV)
-
-#define INTER_OFFSET(mode) ((mode) - NEARESTMV)
-
/* For keyframes, intra block modes are predicted by the (already decoded)
modes for the Y blocks to the left and above us; for interframes, there
is a single probability table. */
@@ -281,6 +235,27 @@ static INLINE BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize,
return ss_size_lookup[bsize][pd->subsampling_x][pd->subsampling_y];
}
+static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) {
+ int i;
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ struct macroblockd_plane *const pd = &xd->plane[i];
+ const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+ memset(pd->above_context, 0,
+ sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide_lookup[plane_bsize]);
+ memset(pd->left_context, 0,
+ sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high_lookup[plane_bsize]);
+ }
+}
+
+static INLINE const vp9_prob *get_y_mode_probs(const MODE_INFO *mi,
+ const MODE_INFO *above_mi,
+ const MODE_INFO *left_mi,
+ int block) {
+ const PREDICTION_MODE above = vp9_above_block_mode(mi, above_mi, block);
+ const PREDICTION_MODE left = vp9_left_block_mode(mi, left_mi, block);
+ return vp9_kf_y_mode_prob[above][left];
+}
+
typedef void (*foreach_transformed_block_visitor)(int plane, int block,
BLOCK_SIZE plane_bsize,
TX_SIZE tx_size,
diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index 5a9007b54..4e02630e6 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -14,8 +14,8 @@
#include "vpx/vpx_integer.h"
#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_enums.h"
#include "vp9/common/vp9_prob.h"
-#include "vp9/common/vp9_scan.h"
#ifdef __cplusplus
extern "C" {
@@ -137,18 +137,6 @@ struct VP9Common;
void vp9_default_coef_probs(struct VP9Common *cm);
void vp9_adapt_coef_probs(struct VP9Common *cm);
-static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) {
- int i;
- for (i = 0; i < MAX_MB_PLANE; i++) {
- struct macroblockd_plane *const pd = &xd->plane[i];
- const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
- memset(pd->above_context, 0,
- sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide_lookup[plane_bsize]);
- memset(pd->left_context, 0,
- sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high_lookup[plane_bsize]);
- }
-}
-
// This is the index in the scan order beyond which all coefficients for
// 8x8 transform and above are in the top band.
// This macro is currently unused but may be used by certain implementations
@@ -185,6 +173,13 @@ typedef unsigned int vp9_coeff_count_model[REF_TYPES][COEF_BANDS]
void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full);
+typedef char ENTROPY_CONTEXT;
+
+static INLINE int combine_entropy_contexts(ENTROPY_CONTEXT a,
+ ENTROPY_CONTEXT b) {
+ return (a != 0) + (b != 0);
+}
+
static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
const ENTROPY_CONTEXT *l) {
ENTROPY_CONTEXT above_ec = 0, left_ec = 0;
@@ -214,18 +209,6 @@ static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
return combine_entropy_contexts(above_ec, left_ec);
}
-static INLINE const scan_order *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size,
- PLANE_TYPE type, int block_idx) {
- const MODE_INFO *const mi = xd->mi[0];
-
- if (is_inter_block(&mi->mbmi) || type != PLANE_TYPE_Y || xd->lossless) {
- return &vp9_default_scan_orders[tx_size];
- } else {
- const PREDICTION_MODE mode = get_y_mode(mi, block_idx);
- return &vp9_scan_orders[tx_size][intra_mode_to_tx_type_lookup[mode]];
- }
-}
-
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/vp9/common/vp9_entropymode.h b/vp9/common/vp9_entropymode.h
index f4e20e1af..a0619ec6f 100644
--- a/vp9/common/vp9_entropymode.h
+++ b/vp9/common/vp9_entropymode.h
@@ -11,7 +11,7 @@
#ifndef VP9_COMMON_VP9_ENTROPYMODE_H_
#define VP9_COMMON_VP9_ENTROPYMODE_H_
-#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_filter.h"
#include "vp9/common/vp9_entropy.h"
#include "vp9/common/vp9_entropymv.h"
@@ -19,8 +19,12 @@
extern "C" {
#endif
+#define BLOCK_SIZE_GROUPS 4
+
#define TX_SIZE_CONTEXTS 2
+#define INTER_OFFSET(mode) ((mode) - NEARESTMV)
+
struct VP9Common;
struct tx_probs {
@@ -97,15 +101,6 @@ void tx_counts_to_branch_counts_16x16(const unsigned int *tx_count_16x16p,
void tx_counts_to_branch_counts_8x8(const unsigned int *tx_count_8x8p,
unsigned int (*ct_8x8p)[2]);
-static INLINE const vp9_prob *get_y_mode_probs(const MODE_INFO *mi,
- const MODE_INFO *above_mi,
- const MODE_INFO *left_mi,
- int block) {
- const PREDICTION_MODE above = vp9_above_block_mode(mi, above_mi, block);
- const PREDICTION_MODE left = vp9_left_block_mode(mi, left_mi, block);
- return vp9_kf_y_mode_prob[above][left];
-}
-
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/vp9/common/vp9_enums.h b/vp9/common/vp9_enums.h
index 7938fc10a..048202593 100644
--- a/vp9/common/vp9_enums.h
+++ b/vp9/common/vp9_enums.h
@@ -104,6 +104,44 @@ typedef enum {
VP9_ALT_FLAG = 1 << 2,
} VP9_REFFRAME;
+typedef enum {
+ PLANE_TYPE_Y = 0,
+ PLANE_TYPE_UV = 1,
+ PLANE_TYPES
+} PLANE_TYPE;
+
+typedef enum {
+ DC_PRED, // Average of above and left pixels
+ V_PRED, // Vertical
+ H_PRED, // Horizontal
+ D45_PRED, // Directional 45 deg = round(arctan(1/1) * 180/pi)
+ D135_PRED, // Directional 135 deg = 180 - 45
+ D117_PRED, // Directional 117 deg = 180 - 63
+ D153_PRED, // Directional 153 deg = 180 - 27
+ D207_PRED, // Directional 207 deg = 180 + 27
+ D63_PRED, // Directional 63 deg = round(arctan(2/1) * 180/pi)
+ TM_PRED, // True-motion
+ NEARESTMV,
+ NEARMV,
+ ZEROMV,
+ NEWMV,
+ MB_MODE_COUNT
+} PREDICTION_MODE;
+
+#define INTRA_MODES (TM_PRED + 1)
+
+#define INTER_MODES (1 + NEWMV - NEARESTMV)
+
+#define SKIP_CONTEXTS 3
+#define INTER_MODE_CONTEXTS 7
+
+/* Segment Feature Masks */
+#define MAX_MV_REF_CANDIDATES 2
+
+#define INTRA_INTER_CONTEXTS 4
+#define COMP_INTER_CONTEXTS 5
+#define REF_CONTEXTS 5
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/vp9/common/vp9_mfqe.c b/vp9/common/vp9_mfqe.c
index 57189df16..bebb37eda 100644
--- a/vp9/common/vp9_mfqe.c
+++ b/vp9/common/vp9_mfqe.c
@@ -171,13 +171,13 @@ static void mfqe_block(BLOCK_SIZE bs, const uint8_t *y, const uint8_t *u,
get_thr(bs, qdiff, &sad_thr, &vdiff_thr);
if (bs == BLOCK_16X16) {
- vdiff = (vp9_variance16x16(y, y_stride, yd, yd_stride, &sse) + 128) >> 8;
+ vdiff = (vpx_variance16x16(y, y_stride, yd, yd_stride, &sse) + 128) >> 8;
sad = (vpx_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8;
} else if (bs == BLOCK_32X32) {
- vdiff = (vp9_variance32x32(y, y_stride, yd, yd_stride, &sse) + 512) >> 10;
+ vdiff = (vpx_variance32x32(y, y_stride, yd, yd_stride, &sse) + 512) >> 10;
sad = (vpx_sad32x32(y, y_stride, yd, yd_stride) + 512) >> 10;
} else /* if (bs == BLOCK_64X64) */ {
- vdiff = (vp9_variance64x64(y, y_stride, yd, yd_stride, &sse) + 2048) >> 12;
+ vdiff = (vpx_variance64x64(y, y_stride, yd, yd_stride, &sse) + 2048) >> 12;
sad = (vpx_sad64x64(y, y_stride, yd, yd_stride) + 2048) >> 12;
}
diff --git a/vp9/common/vp9_mvref_common.c b/vp9/common/vp9_mvref_common.c
index 51e147e00..ce6952752 100644
--- a/vp9/common/vp9_mvref_common.c
+++ b/vp9/common/vp9_mvref_common.c
@@ -223,6 +223,6 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
break;
}
default:
- assert("Invalid block index.");
+ assert(0 && "Invalid block index.");
}
}
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index 5179c6906..188b03d41 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -14,6 +14,7 @@
#include "./vpx_config.h"
#include "vpx/internal/vpx_codec_internal.h"
#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_alloccommon.h"
#include "vp9/common/vp9_loopfilter.h"
#include "vp9/common/vp9_entropymv.h"
#include "vp9/common/vp9_entropy.h"
@@ -220,6 +221,7 @@ typedef struct VP9Common {
uint8_t *seg_map_array[NUM_PING_PONG_BUFFERS];
uint8_t *last_frame_seg_map;
uint8_t *current_frame_seg_map;
+ int seg_map_alloc_size;
INTERP_FILTER interp_filter;
@@ -276,6 +278,7 @@ typedef struct VP9Common {
PARTITION_CONTEXT *above_seg_context;
ENTROPY_CONTEXT *above_context;
+ int above_context_alloc_cols;
} VP9_COMMON;
// TODO(hkuang): Don't need to lock the whole pool after implementing atomic
@@ -305,8 +308,13 @@ static INLINE int get_free_fb(VP9_COMMON *cm) {
if (frame_bufs[i].ref_count == 0)
break;
- assert(i < FRAME_BUFFERS);
- frame_bufs[i].ref_count = 1;
+ if (i != FRAME_BUFFERS) {
+ frame_bufs[i].ref_count = 1;
+ } else {
+ // Reset i to be INVALID_IDX to indicate no free buffer found.
+ i = INVALID_IDX;
+ }
+
unlock_buffer_pool(cm->buffer_pool);
return i;
}
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 2a9736b40..30710ba00 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -797,51 +797,6 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
# variance
-add_proto qw/unsigned int vp9_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance32x16 avx2/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance16x32/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance64x32 avx2 neon/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance32x64 neon/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance32x32 avx2 neon/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance64x64 avx2 neon/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance16x16 avx2 neon/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance16x8/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance8x16/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance8x8 neon/, "$sse2_x86inc";
-
-add_proto qw/void vp9_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-specialize qw/vp9_get8x8var neon/, "$sse2_x86inc";
-
-add_proto qw/void vp9_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-specialize qw/vp9_get16x16var avx2 neon/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance8x4/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance4x8/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance4x4/, "$sse2_x86inc";
-
add_proto qw/unsigned int vp9_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_sub_pixel_variance64x64 avx2 neon/, "$sse2_x86inc", "$ssse3_x86inc";
@@ -922,21 +877,6 @@ specialize qw/vp9_sub_pixel_variance4x4/, "$sse_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_sub_pixel_avg_variance4x4/, "$sse_x86inc", "$ssse3_x86inc";
-add_proto qw/unsigned int vp9_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
-specialize qw/vp9_mse16x16 avx2/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
-specialize qw/vp9_mse8x16/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
-specialize qw/vp9_mse16x8/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
-specialize qw/vp9_mse8x8/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_get_mb_ss/, "const int16_t *";
-specialize qw/vp9_get_mb_ss/, "$sse2_x86inc";
-
add_proto qw/unsigned int vp9_avg_8x8/, "const uint8_t *, int p";
specialize qw/vp9_avg_8x8 sse2 neon/;
@@ -1141,142 +1081,6 @@ specialize qw/vp9_temporal_filter_apply sse2/;
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
- # variance
- add_proto qw/unsigned int vp9_highbd_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_variance32x16/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_variance16x32/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_variance64x32/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_variance32x64/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_variance32x32/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_variance64x64/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_variance16x16/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_variance16x8/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_variance8x16/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_variance8x8/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_variance8x4/;
-
- add_proto qw/unsigned int vp9_highbd_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_variance4x8/;
-
- add_proto qw/unsigned int vp9_highbd_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_variance4x4/;
-
- add_proto qw/void vp9_highbd_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- specialize qw/vp9_highbd_get8x8var/, "$sse2_x86inc";
-
- add_proto qw/void vp9_highbd_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- specialize qw/vp9_highbd_get16x16var/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_variance32x16/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_variance16x32/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_variance64x32/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_variance32x64/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_variance32x32/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_variance64x64/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_variance16x16/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_variance16x8/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_variance8x16/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_variance8x8/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_variance8x4/;
-
- add_proto qw/unsigned int vp9_highbd_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_variance4x8/;
-
- add_proto qw/unsigned int vp9_highbd_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_variance4x4/;
-
- add_proto qw/void vp9_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- specialize qw/vp9_highbd_10_get8x8var/, "$sse2_x86inc";
-
- add_proto qw/void vp9_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- specialize qw/vp9_highbd_10_get16x16var/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_variance32x16/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_variance16x32/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_variance64x32/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_variance32x64/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_variance32x32/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_variance64x64/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_variance16x16/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_variance16x8/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_variance8x16/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_variance8x8/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_variance8x4/;
-
- add_proto qw/unsigned int vp9_highbd_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_variance4x8/;
-
- add_proto qw/unsigned int vp9_highbd_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_variance4x4/;
-
- add_proto qw/void vp9_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- specialize qw/vp9_highbd_12_get8x8var/, "$sse2_x86inc";
-
- add_proto qw/void vp9_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- specialize qw/vp9_highbd_12_get16x16var/, "$sse2_x86inc";
-
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance64x64/, "$sse2_x86inc";
@@ -1511,41 +1315,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_12_sub_pixel_avg_variance4x4/;
- add_proto qw/unsigned int vp9_highbd_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/vp9_highbd_mse16x16/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/vp9_highbd_mse8x16/;
-
- add_proto qw/unsigned int vp9_highbd_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/vp9_highbd_mse16x8/;
-
- add_proto qw/unsigned int vp9_highbd_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/vp9_highbd_mse8x8/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_10_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_mse16x16/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_10_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_mse8x16/;
-
- add_proto qw/unsigned int vp9_highbd_10_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_mse16x8/;
-
- add_proto qw/unsigned int vp9_highbd_10_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_mse8x8/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_12_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_mse16x16/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_12_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_mse8x16/;
-
- add_proto qw/unsigned int vp9_highbd_12_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_mse16x8/;
-
- add_proto qw/unsigned int vp9_highbd_12_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_mse8x8/, "$sse2_x86inc";
# ENCODEMB INVOKE
diff --git a/vp9/common/vp9_scan.h b/vp9/common/vp9_scan.h
index 65e2aa69a..1d86b5cfe 100644
--- a/vp9/common/vp9_scan.h
+++ b/vp9/common/vp9_scan.h
@@ -38,6 +38,18 @@ static INLINE int get_coef_context(const int16_t *neighbors,
token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >> 1;
}
+static INLINE const scan_order *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size,
+ PLANE_TYPE type, int block_idx) {
+ const MODE_INFO *const mi = xd->mi[0];
+
+ if (is_inter_block(&mi->mbmi) || type != PLANE_TYPE_Y || xd->lossless) {
+ return &vp9_default_scan_orders[tx_size];
+ } else {
+ const PREDICTION_MODE mode = get_y_mode(mi, block_idx);
+ return &vp9_scan_orders[tx_size][intra_mode_to_tx_type_lookup[mode]];
+ }
+}
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/vp9/common/vp9_systemdependent.h b/vp9/common/vp9_systemdependent.h
index e97115823..fc77762de 100644
--- a/vp9/common/vp9_systemdependent.h
+++ b/vp9/common/vp9_systemdependent.h
@@ -11,13 +11,14 @@
#ifndef VP9_COMMON_VP9_SYSTEMDEPENDENT_H_
#define VP9_COMMON_VP9_SYSTEMDEPENDENT_H_
+#include "vpx_ports/msvc.h"
+
#ifdef _MSC_VER
# include <math.h> // the ceil() definition must precede intrin.h
# if _MSC_VER > 1310 && (defined(_M_X64) || defined(_M_IX86))
# include <intrin.h>
-# define USE_MSC_INTRIN
+# define USE_MSC_INTRINSICS
# endif
-# define snprintf _snprintf
#endif
#ifdef __cplusplus
@@ -48,7 +49,7 @@ static INLINE int round(double x) {
static INLINE int get_msb(unsigned int n) {
return 31 ^ __builtin_clz(n);
}
-#elif defined(USE_MSC_INTRIN)
+#elif defined(USE_MSC_INTRINSICS)
#pragma intrinsic(_BitScanReverse)
static INLINE int get_msb(unsigned int n) {
@@ -56,7 +57,7 @@ static INLINE int get_msb(unsigned int n) {
_BitScanReverse(&first_set_bit, n);
return first_set_bit;
}
-#undef USE_MSC_INTRIN
+#undef USE_MSC_INTRINSICS
#else
// Returns (int)floor(log2(n)). n must be > 0.
static INLINE int get_msb(unsigned int n) {
diff --git a/vp9/common/x86/convolve.h b/vp9/common/x86/convolve.h
new file mode 100644
index 000000000..de2df47e5
--- /dev/null
+++ b/vp9/common/x86/convolve.h
@@ -0,0 +1,296 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef VP9_COMMON_X86_CONVOLVE_H_
+#define VP9_COMMON_X86_CONVOLVE_H_
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+typedef void filter8_1dfunction (
+ const uint8_t *src_ptr,
+ ptrdiff_t src_pitch,
+ uint8_t *output_ptr,
+ ptrdiff_t out_pitch,
+ uint32_t output_height,
+ const int16_t *filter
+);
+
+#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
+ void vp9_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \
+ uint8_t *dst, ptrdiff_t dst_stride, \
+ const int16_t *filter_x, int x_step_q4, \
+ const int16_t *filter_y, int y_step_q4, \
+ int w, int h) { \
+ if (step_q4 == 16 && filter[3] != 128) { \
+ if (filter[0] || filter[1] || filter[2]) { \
+ while (w >= 16) { \
+ vp9_filter_block1d16_##dir##8_##avg##opt(src_start, \
+ src_stride, \
+ dst, \
+ dst_stride, \
+ h, \
+ filter); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ while (w >= 8) { \
+ vp9_filter_block1d8_##dir##8_##avg##opt(src_start, \
+ src_stride, \
+ dst, \
+ dst_stride, \
+ h, \
+ filter); \
+ src += 8; \
+ dst += 8; \
+ w -= 8; \
+ } \
+ while (w >= 4) { \
+ vp9_filter_block1d4_##dir##8_##avg##opt(src_start, \
+ src_stride, \
+ dst, \
+ dst_stride, \
+ h, \
+ filter); \
+ src += 4; \
+ dst += 4; \
+ w -= 4; \
+ } \
+ } else { \
+ while (w >= 16) { \
+ vp9_filter_block1d16_##dir##2_##avg##opt(src, \
+ src_stride, \
+ dst, \
+ dst_stride, \
+ h, \
+ filter); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ while (w >= 8) { \
+ vp9_filter_block1d8_##dir##2_##avg##opt(src, \
+ src_stride, \
+ dst, \
+ dst_stride, \
+ h, \
+ filter); \
+ src += 8; \
+ dst += 8; \
+ w -= 8; \
+ } \
+ while (w >= 4) { \
+ vp9_filter_block1d4_##dir##2_##avg##opt(src, \
+ src_stride, \
+ dst, \
+ dst_stride, \
+ h, \
+ filter); \
+ src += 4; \
+ dst += 4; \
+ w -= 4; \
+ } \
+ } \
+ } \
+ if (w) { \
+ vp9_convolve8_##name##_c(src, src_stride, dst, dst_stride, \
+ filter_x, x_step_q4, filter_y, y_step_q4, \
+ w, h); \
+ } \
+}
+
+#define FUN_CONV_2D(avg, opt) \
+void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
+ uint8_t *dst, ptrdiff_t dst_stride, \
+ const int16_t *filter_x, int x_step_q4, \
+ const int16_t *filter_y, int y_step_q4, \
+ int w, int h) { \
+ assert(w <= 64); \
+ assert(h <= 64); \
+ if (x_step_q4 == 16 && y_step_q4 == 16) { \
+ if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
+ filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
+ DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]); \
+ vp9_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
+ filter_x, x_step_q4, filter_y, y_step_q4, \
+ w, h + 7); \
+ vp9_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \
+ filter_x, x_step_q4, filter_y, \
+ y_step_q4, w, h); \
+ } else { \
+ DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]); \
+ vp9_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \
+ filter_x, x_step_q4, filter_y, y_step_q4, \
+ w, h + 1); \
+ vp9_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \
+ filter_x, x_step_q4, filter_y, \
+ y_step_q4, w, h); \
+ } \
+ } else { \
+ vp9_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
+ filter_x, x_step_q4, filter_y, y_step_q4, w, h); \
+ } \
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+typedef void highbd_filter8_1dfunction (
+ const uint16_t *src_ptr,
+ const ptrdiff_t src_pitch,
+ uint16_t *output_ptr,
+ ptrdiff_t out_pitch,
+ unsigned int output_height,
+ const int16_t *filter,
+ int bd
+);
+
+#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
+ void vp9_highbd_convolve8_##name##_##opt(const uint8_t *src8, \
+ ptrdiff_t src_stride, \
+ uint8_t *dst8, \
+ ptrdiff_t dst_stride, \
+ const int16_t *filter_x, \
+ int x_step_q4, \
+ const int16_t *filter_y, \
+ int y_step_q4, \
+ int w, int h, int bd) { \
+ if (step_q4 == 16 && filter[3] != 128) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ if (filter[0] || filter[1] || filter[2]) { \
+ while (w >= 16) { \
+ vp9_highbd_filter_block1d16_##dir##8_##avg##opt(src_start, \
+ src_stride, \
+ dst, \
+ dst_stride, \
+ h, \
+ filter, \
+ bd); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ while (w >= 8) { \
+ vp9_highbd_filter_block1d8_##dir##8_##avg##opt(src_start, \
+ src_stride, \
+ dst, \
+ dst_stride, \
+ h, \
+ filter, \
+ bd); \
+ src += 8; \
+ dst += 8; \
+ w -= 8; \
+ } \
+ while (w >= 4) { \
+ vp9_highbd_filter_block1d4_##dir##8_##avg##opt(src_start, \
+ src_stride, \
+ dst, \
+ dst_stride, \
+ h, \
+ filter, \
+ bd); \
+ src += 4; \
+ dst += 4; \
+ w -= 4; \
+ } \
+ } else { \
+ while (w >= 16) { \
+ vp9_highbd_filter_block1d16_##dir##2_##avg##opt(src, \
+ src_stride, \
+ dst, \
+ dst_stride, \
+ h, \
+ filter, \
+ bd); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ while (w >= 8) { \
+ vp9_highbd_filter_block1d8_##dir##2_##avg##opt(src, \
+ src_stride, \
+ dst, \
+ dst_stride, \
+ h, \
+ filter, \
+ bd); \
+ src += 8; \
+ dst += 8; \
+ w -= 8; \
+ } \
+ while (w >= 4) { \
+ vp9_highbd_filter_block1d4_##dir##2_##avg##opt(src, \
+ src_stride, \
+ dst, \
+ dst_stride, \
+ h, \
+ filter, \
+ bd); \
+ src += 4; \
+ dst += 4; \
+ w -= 4; \
+ } \
+ } \
+ } \
+ if (w) { \
+ vp9_highbd_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \
+ filter_x, x_step_q4, filter_y, y_step_q4, \
+ w, h, bd); \
+ } \
+}
+
+#define HIGH_FUN_CONV_2D(avg, opt) \
+void vp9_highbd_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
+ uint8_t *dst, ptrdiff_t dst_stride, \
+ const int16_t *filter_x, int x_step_q4, \
+ const int16_t *filter_y, int y_step_q4, \
+ int w, int h, int bd) { \
+ assert(w <= 64); \
+ assert(h <= 64); \
+ if (x_step_q4 == 16 && y_step_q4 == 16) { \
+ if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
+ filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
+ DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \
+ vp9_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \
+ CONVERT_TO_BYTEPTR(fdata2), 64, \
+ filter_x, x_step_q4, \
+ filter_y, y_step_q4, \
+ w, h + 7, bd); \
+ vp9_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2) + 192, \
+ 64, dst, dst_stride, \
+ filter_x, x_step_q4, \
+ filter_y, y_step_q4, \
+ w, h, bd); \
+ } else { \
+ DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \
+ vp9_highbd_convolve8_horiz_##opt(src, src_stride, \
+ CONVERT_TO_BYTEPTR(fdata2), 64, \
+ filter_x, x_step_q4, \
+ filter_y, y_step_q4, \
+ w, h + 1, bd); \
+ vp9_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), 64, \
+ dst, dst_stride, \
+ filter_x, x_step_q4, \
+ filter_y, y_step_q4, \
+ w, h, bd); \
+ } \
+ } else { \
+ vp9_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
+ filter_x, x_step_q4, filter_y, y_step_q4, w, \
+ h, bd); \
+ } \
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+#endif // VP9_COMMON_X86_CONVOLVE_H_
diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c
index 963023c53..fd55fb8c6 100644
--- a/vp9/common/x86/vp9_asm_stubs.c
+++ b/vp9/common/x86/vp9_asm_stubs.c
@@ -8,421 +8,9 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include <assert.h>
-
-#include "./vpx_config.h"
#include "./vp9_rtcd.h"
-#include "vpx_ports/mem.h"
-
-typedef void filter8_1dfunction (
- const unsigned char *src_ptr,
- const ptrdiff_t src_pitch,
- unsigned char *output_ptr,
- ptrdiff_t out_pitch,
- unsigned int output_height,
- const short *filter
-);
-
-#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
- void vp9_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \
- uint8_t *dst, ptrdiff_t dst_stride, \
- const int16_t *filter_x, int x_step_q4, \
- const int16_t *filter_y, int y_step_q4, \
- int w, int h) { \
- if (step_q4 == 16 && filter[3] != 128) { \
- if (filter[0] || filter[1] || filter[2]) { \
- while (w >= 16) { \
- vp9_filter_block1d16_##dir##8_##avg##opt(src_start, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter); \
- src += 16; \
- dst += 16; \
- w -= 16; \
- } \
- while (w >= 8) { \
- vp9_filter_block1d8_##dir##8_##avg##opt(src_start, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter); \
- src += 8; \
- dst += 8; \
- w -= 8; \
- } \
- while (w >= 4) { \
- vp9_filter_block1d4_##dir##8_##avg##opt(src_start, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter); \
- src += 4; \
- dst += 4; \
- w -= 4; \
- } \
- } else { \
- while (w >= 16) { \
- vp9_filter_block1d16_##dir##2_##avg##opt(src, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter); \
- src += 16; \
- dst += 16; \
- w -= 16; \
- } \
- while (w >= 8) { \
- vp9_filter_block1d8_##dir##2_##avg##opt(src, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter); \
- src += 8; \
- dst += 8; \
- w -= 8; \
- } \
- while (w >= 4) { \
- vp9_filter_block1d4_##dir##2_##avg##opt(src, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter); \
- src += 4; \
- dst += 4; \
- w -= 4; \
- } \
- } \
- } \
- if (w) { \
- vp9_convolve8_##name##_c(src, src_stride, dst, dst_stride, \
- filter_x, x_step_q4, filter_y, y_step_q4, \
- w, h); \
- } \
-}
-
-#define FUN_CONV_2D(avg, opt) \
-void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
- uint8_t *dst, ptrdiff_t dst_stride, \
- const int16_t *filter_x, int x_step_q4, \
- const int16_t *filter_y, int y_step_q4, \
- int w, int h) { \
- assert(w <= 64); \
- assert(h <= 64); \
- if (x_step_q4 == 16 && y_step_q4 == 16) { \
- if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
- filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
- DECLARE_ALIGNED(16, unsigned char, fdata2[64 * 71]); \
- vp9_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
- filter_x, x_step_q4, filter_y, y_step_q4, \
- w, h + 7); \
- vp9_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \
- filter_x, x_step_q4, filter_y, \
- y_step_q4, w, h); \
- } else { \
- DECLARE_ALIGNED(16, unsigned char, fdata2[64 * 65]); \
- vp9_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \
- filter_x, x_step_q4, filter_y, y_step_q4, \
- w, h + 1); \
- vp9_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \
- filter_x, x_step_q4, filter_y, \
- y_step_q4, w, h); \
- } \
- } else { \
- vp9_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
- filter_x, x_step_q4, filter_y, y_step_q4, w, h); \
- } \
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-
-typedef void highbd_filter8_1dfunction (
- const uint16_t *src_ptr,
- const ptrdiff_t src_pitch,
- uint16_t *output_ptr,
- ptrdiff_t out_pitch,
- unsigned int output_height,
- const int16_t *filter,
- int bd
-);
-
-#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
- void vp9_highbd_convolve8_##name##_##opt(const uint8_t *src8, \
- ptrdiff_t src_stride, \
- uint8_t *dst8, \
- ptrdiff_t dst_stride, \
- const int16_t *filter_x, \
- int x_step_q4, \
- const int16_t *filter_y, \
- int y_step_q4, \
- int w, int h, int bd) { \
- if (step_q4 == 16 && filter[3] != 128) { \
- uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
- if (filter[0] || filter[1] || filter[2]) { \
- while (w >= 16) { \
- vp9_highbd_filter_block1d16_##dir##8_##avg##opt(src_start, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter, \
- bd); \
- src += 16; \
- dst += 16; \
- w -= 16; \
- } \
- while (w >= 8) { \
- vp9_highbd_filter_block1d8_##dir##8_##avg##opt(src_start, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter, \
- bd); \
- src += 8; \
- dst += 8; \
- w -= 8; \
- } \
- while (w >= 4) { \
- vp9_highbd_filter_block1d4_##dir##8_##avg##opt(src_start, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter, \
- bd); \
- src += 4; \
- dst += 4; \
- w -= 4; \
- } \
- } else { \
- while (w >= 16) { \
- vp9_highbd_filter_block1d16_##dir##2_##avg##opt(src, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter, \
- bd); \
- src += 16; \
- dst += 16; \
- w -= 16; \
- } \
- while (w >= 8) { \
- vp9_highbd_filter_block1d8_##dir##2_##avg##opt(src, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter, \
- bd); \
- src += 8; \
- dst += 8; \
- w -= 8; \
- } \
- while (w >= 4) { \
- vp9_highbd_filter_block1d4_##dir##2_##avg##opt(src, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter, \
- bd); \
- src += 4; \
- dst += 4; \
- w -= 4; \
- } \
- } \
- } \
- if (w) { \
- vp9_highbd_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \
- filter_x, x_step_q4, filter_y, y_step_q4, \
- w, h, bd); \
- } \
-}
-
-#define HIGH_FUN_CONV_2D(avg, opt) \
-void vp9_highbd_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
- uint8_t *dst, ptrdiff_t dst_stride, \
- const int16_t *filter_x, int x_step_q4, \
- const int16_t *filter_y, int y_step_q4, \
- int w, int h, int bd) { \
- assert(w <= 64); \
- assert(h <= 64); \
- if (x_step_q4 == 16 && y_step_q4 == 16) { \
- if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
- filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
- DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \
- vp9_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \
- CONVERT_TO_BYTEPTR(fdata2), 64, \
- filter_x, x_step_q4, \
- filter_y, y_step_q4, \
- w, h + 7, bd); \
- vp9_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2) + 192, \
- 64, dst, dst_stride, \
- filter_x, x_step_q4, \
- filter_y, y_step_q4, \
- w, h, bd); \
- } else { \
- DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \
- vp9_highbd_convolve8_horiz_##opt(src, src_stride, \
- CONVERT_TO_BYTEPTR(fdata2), 64, \
- filter_x, x_step_q4, \
- filter_y, y_step_q4, \
- w, h + 1, bd); \
- vp9_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), 64, \
- dst, dst_stride, \
- filter_x, x_step_q4, \
- filter_y, y_step_q4, \
- w, h, bd); \
- } \
- } else { \
- vp9_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
- filter_x, x_step_q4, filter_y, y_step_q4, w, \
- h, bd); \
- } \
-}
-#endif // CONFIG_VP9_HIGHBITDEPTH
-
-#if HAVE_AVX2 && HAVE_SSSE3
-filter8_1dfunction vp9_filter_block1d16_v8_avx2;
-filter8_1dfunction vp9_filter_block1d16_h8_avx2;
-filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
-#if ARCH_X86_64
-filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
-#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_intrin_ssse3
-#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_intrin_ssse3
-#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_intrin_ssse3
-#else // ARCH_X86
-filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
-#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3
-#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3
-#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3
-#endif // ARCH_X86_64 / ARCH_X86
-filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
-#define vp9_filter_block1d4_v8_avx2 vp9_filter_block1d4_v8_ssse3
-#define vp9_filter_block1d16_v2_avx2 vp9_filter_block1d16_v2_ssse3
-#define vp9_filter_block1d16_h2_avx2 vp9_filter_block1d16_h2_ssse3
-#define vp9_filter_block1d8_v2_avx2 vp9_filter_block1d8_v2_ssse3
-#define vp9_filter_block1d8_h2_avx2 vp9_filter_block1d8_h2_ssse3
-#define vp9_filter_block1d4_v2_avx2 vp9_filter_block1d4_v2_ssse3
-#define vp9_filter_block1d4_h2_avx2 vp9_filter_block1d4_h2_ssse3
-// void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-// void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
-
-// void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-FUN_CONV_2D(, avx2);
-#endif // HAVE_AX2 && HAVE_SSSE3
-#if HAVE_SSSE3
-#if ARCH_X86_64
-filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
-#define vp9_filter_block1d16_v8_ssse3 vp9_filter_block1d16_v8_intrin_ssse3
-#define vp9_filter_block1d16_h8_ssse3 vp9_filter_block1d16_h8_intrin_ssse3
-#define vp9_filter_block1d8_v8_ssse3 vp9_filter_block1d8_v8_intrin_ssse3
-#define vp9_filter_block1d8_h8_ssse3 vp9_filter_block1d8_h8_intrin_ssse3
-#define vp9_filter_block1d4_h8_ssse3 vp9_filter_block1d4_h8_intrin_ssse3
-#else // ARCH_X86
-filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
-#endif // ARCH_X86_64 / ARCH_X86
-filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3;
-
-filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
-filter8_1dfunction vp9_filter_block1d16_v2_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h2_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v2_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h2_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v2_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h2_avg_ssse3;
-
-// void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-// void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-// void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-// void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
-FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
-FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
- ssse3);
-
-// void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-// void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-FUN_CONV_2D(, ssse3);
-FUN_CONV_2D(avg_ , ssse3);
-#endif // HAVE_SSSE3
+#include "./vpx_config.h"
+#include "vp9/common/x86/convolve.h"
#if HAVE_SSE2
filter8_1dfunction vp9_filter_block1d16_v8_sse2;
diff --git a/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c b/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c
index 3bc7d3918..cee8d1e76 100644
--- a/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c
+++ b/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c
@@ -8,7 +8,14 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+// Due to a header conflict between math.h and intrinsics includes with ceil()
+// in certain configurations under vs9 this include needs to precede
+// immintrin.h.
+#include "./vp9_rtcd.h"
+
#include <immintrin.h>
+
+#include "vp9/common/x86/convolve.h"
#include "vpx_ports/mem.h"
// filters for 16_h8 and 16_v8
@@ -53,23 +60,23 @@ DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
# define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
#endif // __clang__
-void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr,
- unsigned int src_pixels_per_line,
- unsigned char *output_ptr,
- unsigned int output_pitch,
- unsigned int output_height,
- int16_t *filter) {
+static void vp9_filter_block1d16_h8_avx2(const uint8_t *src_ptr,
+ ptrdiff_t src_pixels_per_line,
+ uint8_t *output_ptr,
+ ptrdiff_t output_pitch,
+ uint32_t output_height,
+ const int16_t *filter) {
__m128i filtersReg;
__m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
__m256i firstFilters, secondFilters, thirdFilters, forthFilters;
__m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
__m256i srcReg32b1, srcReg32b2, filtersReg32;
unsigned int i;
- unsigned int src_stride, dst_stride;
+ ptrdiff_t src_stride, dst_stride;
// create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
- filtersReg = _mm_loadu_si128((__m128i *)filter);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
// converting the 16 bit (short) to 8 bit (byte) and have the same data
// in both lanes of 128 bit register.
filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
@@ -104,9 +111,9 @@ void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr,
for (i = output_height; i > 1; i-=2) {
// load the 2 strides of source
srcReg32b1 = _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(src_ptr-3)));
+ _mm_loadu_si128((const __m128i *)(src_ptr - 3)));
srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
- _mm_loadu_si128((__m128i *)
+ _mm_loadu_si128((const __m128i *)
(src_ptr+src_pixels_per_line-3)), 1);
// filter the source buffer
@@ -135,9 +142,9 @@ void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr,
// reading 2 strides of the next 16 bytes
// (part of it was being read by earlier read)
srcReg32b2 = _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(src_ptr+5)));
+ _mm_loadu_si128((const __m128i *)(src_ptr + 5)));
srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
- _mm_loadu_si128((__m128i *)
+ _mm_loadu_si128((const __m128i *)
(src_ptr+src_pixels_per_line+5)), 1);
// add and saturate the results together
@@ -202,7 +209,7 @@ void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr,
__m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
__m128i srcRegFilt2, srcRegFilt3;
- srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3));
+ srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
// filter the source buffer
srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1,
@@ -237,7 +244,7 @@ void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr,
// reading the next 16 bytes
// (part of it was being read by earlier read)
- srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5));
+ srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
// add and saturate the results together
srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
@@ -297,12 +304,12 @@ void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr,
}
}
-void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr,
- unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- int16_t *filter) {
+static void vp9_filter_block1d16_v8_avx2(const uint8_t *src_ptr,
+ ptrdiff_t src_pitch,
+ uint8_t *output_ptr,
+ ptrdiff_t out_pitch,
+ uint32_t output_height,
+ const int16_t *filter) {
__m128i filtersReg;
__m256i addFilterReg64;
__m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
@@ -310,11 +317,11 @@ void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr,
__m256i srcReg32b11, srcReg32b12, filtersReg32;
__m256i firstFilters, secondFilters, thirdFilters, forthFilters;
unsigned int i;
- unsigned int src_stride, dst_stride;
+ ptrdiff_t src_stride, dst_stride;
// create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
- filtersReg = _mm_loadu_si128((__m128i *)filter);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
// converting the 16 bit (short) to 8 bit (byte) and have the
// same data in both lanes of 128 bit register.
filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
@@ -344,19 +351,19 @@ void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr,
// load 16 bytes 7 times in stride of src_pitch
srcReg32b1 = _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(src_ptr)));
+ _mm_loadu_si128((const __m128i *)(src_ptr)));
srcReg32b2 = _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(src_ptr+src_pitch)));
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)));
srcReg32b3 = _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2)));
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)));
srcReg32b4 = _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3)));
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)));
srcReg32b5 = _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4)));
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
srcReg32b6 = _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5)));
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
srcReg32b7 = _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6)));
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
// have each consecutive loads on the same 256 register
srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
@@ -393,11 +400,11 @@ void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr,
// load the last 2 loads of 16 bytes and have every two
// consecutive loads in the same 256 bit register
srcReg32b8 = _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7)));
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)));
srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
_mm256_castsi256_si128(srcReg32b8), 1);
srcReg32b9 = _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*8)));
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8)));
srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
_mm256_castsi256_si128(srcReg32b9), 1);
@@ -476,7 +483,7 @@ void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr,
__m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;
__m128i srcRegFilt6, srcRegFilt7, srcRegFilt8;
// load the last 16 bytes
- srcRegFilt8 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7));
+ srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
// merge the last 2 results together
srcRegFilt4 = _mm_unpacklo_epi8(
@@ -542,3 +549,54 @@ void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr,
_mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
}
}
+
+#if HAVE_AVX2 && HAVE_SSSE3
+filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
+#if ARCH_X86_64
+filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
+#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_intrin_ssse3
+#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_intrin_ssse3
+#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_intrin_ssse3
+#else // ARCH_X86
+filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
+#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3
+#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3
+#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3
+#endif // ARCH_X86_64
+filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
+#define vp9_filter_block1d4_v8_avx2 vp9_filter_block1d4_v8_ssse3
+#define vp9_filter_block1d16_v2_avx2 vp9_filter_block1d16_v2_ssse3
+#define vp9_filter_block1d16_h2_avx2 vp9_filter_block1d16_h2_ssse3
+#define vp9_filter_block1d8_v2_avx2 vp9_filter_block1d8_v2_ssse3
+#define vp9_filter_block1d8_h2_avx2 vp9_filter_block1d8_h2_ssse3
+#define vp9_filter_block1d4_v2_avx2 vp9_filter_block1d4_v2_ssse3
+#define vp9_filter_block1d4_h2_avx2 vp9_filter_block1d4_h2_ssse3
+// void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+// void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
+
+// void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+FUN_CONV_2D(, avx2);
+#endif // HAVE_AX2 && HAVE_SSSE3
diff --git a/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c b/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
index 4ab49e772..5fd2857e1 100644
--- a/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
+++ b/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
@@ -8,9 +8,14 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+// Due to a header conflict between math.h and intrinsics includes with ceil()
+// in certain configurations under vs9 this include needs to precede
+// tmmintrin.h.
+#include "./vp9_rtcd.h"
+
#include <tmmintrin.h>
-#include "./vp9_rtcd.h"
+#include "vp9/common/x86/convolve.h"
#include "vpx_ports/mem.h"
#include "vpx_ports/emmintrin_compat.h"
@@ -40,12 +45,17 @@ DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = {
6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
};
-void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr,
- unsigned int src_pixels_per_line,
- unsigned char *output_ptr,
- unsigned int output_pitch,
- unsigned int output_height,
- int16_t *filter) {
+// These are reused by the avx2 intrinsics.
+filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
+
+void vp9_filter_block1d4_h8_intrin_ssse3(const uint8_t *src_ptr,
+ ptrdiff_t src_pixels_per_line,
+ uint8_t *output_ptr,
+ ptrdiff_t output_pitch,
+ uint32_t output_height,
+ const int16_t *filter) {
__m128i firstFilters, secondFilters, shuffle1, shuffle2;
__m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
__m128i addFilterReg64, filtersReg, srcReg, minReg;
@@ -53,7 +63,7 @@ void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr,
// create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
addFilterReg64 =_mm_set1_epi32((int)0x0400040u);
- filtersReg = _mm_loadu_si128((__m128i *)filter);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
// converting the 16 bit (short) to 8 bit (byte) and have the same data
// in both lanes of 128 bit register.
filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
@@ -74,7 +84,7 @@ void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr,
shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8);
for (i = 0; i < output_height; i++) {
- srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3));
+ srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
// filter the source buffer
srcRegFilt1= _mm_shuffle_epi8(srcReg, shuffle1);
@@ -111,12 +121,12 @@ void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr,
}
}
-void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr,
- unsigned int src_pixels_per_line,
- unsigned char *output_ptr,
- unsigned int output_pitch,
- unsigned int output_height,
- int16_t *filter) {
+void vp9_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr,
+ ptrdiff_t src_pixels_per_line,
+ uint8_t *output_ptr,
+ ptrdiff_t output_pitch,
+ uint32_t output_height,
+ const int16_t *filter) {
__m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
__m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
__m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
@@ -125,7 +135,7 @@ void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr,
// create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
- filtersReg = _mm_loadu_si128((__m128i *)filter);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
// converting the 16 bit (short) to 8 bit (byte) and have the same data
// in both lanes of 128 bit register.
filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
@@ -149,7 +159,7 @@ void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr,
filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
for (i = 0; i < output_height; i++) {
- srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3));
+ srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
// filter the source buffer
srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg);
@@ -191,12 +201,12 @@ void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr,
}
}
-void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr,
- unsigned int src_pixels_per_line,
- unsigned char *output_ptr,
- unsigned int output_pitch,
- unsigned int output_height,
- int16_t *filter) {
+static void vp9_filter_block1d16_h8_intrin_ssse3(const uint8_t *src_ptr,
+ ptrdiff_t src_pixels_per_line,
+ uint8_t *output_ptr,
+ ptrdiff_t output_pitch,
+ uint32_t output_height,
+ const int16_t *filter) {
__m128i addFilterReg64, filtersReg, srcReg1, srcReg2;
__m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
__m128i firstFilters, secondFilters, thirdFilters, forthFilters;
@@ -205,7 +215,7 @@ void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr,
// create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
- filtersReg = _mm_loadu_si128((__m128i *)filter);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
// converting the 16 bit (short) to 8 bit (byte) and have the same data
// in both lanes of 128 bit register.
filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
@@ -229,7 +239,7 @@ void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr,
filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
for (i = 0; i < output_height; i++) {
- srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3));
+ srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
// filter the source buffer
srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg);
@@ -256,7 +266,7 @@ void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr,
// reading the next 16 bytes.
// (part of it was being read by earlier read)
- srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5));
+ srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
// add and saturate the results together
srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
@@ -308,12 +318,12 @@ void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr,
}
}
-void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr,
- unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- int16_t *filter) {
+void vp9_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr,
+ ptrdiff_t src_pitch,
+ uint8_t *output_ptr,
+ ptrdiff_t out_pitch,
+ uint32_t output_height,
+ const int16_t *filter) {
__m128i addFilterReg64, filtersReg, minReg;
__m128i firstFilters, secondFilters, thirdFilters, forthFilters;
__m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;
@@ -323,7 +333,7 @@ void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr,
// create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
- filtersReg = _mm_loadu_si128((__m128i *)filter);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
// converting the 16 bit (short) to 8 bit (byte) and have the same data
// in both lanes of 128 bit register.
filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
@@ -338,17 +348,17 @@ void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr,
forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
// load the first 7 rows of 8 bytes
- srcReg1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]);
- srcReg2 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch)[0]);
- srcReg3 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 2)[0]);
- srcReg4 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 3)[0]);
- srcReg5 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 4)[0]);
- srcReg6 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 5)[0]);
- srcReg7 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 6)[0]);
+ srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr);
+ srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
+ srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
+ srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
+ srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
+ srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
+ srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
for (i = 0; i < output_height; i++) {
// load the last 8 bytes
- srcReg8 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 7)[0]);
+ srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
// merge the result together
srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2);
@@ -396,12 +406,12 @@ void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr,
}
}
-void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
- unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- int16_t *filter) {
+static void vp9_filter_block1d16_v8_intrin_ssse3(const uint8_t *src_ptr,
+ ptrdiff_t src_pitch,
+ uint8_t *output_ptr,
+ ptrdiff_t out_pitch,
+ uint32_t output_height,
+ const int16_t *filter) {
__m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt3;
__m128i firstFilters, secondFilters, thirdFilters, forthFilters;
__m128i srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;
@@ -411,7 +421,7 @@ void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
// create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
- filtersReg = _mm_loadu_si128((__m128i *)filter);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
// converting the 16 bit (short) to 8 bit (byte) and have the same data
// in both lanes of 128 bit register.
filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
@@ -426,17 +436,17 @@ void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
// load the first 7 rows of 16 bytes
- srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr));
- srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch));
- srcReg3 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 2));
- srcReg4 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 3));
- srcReg5 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 4));
- srcReg6 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 5));
- srcReg7 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 6));
+ srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
+ srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
+ srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
+ srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
+ srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
+ srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
+ srcReg7 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
for (i = 0; i < output_height; i++) {
// load the last 16 bytes
- srcReg8 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 7));
+ srcReg8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
// merge the result together
srcRegFilt5 = _mm_unpacklo_epi8(srcReg1, srcReg2);
@@ -510,3 +520,82 @@ void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
output_ptr+=out_pitch;
}
}
+
+#if ARCH_X86_64
+filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
+#define vp9_filter_block1d16_v8_ssse3 vp9_filter_block1d16_v8_intrin_ssse3
+#define vp9_filter_block1d16_h8_ssse3 vp9_filter_block1d16_h8_intrin_ssse3
+#define vp9_filter_block1d8_v8_ssse3 vp9_filter_block1d8_v8_intrin_ssse3
+#define vp9_filter_block1d8_h8_ssse3 vp9_filter_block1d8_h8_intrin_ssse3
+#define vp9_filter_block1d4_h8_ssse3 vp9_filter_block1d4_h8_intrin_ssse3
+#else // ARCH_X86
+filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
+#endif // ARCH_X86_64
+filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3;
+
+filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
+filter8_1dfunction vp9_filter_block1d16_v2_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h2_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v2_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h2_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v2_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h2_avg_ssse3;
+
+// void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+// void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+// void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+// void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
+FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
+FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
+ ssse3);
+
+// void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+// void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+FUN_CONV_2D(, ssse3);
+FUN_CONV_2D(avg_ , ssse3);
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index fcf480b86..0e9b1c523 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -699,7 +699,8 @@ static void resize_context_buffers(VP9_COMMON *cm, int width, int height) {
#if CONFIG_SIZE_LIMIT
if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT)
vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
- "Width and height beyond allowed size.");
+ "Dimensions of %dx%d beyond allowed size of %dx%d.",
+ width, height, DECODE_WIDTH_LIMIT, DECODE_HEIGHT_LIMIT);
#endif
if (cm->width != width || cm->height != height) {
const int new_mi_rows =
diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c
index 288d8690c..7991a39e6 100644
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@@ -50,7 +50,6 @@ static void initialize_dec(void) {
static void vp9_dec_setup_mi(VP9_COMMON *cm) {
cm->mi = cm->mip + cm->mi_stride + 1;
- memset(cm->mip, 0, cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mip));
cm->mi_grid_visible = cm->mi_grid_base + cm->mi_stride + 1;
memset(cm->mi_grid_base, 0,
cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mi_grid_base));
@@ -212,6 +211,9 @@ vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm,
// Find an empty frame buffer.
const int free_fb = get_free_fb(cm);
+ if (cm->new_fb_idx == INVALID_IDX)
+ return VPX_CODEC_MEM_ERROR;
+
// Decrease ref_count since it will be increased again in
// ref_cnt_fb() below.
--frame_bufs[free_fb].ref_count;
@@ -299,7 +301,10 @@ int vp9_receive_compressed_data(VP9Decoder *pbi,
&& frame_bufs[cm->new_fb_idx].ref_count == 0)
pool->release_fb_cb(pool->cb_priv,
&frame_bufs[cm->new_fb_idx].raw_frame_buffer);
+ // Find a free frame buffer. Return error if can not find any.
cm->new_fb_idx = get_free_fb(cm);
+ if (cm->new_fb_idx == INVALID_IDX)
+ return VPX_CODEC_MEM_ERROR;
// Assign a MV array to the frame buffer.
cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx];
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index bb8c66fc0..63269844c 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -17,6 +17,7 @@
#if CONFIG_COEFFICIENT_RANGE_CHECKING
#include "vp9/common/vp9_idct.h"
#endif
+#include "vp9/common/vp9_scan.h"
#include "vp9/decoder/vp9_detokenize.h"
diff --git a/vp9/encoder/arm/neon/vp9_variance_neon.c b/vp9/encoder/arm/neon/vp9_variance_neon.c
index cf82dd75d..166156af7 100644
--- a/vp9/encoder/arm/neon/vp9_variance_neon.c
+++ b/vp9/encoder/arm/neon/vp9_variance_neon.c
@@ -10,6 +10,7 @@
#include <arm_neon.h>
#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
#include "./vpx_config.h"
#include "vpx_ports/mem.h"
@@ -20,82 +21,6 @@
#include "vp9/encoder/vp9_variance.h"
-static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
- const int32x4_t a = vpaddlq_s16(v_16x8);
- const int64x2_t b = vpaddlq_s32(a);
- const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
- vreinterpret_s32_s64(vget_high_s64(b)));
- return vget_lane_s32(c, 0);
-}
-
-static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
- const int64x2_t b = vpaddlq_s32(v_32x4);
- const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
- vreinterpret_s32_s64(vget_high_s64(b)));
- return vget_lane_s32(c, 0);
-}
-
-// w * h must be less than 2048 or local variable v_sum may overflow.
-static void variance_neon_w8(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- int w, int h, uint32_t *sse, int *sum) {
- int i, j;
- int16x8_t v_sum = vdupq_n_s16(0);
- int32x4_t v_sse_lo = vdupq_n_s32(0);
- int32x4_t v_sse_hi = vdupq_n_s32(0);
-
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; j += 8) {
- const uint8x8_t v_a = vld1_u8(&a[j]);
- const uint8x8_t v_b = vld1_u8(&b[j]);
- const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
- const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
- v_sum = vaddq_s16(v_sum, sv_diff);
- v_sse_lo = vmlal_s16(v_sse_lo,
- vget_low_s16(sv_diff),
- vget_low_s16(sv_diff));
- v_sse_hi = vmlal_s16(v_sse_hi,
- vget_high_s16(sv_diff),
- vget_high_s16(sv_diff));
- }
- a += a_stride;
- b += b_stride;
- }
-
- *sum = horizontal_add_s16x8(v_sum);
- *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
-}
-
-void vp9_get8x8var_neon(const uint8_t *src_ptr, int source_stride,
- const uint8_t *ref_ptr, int ref_stride,
- unsigned int *sse, int *sum) {
- variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, 8,
- 8, sse, sum);
-}
-
-unsigned int vp9_variance8x8_neon(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- unsigned int *sse) {
- int sum;
- variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum);
- return *sse - (((int64_t)sum * sum) >> 6); // >> 6 = / 8 * 8
-}
-
-void vp9_get16x16var_neon(const uint8_t *src_ptr, int source_stride,
- const uint8_t *ref_ptr, int ref_stride,
- unsigned int *sse, int *sum) {
- variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, 16,
- 16, sse, sum);
-}
-
-unsigned int vp9_variance16x16_neon(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- unsigned int *sse) {
- int sum;
- variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum);
- return *sse - (((int64_t)sum * sum) >> 8); // >> 8 = / 16 * 16
-}
-
static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
uint8_t *output_ptr,
unsigned int src_pixels_per_line,
@@ -162,7 +87,7 @@ unsigned int vp9_sub_pixel_variance8x8_neon(const uint8_t *src,
BILINEAR_FILTERS_2TAP(xoffset));
var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8,
8, BILINEAR_FILTERS_2TAP(yoffset));
- return vp9_variance8x8_neon(temp2, 8, dst, dst_stride, sse);
+ return vpx_variance8x8_neon(temp2, 8, dst, dst_stride, sse);
}
unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src,
@@ -180,77 +105,7 @@ unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src,
BILINEAR_FILTERS_2TAP(xoffset));
var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16,
16, BILINEAR_FILTERS_2TAP(yoffset));
- return vp9_variance16x16_neon(temp2, 16, dst, dst_stride, sse);
-}
-
-void vp9_get32x32var_neon(const uint8_t *src_ptr, int source_stride,
- const uint8_t *ref_ptr, int ref_stride,
- unsigned int *sse, int *sum) {
- variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, 32,
- 32, sse, sum);
-}
-
-unsigned int vp9_variance32x32_neon(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- unsigned int *sse) {
- int sum;
- variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum);
- return *sse - (((int64_t)sum * sum) >> 10); // >> 10 = / 32 * 32
-}
-
-unsigned int vp9_variance32x64_neon(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- unsigned int *sse) {
- int sum1, sum2;
- uint32_t sse1, sse2;
- variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1);
- variance_neon_w8(a + (32 * a_stride), a_stride,
- b + (32 * b_stride), b_stride, 32, 32,
- &sse2, &sum2);
- *sse = sse1 + sse2;
- sum1 += sum2;
- return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64
-}
-
-unsigned int vp9_variance64x32_neon(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- unsigned int *sse) {
- int sum1, sum2;
- uint32_t sse1, sse2;
- variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
- variance_neon_w8(a + (16 * a_stride), a_stride,
- b + (16 * b_stride), b_stride, 64, 16,
- &sse2, &sum2);
- *sse = sse1 + sse2;
- sum1 += sum2;
- return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64
-}
-
-unsigned int vp9_variance64x64_neon(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- unsigned int *sse) {
- int sum1, sum2;
- uint32_t sse1, sse2;
-
- variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
- variance_neon_w8(a + (16 * a_stride), a_stride,
- b + (16 * b_stride), b_stride, 64, 16,
- &sse2, &sum2);
- sse1 += sse2;
- sum1 += sum2;
-
- variance_neon_w8(a + (16 * 2 * a_stride), a_stride,
- b + (16 * 2 * b_stride), b_stride,
- 64, 16, &sse2, &sum2);
- sse1 += sse2;
- sum1 += sum2;
-
- variance_neon_w8(a + (16 * 3 * a_stride), a_stride,
- b + (16 * 3 * b_stride), b_stride,
- 64, 16, &sse2, &sum2);
- *sse = sse1 + sse2;
- sum1 += sum2;
- return *sse - (((int64_t)sum1 * sum1) >> 12); // >> 12 = / 64 * 64
+ return vpx_variance16x16_neon(temp2, 16, dst, dst_stride, sse);
}
unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src,
@@ -268,7 +123,7 @@ unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src,
BILINEAR_FILTERS_2TAP(xoffset));
var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32,
32, BILINEAR_FILTERS_2TAP(yoffset));
- return vp9_variance32x32_neon(temp2, 32, dst, dst_stride, sse);
+ return vpx_variance32x32_neon(temp2, 32, dst, dst_stride, sse);
}
unsigned int vp9_sub_pixel_variance64x64_neon(const uint8_t *src,
@@ -286,5 +141,5 @@ unsigned int vp9_sub_pixel_variance64x64_neon(const uint8_t *src,
BILINEAR_FILTERS_2TAP(xoffset));
var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64,
64, BILINEAR_FILTERS_2TAP(yoffset));
- return vp9_variance64x64_neon(temp2, 64, dst, dst_stride, sse);
+ return vpx_variance64x64_neon(temp2, 64, dst, dst_stride, sse);
}
diff --git a/vp9/encoder/vp9_aq_variance.c b/vp9/encoder/vp9_aq_variance.c
index 55c964903..9e5d9ee6a 100644
--- a/vp9/encoder/vp9_aq_variance.c
+++ b/vp9/encoder/vp9_aq_variance.c
@@ -98,9 +98,9 @@ static unsigned int block_variance(VP9_COMP *cpi, MACROBLOCK *x,
int avg;
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- highbd_variance(x->plane[0].src.buf, x->plane[0].src.stride,
- CONVERT_TO_BYTEPTR(vp9_highbd_64_zeros), 0, bw, bh,
- &sse, &avg);
+ highbd_8_variance(x->plane[0].src.buf, x->plane[0].src.stride,
+ CONVERT_TO_BYTEPTR(vp9_highbd_64_zeros), 0, bw, bh,
+ &sse, &avg);
sse >>= 2 * (xd->bd - 8);
avg >>= (xd->bd - 8);
} else {
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 3d310f955..a8adca9ec 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -13,6 +13,7 @@
#include <stdio.h>
#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
#include "./vpx_config.h"
#include "vpx_ports/mem.h"
@@ -463,46 +464,55 @@ static int set_vt_partitioning(VP9_COMP *cpi,
return 0;
}
-void vp9_set_vbp_thresholds(VP9_COMP *cpi, int q) {
+// Set the variance split thresholds for following the block sizes:
+// 0 - threshold_64x64, 1 - threshold_32x32, 2 - threshold_16x16,
+// 3 - vbp_threshold_8x8. vbp_threshold_8x8 (to split to 4x4 partition) is
+// currently only used on key frame.
+static void set_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q) {
+ VP9_COMMON *const cm = &cpi->common;
+ const int is_key_frame = (cm->frame_type == KEY_FRAME);
+ const int threshold_multiplier = is_key_frame ? 20 : 1;
+ const int64_t threshold_base = (int64_t)(threshold_multiplier *
+ cpi->y_dequant[q][1]);
+ if (is_key_frame) {
+ thresholds[0] = threshold_base;
+ thresholds[1] = threshold_base >> 2;
+ thresholds[2] = threshold_base >> 2;
+ thresholds[3] = threshold_base << 2;
+ } else {
+ thresholds[1] = threshold_base;
+ if (cm->width <= 352 && cm->height <= 288) {
+ thresholds[0] = threshold_base >> 2;
+ thresholds[2] = threshold_base << 3;
+ } else {
+ thresholds[0] = threshold_base;
+ thresholds[1] = (5 * threshold_base) >> 2;
+ if (cm->width >= 1920 && cm->height >= 1080)
+ thresholds[1] = (7 * threshold_base) >> 2;
+ thresholds[2] = threshold_base << cpi->oxcf.speed;
+ }
+ }
+}
+
+void vp9_set_variance_partition_thresholds(VP9_COMP *cpi, int q) {
+ VP9_COMMON *const cm = &cpi->common;
SPEED_FEATURES *const sf = &cpi->sf;
+ const int is_key_frame = (cm->frame_type == KEY_FRAME);
if (sf->partition_search_type != VAR_BASED_PARTITION &&
sf->partition_search_type != REFERENCE_PARTITION) {
return;
} else {
- VP9_COMMON *const cm = &cpi->common;
- const int is_key_frame = (cm->frame_type == KEY_FRAME);
- const int threshold_multiplier = is_key_frame ? 20 : 1;
- const int64_t threshold_base = (int64_t)(threshold_multiplier *
- cpi->y_dequant[q][1]);
-
- // TODO(marpan): Allow 4x4 partitions for inter-frames.
- // use_4x4_partition = (variance4x4downsample[i2 + j] == 1);
- // If 4x4 partition is not used, then 8x8 partition will be selected
- // if variance of 16x16 block is very high, so use larger threshold
- // for 16x16 (threshold_bsize_min) in that case.
-
- // Array index: 0 - threshold_64x64; 1 - threshold_32x32;
- // 2 - threshold_16x16; 3 - vbp_threshold_8x8;
+ set_vbp_thresholds(cpi, cpi->vbp_thresholds, q);
+ // The thresholds below are not changed locally.
if (is_key_frame) {
- cpi->vbp_thresholds[0] = threshold_base;
- cpi->vbp_thresholds[1] = threshold_base >> 2;
- cpi->vbp_thresholds[2] = threshold_base >> 2;
- cpi->vbp_thresholds[3] = threshold_base << 2;
cpi->vbp_threshold_sad = 0;
cpi->vbp_bsize_min = BLOCK_8X8;
} else {
- cpi->vbp_thresholds[1] = threshold_base;
- if (cm->width <= 352 && cm->height <= 288) {
- cpi->vbp_thresholds[0] = threshold_base >> 2;
- cpi->vbp_thresholds[2] = threshold_base << 3;
+ if (cm->width <= 352 && cm->height <= 288)
cpi->vbp_threshold_sad = 100;
- } else {
- cpi->vbp_thresholds[0] = threshold_base;
- cpi->vbp_thresholds[1] = (5 * threshold_base) >> 2;
- cpi->vbp_thresholds[2] = threshold_base << cpi->oxcf.speed;
+ else
cpi->vbp_threshold_sad = (cpi->y_dequant[q][1] << 1) > 1000 ?
(cpi->y_dequant[q][1] << 1) : 1000;
- }
cpi->vbp_bsize_min = BLOCK_16X16;
}
cpi->vbp_threshold_minmax = 15 + (q >> 3);
@@ -551,23 +561,6 @@ static int compute_minmax_8x8(const uint8_t *s, int sp, const uint8_t *d,
return (minmax_max - minmax_min);
}
-static void modify_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q) {
- VP9_COMMON *const cm = &cpi->common;
- const int64_t threshold_base = (int64_t)(cpi->y_dequant[q][1]);
-
- // Array index: 0 - threshold_64x64; 1 - threshold_32x32;
- // 2 - threshold_16x16; 3 - vbp_threshold_8x8;
- thresholds[1] = threshold_base;
- if (cm->width <= 352 && cm->height <= 288) {
- thresholds[0] = threshold_base >> 2;
- thresholds[2] = threshold_base << 3;
- } else {
- thresholds[0] = threshold_base;
- thresholds[1] = (5 * threshold_base) >> 2;
- thresholds[2] = threshold_base << cpi->oxcf.speed;
- }
-}
-
static void fill_variance_4x4avg(const uint8_t *s, int sp, const uint8_t *d,
int dp, int x8_idx, int y8_idx, v8x8 *vst,
#if CONFIG_VP9_HIGHBITDEPTH
@@ -680,7 +673,7 @@ static int choose_partitioning(VP9_COMP *cpi,
if (cyclic_refresh_segment_id_boosted(segment_id)) {
int q = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex);
- modify_vbp_thresholds(cpi, thresholds, q);
+ set_vbp_thresholds(cpi, thresholds, q);
}
}
@@ -724,7 +717,7 @@ static int choose_partitioning(VP9_COMP *cpi,
mbmi->mv[0].as_int = 0;
mbmi->interp_filter = BILINEAR;
- y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize);
+ y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col);
if (y_sad_g < y_sad) {
vp9_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
&cm->frame_refs[GOLDEN_FRAME - 1].sf);
@@ -3672,15 +3665,15 @@ static int set_var_thresh_from_histogram(VP9_COMP *cpi) {
if (cm->use_highbitdepth) {
switch (cm->bit_depth) {
case VPX_BITS_8:
- vp9_highbd_get16x16var(src, src_stride, last_src, last_stride,
+ vpx_highbd_8_get16x16var(src, src_stride, last_src, last_stride,
&var16->sse, &var16->sum);
break;
case VPX_BITS_10:
- vp9_highbd_10_get16x16var(src, src_stride, last_src, last_stride,
+ vpx_highbd_10_get16x16var(src, src_stride, last_src, last_stride,
&var16->sse, &var16->sum);
break;
case VPX_BITS_12:
- vp9_highbd_12_get16x16var(src, src_stride, last_src, last_stride,
+ vpx_highbd_12_get16x16var(src, src_stride, last_src, last_stride,
&var16->sse, &var16->sum);
break;
default:
@@ -3689,11 +3682,11 @@ static int set_var_thresh_from_histogram(VP9_COMP *cpi) {
return -1;
}
} else {
- vp9_get16x16var(src, src_stride, last_src, last_stride,
+ vpx_get16x16var(src, src_stride, last_src, last_stride,
&var16->sse, &var16->sum);
}
#else
- vp9_get16x16var(src, src_stride, last_src, last_stride,
+ vpx_get16x16var(src, src_stride, last_src, last_stride,
&var16->sse, &var16->sum);
#endif // CONFIG_VP9_HIGHBITDEPTH
var16->var = var16->sse -
diff --git a/vp9/encoder/vp9_encodeframe.h b/vp9/encoder/vp9_encodeframe.h
index 1acde0283..6aaa56463 100644
--- a/vp9/encoder/vp9_encodeframe.h
+++ b/vp9/encoder/vp9_encodeframe.h
@@ -40,7 +40,7 @@ void vp9_init_tile_data(struct VP9_COMP *cpi);
void vp9_encode_tile(struct VP9_COMP *cpi, struct ThreadData *td,
int tile_row, int tile_col);
-void vp9_set_vbp_thresholds(struct VP9_COMP *cpi, int q);
+void vp9_set_variance_partition_thresholds(struct VP9_COMP *cpi, int q);
#ifdef __cplusplus
} // extern "C"
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index b115e0ef9..2829365e5 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -18,6 +18,7 @@
#include "vp9/common/vp9_idct.h"
#include "vp9/common/vp9_reconinter.h"
#include "vp9/common/vp9_reconintra.h"
+#include "vp9/common/vp9_scan.h"
#include "vp9/common/vp9_systemdependent.h"
#include "vp9/encoder/vp9_encodemb.h"
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index df7094949..2fdf4082a 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -1000,7 +1000,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_32X16,
vpx_highbd_sad32x16_bits8,
vpx_highbd_sad32x16_avg_bits8,
- vp9_highbd_variance32x16,
+ vpx_highbd_8_variance32x16,
vp9_highbd_sub_pixel_variance32x16,
vp9_highbd_sub_pixel_avg_variance32x16,
NULL,
@@ -1010,7 +1010,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_16X32,
vpx_highbd_sad16x32_bits8,
vpx_highbd_sad16x32_avg_bits8,
- vp9_highbd_variance16x32,
+ vpx_highbd_8_variance16x32,
vp9_highbd_sub_pixel_variance16x32,
vp9_highbd_sub_pixel_avg_variance16x32,
NULL,
@@ -1020,7 +1020,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_64X32,
vpx_highbd_sad64x32_bits8,
vpx_highbd_sad64x32_avg_bits8,
- vp9_highbd_variance64x32,
+ vpx_highbd_8_variance64x32,
vp9_highbd_sub_pixel_variance64x32,
vp9_highbd_sub_pixel_avg_variance64x32,
NULL,
@@ -1030,7 +1030,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_32X64,
vpx_highbd_sad32x64_bits8,
vpx_highbd_sad32x64_avg_bits8,
- vp9_highbd_variance32x64,
+ vpx_highbd_8_variance32x64,
vp9_highbd_sub_pixel_variance32x64,
vp9_highbd_sub_pixel_avg_variance32x64,
NULL,
@@ -1040,7 +1040,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_32X32,
vpx_highbd_sad32x32_bits8,
vpx_highbd_sad32x32_avg_bits8,
- vp9_highbd_variance32x32,
+ vpx_highbd_8_variance32x32,
vp9_highbd_sub_pixel_variance32x32,
vp9_highbd_sub_pixel_avg_variance32x32,
vpx_highbd_sad32x32x3_bits8,
@@ -1050,7 +1050,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_64X64,
vpx_highbd_sad64x64_bits8,
vpx_highbd_sad64x64_avg_bits8,
- vp9_highbd_variance64x64,
+ vpx_highbd_8_variance64x64,
vp9_highbd_sub_pixel_variance64x64,
vp9_highbd_sub_pixel_avg_variance64x64,
vpx_highbd_sad64x64x3_bits8,
@@ -1060,7 +1060,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_16X16,
vpx_highbd_sad16x16_bits8,
vpx_highbd_sad16x16_avg_bits8,
- vp9_highbd_variance16x16,
+ vpx_highbd_8_variance16x16,
vp9_highbd_sub_pixel_variance16x16,
vp9_highbd_sub_pixel_avg_variance16x16,
vpx_highbd_sad16x16x3_bits8,
@@ -1070,7 +1070,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_16X8,
vpx_highbd_sad16x8_bits8,
vpx_highbd_sad16x8_avg_bits8,
- vp9_highbd_variance16x8,
+ vpx_highbd_8_variance16x8,
vp9_highbd_sub_pixel_variance16x8,
vp9_highbd_sub_pixel_avg_variance16x8,
vpx_highbd_sad16x8x3_bits8,
@@ -1080,7 +1080,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_8X16,
vpx_highbd_sad8x16_bits8,
vpx_highbd_sad8x16_avg_bits8,
- vp9_highbd_variance8x16,
+ vpx_highbd_8_variance8x16,
vp9_highbd_sub_pixel_variance8x16,
vp9_highbd_sub_pixel_avg_variance8x16,
vpx_highbd_sad8x16x3_bits8,
@@ -1090,7 +1090,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_8X8,
vpx_highbd_sad8x8_bits8,
vpx_highbd_sad8x8_avg_bits8,
- vp9_highbd_variance8x8,
+ vpx_highbd_8_variance8x8,
vp9_highbd_sub_pixel_variance8x8,
vp9_highbd_sub_pixel_avg_variance8x8,
vpx_highbd_sad8x8x3_bits8,
@@ -1100,7 +1100,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_8X4,
vpx_highbd_sad8x4_bits8,
vpx_highbd_sad8x4_avg_bits8,
- vp9_highbd_variance8x4,
+ vpx_highbd_8_variance8x4,
vp9_highbd_sub_pixel_variance8x4,
vp9_highbd_sub_pixel_avg_variance8x4,
NULL,
@@ -1110,7 +1110,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_4X8,
vpx_highbd_sad4x8_bits8,
vpx_highbd_sad4x8_avg_bits8,
- vp9_highbd_variance4x8,
+ vpx_highbd_8_variance4x8,
vp9_highbd_sub_pixel_variance4x8,
vp9_highbd_sub_pixel_avg_variance4x8,
NULL,
@@ -1120,7 +1120,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_4X4,
vpx_highbd_sad4x4_bits8,
vpx_highbd_sad4x4_avg_bits8,
- vp9_highbd_variance4x4,
+ vpx_highbd_8_variance4x4,
vp9_highbd_sub_pixel_variance4x4,
vp9_highbd_sub_pixel_avg_variance4x4,
vpx_highbd_sad4x4x3_bits8,
@@ -1132,7 +1132,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_32X16,
vpx_highbd_sad32x16_bits10,
vpx_highbd_sad32x16_avg_bits10,
- vp9_highbd_10_variance32x16,
+ vpx_highbd_10_variance32x16,
vp9_highbd_10_sub_pixel_variance32x16,
vp9_highbd_10_sub_pixel_avg_variance32x16,
NULL,
@@ -1142,7 +1142,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_16X32,
vpx_highbd_sad16x32_bits10,
vpx_highbd_sad16x32_avg_bits10,
- vp9_highbd_10_variance16x32,
+ vpx_highbd_10_variance16x32,
vp9_highbd_10_sub_pixel_variance16x32,
vp9_highbd_10_sub_pixel_avg_variance16x32,
NULL,
@@ -1152,7 +1152,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_64X32,
vpx_highbd_sad64x32_bits10,
vpx_highbd_sad64x32_avg_bits10,
- vp9_highbd_10_variance64x32,
+ vpx_highbd_10_variance64x32,
vp9_highbd_10_sub_pixel_variance64x32,
vp9_highbd_10_sub_pixel_avg_variance64x32,
NULL,
@@ -1162,7 +1162,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_32X64,
vpx_highbd_sad32x64_bits10,
vpx_highbd_sad32x64_avg_bits10,
- vp9_highbd_10_variance32x64,
+ vpx_highbd_10_variance32x64,
vp9_highbd_10_sub_pixel_variance32x64,
vp9_highbd_10_sub_pixel_avg_variance32x64,
NULL,
@@ -1172,7 +1172,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_32X32,
vpx_highbd_sad32x32_bits10,
vpx_highbd_sad32x32_avg_bits10,
- vp9_highbd_10_variance32x32,
+ vpx_highbd_10_variance32x32,
vp9_highbd_10_sub_pixel_variance32x32,
vp9_highbd_10_sub_pixel_avg_variance32x32,
vpx_highbd_sad32x32x3_bits10,
@@ -1182,7 +1182,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_64X64,
vpx_highbd_sad64x64_bits10,
vpx_highbd_sad64x64_avg_bits10,
- vp9_highbd_10_variance64x64,
+ vpx_highbd_10_variance64x64,
vp9_highbd_10_sub_pixel_variance64x64,
vp9_highbd_10_sub_pixel_avg_variance64x64,
vpx_highbd_sad64x64x3_bits10,
@@ -1192,7 +1192,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_16X16,
vpx_highbd_sad16x16_bits10,
vpx_highbd_sad16x16_avg_bits10,
- vp9_highbd_10_variance16x16,
+ vpx_highbd_10_variance16x16,
vp9_highbd_10_sub_pixel_variance16x16,
vp9_highbd_10_sub_pixel_avg_variance16x16,
vpx_highbd_sad16x16x3_bits10,
@@ -1202,7 +1202,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_16X8,
vpx_highbd_sad16x8_bits10,
vpx_highbd_sad16x8_avg_bits10,
- vp9_highbd_10_variance16x8,
+ vpx_highbd_10_variance16x8,
vp9_highbd_10_sub_pixel_variance16x8,
vp9_highbd_10_sub_pixel_avg_variance16x8,
vpx_highbd_sad16x8x3_bits10,
@@ -1212,7 +1212,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_8X16,
vpx_highbd_sad8x16_bits10,
vpx_highbd_sad8x16_avg_bits10,
- vp9_highbd_10_variance8x16,
+ vpx_highbd_10_variance8x16,
vp9_highbd_10_sub_pixel_variance8x16,
vp9_highbd_10_sub_pixel_avg_variance8x16,
vpx_highbd_sad8x16x3_bits10,
@@ -1222,7 +1222,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_8X8,
vpx_highbd_sad8x8_bits10,
vpx_highbd_sad8x8_avg_bits10,
- vp9_highbd_10_variance8x8,
+ vpx_highbd_10_variance8x8,
vp9_highbd_10_sub_pixel_variance8x8,
vp9_highbd_10_sub_pixel_avg_variance8x8,
vpx_highbd_sad8x8x3_bits10,
@@ -1232,7 +1232,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_8X4,
vpx_highbd_sad8x4_bits10,
vpx_highbd_sad8x4_avg_bits10,
- vp9_highbd_10_variance8x4,
+ vpx_highbd_10_variance8x4,
vp9_highbd_10_sub_pixel_variance8x4,
vp9_highbd_10_sub_pixel_avg_variance8x4,
NULL,
@@ -1242,7 +1242,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_4X8,
vpx_highbd_sad4x8_bits10,
vpx_highbd_sad4x8_avg_bits10,
- vp9_highbd_10_variance4x8,
+ vpx_highbd_10_variance4x8,
vp9_highbd_10_sub_pixel_variance4x8,
vp9_highbd_10_sub_pixel_avg_variance4x8,
NULL,
@@ -1252,7 +1252,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_4X4,
vpx_highbd_sad4x4_bits10,
vpx_highbd_sad4x4_avg_bits10,
- vp9_highbd_10_variance4x4,
+ vpx_highbd_10_variance4x4,
vp9_highbd_10_sub_pixel_variance4x4,
vp9_highbd_10_sub_pixel_avg_variance4x4,
vpx_highbd_sad4x4x3_bits10,
@@ -1264,7 +1264,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_32X16,
vpx_highbd_sad32x16_bits12,
vpx_highbd_sad32x16_avg_bits12,
- vp9_highbd_12_variance32x16,
+ vpx_highbd_12_variance32x16,
vp9_highbd_12_sub_pixel_variance32x16,
vp9_highbd_12_sub_pixel_avg_variance32x16,
NULL,
@@ -1274,7 +1274,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_16X32,
vpx_highbd_sad16x32_bits12,
vpx_highbd_sad16x32_avg_bits12,
- vp9_highbd_12_variance16x32,
+ vpx_highbd_12_variance16x32,
vp9_highbd_12_sub_pixel_variance16x32,
vp9_highbd_12_sub_pixel_avg_variance16x32,
NULL,
@@ -1284,7 +1284,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_64X32,
vpx_highbd_sad64x32_bits12,
vpx_highbd_sad64x32_avg_bits12,
- vp9_highbd_12_variance64x32,
+ vpx_highbd_12_variance64x32,
vp9_highbd_12_sub_pixel_variance64x32,
vp9_highbd_12_sub_pixel_avg_variance64x32,
NULL,
@@ -1294,7 +1294,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_32X64,
vpx_highbd_sad32x64_bits12,
vpx_highbd_sad32x64_avg_bits12,
- vp9_highbd_12_variance32x64,
+ vpx_highbd_12_variance32x64,
vp9_highbd_12_sub_pixel_variance32x64,
vp9_highbd_12_sub_pixel_avg_variance32x64,
NULL,
@@ -1304,7 +1304,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_32X32,
vpx_highbd_sad32x32_bits12,
vpx_highbd_sad32x32_avg_bits12,
- vp9_highbd_12_variance32x32,
+ vpx_highbd_12_variance32x32,
vp9_highbd_12_sub_pixel_variance32x32,
vp9_highbd_12_sub_pixel_avg_variance32x32,
vpx_highbd_sad32x32x3_bits12,
@@ -1314,7 +1314,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_64X64,
vpx_highbd_sad64x64_bits12,
vpx_highbd_sad64x64_avg_bits12,
- vp9_highbd_12_variance64x64,
+ vpx_highbd_12_variance64x64,
vp9_highbd_12_sub_pixel_variance64x64,
vp9_highbd_12_sub_pixel_avg_variance64x64,
vpx_highbd_sad64x64x3_bits12,
@@ -1324,7 +1324,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_16X16,
vpx_highbd_sad16x16_bits12,
vpx_highbd_sad16x16_avg_bits12,
- vp9_highbd_12_variance16x16,
+ vpx_highbd_12_variance16x16,
vp9_highbd_12_sub_pixel_variance16x16,
vp9_highbd_12_sub_pixel_avg_variance16x16,
vpx_highbd_sad16x16x3_bits12,
@@ -1334,7 +1334,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_16X8,
vpx_highbd_sad16x8_bits12,
vpx_highbd_sad16x8_avg_bits12,
- vp9_highbd_12_variance16x8,
+ vpx_highbd_12_variance16x8,
vp9_highbd_12_sub_pixel_variance16x8,
vp9_highbd_12_sub_pixel_avg_variance16x8,
vpx_highbd_sad16x8x3_bits12,
@@ -1344,7 +1344,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_8X16,
vpx_highbd_sad8x16_bits12,
vpx_highbd_sad8x16_avg_bits12,
- vp9_highbd_12_variance8x16,
+ vpx_highbd_12_variance8x16,
vp9_highbd_12_sub_pixel_variance8x16,
vp9_highbd_12_sub_pixel_avg_variance8x16,
vpx_highbd_sad8x16x3_bits12,
@@ -1354,7 +1354,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_8X8,
vpx_highbd_sad8x8_bits12,
vpx_highbd_sad8x8_avg_bits12,
- vp9_highbd_12_variance8x8,
+ vpx_highbd_12_variance8x8,
vp9_highbd_12_sub_pixel_variance8x8,
vp9_highbd_12_sub_pixel_avg_variance8x8,
vpx_highbd_sad8x8x3_bits12,
@@ -1364,7 +1364,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_8X4,
vpx_highbd_sad8x4_bits12,
vpx_highbd_sad8x4_avg_bits12,
- vp9_highbd_12_variance8x4,
+ vpx_highbd_12_variance8x4,
vp9_highbd_12_sub_pixel_variance8x4,
vp9_highbd_12_sub_pixel_avg_variance8x4,
NULL,
@@ -1374,7 +1374,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_4X8,
vpx_highbd_sad4x8_bits12,
vpx_highbd_sad4x8_avg_bits12,
- vp9_highbd_12_variance4x8,
+ vpx_highbd_12_variance4x8,
vp9_highbd_12_sub_pixel_variance4x8,
vp9_highbd_12_sub_pixel_avg_variance4x8,
NULL,
@@ -1384,7 +1384,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_4X4,
vpx_highbd_sad4x4_bits12,
vpx_highbd_sad4x4_avg_bits12,
- vp9_highbd_12_variance4x4,
+ vpx_highbd_12_variance4x4,
vp9_highbd_12_sub_pixel_variance4x4,
vp9_highbd_12_sub_pixel_avg_variance4x4,
vpx_highbd_sad4x4x3_bits12,
@@ -1807,61 +1807,61 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
cpi->fn_ptr[BT].sdx4df = SDX4DF;
BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad32x16_avg,
- vp9_variance32x16, vp9_sub_pixel_variance32x16,
+ vpx_variance32x16, vp9_sub_pixel_variance32x16,
vp9_sub_pixel_avg_variance32x16, NULL, NULL, vpx_sad32x16x4d)
BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad16x32_avg,
- vp9_variance16x32, vp9_sub_pixel_variance16x32,
+ vpx_variance16x32, vp9_sub_pixel_variance16x32,
vp9_sub_pixel_avg_variance16x32, NULL, NULL, vpx_sad16x32x4d)
BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad64x32_avg,
- vp9_variance64x32, vp9_sub_pixel_variance64x32,
+ vpx_variance64x32, vp9_sub_pixel_variance64x32,
vp9_sub_pixel_avg_variance64x32, NULL, NULL, vpx_sad64x32x4d)
BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad32x64_avg,
- vp9_variance32x64, vp9_sub_pixel_variance32x64,
+ vpx_variance32x64, vp9_sub_pixel_variance32x64,
vp9_sub_pixel_avg_variance32x64, NULL, NULL, vpx_sad32x64x4d)
BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad32x32_avg,
- vp9_variance32x32, vp9_sub_pixel_variance32x32,
+ vpx_variance32x32, vp9_sub_pixel_variance32x32,
vp9_sub_pixel_avg_variance32x32, vpx_sad32x32x3, vpx_sad32x32x8,
vpx_sad32x32x4d)
BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad64x64_avg,
- vp9_variance64x64, vp9_sub_pixel_variance64x64,
+ vpx_variance64x64, vp9_sub_pixel_variance64x64,
vp9_sub_pixel_avg_variance64x64, vpx_sad64x64x3, vpx_sad64x64x8,
vpx_sad64x64x4d)
BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad16x16_avg,
- vp9_variance16x16, vp9_sub_pixel_variance16x16,
+ vpx_variance16x16, vp9_sub_pixel_variance16x16,
vp9_sub_pixel_avg_variance16x16, vpx_sad16x16x3, vpx_sad16x16x8,
vpx_sad16x16x4d)
BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad16x8_avg,
- vp9_variance16x8, vp9_sub_pixel_variance16x8,
+ vpx_variance16x8, vp9_sub_pixel_variance16x8,
vp9_sub_pixel_avg_variance16x8,
vpx_sad16x8x3, vpx_sad16x8x8, vpx_sad16x8x4d)
BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad8x16_avg,
- vp9_variance8x16, vp9_sub_pixel_variance8x16,
+ vpx_variance8x16, vp9_sub_pixel_variance8x16,
vp9_sub_pixel_avg_variance8x16,
vpx_sad8x16x3, vpx_sad8x16x8, vpx_sad8x16x4d)
BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad8x8_avg,
- vp9_variance8x8, vp9_sub_pixel_variance8x8,
+ vpx_variance8x8, vp9_sub_pixel_variance8x8,
vp9_sub_pixel_avg_variance8x8,
vpx_sad8x8x3, vpx_sad8x8x8, vpx_sad8x8x4d)
BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad8x4_avg,
- vp9_variance8x4, vp9_sub_pixel_variance8x4,
+ vpx_variance8x4, vp9_sub_pixel_variance8x4,
vp9_sub_pixel_avg_variance8x4, NULL, vpx_sad8x4x8, vpx_sad8x4x4d)
BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad4x8_avg,
- vp9_variance4x8, vp9_sub_pixel_variance4x8,
+ vpx_variance4x8, vp9_sub_pixel_variance4x8,
vp9_sub_pixel_avg_variance4x8, NULL, vpx_sad4x8x8, vpx_sad4x8x4d)
BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad4x4_avg,
- vp9_variance4x4, vp9_sub_pixel_variance4x4,
+ vpx_variance4x4, vp9_sub_pixel_variance4x4,
vp9_sub_pixel_avg_variance4x4,
vpx_sad4x4x3, vpx_sad4x4x8, vpx_sad4x4x4d)
@@ -2081,7 +2081,7 @@ static int64_t get_sse(const uint8_t *a, int a_stride,
const uint8_t *pa = a;
const uint8_t *pb = b;
for (x = 0; x < width / 16; ++x) {
- vp9_mse16x16(pa, a_stride, pb, b_stride, &sse);
+ vpx_mse16x16(pa, a_stride, pb, b_stride, &sse);
total_sse += sse;
pa += 16;
@@ -2126,21 +2126,21 @@ static int64_t highbd_get_sse(const uint8_t *a, int a_stride,
unsigned int sse = 0;
int sum = 0;
if (dw > 0) {
- highbd_variance(&a[width - dw], a_stride, &b[width - dw], b_stride,
- dw, height, &sse, &sum);
+ highbd_8_variance(&a[width - dw], a_stride, &b[width - dw], b_stride,
+ dw, height, &sse, &sum);
total_sse += sse;
}
if (dh > 0) {
- highbd_variance(&a[(height - dh) * a_stride], a_stride,
- &b[(height - dh) * b_stride], b_stride,
- width - dw, dh, &sse, &sum);
+ highbd_8_variance(&a[(height - dh) * a_stride], a_stride,
+ &b[(height - dh) * b_stride], b_stride,
+ width - dw, dh, &sse, &sum);
total_sse += sse;
}
for (y = 0; y < height / 16; ++y) {
const uint8_t *pa = a;
const uint8_t *pb = b;
for (x = 0; x < width / 16; ++x) {
- vp9_highbd_mse16x16(pa, a_stride, pb, b_stride, &sse);
+ vpx_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse);
total_sse += sse;
pa += 16;
pb += 16;
@@ -2716,7 +2716,10 @@ void vp9_scale_references(VP9_COMP *cpi) {
#if CONFIG_VP9_HIGHBITDEPTH
if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
const int new_fb = get_free_fb(cm);
- RefCntBuffer *const new_fb_ptr = &pool->frame_bufs[new_fb];
+ RefCntBuffer *new_fb_ptr = NULL;
+ if (cm->new_fb_idx == INVALID_IDX)
+ return;
+ new_fb_ptr = &pool->frame_bufs[new_fb];
cm->cur_frame = &pool->frame_bufs[new_fb];
vp9_realloc_frame_buffer(&pool->frame_bufs[new_fb].buf,
cm->width, cm->height,
@@ -2728,7 +2731,10 @@ void vp9_scale_references(VP9_COMP *cpi) {
#else
if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
const int new_fb = get_free_fb(cm);
- RefCntBuffer *const new_fb_ptr = &pool->frame_bufs[new_fb];
+ RefCntBuffer *new_fb_ptr = NULL;
+ if (cm->new_fb_idx == INVALID_IDX)
+ return;
+ new_fb_ptr = &pool->frame_bufs[new_fb];
vp9_realloc_frame_buffer(&new_fb_ptr->buf,
cm->width, cm->height,
cm->subsampling_x, cm->subsampling_y,
@@ -2797,7 +2803,7 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) {
recon_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
if (cpi->twopass.total_left_stats.coded_error != 0.0)
- fprintf(f, "%10u %dx%d %10d %10d %10d %10d"
+ fprintf(f, "%10u %dx%d %d %d %10d %10d %10d %10d"
"%10"PRId64" %10"PRId64" %10"PRId64" %10"PRId64" %10d "
"%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf"
"%6d %6d %5d %5d %5d "
@@ -2805,6 +2811,8 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) {
"%10lf %8u %10"PRId64" %10d %10d\n",
cpi->common.current_video_frame,
cm->width, cm->height,
+ cpi->rc.source_alt_ref_pending,
+ cpi->rc.source_alt_ref_active,
cpi->rc.this_frame_target,
cpi->rc.projected_frame_size,
cpi->rc.projected_frame_size / cpi->common.MBs,
@@ -3031,7 +3039,7 @@ static void encode_without_recode_loop(VP9_COMP *cpi) {
set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
vp9_set_quantizer(cm, q);
- vp9_set_vbp_thresholds(cpi, q);
+ vp9_set_variance_partition_thresholds(cpi, q);
setup_frame(cpi);
@@ -3476,34 +3484,41 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
}
}
if (is_two_pass_svc(cpi) && cm->error_resilient_mode == 0) {
- // Use the last frame context for the empty frame.
+ // Use context 0 for intra only empty frame, but the last frame context
+ // for other empty frames.
+ if (cpi->svc.encode_empty_frame_state == ENCODING) {
+ if (cpi->svc.encode_intra_empty_frame != 0)
+ cm->frame_context_idx = 0;
+ else
+ cm->frame_context_idx = FRAME_CONTEXTS - 1;
+ } else {
cm->frame_context_idx =
- (cpi->svc.encode_empty_frame_state == ENCODING) ? FRAME_CONTEXTS - 1 :
cpi->svc.spatial_layer_id * cpi->svc.number_temporal_layers +
cpi->svc.temporal_layer_id;
+ }
+
+ cm->frame_parallel_decoding_mode = oxcf->frame_parallel_decoding_mode;
// The probs will be updated based on the frame type of its previous
// frame if frame_parallel_decoding_mode is 0. The type may vary for
// the frame after a key frame in base layer since we may drop enhancement
// layers. So set frame_parallel_decoding_mode to 1 in this case.
- if (cpi->svc.number_temporal_layers == 1) {
- if (cpi->svc.spatial_layer_id == 0 &&
- cpi->svc.layer_context[0].last_frame_type == KEY_FRAME)
- cm->frame_parallel_decoding_mode = 1;
- else
- cm->frame_parallel_decoding_mode = 0;
- } else if (cpi->svc.spatial_layer_id == 0) {
- // Find the 2nd frame in temporal base layer and 1st frame in temporal
- // enhancement layers from the key frame.
- int i;
- for (i = 0; i < cpi->svc.number_temporal_layers; ++i) {
- if (cpi->svc.layer_context[0].frames_from_key_frame == 1 << i) {
+ if (cm->frame_parallel_decoding_mode == 0) {
+ if (cpi->svc.number_temporal_layers == 1) {
+ if (cpi->svc.spatial_layer_id == 0 &&
+ cpi->svc.layer_context[0].last_frame_type == KEY_FRAME)
cm->frame_parallel_decoding_mode = 1;
- break;
+ } else if (cpi->svc.spatial_layer_id == 0) {
+ // Find the 2nd frame in temporal base layer and 1st frame in temporal
+ // enhancement layers from the key frame.
+ int i;
+ for (i = 0; i < cpi->svc.number_temporal_layers; ++i) {
+ if (cpi->svc.layer_context[0].frames_from_key_frame == 1 << i) {
+ cm->frame_parallel_decoding_mode = 1;
+ break;
+ }
}
}
- if (i == cpi->svc.number_temporal_layers)
- cm->frame_parallel_decoding_mode = 0;
}
}
@@ -3968,6 +3983,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
}
cm->show_frame = 0;
+ cm->intra_only = 0;
cpi->refresh_alt_ref_frame = 1;
cpi->refresh_golden_frame = 0;
cpi->refresh_last_frame = 0;
@@ -4310,8 +4326,10 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
#endif
if (is_two_pass_svc(cpi)) {
- if (cpi->svc.encode_empty_frame_state == ENCODING)
+ if (cpi->svc.encode_empty_frame_state == ENCODING) {
cpi->svc.encode_empty_frame_state = ENCODED;
+ cpi->svc.encode_intra_empty_frame = 0;
+ }
if (cm->show_frame) {
++cpi->svc.spatial_layer_to_encode;
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 88b10307d..942eac911 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -12,6 +12,7 @@
#include <math.h>
#include <stdio.h>
+#include "./vpx_dsp_rtcd.h"
#include "./vpx_scale_rtcd.h"
#include "vpx_mem/vpx_mem.h"
@@ -267,13 +268,13 @@ void vp9_end_first_pass(VP9_COMP *cpi) {
static vp9_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) {
switch (bsize) {
case BLOCK_8X8:
- return vp9_mse8x8;
+ return vpx_mse8x8;
case BLOCK_16X8:
- return vp9_mse16x8;
+ return vpx_mse16x8;
case BLOCK_8X16:
- return vp9_mse8x16;
+ return vpx_mse8x16;
default:
- return vp9_mse16x16;
+ return vpx_mse16x16;
}
}
@@ -293,37 +294,37 @@ static vp9_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,
default:
switch (bsize) {
case BLOCK_8X8:
- return vp9_highbd_mse8x8;
+ return vpx_highbd_8_mse8x8;
case BLOCK_16X8:
- return vp9_highbd_mse16x8;
+ return vpx_highbd_8_mse16x8;
case BLOCK_8X16:
- return vp9_highbd_mse8x16;
+ return vpx_highbd_8_mse8x16;
default:
- return vp9_highbd_mse16x16;
+ return vpx_highbd_8_mse16x16;
}
break;
case 10:
switch (bsize) {
case BLOCK_8X8:
- return vp9_highbd_10_mse8x8;
+ return vpx_highbd_10_mse8x8;
case BLOCK_16X8:
- return vp9_highbd_10_mse16x8;
+ return vpx_highbd_10_mse16x8;
case BLOCK_8X16:
- return vp9_highbd_10_mse8x16;
+ return vpx_highbd_10_mse8x16;
default:
- return vp9_highbd_10_mse16x16;
+ return vpx_highbd_10_mse16x16;
}
break;
case 12:
switch (bsize) {
case BLOCK_8X8:
- return vp9_highbd_12_mse8x8;
+ return vpx_highbd_12_mse8x8;
case BLOCK_16X8:
- return vp9_highbd_12_mse16x8;
+ return vpx_highbd_12_mse16x8;
case BLOCK_8X16:
- return vp9_highbd_12_mse8x16;
+ return vpx_highbd_12_mse8x16;
default:
- return vp9_highbd_12_mse16x16;
+ return vpx_highbd_12_mse16x16;
}
break;
}
@@ -634,7 +635,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
xd->mi[0]->mbmi.tx_size = use_dc_pred ?
(bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4;
vp9_encode_intra_block_plane(x, bsize, 0);
- this_error = vp9_get_mb_ss(x->plane[0].src_diff);
+ this_error = vpx_get_mb_ss(x->plane[0].src_diff);
#if CONFIG_VP9_HIGHBITDEPTH
if (cm->use_highbitdepth) {
switch (cm->bit_depth) {
@@ -1696,7 +1697,7 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
mid_frame_idx = frame_index + (rc->baseline_gf_interval >> 1) - 1;
// Allocate bits to the other frames in the group.
- for (i = 0; i < rc->baseline_gf_interval - 1; ++i) {
+ for (i = 0; i < rc->baseline_gf_interval - rc->source_alt_ref_pending; ++i) {
int arf_idx = 0;
if (EOF == input_stats(twopass, &frame_stats))
break;
@@ -1934,8 +1935,26 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
// Was the group length constrained by the requirement for a new KF?
rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0;
+ // Should we use the alternate reference frame.
+ if (allow_alt_ref &&
+ (i < cpi->oxcf.lag_in_frames) &&
+ (i >= rc->min_gf_interval)) {
+ // Calculate the boost for alt ref.
+ rc->gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost,
+ &b_boost);
+ rc->source_alt_ref_pending = 1;
+
+ // Test to see if multi arf is appropriate.
+ cpi->multi_arf_enabled =
+ (cpi->multi_arf_allowed && (rc->baseline_gf_interval >= 6) &&
+ (zero_motion_accumulator < 0.995)) ? 1 : 0;
+ } else {
+ rc->gfu_boost = MAX((int)boost_score, MIN_ARF_GF_BOOST);
+ rc->source_alt_ref_pending = 0;
+ }
+
// Set the interval until the next gf.
- if (is_key_frame || rc->source_alt_ref_active)
+ if (is_key_frame || rc->source_alt_ref_pending)
rc->baseline_gf_interval = i - 1;
else
rc->baseline_gf_interval = i;
@@ -1960,24 +1979,6 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
rc->frames_till_gf_update_due = rc->baseline_gf_interval;
- // Should we use the alternate reference frame.
- if (allow_alt_ref &&
- (i < cpi->oxcf.lag_in_frames) &&
- (i >= rc->min_gf_interval)) {
- // Calculate the boost for alt ref.
- rc->gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost,
- &b_boost);
- rc->source_alt_ref_pending = 1;
-
- // Test to see if multi arf is appropriate.
- cpi->multi_arf_enabled =
- (cpi->multi_arf_allowed && (rc->baseline_gf_interval >= 6) &&
- (zero_motion_accumulator < 0.995)) ? 1 : 0;
- } else {
- rc->gfu_boost = MAX((int)boost_score, MIN_ARF_GF_BOOST);
- rc->source_alt_ref_pending = 0;
- }
-
// Reset the file position.
reset_fpf_position(twopass, start_pos);
@@ -2581,9 +2582,8 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
cpi->ref_frame_flags &=
(~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG);
lc->frames_from_key_frame = 0;
- // Reset the empty frame resolution since we have a key frame.
- cpi->svc.empty_frame_width = cm->width;
- cpi->svc.empty_frame_height = cm->height;
+ // Encode an intra only empty frame since we have a key frame.
+ cpi->svc.encode_intra_empty_frame = 1;
}
} else {
cm->frame_type = INTER_FRAME;
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 8bdd4286a..15f95829f 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -13,11 +13,13 @@
#include <stdio.h>
#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
#include "vpx_mem/vpx_mem.h"
#include "vpx_ports/mem.h"
#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_reconinter.h"
#include "vp9/encoder/vp9_encoder.h"
#include "vp9/encoder/vp9_mcomp.h"
@@ -303,13 +305,13 @@ static INLINE unsigned int setup_center_error(const MACROBLOCKD *xd,
if (second_pred != NULL) {
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
DECLARE_ALIGNED(16, uint16_t, comp_pred16[64 * 64]);
- vp9_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset,
+ vpx_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset,
y_stride);
besterr = vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride,
sse1);
} else {
DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
- vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
+ vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
}
} else {
@@ -321,7 +323,7 @@ static INLINE unsigned int setup_center_error(const MACROBLOCKD *xd,
(void) xd;
if (second_pred != NULL) {
DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
- vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
+ vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
} else {
besterr = vfp->vf(y + offset, y_stride, src, src_stride, sse1);
@@ -1789,8 +1791,11 @@ static const MV search_pos[4] = {
};
unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,
- BLOCK_SIZE bsize) {
+ BLOCK_SIZE bsize,
+ int mi_row, int mi_col) {
MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+ struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0, 0}};
DECLARE_ALIGNED(16, int16_t, hbuf[128]);
DECLARE_ALIGNED(16, int16_t, vbuf[128]);
DECLARE_ALIGNED(16, int16_t, src_hbuf[64]);
@@ -1807,12 +1812,34 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,
unsigned int best_sad, tmp_sad, this_sad[4];
MV this_mv;
const int norm_factor = 3 + (bw >> 5);
+ const YV12_BUFFER_CONFIG *scaled_ref_frame =
+ vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[0]);
+
+ if (scaled_ref_frame) {
+ int i;
+ // Swap out the reference frame for a version that's been scaled to
+ // match the resolution of the current frame, allowing the existing
+ // motion search code to be used without additional modifications.
+ for (i = 0; i < MAX_MB_PLANE; i++)
+ backup_yv12[i] = xd->plane[i].pre[0];
+ vp9_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
+ }
#if CONFIG_VP9_HIGHBITDEPTH
- tmp_mv->row = 0;
- tmp_mv->col = 0;
- return cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride,
- xd->plane[0].pre[0].buf, ref_stride);
+ {
+ unsigned int this_sad;
+ tmp_mv->row = 0;
+ tmp_mv->col = 0;
+ this_sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride,
+ xd->plane[0].pre[0].buf, ref_stride);
+
+ if (scaled_ref_frame) {
+ int i;
+ for (i = 0; i < MAX_MB_PLANE; i++)
+ xd->plane[i].pre[0] = backup_yv12[i];
+ }
+ return this_sad;
+ }
#endif
// Set up prediction 1-D reference set
@@ -1890,6 +1917,12 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,
tmp_mv->row *= 8;
tmp_mv->col *= 8;
+ if (scaled_ref_frame) {
+ int i;
+ for (i = 0; i < MAX_MB_PLANE; i++)
+ xd->plane[i].pre[0] = backup_yv12[i];
+ }
+
return best_sad;
}
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index dd8a46079..99c1afa28 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -83,7 +83,8 @@ int vp9_full_pixel_diamond(const struct VP9_COMP *cpi, MACROBLOCK *x,
// Perform integral projection based motion estimation.
unsigned int vp9_int_pro_motion_estimation(const struct VP9_COMP *cpi,
MACROBLOCK *x,
- BLOCK_SIZE bsize);
+ BLOCK_SIZE bsize,
+ int mi_row, int mi_col);
typedef int (integer_mv_pattern_search_fn) (
const MACROBLOCK *x,
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 1e917159f..b0e255de8 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -14,6 +14,7 @@
#include <stdio.h>
#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
#include "vpx_mem/vpx_mem.h"
#include "vpx_ports/mem.h"
@@ -24,6 +25,7 @@
#include "vp9/common/vp9_pred_common.h"
#include "vp9/common/vp9_reconinter.h"
#include "vp9/common/vp9_reconintra.h"
+#include "vp9/common/vp9_scan.h"
#include "vp9/encoder/vp9_cost.h"
#include "vp9/encoder/vp9_encoder.h"
@@ -215,7 +217,7 @@ static void block_variance(const uint8_t *src, int src_stride,
for (i = 0; i < h; i += block_size) {
for (j = 0; j < w; j += block_size) {
- vp9_get8x8var(src + src_stride * i + j, src_stride,
+ vpx_get8x8var(src + src_stride * i + j, src_stride,
ref + ref_stride * i + j, ref_stride,
&sse8x8[k], &sum8x8[k]);
*sse += sse8x8[k];
@@ -1248,7 +1250,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
if (bsize < BLOCK_16X16)
continue;
- tmp_sad = vp9_int_pro_motion_estimation(cpi, x, bsize);
+ tmp_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col);
if (tmp_sad > x->pred_mv_sad[LAST_FRAME])
continue;
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index fabe36296..7211e9992 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -1207,11 +1207,9 @@ static void update_golden_frame_stats(VP9_COMP *cpi) {
// this frame refreshes means next frames don't unless specified by user
rc->frames_since_golden = 0;
- if (cpi->oxcf.pass == 2) {
- if (!rc->source_alt_ref_pending &&
- cpi->twopass.gf_group.rf_level[0] == GF_ARF_STD)
- rc->source_alt_ref_active = 0;
- } else if (!rc->source_alt_ref_pending) {
+ // If we are not using alt ref in the up and coming group clear the arf
+ // active flag.
+ if (!rc->source_alt_ref_pending) {
rc->source_alt_ref_active = 0;
}
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 6eb8f6cb5..9fa258c61 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -25,6 +25,7 @@
#include "vp9/common/vp9_quant_common.h"
#include "vp9/common/vp9_reconinter.h"
#include "vp9/common/vp9_reconintra.h"
+#include "vp9/common/vp9_scan.h"
#include "vp9/common/vp9_seg_common.h"
#include "vp9/common/vp9_systemdependent.h"
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index b3491a27a..2d2e95d9c 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -15,6 +15,8 @@
#include "vp9/encoder/vp9_extend.h"
#define SMALL_FRAME_FB_IDX 7
+#define SMALL_FRAME_WIDTH 16
+#define SMALL_FRAME_HEIGHT 16
void vp9_init_layer_context(VP9_COMP *const cpi) {
SVC *const svc = &cpi->svc;
@@ -33,7 +35,7 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2) {
if (vp9_realloc_frame_buffer(&cpi->svc.empty_frame.img,
- cpi->common.width, cpi->common.height,
+ SMALL_FRAME_WIDTH, SMALL_FRAME_HEIGHT,
cpi->common.subsampling_x,
cpi->common.subsampling_y,
#if CONFIG_VP9_HIGHBITDEPTH
@@ -48,8 +50,6 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
memset(cpi->svc.empty_frame.img.buffer_alloc, 0x80,
cpi->svc.empty_frame.img.buffer_alloc_sz);
- cpi->svc.empty_frame_width = cpi->common.width;
- cpi->svc.empty_frame_height = cpi->common.height;
}
}
@@ -362,20 +362,11 @@ int vp9_svc_start_frame(VP9_COMP *const cpi) {
cpi->lst_fb_idx =
cpi->gld_fb_idx = cpi->alt_fb_idx = SMALL_FRAME_FB_IDX;
- // Gradually make the empty frame smaller to save bits. Make it half of
- // its previous size because of the scaling factor restriction.
- cpi->svc.empty_frame_width >>= 1;
- cpi->svc.empty_frame_width = (cpi->svc.empty_frame_width + 1) & ~1;
- if (cpi->svc.empty_frame_width < 16)
- cpi->svc.empty_frame_width = 16;
+ if (cpi->svc.encode_intra_empty_frame != 0)
+ cpi->common.intra_only = 1;
- cpi->svc.empty_frame_height >>= 1;
- cpi->svc.empty_frame_height = (cpi->svc.empty_frame_height + 1) & ~1;
- if (cpi->svc.empty_frame_height < 16)
- cpi->svc.empty_frame_height = 16;
-
- width = cpi->svc.empty_frame_width;
- height = cpi->svc.empty_frame_height;
+ width = SMALL_FRAME_WIDTH;
+ height = SMALL_FRAME_HEIGHT;
}
}
}
diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h
index e9645ce9f..5063d521f 100644
--- a/vp9/encoder/vp9_svc_layercontext.h
+++ b/vp9/encoder/vp9_svc_layercontext.h
@@ -57,8 +57,7 @@ typedef struct {
NEED_TO_ENCODE
}encode_empty_frame_state;
struct lookahead_entry empty_frame;
- int empty_frame_width;
- int empty_frame_height;
+ int encode_intra_empty_frame;
// Store scaled source frames to be used for temporal filter to generate
// a alt ref frame.
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 862be4d38..35920313a 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -17,6 +17,7 @@
#include "vp9/common/vp9_entropy.h"
#include "vp9/common/vp9_pred_common.h"
+#include "vp9/common/vp9_scan.h"
#include "vp9/common/vp9_seg_common.h"
#include "vp9/encoder/vp9_cost.h"
diff --git a/vp9/encoder/vp9_variance.c b/vp9/encoder/vp9_variance.c
index f38f96d6c..1f6b083c4 100644
--- a/vp9/encoder/vp9_variance.c
+++ b/vp9/encoder/vp9_variance.c
@@ -9,6 +9,7 @@
*/
#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
#include "vpx_ports/mem.h"
#include "vpx/vpx_integer.h"
@@ -18,26 +19,6 @@
#include "vp9/encoder/vp9_variance.h"
-void variance(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- int w, int h, unsigned int *sse, int *sum) {
- int i, j;
-
- *sum = 0;
- *sse = 0;
-
- for (i = 0; i < h; i++) {
- for (j = 0; j < w; j++) {
- const int diff = a[j] - b[j];
- *sum += diff;
- *sse += diff * diff;
- }
-
- a += a_stride;
- b += b_stride;
- }
-}
-
// Applies a 1-D 2-tap bi-linear filter to the source block in either horizontal
// or vertical direction to produce the filtered output block. Used to implement
// first-pass of 2-D separable filter.
@@ -100,25 +81,6 @@ static void var_filter_block2d_bil_second_pass(const uint16_t *src_ptr,
}
}
-unsigned int vp9_get_mb_ss_c(const int16_t *src_ptr) {
- unsigned int i, sum = 0;
-
- for (i = 0; i < 256; ++i) {
- sum += src_ptr[i] * src_ptr[i];
- }
-
- return sum;
-}
-
-#define VAR(W, H) \
-unsigned int vp9_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
- const uint8_t *b, int b_stride, \
- unsigned int *sse) { \
- int sum; \
- variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
- return *sse - (((int64_t)sum * sum) / (W * H)); \
-}
-
#define SUBPIX_VAR(W, H) \
unsigned int vp9_sub_pixel_variance##W##x##H##_c( \
const uint8_t *src, int src_stride, \
@@ -133,7 +95,7 @@ unsigned int vp9_sub_pixel_variance##W##x##H##_c( \
var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
BILINEAR_FILTERS_2TAP(yoffset)); \
\
- return vp9_variance##W##x##H##_c(temp2, W, dst, dst_stride, sse); \
+ return vpx_variance##W##x##H##_c(temp2, W, dst, dst_stride, sse); \
}
#define SUBPIX_AVG_VAR(W, H) \
@@ -152,178 +114,51 @@ unsigned int vp9_sub_pixel_avg_variance##W##x##H##_c( \
var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
BILINEAR_FILTERS_2TAP(yoffset)); \
\
- vp9_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
+ vpx_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
\
- return vp9_variance##W##x##H##_c(temp3, W, dst, dst_stride, sse); \
-}
-
-void vp9_get16x16var_c(const uint8_t *src_ptr, int source_stride,
- const uint8_t *ref_ptr, int ref_stride,
- unsigned int *sse, int *sum) {
- variance(src_ptr, source_stride, ref_ptr, ref_stride, 16, 16, sse, sum);
+ return vpx_variance##W##x##H##_c(temp3, W, dst, dst_stride, sse); \
}
-void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride,
- const uint8_t *ref_ptr, int ref_stride,
- unsigned int *sse, int *sum) {
- variance(src_ptr, source_stride, ref_ptr, ref_stride, 8, 8, sse, sum);
-}
-
-unsigned int vp9_mse16x16_c(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse) {
- int sum;
- variance(src, src_stride, ref, ref_stride, 16, 16, sse, &sum);
- return *sse;
-}
-
-unsigned int vp9_mse16x8_c(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse) {
- int sum;
- variance(src, src_stride, ref, ref_stride, 16, 8, sse, &sum);
- return *sse;
-}
-
-unsigned int vp9_mse8x16_c(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse) {
- int sum;
- variance(src, src_stride, ref, ref_stride, 8, 16, sse, &sum);
- return *sse;
-}
-
-unsigned int vp9_mse8x8_c(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse) {
- int sum;
- variance(src, src_stride, ref, ref_stride, 8, 8, sse, &sum);
- return *sse;
-}
-
-VAR(4, 4)
SUBPIX_VAR(4, 4)
SUBPIX_AVG_VAR(4, 4)
-VAR(4, 8)
SUBPIX_VAR(4, 8)
SUBPIX_AVG_VAR(4, 8)
-VAR(8, 4)
SUBPIX_VAR(8, 4)
SUBPIX_AVG_VAR(8, 4)
-VAR(8, 8)
SUBPIX_VAR(8, 8)
SUBPIX_AVG_VAR(8, 8)
-VAR(8, 16)
SUBPIX_VAR(8, 16)
SUBPIX_AVG_VAR(8, 16)
-VAR(16, 8)
SUBPIX_VAR(16, 8)
SUBPIX_AVG_VAR(16, 8)
-VAR(16, 16)
SUBPIX_VAR(16, 16)
SUBPIX_AVG_VAR(16, 16)
-VAR(16, 32)
SUBPIX_VAR(16, 32)
SUBPIX_AVG_VAR(16, 32)
-VAR(32, 16)
SUBPIX_VAR(32, 16)
SUBPIX_AVG_VAR(32, 16)
-VAR(32, 32)
SUBPIX_VAR(32, 32)
SUBPIX_AVG_VAR(32, 32)
-VAR(32, 64)
SUBPIX_VAR(32, 64)
SUBPIX_AVG_VAR(32, 64)
-VAR(64, 32)
SUBPIX_VAR(64, 32)
SUBPIX_AVG_VAR(64, 32)
-VAR(64, 64)
SUBPIX_VAR(64, 64)
SUBPIX_AVG_VAR(64, 64)
-void vp9_comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,
- int height, const uint8_t *ref, int ref_stride) {
- int i, j;
-
- for (i = 0; i < height; i++) {
- for (j = 0; j < width; j++) {
- const int tmp = pred[j] + ref[j];
- comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
- }
- comp_pred += width;
- pred += width;
- ref += ref_stride;
- }
-}
-
#if CONFIG_VP9_HIGHBITDEPTH
-void highbd_variance64(const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride,
- int w, int h, uint64_t *sse,
- uint64_t *sum) {
- int i, j;
-
- uint16_t *a = CONVERT_TO_SHORTPTR(a8);
- uint16_t *b = CONVERT_TO_SHORTPTR(b8);
- *sum = 0;
- *sse = 0;
-
- for (i = 0; i < h; i++) {
- for (j = 0; j < w; j++) {
- const int diff = a[j] - b[j];
- *sum += diff;
- *sse += diff * diff;
- }
- a += a_stride;
- b += b_stride;
- }
-}
-
-void highbd_variance(const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride,
- int w, int h, unsigned int *sse,
- int *sum) {
- uint64_t sse_long = 0;
- uint64_t sum_long = 0;
- highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
- *sse = (unsigned int)sse_long;
- *sum = (int)sum_long;
-}
-
-void highbd_10_variance(const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride,
- int w, int h, unsigned int *sse,
- int *sum) {
- uint64_t sse_long = 0;
- uint64_t sum_long = 0;
- highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
- *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
- *sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 4);
-}
-
-void highbd_12_variance(const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride,
- int w, int h, unsigned int *sse,
- int *sum) {
- uint64_t sse_long = 0;
- uint64_t sum_long = 0;
- highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
- *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
- *sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 8);
-}
-
static void highbd_var_filter_block2d_bil_first_pass(
const uint8_t *src_ptr8,
uint16_t *output_ptr,
@@ -374,35 +209,6 @@ static void highbd_var_filter_block2d_bil_second_pass(
}
}
-#define HIGHBD_VAR(W, H) \
-unsigned int vp9_highbd_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
- const uint8_t *b, int b_stride, \
- unsigned int *sse) { \
- int sum; \
- highbd_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
- return *sse - (((int64_t)sum * sum) / (W * H)); \
-} \
-\
-unsigned int vp9_highbd_10_variance##W##x##H##_c(const uint8_t *a, \
- int a_stride, \
- const uint8_t *b, \
- int b_stride, \
- unsigned int *sse) { \
- int sum; \
- highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
- return *sse - (((int64_t)sum * sum) / (W * H)); \
-} \
-\
-unsigned int vp9_highbd_12_variance##W##x##H##_c(const uint8_t *a, \
- int a_stride, \
- const uint8_t *b, \
- int b_stride, \
- unsigned int *sse) { \
- int sum; \
- highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
- return *sse - (((int64_t)sum * sum) / (W * H)); \
-}
-
#define HIGHBD_SUBPIX_VAR(W, H) \
unsigned int vp9_highbd_sub_pixel_variance##W##x##H##_c( \
const uint8_t *src, int src_stride, \
@@ -417,7 +223,7 @@ unsigned int vp9_highbd_sub_pixel_variance##W##x##H##_c( \
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
BILINEAR_FILTERS_2TAP(yoffset)); \
\
- return vp9_highbd_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
+ return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
dst_stride, sse); \
} \
\
@@ -434,7 +240,7 @@ unsigned int vp9_highbd_10_sub_pixel_variance##W##x##H##_c( \
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
BILINEAR_FILTERS_2TAP(yoffset)); \
\
- return vp9_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
+ return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
W, dst, dst_stride, sse); \
} \
\
@@ -451,7 +257,7 @@ unsigned int vp9_highbd_12_sub_pixel_variance##W##x##H##_c( \
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
BILINEAR_FILTERS_2TAP(yoffset)); \
\
- return vp9_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
+ return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
W, dst, dst_stride, sse); \
}
@@ -471,10 +277,10 @@ unsigned int vp9_highbd_sub_pixel_avg_variance##W##x##H##_c( \
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
BILINEAR_FILTERS_2TAP(yoffset)); \
\
- vp9_highbd_comp_avg_pred(temp3, second_pred, W, H, \
+ vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
CONVERT_TO_BYTEPTR(temp2), W); \
\
- return vp9_highbd_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
+ return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
dst_stride, sse); \
} \
\
@@ -493,10 +299,10 @@ unsigned int vp9_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
BILINEAR_FILTERS_2TAP(yoffset)); \
\
- vp9_highbd_comp_avg_pred(temp3, second_pred, W, H, \
+ vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
CONVERT_TO_BYTEPTR(temp2), W); \
\
- return vp9_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
+ return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
W, dst, dst_stride, sse); \
} \
\
@@ -515,137 +321,49 @@ unsigned int vp9_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
BILINEAR_FILTERS_2TAP(yoffset)); \
\
- vp9_highbd_comp_avg_pred(temp3, second_pred, W, H, \
+ vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
CONVERT_TO_BYTEPTR(temp2), W); \
\
- return vp9_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
+ return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
W, dst, dst_stride, sse); \
}
-#define HIGHBD_GET_VAR(S) \
-void vp9_highbd_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
- const uint8_t *ref, int ref_stride, \
- unsigned int *sse, int *sum) { \
- highbd_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
-} \
-\
-void vp9_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
- const uint8_t *ref, int ref_stride, \
- unsigned int *sse, int *sum) { \
- highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
-} \
-\
-void vp9_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
- const uint8_t *ref, int ref_stride, \
- unsigned int *sse, int *sum) { \
- highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
-}
-
-#define HIGHBD_MSE(W, H) \
-unsigned int vp9_highbd_mse##W##x##H##_c(const uint8_t *src, \
- int src_stride, \
- const uint8_t *ref, \
- int ref_stride, \
- unsigned int *sse) { \
- int sum; \
- highbd_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
- return *sse; \
-} \
-\
-unsigned int vp9_highbd_10_mse##W##x##H##_c(const uint8_t *src, \
- int src_stride, \
- const uint8_t *ref, \
- int ref_stride, \
- unsigned int *sse) { \
- int sum; \
- highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
- return *sse; \
-} \
-\
-unsigned int vp9_highbd_12_mse##W##x##H##_c(const uint8_t *src, \
- int src_stride, \
- const uint8_t *ref, \
- int ref_stride, \
- unsigned int *sse) { \
- int sum; \
- highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
- return *sse; \
-}
-
-HIGHBD_GET_VAR(8)
-HIGHBD_GET_VAR(16)
-
-HIGHBD_MSE(16, 16)
-HIGHBD_MSE(16, 8)
-HIGHBD_MSE(8, 16)
-HIGHBD_MSE(8, 8)
-
-HIGHBD_VAR(4, 4)
HIGHBD_SUBPIX_VAR(4, 4)
HIGHBD_SUBPIX_AVG_VAR(4, 4)
-HIGHBD_VAR(4, 8)
HIGHBD_SUBPIX_VAR(4, 8)
HIGHBD_SUBPIX_AVG_VAR(4, 8)
-HIGHBD_VAR(8, 4)
HIGHBD_SUBPIX_VAR(8, 4)
HIGHBD_SUBPIX_AVG_VAR(8, 4)
-HIGHBD_VAR(8, 8)
HIGHBD_SUBPIX_VAR(8, 8)
HIGHBD_SUBPIX_AVG_VAR(8, 8)
-HIGHBD_VAR(8, 16)
HIGHBD_SUBPIX_VAR(8, 16)
HIGHBD_SUBPIX_AVG_VAR(8, 16)
-HIGHBD_VAR(16, 8)
HIGHBD_SUBPIX_VAR(16, 8)
HIGHBD_SUBPIX_AVG_VAR(16, 8)
-HIGHBD_VAR(16, 16)
HIGHBD_SUBPIX_VAR(16, 16)
HIGHBD_SUBPIX_AVG_VAR(16, 16)
-HIGHBD_VAR(16, 32)
HIGHBD_SUBPIX_VAR(16, 32)
HIGHBD_SUBPIX_AVG_VAR(16, 32)
-HIGHBD_VAR(32, 16)
HIGHBD_SUBPIX_VAR(32, 16)
HIGHBD_SUBPIX_AVG_VAR(32, 16)
-HIGHBD_VAR(32, 32)
HIGHBD_SUBPIX_VAR(32, 32)
HIGHBD_SUBPIX_AVG_VAR(32, 32)
-HIGHBD_VAR(32, 64)
HIGHBD_SUBPIX_VAR(32, 64)
HIGHBD_SUBPIX_AVG_VAR(32, 64)
-HIGHBD_VAR(64, 32)
HIGHBD_SUBPIX_VAR(64, 32)
HIGHBD_SUBPIX_AVG_VAR(64, 32)
-HIGHBD_VAR(64, 64)
HIGHBD_SUBPIX_VAR(64, 64)
HIGHBD_SUBPIX_AVG_VAR(64, 64)
-
-void vp9_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8,
- int width, int height, const uint8_t *ref8,
- int ref_stride) {
- int i, j;
- uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
- uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
- for (i = 0; i < height; i++) {
- for (j = 0; j < width; j++) {
- const int tmp = pred[j] + ref[j];
- comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
- }
- comp_pred += width;
- pred += width;
- ref += ref_stride;
- }
-}
#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp9/encoder/vp9_variance.h b/vp9/encoder/vp9_variance.h
index 53148f23c..8fc47a850 100644
--- a/vp9/encoder/vp9_variance.h
+++ b/vp9/encoder/vp9_variance.h
@@ -12,31 +12,64 @@
#define VP9_ENCODER_VP9_VARIANCE_H_
#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
#ifdef __cplusplus
extern "C" {
#endif
-void variance(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- int w, int h,
- unsigned int *sse, int *sum);
+// TODO(johannkoenig): All functions which depend on
+// [highbd_][8|10|12_]variance should be refactored or moved to vpx_dsp.
+static void variance(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ int w, int h, unsigned int *sse, int *sum) {
+ int i, j;
+
+ *sum = 0;
+ *sse = 0;
+
+ for (i = 0; i < h; i++) {
+ for (j = 0; j < w; j++) {
+ const int diff = a[j] - b[j];
+ *sum += diff;
+ *sse += diff * diff;
+ }
+
+ a += a_stride;
+ b += b_stride;
+ }
+}
#if CONFIG_VP9_HIGHBITDEPTH
-void highbd_variance(const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride,
- int w, int h,
- unsigned int *sse, int *sum);
-
-void highbd_10_variance(const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride,
- int w, int h,
- unsigned int *sse, int *sum);
-
-void highbd_12_variance(const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride,
- int w, int h,
- unsigned int *sse, int *sum);
+static void highbd_variance64(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride,
+ int w, int h, uint64_t *sse, uint64_t *sum) {
+ int i, j;
+
+ uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ *sum = 0;
+ *sse = 0;
+
+ for (i = 0; i < h; i++) {
+ for (j = 0; j < w; j++) {
+ const int diff = a[j] - b[j];
+ *sum += diff;
+ *sse += diff * diff;
+ }
+ a += a_stride;
+ b += b_stride;
+ }
+}
+static void highbd_8_variance(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride,
+ int w, int h, unsigned int *sse, int *sum) {
+ uint64_t sse_long = 0;
+ uint64_t sum_long = 0;
+ highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+ *sse = (unsigned int)sse_long;
+ *sum = (int)sum_long;
+}
#endif
typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr,
@@ -95,15 +128,6 @@ typedef struct vp9_variance_vtable {
vp9_sad_multi_d_fn_t sdx4df;
} vp9_variance_fn_ptr_t;
-void vp9_comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,
- int height, const uint8_t *ref, int ref_stride);
-
-#if CONFIG_VP9_HIGHBITDEPTH
-void vp9_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred,
- int width, int height,
- const uint8_t *ref, int ref_stride);
-#endif
-
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/vp9/encoder/x86/vp9_highbd_variance_sse2.c b/vp9/encoder/x86/vp9_highbd_variance_sse2.c
index 4bc3e7e2d..29b7b2782 100644
--- a/vp9/encoder/x86/vp9_highbd_variance_sse2.c
+++ b/vp9/encoder/x86/vp9_highbd_variance_sse2.c
@@ -13,237 +13,6 @@
#include "vp9/encoder/vp9_variance.h"
#include "vpx_ports/mem.h"
-typedef uint32_t (*high_variance_fn_t) (const uint16_t *src, int src_stride,
- const uint16_t *ref, int ref_stride,
- uint32_t *sse, int *sum);
-
-uint32_t vp9_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride,
- const uint16_t *ref, int ref_stride,
- uint32_t *sse, int *sum);
-
-uint32_t vp9_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride,
- const uint16_t *ref, int ref_stride,
- uint32_t *sse, int *sum);
-
-static void highbd_variance_sse2(const uint16_t *src, int src_stride,
- const uint16_t *ref, int ref_stride,
- int w, int h, uint32_t *sse, int *sum,
- high_variance_fn_t var_fn, int block_size) {
- int i, j;
-
- *sse = 0;
- *sum = 0;
-
- for (i = 0; i < h; i += block_size) {
- for (j = 0; j < w; j += block_size) {
- unsigned int sse0;
- int sum0;
- var_fn(src + src_stride * i + j, src_stride,
- ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
- *sse += sse0;
- *sum += sum0;
- }
- }
-}
-
-static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
- const uint16_t *ref, int ref_stride,
- int w, int h, uint32_t *sse, int *sum,
- high_variance_fn_t var_fn, int block_size) {
- int i, j;
- uint64_t sse_long = 0;
- int64_t sum_long = 0;
-
- for (i = 0; i < h; i += block_size) {
- for (j = 0; j < w; j += block_size) {
- unsigned int sse0;
- int sum0;
- var_fn(src + src_stride * i + j, src_stride,
- ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
- sse_long += sse0;
- sum_long += sum0;
- }
- }
- *sum = ROUND_POWER_OF_TWO(sum_long, 2);
- *sse = ROUND_POWER_OF_TWO(sse_long, 4);
-}
-
-static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
- const uint16_t *ref, int ref_stride,
- int w, int h, uint32_t *sse, int *sum,
- high_variance_fn_t var_fn, int block_size) {
- int i, j;
- uint64_t sse_long = 0;
- int64_t sum_long = 0;
-
- for (i = 0; i < h; i += block_size) {
- for (j = 0; j < w; j += block_size) {
- unsigned int sse0;
- int sum0;
- var_fn(src + src_stride * i + j, src_stride,
- ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
- sse_long += sse0;
- sum_long += sum0;
- }
- }
- *sum = ROUND_POWER_OF_TWO(sum_long, 4);
- *sse = ROUND_POWER_OF_TWO(sse_long, 8);
-}
-
-
-#define HIGH_GET_VAR(S) \
-void vp9_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
- const uint8_t *ref8, int ref_stride, \
- uint32_t *sse, int *sum) { \
- uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
- vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
- sse, sum); \
-} \
-\
-void vp9_highbd_10_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
- const uint8_t *ref8, int ref_stride, \
- uint32_t *sse, int *sum) { \
- uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
- vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
- sse, sum); \
- *sum = ROUND_POWER_OF_TWO(*sum, 2); \
- *sse = ROUND_POWER_OF_TWO(*sse, 4); \
-} \
-\
-void vp9_highbd_12_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
- const uint8_t *ref8, int ref_stride, \
- uint32_t *sse, int *sum) { \
- uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
- vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
- sse, sum); \
- *sum = ROUND_POWER_OF_TWO(*sum, 4); \
- *sse = ROUND_POWER_OF_TWO(*sse, 8); \
-}
-
-HIGH_GET_VAR(16);
-HIGH_GET_VAR(8);
-
-#undef HIGH_GET_VAR
-
-#define VAR_FN(w, h, block_size, shift) \
-uint32_t vp9_highbd_variance##w##x##h##_sse2( \
- const uint8_t *src8, int src_stride, \
- const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
- int sum; \
- uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
- highbd_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \
- vp9_highbd_calc##block_size##x##block_size##var_sse2, \
- block_size); \
- return *sse - (((int64_t)sum * sum) >> shift); \
-} \
-\
-uint32_t vp9_highbd_10_variance##w##x##h##_sse2( \
- const uint8_t *src8, int src_stride, \
- const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
- int sum; \
- uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
- highbd_10_variance_sse2( \
- src, src_stride, ref, ref_stride, w, h, sse, &sum, \
- vp9_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
- return *sse - (((int64_t)sum * sum) >> shift); \
-} \
-\
-uint32_t vp9_highbd_12_variance##w##x##h##_sse2( \
- const uint8_t *src8, int src_stride, \
- const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
- int sum; \
- uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
- highbd_12_variance_sse2( \
- src, src_stride, ref, ref_stride, w, h, sse, &sum, \
- vp9_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
- return *sse - (((int64_t)sum * sum) >> shift); \
-}
-
-VAR_FN(64, 64, 16, 12);
-VAR_FN(64, 32, 16, 11);
-VAR_FN(32, 64, 16, 11);
-VAR_FN(32, 32, 16, 10);
-VAR_FN(32, 16, 16, 9);
-VAR_FN(16, 32, 16, 9);
-VAR_FN(16, 16, 16, 8);
-VAR_FN(16, 8, 8, 7);
-VAR_FN(8, 16, 8, 7);
-VAR_FN(8, 8, 8, 6);
-
-#undef VAR_FN
-
-unsigned int vp9_highbd_mse16x16_sse2(const uint8_t *src8, int src_stride,
- const uint8_t *ref8, int ref_stride,
- unsigned int *sse) {
- int sum;
- uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
- highbd_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
- sse, &sum, vp9_highbd_calc16x16var_sse2, 16);
- return *sse;
-}
-
-unsigned int vp9_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride,
- const uint8_t *ref8, int ref_stride,
- unsigned int *sse) {
- int sum;
- uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
- highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
- sse, &sum, vp9_highbd_calc16x16var_sse2, 16);
- return *sse;
-}
-
-unsigned int vp9_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride,
- const uint8_t *ref8, int ref_stride,
- unsigned int *sse) {
- int sum;
- uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
- highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
- sse, &sum, vp9_highbd_calc16x16var_sse2, 16);
- return *sse;
-}
-
-unsigned int vp9_highbd_mse8x8_sse2(const uint8_t *src8, int src_stride,
- const uint8_t *ref8, int ref_stride,
- unsigned int *sse) {
- int sum;
- uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
- highbd_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
- sse, &sum, vp9_highbd_calc8x8var_sse2, 8);
- return *sse;
-}
-
-unsigned int vp9_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride,
- const uint8_t *ref8, int ref_stride,
- unsigned int *sse) {
- int sum;
- uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
- highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
- sse, &sum, vp9_highbd_calc8x8var_sse2, 8);
- return *sse;
-}
-
-unsigned int vp9_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
- const uint8_t *ref8, int ref_stride,
- unsigned int *sse) {
- int sum;
- uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
- highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
- sse, &sum, vp9_highbd_calc8x8var_sse2, 8);
- return *sse;
-}
-
#define DECL(w, opt) \
int vp9_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \
ptrdiff_t src_stride, \
diff --git a/vp9/encoder/x86/vp9_variance_avx2.c b/vp9/encoder/x86/vp9_variance_avx2.c
index ff9f7cca3..8cd071de5 100644
--- a/vp9/encoder/x86/vp9_variance_avx2.c
+++ b/vp9/encoder/x86/vp9_variance_avx2.c
@@ -13,18 +13,6 @@
#include "vp9/encoder/vp9_variance.h"
#include "vpx_ports/mem.h"
-typedef void (*get_var_avx2)(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse, int *sum);
-
-void vp9_get16x16var_avx2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse, int *sum);
-
-void vp9_get32x32var_avx2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse, int *sum);
-
unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
int x_offset, int y_offset,
const uint8_t *dst, int dst_stride,
@@ -42,81 +30,6 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
int height,
unsigned int *sseptr);
-static void variance_avx2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- int w, int h, unsigned int *sse, int *sum,
- get_var_avx2 var_fn, int block_size) {
- int i, j;
-
- *sse = 0;
- *sum = 0;
-
- for (i = 0; i < h; i += 16) {
- for (j = 0; j < w; j += block_size) {
- unsigned int sse0;
- int sum0;
- var_fn(&src[src_stride * i + j], src_stride,
- &ref[ref_stride * i + j], ref_stride, &sse0, &sum0);
- *sse += sse0;
- *sum += sum0;
- }
- }
-}
-
-
-unsigned int vp9_variance16x16_avx2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse) {
- int sum;
- variance_avx2(src, src_stride, ref, ref_stride, 16, 16,
- sse, &sum, vp9_get16x16var_avx2, 16);
- return *sse - (((unsigned int)sum * sum) >> 8);
-}
-
-unsigned int vp9_mse16x16_avx2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse) {
- int sum;
- vp9_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum);
- return *sse;
-}
-
-unsigned int vp9_variance32x16_avx2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse) {
- int sum;
- variance_avx2(src, src_stride, ref, ref_stride, 32, 16,
- sse, &sum, vp9_get32x32var_avx2, 32);
- return *sse - (((int64_t)sum * sum) >> 9);
-}
-
-unsigned int vp9_variance32x32_avx2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse) {
- int sum;
- variance_avx2(src, src_stride, ref, ref_stride, 32, 32,
- sse, &sum, vp9_get32x32var_avx2, 32);
- return *sse - (((int64_t)sum * sum) >> 10);
-}
-
-unsigned int vp9_variance64x64_avx2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse) {
- int sum;
- variance_avx2(src, src_stride, ref, ref_stride, 64, 64,
- sse, &sum, vp9_get32x32var_avx2, 32);
- return *sse - (((int64_t)sum * sum) >> 12);
-}
-
-unsigned int vp9_variance64x32_avx2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse) {
- int sum;
- variance_avx2(src, src_stride, ref, ref_stride, 64, 32,
- sse, &sum, vp9_get32x32var_avx2, 32);
- return *sse - (((int64_t)sum * sum) >> 11);
-}
-
unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src,
int src_stride,
int x_offset,
diff --git a/vp9/encoder/x86/vp9_variance_sse2.c b/vp9/encoder/x86/vp9_variance_sse2.c
index cacee7442..961efe34e 100644
--- a/vp9/encoder/x86/vp9_variance_sse2.c
+++ b/vp9/encoder/x86/vp9_variance_sse2.c
@@ -16,299 +16,6 @@
#include "vp9/encoder/vp9_variance.h"
#include "vpx_ports/mem.h"
-typedef void (*variance_fn_t)(const unsigned char *src, int src_stride,
- const unsigned char *ref, int ref_stride,
- unsigned int *sse, int *sum);
-
-unsigned int vp9_get_mb_ss_sse2(const int16_t *src) {
- __m128i vsum = _mm_setzero_si128();
- int i;
-
- for (i = 0; i < 32; ++i) {
- const __m128i v = _mm_loadu_si128((const __m128i *)src);
- vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
- src += 8;
- }
-
- vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
- vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
- return _mm_cvtsi128_si32(vsum);
-}
-
-#define READ64(p, stride, i) \
- _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \
- _mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride)))
-
-static void get4x4var_sse2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse, int *sum) {
- const __m128i zero = _mm_setzero_si128();
- const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero);
- const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero);
- const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero);
- const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero);
- const __m128i diff0 = _mm_sub_epi16(src0, ref0);
- const __m128i diff1 = _mm_sub_epi16(src1, ref1);
-
- // sum
- __m128i vsum = _mm_add_epi16(diff0, diff1);
- vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
- vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
- vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
- *sum = (int16_t)_mm_extract_epi16(vsum, 0);
-
- // sse
- vsum = _mm_add_epi32(_mm_madd_epi16(diff0, diff0),
- _mm_madd_epi16(diff1, diff1));
- vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
- vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
- *sse = _mm_cvtsi128_si32(vsum);
-}
-
-void vp9_get8x8var_sse2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse, int *sum) {
- const __m128i zero = _mm_setzero_si128();
- __m128i vsum = _mm_setzero_si128();
- __m128i vsse = _mm_setzero_si128();
- int i;
-
- for (i = 0; i < 8; i += 2) {
- const __m128i src0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
- (const __m128i *)(src + i * src_stride)), zero);
- const __m128i ref0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
- (const __m128i *)(ref + i * ref_stride)), zero);
- const __m128i diff0 = _mm_sub_epi16(src0, ref0);
-
- const __m128i src1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
- (const __m128i *)(src + (i + 1) * src_stride)), zero);
- const __m128i ref1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
- (const __m128i *)(ref + (i + 1) * ref_stride)), zero);
- const __m128i diff1 = _mm_sub_epi16(src1, ref1);
-
- vsum = _mm_add_epi16(vsum, diff0);
- vsum = _mm_add_epi16(vsum, diff1);
- vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
- vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
- }
-
- // sum
- vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
- vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
- vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
- *sum = (int16_t)_mm_extract_epi16(vsum, 0);
-
- // sse
- vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
- vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
- *sse = _mm_cvtsi128_si32(vsse);
-}
-
-void vp9_get16x16var_sse2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse, int *sum) {
- const __m128i zero = _mm_setzero_si128();
- __m128i vsum = _mm_setzero_si128();
- __m128i vsse = _mm_setzero_si128();
- int i;
-
- for (i = 0; i < 16; ++i) {
- const __m128i s = _mm_loadu_si128((const __m128i *)src);
- const __m128i r = _mm_loadu_si128((const __m128i *)ref);
-
- const __m128i src0 = _mm_unpacklo_epi8(s, zero);
- const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
- const __m128i diff0 = _mm_sub_epi16(src0, ref0);
-
- const __m128i src1 = _mm_unpackhi_epi8(s, zero);
- const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
- const __m128i diff1 = _mm_sub_epi16(src1, ref1);
-
- vsum = _mm_add_epi16(vsum, diff0);
- vsum = _mm_add_epi16(vsum, diff1);
- vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
- vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
-
- src += src_stride;
- ref += ref_stride;
- }
-
- // sum
- vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
- vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
- *sum = (int16_t)_mm_extract_epi16(vsum, 0) +
- (int16_t)_mm_extract_epi16(vsum, 1);
-
- // sse
- vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
- vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
- *sse = _mm_cvtsi128_si32(vsse);
-}
-
-
-static void variance_sse2(const unsigned char *src, int src_stride,
- const unsigned char *ref, int ref_stride,
- int w, int h, unsigned int *sse, int *sum,
- variance_fn_t var_fn, int block_size) {
- int i, j;
-
- *sse = 0;
- *sum = 0;
-
- for (i = 0; i < h; i += block_size) {
- for (j = 0; j < w; j += block_size) {
- unsigned int sse0;
- int sum0;
- var_fn(src + src_stride * i + j, src_stride,
- ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
- *sse += sse0;
- *sum += sum0;
- }
- }
-}
-
-unsigned int vp9_variance4x4_sse2(const unsigned char *src, int src_stride,
- const unsigned char *ref, int ref_stride,
- unsigned int *sse) {
- int sum;
- get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
- return *sse - (((unsigned int)sum * sum) >> 4);
-}
-
-unsigned int vp9_variance8x4_sse2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse) {
- int sum;
- variance_sse2(src, src_stride, ref, ref_stride, 8, 4,
- sse, &sum, get4x4var_sse2, 4);
- return *sse - (((unsigned int)sum * sum) >> 5);
-}
-
-unsigned int vp9_variance4x8_sse2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse) {
- int sum;
- variance_sse2(src, src_stride, ref, ref_stride, 4, 8,
- sse, &sum, get4x4var_sse2, 4);
- return *sse - (((unsigned int)sum * sum) >> 5);
-}
-
-unsigned int vp9_variance8x8_sse2(const unsigned char *src, int src_stride,
- const unsigned char *ref, int ref_stride,
- unsigned int *sse) {
- int sum;
- vp9_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
- return *sse - (((unsigned int)sum * sum) >> 6);
-}
-
-unsigned int vp9_variance16x8_sse2(const unsigned char *src, int src_stride,
- const unsigned char *ref, int ref_stride,
- unsigned int *sse) {
- int sum;
- variance_sse2(src, src_stride, ref, ref_stride, 16, 8,
- sse, &sum, vp9_get8x8var_sse2, 8);
- return *sse - (((unsigned int)sum * sum) >> 7);
-}
-
-unsigned int vp9_variance8x16_sse2(const unsigned char *src, int src_stride,
- const unsigned char *ref, int ref_stride,
- unsigned int *sse) {
- int sum;
- variance_sse2(src, src_stride, ref, ref_stride, 8, 16,
- sse, &sum, vp9_get8x8var_sse2, 8);
- return *sse - (((unsigned int)sum * sum) >> 7);
-}
-
-unsigned int vp9_variance16x16_sse2(const unsigned char *src, int src_stride,
- const unsigned char *ref, int ref_stride,
- unsigned int *sse) {
- int sum;
- vp9_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
- return *sse - (((unsigned int)sum * sum) >> 8);
-}
-
-unsigned int vp9_variance32x32_sse2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse) {
- int sum;
- variance_sse2(src, src_stride, ref, ref_stride, 32, 32,
- sse, &sum, vp9_get16x16var_sse2, 16);
- return *sse - (((int64_t)sum * sum) >> 10);
-}
-
-unsigned int vp9_variance32x16_sse2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse) {
- int sum;
- variance_sse2(src, src_stride, ref, ref_stride, 32, 16,
- sse, &sum, vp9_get16x16var_sse2, 16);
- return *sse - (((int64_t)sum * sum) >> 9);
-}
-
-unsigned int vp9_variance16x32_sse2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse) {
- int sum;
- variance_sse2(src, src_stride, ref, ref_stride, 16, 32,
- sse, &sum, vp9_get16x16var_sse2, 16);
- return *sse - (((int64_t)sum * sum) >> 9);
-}
-
-unsigned int vp9_variance64x64_sse2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse) {
- int sum;
- variance_sse2(src, src_stride, ref, ref_stride, 64, 64,
- sse, &sum, vp9_get16x16var_sse2, 16);
- return *sse - (((int64_t)sum * sum) >> 12);
-}
-
-unsigned int vp9_variance64x32_sse2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse) {
- int sum;
- variance_sse2(src, src_stride, ref, ref_stride, 64, 32,
- sse, &sum, vp9_get16x16var_sse2, 16);
- return *sse - (((int64_t)sum * sum) >> 11);
-}
-
-unsigned int vp9_variance32x64_sse2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse) {
- int sum;
- variance_sse2(src, src_stride, ref, ref_stride, 32, 64,
- sse, &sum, vp9_get16x16var_sse2, 16);
- return *sse - (((int64_t)sum * sum) >> 11);
-}
-
-unsigned int vp9_mse8x8_sse2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse) {
- vp9_variance8x8_sse2(src, src_stride, ref, ref_stride, sse);
- return *sse;
-}
-
-unsigned int vp9_mse8x16_sse2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse) {
- vp9_variance8x16_sse2(src, src_stride, ref, ref_stride, sse);
- return *sse;
-}
-
-unsigned int vp9_mse16x8_sse2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse) {
- vp9_variance16x8_sse2(src, src_stride, ref, ref_stride, sse);
- return *sse;
-}
-
-unsigned int vp9_mse16x16_sse2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse) {
- vp9_variance16x16_sse2(src, src_stride, ref, ref_stride, sse);
- return *sse;
-}
-
// The 2 unused parameters are place holders for PIC enabled build.
#define DECL(w, opt) \
int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index f629d98b8..cbc04888b 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -69,6 +69,7 @@ VP9_COMMON_SRCS-yes += common/vp9_common_data.h
VP9_COMMON_SRCS-yes += common/vp9_scan.c
VP9_COMMON_SRCS-yes += common/vp9_scan.h
+VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/convolve.h
VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c
VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_intrin_sse2.c
VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_loopfilter_intrin_avx2.c
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index bd0d18c17..5415215c2 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -102,13 +102,11 @@ VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.h
VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c
VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
-VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_impl_intrin_avx2.c
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_avg_intrin_sse2.c
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_variance_impl_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_quantize_intrin_sse2.c
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c
endif
diff --git a/vpx_dsp/arm/variance_media.asm b/vpx_dsp/arm/variance_media.asm
new file mode 100644
index 000000000..f7f9e14b0
--- /dev/null
+++ b/vpx_dsp/arm/variance_media.asm
@@ -0,0 +1,358 @@
+;
+; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vpx_variance16x16_media|
+ EXPORT |vpx_variance8x8_media|
+ EXPORT |vpx_mse16x16_media|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 unsigned char *src_ptr
+; r1 int source_stride
+; r2 unsigned char *ref_ptr
+; r3 int recon_stride
+; stack unsigned int *sse
+|vpx_variance16x16_media| PROC
+
+ stmfd sp!, {r4-r12, lr}
+
+ pld [r0, r1, lsl #0]
+ pld [r2, r3, lsl #0]
+
+ mov r8, #0 ; initialize sum = 0
+ mov r11, #0 ; initialize sse = 0
+ mov r12, #16 ; set loop counter to 16 (=block height)
+
+loop16x16
+ ; 1st 4 pixels
+ ldr r4, [r0, #0] ; load 4 src pixels
+ ldr r5, [r2, #0] ; load 4 ref pixels
+
+ mov lr, #0 ; constant zero
+
+ usub8 r6, r4, r5 ; calculate difference
+ pld [r0, r1, lsl #1]
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r9, r5, r4 ; calculate difference with reversed operands
+ pld [r2, r3, lsl #1]
+ sel r6, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+ ; calculate total sum
+ adds r8, r8, r4 ; add positive differences to sum
+ subs r8, r8, r5 ; subtract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 2nd 4 pixels
+ ldr r4, [r0, #4] ; load 4 src pixels
+ ldr r5, [r2, #4] ; load 4 ref pixels
+ smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r9, r5, r4 ; calculate difference with reversed operands
+ sel r6, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; subtract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 3rd 4 pixels
+ ldr r4, [r0, #8] ; load 4 src pixels
+ ldr r5, [r2, #8] ; load 4 ref pixels
+ smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r9, r5, r4 ; calculate difference with reversed operands
+ sel r6, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; subtract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 4th 4 pixels
+ ldr r4, [r0, #12] ; load 4 src pixels
+ ldr r5, [r2, #12] ; load 4 ref pixels
+ smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ add r0, r0, r1 ; set src_ptr to next row
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r9, r5, r4 ; calculate difference with reversed operands
+ add r2, r2, r3 ; set dst_ptr to next row
+ sel r6, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; subtract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+ smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
+
+
+ subs r12, r12, #1
+
+ bne loop16x16
+
+ ; return stuff
+ ldr r6, [sp, #40] ; get address of sse
+ mul r0, r8, r8 ; sum * sum
+ str r11, [r6] ; store sse
+ sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
+
+ ldmfd sp!, {r4-r12, pc}
+
+ ENDP
+
+; r0 unsigned char *src_ptr
+; r1 int source_stride
+; r2 unsigned char *ref_ptr
+; r3 int recon_stride
+; stack unsigned int *sse
+|vpx_variance8x8_media| PROC
+
+ push {r4-r10, lr}
+
+ pld [r0, r1, lsl #0]
+ pld [r2, r3, lsl #0]
+
+ mov r12, #8 ; set loop counter to 8 (=block height)
+ mov r4, #0 ; initialize sum = 0
+ mov r5, #0 ; initialize sse = 0
+
+loop8x8
+ ; 1st 4 pixels
+ ldr r6, [r0, #0x0] ; load 4 src pixels
+ ldr r7, [r2, #0x0] ; load 4 ref pixels
+
+ mov lr, #0 ; constant zero
+
+ usub8 r8, r6, r7 ; calculate difference
+ pld [r0, r1, lsl #1]
+ sel r10, r8, lr ; select bytes with positive difference
+ usub8 r9, r7, r6 ; calculate difference with reversed operands
+ pld [r2, r3, lsl #1]
+ sel r8, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r6, r10, lr ; calculate sum of positive differences
+ usad8 r7, r8, lr ; calculate sum of negative differences
+ orr r8, r8, r10 ; differences of all 4 pixels
+ ; calculate total sum
+ add r4, r4, r6 ; add positive differences to sum
+ sub r4, r4, r7 ; subtract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r7, r8 ; byte (two pixels) to halfwords
+ uxtb16 r10, r8, ror #8 ; another two pixels to halfwords
+ smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1)
+
+ ; 2nd 4 pixels
+ ldr r6, [r0, #0x4] ; load 4 src pixels
+ ldr r7, [r2, #0x4] ; load 4 ref pixels
+ smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r8, r6, r7 ; calculate difference
+ add r0, r0, r1 ; set src_ptr to next row
+ sel r10, r8, lr ; select bytes with positive difference
+ usub8 r9, r7, r6 ; calculate difference with reversed operands
+ add r2, r2, r3 ; set dst_ptr to next row
+ sel r8, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r6, r10, lr ; calculate sum of positive differences
+ usad8 r7, r8, lr ; calculate sum of negative differences
+ orr r8, r8, r10 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r4, r4, r6 ; add positive differences to sum
+ sub r4, r4, r7 ; subtract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r7, r8 ; byte (two pixels) to halfwords
+ uxtb16 r10, r8, ror #8 ; another two pixels to halfwords
+ smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1)
+ subs r12, r12, #1 ; next row
+ smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2)
+
+ bne loop8x8
+
+ ; return stuff
+ ldr r8, [sp, #32] ; get address of sse
+ mul r1, r4, r4 ; sum * sum
+ str r5, [r8] ; store sse
+ sub r0, r5, r1, ASR #6 ; return (sse - ((sum * sum) >> 6))
+
+ pop {r4-r10, pc}
+
+ ENDP
+
+; r0 unsigned char *src_ptr
+; r1 int source_stride
+; r2 unsigned char *ref_ptr
+; r3 int recon_stride
+; stack unsigned int *sse
+;
+;note: Based on vpx_variance16x16_media. In this function, sum is never used.
+; So, we can remove this part of calculation.
+
+|vpx_mse16x16_media| PROC
+
+ push {r4-r9, lr}
+
+ pld [r0, r1, lsl #0]
+ pld [r2, r3, lsl #0]
+
+ mov r12, #16 ; set loop counter to 16 (=block height)
+ mov r4, #0 ; initialize sse = 0
+
+loopmse
+ ; 1st 4 pixels
+ ldr r5, [r0, #0x0] ; load 4 src pixels
+ ldr r6, [r2, #0x0] ; load 4 ref pixels
+
+ mov lr, #0 ; constant zero
+
+ usub8 r8, r5, r6 ; calculate difference
+ pld [r0, r1, lsl #1]
+ sel r7, r8, lr ; select bytes with positive difference
+ usub8 r9, r6, r5 ; calculate difference with reversed operands
+ pld [r2, r3, lsl #1]
+ sel r8, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r5, r7, lr ; calculate sum of positive differences
+ usad8 r6, r8, lr ; calculate sum of negative differences
+ orr r8, r8, r7 ; differences of all 4 pixels
+
+ ldr r5, [r0, #0x4] ; load 4 src pixels
+
+ ; calculate sse
+ uxtb16 r6, r8 ; byte (two pixels) to halfwords
+ uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
+ smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
+
+ ; 2nd 4 pixels
+ ldr r6, [r2, #0x4] ; load 4 ref pixels
+ smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r8, r5, r6 ; calculate difference
+ sel r7, r8, lr ; select bytes with positive difference
+ usub8 r9, r6, r5 ; calculate difference with reversed operands
+ sel r8, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r5, r7, lr ; calculate sum of positive differences
+ usad8 r6, r8, lr ; calculate sum of negative differences
+ orr r8, r8, r7 ; differences of all 4 pixels
+ ldr r5, [r0, #0x8] ; load 4 src pixels
+ ; calculate sse
+ uxtb16 r6, r8 ; byte (two pixels) to halfwords
+ uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
+ smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
+
+ ; 3rd 4 pixels
+ ldr r6, [r2, #0x8] ; load 4 ref pixels
+ smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r8, r5, r6 ; calculate difference
+ sel r7, r8, lr ; select bytes with positive difference
+ usub8 r9, r6, r5 ; calculate difference with reversed operands
+ sel r8, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r5, r7, lr ; calculate sum of positive differences
+ usad8 r6, r8, lr ; calculate sum of negative differences
+ orr r8, r8, r7 ; differences of all 4 pixels
+
+ ldr r5, [r0, #0xc] ; load 4 src pixels
+
+ ; calculate sse
+ uxtb16 r6, r8 ; byte (two pixels) to halfwords
+ uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
+ smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
+
+ ; 4th 4 pixels
+ ldr r6, [r2, #0xc] ; load 4 ref pixels
+ smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r8, r5, r6 ; calculate difference
+ add r0, r0, r1 ; set src_ptr to next row
+ sel r7, r8, lr ; select bytes with positive difference
+ usub8 r9, r6, r5 ; calculate difference with reversed operands
+ add r2, r2, r3 ; set dst_ptr to next row
+ sel r8, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r5, r7, lr ; calculate sum of positive differences
+ usad8 r6, r8, lr ; calculate sum of negative differences
+ orr r8, r8, r7 ; differences of all 4 pixels
+
+ subs r12, r12, #1 ; next row
+
+ ; calculate sse
+ uxtb16 r6, r8 ; byte (two pixels) to halfwords
+ uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
+ smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
+ smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
+
+ bne loopmse
+
+ ; return stuff
+ ldr r1, [sp, #28] ; get address of sse
+ mov r0, r4 ; return sse
+ str r4, [r1] ; store sse
+
+ pop {r4-r9, pc}
+
+ ENDP
+
+ END
diff --git a/vpx_dsp/arm/variance_neon.c b/vpx_dsp/arm/variance_neon.c
new file mode 100644
index 000000000..1a9792e6c
--- /dev/null
+++ b/vpx_dsp/arm/variance_neon.c
@@ -0,0 +1,417 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+
+static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
+ const int32x4_t a = vpaddlq_s16(v_16x8);
+ const int64x2_t b = vpaddlq_s32(a);
+ const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+ vreinterpret_s32_s64(vget_high_s64(b)));
+ return vget_lane_s32(c, 0);
+}
+
+static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
+ const int64x2_t b = vpaddlq_s32(v_32x4);
+ const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+ vreinterpret_s32_s64(vget_high_s64(b)));
+ return vget_lane_s32(c, 0);
+}
+
+// w * h must be less than 2048 or local variable v_sum may overflow.
+static void variance_neon_w8(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ int w, int h, uint32_t *sse, int *sum) {
+ int i, j;
+ int16x8_t v_sum = vdupq_n_s16(0);
+ int32x4_t v_sse_lo = vdupq_n_s32(0);
+ int32x4_t v_sse_hi = vdupq_n_s32(0);
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ const uint8x8_t v_a = vld1_u8(&a[j]);
+ const uint8x8_t v_b = vld1_u8(&b[j]);
+ const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
+ const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
+ v_sum = vaddq_s16(v_sum, sv_diff);
+ v_sse_lo = vmlal_s16(v_sse_lo,
+ vget_low_s16(sv_diff),
+ vget_low_s16(sv_diff));
+ v_sse_hi = vmlal_s16(v_sse_hi,
+ vget_high_s16(sv_diff),
+ vget_high_s16(sv_diff));
+ }
+ a += a_stride;
+ b += b_stride;
+ }
+
+ *sum = horizontal_add_s16x8(v_sum);
+ *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
+}
+
+void vpx_get8x8var_neon(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ unsigned int *sse, int *sum) {
+ variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, sum);
+}
+
+void vpx_get16x16var_neon(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ unsigned int *sse, int *sum) {
+ variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, sum);
+}
+
+unsigned int vpx_variance8x8_neon(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ unsigned int *sse) {
+ int sum;
+ variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum);
+ return *sse - (((int64_t)sum * sum) >> 6); // >> 6 = / 8 * 8
+}
+
+unsigned int vpx_variance16x16_neon(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ unsigned int *sse) {
+ int sum;
+ variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum);
+ return *sse - (((int64_t)sum * sum) >> 8); // >> 8 = / 16 * 16
+}
+
+unsigned int vpx_variance32x32_neon(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ unsigned int *sse) {
+ int sum;
+ variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum);
+ return *sse - (((int64_t)sum * sum) >> 10); // >> 10 = / 32 * 32
+}
+
+unsigned int vpx_variance32x64_neon(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ unsigned int *sse) {
+ int sum1, sum2;
+ uint32_t sse1, sse2;
+ variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1);
+ variance_neon_w8(a + (32 * a_stride), a_stride,
+ b + (32 * b_stride), b_stride, 32, 32,
+ &sse2, &sum2);
+ *sse = sse1 + sse2;
+ sum1 += sum2;
+ return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64
+}
+
+unsigned int vpx_variance64x32_neon(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ unsigned int *sse) {
+ int sum1, sum2;
+ uint32_t sse1, sse2;
+ variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
+ variance_neon_w8(a + (16 * a_stride), a_stride,
+ b + (16 * b_stride), b_stride, 64, 16,
+ &sse2, &sum2);
+ *sse = sse1 + sse2;
+ sum1 += sum2;
+ return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64
+}
+
+unsigned int vpx_variance64x64_neon(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ unsigned int *sse) {
+ int sum1, sum2;
+ uint32_t sse1, sse2;
+
+ variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
+ variance_neon_w8(a + (16 * a_stride), a_stride,
+ b + (16 * b_stride), b_stride, 64, 16,
+ &sse2, &sum2);
+ sse1 += sse2;
+ sum1 += sum2;
+
+ variance_neon_w8(a + (16 * 2 * a_stride), a_stride,
+ b + (16 * 2 * b_stride), b_stride,
+ 64, 16, &sse2, &sum2);
+ sse1 += sse2;
+ sum1 += sum2;
+
+ variance_neon_w8(a + (16 * 3 * a_stride), a_stride,
+ b + (16 * 3 * b_stride), b_stride,
+ 64, 16, &sse2, &sum2);
+ *sse = sse1 + sse2;
+ sum1 += sum2;
+ return *sse - (((int64_t)sum1 * sum1) >> 12); // >> 12 = / 64 * 64
+}
+
+unsigned int vpx_variance16x8_neon(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse) {
+ int i;
+ int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
+ uint32x2_t d0u32, d10u32;
+ int64x1_t d0s64, d1s64;
+ uint8x16_t q0u8, q1u8, q2u8, q3u8;
+ uint16x8_t q11u16, q12u16, q13u16, q14u16;
+ int32x4_t q8s32, q9s32, q10s32;
+ int64x2_t q0s64, q1s64, q5s64;
+
+ q8s32 = vdupq_n_s32(0);
+ q9s32 = vdupq_n_s32(0);
+ q10s32 = vdupq_n_s32(0);
+
+ for (i = 0; i < 4; i++) {
+ q0u8 = vld1q_u8(src_ptr);
+ src_ptr += source_stride;
+ q1u8 = vld1q_u8(src_ptr);
+ src_ptr += source_stride;
+ __builtin_prefetch(src_ptr);
+
+ q2u8 = vld1q_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ q3u8 = vld1q_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ __builtin_prefetch(ref_ptr);
+
+ q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
+ q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
+ q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
+ q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
+
+ d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+ d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
+ q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
+ q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
+
+ d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+ d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
+ q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+ q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+
+ d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+ d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
+ q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
+ q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
+
+ d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
+ d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
+ q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
+ q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
+ }
+
+ q10s32 = vaddq_s32(q10s32, q9s32);
+ q0s64 = vpaddlq_s32(q8s32);
+ q1s64 = vpaddlq_s32(q10s32);
+
+ d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
+ d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+
+ q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
+ vreinterpret_s32_s64(d0s64));
+ vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
+
+ d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
+ d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
+
+ return vget_lane_u32(d0u32, 0);
+}
+
+unsigned int vpx_variance8x16_neon(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse) {
+ int i;
+ uint8x8_t d0u8, d2u8, d4u8, d6u8;
+ int16x4_t d22s16, d23s16, d24s16, d25s16;
+ uint32x2_t d0u32, d10u32;
+ int64x1_t d0s64, d1s64;
+ uint16x8_t q11u16, q12u16;
+ int32x4_t q8s32, q9s32, q10s32;
+ int64x2_t q0s64, q1s64, q5s64;
+
+ q8s32 = vdupq_n_s32(0);
+ q9s32 = vdupq_n_s32(0);
+ q10s32 = vdupq_n_s32(0);
+
+ for (i = 0; i < 8; i++) {
+ d0u8 = vld1_u8(src_ptr);
+ src_ptr += source_stride;
+ d2u8 = vld1_u8(src_ptr);
+ src_ptr += source_stride;
+ __builtin_prefetch(src_ptr);
+
+ d4u8 = vld1_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ d6u8 = vld1_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ __builtin_prefetch(ref_ptr);
+
+ q11u16 = vsubl_u8(d0u8, d4u8);
+ q12u16 = vsubl_u8(d2u8, d6u8);
+
+ d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+ d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
+ q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
+ q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
+
+ d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+ d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
+ q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+ q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+ }
+
+ q10s32 = vaddq_s32(q10s32, q9s32);
+ q0s64 = vpaddlq_s32(q8s32);
+ q1s64 = vpaddlq_s32(q10s32);
+
+ d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
+ d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+
+ q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
+ vreinterpret_s32_s64(d0s64));
+ vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
+
+ d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
+ d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
+
+ return vget_lane_u32(d0u32, 0);
+}
+
+unsigned int vpx_mse16x16_neon(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse) {
+ int i;
+ int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
+ int64x1_t d0s64;
+ uint8x16_t q0u8, q1u8, q2u8, q3u8;
+ int32x4_t q7s32, q8s32, q9s32, q10s32;
+ uint16x8_t q11u16, q12u16, q13u16, q14u16;
+ int64x2_t q1s64;
+
+ q7s32 = vdupq_n_s32(0);
+ q8s32 = vdupq_n_s32(0);
+ q9s32 = vdupq_n_s32(0);
+ q10s32 = vdupq_n_s32(0);
+
+ for (i = 0; i < 8; i++) { // mse16x16_neon_loop
+ q0u8 = vld1q_u8(src_ptr);
+ src_ptr += source_stride;
+ q1u8 = vld1q_u8(src_ptr);
+ src_ptr += source_stride;
+ q2u8 = vld1q_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ q3u8 = vld1q_u8(ref_ptr);
+ ref_ptr += recon_stride;
+
+ q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
+ q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
+ q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
+ q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
+
+ d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+ d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+ q7s32 = vmlal_s16(q7s32, d22s16, d22s16);
+ q8s32 = vmlal_s16(q8s32, d23s16, d23s16);
+
+ d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+ d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+ q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+ q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+
+ d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+ d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+ q7s32 = vmlal_s16(q7s32, d26s16, d26s16);
+ q8s32 = vmlal_s16(q8s32, d27s16, d27s16);
+
+ d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
+ d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
+ q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
+ q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
+ }
+
+ q7s32 = vaddq_s32(q7s32, q8s32);
+ q9s32 = vaddq_s32(q9s32, q10s32);
+ q10s32 = vaddq_s32(q7s32, q9s32);
+
+ q1s64 = vpaddlq_s32(q10s32);
+ d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+
+ vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0);
+ return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
+}
+
+unsigned int vpx_get4x4sse_cs_neon(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride) {
+ int16x4_t d22s16, d24s16, d26s16, d28s16;
+ int64x1_t d0s64;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+ int32x4_t q7s32, q8s32, q9s32, q10s32;
+ uint16x8_t q11u16, q12u16, q13u16, q14u16;
+ int64x2_t q1s64;
+
+ d0u8 = vld1_u8(src_ptr);
+ src_ptr += source_stride;
+ d4u8 = vld1_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ d1u8 = vld1_u8(src_ptr);
+ src_ptr += source_stride;
+ d5u8 = vld1_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ d2u8 = vld1_u8(src_ptr);
+ src_ptr += source_stride;
+ d6u8 = vld1_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ d3u8 = vld1_u8(src_ptr);
+ src_ptr += source_stride;
+ d7u8 = vld1_u8(ref_ptr);
+ ref_ptr += recon_stride;
+
+ q11u16 = vsubl_u8(d0u8, d4u8);
+ q12u16 = vsubl_u8(d1u8, d5u8);
+ q13u16 = vsubl_u8(d2u8, d6u8);
+ q14u16 = vsubl_u8(d3u8, d7u8);
+
+ d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16));
+ d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16));
+ d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16));
+ d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16));
+
+ q7s32 = vmull_s16(d22s16, d22s16);
+ q8s32 = vmull_s16(d24s16, d24s16);
+ q9s32 = vmull_s16(d26s16, d26s16);
+ q10s32 = vmull_s16(d28s16, d28s16);
+
+ q7s32 = vaddq_s32(q7s32, q8s32);
+ q9s32 = vaddq_s32(q9s32, q10s32);
+ q9s32 = vaddq_s32(q7s32, q9s32);
+
+ q1s64 = vpaddlq_s32(q9s32);
+ d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+
+ return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
+}
diff --git a/vpx_dsp/sad.c b/vpx_dsp/sad.c
index 9783e4363..c0c3ff996 100644
--- a/vpx_dsp/sad.c
+++ b/vpx_dsp/sad.c
@@ -33,6 +33,7 @@ static INLINE unsigned int sad(const uint8_t *a, int a_stride,
return sad;
}
+// TODO(johannkoenig): this moved to vpx_dsp, should be able to clean this up.
/* Remove dependency on vp9 variance function by duplicating vp9_comp_avg_pred.
* The function averages every corresponding element of the buffers and stores
* the value in a third buffer, comp_pred.
diff --git a/vpx_dsp/variance.c b/vpx_dsp/variance.c
new file mode 100644
index 000000000..084dd7b7e
--- /dev/null
+++ b/vpx_dsp/variance.c
@@ -0,0 +1,306 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_ports/mem.h"
+#include "vpx/vpx_integer.h"
+
+unsigned int vpx_get4x4sse_cs_c(const unsigned char *a, int a_stride,
+ const unsigned char *b, int b_stride) {
+ int distortion = 0;
+ int r, c;
+
+ for (r = 0; r < 4; r++) {
+ for (c = 0; c < 4; c++) {
+ int diff = a[c] - b[c];
+ distortion += diff * diff;
+ }
+
+ a += a_stride;
+ b += b_stride;
+ }
+
+ return distortion;
+}
+
+unsigned int vpx_get_mb_ss_c(const int16_t *a) {
+ unsigned int i, sum = 0;
+
+ for (i = 0; i < 256; ++i) {
+ sum += a[i] * a[i];
+ }
+
+ return sum;
+}
+
+static void variance(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ int w, int h, unsigned int *sse, int *sum) {
+ int i, j;
+
+ *sum = 0;
+ *sse = 0;
+
+ for (i = 0; i < h; i++) {
+ for (j = 0; j < w; j++) {
+ const int diff = a[j] - b[j];
+ *sum += diff;
+ *sse += diff * diff;
+ }
+
+ a += a_stride;
+ b += b_stride;
+ }
+}
+
+#define VAR(W, H) \
+unsigned int vpx_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride, \
+ unsigned int *sse) { \
+ int sum; \
+ variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+ return *sse - (((int64_t)sum * sum) / (W * H)); \
+}
+
+/* Identical to the variance call except it takes an additional parameter, sum,
+ * and returns that value using pass-by-reference instead of returning
+ * sse - sum^2 / w*h
+ */
+#define GET_VAR(W, H) \
+void vpx_get##W##x##H##var_c(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride, \
+ unsigned int *sse, int *sum) { \
+ variance(a, a_stride, b, b_stride, W, H, sse, sum); \
+}
+
+/* Identical to the variance call except it does not calculate the
+ * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
+ * variable.
+ */
+#define MSE(W, H) \
+unsigned int vpx_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride, \
+ unsigned int *sse) { \
+ int sum; \
+ variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+ return *sse; \
+}
+
+VAR(64, 64)
+VAR(64, 32)
+VAR(32, 64)
+VAR(32, 32)
+VAR(32, 16)
+VAR(16, 32)
+VAR(16, 16)
+VAR(16, 8)
+VAR(8, 16)
+VAR(8, 8)
+VAR(8, 4)
+VAR(4, 8)
+VAR(4, 4)
+
+GET_VAR(16, 16)
+GET_VAR(8, 8)
+
+MSE(16, 16)
+MSE(16, 8)
+MSE(8, 16)
+MSE(8, 8)
+
+void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
+ int height, const uint8_t *ref, int ref_stride) {
+ int i, j;
+
+ for (i = 0; i < height; i++) {
+ for (j = 0; j < width; j++) {
+ const int tmp = pred[j] + ref[j];
+ comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
+ }
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_variance64(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride,
+ int w, int h, uint64_t *sse, uint64_t *sum) {
+ int i, j;
+
+ uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ *sum = 0;
+ *sse = 0;
+
+ for (i = 0; i < h; i++) {
+ for (j = 0; j < w; j++) {
+ const int diff = a[j] - b[j];
+ *sum += diff;
+ *sse += diff * diff;
+ }
+ a += a_stride;
+ b += b_stride;
+ }
+}
+
+static void highbd_8_variance(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride,
+ int w, int h, unsigned int *sse, int *sum) {
+ uint64_t sse_long = 0;
+ uint64_t sum_long = 0;
+ highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+ *sse = (unsigned int)sse_long;
+ *sum = (int)sum_long;
+}
+
+static void highbd_10_variance(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride,
+ int w, int h, unsigned int *sse, int *sum) {
+ uint64_t sse_long = 0;
+ uint64_t sum_long = 0;
+ highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+ *sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 4);
+ *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
+}
+
+static void highbd_12_variance(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride,
+ int w, int h, unsigned int *sse, int *sum) {
+ uint64_t sse_long = 0;
+ uint64_t sum_long = 0;
+ highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+ *sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 8);
+ *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
+}
+
+#define HIGHBD_VAR(W, H) \
+unsigned int vpx_highbd_8_variance##W##x##H##_c(const uint8_t *a, \
+ int a_stride, \
+ const uint8_t *b, \
+ int b_stride, \
+ unsigned int *sse) { \
+ int sum; \
+ highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+ return *sse - (((int64_t)sum * sum) / (W * H)); \
+} \
+\
+unsigned int vpx_highbd_10_variance##W##x##H##_c(const uint8_t *a, \
+ int a_stride, \
+ const uint8_t *b, \
+ int b_stride, \
+ unsigned int *sse) { \
+ int sum; \
+ highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+ return *sse - (((int64_t)sum * sum) / (W * H)); \
+} \
+\
+unsigned int vpx_highbd_12_variance##W##x##H##_c(const uint8_t *a, \
+ int a_stride, \
+ const uint8_t *b, \
+ int b_stride, \
+ unsigned int *sse) { \
+ int sum; \
+ highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+ return *sse - (((int64_t)sum * sum) / (W * H)); \
+}
+
+#define HIGHBD_GET_VAR(S) \
+void vpx_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ unsigned int *sse, int *sum) { \
+ highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
+} \
+\
+void vpx_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ unsigned int *sse, int *sum) { \
+ highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
+} \
+\
+void vpx_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ unsigned int *sse, int *sum) { \
+ highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
+}
+
+#define HIGHBD_MSE(W, H) \
+unsigned int vpx_highbd_8_mse##W##x##H##_c(const uint8_t *src, \
+ int src_stride, \
+ const uint8_t *ref, \
+ int ref_stride, \
+ unsigned int *sse) { \
+ int sum; \
+ highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
+ return *sse; \
+} \
+\
+unsigned int vpx_highbd_10_mse##W##x##H##_c(const uint8_t *src, \
+ int src_stride, \
+ const uint8_t *ref, \
+ int ref_stride, \
+ unsigned int *sse) { \
+ int sum; \
+ highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
+ return *sse; \
+} \
+\
+unsigned int vpx_highbd_12_mse##W##x##H##_c(const uint8_t *src, \
+ int src_stride, \
+ const uint8_t *ref, \
+ int ref_stride, \
+ unsigned int *sse) { \
+ int sum; \
+ highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
+ return *sse; \
+}
+
+HIGHBD_GET_VAR(8)
+HIGHBD_GET_VAR(16)
+
+HIGHBD_MSE(16, 16)
+HIGHBD_MSE(16, 8)
+HIGHBD_MSE(8, 16)
+HIGHBD_MSE(8, 8)
+
+HIGHBD_VAR(64, 64)
+HIGHBD_VAR(64, 32)
+HIGHBD_VAR(32, 64)
+HIGHBD_VAR(32, 32)
+HIGHBD_VAR(32, 16)
+HIGHBD_VAR(16, 32)
+HIGHBD_VAR(16, 16)
+HIGHBD_VAR(16, 8)
+HIGHBD_VAR(8, 16)
+HIGHBD_VAR(8, 8)
+HIGHBD_VAR(8, 4)
+HIGHBD_VAR(4, 8)
+HIGHBD_VAR(4, 4)
+
+void vpx_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8,
+ int width, int height, const uint8_t *ref8,
+ int ref_stride) {
+ int i, j;
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ for (i = 0; i < height; i++) {
+ for (j = 0; j < width; j++) {
+ const int tmp = pred[j] + ref[j];
+ comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
+ }
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 606515d2c..f23534adc 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -17,6 +17,7 @@ DSP_SRCS-$(HAVE_MEDIA) += arm/sad_media$(ASM)
DSP_SRCS-$(HAVE_NEON) += arm/sad4d_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/sad_neon.c
+
DSP_SRCS-$(HAVE_MMX) += x86/sad_mmx.asm
DSP_SRCS-$(HAVE_SSE2) += x86/sad4d_sse2.asm
DSP_SRCS-$(HAVE_SSE2) += x86/sad_sse2.asm
@@ -29,9 +30,28 @@ DSP_SRCS-$(HAVE_AVX2) += x86/sad_avx2.c
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm
+
endif # CONFIG_VP9_HIGHBITDEPTH
endif # CONFIG_ENCODERS
+ifneq ($(filter yes,$(CONFIG_ENCODERS) $(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)),)
+DSP_SRCS-yes += variance.c
+
+DSP_SRCS-$(HAVE_MEDIA) += arm/variance_media$(ASM)
+DSP_SRCS-$(HAVE_NEON) += arm/variance_neon.c
+
+DSP_SRCS-$(HAVE_MMX) += x86/variance_mmx.c
+DSP_SRCS-$(HAVE_MMX) += x86/variance_impl_mmx.asm
+DSP_SRCS-$(HAVE_SSE2) += x86/variance_sse2.c
+DSP_SRCS-$(HAVE_AVX2) += x86/variance_avx2.c
+DSP_SRCS-$(HAVE_AVX2) += x86/variance_impl_avx2.c
+
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_sse2.c
+DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_impl_sse2.asm
+endif # CONFIG_VP9_HIGHBITDEPTH
+endif # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
+
DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes)
DSP_SRCS-yes += vpx_dsp_rtcd.c
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index ebec9ec06..55271cf9c 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -392,4 +392,212 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
} # CONFIG_VP9_HIGHBITDEPTH
} # CONFIG_ENCODERS
+if (vpx_config("CONFIG_ENCODERS") eq "yes" || vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
+
+add_proto qw/unsigned int vpx_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_variance64x64 sse2 avx2 neon/;
+
+add_proto qw/unsigned int vpx_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_variance64x32 sse2 avx2 neon/;
+
+add_proto qw/unsigned int vpx_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_variance32x64 sse2 neon/;
+
+add_proto qw/unsigned int vpx_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_variance32x32 sse2 avx2 neon/;
+
+add_proto qw/unsigned int vpx_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_variance32x16 sse2 avx2/;
+
+add_proto qw/unsigned int vpx_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_variance16x32 sse2/;
+
+add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_variance16x16 mmx sse2 avx2 media neon/;
+
+add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_variance16x8 mmx sse2 neon/;
+
+add_proto qw/unsigned int vpx_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_variance8x16 mmx sse2 neon/;
+
+add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_variance8x8 mmx sse2 media neon/;
+
+add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_variance8x4 sse2/;
+
+add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_variance4x8 sse2/;
+
+add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_variance4x4 mmx sse2/;
+
+
+add_proto qw/void vpx_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+ specialize qw/vpx_get16x16var sse2 avx2 neon/;
+
+add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+ specialize qw/vpx_get8x8var mmx sse2 neon/;
+
+add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ specialize qw/vpx_mse16x16 mmx sse2 avx2 media neon/;
+
+add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ specialize qw/vpx_mse16x8 sse2/;
+
+add_proto qw/unsigned int vpx_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ specialize qw/vpx_mse8x16 sse2/;
+
+add_proto qw/unsigned int vpx_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ specialize qw/vpx_mse8x8 sse2/;
+
+add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *";
+ specialize qw/vpx_get_mb_ss mmx sse2/;
+
+add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride";
+ specialize qw/vpx_get4x4sse_cs neon/;
+
+add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
+
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_12_variance64x64 sse2/;
+
+ add_proto qw/unsigned int vpx_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_12_variance64x32 sse2/;
+
+ add_proto qw/unsigned int vpx_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_12_variance32x64 sse2/;
+
+ add_proto qw/unsigned int vpx_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_12_variance32x32 sse2/;
+
+ add_proto qw/unsigned int vpx_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_12_variance32x16 sse2/;
+
+ add_proto qw/unsigned int vpx_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_12_variance16x32 sse2/;
+
+ add_proto qw/unsigned int vpx_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_12_variance16x16 sse2/;
+
+ add_proto qw/unsigned int vpx_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_12_variance16x8 sse2/;
+
+ add_proto qw/unsigned int vpx_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_12_variance8x16 sse2/;
+
+ add_proto qw/unsigned int vpx_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_12_variance8x8 sse2/;
+
+ add_proto qw/unsigned int vpx_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ add_proto qw/unsigned int vpx_highbd_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ add_proto qw/unsigned int vpx_highbd_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+
+ add_proto qw/unsigned int vpx_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_10_variance64x64 sse2/;
+
+ add_proto qw/unsigned int vpx_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_10_variance64x32 sse2/;
+
+ add_proto qw/unsigned int vpx_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_10_variance32x64 sse2/;
+
+ add_proto qw/unsigned int vpx_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_10_variance32x32 sse2/;
+
+ add_proto qw/unsigned int vpx_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_10_variance32x16 sse2/;
+
+ add_proto qw/unsigned int vpx_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_10_variance16x32 sse2/;
+
+ add_proto qw/unsigned int vpx_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_10_variance16x16 sse2/;
+
+ add_proto qw/unsigned int vpx_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_10_variance16x8 sse2/;
+
+ add_proto qw/unsigned int vpx_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_10_variance8x16 sse2/;
+
+ add_proto qw/unsigned int vpx_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_10_variance8x8 sse2/;
+
+ add_proto qw/unsigned int vpx_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ add_proto qw/unsigned int vpx_highbd_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ add_proto qw/unsigned int vpx_highbd_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+
+ add_proto qw/unsigned int vpx_highbd_8_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_8_variance64x64 sse2/;
+
+ add_proto qw/unsigned int vpx_highbd_8_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_8_variance64x32 sse2/;
+
+ add_proto qw/unsigned int vpx_highbd_8_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_8_variance32x64 sse2/;
+
+ add_proto qw/unsigned int vpx_highbd_8_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_8_variance32x32 sse2/;
+
+ add_proto qw/unsigned int vpx_highbd_8_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_8_variance32x16 sse2/;
+
+ add_proto qw/unsigned int vpx_highbd_8_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_8_variance16x32 sse2/;
+
+ add_proto qw/unsigned int vpx_highbd_8_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_8_variance16x16 sse2/;
+
+ add_proto qw/unsigned int vpx_highbd_8_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_8_variance16x8 sse2/;
+
+ add_proto qw/unsigned int vpx_highbd_8_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_8_variance8x16 sse2/;
+
+ add_proto qw/unsigned int vpx_highbd_8_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_8_variance8x8 sse2/;
+
+ add_proto qw/unsigned int vpx_highbd_8_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ add_proto qw/unsigned int vpx_highbd_8_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ add_proto qw/unsigned int vpx_highbd_8_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+
+ add_proto qw/void vpx_highbd_8_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+ add_proto qw/void vpx_highbd_8_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+
+ add_proto qw/void vpx_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+ add_proto qw/void vpx_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+
+ add_proto qw/void vpx_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+ add_proto qw/void vpx_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+
+ add_proto qw/unsigned int vpx_highbd_8_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_8_mse16x16 sse2/;
+
+ add_proto qw/unsigned int vpx_highbd_8_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ add_proto qw/unsigned int vpx_highbd_8_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ add_proto qw/unsigned int vpx_highbd_8_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_8_mse8x8 sse2/;
+
+ add_proto qw/unsigned int vpx_highbd_10_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_10_mse16x16 sse2/;
+
+ add_proto qw/unsigned int vpx_highbd_10_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ add_proto qw/unsigned int vpx_highbd_10_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ add_proto qw/unsigned int vpx_highbd_10_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_10_mse8x8 sse2/;
+
+ add_proto qw/unsigned int vpx_highbd_12_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_12_mse16x16 sse2/;
+
+ add_proto qw/unsigned int vpx_highbd_12_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ add_proto qw/unsigned int vpx_highbd_12_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ add_proto qw/unsigned int vpx_highbd_12_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_12_mse8x8 sse2/;
+
+ add_proto qw/void vpx_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride";
+} # CONFIG_VP9_HIGHBITDEPTH
+} # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
+
1;
diff --git a/vp9/encoder/x86/vp9_highbd_variance_impl_sse2.asm b/vpx_dsp/x86/highbd_variance_impl_sse2.asm
index 821dd0660..923418a99 100644
--- a/vp9/encoder/x86/vp9_highbd_variance_impl_sse2.asm
+++ b/vpx_dsp/x86/highbd_variance_impl_sse2.asm
@@ -11,7 +11,7 @@
%include "vpx_ports/x86_abi_support.asm"
-;unsigned int vp9_highbd_calc16x16var_sse2
+;unsigned int vpx_highbd_calc16x16var_sse2
;(
; unsigned char * src_ptr,
; int source_stride,
@@ -20,8 +20,8 @@
; unsigned int * SSE,
; int * Sum
;)
-global sym(vp9_highbd_calc16x16var_sse2) PRIVATE
-sym(vp9_highbd_calc16x16var_sse2):
+global sym(vpx_highbd_calc16x16var_sse2) PRIVATE
+sym(vpx_highbd_calc16x16var_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
@@ -164,7 +164,7 @@ sym(vp9_highbd_calc16x16var_sse2):
ret
-;unsigned int vp9_highbd_calc8x8var_sse2
+;unsigned int vpx_highbd_calc8x8var_sse2
;(
; unsigned char * src_ptr,
; int source_stride,
@@ -173,8 +173,8 @@ sym(vp9_highbd_calc16x16var_sse2):
; unsigned int * SSE,
; int * Sum
;)
-global sym(vp9_highbd_calc8x8var_sse2) PRIVATE
-sym(vp9_highbd_calc8x8var_sse2):
+global sym(vpx_highbd_calc8x8var_sse2) PRIVATE
+sym(vpx_highbd_calc8x8var_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
diff --git a/vpx_dsp/x86/highbd_variance_sse2.c b/vpx_dsp/x86/highbd_variance_sse2.c
new file mode 100644
index 000000000..343c0478b
--- /dev/null
+++ b/vpx_dsp/x86/highbd_variance_sse2.c
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include "./vpx_config.h"
+#include "vp9/common/vp9_common.h"
+
+#include "vp9/encoder/vp9_variance.h"
+#include "vpx_ports/mem.h"
+
+typedef uint32_t (*high_variance_fn_t) (const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ uint32_t *sse, int *sum);
+
+uint32_t vpx_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ uint32_t *sse, int *sum);
+
+uint32_t vpx_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ uint32_t *sse, int *sum);
+
+static void highbd_8_variance_sse2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int w, int h, uint32_t *sse, int *sum,
+ high_variance_fn_t var_fn, int block_size) {
+ int i, j;
+
+ *sse = 0;
+ *sum = 0;
+
+ for (i = 0; i < h; i += block_size) {
+ for (j = 0; j < w; j += block_size) {
+ unsigned int sse0;
+ int sum0;
+ var_fn(src + src_stride * i + j, src_stride,
+ ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
+ *sse += sse0;
+ *sum += sum0;
+ }
+ }
+}
+
+static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int w, int h, uint32_t *sse, int *sum,
+ high_variance_fn_t var_fn, int block_size) {
+ int i, j;
+ uint64_t sse_long = 0;
+ int64_t sum_long = 0;
+
+ for (i = 0; i < h; i += block_size) {
+ for (j = 0; j < w; j += block_size) {
+ unsigned int sse0;
+ int sum0;
+ var_fn(src + src_stride * i + j, src_stride,
+ ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
+ sse_long += sse0;
+ sum_long += sum0;
+ }
+ }
+ *sum = ROUND_POWER_OF_TWO(sum_long, 2);
+ *sse = ROUND_POWER_OF_TWO(sse_long, 4);
+}
+
+static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int w, int h, uint32_t *sse, int *sum,
+ high_variance_fn_t var_fn, int block_size) {
+ int i, j;
+ uint64_t sse_long = 0;
+ int64_t sum_long = 0;
+
+ for (i = 0; i < h; i += block_size) {
+ for (j = 0; j < w; j += block_size) {
+ unsigned int sse0;
+ int sum0;
+ var_fn(src + src_stride * i + j, src_stride,
+ ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
+ sse_long += sse0;
+ sum_long += sum0;
+ }
+ }
+ *sum = ROUND_POWER_OF_TWO(sum_long, 4);
+ *sse = ROUND_POWER_OF_TWO(sse_long, 8);
+}
+
+
+#define HIGH_GET_VAR(S) \
+void vpx_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
+ const uint8_t *ref8, int ref_stride, \
+ uint32_t *sse, int *sum) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
+ sse, sum); \
+} \
+\
+void vpx_highbd_10_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
+ const uint8_t *ref8, int ref_stride, \
+ uint32_t *sse, int *sum) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
+ sse, sum); \
+ *sum = ROUND_POWER_OF_TWO(*sum, 2); \
+ *sse = ROUND_POWER_OF_TWO(*sse, 4); \
+} \
+\
+void vpx_highbd_12_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
+ const uint8_t *ref8, int ref_stride, \
+ uint32_t *sse, int *sum) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
+ sse, sum); \
+ *sum = ROUND_POWER_OF_TWO(*sum, 4); \
+ *sse = ROUND_POWER_OF_TWO(*sse, 8); \
+}
+
+HIGH_GET_VAR(16);
+HIGH_GET_VAR(8);
+
+#undef HIGH_GET_VAR
+
+#define VAR_FN(w, h, block_size, shift) \
+uint32_t vpx_highbd_8_variance##w##x##h##_sse2( \
+ const uint8_t *src8, int src_stride, \
+ const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
+ int sum; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ highbd_8_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+ vpx_highbd_calc##block_size##x##block_size##var_sse2, \
+ block_size); \
+ return *sse - (((int64_t)sum * sum) >> shift); \
+} \
+\
+uint32_t vpx_highbd_10_variance##w##x##h##_sse2( \
+ const uint8_t *src8, int src_stride, \
+ const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
+ int sum; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ highbd_10_variance_sse2( \
+ src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+ vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
+ return *sse - (((int64_t)sum * sum) >> shift); \
+} \
+\
+uint32_t vpx_highbd_12_variance##w##x##h##_sse2( \
+ const uint8_t *src8, int src_stride, \
+ const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
+ int sum; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ highbd_12_variance_sse2( \
+ src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+ vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
+ return *sse - (((int64_t)sum * sum) >> shift); \
+}
+
+VAR_FN(64, 64, 16, 12);
+VAR_FN(64, 32, 16, 11);
+VAR_FN(32, 64, 16, 11);
+VAR_FN(32, 32, 16, 10);
+VAR_FN(32, 16, 16, 9);
+VAR_FN(16, 32, 16, 9);
+VAR_FN(16, 16, 16, 8);
+VAR_FN(16, 8, 8, 7);
+VAR_FN(8, 16, 8, 7);
+VAR_FN(8, 8, 8, 6);
+
+#undef VAR_FN
+
+unsigned int vpx_highbd_8_mse16x16_sse2(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
+ sse, &sum, vpx_highbd_calc16x16var_sse2, 16);
+ return *sse;
+}
+
+unsigned int vpx_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
+ sse, &sum, vpx_highbd_calc16x16var_sse2, 16);
+ return *sse;
+}
+
+unsigned int vpx_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
+ sse, &sum, vpx_highbd_calc16x16var_sse2, 16);
+ return *sse;
+}
+
+unsigned int vpx_highbd_8_mse8x8_sse2(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
+ sse, &sum, vpx_highbd_calc8x8var_sse2, 8);
+ return *sse;
+}
+
+unsigned int vpx_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
+ sse, &sum, vpx_highbd_calc8x8var_sse2, 8);
+ return *sse;
+}
+
+unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
+ sse, &sum, vpx_highbd_calc8x8var_sse2, 8);
+ return *sse;
+}
diff --git a/vpx_dsp/x86/variance_avx2.c b/vpx_dsp/x86/variance_avx2.c
new file mode 100644
index 000000000..82cef4af0
--- /dev/null
+++ b/vpx_dsp/x86/variance_avx2.c
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include "./vpx_dsp_rtcd.h"
+
+typedef void (*get_var_avx2)(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse, int *sum);
+
+void vpx_get32x32var_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse, int *sum);
+
+static void variance_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int w, int h, unsigned int *sse, int *sum,
+ get_var_avx2 var_fn, int block_size) {
+ int i, j;
+
+ *sse = 0;
+ *sum = 0;
+
+ for (i = 0; i < h; i += 16) {
+ for (j = 0; j < w; j += block_size) {
+ unsigned int sse0;
+ int sum0;
+ var_fn(&src[src_stride * i + j], src_stride,
+ &ref[ref_stride * i + j], ref_stride, &sse0, &sum0);
+ *sse += sse0;
+ *sum += sum0;
+ }
+ }
+}
+
+
+unsigned int vpx_variance16x16_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ variance_avx2(src, src_stride, ref, ref_stride, 16, 16,
+ sse, &sum, vpx_get16x16var_avx2, 16);
+ return *sse - (((unsigned int)sum * sum) >> 8);
+}
+
+unsigned int vpx_mse16x16_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ vpx_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum);
+ return *sse;
+}
+
+unsigned int vpx_variance32x16_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ variance_avx2(src, src_stride, ref, ref_stride, 32, 16,
+ sse, &sum, vpx_get32x32var_avx2, 32);
+ return *sse - (((int64_t)sum * sum) >> 9);
+}
+
+unsigned int vpx_variance32x32_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ variance_avx2(src, src_stride, ref, ref_stride, 32, 32,
+ sse, &sum, vpx_get32x32var_avx2, 32);
+ return *sse - (((int64_t)sum * sum) >> 10);
+}
+
+unsigned int vpx_variance64x64_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ variance_avx2(src, src_stride, ref, ref_stride, 64, 64,
+ sse, &sum, vpx_get32x32var_avx2, 32);
+ return *sse - (((int64_t)sum * sum) >> 12);
+}
+
+unsigned int vpx_variance64x32_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ variance_avx2(src, src_stride, ref, ref_stride, 64, 32,
+ sse, &sum, vpx_get32x32var_avx2, 32);
+ return *sse - (((int64_t)sum * sum) >> 11);
+}
diff --git a/vp9/encoder/x86/vp9_variance_impl_intrin_avx2.c b/vpx_dsp/x86/variance_impl_avx2.c
index ee76a315f..0e40959aa 100644
--- a/vp9/encoder/x86/vp9_variance_impl_intrin_avx2.c
+++ b/vpx_dsp/x86/variance_impl_avx2.c
@@ -10,9 +10,9 @@
#include <immintrin.h> // AVX2
-#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
-void vp9_get16x16var_avx2(const unsigned char *src_ptr,
+void vpx_get16x16var_avx2(const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
@@ -123,7 +123,7 @@ void vp9_get16x16var_avx2(const unsigned char *src_ptr,
}
}
-void vp9_get32x32var_avx2(const unsigned char *src_ptr,
+void vpx_get32x32var_avx2(const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
diff --git a/vpx_dsp/x86/variance_impl_mmx.asm b/vpx_dsp/x86/variance_impl_mmx.asm
new file mode 100644
index 000000000..a8d7d99db
--- /dev/null
+++ b/vpx_dsp/x86/variance_impl_mmx.asm
@@ -0,0 +1,424 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;unsigned int vpx_get_mb_ss_mmx( short *src_ptr )
+global sym(vpx_get_mb_ss_mmx) PRIVATE
+sym(vpx_get_mb_ss_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ sub rsp, 8
+ ; end prolog
+
+ mov rax, arg(0) ;src_ptr
+ mov rcx, 16
+ pxor mm4, mm4
+
+.NEXTROW:
+ movq mm0, [rax]
+ movq mm1, [rax+8]
+ movq mm2, [rax+16]
+ movq mm3, [rax+24]
+ pmaddwd mm0, mm0
+ pmaddwd mm1, mm1
+ pmaddwd mm2, mm2
+ pmaddwd mm3, mm3
+
+ paddd mm4, mm0
+ paddd mm4, mm1
+ paddd mm4, mm2
+ paddd mm4, mm3
+
+ add rax, 32
+ dec rcx
+ ja .NEXTROW
+ movq QWORD PTR [rsp], mm4
+
+ ;return sum[0]+sum[1];
+ movsxd rax, dword ptr [rsp]
+ movsxd rcx, dword ptr [rsp+4]
+ add rax, rcx
+
+
+ ; begin epilog
+ add rsp, 8
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vpx_get8x8var_mmx
+;(
+; unsigned char *src_ptr,
+; int source_stride,
+; unsigned char *ref_ptr,
+; int recon_stride,
+; unsigned int *SSE,
+; int *Sum
+;)
+global sym(vpx_get8x8var_mmx) PRIVATE
+sym(vpx_get8x8var_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ push rbx
+ sub rsp, 16
+ ; end prolog
+
+
+ pxor mm5, mm5 ; Blank mmx6
+ pxor mm6, mm6 ; Blank mmx7
+ pxor mm7, mm7 ; Blank mmx7
+
+ mov rax, arg(0) ;[src_ptr] ; Load base addresses
+ mov rbx, arg(2) ;[ref_ptr]
+ movsxd rcx, dword ptr arg(1) ;[source_stride]
+ movsxd rdx, dword ptr arg(3) ;[recon_stride]
+
+ ; Row 1
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ movq mm2, mm0 ; Take copies
+ movq mm3, mm1 ; Take copies
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ punpckhbw mm2, mm6 ; unpack to higher prrcision
+ punpckhbw mm3, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ psubsw mm2, mm3 ; A-B (high order) to MM2
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+ paddw mm5, mm2 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ pmaddwd mm2, mm2 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+ paddd mm7, mm2 ; accumulate in mm7
+
+
+ ; Row 2
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm2, mm0 ; Take copies
+ movq mm3, mm1 ; Take copies
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ punpckhbw mm2, mm6 ; unpack to higher prrcision
+ punpckhbw mm3, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ psubsw mm2, mm3 ; A-B (high order) to MM2
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+ paddw mm5, mm2 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ pmaddwd mm2, mm2 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+ paddd mm7, mm2 ; accumulate in mm7
+
+ ; Row 3
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm2, mm0 ; Take copies
+ movq mm3, mm1 ; Take copies
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ punpckhbw mm2, mm6 ; unpack to higher prrcision
+ punpckhbw mm3, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ psubsw mm2, mm3 ; A-B (high order) to MM2
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+ paddw mm5, mm2 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ pmaddwd mm2, mm2 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+ paddd mm7, mm2 ; accumulate in mm7
+
+ ; Row 4
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm2, mm0 ; Take copies
+ movq mm3, mm1 ; Take copies
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ punpckhbw mm2, mm6 ; unpack to higher prrcision
+ punpckhbw mm3, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ psubsw mm2, mm3 ; A-B (high order) to MM2
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+ paddw mm5, mm2 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ pmaddwd mm2, mm2 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+ paddd mm7, mm2 ; accumulate in mm7
+
+ ; Row 5
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm2, mm0 ; Take copies
+ movq mm3, mm1 ; Take copies
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ punpckhbw mm2, mm6 ; unpack to higher prrcision
+ punpckhbw mm3, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ psubsw mm2, mm3 ; A-B (high order) to MM2
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+ paddw mm5, mm2 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ pmaddwd mm2, mm2 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ ; movq mm4, [rbx + rdx]
+ paddd mm7, mm0 ; accumulate in mm7
+ paddd mm7, mm2 ; accumulate in mm7
+
+ ; Row 6
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm2, mm0 ; Take copies
+ movq mm3, mm1 ; Take copies
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ punpckhbw mm2, mm6 ; unpack to higher prrcision
+ punpckhbw mm3, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ psubsw mm2, mm3 ; A-B (high order) to MM2
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+ paddw mm5, mm2 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ pmaddwd mm2, mm2 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+ paddd mm7, mm2 ; accumulate in mm7
+
+ ; Row 7
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm2, mm0 ; Take copies
+ movq mm3, mm1 ; Take copies
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ punpckhbw mm2, mm6 ; unpack to higher prrcision
+ punpckhbw mm3, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ psubsw mm2, mm3 ; A-B (high order) to MM2
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+ paddw mm5, mm2 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ pmaddwd mm2, mm2 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+ paddd mm7, mm2 ; accumulate in mm7
+
+ ; Row 8
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm2, mm0 ; Take copies
+ movq mm3, mm1 ; Take copies
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ punpckhbw mm2, mm6 ; unpack to higher prrcision
+ punpckhbw mm3, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ psubsw mm2, mm3 ; A-B (high order) to MM2
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+ paddw mm5, mm2 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ pmaddwd mm2, mm2 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ paddd mm7, mm0 ; accumulate in mm7
+ paddd mm7, mm2 ; accumulate in mm7
+
+ ; Now accumulate the final results.
+ movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
+ movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
+ movsx rdx, WORD PTR [rsp+8]
+ movsx rcx, WORD PTR [rsp+10]
+ movsx rbx, WORD PTR [rsp+12]
+ movsx rax, WORD PTR [rsp+14]
+ add rdx, rcx
+ add rbx, rax
+ add rdx, rbx ;XSum
+ movsxd rax, DWORD PTR [rsp]
+ movsxd rcx, DWORD PTR [rsp+4]
+ add rax, rcx ;XXSum
+ mov rsi, arg(4) ;SSE
+ mov rdi, arg(5) ;Sum
+ mov dword ptr [rsi], eax
+ mov dword ptr [rdi], edx
+ xor rax, rax ; return 0
+
+
+ ; begin epilog
+ add rsp, 16
+ pop rbx
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+
+;void
+;vpx_get4x4var_mmx
+;(
+; unsigned char *src_ptr,
+; int source_stride,
+; unsigned char *ref_ptr,
+; int recon_stride,
+; unsigned int *SSE,
+; int *Sum
+;)
+global sym(vpx_get4x4var_mmx) PRIVATE
+sym(vpx_get4x4var_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ push rbx
+ sub rsp, 16
+ ; end prolog
+
+
+ pxor mm5, mm5 ; Blank mmx6
+ pxor mm6, mm6 ; Blank mmx7
+ pxor mm7, mm7 ; Blank mmx7
+
+ mov rax, arg(0) ;[src_ptr] ; Load base addresses
+ mov rbx, arg(2) ;[ref_ptr]
+ movsxd rcx, dword ptr arg(1) ;[source_stride]
+ movsxd rdx, dword ptr arg(3) ;[recon_stride]
+
+ ; Row 1
+ movd mm0, [rax] ; Copy four bytes to mm0
+ movd mm1, [rbx] ; Copy four bytes to mm1
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ paddw mm5, mm0 ; accumulate differences in mm5
+ pmaddwd mm0, mm0 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movd mm1, [rbx] ; Copy four bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+
+
+ ; Row 2
+ movd mm0, [rax] ; Copy four bytes to mm0
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ paddw mm5, mm0 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movd mm1, [rbx] ; Copy four bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+
+ ; Row 3
+ movd mm0, [rax] ; Copy four bytes to mm0
+ punpcklbw mm0, mm6 ; unpack to higher precision
+ punpcklbw mm1, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ paddw mm5, mm0 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movd mm1, [rbx] ; Copy four bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+
+ ; Row 4
+ movd mm0, [rax] ; Copy four bytes to mm0
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ paddd mm7, mm0 ; accumulate in mm7
+
+
+ ; Now accumulate the final results.
+ movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
+ movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
+ movsx rdx, WORD PTR [rsp+8]
+ movsx rcx, WORD PTR [rsp+10]
+ movsx rbx, WORD PTR [rsp+12]
+ movsx rax, WORD PTR [rsp+14]
+ add rdx, rcx
+ add rbx, rax
+ add rdx, rbx ;XSum
+ movsxd rax, DWORD PTR [rsp]
+ movsxd rcx, DWORD PTR [rsp+4]
+ add rax, rcx ;XXSum
+ mov rsi, arg(4) ;SSE
+ mov rdi, arg(5) ;Sum
+ mov dword ptr [rsi], eax
+ mov dword ptr [rdi], edx
+ xor rax, rax ; return 0
+
+
+ ; begin epilog
+ add rsp, 16
+ pop rbx
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/vpx_dsp/x86/variance_mmx.c b/vpx_dsp/x86/variance_mmx.c
new file mode 100644
index 000000000..99dd741bc
--- /dev/null
+++ b/vpx_dsp/x86/variance_mmx.c
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+
+extern void vpx_get4x4var_mmx(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ unsigned int *sse, int *sum);
+
+unsigned int vpx_variance4x4_mmx(const unsigned char *a, int a_stride,
+ const unsigned char *b, int b_stride,
+ unsigned int *sse) {
+ unsigned int var;
+ int avg;
+
+ vpx_get4x4var_mmx(a, a_stride, b, b_stride, &var, &avg);
+ *sse = var;
+ return (var - (((unsigned int)avg * avg) >> 4));
+}
+
+unsigned int vpx_variance8x8_mmx(const unsigned char *a, int a_stride,
+ const unsigned char *b, int b_stride,
+ unsigned int *sse) {
+ unsigned int var;
+ int avg;
+
+ vpx_get8x8var_mmx(a, a_stride, b, b_stride, &var, &avg);
+ *sse = var;
+
+ return (var - (((unsigned int)avg * avg) >> 6));
+}
+
+unsigned int vpx_mse16x16_mmx(const unsigned char *a, int a_stride,
+ const unsigned char *b, int b_stride,
+ unsigned int *sse) {
+ unsigned int sse0, sse1, sse2, sse3, var;
+ int sum0, sum1, sum2, sum3;
+
+ vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
+ vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1);
+ vpx_get8x8var_mmx(a + 8 * a_stride, a_stride,
+ b + 8 * b_stride, b_stride, &sse2, &sum2);
+ vpx_get8x8var_mmx(a + 8 * a_stride + 8, a_stride,
+ b + 8 * b_stride + 8, b_stride, &sse3, &sum3);
+
+ var = sse0 + sse1 + sse2 + sse3;
+ *sse = var;
+ return var;
+}
+
+unsigned int vpx_variance16x16_mmx(const unsigned char *a, int a_stride,
+ const unsigned char *b, int b_stride,
+ unsigned int *sse) {
+ unsigned int sse0, sse1, sse2, sse3, var;
+ int sum0, sum1, sum2, sum3, avg;
+
+ vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
+ vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1);
+ vpx_get8x8var_mmx(a + 8 * a_stride, a_stride,
+ b + 8 * b_stride, b_stride, &sse2, &sum2);
+ vpx_get8x8var_mmx(a + 8 * a_stride + 8, a_stride,
+ b + 8 * b_stride + 8, b_stride, &sse3, &sum3);
+
+ var = sse0 + sse1 + sse2 + sse3;
+ avg = sum0 + sum1 + sum2 + sum3;
+ *sse = var;
+ return (var - (((unsigned int)avg * avg) >> 8));
+}
+
+unsigned int vpx_variance16x8_mmx(const unsigned char *a, int a_stride,
+ const unsigned char *b, int b_stride,
+ unsigned int *sse) {
+ unsigned int sse0, sse1, var;
+ int sum0, sum1, avg;
+
+ vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
+ vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1);
+
+ var = sse0 + sse1;
+ avg = sum0 + sum1;
+ *sse = var;
+ return (var - (((unsigned int)avg * avg) >> 7));
+}
+
+unsigned int vpx_variance8x16_mmx(const unsigned char *a, int a_stride,
+ const unsigned char *b, int b_stride,
+ unsigned int *sse) {
+ unsigned int sse0, sse1, var;
+ int sum0, sum1, avg;
+
+ vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
+ vpx_get8x8var_mmx(a + 8 * a_stride, a_stride,
+ b + 8 * b_stride, b_stride, &sse1, &sum1);
+
+ var = sse0 + sse1;
+ avg = sum0 + sum1;
+ *sse = var;
+
+ return (var - (((unsigned int)avg * avg) >> 7));
+}
diff --git a/vpx_dsp/x86/variance_sse2.c b/vpx_dsp/x86/variance_sse2.c
new file mode 100644
index 000000000..6256bc536
--- /dev/null
+++ b/vpx_dsp/x86/variance_sse2.c
@@ -0,0 +1,309 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h> // SSE2
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_ports/mem.h"
+
+typedef void (*getNxMvar_fn_t) (const unsigned char *src, int src_stride,
+ const unsigned char *ref, int ref_stride,
+ unsigned int *sse, int *sum);
+
+unsigned int vpx_get_mb_ss_sse2(const int16_t *src) {
+ __m128i vsum = _mm_setzero_si128();
+ int i;
+
+ for (i = 0; i < 32; ++i) {
+ const __m128i v = _mm_loadu_si128((const __m128i *)src);
+ vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
+ src += 8;
+ }
+
+ vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
+ vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
+ return _mm_cvtsi128_si32(vsum);
+}
+
+#define READ64(p, stride, i) \
+ _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \
+ _mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride)))
+
+static void get4x4var_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse, int *sum) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero);
+ const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero);
+ const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero);
+ const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero);
+ const __m128i diff0 = _mm_sub_epi16(src0, ref0);
+ const __m128i diff1 = _mm_sub_epi16(src1, ref1);
+
+ // sum
+ __m128i vsum = _mm_add_epi16(diff0, diff1);
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
+ *sum = (int16_t)_mm_extract_epi16(vsum, 0);
+
+ // sse
+ vsum = _mm_add_epi32(_mm_madd_epi16(diff0, diff0),
+ _mm_madd_epi16(diff1, diff1));
+ vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
+ vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
+ *sse = _mm_cvtsi128_si32(vsum);
+}
+
+void vpx_get8x8var_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse, int *sum) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i vsum = _mm_setzero_si128();
+ __m128i vsse = _mm_setzero_si128();
+ int i;
+
+ for (i = 0; i < 8; i += 2) {
+ const __m128i src0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
+ (const __m128i *)(src + i * src_stride)), zero);
+ const __m128i ref0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
+ (const __m128i *)(ref + i * ref_stride)), zero);
+ const __m128i diff0 = _mm_sub_epi16(src0, ref0);
+
+ const __m128i src1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
+ (const __m128i *)(src + (i + 1) * src_stride)), zero);
+ const __m128i ref1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
+ (const __m128i *)(ref + (i + 1) * ref_stride)), zero);
+ const __m128i diff1 = _mm_sub_epi16(src1, ref1);
+
+ vsum = _mm_add_epi16(vsum, diff0);
+ vsum = _mm_add_epi16(vsum, diff1);
+ vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
+ vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
+ }
+
+ // sum
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
+ *sum = (int16_t)_mm_extract_epi16(vsum, 0);
+
+ // sse
+ vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
+ vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
+ *sse = _mm_cvtsi128_si32(vsse);
+}
+
+void vpx_get16x16var_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse, int *sum) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i vsum = _mm_setzero_si128();
+ __m128i vsse = _mm_setzero_si128();
+ int i;
+
+ for (i = 0; i < 16; ++i) {
+ const __m128i s = _mm_loadu_si128((const __m128i *)src);
+ const __m128i r = _mm_loadu_si128((const __m128i *)ref);
+
+ const __m128i src0 = _mm_unpacklo_epi8(s, zero);
+ const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
+ const __m128i diff0 = _mm_sub_epi16(src0, ref0);
+
+ const __m128i src1 = _mm_unpackhi_epi8(s, zero);
+ const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
+ const __m128i diff1 = _mm_sub_epi16(src1, ref1);
+
+ vsum = _mm_add_epi16(vsum, diff0);
+ vsum = _mm_add_epi16(vsum, diff1);
+ vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
+ vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
+
+ src += src_stride;
+ ref += ref_stride;
+ }
+
+ // sum
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
+ *sum = (int16_t)_mm_extract_epi16(vsum, 0) +
+ (int16_t)_mm_extract_epi16(vsum, 1);
+
+ // sse
+ vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
+ vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
+ *sse = _mm_cvtsi128_si32(vsse);
+}
+
+
+static void variance_sse2(const unsigned char *src, int src_stride,
+ const unsigned char *ref, int ref_stride,
+ int w, int h, unsigned int *sse, int *sum,
+ getNxMvar_fn_t var_fn, int block_size) {
+ int i, j;
+
+ *sse = 0;
+ *sum = 0;
+
+ for (i = 0; i < h; i += block_size) {
+ for (j = 0; j < w; j += block_size) {
+ unsigned int sse0;
+ int sum0;
+ var_fn(src + src_stride * i + j, src_stride,
+ ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
+ *sse += sse0;
+ *sum += sum0;
+ }
+ }
+}
+
+unsigned int vpx_variance4x4_sse2(const unsigned char *src, int src_stride,
+ const unsigned char *ref, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
+ return *sse - (((unsigned int)sum * sum) >> 4);
+}
+
+unsigned int vpx_variance8x4_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ variance_sse2(src, src_stride, ref, ref_stride, 8, 4,
+ sse, &sum, get4x4var_sse2, 4);
+ return *sse - (((unsigned int)sum * sum) >> 5);
+}
+
+unsigned int vpx_variance4x8_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ variance_sse2(src, src_stride, ref, ref_stride, 4, 8,
+ sse, &sum, get4x4var_sse2, 4);
+ return *sse - (((unsigned int)sum * sum) >> 5);
+}
+
+unsigned int vpx_variance8x8_sse2(const unsigned char *src, int src_stride,
+ const unsigned char *ref, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ vpx_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
+ return *sse - (((unsigned int)sum * sum) >> 6);
+}
+
+unsigned int vpx_variance16x8_sse2(const unsigned char *src, int src_stride,
+ const unsigned char *ref, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ variance_sse2(src, src_stride, ref, ref_stride, 16, 8,
+ sse, &sum, vpx_get8x8var_sse2, 8);
+ return *sse - (((unsigned int)sum * sum) >> 7);
+}
+
+unsigned int vpx_variance8x16_sse2(const unsigned char *src, int src_stride,
+ const unsigned char *ref, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ variance_sse2(src, src_stride, ref, ref_stride, 8, 16,
+ sse, &sum, vpx_get8x8var_sse2, 8);
+ return *sse - (((unsigned int)sum * sum) >> 7);
+}
+
+unsigned int vpx_variance16x16_sse2(const unsigned char *src, int src_stride,
+ const unsigned char *ref, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ vpx_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
+ return *sse - (((unsigned int)sum * sum) >> 8);
+}
+
+unsigned int vpx_variance32x32_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ variance_sse2(src, src_stride, ref, ref_stride, 32, 32,
+ sse, &sum, vpx_get16x16var_sse2, 16);
+ return *sse - (((int64_t)sum * sum) >> 10);
+}
+
+unsigned int vpx_variance32x16_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ variance_sse2(src, src_stride, ref, ref_stride, 32, 16,
+ sse, &sum, vpx_get16x16var_sse2, 16);
+ return *sse - (((int64_t)sum * sum) >> 9);
+}
+
+unsigned int vpx_variance16x32_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ variance_sse2(src, src_stride, ref, ref_stride, 16, 32,
+ sse, &sum, vpx_get16x16var_sse2, 16);
+ return *sse - (((int64_t)sum * sum) >> 9);
+}
+
+unsigned int vpx_variance64x64_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ variance_sse2(src, src_stride, ref, ref_stride, 64, 64,
+ sse, &sum, vpx_get16x16var_sse2, 16);
+ return *sse - (((int64_t)sum * sum) >> 12);
+}
+
+unsigned int vpx_variance64x32_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ variance_sse2(src, src_stride, ref, ref_stride, 64, 32,
+ sse, &sum, vpx_get16x16var_sse2, 16);
+ return *sse - (((int64_t)sum * sum) >> 11);
+}
+
+unsigned int vpx_variance32x64_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ variance_sse2(src, src_stride, ref, ref_stride, 32, 64,
+ sse, &sum, vpx_get16x16var_sse2, 16);
+ return *sse - (((int64_t)sum * sum) >> 11);
+}
+
+unsigned int vpx_mse8x8_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ vpx_variance8x8_sse2(src, src_stride, ref, ref_stride, sse);
+ return *sse;
+}
+
+unsigned int vpx_mse8x16_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ vpx_variance8x16_sse2(src, src_stride, ref, ref_stride, sse);
+ return *sse;
+}
+
+unsigned int vpx_mse16x8_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ vpx_variance16x8_sse2(src, src_stride, ref, ref_stride, sse);
+ return *sse;
+}
+
+unsigned int vpx_mse16x16_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ vpx_variance16x16_sse2(src, src_stride, ref, ref_stride, sse);
+ return *sse;
+}
diff --git a/vpx_ports/msvc.h b/vpx_ports/msvc.h
new file mode 100644
index 000000000..43a36e761
--- /dev/null
+++ b/vpx_ports/msvc.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_PORTS_MSVC_H_
+#define VPX_PORTS_MSVC_H_
+#ifdef _MSC_VER
+
+#include "./vpx_config.h"
+
+# if _MSC_VER < 1900 // VS2015 provides snprintf
+# define snprintf _snprintf
+# endif // _MSC_VER < 1900
+
+#endif // _MSC_VER
+#endif // VPX_PORTS_MSVC_H_
diff --git a/vpx_ports/vpx_ports.mk b/vpx_ports/vpx_ports.mk
index dfc75ab6f..ab7fc4ac7 100644
--- a/vpx_ports/vpx_ports.mk
+++ b/vpx_ports/vpx_ports.mk
@@ -12,6 +12,7 @@
PORTS_SRCS-yes += vpx_ports.mk
PORTS_SRCS-yes += mem.h
+PORTS_SRCS-yes += msvc.h
PORTS_SRCS-yes += vpx_timer.h
ifeq ($(ARCH_X86)$(ARCH_X86_64),yes)