diff options
91 files changed, 4480 insertions, 5399 deletions
@@ -14,9 +14,7 @@ #include <limits.h> #include "args.h" -#ifdef _MSC_VER -#define snprintf _snprintf -#endif +#include "vpx_ports/msvc.h" #if defined(__GNUC__) && __GNUC__ extern void die(const char *fmt, ...) __attribute__((noreturn)); @@ -184,6 +184,10 @@ if [ ${doxy_major:-0} -ge 1 ]; then [ $doxy_minor -eq 5 ] && [ $doxy_patch -ge 3 ] && enable_feature doxygen fi +# disable codecs when their source directory does not exist +[ -d "${source_path}/vp8" ] || disable_feature vp8 +[ -d "${source_path}/vp9" ] || disable_feature vp9 + # install everything except the sources, by default. sources will have # to be enabled when doing dist builds, since that's no longer a common # case. @@ -199,31 +203,16 @@ enable_feature multithread enable_feature os_support enable_feature temporal_denoising -[ -d "${source_path}/../include" ] && enable_feature alt_tree_layout -for d in vp8 vp9; do - [ -d "${source_path}/${d}" ] && disable_feature alt_tree_layout; -done - -if ! enabled alt_tree_layout; then -# development environment -[ -d "${source_path}/vp8" ] && CODECS="${CODECS} vp8_encoder vp8_decoder" -[ -d "${source_path}/vp9" ] && CODECS="${CODECS} vp9_encoder vp9_decoder" -else -# customer environment -[ -f "${source_path}/../include/vpx/vp8cx.h" ] && CODECS="${CODECS} vp8_encoder" -[ -f "${source_path}/../include/vpx/vp8dx.h" ] && CODECS="${CODECS} vp8_decoder" -[ -f "${source_path}/../include/vpx/vp9cx.h" ] && CODECS="${CODECS} vp9_encoder" -[ -f "${source_path}/../include/vpx/vp9dx.h" ] && CODECS="${CODECS} vp9_decoder" -[ -f "${source_path}/../include/vpx/vp8cx.h" ] || disable_feature vp8_encoder -[ -f "${source_path}/../include/vpx/vp8dx.h" ] || disable_feature vp8_decoder -[ -f "${source_path}/../include/vpx/vp9cx.h" ] || disable_feature vp9_encoder -[ -f "${source_path}/../include/vpx/vp9dx.h" ] || disable_feature vp9_decoder - -[ -f "${source_path}/../lib/*/*mt.lib" ] && soft_enable static_msvcrt -fi - -CODECS="$(echo ${CODECS} | tr ' ' '\n')" -CODEC_FAMILIES="$(for c in ${CODECS}; do echo ${c%_*}; done | sort | uniq)" +CODECS=" + vp8_encoder + vp8_decoder + vp9_encoder + vp9_decoder +" +CODEC_FAMILIES=" + vp8 + vp9 +" ARCH_LIST=" arm @@ -255,7 +244,6 @@ HAVE_LIST=" ${ARCH_EXT_LIST} vpx_ports stdint_h - alt_tree_layout pthread_h sys_mman_h unistd_h diff --git a/examples.mk b/examples.mk index b92507a6f..174c71d10 100644 --- a/examples.mk +++ b/examples.mk @@ -56,6 +56,7 @@ UTILS-$(CONFIG_DECODERS) += vpxdec.c vpxdec.SRCS += md5_utils.c md5_utils.h vpxdec.SRCS += vpx_ports/mem_ops.h vpxdec.SRCS += vpx_ports/mem_ops_aligned.h +vpxdec.SRCS += vpx_ports/msvc.h vpxdec.SRCS += vpx_ports/vpx_timer.h vpxdec.SRCS += vpx/vpx_integer.h vpxdec.SRCS += args.c args.h @@ -80,6 +81,7 @@ vpxenc.SRCS += tools_common.c tools_common.h vpxenc.SRCS += warnings.c warnings.h vpxenc.SRCS += vpx_ports/mem_ops.h vpxenc.SRCS += vpx_ports/mem_ops_aligned.h +vpxenc.SRCS += vpx_ports/msvc.h vpxenc.SRCS += vpx_ports/vpx_timer.h vpxenc.SRCS += vpxstats.c vpxstats.h ifeq ($(CONFIG_LIBYUV),yes) @@ -98,6 +100,7 @@ ifeq ($(CONFIG_SPATIAL_SVC),yes) vp9_spatial_svc_encoder.SRCS += tools_common.c tools_common.h vp9_spatial_svc_encoder.SRCS += video_common.h vp9_spatial_svc_encoder.SRCS += video_writer.h video_writer.c + vp9_spatial_svc_encoder.SRCS += vpx_ports/msvc.h vp9_spatial_svc_encoder.SRCS += vpxstats.c vpxstats.h vp9_spatial_svc_encoder.GUID = 4A38598D-627D-4505-9C7B-D4020C84100D vp9_spatial_svc_encoder.DESCRIPTION = VP9 Spatial SVC Encoder @@ -112,6 +115,7 @@ vpx_temporal_svc_encoder.SRCS += ivfenc.c ivfenc.h vpx_temporal_svc_encoder.SRCS += tools_common.c tools_common.h vpx_temporal_svc_encoder.SRCS += video_common.h vpx_temporal_svc_encoder.SRCS += video_writer.h video_writer.c +vpx_temporal_svc_encoder.SRCS += vpx_ports/msvc.h vpx_temporal_svc_encoder.GUID = B18C08F2-A439-4502-A78E-849BE3D60947 vpx_temporal_svc_encoder.DESCRIPTION = Temporal SVC Encoder EXAMPLES-$(CONFIG_DECODERS) += simple_decoder.c @@ -122,6 +126,7 @@ simple_decoder.SRCS += video_common.h simple_decoder.SRCS += video_reader.h video_reader.c simple_decoder.SRCS += vpx_ports/mem_ops.h simple_decoder.SRCS += vpx_ports/mem_ops_aligned.h +simple_decoder.SRCS += vpx_ports/msvc.h simple_decoder.DESCRIPTION = Simplified decoder loop EXAMPLES-$(CONFIG_DECODERS) += postproc.c postproc.SRCS += ivfdec.h ivfdec.c @@ -130,6 +135,7 @@ postproc.SRCS += video_common.h postproc.SRCS += video_reader.h video_reader.c postproc.SRCS += vpx_ports/mem_ops.h postproc.SRCS += vpx_ports/mem_ops_aligned.h +postproc.SRCS += vpx_ports/msvc.h postproc.GUID = 65E33355-F35E-4088-884D-3FD4905881D7 postproc.DESCRIPTION = Decoder postprocessor control EXAMPLES-$(CONFIG_DECODERS) += decode_to_md5.c @@ -140,6 +146,7 @@ decode_to_md5.SRCS += video_common.h decode_to_md5.SRCS += video_reader.h video_reader.c decode_to_md5.SRCS += vpx_ports/mem_ops.h decode_to_md5.SRCS += vpx_ports/mem_ops_aligned.h +decode_to_md5.SRCS += vpx_ports/msvc.h decode_to_md5.GUID = 59120B9B-2735-4BFE-B022-146CA340FE42 decode_to_md5.DESCRIPTION = Frame by frame MD5 checksum EXAMPLES-$(CONFIG_ENCODERS) += simple_encoder.c @@ -147,6 +154,7 @@ simple_encoder.SRCS += ivfenc.h ivfenc.c simple_encoder.SRCS += tools_common.h tools_common.c simple_encoder.SRCS += video_common.h simple_encoder.SRCS += video_writer.h video_writer.c +simple_encoder.SRCS += vpx_ports/msvc.h simple_encoder.GUID = 4607D299-8A71-4D2C-9B1D-071899B6FBFD simple_encoder.DESCRIPTION = Simplified encoder loop EXAMPLES-$(CONFIG_VP9_ENCODER) += vp9_lossless_encoder.c @@ -154,6 +162,7 @@ vp9_lossless_encoder.SRCS += ivfenc.h ivfenc.c vp9_lossless_encoder.SRCS += tools_common.h tools_common.c vp9_lossless_encoder.SRCS += video_common.h vp9_lossless_encoder.SRCS += video_writer.h video_writer.c +vp9_lossless_encoder.SRCS += vpx_ports/msvc.h vp9_lossless_encoder.GUID = B63C7C88-5348-46DC-A5A6-CC151EF93366 vp9_lossless_encoder.DESCRIPTION = Simplified lossless VP9 encoder EXAMPLES-$(CONFIG_ENCODERS) += twopass_encoder.c @@ -161,6 +170,7 @@ twopass_encoder.SRCS += ivfenc.h ivfenc.c twopass_encoder.SRCS += tools_common.h tools_common.c twopass_encoder.SRCS += video_common.h twopass_encoder.SRCS += video_writer.h video_writer.c +twopass_encoder.SRCS += vpx_ports/msvc.h twopass_encoder.GUID = 73494FA6-4AF9-4763-8FBB-265C92402FD8 twopass_encoder.DESCRIPTION = Two-pass encoder loop EXAMPLES-$(CONFIG_DECODERS) += decode_with_drops.c @@ -170,6 +180,7 @@ decode_with_drops.SRCS += video_common.h decode_with_drops.SRCS += video_reader.h video_reader.c decode_with_drops.SRCS += vpx_ports/mem_ops.h decode_with_drops.SRCS += vpx_ports/mem_ops_aligned.h +decode_with_drops.SRCS += vpx_ports/msvc.h decode_with_drops.GUID = CE5C53C4-8DDA-438A-86ED-0DDD3CDB8D26 decode_with_drops.DESCRIPTION = Drops frames while decoding EXAMPLES-$(CONFIG_ENCODERS) += set_maps.c @@ -177,6 +188,7 @@ set_maps.SRCS += ivfenc.h ivfenc.c set_maps.SRCS += tools_common.h tools_common.c set_maps.SRCS += video_common.h set_maps.SRCS += video_writer.h video_writer.c +set_maps.SRCS += vpx_ports/msvc.h set_maps.GUID = ECB2D24D-98B8-4015-A465-A4AF3DCC145F set_maps.DESCRIPTION = Set active and ROI maps EXAMPLES-$(CONFIG_VP8_ENCODER) += vp8cx_set_ref.c @@ -184,6 +196,7 @@ vp8cx_set_ref.SRCS += ivfenc.h ivfenc.c vp8cx_set_ref.SRCS += tools_common.h tools_common.c vp8cx_set_ref.SRCS += video_common.h vp8cx_set_ref.SRCS += video_writer.h video_writer.c +vp8cx_set_ref.SRCS += vpx_ports/msvc.h vp8cx_set_ref.GUID = C5E31F7F-96F6-48BD-BD3E-10EBF6E8057A vp8cx_set_ref.DESCRIPTION = VP8 set encoder reference frame @@ -194,6 +207,7 @@ EXAMPLES-$(CONFIG_VP8_ENCODER) += vp8_multi_resolution_encoder.c vp8_multi_resolution_encoder.SRCS += ivfenc.h ivfenc.c vp8_multi_resolution_encoder.SRCS += tools_common.h tools_common.c vp8_multi_resolution_encoder.SRCS += video_writer.h video_writer.c +vp8_multi_resolution_encoder.SRCS += vpx_ports/msvc.h vp8_multi_resolution_encoder.SRCS += $(LIBYUV_SRCS) vp8_multi_resolution_encoder.GUID = 04f8738e-63c8-423b-90fa-7c2703a374de vp8_multi_resolution_encoder.DESCRIPTION = VP8 Multiple-resolution Encoding diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc index 48a8006af..b37d8e353 100644 --- a/test/dct16x16_test.cc +++ b/test/dct16x16_test.cc @@ -20,6 +20,7 @@ #include "./vp9_rtcd.h" #include "vp9/common/vp9_entropy.h" +#include "vp9/common/vp9_scan.h" #include "vpx/vpx_codec.h" #include "vpx/vpx_integer.h" #include "vpx_ports/mem.h" diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc index 5c0b09bb3..4c12bb49b 100644 --- a/test/fdct8x8_test.cc +++ b/test/fdct8x8_test.cc @@ -20,6 +20,7 @@ #include "./vp9_rtcd.h" #include "vp9/common/vp9_entropy.h" +#include "vp9/common/vp9_scan.h" #include "vpx/vpx_codec.h" #include "vpx/vpx_integer.h" #include "vpx_ports/mem.h" diff --git a/test/test_vectors.cc b/test/test_vectors.cc index 07d306ff4..434a38251 100644 --- a/test/test_vectors.cc +++ b/test/test_vectors.cc @@ -165,7 +165,10 @@ const char *const kVP9TestVectors[] = { "vp90-2-11-size-351x287.webm", "vp90-2-11-size-351x288.webm", "vp90-2-11-size-352x287.webm", "vp90-2-12-droppable_1.ivf", "vp90-2-12-droppable_2.ivf", "vp90-2-12-droppable_3.ivf", +#if !CONFIG_SIZE_LIMIT || \ + (DECODE_WIDTH_LIMIT >= 20400 && DECODE_HEIGHT_LIMIT >= 120) "vp90-2-13-largescaling.webm", +#endif "vp90-2-14-resize-fp-tiles-1-16.webm", "vp90-2-14-resize-fp-tiles-1-2-4-8-16.webm", "vp90-2-14-resize-fp-tiles-1-2.webm", "vp90-2-14-resize-fp-tiles-1-4.webm", diff --git a/test/variance_test.cc b/test/variance_test.cc index e4e27af7c..e45d90fae 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -21,22 +21,45 @@ #include "vpx/vpx_integer.h" #include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" -#if CONFIG_VP8_ENCODER -# include "./vp8_rtcd.h" -# include "vp8/common/variance.h" -#endif #if CONFIG_VP9_ENCODER # include "./vp9_rtcd.h" # include "vp9/encoder/vp9_variance.h" -#endif +#endif // CONFIG_VP9_ENCODER +#include "./vpx_dsp_rtcd.h" namespace { +typedef unsigned int (*VarianceMxNFunc)(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse); +typedef unsigned int (*Get4x4SseFunc)(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride); + + using ::std::tr1::get; using ::std::tr1::make_tuple; using ::std::tr1::tuple; using libvpx_test::ACMRandom; +// Truncate high bit depth results by downshifting (with rounding) by: +// 2 * (bit_depth - 8) for sse +// (bit_depth - 8) for se +static void RoundHighBitDepth(int bit_depth, int64_t *se, uint64_t *sse) { + switch (bit_depth) { + case VPX_BITS_12: + *sse = (*sse + 128) >> 8; + *se = (*se + 8) >> 4; + break; + case VPX_BITS_10: + *sse = (*sse + 8) >> 4; + *se = (*se + 2) >> 2; + break; + case VPX_BITS_8: + default: + break; + } +} + static unsigned int mb_ss_ref(const int16_t *src) { unsigned int res = 0; for (int i = 0; i < 256; ++i) { @@ -50,7 +73,6 @@ static unsigned int variance_ref(const uint8_t *src, const uint8_t *ref, int ref_stride_coeff, uint32_t *sse_ptr, bool use_high_bit_depth_, vpx_bit_depth_t bit_depth) { -#if CONFIG_VP9_HIGHBITDEPTH int64_t se = 0; uint64_t sse = 0; const int w = 1 << l2w; @@ -63,32 +85,17 @@ static unsigned int variance_ref(const uint8_t *src, const uint8_t *ref, src[w * y * src_stride_coeff + x]; se += diff; sse += diff * diff; +#if CONFIG_VP9_HIGHBITDEPTH } else { diff = CONVERT_TO_SHORTPTR(ref)[w * y * ref_stride_coeff + x] - CONVERT_TO_SHORTPTR(src)[w * y * src_stride_coeff + x]; se += diff; sse += diff * diff; +#endif // CONFIG_VP9_HIGHBITDEPTH } } } - if (bit_depth > VPX_BITS_8) { - sse = ROUND_POWER_OF_TWO(sse, 2 * (bit_depth - 8)); - se = ROUND_POWER_OF_TWO(se, bit_depth - 8); - } -#else - int se = 0; - unsigned int sse = 0; - const int w = 1 << l2w; - const int h = 1 << l2h; - for (int y = 0; y < h; y++) { - for (int x = 0; x < w; x++) { - int diff = ref[w * y * ref_stride_coeff + x] - - src[w * y * src_stride_coeff + x]; - se += diff; - sse += diff * diff; - } - } -#endif // CONFIG_VP9_HIGHBITDEPTH + RoundHighBitDepth(bit_depth, &se, &sse); *sse_ptr = sse; return sse - (((int64_t) se * se) >> (l2w + l2h)); } @@ -98,7 +105,6 @@ static unsigned int subpel_variance_ref(const uint8_t *ref, const uint8_t *src, unsigned int *sse_ptr, bool use_high_bit_depth_, vpx_bit_depth_t bit_depth) { -#if CONFIG_VP9_HIGHBITDEPTH int64_t se = 0; uint64_t sse = 0; const int w = 1 << l2w; @@ -117,6 +123,7 @@ static unsigned int subpel_variance_ref(const uint8_t *ref, const uint8_t *src, const int diff = r - src[w * y + x]; se += diff; sse += diff * diff; +#if CONFIG_VP9_HIGHBITDEPTH } else { uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref); uint16_t *src16 = CONVERT_TO_SHORTPTR(src); @@ -130,34 +137,11 @@ static unsigned int subpel_variance_ref(const uint8_t *ref, const uint8_t *src, const int diff = r - src16[w * y + x]; se += diff; sse += diff * diff; +#endif // CONFIG_VP9_HIGHBITDEPTH } } } - if (bit_depth > VPX_BITS_8) { - sse = ROUND_POWER_OF_TWO(sse, 2 * (bit_depth - 8)); - se = ROUND_POWER_OF_TWO(se, bit_depth - 8); - } -#else - int se = 0; - unsigned int sse = 0; - const int w = 1 << l2w; - const int h = 1 << l2h; - for (int y = 0; y < h; y++) { - for (int x = 0; x < w; x++) { - // Bilinear interpolation at a 16th pel step. - const int a1 = ref[(w + 1) * (y + 0) + x + 0]; - const int a2 = ref[(w + 1) * (y + 0) + x + 1]; - const int b1 = ref[(w + 1) * (y + 1) + x + 0]; - const int b2 = ref[(w + 1) * (y + 1) + x + 1]; - const int a = a1 + (((a2 - a1) * xoff + 8) >> 4); - const int b = b1 + (((b2 - b1) * xoff + 8) >> 4); - const int r = a + (((b - a) * yoff + 8) >> 4); - const int diff = r - src[w * y + x]; - se += diff; - sse += diff * diff; - } - } -#endif // CONFIG_VP9_HIGHBITDEPTH + RoundHighBitDepth(bit_depth, &se, &sse); *sse_ptr = sse; return sse - (((int64_t) se * se) >> (l2w + l2h)); } @@ -229,36 +213,30 @@ class VarianceTest rnd_.Reset(ACMRandom::DeterministicSeed()); block_size_ = width_ * height_; -#if CONFIG_VP9_HIGHBITDEPTH if (!use_high_bit_depth_) { src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_ * 2)); ref_ = new uint8_t[block_size_ * 2]; +#if CONFIG_VP9_HIGHBITDEPTH } else { src_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>( vpx_memalign(16, block_size_ * 2 * sizeof(uint16_t)))); ref_ = CONVERT_TO_BYTEPTR(new uint16_t[block_size_ * 2]); +#endif // CONFIG_VP9_HIGHBITDEPTH } -#else - src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_ * 2)); - ref_ = new uint8_t[block_size_ * 2]; -#endif ASSERT_TRUE(src_ != NULL); ASSERT_TRUE(ref_ != NULL); } virtual void TearDown() { -#if CONFIG_VP9_HIGHBITDEPTH if (!use_high_bit_depth_) { vpx_free(src_); delete[] ref_; +#if CONFIG_VP9_HIGHBITDEPTH } else { vpx_free(CONVERT_TO_SHORTPTR(src_)); delete[] CONVERT_TO_SHORTPTR(ref_); +#endif // CONFIG_VP9_HIGHBITDEPTH } -#else - vpx_free(src_); - delete[] ref_; -#endif libvpx_test::ClearSystemState(); } @@ -283,27 +261,23 @@ class VarianceTest template<typename VarianceFunctionType> void VarianceTest<VarianceFunctionType>::ZeroTest() { for (int i = 0; i <= 255; ++i) { -#if CONFIG_VP9_HIGHBITDEPTH if (!use_high_bit_depth_) { memset(src_, i, block_size_); +#if CONFIG_VP9_HIGHBITDEPTH } else { vpx_memset16(CONVERT_TO_SHORTPTR(src_), i << (bit_depth_ - 8), block_size_); +#endif // CONFIG_VP9_HIGHBITDEPTH } -#else - memset(src_, i, block_size_); -#endif for (int j = 0; j <= 255; ++j) { -#if CONFIG_VP9_HIGHBITDEPTH if (!use_high_bit_depth_) { memset(ref_, j, block_size_); +#if CONFIG_VP9_HIGHBITDEPTH } else { vpx_memset16(CONVERT_TO_SHORTPTR(ref_), j << (bit_depth_ - 8), block_size_); +#endif // CONFIG_VP9_HIGHBITDEPTH } -#else - memset(ref_, j, block_size_); -#endif unsigned int sse; unsigned int var; ASM_REGISTER_STATE_CHECK( @@ -317,18 +291,15 @@ template<typename VarianceFunctionType> void VarianceTest<VarianceFunctionType>::RefTest() { for (int i = 0; i < 10; ++i) { for (int j = 0; j < block_size_; j++) { -#if CONFIG_VP9_HIGHBITDEPTH if (!use_high_bit_depth_) { src_[j] = rnd_.Rand8(); ref_[j] = rnd_.Rand8(); +#if CONFIG_VP9_HIGHBITDEPTH } else { CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() && mask_; CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() && mask_; +#endif // CONFIG_VP9_HIGHBITDEPTH } -#else - src_[j] = rnd_.Rand8(); - ref_[j] = rnd_.Rand8(); -#endif } unsigned int sse1, sse2; unsigned int var1; @@ -352,18 +323,15 @@ void VarianceTest<VarianceFunctionType>::RefStrideTest() { for (int j = 0; j < block_size_; j++) { int ref_ind = (j / width_) * ref_stride_coeff * width_ + j % width_; int src_ind = (j / width_) * src_stride_coeff * width_ + j % width_; -#if CONFIG_VP9_HIGHBITDEPTH if (!use_high_bit_depth_) { src_[src_ind] = rnd_.Rand8(); ref_[ref_ind] = rnd_.Rand8(); +#if CONFIG_VP9_HIGHBITDEPTH } else { CONVERT_TO_SHORTPTR(src_)[src_ind] = rnd_.Rand16() && mask_; CONVERT_TO_SHORTPTR(ref_)[ref_ind] = rnd_.Rand16() && mask_; +#endif // CONFIG_VP9_HIGHBITDEPTH } -#else - src_[src_ind] = rnd_.Rand8(); - ref_[ref_ind] = rnd_.Rand8(); -#endif } unsigned int sse1, sse2; unsigned int var1; @@ -383,22 +351,18 @@ void VarianceTest<VarianceFunctionType>::RefStrideTest() { template<typename VarianceFunctionType> void VarianceTest<VarianceFunctionType>::OneQuarterTest() { const int half = block_size_ / 2; -#if CONFIG_VP9_HIGHBITDEPTH if (!use_high_bit_depth_) { memset(src_, 255, block_size_); memset(ref_, 255, half); memset(ref_ + half, 0, half); +#if CONFIG_VP9_HIGHBITDEPTH } else { vpx_memset16(CONVERT_TO_SHORTPTR(src_), 255 << (bit_depth_ - 8), block_size_); vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 255 << (bit_depth_ - 8), half); vpx_memset16(CONVERT_TO_SHORTPTR(ref_) + half, 0, half); +#endif // CONFIG_VP9_HIGHBITDEPTH } -#else - memset(src_, 255, block_size_); - memset(ref_, 255, half); - memset(ref_ + half, 0, half); -#endif unsigned int sse; unsigned int var; ASM_REGISTER_STATE_CHECK(var = variance_(src_, width_, ref_, width_, &sse)); @@ -406,7 +370,6 @@ void VarianceTest<VarianceFunctionType>::OneQuarterTest() { EXPECT_EQ(expected, var); } -#if CONFIG_VP8_ENCODER template<typename MseFunctionType> class MseTest : public ::testing::TestWithParam<tuple<int, int, MseFunctionType> > { @@ -500,9 +463,7 @@ void MseTest<MseFunctionType>::MaxTest_sse() { const unsigned int expected = block_size_ * 255 * 255; EXPECT_EQ(expected, var); } -#endif -#if CONFIG_VP9_ENCODER unsigned int subpel_avg_variance_ref(const uint8_t *ref, const uint8_t *src, const uint8_t *second_pred, @@ -511,7 +472,6 @@ unsigned int subpel_avg_variance_ref(const uint8_t *ref, unsigned int *sse_ptr, bool use_high_bit_depth, vpx_bit_depth_t bit_depth) { -#if CONFIG_VP9_HIGHBITDEPTH int64_t se = 0; uint64_t sse = 0; const int w = 1 << l2w; @@ -530,6 +490,7 @@ unsigned int subpel_avg_variance_ref(const uint8_t *ref, const int diff = ((r + second_pred[w * y + x] + 1) >> 1) - src[w * y + x]; se += diff; sse += diff * diff; +#if CONFIG_VP9_HIGHBITDEPTH } else { uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref); uint16_t *src16 = CONVERT_TO_SHORTPTR(src); @@ -544,34 +505,11 @@ unsigned int subpel_avg_variance_ref(const uint8_t *ref, const int diff = ((r + sec16[w * y + x] + 1) >> 1) - src16[w * y + x]; se += diff; sse += diff * diff; +#endif // CONFIG_VP9_HIGHBITDEPTH } } } - if (bit_depth > 8) { - sse = ROUND_POWER_OF_TWO(sse, 2*(bit_depth-8)); - se = ROUND_POWER_OF_TWO(se, bit_depth-8); - } -#else - int se = 0; - unsigned int sse = 0; - const int w = 1 << l2w; - const int h = 1 << l2h; - for (int y = 0; y < h; y++) { - for (int x = 0; x < w; x++) { - // bilinear interpolation at a 16th pel step - const int a1 = ref[(w + 1) * (y + 0) + x + 0]; - const int a2 = ref[(w + 1) * (y + 0) + x + 1]; - const int b1 = ref[(w + 1) * (y + 1) + x + 0]; - const int b2 = ref[(w + 1) * (y + 1) + x + 1]; - const int a = a1 + (((a2 - a1) * xoff + 8) >> 4); - const int b = b1 + (((b2 - b1) * xoff + 8) >> 4); - const int r = a + (((b - a) * yoff + 8) >> 4); - const int diff = ((r + second_pred[w * y + x] + 1) >> 1) - src[w * y + x]; - se += diff; - sse += diff * diff; - } - } -#endif // CONFIG_VP9_HIGHBITDEPTH + RoundHighBitDepth(bit_depth, &se, &sse); *sse_ptr = sse; return sse - (((int64_t) se * se) >> (l2w + l2h)); } @@ -600,11 +538,11 @@ class SubpelVarianceTest rnd_.Reset(ACMRandom::DeterministicSeed()); block_size_ = width_ * height_; -#if CONFIG_VP9_HIGHBITDEPTH if (!use_high_bit_depth_) { src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_)); sec_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_)); ref_ = new uint8_t[block_size_ + width_ + height_ + 1]; +#if CONFIG_VP9_HIGHBITDEPTH } else { src_ = CONVERT_TO_BYTEPTR( reinterpret_cast<uint16_t *>( @@ -614,33 +552,25 @@ class SubpelVarianceTest vpx_memalign(16, block_size_*sizeof(uint16_t)))); ref_ = CONVERT_TO_BYTEPTR( new uint16_t[block_size_ + width_ + height_ + 1]); - } -#else - src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_)); - sec_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_)); - ref_ = new uint8_t[block_size_ + width_ + height_ + 1]; #endif // CONFIG_VP9_HIGHBITDEPTH + } ASSERT_TRUE(src_ != NULL); ASSERT_TRUE(sec_ != NULL); ASSERT_TRUE(ref_ != NULL); } virtual void TearDown() { -#if CONFIG_VP9_HIGHBITDEPTH if (!use_high_bit_depth_) { vpx_free(src_); delete[] ref_; vpx_free(sec_); +#if CONFIG_VP9_HIGHBITDEPTH } else { vpx_free(CONVERT_TO_SHORTPTR(src_)); delete[] CONVERT_TO_SHORTPTR(ref_); vpx_free(CONVERT_TO_SHORTPTR(sec_)); +#endif // CONFIG_VP9_HIGHBITDEPTH } -#else - vpx_free(src_); - delete[] ref_; - vpx_free(sec_); -#endif libvpx_test::ClearSystemState(); } @@ -664,7 +594,6 @@ template<typename SubpelVarianceFunctionType> void SubpelVarianceTest<SubpelVarianceFunctionType>::RefTest() { for (int x = 0; x < 16; ++x) { for (int y = 0; y < 16; ++y) { -#if CONFIG_VP9_HIGHBITDEPTH if (!use_high_bit_depth_) { for (int j = 0; j < block_size_; j++) { src_[j] = rnd_.Rand8(); @@ -672,6 +601,7 @@ void SubpelVarianceTest<SubpelVarianceFunctionType>::RefTest() { for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) { ref_[j] = rnd_.Rand8(); } +#if CONFIG_VP9_HIGHBITDEPTH } else { for (int j = 0; j < block_size_; j++) { CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask_; @@ -679,15 +609,8 @@ void SubpelVarianceTest<SubpelVarianceFunctionType>::RefTest() { for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) { CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask_; } - } -#else - for (int j = 0; j < block_size_; j++) { - src_[j] = rnd_.Rand8(); - } - for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) { - ref_[j] = rnd_.Rand8(); - } #endif // CONFIG_VP9_HIGHBITDEPTH + } unsigned int sse1, sse2; unsigned int var1; ASM_REGISTER_STATE_CHECK(var1 = subpel_variance_(ref_, width_ + 1, x, y, @@ -710,25 +633,20 @@ void SubpelVarianceTest<SubpelVarianceFunctionType>::ExtremeRefTest() { for (int x = 0; x < 16; ++x) { for (int y = 0; y < 16; ++y) { const int half = block_size_ / 2; -#if CONFIG_VP9_HIGHBITDEPTH if (!use_high_bit_depth_) { memset(src_, 0, half); memset(src_ + half, 255, half); memset(ref_, 255, half); memset(ref_ + half, 0, half + width_ + height_ + 1); +#if CONFIG_VP9_HIGHBITDEPTH } else { vpx_memset16(CONVERT_TO_SHORTPTR(src_), mask_, half); vpx_memset16(CONVERT_TO_SHORTPTR(src_) + half, 0, half); vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 0, half); vpx_memset16(CONVERT_TO_SHORTPTR(ref_) + half, mask_, half + width_ + height_ + 1); - } -#else - memset(src_, 0, half); - memset(src_ + half, 255, half); - memset(ref_, 255, half); - memset(ref_ + half, 0, half + width_ + height_ + 1); #endif // CONFIG_VP9_HIGHBITDEPTH + } unsigned int sse1, sse2; unsigned int var1; ASM_REGISTER_STATE_CHECK( @@ -742,11 +660,11 @@ void SubpelVarianceTest<SubpelVarianceFunctionType>::ExtremeRefTest() { } } +#if CONFIG_VP9_ENCODER template<> void SubpelVarianceTest<vp9_subp_avg_variance_fn_t>::RefTest() { for (int x = 0; x < 16; ++x) { for (int y = 0; y < 16; ++y) { -#if CONFIG_VP9_HIGHBITDEPTH if (!use_high_bit_depth_) { for (int j = 0; j < block_size_; j++) { src_[j] = rnd_.Rand8(); @@ -755,6 +673,7 @@ void SubpelVarianceTest<vp9_subp_avg_variance_fn_t>::RefTest() { for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) { ref_[j] = rnd_.Rand8(); } +#if CONFIG_VP9_HIGHBITDEPTH } else { for (int j = 0; j < block_size_; j++) { CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask_; @@ -763,16 +682,8 @@ void SubpelVarianceTest<vp9_subp_avg_variance_fn_t>::RefTest() { for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) { CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask_; } +#endif // CONFIG_VP9_HIGHBITDEPTH } -#else - for (int j = 0; j < block_size_; j++) { - src_[j] = rnd_.Rand8(); - sec_[j] = rnd_.Rand8(); - } - for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) { - ref_[j] = rnd_.Rand8(); - } -#endif unsigned int sse1, sse2; unsigned int var1; ASM_REGISTER_STATE_CHECK( @@ -788,272 +699,407 @@ void SubpelVarianceTest<vp9_subp_avg_variance_fn_t>::RefTest() { } } } - #endif // CONFIG_VP9_ENCODER -// ----------------------------------------------------------------------------- -// VP8 test cases. - -namespace vp8 { - -#if CONFIG_VP8_ENCODER -typedef unsigned int (*vp8_sse_fn_t)(const unsigned char *src_ptr, - int source_stride, const unsigned char *ref_ptr, int ref_stride); - -typedef MseTest<vp8_sse_fn_t> VP8SseTest; -typedef MseTest<vp8_variance_fn_t> VP8MseTest; -typedef VarianceTest<vp8_variance_fn_t> VP8VarianceTest; - -TEST_P(VP8SseTest, Ref_sse) { RefTest_sse(); } -TEST_P(VP8SseTest, Max_sse) { MaxTest_sse(); } -TEST_P(VP8MseTest, Ref_mse) { RefTest_mse(); } -TEST_P(VP8MseTest, Max_mse) { MaxTest_mse(); } -TEST_P(VP8VarianceTest, Zero) { ZeroTest(); } -TEST_P(VP8VarianceTest, Ref) { RefTest(); } -TEST_P(VP8VarianceTest, OneQuarter) { OneQuarterTest(); } +typedef MseTest<Get4x4SseFunc> VpxSseTest; +typedef MseTest<VarianceMxNFunc> VpxMseTest; +typedef VarianceTest<VarianceMxNFunc> VpxVarianceTest; + +TEST_P(VpxSseTest, Ref_sse) { RefTest_sse(); } +TEST_P(VpxSseTest, Max_sse) { MaxTest_sse(); } +TEST_P(VpxMseTest, Ref_mse) { RefTest_mse(); } +TEST_P(VpxMseTest, Max_mse) { MaxTest_mse(); } +TEST_P(VpxVarianceTest, Zero) { ZeroTest(); } +TEST_P(VpxVarianceTest, Ref) { RefTest(); } +TEST_P(VpxVarianceTest, RefStride) { RefStrideTest(); } +TEST_P(VpxVarianceTest, OneQuarter) { OneQuarterTest(); } +TEST_P(SumOfSquaresTest, Const) { ConstTest(); } +TEST_P(SumOfSquaresTest, Ref) { RefTest(); } -const vp8_sse_fn_t get4x4sse_cs_c = vp8_get4x4sse_cs_c; -INSTANTIATE_TEST_CASE_P( - C, VP8SseTest, - ::testing::Values(make_tuple(2, 2, get4x4sse_cs_c))); +INSTANTIATE_TEST_CASE_P(C, SumOfSquaresTest, + ::testing::Values(vpx_get_mb_ss_c)); + +const Get4x4SseFunc get4x4sse_cs_c = vpx_get4x4sse_cs_c; +INSTANTIATE_TEST_CASE_P(C, VpxSseTest, + ::testing::Values(make_tuple(2, 2, get4x4sse_cs_c))); + +const VarianceMxNFunc mse16x16_c = vpx_mse16x16_c; +const VarianceMxNFunc mse16x8_c = vpx_mse16x8_c; +const VarianceMxNFunc mse8x16_c = vpx_mse8x16_c; +const VarianceMxNFunc mse8x8_c = vpx_mse8x8_c; +INSTANTIATE_TEST_CASE_P(C, VpxMseTest, + ::testing::Values(make_tuple(4, 4, mse16x16_c), + make_tuple(4, 3, mse16x8_c), + make_tuple(3, 4, mse8x16_c), + make_tuple(3, 3, mse8x8_c))); + +const VarianceMxNFunc variance64x64_c = vpx_variance64x64_c; +const VarianceMxNFunc variance64x32_c = vpx_variance64x32_c; +const VarianceMxNFunc variance32x64_c = vpx_variance32x64_c; +const VarianceMxNFunc variance32x32_c = vpx_variance32x32_c; +const VarianceMxNFunc variance32x16_c = vpx_variance32x16_c; +const VarianceMxNFunc variance16x32_c = vpx_variance16x32_c; +const VarianceMxNFunc variance16x16_c = vpx_variance16x16_c; +const VarianceMxNFunc variance16x8_c = vpx_variance16x8_c; +const VarianceMxNFunc variance8x16_c = vpx_variance8x16_c; +const VarianceMxNFunc variance8x8_c = vpx_variance8x8_c; +const VarianceMxNFunc variance8x4_c = vpx_variance8x4_c; +const VarianceMxNFunc variance4x8_c = vpx_variance4x8_c; +const VarianceMxNFunc variance4x4_c = vpx_variance4x4_c; -const vp8_variance_fn_t mse16x16_c = vp8_mse16x16_c; INSTANTIATE_TEST_CASE_P( - C, VP8MseTest, - ::testing::Values(make_tuple(4, 4, mse16x16_c))); - -const vp8_variance_fn_t variance4x4_c = vp8_variance4x4_c; -const vp8_variance_fn_t variance8x8_c = vp8_variance8x8_c; -const vp8_variance_fn_t variance8x16_c = vp8_variance8x16_c; -const vp8_variance_fn_t variance16x8_c = vp8_variance16x8_c; -const vp8_variance_fn_t variance16x16_c = vp8_variance16x16_c; -INSTANTIATE_TEST_CASE_P( - C, VP8VarianceTest, - ::testing::Values(make_tuple(2, 2, variance4x4_c, 0), - make_tuple(3, 3, variance8x8_c, 0), - make_tuple(3, 4, variance8x16_c, 0), + C, VpxVarianceTest, + ::testing::Values(make_tuple(6, 6, variance64x64_c, 0), + make_tuple(6, 5, variance64x32_c, 0), + make_tuple(5, 6, variance32x64_c, 0), + make_tuple(5, 5, variance32x32_c, 0), + make_tuple(5, 4, variance32x16_c, 0), + make_tuple(4, 5, variance16x32_c, 0), + make_tuple(4, 4, variance16x16_c, 0), make_tuple(4, 3, variance16x8_c, 0), - make_tuple(4, 4, variance16x16_c, 0))); + make_tuple(3, 4, variance8x16_c, 0), + make_tuple(3, 3, variance8x8_c, 0), + make_tuple(3, 2, variance8x4_c, 0), + make_tuple(2, 3, variance4x8_c, 0), + make_tuple(2, 2, variance4x4_c, 0))); -#if HAVE_NEON -const vp8_sse_fn_t get4x4sse_cs_neon = vp8_get4x4sse_cs_neon; -INSTANTIATE_TEST_CASE_P( - NEON, VP8SseTest, - ::testing::Values(make_tuple(2, 2, get4x4sse_cs_neon))); +#if CONFIG_VP9_HIGHBITDEPTH +typedef MseTest<VarianceMxNFunc> VpxHBDMseTest; +typedef VarianceTest<VarianceMxNFunc> VpxHBDVarianceTest; + +TEST_P(VpxHBDMseTest, Ref_mse) { RefTest_mse(); } +TEST_P(VpxHBDMseTest, Max_mse) { MaxTest_mse(); } +TEST_P(VpxHBDVarianceTest, Zero) { ZeroTest(); } +TEST_P(VpxHBDVarianceTest, Ref) { RefTest(); } +TEST_P(VpxHBDVarianceTest, RefStride) { RefStrideTest(); } +TEST_P(VpxHBDVarianceTest, OneQuarter) { OneQuarterTest(); } + +/* TODO(debargha): This test does not support the highbd version +const VarianceMxNFunc highbd_12_mse16x16_c = vpx_highbd_12_mse16x16_c; +const VarianceMxNFunc highbd_12_mse16x8_c = vpx_highbd_12_mse16x8_c; +const VarianceMxNFunc highbd_12_mse8x16_c = vpx_highbd_12_mse8x16_c; +const VarianceMxNFunc highbd_12_mse8x8_c = vpx_highbd_12_mse8x8_c; + +const VarianceMxNFunc highbd_10_mse16x16_c = vpx_highbd_10_mse16x16_c; +const VarianceMxNFunc highbd_10_mse16x8_c = vpx_highbd_10_mse16x8_c; +const VarianceMxNFunc highbd_10_mse8x16_c = vpx_highbd_10_mse8x16_c; +const VarianceMxNFunc highbd_10_mse8x8_c = vpx_highbd_10_mse8x8_c; + +const VarianceMxNFunc highbd_8_mse16x16_c = vpx_highbd_8_mse16x16_c; +const VarianceMxNFunc highbd_8_mse16x8_c = vpx_highbd_8_mse16x8_c; +const VarianceMxNFunc highbd_8_mse8x16_c = vpx_highbd_8_mse8x16_c; +const VarianceMxNFunc highbd_8_mse8x8_c = vpx_highbd_8_mse8x8_c; -const vp8_variance_fn_t mse16x16_neon = vp8_mse16x16_neon; INSTANTIATE_TEST_CASE_P( - NEON, VP8MseTest, - ::testing::Values(make_tuple(4, 4, mse16x16_neon))); - -const vp8_variance_fn_t variance8x8_neon = vp8_variance8x8_neon; -const vp8_variance_fn_t variance8x16_neon = vp8_variance8x16_neon; -const vp8_variance_fn_t variance16x8_neon = vp8_variance16x8_neon; -const vp8_variance_fn_t variance16x16_neon = vp8_variance16x16_neon; + C, VpxHBDMseTest, ::testing::Values(make_tuple(4, 4, highbd_12_mse16x16_c), + make_tuple(4, 4, highbd_12_mse16x8_c), + make_tuple(4, 4, highbd_12_mse8x16_c), + make_tuple(4, 4, highbd_12_mse8x8_c), + make_tuple(4, 4, highbd_10_mse16x16_c), + make_tuple(4, 4, highbd_10_mse16x8_c), + make_tuple(4, 4, highbd_10_mse8x16_c), + make_tuple(4, 4, highbd_10_mse8x8_c), + make_tuple(4, 4, highbd_8_mse16x16_c), + make_tuple(4, 4, highbd_8_mse16x8_c), + make_tuple(4, 4, highbd_8_mse8x16_c), + make_tuple(4, 4, highbd_8_mse8x8_c))); +*/ + + +const VarianceMxNFunc highbd_12_variance64x64_c = vpx_highbd_12_variance64x64_c; +const VarianceMxNFunc highbd_12_variance64x32_c = vpx_highbd_12_variance64x32_c; +const VarianceMxNFunc highbd_12_variance32x64_c = vpx_highbd_12_variance32x64_c; +const VarianceMxNFunc highbd_12_variance32x32_c = vpx_highbd_12_variance32x32_c; +const VarianceMxNFunc highbd_12_variance32x16_c = vpx_highbd_12_variance32x16_c; +const VarianceMxNFunc highbd_12_variance16x32_c = vpx_highbd_12_variance16x32_c; +const VarianceMxNFunc highbd_12_variance16x16_c = vpx_highbd_12_variance16x16_c; +const VarianceMxNFunc highbd_12_variance16x8_c = vpx_highbd_12_variance16x8_c; +const VarianceMxNFunc highbd_12_variance8x16_c = vpx_highbd_12_variance8x16_c; +const VarianceMxNFunc highbd_12_variance8x8_c = vpx_highbd_12_variance8x8_c; +const VarianceMxNFunc highbd_12_variance8x4_c = vpx_highbd_12_variance8x4_c; +const VarianceMxNFunc highbd_12_variance4x8_c = vpx_highbd_12_variance4x8_c; +const VarianceMxNFunc highbd_12_variance4x4_c = vpx_highbd_12_variance4x4_c; + +const VarianceMxNFunc highbd_10_variance64x64_c = vpx_highbd_10_variance64x64_c; +const VarianceMxNFunc highbd_10_variance64x32_c = vpx_highbd_10_variance64x32_c; +const VarianceMxNFunc highbd_10_variance32x64_c = vpx_highbd_10_variance32x64_c; +const VarianceMxNFunc highbd_10_variance32x32_c = vpx_highbd_10_variance32x32_c; +const VarianceMxNFunc highbd_10_variance32x16_c = vpx_highbd_10_variance32x16_c; +const VarianceMxNFunc highbd_10_variance16x32_c = vpx_highbd_10_variance16x32_c; +const VarianceMxNFunc highbd_10_variance16x16_c = vpx_highbd_10_variance16x16_c; +const VarianceMxNFunc highbd_10_variance16x8_c = vpx_highbd_10_variance16x8_c; +const VarianceMxNFunc highbd_10_variance8x16_c = vpx_highbd_10_variance8x16_c; +const VarianceMxNFunc highbd_10_variance8x8_c = vpx_highbd_10_variance8x8_c; +const VarianceMxNFunc highbd_10_variance8x4_c = vpx_highbd_10_variance8x4_c; +const VarianceMxNFunc highbd_10_variance4x8_c = vpx_highbd_10_variance4x8_c; +const VarianceMxNFunc highbd_10_variance4x4_c = vpx_highbd_10_variance4x4_c; + +const VarianceMxNFunc highbd_8_variance64x64_c = vpx_highbd_8_variance64x64_c; +const VarianceMxNFunc highbd_8_variance64x32_c = vpx_highbd_8_variance64x32_c; +const VarianceMxNFunc highbd_8_variance32x64_c = vpx_highbd_8_variance32x64_c; +const VarianceMxNFunc highbd_8_variance32x32_c = vpx_highbd_8_variance32x32_c; +const VarianceMxNFunc highbd_8_variance32x16_c = vpx_highbd_8_variance32x16_c; +const VarianceMxNFunc highbd_8_variance16x32_c = vpx_highbd_8_variance16x32_c; +const VarianceMxNFunc highbd_8_variance16x16_c = vpx_highbd_8_variance16x16_c; +const VarianceMxNFunc highbd_8_variance16x8_c = vpx_highbd_8_variance16x8_c; +const VarianceMxNFunc highbd_8_variance8x16_c = vpx_highbd_8_variance8x16_c; +const VarianceMxNFunc highbd_8_variance8x8_c = vpx_highbd_8_variance8x8_c; +const VarianceMxNFunc highbd_8_variance8x4_c = vpx_highbd_8_variance8x4_c; +const VarianceMxNFunc highbd_8_variance4x8_c = vpx_highbd_8_variance4x8_c; +const VarianceMxNFunc highbd_8_variance4x4_c = vpx_highbd_8_variance4x4_c; INSTANTIATE_TEST_CASE_P( - NEON, VP8VarianceTest, - ::testing::Values(make_tuple(3, 3, variance8x8_neon, 0), - make_tuple(3, 4, variance8x16_neon, 0), - make_tuple(4, 3, variance16x8_neon, 0), - make_tuple(4, 4, variance16x16_neon, 0))); -#endif + C, VpxHBDVarianceTest, + ::testing::Values(make_tuple(6, 6, highbd_12_variance64x64_c, 12), + make_tuple(6, 5, highbd_12_variance64x32_c, 12), + make_tuple(5, 6, highbd_12_variance32x64_c, 12), + make_tuple(5, 5, highbd_12_variance32x32_c, 12), + make_tuple(5, 4, highbd_12_variance32x16_c, 12), + make_tuple(4, 5, highbd_12_variance16x32_c, 12), + make_tuple(4, 4, highbd_12_variance16x16_c, 12), + make_tuple(4, 3, highbd_12_variance16x8_c, 12), + make_tuple(3, 4, highbd_12_variance8x16_c, 12), + make_tuple(3, 3, highbd_12_variance8x8_c, 12), + make_tuple(3, 2, highbd_12_variance8x4_c, 12), + make_tuple(2, 3, highbd_12_variance4x8_c, 12), + make_tuple(2, 2, highbd_12_variance4x4_c, 12), + make_tuple(6, 6, highbd_10_variance64x64_c, 10), + make_tuple(6, 5, highbd_10_variance64x32_c, 10), + make_tuple(5, 6, highbd_10_variance32x64_c, 10), + make_tuple(5, 5, highbd_10_variance32x32_c, 10), + make_tuple(5, 4, highbd_10_variance32x16_c, 10), + make_tuple(4, 5, highbd_10_variance16x32_c, 10), + make_tuple(4, 4, highbd_10_variance16x16_c, 10), + make_tuple(4, 3, highbd_10_variance16x8_c, 10), + make_tuple(3, 4, highbd_10_variance8x16_c, 10), + make_tuple(3, 3, highbd_10_variance8x8_c, 10), + make_tuple(3, 2, highbd_10_variance8x4_c, 10), + make_tuple(2, 3, highbd_10_variance4x8_c, 10), + make_tuple(2, 2, highbd_10_variance4x4_c, 10), + make_tuple(6, 6, highbd_8_variance64x64_c, 8), + make_tuple(6, 5, highbd_8_variance64x32_c, 8), + make_tuple(5, 6, highbd_8_variance32x64_c, 8), + make_tuple(5, 5, highbd_8_variance32x32_c, 8), + make_tuple(5, 4, highbd_8_variance32x16_c, 8), + make_tuple(4, 5, highbd_8_variance16x32_c, 8), + make_tuple(4, 4, highbd_8_variance16x16_c, 8), + make_tuple(4, 3, highbd_8_variance16x8_c, 8), + make_tuple(3, 4, highbd_8_variance8x16_c, 8), + make_tuple(3, 3, highbd_8_variance8x8_c, 8), + make_tuple(3, 2, highbd_8_variance8x4_c, 8), + make_tuple(2, 3, highbd_8_variance4x8_c, 8), + make_tuple(2, 2, highbd_8_variance4x4_c, 8))); +#endif // CONFIG_VP9_HIGHBITDEPTH #if HAVE_MMX -const vp8_variance_fn_t variance4x4_mmx = vp8_variance4x4_mmx; -const vp8_variance_fn_t variance8x8_mmx = vp8_variance8x8_mmx; -const vp8_variance_fn_t variance8x16_mmx = vp8_variance8x16_mmx; -const vp8_variance_fn_t variance16x8_mmx = vp8_variance16x8_mmx; -const vp8_variance_fn_t variance16x16_mmx = vp8_variance16x16_mmx; +const VarianceMxNFunc mse16x16_mmx = vpx_mse16x16_mmx; +INSTANTIATE_TEST_CASE_P(MMX, VpxMseTest, + ::testing::Values(make_tuple(4, 4, mse16x16_mmx))); + +INSTANTIATE_TEST_CASE_P(MMX, SumOfSquaresTest, + ::testing::Values(vpx_get_mb_ss_mmx)); + +const VarianceMxNFunc variance16x16_mmx = vpx_variance16x16_mmx; +const VarianceMxNFunc variance16x8_mmx = vpx_variance16x8_mmx; +const VarianceMxNFunc variance8x16_mmx = vpx_variance8x16_mmx; +const VarianceMxNFunc variance8x8_mmx = vpx_variance8x8_mmx; +const VarianceMxNFunc variance4x4_mmx = vpx_variance4x4_mmx; INSTANTIATE_TEST_CASE_P( - MMX, VP8VarianceTest, - ::testing::Values(make_tuple(2, 2, variance4x4_mmx, 0), - make_tuple(3, 3, variance8x8_mmx, 0), - make_tuple(3, 4, variance8x16_mmx, 0), + MMX, VpxVarianceTest, + ::testing::Values(make_tuple(4, 4, variance16x16_mmx, 0), make_tuple(4, 3, variance16x8_mmx, 0), - make_tuple(4, 4, variance16x16_mmx, 0))); -#endif + make_tuple(3, 4, variance8x16_mmx, 0), + make_tuple(3, 3, variance8x8_mmx, 0), + make_tuple(2, 2, variance4x4_mmx, 0))); +#endif // HAVE_MMX #if HAVE_SSE2 -const vp8_variance_fn_t variance4x4_wmt = vp8_variance4x4_wmt; -const vp8_variance_fn_t variance8x8_wmt = vp8_variance8x8_wmt; -const vp8_variance_fn_t variance8x16_wmt = vp8_variance8x16_wmt; -const vp8_variance_fn_t variance16x8_wmt = vp8_variance16x8_wmt; -const vp8_variance_fn_t variance16x16_wmt = vp8_variance16x16_wmt; +INSTANTIATE_TEST_CASE_P(SSE2, SumOfSquaresTest, + ::testing::Values(vpx_get_mb_ss_sse2)); + +const VarianceMxNFunc mse16x16_sse2 = vpx_mse16x16_sse2; +const VarianceMxNFunc mse16x8_sse2 = vpx_mse16x8_sse2; +const VarianceMxNFunc mse8x16_sse2 = vpx_mse8x16_sse2; +const VarianceMxNFunc mse8x8_sse2 = vpx_mse8x8_sse2; +INSTANTIATE_TEST_CASE_P(SSE2, VpxMseTest, + ::testing::Values(make_tuple(4, 4, mse16x16_sse2), + make_tuple(4, 3, mse16x8_sse2), + make_tuple(3, 4, mse8x16_sse2), + make_tuple(3, 3, mse8x8_sse2))); + +const VarianceMxNFunc variance64x64_sse2 = vpx_variance64x64_sse2; +const VarianceMxNFunc variance64x32_sse2 = vpx_variance64x32_sse2; +const VarianceMxNFunc variance32x64_sse2 = vpx_variance32x64_sse2; +const VarianceMxNFunc variance32x32_sse2 = vpx_variance32x32_sse2; +const VarianceMxNFunc variance32x16_sse2 = vpx_variance32x16_sse2; +const VarianceMxNFunc variance16x32_sse2 = vpx_variance16x32_sse2; +const VarianceMxNFunc variance16x16_sse2 = vpx_variance16x16_sse2; +const VarianceMxNFunc variance16x8_sse2 = vpx_variance16x8_sse2; +const VarianceMxNFunc variance8x16_sse2 = vpx_variance8x16_sse2; +const VarianceMxNFunc variance8x8_sse2 = vpx_variance8x8_sse2; +const VarianceMxNFunc variance8x4_sse2 = vpx_variance8x4_sse2; +const VarianceMxNFunc variance4x8_sse2 = vpx_variance4x8_sse2; +const VarianceMxNFunc variance4x4_sse2 = vpx_variance4x4_sse2; INSTANTIATE_TEST_CASE_P( - SSE2, VP8VarianceTest, - ::testing::Values(make_tuple(2, 2, variance4x4_wmt, 0), - make_tuple(3, 3, variance8x8_wmt, 0), - make_tuple(3, 4, variance8x16_wmt, 0), - make_tuple(4, 3, variance16x8_wmt, 0), - make_tuple(4, 4, variance16x16_wmt, 0))); -#endif -#endif // CONFIG_VP8_ENCODER - -} // namespace vp8 + SSE2, VpxVarianceTest, + ::testing::Values(make_tuple(6, 6, variance64x64_sse2, 0), + make_tuple(6, 5, variance64x32_sse2, 0), + make_tuple(5, 6, variance32x64_sse2, 0), + make_tuple(5, 5, variance32x32_sse2, 0), + make_tuple(5, 4, variance32x16_sse2, 0), + make_tuple(4, 5, variance16x32_sse2, 0), + make_tuple(4, 4, variance16x16_sse2, 0), + make_tuple(4, 3, variance16x8_sse2, 0), + make_tuple(3, 4, variance8x16_sse2, 0), + make_tuple(3, 3, variance8x8_sse2, 0), + make_tuple(3, 2, variance8x4_sse2, 0), + make_tuple(2, 3, variance4x8_sse2, 0), + make_tuple(2, 2, variance4x4_sse2, 0))); +#if CONFIG_VP9_HIGHBITDEPTH +/* TODO(debargha): This test does not support the highbd version +const VarianceMxNFunc highbd_12_mse16x16_sse2 = vpx_highbd_12_mse16x16_sse2; +const VarianceMxNFunc highbd_12_mse16x8_sse2 = vpx_highbd_12_mse16x8_sse2; +const VarianceMxNFunc highbd_12_mse8x16_sse2 = vpx_highbd_12_mse8x16_sse2; +const VarianceMxNFunc highbd_12_mse8x8_sse2 = vpx_highbd_12_mse8x8_sse2; + +const VarianceMxNFunc highbd_10_mse16x16_sse2 = vpx_highbd_10_mse16x16_sse2; +const VarianceMxNFunc highbd_10_mse16x8_sse2 = vpx_highbd_10_mse16x8_sse2; +const VarianceMxNFunc highbd_10_mse8x16_sse2 = vpx_highbd_10_mse8x16_sse2; +const VarianceMxNFunc highbd_10_mse8x8_sse2 = vpx_highbd_10_mse8x8_sse2; + +const VarianceMxNFunc highbd_8_mse16x16_sse2 = vpx_highbd_8_mse16x16_sse2; +const VarianceMxNFunc highbd_8_mse16x8_sse2 = vpx_highbd_8_mse16x8_sse2; +const VarianceMxNFunc highbd_8_mse8x16_sse2 = vpx_highbd_8_mse8x16_sse2; +const VarianceMxNFunc highbd_8_mse8x8_sse2 = vpx_highbd_8_mse8x8_sse2; -// ----------------------------------------------------------------------------- -// VP9 test cases. +INSTANTIATE_TEST_CASE_P( + SSE2, VpxHBDMseTest, ::testing::Values(make_tuple(4, 4, highbd_12_mse16x16_sse2), + make_tuple(4, 3, highbd_12_mse16x8_sse2), + make_tuple(3, 4, highbd_12_mse8x16_sse2), + make_tuple(3, 3, highbd_12_mse8x8_sse2), + make_tuple(4, 4, highbd_10_mse16x16_sse2), + make_tuple(4, 3, highbd_10_mse16x8_sse2), + make_tuple(3, 4, highbd_10_mse8x16_sse2), + make_tuple(3, 3, highbd_10_mse8x8_sse2), + make_tuple(4, 4, highbd_8_mse16x16_sse2), + make_tuple(4, 3, highbd_8_mse16x8_sse2), + make_tuple(3, 4, highbd_8_mse8x16_sse2), + make_tuple(3, 3, highbd_8_mse8x8_sse2))); +*/ + +const VarianceMxNFunc highbd_12_variance64x64_sse2 = + vpx_highbd_12_variance64x64_sse2; +const VarianceMxNFunc highbd_12_variance64x32_sse2 = + vpx_highbd_12_variance64x32_sse2; +const VarianceMxNFunc highbd_12_variance32x64_sse2 = + vpx_highbd_12_variance32x64_sse2; +const VarianceMxNFunc highbd_12_variance32x32_sse2 = + vpx_highbd_12_variance32x32_sse2; +const VarianceMxNFunc highbd_12_variance32x16_sse2 = + vpx_highbd_12_variance32x16_sse2; +const VarianceMxNFunc highbd_12_variance16x32_sse2 = + vpx_highbd_12_variance16x32_sse2; +const VarianceMxNFunc highbd_12_variance16x16_sse2 = + vpx_highbd_12_variance16x16_sse2; +const VarianceMxNFunc highbd_12_variance16x8_sse2 = + vpx_highbd_12_variance16x8_sse2; +const VarianceMxNFunc highbd_12_variance8x16_sse2 = + vpx_highbd_12_variance8x16_sse2; +const VarianceMxNFunc highbd_12_variance8x8_sse2 = + vpx_highbd_12_variance8x8_sse2; +const VarianceMxNFunc highbd_10_variance64x64_sse2 = + vpx_highbd_10_variance64x64_sse2; +const VarianceMxNFunc highbd_10_variance64x32_sse2 = + vpx_highbd_10_variance64x32_sse2; +const VarianceMxNFunc highbd_10_variance32x64_sse2 = + vpx_highbd_10_variance32x64_sse2; +const VarianceMxNFunc highbd_10_variance32x32_sse2 = + vpx_highbd_10_variance32x32_sse2; +const VarianceMxNFunc highbd_10_variance32x16_sse2 = + vpx_highbd_10_variance32x16_sse2; +const VarianceMxNFunc highbd_10_variance16x32_sse2 = + vpx_highbd_10_variance16x32_sse2; +const VarianceMxNFunc highbd_10_variance16x16_sse2 = + vpx_highbd_10_variance16x16_sse2; +const VarianceMxNFunc highbd_10_variance16x8_sse2 = + vpx_highbd_10_variance16x8_sse2; +const VarianceMxNFunc highbd_10_variance8x16_sse2 = + vpx_highbd_10_variance8x16_sse2; +const VarianceMxNFunc highbd_10_variance8x8_sse2 = + vpx_highbd_10_variance8x8_sse2; +const VarianceMxNFunc highbd_8_variance64x64_sse2 = + vpx_highbd_8_variance64x64_sse2; +const VarianceMxNFunc highbd_8_variance64x32_sse2 = + vpx_highbd_8_variance64x32_sse2; +const VarianceMxNFunc highbd_8_variance32x64_sse2 = + vpx_highbd_8_variance32x64_sse2; +const VarianceMxNFunc highbd_8_variance32x32_sse2 = + vpx_highbd_8_variance32x32_sse2; +const VarianceMxNFunc highbd_8_variance32x16_sse2 = + vpx_highbd_8_variance32x16_sse2; +const VarianceMxNFunc highbd_8_variance16x32_sse2 = + vpx_highbd_8_variance16x32_sse2; +const VarianceMxNFunc highbd_8_variance16x16_sse2 = + vpx_highbd_8_variance16x16_sse2; +const VarianceMxNFunc highbd_8_variance16x8_sse2 = + vpx_highbd_8_variance16x8_sse2; +const VarianceMxNFunc highbd_8_variance8x16_sse2 = + vpx_highbd_8_variance8x16_sse2; +const VarianceMxNFunc highbd_8_variance8x8_sse2 = + vpx_highbd_8_variance8x8_sse2; -namespace vp9 { +INSTANTIATE_TEST_CASE_P( + SSE2, VpxHBDVarianceTest, + ::testing::Values(make_tuple(6, 6, highbd_12_variance64x64_sse2, 12), + make_tuple(6, 5, highbd_12_variance64x32_sse2, 12), + make_tuple(5, 6, highbd_12_variance32x64_sse2, 12), + make_tuple(5, 5, highbd_12_variance32x32_sse2, 12), + make_tuple(5, 4, highbd_12_variance32x16_sse2, 12), + make_tuple(4, 5, highbd_12_variance16x32_sse2, 12), + make_tuple(4, 4, highbd_12_variance16x16_sse2, 12), + make_tuple(4, 3, highbd_12_variance16x8_sse2, 12), + make_tuple(3, 4, highbd_12_variance8x16_sse2, 12), + make_tuple(3, 3, highbd_12_variance8x8_sse2, 12), + make_tuple(6, 6, highbd_10_variance64x64_sse2, 10), + make_tuple(6, 5, highbd_10_variance64x32_sse2, 10), + make_tuple(5, 6, highbd_10_variance32x64_sse2, 10), + make_tuple(5, 5, highbd_10_variance32x32_sse2, 10), + make_tuple(5, 4, highbd_10_variance32x16_sse2, 10), + make_tuple(4, 5, highbd_10_variance16x32_sse2, 10), + make_tuple(4, 4, highbd_10_variance16x16_sse2, 10), + make_tuple(4, 3, highbd_10_variance16x8_sse2, 10), + make_tuple(3, 4, highbd_10_variance8x16_sse2, 10), + make_tuple(3, 3, highbd_10_variance8x8_sse2, 10), + make_tuple(6, 6, highbd_8_variance64x64_sse2, 8), + make_tuple(6, 5, highbd_8_variance64x32_sse2, 8), + make_tuple(5, 6, highbd_8_variance32x64_sse2, 8), + make_tuple(5, 5, highbd_8_variance32x32_sse2, 8), + make_tuple(5, 4, highbd_8_variance32x16_sse2, 8), + make_tuple(4, 5, highbd_8_variance16x32_sse2, 8), + make_tuple(4, 4, highbd_8_variance16x16_sse2, 8), + make_tuple(4, 3, highbd_8_variance16x8_sse2, 8), + make_tuple(3, 4, highbd_8_variance8x16_sse2, 8), + make_tuple(3, 3, highbd_8_variance8x8_sse2, 8))); +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_SSE2 #if CONFIG_VP9_ENCODER -TEST_P(SumOfSquaresTest, Const) { ConstTest(); } -TEST_P(SumOfSquaresTest, Ref) { RefTest(); } - -INSTANTIATE_TEST_CASE_P(C, SumOfSquaresTest, - ::testing::Values(vp9_get_mb_ss_c)); - -typedef VarianceTest<vp9_variance_fn_t> VP9VarianceTest; typedef SubpelVarianceTest<vp9_subpixvariance_fn_t> VP9SubpelVarianceTest; typedef SubpelVarianceTest<vp9_subp_avg_variance_fn_t> VP9SubpelAvgVarianceTest; -TEST_P(VP9VarianceTest, Zero) { ZeroTest(); } -TEST_P(VP9VarianceTest, Ref) { RefTest(); } -TEST_P(VP9VarianceTest, RefStride) { RefStrideTest(); } TEST_P(VP9SubpelVarianceTest, Ref) { RefTest(); } TEST_P(VP9SubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); } TEST_P(VP9SubpelAvgVarianceTest, Ref) { RefTest(); } -TEST_P(VP9VarianceTest, OneQuarter) { OneQuarterTest(); } #if CONFIG_VP9_HIGHBITDEPTH -typedef VarianceTest<vp9_variance_fn_t> VP9VarianceHighTest; typedef SubpelVarianceTest<vp9_subpixvariance_fn_t> VP9SubpelVarianceHighTest; typedef SubpelVarianceTest<vp9_subp_avg_variance_fn_t> VP9SubpelAvgVarianceHighTest; -TEST_P(VP9VarianceHighTest, Zero) { ZeroTest(); } -TEST_P(VP9VarianceHighTest, Ref) { RefTest(); } -TEST_P(VP9VarianceHighTest, RefStride) { RefStrideTest(); } TEST_P(VP9SubpelVarianceHighTest, Ref) { RefTest(); } TEST_P(VP9SubpelVarianceHighTest, ExtremeRef) { ExtremeRefTest(); } TEST_P(VP9SubpelAvgVarianceHighTest, Ref) { RefTest(); } -TEST_P(VP9VarianceHighTest, OneQuarter) { OneQuarterTest(); } #endif // CONFIG_VP9_HIGHBITDEPTH -const vp9_variance_fn_t variance4x4_c = vp9_variance4x4_c; -const vp9_variance_fn_t variance4x8_c = vp9_variance4x8_c; -const vp9_variance_fn_t variance8x4_c = vp9_variance8x4_c; -const vp9_variance_fn_t variance8x8_c = vp9_variance8x8_c; -const vp9_variance_fn_t variance8x16_c = vp9_variance8x16_c; -const vp9_variance_fn_t variance16x8_c = vp9_variance16x8_c; -const vp9_variance_fn_t variance16x16_c = vp9_variance16x16_c; -const vp9_variance_fn_t variance16x32_c = vp9_variance16x32_c; -const vp9_variance_fn_t variance32x16_c = vp9_variance32x16_c; -const vp9_variance_fn_t variance32x32_c = vp9_variance32x32_c; -const vp9_variance_fn_t variance32x64_c = vp9_variance32x64_c; -const vp9_variance_fn_t variance64x32_c = vp9_variance64x32_c; -const vp9_variance_fn_t variance64x64_c = vp9_variance64x64_c; -INSTANTIATE_TEST_CASE_P( - C, VP9VarianceTest, - ::testing::Values(make_tuple(2, 2, variance4x4_c, 0), - make_tuple(2, 3, variance4x8_c, 0), - make_tuple(3, 2, variance8x4_c, 0), - make_tuple(3, 3, variance8x8_c, 0), - make_tuple(3, 4, variance8x16_c, 0), - make_tuple(4, 3, variance16x8_c, 0), - make_tuple(4, 4, variance16x16_c, 0), - make_tuple(4, 5, variance16x32_c, 0), - make_tuple(5, 4, variance32x16_c, 0), - make_tuple(5, 5, variance32x32_c, 0), - make_tuple(5, 6, variance32x64_c, 0), - make_tuple(6, 5, variance64x32_c, 0), - make_tuple(6, 6, variance64x64_c, 0))); -#if CONFIG_VP9_HIGHBITDEPTH -const vp9_variance_fn_t highbd_10_variance4x4_c = vp9_highbd_10_variance4x4_c; -const vp9_variance_fn_t highbd_10_variance4x8_c = vp9_highbd_10_variance4x8_c; -const vp9_variance_fn_t highbd_10_variance8x4_c = vp9_highbd_10_variance8x4_c; -const vp9_variance_fn_t highbd_10_variance8x8_c = vp9_highbd_10_variance8x8_c; -const vp9_variance_fn_t highbd_10_variance8x16_c = vp9_highbd_10_variance8x16_c; -const vp9_variance_fn_t highbd_10_variance16x8_c = vp9_highbd_10_variance16x8_c; -const vp9_variance_fn_t highbd_10_variance16x16_c = - vp9_highbd_10_variance16x16_c; -const vp9_variance_fn_t highbd_10_variance16x32_c = - vp9_highbd_10_variance16x32_c; -const vp9_variance_fn_t highbd_10_variance32x16_c = - vp9_highbd_10_variance32x16_c; -const vp9_variance_fn_t highbd_10_variance32x32_c = - vp9_highbd_10_variance32x32_c; -const vp9_variance_fn_t highbd_10_variance32x64_c = - vp9_highbd_10_variance32x64_c; -const vp9_variance_fn_t highbd_10_variance64x32_c = - vp9_highbd_10_variance64x32_c; -const vp9_variance_fn_t highbd_10_variance64x64_c = - vp9_highbd_10_variance64x64_c; -const vp9_variance_fn_t highbd_12_variance4x4_c = vp9_highbd_12_variance4x4_c; -const vp9_variance_fn_t highbd_12_variance4x8_c = vp9_highbd_12_variance4x8_c; -const vp9_variance_fn_t highbd_12_variance8x4_c = vp9_highbd_12_variance8x4_c; -const vp9_variance_fn_t highbd_12_variance8x8_c = vp9_highbd_12_variance8x8_c; -const vp9_variance_fn_t highbd_12_variance8x16_c = vp9_highbd_12_variance8x16_c; -const vp9_variance_fn_t highbd_12_variance16x8_c = vp9_highbd_12_variance16x8_c; -const vp9_variance_fn_t highbd_12_variance16x16_c = - vp9_highbd_12_variance16x16_c; -const vp9_variance_fn_t highbd_12_variance16x32_c = - vp9_highbd_12_variance16x32_c; -const vp9_variance_fn_t highbd_12_variance32x16_c = - vp9_highbd_12_variance32x16_c; -const vp9_variance_fn_t highbd_12_variance32x32_c = - vp9_highbd_12_variance32x32_c; -const vp9_variance_fn_t highbd_12_variance32x64_c = - vp9_highbd_12_variance32x64_c; -const vp9_variance_fn_t highbd_12_variance64x32_c = - vp9_highbd_12_variance64x32_c; -const vp9_variance_fn_t highbd_12_variance64x64_c = - vp9_highbd_12_variance64x64_c; -const vp9_variance_fn_t highbd_variance4x4_c = vp9_highbd_variance4x4_c; -const vp9_variance_fn_t highbd_variance4x8_c = vp9_highbd_variance4x8_c; -const vp9_variance_fn_t highbd_variance8x4_c = vp9_highbd_variance8x4_c; -const vp9_variance_fn_t highbd_variance8x8_c = vp9_highbd_variance8x8_c; -const vp9_variance_fn_t highbd_variance8x16_c = vp9_highbd_variance8x16_c; -const vp9_variance_fn_t highbd_variance16x8_c = vp9_highbd_variance16x8_c; -const vp9_variance_fn_t highbd_variance16x16_c = vp9_highbd_variance16x16_c; -const vp9_variance_fn_t highbd_variance16x32_c = vp9_highbd_variance16x32_c; -const vp9_variance_fn_t highbd_variance32x16_c = vp9_highbd_variance32x16_c; -const vp9_variance_fn_t highbd_variance32x32_c = vp9_highbd_variance32x32_c; -const vp9_variance_fn_t highbd_variance32x64_c = vp9_highbd_variance32x64_c; -const vp9_variance_fn_t highbd_variance64x32_c = vp9_highbd_variance64x32_c; -const vp9_variance_fn_t highbd_variance64x64_c = vp9_highbd_variance64x64_c; -INSTANTIATE_TEST_CASE_P( - C, VP9VarianceHighTest, - ::testing::Values(make_tuple(2, 2, highbd_10_variance4x4_c, 10), - make_tuple(2, 3, highbd_10_variance4x8_c, 10), - make_tuple(3, 2, highbd_10_variance8x4_c, 10), - make_tuple(3, 3, highbd_10_variance8x8_c, 10), - make_tuple(3, 4, highbd_10_variance8x16_c, 10), - make_tuple(4, 3, highbd_10_variance16x8_c, 10), - make_tuple(4, 4, highbd_10_variance16x16_c, 10), - make_tuple(4, 5, highbd_10_variance16x32_c, 10), - make_tuple(5, 4, highbd_10_variance32x16_c, 10), - make_tuple(5, 5, highbd_10_variance32x32_c, 10), - make_tuple(5, 6, highbd_10_variance32x64_c, 10), - make_tuple(6, 5, highbd_10_variance64x32_c, 10), - make_tuple(6, 6, highbd_10_variance64x64_c, 10), - make_tuple(2, 2, highbd_12_variance4x4_c, 12), - make_tuple(2, 3, highbd_12_variance4x8_c, 12), - make_tuple(3, 2, highbd_12_variance8x4_c, 12), - make_tuple(3, 3, highbd_12_variance8x8_c, 12), - make_tuple(3, 4, highbd_12_variance8x16_c, 12), - make_tuple(4, 3, highbd_12_variance16x8_c, 12), - make_tuple(4, 4, highbd_12_variance16x16_c, 12), - make_tuple(4, 5, highbd_12_variance16x32_c, 12), - make_tuple(5, 4, highbd_12_variance32x16_c, 12), - make_tuple(5, 5, highbd_12_variance32x32_c, 12), - make_tuple(5, 6, highbd_12_variance32x64_c, 12), - make_tuple(6, 5, highbd_12_variance64x32_c, 12), - make_tuple(6, 6, highbd_12_variance64x64_c, 12), - make_tuple(2, 2, highbd_variance4x4_c, 8), - make_tuple(2, 3, highbd_variance4x8_c, 8), - make_tuple(3, 2, highbd_variance8x4_c, 8), - make_tuple(3, 3, highbd_variance8x8_c, 8), - make_tuple(3, 4, highbd_variance8x16_c, 8), - make_tuple(4, 3, highbd_variance16x8_c, 8), - make_tuple(4, 4, highbd_variance16x16_c, 8), - make_tuple(4, 5, highbd_variance16x32_c, 8), - make_tuple(5, 4, highbd_variance32x16_c, 8), - make_tuple(5, 5, highbd_variance32x32_c, 8), - make_tuple(5, 6, highbd_variance32x64_c, 8), - make_tuple(6, 5, highbd_variance64x32_c, 8), - make_tuple(6, 6, highbd_variance64x64_c, 8))); -#endif // CONFIG_VP9_HIGHBITDEPTH const vp9_subpixvariance_fn_t subpel_variance4x4_c = vp9_sub_pixel_variance4x4_c; const vp9_subpixvariance_fn_t subpel_variance4x8_c = @@ -1377,40 +1423,11 @@ INSTANTIATE_TEST_CASE_P( make_tuple(6, 5, highbd_subpel_avg_variance64x32_c, 8), make_tuple(6, 6, highbd_subpel_avg_variance64x64_c, 8))); #endif // CONFIG_VP9_HIGHBITDEPTH +#endif // CONFIG_VP9_ENCODER +#if CONFIG_VP9_ENCODER #if HAVE_SSE2 #if CONFIG_USE_X86INC -INSTANTIATE_TEST_CASE_P(SSE2, SumOfSquaresTest, - ::testing::Values(vp9_get_mb_ss_sse2)); - -const vp9_variance_fn_t variance4x4_sse2 = vp9_variance4x4_sse2; -const vp9_variance_fn_t variance4x8_sse2 = vp9_variance4x8_sse2; -const vp9_variance_fn_t variance8x4_sse2 = vp9_variance8x4_sse2; -const vp9_variance_fn_t variance8x8_sse2 = vp9_variance8x8_sse2; -const vp9_variance_fn_t variance8x16_sse2 = vp9_variance8x16_sse2; -const vp9_variance_fn_t variance16x8_sse2 = vp9_variance16x8_sse2; -const vp9_variance_fn_t variance16x16_sse2 = vp9_variance16x16_sse2; -const vp9_variance_fn_t variance16x32_sse2 = vp9_variance16x32_sse2; -const vp9_variance_fn_t variance32x16_sse2 = vp9_variance32x16_sse2; -const vp9_variance_fn_t variance32x32_sse2 = vp9_variance32x32_sse2; -const vp9_variance_fn_t variance32x64_sse2 = vp9_variance32x64_sse2; -const vp9_variance_fn_t variance64x32_sse2 = vp9_variance64x32_sse2; -const vp9_variance_fn_t variance64x64_sse2 = vp9_variance64x64_sse2; -INSTANTIATE_TEST_CASE_P( - SSE2, VP9VarianceTest, - ::testing::Values(make_tuple(2, 2, variance4x4_sse2, 0), - make_tuple(2, 3, variance4x8_sse2, 0), - make_tuple(3, 2, variance8x4_sse2, 0), - make_tuple(3, 3, variance8x8_sse2, 0), - make_tuple(3, 4, variance8x16_sse2, 0), - make_tuple(4, 3, variance16x8_sse2, 0), - make_tuple(4, 4, variance16x16_sse2, 0), - make_tuple(4, 5, variance16x32_sse2, 0), - make_tuple(5, 4, variance32x16_sse2, 0), - make_tuple(5, 5, variance32x32_sse2, 0), - make_tuple(5, 6, variance32x64_sse2, 0), - make_tuple(6, 5, variance64x32_sse2, 0), - make_tuple(6, 6, variance64x64_sse2, 0))); const vp9_subpixvariance_fn_t subpel_variance4x4_sse = vp9_sub_pixel_variance4x4_sse; const vp9_subpixvariance_fn_t subpel_variance4x8_sse = @@ -1494,96 +1511,6 @@ INSTANTIATE_TEST_CASE_P( make_tuple(6, 5, subpel_avg_variance64x32_sse2, 0), make_tuple(6, 6, subpel_avg_variance64x64_sse2, 0))); #if CONFIG_VP9_HIGHBITDEPTH -const vp9_variance_fn_t highbd_variance8x8_sse2 = vp9_highbd_variance8x8_sse2; -const vp9_variance_fn_t highbd_10_variance8x8_sse2 = - vp9_highbd_10_variance8x8_sse2; -const vp9_variance_fn_t highbd_12_variance8x8_sse2 = - vp9_highbd_12_variance8x8_sse2; -const vp9_variance_fn_t highbd_variance8x16_sse2 = vp9_highbd_variance8x16_sse2; -const vp9_variance_fn_t highbd_10_variance8x16_sse2 = - vp9_highbd_10_variance8x16_sse2; -const vp9_variance_fn_t highbd_12_variance8x16_sse2 = - vp9_highbd_12_variance8x16_sse2; -const vp9_variance_fn_t highbd_variance16x8_sse2 = - vp9_highbd_variance16x8_sse2; -const vp9_variance_fn_t highbd_10_variance16x8_sse2 = - vp9_highbd_10_variance16x8_sse2; -const vp9_variance_fn_t highbd_12_variance16x8_sse2 = - vp9_highbd_12_variance16x8_sse2; -const vp9_variance_fn_t highbd_variance16x16_sse2 = - vp9_highbd_variance16x16_sse2; -const vp9_variance_fn_t highbd_10_variance16x16_sse2 = - vp9_highbd_10_variance16x16_sse2; -const vp9_variance_fn_t highbd_12_variance16x16_sse2 = - vp9_highbd_12_variance16x16_sse2; -const vp9_variance_fn_t highbd_variance16x32_sse2 = - vp9_highbd_variance16x32_sse2; -const vp9_variance_fn_t highbd_10_variance16x32_sse2 = - vp9_highbd_10_variance16x32_sse2; -const vp9_variance_fn_t highbd_12_variance16x32_sse2 = - vp9_highbd_12_variance16x32_sse2; -const vp9_variance_fn_t highbd_variance32x16_sse2 = - vp9_highbd_variance32x16_sse2; -const vp9_variance_fn_t highbd_10_variance32x16_sse2 = - vp9_highbd_10_variance32x16_sse2; -const vp9_variance_fn_t highbd_12_variance32x16_sse2 = - vp9_highbd_12_variance32x16_sse2; -const vp9_variance_fn_t highbd_variance32x32_sse2 = - vp9_highbd_variance32x32_sse2; -const vp9_variance_fn_t highbd_10_variance32x32_sse2 = - vp9_highbd_10_variance32x32_sse2; -const vp9_variance_fn_t highbd_12_variance32x32_sse2 = - vp9_highbd_12_variance32x32_sse2; -const vp9_variance_fn_t highbd_variance32x64_sse2 = - vp9_highbd_variance32x64_sse2; -const vp9_variance_fn_t highbd_10_variance32x64_sse2 = - vp9_highbd_10_variance32x64_sse2; -const vp9_variance_fn_t highbd_12_variance32x64_sse2 = - vp9_highbd_12_variance32x64_sse2; -const vp9_variance_fn_t highbd_variance64x32_sse2 = - vp9_highbd_variance64x32_sse2; -const vp9_variance_fn_t highbd_10_variance64x32_sse2 = - vp9_highbd_10_variance64x32_sse2; -const vp9_variance_fn_t highbd_12_variance64x32_sse2 = - vp9_highbd_12_variance64x32_sse2; -const vp9_variance_fn_t highbd_variance64x64_sse2 = - vp9_highbd_variance64x64_sse2; -const vp9_variance_fn_t highbd_10_variance64x64_sse2 = - vp9_highbd_10_variance64x64_sse2; -const vp9_variance_fn_t highbd_12_variance64x64_sse2 = - vp9_highbd_12_variance64x64_sse2; -INSTANTIATE_TEST_CASE_P( - SSE2, VP9VarianceHighTest, - ::testing::Values(make_tuple(3, 3, highbd_10_variance8x8_sse2, 10), - make_tuple(3, 4, highbd_10_variance8x16_sse2, 10), - make_tuple(4, 3, highbd_10_variance16x8_sse2, 10), - make_tuple(4, 4, highbd_10_variance16x16_sse2, 10), - make_tuple(4, 5, highbd_10_variance16x32_sse2, 10), - make_tuple(5, 4, highbd_10_variance32x16_sse2, 10), - make_tuple(5, 5, highbd_10_variance32x32_sse2, 10), - make_tuple(5, 6, highbd_10_variance32x64_sse2, 10), - make_tuple(6, 5, highbd_10_variance64x32_sse2, 10), - make_tuple(6, 6, highbd_10_variance64x64_sse2, 10), - make_tuple(3, 3, highbd_12_variance8x8_sse2, 12), - make_tuple(3, 4, highbd_12_variance8x16_sse2, 12), - make_tuple(4, 3, highbd_12_variance16x8_sse2, 12), - make_tuple(4, 4, highbd_12_variance16x16_sse2, 12), - make_tuple(4, 5, highbd_12_variance16x32_sse2, 12), - make_tuple(5, 4, highbd_12_variance32x16_sse2, 12), - make_tuple(5, 5, highbd_12_variance32x32_sse2, 12), - make_tuple(5, 6, highbd_12_variance32x64_sse2, 12), - make_tuple(6, 5, highbd_12_variance64x32_sse2, 12), - make_tuple(6, 6, highbd_12_variance64x64_sse2, 12), - make_tuple(3, 3, highbd_variance8x8_sse2, 8), - make_tuple(3, 4, highbd_variance8x16_sse2, 8), - make_tuple(4, 3, highbd_variance16x8_sse2, 8), - make_tuple(4, 4, highbd_variance16x16_sse2, 8), - make_tuple(4, 5, highbd_variance16x32_sse2, 8), - make_tuple(5, 4, highbd_variance32x16_sse2, 8), - make_tuple(5, 5, highbd_variance32x32_sse2, 8), - make_tuple(5, 6, highbd_variance32x64_sse2, 8), - make_tuple(6, 5, highbd_variance64x32_sse2, 8), - make_tuple(6, 6, highbd_variance64x64_sse2, 8))); const vp9_subpixvariance_fn_t highbd_subpel_variance8x4_sse2 = vp9_highbd_sub_pixel_variance8x4_sse2; const vp9_subpixvariance_fn_t highbd_subpel_variance8x8_sse2 = @@ -1790,6 +1717,9 @@ INSTANTIATE_TEST_CASE_P( #endif // CONFIG_VP9_HIGHBITDEPTH #endif // CONFIG_USE_X86INC #endif // HAVE_SSE2 +#endif // CONFIG_VP9_ENCODER + +#if CONFIG_VP9_ENCODER #if HAVE_SSSE3 #if CONFIG_USE_X86INC @@ -1877,22 +1807,27 @@ INSTANTIATE_TEST_CASE_P( make_tuple(6, 6, subpel_avg_variance64x64_ssse3, 0))); #endif // CONFIG_USE_X86INC #endif // HAVE_SSSE3 +#endif // CONFIG_VP9_ENCODER #if HAVE_AVX2 - -const vp9_variance_fn_t variance16x16_avx2 = vp9_variance16x16_avx2; -const vp9_variance_fn_t variance32x16_avx2 = vp9_variance32x16_avx2; -const vp9_variance_fn_t variance32x32_avx2 = vp9_variance32x32_avx2; -const vp9_variance_fn_t variance64x32_avx2 = vp9_variance64x32_avx2; -const vp9_variance_fn_t variance64x64_avx2 = vp9_variance64x64_avx2; +const VarianceMxNFunc mse16x16_avx2 = vpx_mse16x16_avx2; +INSTANTIATE_TEST_CASE_P(AVX2, VpxMseTest, + ::testing::Values(make_tuple(4, 4, mse16x16_avx2))); + +const VarianceMxNFunc variance64x64_avx2 = vpx_variance64x64_avx2; +const VarianceMxNFunc variance64x32_avx2 = vpx_variance64x32_avx2; +const VarianceMxNFunc variance32x32_avx2 = vpx_variance32x32_avx2; +const VarianceMxNFunc variance32x16_avx2 = vpx_variance32x16_avx2; +const VarianceMxNFunc variance16x16_avx2 = vpx_variance16x16_avx2; INSTANTIATE_TEST_CASE_P( - AVX2, VP9VarianceTest, - ::testing::Values(make_tuple(4, 4, variance16x16_avx2, 0), - make_tuple(5, 4, variance32x16_avx2, 0), - make_tuple(5, 5, variance32x32_avx2, 0), + AVX2, VpxVarianceTest, + ::testing::Values(make_tuple(6, 6, variance64x64_avx2, 0), make_tuple(6, 5, variance64x32_avx2, 0), - make_tuple(6, 6, variance64x64_avx2, 0))); + make_tuple(5, 5, variance32x32_avx2, 0), + make_tuple(5, 4, variance32x16_avx2, 0), + make_tuple(4, 4, variance16x16_avx2, 0))); +#if CONFIG_VP9_ENCODER const vp9_subpixvariance_fn_t subpel_variance32x32_avx2 = vp9_sub_pixel_variance32x32_avx2; const vp9_subpixvariance_fn_t subpel_variance64x64_avx2 = @@ -1910,23 +1845,38 @@ INSTANTIATE_TEST_CASE_P( AVX2, VP9SubpelAvgVarianceTest, ::testing::Values(make_tuple(5, 5, subpel_avg_variance32x32_avx2, 0), make_tuple(6, 6, subpel_avg_variance64x64_avx2, 0))); +#endif // CONFIG_VP9_ENCODER #endif // HAVE_AVX2 + #if HAVE_NEON -const vp9_variance_fn_t variance8x8_neon = vp9_variance8x8_neon; -const vp9_variance_fn_t variance16x16_neon = vp9_variance16x16_neon; -const vp9_variance_fn_t variance32x32_neon = vp9_variance32x32_neon; -const vp9_variance_fn_t variance32x64_neon = vp9_variance32x64_neon; -const vp9_variance_fn_t variance64x32_neon = vp9_variance64x32_neon; -const vp9_variance_fn_t variance64x64_neon = vp9_variance64x64_neon; +const Get4x4SseFunc get4x4sse_cs_neon = vpx_get4x4sse_cs_neon; +INSTANTIATE_TEST_CASE_P(NEON, VpxSseTest, + ::testing::Values(make_tuple(2, 2, get4x4sse_cs_neon))); + +const VarianceMxNFunc mse16x16_neon = vpx_mse16x16_neon; +INSTANTIATE_TEST_CASE_P(NEON, VpxMseTest, + ::testing::Values(make_tuple(4, 4, mse16x16_neon))); + +const VarianceMxNFunc variance64x64_neon = vpx_variance64x64_neon; +const VarianceMxNFunc variance64x32_neon = vpx_variance64x32_neon; +const VarianceMxNFunc variance32x64_neon = vpx_variance32x64_neon; +const VarianceMxNFunc variance32x32_neon = vpx_variance32x32_neon; +const VarianceMxNFunc variance16x16_neon = vpx_variance16x16_neon; +const VarianceMxNFunc variance16x8_neon = vpx_variance16x8_neon; +const VarianceMxNFunc variance8x16_neon = vpx_variance8x16_neon; +const VarianceMxNFunc variance8x8_neon = vpx_variance8x8_neon; INSTANTIATE_TEST_CASE_P( - NEON, VP9VarianceTest, - ::testing::Values(make_tuple(3, 3, variance8x8_neon, 0), - make_tuple(4, 4, variance16x16_neon, 0), - make_tuple(5, 5, variance32x32_neon, 0), - make_tuple(5, 6, variance32x64_neon, 0), + NEON, VpxVarianceTest, + ::testing::Values(make_tuple(6, 6, variance64x64_neon, 0), make_tuple(6, 5, variance64x32_neon, 0), - make_tuple(6, 6, variance64x64_neon, 0))); + make_tuple(5, 6, variance32x64_neon, 0), + make_tuple(5, 5, variance32x32_neon, 0), + make_tuple(4, 4, variance16x16_neon, 0), + make_tuple(4, 3, variance16x8_neon, 0), + make_tuple(3, 4, variance8x16_neon, 0), + make_tuple(3, 3, variance8x8_neon, 0))); +#if CONFIG_VP9_ENCODER const vp9_subpixvariance_fn_t subpel_variance8x8_neon = vp9_sub_pixel_variance8x8_neon; const vp9_subpixvariance_fn_t subpel_variance16x16_neon = @@ -1941,8 +1891,19 @@ INSTANTIATE_TEST_CASE_P( make_tuple(4, 4, subpel_variance16x16_neon, 0), make_tuple(5, 5, subpel_variance32x32_neon, 0), make_tuple(6, 6, subpel_variance64x64_neon, 0))); -#endif // HAVE_NEON #endif // CONFIG_VP9_ENCODER +#endif // HAVE_NEON -} // namespace vp9 +#if HAVE_MEDIA +const VarianceMxNFunc mse16x16_media = vpx_mse16x16_media; +INSTANTIATE_TEST_CASE_P(MEDIA, VpxMseTest, + ::testing::Values(make_tuple(4, 4, mse16x16_media))); + +const VarianceMxNFunc variance16x16_media = vpx_variance16x16_media; +const VarianceMxNFunc variance8x8_media = vpx_variance8x8_media; +INSTANTIATE_TEST_CASE_P( + MEDIA, VpxVarianceTest, + ::testing::Values(make_tuple(4, 4, variance16x16_media, 0), + make_tuple(3, 3, variance8x8_media, 0))); +#endif // HAVE_MEDIA } // namespace diff --git a/test/vp9_error_block_test.cc b/test/vp9_error_block_test.cc index d7ba1b024..ac19c2e3d 100644 --- a/test/vp9_error_block_test.cc +++ b/test/vp9_error_block_test.cc @@ -21,6 +21,7 @@ #include "./vpx_config.h" #include "./vp9_rtcd.h" #include "vp9/common/vp9_entropy.h" +#include "vpx/vpx_codec.h" #include "vpx/vpx_integer.h" using libvpx_test::ACMRandom; diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc index 2d910466d..943c00b87 100644 --- a/test/vp9_quantize_test.cc +++ b/test/vp9_quantize_test.cc @@ -21,6 +21,8 @@ #include "./vpx_config.h" #include "./vp9_rtcd.h" #include "vp9/common/vp9_entropy.h" +#include "vp9/common/vp9_scan.h" +#include "vpx/vpx_codec.h" #include "vpx/vpx_integer.h" using libvpx_test::ACMRandom; diff --git a/tools_common.h b/tools_common.h index a87e814c1..aa7f02599 100644 --- a/tools_common.h +++ b/tools_common.h @@ -16,6 +16,7 @@ #include "vpx/vpx_codec.h" #include "vpx/vpx_image.h" #include "vpx/vpx_integer.h" +#include "vpx_ports/msvc.h" #if CONFIG_ENCODERS #include "./y4minput.h" @@ -34,7 +35,6 @@ #if CONFIG_OS_SUPPORT #if defined(_MSC_VER) #include <io.h> /* NOLINT */ -#define snprintf _snprintf #define isatty _isatty #define fileno _fileno #else diff --git a/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm b/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm deleted file mode 100644 index 39919579f..000000000 --- a/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm +++ /dev/null @@ -1,154 +0,0 @@ -; -; Copyright (c) 2011 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_variance16x16_armv6| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *src_ptr -; r1 int source_stride -; r2 unsigned char *ref_ptr -; r3 int recon_stride -; stack unsigned int *sse -|vp8_variance16x16_armv6| PROC - - stmfd sp!, {r4-r12, lr} - - pld [r0, r1, lsl #0] - pld [r2, r3, lsl #0] - - mov r8, #0 ; initialize sum = 0 - mov r11, #0 ; initialize sse = 0 - mov r12, #16 ; set loop counter to 16 (=block height) - -loop - ; 1st 4 pixels - ldr r4, [r0, #0] ; load 4 src pixels - ldr r5, [r2, #0] ; load 4 ref pixels - - mov lr, #0 ; constant zero - - usub8 r6, r4, r5 ; calculate difference - pld [r0, r1, lsl #1] - sel r7, r6, lr ; select bytes with positive difference - usub8 r9, r5, r4 ; calculate difference with reversed operands - pld [r2, r3, lsl #1] - sel r6, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - ; calculate total sum - adds r8, r8, r4 ; add positive differences to sum - subs r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r10, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 2nd 4 pixels - ldr r4, [r0, #4] ; load 4 src pixels - ldr r5, [r2, #4] ; load 4 ref pixels - smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - sel r7, r6, lr ; select bytes with positive difference - usub8 r9, r5, r4 ; calculate difference with reversed operands - sel r6, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r10, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 3rd 4 pixels - ldr r4, [r0, #8] ; load 4 src pixels - ldr r5, [r2, #8] ; load 4 ref pixels - smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - sel r7, r6, lr ; select bytes with positive difference - usub8 r9, r5, r4 ; calculate difference with reversed operands - sel r6, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r10, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 4th 4 pixels - ldr r4, [r0, #12] ; load 4 src pixels - ldr r5, [r2, #12] ; load 4 ref pixels - smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - add r0, r0, r1 ; set src_ptr to next row - sel r7, r6, lr ; select bytes with positive difference - usub8 r9, r5, r4 ; calculate difference with reversed operands - add r2, r2, r3 ; set dst_ptr to next row - sel r6, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r10, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) - - - subs r12, r12, #1 - - bne loop - - ; return stuff - ldr r6, [sp, #40] ; get address of sse - mul r0, r8, r8 ; sum * sum - str r11, [r6] ; store sse - sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8)) - - ldmfd sp!, {r4-r12, pc} - - ENDP - - END - diff --git a/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm b/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm deleted file mode 100644 index 915ee4993..000000000 --- a/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm +++ /dev/null @@ -1,101 +0,0 @@ -; -; Copyright (c) 2011 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_variance8x8_armv6| - - ARM - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *src_ptr -; r1 int source_stride -; r2 unsigned char *ref_ptr -; r3 int recon_stride -; stack unsigned int *sse -|vp8_variance8x8_armv6| PROC - - push {r4-r10, lr} - - pld [r0, r1, lsl #0] - pld [r2, r3, lsl #0] - - mov r12, #8 ; set loop counter to 8 (=block height) - mov r4, #0 ; initialize sum = 0 - mov r5, #0 ; initialize sse = 0 - -loop - ; 1st 4 pixels - ldr r6, [r0, #0x0] ; load 4 src pixels - ldr r7, [r2, #0x0] ; load 4 ref pixels - - mov lr, #0 ; constant zero - - usub8 r8, r6, r7 ; calculate difference - pld [r0, r1, lsl #1] - sel r10, r8, lr ; select bytes with positive difference - usub8 r9, r7, r6 ; calculate difference with reversed operands - pld [r2, r3, lsl #1] - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r6, r10, lr ; calculate sum of positive differences - usad8 r7, r8, lr ; calculate sum of negative differences - orr r8, r8, r10 ; differences of all 4 pixels - ; calculate total sum - add r4, r4, r6 ; add positive differences to sum - sub r4, r4, r7 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r7, r8 ; byte (two pixels) to halfwords - uxtb16 r10, r8, ror #8 ; another two pixels to halfwords - smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1) - - ; 2nd 4 pixels - ldr r6, [r0, #0x4] ; load 4 src pixels - ldr r7, [r2, #0x4] ; load 4 ref pixels - smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2) - - usub8 r8, r6, r7 ; calculate difference - add r0, r0, r1 ; set src_ptr to next row - sel r10, r8, lr ; select bytes with positive difference - usub8 r9, r7, r6 ; calculate difference with reversed operands - add r2, r2, r3 ; set dst_ptr to next row - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r6, r10, lr ; calculate sum of positive differences - usad8 r7, r8, lr ; calculate sum of negative differences - orr r8, r8, r10 ; differences of all 4 pixels - - ; calculate total sum - add r4, r4, r6 ; add positive differences to sum - sub r4, r4, r7 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r7, r8 ; byte (two pixels) to halfwords - uxtb16 r10, r8, ror #8 ; another two pixels to halfwords - smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1) - subs r12, r12, #1 ; next row - smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2) - - bne loop - - ; return stuff - ldr r8, [sp, #32] ; get address of sse - mul r1, r4, r4 ; sum * sum - str r5, [r8] ; store sse - sub r0, r5, r1, ASR #6 ; return (sse - ((sum * sum) >> 6)) - - pop {r4-r10, pc} - - ENDP - - END diff --git a/vp8/common/arm/neon/variance_neon.c b/vp8/common/arm/neon/variance_neon.c deleted file mode 100644 index 1b1979073..000000000 --- a/vp8/common/arm/neon/variance_neon.c +++ /dev/null @@ -1,320 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <arm_neon.h> -#include "vpx_ports/mem.h" - -unsigned int vp8_variance16x16_neon( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - int i; - int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; - uint32x2_t d0u32, d10u32; - int64x1_t d0s64, d1s64; - uint8x16_t q0u8, q1u8, q2u8, q3u8; - uint16x8_t q11u16, q12u16, q13u16, q14u16; - int32x4_t q8s32, q9s32, q10s32; - int64x2_t q0s64, q1s64, q5s64; - - q8s32 = vdupq_n_s32(0); - q9s32 = vdupq_n_s32(0); - q10s32 = vdupq_n_s32(0); - - for (i = 0; i < 8; i++) { - q0u8 = vld1q_u8(src_ptr); - src_ptr += source_stride; - q1u8 = vld1q_u8(src_ptr); - src_ptr += source_stride; - __builtin_prefetch(src_ptr); - - q2u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - q3u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - __builtin_prefetch(ref_ptr); - - q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); - q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); - q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); - q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); - - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); - q9s32 = vmlal_s16(q9s32, d22s16, d22s16); - q10s32 = vmlal_s16(q10s32, d23s16, d23s16); - - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); - q9s32 = vmlal_s16(q9s32, d24s16, d24s16); - q10s32 = vmlal_s16(q10s32, d25s16, d25s16); - - d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); - d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); - q9s32 = vmlal_s16(q9s32, d26s16, d26s16); - q10s32 = vmlal_s16(q10s32, d27s16, d27s16); - - d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); - d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); - q9s32 = vmlal_s16(q9s32, d28s16, d28s16); - q10s32 = vmlal_s16(q10s32, d29s16, d29s16); - } - - q10s32 = vaddq_s32(q10s32, q9s32); - q0s64 = vpaddlq_s32(q8s32); - q1s64 = vpaddlq_s32(q10s32); - - d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); - d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); - - q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), - vreinterpret_s32_s64(d0s64)); - vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); - - d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8); - d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); - - return vget_lane_u32(d0u32, 0); -} - -unsigned int vp8_variance16x8_neon( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - int i; - int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; - uint32x2_t d0u32, d10u32; - int64x1_t d0s64, d1s64; - uint8x16_t q0u8, q1u8, q2u8, q3u8; - uint16x8_t q11u16, q12u16, q13u16, q14u16; - int32x4_t q8s32, q9s32, q10s32; - int64x2_t q0s64, q1s64, q5s64; - - q8s32 = vdupq_n_s32(0); - q9s32 = vdupq_n_s32(0); - q10s32 = vdupq_n_s32(0); - - for (i = 0; i < 4; i++) { // variance16x8_neon_loop - q0u8 = vld1q_u8(src_ptr); - src_ptr += source_stride; - q1u8 = vld1q_u8(src_ptr); - src_ptr += source_stride; - __builtin_prefetch(src_ptr); - - q2u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - q3u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - __builtin_prefetch(ref_ptr); - - q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); - q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); - q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); - q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); - - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); - q9s32 = vmlal_s16(q9s32, d22s16, d22s16); - q10s32 = vmlal_s16(q10s32, d23s16, d23s16); - - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); - q9s32 = vmlal_s16(q9s32, d24s16, d24s16); - q10s32 = vmlal_s16(q10s32, d25s16, d25s16); - - d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); - d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); - q9s32 = vmlal_s16(q9s32, d26s16, d26s16); - q10s32 = vmlal_s16(q10s32, d27s16, d27s16); - - d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); - d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); - q9s32 = vmlal_s16(q9s32, d28s16, d28s16); - q10s32 = vmlal_s16(q10s32, d29s16, d29s16); - } - - q10s32 = vaddq_s32(q10s32, q9s32); - q0s64 = vpaddlq_s32(q8s32); - q1s64 = vpaddlq_s32(q10s32); - - d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); - d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); - - q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), - vreinterpret_s32_s64(d0s64)); - vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); - - d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); - d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); - - return vget_lane_u32(d0u32, 0); -} - -unsigned int vp8_variance8x16_neon( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - int i; - uint8x8_t d0u8, d2u8, d4u8, d6u8; - int16x4_t d22s16, d23s16, d24s16, d25s16; - uint32x2_t d0u32, d10u32; - int64x1_t d0s64, d1s64; - uint16x8_t q11u16, q12u16; - int32x4_t q8s32, q9s32, q10s32; - int64x2_t q0s64, q1s64, q5s64; - - q8s32 = vdupq_n_s32(0); - q9s32 = vdupq_n_s32(0); - q10s32 = vdupq_n_s32(0); - - for (i = 0; i < 8; i++) { // variance8x16_neon_loop - d0u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - d2u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - __builtin_prefetch(src_ptr); - - d4u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - d6u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - __builtin_prefetch(ref_ptr); - - q11u16 = vsubl_u8(d0u8, d4u8); - q12u16 = vsubl_u8(d2u8, d6u8); - - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); - q9s32 = vmlal_s16(q9s32, d22s16, d22s16); - q10s32 = vmlal_s16(q10s32, d23s16, d23s16); - - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); - q9s32 = vmlal_s16(q9s32, d24s16, d24s16); - q10s32 = vmlal_s16(q10s32, d25s16, d25s16); - } - - q10s32 = vaddq_s32(q10s32, q9s32); - q0s64 = vpaddlq_s32(q8s32); - q1s64 = vpaddlq_s32(q10s32); - - d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); - d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); - - q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), - vreinterpret_s32_s64(d0s64)); - vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); - - d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); - d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); - - return vget_lane_u32(d0u32, 0); -} - -unsigned int vp8_variance8x8_neon( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - int i; - uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; - int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; - uint32x2_t d0u32, d10u32; - int64x1_t d0s64, d1s64; - uint16x8_t q11u16, q12u16, q13u16, q14u16; - int32x4_t q8s32, q9s32, q10s32; - int64x2_t q0s64, q1s64, q5s64; - - q8s32 = vdupq_n_s32(0); - q9s32 = vdupq_n_s32(0); - q10s32 = vdupq_n_s32(0); - - for (i = 0; i < 2; i++) { // variance8x8_neon_loop - d0u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - d1u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - d2u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - d3u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - - d4u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - d5u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - d6u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - d7u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - - q11u16 = vsubl_u8(d0u8, d4u8); - q12u16 = vsubl_u8(d1u8, d5u8); - q13u16 = vsubl_u8(d2u8, d6u8); - q14u16 = vsubl_u8(d3u8, d7u8); - - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); - q9s32 = vmlal_s16(q9s32, d22s16, d22s16); - q10s32 = vmlal_s16(q10s32, d23s16, d23s16); - - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); - q9s32 = vmlal_s16(q9s32, d24s16, d24s16); - q10s32 = vmlal_s16(q10s32, d25s16, d25s16); - - d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); - d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); - q9s32 = vmlal_s16(q9s32, d26s16, d26s16); - q10s32 = vmlal_s16(q10s32, d27s16, d27s16); - - d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); - d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); - q9s32 = vmlal_s16(q9s32, d28s16, d28s16); - q10s32 = vmlal_s16(q10s32, d29s16, d29s16); - } - - q10s32 = vaddq_s32(q10s32, q9s32); - q0s64 = vpaddlq_s32(q8s32); - q1s64 = vpaddlq_s32(q10s32); - - d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); - d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); - - q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), - vreinterpret_s32_s64(d0s64)); - vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); - - d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 6); - d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); - - return vget_lane_u32(d0u32, 0); -} diff --git a/vp8/common/arm/variance_arm.c b/vp8/common/arm/variance_arm.c index 467a50942..0f293f03d 100644 --- a/vp8/common/arm/variance_arm.c +++ b/vp8/common/arm/variance_arm.c @@ -9,10 +9,14 @@ */ #include "vpx_config.h" -#include "vp8_rtcd.h" +#include "./vp8_rtcd.h" +#include "./vpx_dsp_rtcd.h" #include "vp8/common/variance.h" #include "vp8/common/filter.h" +// TODO(johannkoenig): Move this to vpx_dsp or vp8/encoder +#if CONFIG_VP8_ENCODER + #if HAVE_MEDIA #include "vp8/common/arm/bilinearfilter_arm.h" @@ -40,8 +44,8 @@ unsigned int vp8_sub_pixel_variance8x8_armv6 vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass, 8, 8, 8, VFilter); - return vp8_variance8x8_armv6(second_pass, 8, dst_ptr, - dst_pixels_per_line, sse); + return vpx_variance8x8_media(second_pass, 8, dst_ptr, + dst_pixels_per_line, sse); } unsigned int vp8_sub_pixel_variance16x16_armv6 @@ -86,13 +90,13 @@ unsigned int vp8_sub_pixel_variance16x16_armv6 vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass, 16, 16, 16, VFilter); - var = vp8_variance16x16_armv6(second_pass, 16, dst_ptr, - dst_pixels_per_line, sse); + var = vpx_variance16x16_media(second_pass, 16, dst_ptr, + dst_pixels_per_line, sse); } return var; } -#endif /* HAVE_MEDIA */ +#endif // HAVE_MEDIA #if HAVE_NEON @@ -129,4 +133,5 @@ unsigned int vp8_sub_pixel_variance16x16_neon return vp8_sub_pixel_variance16x16_neon_func(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse); } -#endif +#endif // HAVE_NEON +#endif // CONFIG_VP8_ENCODER diff --git a/vp8/common/mfqe.c b/vp8/common/mfqe.c index d12dea193..5c0680f42 100644 --- a/vp8/common/mfqe.c +++ b/vp8/common/mfqe.c @@ -151,14 +151,14 @@ static void multiframe_quality_enhance_block if (blksize == 16) { - actd = (vp8_variance16x16(yd, yd_stride, VP8_ZEROS, 0, &sse)+128)>>8; - act = (vp8_variance16x16(y, y_stride, VP8_ZEROS, 0, &sse)+128)>>8; + actd = (vpx_variance16x16(yd, yd_stride, VP8_ZEROS, 0, &sse)+128)>>8; + act = (vpx_variance16x16(y, y_stride, VP8_ZEROS, 0, &sse)+128)>>8; #ifdef USE_SSD - vp8_variance16x16(y, y_stride, yd, yd_stride, &sse); + vpx_variance16x16(y, y_stride, yd, yd_stride, &sse); sad = (sse + 128)>>8; - vp8_variance8x8(u, uv_stride, ud, uvd_stride, &sse); + vpx_variance8x8(u, uv_stride, ud, uvd_stride, &sse); usad = (sse + 32)>>6; - vp8_variance8x8(v, uv_stride, vd, uvd_stride, &sse); + vpx_variance8x8(v, uv_stride, vd, uvd_stride, &sse); vsad = (sse + 32)>>6; #else sad = (vpx_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8; @@ -168,14 +168,14 @@ static void multiframe_quality_enhance_block } else /* if (blksize == 8) */ { - actd = (vp8_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse)+32)>>6; - act = (vp8_variance8x8(y, y_stride, VP8_ZEROS, 0, &sse)+32)>>6; + actd = (vpx_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse)+32)>>6; + act = (vpx_variance8x8(y, y_stride, VP8_ZEROS, 0, &sse)+32)>>6; #ifdef USE_SSD - vp8_variance8x8(y, y_stride, yd, yd_stride, &sse); + vpx_variance8x8(y, y_stride, yd, yd_stride, &sse); sad = (sse + 32)>>6; - vp8_variance4x4(u, uv_stride, ud, uvd_stride, &sse); + vpx_variance4x4(u, uv_stride, ud, uvd_stride, &sse); usad = (sse + 8)>>4; - vp8_variance4x4(v, uv_stride, vd, uvd_stride, &sse); + vpx_variance4x4(v, uv_stride, vd, uvd_stride, &sse); vsad = (sse + 8)>>4; #else sad = (vpx_sad8x8(y, y_stride, yd, yd_stride) + 32) >> 6; diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl index c9f14d58a..4b820338e 100644 --- a/vp8/common/rtcd_defs.pl +++ b/vp8/common/rtcd_defs.pl @@ -237,31 +237,6 @@ specialize qw/vp8_bilinear_predict4x4 mmx media neon/; $vp8_bilinear_predict4x4_media=vp8_bilinear_predict4x4_armv6; # -# Whole-pixel Variance -# -add_proto qw/unsigned int vp8_variance4x4/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp8_variance4x4 mmx sse2/; -$vp8_variance4x4_sse2=vp8_variance4x4_wmt; - -add_proto qw/unsigned int vp8_variance8x8/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp8_variance8x8 mmx sse2 media neon/; -$vp8_variance8x8_sse2=vp8_variance8x8_wmt; -$vp8_variance8x8_media=vp8_variance8x8_armv6; - -add_proto qw/unsigned int vp8_variance8x16/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp8_variance8x16 mmx sse2 neon/; -$vp8_variance8x16_sse2=vp8_variance8x16_wmt; - -add_proto qw/unsigned int vp8_variance16x8/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp8_variance16x8 mmx sse2 neon/; -$vp8_variance16x8_sse2=vp8_variance16x8_wmt; - -add_proto qw/unsigned int vp8_variance16x16/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp8_variance16x16 mmx sse2 media neon/; -$vp8_variance16x16_sse2=vp8_variance16x16_wmt; -$vp8_variance16x16_media=vp8_variance16x16_armv6; - -# # Sub-pixel Variance # add_proto qw/unsigned int vp8_sub_pixel_variance4x4/, "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"; @@ -309,26 +284,12 @@ $vp8_variance_halfpixvar16x16_hv_media=vp8_variance_halfpixvar16x16_hv_armv6; if (vpx_config("CONFIG_VP8_ENCODER") eq "yes") { # -# Sum of squares (vector) -# -add_proto qw/unsigned int vp8_get_mb_ss/, "const short *"; -specialize qw/vp8_get_mb_ss mmx sse2/; - -# # SSE (Sum Squared Error) # add_proto qw/unsigned int vp8_sub_pixel_mse16x16/, "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"; specialize qw/vp8_sub_pixel_mse16x16 mmx sse2/; $vp8_sub_pixel_mse16x16_sse2=vp8_sub_pixel_mse16x16_wmt; -add_proto qw/unsigned int vp8_mse16x16/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp8_mse16x16 mmx sse2 media neon/; -$vp8_mse16x16_sse2=vp8_mse16x16_wmt; -$vp8_mse16x16_media=vp8_mse16x16_armv6; - -add_proto qw/unsigned int vp8_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride"; -specialize qw/vp8_get4x4sse_cs mmx neon/; - # # Block copy # diff --git a/vp8/common/variance.h b/vp8/common/variance.h index b62cc6136..c6c9f41bf 100644 --- a/vp8/common/variance.h +++ b/vp8/common/variance.h @@ -39,6 +39,7 @@ typedef void (*vpx_sad_multi_fn_t)( const unsigned char *ref_array, int ref_stride, unsigned int *sad_array); + typedef void (*vpx_sad_multi_d_fn_t) ( const unsigned char *src_ptr, @@ -48,7 +49,7 @@ typedef void (*vpx_sad_multi_d_fn_t) unsigned int *sad_array ); -typedef unsigned int (*vp8_variance_fn_t) +typedef unsigned int (*vpx_variance_fn_t) ( const unsigned char *src_ptr, int source_stride, @@ -68,37 +69,14 @@ typedef unsigned int (*vp8_subpixvariance_fn_t) unsigned int *sse ); -typedef void (*vp8_ssimpf_fn_t) - ( - unsigned char *s, - int sp, - unsigned char *r, - int rp, - unsigned long *sum_s, - unsigned long *sum_r, - unsigned long *sum_sq_s, - unsigned long *sum_sq_r, - unsigned long *sum_sxr - ); - -typedef unsigned int (*vp8_getmbss_fn_t)(const short *); - -typedef unsigned int (*vp8_get16x16prederror_fn_t) - ( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int ref_stride - ); - typedef struct variance_vtable { vpx_sad_fn_t sdf; - vp8_variance_fn_t vf; + vpx_variance_fn_t vf; vp8_subpixvariance_fn_t svf; - vp8_variance_fn_t svf_halfpix_h; - vp8_variance_fn_t svf_halfpix_v; - vp8_variance_fn_t svf_halfpix_hv; + vpx_variance_fn_t svf_halfpix_h; + vpx_variance_fn_t svf_halfpix_v; + vpx_variance_fn_t svf_halfpix_hv; vpx_sad_multi_fn_t sdx3f; vpx_sad_multi_fn_t sdx8f; vpx_sad_multi_d_fn_t sdx4df; diff --git a/vp8/common/variance_c.c b/vp8/common/variance_c.c index dc95bfeb3..79d1ca00c 100644 --- a/vp8/common/variance_c.c +++ b/vp8/common/variance_c.c @@ -8,44 +8,34 @@ * be found in the AUTHORS file in the root of the source tree. */ - #include "./vp8_rtcd.h" #include "filter.h" #include "variance.h" - -unsigned int vp8_get_mb_ss_c -( - const short *src_ptr -) -{ - unsigned int i = 0, sum = 0; - - do - { - sum += (src_ptr[i] * src_ptr[i]); - i++; - } - while (i < 256); - - return sum; +/* This is a bad idea. + * ctz = count trailing zeros */ +static int ctz(int a) { + int b = 0; + while (a != 1) { + a >>= 1; + b++; + } + return b; } - -static void variance( +static unsigned int variance( const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int recon_stride, int w, int h, - unsigned int *sse, - int *sum) + unsigned int *sse) { int i, j; - int diff; + int diff, sum; - *sum = 0; + sum = 0; *sse = 0; for (i = 0; i < h; i++) @@ -53,114 +43,17 @@ static void variance( for (j = 0; j < w; j++) { diff = src_ptr[j] - ref_ptr[j]; - *sum += diff; + sum += diff; *sse += diff * diff; } src_ptr += source_stride; ref_ptr += recon_stride; } -} - -unsigned int vp8_variance16x16_c( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int var; - int avg; - - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 8)); -} - -unsigned int vp8_variance8x16_c( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int var; - int avg; - - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, &var, &avg); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 7)); + return (*sse - (((unsigned int)sum * sum) >> (int)((ctz(w) + ctz(h))))); } -unsigned int vp8_variance16x8_c( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int var; - int avg; - - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, &var, &avg); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 7)); -} - - -unsigned int vp8_variance8x8_c( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int var; - int avg; - - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, &var, &avg); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 6)); -} - -unsigned int vp8_variance4x4_c( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int var; - int avg; - - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4, &var, &avg); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 4)); -} - - -unsigned int vp8_mse16x16_c( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int var; - int avg; - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg); - *sse = var; - return var; -} - - /**************************************************************************** * * ROUTINE : filter_block2d_bil_first_pass @@ -304,7 +197,7 @@ unsigned int vp8_sub_pixel_variance4x4_c /* Now filter Verticaly */ var_filter_block2d_bil_second_pass(FData3, temp2, 4, 4, 4, 4, VFilter); - return vp8_variance4x4_c(temp2, 4, dst_ptr, dst_pixels_per_line, sse); + return variance(temp2, 4, dst_ptr, dst_pixels_per_line, 4, 4, sse); } @@ -329,7 +222,7 @@ unsigned int vp8_sub_pixel_variance8x8_c var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 8, HFilter); var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 8, 8, VFilter); - return vp8_variance8x8_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse); + return variance(temp2, 8, dst_ptr, dst_pixels_per_line, 8, 8, sse); } unsigned int vp8_sub_pixel_variance16x16_c @@ -353,7 +246,7 @@ unsigned int vp8_sub_pixel_variance16x16_c var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 16, HFilter); var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 16, 16, VFilter); - return vp8_variance16x16_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse); + return variance(temp2, 16, dst_ptr, dst_pixels_per_line, 16, 16, sse); } @@ -429,7 +322,7 @@ unsigned int vp8_sub_pixel_variance16x8_c var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 16, HFilter); var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 8, 16, VFilter); - return vp8_variance16x8_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse); + return variance(temp2, 16, dst_ptr, dst_pixels_per_line, 16, 8, sse); } unsigned int vp8_sub_pixel_variance8x16_c @@ -455,5 +348,5 @@ unsigned int vp8_sub_pixel_variance8x16_c var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 8, HFilter); var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 16, 8, VFilter); - return vp8_variance8x16_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse); + return variance(temp2, 8, dst_ptr, dst_pixels_per_line, 8, 16, sse); } diff --git a/vp8/common/x86/variance_impl_mmx.asm b/vp8/common/x86/variance_impl_mmx.asm index 7d5e6810b..97f25275d 100644 --- a/vp8/common/x86/variance_impl_mmx.asm +++ b/vp8/common/x86/variance_impl_mmx.asm @@ -11,504 +11,6 @@ %include "vpx_ports/x86_abi_support.asm" -;unsigned int vp8_get_mb_ss_mmx( short *src_ptr ) -global sym(vp8_get_mb_ss_mmx) PRIVATE -sym(vp8_get_mb_ss_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - GET_GOT rbx - push rsi - push rdi - sub rsp, 8 - ; end prolog - - mov rax, arg(0) ;src_ptr - mov rcx, 16 - pxor mm4, mm4 - -.NEXTROW: - movq mm0, [rax] - movq mm1, [rax+8] - movq mm2, [rax+16] - movq mm3, [rax+24] - pmaddwd mm0, mm0 - pmaddwd mm1, mm1 - pmaddwd mm2, mm2 - pmaddwd mm3, mm3 - - paddd mm4, mm0 - paddd mm4, mm1 - paddd mm4, mm2 - paddd mm4, mm3 - - add rax, 32 - dec rcx - ja .NEXTROW - movq QWORD PTR [rsp], mm4 - - ;return sum[0]+sum[1]; - movsxd rax, dword ptr [rsp] - movsxd rcx, dword ptr [rsp+4] - add rax, rcx - - - ; begin epilog - add rsp, 8 - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;unsigned int vp8_get8x8var_mmx -;( -; unsigned char *src_ptr, -; int source_stride, -; unsigned char *ref_ptr, -; int recon_stride, -; unsigned int *SSE, -; int *Sum -;) -global sym(vp8_get8x8var_mmx) PRIVATE -sym(vp8_get8x8var_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - push rsi - push rdi - push rbx - sub rsp, 16 - ; end prolog - - - pxor mm5, mm5 ; Blank mmx6 - pxor mm6, mm6 ; Blank mmx7 - pxor mm7, mm7 ; Blank mmx7 - - mov rax, arg(0) ;[src_ptr] ; Load base addresses - mov rbx, arg(2) ;[ref_ptr] - movsxd rcx, dword ptr arg(1) ;[source_stride] - movsxd rdx, dword ptr arg(3) ;[recon_stride] - - ; Row 1 - movq mm0, [rax] ; Copy eight bytes to mm0 - movq mm1, [rbx] ; Copy eight bytes to mm1 - movq mm2, mm0 ; Take copies - movq mm3, mm1 ; Take copies - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - punpckhbw mm2, mm6 ; unpack to higher prrcision - punpckhbw mm3, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - psubsw mm2, mm3 ; A-B (high order) to MM2 - - paddw mm5, mm0 ; accumulate differences in mm5 - paddw mm5, mm2 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - pmaddwd mm2, mm2 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movq mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - paddd mm7, mm2 ; accumulate in mm7 - - - ; Row 2 - movq mm0, [rax] ; Copy eight bytes to mm0 - movq mm2, mm0 ; Take copies - movq mm3, mm1 ; Take copies - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - punpckhbw mm2, mm6 ; unpack to higher prrcision - punpckhbw mm3, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - psubsw mm2, mm3 ; A-B (high order) to MM2 - - paddw mm5, mm0 ; accumulate differences in mm5 - paddw mm5, mm2 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - pmaddwd mm2, mm2 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movq mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - paddd mm7, mm2 ; accumulate in mm7 - - ; Row 3 - movq mm0, [rax] ; Copy eight bytes to mm0 - movq mm2, mm0 ; Take copies - movq mm3, mm1 ; Take copies - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - punpckhbw mm2, mm6 ; unpack to higher prrcision - punpckhbw mm3, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - psubsw mm2, mm3 ; A-B (high order) to MM2 - - paddw mm5, mm0 ; accumulate differences in mm5 - paddw mm5, mm2 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - pmaddwd mm2, mm2 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movq mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - paddd mm7, mm2 ; accumulate in mm7 - - ; Row 4 - movq mm0, [rax] ; Copy eight bytes to mm0 - movq mm2, mm0 ; Take copies - movq mm3, mm1 ; Take copies - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - punpckhbw mm2, mm6 ; unpack to higher prrcision - punpckhbw mm3, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - psubsw mm2, mm3 ; A-B (high order) to MM2 - - paddw mm5, mm0 ; accumulate differences in mm5 - paddw mm5, mm2 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - pmaddwd mm2, mm2 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movq mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - paddd mm7, mm2 ; accumulate in mm7 - - ; Row 5 - movq mm0, [rax] ; Copy eight bytes to mm0 - movq mm2, mm0 ; Take copies - movq mm3, mm1 ; Take copies - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - punpckhbw mm2, mm6 ; unpack to higher prrcision - punpckhbw mm3, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - psubsw mm2, mm3 ; A-B (high order) to MM2 - - paddw mm5, mm0 ; accumulate differences in mm5 - paddw mm5, mm2 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - pmaddwd mm2, mm2 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movq mm1, [rbx] ; Copy eight bytes to mm1 - ; movq mm4, [rbx + rdx] - paddd mm7, mm0 ; accumulate in mm7 - paddd mm7, mm2 ; accumulate in mm7 - - ; Row 6 - movq mm0, [rax] ; Copy eight bytes to mm0 - movq mm2, mm0 ; Take copies - movq mm3, mm1 ; Take copies - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - punpckhbw mm2, mm6 ; unpack to higher prrcision - punpckhbw mm3, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - psubsw mm2, mm3 ; A-B (high order) to MM2 - - paddw mm5, mm0 ; accumulate differences in mm5 - paddw mm5, mm2 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - pmaddwd mm2, mm2 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movq mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - paddd mm7, mm2 ; accumulate in mm7 - - ; Row 7 - movq mm0, [rax] ; Copy eight bytes to mm0 - movq mm2, mm0 ; Take copies - movq mm3, mm1 ; Take copies - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - punpckhbw mm2, mm6 ; unpack to higher prrcision - punpckhbw mm3, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - psubsw mm2, mm3 ; A-B (high order) to MM2 - - paddw mm5, mm0 ; accumulate differences in mm5 - paddw mm5, mm2 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - pmaddwd mm2, mm2 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movq mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - paddd mm7, mm2 ; accumulate in mm7 - - ; Row 8 - movq mm0, [rax] ; Copy eight bytes to mm0 - movq mm2, mm0 ; Take copies - movq mm3, mm1 ; Take copies - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - punpckhbw mm2, mm6 ; unpack to higher prrcision - punpckhbw mm3, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - psubsw mm2, mm3 ; A-B (high order) to MM2 - - paddw mm5, mm0 ; accumulate differences in mm5 - paddw mm5, mm2 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - pmaddwd mm2, mm2 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - paddd mm7, mm0 ; accumulate in mm7 - paddd mm7, mm2 ; accumulate in mm7 - - ; Now accumulate the final results. - movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory - movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory - movsx rdx, WORD PTR [rsp+8] - movsx rcx, WORD PTR [rsp+10] - movsx rbx, WORD PTR [rsp+12] - movsx rax, WORD PTR [rsp+14] - add rdx, rcx - add rbx, rax - add rdx, rbx ;XSum - movsxd rax, DWORD PTR [rsp] - movsxd rcx, DWORD PTR [rsp+4] - add rax, rcx ;XXSum - mov rsi, arg(4) ;SSE - mov rdi, arg(5) ;Sum - mov dword ptr [rsi], eax - mov dword ptr [rdi], edx - xor rax, rax ; return 0 - - - ; begin epilog - add rsp, 16 - pop rbx - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - - -;unsigned int -;vp8_get4x4var_mmx -;( -; unsigned char *src_ptr, -; int source_stride, -; unsigned char *ref_ptr, -; int recon_stride, -; unsigned int *SSE, -; int *Sum -;) -global sym(vp8_get4x4var_mmx) PRIVATE -sym(vp8_get4x4var_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - push rsi - push rdi - push rbx - sub rsp, 16 - ; end prolog - - - pxor mm5, mm5 ; Blank mmx6 - pxor mm6, mm6 ; Blank mmx7 - pxor mm7, mm7 ; Blank mmx7 - - mov rax, arg(0) ;[src_ptr] ; Load base addresses - mov rbx, arg(2) ;[ref_ptr] - movsxd rcx, dword ptr arg(1) ;[source_stride] - movsxd rdx, dword ptr arg(3) ;[recon_stride] - - ; Row 1 - movd mm0, [rax] ; Copy four bytes to mm0 - movd mm1, [rbx] ; Copy four bytes to mm1 - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - paddw mm5, mm0 ; accumulate differences in mm5 - pmaddwd mm0, mm0 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movd mm1, [rbx] ; Copy four bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - - - ; Row 2 - movd mm0, [rax] ; Copy four bytes to mm0 - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - paddw mm5, mm0 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movd mm1, [rbx] ; Copy four bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - - ; Row 3 - movd mm0, [rax] ; Copy four bytes to mm0 - punpcklbw mm0, mm6 ; unpack to higher precision - punpcklbw mm1, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - paddw mm5, mm0 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movd mm1, [rbx] ; Copy four bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - - ; Row 4 - movd mm0, [rax] ; Copy four bytes to mm0 - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - - paddw mm5, mm0 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - paddd mm7, mm0 ; accumulate in mm7 - - - ; Now accumulate the final results. - movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory - movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory - movsx rdx, WORD PTR [rsp+8] - movsx rcx, WORD PTR [rsp+10] - movsx rbx, WORD PTR [rsp+12] - movsx rax, WORD PTR [rsp+14] - add rdx, rcx - add rbx, rax - add rdx, rbx ;XSum - movsxd rax, DWORD PTR [rsp] - movsxd rcx, DWORD PTR [rsp+4] - add rax, rcx ;XXSum - mov rsi, arg(4) ;SSE - mov rdi, arg(5) ;Sum - mov dword ptr [rsi], eax - mov dword ptr [rdi], edx - xor rax, rax ; return 0 - - - ; begin epilog - add rsp, 16 - pop rbx - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - - -;unsigned int -;vp8_get4x4sse_cs_mmx -;( -; unsigned char *src_ptr, -; int source_stride, -; unsigned char *ref_ptr, -; int recon_stride -;) -global sym(vp8_get4x4sse_cs_mmx) PRIVATE -sym(vp8_get4x4sse_cs_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - push rsi - push rdi - push rbx - ; end prolog - - - pxor mm6, mm6 ; Blank mmx7 - pxor mm7, mm7 ; Blank mmx7 - - mov rax, arg(0) ;[src_ptr] ; Load base addresses - mov rbx, arg(2) ;[ref_ptr] - movsxd rcx, dword ptr arg(1) ;[source_stride] - movsxd rdx, dword ptr arg(3) ;[recon_stride] - ; Row 1 - movd mm0, [rax] ; Copy eight bytes to mm0 - movd mm1, [rbx] ; Copy eight bytes to mm1 - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - pmaddwd mm0, mm0 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movd mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - - ; Row 2 - movd mm0, [rax] ; Copy eight bytes to mm0 - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - pmaddwd mm0, mm0 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movd mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - - ; Row 3 - movd mm0, [rax] ; Copy eight bytes to mm0 - punpcklbw mm1, mm6 - punpcklbw mm0, mm6 ; unpack to higher prrcision - psubsw mm0, mm1 ; A-B (low order) to MM0 - - pmaddwd mm0, mm0 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movd mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - - ; Row 4 - movd mm0, [rax] ; Copy eight bytes to mm0 - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - pmaddwd mm0, mm0 ; square and accumulate - paddd mm7, mm0 ; accumulate in mm7 - - movq mm0, mm7 ; - psrlq mm7, 32 - - paddd mm0, mm7 - movq rax, mm0 - - - ; begin epilog - pop rbx - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - %define mmx_filter_shift 7 ;void vp8_filter_block2d_bil4x4_var_mmx diff --git a/vp8/common/x86/variance_impl_sse2.asm b/vp8/common/x86/variance_impl_sse2.asm index 761433c11..26de5e860 100644 --- a/vp8/common/x86/variance_impl_sse2.asm +++ b/vp8/common/x86/variance_impl_sse2.asm @@ -13,393 +13,6 @@ %define xmm_filter_shift 7 -;unsigned int vp8_get_mb_ss_sse2 -;( -; short *src_ptr -;) -global sym(vp8_get_mb_ss_sse2) PRIVATE -sym(vp8_get_mb_ss_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 1 - GET_GOT rbx - push rsi - push rdi - sub rsp, 16 - ; end prolog - - - mov rax, arg(0) ;[src_ptr] - mov rcx, 8 - pxor xmm4, xmm4 - -.NEXTROW: - movdqa xmm0, [rax] - movdqa xmm1, [rax+16] - movdqa xmm2, [rax+32] - movdqa xmm3, [rax+48] - pmaddwd xmm0, xmm0 - pmaddwd xmm1, xmm1 - pmaddwd xmm2, xmm2 - pmaddwd xmm3, xmm3 - - paddd xmm0, xmm1 - paddd xmm2, xmm3 - paddd xmm4, xmm0 - paddd xmm4, xmm2 - - add rax, 0x40 - dec rcx - ja .NEXTROW - - movdqa xmm3,xmm4 - psrldq xmm4,8 - paddd xmm4,xmm3 - movdqa xmm3,xmm4 - psrldq xmm4,4 - paddd xmm4,xmm3 - movq rax,xmm4 - - - ; begin epilog - add rsp, 16 - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;unsigned int vp8_get16x16var_sse2 -;( -; unsigned char * src_ptr, -; int source_stride, -; unsigned char * ref_ptr, -; int recon_stride, -; unsigned int * SSE, -; int * Sum -;) -global sym(vp8_get16x16var_sse2) PRIVATE -sym(vp8_get16x16var_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rbx - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;[src_ptr] - mov rdi, arg(2) ;[ref_ptr] - - movsxd rax, DWORD PTR arg(1) ;[source_stride] - movsxd rdx, DWORD PTR arg(3) ;[recon_stride] - - ; Prefetch data - lea rcx, [rax+rax*2] - prefetcht0 [rsi] - prefetcht0 [rsi+rax] - prefetcht0 [rsi+rax*2] - prefetcht0 [rsi+rcx] - lea rbx, [rsi+rax*4] - prefetcht0 [rbx] - prefetcht0 [rbx+rax] - prefetcht0 [rbx+rax*2] - prefetcht0 [rbx+rcx] - - lea rcx, [rdx+rdx*2] - prefetcht0 [rdi] - prefetcht0 [rdi+rdx] - prefetcht0 [rdi+rdx*2] - prefetcht0 [rdi+rcx] - lea rbx, [rdi+rdx*4] - prefetcht0 [rbx] - prefetcht0 [rbx+rdx] - prefetcht0 [rbx+rdx*2] - prefetcht0 [rbx+rcx] - - pxor xmm0, xmm0 ; clear xmm0 for unpack - pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs - - pxor xmm6, xmm6 ; clear xmm6 for accumulating sse - mov rcx, 16 - -.var16loop: - movdqu xmm1, XMMWORD PTR [rsi] - movdqu xmm2, XMMWORD PTR [rdi] - - prefetcht0 [rsi+rax*8] - prefetcht0 [rdi+rdx*8] - - movdqa xmm3, xmm1 - movdqa xmm4, xmm2 - - - punpcklbw xmm1, xmm0 - punpckhbw xmm3, xmm0 - - punpcklbw xmm2, xmm0 - punpckhbw xmm4, xmm0 - - - psubw xmm1, xmm2 - psubw xmm3, xmm4 - - paddw xmm7, xmm1 - pmaddwd xmm1, xmm1 - - paddw xmm7, xmm3 - pmaddwd xmm3, xmm3 - - paddd xmm6, xmm1 - paddd xmm6, xmm3 - - add rsi, rax - add rdi, rdx - - sub rcx, 1 - jnz .var16loop - - - movdqa xmm1, xmm6 - pxor xmm6, xmm6 - - pxor xmm5, xmm5 - punpcklwd xmm6, xmm7 - - punpckhwd xmm5, xmm7 - psrad xmm5, 16 - - psrad xmm6, 16 - paddd xmm6, xmm5 - - movdqa xmm2, xmm1 - punpckldq xmm1, xmm0 - - punpckhdq xmm2, xmm0 - movdqa xmm7, xmm6 - - paddd xmm1, xmm2 - punpckldq xmm6, xmm0 - - punpckhdq xmm7, xmm0 - paddd xmm6, xmm7 - - movdqa xmm2, xmm1 - movdqa xmm7, xmm6 - - psrldq xmm1, 8 - psrldq xmm6, 8 - - paddd xmm7, xmm6 - paddd xmm1, xmm2 - - mov rax, arg(5) ;[Sum] - mov rdi, arg(4) ;[SSE] - - movd DWORD PTR [rax], xmm7 - movd DWORD PTR [rdi], xmm1 - - - ; begin epilog - pop rdi - pop rsi - pop rbx - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - - - -;unsigned int vp8_get8x8var_sse2 -;( -; unsigned char * src_ptr, -; int source_stride, -; unsigned char * ref_ptr, -; int recon_stride, -; unsigned int * SSE, -; int * Sum -;) -global sym(vp8_get8x8var_sse2) PRIVATE -sym(vp8_get8x8var_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - sub rsp, 16 - ; end prolog - - mov rsi, arg(0) ;[src_ptr] - mov rdi, arg(2) ;[ref_ptr] - - movsxd rax, DWORD PTR arg(1) ;[source_stride] - movsxd rdx, DWORD PTR arg(3) ;[recon_stride] - - pxor xmm0, xmm0 ; clear xmm0 for unpack - pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs - - movq xmm1, QWORD PTR [rsi] - movq xmm2, QWORD PTR [rdi] - - punpcklbw xmm1, xmm0 - punpcklbw xmm2, xmm0 - - psubsw xmm1, xmm2 - paddw xmm7, xmm1 - - pmaddwd xmm1, xmm1 - - movq xmm2, QWORD PTR[rsi + rax] - movq xmm3, QWORD PTR[rdi + rdx] - - punpcklbw xmm2, xmm0 - punpcklbw xmm3, xmm0 - - psubsw xmm2, xmm3 - paddw xmm7, xmm2 - - pmaddwd xmm2, xmm2 - paddd xmm1, xmm2 - - - movq xmm2, QWORD PTR[rsi + rax * 2] - movq xmm3, QWORD PTR[rdi + rdx * 2] - - punpcklbw xmm2, xmm0 - punpcklbw xmm3, xmm0 - - psubsw xmm2, xmm3 - paddw xmm7, xmm2 - - pmaddwd xmm2, xmm2 - paddd xmm1, xmm2 - - - lea rsi, [rsi + rax * 2] - lea rdi, [rdi + rdx * 2] - movq xmm2, QWORD PTR[rsi + rax] - movq xmm3, QWORD PTR[rdi + rdx] - - punpcklbw xmm2, xmm0 - punpcklbw xmm3, xmm0 - - psubsw xmm2, xmm3 - paddw xmm7, xmm2 - - pmaddwd xmm2, xmm2 - paddd xmm1, xmm2 - - movq xmm2, QWORD PTR[rsi + rax *2] - movq xmm3, QWORD PTR[rdi + rdx *2] - - punpcklbw xmm2, xmm0 - punpcklbw xmm3, xmm0 - - psubsw xmm2, xmm3 - paddw xmm7, xmm2 - - pmaddwd xmm2, xmm2 - paddd xmm1, xmm2 - - - lea rsi, [rsi + rax * 2] - lea rdi, [rdi + rdx * 2] - - - movq xmm2, QWORD PTR[rsi + rax] - movq xmm3, QWORD PTR[rdi + rdx] - - punpcklbw xmm2, xmm0 - punpcklbw xmm3, xmm0 - - psubsw xmm2, xmm3 - paddw xmm7, xmm2 - - pmaddwd xmm2, xmm2 - paddd xmm1, xmm2 - - movq xmm2, QWORD PTR[rsi + rax *2] - movq xmm3, QWORD PTR[rdi + rdx *2] - - punpcklbw xmm2, xmm0 - punpcklbw xmm3, xmm0 - - psubsw xmm2, xmm3 - paddw xmm7, xmm2 - - pmaddwd xmm2, xmm2 - paddd xmm1, xmm2 - - - lea rsi, [rsi + rax * 2] - lea rdi, [rdi + rdx * 2] - - movq xmm2, QWORD PTR[rsi + rax] - movq xmm3, QWORD PTR[rdi + rdx] - - punpcklbw xmm2, xmm0 - punpcklbw xmm3, xmm0 - - psubsw xmm2, xmm3 - paddw xmm7, xmm2 - - pmaddwd xmm2, xmm2 - paddd xmm1, xmm2 - - - movdqa xmm6, xmm7 - punpcklwd xmm6, xmm0 - - punpckhwd xmm7, xmm0 - movdqa xmm2, xmm1 - - paddw xmm6, xmm7 - punpckldq xmm1, xmm0 - - punpckhdq xmm2, xmm0 - movdqa xmm7, xmm6 - - paddd xmm1, xmm2 - punpckldq xmm6, xmm0 - - punpckhdq xmm7, xmm0 - paddw xmm6, xmm7 - - movdqa xmm2, xmm1 - movdqa xmm7, xmm6 - - psrldq xmm1, 8 - psrldq xmm6, 8 - - paddw xmm7, xmm6 - paddd xmm1, xmm2 - - mov rax, arg(5) ;[Sum] - mov rdi, arg(4) ;[SSE] - - movq rdx, xmm7 - movsx rcx, dx - - mov dword ptr [rax], ecx - movd DWORD PTR [rdi], xmm1 - - ; begin epilog - add rsp, 16 - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - ;void vp8_filter_block2d_bil_var_sse2 ;( ; unsigned char *ref_ptr, diff --git a/vp8/common/x86/variance_mmx.c b/vp8/common/x86/variance_mmx.c index 10a58b822..25ae5767f 100644 --- a/vp8/common/x86/variance_mmx.c +++ b/vp8/common/x86/variance_mmx.c @@ -35,25 +35,6 @@ extern void filter_block1d_v6_mmx short *filter ); -extern unsigned int vp8_get_mb_ss_mmx(const short *src_ptr); -extern unsigned int vp8_get8x8var_mmx -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *SSE, - int *Sum -); -extern unsigned int vp8_get4x4var_mmx -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *SSE, - int *Sum -); extern void vp8_filter_block2d_bil4x4_var_mmx ( const unsigned char *ref_ptr, @@ -78,127 +59,6 @@ extern void vp8_filter_block2d_bil_var_mmx unsigned int *sumsquared ); - -unsigned int vp8_variance4x4_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int var; - int avg; - - vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ; - *sse = var; - return (var - (((unsigned int)avg * avg) >> 4)); - -} - -unsigned int vp8_variance8x8_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int var; - int avg; - - vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ; - *sse = var; - - return (var - (((unsigned int)avg * avg) >> 6)); - -} - -unsigned int vp8_mse16x16_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int sse0, sse1, sse2, sse3, var; - int sum0, sum1, sum2, sum3; - - - vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; - vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); - vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ; - vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3); - - var = sse0 + sse1 + sse2 + sse3; - *sse = var; - return var; -} - - -unsigned int vp8_variance16x16_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int sse0, sse1, sse2, sse3, var; - int sum0, sum1, sum2, sum3, avg; - - - vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; - vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); - vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ; - vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3); - - var = sse0 + sse1 + sse2 + sse3; - avg = sum0 + sum1 + sum2 + sum3; - *sse = var; - return (var - (((unsigned int)avg * avg) >> 8)); -} - -unsigned int vp8_variance16x8_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int sse0, sse1, var; - int sum0, sum1, avg; - - vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; - vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); - - var = sse0 + sse1; - avg = sum0 + sum1; - *sse = var; - return (var - (((unsigned int)avg * avg) >> 7)); - -} - - -unsigned int vp8_variance8x16_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int sse0, sse1, var; - int sum0, sum1, avg; - - vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; - vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ; - - var = sse0 + sse1; - avg = sum0 + sum1; - *sse = var; - - return (var - (((unsigned int)avg * avg) >> 7)); - -} - - unsigned int vp8_sub_pixel_variance4x4_mmx ( const unsigned char *src_ptr, diff --git a/vp8/common/x86/variance_sse2.c b/vp8/common/x86/variance_sse2.c index 6c6539d8e..f6dfb2787 100644 --- a/vp8/common/x86/variance_sse2.c +++ b/vp8/common/x86/variance_sse2.c @@ -31,38 +31,6 @@ extern void vp8_filter_block2d_bil4x4_var_mmx unsigned int *sumsquared ); -extern unsigned int vp8_get4x4var_mmx -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *SSE, - int *Sum -); - -unsigned int vp8_get_mb_ss_sse2 -( - const short *src_ptr -); -unsigned int vp8_get16x16var_sse2 -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *SSE, - int *Sum -); -unsigned int vp8_get8x8var_sse2 -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *SSE, - int *Sum -); void vp8_filter_block2d_bil_var_sse2 ( const unsigned char *ref_ptr, @@ -136,115 +104,6 @@ void vp8_half_vert_variance16x_h_sse2 unsigned int *sumsquared ); -unsigned int vp8_variance4x4_wmt( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int var; - int avg; - - vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ; - *sse = var; - return (var - (((unsigned int)avg * avg) >> 4)); - -} - -unsigned int vp8_variance8x8_wmt -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int var; - int avg; - - vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ; - *sse = var; - return (var - (((unsigned int)avg * avg) >> 6)); - -} - - -unsigned int vp8_variance16x16_wmt -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int sse0; - int sum0; - - - vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; - *sse = sse0; - return (sse0 - (((unsigned int)sum0 * sum0) >> 8)); -} -unsigned int vp8_mse16x16_wmt( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - - unsigned int sse0; - int sum0; - vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; - *sse = sse0; - return sse0; - -} - - -unsigned int vp8_variance16x8_wmt -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int sse0, sse1, var; - int sum0, sum1, avg; - - vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; - vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); - - var = sse0 + sse1; - avg = sum0 + sum1; - *sse = var; - return (var - (((unsigned int)avg * avg) >> 7)); - -} - -unsigned int vp8_variance8x16_wmt -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int sse0, sse1, var; - int sum0, sum1, avg; - - vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; - vp8_get8x8var_sse2(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ; - - var = sse0 + sse1; - avg = sum0 + sum1; - *sse = var; - return (var - (((unsigned int)avg * avg) >> 7)); - -} - unsigned int vp8_sub_pixel_variance4x4_wmt ( const unsigned char *src_ptr, diff --git a/vp8/common/x86/variance_ssse3.c b/vp8/common/x86/variance_ssse3.c index d8c8da540..2a0df640a 100644 --- a/vp8/common/x86/variance_ssse3.c +++ b/vp8/common/x86/variance_ssse3.c @@ -13,15 +13,6 @@ #include "vp8/common/variance.h" #include "vpx_ports/mem.h" -extern unsigned int vp8_get16x16var_sse2 -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *SSE, - int *Sum -); extern void vp8_half_horiz_vert_variance16x_h_sse2 ( const unsigned char *ref_ptr, diff --git a/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm b/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm deleted file mode 100644 index 000805d4f..000000000 --- a/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm +++ /dev/null @@ -1,138 +0,0 @@ -; -; Copyright (c) 2011 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_mse16x16_armv6| - - ARM - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *src_ptr -; r1 int source_stride -; r2 unsigned char *ref_ptr -; r3 int recon_stride -; stack unsigned int *sse -; -;note: Based on vp8_variance16x16_armv6. In this function, sum is never used. -; So, we can remove this part of calculation. - -|vp8_mse16x16_armv6| PROC - - push {r4-r9, lr} - - pld [r0, r1, lsl #0] - pld [r2, r3, lsl #0] - - mov r12, #16 ; set loop counter to 16 (=block height) - mov r4, #0 ; initialize sse = 0 - -loop - ; 1st 4 pixels - ldr r5, [r0, #0x0] ; load 4 src pixels - ldr r6, [r2, #0x0] ; load 4 ref pixels - - mov lr, #0 ; constant zero - - usub8 r8, r5, r6 ; calculate difference - pld [r0, r1, lsl #1] - sel r7, r8, lr ; select bytes with positive difference - usub8 r9, r6, r5 ; calculate difference with reversed operands - pld [r2, r3, lsl #1] - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r5, r7, lr ; calculate sum of positive differences - usad8 r6, r8, lr ; calculate sum of negative differences - orr r8, r8, r7 ; differences of all 4 pixels - - ldr r5, [r0, #0x4] ; load 4 src pixels - - ; calculate sse - uxtb16 r6, r8 ; byte (two pixels) to halfwords - uxtb16 r7, r8, ror #8 ; another two pixels to halfwords - smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) - - ; 2nd 4 pixels - ldr r6, [r2, #0x4] ; load 4 ref pixels - smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) - - usub8 r8, r5, r6 ; calculate difference - sel r7, r8, lr ; select bytes with positive difference - usub8 r9, r6, r5 ; calculate difference with reversed operands - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r5, r7, lr ; calculate sum of positive differences - usad8 r6, r8, lr ; calculate sum of negative differences - orr r8, r8, r7 ; differences of all 4 pixels - ldr r5, [r0, #0x8] ; load 4 src pixels - ; calculate sse - uxtb16 r6, r8 ; byte (two pixels) to halfwords - uxtb16 r7, r8, ror #8 ; another two pixels to halfwords - smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) - - ; 3rd 4 pixels - ldr r6, [r2, #0x8] ; load 4 ref pixels - smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) - - usub8 r8, r5, r6 ; calculate difference - sel r7, r8, lr ; select bytes with positive difference - usub8 r9, r6, r5 ; calculate difference with reversed operands - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r5, r7, lr ; calculate sum of positive differences - usad8 r6, r8, lr ; calculate sum of negative differences - orr r8, r8, r7 ; differences of all 4 pixels - - ldr r5, [r0, #0xc] ; load 4 src pixels - - ; calculate sse - uxtb16 r6, r8 ; byte (two pixels) to halfwords - uxtb16 r7, r8, ror #8 ; another two pixels to halfwords - smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) - - ; 4th 4 pixels - ldr r6, [r2, #0xc] ; load 4 ref pixels - smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) - - usub8 r8, r5, r6 ; calculate difference - add r0, r0, r1 ; set src_ptr to next row - sel r7, r8, lr ; select bytes with positive difference - usub8 r9, r6, r5 ; calculate difference with reversed operands - add r2, r2, r3 ; set dst_ptr to next row - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r5, r7, lr ; calculate sum of positive differences - usad8 r6, r8, lr ; calculate sum of negative differences - orr r8, r8, r7 ; differences of all 4 pixels - - subs r12, r12, #1 ; next row - - ; calculate sse - uxtb16 r6, r8 ; byte (two pixels) to halfwords - uxtb16 r7, r8, ror #8 ; another two pixels to halfwords - smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) - smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) - - bne loop - - ; return stuff - ldr r1, [sp, #28] ; get address of sse - mov r0, r4 ; return sse - str r4, [r1] ; store sse - - pop {r4-r9, pc} - - ENDP - - END diff --git a/vp8/encoder/arm/neon/vp8_mse16x16_neon.c b/vp8/encoder/arm/neon/vp8_mse16x16_neon.c deleted file mode 100644 index f806809df..000000000 --- a/vp8/encoder/arm/neon/vp8_mse16x16_neon.c +++ /dev/null @@ -1,131 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <arm_neon.h> - -unsigned int vp8_mse16x16_neon( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - int i; - int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; - int64x1_t d0s64; - uint8x16_t q0u8, q1u8, q2u8, q3u8; - int32x4_t q7s32, q8s32, q9s32, q10s32; - uint16x8_t q11u16, q12u16, q13u16, q14u16; - int64x2_t q1s64; - - q7s32 = vdupq_n_s32(0); - q8s32 = vdupq_n_s32(0); - q9s32 = vdupq_n_s32(0); - q10s32 = vdupq_n_s32(0); - - for (i = 0; i < 8; i++) { // mse16x16_neon_loop - q0u8 = vld1q_u8(src_ptr); - src_ptr += source_stride; - q1u8 = vld1q_u8(src_ptr); - src_ptr += source_stride; - q2u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - q3u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - - q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); - q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); - q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); - q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); - - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); - q7s32 = vmlal_s16(q7s32, d22s16, d22s16); - q8s32 = vmlal_s16(q8s32, d23s16, d23s16); - - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - q9s32 = vmlal_s16(q9s32, d24s16, d24s16); - q10s32 = vmlal_s16(q10s32, d25s16, d25s16); - - d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); - d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); - q7s32 = vmlal_s16(q7s32, d26s16, d26s16); - q8s32 = vmlal_s16(q8s32, d27s16, d27s16); - - d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); - d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); - q9s32 = vmlal_s16(q9s32, d28s16, d28s16); - q10s32 = vmlal_s16(q10s32, d29s16, d29s16); - } - - q7s32 = vaddq_s32(q7s32, q8s32); - q9s32 = vaddq_s32(q9s32, q10s32); - q10s32 = vaddq_s32(q7s32, q9s32); - - q1s64 = vpaddlq_s32(q10s32); - d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); - - vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0); - return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0); -} - -unsigned int vp8_get4x4sse_cs_neon( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride) { - int16x4_t d22s16, d24s16, d26s16, d28s16; - int64x1_t d0s64; - uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; - int32x4_t q7s32, q8s32, q9s32, q10s32; - uint16x8_t q11u16, q12u16, q13u16, q14u16; - int64x2_t q1s64; - - d0u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - d4u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - d1u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - d5u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - d2u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - d6u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - d3u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - d7u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - - q11u16 = vsubl_u8(d0u8, d4u8); - q12u16 = vsubl_u8(d1u8, d5u8); - q13u16 = vsubl_u8(d2u8, d6u8); - q14u16 = vsubl_u8(d3u8, d7u8); - - d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16)); - d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16)); - d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16)); - d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16)); - - q7s32 = vmull_s16(d22s16, d22s16); - q8s32 = vmull_s16(d24s16, d24s16); - q9s32 = vmull_s16(d26s16, d26s16); - q10s32 = vmull_s16(d28s16, d28s16); - - q7s32 = vaddq_s32(q7s32, q8s32); - q9s32 = vaddq_s32(q9s32, q10s32); - q9s32 = vaddq_s32(q7s32, q9s32); - - q1s64 = vpaddlq_s32(q9s32); - d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); - - return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0); -} diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c index 378e902c6..d381d8ddf 100644 --- a/vp8/encoder/encodeframe.c +++ b/vp8/encoder/encodeframe.c @@ -11,6 +11,7 @@ #include "vpx_config.h" #include "vp8_rtcd.h" +#include "./vpx_dsp_rtcd.h" #include "encodemb.h" #include "encodemv.h" #include "vp8/common/common.h" @@ -90,7 +91,7 @@ static unsigned int tt_activity_measure( VP8_COMP *cpi, MACROBLOCK *x ) * lambda using a non-linear combination (e.g., the smallest, or second * smallest, etc.). */ - act = vp8_variance16x16(x->src.y_buffer, + act = vpx_variance16x16(x->src.y_buffer, x->src.y_stride, VP8_VAR_OFFS, 0, &sse); act = act<<4; diff --git a/vp8/encoder/encodeintra.c b/vp8/encoder/encodeintra.c index cfa4cb927..e2de5eecb 100644 --- a/vp8/encoder/encodeintra.c +++ b/vp8/encoder/encodeintra.c @@ -11,6 +11,7 @@ #include "vpx_config.h" #include "vp8_rtcd.h" +#include "./vpx_dsp_rtcd.h" #include "quantize.h" #include "vp8/common/reconintra4x4.h" #include "encodemb.h" @@ -44,7 +45,7 @@ int vp8_encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred) } } - intra_pred_var = vp8_get_mb_ss(x->src_diff); + intra_pred_var = vpx_get_mb_ss(x->src_diff); return intra_pred_var; } diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c index a6ff0e7a0..3deb4abb3 100644 --- a/vp8/encoder/firstpass.c +++ b/vp8/encoder/firstpass.c @@ -12,6 +12,7 @@ #include <limits.h> #include <stdio.h> +#include "./vpx_dsp_rtcd.h" #include "./vpx_scale_rtcd.h" #include "block.h" #include "onyx_int.h" @@ -422,14 +423,14 @@ static void zz_motion_search( VP8_COMP *cpi, MACROBLOCK * x, /* Set up pointers for this macro block raw buffer */ raw_ptr = (unsigned char *)(raw_buffer->y_buffer + recon_yoffset + d->offset); - vp8_mse16x16 ( src_ptr, src_stride, raw_ptr, raw_stride, - (unsigned int *)(raw_motion_err)); + vpx_mse16x16(src_ptr, src_stride, raw_ptr, raw_stride, + (unsigned int *)(raw_motion_err)); /* Set up pointers for this macro block recon buffer */ xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset; ref_ptr = (unsigned char *)(xd->pre.y_buffer + d->offset ); - vp8_mse16x16 ( src_ptr, src_stride, ref_ptr, ref_stride, - (unsigned int *)(best_motion_err)); + vpx_mse16x16(src_ptr, src_stride, ref_ptr, ref_stride, + (unsigned int *)(best_motion_err)); } static void first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x, @@ -453,7 +454,7 @@ static void first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x, int new_mv_mode_penalty = 256; /* override the default variance function to use MSE */ - v_fn_ptr.vf = vp8_mse16x16; + v_fn_ptr.vf = vpx_mse16x16; /* Set up pointers for this macro block recon buffer */ xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset; diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index c2bb23295..40e29e191 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -2131,7 +2131,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf) #endif cpi->fn_ptr[BLOCK_16X16].sdf = vpx_sad16x16; - cpi->fn_ptr[BLOCK_16X16].vf = vp8_variance16x16; + cpi->fn_ptr[BLOCK_16X16].vf = vpx_variance16x16; cpi->fn_ptr[BLOCK_16X16].svf = vp8_sub_pixel_variance16x16; cpi->fn_ptr[BLOCK_16X16].svf_halfpix_h = vp8_variance_halfpixvar16x16_h; cpi->fn_ptr[BLOCK_16X16].svf_halfpix_v = vp8_variance_halfpixvar16x16_v; @@ -2141,7 +2141,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf) cpi->fn_ptr[BLOCK_16X16].sdx4df = vpx_sad16x16x4d; cpi->fn_ptr[BLOCK_16X8].sdf = vpx_sad16x8; - cpi->fn_ptr[BLOCK_16X8].vf = vp8_variance16x8; + cpi->fn_ptr[BLOCK_16X8].vf = vpx_variance16x8; cpi->fn_ptr[BLOCK_16X8].svf = vp8_sub_pixel_variance16x8; cpi->fn_ptr[BLOCK_16X8].svf_halfpix_h = NULL; cpi->fn_ptr[BLOCK_16X8].svf_halfpix_v = NULL; @@ -2151,7 +2151,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf) cpi->fn_ptr[BLOCK_16X8].sdx4df = vpx_sad16x8x4d; cpi->fn_ptr[BLOCK_8X16].sdf = vpx_sad8x16; - cpi->fn_ptr[BLOCK_8X16].vf = vp8_variance8x16; + cpi->fn_ptr[BLOCK_8X16].vf = vpx_variance8x16; cpi->fn_ptr[BLOCK_8X16].svf = vp8_sub_pixel_variance8x16; cpi->fn_ptr[BLOCK_8X16].svf_halfpix_h = NULL; cpi->fn_ptr[BLOCK_8X16].svf_halfpix_v = NULL; @@ -2161,7 +2161,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf) cpi->fn_ptr[BLOCK_8X16].sdx4df = vpx_sad8x16x4d; cpi->fn_ptr[BLOCK_8X8].sdf = vpx_sad8x8; - cpi->fn_ptr[BLOCK_8X8].vf = vp8_variance8x8; + cpi->fn_ptr[BLOCK_8X8].vf = vpx_variance8x8; cpi->fn_ptr[BLOCK_8X8].svf = vp8_sub_pixel_variance8x8; cpi->fn_ptr[BLOCK_8X8].svf_halfpix_h = NULL; cpi->fn_ptr[BLOCK_8X8].svf_halfpix_v = NULL; @@ -2171,7 +2171,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf) cpi->fn_ptr[BLOCK_8X8].sdx4df = vpx_sad8x8x4d; cpi->fn_ptr[BLOCK_4X4].sdf = vpx_sad4x4; - cpi->fn_ptr[BLOCK_4X4].vf = vp8_variance4x4; + cpi->fn_ptr[BLOCK_4X4].vf = vpx_variance4x4; cpi->fn_ptr[BLOCK_4X4].svf = vp8_sub_pixel_variance4x4; cpi->fn_ptr[BLOCK_4X4].svf_halfpix_h = NULL; cpi->fn_ptr[BLOCK_4X4].svf_halfpix_v = NULL; @@ -2558,7 +2558,7 @@ static uint64_t calc_plane_error(unsigned char *orig, int orig_stride, { unsigned int sse; - vp8_mse16x16(orig + col, orig_stride, + vpx_mse16x16(orig + col, orig_stride, recon + col, recon_stride, &sse); total_sse += sse; @@ -3384,7 +3384,7 @@ static int measure_square_diff_partial(YV12_BUFFER_CONFIG *source, int index = block_index_row + (j >> 4); if (cpi->consec_zero_last[index] >= min_consec_zero_last) { unsigned int sse; - Total += vp8_mse16x16(src + j, + Total += vpx_mse16x16(src + j, source->y_stride, dst + j, dest->y_stride, &sse); @@ -3448,7 +3448,7 @@ static void process_denoiser_mode_change(VP8_COMP *cpi) { int index = block_index_row + (j >> 4); if (cpi->consec_zero_last[index] >= min_consec_zero_last) { unsigned int sse; - const unsigned int var = vp8_variance16x16(src + j, + const unsigned int var = vpx_variance16x16(src + j, ystride, dst + j, ystride, @@ -3458,7 +3458,7 @@ static void process_denoiser_mode_change(VP8_COMP *cpi) { // is small (to avoid effects from lighting change). if ((sse - var) < 128) { unsigned int sse2; - const unsigned int act = vp8_variance16x16(src + j, + const unsigned int act = vpx_variance16x16(src + j, ystride, const_source, 0, @@ -5993,7 +5993,8 @@ int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest) for (j = 0; j < source->y_width; j += 16) { unsigned int sse; - Total += vp8_mse16x16(src + j, source->y_stride, dst + j, dest->y_stride, &sse); + Total += vpx_mse16x16(src + j, source->y_stride, + dst + j, dest->y_stride, &sse); } src += 16 * source->y_stride; diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c index 98ea5a040..053bf119a 100644 --- a/vp8/encoder/pickinter.c +++ b/vp8/encoder/pickinter.c @@ -11,6 +11,7 @@ #include <limits.h> #include "vpx_config.h" +#include "./vpx_dsp_rtcd.h" #include "onyx_int.h" #include "modecosts.h" #include "encodeintra.h" @@ -215,33 +216,6 @@ int vp8_get_inter_mbpred_error(MACROBLOCK *mb, } - -unsigned int vp8_get4x4sse_cs_c -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride -) -{ - int distortion = 0; - int r, c; - - for (r = 0; r < 4; r++) - { - for (c = 0; c < 4; c++) - { - int diff = src_ptr[c] - ref_ptr[c]; - distortion += diff * diff; - } - - src_ptr += source_stride; - ref_ptr += recon_stride; - } - - return distortion; -} - static int get_prediction_error(BLOCK *be, BLOCKD *b) { unsigned char *sptr; @@ -249,7 +223,7 @@ static int get_prediction_error(BLOCK *be, BLOCKD *b) sptr = (*(be->base_src) + be->src); dptr = b->predictor; - return vp8_get4x4sse_cs(sptr, be->src_stride, dptr, 16); + return vpx_get4x4sse_cs(sptr, be->src_stride, dptr, 16); } @@ -1037,7 +1011,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, else { rate2 += rate; - distortion2 = vp8_variance16x16( + distortion2 = vpx_variance16x16( *(b->base_src), b->src_stride, x->e_mbd.predictor, 16, &sse); this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2); @@ -1066,7 +1040,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, xd->dst.y_stride, xd->predictor, 16); - distortion2 = vp8_variance16x16 + distortion2 = vpx_variance16x16 (*(b->base_src), b->src_stride, x->e_mbd.predictor, 16, &sse); rate2 += x->mbmode_cost[x->e_mbd.frame_type][x->e_mbd.mode_info_context->mbmi.mode]; @@ -1547,7 +1521,7 @@ void vp8_pick_intra_mode(MACROBLOCK *x, int *rate_) xd->dst.y_stride, xd->predictor, 16); - distortion = vp8_variance16x16 + distortion = vpx_variance16x16 (*(b->base_src), b->src_stride, xd->predictor, 16, &sse); rate = x->mbmode_cost[xd->frame_type][mode]; this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion); diff --git a/vp8/encoder/picklpf.c b/vp8/encoder/picklpf.c index 890053dcf..875b37f68 100644 --- a/vp8/encoder/picklpf.c +++ b/vp8/encoder/picklpf.c @@ -9,6 +9,7 @@ */ +#include "./vpx_dsp_rtcd.h" #include "./vpx_scale_rtcd.h" #include "vp8/common/onyxc_int.h" #include "onyx_int.h" @@ -83,7 +84,7 @@ static int calc_partial_ssl_err(YV12_BUFFER_CONFIG *source, for (j = 0; j < source->y_width; j += 16) { unsigned int sse; - Total += vp8_mse16x16(src + j, source->y_stride, + Total += vpx_mse16x16(src + j, source->y_stride, dst + j, dest->y_stride, &sse); } diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c index 10d340880..e8796a1fc 100644 --- a/vp8/encoder/ratectrl.c +++ b/vp8/encoder/ratectrl.c @@ -1587,7 +1587,7 @@ int vp8_drop_encodedframe_overshoot(VP8_COMP *cpi, int Q) { // Threshold for the average (over all macroblocks) of the pixel-sum // residual error over 16x16 block. Should add QP dependence on threshold? int thresh_pred_err_mb = (256 << 4); - int pred_err_mb = cpi->mb.prediction_error / cpi->common.MBs; + int pred_err_mb = (int)(cpi->mb.prediction_error / cpi->common.MBs); if (Q < thresh_qp && cpi->projected_frame_size > thresh_rate && pred_err_mb > thresh_pred_err_mb) { @@ -1601,7 +1601,9 @@ int vp8_drop_encodedframe_overshoot(VP8_COMP *cpi, int Q) { cpi->force_maxqp = 0; return 0; } + cpi->force_maxqp = 0; return 0; } + cpi->force_maxqp = 0; return 0; } diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c index 9ccd85eb9..17194f0d4 100644 --- a/vp8/encoder/rdopt.c +++ b/vp8/encoder/rdopt.c @@ -15,6 +15,7 @@ #include <assert.h> #include "vpx_config.h" #include "vp8_rtcd.h" +#include "./vpx_dsp_rtcd.h" #include "tokenize.h" #include "treewriter.h" #include "onyx_int.h" @@ -507,9 +508,9 @@ int VP8_UVSSE(MACROBLOCK *x) } else { - vp8_variance8x8(uptr, pre_stride, + vpx_variance8x8(uptr, pre_stride, upred_ptr, uv_stride, &sse2); - vp8_variance8x8(vptr, pre_stride, + vpx_variance8x8(vptr, pre_stride, vpred_ptr, uv_stride, &sse1); sse2 += sse1; } @@ -1783,7 +1784,7 @@ static int evaluate_inter_mode_rd(int mdcounts[4], if(threshold < x->encode_breakout) threshold = x->encode_breakout; - var = vp8_variance16x16 + var = vpx_variance16x16 (*(b->base_src), b->src_stride, x->e_mbd.predictor, 16, &sse); diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk index b4c814075..c71d592f5 100644 --- a/vp8/vp8_common.mk +++ b/vp8/vp8_common.mk @@ -145,8 +145,6 @@ VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/intra4x4_predict_v6$(ASM) VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/dequant_idct_v6$(ASM) VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/dequantize_v6$(ASM) VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/idct_blk_v6.c -VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance8x8_armv6$(ASM) -VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance16x16_armv6$(ASM) VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6$(ASM) VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6$(ASM) VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM) @@ -168,7 +166,6 @@ VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/mbloopfilter_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/reconintra_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/shortidct4x4llm_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sixtappredict_neon.c -VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/variance_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp8_subpixelvariance_neon.c $(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.pl)) diff --git a/vp8/vp8cx_arm.mk b/vp8/vp8cx_arm.mk index 050030179..0b0f6a70a 100644 --- a/vp8/vp8cx_arm.mk +++ b/vp8/vp8cx_arm.mk @@ -18,7 +18,6 @@ VP8_CX_SRCS-$(ARCH_ARM) += encoder/arm/dct_arm.c #File list for media # encoder VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/vp8_short_fdct4x4_armv6$(ASM) -VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/vp8_mse16x16_armv6$(ASM) VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/walsh_v6$(ASM) #File list for neon @@ -27,5 +26,4 @@ VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/denoising_neon.c VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/fastquantizeb_neon.c VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/shortfdct_neon.c VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/subtract_neon.c -VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp8_mse16x16_neon.c VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp8_shortwalsh4x4_neon.c diff --git a/vp9/common/arm/neon/vp9_reconintra_neon.c b/vp9/common/arm/neon/vp9_reconintra_neon.c index d0beaa720..66cf6600e 100644 --- a/vp9/common/arm/neon/vp9_reconintra_neon.c +++ b/vp9/common/arm/neon/vp9_reconintra_neon.c @@ -11,463 +11,415 @@ #include <stddef.h> #include <arm_neon.h> -void vp9_v_predictor_4x4_neon( - uint8_t *dst, - ptrdiff_t y_stride, - const uint8_t *above, - const uint8_t *left) { - int i; - uint32x2_t d0u32 = vdup_n_u32(0); - (void)left; +void vp9_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + int i; + uint32x2_t d0u32 = vdup_n_u32(0); + (void)left; - d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0); - for (i = 0; i < 4; i++, dst += y_stride) - vst1_lane_u32((uint32_t *)dst, d0u32, 0); - return; + d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0); + for (i = 0; i < 4; i++, dst += y_stride) + vst1_lane_u32((uint32_t *)dst, d0u32, 0); } -void vp9_v_predictor_8x8_neon( - uint8_t *dst, - ptrdiff_t y_stride, - const uint8_t *above, - const uint8_t *left) { - int i; - uint8x8_t d0u8 = vdup_n_u8(0); - (void)left; +void vp9_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + int i; + uint8x8_t d0u8 = vdup_n_u8(0); + (void)left; - d0u8 = vld1_u8(above); - for (i = 0; i < 8; i++, dst += y_stride) - vst1_u8(dst, d0u8); - return; + d0u8 = vld1_u8(above); + for (i = 0; i < 8; i++, dst += y_stride) + vst1_u8(dst, d0u8); } -void vp9_v_predictor_16x16_neon( - uint8_t *dst, - ptrdiff_t y_stride, - const uint8_t *above, - const uint8_t *left) { - int i; - uint8x16_t q0u8 = vdupq_n_u8(0); - (void)left; +void vp9_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + int i; + uint8x16_t q0u8 = vdupq_n_u8(0); + (void)left; - q0u8 = vld1q_u8(above); - for (i = 0; i < 16; i++, dst += y_stride) - vst1q_u8(dst, q0u8); - return; + q0u8 = vld1q_u8(above); + for (i = 0; i < 16; i++, dst += y_stride) + vst1q_u8(dst, q0u8); } -void vp9_v_predictor_32x32_neon( - uint8_t *dst, - ptrdiff_t y_stride, - const uint8_t *above, - const uint8_t *left) { - int i; - uint8x16_t q0u8 = vdupq_n_u8(0); - uint8x16_t q1u8 = vdupq_n_u8(0); - (void)left; +void vp9_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + int i; + uint8x16_t q0u8 = vdupq_n_u8(0); + uint8x16_t q1u8 = vdupq_n_u8(0); + (void)left; - q0u8 = vld1q_u8(above); - q1u8 = vld1q_u8(above + 16); - for (i = 0; i < 32; i++, dst += y_stride) { - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q1u8); - } - return; + q0u8 = vld1q_u8(above); + q1u8 = vld1q_u8(above + 16); + for (i = 0; i < 32; i++, dst += y_stride) { + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q1u8); + } } -void vp9_h_predictor_4x4_neon( - uint8_t *dst, - ptrdiff_t y_stride, - const uint8_t *above, - const uint8_t *left) { - uint8x8_t d0u8 = vdup_n_u8(0); - uint32x2_t d1u32 = vdup_n_u32(0); - (void)above; +void vp9_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + uint8x8_t d0u8 = vdup_n_u8(0); + uint32x2_t d1u32 = vdup_n_u32(0); + (void)above; - d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0); + d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0); - d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); - dst += y_stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); - dst += y_stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); - dst += y_stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); - return; + d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); + dst += y_stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); + dst += y_stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); + dst += y_stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); } -void vp9_h_predictor_8x8_neon( - uint8_t *dst, - ptrdiff_t y_stride, - const uint8_t *above, - const uint8_t *left) { - uint8x8_t d0u8 = vdup_n_u8(0); - uint64x1_t d1u64 = vdup_n_u64(0); - (void)above; +void vp9_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + uint8x8_t d0u8 = vdup_n_u8(0); + uint64x1_t d1u64 = vdup_n_u64(0); + (void)above; - d1u64 = vld1_u64((const uint64_t *)left); + d1u64 = vld1_u64((const uint64_t *)left); - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0); - vst1_u8(dst, d0u8); + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0); + vst1_u8(dst, d0u8); + dst += y_stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1); + vst1_u8(dst, d0u8); + dst += y_stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2); + vst1_u8(dst, d0u8); + dst += y_stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3); + vst1_u8(dst, d0u8); + dst += y_stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4); + vst1_u8(dst, d0u8); + dst += y_stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5); + vst1_u8(dst, d0u8); + dst += y_stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6); + vst1_u8(dst, d0u8); + dst += y_stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7); + vst1_u8(dst, d0u8); +} + +void vp9_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + int j; + uint8x8_t d2u8 = vdup_n_u8(0); + uint8x16_t q0u8 = vdupq_n_u8(0); + uint8x16_t q1u8 = vdupq_n_u8(0); + (void)above; + + q1u8 = vld1q_u8(left); + d2u8 = vget_low_u8(q1u8); + for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) { + q0u8 = vdupq_lane_u8(d2u8, 0); + vst1q_u8(dst, q0u8); dst += y_stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1); - vst1_u8(dst, d0u8); + q0u8 = vdupq_lane_u8(d2u8, 1); + vst1q_u8(dst, q0u8); dst += y_stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2); - vst1_u8(dst, d0u8); + q0u8 = vdupq_lane_u8(d2u8, 2); + vst1q_u8(dst, q0u8); dst += y_stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3); - vst1_u8(dst, d0u8); + q0u8 = vdupq_lane_u8(d2u8, 3); + vst1q_u8(dst, q0u8); dst += y_stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4); - vst1_u8(dst, d0u8); + q0u8 = vdupq_lane_u8(d2u8, 4); + vst1q_u8(dst, q0u8); dst += y_stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5); - vst1_u8(dst, d0u8); + q0u8 = vdupq_lane_u8(d2u8, 5); + vst1q_u8(dst, q0u8); dst += y_stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6); - vst1_u8(dst, d0u8); + q0u8 = vdupq_lane_u8(d2u8, 6); + vst1q_u8(dst, q0u8); dst += y_stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7); - vst1_u8(dst, d0u8); - return; + q0u8 = vdupq_lane_u8(d2u8, 7); + vst1q_u8(dst, q0u8); + dst += y_stride; + } } -void vp9_h_predictor_16x16_neon( - uint8_t *dst, - ptrdiff_t y_stride, - const uint8_t *above, - const uint8_t *left) { - int j; - uint8x8_t d2u8 = vdup_n_u8(0); - uint8x16_t q0u8 = vdupq_n_u8(0); - uint8x16_t q1u8 = vdupq_n_u8(0); - (void)above; +void vp9_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + int j, k; + uint8x8_t d2u8 = vdup_n_u8(0); + uint8x16_t q0u8 = vdupq_n_u8(0); + uint8x16_t q1u8 = vdupq_n_u8(0); + (void)above; + for (k = 0; k < 2; k++, left += 16) { q1u8 = vld1q_u8(left); d2u8 = vget_low_u8(q1u8); for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) { - q0u8 = vdupq_lane_u8(d2u8, 0); - vst1q_u8(dst, q0u8); - dst += y_stride; - q0u8 = vdupq_lane_u8(d2u8, 1); - vst1q_u8(dst, q0u8); - dst += y_stride; - q0u8 = vdupq_lane_u8(d2u8, 2); - vst1q_u8(dst, q0u8); - dst += y_stride; - q0u8 = vdupq_lane_u8(d2u8, 3); - vst1q_u8(dst, q0u8); - dst += y_stride; - q0u8 = vdupq_lane_u8(d2u8, 4); - vst1q_u8(dst, q0u8); - dst += y_stride; - q0u8 = vdupq_lane_u8(d2u8, 5); - vst1q_u8(dst, q0u8); - dst += y_stride; - q0u8 = vdupq_lane_u8(d2u8, 6); - vst1q_u8(dst, q0u8); - dst += y_stride; - q0u8 = vdupq_lane_u8(d2u8, 7); - vst1q_u8(dst, q0u8); - dst += y_stride; + q0u8 = vdupq_lane_u8(d2u8, 0); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += y_stride; + q0u8 = vdupq_lane_u8(d2u8, 1); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += y_stride; + q0u8 = vdupq_lane_u8(d2u8, 2); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += y_stride; + q0u8 = vdupq_lane_u8(d2u8, 3); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += y_stride; + q0u8 = vdupq_lane_u8(d2u8, 4); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += y_stride; + q0u8 = vdupq_lane_u8(d2u8, 5); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += y_stride; + q0u8 = vdupq_lane_u8(d2u8, 6); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += y_stride; + q0u8 = vdupq_lane_u8(d2u8, 7); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += y_stride; } - return; + } } -void vp9_h_predictor_32x32_neon( - uint8_t *dst, - ptrdiff_t y_stride, - const uint8_t *above, - const uint8_t *left) { - int j, k; - uint8x8_t d2u8 = vdup_n_u8(0); - uint8x16_t q0u8 = vdupq_n_u8(0); - uint8x16_t q1u8 = vdupq_n_u8(0); - (void)above; +void vp9_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + int i; + uint16x8_t q1u16, q3u16; + int16x8_t q1s16; + uint8x8_t d0u8 = vdup_n_u8(0); + uint32x2_t d2u32 = vdup_n_u32(0); - for (k = 0; k < 2; k++, left += 16) { - q1u8 = vld1q_u8(left); - d2u8 = vget_low_u8(q1u8); - for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) { - q0u8 = vdupq_lane_u8(d2u8, 0); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += y_stride; - q0u8 = vdupq_lane_u8(d2u8, 1); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += y_stride; - q0u8 = vdupq_lane_u8(d2u8, 2); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += y_stride; - q0u8 = vdupq_lane_u8(d2u8, 3); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += y_stride; - q0u8 = vdupq_lane_u8(d2u8, 4); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += y_stride; - q0u8 = vdupq_lane_u8(d2u8, 5); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += y_stride; - q0u8 = vdupq_lane_u8(d2u8, 6); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += y_stride; - q0u8 = vdupq_lane_u8(d2u8, 7); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += y_stride; - } - } - return; + d0u8 = vdup_n_u8(above[-1]); + d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0); + q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8); + for (i = 0; i < 4; i++, dst += y_stride) { + q1u16 = vdupq_n_u16((uint16_t)left[i]); + q1s16 = vaddq_s16(vreinterpretq_s16_u16(q1u16), + vreinterpretq_s16_u16(q3u16)); + d0u8 = vqmovun_s16(q1s16); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); + } } -void vp9_tm_predictor_4x4_neon( - uint8_t *dst, - ptrdiff_t y_stride, - const uint8_t *above, - const uint8_t *left) { - int i; - uint16x8_t q1u16, q3u16; - int16x8_t q1s16; - uint8x8_t d0u8 = vdup_n_u8(0); - uint32x2_t d2u32 = vdup_n_u32(0); +void vp9_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + int j; + uint16x8_t q0u16, q3u16, q10u16; + int16x8_t q0s16; + uint16x4_t d20u16; + uint8x8_t d0u8, d2u8, d30u8; - d0u8 = vdup_n_u8(above[-1]); - d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0); - q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8); - for (i = 0; i < 4; i++, dst += y_stride) { - q1u16 = vdupq_n_u16((uint16_t)left[i]); - q1s16 = vaddq_s16(vreinterpretq_s16_u16(q1u16), - vreinterpretq_s16_u16(q3u16)); - d0u8 = vqmovun_s16(q1s16); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); - } - return; + d0u8 = vdup_n_u8(above[-1]); + d30u8 = vld1_u8(left); + d2u8 = vld1_u8(above); + q10u16 = vmovl_u8(d30u8); + q3u16 = vsubl_u8(d2u8, d0u8); + d20u16 = vget_low_u16(q10u16); + for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) { + q0u16 = vdupq_lane_u16(d20u16, 0); + q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), + vreinterpretq_s16_u16(q0u16)); + d0u8 = vqmovun_s16(q0s16); + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); + dst += y_stride; + q0u16 = vdupq_lane_u16(d20u16, 1); + q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), + vreinterpretq_s16_u16(q0u16)); + d0u8 = vqmovun_s16(q0s16); + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); + dst += y_stride; + q0u16 = vdupq_lane_u16(d20u16, 2); + q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), + vreinterpretq_s16_u16(q0u16)); + d0u8 = vqmovun_s16(q0s16); + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); + dst += y_stride; + q0u16 = vdupq_lane_u16(d20u16, 3); + q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), + vreinterpretq_s16_u16(q0u16)); + d0u8 = vqmovun_s16(q0s16); + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); + dst += y_stride; + } } -void vp9_tm_predictor_8x8_neon( - uint8_t *dst, - ptrdiff_t y_stride, - const uint8_t *above, - const uint8_t *left) { - int j; - uint16x8_t q0u16, q3u16, q10u16; - int16x8_t q0s16; - uint16x4_t d20u16; - uint8x8_t d0u8, d2u8, d30u8; +void vp9_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + int j, k; + uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16; + uint8x16_t q0u8, q1u8; + int16x8_t q0s16, q1s16, q8s16, q11s16; + uint16x4_t d20u16; + uint8x8_t d2u8, d3u8, d18u8, d22u8, d23u8; - d0u8 = vdup_n_u8(above[-1]); - d30u8 = vld1_u8(left); - d2u8 = vld1_u8(above); - q10u16 = vmovl_u8(d30u8); - q3u16 = vsubl_u8(d2u8, d0u8); + q0u8 = vdupq_n_u8(above[-1]); + q1u8 = vld1q_u8(above); + q2u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8)); + q3u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8)); + for (k = 0; k < 2; k++, left += 8) { + d18u8 = vld1_u8(left); + q10u16 = vmovl_u8(d18u8); d20u16 = vget_low_u16(q10u16); for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) { - q0u16 = vdupq_lane_u16(d20u16, 0); - q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), - vreinterpretq_s16_u16(q0u16)); - d0u8 = vqmovun_s16(q0s16); - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); - dst += y_stride; - q0u16 = vdupq_lane_u16(d20u16, 1); - q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), - vreinterpretq_s16_u16(q0u16)); - d0u8 = vqmovun_s16(q0s16); - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); - dst += y_stride; - q0u16 = vdupq_lane_u16(d20u16, 2); - q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), - vreinterpretq_s16_u16(q0u16)); - d0u8 = vqmovun_s16(q0s16); - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); - dst += y_stride; - q0u16 = vdupq_lane_u16(d20u16, 3); - q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), - vreinterpretq_s16_u16(q0u16)); - d0u8 = vqmovun_s16(q0s16); - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); - dst += y_stride; - } - return; -} - -void vp9_tm_predictor_16x16_neon( - uint8_t *dst, - ptrdiff_t y_stride, - const uint8_t *above, - const uint8_t *left) { - int j, k; - uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16; - uint8x16_t q0u8, q1u8; - int16x8_t q0s16, q1s16, q8s16, q11s16; - uint16x4_t d20u16; - uint8x8_t d2u8, d3u8, d18u8, d22u8, d23u8; - - q0u8 = vdupq_n_u8(above[-1]); - q1u8 = vld1q_u8(above); - q2u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8)); - q3u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8)); - for (k = 0; k < 2; k++, left += 8) { - d18u8 = vld1_u8(left); - q10u16 = vmovl_u8(d18u8); - d20u16 = vget_low_u16(q10u16); - for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) { - q0u16 = vdupq_lane_u16(d20u16, 0); - q8u16 = vdupq_lane_u16(d20u16, 1); - q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q2u16)); - q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q3u16)); - q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), - vreinterpretq_s16_u16(q2u16)); - q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), - vreinterpretq_s16_u16(q3u16)); - d2u8 = vqmovun_s16(q1s16); - d3u8 = vqmovun_s16(q0s16); - d22u8 = vqmovun_s16(q11s16); - d23u8 = vqmovun_s16(q8s16); - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8)); - vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8)); - dst += y_stride; - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8)); - vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8)); - dst += y_stride; + q0u16 = vdupq_lane_u16(d20u16, 0); + q8u16 = vdupq_lane_u16(d20u16, 1); + q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q2u16)); + q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q3u16)); + q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), + vreinterpretq_s16_u16(q2u16)); + q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), + vreinterpretq_s16_u16(q3u16)); + d2u8 = vqmovun_s16(q1s16); + d3u8 = vqmovun_s16(q0s16); + d22u8 = vqmovun_s16(q11s16); + d23u8 = vqmovun_s16(q8s16); + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8)); + vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8)); + dst += y_stride; + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8)); + vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8)); + dst += y_stride; - q0u16 = vdupq_lane_u16(d20u16, 2); - q8u16 = vdupq_lane_u16(d20u16, 3); - q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q2u16)); - q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q3u16)); - q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), - vreinterpretq_s16_u16(q2u16)); - q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), - vreinterpretq_s16_u16(q3u16)); - d2u8 = vqmovun_s16(q1s16); - d3u8 = vqmovun_s16(q0s16); - d22u8 = vqmovun_s16(q11s16); - d23u8 = vqmovun_s16(q8s16); - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8)); - vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8)); - dst += y_stride; - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8)); - vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8)); - dst += y_stride; - } + q0u16 = vdupq_lane_u16(d20u16, 2); + q8u16 = vdupq_lane_u16(d20u16, 3); + q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q2u16)); + q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q3u16)); + q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), + vreinterpretq_s16_u16(q2u16)); + q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), + vreinterpretq_s16_u16(q3u16)); + d2u8 = vqmovun_s16(q1s16); + d3u8 = vqmovun_s16(q0s16); + d22u8 = vqmovun_s16(q11s16); + d23u8 = vqmovun_s16(q8s16); + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8)); + vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8)); + dst += y_stride; + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8)); + vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8)); + dst += y_stride; } - return; + } } -void vp9_tm_predictor_32x32_neon( - uint8_t *dst, - ptrdiff_t y_stride, - const uint8_t *above, - const uint8_t *left) { - int j, k; - uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16; - uint8x16_t q0u8, q1u8, q2u8; - int16x8_t q12s16, q13s16, q14s16, q15s16; - uint16x4_t d6u16; - uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8; +void vp9_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + int j, k; + uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16; + uint8x16_t q0u8, q1u8, q2u8; + int16x8_t q12s16, q13s16, q14s16, q15s16; + uint16x4_t d6u16; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8; - q0u8 = vdupq_n_u8(above[-1]); - q1u8 = vld1q_u8(above); - q2u8 = vld1q_u8(above + 16); - q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8)); - q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8)); - q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8)); - q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8)); - for (k = 0; k < 4; k++, left += 8) { - d26u8 = vld1_u8(left); - q3u16 = vmovl_u8(d26u8); - d6u16 = vget_low_u16(q3u16); - for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) { - q0u16 = vdupq_lane_u16(d6u16, 0); - q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q8u16)); - q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q9u16)); - q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q10u16)); - q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q11u16)); - d0u8 = vqmovun_s16(q12s16); - d1u8 = vqmovun_s16(q13s16); - d2u8 = vqmovun_s16(q14s16); - d3u8 = vqmovun_s16(q15s16); - q0u8 = vcombine_u8(d0u8, d1u8); - q1u8 = vcombine_u8(d2u8, d3u8); - vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); - vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); - dst += y_stride; + q0u8 = vdupq_n_u8(above[-1]); + q1u8 = vld1q_u8(above); + q2u8 = vld1q_u8(above + 16); + q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8)); + q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8)); + q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8)); + q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8)); + for (k = 0; k < 4; k++, left += 8) { + d26u8 = vld1_u8(left); + q3u16 = vmovl_u8(d26u8); + d6u16 = vget_low_u16(q3u16); + for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) { + q0u16 = vdupq_lane_u16(d6u16, 0); + q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q8u16)); + q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q9u16)); + q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q10u16)); + q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q11u16)); + d0u8 = vqmovun_s16(q12s16); + d1u8 = vqmovun_s16(q13s16); + d2u8 = vqmovun_s16(q14s16); + d3u8 = vqmovun_s16(q15s16); + q0u8 = vcombine_u8(d0u8, d1u8); + q1u8 = vcombine_u8(d2u8, d3u8); + vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); + vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); + dst += y_stride; - q0u16 = vdupq_lane_u16(d6u16, 1); - q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q8u16)); - q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q9u16)); - q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q10u16)); - q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q11u16)); - d0u8 = vqmovun_s16(q12s16); - d1u8 = vqmovun_s16(q13s16); - d2u8 = vqmovun_s16(q14s16); - d3u8 = vqmovun_s16(q15s16); - q0u8 = vcombine_u8(d0u8, d1u8); - q1u8 = vcombine_u8(d2u8, d3u8); - vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); - vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); - dst += y_stride; + q0u16 = vdupq_lane_u16(d6u16, 1); + q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q8u16)); + q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q9u16)); + q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q10u16)); + q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q11u16)); + d0u8 = vqmovun_s16(q12s16); + d1u8 = vqmovun_s16(q13s16); + d2u8 = vqmovun_s16(q14s16); + d3u8 = vqmovun_s16(q15s16); + q0u8 = vcombine_u8(d0u8, d1u8); + q1u8 = vcombine_u8(d2u8, d3u8); + vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); + vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); + dst += y_stride; - q0u16 = vdupq_lane_u16(d6u16, 2); - q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q8u16)); - q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q9u16)); - q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q10u16)); - q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q11u16)); - d0u8 = vqmovun_s16(q12s16); - d1u8 = vqmovun_s16(q13s16); - d2u8 = vqmovun_s16(q14s16); - d3u8 = vqmovun_s16(q15s16); - q0u8 = vcombine_u8(d0u8, d1u8); - q1u8 = vcombine_u8(d2u8, d3u8); - vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); - vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); - dst += y_stride; + q0u16 = vdupq_lane_u16(d6u16, 2); + q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q8u16)); + q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q9u16)); + q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q10u16)); + q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q11u16)); + d0u8 = vqmovun_s16(q12s16); + d1u8 = vqmovun_s16(q13s16); + d2u8 = vqmovun_s16(q14s16); + d3u8 = vqmovun_s16(q15s16); + q0u8 = vcombine_u8(d0u8, d1u8); + q1u8 = vcombine_u8(d2u8, d3u8); + vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); + vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); + dst += y_stride; - q0u16 = vdupq_lane_u16(d6u16, 3); - q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q8u16)); - q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q9u16)); - q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q10u16)); - q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q11u16)); - d0u8 = vqmovun_s16(q12s16); - d1u8 = vqmovun_s16(q13s16); - d2u8 = vqmovun_s16(q14s16); - d3u8 = vqmovun_s16(q15s16); - q0u8 = vcombine_u8(d0u8, d1u8); - q1u8 = vcombine_u8(d2u8, d3u8); - vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); - vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); - dst += y_stride; - } + q0u16 = vdupq_lane_u16(d6u16, 3); + q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q8u16)); + q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q9u16)); + q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q10u16)); + q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q11u16)); + d0u8 = vqmovun_s16(q12s16); + d1u8 = vqmovun_s16(q13s16); + d2u8 = vqmovun_s16(q14s16); + d3u8 = vqmovun_s16(q15s16); + q0u8 = vcombine_u8(d0u8, d1u8); + q1u8 = vcombine_u8(d2u8, d3u8); + vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); + vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); + dst += y_stride; } - return; + } } diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c index 600cb13d8..8eda491de 100644 --- a/vp9/common/vp9_alloccommon.c +++ b/vp9/common/vp9_alloccommon.c @@ -57,6 +57,7 @@ static int alloc_seg_map(VP9_COMMON *cm, int seg_map_size) { if (cm->seg_map_array[i] == NULL) return 1; } + cm->seg_map_alloc_size = seg_map_size; // Init the index. cm->seg_map_idx = 0; @@ -118,25 +119,36 @@ void vp9_free_context_buffers(VP9_COMMON *cm) { } int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) { - vp9_free_context_buffers(cm); + int new_mi_size; vp9_set_mb_mi(cm, width, height); - if (cm->alloc_mi(cm, cm->mi_stride * calc_mi_size(cm->mi_rows))) - goto fail; - - // Create the segmentation map structure and set to 0. - free_seg_map(cm); - if (alloc_seg_map(cm, cm->mi_rows * cm->mi_cols)) - goto fail; + new_mi_size = cm->mi_stride * calc_mi_size(cm->mi_rows); + if (cm->mi_alloc_size < new_mi_size) { + cm->free_mi(cm); + if (cm->alloc_mi(cm, new_mi_size)) + goto fail; + } - cm->above_context = (ENTROPY_CONTEXT *)vpx_calloc( - 2 * mi_cols_aligned_to_sb(cm->mi_cols) * MAX_MB_PLANE, - sizeof(*cm->above_context)); - if (!cm->above_context) goto fail; + if (cm->seg_map_alloc_size < cm->mi_rows * cm->mi_cols) { + // Create the segmentation map structure and set to 0. + free_seg_map(cm); + if (alloc_seg_map(cm, cm->mi_rows * cm->mi_cols)) + goto fail; + } - cm->above_seg_context = (PARTITION_CONTEXT *)vpx_calloc( - mi_cols_aligned_to_sb(cm->mi_cols), sizeof(*cm->above_seg_context)); - if (!cm->above_seg_context) goto fail; + if (cm->above_context_alloc_cols < cm->mi_cols) { + vpx_free(cm->above_context); + cm->above_context = (ENTROPY_CONTEXT *)vpx_calloc( + 2 * mi_cols_aligned_to_sb(cm->mi_cols) * MAX_MB_PLANE, + sizeof(*cm->above_context)); + if (!cm->above_context) goto fail; + + vpx_free(cm->above_seg_context); + cm->above_seg_context = (PARTITION_CONTEXT *)vpx_calloc( + mi_cols_aligned_to_sb(cm->mi_cols), sizeof(*cm->above_seg_context)); + if (!cm->above_seg_context) goto fail; + cm->above_context_alloc_cols = cm->mi_cols; + } return 0; diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h index 097053a7d..319d34832 100644 --- a/vp9/common/vp9_blockd.h +++ b/vp9/common/vp9_blockd.h @@ -18,74 +18,28 @@ #include "vpx_scale/yv12config.h" #include "vp9/common/vp9_common_data.h" -#include "vp9/common/vp9_filter.h" +#include "vp9/common/vp9_entropy.h" +#include "vp9/common/vp9_entropymode.h" #include "vp9/common/vp9_mv.h" #include "vp9/common/vp9_scale.h" +#include "vp9/common/vp9_seg_common.h" #ifdef __cplusplus extern "C" { #endif -#define BLOCK_SIZE_GROUPS 4 -#define SKIP_CONTEXTS 3 -#define INTER_MODE_CONTEXTS 7 - -/* Segment Feature Masks */ -#define MAX_MV_REF_CANDIDATES 2 - -#define INTRA_INTER_CONTEXTS 4 -#define COMP_INTER_CONTEXTS 5 -#define REF_CONTEXTS 5 - -typedef enum { - PLANE_TYPE_Y = 0, - PLANE_TYPE_UV = 1, - PLANE_TYPES -} PLANE_TYPE; - #define MAX_MB_PLANE 3 -typedef char ENTROPY_CONTEXT; - -static INLINE int combine_entropy_contexts(ENTROPY_CONTEXT a, - ENTROPY_CONTEXT b) { - return (a != 0) + (b != 0); -} - typedef enum { KEY_FRAME = 0, INTER_FRAME = 1, FRAME_TYPES, } FRAME_TYPE; -typedef enum { - DC_PRED, // Average of above and left pixels - V_PRED, // Vertical - H_PRED, // Horizontal - D45_PRED, // Directional 45 deg = round(arctan(1/1) * 180/pi) - D135_PRED, // Directional 135 deg = 180 - 45 - D117_PRED, // Directional 117 deg = 180 - 63 - D153_PRED, // Directional 153 deg = 180 - 27 - D207_PRED, // Directional 207 deg = 180 + 27 - D63_PRED, // Directional 63 deg = round(arctan(2/1) * 180/pi) - TM_PRED, // True-motion - NEARESTMV, - NEARMV, - ZEROMV, - NEWMV, - MB_MODE_COUNT -} PREDICTION_MODE; - static INLINE int is_inter_mode(PREDICTION_MODE mode) { return mode >= NEARESTMV && mode <= NEWMV; } -#define INTRA_MODES (TM_PRED + 1) - -#define INTER_MODES (1 + NEWMV - NEARESTMV) - -#define INTER_OFFSET(mode) ((mode) - NEARESTMV) - /* For keyframes, intra block modes are predicted by the (already decoded) modes for the Y blocks to the left and above us; for interframes, there is a single probability table. */ @@ -281,6 +235,27 @@ static INLINE BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize, return ss_size_lookup[bsize][pd->subsampling_x][pd->subsampling_y]; } +static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) { + int i; + for (i = 0; i < MAX_MB_PLANE; i++) { + struct macroblockd_plane *const pd = &xd->plane[i]; + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd); + memset(pd->above_context, 0, + sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide_lookup[plane_bsize]); + memset(pd->left_context, 0, + sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high_lookup[plane_bsize]); + } +} + +static INLINE const vp9_prob *get_y_mode_probs(const MODE_INFO *mi, + const MODE_INFO *above_mi, + const MODE_INFO *left_mi, + int block) { + const PREDICTION_MODE above = vp9_above_block_mode(mi, above_mi, block); + const PREDICTION_MODE left = vp9_left_block_mode(mi, left_mi, block); + return vp9_kf_y_mode_prob[above][left]; +} + typedef void (*foreach_transformed_block_visitor)(int plane, int block, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h index 5a9007b54..4e02630e6 100644 --- a/vp9/common/vp9_entropy.h +++ b/vp9/common/vp9_entropy.h @@ -14,8 +14,8 @@ #include "vpx/vpx_integer.h" #include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_enums.h" #include "vp9/common/vp9_prob.h" -#include "vp9/common/vp9_scan.h" #ifdef __cplusplus extern "C" { @@ -137,18 +137,6 @@ struct VP9Common; void vp9_default_coef_probs(struct VP9Common *cm); void vp9_adapt_coef_probs(struct VP9Common *cm); -static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) { - int i; - for (i = 0; i < MAX_MB_PLANE; i++) { - struct macroblockd_plane *const pd = &xd->plane[i]; - const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd); - memset(pd->above_context, 0, - sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide_lookup[plane_bsize]); - memset(pd->left_context, 0, - sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high_lookup[plane_bsize]); - } -} - // This is the index in the scan order beyond which all coefficients for // 8x8 transform and above are in the top band. // This macro is currently unused but may be used by certain implementations @@ -185,6 +173,13 @@ typedef unsigned int vp9_coeff_count_model[REF_TYPES][COEF_BANDS] void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full); +typedef char ENTROPY_CONTEXT; + +static INLINE int combine_entropy_contexts(ENTROPY_CONTEXT a, + ENTROPY_CONTEXT b) { + return (a != 0) + (b != 0); +} + static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a, const ENTROPY_CONTEXT *l) { ENTROPY_CONTEXT above_ec = 0, left_ec = 0; @@ -214,18 +209,6 @@ static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a, return combine_entropy_contexts(above_ec, left_ec); } -static INLINE const scan_order *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size, - PLANE_TYPE type, int block_idx) { - const MODE_INFO *const mi = xd->mi[0]; - - if (is_inter_block(&mi->mbmi) || type != PLANE_TYPE_Y || xd->lossless) { - return &vp9_default_scan_orders[tx_size]; - } else { - const PREDICTION_MODE mode = get_y_mode(mi, block_idx); - return &vp9_scan_orders[tx_size][intra_mode_to_tx_type_lookup[mode]]; - } -} - #ifdef __cplusplus } // extern "C" #endif diff --git a/vp9/common/vp9_entropymode.h b/vp9/common/vp9_entropymode.h index f4e20e1af..a0619ec6f 100644 --- a/vp9/common/vp9_entropymode.h +++ b/vp9/common/vp9_entropymode.h @@ -11,7 +11,7 @@ #ifndef VP9_COMMON_VP9_ENTROPYMODE_H_ #define VP9_COMMON_VP9_ENTROPYMODE_H_ -#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_filter.h" #include "vp9/common/vp9_entropy.h" #include "vp9/common/vp9_entropymv.h" @@ -19,8 +19,12 @@ extern "C" { #endif +#define BLOCK_SIZE_GROUPS 4 + #define TX_SIZE_CONTEXTS 2 +#define INTER_OFFSET(mode) ((mode) - NEARESTMV) + struct VP9Common; struct tx_probs { @@ -97,15 +101,6 @@ void tx_counts_to_branch_counts_16x16(const unsigned int *tx_count_16x16p, void tx_counts_to_branch_counts_8x8(const unsigned int *tx_count_8x8p, unsigned int (*ct_8x8p)[2]); -static INLINE const vp9_prob *get_y_mode_probs(const MODE_INFO *mi, - const MODE_INFO *above_mi, - const MODE_INFO *left_mi, - int block) { - const PREDICTION_MODE above = vp9_above_block_mode(mi, above_mi, block); - const PREDICTION_MODE left = vp9_left_block_mode(mi, left_mi, block); - return vp9_kf_y_mode_prob[above][left]; -} - #ifdef __cplusplus } // extern "C" #endif diff --git a/vp9/common/vp9_enums.h b/vp9/common/vp9_enums.h index 7938fc10a..048202593 100644 --- a/vp9/common/vp9_enums.h +++ b/vp9/common/vp9_enums.h @@ -104,6 +104,44 @@ typedef enum { VP9_ALT_FLAG = 1 << 2, } VP9_REFFRAME; +typedef enum { + PLANE_TYPE_Y = 0, + PLANE_TYPE_UV = 1, + PLANE_TYPES +} PLANE_TYPE; + +typedef enum { + DC_PRED, // Average of above and left pixels + V_PRED, // Vertical + H_PRED, // Horizontal + D45_PRED, // Directional 45 deg = round(arctan(1/1) * 180/pi) + D135_PRED, // Directional 135 deg = 180 - 45 + D117_PRED, // Directional 117 deg = 180 - 63 + D153_PRED, // Directional 153 deg = 180 - 27 + D207_PRED, // Directional 207 deg = 180 + 27 + D63_PRED, // Directional 63 deg = round(arctan(2/1) * 180/pi) + TM_PRED, // True-motion + NEARESTMV, + NEARMV, + ZEROMV, + NEWMV, + MB_MODE_COUNT +} PREDICTION_MODE; + +#define INTRA_MODES (TM_PRED + 1) + +#define INTER_MODES (1 + NEWMV - NEARESTMV) + +#define SKIP_CONTEXTS 3 +#define INTER_MODE_CONTEXTS 7 + +/* Segment Feature Masks */ +#define MAX_MV_REF_CANDIDATES 2 + +#define INTRA_INTER_CONTEXTS 4 +#define COMP_INTER_CONTEXTS 5 +#define REF_CONTEXTS 5 + #ifdef __cplusplus } // extern "C" #endif diff --git a/vp9/common/vp9_mfqe.c b/vp9/common/vp9_mfqe.c index 57189df16..bebb37eda 100644 --- a/vp9/common/vp9_mfqe.c +++ b/vp9/common/vp9_mfqe.c @@ -171,13 +171,13 @@ static void mfqe_block(BLOCK_SIZE bs, const uint8_t *y, const uint8_t *u, get_thr(bs, qdiff, &sad_thr, &vdiff_thr); if (bs == BLOCK_16X16) { - vdiff = (vp9_variance16x16(y, y_stride, yd, yd_stride, &sse) + 128) >> 8; + vdiff = (vpx_variance16x16(y, y_stride, yd, yd_stride, &sse) + 128) >> 8; sad = (vpx_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8; } else if (bs == BLOCK_32X32) { - vdiff = (vp9_variance32x32(y, y_stride, yd, yd_stride, &sse) + 512) >> 10; + vdiff = (vpx_variance32x32(y, y_stride, yd, yd_stride, &sse) + 512) >> 10; sad = (vpx_sad32x32(y, y_stride, yd, yd_stride) + 512) >> 10; } else /* if (bs == BLOCK_64X64) */ { - vdiff = (vp9_variance64x64(y, y_stride, yd, yd_stride, &sse) + 2048) >> 12; + vdiff = (vpx_variance64x64(y, y_stride, yd, yd_stride, &sse) + 2048) >> 12; sad = (vpx_sad64x64(y, y_stride, yd, yd_stride) + 2048) >> 12; } diff --git a/vp9/common/vp9_mvref_common.c b/vp9/common/vp9_mvref_common.c index 51e147e00..ce6952752 100644 --- a/vp9/common/vp9_mvref_common.c +++ b/vp9/common/vp9_mvref_common.c @@ -223,6 +223,6 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd, break; } default: - assert("Invalid block index."); + assert(0 && "Invalid block index."); } } diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h index 5179c6906..188b03d41 100644 --- a/vp9/common/vp9_onyxc_int.h +++ b/vp9/common/vp9_onyxc_int.h @@ -14,6 +14,7 @@ #include "./vpx_config.h" #include "vpx/internal/vpx_codec_internal.h" #include "./vp9_rtcd.h" +#include "vp9/common/vp9_alloccommon.h" #include "vp9/common/vp9_loopfilter.h" #include "vp9/common/vp9_entropymv.h" #include "vp9/common/vp9_entropy.h" @@ -220,6 +221,7 @@ typedef struct VP9Common { uint8_t *seg_map_array[NUM_PING_PONG_BUFFERS]; uint8_t *last_frame_seg_map; uint8_t *current_frame_seg_map; + int seg_map_alloc_size; INTERP_FILTER interp_filter; @@ -276,6 +278,7 @@ typedef struct VP9Common { PARTITION_CONTEXT *above_seg_context; ENTROPY_CONTEXT *above_context; + int above_context_alloc_cols; } VP9_COMMON; // TODO(hkuang): Don't need to lock the whole pool after implementing atomic @@ -305,8 +308,13 @@ static INLINE int get_free_fb(VP9_COMMON *cm) { if (frame_bufs[i].ref_count == 0) break; - assert(i < FRAME_BUFFERS); - frame_bufs[i].ref_count = 1; + if (i != FRAME_BUFFERS) { + frame_bufs[i].ref_count = 1; + } else { + // Reset i to be INVALID_IDX to indicate no free buffer found. + i = INVALID_IDX; + } + unlock_buffer_pool(cm->buffer_pool); return i; } diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 2a9736b40..30710ba00 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -797,51 +797,6 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { # variance -add_proto qw/unsigned int vp9_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance32x16 avx2/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance16x32/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance64x32 avx2 neon/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance32x64 neon/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance32x32 avx2 neon/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance64x64 avx2 neon/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance16x16 avx2 neon/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance16x8/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance8x16/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance8x8 neon/, "$sse2_x86inc"; - -add_proto qw/void vp9_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; -specialize qw/vp9_get8x8var neon/, "$sse2_x86inc"; - -add_proto qw/void vp9_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; -specialize qw/vp9_get16x16var avx2 neon/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance8x4/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance4x8/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance4x4/, "$sse2_x86inc"; - add_proto qw/unsigned int vp9_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vp9_sub_pixel_variance64x64 avx2 neon/, "$sse2_x86inc", "$ssse3_x86inc"; @@ -922,21 +877,6 @@ specialize qw/vp9_sub_pixel_variance4x4/, "$sse_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; specialize qw/vp9_sub_pixel_avg_variance4x4/, "$sse_x86inc", "$ssse3_x86inc"; -add_proto qw/unsigned int vp9_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; -specialize qw/vp9_mse16x16 avx2/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; -specialize qw/vp9_mse8x16/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; -specialize qw/vp9_mse16x8/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; -specialize qw/vp9_mse8x8/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_get_mb_ss/, "const int16_t *"; -specialize qw/vp9_get_mb_ss/, "$sse2_x86inc"; - add_proto qw/unsigned int vp9_avg_8x8/, "const uint8_t *, int p"; specialize qw/vp9_avg_8x8 sse2 neon/; @@ -1141,142 +1081,6 @@ specialize qw/vp9_temporal_filter_apply sse2/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { - # variance - add_proto qw/unsigned int vp9_highbd_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance32x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance16x32/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance64x32/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance32x64/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance32x32/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance64x64/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance16x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance16x8/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance8x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance8x8/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance8x4/; - - add_proto qw/unsigned int vp9_highbd_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance4x8/; - - add_proto qw/unsigned int vp9_highbd_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance4x4/; - - add_proto qw/void vp9_highbd_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vp9_highbd_get8x8var/, "$sse2_x86inc"; - - add_proto qw/void vp9_highbd_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vp9_highbd_get16x16var/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance32x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance16x32/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance64x32/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance32x64/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance32x32/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance64x64/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance16x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance16x8/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance8x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance8x8/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance8x4/; - - add_proto qw/unsigned int vp9_highbd_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance4x8/; - - add_proto qw/unsigned int vp9_highbd_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance4x4/; - - add_proto qw/void vp9_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vp9_highbd_10_get8x8var/, "$sse2_x86inc"; - - add_proto qw/void vp9_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vp9_highbd_10_get16x16var/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance32x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance16x32/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance64x32/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance32x64/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance32x32/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance64x64/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance16x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance16x8/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance8x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance8x8/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance8x4/; - - add_proto qw/unsigned int vp9_highbd_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance4x8/; - - add_proto qw/unsigned int vp9_highbd_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance4x4/; - - add_proto qw/void vp9_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vp9_highbd_12_get8x8var/, "$sse2_x86inc"; - - add_proto qw/void vp9_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vp9_highbd_12_get16x16var/, "$sse2_x86inc"; - add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vp9_highbd_sub_pixel_variance64x64/, "$sse2_x86inc"; @@ -1511,41 +1315,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; specialize qw/vp9_highbd_12_sub_pixel_avg_variance4x4/; - add_proto qw/unsigned int vp9_highbd_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vp9_highbd_mse16x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vp9_highbd_mse8x16/; - - add_proto qw/unsigned int vp9_highbd_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vp9_highbd_mse16x8/; - - add_proto qw/unsigned int vp9_highbd_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vp9_highbd_mse8x8/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_mse16x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_mse8x16/; - - add_proto qw/unsigned int vp9_highbd_10_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_mse16x8/; - - add_proto qw/unsigned int vp9_highbd_10_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_mse8x8/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_mse16x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_mse8x16/; - - add_proto qw/unsigned int vp9_highbd_12_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_mse16x8/; - - add_proto qw/unsigned int vp9_highbd_12_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_mse8x8/, "$sse2_x86inc"; # ENCODEMB INVOKE diff --git a/vp9/common/vp9_scan.h b/vp9/common/vp9_scan.h index 65e2aa69a..1d86b5cfe 100644 --- a/vp9/common/vp9_scan.h +++ b/vp9/common/vp9_scan.h @@ -38,6 +38,18 @@ static INLINE int get_coef_context(const int16_t *neighbors, token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >> 1; } +static INLINE const scan_order *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size, + PLANE_TYPE type, int block_idx) { + const MODE_INFO *const mi = xd->mi[0]; + + if (is_inter_block(&mi->mbmi) || type != PLANE_TYPE_Y || xd->lossless) { + return &vp9_default_scan_orders[tx_size]; + } else { + const PREDICTION_MODE mode = get_y_mode(mi, block_idx); + return &vp9_scan_orders[tx_size][intra_mode_to_tx_type_lookup[mode]]; + } +} + #ifdef __cplusplus } // extern "C" #endif diff --git a/vp9/common/vp9_systemdependent.h b/vp9/common/vp9_systemdependent.h index e97115823..fc77762de 100644 --- a/vp9/common/vp9_systemdependent.h +++ b/vp9/common/vp9_systemdependent.h @@ -11,13 +11,14 @@ #ifndef VP9_COMMON_VP9_SYSTEMDEPENDENT_H_ #define VP9_COMMON_VP9_SYSTEMDEPENDENT_H_ +#include "vpx_ports/msvc.h" + #ifdef _MSC_VER # include <math.h> // the ceil() definition must precede intrin.h # if _MSC_VER > 1310 && (defined(_M_X64) || defined(_M_IX86)) # include <intrin.h> -# define USE_MSC_INTRIN +# define USE_MSC_INTRINSICS # endif -# define snprintf _snprintf #endif #ifdef __cplusplus @@ -48,7 +49,7 @@ static INLINE int round(double x) { static INLINE int get_msb(unsigned int n) { return 31 ^ __builtin_clz(n); } -#elif defined(USE_MSC_INTRIN) +#elif defined(USE_MSC_INTRINSICS) #pragma intrinsic(_BitScanReverse) static INLINE int get_msb(unsigned int n) { @@ -56,7 +57,7 @@ static INLINE int get_msb(unsigned int n) { _BitScanReverse(&first_set_bit, n); return first_set_bit; } -#undef USE_MSC_INTRIN +#undef USE_MSC_INTRINSICS #else // Returns (int)floor(log2(n)). n must be > 0. static INLINE int get_msb(unsigned int n) { diff --git a/vp9/common/x86/convolve.h b/vp9/common/x86/convolve.h new file mode 100644 index 000000000..de2df47e5 --- /dev/null +++ b/vp9/common/x86/convolve.h @@ -0,0 +1,296 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef VP9_COMMON_X86_CONVOLVE_H_ +#define VP9_COMMON_X86_CONVOLVE_H_ + +#include <assert.h> + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" + +typedef void filter8_1dfunction ( + const uint8_t *src_ptr, + ptrdiff_t src_pitch, + uint8_t *output_ptr, + ptrdiff_t out_pitch, + uint32_t output_height, + const int16_t *filter +); + +#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ + void vp9_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \ + uint8_t *dst, ptrdiff_t dst_stride, \ + const int16_t *filter_x, int x_step_q4, \ + const int16_t *filter_y, int y_step_q4, \ + int w, int h) { \ + if (step_q4 == 16 && filter[3] != 128) { \ + if (filter[0] || filter[1] || filter[2]) { \ + while (w >= 16) { \ + vp9_filter_block1d16_##dir##8_##avg##opt(src_start, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + vp9_filter_block1d8_##dir##8_##avg##opt(src_start, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + vp9_filter_block1d4_##dir##8_##avg##opt(src_start, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } else { \ + while (w >= 16) { \ + vp9_filter_block1d16_##dir##2_##avg##opt(src, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + vp9_filter_block1d8_##dir##2_##avg##opt(src, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + vp9_filter_block1d4_##dir##2_##avg##opt(src, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } \ + } \ + if (w) { \ + vp9_convolve8_##name##_c(src, src_stride, dst, dst_stride, \ + filter_x, x_step_q4, filter_y, y_step_q4, \ + w, h); \ + } \ +} + +#define FUN_CONV_2D(avg, opt) \ +void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \ + uint8_t *dst, ptrdiff_t dst_stride, \ + const int16_t *filter_x, int x_step_q4, \ + const int16_t *filter_y, int y_step_q4, \ + int w, int h) { \ + assert(w <= 64); \ + assert(h <= 64); \ + if (x_step_q4 == 16 && y_step_q4 == 16) { \ + if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \ + filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \ + DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]); \ + vp9_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \ + filter_x, x_step_q4, filter_y, y_step_q4, \ + w, h + 7); \ + vp9_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \ + filter_x, x_step_q4, filter_y, \ + y_step_q4, w, h); \ + } else { \ + DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]); \ + vp9_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \ + filter_x, x_step_q4, filter_y, y_step_q4, \ + w, h + 1); \ + vp9_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \ + filter_x, x_step_q4, filter_y, \ + y_step_q4, w, h); \ + } \ + } else { \ + vp9_convolve8_##avg##c(src, src_stride, dst, dst_stride, \ + filter_x, x_step_q4, filter_y, y_step_q4, w, h); \ + } \ +} + +#if CONFIG_VP9_HIGHBITDEPTH + +typedef void highbd_filter8_1dfunction ( + const uint16_t *src_ptr, + const ptrdiff_t src_pitch, + uint16_t *output_ptr, + ptrdiff_t out_pitch, + unsigned int output_height, + const int16_t *filter, + int bd +); + +#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ + void vp9_highbd_convolve8_##name##_##opt(const uint8_t *src8, \ + ptrdiff_t src_stride, \ + uint8_t *dst8, \ + ptrdiff_t dst_stride, \ + const int16_t *filter_x, \ + int x_step_q4, \ + const int16_t *filter_y, \ + int y_step_q4, \ + int w, int h, int bd) { \ + if (step_q4 == 16 && filter[3] != 128) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + if (filter[0] || filter[1] || filter[2]) { \ + while (w >= 16) { \ + vp9_highbd_filter_block1d16_##dir##8_##avg##opt(src_start, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter, \ + bd); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + vp9_highbd_filter_block1d8_##dir##8_##avg##opt(src_start, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter, \ + bd); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + vp9_highbd_filter_block1d4_##dir##8_##avg##opt(src_start, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter, \ + bd); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } else { \ + while (w >= 16) { \ + vp9_highbd_filter_block1d16_##dir##2_##avg##opt(src, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter, \ + bd); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + vp9_highbd_filter_block1d8_##dir##2_##avg##opt(src, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter, \ + bd); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + vp9_highbd_filter_block1d4_##dir##2_##avg##opt(src, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter, \ + bd); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } \ + } \ + if (w) { \ + vp9_highbd_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \ + filter_x, x_step_q4, filter_y, y_step_q4, \ + w, h, bd); \ + } \ +} + +#define HIGH_FUN_CONV_2D(avg, opt) \ +void vp9_highbd_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \ + uint8_t *dst, ptrdiff_t dst_stride, \ + const int16_t *filter_x, int x_step_q4, \ + const int16_t *filter_y, int y_step_q4, \ + int w, int h, int bd) { \ + assert(w <= 64); \ + assert(h <= 64); \ + if (x_step_q4 == 16 && y_step_q4 == 16) { \ + if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \ + filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \ + DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \ + vp9_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \ + CONVERT_TO_BYTEPTR(fdata2), 64, \ + filter_x, x_step_q4, \ + filter_y, y_step_q4, \ + w, h + 7, bd); \ + vp9_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2) + 192, \ + 64, dst, dst_stride, \ + filter_x, x_step_q4, \ + filter_y, y_step_q4, \ + w, h, bd); \ + } else { \ + DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \ + vp9_highbd_convolve8_horiz_##opt(src, src_stride, \ + CONVERT_TO_BYTEPTR(fdata2), 64, \ + filter_x, x_step_q4, \ + filter_y, y_step_q4, \ + w, h + 1, bd); \ + vp9_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), 64, \ + dst, dst_stride, \ + filter_x, x_step_q4, \ + filter_y, y_step_q4, \ + w, h, bd); \ + } \ + } else { \ + vp9_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \ + filter_x, x_step_q4, filter_y, y_step_q4, w, \ + h, bd); \ + } \ +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +#endif // VP9_COMMON_X86_CONVOLVE_H_ diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c index 963023c53..fd55fb8c6 100644 --- a/vp9/common/x86/vp9_asm_stubs.c +++ b/vp9/common/x86/vp9_asm_stubs.c @@ -8,421 +8,9 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include <assert.h> - -#include "./vpx_config.h" #include "./vp9_rtcd.h" -#include "vpx_ports/mem.h" - -typedef void filter8_1dfunction ( - const unsigned char *src_ptr, - const ptrdiff_t src_pitch, - unsigned char *output_ptr, - ptrdiff_t out_pitch, - unsigned int output_height, - const short *filter -); - -#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ - void vp9_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \ - uint8_t *dst, ptrdiff_t dst_stride, \ - const int16_t *filter_x, int x_step_q4, \ - const int16_t *filter_y, int y_step_q4, \ - int w, int h) { \ - if (step_q4 == 16 && filter[3] != 128) { \ - if (filter[0] || filter[1] || filter[2]) { \ - while (w >= 16) { \ - vp9_filter_block1d16_##dir##8_##avg##opt(src_start, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter); \ - src += 16; \ - dst += 16; \ - w -= 16; \ - } \ - while (w >= 8) { \ - vp9_filter_block1d8_##dir##8_##avg##opt(src_start, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter); \ - src += 8; \ - dst += 8; \ - w -= 8; \ - } \ - while (w >= 4) { \ - vp9_filter_block1d4_##dir##8_##avg##opt(src_start, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter); \ - src += 4; \ - dst += 4; \ - w -= 4; \ - } \ - } else { \ - while (w >= 16) { \ - vp9_filter_block1d16_##dir##2_##avg##opt(src, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter); \ - src += 16; \ - dst += 16; \ - w -= 16; \ - } \ - while (w >= 8) { \ - vp9_filter_block1d8_##dir##2_##avg##opt(src, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter); \ - src += 8; \ - dst += 8; \ - w -= 8; \ - } \ - while (w >= 4) { \ - vp9_filter_block1d4_##dir##2_##avg##opt(src, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter); \ - src += 4; \ - dst += 4; \ - w -= 4; \ - } \ - } \ - } \ - if (w) { \ - vp9_convolve8_##name##_c(src, src_stride, dst, dst_stride, \ - filter_x, x_step_q4, filter_y, y_step_q4, \ - w, h); \ - } \ -} - -#define FUN_CONV_2D(avg, opt) \ -void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \ - uint8_t *dst, ptrdiff_t dst_stride, \ - const int16_t *filter_x, int x_step_q4, \ - const int16_t *filter_y, int y_step_q4, \ - int w, int h) { \ - assert(w <= 64); \ - assert(h <= 64); \ - if (x_step_q4 == 16 && y_step_q4 == 16) { \ - if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \ - filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \ - DECLARE_ALIGNED(16, unsigned char, fdata2[64 * 71]); \ - vp9_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \ - filter_x, x_step_q4, filter_y, y_step_q4, \ - w, h + 7); \ - vp9_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \ - filter_x, x_step_q4, filter_y, \ - y_step_q4, w, h); \ - } else { \ - DECLARE_ALIGNED(16, unsigned char, fdata2[64 * 65]); \ - vp9_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \ - filter_x, x_step_q4, filter_y, y_step_q4, \ - w, h + 1); \ - vp9_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \ - filter_x, x_step_q4, filter_y, \ - y_step_q4, w, h); \ - } \ - } else { \ - vp9_convolve8_##avg##c(src, src_stride, dst, dst_stride, \ - filter_x, x_step_q4, filter_y, y_step_q4, w, h); \ - } \ -} - -#if CONFIG_VP9_HIGHBITDEPTH - -typedef void highbd_filter8_1dfunction ( - const uint16_t *src_ptr, - const ptrdiff_t src_pitch, - uint16_t *output_ptr, - ptrdiff_t out_pitch, - unsigned int output_height, - const int16_t *filter, - int bd -); - -#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ - void vp9_highbd_convolve8_##name##_##opt(const uint8_t *src8, \ - ptrdiff_t src_stride, \ - uint8_t *dst8, \ - ptrdiff_t dst_stride, \ - const int16_t *filter_x, \ - int x_step_q4, \ - const int16_t *filter_y, \ - int y_step_q4, \ - int w, int h, int bd) { \ - if (step_q4 == 16 && filter[3] != 128) { \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ - if (filter[0] || filter[1] || filter[2]) { \ - while (w >= 16) { \ - vp9_highbd_filter_block1d16_##dir##8_##avg##opt(src_start, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter, \ - bd); \ - src += 16; \ - dst += 16; \ - w -= 16; \ - } \ - while (w >= 8) { \ - vp9_highbd_filter_block1d8_##dir##8_##avg##opt(src_start, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter, \ - bd); \ - src += 8; \ - dst += 8; \ - w -= 8; \ - } \ - while (w >= 4) { \ - vp9_highbd_filter_block1d4_##dir##8_##avg##opt(src_start, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter, \ - bd); \ - src += 4; \ - dst += 4; \ - w -= 4; \ - } \ - } else { \ - while (w >= 16) { \ - vp9_highbd_filter_block1d16_##dir##2_##avg##opt(src, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter, \ - bd); \ - src += 16; \ - dst += 16; \ - w -= 16; \ - } \ - while (w >= 8) { \ - vp9_highbd_filter_block1d8_##dir##2_##avg##opt(src, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter, \ - bd); \ - src += 8; \ - dst += 8; \ - w -= 8; \ - } \ - while (w >= 4) { \ - vp9_highbd_filter_block1d4_##dir##2_##avg##opt(src, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter, \ - bd); \ - src += 4; \ - dst += 4; \ - w -= 4; \ - } \ - } \ - } \ - if (w) { \ - vp9_highbd_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \ - filter_x, x_step_q4, filter_y, y_step_q4, \ - w, h, bd); \ - } \ -} - -#define HIGH_FUN_CONV_2D(avg, opt) \ -void vp9_highbd_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \ - uint8_t *dst, ptrdiff_t dst_stride, \ - const int16_t *filter_x, int x_step_q4, \ - const int16_t *filter_y, int y_step_q4, \ - int w, int h, int bd) { \ - assert(w <= 64); \ - assert(h <= 64); \ - if (x_step_q4 == 16 && y_step_q4 == 16) { \ - if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \ - filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \ - DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \ - vp9_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \ - CONVERT_TO_BYTEPTR(fdata2), 64, \ - filter_x, x_step_q4, \ - filter_y, y_step_q4, \ - w, h + 7, bd); \ - vp9_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2) + 192, \ - 64, dst, dst_stride, \ - filter_x, x_step_q4, \ - filter_y, y_step_q4, \ - w, h, bd); \ - } else { \ - DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \ - vp9_highbd_convolve8_horiz_##opt(src, src_stride, \ - CONVERT_TO_BYTEPTR(fdata2), 64, \ - filter_x, x_step_q4, \ - filter_y, y_step_q4, \ - w, h + 1, bd); \ - vp9_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), 64, \ - dst, dst_stride, \ - filter_x, x_step_q4, \ - filter_y, y_step_q4, \ - w, h, bd); \ - } \ - } else { \ - vp9_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \ - filter_x, x_step_q4, filter_y, y_step_q4, w, \ - h, bd); \ - } \ -} -#endif // CONFIG_VP9_HIGHBITDEPTH - -#if HAVE_AVX2 && HAVE_SSSE3 -filter8_1dfunction vp9_filter_block1d16_v8_avx2; -filter8_1dfunction vp9_filter_block1d16_h8_avx2; -filter8_1dfunction vp9_filter_block1d4_v8_ssse3; -#if ARCH_X86_64 -filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3; -filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3; -filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3; -#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_intrin_ssse3 -#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_intrin_ssse3 -#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_intrin_ssse3 -#else // ARCH_X86 -filter8_1dfunction vp9_filter_block1d8_v8_ssse3; -filter8_1dfunction vp9_filter_block1d8_h8_ssse3; -filter8_1dfunction vp9_filter_block1d4_h8_ssse3; -#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3 -#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3 -#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3 -#endif // ARCH_X86_64 / ARCH_X86 -filter8_1dfunction vp9_filter_block1d16_v2_ssse3; -filter8_1dfunction vp9_filter_block1d16_h2_ssse3; -filter8_1dfunction vp9_filter_block1d8_v2_ssse3; -filter8_1dfunction vp9_filter_block1d8_h2_ssse3; -filter8_1dfunction vp9_filter_block1d4_v2_ssse3; -filter8_1dfunction vp9_filter_block1d4_h2_ssse3; -#define vp9_filter_block1d4_v8_avx2 vp9_filter_block1d4_v8_ssse3 -#define vp9_filter_block1d16_v2_avx2 vp9_filter_block1d16_v2_ssse3 -#define vp9_filter_block1d16_h2_avx2 vp9_filter_block1d16_h2_ssse3 -#define vp9_filter_block1d8_v2_avx2 vp9_filter_block1d8_v2_ssse3 -#define vp9_filter_block1d8_h2_avx2 vp9_filter_block1d8_h2_ssse3 -#define vp9_filter_block1d4_v2_avx2 vp9_filter_block1d4_v2_ssse3 -#define vp9_filter_block1d4_h2_avx2 vp9_filter_block1d4_h2_ssse3 -// void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -// void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2); -FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2); - -// void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -FUN_CONV_2D(, avx2); -#endif // HAVE_AX2 && HAVE_SSSE3 -#if HAVE_SSSE3 -#if ARCH_X86_64 -filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3; -filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3; -filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3; -filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3; -filter8_1dfunction vp9_filter_block1d4_v8_ssse3; -filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3; -#define vp9_filter_block1d16_v8_ssse3 vp9_filter_block1d16_v8_intrin_ssse3 -#define vp9_filter_block1d16_h8_ssse3 vp9_filter_block1d16_h8_intrin_ssse3 -#define vp9_filter_block1d8_v8_ssse3 vp9_filter_block1d8_v8_intrin_ssse3 -#define vp9_filter_block1d8_h8_ssse3 vp9_filter_block1d8_h8_intrin_ssse3 -#define vp9_filter_block1d4_h8_ssse3 vp9_filter_block1d4_h8_intrin_ssse3 -#else // ARCH_X86 -filter8_1dfunction vp9_filter_block1d16_v8_ssse3; -filter8_1dfunction vp9_filter_block1d16_h8_ssse3; -filter8_1dfunction vp9_filter_block1d8_v8_ssse3; -filter8_1dfunction vp9_filter_block1d8_h8_ssse3; -filter8_1dfunction vp9_filter_block1d4_v8_ssse3; -filter8_1dfunction vp9_filter_block1d4_h8_ssse3; -#endif // ARCH_X86_64 / ARCH_X86 -filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3; -filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3; -filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3; -filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3; -filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3; -filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3; - -filter8_1dfunction vp9_filter_block1d16_v2_ssse3; -filter8_1dfunction vp9_filter_block1d16_h2_ssse3; -filter8_1dfunction vp9_filter_block1d8_v2_ssse3; -filter8_1dfunction vp9_filter_block1d8_h2_ssse3; -filter8_1dfunction vp9_filter_block1d4_v2_ssse3; -filter8_1dfunction vp9_filter_block1d4_h2_ssse3; -filter8_1dfunction vp9_filter_block1d16_v2_avg_ssse3; -filter8_1dfunction vp9_filter_block1d16_h2_avg_ssse3; -filter8_1dfunction vp9_filter_block1d8_v2_avg_ssse3; -filter8_1dfunction vp9_filter_block1d8_h2_avg_ssse3; -filter8_1dfunction vp9_filter_block1d4_v2_avg_ssse3; -filter8_1dfunction vp9_filter_block1d4_h2_avg_ssse3; - -// void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -// void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -// void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -// void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3); -FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3); -FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3); -FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, - ssse3); - -// void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -// void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -FUN_CONV_2D(, ssse3); -FUN_CONV_2D(avg_ , ssse3); -#endif // HAVE_SSSE3 +#include "./vpx_config.h" +#include "vp9/common/x86/convolve.h" #if HAVE_SSE2 filter8_1dfunction vp9_filter_block1d16_v8_sse2; diff --git a/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c b/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c index 3bc7d3918..cee8d1e76 100644 --- a/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c +++ b/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c @@ -8,7 +8,14 @@ * be found in the AUTHORS file in the root of the source tree. */ +// Due to a header conflict between math.h and intrinsics includes with ceil() +// in certain configurations under vs9 this include needs to precede +// immintrin.h. +#include "./vp9_rtcd.h" + #include <immintrin.h> + +#include "vp9/common/x86/convolve.h" #include "vpx_ports/mem.h" // filters for 16_h8 and 16_v8 @@ -53,23 +60,23 @@ DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = { # define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) #endif // __clang__ -void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr, - unsigned int src_pixels_per_line, - unsigned char *output_ptr, - unsigned int output_pitch, - unsigned int output_height, - int16_t *filter) { +static void vp9_filter_block1d16_h8_avx2(const uint8_t *src_ptr, + ptrdiff_t src_pixels_per_line, + uint8_t *output_ptr, + ptrdiff_t output_pitch, + uint32_t output_height, + const int16_t *filter) { __m128i filtersReg; __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg; __m256i firstFilters, secondFilters, thirdFilters, forthFilters; __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; __m256i srcReg32b1, srcReg32b2, filtersReg32; unsigned int i; - unsigned int src_stride, dst_stride; + ptrdiff_t src_stride, dst_stride; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((__m128i *)filter); + filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg =_mm_packs_epi16(filtersReg, filtersReg); @@ -104,9 +111,9 @@ void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr, for (i = output_height; i > 1; i-=2) { // load the 2 strides of source srcReg32b1 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(src_ptr-3))); + _mm_loadu_si128((const __m128i *)(src_ptr - 3))); srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, - _mm_loadu_si128((__m128i *) + _mm_loadu_si128((const __m128i *) (src_ptr+src_pixels_per_line-3)), 1); // filter the source buffer @@ -135,9 +142,9 @@ void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr, // reading 2 strides of the next 16 bytes // (part of it was being read by earlier read) srcReg32b2 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(src_ptr+5))); + _mm_loadu_si128((const __m128i *)(src_ptr + 5))); srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, - _mm_loadu_si128((__m128i *) + _mm_loadu_si128((const __m128i *) (src_ptr+src_pixels_per_line+5)), 1); // add and saturate the results together @@ -202,7 +209,7 @@ void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr, __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1; __m128i srcRegFilt2, srcRegFilt3; - srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3)); + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); // filter the source buffer srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, @@ -237,7 +244,7 @@ void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr, // reading the next 16 bytes // (part of it was being read by earlier read) - srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5)); + srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); // add and saturate the results together srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, @@ -297,12 +304,12 @@ void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr, } } -void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr, - unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - int16_t *filter) { +static void vp9_filter_block1d16_v8_avx2(const uint8_t *src_ptr, + ptrdiff_t src_pitch, + uint8_t *output_ptr, + ptrdiff_t out_pitch, + uint32_t output_height, + const int16_t *filter) { __m128i filtersReg; __m256i addFilterReg64; __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; @@ -310,11 +317,11 @@ void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr, __m256i srcReg32b11, srcReg32b12, filtersReg32; __m256i firstFilters, secondFilters, thirdFilters, forthFilters; unsigned int i; - unsigned int src_stride, dst_stride; + ptrdiff_t src_stride, dst_stride; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((__m128i *)filter); + filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the // same data in both lanes of 128 bit register. filtersReg =_mm_packs_epi16(filtersReg, filtersReg); @@ -344,19 +351,19 @@ void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr, // load 16 bytes 7 times in stride of src_pitch srcReg32b1 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(src_ptr))); + _mm_loadu_si128((const __m128i *)(src_ptr))); srcReg32b2 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(src_ptr+src_pitch))); + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch))); srcReg32b3 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2))); + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2))); srcReg32b4 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3))); + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3))); srcReg32b5 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4))); + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4))); srcReg32b6 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5))); + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5))); srcReg32b7 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6))); + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6))); // have each consecutive loads on the same 256 register srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, @@ -393,11 +400,11 @@ void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr, // load the last 2 loads of 16 bytes and have every two // consecutive loads in the same 256 bit register srcReg32b8 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7))); + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7))); srcReg32b7 = _mm256_inserti128_si256(srcReg32b7, _mm256_castsi256_si128(srcReg32b8), 1); srcReg32b9 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*8))); + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8))); srcReg32b8 = _mm256_inserti128_si256(srcReg32b8, _mm256_castsi256_si128(srcReg32b9), 1); @@ -476,7 +483,7 @@ void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr, __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5; __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8; // load the last 16 bytes - srcRegFilt8 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7)); + srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); // merge the last 2 results together srcRegFilt4 = _mm_unpacklo_epi8( @@ -542,3 +549,54 @@ void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr, _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); } } + +#if HAVE_AVX2 && HAVE_SSSE3 +filter8_1dfunction vp9_filter_block1d4_v8_ssse3; +#if ARCH_X86_64 +filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3; +#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_intrin_ssse3 +#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_intrin_ssse3 +#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_intrin_ssse3 +#else // ARCH_X86 +filter8_1dfunction vp9_filter_block1d8_v8_ssse3; +filter8_1dfunction vp9_filter_block1d8_h8_ssse3; +filter8_1dfunction vp9_filter_block1d4_h8_ssse3; +#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3 +#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3 +#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3 +#endif // ARCH_X86_64 +filter8_1dfunction vp9_filter_block1d16_v2_ssse3; +filter8_1dfunction vp9_filter_block1d16_h2_ssse3; +filter8_1dfunction vp9_filter_block1d8_v2_ssse3; +filter8_1dfunction vp9_filter_block1d8_h2_ssse3; +filter8_1dfunction vp9_filter_block1d4_v2_ssse3; +filter8_1dfunction vp9_filter_block1d4_h2_ssse3; +#define vp9_filter_block1d4_v8_avx2 vp9_filter_block1d4_v8_ssse3 +#define vp9_filter_block1d16_v2_avx2 vp9_filter_block1d16_v2_ssse3 +#define vp9_filter_block1d16_h2_avx2 vp9_filter_block1d16_h2_ssse3 +#define vp9_filter_block1d8_v2_avx2 vp9_filter_block1d8_v2_ssse3 +#define vp9_filter_block1d8_h2_avx2 vp9_filter_block1d8_h2_ssse3 +#define vp9_filter_block1d4_v2_avx2 vp9_filter_block1d4_v2_ssse3 +#define vp9_filter_block1d4_h2_avx2 vp9_filter_block1d4_h2_ssse3 +// void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2); +FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2); + +// void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_2D(, avx2); +#endif // HAVE_AX2 && HAVE_SSSE3 diff --git a/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c b/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c index 4ab49e772..5fd2857e1 100644 --- a/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c +++ b/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c @@ -8,9 +8,14 @@ * be found in the AUTHORS file in the root of the source tree. */ +// Due to a header conflict between math.h and intrinsics includes with ceil() +// in certain configurations under vs9 this include needs to precede +// tmmintrin.h. +#include "./vp9_rtcd.h" + #include <tmmintrin.h> -#include "./vp9_rtcd.h" +#include "vp9/common/x86/convolve.h" #include "vpx_ports/mem.h" #include "vpx_ports/emmintrin_compat.h" @@ -40,12 +45,17 @@ DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = { 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 }; -void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr, - unsigned int src_pixels_per_line, - unsigned char *output_ptr, - unsigned int output_pitch, - unsigned int output_height, - int16_t *filter) { +// These are reused by the avx2 intrinsics. +filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3; + +void vp9_filter_block1d4_h8_intrin_ssse3(const uint8_t *src_ptr, + ptrdiff_t src_pixels_per_line, + uint8_t *output_ptr, + ptrdiff_t output_pitch, + uint32_t output_height, + const int16_t *filter) { __m128i firstFilters, secondFilters, shuffle1, shuffle2; __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; __m128i addFilterReg64, filtersReg, srcReg, minReg; @@ -53,7 +63,7 @@ void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr, // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 =_mm_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((__m128i *)filter); + filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg =_mm_packs_epi16(filtersReg, filtersReg); @@ -74,7 +84,7 @@ void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr, shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8); for (i = 0; i < output_height; i++) { - srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3)); + srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); // filter the source buffer srcRegFilt1= _mm_shuffle_epi8(srcReg, shuffle1); @@ -111,12 +121,12 @@ void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr, } } -void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr, - unsigned int src_pixels_per_line, - unsigned char *output_ptr, - unsigned int output_pitch, - unsigned int output_height, - int16_t *filter) { +void vp9_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr, + ptrdiff_t src_pixels_per_line, + uint8_t *output_ptr, + ptrdiff_t output_pitch, + uint32_t output_height, + const int16_t *filter) { __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg; __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; @@ -125,7 +135,7 @@ void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr, // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((__m128i *)filter); + filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg =_mm_packs_epi16(filtersReg, filtersReg); @@ -149,7 +159,7 @@ void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr, filt4Reg = _mm_load_si128((__m128i const *)filt4_global); for (i = 0; i < output_height; i++) { - srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3)); + srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); // filter the source buffer srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg); @@ -191,12 +201,12 @@ void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr, } } -void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr, - unsigned int src_pixels_per_line, - unsigned char *output_ptr, - unsigned int output_pitch, - unsigned int output_height, - int16_t *filter) { +static void vp9_filter_block1d16_h8_intrin_ssse3(const uint8_t *src_ptr, + ptrdiff_t src_pixels_per_line, + uint8_t *output_ptr, + ptrdiff_t output_pitch, + uint32_t output_height, + const int16_t *filter) { __m128i addFilterReg64, filtersReg, srcReg1, srcReg2; __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; __m128i firstFilters, secondFilters, thirdFilters, forthFilters; @@ -205,7 +215,7 @@ void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr, // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((__m128i *)filter); + filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg =_mm_packs_epi16(filtersReg, filtersReg); @@ -229,7 +239,7 @@ void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr, filt4Reg = _mm_load_si128((__m128i const *)filt4_global); for (i = 0; i < output_height; i++) { - srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3)); + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); // filter the source buffer srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg); @@ -256,7 +266,7 @@ void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr, // reading the next 16 bytes. // (part of it was being read by earlier read) - srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5)); + srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); // add and saturate the results together srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, @@ -308,12 +318,12 @@ void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr, } } -void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr, - unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - int16_t *filter) { +void vp9_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr, + ptrdiff_t src_pitch, + uint8_t *output_ptr, + ptrdiff_t out_pitch, + uint32_t output_height, + const int16_t *filter) { __m128i addFilterReg64, filtersReg, minReg; __m128i firstFilters, secondFilters, thirdFilters, forthFilters; __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5; @@ -323,7 +333,7 @@ void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr, // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((__m128i *)filter); + filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg =_mm_packs_epi16(filtersReg, filtersReg); @@ -338,17 +348,17 @@ void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr, forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); // load the first 7 rows of 8 bytes - srcReg1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]); - srcReg2 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch)[0]); - srcReg3 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 2)[0]); - srcReg4 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 3)[0]); - srcReg5 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 4)[0]); - srcReg6 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 5)[0]); - srcReg7 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 6)[0]); + srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr); + srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch)); + srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); + srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); + srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); + srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); + srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); for (i = 0; i < output_height; i++) { // load the last 8 bytes - srcReg8 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 7)[0]); + srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)); // merge the result together srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2); @@ -396,12 +406,12 @@ void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr, } } -void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr, - unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - int16_t *filter) { +static void vp9_filter_block1d16_v8_intrin_ssse3(const uint8_t *src_ptr, + ptrdiff_t src_pitch, + uint8_t *output_ptr, + ptrdiff_t out_pitch, + uint32_t output_height, + const int16_t *filter) { __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt3; __m128i firstFilters, secondFilters, thirdFilters, forthFilters; __m128i srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8; @@ -411,7 +421,7 @@ void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr, // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((__m128i *)filter); + filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg =_mm_packs_epi16(filtersReg, filtersReg); @@ -426,17 +436,17 @@ void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr, forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); // load the first 7 rows of 16 bytes - srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr)); - srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch)); - srcReg3 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 2)); - srcReg4 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 3)); - srcReg5 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 4)); - srcReg6 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 5)); - srcReg7 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 6)); + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); + srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)); + srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)); + srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)); + srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)); + srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)); + srcReg7 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)); for (i = 0; i < output_height; i++) { // load the last 16 bytes - srcReg8 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 7)); + srcReg8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); // merge the result together srcRegFilt5 = _mm_unpacklo_epi8(srcReg1, srcReg2); @@ -510,3 +520,82 @@ void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr, output_ptr+=out_pitch; } } + +#if ARCH_X86_64 +filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d4_v8_ssse3; +filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3; +#define vp9_filter_block1d16_v8_ssse3 vp9_filter_block1d16_v8_intrin_ssse3 +#define vp9_filter_block1d16_h8_ssse3 vp9_filter_block1d16_h8_intrin_ssse3 +#define vp9_filter_block1d8_v8_ssse3 vp9_filter_block1d8_v8_intrin_ssse3 +#define vp9_filter_block1d8_h8_ssse3 vp9_filter_block1d8_h8_intrin_ssse3 +#define vp9_filter_block1d4_h8_ssse3 vp9_filter_block1d4_h8_intrin_ssse3 +#else // ARCH_X86 +filter8_1dfunction vp9_filter_block1d16_v8_ssse3; +filter8_1dfunction vp9_filter_block1d16_h8_ssse3; +filter8_1dfunction vp9_filter_block1d8_v8_ssse3; +filter8_1dfunction vp9_filter_block1d8_h8_ssse3; +filter8_1dfunction vp9_filter_block1d4_v8_ssse3; +filter8_1dfunction vp9_filter_block1d4_h8_ssse3; +#endif // ARCH_X86_64 +filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3; +filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3; +filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3; +filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3; +filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3; +filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3; + +filter8_1dfunction vp9_filter_block1d16_v2_ssse3; +filter8_1dfunction vp9_filter_block1d16_h2_ssse3; +filter8_1dfunction vp9_filter_block1d8_v2_ssse3; +filter8_1dfunction vp9_filter_block1d8_h2_ssse3; +filter8_1dfunction vp9_filter_block1d4_v2_ssse3; +filter8_1dfunction vp9_filter_block1d4_h2_ssse3; +filter8_1dfunction vp9_filter_block1d16_v2_avg_ssse3; +filter8_1dfunction vp9_filter_block1d16_h2_avg_ssse3; +filter8_1dfunction vp9_filter_block1d8_v2_avg_ssse3; +filter8_1dfunction vp9_filter_block1d8_h2_avg_ssse3; +filter8_1dfunction vp9_filter_block1d4_v2_avg_ssse3; +filter8_1dfunction vp9_filter_block1d4_h2_avg_ssse3; + +// void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3); +FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3); +FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3); +FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, + ssse3); + +// void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_2D(, ssse3); +FUN_CONV_2D(avg_ , ssse3); diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c index fcf480b86..0e9b1c523 100644 --- a/vp9/decoder/vp9_decodeframe.c +++ b/vp9/decoder/vp9_decodeframe.c @@ -699,7 +699,8 @@ static void resize_context_buffers(VP9_COMMON *cm, int width, int height) { #if CONFIG_SIZE_LIMIT if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT) vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, - "Width and height beyond allowed size."); + "Dimensions of %dx%d beyond allowed size of %dx%d.", + width, height, DECODE_WIDTH_LIMIT, DECODE_HEIGHT_LIMIT); #endif if (cm->width != width || cm->height != height) { const int new_mi_rows = diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c index 288d8690c..7991a39e6 100644 --- a/vp9/decoder/vp9_decoder.c +++ b/vp9/decoder/vp9_decoder.c @@ -50,7 +50,6 @@ static void initialize_dec(void) { static void vp9_dec_setup_mi(VP9_COMMON *cm) { cm->mi = cm->mip + cm->mi_stride + 1; - memset(cm->mip, 0, cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mip)); cm->mi_grid_visible = cm->mi_grid_base + cm->mi_stride + 1; memset(cm->mi_grid_base, 0, cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mi_grid_base)); @@ -212,6 +211,9 @@ vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm, // Find an empty frame buffer. const int free_fb = get_free_fb(cm); + if (cm->new_fb_idx == INVALID_IDX) + return VPX_CODEC_MEM_ERROR; + // Decrease ref_count since it will be increased again in // ref_cnt_fb() below. --frame_bufs[free_fb].ref_count; @@ -299,7 +301,10 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, && frame_bufs[cm->new_fb_idx].ref_count == 0) pool->release_fb_cb(pool->cb_priv, &frame_bufs[cm->new_fb_idx].raw_frame_buffer); + // Find a free frame buffer. Return error if can not find any. cm->new_fb_idx = get_free_fb(cm); + if (cm->new_fb_idx == INVALID_IDX) + return VPX_CODEC_MEM_ERROR; // Assign a MV array to the frame buffer. cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx]; diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c index bb8c66fc0..63269844c 100644 --- a/vp9/decoder/vp9_detokenize.c +++ b/vp9/decoder/vp9_detokenize.c @@ -17,6 +17,7 @@ #if CONFIG_COEFFICIENT_RANGE_CHECKING #include "vp9/common/vp9_idct.h" #endif +#include "vp9/common/vp9_scan.h" #include "vp9/decoder/vp9_detokenize.h" diff --git a/vp9/encoder/arm/neon/vp9_variance_neon.c b/vp9/encoder/arm/neon/vp9_variance_neon.c index cf82dd75d..166156af7 100644 --- a/vp9/encoder/arm/neon/vp9_variance_neon.c +++ b/vp9/encoder/arm/neon/vp9_variance_neon.c @@ -10,6 +10,7 @@ #include <arm_neon.h> #include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" #include "./vpx_config.h" #include "vpx_ports/mem.h" @@ -20,82 +21,6 @@ #include "vp9/encoder/vp9_variance.h" -static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) { - const int32x4_t a = vpaddlq_s16(v_16x8); - const int64x2_t b = vpaddlq_s32(a); - const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), - vreinterpret_s32_s64(vget_high_s64(b))); - return vget_lane_s32(c, 0); -} - -static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) { - const int64x2_t b = vpaddlq_s32(v_32x4); - const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), - vreinterpret_s32_s64(vget_high_s64(b))); - return vget_lane_s32(c, 0); -} - -// w * h must be less than 2048 or local variable v_sum may overflow. -static void variance_neon_w8(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - int w, int h, uint32_t *sse, int *sum) { - int i, j; - int16x8_t v_sum = vdupq_n_s16(0); - int32x4_t v_sse_lo = vdupq_n_s32(0); - int32x4_t v_sse_hi = vdupq_n_s32(0); - - for (i = 0; i < h; ++i) { - for (j = 0; j < w; j += 8) { - const uint8x8_t v_a = vld1_u8(&a[j]); - const uint8x8_t v_b = vld1_u8(&b[j]); - const uint16x8_t v_diff = vsubl_u8(v_a, v_b); - const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff); - v_sum = vaddq_s16(v_sum, sv_diff); - v_sse_lo = vmlal_s16(v_sse_lo, - vget_low_s16(sv_diff), - vget_low_s16(sv_diff)); - v_sse_hi = vmlal_s16(v_sse_hi, - vget_high_s16(sv_diff), - vget_high_s16(sv_diff)); - } - a += a_stride; - b += b_stride; - } - - *sum = horizontal_add_s16x8(v_sum); - *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi)); -} - -void vp9_get8x8var_neon(const uint8_t *src_ptr, int source_stride, - const uint8_t *ref_ptr, int ref_stride, - unsigned int *sse, int *sum) { - variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, 8, - 8, sse, sum); -} - -unsigned int vp9_variance8x8_neon(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - unsigned int *sse) { - int sum; - variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum); - return *sse - (((int64_t)sum * sum) >> 6); // >> 6 = / 8 * 8 -} - -void vp9_get16x16var_neon(const uint8_t *src_ptr, int source_stride, - const uint8_t *ref_ptr, int ref_stride, - unsigned int *sse, int *sum) { - variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, 16, - 16, sse, sum); -} - -unsigned int vp9_variance16x16_neon(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - unsigned int *sse) { - int sum; - variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum); - return *sse - (((int64_t)sum * sum) >> 8); // >> 8 = / 16 * 16 -} - static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, uint8_t *output_ptr, unsigned int src_pixels_per_line, @@ -162,7 +87,7 @@ unsigned int vp9_sub_pixel_variance8x8_neon(const uint8_t *src, BILINEAR_FILTERS_2TAP(xoffset)); var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8, 8, BILINEAR_FILTERS_2TAP(yoffset)); - return vp9_variance8x8_neon(temp2, 8, dst, dst_stride, sse); + return vpx_variance8x8_neon(temp2, 8, dst, dst_stride, sse); } unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src, @@ -180,77 +105,7 @@ unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src, BILINEAR_FILTERS_2TAP(xoffset)); var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16, 16, BILINEAR_FILTERS_2TAP(yoffset)); - return vp9_variance16x16_neon(temp2, 16, dst, dst_stride, sse); -} - -void vp9_get32x32var_neon(const uint8_t *src_ptr, int source_stride, - const uint8_t *ref_ptr, int ref_stride, - unsigned int *sse, int *sum) { - variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, 32, - 32, sse, sum); -} - -unsigned int vp9_variance32x32_neon(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - unsigned int *sse) { - int sum; - variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum); - return *sse - (((int64_t)sum * sum) >> 10); // >> 10 = / 32 * 32 -} - -unsigned int vp9_variance32x64_neon(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - unsigned int *sse) { - int sum1, sum2; - uint32_t sse1, sse2; - variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1); - variance_neon_w8(a + (32 * a_stride), a_stride, - b + (32 * b_stride), b_stride, 32, 32, - &sse2, &sum2); - *sse = sse1 + sse2; - sum1 += sum2; - return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64 -} - -unsigned int vp9_variance64x32_neon(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - unsigned int *sse) { - int sum1, sum2; - uint32_t sse1, sse2; - variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1); - variance_neon_w8(a + (16 * a_stride), a_stride, - b + (16 * b_stride), b_stride, 64, 16, - &sse2, &sum2); - *sse = sse1 + sse2; - sum1 += sum2; - return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64 -} - -unsigned int vp9_variance64x64_neon(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - unsigned int *sse) { - int sum1, sum2; - uint32_t sse1, sse2; - - variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1); - variance_neon_w8(a + (16 * a_stride), a_stride, - b + (16 * b_stride), b_stride, 64, 16, - &sse2, &sum2); - sse1 += sse2; - sum1 += sum2; - - variance_neon_w8(a + (16 * 2 * a_stride), a_stride, - b + (16 * 2 * b_stride), b_stride, - 64, 16, &sse2, &sum2); - sse1 += sse2; - sum1 += sum2; - - variance_neon_w8(a + (16 * 3 * a_stride), a_stride, - b + (16 * 3 * b_stride), b_stride, - 64, 16, &sse2, &sum2); - *sse = sse1 + sse2; - sum1 += sum2; - return *sse - (((int64_t)sum1 * sum1) >> 12); // >> 12 = / 64 * 64 + return vpx_variance16x16_neon(temp2, 16, dst, dst_stride, sse); } unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src, @@ -268,7 +123,7 @@ unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src, BILINEAR_FILTERS_2TAP(xoffset)); var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32, 32, BILINEAR_FILTERS_2TAP(yoffset)); - return vp9_variance32x32_neon(temp2, 32, dst, dst_stride, sse); + return vpx_variance32x32_neon(temp2, 32, dst, dst_stride, sse); } unsigned int vp9_sub_pixel_variance64x64_neon(const uint8_t *src, @@ -286,5 +141,5 @@ unsigned int vp9_sub_pixel_variance64x64_neon(const uint8_t *src, BILINEAR_FILTERS_2TAP(xoffset)); var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64, 64, BILINEAR_FILTERS_2TAP(yoffset)); - return vp9_variance64x64_neon(temp2, 64, dst, dst_stride, sse); + return vpx_variance64x64_neon(temp2, 64, dst, dst_stride, sse); } diff --git a/vp9/encoder/vp9_aq_variance.c b/vp9/encoder/vp9_aq_variance.c index 55c964903..9e5d9ee6a 100644 --- a/vp9/encoder/vp9_aq_variance.c +++ b/vp9/encoder/vp9_aq_variance.c @@ -98,9 +98,9 @@ static unsigned int block_variance(VP9_COMP *cpi, MACROBLOCK *x, int avg; #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - highbd_variance(x->plane[0].src.buf, x->plane[0].src.stride, - CONVERT_TO_BYTEPTR(vp9_highbd_64_zeros), 0, bw, bh, - &sse, &avg); + highbd_8_variance(x->plane[0].src.buf, x->plane[0].src.stride, + CONVERT_TO_BYTEPTR(vp9_highbd_64_zeros), 0, bw, bh, + &sse, &avg); sse >>= 2 * (xd->bd - 8); avg >>= (xd->bd - 8); } else { diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 3d310f955..a8adca9ec 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -13,6 +13,7 @@ #include <stdio.h> #include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" #include "./vpx_config.h" #include "vpx_ports/mem.h" @@ -463,46 +464,55 @@ static int set_vt_partitioning(VP9_COMP *cpi, return 0; } -void vp9_set_vbp_thresholds(VP9_COMP *cpi, int q) { +// Set the variance split thresholds for following the block sizes: +// 0 - threshold_64x64, 1 - threshold_32x32, 2 - threshold_16x16, +// 3 - vbp_threshold_8x8. vbp_threshold_8x8 (to split to 4x4 partition) is +// currently only used on key frame. +static void set_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q) { + VP9_COMMON *const cm = &cpi->common; + const int is_key_frame = (cm->frame_type == KEY_FRAME); + const int threshold_multiplier = is_key_frame ? 20 : 1; + const int64_t threshold_base = (int64_t)(threshold_multiplier * + cpi->y_dequant[q][1]); + if (is_key_frame) { + thresholds[0] = threshold_base; + thresholds[1] = threshold_base >> 2; + thresholds[2] = threshold_base >> 2; + thresholds[3] = threshold_base << 2; + } else { + thresholds[1] = threshold_base; + if (cm->width <= 352 && cm->height <= 288) { + thresholds[0] = threshold_base >> 2; + thresholds[2] = threshold_base << 3; + } else { + thresholds[0] = threshold_base; + thresholds[1] = (5 * threshold_base) >> 2; + if (cm->width >= 1920 && cm->height >= 1080) + thresholds[1] = (7 * threshold_base) >> 2; + thresholds[2] = threshold_base << cpi->oxcf.speed; + } + } +} + +void vp9_set_variance_partition_thresholds(VP9_COMP *cpi, int q) { + VP9_COMMON *const cm = &cpi->common; SPEED_FEATURES *const sf = &cpi->sf; + const int is_key_frame = (cm->frame_type == KEY_FRAME); if (sf->partition_search_type != VAR_BASED_PARTITION && sf->partition_search_type != REFERENCE_PARTITION) { return; } else { - VP9_COMMON *const cm = &cpi->common; - const int is_key_frame = (cm->frame_type == KEY_FRAME); - const int threshold_multiplier = is_key_frame ? 20 : 1; - const int64_t threshold_base = (int64_t)(threshold_multiplier * - cpi->y_dequant[q][1]); - - // TODO(marpan): Allow 4x4 partitions for inter-frames. - // use_4x4_partition = (variance4x4downsample[i2 + j] == 1); - // If 4x4 partition is not used, then 8x8 partition will be selected - // if variance of 16x16 block is very high, so use larger threshold - // for 16x16 (threshold_bsize_min) in that case. - - // Array index: 0 - threshold_64x64; 1 - threshold_32x32; - // 2 - threshold_16x16; 3 - vbp_threshold_8x8; + set_vbp_thresholds(cpi, cpi->vbp_thresholds, q); + // The thresholds below are not changed locally. if (is_key_frame) { - cpi->vbp_thresholds[0] = threshold_base; - cpi->vbp_thresholds[1] = threshold_base >> 2; - cpi->vbp_thresholds[2] = threshold_base >> 2; - cpi->vbp_thresholds[3] = threshold_base << 2; cpi->vbp_threshold_sad = 0; cpi->vbp_bsize_min = BLOCK_8X8; } else { - cpi->vbp_thresholds[1] = threshold_base; - if (cm->width <= 352 && cm->height <= 288) { - cpi->vbp_thresholds[0] = threshold_base >> 2; - cpi->vbp_thresholds[2] = threshold_base << 3; + if (cm->width <= 352 && cm->height <= 288) cpi->vbp_threshold_sad = 100; - } else { - cpi->vbp_thresholds[0] = threshold_base; - cpi->vbp_thresholds[1] = (5 * threshold_base) >> 2; - cpi->vbp_thresholds[2] = threshold_base << cpi->oxcf.speed; + else cpi->vbp_threshold_sad = (cpi->y_dequant[q][1] << 1) > 1000 ? (cpi->y_dequant[q][1] << 1) : 1000; - } cpi->vbp_bsize_min = BLOCK_16X16; } cpi->vbp_threshold_minmax = 15 + (q >> 3); @@ -551,23 +561,6 @@ static int compute_minmax_8x8(const uint8_t *s, int sp, const uint8_t *d, return (minmax_max - minmax_min); } -static void modify_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q) { - VP9_COMMON *const cm = &cpi->common; - const int64_t threshold_base = (int64_t)(cpi->y_dequant[q][1]); - - // Array index: 0 - threshold_64x64; 1 - threshold_32x32; - // 2 - threshold_16x16; 3 - vbp_threshold_8x8; - thresholds[1] = threshold_base; - if (cm->width <= 352 && cm->height <= 288) { - thresholds[0] = threshold_base >> 2; - thresholds[2] = threshold_base << 3; - } else { - thresholds[0] = threshold_base; - thresholds[1] = (5 * threshold_base) >> 2; - thresholds[2] = threshold_base << cpi->oxcf.speed; - } -} - static void fill_variance_4x4avg(const uint8_t *s, int sp, const uint8_t *d, int dp, int x8_idx, int y8_idx, v8x8 *vst, #if CONFIG_VP9_HIGHBITDEPTH @@ -680,7 +673,7 @@ static int choose_partitioning(VP9_COMP *cpi, if (cyclic_refresh_segment_id_boosted(segment_id)) { int q = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex); - modify_vbp_thresholds(cpi, thresholds, q); + set_vbp_thresholds(cpi, thresholds, q); } } @@ -724,7 +717,7 @@ static int choose_partitioning(VP9_COMP *cpi, mbmi->mv[0].as_int = 0; mbmi->interp_filter = BILINEAR; - y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize); + y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col); if (y_sad_g < y_sad) { vp9_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col, &cm->frame_refs[GOLDEN_FRAME - 1].sf); @@ -3672,15 +3665,15 @@ static int set_var_thresh_from_histogram(VP9_COMP *cpi) { if (cm->use_highbitdepth) { switch (cm->bit_depth) { case VPX_BITS_8: - vp9_highbd_get16x16var(src, src_stride, last_src, last_stride, + vpx_highbd_8_get16x16var(src, src_stride, last_src, last_stride, &var16->sse, &var16->sum); break; case VPX_BITS_10: - vp9_highbd_10_get16x16var(src, src_stride, last_src, last_stride, + vpx_highbd_10_get16x16var(src, src_stride, last_src, last_stride, &var16->sse, &var16->sum); break; case VPX_BITS_12: - vp9_highbd_12_get16x16var(src, src_stride, last_src, last_stride, + vpx_highbd_12_get16x16var(src, src_stride, last_src, last_stride, &var16->sse, &var16->sum); break; default: @@ -3689,11 +3682,11 @@ static int set_var_thresh_from_histogram(VP9_COMP *cpi) { return -1; } } else { - vp9_get16x16var(src, src_stride, last_src, last_stride, + vpx_get16x16var(src, src_stride, last_src, last_stride, &var16->sse, &var16->sum); } #else - vp9_get16x16var(src, src_stride, last_src, last_stride, + vpx_get16x16var(src, src_stride, last_src, last_stride, &var16->sse, &var16->sum); #endif // CONFIG_VP9_HIGHBITDEPTH var16->var = var16->sse - diff --git a/vp9/encoder/vp9_encodeframe.h b/vp9/encoder/vp9_encodeframe.h index 1acde0283..6aaa56463 100644 --- a/vp9/encoder/vp9_encodeframe.h +++ b/vp9/encoder/vp9_encodeframe.h @@ -40,7 +40,7 @@ void vp9_init_tile_data(struct VP9_COMP *cpi); void vp9_encode_tile(struct VP9_COMP *cpi, struct ThreadData *td, int tile_row, int tile_col); -void vp9_set_vbp_thresholds(struct VP9_COMP *cpi, int q); +void vp9_set_variance_partition_thresholds(struct VP9_COMP *cpi, int q); #ifdef __cplusplus } // extern "C" diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index b115e0ef9..2829365e5 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -18,6 +18,7 @@ #include "vp9/common/vp9_idct.h" #include "vp9/common/vp9_reconinter.h" #include "vp9/common/vp9_reconintra.h" +#include "vp9/common/vp9_scan.h" #include "vp9/common/vp9_systemdependent.h" #include "vp9/encoder/vp9_encodemb.h" diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index df7094949..2fdf4082a 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -1000,7 +1000,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits8, vpx_highbd_sad32x16_avg_bits8, - vp9_highbd_variance32x16, + vpx_highbd_8_variance32x16, vp9_highbd_sub_pixel_variance32x16, vp9_highbd_sub_pixel_avg_variance32x16, NULL, @@ -1010,7 +1010,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits8, vpx_highbd_sad16x32_avg_bits8, - vp9_highbd_variance16x32, + vpx_highbd_8_variance16x32, vp9_highbd_sub_pixel_variance16x32, vp9_highbd_sub_pixel_avg_variance16x32, NULL, @@ -1020,7 +1020,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits8, vpx_highbd_sad64x32_avg_bits8, - vp9_highbd_variance64x32, + vpx_highbd_8_variance64x32, vp9_highbd_sub_pixel_variance64x32, vp9_highbd_sub_pixel_avg_variance64x32, NULL, @@ -1030,7 +1030,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits8, vpx_highbd_sad32x64_avg_bits8, - vp9_highbd_variance32x64, + vpx_highbd_8_variance32x64, vp9_highbd_sub_pixel_variance32x64, vp9_highbd_sub_pixel_avg_variance32x64, NULL, @@ -1040,7 +1040,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits8, vpx_highbd_sad32x32_avg_bits8, - vp9_highbd_variance32x32, + vpx_highbd_8_variance32x32, vp9_highbd_sub_pixel_variance32x32, vp9_highbd_sub_pixel_avg_variance32x32, vpx_highbd_sad32x32x3_bits8, @@ -1050,7 +1050,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits8, vpx_highbd_sad64x64_avg_bits8, - vp9_highbd_variance64x64, + vpx_highbd_8_variance64x64, vp9_highbd_sub_pixel_variance64x64, vp9_highbd_sub_pixel_avg_variance64x64, vpx_highbd_sad64x64x3_bits8, @@ -1060,7 +1060,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits8, vpx_highbd_sad16x16_avg_bits8, - vp9_highbd_variance16x16, + vpx_highbd_8_variance16x16, vp9_highbd_sub_pixel_variance16x16, vp9_highbd_sub_pixel_avg_variance16x16, vpx_highbd_sad16x16x3_bits8, @@ -1070,7 +1070,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits8, vpx_highbd_sad16x8_avg_bits8, - vp9_highbd_variance16x8, + vpx_highbd_8_variance16x8, vp9_highbd_sub_pixel_variance16x8, vp9_highbd_sub_pixel_avg_variance16x8, vpx_highbd_sad16x8x3_bits8, @@ -1080,7 +1080,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits8, vpx_highbd_sad8x16_avg_bits8, - vp9_highbd_variance8x16, + vpx_highbd_8_variance8x16, vp9_highbd_sub_pixel_variance8x16, vp9_highbd_sub_pixel_avg_variance8x16, vpx_highbd_sad8x16x3_bits8, @@ -1090,7 +1090,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_8X8, vpx_highbd_sad8x8_bits8, vpx_highbd_sad8x8_avg_bits8, - vp9_highbd_variance8x8, + vpx_highbd_8_variance8x8, vp9_highbd_sub_pixel_variance8x8, vp9_highbd_sub_pixel_avg_variance8x8, vpx_highbd_sad8x8x3_bits8, @@ -1100,7 +1100,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits8, vpx_highbd_sad8x4_avg_bits8, - vp9_highbd_variance8x4, + vpx_highbd_8_variance8x4, vp9_highbd_sub_pixel_variance8x4, vp9_highbd_sub_pixel_avg_variance8x4, NULL, @@ -1110,7 +1110,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits8, vpx_highbd_sad4x8_avg_bits8, - vp9_highbd_variance4x8, + vpx_highbd_8_variance4x8, vp9_highbd_sub_pixel_variance4x8, vp9_highbd_sub_pixel_avg_variance4x8, NULL, @@ -1120,7 +1120,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_4X4, vpx_highbd_sad4x4_bits8, vpx_highbd_sad4x4_avg_bits8, - vp9_highbd_variance4x4, + vpx_highbd_8_variance4x4, vp9_highbd_sub_pixel_variance4x4, vp9_highbd_sub_pixel_avg_variance4x4, vpx_highbd_sad4x4x3_bits8, @@ -1132,7 +1132,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits10, vpx_highbd_sad32x16_avg_bits10, - vp9_highbd_10_variance32x16, + vpx_highbd_10_variance32x16, vp9_highbd_10_sub_pixel_variance32x16, vp9_highbd_10_sub_pixel_avg_variance32x16, NULL, @@ -1142,7 +1142,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits10, vpx_highbd_sad16x32_avg_bits10, - vp9_highbd_10_variance16x32, + vpx_highbd_10_variance16x32, vp9_highbd_10_sub_pixel_variance16x32, vp9_highbd_10_sub_pixel_avg_variance16x32, NULL, @@ -1152,7 +1152,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits10, vpx_highbd_sad64x32_avg_bits10, - vp9_highbd_10_variance64x32, + vpx_highbd_10_variance64x32, vp9_highbd_10_sub_pixel_variance64x32, vp9_highbd_10_sub_pixel_avg_variance64x32, NULL, @@ -1162,7 +1162,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits10, vpx_highbd_sad32x64_avg_bits10, - vp9_highbd_10_variance32x64, + vpx_highbd_10_variance32x64, vp9_highbd_10_sub_pixel_variance32x64, vp9_highbd_10_sub_pixel_avg_variance32x64, NULL, @@ -1172,7 +1172,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits10, vpx_highbd_sad32x32_avg_bits10, - vp9_highbd_10_variance32x32, + vpx_highbd_10_variance32x32, vp9_highbd_10_sub_pixel_variance32x32, vp9_highbd_10_sub_pixel_avg_variance32x32, vpx_highbd_sad32x32x3_bits10, @@ -1182,7 +1182,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits10, vpx_highbd_sad64x64_avg_bits10, - vp9_highbd_10_variance64x64, + vpx_highbd_10_variance64x64, vp9_highbd_10_sub_pixel_variance64x64, vp9_highbd_10_sub_pixel_avg_variance64x64, vpx_highbd_sad64x64x3_bits10, @@ -1192,7 +1192,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits10, vpx_highbd_sad16x16_avg_bits10, - vp9_highbd_10_variance16x16, + vpx_highbd_10_variance16x16, vp9_highbd_10_sub_pixel_variance16x16, vp9_highbd_10_sub_pixel_avg_variance16x16, vpx_highbd_sad16x16x3_bits10, @@ -1202,7 +1202,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits10, vpx_highbd_sad16x8_avg_bits10, - vp9_highbd_10_variance16x8, + vpx_highbd_10_variance16x8, vp9_highbd_10_sub_pixel_variance16x8, vp9_highbd_10_sub_pixel_avg_variance16x8, vpx_highbd_sad16x8x3_bits10, @@ -1212,7 +1212,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits10, vpx_highbd_sad8x16_avg_bits10, - vp9_highbd_10_variance8x16, + vpx_highbd_10_variance8x16, vp9_highbd_10_sub_pixel_variance8x16, vp9_highbd_10_sub_pixel_avg_variance8x16, vpx_highbd_sad8x16x3_bits10, @@ -1222,7 +1222,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_8X8, vpx_highbd_sad8x8_bits10, vpx_highbd_sad8x8_avg_bits10, - vp9_highbd_10_variance8x8, + vpx_highbd_10_variance8x8, vp9_highbd_10_sub_pixel_variance8x8, vp9_highbd_10_sub_pixel_avg_variance8x8, vpx_highbd_sad8x8x3_bits10, @@ -1232,7 +1232,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits10, vpx_highbd_sad8x4_avg_bits10, - vp9_highbd_10_variance8x4, + vpx_highbd_10_variance8x4, vp9_highbd_10_sub_pixel_variance8x4, vp9_highbd_10_sub_pixel_avg_variance8x4, NULL, @@ -1242,7 +1242,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits10, vpx_highbd_sad4x8_avg_bits10, - vp9_highbd_10_variance4x8, + vpx_highbd_10_variance4x8, vp9_highbd_10_sub_pixel_variance4x8, vp9_highbd_10_sub_pixel_avg_variance4x8, NULL, @@ -1252,7 +1252,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_4X4, vpx_highbd_sad4x4_bits10, vpx_highbd_sad4x4_avg_bits10, - vp9_highbd_10_variance4x4, + vpx_highbd_10_variance4x4, vp9_highbd_10_sub_pixel_variance4x4, vp9_highbd_10_sub_pixel_avg_variance4x4, vpx_highbd_sad4x4x3_bits10, @@ -1264,7 +1264,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits12, vpx_highbd_sad32x16_avg_bits12, - vp9_highbd_12_variance32x16, + vpx_highbd_12_variance32x16, vp9_highbd_12_sub_pixel_variance32x16, vp9_highbd_12_sub_pixel_avg_variance32x16, NULL, @@ -1274,7 +1274,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits12, vpx_highbd_sad16x32_avg_bits12, - vp9_highbd_12_variance16x32, + vpx_highbd_12_variance16x32, vp9_highbd_12_sub_pixel_variance16x32, vp9_highbd_12_sub_pixel_avg_variance16x32, NULL, @@ -1284,7 +1284,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits12, vpx_highbd_sad64x32_avg_bits12, - vp9_highbd_12_variance64x32, + vpx_highbd_12_variance64x32, vp9_highbd_12_sub_pixel_variance64x32, vp9_highbd_12_sub_pixel_avg_variance64x32, NULL, @@ -1294,7 +1294,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits12, vpx_highbd_sad32x64_avg_bits12, - vp9_highbd_12_variance32x64, + vpx_highbd_12_variance32x64, vp9_highbd_12_sub_pixel_variance32x64, vp9_highbd_12_sub_pixel_avg_variance32x64, NULL, @@ -1304,7 +1304,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits12, vpx_highbd_sad32x32_avg_bits12, - vp9_highbd_12_variance32x32, + vpx_highbd_12_variance32x32, vp9_highbd_12_sub_pixel_variance32x32, vp9_highbd_12_sub_pixel_avg_variance32x32, vpx_highbd_sad32x32x3_bits12, @@ -1314,7 +1314,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits12, vpx_highbd_sad64x64_avg_bits12, - vp9_highbd_12_variance64x64, + vpx_highbd_12_variance64x64, vp9_highbd_12_sub_pixel_variance64x64, vp9_highbd_12_sub_pixel_avg_variance64x64, vpx_highbd_sad64x64x3_bits12, @@ -1324,7 +1324,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits12, vpx_highbd_sad16x16_avg_bits12, - vp9_highbd_12_variance16x16, + vpx_highbd_12_variance16x16, vp9_highbd_12_sub_pixel_variance16x16, vp9_highbd_12_sub_pixel_avg_variance16x16, vpx_highbd_sad16x16x3_bits12, @@ -1334,7 +1334,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits12, vpx_highbd_sad16x8_avg_bits12, - vp9_highbd_12_variance16x8, + vpx_highbd_12_variance16x8, vp9_highbd_12_sub_pixel_variance16x8, vp9_highbd_12_sub_pixel_avg_variance16x8, vpx_highbd_sad16x8x3_bits12, @@ -1344,7 +1344,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits12, vpx_highbd_sad8x16_avg_bits12, - vp9_highbd_12_variance8x16, + vpx_highbd_12_variance8x16, vp9_highbd_12_sub_pixel_variance8x16, vp9_highbd_12_sub_pixel_avg_variance8x16, vpx_highbd_sad8x16x3_bits12, @@ -1354,7 +1354,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_8X8, vpx_highbd_sad8x8_bits12, vpx_highbd_sad8x8_avg_bits12, - vp9_highbd_12_variance8x8, + vpx_highbd_12_variance8x8, vp9_highbd_12_sub_pixel_variance8x8, vp9_highbd_12_sub_pixel_avg_variance8x8, vpx_highbd_sad8x8x3_bits12, @@ -1364,7 +1364,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits12, vpx_highbd_sad8x4_avg_bits12, - vp9_highbd_12_variance8x4, + vpx_highbd_12_variance8x4, vp9_highbd_12_sub_pixel_variance8x4, vp9_highbd_12_sub_pixel_avg_variance8x4, NULL, @@ -1374,7 +1374,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits12, vpx_highbd_sad4x8_avg_bits12, - vp9_highbd_12_variance4x8, + vpx_highbd_12_variance4x8, vp9_highbd_12_sub_pixel_variance4x8, vp9_highbd_12_sub_pixel_avg_variance4x8, NULL, @@ -1384,7 +1384,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_4X4, vpx_highbd_sad4x4_bits12, vpx_highbd_sad4x4_avg_bits12, - vp9_highbd_12_variance4x4, + vpx_highbd_12_variance4x4, vp9_highbd_12_sub_pixel_variance4x4, vp9_highbd_12_sub_pixel_avg_variance4x4, vpx_highbd_sad4x4x3_bits12, @@ -1807,61 +1807,61 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf, cpi->fn_ptr[BT].sdx4df = SDX4DF; BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad32x16_avg, - vp9_variance32x16, vp9_sub_pixel_variance32x16, + vpx_variance32x16, vp9_sub_pixel_variance32x16, vp9_sub_pixel_avg_variance32x16, NULL, NULL, vpx_sad32x16x4d) BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad16x32_avg, - vp9_variance16x32, vp9_sub_pixel_variance16x32, + vpx_variance16x32, vp9_sub_pixel_variance16x32, vp9_sub_pixel_avg_variance16x32, NULL, NULL, vpx_sad16x32x4d) BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad64x32_avg, - vp9_variance64x32, vp9_sub_pixel_variance64x32, + vpx_variance64x32, vp9_sub_pixel_variance64x32, vp9_sub_pixel_avg_variance64x32, NULL, NULL, vpx_sad64x32x4d) BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad32x64_avg, - vp9_variance32x64, vp9_sub_pixel_variance32x64, + vpx_variance32x64, vp9_sub_pixel_variance32x64, vp9_sub_pixel_avg_variance32x64, NULL, NULL, vpx_sad32x64x4d) BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad32x32_avg, - vp9_variance32x32, vp9_sub_pixel_variance32x32, + vpx_variance32x32, vp9_sub_pixel_variance32x32, vp9_sub_pixel_avg_variance32x32, vpx_sad32x32x3, vpx_sad32x32x8, vpx_sad32x32x4d) BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad64x64_avg, - vp9_variance64x64, vp9_sub_pixel_variance64x64, + vpx_variance64x64, vp9_sub_pixel_variance64x64, vp9_sub_pixel_avg_variance64x64, vpx_sad64x64x3, vpx_sad64x64x8, vpx_sad64x64x4d) BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad16x16_avg, - vp9_variance16x16, vp9_sub_pixel_variance16x16, + vpx_variance16x16, vp9_sub_pixel_variance16x16, vp9_sub_pixel_avg_variance16x16, vpx_sad16x16x3, vpx_sad16x16x8, vpx_sad16x16x4d) BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad16x8_avg, - vp9_variance16x8, vp9_sub_pixel_variance16x8, + vpx_variance16x8, vp9_sub_pixel_variance16x8, vp9_sub_pixel_avg_variance16x8, vpx_sad16x8x3, vpx_sad16x8x8, vpx_sad16x8x4d) BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad8x16_avg, - vp9_variance8x16, vp9_sub_pixel_variance8x16, + vpx_variance8x16, vp9_sub_pixel_variance8x16, vp9_sub_pixel_avg_variance8x16, vpx_sad8x16x3, vpx_sad8x16x8, vpx_sad8x16x4d) BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad8x8_avg, - vp9_variance8x8, vp9_sub_pixel_variance8x8, + vpx_variance8x8, vp9_sub_pixel_variance8x8, vp9_sub_pixel_avg_variance8x8, vpx_sad8x8x3, vpx_sad8x8x8, vpx_sad8x8x4d) BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad8x4_avg, - vp9_variance8x4, vp9_sub_pixel_variance8x4, + vpx_variance8x4, vp9_sub_pixel_variance8x4, vp9_sub_pixel_avg_variance8x4, NULL, vpx_sad8x4x8, vpx_sad8x4x4d) BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad4x8_avg, - vp9_variance4x8, vp9_sub_pixel_variance4x8, + vpx_variance4x8, vp9_sub_pixel_variance4x8, vp9_sub_pixel_avg_variance4x8, NULL, vpx_sad4x8x8, vpx_sad4x8x4d) BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad4x4_avg, - vp9_variance4x4, vp9_sub_pixel_variance4x4, + vpx_variance4x4, vp9_sub_pixel_variance4x4, vp9_sub_pixel_avg_variance4x4, vpx_sad4x4x3, vpx_sad4x4x8, vpx_sad4x4x4d) @@ -2081,7 +2081,7 @@ static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *pa = a; const uint8_t *pb = b; for (x = 0; x < width / 16; ++x) { - vp9_mse16x16(pa, a_stride, pb, b_stride, &sse); + vpx_mse16x16(pa, a_stride, pb, b_stride, &sse); total_sse += sse; pa += 16; @@ -2126,21 +2126,21 @@ static int64_t highbd_get_sse(const uint8_t *a, int a_stride, unsigned int sse = 0; int sum = 0; if (dw > 0) { - highbd_variance(&a[width - dw], a_stride, &b[width - dw], b_stride, - dw, height, &sse, &sum); + highbd_8_variance(&a[width - dw], a_stride, &b[width - dw], b_stride, + dw, height, &sse, &sum); total_sse += sse; } if (dh > 0) { - highbd_variance(&a[(height - dh) * a_stride], a_stride, - &b[(height - dh) * b_stride], b_stride, - width - dw, dh, &sse, &sum); + highbd_8_variance(&a[(height - dh) * a_stride], a_stride, + &b[(height - dh) * b_stride], b_stride, + width - dw, dh, &sse, &sum); total_sse += sse; } for (y = 0; y < height / 16; ++y) { const uint8_t *pa = a; const uint8_t *pb = b; for (x = 0; x < width / 16; ++x) { - vp9_highbd_mse16x16(pa, a_stride, pb, b_stride, &sse); + vpx_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse); total_sse += sse; pa += 16; pb += 16; @@ -2716,7 +2716,10 @@ void vp9_scale_references(VP9_COMP *cpi) { #if CONFIG_VP9_HIGHBITDEPTH if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) { const int new_fb = get_free_fb(cm); - RefCntBuffer *const new_fb_ptr = &pool->frame_bufs[new_fb]; + RefCntBuffer *new_fb_ptr = NULL; + if (cm->new_fb_idx == INVALID_IDX) + return; + new_fb_ptr = &pool->frame_bufs[new_fb]; cm->cur_frame = &pool->frame_bufs[new_fb]; vp9_realloc_frame_buffer(&pool->frame_bufs[new_fb].buf, cm->width, cm->height, @@ -2728,7 +2731,10 @@ void vp9_scale_references(VP9_COMP *cpi) { #else if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) { const int new_fb = get_free_fb(cm); - RefCntBuffer *const new_fb_ptr = &pool->frame_bufs[new_fb]; + RefCntBuffer *new_fb_ptr = NULL; + if (cm->new_fb_idx == INVALID_IDX) + return; + new_fb_ptr = &pool->frame_bufs[new_fb]; vp9_realloc_frame_buffer(&new_fb_ptr->buf, cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, @@ -2797,7 +2803,7 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) { recon_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm)); if (cpi->twopass.total_left_stats.coded_error != 0.0) - fprintf(f, "%10u %dx%d %10d %10d %10d %10d" + fprintf(f, "%10u %dx%d %d %d %10d %10d %10d %10d" "%10"PRId64" %10"PRId64" %10"PRId64" %10"PRId64" %10d " "%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf" "%6d %6d %5d %5d %5d " @@ -2805,6 +2811,8 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) { "%10lf %8u %10"PRId64" %10d %10d\n", cpi->common.current_video_frame, cm->width, cm->height, + cpi->rc.source_alt_ref_pending, + cpi->rc.source_alt_ref_active, cpi->rc.this_frame_target, cpi->rc.projected_frame_size, cpi->rc.projected_frame_size / cpi->common.MBs, @@ -3031,7 +3039,7 @@ static void encode_without_recode_loop(VP9_COMP *cpi) { set_size_dependent_vars(cpi, &q, &bottom_index, &top_index); vp9_set_quantizer(cm, q); - vp9_set_vbp_thresholds(cpi, q); + vp9_set_variance_partition_thresholds(cpi, q); setup_frame(cpi); @@ -3476,34 +3484,41 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, } } if (is_two_pass_svc(cpi) && cm->error_resilient_mode == 0) { - // Use the last frame context for the empty frame. + // Use context 0 for intra only empty frame, but the last frame context + // for other empty frames. + if (cpi->svc.encode_empty_frame_state == ENCODING) { + if (cpi->svc.encode_intra_empty_frame != 0) + cm->frame_context_idx = 0; + else + cm->frame_context_idx = FRAME_CONTEXTS - 1; + } else { cm->frame_context_idx = - (cpi->svc.encode_empty_frame_state == ENCODING) ? FRAME_CONTEXTS - 1 : cpi->svc.spatial_layer_id * cpi->svc.number_temporal_layers + cpi->svc.temporal_layer_id; + } + + cm->frame_parallel_decoding_mode = oxcf->frame_parallel_decoding_mode; // The probs will be updated based on the frame type of its previous // frame if frame_parallel_decoding_mode is 0. The type may vary for // the frame after a key frame in base layer since we may drop enhancement // layers. So set frame_parallel_decoding_mode to 1 in this case. - if (cpi->svc.number_temporal_layers == 1) { - if (cpi->svc.spatial_layer_id == 0 && - cpi->svc.layer_context[0].last_frame_type == KEY_FRAME) - cm->frame_parallel_decoding_mode = 1; - else - cm->frame_parallel_decoding_mode = 0; - } else if (cpi->svc.spatial_layer_id == 0) { - // Find the 2nd frame in temporal base layer and 1st frame in temporal - // enhancement layers from the key frame. - int i; - for (i = 0; i < cpi->svc.number_temporal_layers; ++i) { - if (cpi->svc.layer_context[0].frames_from_key_frame == 1 << i) { + if (cm->frame_parallel_decoding_mode == 0) { + if (cpi->svc.number_temporal_layers == 1) { + if (cpi->svc.spatial_layer_id == 0 && + cpi->svc.layer_context[0].last_frame_type == KEY_FRAME) cm->frame_parallel_decoding_mode = 1; - break; + } else if (cpi->svc.spatial_layer_id == 0) { + // Find the 2nd frame in temporal base layer and 1st frame in temporal + // enhancement layers from the key frame. + int i; + for (i = 0; i < cpi->svc.number_temporal_layers; ++i) { + if (cpi->svc.layer_context[0].frames_from_key_frame == 1 << i) { + cm->frame_parallel_decoding_mode = 1; + break; + } } } - if (i == cpi->svc.number_temporal_layers) - cm->frame_parallel_decoding_mode = 0; } } @@ -3968,6 +3983,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, } cm->show_frame = 0; + cm->intra_only = 0; cpi->refresh_alt_ref_frame = 1; cpi->refresh_golden_frame = 0; cpi->refresh_last_frame = 0; @@ -4310,8 +4326,10 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, #endif if (is_two_pass_svc(cpi)) { - if (cpi->svc.encode_empty_frame_state == ENCODING) + if (cpi->svc.encode_empty_frame_state == ENCODING) { cpi->svc.encode_empty_frame_state = ENCODED; + cpi->svc.encode_intra_empty_frame = 0; + } if (cm->show_frame) { ++cpi->svc.spatial_layer_to_encode; diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 88b10307d..942eac911 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -12,6 +12,7 @@ #include <math.h> #include <stdio.h> +#include "./vpx_dsp_rtcd.h" #include "./vpx_scale_rtcd.h" #include "vpx_mem/vpx_mem.h" @@ -267,13 +268,13 @@ void vp9_end_first_pass(VP9_COMP *cpi) { static vp9_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) { switch (bsize) { case BLOCK_8X8: - return vp9_mse8x8; + return vpx_mse8x8; case BLOCK_16X8: - return vp9_mse16x8; + return vpx_mse16x8; case BLOCK_8X16: - return vp9_mse8x16; + return vpx_mse8x16; default: - return vp9_mse16x16; + return vpx_mse16x16; } } @@ -293,37 +294,37 @@ static vp9_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize, default: switch (bsize) { case BLOCK_8X8: - return vp9_highbd_mse8x8; + return vpx_highbd_8_mse8x8; case BLOCK_16X8: - return vp9_highbd_mse16x8; + return vpx_highbd_8_mse16x8; case BLOCK_8X16: - return vp9_highbd_mse8x16; + return vpx_highbd_8_mse8x16; default: - return vp9_highbd_mse16x16; + return vpx_highbd_8_mse16x16; } break; case 10: switch (bsize) { case BLOCK_8X8: - return vp9_highbd_10_mse8x8; + return vpx_highbd_10_mse8x8; case BLOCK_16X8: - return vp9_highbd_10_mse16x8; + return vpx_highbd_10_mse16x8; case BLOCK_8X16: - return vp9_highbd_10_mse8x16; + return vpx_highbd_10_mse8x16; default: - return vp9_highbd_10_mse16x16; + return vpx_highbd_10_mse16x16; } break; case 12: switch (bsize) { case BLOCK_8X8: - return vp9_highbd_12_mse8x8; + return vpx_highbd_12_mse8x8; case BLOCK_16X8: - return vp9_highbd_12_mse16x8; + return vpx_highbd_12_mse16x8; case BLOCK_8X16: - return vp9_highbd_12_mse8x16; + return vpx_highbd_12_mse8x16; default: - return vp9_highbd_12_mse16x16; + return vpx_highbd_12_mse16x16; } break; } @@ -634,7 +635,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { xd->mi[0]->mbmi.tx_size = use_dc_pred ? (bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4; vp9_encode_intra_block_plane(x, bsize, 0); - this_error = vp9_get_mb_ss(x->plane[0].src_diff); + this_error = vpx_get_mb_ss(x->plane[0].src_diff); #if CONFIG_VP9_HIGHBITDEPTH if (cm->use_highbitdepth) { switch (cm->bit_depth) { @@ -1696,7 +1697,7 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits, mid_frame_idx = frame_index + (rc->baseline_gf_interval >> 1) - 1; // Allocate bits to the other frames in the group. - for (i = 0; i < rc->baseline_gf_interval - 1; ++i) { + for (i = 0; i < rc->baseline_gf_interval - rc->source_alt_ref_pending; ++i) { int arf_idx = 0; if (EOF == input_stats(twopass, &frame_stats)) break; @@ -1934,8 +1935,26 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Was the group length constrained by the requirement for a new KF? rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0; + // Should we use the alternate reference frame. + if (allow_alt_ref && + (i < cpi->oxcf.lag_in_frames) && + (i >= rc->min_gf_interval)) { + // Calculate the boost for alt ref. + rc->gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost, + &b_boost); + rc->source_alt_ref_pending = 1; + + // Test to see if multi arf is appropriate. + cpi->multi_arf_enabled = + (cpi->multi_arf_allowed && (rc->baseline_gf_interval >= 6) && + (zero_motion_accumulator < 0.995)) ? 1 : 0; + } else { + rc->gfu_boost = MAX((int)boost_score, MIN_ARF_GF_BOOST); + rc->source_alt_ref_pending = 0; + } + // Set the interval until the next gf. - if (is_key_frame || rc->source_alt_ref_active) + if (is_key_frame || rc->source_alt_ref_pending) rc->baseline_gf_interval = i - 1; else rc->baseline_gf_interval = i; @@ -1960,24 +1979,6 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { rc->frames_till_gf_update_due = rc->baseline_gf_interval; - // Should we use the alternate reference frame. - if (allow_alt_ref && - (i < cpi->oxcf.lag_in_frames) && - (i >= rc->min_gf_interval)) { - // Calculate the boost for alt ref. - rc->gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost, - &b_boost); - rc->source_alt_ref_pending = 1; - - // Test to see if multi arf is appropriate. - cpi->multi_arf_enabled = - (cpi->multi_arf_allowed && (rc->baseline_gf_interval >= 6) && - (zero_motion_accumulator < 0.995)) ? 1 : 0; - } else { - rc->gfu_boost = MAX((int)boost_score, MIN_ARF_GF_BOOST); - rc->source_alt_ref_pending = 0; - } - // Reset the file position. reset_fpf_position(twopass, start_pos); @@ -2581,9 +2582,8 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG); lc->frames_from_key_frame = 0; - // Reset the empty frame resolution since we have a key frame. - cpi->svc.empty_frame_width = cm->width; - cpi->svc.empty_frame_height = cm->height; + // Encode an intra only empty frame since we have a key frame. + cpi->svc.encode_intra_empty_frame = 1; } } else { cm->frame_type = INTER_FRAME; diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c index 8bdd4286a..15f95829f 100644 --- a/vp9/encoder/vp9_mcomp.c +++ b/vp9/encoder/vp9_mcomp.c @@ -13,11 +13,13 @@ #include <stdio.h> #include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" #include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" #include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_reconinter.h" #include "vp9/encoder/vp9_encoder.h" #include "vp9/encoder/vp9_mcomp.h" @@ -303,13 +305,13 @@ static INLINE unsigned int setup_center_error(const MACROBLOCKD *xd, if (second_pred != NULL) { if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { DECLARE_ALIGNED(16, uint16_t, comp_pred16[64 * 64]); - vp9_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset, + vpx_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset, y_stride); besterr = vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride, sse1); } else { DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]); - vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); + vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); besterr = vfp->vf(comp_pred, w, src, src_stride, sse1); } } else { @@ -321,7 +323,7 @@ static INLINE unsigned int setup_center_error(const MACROBLOCKD *xd, (void) xd; if (second_pred != NULL) { DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]); - vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); + vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); besterr = vfp->vf(comp_pred, w, src, src_stride, sse1); } else { besterr = vfp->vf(y + offset, y_stride, src, src_stride, sse1); @@ -1789,8 +1791,11 @@ static const MV search_pos[4] = { }; unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x, - BLOCK_SIZE bsize) { + BLOCK_SIZE bsize, + int mi_row, int mi_col) { MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; + struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0, 0}}; DECLARE_ALIGNED(16, int16_t, hbuf[128]); DECLARE_ALIGNED(16, int16_t, vbuf[128]); DECLARE_ALIGNED(16, int16_t, src_hbuf[64]); @@ -1807,12 +1812,34 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x, unsigned int best_sad, tmp_sad, this_sad[4]; MV this_mv; const int norm_factor = 3 + (bw >> 5); + const YV12_BUFFER_CONFIG *scaled_ref_frame = + vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[0]); + + if (scaled_ref_frame) { + int i; + // Swap out the reference frame for a version that's been scaled to + // match the resolution of the current frame, allowing the existing + // motion search code to be used without additional modifications. + for (i = 0; i < MAX_MB_PLANE; i++) + backup_yv12[i] = xd->plane[i].pre[0]; + vp9_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL); + } #if CONFIG_VP9_HIGHBITDEPTH - tmp_mv->row = 0; - tmp_mv->col = 0; - return cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride, - xd->plane[0].pre[0].buf, ref_stride); + { + unsigned int this_sad; + tmp_mv->row = 0; + tmp_mv->col = 0; + this_sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride, + xd->plane[0].pre[0].buf, ref_stride); + + if (scaled_ref_frame) { + int i; + for (i = 0; i < MAX_MB_PLANE; i++) + xd->plane[i].pre[0] = backup_yv12[i]; + } + return this_sad; + } #endif // Set up prediction 1-D reference set @@ -1890,6 +1917,12 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x, tmp_mv->row *= 8; tmp_mv->col *= 8; + if (scaled_ref_frame) { + int i; + for (i = 0; i < MAX_MB_PLANE; i++) + xd->plane[i].pre[0] = backup_yv12[i]; + } + return best_sad; } diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h index dd8a46079..99c1afa28 100644 --- a/vp9/encoder/vp9_mcomp.h +++ b/vp9/encoder/vp9_mcomp.h @@ -83,7 +83,8 @@ int vp9_full_pixel_diamond(const struct VP9_COMP *cpi, MACROBLOCK *x, // Perform integral projection based motion estimation. unsigned int vp9_int_pro_motion_estimation(const struct VP9_COMP *cpi, MACROBLOCK *x, - BLOCK_SIZE bsize); + BLOCK_SIZE bsize, + int mi_row, int mi_col); typedef int (integer_mv_pattern_search_fn) ( const MACROBLOCK *x, diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index 1e917159f..b0e255de8 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -14,6 +14,7 @@ #include <stdio.h> #include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" #include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" @@ -24,6 +25,7 @@ #include "vp9/common/vp9_pred_common.h" #include "vp9/common/vp9_reconinter.h" #include "vp9/common/vp9_reconintra.h" +#include "vp9/common/vp9_scan.h" #include "vp9/encoder/vp9_cost.h" #include "vp9/encoder/vp9_encoder.h" @@ -215,7 +217,7 @@ static void block_variance(const uint8_t *src, int src_stride, for (i = 0; i < h; i += block_size) { for (j = 0; j < w; j += block_size) { - vp9_get8x8var(src + src_stride * i + j, src_stride, + vpx_get8x8var(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, ref_stride, &sse8x8[k], &sum8x8[k]); *sse += sse8x8[k]; @@ -1248,7 +1250,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, if (bsize < BLOCK_16X16) continue; - tmp_sad = vp9_int_pro_motion_estimation(cpi, x, bsize); + tmp_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col); if (tmp_sad > x->pred_mv_sad[LAST_FRAME]) continue; diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index fabe36296..7211e9992 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -1207,11 +1207,9 @@ static void update_golden_frame_stats(VP9_COMP *cpi) { // this frame refreshes means next frames don't unless specified by user rc->frames_since_golden = 0; - if (cpi->oxcf.pass == 2) { - if (!rc->source_alt_ref_pending && - cpi->twopass.gf_group.rf_level[0] == GF_ARF_STD) - rc->source_alt_ref_active = 0; - } else if (!rc->source_alt_ref_pending) { + // If we are not using alt ref in the up and coming group clear the arf + // active flag. + if (!rc->source_alt_ref_pending) { rc->source_alt_ref_active = 0; } diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 6eb8f6cb5..9fa258c61 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -25,6 +25,7 @@ #include "vp9/common/vp9_quant_common.h" #include "vp9/common/vp9_reconinter.h" #include "vp9/common/vp9_reconintra.h" +#include "vp9/common/vp9_scan.h" #include "vp9/common/vp9_seg_common.h" #include "vp9/common/vp9_systemdependent.h" diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c index b3491a27a..2d2e95d9c 100644 --- a/vp9/encoder/vp9_svc_layercontext.c +++ b/vp9/encoder/vp9_svc_layercontext.c @@ -15,6 +15,8 @@ #include "vp9/encoder/vp9_extend.h" #define SMALL_FRAME_FB_IDX 7 +#define SMALL_FRAME_WIDTH 16 +#define SMALL_FRAME_HEIGHT 16 void vp9_init_layer_context(VP9_COMP *const cpi) { SVC *const svc = &cpi->svc; @@ -33,7 +35,7 @@ void vp9_init_layer_context(VP9_COMP *const cpi) { if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2) { if (vp9_realloc_frame_buffer(&cpi->svc.empty_frame.img, - cpi->common.width, cpi->common.height, + SMALL_FRAME_WIDTH, SMALL_FRAME_HEIGHT, cpi->common.subsampling_x, cpi->common.subsampling_y, #if CONFIG_VP9_HIGHBITDEPTH @@ -48,8 +50,6 @@ void vp9_init_layer_context(VP9_COMP *const cpi) { memset(cpi->svc.empty_frame.img.buffer_alloc, 0x80, cpi->svc.empty_frame.img.buffer_alloc_sz); - cpi->svc.empty_frame_width = cpi->common.width; - cpi->svc.empty_frame_height = cpi->common.height; } } @@ -362,20 +362,11 @@ int vp9_svc_start_frame(VP9_COMP *const cpi) { cpi->lst_fb_idx = cpi->gld_fb_idx = cpi->alt_fb_idx = SMALL_FRAME_FB_IDX; - // Gradually make the empty frame smaller to save bits. Make it half of - // its previous size because of the scaling factor restriction. - cpi->svc.empty_frame_width >>= 1; - cpi->svc.empty_frame_width = (cpi->svc.empty_frame_width + 1) & ~1; - if (cpi->svc.empty_frame_width < 16) - cpi->svc.empty_frame_width = 16; + if (cpi->svc.encode_intra_empty_frame != 0) + cpi->common.intra_only = 1; - cpi->svc.empty_frame_height >>= 1; - cpi->svc.empty_frame_height = (cpi->svc.empty_frame_height + 1) & ~1; - if (cpi->svc.empty_frame_height < 16) - cpi->svc.empty_frame_height = 16; - - width = cpi->svc.empty_frame_width; - height = cpi->svc.empty_frame_height; + width = SMALL_FRAME_WIDTH; + height = SMALL_FRAME_HEIGHT; } } } diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h index e9645ce9f..5063d521f 100644 --- a/vp9/encoder/vp9_svc_layercontext.h +++ b/vp9/encoder/vp9_svc_layercontext.h @@ -57,8 +57,7 @@ typedef struct { NEED_TO_ENCODE }encode_empty_frame_state; struct lookahead_entry empty_frame; - int empty_frame_width; - int empty_frame_height; + int encode_intra_empty_frame; // Store scaled source frames to be used for temporal filter to generate // a alt ref frame. diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c index 862be4d38..35920313a 100644 --- a/vp9/encoder/vp9_tokenize.c +++ b/vp9/encoder/vp9_tokenize.c @@ -17,6 +17,7 @@ #include "vp9/common/vp9_entropy.h" #include "vp9/common/vp9_pred_common.h" +#include "vp9/common/vp9_scan.h" #include "vp9/common/vp9_seg_common.h" #include "vp9/encoder/vp9_cost.h" diff --git a/vp9/encoder/vp9_variance.c b/vp9/encoder/vp9_variance.c index f38f96d6c..1f6b083c4 100644 --- a/vp9/encoder/vp9_variance.c +++ b/vp9/encoder/vp9_variance.c @@ -9,6 +9,7 @@ */ #include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" #include "vpx_ports/mem.h" #include "vpx/vpx_integer.h" @@ -18,26 +19,6 @@ #include "vp9/encoder/vp9_variance.h" -void variance(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - int w, int h, unsigned int *sse, int *sum) { - int i, j; - - *sum = 0; - *sse = 0; - - for (i = 0; i < h; i++) { - for (j = 0; j < w; j++) { - const int diff = a[j] - b[j]; - *sum += diff; - *sse += diff * diff; - } - - a += a_stride; - b += b_stride; - } -} - // Applies a 1-D 2-tap bi-linear filter to the source block in either horizontal // or vertical direction to produce the filtered output block. Used to implement // first-pass of 2-D separable filter. @@ -100,25 +81,6 @@ static void var_filter_block2d_bil_second_pass(const uint16_t *src_ptr, } } -unsigned int vp9_get_mb_ss_c(const int16_t *src_ptr) { - unsigned int i, sum = 0; - - for (i = 0; i < 256; ++i) { - sum += src_ptr[i] * src_ptr[i]; - } - - return sum; -} - -#define VAR(W, H) \ -unsigned int vp9_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - unsigned int *sse) { \ - int sum; \ - variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ - return *sse - (((int64_t)sum * sum) / (W * H)); \ -} - #define SUBPIX_VAR(W, H) \ unsigned int vp9_sub_pixel_variance##W##x##H##_c( \ const uint8_t *src, int src_stride, \ @@ -133,7 +95,7 @@ unsigned int vp9_sub_pixel_variance##W##x##H##_c( \ var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ BILINEAR_FILTERS_2TAP(yoffset)); \ \ - return vp9_variance##W##x##H##_c(temp2, W, dst, dst_stride, sse); \ + return vpx_variance##W##x##H##_c(temp2, W, dst, dst_stride, sse); \ } #define SUBPIX_AVG_VAR(W, H) \ @@ -152,178 +114,51 @@ unsigned int vp9_sub_pixel_avg_variance##W##x##H##_c( \ var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ BILINEAR_FILTERS_2TAP(yoffset)); \ \ - vp9_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \ + vpx_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \ \ - return vp9_variance##W##x##H##_c(temp3, W, dst, dst_stride, sse); \ -} - -void vp9_get16x16var_c(const uint8_t *src_ptr, int source_stride, - const uint8_t *ref_ptr, int ref_stride, - unsigned int *sse, int *sum) { - variance(src_ptr, source_stride, ref_ptr, ref_stride, 16, 16, sse, sum); + return vpx_variance##W##x##H##_c(temp3, W, dst, dst_stride, sse); \ } -void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride, - const uint8_t *ref_ptr, int ref_stride, - unsigned int *sse, int *sum) { - variance(src_ptr, source_stride, ref_ptr, ref_stride, 8, 8, sse, sum); -} - -unsigned int vp9_mse16x16_c(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance(src, src_stride, ref, ref_stride, 16, 16, sse, &sum); - return *sse; -} - -unsigned int vp9_mse16x8_c(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance(src, src_stride, ref, ref_stride, 16, 8, sse, &sum); - return *sse; -} - -unsigned int vp9_mse8x16_c(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance(src, src_stride, ref, ref_stride, 8, 16, sse, &sum); - return *sse; -} - -unsigned int vp9_mse8x8_c(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance(src, src_stride, ref, ref_stride, 8, 8, sse, &sum); - return *sse; -} - -VAR(4, 4) SUBPIX_VAR(4, 4) SUBPIX_AVG_VAR(4, 4) -VAR(4, 8) SUBPIX_VAR(4, 8) SUBPIX_AVG_VAR(4, 8) -VAR(8, 4) SUBPIX_VAR(8, 4) SUBPIX_AVG_VAR(8, 4) -VAR(8, 8) SUBPIX_VAR(8, 8) SUBPIX_AVG_VAR(8, 8) -VAR(8, 16) SUBPIX_VAR(8, 16) SUBPIX_AVG_VAR(8, 16) -VAR(16, 8) SUBPIX_VAR(16, 8) SUBPIX_AVG_VAR(16, 8) -VAR(16, 16) SUBPIX_VAR(16, 16) SUBPIX_AVG_VAR(16, 16) -VAR(16, 32) SUBPIX_VAR(16, 32) SUBPIX_AVG_VAR(16, 32) -VAR(32, 16) SUBPIX_VAR(32, 16) SUBPIX_AVG_VAR(32, 16) -VAR(32, 32) SUBPIX_VAR(32, 32) SUBPIX_AVG_VAR(32, 32) -VAR(32, 64) SUBPIX_VAR(32, 64) SUBPIX_AVG_VAR(32, 64) -VAR(64, 32) SUBPIX_VAR(64, 32) SUBPIX_AVG_VAR(64, 32) -VAR(64, 64) SUBPIX_VAR(64, 64) SUBPIX_AVG_VAR(64, 64) -void vp9_comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width, - int height, const uint8_t *ref, int ref_stride) { - int i, j; - - for (i = 0; i < height; i++) { - for (j = 0; j < width; j++) { - const int tmp = pred[j] + ref[j]; - comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1); - } - comp_pred += width; - pred += width; - ref += ref_stride; - } -} - #if CONFIG_VP9_HIGHBITDEPTH -void highbd_variance64(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, - int w, int h, uint64_t *sse, - uint64_t *sum) { - int i, j; - - uint16_t *a = CONVERT_TO_SHORTPTR(a8); - uint16_t *b = CONVERT_TO_SHORTPTR(b8); - *sum = 0; - *sse = 0; - - for (i = 0; i < h; i++) { - for (j = 0; j < w; j++) { - const int diff = a[j] - b[j]; - *sum += diff; - *sse += diff * diff; - } - a += a_stride; - b += b_stride; - } -} - -void highbd_variance(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, - int w, int h, unsigned int *sse, - int *sum) { - uint64_t sse_long = 0; - uint64_t sum_long = 0; - highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); - *sse = (unsigned int)sse_long; - *sum = (int)sum_long; -} - -void highbd_10_variance(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, - int w, int h, unsigned int *sse, - int *sum) { - uint64_t sse_long = 0; - uint64_t sum_long = 0; - highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); - *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); - *sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 4); -} - -void highbd_12_variance(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, - int w, int h, unsigned int *sse, - int *sum) { - uint64_t sse_long = 0; - uint64_t sum_long = 0; - highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); - *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); - *sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 8); -} - static void highbd_var_filter_block2d_bil_first_pass( const uint8_t *src_ptr8, uint16_t *output_ptr, @@ -374,35 +209,6 @@ static void highbd_var_filter_block2d_bil_second_pass( } } -#define HIGHBD_VAR(W, H) \ -unsigned int vp9_highbd_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - unsigned int *sse) { \ - int sum; \ - highbd_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ - return *sse - (((int64_t)sum * sum) / (W * H)); \ -} \ -\ -unsigned int vp9_highbd_10_variance##W##x##H##_c(const uint8_t *a, \ - int a_stride, \ - const uint8_t *b, \ - int b_stride, \ - unsigned int *sse) { \ - int sum; \ - highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ - return *sse - (((int64_t)sum * sum) / (W * H)); \ -} \ -\ -unsigned int vp9_highbd_12_variance##W##x##H##_c(const uint8_t *a, \ - int a_stride, \ - const uint8_t *b, \ - int b_stride, \ - unsigned int *sse) { \ - int sum; \ - highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ - return *sse - (((int64_t)sum * sum) / (W * H)); \ -} - #define HIGHBD_SUBPIX_VAR(W, H) \ unsigned int vp9_highbd_sub_pixel_variance##W##x##H##_c( \ const uint8_t *src, int src_stride, \ @@ -417,7 +223,7 @@ unsigned int vp9_highbd_sub_pixel_variance##W##x##H##_c( \ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ BILINEAR_FILTERS_2TAP(yoffset)); \ \ - return vp9_highbd_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \ + return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \ dst_stride, sse); \ } \ \ @@ -434,7 +240,7 @@ unsigned int vp9_highbd_10_sub_pixel_variance##W##x##H##_c( \ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ BILINEAR_FILTERS_2TAP(yoffset)); \ \ - return vp9_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \ + return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \ W, dst, dst_stride, sse); \ } \ \ @@ -451,7 +257,7 @@ unsigned int vp9_highbd_12_sub_pixel_variance##W##x##H##_c( \ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ BILINEAR_FILTERS_2TAP(yoffset)); \ \ - return vp9_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \ + return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \ W, dst, dst_stride, sse); \ } @@ -471,10 +277,10 @@ unsigned int vp9_highbd_sub_pixel_avg_variance##W##x##H##_c( \ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ BILINEAR_FILTERS_2TAP(yoffset)); \ \ - vp9_highbd_comp_avg_pred(temp3, second_pred, W, H, \ + vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \ CONVERT_TO_BYTEPTR(temp2), W); \ \ - return vp9_highbd_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \ + return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \ dst_stride, sse); \ } \ \ @@ -493,10 +299,10 @@ unsigned int vp9_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ BILINEAR_FILTERS_2TAP(yoffset)); \ \ - vp9_highbd_comp_avg_pred(temp3, second_pred, W, H, \ + vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \ CONVERT_TO_BYTEPTR(temp2), W); \ \ - return vp9_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \ + return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \ W, dst, dst_stride, sse); \ } \ \ @@ -515,137 +321,49 @@ unsigned int vp9_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ BILINEAR_FILTERS_2TAP(yoffset)); \ \ - vp9_highbd_comp_avg_pred(temp3, second_pred, W, H, \ + vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \ CONVERT_TO_BYTEPTR(temp2), W); \ \ - return vp9_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \ + return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \ W, dst, dst_stride, sse); \ } -#define HIGHBD_GET_VAR(S) \ -void vp9_highbd_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - unsigned int *sse, int *sum) { \ - highbd_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ -} \ -\ -void vp9_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - unsigned int *sse, int *sum) { \ - highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ -} \ -\ -void vp9_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - unsigned int *sse, int *sum) { \ - highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ -} - -#define HIGHBD_MSE(W, H) \ -unsigned int vp9_highbd_mse##W##x##H##_c(const uint8_t *src, \ - int src_stride, \ - const uint8_t *ref, \ - int ref_stride, \ - unsigned int *sse) { \ - int sum; \ - highbd_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ - return *sse; \ -} \ -\ -unsigned int vp9_highbd_10_mse##W##x##H##_c(const uint8_t *src, \ - int src_stride, \ - const uint8_t *ref, \ - int ref_stride, \ - unsigned int *sse) { \ - int sum; \ - highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ - return *sse; \ -} \ -\ -unsigned int vp9_highbd_12_mse##W##x##H##_c(const uint8_t *src, \ - int src_stride, \ - const uint8_t *ref, \ - int ref_stride, \ - unsigned int *sse) { \ - int sum; \ - highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ - return *sse; \ -} - -HIGHBD_GET_VAR(8) -HIGHBD_GET_VAR(16) - -HIGHBD_MSE(16, 16) -HIGHBD_MSE(16, 8) -HIGHBD_MSE(8, 16) -HIGHBD_MSE(8, 8) - -HIGHBD_VAR(4, 4) HIGHBD_SUBPIX_VAR(4, 4) HIGHBD_SUBPIX_AVG_VAR(4, 4) -HIGHBD_VAR(4, 8) HIGHBD_SUBPIX_VAR(4, 8) HIGHBD_SUBPIX_AVG_VAR(4, 8) -HIGHBD_VAR(8, 4) HIGHBD_SUBPIX_VAR(8, 4) HIGHBD_SUBPIX_AVG_VAR(8, 4) -HIGHBD_VAR(8, 8) HIGHBD_SUBPIX_VAR(8, 8) HIGHBD_SUBPIX_AVG_VAR(8, 8) -HIGHBD_VAR(8, 16) HIGHBD_SUBPIX_VAR(8, 16) HIGHBD_SUBPIX_AVG_VAR(8, 16) -HIGHBD_VAR(16, 8) HIGHBD_SUBPIX_VAR(16, 8) HIGHBD_SUBPIX_AVG_VAR(16, 8) -HIGHBD_VAR(16, 16) HIGHBD_SUBPIX_VAR(16, 16) HIGHBD_SUBPIX_AVG_VAR(16, 16) -HIGHBD_VAR(16, 32) HIGHBD_SUBPIX_VAR(16, 32) HIGHBD_SUBPIX_AVG_VAR(16, 32) -HIGHBD_VAR(32, 16) HIGHBD_SUBPIX_VAR(32, 16) HIGHBD_SUBPIX_AVG_VAR(32, 16) -HIGHBD_VAR(32, 32) HIGHBD_SUBPIX_VAR(32, 32) HIGHBD_SUBPIX_AVG_VAR(32, 32) -HIGHBD_VAR(32, 64) HIGHBD_SUBPIX_VAR(32, 64) HIGHBD_SUBPIX_AVG_VAR(32, 64) -HIGHBD_VAR(64, 32) HIGHBD_SUBPIX_VAR(64, 32) HIGHBD_SUBPIX_AVG_VAR(64, 32) -HIGHBD_VAR(64, 64) HIGHBD_SUBPIX_VAR(64, 64) HIGHBD_SUBPIX_AVG_VAR(64, 64) - -void vp9_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8, - int width, int height, const uint8_t *ref8, - int ref_stride) { - int i, j; - uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); - for (i = 0; i < height; i++) { - for (j = 0; j < width; j++) { - const int tmp = pred[j] + ref[j]; - comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1); - } - comp_pred += width; - pred += width; - ref += ref_stride; - } -} #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vp9/encoder/vp9_variance.h b/vp9/encoder/vp9_variance.h index 53148f23c..8fc47a850 100644 --- a/vp9/encoder/vp9_variance.h +++ b/vp9/encoder/vp9_variance.h @@ -12,31 +12,64 @@ #define VP9_ENCODER_VP9_VARIANCE_H_ #include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" #ifdef __cplusplus extern "C" { #endif -void variance(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - int w, int h, - unsigned int *sse, int *sum); +// TODO(johannkoenig): All functions which depend on +// [highbd_][8|10|12_]variance should be refactored or moved to vpx_dsp. +static void variance(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + int w, int h, unsigned int *sse, int *sum) { + int i, j; + + *sum = 0; + *sse = 0; + + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + const int diff = a[j] - b[j]; + *sum += diff; + *sse += diff * diff; + } + + a += a_stride; + b += b_stride; + } +} #if CONFIG_VP9_HIGHBITDEPTH -void highbd_variance(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, - int w, int h, - unsigned int *sse, int *sum); - -void highbd_10_variance(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, - int w, int h, - unsigned int *sse, int *sum); - -void highbd_12_variance(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, - int w, int h, - unsigned int *sse, int *sum); +static void highbd_variance64(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + int w, int h, uint64_t *sse, uint64_t *sum) { + int i, j; + + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + *sum = 0; + *sse = 0; + + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + const int diff = a[j] - b[j]; + *sum += diff; + *sse += diff * diff; + } + a += a_stride; + b += b_stride; + } +} +static void highbd_8_variance(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + int w, int h, unsigned int *sse, int *sum) { + uint64_t sse_long = 0; + uint64_t sum_long = 0; + highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); + *sse = (unsigned int)sse_long; + *sum = (int)sum_long; +} #endif typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr, @@ -95,15 +128,6 @@ typedef struct vp9_variance_vtable { vp9_sad_multi_d_fn_t sdx4df; } vp9_variance_fn_ptr_t; -void vp9_comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width, - int height, const uint8_t *ref, int ref_stride); - -#if CONFIG_VP9_HIGHBITDEPTH -void vp9_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred, - int width, int height, - const uint8_t *ref, int ref_stride); -#endif - #ifdef __cplusplus } // extern "C" #endif diff --git a/vp9/encoder/x86/vp9_highbd_variance_sse2.c b/vp9/encoder/x86/vp9_highbd_variance_sse2.c index 4bc3e7e2d..29b7b2782 100644 --- a/vp9/encoder/x86/vp9_highbd_variance_sse2.c +++ b/vp9/encoder/x86/vp9_highbd_variance_sse2.c @@ -13,237 +13,6 @@ #include "vp9/encoder/vp9_variance.h" #include "vpx_ports/mem.h" -typedef uint32_t (*high_variance_fn_t) (const uint16_t *src, int src_stride, - const uint16_t *ref, int ref_stride, - uint32_t *sse, int *sum); - -uint32_t vp9_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride, - const uint16_t *ref, int ref_stride, - uint32_t *sse, int *sum); - -uint32_t vp9_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride, - const uint16_t *ref, int ref_stride, - uint32_t *sse, int *sum); - -static void highbd_variance_sse2(const uint16_t *src, int src_stride, - const uint16_t *ref, int ref_stride, - int w, int h, uint32_t *sse, int *sum, - high_variance_fn_t var_fn, int block_size) { - int i, j; - - *sse = 0; - *sum = 0; - - for (i = 0; i < h; i += block_size) { - for (j = 0; j < w; j += block_size) { - unsigned int sse0; - int sum0; - var_fn(src + src_stride * i + j, src_stride, - ref + ref_stride * i + j, ref_stride, &sse0, &sum0); - *sse += sse0; - *sum += sum0; - } - } -} - -static void highbd_10_variance_sse2(const uint16_t *src, int src_stride, - const uint16_t *ref, int ref_stride, - int w, int h, uint32_t *sse, int *sum, - high_variance_fn_t var_fn, int block_size) { - int i, j; - uint64_t sse_long = 0; - int64_t sum_long = 0; - - for (i = 0; i < h; i += block_size) { - for (j = 0; j < w; j += block_size) { - unsigned int sse0; - int sum0; - var_fn(src + src_stride * i + j, src_stride, - ref + ref_stride * i + j, ref_stride, &sse0, &sum0); - sse_long += sse0; - sum_long += sum0; - } - } - *sum = ROUND_POWER_OF_TWO(sum_long, 2); - *sse = ROUND_POWER_OF_TWO(sse_long, 4); -} - -static void highbd_12_variance_sse2(const uint16_t *src, int src_stride, - const uint16_t *ref, int ref_stride, - int w, int h, uint32_t *sse, int *sum, - high_variance_fn_t var_fn, int block_size) { - int i, j; - uint64_t sse_long = 0; - int64_t sum_long = 0; - - for (i = 0; i < h; i += block_size) { - for (j = 0; j < w; j += block_size) { - unsigned int sse0; - int sum0; - var_fn(src + src_stride * i + j, src_stride, - ref + ref_stride * i + j, ref_stride, &sse0, &sum0); - sse_long += sse0; - sum_long += sum0; - } - } - *sum = ROUND_POWER_OF_TWO(sum_long, 4); - *sse = ROUND_POWER_OF_TWO(sse_long, 8); -} - - -#define HIGH_GET_VAR(S) \ -void vp9_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \ - const uint8_t *ref8, int ref_stride, \ - uint32_t *sse, int *sum) { \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ - vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \ - sse, sum); \ -} \ -\ -void vp9_highbd_10_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \ - const uint8_t *ref8, int ref_stride, \ - uint32_t *sse, int *sum) { \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ - vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \ - sse, sum); \ - *sum = ROUND_POWER_OF_TWO(*sum, 2); \ - *sse = ROUND_POWER_OF_TWO(*sse, 4); \ -} \ -\ -void vp9_highbd_12_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \ - const uint8_t *ref8, int ref_stride, \ - uint32_t *sse, int *sum) { \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ - vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \ - sse, sum); \ - *sum = ROUND_POWER_OF_TWO(*sum, 4); \ - *sse = ROUND_POWER_OF_TWO(*sse, 8); \ -} - -HIGH_GET_VAR(16); -HIGH_GET_VAR(8); - -#undef HIGH_GET_VAR - -#define VAR_FN(w, h, block_size, shift) \ -uint32_t vp9_highbd_variance##w##x##h##_sse2( \ - const uint8_t *src8, int src_stride, \ - const uint8_t *ref8, int ref_stride, uint32_t *sse) { \ - int sum; \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ - highbd_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \ - vp9_highbd_calc##block_size##x##block_size##var_sse2, \ - block_size); \ - return *sse - (((int64_t)sum * sum) >> shift); \ -} \ -\ -uint32_t vp9_highbd_10_variance##w##x##h##_sse2( \ - const uint8_t *src8, int src_stride, \ - const uint8_t *ref8, int ref_stride, uint32_t *sse) { \ - int sum; \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ - highbd_10_variance_sse2( \ - src, src_stride, ref, ref_stride, w, h, sse, &sum, \ - vp9_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ - return *sse - (((int64_t)sum * sum) >> shift); \ -} \ -\ -uint32_t vp9_highbd_12_variance##w##x##h##_sse2( \ - const uint8_t *src8, int src_stride, \ - const uint8_t *ref8, int ref_stride, uint32_t *sse) { \ - int sum; \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ - highbd_12_variance_sse2( \ - src, src_stride, ref, ref_stride, w, h, sse, &sum, \ - vp9_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ - return *sse - (((int64_t)sum * sum) >> shift); \ -} - -VAR_FN(64, 64, 16, 12); -VAR_FN(64, 32, 16, 11); -VAR_FN(32, 64, 16, 11); -VAR_FN(32, 32, 16, 10); -VAR_FN(32, 16, 16, 9); -VAR_FN(16, 32, 16, 9); -VAR_FN(16, 16, 16, 8); -VAR_FN(16, 8, 8, 7); -VAR_FN(8, 16, 8, 7); -VAR_FN(8, 8, 8, 6); - -#undef VAR_FN - -unsigned int vp9_highbd_mse16x16_sse2(const uint8_t *src8, int src_stride, - const uint8_t *ref8, int ref_stride, - unsigned int *sse) { - int sum; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); - highbd_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, - sse, &sum, vp9_highbd_calc16x16var_sse2, 16); - return *sse; -} - -unsigned int vp9_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride, - const uint8_t *ref8, int ref_stride, - unsigned int *sse) { - int sum; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); - highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, - sse, &sum, vp9_highbd_calc16x16var_sse2, 16); - return *sse; -} - -unsigned int vp9_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride, - const uint8_t *ref8, int ref_stride, - unsigned int *sse) { - int sum; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); - highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, - sse, &sum, vp9_highbd_calc16x16var_sse2, 16); - return *sse; -} - -unsigned int vp9_highbd_mse8x8_sse2(const uint8_t *src8, int src_stride, - const uint8_t *ref8, int ref_stride, - unsigned int *sse) { - int sum; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); - highbd_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, - sse, &sum, vp9_highbd_calc8x8var_sse2, 8); - return *sse; -} - -unsigned int vp9_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride, - const uint8_t *ref8, int ref_stride, - unsigned int *sse) { - int sum; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); - highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, - sse, &sum, vp9_highbd_calc8x8var_sse2, 8); - return *sse; -} - -unsigned int vp9_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride, - const uint8_t *ref8, int ref_stride, - unsigned int *sse) { - int sum; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); - highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, - sse, &sum, vp9_highbd_calc8x8var_sse2, 8); - return *sse; -} - #define DECL(w, opt) \ int vp9_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \ ptrdiff_t src_stride, \ diff --git a/vp9/encoder/x86/vp9_variance_avx2.c b/vp9/encoder/x86/vp9_variance_avx2.c index ff9f7cca3..8cd071de5 100644 --- a/vp9/encoder/x86/vp9_variance_avx2.c +++ b/vp9/encoder/x86/vp9_variance_avx2.c @@ -13,18 +13,6 @@ #include "vp9/encoder/vp9_variance.h" #include "vpx_ports/mem.h" -typedef void (*get_var_avx2)(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse, int *sum); - -void vp9_get16x16var_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse, int *sum); - -void vp9_get32x32var_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse, int *sum); - unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride, int x_offset, int y_offset, const uint8_t *dst, int dst_stride, @@ -42,81 +30,6 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src, int height, unsigned int *sseptr); -static void variance_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - int w, int h, unsigned int *sse, int *sum, - get_var_avx2 var_fn, int block_size) { - int i, j; - - *sse = 0; - *sum = 0; - - for (i = 0; i < h; i += 16) { - for (j = 0; j < w; j += block_size) { - unsigned int sse0; - int sum0; - var_fn(&src[src_stride * i + j], src_stride, - &ref[ref_stride * i + j], ref_stride, &sse0, &sum0); - *sse += sse0; - *sum += sum0; - } - } -} - - -unsigned int vp9_variance16x16_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_avx2(src, src_stride, ref, ref_stride, 16, 16, - sse, &sum, vp9_get16x16var_avx2, 16); - return *sse - (((unsigned int)sum * sum) >> 8); -} - -unsigned int vp9_mse16x16_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - vp9_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum); - return *sse; -} - -unsigned int vp9_variance32x16_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_avx2(src, src_stride, ref, ref_stride, 32, 16, - sse, &sum, vp9_get32x32var_avx2, 32); - return *sse - (((int64_t)sum * sum) >> 9); -} - -unsigned int vp9_variance32x32_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_avx2(src, src_stride, ref, ref_stride, 32, 32, - sse, &sum, vp9_get32x32var_avx2, 32); - return *sse - (((int64_t)sum * sum) >> 10); -} - -unsigned int vp9_variance64x64_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_avx2(src, src_stride, ref, ref_stride, 64, 64, - sse, &sum, vp9_get32x32var_avx2, 32); - return *sse - (((int64_t)sum * sum) >> 12); -} - -unsigned int vp9_variance64x32_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_avx2(src, src_stride, ref, ref_stride, 64, 32, - sse, &sum, vp9_get32x32var_avx2, 32); - return *sse - (((int64_t)sum * sum) >> 11); -} - unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src, int src_stride, int x_offset, diff --git a/vp9/encoder/x86/vp9_variance_sse2.c b/vp9/encoder/x86/vp9_variance_sse2.c index cacee7442..961efe34e 100644 --- a/vp9/encoder/x86/vp9_variance_sse2.c +++ b/vp9/encoder/x86/vp9_variance_sse2.c @@ -16,299 +16,6 @@ #include "vp9/encoder/vp9_variance.h" #include "vpx_ports/mem.h" -typedef void (*variance_fn_t)(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, - unsigned int *sse, int *sum); - -unsigned int vp9_get_mb_ss_sse2(const int16_t *src) { - __m128i vsum = _mm_setzero_si128(); - int i; - - for (i = 0; i < 32; ++i) { - const __m128i v = _mm_loadu_si128((const __m128i *)src); - vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v)); - src += 8; - } - - vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8)); - vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4)); - return _mm_cvtsi128_si32(vsum); -} - -#define READ64(p, stride, i) \ - _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \ - _mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride))) - -static void get4x4var_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse, int *sum) { - const __m128i zero = _mm_setzero_si128(); - const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero); - const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero); - const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero); - const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero); - const __m128i diff0 = _mm_sub_epi16(src0, ref0); - const __m128i diff1 = _mm_sub_epi16(src1, ref1); - - // sum - __m128i vsum = _mm_add_epi16(diff0, diff1); - vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); - vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); - vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2)); - *sum = (int16_t)_mm_extract_epi16(vsum, 0); - - // sse - vsum = _mm_add_epi32(_mm_madd_epi16(diff0, diff0), - _mm_madd_epi16(diff1, diff1)); - vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8)); - vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4)); - *sse = _mm_cvtsi128_si32(vsum); -} - -void vp9_get8x8var_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse, int *sum) { - const __m128i zero = _mm_setzero_si128(); - __m128i vsum = _mm_setzero_si128(); - __m128i vsse = _mm_setzero_si128(); - int i; - - for (i = 0; i < 8; i += 2) { - const __m128i src0 = _mm_unpacklo_epi8(_mm_loadl_epi64( - (const __m128i *)(src + i * src_stride)), zero); - const __m128i ref0 = _mm_unpacklo_epi8(_mm_loadl_epi64( - (const __m128i *)(ref + i * ref_stride)), zero); - const __m128i diff0 = _mm_sub_epi16(src0, ref0); - - const __m128i src1 = _mm_unpacklo_epi8(_mm_loadl_epi64( - (const __m128i *)(src + (i + 1) * src_stride)), zero); - const __m128i ref1 = _mm_unpacklo_epi8(_mm_loadl_epi64( - (const __m128i *)(ref + (i + 1) * ref_stride)), zero); - const __m128i diff1 = _mm_sub_epi16(src1, ref1); - - vsum = _mm_add_epi16(vsum, diff0); - vsum = _mm_add_epi16(vsum, diff1); - vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0)); - vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1)); - } - - // sum - vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); - vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); - vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2)); - *sum = (int16_t)_mm_extract_epi16(vsum, 0); - - // sse - vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8)); - vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4)); - *sse = _mm_cvtsi128_si32(vsse); -} - -void vp9_get16x16var_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse, int *sum) { - const __m128i zero = _mm_setzero_si128(); - __m128i vsum = _mm_setzero_si128(); - __m128i vsse = _mm_setzero_si128(); - int i; - - for (i = 0; i < 16; ++i) { - const __m128i s = _mm_loadu_si128((const __m128i *)src); - const __m128i r = _mm_loadu_si128((const __m128i *)ref); - - const __m128i src0 = _mm_unpacklo_epi8(s, zero); - const __m128i ref0 = _mm_unpacklo_epi8(r, zero); - const __m128i diff0 = _mm_sub_epi16(src0, ref0); - - const __m128i src1 = _mm_unpackhi_epi8(s, zero); - const __m128i ref1 = _mm_unpackhi_epi8(r, zero); - const __m128i diff1 = _mm_sub_epi16(src1, ref1); - - vsum = _mm_add_epi16(vsum, diff0); - vsum = _mm_add_epi16(vsum, diff1); - vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0)); - vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1)); - - src += src_stride; - ref += ref_stride; - } - - // sum - vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); - vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); - *sum = (int16_t)_mm_extract_epi16(vsum, 0) + - (int16_t)_mm_extract_epi16(vsum, 1); - - // sse - vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8)); - vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4)); - *sse = _mm_cvtsi128_si32(vsse); -} - - -static void variance_sse2(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, - int w, int h, unsigned int *sse, int *sum, - variance_fn_t var_fn, int block_size) { - int i, j; - - *sse = 0; - *sum = 0; - - for (i = 0; i < h; i += block_size) { - for (j = 0; j < w; j += block_size) { - unsigned int sse0; - int sum0; - var_fn(src + src_stride * i + j, src_stride, - ref + ref_stride * i + j, ref_stride, &sse0, &sum0); - *sse += sse0; - *sum += sum0; - } - } -} - -unsigned int vp9_variance4x4_sse2(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, - unsigned int *sse) { - int sum; - get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum); - return *sse - (((unsigned int)sum * sum) >> 4); -} - -unsigned int vp9_variance8x4_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 8, 4, - sse, &sum, get4x4var_sse2, 4); - return *sse - (((unsigned int)sum * sum) >> 5); -} - -unsigned int vp9_variance4x8_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 4, 8, - sse, &sum, get4x4var_sse2, 4); - return *sse - (((unsigned int)sum * sum) >> 5); -} - -unsigned int vp9_variance8x8_sse2(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, - unsigned int *sse) { - int sum; - vp9_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum); - return *sse - (((unsigned int)sum * sum) >> 6); -} - -unsigned int vp9_variance16x8_sse2(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 16, 8, - sse, &sum, vp9_get8x8var_sse2, 8); - return *sse - (((unsigned int)sum * sum) >> 7); -} - -unsigned int vp9_variance8x16_sse2(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 8, 16, - sse, &sum, vp9_get8x8var_sse2, 8); - return *sse - (((unsigned int)sum * sum) >> 7); -} - -unsigned int vp9_variance16x16_sse2(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, - unsigned int *sse) { - int sum; - vp9_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum); - return *sse - (((unsigned int)sum * sum) >> 8); -} - -unsigned int vp9_variance32x32_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 32, 32, - sse, &sum, vp9_get16x16var_sse2, 16); - return *sse - (((int64_t)sum * sum) >> 10); -} - -unsigned int vp9_variance32x16_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 32, 16, - sse, &sum, vp9_get16x16var_sse2, 16); - return *sse - (((int64_t)sum * sum) >> 9); -} - -unsigned int vp9_variance16x32_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 16, 32, - sse, &sum, vp9_get16x16var_sse2, 16); - return *sse - (((int64_t)sum * sum) >> 9); -} - -unsigned int vp9_variance64x64_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 64, 64, - sse, &sum, vp9_get16x16var_sse2, 16); - return *sse - (((int64_t)sum * sum) >> 12); -} - -unsigned int vp9_variance64x32_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 64, 32, - sse, &sum, vp9_get16x16var_sse2, 16); - return *sse - (((int64_t)sum * sum) >> 11); -} - -unsigned int vp9_variance32x64_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 32, 64, - sse, &sum, vp9_get16x16var_sse2, 16); - return *sse - (((int64_t)sum * sum) >> 11); -} - -unsigned int vp9_mse8x8_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - vp9_variance8x8_sse2(src, src_stride, ref, ref_stride, sse); - return *sse; -} - -unsigned int vp9_mse8x16_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - vp9_variance8x16_sse2(src, src_stride, ref, ref_stride, sse); - return *sse; -} - -unsigned int vp9_mse16x8_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - vp9_variance16x8_sse2(src, src_stride, ref, ref_stride, sse); - return *sse; -} - -unsigned int vp9_mse16x16_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - vp9_variance16x16_sse2(src, src_stride, ref, ref_stride, sse); - return *sse; -} - // The 2 unused parameters are place holders for PIC enabled build. #define DECL(w, opt) \ int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \ diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index f629d98b8..cbc04888b 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -69,6 +69,7 @@ VP9_COMMON_SRCS-yes += common/vp9_common_data.h VP9_COMMON_SRCS-yes += common/vp9_scan.c VP9_COMMON_SRCS-yes += common/vp9_scan.h +VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/convolve.h VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_intrin_sse2.c VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_loopfilter_intrin_avx2.c diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index bd0d18c17..5415215c2 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -102,13 +102,11 @@ VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.h VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h -VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_impl_intrin_avx2.c VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_avg_intrin_sse2.c VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) -VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_variance_impl_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_quantize_intrin_sse2.c VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c endif diff --git a/vpx_dsp/arm/variance_media.asm b/vpx_dsp/arm/variance_media.asm new file mode 100644 index 000000000..f7f9e14b0 --- /dev/null +++ b/vpx_dsp/arm/variance_media.asm @@ -0,0 +1,358 @@ +; +; Copyright (c) 2011 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vpx_variance16x16_media| + EXPORT |vpx_variance8x8_media| + EXPORT |vpx_mse16x16_media| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +|vpx_variance16x16_media| PROC + + stmfd sp!, {r4-r12, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + + mov r8, #0 ; initialize sum = 0 + mov r11, #0 ; initialize sse = 0 + mov r12, #16 ; set loop counter to 16 (=block height) + +loop16x16 + ; 1st 4 pixels + ldr r4, [r0, #0] ; load 4 src pixels + ldr r5, [r2, #0] ; load 4 ref pixels + + mov lr, #0 ; constant zero + + usub8 r6, r4, r5 ; calculate difference + pld [r0, r1, lsl #1] + sel r7, r6, lr ; select bytes with positive difference + usub8 r9, r5, r4 ; calculate difference with reversed operands + pld [r2, r3, lsl #1] + sel r6, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + ; calculate total sum + adds r8, r8, r4 ; add positive differences to sum + subs r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r10, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 2nd 4 pixels + ldr r4, [r0, #4] ; load 4 src pixels + ldr r5, [r2, #4] ; load 4 ref pixels + smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r9, r5, r4 ; calculate difference with reversed operands + sel r6, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r10, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 3rd 4 pixels + ldr r4, [r0, #8] ; load 4 src pixels + ldr r5, [r2, #8] ; load 4 ref pixels + smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r9, r5, r4 ; calculate difference with reversed operands + sel r6, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r10, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 4th 4 pixels + ldr r4, [r0, #12] ; load 4 src pixels + ldr r5, [r2, #12] ; load 4 ref pixels + smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + add r0, r0, r1 ; set src_ptr to next row + sel r7, r6, lr ; select bytes with positive difference + usub8 r9, r5, r4 ; calculate difference with reversed operands + add r2, r2, r3 ; set dst_ptr to next row + sel r6, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r10, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) + + + subs r12, r12, #1 + + bne loop16x16 + + ; return stuff + ldr r6, [sp, #40] ; get address of sse + mul r0, r8, r8 ; sum * sum + str r11, [r6] ; store sse + sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8)) + + ldmfd sp!, {r4-r12, pc} + + ENDP + +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +|vpx_variance8x8_media| PROC + + push {r4-r10, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + + mov r12, #8 ; set loop counter to 8 (=block height) + mov r4, #0 ; initialize sum = 0 + mov r5, #0 ; initialize sse = 0 + +loop8x8 + ; 1st 4 pixels + ldr r6, [r0, #0x0] ; load 4 src pixels + ldr r7, [r2, #0x0] ; load 4 ref pixels + + mov lr, #0 ; constant zero + + usub8 r8, r6, r7 ; calculate difference + pld [r0, r1, lsl #1] + sel r10, r8, lr ; select bytes with positive difference + usub8 r9, r7, r6 ; calculate difference with reversed operands + pld [r2, r3, lsl #1] + sel r8, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r6, r10, lr ; calculate sum of positive differences + usad8 r7, r8, lr ; calculate sum of negative differences + orr r8, r8, r10 ; differences of all 4 pixels + ; calculate total sum + add r4, r4, r6 ; add positive differences to sum + sub r4, r4, r7 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r7, r8 ; byte (two pixels) to halfwords + uxtb16 r10, r8, ror #8 ; another two pixels to halfwords + smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1) + + ; 2nd 4 pixels + ldr r6, [r0, #0x4] ; load 4 src pixels + ldr r7, [r2, #0x4] ; load 4 ref pixels + smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2) + + usub8 r8, r6, r7 ; calculate difference + add r0, r0, r1 ; set src_ptr to next row + sel r10, r8, lr ; select bytes with positive difference + usub8 r9, r7, r6 ; calculate difference with reversed operands + add r2, r2, r3 ; set dst_ptr to next row + sel r8, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r6, r10, lr ; calculate sum of positive differences + usad8 r7, r8, lr ; calculate sum of negative differences + orr r8, r8, r10 ; differences of all 4 pixels + + ; calculate total sum + add r4, r4, r6 ; add positive differences to sum + sub r4, r4, r7 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r7, r8 ; byte (two pixels) to halfwords + uxtb16 r10, r8, ror #8 ; another two pixels to halfwords + smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1) + subs r12, r12, #1 ; next row + smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2) + + bne loop8x8 + + ; return stuff + ldr r8, [sp, #32] ; get address of sse + mul r1, r4, r4 ; sum * sum + str r5, [r8] ; store sse + sub r0, r5, r1, ASR #6 ; return (sse - ((sum * sum) >> 6)) + + pop {r4-r10, pc} + + ENDP + +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +; +;note: Based on vpx_variance16x16_media. In this function, sum is never used. +; So, we can remove this part of calculation. + +|vpx_mse16x16_media| PROC + + push {r4-r9, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + + mov r12, #16 ; set loop counter to 16 (=block height) + mov r4, #0 ; initialize sse = 0 + +loopmse + ; 1st 4 pixels + ldr r5, [r0, #0x0] ; load 4 src pixels + ldr r6, [r2, #0x0] ; load 4 ref pixels + + mov lr, #0 ; constant zero + + usub8 r8, r5, r6 ; calculate difference + pld [r0, r1, lsl #1] + sel r7, r8, lr ; select bytes with positive difference + usub8 r9, r6, r5 ; calculate difference with reversed operands + pld [r2, r3, lsl #1] + sel r8, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r5, r7, lr ; calculate sum of positive differences + usad8 r6, r8, lr ; calculate sum of negative differences + orr r8, r8, r7 ; differences of all 4 pixels + + ldr r5, [r0, #0x4] ; load 4 src pixels + + ; calculate sse + uxtb16 r6, r8 ; byte (two pixels) to halfwords + uxtb16 r7, r8, ror #8 ; another two pixels to halfwords + smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) + + ; 2nd 4 pixels + ldr r6, [r2, #0x4] ; load 4 ref pixels + smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) + + usub8 r8, r5, r6 ; calculate difference + sel r7, r8, lr ; select bytes with positive difference + usub8 r9, r6, r5 ; calculate difference with reversed operands + sel r8, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r5, r7, lr ; calculate sum of positive differences + usad8 r6, r8, lr ; calculate sum of negative differences + orr r8, r8, r7 ; differences of all 4 pixels + ldr r5, [r0, #0x8] ; load 4 src pixels + ; calculate sse + uxtb16 r6, r8 ; byte (two pixels) to halfwords + uxtb16 r7, r8, ror #8 ; another two pixels to halfwords + smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) + + ; 3rd 4 pixels + ldr r6, [r2, #0x8] ; load 4 ref pixels + smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) + + usub8 r8, r5, r6 ; calculate difference + sel r7, r8, lr ; select bytes with positive difference + usub8 r9, r6, r5 ; calculate difference with reversed operands + sel r8, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r5, r7, lr ; calculate sum of positive differences + usad8 r6, r8, lr ; calculate sum of negative differences + orr r8, r8, r7 ; differences of all 4 pixels + + ldr r5, [r0, #0xc] ; load 4 src pixels + + ; calculate sse + uxtb16 r6, r8 ; byte (two pixels) to halfwords + uxtb16 r7, r8, ror #8 ; another two pixels to halfwords + smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) + + ; 4th 4 pixels + ldr r6, [r2, #0xc] ; load 4 ref pixels + smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) + + usub8 r8, r5, r6 ; calculate difference + add r0, r0, r1 ; set src_ptr to next row + sel r7, r8, lr ; select bytes with positive difference + usub8 r9, r6, r5 ; calculate difference with reversed operands + add r2, r2, r3 ; set dst_ptr to next row + sel r8, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r5, r7, lr ; calculate sum of positive differences + usad8 r6, r8, lr ; calculate sum of negative differences + orr r8, r8, r7 ; differences of all 4 pixels + + subs r12, r12, #1 ; next row + + ; calculate sse + uxtb16 r6, r8 ; byte (two pixels) to halfwords + uxtb16 r7, r8, ror #8 ; another two pixels to halfwords + smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) + smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) + + bne loopmse + + ; return stuff + ldr r1, [sp, #28] ; get address of sse + mov r0, r4 ; return sse + str r4, [r1] ; store sse + + pop {r4-r9, pc} + + ENDP + + END diff --git a/vpx_dsp/arm/variance_neon.c b/vpx_dsp/arm/variance_neon.c new file mode 100644 index 000000000..1a9792e6c --- /dev/null +++ b/vpx_dsp/arm/variance_neon.c @@ -0,0 +1,417 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vpx_dsp_rtcd.h" +#include "./vpx_config.h" + +#include "vpx/vpx_integer.h" + +static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) { + const int32x4_t a = vpaddlq_s16(v_16x8); + const int64x2_t b = vpaddlq_s32(a); + const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), + vreinterpret_s32_s64(vget_high_s64(b))); + return vget_lane_s32(c, 0); +} + +static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) { + const int64x2_t b = vpaddlq_s32(v_32x4); + const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), + vreinterpret_s32_s64(vget_high_s64(b))); + return vget_lane_s32(c, 0); +} + +// w * h must be less than 2048 or local variable v_sum may overflow. +static void variance_neon_w8(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + int w, int h, uint32_t *sse, int *sum) { + int i, j; + int16x8_t v_sum = vdupq_n_s16(0); + int32x4_t v_sse_lo = vdupq_n_s32(0); + int32x4_t v_sse_hi = vdupq_n_s32(0); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + const uint8x8_t v_a = vld1_u8(&a[j]); + const uint8x8_t v_b = vld1_u8(&b[j]); + const uint16x8_t v_diff = vsubl_u8(v_a, v_b); + const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff); + v_sum = vaddq_s16(v_sum, sv_diff); + v_sse_lo = vmlal_s16(v_sse_lo, + vget_low_s16(sv_diff), + vget_low_s16(sv_diff)); + v_sse_hi = vmlal_s16(v_sse_hi, + vget_high_s16(sv_diff), + vget_high_s16(sv_diff)); + } + a += a_stride; + b += b_stride; + } + + *sum = horizontal_add_s16x8(v_sum); + *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi)); +} + +void vpx_get8x8var_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse, int *sum) { + variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, sum); +} + +void vpx_get16x16var_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse, int *sum) { + variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, sum); +} + +unsigned int vpx_variance8x8_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum; + variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum); + return *sse - (((int64_t)sum * sum) >> 6); // >> 6 = / 8 * 8 +} + +unsigned int vpx_variance16x16_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum; + variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum); + return *sse - (((int64_t)sum * sum) >> 8); // >> 8 = / 16 * 16 +} + +unsigned int vpx_variance32x32_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum; + variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum); + return *sse - (((int64_t)sum * sum) >> 10); // >> 10 = / 32 * 32 +} + +unsigned int vpx_variance32x64_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum1, sum2; + uint32_t sse1, sse2; + variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1); + variance_neon_w8(a + (32 * a_stride), a_stride, + b + (32 * b_stride), b_stride, 32, 32, + &sse2, &sum2); + *sse = sse1 + sse2; + sum1 += sum2; + return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64 +} + +unsigned int vpx_variance64x32_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum1, sum2; + uint32_t sse1, sse2; + variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1); + variance_neon_w8(a + (16 * a_stride), a_stride, + b + (16 * b_stride), b_stride, 64, 16, + &sse2, &sum2); + *sse = sse1 + sse2; + sum1 += sum2; + return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64 +} + +unsigned int vpx_variance64x64_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum1, sum2; + uint32_t sse1, sse2; + + variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1); + variance_neon_w8(a + (16 * a_stride), a_stride, + b + (16 * b_stride), b_stride, 64, 16, + &sse2, &sum2); + sse1 += sse2; + sum1 += sum2; + + variance_neon_w8(a + (16 * 2 * a_stride), a_stride, + b + (16 * 2 * b_stride), b_stride, + 64, 16, &sse2, &sum2); + sse1 += sse2; + sum1 += sum2; + + variance_neon_w8(a + (16 * 3 * a_stride), a_stride, + b + (16 * 3 * b_stride), b_stride, + 64, 16, &sse2, &sum2); + *sse = sse1 + sse2; + sum1 += sum2; + return *sse - (((int64_t)sum1 * sum1) >> 12); // >> 12 = / 64 * 64 +} + +unsigned int vpx_variance16x8_neon( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) { + int i; + int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; + uint32x2_t d0u32, d10u32; + int64x1_t d0s64, d1s64; + uint8x16_t q0u8, q1u8, q2u8, q3u8; + uint16x8_t q11u16, q12u16, q13u16, q14u16; + int32x4_t q8s32, q9s32, q10s32; + int64x2_t q0s64, q1s64, q5s64; + + q8s32 = vdupq_n_s32(0); + q9s32 = vdupq_n_s32(0); + q10s32 = vdupq_n_s32(0); + + for (i = 0; i < 4; i++) { + q0u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + q1u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + __builtin_prefetch(src_ptr); + + q2u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q3u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + __builtin_prefetch(ref_ptr); + + q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); + q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); + q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); + q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); + + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); + q9s32 = vmlal_s16(q9s32, d22s16, d22s16); + q10s32 = vmlal_s16(q10s32, d23s16, d23s16); + + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); + q9s32 = vmlal_s16(q9s32, d24s16, d24s16); + q10s32 = vmlal_s16(q10s32, d25s16, d25s16); + + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); + q9s32 = vmlal_s16(q9s32, d26s16, d26s16); + q10s32 = vmlal_s16(q10s32, d27s16, d27s16); + + d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); + d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); + q9s32 = vmlal_s16(q9s32, d28s16, d28s16); + q10s32 = vmlal_s16(q10s32, d29s16, d29s16); + } + + q10s32 = vaddq_s32(q10s32, q9s32); + q0s64 = vpaddlq_s32(q8s32); + q1s64 = vpaddlq_s32(q10s32); + + d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); + d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); + + q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), + vreinterpret_s32_s64(d0s64)); + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); + + d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); + d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); + + return vget_lane_u32(d0u32, 0); +} + +unsigned int vpx_variance8x16_neon( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) { + int i; + uint8x8_t d0u8, d2u8, d4u8, d6u8; + int16x4_t d22s16, d23s16, d24s16, d25s16; + uint32x2_t d0u32, d10u32; + int64x1_t d0s64, d1s64; + uint16x8_t q11u16, q12u16; + int32x4_t q8s32, q9s32, q10s32; + int64x2_t q0s64, q1s64, q5s64; + + q8s32 = vdupq_n_s32(0); + q9s32 = vdupq_n_s32(0); + q10s32 = vdupq_n_s32(0); + + for (i = 0; i < 8; i++) { + d0u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d2u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + __builtin_prefetch(src_ptr); + + d4u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + d6u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + __builtin_prefetch(ref_ptr); + + q11u16 = vsubl_u8(d0u8, d4u8); + q12u16 = vsubl_u8(d2u8, d6u8); + + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); + q9s32 = vmlal_s16(q9s32, d22s16, d22s16); + q10s32 = vmlal_s16(q10s32, d23s16, d23s16); + + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); + q9s32 = vmlal_s16(q9s32, d24s16, d24s16); + q10s32 = vmlal_s16(q10s32, d25s16, d25s16); + } + + q10s32 = vaddq_s32(q10s32, q9s32); + q0s64 = vpaddlq_s32(q8s32); + q1s64 = vpaddlq_s32(q10s32); + + d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); + d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); + + q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), + vreinterpret_s32_s64(d0s64)); + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); + + d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); + d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); + + return vget_lane_u32(d0u32, 0); +} + +unsigned int vpx_mse16x16_neon( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) { + int i; + int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; + int64x1_t d0s64; + uint8x16_t q0u8, q1u8, q2u8, q3u8; + int32x4_t q7s32, q8s32, q9s32, q10s32; + uint16x8_t q11u16, q12u16, q13u16, q14u16; + int64x2_t q1s64; + + q7s32 = vdupq_n_s32(0); + q8s32 = vdupq_n_s32(0); + q9s32 = vdupq_n_s32(0); + q10s32 = vdupq_n_s32(0); + + for (i = 0; i < 8; i++) { // mse16x16_neon_loop + q0u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + q1u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + q2u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q3u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + + q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); + q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); + q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); + q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); + + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + q7s32 = vmlal_s16(q7s32, d22s16, d22s16); + q8s32 = vmlal_s16(q8s32, d23s16, d23s16); + + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + q9s32 = vmlal_s16(q9s32, d24s16, d24s16); + q10s32 = vmlal_s16(q10s32, d25s16, d25s16); + + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + q7s32 = vmlal_s16(q7s32, d26s16, d26s16); + q8s32 = vmlal_s16(q8s32, d27s16, d27s16); + + d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); + d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); + q9s32 = vmlal_s16(q9s32, d28s16, d28s16); + q10s32 = vmlal_s16(q10s32, d29s16, d29s16); + } + + q7s32 = vaddq_s32(q7s32, q8s32); + q9s32 = vaddq_s32(q9s32, q10s32); + q10s32 = vaddq_s32(q7s32, q9s32); + + q1s64 = vpaddlq_s32(q10s32); + d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); + + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0); + return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0); +} + +unsigned int vpx_get4x4sse_cs_neon( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride) { + int16x4_t d22s16, d24s16, d26s16, d28s16; + int64x1_t d0s64; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; + int32x4_t q7s32, q8s32, q9s32, q10s32; + uint16x8_t q11u16, q12u16, q13u16, q14u16; + int64x2_t q1s64; + + d0u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d4u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + d1u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d5u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + d2u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d6u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + d3u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d7u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + + q11u16 = vsubl_u8(d0u8, d4u8); + q12u16 = vsubl_u8(d1u8, d5u8); + q13u16 = vsubl_u8(d2u8, d6u8); + q14u16 = vsubl_u8(d3u8, d7u8); + + d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16)); + d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16)); + d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16)); + d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16)); + + q7s32 = vmull_s16(d22s16, d22s16); + q8s32 = vmull_s16(d24s16, d24s16); + q9s32 = vmull_s16(d26s16, d26s16); + q10s32 = vmull_s16(d28s16, d28s16); + + q7s32 = vaddq_s32(q7s32, q8s32); + q9s32 = vaddq_s32(q9s32, q10s32); + q9s32 = vaddq_s32(q7s32, q9s32); + + q1s64 = vpaddlq_s32(q9s32); + d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); + + return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0); +} diff --git a/vpx_dsp/sad.c b/vpx_dsp/sad.c index 9783e4363..c0c3ff996 100644 --- a/vpx_dsp/sad.c +++ b/vpx_dsp/sad.c @@ -33,6 +33,7 @@ static INLINE unsigned int sad(const uint8_t *a, int a_stride, return sad; } +// TODO(johannkoenig): this moved to vpx_dsp, should be able to clean this up. /* Remove dependency on vp9 variance function by duplicating vp9_comp_avg_pred. * The function averages every corresponding element of the buffers and stores * the value in a third buffer, comp_pred. diff --git a/vpx_dsp/variance.c b/vpx_dsp/variance.c new file mode 100644 index 000000000..084dd7b7e --- /dev/null +++ b/vpx_dsp/variance.c @@ -0,0 +1,306 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx_ports/mem.h" +#include "vpx/vpx_integer.h" + +unsigned int vpx_get4x4sse_cs_c(const unsigned char *a, int a_stride, + const unsigned char *b, int b_stride) { + int distortion = 0; + int r, c; + + for (r = 0; r < 4; r++) { + for (c = 0; c < 4; c++) { + int diff = a[c] - b[c]; + distortion += diff * diff; + } + + a += a_stride; + b += b_stride; + } + + return distortion; +} + +unsigned int vpx_get_mb_ss_c(const int16_t *a) { + unsigned int i, sum = 0; + + for (i = 0; i < 256; ++i) { + sum += a[i] * a[i]; + } + + return sum; +} + +static void variance(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + int w, int h, unsigned int *sse, int *sum) { + int i, j; + + *sum = 0; + *sse = 0; + + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + const int diff = a[j] - b[j]; + *sum += diff; + *sse += diff * diff; + } + + a += a_stride; + b += b_stride; + } +} + +#define VAR(W, H) \ +unsigned int vpx_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + unsigned int *sse) { \ + int sum; \ + variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + return *sse - (((int64_t)sum * sum) / (W * H)); \ +} + +/* Identical to the variance call except it takes an additional parameter, sum, + * and returns that value using pass-by-reference instead of returning + * sse - sum^2 / w*h + */ +#define GET_VAR(W, H) \ +void vpx_get##W##x##H##var_c(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + unsigned int *sse, int *sum) { \ + variance(a, a_stride, b, b_stride, W, H, sse, sum); \ +} + +/* Identical to the variance call except it does not calculate the + * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in + * variable. + */ +#define MSE(W, H) \ +unsigned int vpx_mse##W##x##H##_c(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + unsigned int *sse) { \ + int sum; \ + variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + return *sse; \ +} + +VAR(64, 64) +VAR(64, 32) +VAR(32, 64) +VAR(32, 32) +VAR(32, 16) +VAR(16, 32) +VAR(16, 16) +VAR(16, 8) +VAR(8, 16) +VAR(8, 8) +VAR(8, 4) +VAR(4, 8) +VAR(4, 4) + +GET_VAR(16, 16) +GET_VAR(8, 8) + +MSE(16, 16) +MSE(16, 8) +MSE(8, 16) +MSE(8, 8) + +void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, + int height, const uint8_t *ref, int ref_stride) { + int i, j; + + for (i = 0; i < height; i++) { + for (j = 0; j < width; j++) { + const int tmp = pred[j] + ref[j]; + comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +static void highbd_variance64(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + int w, int h, uint64_t *sse, uint64_t *sum) { + int i, j; + + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + *sum = 0; + *sse = 0; + + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + const int diff = a[j] - b[j]; + *sum += diff; + *sse += diff * diff; + } + a += a_stride; + b += b_stride; + } +} + +static void highbd_8_variance(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + int w, int h, unsigned int *sse, int *sum) { + uint64_t sse_long = 0; + uint64_t sum_long = 0; + highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); + *sse = (unsigned int)sse_long; + *sum = (int)sum_long; +} + +static void highbd_10_variance(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + int w, int h, unsigned int *sse, int *sum) { + uint64_t sse_long = 0; + uint64_t sum_long = 0; + highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); + *sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 4); + *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); +} + +static void highbd_12_variance(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + int w, int h, unsigned int *sse, int *sum) { + uint64_t sse_long = 0; + uint64_t sum_long = 0; + highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); + *sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 8); + *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); +} + +#define HIGHBD_VAR(W, H) \ +unsigned int vpx_highbd_8_variance##W##x##H##_c(const uint8_t *a, \ + int a_stride, \ + const uint8_t *b, \ + int b_stride, \ + unsigned int *sse) { \ + int sum; \ + highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + return *sse - (((int64_t)sum * sum) / (W * H)); \ +} \ +\ +unsigned int vpx_highbd_10_variance##W##x##H##_c(const uint8_t *a, \ + int a_stride, \ + const uint8_t *b, \ + int b_stride, \ + unsigned int *sse) { \ + int sum; \ + highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + return *sse - (((int64_t)sum * sum) / (W * H)); \ +} \ +\ +unsigned int vpx_highbd_12_variance##W##x##H##_c(const uint8_t *a, \ + int a_stride, \ + const uint8_t *b, \ + int b_stride, \ + unsigned int *sse) { \ + int sum; \ + highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + return *sse - (((int64_t)sum * sum) / (W * H)); \ +} + +#define HIGHBD_GET_VAR(S) \ +void vpx_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + unsigned int *sse, int *sum) { \ + highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ +} \ +\ +void vpx_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + unsigned int *sse, int *sum) { \ + highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ +} \ +\ +void vpx_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + unsigned int *sse, int *sum) { \ + highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ +} + +#define HIGHBD_MSE(W, H) \ +unsigned int vpx_highbd_8_mse##W##x##H##_c(const uint8_t *src, \ + int src_stride, \ + const uint8_t *ref, \ + int ref_stride, \ + unsigned int *sse) { \ + int sum; \ + highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ + return *sse; \ +} \ +\ +unsigned int vpx_highbd_10_mse##W##x##H##_c(const uint8_t *src, \ + int src_stride, \ + const uint8_t *ref, \ + int ref_stride, \ + unsigned int *sse) { \ + int sum; \ + highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ + return *sse; \ +} \ +\ +unsigned int vpx_highbd_12_mse##W##x##H##_c(const uint8_t *src, \ + int src_stride, \ + const uint8_t *ref, \ + int ref_stride, \ + unsigned int *sse) { \ + int sum; \ + highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ + return *sse; \ +} + +HIGHBD_GET_VAR(8) +HIGHBD_GET_VAR(16) + +HIGHBD_MSE(16, 16) +HIGHBD_MSE(16, 8) +HIGHBD_MSE(8, 16) +HIGHBD_MSE(8, 8) + +HIGHBD_VAR(64, 64) +HIGHBD_VAR(64, 32) +HIGHBD_VAR(32, 64) +HIGHBD_VAR(32, 32) +HIGHBD_VAR(32, 16) +HIGHBD_VAR(16, 32) +HIGHBD_VAR(16, 16) +HIGHBD_VAR(16, 8) +HIGHBD_VAR(8, 16) +HIGHBD_VAR(8, 8) +HIGHBD_VAR(8, 4) +HIGHBD_VAR(4, 8) +HIGHBD_VAR(4, 4) + +void vpx_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8, + int width, int height, const uint8_t *ref8, + int ref_stride) { + int i, j; + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + for (i = 0; i < height; i++) { + for (j = 0; j < width; j++) { + const int tmp = pred[j] + ref[j]; + comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 606515d2c..f23534adc 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -17,6 +17,7 @@ DSP_SRCS-$(HAVE_MEDIA) += arm/sad_media$(ASM) DSP_SRCS-$(HAVE_NEON) += arm/sad4d_neon.c DSP_SRCS-$(HAVE_NEON) += arm/sad_neon.c + DSP_SRCS-$(HAVE_MMX) += x86/sad_mmx.asm DSP_SRCS-$(HAVE_SSE2) += x86/sad4d_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/sad_sse2.asm @@ -29,9 +30,28 @@ DSP_SRCS-$(HAVE_AVX2) += x86/sad_avx2.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm + endif # CONFIG_VP9_HIGHBITDEPTH endif # CONFIG_ENCODERS +ifneq ($(filter yes,$(CONFIG_ENCODERS) $(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)),) +DSP_SRCS-yes += variance.c + +DSP_SRCS-$(HAVE_MEDIA) += arm/variance_media$(ASM) +DSP_SRCS-$(HAVE_NEON) += arm/variance_neon.c + +DSP_SRCS-$(HAVE_MMX) += x86/variance_mmx.c +DSP_SRCS-$(HAVE_MMX) += x86/variance_impl_mmx.asm +DSP_SRCS-$(HAVE_SSE2) += x86/variance_sse2.c +DSP_SRCS-$(HAVE_AVX2) += x86/variance_avx2.c +DSP_SRCS-$(HAVE_AVX2) += x86/variance_impl_avx2.c + +ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_sse2.c +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_impl_sse2.asm +endif # CONFIG_VP9_HIGHBITDEPTH +endif # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC + DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes) DSP_SRCS-yes += vpx_dsp_rtcd.c diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index ebec9ec06..55271cf9c 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -392,4 +392,212 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { } # CONFIG_VP9_HIGHBITDEPTH } # CONFIG_ENCODERS +if (vpx_config("CONFIG_ENCODERS") eq "yes" || vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC") eq "yes") { + +add_proto qw/unsigned int vpx_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance64x64 sse2 avx2 neon/; + +add_proto qw/unsigned int vpx_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance64x32 sse2 avx2 neon/; + +add_proto qw/unsigned int vpx_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance32x64 sse2 neon/; + +add_proto qw/unsigned int vpx_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance32x32 sse2 avx2 neon/; + +add_proto qw/unsigned int vpx_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance32x16 sse2 avx2/; + +add_proto qw/unsigned int vpx_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance16x32 sse2/; + +add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance16x16 mmx sse2 avx2 media neon/; + +add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance16x8 mmx sse2 neon/; + +add_proto qw/unsigned int vpx_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance8x16 mmx sse2 neon/; + +add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance8x8 mmx sse2 media neon/; + +add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance8x4 sse2/; + +add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance4x8 sse2/; + +add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance4x4 mmx sse2/; + + +add_proto qw/void vpx_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + specialize qw/vpx_get16x16var sse2 avx2 neon/; + +add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + specialize qw/vpx_get8x8var mmx sse2 neon/; + +add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/vpx_mse16x16 mmx sse2 avx2 media neon/; + +add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/vpx_mse16x8 sse2/; + +add_proto qw/unsigned int vpx_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/vpx_mse8x16 sse2/; + +add_proto qw/unsigned int vpx_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/vpx_mse8x8 sse2/; + +add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *"; + specialize qw/vpx_get_mb_ss mmx sse2/; + +add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride"; + specialize qw/vpx_get4x4sse_cs neon/; + +add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride"; + +if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { + add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_12_variance64x64 sse2/; + + add_proto qw/unsigned int vpx_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_12_variance64x32 sse2/; + + add_proto qw/unsigned int vpx_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_12_variance32x64 sse2/; + + add_proto qw/unsigned int vpx_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_12_variance32x32 sse2/; + + add_proto qw/unsigned int vpx_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_12_variance32x16 sse2/; + + add_proto qw/unsigned int vpx_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_12_variance16x32 sse2/; + + add_proto qw/unsigned int vpx_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_12_variance16x16 sse2/; + + add_proto qw/unsigned int vpx_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_12_variance16x8 sse2/; + + add_proto qw/unsigned int vpx_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_12_variance8x16 sse2/; + + add_proto qw/unsigned int vpx_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_12_variance8x8 sse2/; + + add_proto qw/unsigned int vpx_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + + add_proto qw/unsigned int vpx_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_10_variance64x64 sse2/; + + add_proto qw/unsigned int vpx_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_10_variance64x32 sse2/; + + add_proto qw/unsigned int vpx_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_10_variance32x64 sse2/; + + add_proto qw/unsigned int vpx_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_10_variance32x32 sse2/; + + add_proto qw/unsigned int vpx_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_10_variance32x16 sse2/; + + add_proto qw/unsigned int vpx_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_10_variance16x32 sse2/; + + add_proto qw/unsigned int vpx_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_10_variance16x16 sse2/; + + add_proto qw/unsigned int vpx_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_10_variance16x8 sse2/; + + add_proto qw/unsigned int vpx_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_10_variance8x16 sse2/; + + add_proto qw/unsigned int vpx_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_10_variance8x8 sse2/; + + add_proto qw/unsigned int vpx_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + + add_proto qw/unsigned int vpx_highbd_8_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_8_variance64x64 sse2/; + + add_proto qw/unsigned int vpx_highbd_8_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_8_variance64x32 sse2/; + + add_proto qw/unsigned int vpx_highbd_8_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_8_variance32x64 sse2/; + + add_proto qw/unsigned int vpx_highbd_8_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_8_variance32x32 sse2/; + + add_proto qw/unsigned int vpx_highbd_8_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_8_variance32x16 sse2/; + + add_proto qw/unsigned int vpx_highbd_8_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_8_variance16x32 sse2/; + + add_proto qw/unsigned int vpx_highbd_8_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_8_variance16x16 sse2/; + + add_proto qw/unsigned int vpx_highbd_8_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_8_variance16x8 sse2/; + + add_proto qw/unsigned int vpx_highbd_8_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_8_variance8x16 sse2/; + + add_proto qw/unsigned int vpx_highbd_8_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_8_variance8x8 sse2/; + + add_proto qw/unsigned int vpx_highbd_8_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_8_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_8_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + + add_proto qw/void vpx_highbd_8_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + add_proto qw/void vpx_highbd_8_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + + add_proto qw/void vpx_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + add_proto qw/void vpx_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + + add_proto qw/void vpx_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + add_proto qw/void vpx_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + + add_proto qw/unsigned int vpx_highbd_8_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/vpx_highbd_8_mse16x16 sse2/; + + add_proto qw/unsigned int vpx_highbd_8_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_8_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_8_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/vpx_highbd_8_mse8x8 sse2/; + + add_proto qw/unsigned int vpx_highbd_10_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/vpx_highbd_10_mse16x16 sse2/; + + add_proto qw/unsigned int vpx_highbd_10_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_10_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_10_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/vpx_highbd_10_mse8x8 sse2/; + + add_proto qw/unsigned int vpx_highbd_12_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/vpx_highbd_12_mse16x16 sse2/; + + add_proto qw/unsigned int vpx_highbd_12_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_12_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_12_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/vpx_highbd_12_mse8x8 sse2/; + + add_proto qw/void vpx_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride"; +} # CONFIG_VP9_HIGHBITDEPTH +} # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC + 1; diff --git a/vp9/encoder/x86/vp9_highbd_variance_impl_sse2.asm b/vpx_dsp/x86/highbd_variance_impl_sse2.asm index 821dd0660..923418a99 100644 --- a/vp9/encoder/x86/vp9_highbd_variance_impl_sse2.asm +++ b/vpx_dsp/x86/highbd_variance_impl_sse2.asm @@ -11,7 +11,7 @@ %include "vpx_ports/x86_abi_support.asm" -;unsigned int vp9_highbd_calc16x16var_sse2 +;unsigned int vpx_highbd_calc16x16var_sse2 ;( ; unsigned char * src_ptr, ; int source_stride, @@ -20,8 +20,8 @@ ; unsigned int * SSE, ; int * Sum ;) -global sym(vp9_highbd_calc16x16var_sse2) PRIVATE -sym(vp9_highbd_calc16x16var_sse2): +global sym(vpx_highbd_calc16x16var_sse2) PRIVATE +sym(vpx_highbd_calc16x16var_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -164,7 +164,7 @@ sym(vp9_highbd_calc16x16var_sse2): ret -;unsigned int vp9_highbd_calc8x8var_sse2 +;unsigned int vpx_highbd_calc8x8var_sse2 ;( ; unsigned char * src_ptr, ; int source_stride, @@ -173,8 +173,8 @@ sym(vp9_highbd_calc16x16var_sse2): ; unsigned int * SSE, ; int * Sum ;) -global sym(vp9_highbd_calc8x8var_sse2) PRIVATE -sym(vp9_highbd_calc8x8var_sse2): +global sym(vpx_highbd_calc8x8var_sse2) PRIVATE +sym(vpx_highbd_calc8x8var_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 diff --git a/vpx_dsp/x86/highbd_variance_sse2.c b/vpx_dsp/x86/highbd_variance_sse2.c new file mode 100644 index 000000000..343c0478b --- /dev/null +++ b/vpx_dsp/x86/highbd_variance_sse2.c @@ -0,0 +1,245 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "./vpx_config.h" +#include "vp9/common/vp9_common.h" + +#include "vp9/encoder/vp9_variance.h" +#include "vpx_ports/mem.h" + +typedef uint32_t (*high_variance_fn_t) (const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum); + +uint32_t vpx_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum); + +uint32_t vpx_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum); + +static void highbd_8_variance_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int w, int h, uint32_t *sse, int *sum, + high_variance_fn_t var_fn, int block_size) { + int i, j; + + *sse = 0; + *sum = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, + ref + ref_stride * i + j, ref_stride, &sse0, &sum0); + *sse += sse0; + *sum += sum0; + } + } +} + +static void highbd_10_variance_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int w, int h, uint32_t *sse, int *sum, + high_variance_fn_t var_fn, int block_size) { + int i, j; + uint64_t sse_long = 0; + int64_t sum_long = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, + ref + ref_stride * i + j, ref_stride, &sse0, &sum0); + sse_long += sse0; + sum_long += sum0; + } + } + *sum = ROUND_POWER_OF_TWO(sum_long, 2); + *sse = ROUND_POWER_OF_TWO(sse_long, 4); +} + +static void highbd_12_variance_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int w, int h, uint32_t *sse, int *sum, + high_variance_fn_t var_fn, int block_size) { + int i, j; + uint64_t sse_long = 0; + int64_t sum_long = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, + ref + ref_stride * i + j, ref_stride, &sse0, &sum0); + sse_long += sse0; + sum_long += sum0; + } + } + *sum = ROUND_POWER_OF_TWO(sum_long, 4); + *sse = ROUND_POWER_OF_TWO(sse_long, 8); +} + + +#define HIGH_GET_VAR(S) \ +void vpx_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \ + const uint8_t *ref8, int ref_stride, \ + uint32_t *sse, int *sum) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \ + sse, sum); \ +} \ +\ +void vpx_highbd_10_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \ + const uint8_t *ref8, int ref_stride, \ + uint32_t *sse, int *sum) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \ + sse, sum); \ + *sum = ROUND_POWER_OF_TWO(*sum, 2); \ + *sse = ROUND_POWER_OF_TWO(*sse, 4); \ +} \ +\ +void vpx_highbd_12_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \ + const uint8_t *ref8, int ref_stride, \ + uint32_t *sse, int *sum) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \ + sse, sum); \ + *sum = ROUND_POWER_OF_TWO(*sum, 4); \ + *sse = ROUND_POWER_OF_TWO(*sse, 8); \ +} + +HIGH_GET_VAR(16); +HIGH_GET_VAR(8); + +#undef HIGH_GET_VAR + +#define VAR_FN(w, h, block_size, shift) \ +uint32_t vpx_highbd_8_variance##w##x##h##_sse2( \ + const uint8_t *src8, int src_stride, \ + const uint8_t *ref8, int ref_stride, uint32_t *sse) { \ + int sum; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_8_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + vpx_highbd_calc##block_size##x##block_size##var_sse2, \ + block_size); \ + return *sse - (((int64_t)sum * sum) >> shift); \ +} \ +\ +uint32_t vpx_highbd_10_variance##w##x##h##_sse2( \ + const uint8_t *src8, int src_stride, \ + const uint8_t *ref8, int ref_stride, uint32_t *sse) { \ + int sum; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_10_variance_sse2( \ + src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ + return *sse - (((int64_t)sum * sum) >> shift); \ +} \ +\ +uint32_t vpx_highbd_12_variance##w##x##h##_sse2( \ + const uint8_t *src8, int src_stride, \ + const uint8_t *ref8, int ref_stride, uint32_t *sse) { \ + int sum; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_12_variance_sse2( \ + src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ + return *sse - (((int64_t)sum * sum) >> shift); \ +} + +VAR_FN(64, 64, 16, 12); +VAR_FN(64, 32, 16, 11); +VAR_FN(32, 64, 16, 11); +VAR_FN(32, 32, 16, 10); +VAR_FN(32, 16, 16, 9); +VAR_FN(16, 32, 16, 9); +VAR_FN(16, 16, 16, 8); +VAR_FN(16, 8, 8, 7); +VAR_FN(8, 16, 8, 7); +VAR_FN(8, 8, 8, 6); + +#undef VAR_FN + +unsigned int vpx_highbd_8_mse16x16_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, + sse, &sum, vpx_highbd_calc16x16var_sse2, 16); + return *sse; +} + +unsigned int vpx_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, + sse, &sum, vpx_highbd_calc16x16var_sse2, 16); + return *sse; +} + +unsigned int vpx_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, + sse, &sum, vpx_highbd_calc16x16var_sse2, 16); + return *sse; +} + +unsigned int vpx_highbd_8_mse8x8_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, + sse, &sum, vpx_highbd_calc8x8var_sse2, 8); + return *sse; +} + +unsigned int vpx_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, + sse, &sum, vpx_highbd_calc8x8var_sse2, 8); + return *sse; +} + +unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, + sse, &sum, vpx_highbd_calc8x8var_sse2, 8); + return *sse; +} diff --git a/vpx_dsp/x86/variance_avx2.c b/vpx_dsp/x86/variance_avx2.c new file mode 100644 index 000000000..82cef4af0 --- /dev/null +++ b/vpx_dsp/x86/variance_avx2.c @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "./vpx_dsp_rtcd.h" + +typedef void (*get_var_avx2)(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse, int *sum); + +void vpx_get32x32var_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse, int *sum); + +static void variance_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int w, int h, unsigned int *sse, int *sum, + get_var_avx2 var_fn, int block_size) { + int i, j; + + *sse = 0; + *sum = 0; + + for (i = 0; i < h; i += 16) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(&src[src_stride * i + j], src_stride, + &ref[ref_stride * i + j], ref_stride, &sse0, &sum0); + *sse += sse0; + *sum += sum0; + } + } +} + + +unsigned int vpx_variance16x16_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_avx2(src, src_stride, ref, ref_stride, 16, 16, + sse, &sum, vpx_get16x16var_avx2, 16); + return *sse - (((unsigned int)sum * sum) >> 8); +} + +unsigned int vpx_mse16x16_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + vpx_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum); + return *sse; +} + +unsigned int vpx_variance32x16_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_avx2(src, src_stride, ref, ref_stride, 32, 16, + sse, &sum, vpx_get32x32var_avx2, 32); + return *sse - (((int64_t)sum * sum) >> 9); +} + +unsigned int vpx_variance32x32_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_avx2(src, src_stride, ref, ref_stride, 32, 32, + sse, &sum, vpx_get32x32var_avx2, 32); + return *sse - (((int64_t)sum * sum) >> 10); +} + +unsigned int vpx_variance64x64_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_avx2(src, src_stride, ref, ref_stride, 64, 64, + sse, &sum, vpx_get32x32var_avx2, 32); + return *sse - (((int64_t)sum * sum) >> 12); +} + +unsigned int vpx_variance64x32_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_avx2(src, src_stride, ref, ref_stride, 64, 32, + sse, &sum, vpx_get32x32var_avx2, 32); + return *sse - (((int64_t)sum * sum) >> 11); +} diff --git a/vp9/encoder/x86/vp9_variance_impl_intrin_avx2.c b/vpx_dsp/x86/variance_impl_avx2.c index ee76a315f..0e40959aa 100644 --- a/vp9/encoder/x86/vp9_variance_impl_intrin_avx2.c +++ b/vpx_dsp/x86/variance_impl_avx2.c @@ -10,9 +10,9 @@ #include <immintrin.h> // AVX2 -#include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" -void vp9_get16x16var_avx2(const unsigned char *src_ptr, +void vpx_get16x16var_avx2(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int recon_stride, @@ -123,7 +123,7 @@ void vp9_get16x16var_avx2(const unsigned char *src_ptr, } } -void vp9_get32x32var_avx2(const unsigned char *src_ptr, +void vpx_get32x32var_avx2(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int recon_stride, diff --git a/vpx_dsp/x86/variance_impl_mmx.asm b/vpx_dsp/x86/variance_impl_mmx.asm new file mode 100644 index 000000000..a8d7d99db --- /dev/null +++ b/vpx_dsp/x86/variance_impl_mmx.asm @@ -0,0 +1,424 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;unsigned int vpx_get_mb_ss_mmx( short *src_ptr ) +global sym(vpx_get_mb_ss_mmx) PRIVATE +sym(vpx_get_mb_ss_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + GET_GOT rbx + push rsi + push rdi + sub rsp, 8 + ; end prolog + + mov rax, arg(0) ;src_ptr + mov rcx, 16 + pxor mm4, mm4 + +.NEXTROW: + movq mm0, [rax] + movq mm1, [rax+8] + movq mm2, [rax+16] + movq mm3, [rax+24] + pmaddwd mm0, mm0 + pmaddwd mm1, mm1 + pmaddwd mm2, mm2 + pmaddwd mm3, mm3 + + paddd mm4, mm0 + paddd mm4, mm1 + paddd mm4, mm2 + paddd mm4, mm3 + + add rax, 32 + dec rcx + ja .NEXTROW + movq QWORD PTR [rsp], mm4 + + ;return sum[0]+sum[1]; + movsxd rax, dword ptr [rsp] + movsxd rcx, dword ptr [rsp+4] + add rax, rcx + + + ; begin epilog + add rsp, 8 + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vpx_get8x8var_mmx +;( +; unsigned char *src_ptr, +; int source_stride, +; unsigned char *ref_ptr, +; int recon_stride, +; unsigned int *SSE, +; int *Sum +;) +global sym(vpx_get8x8var_mmx) PRIVATE +sym(vpx_get8x8var_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + push rbx + sub rsp, 16 + ; end prolog + + + pxor mm5, mm5 ; Blank mmx6 + pxor mm6, mm6 ; Blank mmx7 + pxor mm7, mm7 ; Blank mmx7 + + mov rax, arg(0) ;[src_ptr] ; Load base addresses + mov rbx, arg(2) ;[ref_ptr] + movsxd rcx, dword ptr arg(1) ;[source_stride] + movsxd rdx, dword ptr arg(3) ;[recon_stride] + + ; Row 1 + movq mm0, [rax] ; Copy eight bytes to mm0 + movq mm1, [rbx] ; Copy eight bytes to mm1 + movq mm2, mm0 ; Take copies + movq mm3, mm1 ; Take copies + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + punpckhbw mm2, mm6 ; unpack to higher prrcision + punpckhbw mm3, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + psubsw mm2, mm3 ; A-B (high order) to MM2 + + paddw mm5, mm0 ; accumulate differences in mm5 + paddw mm5, mm2 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + pmaddwd mm2, mm2 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movq mm1, [rbx] ; Copy eight bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + paddd mm7, mm2 ; accumulate in mm7 + + + ; Row 2 + movq mm0, [rax] ; Copy eight bytes to mm0 + movq mm2, mm0 ; Take copies + movq mm3, mm1 ; Take copies + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + punpckhbw mm2, mm6 ; unpack to higher prrcision + punpckhbw mm3, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + psubsw mm2, mm3 ; A-B (high order) to MM2 + + paddw mm5, mm0 ; accumulate differences in mm5 + paddw mm5, mm2 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + pmaddwd mm2, mm2 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movq mm1, [rbx] ; Copy eight bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + paddd mm7, mm2 ; accumulate in mm7 + + ; Row 3 + movq mm0, [rax] ; Copy eight bytes to mm0 + movq mm2, mm0 ; Take copies + movq mm3, mm1 ; Take copies + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + punpckhbw mm2, mm6 ; unpack to higher prrcision + punpckhbw mm3, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + psubsw mm2, mm3 ; A-B (high order) to MM2 + + paddw mm5, mm0 ; accumulate differences in mm5 + paddw mm5, mm2 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + pmaddwd mm2, mm2 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movq mm1, [rbx] ; Copy eight bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + paddd mm7, mm2 ; accumulate in mm7 + + ; Row 4 + movq mm0, [rax] ; Copy eight bytes to mm0 + movq mm2, mm0 ; Take copies + movq mm3, mm1 ; Take copies + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + punpckhbw mm2, mm6 ; unpack to higher prrcision + punpckhbw mm3, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + psubsw mm2, mm3 ; A-B (high order) to MM2 + + paddw mm5, mm0 ; accumulate differences in mm5 + paddw mm5, mm2 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + pmaddwd mm2, mm2 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movq mm1, [rbx] ; Copy eight bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + paddd mm7, mm2 ; accumulate in mm7 + + ; Row 5 + movq mm0, [rax] ; Copy eight bytes to mm0 + movq mm2, mm0 ; Take copies + movq mm3, mm1 ; Take copies + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + punpckhbw mm2, mm6 ; unpack to higher prrcision + punpckhbw mm3, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + psubsw mm2, mm3 ; A-B (high order) to MM2 + + paddw mm5, mm0 ; accumulate differences in mm5 + paddw mm5, mm2 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + pmaddwd mm2, mm2 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movq mm1, [rbx] ; Copy eight bytes to mm1 + ; movq mm4, [rbx + rdx] + paddd mm7, mm0 ; accumulate in mm7 + paddd mm7, mm2 ; accumulate in mm7 + + ; Row 6 + movq mm0, [rax] ; Copy eight bytes to mm0 + movq mm2, mm0 ; Take copies + movq mm3, mm1 ; Take copies + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + punpckhbw mm2, mm6 ; unpack to higher prrcision + punpckhbw mm3, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + psubsw mm2, mm3 ; A-B (high order) to MM2 + + paddw mm5, mm0 ; accumulate differences in mm5 + paddw mm5, mm2 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + pmaddwd mm2, mm2 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movq mm1, [rbx] ; Copy eight bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + paddd mm7, mm2 ; accumulate in mm7 + + ; Row 7 + movq mm0, [rax] ; Copy eight bytes to mm0 + movq mm2, mm0 ; Take copies + movq mm3, mm1 ; Take copies + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + punpckhbw mm2, mm6 ; unpack to higher prrcision + punpckhbw mm3, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + psubsw mm2, mm3 ; A-B (high order) to MM2 + + paddw mm5, mm0 ; accumulate differences in mm5 + paddw mm5, mm2 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + pmaddwd mm2, mm2 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movq mm1, [rbx] ; Copy eight bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + paddd mm7, mm2 ; accumulate in mm7 + + ; Row 8 + movq mm0, [rax] ; Copy eight bytes to mm0 + movq mm2, mm0 ; Take copies + movq mm3, mm1 ; Take copies + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + punpckhbw mm2, mm6 ; unpack to higher prrcision + punpckhbw mm3, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + psubsw mm2, mm3 ; A-B (high order) to MM2 + + paddw mm5, mm0 ; accumulate differences in mm5 + paddw mm5, mm2 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + pmaddwd mm2, mm2 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + paddd mm7, mm0 ; accumulate in mm7 + paddd mm7, mm2 ; accumulate in mm7 + + ; Now accumulate the final results. + movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory + movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory + movsx rdx, WORD PTR [rsp+8] + movsx rcx, WORD PTR [rsp+10] + movsx rbx, WORD PTR [rsp+12] + movsx rax, WORD PTR [rsp+14] + add rdx, rcx + add rbx, rax + add rdx, rbx ;XSum + movsxd rax, DWORD PTR [rsp] + movsxd rcx, DWORD PTR [rsp+4] + add rax, rcx ;XXSum + mov rsi, arg(4) ;SSE + mov rdi, arg(5) ;Sum + mov dword ptr [rsi], eax + mov dword ptr [rdi], edx + xor rax, rax ; return 0 + + + ; begin epilog + add rsp, 16 + pop rbx + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + + +;void +;vpx_get4x4var_mmx +;( +; unsigned char *src_ptr, +; int source_stride, +; unsigned char *ref_ptr, +; int recon_stride, +; unsigned int *SSE, +; int *Sum +;) +global sym(vpx_get4x4var_mmx) PRIVATE +sym(vpx_get4x4var_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + push rbx + sub rsp, 16 + ; end prolog + + + pxor mm5, mm5 ; Blank mmx6 + pxor mm6, mm6 ; Blank mmx7 + pxor mm7, mm7 ; Blank mmx7 + + mov rax, arg(0) ;[src_ptr] ; Load base addresses + mov rbx, arg(2) ;[ref_ptr] + movsxd rcx, dword ptr arg(1) ;[source_stride] + movsxd rdx, dword ptr arg(3) ;[recon_stride] + + ; Row 1 + movd mm0, [rax] ; Copy four bytes to mm0 + movd mm1, [rbx] ; Copy four bytes to mm1 + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + paddw mm5, mm0 ; accumulate differences in mm5 + pmaddwd mm0, mm0 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movd mm1, [rbx] ; Copy four bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + + + ; Row 2 + movd mm0, [rax] ; Copy four bytes to mm0 + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + paddw mm5, mm0 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movd mm1, [rbx] ; Copy four bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + + ; Row 3 + movd mm0, [rax] ; Copy four bytes to mm0 + punpcklbw mm0, mm6 ; unpack to higher precision + punpcklbw mm1, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + paddw mm5, mm0 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movd mm1, [rbx] ; Copy four bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + + ; Row 4 + movd mm0, [rax] ; Copy four bytes to mm0 + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + + paddw mm5, mm0 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + paddd mm7, mm0 ; accumulate in mm7 + + + ; Now accumulate the final results. + movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory + movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory + movsx rdx, WORD PTR [rsp+8] + movsx rcx, WORD PTR [rsp+10] + movsx rbx, WORD PTR [rsp+12] + movsx rax, WORD PTR [rsp+14] + add rdx, rcx + add rbx, rax + add rdx, rbx ;XSum + movsxd rax, DWORD PTR [rsp] + movsxd rcx, DWORD PTR [rsp+4] + add rax, rcx ;XXSum + mov rsi, arg(4) ;SSE + mov rdi, arg(5) ;Sum + mov dword ptr [rsi], eax + mov dword ptr [rdi], edx + xor rax, rax ; return 0 + + + ; begin epilog + add rsp, 16 + pop rbx + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret diff --git a/vpx_dsp/x86/variance_mmx.c b/vpx_dsp/x86/variance_mmx.c new file mode 100644 index 000000000..99dd741bc --- /dev/null +++ b/vpx_dsp/x86/variance_mmx.c @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" + +extern void vpx_get4x4var_mmx(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse, int *sum); + +unsigned int vpx_variance4x4_mmx(const unsigned char *a, int a_stride, + const unsigned char *b, int b_stride, + unsigned int *sse) { + unsigned int var; + int avg; + + vpx_get4x4var_mmx(a, a_stride, b, b_stride, &var, &avg); + *sse = var; + return (var - (((unsigned int)avg * avg) >> 4)); +} + +unsigned int vpx_variance8x8_mmx(const unsigned char *a, int a_stride, + const unsigned char *b, int b_stride, + unsigned int *sse) { + unsigned int var; + int avg; + + vpx_get8x8var_mmx(a, a_stride, b, b_stride, &var, &avg); + *sse = var; + + return (var - (((unsigned int)avg * avg) >> 6)); +} + +unsigned int vpx_mse16x16_mmx(const unsigned char *a, int a_stride, + const unsigned char *b, int b_stride, + unsigned int *sse) { + unsigned int sse0, sse1, sse2, sse3, var; + int sum0, sum1, sum2, sum3; + + vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0); + vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1); + vpx_get8x8var_mmx(a + 8 * a_stride, a_stride, + b + 8 * b_stride, b_stride, &sse2, &sum2); + vpx_get8x8var_mmx(a + 8 * a_stride + 8, a_stride, + b + 8 * b_stride + 8, b_stride, &sse3, &sum3); + + var = sse0 + sse1 + sse2 + sse3; + *sse = var; + return var; +} + +unsigned int vpx_variance16x16_mmx(const unsigned char *a, int a_stride, + const unsigned char *b, int b_stride, + unsigned int *sse) { + unsigned int sse0, sse1, sse2, sse3, var; + int sum0, sum1, sum2, sum3, avg; + + vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0); + vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1); + vpx_get8x8var_mmx(a + 8 * a_stride, a_stride, + b + 8 * b_stride, b_stride, &sse2, &sum2); + vpx_get8x8var_mmx(a + 8 * a_stride + 8, a_stride, + b + 8 * b_stride + 8, b_stride, &sse3, &sum3); + + var = sse0 + sse1 + sse2 + sse3; + avg = sum0 + sum1 + sum2 + sum3; + *sse = var; + return (var - (((unsigned int)avg * avg) >> 8)); +} + +unsigned int vpx_variance16x8_mmx(const unsigned char *a, int a_stride, + const unsigned char *b, int b_stride, + unsigned int *sse) { + unsigned int sse0, sse1, var; + int sum0, sum1, avg; + + vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0); + vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1); + + var = sse0 + sse1; + avg = sum0 + sum1; + *sse = var; + return (var - (((unsigned int)avg * avg) >> 7)); +} + +unsigned int vpx_variance8x16_mmx(const unsigned char *a, int a_stride, + const unsigned char *b, int b_stride, + unsigned int *sse) { + unsigned int sse0, sse1, var; + int sum0, sum1, avg; + + vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0); + vpx_get8x8var_mmx(a + 8 * a_stride, a_stride, + b + 8 * b_stride, b_stride, &sse1, &sum1); + + var = sse0 + sse1; + avg = sum0 + sum1; + *sse = var; + + return (var - (((unsigned int)avg * avg) >> 7)); +} diff --git a/vpx_dsp/x86/variance_sse2.c b/vpx_dsp/x86/variance_sse2.c new file mode 100644 index 000000000..6256bc536 --- /dev/null +++ b/vpx_dsp/x86/variance_sse2.c @@ -0,0 +1,309 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <emmintrin.h> // SSE2 + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx_ports/mem.h" + +typedef void (*getNxMvar_fn_t) (const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse, int *sum); + +unsigned int vpx_get_mb_ss_sse2(const int16_t *src) { + __m128i vsum = _mm_setzero_si128(); + int i; + + for (i = 0; i < 32; ++i) { + const __m128i v = _mm_loadu_si128((const __m128i *)src); + vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v)); + src += 8; + } + + vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4)); + return _mm_cvtsi128_si32(vsum); +} + +#define READ64(p, stride, i) \ + _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \ + _mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride))) + +static void get4x4var_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse, int *sum) { + const __m128i zero = _mm_setzero_si128(); + const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero); + const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero); + const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero); + const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero); + const __m128i diff0 = _mm_sub_epi16(src0, ref0); + const __m128i diff1 = _mm_sub_epi16(src1, ref1); + + // sum + __m128i vsum = _mm_add_epi16(diff0, diff1); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2)); + *sum = (int16_t)_mm_extract_epi16(vsum, 0); + + // sse + vsum = _mm_add_epi32(_mm_madd_epi16(diff0, diff0), + _mm_madd_epi16(diff1, diff1)); + vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4)); + *sse = _mm_cvtsi128_si32(vsum); +} + +void vpx_get8x8var_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse, int *sum) { + const __m128i zero = _mm_setzero_si128(); + __m128i vsum = _mm_setzero_si128(); + __m128i vsse = _mm_setzero_si128(); + int i; + + for (i = 0; i < 8; i += 2) { + const __m128i src0 = _mm_unpacklo_epi8(_mm_loadl_epi64( + (const __m128i *)(src + i * src_stride)), zero); + const __m128i ref0 = _mm_unpacklo_epi8(_mm_loadl_epi64( + (const __m128i *)(ref + i * ref_stride)), zero); + const __m128i diff0 = _mm_sub_epi16(src0, ref0); + + const __m128i src1 = _mm_unpacklo_epi8(_mm_loadl_epi64( + (const __m128i *)(src + (i + 1) * src_stride)), zero); + const __m128i ref1 = _mm_unpacklo_epi8(_mm_loadl_epi64( + (const __m128i *)(ref + (i + 1) * ref_stride)), zero); + const __m128i diff1 = _mm_sub_epi16(src1, ref1); + + vsum = _mm_add_epi16(vsum, diff0); + vsum = _mm_add_epi16(vsum, diff1); + vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0)); + vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1)); + } + + // sum + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2)); + *sum = (int16_t)_mm_extract_epi16(vsum, 0); + + // sse + vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8)); + vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4)); + *sse = _mm_cvtsi128_si32(vsse); +} + +void vpx_get16x16var_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse, int *sum) { + const __m128i zero = _mm_setzero_si128(); + __m128i vsum = _mm_setzero_si128(); + __m128i vsse = _mm_setzero_si128(); + int i; + + for (i = 0; i < 16; ++i) { + const __m128i s = _mm_loadu_si128((const __m128i *)src); + const __m128i r = _mm_loadu_si128((const __m128i *)ref); + + const __m128i src0 = _mm_unpacklo_epi8(s, zero); + const __m128i ref0 = _mm_unpacklo_epi8(r, zero); + const __m128i diff0 = _mm_sub_epi16(src0, ref0); + + const __m128i src1 = _mm_unpackhi_epi8(s, zero); + const __m128i ref1 = _mm_unpackhi_epi8(r, zero); + const __m128i diff1 = _mm_sub_epi16(src1, ref1); + + vsum = _mm_add_epi16(vsum, diff0); + vsum = _mm_add_epi16(vsum, diff1); + vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0)); + vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1)); + + src += src_stride; + ref += ref_stride; + } + + // sum + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); + *sum = (int16_t)_mm_extract_epi16(vsum, 0) + + (int16_t)_mm_extract_epi16(vsum, 1); + + // sse + vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8)); + vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4)); + *sse = _mm_cvtsi128_si32(vsse); +} + + +static void variance_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + int w, int h, unsigned int *sse, int *sum, + getNxMvar_fn_t var_fn, int block_size) { + int i, j; + + *sse = 0; + *sum = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, + ref + ref_stride * i + j, ref_stride, &sse0, &sum0); + *sse += sse0; + *sum += sum0; + } + } +} + +unsigned int vpx_variance4x4_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse) { + int sum; + get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum); + return *sse - (((unsigned int)sum * sum) >> 4); +} + +unsigned int vpx_variance8x4_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 8, 4, + sse, &sum, get4x4var_sse2, 4); + return *sse - (((unsigned int)sum * sum) >> 5); +} + +unsigned int vpx_variance4x8_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 4, 8, + sse, &sum, get4x4var_sse2, 4); + return *sse - (((unsigned int)sum * sum) >> 5); +} + +unsigned int vpx_variance8x8_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse) { + int sum; + vpx_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum); + return *sse - (((unsigned int)sum * sum) >> 6); +} + +unsigned int vpx_variance16x8_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 16, 8, + sse, &sum, vpx_get8x8var_sse2, 8); + return *sse - (((unsigned int)sum * sum) >> 7); +} + +unsigned int vpx_variance8x16_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 8, 16, + sse, &sum, vpx_get8x8var_sse2, 8); + return *sse - (((unsigned int)sum * sum) >> 7); +} + +unsigned int vpx_variance16x16_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse) { + int sum; + vpx_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum); + return *sse - (((unsigned int)sum * sum) >> 8); +} + +unsigned int vpx_variance32x32_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 32, 32, + sse, &sum, vpx_get16x16var_sse2, 16); + return *sse - (((int64_t)sum * sum) >> 10); +} + +unsigned int vpx_variance32x16_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 32, 16, + sse, &sum, vpx_get16x16var_sse2, 16); + return *sse - (((int64_t)sum * sum) >> 9); +} + +unsigned int vpx_variance16x32_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 16, 32, + sse, &sum, vpx_get16x16var_sse2, 16); + return *sse - (((int64_t)sum * sum) >> 9); +} + +unsigned int vpx_variance64x64_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 64, 64, + sse, &sum, vpx_get16x16var_sse2, 16); + return *sse - (((int64_t)sum * sum) >> 12); +} + +unsigned int vpx_variance64x32_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 64, 32, + sse, &sum, vpx_get16x16var_sse2, 16); + return *sse - (((int64_t)sum * sum) >> 11); +} + +unsigned int vpx_variance32x64_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 32, 64, + sse, &sum, vpx_get16x16var_sse2, 16); + return *sse - (((int64_t)sum * sum) >> 11); +} + +unsigned int vpx_mse8x8_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + vpx_variance8x8_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; +} + +unsigned int vpx_mse8x16_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + vpx_variance8x16_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; +} + +unsigned int vpx_mse16x8_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + vpx_variance16x8_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; +} + +unsigned int vpx_mse16x16_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + vpx_variance16x16_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; +} diff --git a/vpx_ports/msvc.h b/vpx_ports/msvc.h new file mode 100644 index 000000000..43a36e761 --- /dev/null +++ b/vpx_ports/msvc.h @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_PORTS_MSVC_H_ +#define VPX_PORTS_MSVC_H_ +#ifdef _MSC_VER + +#include "./vpx_config.h" + +# if _MSC_VER < 1900 // VS2015 provides snprintf +# define snprintf _snprintf +# endif // _MSC_VER < 1900 + +#endif // _MSC_VER +#endif // VPX_PORTS_MSVC_H_ diff --git a/vpx_ports/vpx_ports.mk b/vpx_ports/vpx_ports.mk index dfc75ab6f..ab7fc4ac7 100644 --- a/vpx_ports/vpx_ports.mk +++ b/vpx_ports/vpx_ports.mk @@ -12,6 +12,7 @@ PORTS_SRCS-yes += vpx_ports.mk PORTS_SRCS-yes += mem.h +PORTS_SRCS-yes += msvc.h PORTS_SRCS-yes += vpx_timer.h ifeq ($(ARCH_X86)$(ARCH_X86_64),yes) |