From 8fb6c58191251792765c2910af3f9d6da22d6c11 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Thu, 20 Jun 2013 09:34:25 -0700 Subject: Implement sse2 and ssse3 versions for all sub_pixel_variance sizes. Overall speedup around 5% (bus @ 1500kbps first 50 frames 4min10 -> 3min58). Specific changes to timings for each function compared to original assembly-optimized versions (or just new version timings if no previous assembly-optimized version was available): sse2 4x4: 99 -> 82 cycles sse2 4x8: 128 cycles sse2 8x4: 121 cycles sse2 8x8: 149 -> 129 cycles sse2 8x16: 235 -> 245 cycles (?) sse2 16x8: 269 -> 203 cycles sse2 16x16: 441 -> 349 cycles sse2 16x32: 641 cycles sse2 32x16: 643 cycles sse2 32x32: 1733 -> 1154 cycles sse2 32x64: 2247 cycles sse2 64x32: 2323 cycles sse2 64x64: 6984 -> 4442 cycles ssse3 4x4: 100 cycles (?) ssse3 4x8: 103 cycles ssse3 8x4: 71 cycles ssse3 8x8: 147 cycles ssse3 8x16: 158 cycles ssse3 16x8: 188 -> 162 cycles ssse3 16x16: 316 -> 273 cycles ssse3 16x32: 535 cycles ssse3 32x16: 564 cycles ssse3 32x32: 973 cycles ssse3 32x64: 1930 cycles ssse3 64x32: 1922 cycles ssse3 64x64: 3760 cycles Change-Id: I81ff6fe51daf35a40d19785167004664d7e0c59d --- test/variance_test.cc | 367 ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 327 insertions(+), 40 deletions(-) (limited to 'test/variance_test.cc') diff --git a/test/variance_test.cc b/test/variance_test.cc index dfa1a07c7..e7037d9d6 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -26,12 +26,55 @@ extern "C" { # include "vp9_rtcd.h" #endif } +#include "test/acm_random.h" namespace { using ::std::tr1::get; using ::std::tr1::make_tuple; using ::std::tr1::tuple; +using libvpx_test::ACMRandom; + +static unsigned int variance_ref(const uint8_t *ref, const uint8_t *src, + int l2w, int l2h, unsigned int *sse_ptr) { + int se = 0; + unsigned int sse = 0; + const int w = 1 << l2w, h = 1 << l2h; + for (int y = 0; y < h; y++) { + for (int x = 0; x < w; x++) { + int diff = ref[w * y + x] - src[w * y + x]; + se += diff; + sse += diff * diff; + } + } + *sse_ptr = sse; + return sse - (((int64_t) se * se) >> (l2w + l2h)); +} + +static unsigned int subpel_variance_ref(const uint8_t *ref, const uint8_t *src, + int l2w, int l2h, int xoff, int yoff, + unsigned int *sse_ptr) { + int se = 0; + unsigned int sse = 0; + const int w = 1 << l2w, h = 1 << l2h; + for (int y = 0; y < h; y++) { + for (int x = 0; x < w; x++) { + // bilinear interpolation at a 16th pel step + const int a1 = ref[(w + 1) * (y + 0) + x + 0]; + const int a2 = ref[(w + 1) * (y + 0) + x + 1]; + const int b1 = ref[(w + 1) * (y + 1) + x + 0]; + const int b2 = ref[(w + 1) * (y + 1) + x + 1]; + const int a = a1 + (((a2 - a1) * xoff + 8) >> 4); + const int b = b1 + (((b2 - b1) * xoff + 8) >> 4); + const int r = a + (((b - a) * yoff + 8) >> 4); + int diff = r - src[w * y + x]; + se += diff; + sse += diff * diff; + } + } + *sse_ptr = sse; + return sse - (((int64_t) se * se) >> (l2w + l2h)); +} template class VarianceTest : @@ -39,10 +82,13 @@ class VarianceTest : public: virtual void SetUp() { const tuple& params = this->GetParam(); - width_ = get<0>(params); - height_ = get<1>(params); + log2width_ = get<0>(params); + width_ = 1 << log2width_; + log2height_ = get<1>(params); + height_ = 1 << log2height_; variance_ = get<2>(params); + rnd(ACMRandom::DeterministicSeed()); block_size_ = width_ * height_; src_ = new uint8_t[block_size_]; ref_ = new uint8_t[block_size_]; @@ -58,15 +104,16 @@ class VarianceTest : protected: void ZeroTest(); + void RefTest(); void OneQuarterTest(); + ACMRandom rnd; uint8_t* src_; uint8_t* ref_; - int width_; - int height_; + int width_, log2width_; + int height_, log2height_; int block_size_; VarianceFunctionType variance_; - }; template @@ -82,6 +129,22 @@ void VarianceTest::ZeroTest() { } } +template +void VarianceTest::RefTest() { + for (int i = 0; i < 10; ++i) { + for (int j = 0; j < block_size_; j++) { + src_[j] = rnd.Rand8(); + ref_[j] = rnd.Rand8(); + } + unsigned int sse1, sse2; + const unsigned int var1 = variance_(src_, width_, ref_, width_, &sse1); + const unsigned int var2 = variance_ref(src_, ref_, log2width_, + log2height_, &sse2); + EXPECT_EQ(sse1, sse2); + EXPECT_EQ(var1, var2); + } +} + template void VarianceTest::OneQuarterTest() { memset(src_, 255, block_size_); @@ -94,6 +157,66 @@ void VarianceTest::OneQuarterTest() { EXPECT_EQ(expected, var); } +template +class SubpelVarianceTest : + public ::testing::TestWithParam > { + public: + virtual void SetUp() { + const tuple& params = + this->GetParam(); + log2width_ = get<0>(params); + width_ = 1 << log2width_; + log2height_ = get<1>(params); + height_ = 1 << log2height_; + subpel_variance_ = get<2>(params); + + rnd(ACMRandom::DeterministicSeed()); + block_size_ = width_ * height_; + src_ = new uint8_t[block_size_]; + ref_ = new uint8_t[block_size_ + width_ + height_ + 1]; + ASSERT_TRUE(src_ != NULL); + ASSERT_TRUE(ref_ != NULL); + } + + virtual void TearDown() { + delete[] src_; + delete[] ref_; + } + + protected: + void RefTest(); + + ACMRandom rnd; + uint8_t* src_; + uint8_t* ref_; + int width_, log2width_; + int height_, log2height_; + int block_size_; + SubpelVarianceFunctionType subpel_variance_; +}; + +template +void SubpelVarianceTest::RefTest() { + for (int x = 0; x < 16; ++x) { + for (int y = 0; y < 16; ++y) { + for (int j = 0; j < block_size_; j++) { + src_[j] = rnd.Rand8(); + } + for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) { + ref_[j] = rnd.Rand8(); + } + unsigned int sse1, sse2; + const unsigned int var1 = subpel_variance_(ref_, width_ + 1, x, y, + src_, width_, &sse1); + const unsigned int var2 = subpel_variance_ref(ref_, src_, log2width_, + log2height_, x, y, &sse2); + EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y; + EXPECT_EQ(var1, var2) << "at position " << x << ", " << y; + } + } +} + // ----------------------------------------------------------------------------- // VP8 test cases. @@ -103,6 +226,7 @@ namespace vp8 { typedef VarianceTest VP8VarianceTest; TEST_P(VP8VarianceTest, Zero) { ZeroTest(); } +TEST_P(VP8VarianceTest, Ref) { RefTest(); } TEST_P(VP8VarianceTest, OneQuarter) { OneQuarterTest(); } const vp8_variance_fn_t variance4x4_c = vp8_variance4x4_c; @@ -112,11 +236,11 @@ const vp8_variance_fn_t variance16x8_c = vp8_variance16x8_c; const vp8_variance_fn_t variance16x16_c = vp8_variance16x16_c; INSTANTIATE_TEST_CASE_P( C, VP8VarianceTest, - ::testing::Values(make_tuple(4, 4, variance4x4_c), - make_tuple(8, 8, variance8x8_c), - make_tuple(8, 16, variance8x16_c), - make_tuple(16, 8, variance16x8_c), - make_tuple(16, 16, variance16x16_c))); + ::testing::Values(make_tuple(2, 2, variance4x4_c), + make_tuple(3, 3, variance8x8_c), + make_tuple(3, 4, variance8x16_c), + make_tuple(4, 3, variance16x8_c), + make_tuple(4, 4, variance16x16_c))); #if HAVE_MMX const vp8_variance_fn_t variance4x4_mmx = vp8_variance4x4_mmx; @@ -126,11 +250,11 @@ const vp8_variance_fn_t variance16x8_mmx = vp8_variance16x8_mmx; const vp8_variance_fn_t variance16x16_mmx = vp8_variance16x16_mmx; INSTANTIATE_TEST_CASE_P( MMX, VP8VarianceTest, - ::testing::Values(make_tuple(4, 4, variance4x4_mmx), - make_tuple(8, 8, variance8x8_mmx), - make_tuple(8, 16, variance8x16_mmx), - make_tuple(16, 8, variance16x8_mmx), - make_tuple(16, 16, variance16x16_mmx))); + ::testing::Values(make_tuple(2, 2, variance4x4_mmx), + make_tuple(3, 3, variance8x8_mmx), + make_tuple(3, 4, variance8x16_mmx), + make_tuple(4, 3, variance16x8_mmx), + make_tuple(4, 4, variance16x16_mmx))); #endif #if HAVE_SSE2 @@ -141,11 +265,11 @@ const vp8_variance_fn_t variance16x8_wmt = vp8_variance16x8_wmt; const vp8_variance_fn_t variance16x16_wmt = vp8_variance16x16_wmt; INSTANTIATE_TEST_CASE_P( SSE2, VP8VarianceTest, - ::testing::Values(make_tuple(4, 4, variance4x4_wmt), - make_tuple(8, 8, variance8x8_wmt), - make_tuple(8, 16, variance8x16_wmt), - make_tuple(16, 8, variance16x8_wmt), - make_tuple(16, 16, variance16x16_wmt))); + ::testing::Values(make_tuple(2, 2, variance4x4_wmt), + make_tuple(3, 3, variance8x8_wmt), + make_tuple(3, 4, variance8x16_wmt), + make_tuple(4, 3, variance16x8_wmt), + make_tuple(4, 4, variance16x16_wmt))); #endif #endif // CONFIG_VP8_ENCODER @@ -158,22 +282,83 @@ namespace vp9 { #if CONFIG_VP9_ENCODER typedef VarianceTest VP9VarianceTest; +typedef SubpelVarianceTest VP9SubpelVarianceTest; TEST_P(VP9VarianceTest, Zero) { ZeroTest(); } +TEST_P(VP9VarianceTest, Ref) { RefTest(); } +TEST_P(VP9SubpelVarianceTest, Ref) { RefTest(); } TEST_P(VP9VarianceTest, OneQuarter) { OneQuarterTest(); } const vp9_variance_fn_t variance4x4_c = vp9_variance4x4_c; +const vp9_variance_fn_t variance4x8_c = vp9_variance4x8_c; +const vp9_variance_fn_t variance8x4_c = vp9_variance8x4_c; const vp9_variance_fn_t variance8x8_c = vp9_variance8x8_c; const vp9_variance_fn_t variance8x16_c = vp9_variance8x16_c; const vp9_variance_fn_t variance16x8_c = vp9_variance16x8_c; const vp9_variance_fn_t variance16x16_c = vp9_variance16x16_c; +const vp9_variance_fn_t variance16x32_c = vp9_variance16x32_c; +const vp9_variance_fn_t variance32x16_c = vp9_variance32x16_c; +const vp9_variance_fn_t variance32x32_c = vp9_variance32x32_c; +const vp9_variance_fn_t variance32x64_c = vp9_variance32x64_c; +const vp9_variance_fn_t variance64x32_c = vp9_variance64x32_c; +const vp9_variance_fn_t variance64x64_c = vp9_variance64x64_c; INSTANTIATE_TEST_CASE_P( C, VP9VarianceTest, - ::testing::Values(make_tuple(4, 4, variance4x4_c), - make_tuple(8, 8, variance8x8_c), - make_tuple(8, 16, variance8x16_c), - make_tuple(16, 8, variance16x8_c), - make_tuple(16, 16, variance16x16_c))); + ::testing::Values(make_tuple(2, 2, variance4x4_c), + make_tuple(2, 3, variance4x8_c), + make_tuple(3, 2, variance8x4_c), + make_tuple(3, 3, variance8x8_c), + make_tuple(3, 4, variance8x16_c), + make_tuple(4, 3, variance16x8_c), + make_tuple(4, 4, variance16x16_c), + make_tuple(4, 5, variance16x32_c), + make_tuple(5, 4, variance32x16_c), + make_tuple(5, 5, variance32x32_c), + make_tuple(5, 6, variance32x64_c), + make_tuple(6, 5, variance64x32_c), + make_tuple(6, 6, variance64x64_c))); + +const vp9_subpixvariance_fn_t subpel_variance4x4_c = + vp9_sub_pixel_variance4x4_c; +const vp9_subpixvariance_fn_t subpel_variance4x8_c = + vp9_sub_pixel_variance4x8_c; +const vp9_subpixvariance_fn_t subpel_variance8x4_c = + vp9_sub_pixel_variance8x4_c; +const vp9_subpixvariance_fn_t subpel_variance8x8_c = + vp9_sub_pixel_variance8x8_c; +const vp9_subpixvariance_fn_t subpel_variance8x16_c = + vp9_sub_pixel_variance8x16_c; +const vp9_subpixvariance_fn_t subpel_variance16x8_c = + vp9_sub_pixel_variance16x8_c; +const vp9_subpixvariance_fn_t subpel_variance16x16_c = + vp9_sub_pixel_variance16x16_c; +const vp9_subpixvariance_fn_t subpel_variance16x32_c = + vp9_sub_pixel_variance16x32_c; +const vp9_subpixvariance_fn_t subpel_variance32x16_c = + vp9_sub_pixel_variance32x16_c; +const vp9_subpixvariance_fn_t subpel_variance32x32_c = + vp9_sub_pixel_variance32x32_c; +const vp9_subpixvariance_fn_t subpel_variance32x64_c = + vp9_sub_pixel_variance32x64_c; +const vp9_subpixvariance_fn_t subpel_variance64x32_c = + vp9_sub_pixel_variance64x32_c; +const vp9_subpixvariance_fn_t subpel_variance64x64_c = + vp9_sub_pixel_variance64x64_c; +INSTANTIATE_TEST_CASE_P( + C, VP9SubpelVarianceTest, + ::testing::Values(make_tuple(2, 2, subpel_variance4x4_c), + make_tuple(2, 3, subpel_variance4x8_c), + make_tuple(3, 2, subpel_variance8x4_c), + make_tuple(3, 3, subpel_variance8x8_c), + make_tuple(3, 4, subpel_variance8x16_c), + make_tuple(4, 3, subpel_variance16x8_c), + make_tuple(4, 4, subpel_variance16x16_c), + make_tuple(4, 5, subpel_variance16x32_c), + make_tuple(5, 4, subpel_variance32x16_c), + make_tuple(5, 5, subpel_variance32x32_c), + make_tuple(5, 6, subpel_variance32x64_c), + make_tuple(6, 5, subpel_variance64x32_c), + make_tuple(6, 6, subpel_variance64x64_c))); #if HAVE_MMX const vp9_variance_fn_t variance4x4_mmx = vp9_variance4x4_mmx; @@ -183,26 +368,128 @@ const vp9_variance_fn_t variance16x8_mmx = vp9_variance16x8_mmx; const vp9_variance_fn_t variance16x16_mmx = vp9_variance16x16_mmx; INSTANTIATE_TEST_CASE_P( MMX, VP9VarianceTest, - ::testing::Values(make_tuple(4, 4, variance4x4_mmx), - make_tuple(8, 8, variance8x8_mmx), - make_tuple(8, 16, variance8x16_mmx), - make_tuple(16, 8, variance16x8_mmx), - make_tuple(16, 16, variance16x16_mmx))); + ::testing::Values(make_tuple(2, 2, variance4x4_mmx), + make_tuple(3, 3, variance8x8_mmx), + make_tuple(3, 4, variance8x16_mmx), + make_tuple(4, 3, variance16x8_mmx), + make_tuple(4, 4, variance16x16_mmx))); #endif #if HAVE_SSE2 -const vp9_variance_fn_t variance4x4_wmt = vp9_variance4x4_sse2; -const vp9_variance_fn_t variance8x8_wmt = vp9_variance8x8_sse2; -const vp9_variance_fn_t variance8x16_wmt = vp9_variance8x16_sse2; -const vp9_variance_fn_t variance16x8_wmt = vp9_variance16x8_sse2; -const vp9_variance_fn_t variance16x16_wmt = vp9_variance16x16_sse2; +const vp9_variance_fn_t variance4x4_sse2 = vp9_variance4x4_sse2; +const vp9_variance_fn_t variance4x8_sse2 = vp9_variance4x8_sse2; +const vp9_variance_fn_t variance8x4_sse2 = vp9_variance8x4_sse2; +const vp9_variance_fn_t variance8x8_sse2 = vp9_variance8x8_sse2; +const vp9_variance_fn_t variance8x16_sse2 = vp9_variance8x16_sse2; +const vp9_variance_fn_t variance16x8_sse2 = vp9_variance16x8_sse2; +const vp9_variance_fn_t variance16x16_sse2 = vp9_variance16x16_sse2; +const vp9_variance_fn_t variance16x32_sse2 = vp9_variance16x32_sse2; +const vp9_variance_fn_t variance32x16_sse2 = vp9_variance32x16_sse2; +const vp9_variance_fn_t variance32x32_sse2 = vp9_variance32x32_sse2; +const vp9_variance_fn_t variance32x64_sse2 = vp9_variance32x64_sse2; +const vp9_variance_fn_t variance64x32_sse2 = vp9_variance64x32_sse2; +const vp9_variance_fn_t variance64x64_sse2 = vp9_variance64x64_sse2; INSTANTIATE_TEST_CASE_P( SSE2, VP9VarianceTest, - ::testing::Values(make_tuple(4, 4, variance4x4_wmt), - make_tuple(8, 8, variance8x8_wmt), - make_tuple(8, 16, variance8x16_wmt), - make_tuple(16, 8, variance16x8_wmt), - make_tuple(16, 16, variance16x16_wmt))); + ::testing::Values(make_tuple(2, 2, variance4x4_sse2), + make_tuple(2, 3, variance4x8_sse2), + make_tuple(3, 2, variance8x4_sse2), + make_tuple(3, 3, variance8x8_sse2), + make_tuple(3, 4, variance8x16_sse2), + make_tuple(4, 3, variance16x8_sse2), + make_tuple(4, 4, variance16x16_sse2), + make_tuple(4, 5, variance16x32_sse2), + make_tuple(5, 4, variance32x16_sse2), + make_tuple(5, 5, variance32x32_sse2), + make_tuple(5, 6, variance32x64_sse2), + make_tuple(6, 5, variance64x32_sse2), + make_tuple(6, 6, variance64x64_sse2))); + +const vp9_subpixvariance_fn_t subpel_variance4x4_sse = + vp9_sub_pixel_variance4x4_sse; +const vp9_subpixvariance_fn_t subpel_variance4x8_sse = + vp9_sub_pixel_variance4x8_sse; +const vp9_subpixvariance_fn_t subpel_variance8x4_sse2 = + vp9_sub_pixel_variance8x4_sse2; +const vp9_subpixvariance_fn_t subpel_variance8x8_sse2 = + vp9_sub_pixel_variance8x8_sse2; +const vp9_subpixvariance_fn_t subpel_variance8x16_sse2 = + vp9_sub_pixel_variance8x16_sse2; +const vp9_subpixvariance_fn_t subpel_variance16x8_sse2 = + vp9_sub_pixel_variance16x8_sse2; +const vp9_subpixvariance_fn_t subpel_variance16x16_sse2 = + vp9_sub_pixel_variance16x16_sse2; +const vp9_subpixvariance_fn_t subpel_variance16x32_sse2 = + vp9_sub_pixel_variance16x32_sse2; +const vp9_subpixvariance_fn_t subpel_variance32x16_sse2 = + vp9_sub_pixel_variance32x16_sse2; +const vp9_subpixvariance_fn_t subpel_variance32x32_sse2 = + vp9_sub_pixel_variance32x32_sse2; +const vp9_subpixvariance_fn_t subpel_variance32x64_sse2 = + vp9_sub_pixel_variance32x64_sse2; +const vp9_subpixvariance_fn_t subpel_variance64x32_sse2 = + vp9_sub_pixel_variance64x32_sse2; +const vp9_subpixvariance_fn_t subpel_variance64x64_sse2 = + vp9_sub_pixel_variance64x64_sse2; +INSTANTIATE_TEST_CASE_P( + SSE2, VP9SubpelVarianceTest, + ::testing::Values(make_tuple(2, 2, subpel_variance4x4_sse), + make_tuple(2, 3, subpel_variance4x8_sse), + make_tuple(3, 2, subpel_variance8x4_sse2), + make_tuple(3, 3, subpel_variance8x8_sse2), + make_tuple(3, 4, subpel_variance8x16_sse2), + make_tuple(4, 3, subpel_variance16x8_sse2), + make_tuple(4, 4, subpel_variance16x16_sse2), + make_tuple(4, 5, subpel_variance16x32_sse2), + make_tuple(5, 4, subpel_variance32x16_sse2), + make_tuple(5, 5, subpel_variance32x32_sse2), + make_tuple(5, 6, subpel_variance32x64_sse2), + make_tuple(6, 5, subpel_variance64x32_sse2), + make_tuple(6, 6, subpel_variance64x64_sse2))); +#endif + +#if HAVE_SSSE3 +const vp9_subpixvariance_fn_t subpel_variance4x4_ssse3 = + vp9_sub_pixel_variance4x4_ssse3; +const vp9_subpixvariance_fn_t subpel_variance4x8_ssse3 = + vp9_sub_pixel_variance4x8_ssse3; +const vp9_subpixvariance_fn_t subpel_variance8x4_ssse3 = + vp9_sub_pixel_variance8x4_ssse3; +const vp9_subpixvariance_fn_t subpel_variance8x8_ssse3 = + vp9_sub_pixel_variance8x8_ssse3; +const vp9_subpixvariance_fn_t subpel_variance8x16_ssse3 = + vp9_sub_pixel_variance8x16_ssse3; +const vp9_subpixvariance_fn_t subpel_variance16x8_ssse3 = + vp9_sub_pixel_variance16x8_ssse3; +const vp9_subpixvariance_fn_t subpel_variance16x16_ssse3 = + vp9_sub_pixel_variance16x16_ssse3; +const vp9_subpixvariance_fn_t subpel_variance16x32_ssse3 = + vp9_sub_pixel_variance16x32_ssse3; +const vp9_subpixvariance_fn_t subpel_variance32x16_ssse3 = + vp9_sub_pixel_variance32x16_ssse3; +const vp9_subpixvariance_fn_t subpel_variance32x32_ssse3 = + vp9_sub_pixel_variance32x32_ssse3; +const vp9_subpixvariance_fn_t subpel_variance32x64_ssse3 = + vp9_sub_pixel_variance32x64_ssse3; +const vp9_subpixvariance_fn_t subpel_variance64x32_ssse3 = + vp9_sub_pixel_variance64x32_ssse3; +const vp9_subpixvariance_fn_t subpel_variance64x64_ssse3 = + vp9_sub_pixel_variance64x64_ssse3; +INSTANTIATE_TEST_CASE_P( + SSSE3, VP9SubpelVarianceTest, + ::testing::Values(make_tuple(2, 2, subpel_variance4x4_ssse3), + make_tuple(2, 3, subpel_variance4x8_ssse3), + make_tuple(3, 2, subpel_variance8x4_ssse3), + make_tuple(3, 3, subpel_variance8x8_ssse3), + make_tuple(3, 4, subpel_variance8x16_ssse3), + make_tuple(4, 3, subpel_variance16x8_ssse3), + make_tuple(4, 4, subpel_variance16x16_ssse3), + make_tuple(4, 5, subpel_variance16x32_ssse3), + make_tuple(5, 4, subpel_variance32x16_ssse3), + make_tuple(5, 5, subpel_variance32x32_ssse3), + make_tuple(5, 6, subpel_variance32x64_ssse3), + make_tuple(6, 5, subpel_variance64x32_ssse3), + make_tuple(6, 6, subpel_variance64x64_ssse3))); #endif #endif // CONFIG_VP9_ENCODER -- cgit v1.2.3