diff options
55 files changed, 1515 insertions, 2302 deletions
diff --git a/examples/decode_with_drops.c b/examples/decode_with_drops.c index 29b8be941..e69e2a9f9 100644 --- a/examples/decode_with_drops.c +++ b/examples/decode_with_drops.c @@ -92,8 +92,8 @@ int main(int argc, char **argv) { if (!(outfile = fopen(argv[2], "wb"))) die("Failed to open %s for writing.", argv[2]); - n = strtol(argv[3], &nptr, 0); - m = strtol(nptr + 1, NULL, 0); + n = (int)strtol(argv[3], &nptr, 0); + m = (int)strtol(nptr + 1, NULL, 0); is_range = (*nptr == '-'); if (!n || !m || (*nptr != '-' && *nptr != '/')) die("Couldn't parse pattern %s.\n", argv[3]); diff --git a/examples/set_maps.c b/examples/set_maps.c index d128e7d9a..c0c7d10e7 100644 --- a/examples/set_maps.c +++ b/examples/set_maps.c @@ -174,8 +174,8 @@ int main(int argc, char **argv) { } assert(encoder != NULL); info.codec_fourcc = encoder->fourcc; - info.frame_width = strtol(argv[2], NULL, 0); - info.frame_height = strtol(argv[3], NULL, 0); + info.frame_width = (int)strtol(argv[2], NULL, 0); + info.frame_height = (int)strtol(argv[3], NULL, 0); info.time_base.numerator = 1; info.time_base.denominator = fps; diff --git a/examples/simple_encoder.c b/examples/simple_encoder.c index 8632f179b..dde6344f8 100644 --- a/examples/simple_encoder.c +++ b/examples/simple_encoder.c @@ -175,14 +175,14 @@ int main(int argc, char **argv) { infile_arg = argv[4]; outfile_arg = argv[5]; keyframe_interval_arg = argv[6]; - max_frames = strtol(argv[8], NULL, 0); + max_frames = (int)strtol(argv[8], NULL, 0); encoder = get_vpx_encoder_by_name(codec_arg); if (!encoder) die("Unsupported codec."); info.codec_fourcc = encoder->fourcc; - info.frame_width = strtol(width_arg, NULL, 0); - info.frame_height = strtol(height_arg, NULL, 0); + info.frame_width = (int)strtol(width_arg, NULL, 0); + info.frame_height = (int)strtol(height_arg, NULL, 0); info.time_base.numerator = 1; info.time_base.denominator = fps; @@ -196,7 +196,7 @@ int main(int argc, char **argv) { die("Failed to allocate image."); } - keyframe_interval = strtol(keyframe_interval_arg, NULL, 0); + keyframe_interval = (int)strtol(keyframe_interval_arg, NULL, 0); if (keyframe_interval < 0) die("Invalid keyframe interval value."); printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface())); @@ -209,7 +209,7 @@ int main(int argc, char **argv) { cfg.g_timebase.num = info.time_base.numerator; cfg.g_timebase.den = info.time_base.denominator; cfg.rc_target_bitrate = bitrate; - cfg.g_error_resilient = strtol(argv[7], NULL, 0); + cfg.g_error_resilient = (vpx_codec_er_flags_t)strtoul(argv[7], NULL, 0); writer = vpx_video_writer_open(outfile_arg, kContainerIVF, &info); if (!writer) die("Failed to open %s for writing.", outfile_arg); diff --git a/examples/twopass_encoder.c b/examples/twopass_encoder.c index 4c130ec18..4e63a7a6c 100644 --- a/examples/twopass_encoder.c +++ b/examples/twopass_encoder.c @@ -209,13 +209,13 @@ int main(int argc, char **argv) { if (argc != 7) die("Invalid number of arguments."); - max_frames = strtol(argv[6], NULL, 0); + max_frames = (int)strtol(argv[6], NULL, 0); encoder = get_vpx_encoder_by_name(codec_arg); if (!encoder) die("Unsupported codec."); - w = strtol(width_arg, NULL, 0); - h = strtol(height_arg, NULL, 0); + w = (int)strtol(width_arg, NULL, 0); + h = (int)strtol(height_arg, NULL, 0); if (w <= 0 || h <= 0 || (w % 2) != 0 || (h % 2) != 0) die("Invalid frame size: %dx%d", w, h); diff --git a/examples/vp8cx_set_ref.c b/examples/vp8cx_set_ref.c index fc7bdab39..846477c61 100644 --- a/examples/vp8cx_set_ref.c +++ b/examples/vp8cx_set_ref.c @@ -122,8 +122,8 @@ int main(int argc, char **argv) { if (!update_frame_num) die("Couldn't parse frame number '%s'\n", argv[5]); info.codec_fourcc = encoder->fourcc; - info.frame_width = strtol(argv[1], NULL, 0); - info.frame_height = strtol(argv[2], NULL, 0); + info.frame_width = (int)strtol(argv[1], NULL, 0); + info.frame_height = (int)strtol(argv[2], NULL, 0); info.time_base.numerator = 1; info.time_base.denominator = fps; diff --git a/examples/vp9_lossless_encoder.c b/examples/vp9_lossless_encoder.c index 5802186bf..cb5ca6bfe 100644 --- a/examples/vp9_lossless_encoder.c +++ b/examples/vp9_lossless_encoder.c @@ -78,8 +78,8 @@ int main(int argc, char **argv) { if (!encoder) die("Unsupported codec."); info.codec_fourcc = encoder->fourcc; - info.frame_width = strtol(argv[1], NULL, 0); - info.frame_height = strtol(argv[2], NULL, 0); + info.frame_width = (int)strtol(argv[1], NULL, 0); + info.frame_height = (int)strtol(argv[2], NULL, 0); info.time_base.numerator = 1; info.time_base.denominator = fps; diff --git a/examples/vp9cx_set_ref.c b/examples/vp9cx_set_ref.c index e0bb795f7..798d7e3f2 100644 --- a/examples/vp9cx_set_ref.c +++ b/examples/vp9cx_set_ref.c @@ -335,8 +335,8 @@ int main(int argc, char **argv) { } info.codec_fourcc = encoder->fourcc; - info.frame_width = strtol(width_arg, NULL, 0); - info.frame_height = strtol(height_arg, NULL, 0); + info.frame_width = (int)strtol(width_arg, NULL, 0); + info.frame_height = (int)strtol(height_arg, NULL, 0); info.time_base.numerator = 1; info.time_base.denominator = fps; diff --git a/examples/vpx_temporal_svc_encoder.c b/examples/vpx_temporal_svc_encoder.c index 4a3387787..309a2fe2e 100644 --- a/examples/vpx_temporal_svc_encoder.c +++ b/examples/vpx_temporal_svc_encoder.c @@ -547,13 +547,13 @@ int main(int argc, char **argv) { printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface())); - width = strtol(argv[4], NULL, 0); - height = strtol(argv[5], NULL, 0); + width = (unsigned int)strtoul(argv[4], NULL, 0); + height = (unsigned int)strtoul(argv[5], NULL, 0); if (width < 16 || width % 2 || height < 16 || height % 2) { die("Invalid resolution: %d x %d", width, height); } - layering_mode = strtol(argv[10], NULL, 0); + layering_mode = (int)strtol(argv[10], NULL, 0); if (layering_mode < 0 || layering_mode > 13) { die("Invalid layering mode (0..12) %s", argv[10]); } @@ -609,17 +609,17 @@ int main(int argc, char **argv) { #endif // CONFIG_VP9_HIGHBITDEPTH // Timebase format e.g. 30fps: numerator=1, demoninator = 30. - cfg.g_timebase.num = strtol(argv[6], NULL, 0); - cfg.g_timebase.den = strtol(argv[7], NULL, 0); + cfg.g_timebase.num = (int)strtol(argv[6], NULL, 0); + cfg.g_timebase.den = (int)strtol(argv[7], NULL, 0); - speed = strtol(argv[8], NULL, 0); + speed = (int)strtol(argv[8], NULL, 0); if (speed < 0) { die("Invalid speed setting: must be positive"); } for (i = min_args_base; (int)i < min_args_base + mode_to_num_layers[layering_mode]; ++i) { - rc.layer_target_bitrate[i - 11] = strtol(argv[i], NULL, 0); + rc.layer_target_bitrate[i - 11] = (int)strtol(argv[i], NULL, 0); if (strncmp(encoder->name, "vp8", 3) == 0) cfg.ts_target_bitrate[i - 11] = rc.layer_target_bitrate[i - 11]; else if (strncmp(encoder->name, "vp9", 3) == 0) @@ -627,7 +627,7 @@ int main(int argc, char **argv) { } // Real time parameters. - cfg.rc_dropframe_thresh = strtol(argv[9], NULL, 0); + cfg.rc_dropframe_thresh = (unsigned int)strtoul(argv[9], NULL, 0); cfg.rc_end_usage = VPX_CBR; cfg.rc_min_quantizer = 2; cfg.rc_max_quantizer = 56; diff --git a/test/idct_test.cc b/test/idct_test.cc index f54f2c005..700da77e3 100644 --- a/test/idct_test.cc +++ b/test/idct_test.cc @@ -115,6 +115,10 @@ TEST_P(IDCTTest, TestWithData) { } INSTANTIATE_TEST_CASE_P(C, IDCTTest, ::testing::Values(vp8_short_idct4x4llm_c)); +#if HAVE_NEON +INSTANTIATE_TEST_CASE_P(NEON, IDCTTest, + ::testing::Values(vp8_short_idct4x4llm_neon)); +#endif #if HAVE_MMX INSTANTIATE_TEST_CASE_P(MMX, IDCTTest, ::testing::Values(vp8_short_idct4x4llm_mmx)); diff --git a/test/predict_test.cc b/test/predict_test.cc new file mode 100644 index 000000000..f06e4dbb2 --- /dev/null +++ b/test/predict_test.cc @@ -0,0 +1,381 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <stdlib.h> +#include <string.h> + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vp8_rtcd.h" +#include "./vpx_config.h" +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" +#include "vpx/vpx_integer.h" +#include "vpx_mem/vpx_mem.h" + +namespace { + +using libvpx_test::ACMRandom; +using std::tr1::make_tuple; + +typedef void (*PredictFunc)(uint8_t *src_ptr, int src_pixels_per_line, + int xoffset, int yoffset, uint8_t *dst_ptr, + int dst_pitch); + +typedef std::tr1::tuple<int, int, PredictFunc> PredictParam; + +class PredictTestBase : public ::testing::TestWithParam<PredictParam> { + public: + PredictTestBase() + : width_(GET_PARAM(0)), height_(GET_PARAM(1)), predict_(GET_PARAM(2)), + src_(NULL), padded_dst_(NULL), dst_(NULL), dst_c_(NULL) {} + + virtual void SetUp() { + src_ = new uint8_t[kSrcSize]; + ASSERT_TRUE(src_ != NULL); + + // padded_dst_ provides a buffer of kBorderSize around the destination + // memory to facilitate detecting out of bounds writes. + dst_stride_ = kBorderSize + width_ + kBorderSize; + padded_dst_size_ = dst_stride_ * (kBorderSize + height_ + kBorderSize); + padded_dst_ = + reinterpret_cast<uint8_t *>(vpx_memalign(16, padded_dst_size_)); + ASSERT_TRUE(padded_dst_ != NULL); + dst_ = padded_dst_ + (kBorderSize * dst_stride_) + kBorderSize; + + dst_c_ = new uint8_t[16 * 16]; + ASSERT_TRUE(dst_c_ != NULL); + + memset(src_, 0, kSrcSize); + memset(padded_dst_, 128, padded_dst_size_); + memset(dst_c_, 0, 16 * 16); + } + + virtual void TearDown() { + delete[] src_; + src_ = NULL; + vpx_free(padded_dst_); + padded_dst_ = NULL; + dst_ = NULL; + delete[] dst_c_; + dst_c_ = NULL; + libvpx_test::ClearSystemState(); + } + + protected: + // Make reference arrays big enough for 16x16 functions. Six-tap filters need + // 5 extra pixels outside of the macroblock. + static const int kSrcStride = 21; + static const int kSrcSize = kSrcStride * kSrcStride; + static const int kBorderSize = 16; + + int width_; + int height_; + PredictFunc predict_; + uint8_t *src_; + uint8_t *padded_dst_; + uint8_t *dst_; + int padded_dst_size_; + uint8_t *dst_c_; + int dst_stride_; + + bool CompareBuffers(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride) const { + for (int height = 0; height < height_; ++height) { + EXPECT_EQ(0, memcmp(a + height * a_stride, b + height * b_stride, + sizeof(*a) * width_)) + << "Row " << height << " does not match."; + } + + return !HasFailure(); + } + + // Given a block of memory 'a' with size 'a_size', determine if all regions + // excepting block 'b' described by 'b_stride', 'b_height', and 'b_width' + // match pixel value 'c'. + bool CheckBorder(const uint8_t *a, int a_size, const uint8_t *b, int b_width, + int b_height, int b_stride, uint8_t c) const { + const uint8_t *a_end = a + a_size; + const int b_size = (b_stride * b_height) + b_width; + const uint8_t *b_end = b + b_size; + const int left_border = (b_stride - b_width) / 2; + const int right_border = left_border + ((b_stride - b_width) % 2); + + EXPECT_GE(b - left_border, a) << "'b' does not start within 'a'"; + EXPECT_LE(b_end + right_border, a_end) << "'b' does not end within 'a'"; + + // Top border. + for (int pixel = 0; pixel < b - a - left_border; ++pixel) { + EXPECT_EQ(c, a[pixel]) << "Mismatch at " << pixel << " in top border."; + } + + // Left border. + for (int height = 0; height < b_height; ++height) { + for (int width = left_border; width > 0; --width) { + EXPECT_EQ(c, b[height * b_stride - width]) + << "Mismatch at row " << height << " column " << left_border - width + << " in left border."; + } + } + + // Right border. + for (int height = 0; height < b_height; ++height) { + for (int width = b_width; width < b_width + right_border; ++width) { + EXPECT_EQ(c, b[height * b_stride + width]) + << "Mismatch at row " << height << " column " << width - b_width + << " in right border."; + } + } + + // Bottom border. + for (int pixel = static_cast<int>(b - a + b_size); pixel < a_size; + ++pixel) { + EXPECT_EQ(c, a[pixel]) << "Mismatch at " << pixel << " in bottom border."; + } + + return !HasFailure(); + } + + void TestWithRandomData(PredictFunc reference) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + + // Run tests for almost all possible offsets. + for (int xoffset = 0; xoffset < 8; ++xoffset) { + for (int yoffset = 0; yoffset < 8; ++yoffset) { + if (xoffset == 0 && yoffset == 0) { + // This represents a copy which is not required to be handled by this + // module. + continue; + } + + for (int i = 0; i < kSrcSize; ++i) { + src_[i] = rnd.Rand8(); + } + reference(&src_[kSrcStride * 2 + 2], kSrcStride, xoffset, yoffset, + dst_c_, 16); + + ASM_REGISTER_STATE_CHECK(predict_(&src_[kSrcStride * 2 + 2], kSrcStride, + xoffset, yoffset, dst_, dst_stride_)); + + ASSERT_TRUE(CompareBuffers(dst_c_, 16, dst_, dst_stride_)); + ASSERT_TRUE(CheckBorder(padded_dst_, padded_dst_size_, dst_, width_, + height_, dst_stride_, 128)); + } + } + } + + void TestWithUnalignedDst(PredictFunc reference) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + + // Only the 4x4 need to be able to handle unaligned writes. + if (width_ == 4 && height_ == 4) { + for (int xoffset = 0; xoffset < 8; ++xoffset) { + for (int yoffset = 0; yoffset < 8; ++yoffset) { + if (xoffset == 0 && yoffset == 0) { + continue; + } + for (int i = 0; i < kSrcSize; ++i) { + src_[i] = rnd.Rand8(); + } + reference(&src_[kSrcStride * 2 + 2], kSrcStride, xoffset, yoffset, + dst_c_, 16); + + for (int i = 1; i < 4; ++i) { + memset(padded_dst_, 128, padded_dst_size_); + + ASM_REGISTER_STATE_CHECK(predict_(&src_[kSrcStride * 2 + 2], + kSrcStride, xoffset, yoffset, + dst_ + i, dst_stride_ + i)); + + ASSERT_TRUE(CompareBuffers(dst_c_, 16, dst_ + i, dst_stride_ + i)); + ASSERT_TRUE(CheckBorder(padded_dst_, padded_dst_size_, dst_ + i, + width_, height_, dst_stride_ + i, 128)); + } + } + } + } + } +}; + +class SixtapPredictTest : public PredictTestBase {}; + +TEST_P(SixtapPredictTest, TestWithRandomData) { + TestWithRandomData(vp8_sixtap_predict16x16_c); +} +TEST_P(SixtapPredictTest, TestWithUnalignedDst) { + TestWithUnalignedDst(vp8_sixtap_predict16x16_c); +} + +TEST_P(SixtapPredictTest, TestWithPresetData) { + // Test input + static const uint8_t kTestData[kSrcSize] = { + 184, 4, 191, 82, 92, 41, 0, 1, 226, 236, 172, 20, 182, 42, 226, + 177, 79, 94, 77, 179, 203, 206, 198, 22, 192, 19, 75, 17, 192, 44, + 233, 120, 48, 168, 203, 141, 210, 203, 143, 180, 184, 59, 201, 110, 102, + 171, 32, 182, 10, 109, 105, 213, 60, 47, 236, 253, 67, 55, 14, 3, + 99, 247, 124, 148, 159, 71, 34, 114, 19, 177, 38, 203, 237, 239, 58, + 83, 155, 91, 10, 166, 201, 115, 124, 5, 163, 104, 2, 231, 160, 16, + 234, 4, 8, 103, 153, 167, 174, 187, 26, 193, 109, 64, 141, 90, 48, + 200, 174, 204, 36, 184, 114, 237, 43, 238, 242, 207, 86, 245, 182, 247, + 6, 161, 251, 14, 8, 148, 182, 182, 79, 208, 120, 188, 17, 6, 23, + 65, 206, 197, 13, 242, 126, 128, 224, 170, 110, 211, 121, 197, 200, 47, + 188, 207, 208, 184, 221, 216, 76, 148, 143, 156, 100, 8, 89, 117, 14, + 112, 183, 221, 54, 197, 208, 180, 69, 176, 94, 180, 131, 215, 121, 76, + 7, 54, 28, 216, 238, 249, 176, 58, 142, 64, 215, 242, 72, 49, 104, + 87, 161, 32, 52, 216, 230, 4, 141, 44, 181, 235, 224, 57, 195, 89, + 134, 203, 144, 162, 163, 126, 156, 84, 185, 42, 148, 145, 29, 221, 194, + 134, 52, 100, 166, 105, 60, 140, 110, 201, 184, 35, 181, 153, 93, 121, + 243, 227, 68, 131, 134, 232, 2, 35, 60, 187, 77, 209, 76, 106, 174, + 15, 241, 227, 115, 151, 77, 175, 36, 187, 121, 221, 223, 47, 118, 61, + 168, 105, 32, 237, 236, 167, 213, 238, 202, 17, 170, 24, 226, 247, 131, + 145, 6, 116, 117, 121, 11, 194, 41, 48, 126, 162, 13, 93, 209, 131, + 154, 122, 237, 187, 103, 217, 99, 60, 200, 45, 78, 115, 69, 49, 106, + 200, 194, 112, 60, 56, 234, 72, 251, 19, 120, 121, 182, 134, 215, 135, + 10, 114, 2, 247, 46, 105, 209, 145, 165, 153, 191, 243, 12, 5, 36, + 119, 206, 231, 231, 11, 32, 209, 83, 27, 229, 204, 149, 155, 83, 109, + 35, 93, 223, 37, 84, 14, 142, 37, 160, 52, 191, 96, 40, 204, 101, + 77, 67, 52, 53, 43, 63, 85, 253, 147, 113, 226, 96, 6, 125, 179, + 115, 161, 17, 83, 198, 101, 98, 85, 139, 3, 137, 75, 99, 178, 23, + 201, 255, 91, 253, 52, 134, 60, 138, 131, 208, 251, 101, 48, 2, 227, + 228, 118, 132, 245, 202, 75, 91, 44, 160, 231, 47, 41, 50, 147, 220, + 74, 92, 219, 165, 89, 16 + }; + + // Expected results for xoffset = 2 and yoffset = 2. + static const int kExpectedDstStride = 16; + static const uint8_t kExpectedDst[256] = { + 117, 102, 74, 135, 42, 98, 175, 206, 70, 73, 222, 197, 50, 24, 39, + 49, 38, 105, 90, 47, 169, 40, 171, 215, 200, 73, 109, 141, 53, 85, + 177, 164, 79, 208, 124, 89, 212, 18, 81, 145, 151, 164, 217, 153, 91, + 154, 102, 102, 159, 75, 164, 152, 136, 51, 213, 219, 186, 116, 193, 224, + 186, 36, 231, 208, 84, 211, 155, 167, 35, 59, 42, 76, 216, 149, 73, + 201, 78, 149, 184, 100, 96, 196, 189, 198, 188, 235, 195, 117, 129, 120, + 129, 49, 25, 133, 113, 69, 221, 114, 70, 143, 99, 157, 108, 189, 140, + 78, 6, 55, 65, 240, 255, 245, 184, 72, 90, 100, 116, 131, 39, 60, + 234, 167, 33, 160, 88, 185, 200, 157, 159, 176, 127, 151, 138, 102, 168, + 106, 170, 86, 82, 219, 189, 76, 33, 115, 197, 106, 96, 198, 136, 97, + 141, 237, 151, 98, 137, 191, 185, 2, 57, 95, 142, 91, 255, 185, 97, + 137, 76, 162, 94, 173, 131, 193, 161, 81, 106, 72, 135, 222, 234, 137, + 66, 137, 106, 243, 210, 147, 95, 15, 137, 110, 85, 66, 16, 96, 167, + 147, 150, 173, 203, 140, 118, 196, 84, 147, 160, 19, 95, 101, 123, 74, + 132, 202, 82, 166, 12, 131, 166, 189, 170, 159, 85, 79, 66, 57, 152, + 132, 203, 194, 0, 1, 56, 146, 180, 224, 156, 28, 83, 181, 79, 76, + 80, 46, 160, 175, 59, 106, 43, 87, 75, 136, 85, 189, 46, 71, 200, + 90 + }; + + ASM_REGISTER_STATE_CHECK( + predict_(const_cast<uint8_t *>(kTestData) + kSrcStride * 2 + 2, + kSrcStride, 2, 2, dst_, dst_stride_)); + + ASSERT_TRUE( + CompareBuffers(kExpectedDst, kExpectedDstStride, dst_, dst_stride_)); +} + +INSTANTIATE_TEST_CASE_P( + C, SixtapPredictTest, + ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_c), + make_tuple(8, 8, &vp8_sixtap_predict8x8_c), + make_tuple(8, 4, &vp8_sixtap_predict8x4_c), + make_tuple(4, 4, &vp8_sixtap_predict4x4_c))); +#if HAVE_NEON +INSTANTIATE_TEST_CASE_P( + NEON, SixtapPredictTest, + ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_neon), + make_tuple(8, 8, &vp8_sixtap_predict8x8_neon), + make_tuple(8, 4, &vp8_sixtap_predict8x4_neon), + make_tuple(4, 4, &vp8_sixtap_predict4x4_neon))); +#endif +#if HAVE_MMX +INSTANTIATE_TEST_CASE_P( + MMX, SixtapPredictTest, + ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_mmx), + make_tuple(8, 8, &vp8_sixtap_predict8x8_mmx), + make_tuple(8, 4, &vp8_sixtap_predict8x4_mmx), + make_tuple(4, 4, &vp8_sixtap_predict4x4_mmx))); +#endif +#if HAVE_SSE2 +INSTANTIATE_TEST_CASE_P( + SSE2, SixtapPredictTest, + ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_sse2), + make_tuple(8, 8, &vp8_sixtap_predict8x8_sse2), + make_tuple(8, 4, &vp8_sixtap_predict8x4_sse2))); +#endif +#if HAVE_SSSE3 +INSTANTIATE_TEST_CASE_P( + SSSE3, SixtapPredictTest, + ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_ssse3), + make_tuple(8, 8, &vp8_sixtap_predict8x8_ssse3), + make_tuple(8, 4, &vp8_sixtap_predict8x4_ssse3), + make_tuple(4, 4, &vp8_sixtap_predict4x4_ssse3))); +#endif +#if HAVE_MSA +INSTANTIATE_TEST_CASE_P( + MSA, SixtapPredictTest, + ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_msa), + make_tuple(8, 8, &vp8_sixtap_predict8x8_msa), + make_tuple(8, 4, &vp8_sixtap_predict8x4_msa), + make_tuple(4, 4, &vp8_sixtap_predict4x4_msa))); +#endif + +class BilinearPredictTest : public PredictTestBase {}; + +TEST_P(BilinearPredictTest, TestWithRandomData) { + TestWithRandomData(vp8_bilinear_predict16x16_c); +} +TEST_P(BilinearPredictTest, TestWithUnalignedDst) { + TestWithUnalignedDst(vp8_bilinear_predict16x16_c); +} + +INSTANTIATE_TEST_CASE_P( + C, BilinearPredictTest, + ::testing::Values(make_tuple(16, 16, &vp8_bilinear_predict16x16_c), + make_tuple(8, 8, &vp8_bilinear_predict8x8_c), + make_tuple(8, 4, &vp8_bilinear_predict8x4_c), + make_tuple(4, 4, &vp8_bilinear_predict4x4_c))); +#if HAVE_NEON +INSTANTIATE_TEST_CASE_P( + NEON, BilinearPredictTest, + ::testing::Values(make_tuple(16, 16, &vp8_bilinear_predict16x16_neon), + make_tuple(8, 8, &vp8_bilinear_predict8x8_neon), + make_tuple(8, 4, &vp8_bilinear_predict8x4_neon), + make_tuple(4, 4, &vp8_bilinear_predict4x4_neon))); +#endif +#if HAVE_MMX +INSTANTIATE_TEST_CASE_P( + MMX, BilinearPredictTest, + ::testing::Values(make_tuple(16, 16, &vp8_bilinear_predict16x16_mmx), + make_tuple(8, 8, &vp8_bilinear_predict8x8_mmx), + make_tuple(8, 4, &vp8_bilinear_predict8x4_mmx), + make_tuple(4, 4, &vp8_bilinear_predict4x4_mmx))); +#endif +#if HAVE_SSE2 +INSTANTIATE_TEST_CASE_P( + SSE2, BilinearPredictTest, + ::testing::Values(make_tuple(16, 16, &vp8_bilinear_predict16x16_sse2), + make_tuple(8, 8, &vp8_bilinear_predict8x8_sse2))); +#endif +#if HAVE_SSSE3 +INSTANTIATE_TEST_CASE_P( + SSSE3, BilinearPredictTest, + ::testing::Values(make_tuple(16, 16, &vp8_bilinear_predict16x16_ssse3), + make_tuple(8, 8, &vp8_bilinear_predict8x8_ssse3))); +#endif +#if HAVE_MSA +INSTANTIATE_TEST_CASE_P( + MSA, BilinearPredictTest, + ::testing::Values(make_tuple(16, 16, &vp8_bilinear_predict16x16_msa), + make_tuple(8, 8, &vp8_bilinear_predict8x8_msa), + make_tuple(8, 4, &vp8_bilinear_predict8x4_msa), + make_tuple(4, 4, &vp8_bilinear_predict4x4_msa))); +#endif +} // namespace diff --git a/test/sixtap_predict_test.cc b/test/sixtap_predict_test.cc deleted file mode 100644 index 31a604417..000000000 --- a/test/sixtap_predict_test.cc +++ /dev/null @@ -1,231 +0,0 @@ -/* - * Copyright (c) 2013 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <math.h> -#include <stdlib.h> -#include <string.h> - -#include "third_party/googletest/src/include/gtest/gtest.h" - -#include "./vpx_config.h" -#include "./vp8_rtcd.h" -#include "test/acm_random.h" -#include "test/clear_system_state.h" -#include "test/register_state_check.h" -#include "test/util.h" -#include "vpx/vpx_integer.h" -#include "vpx_mem/vpx_mem.h" - -namespace { - -typedef void (*SixtapPredictFunc)(uint8_t *src_ptr, int src_pixels_per_line, - int xoffset, int yoffset, uint8_t *dst_ptr, - int dst_pitch); - -typedef std::tr1::tuple<int, int, SixtapPredictFunc> SixtapPredictParam; - -class SixtapPredictTest : public ::testing::TestWithParam<SixtapPredictParam> { - public: - static void SetUpTestCase() { - src_ = reinterpret_cast<uint8_t *>(vpx_memalign(kDataAlignment, kSrcSize)); - dst_ = reinterpret_cast<uint8_t *>(vpx_memalign(kDataAlignment, kDstSize)); - dst_c_ = - reinterpret_cast<uint8_t *>(vpx_memalign(kDataAlignment, kDstSize)); - } - - static void TearDownTestCase() { - vpx_free(src_); - src_ = NULL; - vpx_free(dst_); - dst_ = NULL; - vpx_free(dst_c_); - dst_c_ = NULL; - } - - virtual void TearDown() { libvpx_test::ClearSystemState(); } - - protected: - // Make test arrays big enough for 16x16 functions. Six-tap filters - // need 5 extra pixels outside of the macroblock. - static const int kSrcStride = 21; - static const int kDstStride = 16; - static const int kDataAlignment = 16; - static const int kSrcSize = kSrcStride * kSrcStride + 1; - static const int kDstSize = kDstStride * kDstStride; - - virtual void SetUp() { - width_ = GET_PARAM(0); - height_ = GET_PARAM(1); - sixtap_predict_ = GET_PARAM(2); - memset(src_, 0, kSrcSize); - memset(dst_, 0, kDstSize); - memset(dst_c_, 0, kDstSize); - } - - int width_; - int height_; - SixtapPredictFunc sixtap_predict_; - // The src stores the macroblock we will filter on, and makes it 1 byte larger - // in order to test unaligned access. The result is stored in dst and dst_c(c - // reference code result). - static uint8_t *src_; - static uint8_t *dst_; - static uint8_t *dst_c_; -}; - -uint8_t *SixtapPredictTest::src_ = NULL; -uint8_t *SixtapPredictTest::dst_ = NULL; -uint8_t *SixtapPredictTest::dst_c_ = NULL; - -TEST_P(SixtapPredictTest, TestWithPresetData) { - // Test input - static const uint8_t test_data[kSrcSize] = { - 216, 184, 4, 191, 82, 92, 41, 0, 1, 226, 236, 172, 20, 182, 42, - 226, 177, 79, 94, 77, 179, 203, 206, 198, 22, 192, 19, 75, 17, 192, - 44, 233, 120, 48, 168, 203, 141, 210, 203, 143, 180, 184, 59, 201, 110, - 102, 171, 32, 182, 10, 109, 105, 213, 60, 47, 236, 253, 67, 55, 14, - 3, 99, 247, 124, 148, 159, 71, 34, 114, 19, 177, 38, 203, 237, 239, - 58, 83, 155, 91, 10, 166, 201, 115, 124, 5, 163, 104, 2, 231, 160, - 16, 234, 4, 8, 103, 153, 167, 174, 187, 26, 193, 109, 64, 141, 90, - 48, 200, 174, 204, 36, 184, 114, 237, 43, 238, 242, 207, 86, 245, 182, - 247, 6, 161, 251, 14, 8, 148, 182, 182, 79, 208, 120, 188, 17, 6, - 23, 65, 206, 197, 13, 242, 126, 128, 224, 170, 110, 211, 121, 197, 200, - 47, 188, 207, 208, 184, 221, 216, 76, 148, 143, 156, 100, 8, 89, 117, - 14, 112, 183, 221, 54, 197, 208, 180, 69, 176, 94, 180, 131, 215, 121, - 76, 7, 54, 28, 216, 238, 249, 176, 58, 142, 64, 215, 242, 72, 49, - 104, 87, 161, 32, 52, 216, 230, 4, 141, 44, 181, 235, 224, 57, 195, - 89, 134, 203, 144, 162, 163, 126, 156, 84, 185, 42, 148, 145, 29, 221, - 194, 134, 52, 100, 166, 105, 60, 140, 110, 201, 184, 35, 181, 153, 93, - 121, 243, 227, 68, 131, 134, 232, 2, 35, 60, 187, 77, 209, 76, 106, - 174, 15, 241, 227, 115, 151, 77, 175, 36, 187, 121, 221, 223, 47, 118, - 61, 168, 105, 32, 237, 236, 167, 213, 238, 202, 17, 170, 24, 226, 247, - 131, 145, 6, 116, 117, 121, 11, 194, 41, 48, 126, 162, 13, 93, 209, - 131, 154, 122, 237, 187, 103, 217, 99, 60, 200, 45, 78, 115, 69, 49, - 106, 200, 194, 112, 60, 56, 234, 72, 251, 19, 120, 121, 182, 134, 215, - 135, 10, 114, 2, 247, 46, 105, 209, 145, 165, 153, 191, 243, 12, 5, - 36, 119, 206, 231, 231, 11, 32, 209, 83, 27, 229, 204, 149, 155, 83, - 109, 35, 93, 223, 37, 84, 14, 142, 37, 160, 52, 191, 96, 40, 204, - 101, 77, 67, 52, 53, 43, 63, 85, 253, 147, 113, 226, 96, 6, 125, - 179, 115, 161, 17, 83, 198, 101, 98, 85, 139, 3, 137, 75, 99, 178, - 23, 201, 255, 91, 253, 52, 134, 60, 138, 131, 208, 251, 101, 48, 2, - 227, 228, 118, 132, 245, 202, 75, 91, 44, 160, 231, 47, 41, 50, 147, - 220, 74, 92, 219, 165, 89, 16 - }; - - // Expected result - static const uint8_t expected_dst[kDstSize] = { - 117, 102, 74, 135, 42, 98, 175, 206, 70, 73, 222, 197, 50, 24, 39, - 49, 38, 105, 90, 47, 169, 40, 171, 215, 200, 73, 109, 141, 53, 85, - 177, 164, 79, 208, 124, 89, 212, 18, 81, 145, 151, 164, 217, 153, 91, - 154, 102, 102, 159, 75, 164, 152, 136, 51, 213, 219, 186, 116, 193, 224, - 186, 36, 231, 208, 84, 211, 155, 167, 35, 59, 42, 76, 216, 149, 73, - 201, 78, 149, 184, 100, 96, 196, 189, 198, 188, 235, 195, 117, 129, 120, - 129, 49, 25, 133, 113, 69, 221, 114, 70, 143, 99, 157, 108, 189, 140, - 78, 6, 55, 65, 240, 255, 245, 184, 72, 90, 100, 116, 131, 39, 60, - 234, 167, 33, 160, 88, 185, 200, 157, 159, 176, 127, 151, 138, 102, 168, - 106, 170, 86, 82, 219, 189, 76, 33, 115, 197, 106, 96, 198, 136, 97, - 141, 237, 151, 98, 137, 191, 185, 2, 57, 95, 142, 91, 255, 185, 97, - 137, 76, 162, 94, 173, 131, 193, 161, 81, 106, 72, 135, 222, 234, 137, - 66, 137, 106, 243, 210, 147, 95, 15, 137, 110, 85, 66, 16, 96, 167, - 147, 150, 173, 203, 140, 118, 196, 84, 147, 160, 19, 95, 101, 123, 74, - 132, 202, 82, 166, 12, 131, 166, 189, 170, 159, 85, 79, 66, 57, 152, - 132, 203, 194, 0, 1, 56, 146, 180, 224, 156, 28, 83, 181, 79, 76, - 80, 46, 160, 175, 59, 106, 43, 87, 75, 136, 85, 189, 46, 71, 200, - 90 - }; - - uint8_t *src = const_cast<uint8_t *>(test_data); - - ASM_REGISTER_STATE_CHECK(sixtap_predict_(&src[kSrcStride * 2 + 2 + 1], - kSrcStride, 2, 2, dst_, kDstStride)); - - for (int i = 0; i < height_; ++i) { - for (int j = 0; j < width_; ++j) - ASSERT_EQ(expected_dst[i * kDstStride + j], dst_[i * kDstStride + j]) - << "i==" << (i * width_ + j); - } -} - -using libvpx_test::ACMRandom; - -TEST_P(SixtapPredictTest, TestWithRandomData) { - ACMRandom rnd(ACMRandom::DeterministicSeed()); - for (int i = 0; i < kSrcSize; ++i) src_[i] = rnd.Rand8(); - - // Run tests for all possible offsets. - for (int xoffset = 0; xoffset < 8; ++xoffset) { - for (int yoffset = 0; yoffset < 8; ++yoffset) { - // Call c reference function. - // Move start point to next pixel to test if the function reads - // unaligned data correctly. - vp8_sixtap_predict16x16_c(&src_[kSrcStride * 2 + 2 + 1], kSrcStride, - xoffset, yoffset, dst_c_, kDstStride); - - // Run test. - ASM_REGISTER_STATE_CHECK(sixtap_predict_(&src_[kSrcStride * 2 + 2 + 1], - kSrcStride, xoffset, yoffset, - dst_, kDstStride)); - - for (int i = 0; i < height_; ++i) { - for (int j = 0; j < width_; ++j) - ASSERT_EQ(dst_c_[i * kDstStride + j], dst_[i * kDstStride + j]) - << "i==" << (i * width_ + j); - } - } - } -} - -using std::tr1::make_tuple; - -INSTANTIATE_TEST_CASE_P( - C, SixtapPredictTest, - ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_c), - make_tuple(8, 8, &vp8_sixtap_predict8x8_c), - make_tuple(8, 4, &vp8_sixtap_predict8x4_c), - make_tuple(4, 4, &vp8_sixtap_predict4x4_c))); -#if HAVE_NEON -INSTANTIATE_TEST_CASE_P( - NEON, SixtapPredictTest, - ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_neon), - make_tuple(8, 8, &vp8_sixtap_predict8x8_neon), - make_tuple(8, 4, &vp8_sixtap_predict8x4_neon))); -#endif -#if HAVE_MMX -INSTANTIATE_TEST_CASE_P( - MMX, SixtapPredictTest, - ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_mmx), - make_tuple(8, 8, &vp8_sixtap_predict8x8_mmx), - make_tuple(8, 4, &vp8_sixtap_predict8x4_mmx), - make_tuple(4, 4, &vp8_sixtap_predict4x4_mmx))); -#endif -#if HAVE_SSE2 -INSTANTIATE_TEST_CASE_P( - SSE2, SixtapPredictTest, - ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_sse2), - make_tuple(8, 8, &vp8_sixtap_predict8x8_sse2), - make_tuple(8, 4, &vp8_sixtap_predict8x4_sse2))); -#endif -#if HAVE_SSSE3 -INSTANTIATE_TEST_CASE_P( - SSSE3, SixtapPredictTest, - ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_ssse3), - make_tuple(8, 8, &vp8_sixtap_predict8x8_ssse3), - make_tuple(8, 4, &vp8_sixtap_predict8x4_ssse3), - make_tuple(4, 4, &vp8_sixtap_predict4x4_ssse3))); -#endif -#if HAVE_MSA -INSTANTIATE_TEST_CASE_P( - MSA, SixtapPredictTest, - ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_msa), - make_tuple(8, 8, &vp8_sixtap_predict8x8_msa), - make_tuple(8, 4, &vp8_sixtap_predict8x4_msa), - make_tuple(4, 4, &vp8_sixtap_predict4x4_msa))); -#endif -} // namespace diff --git a/test/test.mk b/test/test.mk index aad264531..60218a780 100644 --- a/test/test.mk +++ b/test/test.mk @@ -119,7 +119,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += variance_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_fdct4x4_test.cc LIBVPX_TEST_SRCS-yes += idct_test.cc -LIBVPX_TEST_SRCS-yes += sixtap_predict_test.cc +LIBVPX_TEST_SRCS-yes += predict_test.cc LIBVPX_TEST_SRCS-yes += vpx_scale_test.cc ifeq ($(CONFIG_VP8_ENCODER)$(CONFIG_TEMPORAL_DENOISING),yesyes) diff --git a/vp8/common/arm/neon/dequant_idct_neon.c b/vp8/common/arm/neon/dequant_idct_neon.c index ff5981eaa..753051c77 100644 --- a/vp8/common/arm/neon/dequant_idct_neon.c +++ b/vp8/common/arm/neon/dequant_idct_neon.c @@ -11,7 +11,11 @@ #include <arm_neon.h> static const int16_t cospi8sqrt2minus1 = 20091; -static const int16_t sinpi8sqrt2 = 35468; +// 35468 exceeds INT16_MAX and gets converted to a negative number. Because of +// the way it is used in vqdmulh, where the result is doubled, it can be divided +// by 2 beforehand. This saves compensating for the negative value as well as +// shifting the result. +static const int16_t sinpi8sqrt2 = 35468 >> 1; void vp8_dequant_idct_add_neon(int16_t *input, int16_t *dq, unsigned char *dst, int stride) { @@ -60,10 +64,8 @@ void vp8_dequant_idct_add_neon(int16_t *input, int16_t *dq, unsigned char *dst, q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2); q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1); - q3 = vshrq_n_s16(q3, 1); q4 = vshrq_n_s16(q4, 1); - q3 = vqaddq_s16(q3, q2); q4 = vqaddq_s16(q4, q2); d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4)); @@ -90,10 +92,8 @@ void vp8_dequant_idct_add_neon(int16_t *input, int16_t *dq, unsigned char *dst, d12 = vqadd_s16(d2tmp2.val[0], d2tmp3.val[0]); d13 = vqsub_s16(d2tmp2.val[0], d2tmp3.val[0]); - q3 = vshrq_n_s16(q3, 1); q4 = vshrq_n_s16(q4, 1); - q3 = vqaddq_s16(q3, q2); q4 = vqaddq_s16(q4, q2); d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4)); diff --git a/vp8/common/arm/neon/shortidct4x4llm_neon.c b/vp8/common/arm/neon/shortidct4x4llm_neon.c index a36c0c1ca..1adb1c317 100644 --- a/vp8/common/arm/neon/shortidct4x4llm_neon.c +++ b/vp8/common/arm/neon/shortidct4x4llm_neon.c @@ -11,7 +11,11 @@ #include <arm_neon.h> static const int16_t cospi8sqrt2minus1 = 20091; -static const int16_t sinpi8sqrt2 = 35468; +// 35468 exceeds INT16_MAX and gets converted to a negative number. Because of +// the way it is used in vqdmulh, where the result is doubled, it can be divided +// by 2 beforehand. This saves compensating for the negative value as well as +// shifting the result. +static const int16_t sinpi8sqrt2 = 35468 >> 1; void vp8_short_idct4x4llm_neon(int16_t *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, @@ -40,10 +44,8 @@ void vp8_short_idct4x4llm_neon(int16_t *input, unsigned char *pred_ptr, d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // a1 d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // b1 - q3s16 = vshrq_n_s16(q3s16, 1); q4s16 = vshrq_n_s16(q4s16, 1); - q3s16 = vqaddq_s16(q3s16, q2s16); q4s16 = vqaddq_s16(q4s16, q2s16); d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16)); // c1 @@ -71,10 +73,8 @@ void vp8_short_idct4x4llm_neon(int16_t *input, unsigned char *pred_ptr, d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // a1 d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // b1 - q3s16 = vshrq_n_s16(q3s16, 1); q4s16 = vshrq_n_s16(q4s16, 1); - q3s16 = vqaddq_s16(q3s16, q2s16); q4s16 = vqaddq_s16(q4s16, q2s16); d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16)); // c1 diff --git a/vp8/common/arm/neon/sixtappredict_neon.c b/vp8/common/arm/neon/sixtappredict_neon.c index 622baa3c5..fbb552ebe 100644 --- a/vp8/common/arm/neon/sixtappredict_neon.c +++ b/vp8/common/arm/neon/sixtappredict_neon.c @@ -9,6 +9,8 @@ */ #include <arm_neon.h> +#include <string.h> +#include "./vpx_config.h" #include "vpx_ports/mem.h" static const int8_t vp8_sub_pel_filters[8][8] = { @@ -22,6 +24,396 @@ static const int8_t vp8_sub_pel_filters[8][8] = { { 0, -1, 12, 123, -6, 0, 0, 0 }, }; +// This table is derived from vp8/common/filter.c:vp8_sub_pel_filters. +// Apply abs() to all the values. Elements 0, 2, 3, and 5 are always positive. +// Elements 1 and 4 are either 0 or negative. The code accounts for this with +// multiply/accumulates which either add or subtract as needed. The other +// functions will be updated to use this table later. +// It is also expanded to 8 elements to allow loading into 64 bit neon +// registers. +static const uint8_t abs_filters[8][8] = { + { 0, 0, 128, 0, 0, 0, 0, 0 }, { 0, 6, 123, 12, 1, 0, 0, 0 }, + { 2, 11, 108, 36, 8, 1, 0, 0 }, { 0, 9, 93, 50, 6, 0, 0, 0 }, + { 3, 16, 77, 77, 16, 3, 0, 0 }, { 0, 6, 50, 93, 9, 0, 0, 0 }, + { 1, 8, 36, 108, 11, 2, 0, 0 }, { 0, 1, 12, 123, 6, 0, 0, 0 }, +}; + +static INLINE uint8x8_t load_and_shift(const unsigned char *a) { + return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vld1_u8(a)), 32)); +} + +static INLINE void store4x4(unsigned char *dst, int dst_stride, + const uint8x8_t a0, const uint8x8_t a1) { + if (!((uintptr_t)dst & 0x3) && !(dst_stride & 0x3)) { + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a0), 0); + dst += dst_stride; + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a0), 1); + dst += dst_stride; + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a1), 0); + dst += dst_stride; + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a1), 1); + } else { + // Store to the aligned local buffer and memcpy instead of vget_lane_u8 + // which is really really slow. + uint32_t output_buffer[4]; + vst1_lane_u32(output_buffer, vreinterpret_u32_u8(a0), 0); + vst1_lane_u32(output_buffer + 1, vreinterpret_u32_u8(a0), 1); + vst1_lane_u32(output_buffer + 2, vreinterpret_u32_u8(a1), 0); + vst1_lane_u32(output_buffer + 3, vreinterpret_u32_u8(a1), 1); + + memcpy(dst, output_buffer, 4); + dst += dst_stride; + memcpy(dst, output_buffer + 1, 4); + dst += dst_stride; + memcpy(dst, output_buffer + 2, 4); + dst += dst_stride; + memcpy(dst, output_buffer + 3, 4); + } +} + +static INLINE void filter_add_accumulate(const uint8x16_t a, const uint8x16_t b, + const uint8x8_t filter, uint16x8_t *c, + uint16x8_t *d) { + const uint32x2x2_t a_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(a)), + vreinterpret_u32_u8(vget_high_u8(a))); + const uint32x2x2_t b_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(b)), + vreinterpret_u32_u8(vget_high_u8(b))); + *c = vmlal_u8(*c, vreinterpret_u8_u32(a_shuf.val[0]), filter); + *d = vmlal_u8(*d, vreinterpret_u8_u32(b_shuf.val[0]), filter); +} + +static INLINE void filter_sub_accumulate(const uint8x16_t a, const uint8x16_t b, + const uint8x8_t filter, uint16x8_t *c, + uint16x8_t *d) { + const uint32x2x2_t a_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(a)), + vreinterpret_u32_u8(vget_high_u8(a))); + const uint32x2x2_t b_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(b)), + vreinterpret_u32_u8(vget_high_u8(b))); + *c = vmlsl_u8(*c, vreinterpret_u8_u32(a_shuf.val[0]), filter); + *d = vmlsl_u8(*d, vreinterpret_u8_u32(b_shuf.val[0]), filter); +} + +static INLINE void yonly4x4(const unsigned char *src, int src_stride, + int filter_offset, unsigned char *dst, + int dst_stride) { + uint8x8_t a0, a1, a2, a3, a4, a5, a6, a7, a8; + uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7, b8; + uint16x8_t c0, c1, c2, c3; + int16x8_t d0, d1; + uint8x8_t e0, e1; + + const uint8x8_t filter = vld1_u8(abs_filters[filter_offset]); + const uint8x8_t filter0 = vdup_lane_u8(filter, 0); + const uint8x8_t filter1 = vdup_lane_u8(filter, 1); + const uint8x8_t filter2 = vdup_lane_u8(filter, 2); + const uint8x8_t filter3 = vdup_lane_u8(filter, 3); + const uint8x8_t filter4 = vdup_lane_u8(filter, 4); + const uint8x8_t filter5 = vdup_lane_u8(filter, 5); + + src -= src_stride * 2; + // Shift the even rows to allow using 'vext' to combine the vectors. armv8 + // has vcopy_lane which would be interesting. This started as just a + // horrible workaround for clang adding alignment hints to 32bit loads: + // https://llvm.org/bugs/show_bug.cgi?id=24421 + // But it turns out it almost identical to casting the loads. + a0 = load_and_shift(src); + src += src_stride; + a1 = vld1_u8(src); + src += src_stride; + a2 = load_and_shift(src); + src += src_stride; + a3 = vld1_u8(src); + src += src_stride; + a4 = load_and_shift(src); + src += src_stride; + a5 = vld1_u8(src); + src += src_stride; + a6 = load_and_shift(src); + src += src_stride; + a7 = vld1_u8(src); + src += src_stride; + a8 = vld1_u8(src); + + // Combine the rows so we can operate on 8 at a time. + b0 = vext_u8(a0, a1, 4); + b2 = vext_u8(a2, a3, 4); + b4 = vext_u8(a4, a5, 4); + b6 = vext_u8(a6, a7, 4); + b8 = a8; + + // To keep with the 8-at-a-time theme, combine *alternate* rows. This + // allows combining the odd rows with the even. + b1 = vext_u8(b0, b2, 4); + b3 = vext_u8(b2, b4, 4); + b5 = vext_u8(b4, b6, 4); + b7 = vext_u8(b6, b8, 4); + + // Multiply and expand to 16 bits. + c0 = vmull_u8(b0, filter0); + c1 = vmull_u8(b2, filter0); + c2 = vmull_u8(b5, filter5); + c3 = vmull_u8(b7, filter5); + + // Multiply, subtract and accumulate for filters 1 and 4 (the negative + // ones). + c0 = vmlsl_u8(c0, b4, filter4); + c1 = vmlsl_u8(c1, b6, filter4); + c2 = vmlsl_u8(c2, b1, filter1); + c3 = vmlsl_u8(c3, b3, filter1); + + // Add more positive ones. vmlal should really return a signed type. + // It's doing signed math internally, as evidenced by the fact we can do + // subtractions followed by more additions. Ideally we could use + // vqmlal/sl but that instruction doesn't exist. Might be able to + // shoehorn vqdmlal/vqdmlsl in here but it would take some effort. + c0 = vmlal_u8(c0, b2, filter2); + c1 = vmlal_u8(c1, b4, filter2); + c2 = vmlal_u8(c2, b3, filter3); + c3 = vmlal_u8(c3, b5, filter3); + + // Use signed saturation math because vmlsl may have left some negative + // numbers in there. + d0 = vqaddq_s16(vreinterpretq_s16_u16(c2), vreinterpretq_s16_u16(c0)); + d1 = vqaddq_s16(vreinterpretq_s16_u16(c3), vreinterpretq_s16_u16(c1)); + + // Use signed again because numbers like -200 need to be saturated to 0. + e0 = vqrshrun_n_s16(d0, 7); + e1 = vqrshrun_n_s16(d1, 7); + + store4x4(dst, dst_stride, e0, e1); +} + +void vp8_sixtap_predict4x4_neon(unsigned char *src_ptr, int src_pixels_per_line, + int xoffset, int yoffset, + unsigned char *dst_ptr, int dst_pitch) { + uint8x16_t s0, s1, s2, s3, s4; + uint64x2_t s01, s23; + // Variables to hold src[] elements for the given filter[] + uint8x8_t s0_f5, s1_f5, s2_f5, s3_f5, s4_f5; + uint8x8_t s4_f1, s4_f2, s4_f3, s4_f4; + uint8x16_t s01_f0, s23_f0; + uint64x2_t s01_f3, s23_f3; + uint32x2x2_t s01_f3_q, s23_f3_q, s01_f5_q, s23_f5_q; + // Accumulator variables. + uint16x8_t d0123, d4567, d89; + uint16x8_t d0123_a, d4567_a, d89_a; + int16x8_t e0123, e4567, e89; + // Second pass intermediates. + uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7, b8; + uint16x8_t c0, c1, c2, c3; + int16x8_t d0, d1; + uint8x8_t e0, e1; + uint8x8_t filter, filter0, filter1, filter2, filter3, filter4, filter5; + + if (xoffset == 0) { // Second pass only. + yonly4x4(src_ptr, src_pixels_per_line, yoffset, dst_ptr, dst_pitch); + return; + } + + if (yoffset == 0) { // First pass only. + src_ptr -= 2; + } else { // Add context for the second pass. 2 extra lines on top. + src_ptr -= 2 + (src_pixels_per_line * 2); + } + + filter = vld1_u8(abs_filters[xoffset]); + filter0 = vdup_lane_u8(filter, 0); + filter1 = vdup_lane_u8(filter, 1); + filter2 = vdup_lane_u8(filter, 2); + filter3 = vdup_lane_u8(filter, 3); + filter4 = vdup_lane_u8(filter, 4); + filter5 = vdup_lane_u8(filter, 5); + + // 2 bytes of context, 4 bytes of src values, 3 bytes of context, 7 bytes of + // garbage. So much effort for that last single bit. + // The low values of each pair are for filter0. + s0 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + s1 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + s2 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + s3 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + + // Shift to extract values for filter[5] + // If src[] is 0, this puts: + // 3 4 5 6 7 8 9 10 in s0_f5 + // Can't use vshr.u64 because it crosses the double word boundary. + s0_f5 = vext_u8(vget_low_u8(s0), vget_high_u8(s0), 5); + s1_f5 = vext_u8(vget_low_u8(s1), vget_high_u8(s1), 5); + s2_f5 = vext_u8(vget_low_u8(s2), vget_high_u8(s2), 5); + s3_f5 = vext_u8(vget_low_u8(s3), vget_high_u8(s3), 5); + + s01_f0 = vcombine_u8(vget_low_u8(s0), vget_low_u8(s1)); + s23_f0 = vcombine_u8(vget_low_u8(s2), vget_low_u8(s3)); + + s01_f5_q = vzip_u32(vreinterpret_u32_u8(s0_f5), vreinterpret_u32_u8(s1_f5)); + s23_f5_q = vzip_u32(vreinterpret_u32_u8(s2_f5), vreinterpret_u32_u8(s3_f5)); + d0123 = vmull_u8(vreinterpret_u8_u32(s01_f5_q.val[0]), filter5); + d4567 = vmull_u8(vreinterpret_u8_u32(s23_f5_q.val[0]), filter5); + + // Keep original src data as 64 bits to simplify shifting and extracting. + s01 = vreinterpretq_u64_u8(s01_f0); + s23 = vreinterpretq_u64_u8(s23_f0); + + // 3 4 5 6 * filter0 + filter_add_accumulate(s01_f0, s23_f0, filter0, &d0123, &d4567); + + // Shift over one to use -1, 0, 1, 2 for filter1 + // -1 0 1 2 * filter1 + filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 8)), + vreinterpretq_u8_u64(vshrq_n_u64(s23, 8)), filter1, + &d0123, &d4567); + + // 2 3 4 5 * filter4 + filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 32)), + vreinterpretq_u8_u64(vshrq_n_u64(s23, 32)), filter4, + &d0123, &d4567); + + // 0 1 2 3 * filter2 + filter_add_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 16)), + vreinterpretq_u8_u64(vshrq_n_u64(s23, 16)), filter2, + &d0123, &d4567); + + // 1 2 3 4 * filter3 + s01_f3 = vshrq_n_u64(s01, 24); + s23_f3 = vshrq_n_u64(s23, 24); + s01_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s01_f3)), + vreinterpret_u32_u64(vget_high_u64(s01_f3))); + s23_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s23_f3)), + vreinterpret_u32_u64(vget_high_u64(s23_f3))); + // Accumulate into different registers so it can use saturated addition. + d0123_a = vmull_u8(vreinterpret_u8_u32(s01_f3_q.val[0]), filter3); + d4567_a = vmull_u8(vreinterpret_u8_u32(s23_f3_q.val[0]), filter3); + + e0123 = + vqaddq_s16(vreinterpretq_s16_u16(d0123), vreinterpretq_s16_u16(d0123_a)); + e4567 = + vqaddq_s16(vreinterpretq_s16_u16(d4567), vreinterpretq_s16_u16(d4567_a)); + + // Shift and narrow. + b0 = vqrshrun_n_s16(e0123, 7); + b2 = vqrshrun_n_s16(e4567, 7); + + if (yoffset == 0) { // firstpass_filter4x4_only + store4x4(dst_ptr, dst_pitch, b0, b2); + return; + } + + // Load additional context when doing both filters. + s0 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + s1 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + s2 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + s3 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + s4 = vld1q_u8(src_ptr); + + s0_f5 = vext_u8(vget_low_u8(s0), vget_high_u8(s0), 5); + s1_f5 = vext_u8(vget_low_u8(s1), vget_high_u8(s1), 5); + s2_f5 = vext_u8(vget_low_u8(s2), vget_high_u8(s2), 5); + s3_f5 = vext_u8(vget_low_u8(s3), vget_high_u8(s3), 5); + s4_f5 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 5); + + // 3 4 5 6 * filter0 + s01_f0 = vcombine_u8(vget_low_u8(s0), vget_low_u8(s1)); + s23_f0 = vcombine_u8(vget_low_u8(s2), vget_low_u8(s3)); + + s01_f5_q = vzip_u32(vreinterpret_u32_u8(s0_f5), vreinterpret_u32_u8(s1_f5)); + s23_f5_q = vzip_u32(vreinterpret_u32_u8(s2_f5), vreinterpret_u32_u8(s3_f5)); + // But this time instead of 16 pixels to filter, there are 20. So an extra + // run with a doubleword register. + d0123 = vmull_u8(vreinterpret_u8_u32(s01_f5_q.val[0]), filter5); + d4567 = vmull_u8(vreinterpret_u8_u32(s23_f5_q.val[0]), filter5); + d89 = vmull_u8(s4_f5, filter5); + + // Save a copy as u64 for shifting. + s01 = vreinterpretq_u64_u8(s01_f0); + s23 = vreinterpretq_u64_u8(s23_f0); + + filter_add_accumulate(s01_f0, s23_f0, filter0, &d0123, &d4567); + d89 = vmlal_u8(d89, vget_low_u8(s4), filter0); + + filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 8)), + vreinterpretq_u8_u64(vshrq_n_u64(s23, 8)), filter1, + &d0123, &d4567); + s4_f1 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 1); + d89 = vmlsl_u8(d89, s4_f1, filter1); + + filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 32)), + vreinterpretq_u8_u64(vshrq_n_u64(s23, 32)), filter4, + &d0123, &d4567); + s4_f4 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 4); + d89 = vmlsl_u8(d89, s4_f4, filter4); + + filter_add_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 16)), + vreinterpretq_u8_u64(vshrq_n_u64(s23, 16)), filter2, + &d0123, &d4567); + s4_f2 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 2); + d89 = vmlal_u8(d89, s4_f2, filter2); + + s01_f3 = vshrq_n_u64(s01, 24); + s23_f3 = vshrq_n_u64(s23, 24); + s01_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s01_f3)), + vreinterpret_u32_u64(vget_high_u64(s01_f3))); + s23_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s23_f3)), + vreinterpret_u32_u64(vget_high_u64(s23_f3))); + s4_f3 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 3); + d0123_a = vmull_u8(vreinterpret_u8_u32(s01_f3_q.val[0]), filter3); + d4567_a = vmull_u8(vreinterpret_u8_u32(s23_f3_q.val[0]), filter3); + d89_a = vmull_u8(s4_f3, filter3); + + e0123 = + vqaddq_s16(vreinterpretq_s16_u16(d0123), vreinterpretq_s16_u16(d0123_a)); + e4567 = + vqaddq_s16(vreinterpretq_s16_u16(d4567), vreinterpretq_s16_u16(d4567_a)); + e89 = vqaddq_s16(vreinterpretq_s16_u16(d89), vreinterpretq_s16_u16(d89_a)); + + b4 = vqrshrun_n_s16(e0123, 7); + b6 = vqrshrun_n_s16(e4567, 7); + b8 = vqrshrun_n_s16(e89, 7); + + // Second pass: 4x4 + filter = vld1_u8(abs_filters[yoffset]); + filter0 = vdup_lane_u8(filter, 0); + filter1 = vdup_lane_u8(filter, 1); + filter2 = vdup_lane_u8(filter, 2); + filter3 = vdup_lane_u8(filter, 3); + filter4 = vdup_lane_u8(filter, 4); + filter5 = vdup_lane_u8(filter, 5); + + b1 = vext_u8(b0, b2, 4); + b3 = vext_u8(b2, b4, 4); + b5 = vext_u8(b4, b6, 4); + b7 = vext_u8(b6, b8, 4); + + c0 = vmull_u8(b0, filter0); + c1 = vmull_u8(b2, filter0); + c2 = vmull_u8(b5, filter5); + c3 = vmull_u8(b7, filter5); + + c0 = vmlsl_u8(c0, b4, filter4); + c1 = vmlsl_u8(c1, b6, filter4); + c2 = vmlsl_u8(c2, b1, filter1); + c3 = vmlsl_u8(c3, b3, filter1); + + c0 = vmlal_u8(c0, b2, filter2); + c1 = vmlal_u8(c1, b4, filter2); + c2 = vmlal_u8(c2, b3, filter3); + c3 = vmlal_u8(c3, b5, filter3); + + d0 = vqaddq_s16(vreinterpretq_s16_u16(c2), vreinterpretq_s16_u16(c0)); + d1 = vqaddq_s16(vreinterpretq_s16_u16(c3), vreinterpretq_s16_u16(c1)); + + e0 = vqrshrun_n_s16(d0, 7); + e1 = vqrshrun_n_s16(d1, 7); + + store4x4(dst_ptr, dst_pitch, e0, e1); +} + void vp8_sixtap_predict8x4_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch) { diff --git a/vp8/common/filter.c b/vp8/common/filter.c index a312efb6c..267498335 100644 --- a/vp8/common/filter.c +++ b/vp8/common/filter.c @@ -8,8 +8,9 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "filter.h" +#include <assert.h> #include "./vp8_rtcd.h" +#include "vp8/common/filter.h" DECLARE_ALIGNED(16, const short, vp8_bilinear_filters[8][2]) = { { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, @@ -324,27 +325,11 @@ void vp8_bilinear_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, const short *HFilter; const short *VFilter; + // This represents a copy and is not required to be handled by optimizations. + assert((xoffset | yoffset) != 0); + HFilter = vp8_bilinear_filters[xoffset]; VFilter = vp8_bilinear_filters[yoffset]; -#if 0 - { - int i; - unsigned char temp1[16]; - unsigned char temp2[16]; - - bilinear_predict4x4_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, temp1, 4); - filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4); - - for (i = 0; i < 16; ++i) - { - if (temp1[i] != temp2[i]) - { - bilinear_predict4x4_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, temp1, 4); - filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4); - } - } - } -#endif filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4); } @@ -355,6 +340,8 @@ void vp8_bilinear_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, const short *HFilter; const short *VFilter; + assert((xoffset | yoffset) != 0); + HFilter = vp8_bilinear_filters[xoffset]; VFilter = vp8_bilinear_filters[yoffset]; @@ -368,6 +355,8 @@ void vp8_bilinear_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, const short *HFilter; const short *VFilter; + assert((xoffset | yoffset) != 0); + HFilter = vp8_bilinear_filters[xoffset]; VFilter = vp8_bilinear_filters[yoffset]; @@ -382,6 +371,8 @@ void vp8_bilinear_predict16x16_c(unsigned char *src_ptr, const short *HFilter; const short *VFilter; + assert((xoffset | yoffset) != 0); + HFilter = vp8_bilinear_filters[xoffset]; VFilter = vp8_bilinear_filters[yoffset]; diff --git a/vp8/common/onyx.h b/vp8/common/onyx.h index eb68246b2..43e3c29b5 100644 --- a/vp8/common/onyx.h +++ b/vp8/common/onyx.h @@ -251,7 +251,7 @@ int vp8_receive_raw_frame(struct VP8_COMP *comp, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, int64_t end_time_stamp); int vp8_get_compressed_data(struct VP8_COMP *comp, unsigned int *frame_flags, - unsigned long *size, unsigned char *dest, + size_t *size, unsigned char *dest, unsigned char *dest_end, int64_t *time_stamp, int64_t *time_end, int flush); int vp8_get_preview_raw_frame(struct VP8_COMP *comp, YV12_BUFFER_CONFIG *dest, diff --git a/vp8/common/onyxc_int.h b/vp8/common/onyxc_int.h index 732656f2f..9a12c7fb6 100644 --- a/vp8/common/onyxc_int.h +++ b/vp8/common/onyxc_int.h @@ -160,10 +160,6 @@ typedef struct VP8Common { #ifdef PACKET_TESTING VP8_HEADER oh; #endif -#if CONFIG_POSTPROC_VISUALIZER - double bitrate; - double framerate; -#endif #if CONFIG_MULTITHREAD int processor_core_count; diff --git a/vp8/common/postproc.c b/vp8/common/postproc.c index 1c4e042c8..8b8c1701a 100644 --- a/vp8/common/postproc.c +++ b/vp8/common/postproc.c @@ -37,46 +37,6 @@ (0.071 * (float)(t & 0xff)) + 128) /* clang-format on */ -/* global constants */ -#if CONFIG_POSTPROC_VISUALIZER -static const unsigned char MB_PREDICTION_MODE_colors[MB_MODE_COUNT][3] = { - { RGB_TO_YUV(0x98FB98) }, /* PaleGreen */ - { RGB_TO_YUV(0x00FF00) }, /* Green */ - { RGB_TO_YUV(0xADFF2F) }, /* GreenYellow */ - { RGB_TO_YUV(0x228B22) }, /* ForestGreen */ - { RGB_TO_YUV(0x006400) }, /* DarkGreen */ - { RGB_TO_YUV(0x98F5FF) }, /* Cadet Blue */ - { RGB_TO_YUV(0x6CA6CD) }, /* Sky Blue */ - { RGB_TO_YUV(0x00008B) }, /* Dark blue */ - { RGB_TO_YUV(0x551A8B) }, /* Purple */ - { RGB_TO_YUV(0xFF0000) } /* Red */ -}; - -static const unsigned char B_PREDICTION_MODE_colors[B_MODE_COUNT][3] = { - { RGB_TO_YUV(0x6633ff) }, /* Purple */ - { RGB_TO_YUV(0xcc33ff) }, /* Magenta */ - { RGB_TO_YUV(0xff33cc) }, /* Pink */ - { RGB_TO_YUV(0xff3366) }, /* Coral */ - { RGB_TO_YUV(0x3366ff) }, /* Blue */ - { RGB_TO_YUV(0xed00f5) }, /* Dark Blue */ - { RGB_TO_YUV(0x2e00b8) }, /* Dark Purple */ - { RGB_TO_YUV(0xff6633) }, /* Orange */ - { RGB_TO_YUV(0x33ccff) }, /* Light Blue */ - { RGB_TO_YUV(0x8ab800) }, /* Green */ - { RGB_TO_YUV(0xffcc33) }, /* Light Orange */ - { RGB_TO_YUV(0x33ffcc) }, /* Aqua */ - { RGB_TO_YUV(0x66ff33) }, /* Light Green */ - { RGB_TO_YUV(0xccff33) }, /* Yellow */ -}; - -static const unsigned char MV_REFERENCE_FRAME_colors[MAX_REF_FRAMES][3] = { - { RGB_TO_YUV(0x00ff00) }, /* Blue */ - { RGB_TO_YUV(0x0000ff) }, /* Green */ - { RGB_TO_YUV(0xffff00) }, /* Yellow */ - { RGB_TO_YUV(0xff0000) }, /* Red */ -}; -#endif - extern void vp8_blit_text(const char *msg, unsigned char *address, const int pitch); extern void vp8_blit_line(int x0, int x1, int y0, int y1, unsigned char *image, @@ -308,43 +268,6 @@ void vp8_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v, } } -#if CONFIG_POSTPROC_VISUALIZER -static void constrain_line(int x_0, int *x_1, int y_0, int *y_1, int width, - int height) { - int dx; - int dy; - - if (*x_1 > width) { - dx = *x_1 - x_0; - dy = *y_1 - y_0; - - *x_1 = width; - if (dx) *y_1 = ((width - x_0) * dy) / dx + y_0; - } - if (*x_1 < 0) { - dx = *x_1 - x_0; - dy = *y_1 - y_0; - - *x_1 = 0; - if (dx) *y_1 = ((0 - x_0) * dy) / dx + y_0; - } - if (*y_1 > height) { - dx = *x_1 - x_0; - dy = *y_1 - y_0; - - *y_1 = height; - if (dy) *x_1 = ((height - y_0) * dx) / dy + x_0; - } - if (*y_1 < 0) { - dx = *x_1 - x_0; - dy = *y_1 - y_0; - - *y_1 = 0; - if (dy) *x_1 = ((0 - y_0) * dx) / dy + x_0; - } -} -#endif // CONFIG_POSTPROC_VISUALIZER - #if CONFIG_POSTPROC int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *ppflags) { @@ -455,331 +378,6 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, oci->post_proc_buffer.y_stride); } -#if CONFIG_POSTPROC_VISUALIZER - if (flags & VP8D_DEBUG_TXT_FRAME_INFO) { - char message[512]; - sprintf(message, "F%1dG%1dQ%3dF%3dP%d_s%dx%d", - (oci->frame_type == KEY_FRAME), oci->refresh_golden_frame, - oci->base_qindex, oci->filter_level, flags, oci->mb_cols, - oci->mb_rows); - vp8_blit_text(message, oci->post_proc_buffer.y_buffer, - oci->post_proc_buffer.y_stride); - } - - if (flags & VP8D_DEBUG_TXT_MBLK_MODES) { - int i, j; - unsigned char *y_ptr; - YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer; - int mb_rows = post->y_height >> 4; - int mb_cols = post->y_width >> 4; - int mb_index = 0; - MODE_INFO *mi = oci->mi; - - y_ptr = post->y_buffer + 4 * post->y_stride + 4; - - /* vp8_filter each macro block */ - for (i = 0; i < mb_rows; ++i) { - for (j = 0; j < mb_cols; ++j) { - char zz[4]; - - sprintf(zz, "%c", mi[mb_index].mbmi.mode + 'a'); - - vp8_blit_text(zz, y_ptr, post->y_stride); - mb_index++; - y_ptr += 16; - } - - mb_index++; /* border */ - y_ptr += post->y_stride * 16 - post->y_width; - } - } - - if (flags & VP8D_DEBUG_TXT_DC_DIFF) { - int i, j; - unsigned char *y_ptr; - YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer; - int mb_rows = post->y_height >> 4; - int mb_cols = post->y_width >> 4; - int mb_index = 0; - MODE_INFO *mi = oci->mi; - - y_ptr = post->y_buffer + 4 * post->y_stride + 4; - - /* vp8_filter each macro block */ - for (i = 0; i < mb_rows; ++i) { - for (j = 0; j < mb_cols; ++j) { - char zz[4]; - int dc_diff = !(mi[mb_index].mbmi.mode != B_PRED && - mi[mb_index].mbmi.mode != SPLITMV && - mi[mb_index].mbmi.mb_skip_coeff); - - if (oci->frame_type == KEY_FRAME) - sprintf(zz, "a"); - else - sprintf(zz, "%c", dc_diff + '0'); - - vp8_blit_text(zz, y_ptr, post->y_stride); - mb_index++; - y_ptr += 16; - } - - mb_index++; /* border */ - y_ptr += post->y_stride * 16 - post->y_width; - } - } - - if (flags & VP8D_DEBUG_TXT_RATE_INFO) { - char message[512]; - sprintf(message, "Bitrate: %10.2f framerate: %10.2f ", oci->bitrate, - oci->framerate); - vp8_blit_text(message, oci->post_proc_buffer.y_buffer, - oci->post_proc_buffer.y_stride); - } - - /* Draw motion vectors */ - if ((flags & VP8D_DEBUG_DRAW_MV) && ppflags->display_mv_flag) { - YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer; - int width = post->y_width; - int height = post->y_height; - unsigned char *y_buffer = oci->post_proc_buffer.y_buffer; - int y_stride = oci->post_proc_buffer.y_stride; - MODE_INFO *mi = oci->mi; - int x0, y0; - - for (y0 = 0; y0 < height; y0 += 16) { - for (x0 = 0; x0 < width; x0 += 16) { - int x1, y1; - - if (!(ppflags->display_mv_flag & (1 << mi->mbmi.mode))) { - mi++; - continue; - } - - if (mi->mbmi.mode == SPLITMV) { - switch (mi->mbmi.partitioning) { - case 0: /* mv_top_bottom */ - { - union b_mode_info *bmi = &mi->bmi[0]; - MV *mv = &bmi->mv.as_mv; - - x1 = x0 + 8 + (mv->col >> 3); - y1 = y0 + 4 + (mv->row >> 3); - - constrain_line(x0 + 8, &x1, y0 + 4, &y1, width, height); - vp8_blit_line(x0 + 8, x1, y0 + 4, y1, y_buffer, y_stride); - - bmi = &mi->bmi[8]; - - x1 = x0 + 8 + (mv->col >> 3); - y1 = y0 + 12 + (mv->row >> 3); - - constrain_line(x0 + 8, &x1, y0 + 12, &y1, width, height); - vp8_blit_line(x0 + 8, x1, y0 + 12, y1, y_buffer, y_stride); - - break; - } - case 1: /* mv_left_right */ - { - union b_mode_info *bmi = &mi->bmi[0]; - MV *mv = &bmi->mv.as_mv; - - x1 = x0 + 4 + (mv->col >> 3); - y1 = y0 + 8 + (mv->row >> 3); - - constrain_line(x0 + 4, &x1, y0 + 8, &y1, width, height); - vp8_blit_line(x0 + 4, x1, y0 + 8, y1, y_buffer, y_stride); - - bmi = &mi->bmi[2]; - - x1 = x0 + 12 + (mv->col >> 3); - y1 = y0 + 8 + (mv->row >> 3); - - constrain_line(x0 + 12, &x1, y0 + 8, &y1, width, height); - vp8_blit_line(x0 + 12, x1, y0 + 8, y1, y_buffer, y_stride); - - break; - } - case 2: /* mv_quarters */ - { - union b_mode_info *bmi = &mi->bmi[0]; - MV *mv = &bmi->mv.as_mv; - - x1 = x0 + 4 + (mv->col >> 3); - y1 = y0 + 4 + (mv->row >> 3); - - constrain_line(x0 + 4, &x1, y0 + 4, &y1, width, height); - vp8_blit_line(x0 + 4, x1, y0 + 4, y1, y_buffer, y_stride); - - bmi = &mi->bmi[2]; - - x1 = x0 + 12 + (mv->col >> 3); - y1 = y0 + 4 + (mv->row >> 3); - - constrain_line(x0 + 12, &x1, y0 + 4, &y1, width, height); - vp8_blit_line(x0 + 12, x1, y0 + 4, y1, y_buffer, y_stride); - - bmi = &mi->bmi[8]; - - x1 = x0 + 4 + (mv->col >> 3); - y1 = y0 + 12 + (mv->row >> 3); - - constrain_line(x0 + 4, &x1, y0 + 12, &y1, width, height); - vp8_blit_line(x0 + 4, x1, y0 + 12, y1, y_buffer, y_stride); - - bmi = &mi->bmi[10]; - - x1 = x0 + 12 + (mv->col >> 3); - y1 = y0 + 12 + (mv->row >> 3); - - constrain_line(x0 + 12, &x1, y0 + 12, &y1, width, height); - vp8_blit_line(x0 + 12, x1, y0 + 12, y1, y_buffer, y_stride); - break; - } - default: { - union b_mode_info *bmi = mi->bmi; - int bx0, by0; - - for (by0 = y0; by0 < (y0 + 16); by0 += 4) { - for (bx0 = x0; bx0 < (x0 + 16); bx0 += 4) { - MV *mv = &bmi->mv.as_mv; - - x1 = bx0 + 2 + (mv->col >> 3); - y1 = by0 + 2 + (mv->row >> 3); - - constrain_line(bx0 + 2, &x1, by0 + 2, &y1, width, height); - vp8_blit_line(bx0 + 2, x1, by0 + 2, y1, y_buffer, y_stride); - - bmi++; - } - } - } - } - } else if (mi->mbmi.mode >= NEARESTMV) { - MV *mv = &mi->mbmi.mv.as_mv; - const int lx0 = x0 + 8; - const int ly0 = y0 + 8; - - x1 = lx0 + (mv->col >> 3); - y1 = ly0 + (mv->row >> 3); - - if (x1 != lx0 && y1 != ly0) { - constrain_line(lx0, &x1, ly0 - 1, &y1, width, height); - vp8_blit_line(lx0, x1, ly0 - 1, y1, y_buffer, y_stride); - - constrain_line(lx0, &x1, ly0 + 1, &y1, width, height); - vp8_blit_line(lx0, x1, ly0 + 1, y1, y_buffer, y_stride); - } else - vp8_blit_line(lx0, x1, ly0, y1, y_buffer, y_stride); - } - - mi++; - } - mi++; - } - } - - /* Color in block modes */ - if ((flags & VP8D_DEBUG_CLR_BLK_MODES) && - (ppflags->display_mb_modes_flag || ppflags->display_b_modes_flag)) { - int y, x; - YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer; - int width = post->y_width; - int height = post->y_height; - unsigned char *y_ptr = oci->post_proc_buffer.y_buffer; - unsigned char *u_ptr = oci->post_proc_buffer.u_buffer; - unsigned char *v_ptr = oci->post_proc_buffer.v_buffer; - int y_stride = oci->post_proc_buffer.y_stride; - MODE_INFO *mi = oci->mi; - - for (y = 0; y < height; y += 16) { - for (x = 0; x < width; x += 16) { - int Y = 0, U = 0, V = 0; - - if (mi->mbmi.mode == B_PRED && - ((ppflags->display_mb_modes_flag & B_PRED) || - ppflags->display_b_modes_flag)) { - int by, bx; - unsigned char *yl, *ul, *vl; - union b_mode_info *bmi = mi->bmi; - - yl = y_ptr + x; - ul = u_ptr + (x >> 1); - vl = v_ptr + (x >> 1); - - for (by = 0; by < 16; by += 4) { - for (bx = 0; bx < 16; bx += 4) { - if ((ppflags->display_b_modes_flag & (1 << mi->mbmi.mode)) || - (ppflags->display_mb_modes_flag & B_PRED)) { - Y = B_PREDICTION_MODE_colors[bmi->as_mode][0]; - U = B_PREDICTION_MODE_colors[bmi->as_mode][1]; - V = B_PREDICTION_MODE_colors[bmi->as_mode][2]; - - vp8_blend_b(yl + bx, ul + (bx >> 1), vl + (bx >> 1), Y, U, V, - 0xc000, y_stride); - } - bmi++; - } - - yl += y_stride * 4; - ul += y_stride * 1; - vl += y_stride * 1; - } - } else if (ppflags->display_mb_modes_flag & (1 << mi->mbmi.mode)) { - Y = MB_PREDICTION_MODE_colors[mi->mbmi.mode][0]; - U = MB_PREDICTION_MODE_colors[mi->mbmi.mode][1]; - V = MB_PREDICTION_MODE_colors[mi->mbmi.mode][2]; - - vp8_blend_mb_inner(y_ptr + x, u_ptr + (x >> 1), v_ptr + (x >> 1), Y, - U, V, 0xc000, y_stride); - } - - mi++; - } - y_ptr += y_stride * 16; - u_ptr += y_stride * 4; - v_ptr += y_stride * 4; - - mi++; - } - } - - /* Color in frame reference blocks */ - if ((flags & VP8D_DEBUG_CLR_FRM_REF_BLKS) && - ppflags->display_ref_frame_flag) { - int y, x; - YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer; - int width = post->y_width; - int height = post->y_height; - unsigned char *y_ptr = oci->post_proc_buffer.y_buffer; - unsigned char *u_ptr = oci->post_proc_buffer.u_buffer; - unsigned char *v_ptr = oci->post_proc_buffer.v_buffer; - int y_stride = oci->post_proc_buffer.y_stride; - MODE_INFO *mi = oci->mi; - - for (y = 0; y < height; y += 16) { - for (x = 0; x < width; x += 16) { - int Y = 0, U = 0, V = 0; - - if (ppflags->display_ref_frame_flag & (1 << mi->mbmi.ref_frame)) { - Y = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][0]; - U = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][1]; - V = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][2]; - - vp8_blend_mb_outer(y_ptr + x, u_ptr + (x >> 1), v_ptr + (x >> 1), Y, - U, V, 0xc000, y_stride); - } - - mi++; - } - y_ptr += y_stride * 16; - u_ptr += y_stride * 4; - v_ptr += y_stride * 4; - - mi++; - } - } -#endif - *dest = oci->post_proc_buffer; /* handle problem with extending borders */ diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl index 8dc36f731..5d8e4a78d 100644 --- a/vp8/common/rtcd_defs.pl +++ b/vp8/common/rtcd_defs.pl @@ -173,10 +173,8 @@ add_proto qw/void vp8_sixtap_predict8x4/, "unsigned char *src, int src_pitch, in specialize qw/vp8_sixtap_predict8x4 mmx sse2 ssse3 neon dspr2 msa/; $vp8_sixtap_predict8x4_dspr2=vp8_sixtap_predict8x4_dspr2; -# TODO(johannkoenig): Add neon implementation -# https://bugs.chromium.org/p/webm/issues/detail?id=1273 add_proto qw/void vp8_sixtap_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; -specialize qw/vp8_sixtap_predict4x4 mmx ssse3 dspr2 msa/; +specialize qw/vp8_sixtap_predict4x4 mmx ssse3 neon dspr2 msa/; $vp8_sixtap_predict4x4_dspr2=vp8_sixtap_predict4x4_dspr2; add_proto qw/void vp8_bilinear_predict16x16/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; diff --git a/vp8/common/textblit.c b/vp8/common/textblit.c deleted file mode 100644 index e7c15c4e4..000000000 --- a/vp8/common/textblit.c +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <stdlib.h> - -void vp8_blit_text(const char *msg, unsigned char *address, const int pitch) { - int letter_bitmap; - unsigned char *output_pos = address; - int colpos; - const int font[] = { - 0x0, 0x5C00, 0x8020, 0xAFABEA, 0xD7EC0, 0x1111111, 0x1855740, - 0x18000, 0x45C0, 0x74400, 0x51140, 0x23880, 0xC4000, 0x21080, - 0x80000, 0x111110, 0xE9D72E, 0x87E40, 0x12AD732, 0xAAD62A, 0x4F94C4, - 0x4D6B7, 0x456AA, 0x3E8423, 0xAAD6AA, 0xAAD6A2, 0x2800, 0x2A00, - 0x8A880, 0x52940, 0x22A20, 0x15422, 0x6AD62E, 0x1E4A53E, 0xAAD6BF, - 0x8C62E, 0xE8C63F, 0x118D6BF, 0x1094BF, 0xCAC62E, 0x1F2109F, 0x118FE31, - 0xF8C628, 0x8A89F, 0x108421F, 0x1F1105F, 0x1F4105F, 0xE8C62E, 0x2294BF, - 0x164C62E, 0x12694BF, 0x8AD6A2, 0x10FC21, 0x1F8421F, 0x744107, 0xF8220F, - 0x1151151, 0x117041, 0x119D731, 0x47E0, 0x1041041, 0xFC400, 0x10440, - 0x1084210, 0x820 - }; - colpos = 0; - - while (msg[colpos] != 0) { - char letter = msg[colpos]; - int fontcol, fontrow; - - if (letter <= 'Z' && letter >= ' ') - letter_bitmap = font[letter - ' ']; - else if (letter <= 'z' && letter >= 'a') - letter_bitmap = font[letter - 'a' + 'A' - ' ']; - else - letter_bitmap = font[0]; - - for (fontcol = 6; fontcol >= 0; fontcol--) - for (fontrow = 0; fontrow < 5; ++fontrow) - output_pos[fontrow * pitch + fontcol] = - ((letter_bitmap >> (fontcol * 5)) & (1 << fontrow) ? 255 : 0); - - output_pos += 7; - colpos++; - } -} - -static void plot(const int x, const int y, unsigned char *image, - const int pitch) { - image[x + y * pitch] ^= 255; -} - -/* Bresenham line algorithm */ -void vp8_blit_line(int x0, int x1, int y0, int y1, unsigned char *image, - const int pitch) { - int steep = abs(y1 - y0) > abs(x1 - x0); - int deltax, deltay; - int error, ystep, y, x; - - if (steep) { - int t; - t = x0; - x0 = y0; - y0 = t; - - t = x1; - x1 = y1; - y1 = t; - } - - if (x0 > x1) { - int t; - t = x0; - x0 = x1; - x1 = t; - - t = y0; - y0 = y1; - y1 = t; - } - - deltax = x1 - x0; - deltay = abs(y1 - y0); - error = deltax / 2; - - y = y0; - - if (y0 < y1) - ystep = 1; - else - ystep = -1; - - if (steep) { - for (x = x0; x <= x1; ++x) { - plot(y, x, image, pitch); - - error = error - deltay; - if (error < 0) { - y = y + ystep; - error = error + deltax; - } - } - } else { - for (x = x0; x <= x1; ++x) { - plot(x, y, image, pitch); - - error = error - deltay; - if (error < 0) { - y = y + ystep; - error = error + deltax; - } - } - } -} diff --git a/vp8/common/x86/subpixel_sse2.asm b/vp8/common/x86/subpixel_sse2.asm index 69f8d103c..ca00583ca 100644 --- a/vp8/common/x86/subpixel_sse2.asm +++ b/vp8/common/x86/subpixel_sse2.asm @@ -181,8 +181,12 @@ sym(vp8_filter_block1d16_h6_sse2): movq xmm3, MMWORD PTR [rsi - 2] movq xmm1, MMWORD PTR [rsi + 6] - movq xmm2, MMWORD PTR [rsi +14] - pslldq xmm2, 8 + ; Load from 11 to avoid reading out of bounds. + movq xmm2, MMWORD PTR [rsi +11] + ; The lower bits are not cleared before 'or'ing with xmm1, + ; but that is OK because the values in the overlapping positions + ; are already equal to the ones in xmm1. + pslldq xmm2, 5 por xmm2, xmm1 prefetcht2 [rsi+rax-2] diff --git a/vp8/common/x86/subpixel_ssse3.asm b/vp8/common/x86/subpixel_ssse3.asm index c06f24556..1f6cbd1d1 100644 --- a/vp8/common/x86/subpixel_ssse3.asm +++ b/vp8/common/x86/subpixel_ssse3.asm @@ -1291,6 +1291,8 @@ sym(vp8_bilinear_predict8x8_ssse3): movq xmm7, XMMWORD PTR [rsp+96] punpcklbw xmm5, xmm6 + ; Because the source register (xmm0) is always treated as signed by + ; pmaddubsw, the constant '128' is treated as '-128'. pmaddubsw xmm1, xmm0 pmaddubsw xmm2, xmm0 @@ -1319,6 +1321,10 @@ sym(vp8_bilinear_predict8x8_ssse3): psraw xmm5, VP8_FILTER_SHIFT psraw xmm6, VP8_FILTER_SHIFT + + ; Having multiplied everything by '-128' and obtained negative + ; numbers, the unsigned saturation truncates those values to 0, + ; resulting in incorrect handling of xoffset == 0 && yoffset == 0 packuswb xmm1, xmm1 packuswb xmm2, xmm2 diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c index 06c2f624f..1b100cfe8 100644 --- a/vp8/encoder/bitstream.c +++ b/vp8/encoder/bitstream.c @@ -1056,7 +1056,7 @@ static void put_delta_q(vp8_writer *bc, int delta_q) { } void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, - unsigned char *dest_end, unsigned long *size) { + unsigned char *dest_end, size_t *size) { int i, j; VP8_HEADER oh; VP8_COMMON *const pc = &cpi->common; @@ -1347,7 +1347,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, *size = VP8_HEADER_SIZE + extra_bytes_packed + cpi->bc->pos; - cpi->partition_sz[0] = *size; + cpi->partition_sz[0] = (unsigned int)*size; #if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING { diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index f61cfbe90..6ebf233ed 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -2746,7 +2746,7 @@ static int decide_key_frame(VP8_COMP *cpi) { return code_key_frame; } -static void Pass1Encode(VP8_COMP *cpi, unsigned long *size, unsigned char *dest, +static void Pass1Encode(VP8_COMP *cpi, size_t *size, unsigned char *dest, unsigned int *frame_flags) { (void)size; (void)dest; @@ -3185,7 +3185,7 @@ void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm) { vp8_yv12_extend_frame_borders(cm->frame_to_show); } -static void encode_frame_to_data_rate(VP8_COMP *cpi, unsigned long *size, +static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, unsigned char *dest, unsigned char *dest_end, unsigned int *frame_flags) { @@ -4384,7 +4384,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, unsigned long *size, /* Update rate control heuristics */ cpi->total_byte_count += (*size); - cpi->projected_frame_size = (*size) << 3; + cpi->projected_frame_size = (int)(*size) << 3; if (cpi->oxcf.number_of_layers > 1) { unsigned int i; @@ -4711,7 +4711,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, unsigned long *size, /* vp8_write_yuv_frame("encoder_recon.yuv", cm->frame_to_show); */ } #if !CONFIG_REALTIME_ONLY -static void Pass2Encode(VP8_COMP *cpi, unsigned long *size, unsigned char *dest, +static void Pass2Encode(VP8_COMP *cpi, size_t *size, unsigned char *dest, unsigned char *dest_end, unsigned int *frame_flags) { if (!cpi->common.refresh_alt_ref_frame) vp8_second_pass(cpi); @@ -4764,7 +4764,7 @@ static int frame_is_reference(const VP8_COMP *cpi) { } int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, - unsigned long *size, unsigned char *dest, + size_t *size, unsigned char *dest, unsigned char *dest_end, int64_t *time_stamp, int64_t *time_end, int flush) { VP8_COMMON *cm; diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h index 32080eff7..59ad5773a 100644 --- a/vp8/encoder/onyx_int.h +++ b/vp8/encoder/onyx_int.h @@ -687,7 +687,7 @@ void vp8_new_framerate(VP8_COMP *cpi, double framerate); void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm); void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, - unsigned char *dest_end, unsigned long *size); + unsigned char *dest_end, size_t *size); void vp8_tokenize_mb(VP8_COMP *, MACROBLOCK *, TOKENEXTRA **); diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c index f0050d201..7b68d35f5 100644 --- a/vp8/encoder/pickinter.c +++ b/vp8/encoder/pickinter.c @@ -570,7 +570,7 @@ static int evaluate_inter_mode(unsigned int *sse, int rate2, int *distortion2, // No adjustment if block is considered to be skin area. if (x->is_skin) rd_adj = 100; - this_rd = ((int64_t)this_rd) * rd_adj / 100; + this_rd = (int)(((int64_t)this_rd) * rd_adj / 100); } check_for_encode_breakout(*sse, x); diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk index d863a0a26..886b127d6 100644 --- a/vp8/vp8_common.mk +++ b/vp8/vp8_common.mk @@ -68,7 +68,6 @@ VP8_COMMON_SRCS-yes += common/vp8_entropymodedata.h -VP8_COMMON_SRCS-$(CONFIG_POSTPROC_VISUALIZER) += common/textblit.c VP8_COMMON_SRCS-yes += common/treecoder.c VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/filter_x86.c diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c index 0ec6902e7..fac237eec 100644 --- a/vp8/vp8_cx_iface.c +++ b/vp8/vp8_cx_iface.c @@ -824,7 +824,7 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx, unsigned int lib_flags; YV12_BUFFER_CONFIG sd; int64_t dst_time_stamp, dst_end_time_stamp; - unsigned long size, cx_data_sz; + size_t size, cx_data_sz; unsigned char *cx_data; unsigned char *cx_data_end; int comp_data_state = 0; diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c index cab0a9997..b1f8340d6 100644 --- a/vp8/vp8_dx_iface.c +++ b/vp8/vp8_dx_iface.c @@ -46,13 +46,6 @@ struct vpx_codec_alg_priv { int decoder_init; int postproc_cfg_set; vp8_postproc_cfg_t postproc_cfg; -#if CONFIG_POSTPROC_VISUALIZER - unsigned int dbg_postproc_flag; - int dbg_color_ref_frame_flag; - int dbg_color_mb_modes_flag; - int dbg_color_b_modes_flag; - int dbg_display_mv_flag; -#endif vpx_decrypt_cb decrypt_cb; void *decrypt_state; vpx_image_t img; @@ -478,22 +471,8 @@ static vpx_image_t *vp8_get_frame(vpx_codec_alg_priv_t *ctx, if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC) { flags.post_proc_flag = ctx->postproc_cfg.post_proc_flag; -#if CONFIG_POSTPROC_VISUALIZER - flags.post_proc_flag |= - ((ctx->dbg_color_ref_frame_flag != 0) ? VP8D_DEBUG_CLR_FRM_REF_BLKS - : 0) | - ((ctx->dbg_color_mb_modes_flag != 0) ? VP8D_DEBUG_CLR_BLK_MODES : 0) | - ((ctx->dbg_color_b_modes_flag != 0) ? VP8D_DEBUG_CLR_BLK_MODES : 0) | - ((ctx->dbg_display_mv_flag != 0) ? VP8D_DEBUG_DRAW_MV : 0); -#endif flags.deblocking_level = ctx->postproc_cfg.deblocking_level; flags.noise_level = ctx->postproc_cfg.noise_level; -#if CONFIG_POSTPROC_VISUALIZER - flags.display_ref_frame_flag = ctx->dbg_color_ref_frame_flag; - flags.display_mb_modes_flag = ctx->dbg_color_mb_modes_flag; - flags.display_b_modes_flag = ctx->dbg_color_b_modes_flag; - flags.display_mv_flag = ctx->dbg_display_mv_flag; -#endif } if (0 == vp8dx_get_raw_frame(ctx->yv12_frame_buffers.pbi[0], &sd, @@ -589,54 +568,6 @@ static vpx_codec_err_t vp8_set_postproc(vpx_codec_alg_priv_t *ctx, #endif } -static vpx_codec_err_t vp8_set_dbg_color_ref_frame(vpx_codec_alg_priv_t *ctx, - va_list args) { -#if CONFIG_POSTPROC_VISUALIZER && CONFIG_POSTPROC - ctx->dbg_color_ref_frame_flag = va_arg(args, int); - return VPX_CODEC_OK; -#else - (void)ctx; - (void)args; - return VPX_CODEC_INCAPABLE; -#endif -} - -static vpx_codec_err_t vp8_set_dbg_color_mb_modes(vpx_codec_alg_priv_t *ctx, - va_list args) { -#if CONFIG_POSTPROC_VISUALIZER && CONFIG_POSTPROC - ctx->dbg_color_mb_modes_flag = va_arg(args, int); - return VPX_CODEC_OK; -#else - (void)ctx; - (void)args; - return VPX_CODEC_INCAPABLE; -#endif -} - -static vpx_codec_err_t vp8_set_dbg_color_b_modes(vpx_codec_alg_priv_t *ctx, - va_list args) { -#if CONFIG_POSTPROC_VISUALIZER && CONFIG_POSTPROC - ctx->dbg_color_b_modes_flag = va_arg(args, int); - return VPX_CODEC_OK; -#else - (void)ctx; - (void)args; - return VPX_CODEC_INCAPABLE; -#endif -} - -static vpx_codec_err_t vp8_set_dbg_display_mv(vpx_codec_alg_priv_t *ctx, - va_list args) { -#if CONFIG_POSTPROC_VISUALIZER && CONFIG_POSTPROC - ctx->dbg_display_mv_flag = va_arg(args, int); - return VPX_CODEC_OK; -#else - (void)ctx; - (void)args; - return VPX_CODEC_INCAPABLE; -#endif -} - static vpx_codec_err_t vp8_get_last_ref_updates(vpx_codec_alg_priv_t *ctx, va_list args) { int *update_info = va_arg(args, int *); @@ -706,10 +637,6 @@ vpx_codec_ctrl_fn_map_t vp8_ctf_maps[] = { { VP8_SET_REFERENCE, vp8_set_reference }, { VP8_COPY_REFERENCE, vp8_get_reference }, { VP8_SET_POSTPROC, vp8_set_postproc }, - { VP8_SET_DBG_COLOR_REF_FRAME, vp8_set_dbg_color_ref_frame }, - { VP8_SET_DBG_COLOR_MB_MODES, vp8_set_dbg_color_mb_modes }, - { VP8_SET_DBG_COLOR_B_MODES, vp8_set_dbg_color_b_modes }, - { VP8_SET_DBG_DISPLAY_MV, vp8_set_dbg_display_mv }, { VP8D_GET_LAST_REF_UPDATES, vp8_get_last_ref_updates }, { VP8D_GET_FRAME_CORRUPTED, vp8_get_frame_corrupted }, { VP8D_GET_LAST_REF_USED, vp8_get_last_ref_frame }, diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c index ff7c1dd3f..dc2e03946 100644 --- a/vp9/common/vp9_idct.c +++ b/vp9/common/vp9_idct.c @@ -139,8 +139,6 @@ void vp9_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride, // The calculation can be simplified if there are not many non-zero dct // coefficients. Use eobs to decide what to do. - // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c. - // Combine that with code here. if (eob == 1) // DC only DCT coefficient vpx_idct8x8_1_add(input, dest, stride); @@ -204,6 +202,18 @@ void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest, } #if CONFIG_VP9_HIGHBITDEPTH + +// 12 signal input bits + 7 forward transform amplify bits + 1 bit +// for contingency in rounding and quantizing +#define VALID_IHT_MAGNITUDE_RANGE (1 << 20) + +static INLINE int detect_invalid_iht_input(const tran_low_t *input, int size) { + int i; + for (i = 0; i < size; ++i) + if (abs(input[i]) >= VALID_IHT_MAGNITUDE_RANGE) return 1; + return 0; +} + void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, int stride, int tx_type, int bd) { const highbd_transform_2d IHT_4[] = { @@ -219,6 +229,13 @@ void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, tran_low_t *outptr = out; tran_low_t temp_in[4], temp_out[4]; + if (detect_invalid_iht_input(input, 16)) { +#if CONFIG_COEFFICIENT_RANGE_CHECKING + assert(0 && "invalid highbd iht input"); +#endif // CONFIG_COEFFICIENT_RANGE_CHECKING + return; + } + // Inverse transform row vectors. for (i = 0; i < 4; ++i) { IHT_4[tx_type].rows(input, outptr, bd); @@ -253,6 +270,13 @@ void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8, const highbd_transform_2d ht = HIGH_IHT_8[tx_type]; uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + if (detect_invalid_iht_input(input, 64)) { +#if CONFIG_COEFFICIENT_RANGE_CHECKING + assert(0 && "invalid highbd iht input"); +#endif // CONFIG_COEFFICIENT_RANGE_CHECKING + return; + } + // Inverse transform row vectors. for (i = 0; i < 8; ++i) { ht.rows(input, outptr, bd); @@ -287,6 +311,13 @@ void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, const highbd_transform_2d ht = HIGH_IHT_16[tx_type]; uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + if (detect_invalid_iht_input(input, 256)) { +#if CONFIG_COEFFICIENT_RANGE_CHECKING + assert(0 && "invalid highbd iht input"); +#endif // CONFIG_COEFFICIENT_RANGE_CHECKING + return; + } + // Rows for (i = 0; i < 16; ++i) { ht.rows(input, outptr, bd); @@ -329,8 +360,6 @@ void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride, // The calculation can be simplified if there are not many non-zero dct // coefficients. Use eobs to decide what to do. - // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c. - // Combine that with code here. // DC only DCT coefficient if (eob == 1) { vpx_highbd_idct8x8_1_add(input, dest, stride, bd); diff --git a/vp9/common/vp9_postproc.c b/vp9/common/vp9_postproc.c index b6ae10b1b..b105e5d45 100644 --- a/vp9/common/vp9_postproc.c +++ b/vp9/common/vp9_postproc.c @@ -26,7 +26,6 @@ #include "vp9/common/vp9_onyxc_int.h" #include "vp9/common/vp9_postproc.h" -#include "vp9/common/vp9_textblit.h" #if CONFIG_VP9_POSTPROC diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index f315a3b85..37a867323 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -92,33 +92,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # High bitdepth functions if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # - # Sub Pixel Filters - # - add_proto qw/void vp9_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; - specialize qw/vp9_highbd_convolve_copy/; - - add_proto qw/void vp9_highbd_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; - specialize qw/vp9_highbd_convolve_avg/; - - add_proto qw/void vp9_highbd_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; - specialize qw/vp9_highbd_convolve8/, "$sse2_x86_64"; - - add_proto qw/void vp9_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; - specialize qw/vp9_highbd_convolve8_horiz/, "$sse2_x86_64"; - - add_proto qw/void vp9_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; - specialize qw/vp9_highbd_convolve8_vert/, "$sse2_x86_64"; - - add_proto qw/void vp9_highbd_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; - specialize qw/vp9_highbd_convolve8_avg/, "$sse2_x86_64"; - - add_proto qw/void vp9_highbd_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; - specialize qw/vp9_highbd_convolve8_avg_horiz/, "$sse2_x86_64"; - - add_proto qw/void vp9_highbd_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; - specialize qw/vp9_highbd_convolve8_avg_vert/, "$sse2_x86_64"; - - # # post proc # if (vpx_config("CONFIG_VP9_POSTPROC") eq "yes") { diff --git a/vp9/common/vp9_textblit.c b/vp9/common/vp9_textblit.c deleted file mode 100644 index 9940137ca..000000000 --- a/vp9/common/vp9_textblit.c +++ /dev/null @@ -1,119 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <stdlib.h> - -#include "vp9/common/vp9_textblit.h" - -static const int font[] = { - 0x0, 0x5C00, 0x8020, 0xAFABEA, 0xD7EC0, 0x1111111, 0x1855740, - 0x18000, 0x45C0, 0x74400, 0x51140, 0x23880, 0xC4000, 0x21080, - 0x80000, 0x111110, 0xE9D72E, 0x87E40, 0x12AD732, 0xAAD62A, 0x4F94C4, - 0x4D6B7, 0x456AA, 0x3E8423, 0xAAD6AA, 0xAAD6A2, 0x2800, 0x2A00, - 0x8A880, 0x52940, 0x22A20, 0x15422, 0x6AD62E, 0x1E4A53E, 0xAAD6BF, - 0x8C62E, 0xE8C63F, 0x118D6BF, 0x1094BF, 0xCAC62E, 0x1F2109F, 0x118FE31, - 0xF8C628, 0x8A89F, 0x108421F, 0x1F1105F, 0x1F4105F, 0xE8C62E, 0x2294BF, - 0x164C62E, 0x12694BF, 0x8AD6A2, 0x10FC21, 0x1F8421F, 0x744107, 0xF8220F, - 0x1151151, 0x117041, 0x119D731, 0x47E0, 0x1041041, 0xFC400, 0x10440, - 0x1084210, 0x820 -}; - -static void plot(int x, int y, unsigned char *image, int pitch) { - image[x + y * pitch] ^= 255; -} - -void vp9_blit_text(const char *msg, unsigned char *address, const int pitch) { - int letter_bitmap; - unsigned char *output_pos = address; - int colpos = 0; - - while (msg[colpos] != 0) { - char letter = msg[colpos]; - int fontcol, fontrow; - - if (letter <= 'Z' && letter >= ' ') - letter_bitmap = font[letter - ' ']; - else if (letter <= 'z' && letter >= 'a') - letter_bitmap = font[letter - 'a' + 'A' - ' ']; - else - letter_bitmap = font[0]; - - for (fontcol = 6; fontcol >= 0; fontcol--) - for (fontrow = 0; fontrow < 5; fontrow++) - output_pos[fontrow * pitch + fontcol] = - ((letter_bitmap >> (fontcol * 5)) & (1 << fontrow) ? 255 : 0); - - output_pos += 7; - colpos++; - } -} - -/* Bresenham line algorithm */ -void vp9_blit_line(int x0, int x1, int y0, int y1, unsigned char *image, - int pitch) { - int steep = abs(y1 - y0) > abs(x1 - x0); - int deltax, deltay; - int error, ystep, y, x; - - if (steep) { - int t; - t = x0; - x0 = y0; - y0 = t; - - t = x1; - x1 = y1; - y1 = t; - } - - if (x0 > x1) { - int t; - t = x0; - x0 = x1; - x1 = t; - - t = y0; - y0 = y1; - y1 = t; - } - - deltax = x1 - x0; - deltay = abs(y1 - y0); - error = deltax / 2; - - y = y0; - - if (y0 < y1) - ystep = 1; - else - ystep = -1; - - if (steep) { - for (x = x0; x <= x1; x++) { - plot(y, x, image, pitch); - - error = error - deltay; - if (error < 0) { - y = y + ystep; - error = error + deltax; - } - } - } else { - for (x = x0; x <= x1; x++) { - plot(x, y, image, pitch); - - error = error - deltay; - if (error < 0) { - y = y + ystep; - error = error + deltax; - } - } - } -} diff --git a/vp9/common/vp9_textblit.h b/vp9/common/vp9_textblit.h deleted file mode 100644 index 158ec1b37..000000000 --- a/vp9/common/vp9_textblit.h +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef VP9_COMMON_VP9_TEXTBLIT_H_ -#define VP9_COMMON_VP9_TEXTBLIT_H_ - -#ifdef __cplusplus -extern "C" { -#endif - -void vp9_blit_text(const char *msg, unsigned char *address, int pitch); - -void vp9_blit_line(int x0, int x1, int y0, int y1, unsigned char *image, - int pitch); - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // VP9_COMMON_VP9_TEXTBLIT_H_ diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c index af2c900e6..fde0b7e31 100644 --- a/vp9/decoder/vp9_decodeframe.c +++ b/vp9/decoder/vp9_decodeframe.c @@ -318,11 +318,11 @@ static void inverse_transform_block_intra(MACROBLOCKD *xd, int plane, } } -static void predict_and_reconstruct_intra_block(MACROBLOCKD *const xd, - vpx_reader *r, +static void predict_and_reconstruct_intra_block(TileWorkerData *twd, MODE_INFO *const mi, int plane, int row, int col, TX_SIZE tx_size) { + MACROBLOCKD *const xd = &twd->xd; struct macroblockd_plane *const pd = &xd->plane[plane]; PREDICTION_MODE mode = (plane == 0) ? mi->mode : mi->uv_mode; uint8_t *dst; @@ -340,7 +340,7 @@ static void predict_and_reconstruct_intra_block(MACROBLOCKD *const xd, const scan_order *sc = (plane || xd->lossless) ? &vp9_default_scan_orders[tx_size] : &vp9_scan_orders[tx_size][tx_type]; - const int eob = vp9_decode_block_tokens(xd, plane, sc, col, row, tx_size, r, + const int eob = vp9_decode_block_tokens(twd, plane, sc, col, row, tx_size, mi->segment_id); if (eob > 0) { inverse_transform_block_intra(xd, plane, tx_type, tx_size, dst, @@ -349,12 +349,13 @@ static void predict_and_reconstruct_intra_block(MACROBLOCKD *const xd, } } -static int reconstruct_inter_block(MACROBLOCKD *const xd, vpx_reader *r, - MODE_INFO *const mi, int plane, int row, - int col, TX_SIZE tx_size) { +static int reconstruct_inter_block(TileWorkerData *twd, MODE_INFO *const mi, + int plane, int row, int col, + TX_SIZE tx_size) { + MACROBLOCKD *const xd = &twd->xd; struct macroblockd_plane *const pd = &xd->plane[plane]; const scan_order *sc = &vp9_default_scan_orders[tx_size]; - const int eob = vp9_decode_block_tokens(xd, plane, sc, col, row, tx_size, r, + const int eob = vp9_decode_block_tokens(twd, plane, sc, col, row, tx_size, mi->segment_id); if (eob > 0) { @@ -761,15 +762,16 @@ static MODE_INFO *set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd, return xd->mi[0]; } -static void decode_block(VP9Decoder *const pbi, MACROBLOCKD *const xd, - int mi_row, int mi_col, vpx_reader *r, - BLOCK_SIZE bsize, int bwl, int bhl) { +static void decode_block(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row, + int mi_col, BLOCK_SIZE bsize, int bwl, int bhl) { VP9_COMMON *const cm = &pbi->common; const int less8x8 = bsize < BLOCK_8X8; const int bw = 1 << (bwl - 1); const int bh = 1 << (bhl - 1); const int x_mis = VPXMIN(bw, cm->mi_cols - mi_col); const int y_mis = VPXMIN(bh, cm->mi_rows - mi_row); + vpx_reader *r = &twd->bit_reader; + MACROBLOCKD *const xd = &twd->xd; MODE_INFO *mi = set_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis, y_mis, bwl, bhl); @@ -782,7 +784,7 @@ static void decode_block(VP9Decoder *const pbi, MACROBLOCKD *const xd, "Invalid block size."); } - vp9_read_mode_info(pbi, xd, mi_row, mi_col, r, x_mis, y_mis); + vp9_read_mode_info(twd, pbi, mi_row, mi_col, x_mis, y_mis); if (mi->skip) { dec_reset_skip_context(xd); @@ -811,7 +813,7 @@ static void decode_block(VP9Decoder *const pbi, MACROBLOCKD *const xd, for (row = 0; row < max_blocks_high; row += step) for (col = 0; col < max_blocks_wide; col += step) - predict_and_reconstruct_intra_block(xd, r, mi, plane, row, col, + predict_and_reconstruct_intra_block(twd, mi, plane, row, col, tx_size); } } else { @@ -845,7 +847,7 @@ static void decode_block(VP9Decoder *const pbi, MACROBLOCKD *const xd, for (row = 0; row < max_blocks_high; row += step) for (col = 0; col < max_blocks_wide; col += step) eobtotal += - reconstruct_inter_block(xd, r, mi, plane, row, col, tx_size); + reconstruct_inter_block(twd, mi, plane, row, col, tx_size); } if (!less8x8 && eobtotal == 0) mi->skip = 1; // skip loopfilter @@ -859,10 +861,11 @@ static void decode_block(VP9Decoder *const pbi, MACROBLOCKD *const xd, } } -static INLINE int dec_partition_plane_context(const MACROBLOCKD *xd, int mi_row, +static INLINE int dec_partition_plane_context(TileWorkerData *twd, int mi_row, int mi_col, int bsl) { - const PARTITION_CONTEXT *above_ctx = xd->above_seg_context + mi_col; - const PARTITION_CONTEXT *left_ctx = xd->left_seg_context + (mi_row & MI_MASK); + const PARTITION_CONTEXT *above_ctx = twd->xd.above_seg_context + mi_col; + const PARTITION_CONTEXT *left_ctx = + twd->xd.left_seg_context + (mi_row & MI_MASK); int above = (*above_ctx >> bsl) & 1, left = (*left_ctx >> bsl) & 1; // assert(bsl >= 0); @@ -870,11 +873,12 @@ static INLINE int dec_partition_plane_context(const MACROBLOCKD *xd, int mi_row, return (left * 2 + above) + bsl * PARTITION_PLOFFSET; } -static INLINE void dec_update_partition_context(MACROBLOCKD *xd, int mi_row, +static INLINE void dec_update_partition_context(TileWorkerData *twd, int mi_row, int mi_col, BLOCK_SIZE subsize, int bw) { - PARTITION_CONTEXT *const above_ctx = xd->above_seg_context + mi_col; - PARTITION_CONTEXT *const left_ctx = xd->left_seg_context + (mi_row & MI_MASK); + PARTITION_CONTEXT *const above_ctx = twd->xd.above_seg_context + mi_col; + PARTITION_CONTEXT *const left_ctx = + twd->xd.left_seg_context + (mi_row & MI_MASK); // update the partition context at the end notes. set partition bits // of block sizes larger than the current one to be one, and partition @@ -883,13 +887,14 @@ static INLINE void dec_update_partition_context(MACROBLOCKD *xd, int mi_row, memset(left_ctx, partition_context_lookup[subsize].left, bw); } -static PARTITION_TYPE read_partition(MACROBLOCKD *xd, int mi_row, int mi_col, - vpx_reader *r, int has_rows, int has_cols, +static PARTITION_TYPE read_partition(TileWorkerData *twd, int mi_row, + int mi_col, int has_rows, int has_cols, int bsl) { - const int ctx = dec_partition_plane_context(xd, mi_row, mi_col, bsl); - const vpx_prob *const probs = get_partition_probs(xd, ctx); - FRAME_COUNTS *counts = xd->counts; + const int ctx = dec_partition_plane_context(twd, mi_row, mi_col, bsl); + const vpx_prob *const probs = twd->xd.partition_probs[ctx]; + FRAME_COUNTS *counts = twd->xd.counts; PARTITION_TYPE p; + vpx_reader *r = &twd->bit_reader; if (has_rows && has_cols) p = (PARTITION_TYPE)vpx_read_tree(r, vp9_partition_tree, probs); @@ -906,9 +911,9 @@ static PARTITION_TYPE read_partition(MACROBLOCKD *xd, int mi_row, int mi_col, } // TODO(slavarnway): eliminate bsize and subsize in future commits -static void decode_partition(VP9Decoder *const pbi, MACROBLOCKD *const xd, - int mi_row, int mi_col, vpx_reader *r, - BLOCK_SIZE bsize, int n4x4_l2) { +static void decode_partition(TileWorkerData *twd, VP9Decoder *const pbi, + int mi_row, int mi_col, BLOCK_SIZE bsize, + int n4x4_l2) { VP9_COMMON *const cm = &pbi->common; const int n8x8_l2 = n4x4_l2 - 1; const int num_8x8_wh = 1 << n8x8_l2; @@ -917,39 +922,39 @@ static void decode_partition(VP9Decoder *const pbi, MACROBLOCKD *const xd, BLOCK_SIZE subsize; const int has_rows = (mi_row + hbs) < cm->mi_rows; const int has_cols = (mi_col + hbs) < cm->mi_cols; + MACROBLOCKD *const xd = &twd->xd; if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; - partition = - read_partition(xd, mi_row, mi_col, r, has_rows, has_cols, n8x8_l2); + partition = read_partition(twd, mi_row, mi_col, has_rows, has_cols, n8x8_l2); subsize = subsize_lookup[partition][bsize]; // get_subsize(bsize, partition); if (!hbs) { // calculate bmode block dimensions (log 2) xd->bmode_blocks_wl = 1 >> !!(partition & PARTITION_VERT); xd->bmode_blocks_hl = 1 >> !!(partition & PARTITION_HORZ); - decode_block(pbi, xd, mi_row, mi_col, r, subsize, 1, 1); + decode_block(twd, pbi, mi_row, mi_col, subsize, 1, 1); } else { switch (partition) { case PARTITION_NONE: - decode_block(pbi, xd, mi_row, mi_col, r, subsize, n4x4_l2, n4x4_l2); + decode_block(twd, pbi, mi_row, mi_col, subsize, n4x4_l2, n4x4_l2); break; case PARTITION_HORZ: - decode_block(pbi, xd, mi_row, mi_col, r, subsize, n4x4_l2, n8x8_l2); + decode_block(twd, pbi, mi_row, mi_col, subsize, n4x4_l2, n8x8_l2); if (has_rows) - decode_block(pbi, xd, mi_row + hbs, mi_col, r, subsize, n4x4_l2, + decode_block(twd, pbi, mi_row + hbs, mi_col, subsize, n4x4_l2, n8x8_l2); break; case PARTITION_VERT: - decode_block(pbi, xd, mi_row, mi_col, r, subsize, n8x8_l2, n4x4_l2); + decode_block(twd, pbi, mi_row, mi_col, subsize, n8x8_l2, n4x4_l2); if (has_cols) - decode_block(pbi, xd, mi_row, mi_col + hbs, r, subsize, n8x8_l2, + decode_block(twd, pbi, mi_row, mi_col + hbs, subsize, n8x8_l2, n4x4_l2); break; case PARTITION_SPLIT: - decode_partition(pbi, xd, mi_row, mi_col, r, subsize, n8x8_l2); - decode_partition(pbi, xd, mi_row, mi_col + hbs, r, subsize, n8x8_l2); - decode_partition(pbi, xd, mi_row + hbs, mi_col, r, subsize, n8x8_l2); - decode_partition(pbi, xd, mi_row + hbs, mi_col + hbs, r, subsize, + decode_partition(twd, pbi, mi_row, mi_col, subsize, n8x8_l2); + decode_partition(twd, pbi, mi_row, mi_col + hbs, subsize, n8x8_l2); + decode_partition(twd, pbi, mi_row + hbs, mi_col, subsize, n8x8_l2); + decode_partition(twd, pbi, mi_row + hbs, mi_col + hbs, subsize, n8x8_l2); break; default: assert(0 && "Invalid partition type"); @@ -959,7 +964,7 @@ static void decode_partition(VP9Decoder *const pbi, MACROBLOCKD *const xd, // update partition context if (bsize >= BLOCK_8X8 && (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT)) - dec_update_partition_context(xd, mi_row, mi_col, subsize, num_8x8_wh); + dec_update_partition_context(twd, mi_row, mi_col, subsize, num_8x8_wh); } static void setup_token_decoder(const uint8_t *data, const uint8_t *data_end, @@ -1442,8 +1447,7 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data, vp9_zero(tile_data->xd.left_seg_context); for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end; mi_col += MI_BLOCK_SIZE) { - decode_partition(pbi, &tile_data->xd, mi_row, mi_col, - &tile_data->bit_reader, BLOCK_64X64, 4); + decode_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4); } pbi->mb.corrupted |= tile_data->xd.corrupted; if (pbi->mb.corrupted) @@ -1532,8 +1536,7 @@ static int tile_worker_hook(TileWorkerData *const tile_data, vp9_zero(tile_data->xd.left_seg_context); for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end; mi_col += MI_BLOCK_SIZE) { - decode_partition(pbi, &tile_data->xd, mi_row, mi_col, - &tile_data->bit_reader, BLOCK_64X64, 4); + decode_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4); } } diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c index 7358c9a39..4372ba037 100644 --- a/vp9/decoder/vp9_decodemv.c +++ b/vp9/decoder/vp9_decodemv.c @@ -241,7 +241,7 @@ static int read_mv_component(vpx_reader *r, const nmv_component *mvcomp, // Integer part if (class0) { - d = vpx_read_tree(r, vp9_mv_class0_tree, mvcomp->class0); + d = vpx_read(r, mvcomp->class0[0]); mag = 0; } else { int i; @@ -826,8 +826,10 @@ static INLINE void copy_ref_frame_pair(MV_REFERENCE_FRAME *dst, memcpy(dst, src, sizeof(*dst) * 2); } -void vp9_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd, int mi_row, - int mi_col, vpx_reader *r, int x_mis, int y_mis) { +void vp9_read_mode_info(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row, + int mi_col, int x_mis, int y_mis) { + vpx_reader *r = &twd->bit_reader; + MACROBLOCKD *const xd = &twd->xd; VP9_COMMON *const cm = &pbi->common; MODE_INFO *const mi = xd->mi[0]; MV_REF *frame_mvs = cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col; diff --git a/vp9/decoder/vp9_decodemv.h b/vp9/decoder/vp9_decodemv.h index 4e11c2fc0..b460cb8fb 100644 --- a/vp9/decoder/vp9_decodemv.h +++ b/vp9/decoder/vp9_decodemv.h @@ -19,8 +19,8 @@ extern "C" { #endif -void vp9_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd, int mi_row, - int mi_col, vpx_reader *r, int x_mis, int y_mis); +void vp9_read_mode_info(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row, + int mi_col, int x_mis, int y_mis); #ifdef __cplusplus } // extern "C" diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c index cc01909ff..7048fb1ca 100644 --- a/vp9/decoder/vp9_detokenize.c +++ b/vp9/decoder/vp9_detokenize.c @@ -29,9 +29,45 @@ if (counts) ++coef_counts[band][ctx][token]; \ } while (0) -static INLINE int read_coeff(const vpx_prob *probs, int n, vpx_reader *r) { +static INLINE int read_bool(vpx_reader *r, int prob, BD_VALUE *value, + int *count, unsigned int *range) { + const unsigned int split = (*range * prob + (256 - prob)) >> CHAR_BIT; + const BD_VALUE bigsplit = (BD_VALUE)split << (BD_VALUE_SIZE - CHAR_BIT); + + if (*count < 0) { + r->value = *value; + r->count = *count; + vpx_reader_fill(r); + *value = r->value; + *count = r->count; + } + + if (*value >= bigsplit) { + *range = *range - split; + *value = *value - bigsplit; + { + const int shift = vpx_norm[*range]; + *range <<= shift; + *value <<= shift; + *count -= shift; + } + return 1; + } + *range = split; + { + const int shift = vpx_norm[*range]; + *range <<= shift; + *value <<= shift; + *count -= shift; + } + return 0; +} + +static INLINE int read_coeff(vpx_reader *r, const vpx_prob *probs, int n, + BD_VALUE *value, int *count, unsigned int *range) { int i, val = 0; - for (i = 0; i < n; ++i) val = (val << 1) | vpx_read(r, probs[i]); + for (i = 0; i < n; ++i) + val = (val << 1) | read_bool(r, probs[i], value, count, range); return val; } @@ -52,7 +88,7 @@ static int decode_coefs(const MACROBLOCKD *xd, PLANE_TYPE type, uint8_t token_cache[32 * 32]; const uint8_t *band_translate = get_band_translate(tx_size); const int dq_shift = (tx_size == TX_32X32); - int v, token; + int v; int16_t dqv = dq[0]; const uint8_t *const cat6_prob = #if CONFIG_VP9_HIGHBITDEPTH @@ -66,6 +102,11 @@ static int decode_coefs(const MACROBLOCKD *xd, PLANE_TYPE type, (xd->bd == VPX_BITS_12) ? 18 : (xd->bd == VPX_BITS_10) ? 16 : #endif // CONFIG_VP9_HIGHBITDEPTH 14; + // Keep value, range, and count as locals. The compiler produces better + // results with the locals than using r directly. + BD_VALUE value = r->value; + unsigned int range = r->range; + int count = r->count; if (counts) { coef_counts = counts->coef[tx_size][type][ref]; @@ -77,70 +118,98 @@ static int decode_coefs(const MACROBLOCKD *xd, PLANE_TYPE type, band = *band_translate++; prob = coef_probs[band][ctx]; if (counts) ++eob_branch_count[band][ctx]; - if (!vpx_read(r, prob[EOB_CONTEXT_NODE])) { + if (!read_bool(r, prob[EOB_CONTEXT_NODE], &value, &count, &range)) { INCREMENT_COUNT(EOB_MODEL_TOKEN); break; } - while (!vpx_read(r, prob[ZERO_CONTEXT_NODE])) { + while (!read_bool(r, prob[ZERO_CONTEXT_NODE], &value, &count, &range)) { INCREMENT_COUNT(ZERO_TOKEN); dqv = dq[1]; token_cache[scan[c]] = 0; ++c; - if (c >= max_eob) return c; // zero tokens at the end (no eob token) + if (c >= max_eob) { + r->value = value; + r->range = range; + r->count = count; + return c; // zero tokens at the end (no eob token) + } ctx = get_coef_context(nb, token_cache, c); band = *band_translate++; prob = coef_probs[band][ctx]; } - if (!vpx_read(r, prob[ONE_CONTEXT_NODE])) { - INCREMENT_COUNT(ONE_TOKEN); - token = ONE_TOKEN; - val = 1; - } else { + if (read_bool(r, prob[ONE_CONTEXT_NODE], &value, &count, &range)) { + const vpx_prob *p = vp9_pareto8_full[prob[PIVOT_NODE] - 1]; INCREMENT_COUNT(TWO_TOKEN); - token = vpx_read_tree(r, vp9_coef_con_tree, - vp9_pareto8_full[prob[PIVOT_NODE] - 1]); - switch (token) { - case TWO_TOKEN: - case THREE_TOKEN: - case FOUR_TOKEN: val = token; break; - case CATEGORY1_TOKEN: - val = CAT1_MIN_VAL + read_coeff(vp9_cat1_prob, 1, r); - break; - case CATEGORY2_TOKEN: - val = CAT2_MIN_VAL + read_coeff(vp9_cat2_prob, 2, r); - break; - case CATEGORY3_TOKEN: - val = CAT3_MIN_VAL + read_coeff(vp9_cat3_prob, 3, r); - break; - case CATEGORY4_TOKEN: - val = CAT4_MIN_VAL + read_coeff(vp9_cat4_prob, 4, r); - break; - case CATEGORY5_TOKEN: - val = CAT5_MIN_VAL + read_coeff(vp9_cat5_prob, 5, r); - break; - case CATEGORY6_TOKEN: - val = CAT6_MIN_VAL + read_coeff(cat6_prob, cat6_bits, r); - break; + if (read_bool(r, p[0], &value, &count, &range)) { + if (read_bool(r, p[3], &value, &count, &range)) { + token_cache[scan[c]] = 5; + if (read_bool(r, p[5], &value, &count, &range)) { + if (read_bool(r, p[7], &value, &count, &range)) { + val = CAT6_MIN_VAL + + read_coeff(r, cat6_prob, cat6_bits, &value, &count, &range); + } else { + val = CAT5_MIN_VAL + + read_coeff(r, vp9_cat5_prob, 5, &value, &count, &range); + } + } else if (read_bool(r, p[6], &value, &count, &range)) { + val = CAT4_MIN_VAL + + read_coeff(r, vp9_cat4_prob, 4, &value, &count, &range); + } else { + val = CAT3_MIN_VAL + + read_coeff(r, vp9_cat3_prob, 3, &value, &count, &range); + } + } else { + token_cache[scan[c]] = 4; + if (read_bool(r, p[4], &value, &count, &range)) { + val = CAT2_MIN_VAL + + read_coeff(r, vp9_cat2_prob, 2, &value, &count, &range); + } else { + val = CAT1_MIN_VAL + + read_coeff(r, vp9_cat1_prob, 1, &value, &count, &range); + } + } + v = (val * dqv) >> dq_shift; + } else { + if (read_bool(r, p[1], &value, &count, &range)) { + token_cache[scan[c]] = 3; + v = ((3 + read_bool(r, p[2], &value, &count, &range)) * dqv) >> + dq_shift; + } else { + token_cache[scan[c]] = 2; + v = (2 * dqv) >> dq_shift; + } } + } else { + INCREMENT_COUNT(ONE_TOKEN); + token_cache[scan[c]] = 1; + v = dqv >> dq_shift; } - v = (val * dqv) >> dq_shift; #if CONFIG_COEFFICIENT_RANGE_CHECKING #if CONFIG_VP9_HIGHBITDEPTH - dqcoeff[scan[c]] = highbd_check_range((vpx_read_bit(r) ? -v : v), xd->bd); + dqcoeff[scan[c]] = + highbd_check_range(read_bool(r, 128, &value, &count, &range) ? -v : v), + xd->bd); #else - dqcoeff[scan[c]] = check_range(vpx_read_bit(r) ? -v : v); + dqcoeff[scan[c]] = + check_range(read_bool(r, 128, &value, &count, &range) ? -v : v); #endif // CONFIG_VP9_HIGHBITDEPTH #else - dqcoeff[scan[c]] = vpx_read_bit(r) ? -v : v; + if (read_bool(r, 128, &value, &count, &range)) { + dqcoeff[scan[c]] = -v; + } else { + dqcoeff[scan[c]] = v; + } #endif // CONFIG_COEFFICIENT_RANGE_CHECKING - token_cache[scan[c]] = vp9_pt_energy_class[token]; ++c; ctx = get_coef_context(nb, token_cache, c); dqv = dq[1]; } + r->value = value; + r->range = range; + r->count = count; return c; } @@ -156,9 +225,11 @@ static void get_ctx_shift(MACROBLOCKD *xd, int *ctx_shift_a, int *ctx_shift_l, } } -int vp9_decode_block_tokens(MACROBLOCKD *xd, int plane, const scan_order *sc, - int x, int y, TX_SIZE tx_size, vpx_reader *r, +int vp9_decode_block_tokens(TileWorkerData *twd, int plane, + const scan_order *sc, int x, int y, TX_SIZE tx_size, int seg_id) { + vpx_reader *r = &twd->bit_reader; + MACROBLOCKD *xd = &twd->xd; struct macroblockd_plane *const pd = &xd->plane[plane]; const int16_t *const dequant = pd->seg_dequant[seg_id]; int eob; diff --git a/vp9/decoder/vp9_detokenize.h b/vp9/decoder/vp9_detokenize.h index aa2afb16a..7b0d87601 100644 --- a/vp9/decoder/vp9_detokenize.h +++ b/vp9/decoder/vp9_detokenize.h @@ -19,8 +19,8 @@ extern "C" { #endif -int vp9_decode_block_tokens(MACROBLOCKD *xd, int plane, const scan_order *sc, - int x, int y, TX_SIZE tx_size, vpx_reader *r, +int vp9_decode_block_tokens(TileWorkerData *twd, int plane, + const scan_order *sc, int x, int y, TX_SIZE tx_size, int seg_id); #ifdef __cplusplus diff --git a/vp9/encoder/vp9_encodemv.c b/vp9/encoder/vp9_encodemv.c index 8e76f72fe..874a8e4b9 100644 --- a/vp9/encoder/vp9_encodemv.c +++ b/vp9/encoder/vp9_encodemv.c @@ -21,12 +21,10 @@ static struct vp9_token mv_joint_encodings[MV_JOINTS]; static struct vp9_token mv_class_encodings[MV_CLASSES]; static struct vp9_token mv_fp_encodings[MV_FP_SIZE]; -static struct vp9_token mv_class0_encodings[CLASS0_SIZE]; void vp9_entropy_mv_init(void) { vp9_tokens_from_tree(mv_joint_encodings, vp9_mv_joint_tree); vp9_tokens_from_tree(mv_class_encodings, vp9_mv_class_tree); - vp9_tokens_from_tree(mv_class0_encodings, vp9_mv_class0_tree); vp9_tokens_from_tree(mv_fp_encodings, vp9_mv_fp_tree); } @@ -51,8 +49,7 @@ static void encode_mv_component(vpx_writer *w, int comp, // Integer bits if (mv_class == MV_CLASS_0) { - vp9_write_token(w, vp9_mv_class0_tree, mvcomp->class0, - &mv_class0_encodings[d]); + vpx_write(w, d, mvcomp->class0[0]); } else { int i; const int n = mv_class + CLASS0_BITS - 1; // number of bits diff --git a/vp9/encoder/vp9_picklpf.c b/vp9/encoder/vp9_picklpf.c index 1583cc8ab..6fc7cd1e3 100644 --- a/vp9/encoder/vp9_picklpf.c +++ b/vp9/encoder/vp9_picklpf.c @@ -180,6 +180,10 @@ void vp9_pick_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, } #else int filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18); + if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR && + cpi->oxcf.content != VP9E_CONTENT_SCREEN && cm->frame_type != KEY_FRAME) + filt_guess = 5 * filt_guess >> 3; + #endif // CONFIG_VP9_HIGHBITDEPTH if (cm->frame_type == KEY_FRAME) filt_guess -= 4; lf->filter_level = clamp(filt_guess, min_filter_level, max_filter_level); diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index 2fd42960e..5bfc0d359 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -45,7 +45,6 @@ VP9_COMMON_SRCS-yes += common/vp9_scale.h VP9_COMMON_SRCS-yes += common/vp9_scale.c VP9_COMMON_SRCS-yes += common/vp9_seg_common.h VP9_COMMON_SRCS-yes += common/vp9_seg_common.c -VP9_COMMON_SRCS-yes += common/vp9_textblit.h VP9_COMMON_SRCS-yes += common/vp9_tile_common.h VP9_COMMON_SRCS-yes += common/vp9_tile_common.c VP9_COMMON_SRCS-yes += common/vp9_loopfilter.c @@ -55,7 +54,6 @@ VP9_COMMON_SRCS-yes += common/vp9_mvref_common.h VP9_COMMON_SRCS-yes += common/vp9_quant_common.c VP9_COMMON_SRCS-yes += common/vp9_reconinter.c VP9_COMMON_SRCS-yes += common/vp9_reconintra.c -VP9_COMMON_SRCS-$(CONFIG_POSTPROC_VISUALIZER) += common/vp9_textblit.c VP9_COMMON_SRCS-yes += common/vp9_common_data.c VP9_COMMON_SRCS-yes += common/vp9_common_data.h VP9_COMMON_SRCS-yes += common/vp9_scan.c diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c index 04b1dca29..3b5dc3dda 100644 --- a/vp9/vp9_dx_iface.c +++ b/vp9/vp9_dx_iface.c @@ -829,13 +829,6 @@ static vpx_codec_err_t ctrl_set_postproc(vpx_codec_alg_priv_t *ctx, #endif } -static vpx_codec_err_t ctrl_set_dbg_options(vpx_codec_alg_priv_t *ctx, - va_list args) { - (void)ctx; - (void)args; - return VPX_CODEC_INCAPABLE; -} - static vpx_codec_err_t ctrl_get_last_ref_updates(vpx_codec_alg_priv_t *ctx, va_list args) { int *const update_info = va_arg(args, int *); @@ -1014,10 +1007,6 @@ static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = { // Setters { VP8_SET_REFERENCE, ctrl_set_reference }, { VP8_SET_POSTPROC, ctrl_set_postproc }, - { VP8_SET_DBG_COLOR_REF_FRAME, ctrl_set_dbg_options }, - { VP8_SET_DBG_COLOR_MB_MODES, ctrl_set_dbg_options }, - { VP8_SET_DBG_COLOR_B_MODES, ctrl_set_dbg_options }, - { VP8_SET_DBG_DISPLAY_MV, ctrl_set_dbg_options }, { VP9_INVERT_TILE_DECODE_ORDER, ctrl_set_invert_tile_order }, { VPXD_SET_DECRYPTOR, ctrl_set_decryptor }, { VP9_SET_BYTE_ALIGNMENT, ctrl_set_byte_alignment }, @@ -47,11 +47,10 @@ enum vp8_com_control_id { VP8_SET_REFERENCE = 1, VP8_COPY_REFERENCE = 2, /**< get a copy of reference frame from the decoder */ VP8_SET_POSTPROC = 3, /**< set the decoder's post processing settings */ - VP8_SET_DBG_COLOR_REF_FRAME = - 4, /**< set the reference frames to color for each macroblock */ - VP8_SET_DBG_COLOR_MB_MODES = 5, /**< set which macro block modes to color */ - VP8_SET_DBG_COLOR_B_MODES = 6, /**< set which blocks modes to color */ - VP8_SET_DBG_DISPLAY_MV = 7, /**< set which motion vector modes to draw */ + VP8_SET_DBG_COLOR_REF_FRAME = 4, /**< \deprecated */ + VP8_SET_DBG_COLOR_MB_MODES = 5, /**< \deprecated */ + VP8_SET_DBG_COLOR_B_MODES = 6, /**< \deprecated */ + VP8_SET_DBG_DISPLAY_MV = 7, /**< \deprecated */ /* TODO(jkoleszar): The encoder incorrectly reuses some of these values (5+) * for its control ids. These should be migrated to something like the @@ -133,13 +132,13 @@ VPX_CTRL_USE_TYPE(VP8_COPY_REFERENCE, vpx_ref_frame_t *) #define VPX_CTRL_VP8_COPY_REFERENCE VPX_CTRL_USE_TYPE(VP8_SET_POSTPROC, vp8_postproc_cfg_t *) #define VPX_CTRL_VP8_SET_POSTPROC -VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_REF_FRAME, int) +VPX_CTRL_USE_TYPE_DEPRECATED(VP8_SET_DBG_COLOR_REF_FRAME, int) #define VPX_CTRL_VP8_SET_DBG_COLOR_REF_FRAME -VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_MB_MODES, int) +VPX_CTRL_USE_TYPE_DEPRECATED(VP8_SET_DBG_COLOR_MB_MODES, int) #define VPX_CTRL_VP8_SET_DBG_COLOR_MB_MODES -VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_B_MODES, int) +VPX_CTRL_USE_TYPE_DEPRECATED(VP8_SET_DBG_COLOR_B_MODES, int) #define VPX_CTRL_VP8_SET_DBG_COLOR_B_MODES -VPX_CTRL_USE_TYPE(VP8_SET_DBG_DISPLAY_MV, int) +VPX_CTRL_USE_TYPE_DEPRECATED(VP8_SET_DBG_DISPLAY_MV, int) #define VPX_CTRL_VP8_SET_DBG_DISPLAY_MV VPX_CTRL_USE_TYPE(VP9_GET_REFERENCE, vp9_ref_frame_t *) #define VPX_CTRL_VP9_GET_REFERENCE diff --git a/vpx_dsp/arm/loopfilter_16_neon.c b/vpx_dsp/arm/loopfilter_16_neon.c deleted file mode 100644 index 9607bb240..000000000 --- a/vpx_dsp/arm/loopfilter_16_neon.c +++ /dev/null @@ -1,173 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <arm_neon.h> - -#include "./vpx_dsp_rtcd.h" -#include "./vpx_config.h" -#include "vpx/vpx_integer.h" - -static INLINE void loop_filter_neon_16(uint8x16_t qblimit, // blimit - uint8x16_t qlimit, // limit - uint8x16_t qthresh, // thresh - uint8x16_t q3, // p3 - uint8x16_t q4, // p2 - uint8x16_t q5, // p1 - uint8x16_t q6, // p0 - uint8x16_t q7, // q0 - uint8x16_t q8, // q1 - uint8x16_t q9, // q2 - uint8x16_t q10, // q3 - uint8x16_t *q5r, // p1 - uint8x16_t *q6r, // p0 - uint8x16_t *q7r, // q0 - uint8x16_t *q8r) { // q1 - uint8x16_t q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8; - int16x8_t q2s16, q11s16; - uint16x8_t q4u16; - int8x16_t q0s8, q1s8, q2s8, q11s8, q12s8, q13s8; - int8x8_t d2s8, d3s8; - - q11u8 = vabdq_u8(q3, q4); - q12u8 = vabdq_u8(q4, q5); - q13u8 = vabdq_u8(q5, q6); - q14u8 = vabdq_u8(q8, q7); - q3 = vabdq_u8(q9, q8); - q4 = vabdq_u8(q10, q9); - - q11u8 = vmaxq_u8(q11u8, q12u8); - q12u8 = vmaxq_u8(q13u8, q14u8); - q3 = vmaxq_u8(q3, q4); - q15u8 = vmaxq_u8(q11u8, q12u8); - - q9 = vabdq_u8(q6, q7); - - // vp8_hevmask - q13u8 = vcgtq_u8(q13u8, qthresh); - q14u8 = vcgtq_u8(q14u8, qthresh); - q15u8 = vmaxq_u8(q15u8, q3); - - q2u8 = vabdq_u8(q5, q8); - q9 = vqaddq_u8(q9, q9); - - q15u8 = vcgeq_u8(qlimit, q15u8); - - // vp8_filter() function - // convert to signed - q10 = vdupq_n_u8(0x80); - q8 = veorq_u8(q8, q10); - q7 = veorq_u8(q7, q10); - q6 = veorq_u8(q6, q10); - q5 = veorq_u8(q5, q10); - - q2u8 = vshrq_n_u8(q2u8, 1); - q9 = vqaddq_u8(q9, q2u8); - - q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)), - vget_low_s8(vreinterpretq_s8_u8(q6))); - q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)), - vget_high_s8(vreinterpretq_s8_u8(q6))); - - q9 = vcgeq_u8(qblimit, q9); - - q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), vreinterpretq_s8_u8(q8)); - - q14u8 = vorrq_u8(q13u8, q14u8); - - q4u16 = vdupq_n_u16(3); - q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16)); - q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16)); - - q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8); - q15u8 = vandq_u8(q15u8, q9); - - q1s8 = vreinterpretq_s8_u8(q1u8); - q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8)); - q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8)); - - q4 = vdupq_n_u8(3); - q9 = vdupq_n_u8(4); - // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0)) - d2s8 = vqmovn_s16(q2s16); - d3s8 = vqmovn_s16(q11s16); - q1s8 = vcombine_s8(d2s8, d3s8); - q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8); - q1s8 = vreinterpretq_s8_u8(q1u8); - - q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q4)); - q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9)); - q2s8 = vshrq_n_s8(q2s8, 3); - q1s8 = vshrq_n_s8(q1s8, 3); - - q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8); - q0s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8); - - q1s8 = vrshrq_n_s8(q1s8, 1); - q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8)); - - q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8); - q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8); - - *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q10); - *q7r = veorq_u8(vreinterpretq_u8_s8(q0s8), q10); - *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q10); - *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q10); - return; -} - -void vpx_lpf_horizontal_4_dual_neon( - uint8_t *s, int p /* pitch */, const uint8_t *blimit0, - const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, - const uint8_t *limit1, const uint8_t *thresh1) { - uint8x8_t dblimit0, dlimit0, dthresh0, dblimit1, dlimit1, dthresh1; - uint8x16_t qblimit, qlimit, qthresh; - uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8; - - dblimit0 = vld1_u8(blimit0); - dlimit0 = vld1_u8(limit0); - dthresh0 = vld1_u8(thresh0); - dblimit1 = vld1_u8(blimit1); - dlimit1 = vld1_u8(limit1); - dthresh1 = vld1_u8(thresh1); - qblimit = vcombine_u8(dblimit0, dblimit1); - qlimit = vcombine_u8(dlimit0, dlimit1); - qthresh = vcombine_u8(dthresh0, dthresh1); - - s -= (p << 2); - - q3u8 = vld1q_u8(s); - s += p; - q4u8 = vld1q_u8(s); - s += p; - q5u8 = vld1q_u8(s); - s += p; - q6u8 = vld1q_u8(s); - s += p; - q7u8 = vld1q_u8(s); - s += p; - q8u8 = vld1q_u8(s); - s += p; - q9u8 = vld1q_u8(s); - s += p; - q10u8 = vld1q_u8(s); - - loop_filter_neon_16(qblimit, qlimit, qthresh, q3u8, q4u8, q5u8, q6u8, q7u8, - q8u8, q9u8, q10u8, &q5u8, &q6u8, &q7u8, &q8u8); - - s -= (p * 5); - vst1q_u8(s, q5u8); - s += p; - vst1q_u8(s, q6u8); - s += p; - vst1q_u8(s, q7u8); - s += p; - vst1q_u8(s, q8u8); - return; -} diff --git a/vpx_dsp/arm/loopfilter_4_neon.c b/vpx_dsp/arm/loopfilter_4_neon.c deleted file mode 100644 index 1c1e80e00..000000000 --- a/vpx_dsp/arm/loopfilter_4_neon.c +++ /dev/null @@ -1,249 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <arm_neon.h> - -#include "./vpx_dsp_rtcd.h" - -static INLINE void loop_filter_neon(uint8x8_t dblimit, // flimit - uint8x8_t dlimit, // limit - uint8x8_t dthresh, // thresh - uint8x8_t d3u8, // p3 - uint8x8_t d4u8, // p2 - uint8x8_t d5u8, // p1 - uint8x8_t d6u8, // p0 - uint8x8_t d7u8, // q0 - uint8x8_t d16u8, // q1 - uint8x8_t d17u8, // q2 - uint8x8_t d18u8, // q3 - uint8x8_t *d4ru8, // p1 - uint8x8_t *d5ru8, // p0 - uint8x8_t *d6ru8, // q0 - uint8x8_t *d7ru8) { // q1 - uint8x8_t d19u8, d20u8, d21u8, d22u8, d23u8, d27u8, d28u8; - int16x8_t q12s16; - int8x8_t d19s8, d20s8, d21s8, d26s8, d27s8, d28s8; - - d19u8 = vabd_u8(d3u8, d4u8); - d20u8 = vabd_u8(d4u8, d5u8); - d21u8 = vabd_u8(d5u8, d6u8); - d22u8 = vabd_u8(d16u8, d7u8); - d3u8 = vabd_u8(d17u8, d16u8); - d4u8 = vabd_u8(d18u8, d17u8); - - d19u8 = vmax_u8(d19u8, d20u8); - d20u8 = vmax_u8(d21u8, d22u8); - d3u8 = vmax_u8(d3u8, d4u8); - d23u8 = vmax_u8(d19u8, d20u8); - - d17u8 = vabd_u8(d6u8, d7u8); - - d21u8 = vcgt_u8(d21u8, dthresh); - d22u8 = vcgt_u8(d22u8, dthresh); - d23u8 = vmax_u8(d23u8, d3u8); - - d28u8 = vabd_u8(d5u8, d16u8); - d17u8 = vqadd_u8(d17u8, d17u8); - - d23u8 = vcge_u8(dlimit, d23u8); - - d18u8 = vdup_n_u8(0x80); - d5u8 = veor_u8(d5u8, d18u8); - d6u8 = veor_u8(d6u8, d18u8); - d7u8 = veor_u8(d7u8, d18u8); - d16u8 = veor_u8(d16u8, d18u8); - - d28u8 = vshr_n_u8(d28u8, 1); - d17u8 = vqadd_u8(d17u8, d28u8); - - d19u8 = vdup_n_u8(3); - - d28s8 = vsub_s8(vreinterpret_s8_u8(d7u8), vreinterpret_s8_u8(d6u8)); - - d17u8 = vcge_u8(dblimit, d17u8); - - d27s8 = vqsub_s8(vreinterpret_s8_u8(d5u8), vreinterpret_s8_u8(d16u8)); - - d22u8 = vorr_u8(d21u8, d22u8); - - q12s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d19u8)); - - d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d22u8); - d23u8 = vand_u8(d23u8, d17u8); - - q12s16 = vaddw_s8(q12s16, vreinterpret_s8_u8(d27u8)); - - d17u8 = vdup_n_u8(4); - - d27s8 = vqmovn_s16(q12s16); - d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d23u8); - d27s8 = vreinterpret_s8_u8(d27u8); - - d28s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d19u8)); - d27s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d17u8)); - d28s8 = vshr_n_s8(d28s8, 3); - d27s8 = vshr_n_s8(d27s8, 3); - - d19s8 = vqadd_s8(vreinterpret_s8_u8(d6u8), d28s8); - d26s8 = vqsub_s8(vreinterpret_s8_u8(d7u8), d27s8); - - d27s8 = vrshr_n_s8(d27s8, 1); - d27s8 = vbic_s8(d27s8, vreinterpret_s8_u8(d22u8)); - - d21s8 = vqadd_s8(vreinterpret_s8_u8(d5u8), d27s8); - d20s8 = vqsub_s8(vreinterpret_s8_u8(d16u8), d27s8); - - *d4ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d18u8); - *d5ru8 = veor_u8(vreinterpret_u8_s8(d19s8), d18u8); - *d6ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d18u8); - *d7ru8 = veor_u8(vreinterpret_u8_s8(d20s8), d18u8); - return; -} - -void vpx_lpf_horizontal_4_neon(uint8_t *src, int pitch, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh) { - int i; - uint8_t *s, *psrc; - uint8x8_t dblimit, dlimit, dthresh; - uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8; - - dblimit = vld1_u8(blimit); - dlimit = vld1_u8(limit); - dthresh = vld1_u8(thresh); - - psrc = src - (pitch << 2); - for (i = 0; i < 1; i++) { - s = psrc + i * 8; - - d3u8 = vld1_u8(s); - s += pitch; - d4u8 = vld1_u8(s); - s += pitch; - d5u8 = vld1_u8(s); - s += pitch; - d6u8 = vld1_u8(s); - s += pitch; - d7u8 = vld1_u8(s); - s += pitch; - d16u8 = vld1_u8(s); - s += pitch; - d17u8 = vld1_u8(s); - s += pitch; - d18u8 = vld1_u8(s); - - loop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8, - d16u8, d17u8, d18u8, &d4u8, &d5u8, &d6u8, &d7u8); - - s -= (pitch * 5); - vst1_u8(s, d4u8); - s += pitch; - vst1_u8(s, d5u8); - s += pitch; - vst1_u8(s, d6u8); - s += pitch; - vst1_u8(s, d7u8); - } - return; -} - -void vpx_lpf_vertical_4_neon(uint8_t *src, int pitch, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh) { - int i, pitch8; - uint8_t *s; - uint8x8_t dblimit, dlimit, dthresh; - uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8; - uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3; - uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7; - uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11; - uint8x8x4_t d4Result; - - dblimit = vld1_u8(blimit); - dlimit = vld1_u8(limit); - dthresh = vld1_u8(thresh); - - pitch8 = pitch * 8; - for (i = 0; i < 1; i++, src += pitch8) { - s = src - (i + 1) * 4; - - d3u8 = vld1_u8(s); - s += pitch; - d4u8 = vld1_u8(s); - s += pitch; - d5u8 = vld1_u8(s); - s += pitch; - d6u8 = vld1_u8(s); - s += pitch; - d7u8 = vld1_u8(s); - s += pitch; - d16u8 = vld1_u8(s); - s += pitch; - d17u8 = vld1_u8(s); - s += pitch; - d18u8 = vld1_u8(s); - - d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), vreinterpret_u32_u8(d7u8)); - d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), vreinterpret_u32_u8(d16u8)); - d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), vreinterpret_u32_u8(d17u8)); - d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), vreinterpret_u32_u8(d18u8)); - - d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]), - vreinterpret_u16_u32(d2tmp2.val[0])); - d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]), - vreinterpret_u16_u32(d2tmp3.val[0])); - d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]), - vreinterpret_u16_u32(d2tmp2.val[1])); - d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]), - vreinterpret_u16_u32(d2tmp3.val[1])); - - d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]), - vreinterpret_u8_u16(d2tmp5.val[0])); - d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]), - vreinterpret_u8_u16(d2tmp5.val[1])); - d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]), - vreinterpret_u8_u16(d2tmp7.val[0])); - d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]), - vreinterpret_u8_u16(d2tmp7.val[1])); - - d3u8 = d2tmp8.val[0]; - d4u8 = d2tmp8.val[1]; - d5u8 = d2tmp9.val[0]; - d6u8 = d2tmp9.val[1]; - d7u8 = d2tmp10.val[0]; - d16u8 = d2tmp10.val[1]; - d17u8 = d2tmp11.val[0]; - d18u8 = d2tmp11.val[1]; - - loop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8, - d16u8, d17u8, d18u8, &d4u8, &d5u8, &d6u8, &d7u8); - - d4Result.val[0] = d4u8; - d4Result.val[1] = d5u8; - d4Result.val[2] = d6u8; - d4Result.val[3] = d7u8; - - src -= 2; - vst4_lane_u8(src, d4Result, 0); - src += pitch; - vst4_lane_u8(src, d4Result, 1); - src += pitch; - vst4_lane_u8(src, d4Result, 2); - src += pitch; - vst4_lane_u8(src, d4Result, 3); - src += pitch; - vst4_lane_u8(src, d4Result, 4); - src += pitch; - vst4_lane_u8(src, d4Result, 5); - src += pitch; - vst4_lane_u8(src, d4Result, 6); - src += pitch; - vst4_lane_u8(src, d4Result, 7); - } - return; -} diff --git a/vpx_dsp/arm/loopfilter_8_neon.c b/vpx_dsp/arm/loopfilter_8_neon.c deleted file mode 100644 index 8641541b0..000000000 --- a/vpx_dsp/arm/loopfilter_8_neon.c +++ /dev/null @@ -1,445 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <arm_neon.h> - -#include "./vpx_dsp_rtcd.h" - -static INLINE void mbloop_filter_neon(uint8x8_t dblimit, // mblimit - uint8x8_t dlimit, // limit - uint8x8_t dthresh, // thresh - uint8x8_t d3u8, // p2 - uint8x8_t d4u8, // p2 - uint8x8_t d5u8, // p1 - uint8x8_t d6u8, // p0 - uint8x8_t d7u8, // q0 - uint8x8_t d16u8, // q1 - uint8x8_t d17u8, // q2 - uint8x8_t d18u8, // q3 - uint8x8_t *d0ru8, // p1 - uint8x8_t *d1ru8, // p1 - uint8x8_t *d2ru8, // p0 - uint8x8_t *d3ru8, // q0 - uint8x8_t *d4ru8, // q1 - uint8x8_t *d5ru8) { // q1 - uint32_t flat; - uint8x8_t d0u8, d1u8, d2u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8; - uint8x8_t d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8; - int16x8_t q15s16; - uint16x8_t q10u16, q14u16; - int8x8_t d21s8, d24s8, d25s8, d26s8, d28s8, d29s8, d30s8; - - d19u8 = vabd_u8(d3u8, d4u8); - d20u8 = vabd_u8(d4u8, d5u8); - d21u8 = vabd_u8(d5u8, d6u8); - d22u8 = vabd_u8(d16u8, d7u8); - d23u8 = vabd_u8(d17u8, d16u8); - d24u8 = vabd_u8(d18u8, d17u8); - - d19u8 = vmax_u8(d19u8, d20u8); - d20u8 = vmax_u8(d21u8, d22u8); - - d25u8 = vabd_u8(d6u8, d4u8); - - d23u8 = vmax_u8(d23u8, d24u8); - - d26u8 = vabd_u8(d7u8, d17u8); - - d19u8 = vmax_u8(d19u8, d20u8); - - d24u8 = vabd_u8(d6u8, d7u8); - d27u8 = vabd_u8(d3u8, d6u8); - d28u8 = vabd_u8(d18u8, d7u8); - - d19u8 = vmax_u8(d19u8, d23u8); - - d23u8 = vabd_u8(d5u8, d16u8); - d24u8 = vqadd_u8(d24u8, d24u8); - - d19u8 = vcge_u8(dlimit, d19u8); - - d25u8 = vmax_u8(d25u8, d26u8); - d26u8 = vmax_u8(d27u8, d28u8); - - d23u8 = vshr_n_u8(d23u8, 1); - - d25u8 = vmax_u8(d25u8, d26u8); - - d24u8 = vqadd_u8(d24u8, d23u8); - - d20u8 = vmax_u8(d20u8, d25u8); - - d23u8 = vdup_n_u8(1); - d24u8 = vcge_u8(dblimit, d24u8); - - d21u8 = vcgt_u8(d21u8, dthresh); - - d20u8 = vcge_u8(d23u8, d20u8); - - d19u8 = vand_u8(d19u8, d24u8); - - d23u8 = vcgt_u8(d22u8, dthresh); - - d20u8 = vand_u8(d20u8, d19u8); - - d22u8 = vdup_n_u8(0x80); - - d23u8 = vorr_u8(d21u8, d23u8); - - q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8), vreinterpret_u16_u8(d21u8)); - - d30u8 = vshrn_n_u16(q10u16, 4); - flat = vget_lane_u32(vreinterpret_u32_u8(d30u8), 0); - - if (flat == 0xffffffff) { // Check for all 1's, power_branch_only - d27u8 = vdup_n_u8(3); - d21u8 = vdup_n_u8(2); - q14u16 = vaddl_u8(d6u8, d7u8); - q14u16 = vmlal_u8(q14u16, d3u8, d27u8); - q14u16 = vmlal_u8(q14u16, d4u8, d21u8); - q14u16 = vaddw_u8(q14u16, d5u8); - *d0ru8 = vqrshrn_n_u16(q14u16, 3); - - q14u16 = vsubw_u8(q14u16, d3u8); - q14u16 = vsubw_u8(q14u16, d4u8); - q14u16 = vaddw_u8(q14u16, d5u8); - q14u16 = vaddw_u8(q14u16, d16u8); - *d1ru8 = vqrshrn_n_u16(q14u16, 3); - - q14u16 = vsubw_u8(q14u16, d3u8); - q14u16 = vsubw_u8(q14u16, d5u8); - q14u16 = vaddw_u8(q14u16, d6u8); - q14u16 = vaddw_u8(q14u16, d17u8); - *d2ru8 = vqrshrn_n_u16(q14u16, 3); - - q14u16 = vsubw_u8(q14u16, d3u8); - q14u16 = vsubw_u8(q14u16, d6u8); - q14u16 = vaddw_u8(q14u16, d7u8); - q14u16 = vaddw_u8(q14u16, d18u8); - *d3ru8 = vqrshrn_n_u16(q14u16, 3); - - q14u16 = vsubw_u8(q14u16, d4u8); - q14u16 = vsubw_u8(q14u16, d7u8); - q14u16 = vaddw_u8(q14u16, d16u8); - q14u16 = vaddw_u8(q14u16, d18u8); - *d4ru8 = vqrshrn_n_u16(q14u16, 3); - - q14u16 = vsubw_u8(q14u16, d5u8); - q14u16 = vsubw_u8(q14u16, d16u8); - q14u16 = vaddw_u8(q14u16, d17u8); - q14u16 = vaddw_u8(q14u16, d18u8); - *d5ru8 = vqrshrn_n_u16(q14u16, 3); - } else { - d21u8 = veor_u8(d7u8, d22u8); - d24u8 = veor_u8(d6u8, d22u8); - d25u8 = veor_u8(d5u8, d22u8); - d26u8 = veor_u8(d16u8, d22u8); - - d27u8 = vdup_n_u8(3); - - d28s8 = vsub_s8(vreinterpret_s8_u8(d21u8), vreinterpret_s8_u8(d24u8)); - d29s8 = vqsub_s8(vreinterpret_s8_u8(d25u8), vreinterpret_s8_u8(d26u8)); - - q15s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d27u8)); - - d29s8 = vand_s8(d29s8, vreinterpret_s8_u8(d23u8)); - - q15s16 = vaddw_s8(q15s16, d29s8); - - d29u8 = vdup_n_u8(4); - - d28s8 = vqmovn_s16(q15s16); - - d28s8 = vand_s8(d28s8, vreinterpret_s8_u8(d19u8)); - - d30s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d27u8)); - d29s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d29u8)); - d30s8 = vshr_n_s8(d30s8, 3); - d29s8 = vshr_n_s8(d29s8, 3); - - d24s8 = vqadd_s8(vreinterpret_s8_u8(d24u8), d30s8); - d21s8 = vqsub_s8(vreinterpret_s8_u8(d21u8), d29s8); - - d29s8 = vrshr_n_s8(d29s8, 1); - d29s8 = vbic_s8(d29s8, vreinterpret_s8_u8(d23u8)); - - d25s8 = vqadd_s8(vreinterpret_s8_u8(d25u8), d29s8); - d26s8 = vqsub_s8(vreinterpret_s8_u8(d26u8), d29s8); - - if (flat == 0) { // filter_branch_only - *d0ru8 = d4u8; - *d1ru8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8); - *d2ru8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8); - *d3ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8); - *d4ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8); - *d5ru8 = d17u8; - return; - } - - d21u8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8); - d24u8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8); - d25u8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8); - d26u8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8); - - d23u8 = vdup_n_u8(2); - q14u16 = vaddl_u8(d6u8, d7u8); - q14u16 = vmlal_u8(q14u16, d3u8, d27u8); - q14u16 = vmlal_u8(q14u16, d4u8, d23u8); - - d0u8 = vbsl_u8(d20u8, dblimit, d4u8); - - q14u16 = vaddw_u8(q14u16, d5u8); - - d1u8 = vbsl_u8(d20u8, dlimit, d25u8); - - d30u8 = vqrshrn_n_u16(q14u16, 3); - - q14u16 = vsubw_u8(q14u16, d3u8); - q14u16 = vsubw_u8(q14u16, d4u8); - q14u16 = vaddw_u8(q14u16, d5u8); - q14u16 = vaddw_u8(q14u16, d16u8); - - d2u8 = vbsl_u8(d20u8, dthresh, d24u8); - - d31u8 = vqrshrn_n_u16(q14u16, 3); - - q14u16 = vsubw_u8(q14u16, d3u8); - q14u16 = vsubw_u8(q14u16, d5u8); - q14u16 = vaddw_u8(q14u16, d6u8); - q14u16 = vaddw_u8(q14u16, d17u8); - - *d0ru8 = vbsl_u8(d20u8, d30u8, d0u8); - - d23u8 = vqrshrn_n_u16(q14u16, 3); - - q14u16 = vsubw_u8(q14u16, d3u8); - q14u16 = vsubw_u8(q14u16, d6u8); - q14u16 = vaddw_u8(q14u16, d7u8); - - *d1ru8 = vbsl_u8(d20u8, d31u8, d1u8); - - q14u16 = vaddw_u8(q14u16, d18u8); - - *d2ru8 = vbsl_u8(d20u8, d23u8, d2u8); - - d22u8 = vqrshrn_n_u16(q14u16, 3); - - q14u16 = vsubw_u8(q14u16, d4u8); - q14u16 = vsubw_u8(q14u16, d7u8); - q14u16 = vaddw_u8(q14u16, d16u8); - - d3u8 = vbsl_u8(d20u8, d3u8, d21u8); - - q14u16 = vaddw_u8(q14u16, d18u8); - - d4u8 = vbsl_u8(d20u8, d4u8, d26u8); - - d6u8 = vqrshrn_n_u16(q14u16, 3); - - q14u16 = vsubw_u8(q14u16, d5u8); - q14u16 = vsubw_u8(q14u16, d16u8); - q14u16 = vaddw_u8(q14u16, d17u8); - q14u16 = vaddw_u8(q14u16, d18u8); - - d5u8 = vbsl_u8(d20u8, d5u8, d17u8); - - d7u8 = vqrshrn_n_u16(q14u16, 3); - - *d3ru8 = vbsl_u8(d20u8, d22u8, d3u8); - *d4ru8 = vbsl_u8(d20u8, d6u8, d4u8); - *d5ru8 = vbsl_u8(d20u8, d7u8, d5u8); - } - return; -} - -void vpx_lpf_horizontal_8_neon(uint8_t *src, int pitch, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh) { - int i; - uint8_t *s, *psrc; - uint8x8_t dblimit, dlimit, dthresh; - uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; - uint8x8_t d16u8, d17u8, d18u8; - - dblimit = vld1_u8(blimit); - dlimit = vld1_u8(limit); - dthresh = vld1_u8(thresh); - - psrc = src - (pitch << 2); - for (i = 0; i < 1; i++) { - s = psrc + i * 8; - - d3u8 = vld1_u8(s); - s += pitch; - d4u8 = vld1_u8(s); - s += pitch; - d5u8 = vld1_u8(s); - s += pitch; - d6u8 = vld1_u8(s); - s += pitch; - d7u8 = vld1_u8(s); - s += pitch; - d16u8 = vld1_u8(s); - s += pitch; - d17u8 = vld1_u8(s); - s += pitch; - d18u8 = vld1_u8(s); - - mbloop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8, - d16u8, d17u8, d18u8, &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, - &d5u8); - - s -= (pitch * 6); - vst1_u8(s, d0u8); - s += pitch; - vst1_u8(s, d1u8); - s += pitch; - vst1_u8(s, d2u8); - s += pitch; - vst1_u8(s, d3u8); - s += pitch; - vst1_u8(s, d4u8); - s += pitch; - vst1_u8(s, d5u8); - } - return; -} - -void vpx_lpf_horizontal_8_dual_neon( - uint8_t *s, int p /* pitch */, const uint8_t *blimit0, - const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, - const uint8_t *limit1, const uint8_t *thresh1) { - vpx_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0); - vpx_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1); -} - -void vpx_lpf_vertical_8_neon(uint8_t *src, int pitch, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh) { - int i; - uint8_t *s; - uint8x8_t dblimit, dlimit, dthresh; - uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; - uint8x8_t d16u8, d17u8, d18u8; - uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3; - uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7; - uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11; - uint8x8x4_t d4Result; - uint8x8x2_t d2Result; - - dblimit = vld1_u8(blimit); - dlimit = vld1_u8(limit); - dthresh = vld1_u8(thresh); - - for (i = 0; i < 1; i++) { - s = src + (i * (pitch << 3)) - 4; - - d3u8 = vld1_u8(s); - s += pitch; - d4u8 = vld1_u8(s); - s += pitch; - d5u8 = vld1_u8(s); - s += pitch; - d6u8 = vld1_u8(s); - s += pitch; - d7u8 = vld1_u8(s); - s += pitch; - d16u8 = vld1_u8(s); - s += pitch; - d17u8 = vld1_u8(s); - s += pitch; - d18u8 = vld1_u8(s); - - d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), vreinterpret_u32_u8(d7u8)); - d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), vreinterpret_u32_u8(d16u8)); - d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), vreinterpret_u32_u8(d17u8)); - d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), vreinterpret_u32_u8(d18u8)); - - d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]), - vreinterpret_u16_u32(d2tmp2.val[0])); - d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]), - vreinterpret_u16_u32(d2tmp3.val[0])); - d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]), - vreinterpret_u16_u32(d2tmp2.val[1])); - d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]), - vreinterpret_u16_u32(d2tmp3.val[1])); - - d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]), - vreinterpret_u8_u16(d2tmp5.val[0])); - d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]), - vreinterpret_u8_u16(d2tmp5.val[1])); - d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]), - vreinterpret_u8_u16(d2tmp7.val[0])); - d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]), - vreinterpret_u8_u16(d2tmp7.val[1])); - - d3u8 = d2tmp8.val[0]; - d4u8 = d2tmp8.val[1]; - d5u8 = d2tmp9.val[0]; - d6u8 = d2tmp9.val[1]; - d7u8 = d2tmp10.val[0]; - d16u8 = d2tmp10.val[1]; - d17u8 = d2tmp11.val[0]; - d18u8 = d2tmp11.val[1]; - - mbloop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8, - d16u8, d17u8, d18u8, &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, - &d5u8); - - d4Result.val[0] = d0u8; - d4Result.val[1] = d1u8; - d4Result.val[2] = d2u8; - d4Result.val[3] = d3u8; - - d2Result.val[0] = d4u8; - d2Result.val[1] = d5u8; - - s = src - 3; - vst4_lane_u8(s, d4Result, 0); - s += pitch; - vst4_lane_u8(s, d4Result, 1); - s += pitch; - vst4_lane_u8(s, d4Result, 2); - s += pitch; - vst4_lane_u8(s, d4Result, 3); - s += pitch; - vst4_lane_u8(s, d4Result, 4); - s += pitch; - vst4_lane_u8(s, d4Result, 5); - s += pitch; - vst4_lane_u8(s, d4Result, 6); - s += pitch; - vst4_lane_u8(s, d4Result, 7); - - s = src + 1; - vst2_lane_u8(s, d2Result, 0); - s += pitch; - vst2_lane_u8(s, d2Result, 1); - s += pitch; - vst2_lane_u8(s, d2Result, 2); - s += pitch; - vst2_lane_u8(s, d2Result, 3); - s += pitch; - vst2_lane_u8(s, d2Result, 4); - s += pitch; - vst2_lane_u8(s, d2Result, 5); - s += pitch; - vst2_lane_u8(s, d2Result, 6); - s += pitch; - vst2_lane_u8(s, d2Result, 7); - } - return; -} - -void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0, - const uint8_t *limit0, const uint8_t *thresh0, - const uint8_t *blimit1, const uint8_t *limit1, - const uint8_t *thresh1) { - vpx_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0); - vpx_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1); -} diff --git a/vpx_dsp/arm/loopfilter_mb_neon.c b/vpx_dsp/arm/loopfilter_mb_neon.c index aa61220d3..f95267472 100644 --- a/vpx_dsp/arm/loopfilter_mb_neon.c +++ b/vpx_dsp/arm/loopfilter_mb_neon.c @@ -31,6 +31,15 @@ FUN_LOAD_THRESH(8, _) // load_thresh_8 FUN_LOAD_THRESH(16, q_) // load_thresh_16 #undef FUN_LOAD_THRESH +static INLINE void load_thresh_8_dual( + const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, + uint8x16_t *blimit_vec, uint8x16_t *limit_vec, uint8x16_t *thresh_vec) { + *blimit_vec = vcombine_u8(vld1_dup_u8(blimit0), vld1_dup_u8(blimit1)); + *limit_vec = vcombine_u8(vld1_dup_u8(limit0), vld1_dup_u8(limit1)); + *thresh_vec = vcombine_u8(vld1_dup_u8(thresh0), vld1_dup_u8(thresh1)); +} + // Here flat is 64-bit long, with each 8-bit (or 4-bit) chunk being a mask of a // pixel. When used to control filter branches, we only detect whether it is all // 0s or all 1s. We pairwise add flat to a 32-bit long number flat_status. @@ -56,33 +65,51 @@ static INLINE uint32_t calc_flat_status_16(uint8x16_t flat) { return calc_flat_status_8(flat_4bit); } -#define FUN_FILTER_FLAT_HEV_MASK(w, r) \ - static INLINE uint8x##w##_t filter_flat_hev_mask_##w( \ +#define FUN_FILTER_HEV_MASK4(w, r) \ + static INLINE uint8x##w##_t filter_hev_mask4_##w( \ const uint8x##w##_t limit, const uint8x##w##_t blimit, \ const uint8x##w##_t thresh, const uint8x##w##_t p3, \ const uint8x##w##_t p2, const uint8x##w##_t p1, const uint8x##w##_t p0, \ const uint8x##w##_t q0, const uint8x##w##_t q1, const uint8x##w##_t q2, \ - const uint8x##w##_t q3, uint8x##w##_t *flat, uint32_t *flat_status, \ - uint8x##w##_t *hev) { \ - uint8x##w##_t t0, t1, mask; \ + const uint8x##w##_t q3, uint8x##w##_t *hev, uint8x##w##_t *mask) { \ + uint8x##w##_t max, t0, t1; \ \ - mask = vabd##r##u8(p1, p0); \ - mask = vmax##r##u8(mask, vabd##r##u8(q1, q0)); \ - *hev = vcgt##r##u8(mask, thresh); \ - *flat = vmax##r##u8(mask, vabd##r##u8(p2, p0)); \ - mask = vmax##r##u8(mask, vabd##r##u8(p3, p2)); \ - mask = vmax##r##u8(mask, vabd##r##u8(p2, p1)); \ - mask = vmax##r##u8(mask, vabd##r##u8(q2, q1)); \ - mask = vmax##r##u8(mask, vabd##r##u8(q3, q2)); \ + max = vabd##r##u8(p1, p0); \ + max = vmax##r##u8(max, vabd##r##u8(q1, q0)); \ + *hev = vcgt##r##u8(max, thresh); \ + *mask = vmax##r##u8(max, vabd##r##u8(p3, p2)); \ + *mask = vmax##r##u8(*mask, vabd##r##u8(p2, p1)); \ + *mask = vmax##r##u8(*mask, vabd##r##u8(q2, q1)); \ + *mask = vmax##r##u8(*mask, vabd##r##u8(q3, q2)); \ t0 = vabd##r##u8(p0, q0); \ t1 = vabd##r##u8(p1, q1); \ t0 = vqadd##r##u8(t0, t0); \ t1 = vshr##r##n_u8(t1, 1); \ t0 = vqadd##r##u8(t0, t1); \ - mask = vcle##r##u8(mask, limit); \ + *mask = vcle##r##u8(*mask, limit); \ t0 = vcle##r##u8(t0, blimit); \ - mask = vand##r##u8(mask, t0); \ + *mask = vand##r##u8(*mask, t0); \ + \ + return max; \ + } + +FUN_FILTER_HEV_MASK4(8, _) // filter_hev_mask4_8 +FUN_FILTER_HEV_MASK4(16, q_) // filter_hev_mask4_16 +#undef FUN_FILTER_HEV_MASK4 + +#define FUN_FILTER_FLAT_HEV_MASK(w, r) \ + static INLINE uint8x##w##_t filter_flat_hev_mask_##w( \ + const uint8x##w##_t limit, const uint8x##w##_t blimit, \ + const uint8x##w##_t thresh, const uint8x##w##_t p3, \ + const uint8x##w##_t p2, const uint8x##w##_t p1, const uint8x##w##_t p0, \ + const uint8x##w##_t q0, const uint8x##w##_t q1, const uint8x##w##_t q2, \ + const uint8x##w##_t q3, uint8x##w##_t *flat, uint32_t *flat_status, \ + uint8x##w##_t *hev) { \ + uint8x##w##_t max, mask; \ \ + max = filter_hev_mask4_##w(limit, blimit, thresh, p3, p2, p1, p0, q0, q1, \ + q2, q3, hev, &mask); \ + *flat = vmax##r##u8(max, vabd##r##u8(p2, p0)); \ *flat = vmax##r##u8(*flat, vabd##r##u8(q2, q0)); \ *flat = vmax##r##u8(*flat, vabd##r##u8(p3, p0)); \ *flat = vmax##r##u8(*flat, vabd##r##u8(q3, q0)); \ @@ -420,6 +447,33 @@ FUN_FILTER4(8, _) // filter4_8 FUN_FILTER4(16, q_) // filter4_16 #undef FUN_FILTER4 +#define FUN_FILTER8(w) \ + static INLINE void filter8_##w( \ + const uint8x##w##_t mask, const uint8x##w##_t flat, \ + const uint32_t flat_status, const uint8x##w##_t hev, \ + const uint8x##w##_t p3, const uint8x##w##_t p2, const uint8x##w##_t p1, \ + const uint8x##w##_t p0, const uint8x##w##_t q0, const uint8x##w##_t q1, \ + const uint8x##w##_t q2, const uint8x##w##_t q3, uint8x##w##_t *op2, \ + uint8x##w##_t *op1, uint8x##w##_t *op0, uint8x##w##_t *oq0, \ + uint8x##w##_t *oq1, uint8x##w##_t *oq2) { \ + if (flat_status != (uint32_t)-2) { \ + filter4_##w(mask, hev, p1, p0, q0, q1, op1, op0, oq0, oq1); \ + *op2 = p2; \ + *oq2 = q2; \ + if (flat_status) { \ + apply_7_tap_filter_##w(flat, p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, \ + op0, oq0, oq1, oq2); \ + } \ + } else { \ + calc_7_tap_filter_##w(p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, op0, \ + oq0, oq1, oq2); \ + } \ + } + +FUN_FILTER8(8) // filter8_8 +FUN_FILTER8(16) // filter8_16 +#undef FUN_FILTER8 + #define FUN_FILTER16(w) \ static INLINE void filter16_##w( \ const uint8x##w##_t mask, const uint8x##w##_t flat, \ @@ -481,6 +535,7 @@ FUN_FILTER16(16) // filter16_16 *q3 = vld1##r##u8(s); \ } +FUN_LOAD8(8, _) // load_8x8 FUN_LOAD8(16, q_) // load_16x8 #undef FUN_LOAD8 @@ -529,6 +584,71 @@ FUN_LOAD16(8, _) // load_8x16 FUN_LOAD16(16, q_) // load_16x16 #undef FUN_LOAD16 +#define FUN_STORE4(w, r) \ + static INLINE void store_##w##x4( \ + uint8_t *s, const int p, const uint8x##w##_t s0, const uint8x##w##_t s1, \ + const uint8x##w##_t s2, const uint8x##w##_t s3) { \ + vst1##r##u8(s, s0); \ + s += p; \ + vst1##r##u8(s, s1); \ + s += p; \ + vst1##r##u8(s, s2); \ + s += p; \ + vst1##r##u8(s, s3); \ + } + +FUN_STORE4(8, _) // store_8x4 +FUN_STORE4(16, q_) // store_16x4 +#undef FUN_STORE4 + +#define FUN_STORE6(w, r) \ + static INLINE void store_##w##x6( \ + uint8_t *s, const int p, const uint8x##w##_t s0, const uint8x##w##_t s1, \ + const uint8x##w##_t s2, const uint8x##w##_t s3, const uint8x##w##_t s4, \ + const uint8x##w##_t s5) { \ + vst1##r##u8(s, s0); \ + s += p; \ + vst1##r##u8(s, s1); \ + s += p; \ + vst1##r##u8(s, s2); \ + s += p; \ + vst1##r##u8(s, s3); \ + s += p; \ + vst1##r##u8(s, s4); \ + s += p; \ + vst1##r##u8(s, s5); \ + } + +FUN_STORE6(8, _) // store_8x6 +FUN_STORE6(16, q_) // store_16x6 +#undef FUN_STORE6 + +static INLINE void store_4x8(uint8_t *s, const int p, const uint8x8_t p1, + const uint8x8_t p0, const uint8x8_t q0, + const uint8x8_t q1) { + uint8x8x4_t o; + + o.val[0] = p1; + o.val[1] = p0; + o.val[2] = q0; + o.val[3] = q1; + vst4_lane_u8(s, o, 0); + s += p; + vst4_lane_u8(s, o, 1); + s += p; + vst4_lane_u8(s, o, 2); + s += p; + vst4_lane_u8(s, o, 3); + s += p; + vst4_lane_u8(s, o, 4); + s += p; + vst4_lane_u8(s, o, 5); + s += p; + vst4_lane_u8(s, o, 6); + s += p; + vst4_lane_u8(s, o, 7); +} + static INLINE void store_6x8(uint8_t *s, const int p, const uint8x8_t s0, const uint8x8_t s1, const uint8x8_t s2, const uint8x8_t s3, const uint8x8_t s4, @@ -566,53 +686,64 @@ static INLINE void store_6x8(uint8_t *s, const int p, const uint8x8_t s0, vst3_lane_u8(s + 0, o1, 7); } -static INLINE void store_4x8(uint8_t *s, const int p, const uint8x8_t p1, - const uint8x8_t p0, const uint8x8_t q0, - const uint8x8_t q1) { - uint8x8x4_t o; +#define FUN_STORE8(w, r) \ + static INLINE void store_##w##x8( \ + uint8_t *s, const int p, const uint8x##w##_t s0, const uint8x##w##_t s1, \ + const uint8x##w##_t s2, const uint8x##w##_t s3, const uint8x##w##_t s4, \ + const uint8x##w##_t s5, const uint8x##w##_t s6, \ + const uint8x##w##_t s7) { \ + vst1##r##u8(s, s0); \ + s += p; \ + vst1##r##u8(s, s1); \ + s += p; \ + vst1##r##u8(s, s2); \ + s += p; \ + vst1##r##u8(s, s3); \ + s += p; \ + vst1##r##u8(s, s4); \ + s += p; \ + vst1##r##u8(s, s5); \ + s += p; \ + vst1##r##u8(s, s6); \ + s += p; \ + vst1##r##u8(s, s7); \ + } - o.val[0] = p1; - o.val[1] = p0; - o.val[2] = q0; - o.val[3] = q1; - vst4_lane_u8(s, o, 0); - s += p; - vst4_lane_u8(s, o, 1); - s += p; - vst4_lane_u8(s, o, 2); - s += p; - vst4_lane_u8(s, o, 3); - s += p; - vst4_lane_u8(s, o, 4); - s += p; - vst4_lane_u8(s, o, 5); - s += p; - vst4_lane_u8(s, o, 6); - s += p; - vst4_lane_u8(s, o, 7); -} +FUN_STORE8(8, _) // store_8x8 +FUN_STORE8(16, q_) // store_16x8 +#undef FUN_STORE8 -static INLINE void store_16x8(uint8_t *s, const int p, const uint8x16_t s0, - const uint8x16_t s1, const uint8x16_t s2, - const uint8x16_t s3, const uint8x16_t s4, - const uint8x16_t s5, const uint8x16_t s6, - const uint8x16_t s7) { - vst1q_u8(s, s0); - s += p; - vst1q_u8(s, s1); - s += p; - vst1q_u8(s, s2); - s += p; - vst1q_u8(s, s3); - s += p; - vst1q_u8(s, s4); - s += p; - vst1q_u8(s, s5); - s += p; - vst1q_u8(s, s6); - s += p; - vst1q_u8(s, s7); -} +#define FUN_STORE14(w, r) \ + static INLINE void store_##w##x14( \ + uint8_t *s, const int p, const uint8x##w##_t p6, const uint8x##w##_t p5, \ + const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2, \ + const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0, \ + const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3, \ + const uint8x##w##_t q4, const uint8x##w##_t q5, const uint8x##w##_t q6, \ + const uint32_t flat_status, const uint32_t flat2_status) { \ + if (flat_status) { \ + if (flat2_status) { \ + vst1##r##u8(s - 7 * p, p6); \ + vst1##r##u8(s - 6 * p, p5); \ + vst1##r##u8(s - 5 * p, p4); \ + vst1##r##u8(s - 4 * p, p3); \ + vst1##r##u8(s + 3 * p, q3); \ + vst1##r##u8(s + 4 * p, q4); \ + vst1##r##u8(s + 5 * p, q5); \ + vst1##r##u8(s + 6 * p, q6); \ + } \ + vst1##r##u8(s - 3 * p, p2); \ + vst1##r##u8(s + 2 * p, q2); \ + } \ + vst1##r##u8(s - 2 * p, p1); \ + vst1##r##u8(s - 1 * p, p0); \ + vst1##r##u8(s + 0 * p, q0); \ + vst1##r##u8(s + 1 * p, q1); \ + } + +FUN_STORE14(8, _) // store_8x14 +FUN_STORE14(16, q_) // store_16x14 +#undef FUN_STORE14 static INLINE void store_16x16(uint8_t *s, const int p, const uint8x16_t s0, const uint8x16_t s1, const uint8x16_t s2, @@ -656,37 +787,160 @@ static INLINE void store_16x16(uint8_t *s, const int p, const uint8x16_t s0, vst1q_u8(s, s15); } -#define FUN_STORE14(w, r) \ - static INLINE void store_##w##x14( \ - uint8_t *s, const int p, const uint8x##w##_t p6, const uint8x##w##_t p5, \ - const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2, \ - const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0, \ - const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3, \ - const uint8x##w##_t q4, const uint8x##w##_t q5, const uint8x##w##_t q6, \ - const uint32_t flat_status, const uint32_t flat2_status) { \ - if (flat_status) { \ - if (flat2_status) { \ - vst1##r##u8(s - 7 * p, p6); \ - vst1##r##u8(s - 6 * p, p5); \ - vst1##r##u8(s - 5 * p, p4); \ - vst1##r##u8(s - 4 * p, p3); \ - vst1##r##u8(s + 3 * p, q3); \ - vst1##r##u8(s + 4 * p, q4); \ - vst1##r##u8(s + 5 * p, q5); \ - vst1##r##u8(s + 6 * p, q6); \ - } \ - vst1##r##u8(s - 3 * p, p2); \ - vst1##r##u8(s + 2 * p, q2); \ - } \ - vst1##r##u8(s - 2 * p, p1); \ - vst1##r##u8(s - 1 * p, p0); \ - vst1##r##u8(s + 0 * p, q0); \ - vst1##r##u8(s + 1 * p, q1); \ +#define FUN_HOR_4_KERNEL(name, w) \ + static INLINE void lpf_horizontal_4##name##kernel( \ + uint8_t *s, const int p, const uint8x##w##_t blimit, \ + const uint8x##w##_t limit, const uint8x##w##_t thresh) { \ + uint8x##w##_t p3, p2, p1, p0, q0, q1, q2, q3, mask, hev; \ + \ + load_##w##x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); \ + filter_hev_mask4_##w(limit, blimit, thresh, p3, p2, p1, p0, q0, q1, q2, \ + q3, &hev, &mask); \ + filter4_##w(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1); \ + store_##w##x4(s - 2 * p, p, p1, p0, q0, q1); \ } -FUN_STORE14(8, _) // store_8x14 -FUN_STORE14(16, q_) // store_16x14 -#undef FUN_STORE14 +FUN_HOR_4_KERNEL(_, 8) // lpf_horizontal_4_kernel +FUN_HOR_4_KERNEL(_dual_, 16) // lpf_horizontal_4_dual_kernel +#undef FUN_HOR_4_KERNEL + +void vpx_lpf_horizontal_4_neon(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint8x8_t blimit_vec, limit_vec, thresh_vec; + load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec); + lpf_horizontal_4_kernel(s, p, blimit_vec, limit_vec, thresh_vec); +} + +void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + uint8x16_t blimit_vec, limit_vec, thresh_vec; + load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1, + &blimit_vec, &limit_vec, &thresh_vec); + lpf_horizontal_4_dual_kernel(s, p, blimit_vec, limit_vec, thresh_vec); +} + +void vpx_lpf_vertical_4_neon(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint8x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3, + mask, hev; + load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec); + load_8x8(s - 4, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + transpose_u8_8x8(&p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + filter_hev_mask4_8(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, + q2, q3, &hev, &mask); + filter4_8(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1); + store_4x8(s - 2, p, p1, p0, q0, q1); +} + +void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + uint8x16_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3, + mask, hev; + uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, + s15; + + load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1, + &blimit_vec, &limit_vec, &thresh_vec); + load_8x16(s - 4, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &s10, + &s11, &s12, &s13, &s14, &s15); + transpose_u8_8x16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, + s14, s15, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + filter_hev_mask4_16(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, + q2, q3, &hev, &mask); + filter4_16(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1); + s -= 2; + store_4x8(s, p, vget_low_u8(p1), vget_low_u8(p0), vget_low_u8(q0), + vget_low_u8(q1)); + store_4x8(s + 8 * p, p, vget_high_u8(p1), vget_high_u8(p0), vget_high_u8(q0), + vget_high_u8(q1)); +} + +void vpx_lpf_horizontal_8_neon(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint8x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3, + op2, op1, op0, oq0, oq1, oq2, mask, flat, hev; + uint32_t flat_status; + + load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec); + load_8x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + mask = filter_flat_hev_mask_8(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, + p0, q0, q1, q2, q3, &flat, &flat_status, &hev); + filter8_8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2, + &op1, &op0, &oq0, &oq1, &oq2); + store_8x6(s - 3 * p, p, op2, op1, op0, oq0, oq1, oq2); +} + +void vpx_lpf_horizontal_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + uint8x16_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3, + op2, op1, op0, oq0, oq1, oq2, mask, flat, hev; + uint32_t flat_status; + + load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1, + &blimit_vec, &limit_vec, &thresh_vec); + load_16x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + mask = filter_flat_hev_mask_16(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, + p0, q0, q1, q2, q3, &flat, &flat_status, &hev); + filter8_16(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2, + &op1, &op0, &oq0, &oq1, &oq2); + store_16x6(s - 3 * p, p, op2, op1, op0, oq0, oq1, oq2); +} + +void vpx_lpf_vertical_8_neon(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint8x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3, + op2, op1, op0, oq0, oq1, oq2, mask, flat, hev; + uint32_t flat_status; + + load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec); + load_8x8(s - 4, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + transpose_u8_8x8(&p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + mask = filter_flat_hev_mask_8(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, + p0, q0, q1, q2, q3, &flat, &flat_status, &hev); + filter8_8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2, + &op1, &op0, &oq0, &oq1, &oq2); + // Note: tranpose + store_8x8() is faster than store_6x8(). + transpose_u8_8x8(&p3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &q3); + store_8x8(s - 4, p, p3, op2, op1, op0, oq0, oq1, oq2, q3); +} + +void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + uint8x16_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3, + op2, op1, op0, oq0, oq1, oq2, mask, flat, hev; + uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, + s15; + uint32_t flat_status; + + load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1, + &blimit_vec, &limit_vec, &thresh_vec); + load_8x16(s - 4, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &s10, + &s11, &s12, &s13, &s14, &s15); + transpose_u8_8x16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, + s14, s15, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + mask = filter_flat_hev_mask_16(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, + p0, q0, q1, q2, q3, &flat, &flat_status, &hev); + filter8_16(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2, + &op1, &op0, &oq0, &oq1, &oq2); + // Note: store_6x8() twice is faster than tranpose + store_8x16(). + store_6x8(s, p, vget_low_u8(op2), vget_low_u8(op1), vget_low_u8(op0), + vget_low_u8(oq0), vget_low_u8(oq1), vget_low_u8(oq2)); + store_6x8(s + 8 * p, p, vget_high_u8(op2), vget_high_u8(op1), + vget_high_u8(op0), vget_high_u8(oq0), vget_high_u8(oq1), + vget_high_u8(oq2)); +} #define FUN_LPF_16_KERNEL(name, w) \ static INLINE void lpf_16##name##kernel( \ @@ -784,7 +1038,9 @@ void vpx_lpf_vertical_16_neon(uint8_t *s, int p, const uint8_t *blimit, &s6, &s7); store_16x8(s, p, s0, s1, s2, s3, s4, s5, s6, s7); } else { - store_6x8(s + 8, p, op2, op1, op0, oq0, oq1, oq2); + // Note: tranpose + store_8x8() is faster than store_6x8(). + transpose_u8_8x8(&p3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &q3); + store_8x8(s + 4, p, p3, op2, op1, op0, oq0, oq1, oq2, q3); } } else { store_4x8(s + 6, p, op1, op0, oq0, oq1); @@ -819,6 +1075,7 @@ void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit, store_16x16(s, p, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15); } else { + // Note: store_6x8() twice is faster than tranpose + store_8x16(). s += 8; store_6x8(s, p, vget_low_u8(op2), vget_low_u8(op1), vget_low_u8(op0), vget_low_u8(oq0), vget_low_u8(oq1), vget_low_u8(oq2)); diff --git a/vpx_dsp/arm/transpose_neon.h b/vpx_dsp/arm/transpose_neon.h index 2318fb44b..3d0b41f93 100644 --- a/vpx_dsp/arm/transpose_neon.h +++ b/vpx_dsp/arm/transpose_neon.h @@ -39,28 +39,84 @@ static INLINE uint8x16x2_t vpx_vtrnq_u64(uint32x4_t a0, uint32x4_t a1) { return b0; } +// Note: Using 'd' registers or 'q' registers has almost identical speed. We use +// 'q' registers here to save some instructions. +static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2, + uint8x8_t *a3, uint8x8_t *a4, uint8x8_t *a5, + uint8x8_t *a6, uint8x8_t *a7) { + // Swap 8 bit elements. Goes from: + // a0: 00 01 02 03 04 05 06 07 + // a1: 10 11 12 13 14 15 16 17 + // a2: 20 21 22 23 24 25 26 27 + // a3: 30 31 32 33 34 35 36 37 + // a4: 40 41 42 43 44 45 46 47 + // a5: 50 51 52 53 54 55 56 57 + // a6: 60 61 62 63 64 65 66 67 + // a7: 70 71 72 73 74 75 76 77 + // to: + // b0.val[0]: 00 10 02 12 04 14 06 16 40 50 42 52 44 54 46 56 + // b0.val[1]: 01 11 03 13 05 15 07 17 41 51 43 53 45 55 47 57 + // b1.val[0]: 20 30 22 32 24 34 26 36 60 70 62 72 64 74 66 76 + // b1.val[1]: 21 31 23 33 25 35 27 37 61 71 63 73 65 75 67 77 + + const uint8x16x2_t b0 = + vtrnq_u8(vcombine_u8(*a0, *a4), vcombine_u8(*a1, *a5)); + const uint8x16x2_t b1 = + vtrnq_u8(vcombine_u8(*a2, *a6), vcombine_u8(*a3, *a7)); + + // Swap 16 bit elements resulting in: + // c0.val[0]: 00 10 20 30 04 14 24 34 40 50 60 70 44 54 64 74 + // c0.val[1]: 02 12 22 32 06 16 26 36 42 52 62 72 46 56 66 76 + // c1.val[0]: 01 11 21 31 05 15 25 35 41 51 61 71 45 55 65 75 + // c1.val[1]: 03 13 23 33 07 17 27 37 43 53 63 73 47 57 67 77 + + const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]), + vreinterpretq_u16_u8(b1.val[0])); + const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]), + vreinterpretq_u16_u8(b1.val[1])); + + // Unzip 32 bit elements resulting in: + // d0.val[0]: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + // d0.val[1]: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 + // d1.val[0]: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + // d1.val[1]: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 + const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]), + vreinterpretq_u32_u16(c1.val[0])); + const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]), + vreinterpretq_u32_u16(c1.val[1])); + + *a0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0])); + *a1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0])); + *a2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0])); + *a3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0])); + *a4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1])); + *a5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1])); + *a6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1])); + *a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1])); +} + static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2, int16x8_t *a3, int16x8_t *a4, int16x8_t *a5, int16x8_t *a6, int16x8_t *a7) { // Swap 16 bit elements. Goes from: // a0: 00 01 02 03 04 05 06 07 - // a1: 08 09 10 11 12 13 14 15 - // a2: 16 17 18 19 20 21 22 23 - // a3: 24 25 26 27 28 29 30 31 - // a4: 32 33 34 35 36 37 38 39 - // a5: 40 41 42 43 44 45 46 47 - // a6: 48 49 50 51 52 53 54 55 - // a7: 56 57 58 59 60 61 62 63 + // a1: 10 11 12 13 14 15 16 17 + // a2: 20 21 22 23 24 25 26 27 + // a3: 30 31 32 33 34 35 36 37 + // a4: 40 41 42 43 44 45 46 47 + // a5: 50 51 52 53 54 55 56 57 + // a6: 60 61 62 63 64 65 66 67 + // a7: 70 71 72 73 74 75 76 77 // to: - // b0.val[0]: 00 08 02 10 04 12 06 14 - // b0.val[1]: 01 09 03 11 05 13 07 15 - // b1.val[0]: 16 24 18 26 20 28 22 30 - // b1.val[1]: 17 25 19 27 21 29 23 31 - // b2.val[0]: 32 40 34 42 36 44 38 46 - // b2.val[1]: 33 41 35 43 37 45 39 47 - // b3.val[0]: 48 56 50 58 52 60 54 62 - // b3.val[1]: 49 57 51 59 53 61 55 63 + // b0.val[0]: 00 10 02 12 04 14 06 16 + // b0.val[1]: 01 11 03 13 05 15 07 17 + // b1.val[0]: 20 30 22 32 24 34 26 36 + // b1.val[1]: 21 31 23 33 25 35 27 37 + // b2.val[0]: 40 50 42 52 44 54 46 56 + // b2.val[1]: 41 51 43 53 45 55 47 57 + // b3.val[0]: 60 70 62 72 64 74 66 76 + // b3.val[1]: 61 71 63 73 65 75 67 77 const int16x8x2_t b0 = vtrnq_s16(*a0, *a1); const int16x8x2_t b1 = vtrnq_s16(*a2, *a3); @@ -68,14 +124,14 @@ static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1, const int16x8x2_t b3 = vtrnq_s16(*a6, *a7); // Swap 32 bit elements resulting in: - // c0.val[0]: 00 08 16 24 04 12 20 28 - // c0.val[1]: 02 10 18 26 06 14 22 30 - // c1.val[0]: 01 09 17 25 05 13 21 29 - // c1.val[1]: 03 11 19 27 07 15 23 31 - // c2.val[0]: 32 40 48 56 36 44 52 60 - // c2.val[1]: 34 42 50 58 38 46 54 62 - // c3.val[0]: 33 41 49 57 37 45 53 61 - // c3.val[1]: 35 43 51 59 39 47 55 63 + // c0.val[0]: 00 10 20 30 04 14 24 34 + // c0.val[1]: 02 12 22 32 06 16 26 36 + // c1.val[0]: 01 11 21 31 05 15 25 35 + // c1.val[1]: 03 13 23 33 07 17 27 37 + // c2.val[0]: 40 50 60 70 44 54 64 74 + // c2.val[1]: 42 52 62 72 46 56 66 76 + // c3.val[0]: 41 51 61 71 45 55 65 75 + // c3.val[1]: 43 53 63 73 47 57 67 77 const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]), vreinterpretq_s32_s16(b1.val[0])); @@ -87,14 +143,14 @@ static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1, vreinterpretq_s32_s16(b3.val[1])); // Swap 64 bit elements resulting in: - // d0.val[0]: 00 08 16 24 32 40 48 56 - // d0.val[1]: 04 12 20 28 36 44 52 60 - // d1.val[0]: 01 09 17 25 33 41 49 57 - // d1.val[1]: 05 13 21 29 37 45 53 61 - // d2.val[0]: 02 10 18 26 34 42 50 58 - // d2.val[1]: 06 14 22 30 38 46 54 62 - // d3.val[0]: 03 11 19 27 35 43 51 59 - // d3.val[1]: 07 15 23 31 39 47 55 63 + // d0.val[0]: 00 10 20 30 40 50 60 70 + // d0.val[1]: 04 14 24 34 44 54 64 74 + // d1.val[0]: 01 11 21 31 41 51 61 71 + // d1.val[1]: 05 15 25 35 45 55 65 75 + // d2.val[0]: 02 12 22 32 42 52 62 72 + // d2.val[1]: 06 16 26 36 46 56 66 76 + // d3.val[0]: 03 13 23 33 43 53 63 73 + // d3.val[1]: 07 17 27 37 47 57 67 77 const int16x8x2_t d0 = vpx_vtrnq_s64(c0.val[0], c2.val[0]); const int16x8x2_t d1 = vpx_vtrnq_s64(c1.val[0], c3.val[0]); const int16x8x2_t d2 = vpx_vtrnq_s64(c0.val[1], c2.val[1]); diff --git a/vpx_dsp/arm/variance_neon.c b/vpx_dsp/arm/variance_neon.c index f469afc4e..b6d7f86a4 100644 --- a/vpx_dsp/arm/variance_neon.c +++ b/vpx_dsp/arm/variance_neon.c @@ -75,7 +75,7 @@ unsigned int vpx_variance8x8_neon(const uint8_t *a, int a_stride, unsigned int *sse) { int sum; variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum); - return *sse - (((int64_t)sum * sum) >> 6); // >> 6 = / 8 * 8 + return *sse - ((sum * sum) >> 6); } unsigned int vpx_variance16x16_neon(const uint8_t *a, int a_stride, @@ -83,7 +83,7 @@ unsigned int vpx_variance16x16_neon(const uint8_t *a, int a_stride, unsigned int *sse) { int sum; variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum); - return *sse - (((int64_t)sum * sum) >> 8); // >> 8 = / 16 * 16 + return *sse - (((uint32_t)((int64_t)sum * sum)) >> 8); } unsigned int vpx_variance32x32_neon(const uint8_t *a, int a_stride, @@ -91,7 +91,7 @@ unsigned int vpx_variance32x32_neon(const uint8_t *a, int a_stride, unsigned int *sse) { int sum; variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum); - return *sse - (((int64_t)sum * sum) >> 10); // >> 10 = / 32 * 32 + return *sse - (unsigned int)(((int64_t)sum * sum) >> 10); } unsigned int vpx_variance32x64_neon(const uint8_t *a, int a_stride, @@ -104,7 +104,7 @@ unsigned int vpx_variance32x64_neon(const uint8_t *a, int a_stride, 32, 32, &sse2, &sum2); *sse = sse1 + sse2; sum1 += sum2; - return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64 + return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11); } unsigned int vpx_variance64x32_neon(const uint8_t *a, int a_stride, @@ -117,7 +117,7 @@ unsigned int vpx_variance64x32_neon(const uint8_t *a, int a_stride, 64, 16, &sse2, &sum2); *sse = sse1 + sse2; sum1 += sum2; - return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64 + return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11); } unsigned int vpx_variance64x64_neon(const uint8_t *a, int a_stride, @@ -141,7 +141,7 @@ unsigned int vpx_variance64x64_neon(const uint8_t *a, int a_stride, b_stride, 64, 16, &sse2, &sum2); *sse = sse1 + sse2; sum1 += sum2; - return *sse - (((int64_t)sum1 * sum1) >> 12); // >> 12 = / 64 * 64 + return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 12); } unsigned int vpx_variance16x8_neon(const unsigned char *src_ptr, diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index def9c8e1b..a78041ce7 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -136,8 +136,8 @@ DSP_SRCS-yes += loopfilter.c DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/loopfilter_sse2.c DSP_SRCS-$(HAVE_AVX2) += x86/loopfilter_avx2.c -DSP_SRCS-$(HAVE_NEON) += arm/loopfilter_neon.c ifeq ($(HAVE_NEON_ASM),yes) +DSP_SRCS-yes += arm/loopfilter_neon.c DSP_SRCS-yes += arm/loopfilter_mb_neon$(ASM) DSP_SRCS-yes += arm/loopfilter_16_neon$(ASM) DSP_SRCS-yes += arm/loopfilter_8_neon$(ASM) @@ -145,9 +145,6 @@ DSP_SRCS-yes += arm/loopfilter_4_neon$(ASM) else ifeq ($(HAVE_NEON),yes) DSP_SRCS-yes += arm/loopfilter_mb_neon.c -DSP_SRCS-yes += arm/loopfilter_16_neon.c -DSP_SRCS-yes += arm/loopfilter_8_neon.c -DSP_SRCS-yes += arm/loopfilter_4_neon.c endif # HAVE_NEON endif # HAVE_NEON_ASM diff --git a/vpx_dsp/x86/inv_txfm_sse2.c b/vpx_dsp/x86/inv_txfm_sse2.c index 330ae8d6a..cb56ad078 100644 --- a/vpx_dsp/x86/inv_txfm_sse2.c +++ b/vpx_dsp/x86/inv_txfm_sse2.c @@ -3066,17 +3066,7 @@ void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, in[6] = load_input_data(input + 192); in[7] = load_input_data(input + 224); - for (i = 8; i < 32; ++i) { - in[i] = _mm_setzero_si128(); - } - array_transpose_8x8(in, in); - // TODO(hkuang): Following transposes are unnecessary. But remove them will - // lead to performance drop on some devices. - array_transpose_8x8(in + 8, in + 8); - array_transpose_8x8(in + 16, in + 16); - array_transpose_8x8(in + 24, in + 24); - IDCT32_34 // 1_D: Store 32 intermediate results for each 8x32 block. diff --git a/vpx_dsp/x86/variance_avx2.c b/vpx_dsp/x86/variance_avx2.c index 7bc2693cf..8428e0520 100644 --- a/vpx_dsp/x86/variance_avx2.c +++ b/vpx_dsp/x86/variance_avx2.c @@ -61,7 +61,7 @@ unsigned int vpx_variance32x16_avx2(const uint8_t *src, int src_stride, int sum; variance_avx2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum, vpx_get32x32var_avx2, 32); - return *sse - (((int64_t)sum * sum) >> 9); + return *sse - (uint32_t)(((int64_t)sum * sum) >> 9); } unsigned int vpx_variance32x32_avx2(const uint8_t *src, int src_stride, @@ -70,7 +70,7 @@ unsigned int vpx_variance32x32_avx2(const uint8_t *src, int src_stride, int sum; variance_avx2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum, vpx_get32x32var_avx2, 32); - return *sse - (((int64_t)sum * sum) >> 10); + return *sse - (uint32_t)(((int64_t)sum * sum) >> 10); } unsigned int vpx_variance64x64_avx2(const uint8_t *src, int src_stride, @@ -79,7 +79,7 @@ unsigned int vpx_variance64x64_avx2(const uint8_t *src, int src_stride, int sum; variance_avx2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum, vpx_get32x32var_avx2, 32); - return *sse - (((int64_t)sum * sum) >> 12); + return *sse - (uint32_t)(((int64_t)sum * sum) >> 12); } unsigned int vpx_variance64x32_avx2(const uint8_t *src, int src_stride, @@ -88,7 +88,7 @@ unsigned int vpx_variance64x32_avx2(const uint8_t *src, int src_stride, int sum; variance_avx2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum, vpx_get32x32var_avx2, 32); - return *sse - (((int64_t)sum * sum) >> 11); + return *sse - (uint32_t)(((int64_t)sum * sum) >> 11); } unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride, @@ -115,7 +115,7 @@ unsigned int vpx_sub_pixel_variance64x64_avx2(const uint8_t *src, dst + 32, dst_stride, 64, &sse2); const int se = se1 + se2; *sse = sse1 + sse2; - return *sse - (((int64_t)se * se) >> 12); + return *sse - (uint32_t)(((int64_t)se * se) >> 12); } unsigned int vpx_sub_pixel_variance32x32_avx2(const uint8_t *src, @@ -125,7 +125,7 @@ unsigned int vpx_sub_pixel_variance32x32_avx2(const uint8_t *src, unsigned int *sse) { const int se = vpx_sub_pixel_variance32xh_avx2( src, src_stride, x_offset, y_offset, dst, dst_stride, 32, sse); - return *sse - (((int64_t)se * se) >> 10); + return *sse - (uint32_t)(((int64_t)se * se) >> 10); } unsigned int vpx_sub_pixel_avg_variance64x64_avx2( @@ -142,7 +142,7 @@ unsigned int vpx_sub_pixel_avg_variance64x64_avx2( *sse = sse1 + sse2; - return *sse - (((int64_t)se * se) >> 12); + return *sse - (uint32_t)(((int64_t)se * se) >> 12); } unsigned int vpx_sub_pixel_avg_variance32x32_avx2( @@ -151,5 +151,5 @@ unsigned int vpx_sub_pixel_avg_variance32x32_avx2( // Process 32 elements in parallel. const int se = vpx_sub_pixel_avg_variance32xh_avx2( src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 32, 32, sse); - return *sse - (((int64_t)se * se) >> 10); + return *sse - (uint32_t)(((int64_t)se * se) >> 10); } @@ -125,30 +125,11 @@ static const arg_def_t deblock = ARG_DEF(NULL, "deblock", 0, "Enable VP8 deblocking"); static const arg_def_t demacroblock_level = ARG_DEF( NULL, "demacroblock-level", 1, "Enable VP8 demacroblocking, w/ level"); -static const arg_def_t pp_debug_info = - ARG_DEF(NULL, "pp-debug-info", 1, "Enable VP8 visible debug info"); -static const arg_def_t pp_disp_ref_frame = - ARG_DEF(NULL, "pp-dbg-ref-frame", 1, - "Display only selected reference frame per macro block"); -static const arg_def_t pp_disp_mb_modes = ARG_DEF( - NULL, "pp-dbg-mb-modes", 1, "Display only selected macro block modes"); -static const arg_def_t pp_disp_b_modes = - ARG_DEF(NULL, "pp-dbg-b-modes", 1, "Display only selected block modes"); -static const arg_def_t pp_disp_mvs = - ARG_DEF(NULL, "pp-dbg-mvs", 1, "Draw only selected motion vectors"); static const arg_def_t mfqe = ARG_DEF(NULL, "mfqe", 0, "Enable multiframe quality enhancement"); -static const arg_def_t *vp8_pp_args[] = { &addnoise_level, - &deblock, - &demacroblock_level, - &pp_debug_info, - &pp_disp_ref_frame, - &pp_disp_mb_modes, - &pp_disp_b_modes, - &pp_disp_mvs, - &mfqe, - NULL }; +static const arg_def_t *vp8_pp_args[] = { &addnoise_level, &deblock, + &demacroblock_level, &mfqe, NULL }; #endif #if CONFIG_LIBYUV @@ -539,10 +520,6 @@ static int main_loop(int argc, const char **argv_) { #endif #if CONFIG_VP8_DECODER vp8_postproc_cfg_t vp8_pp_cfg = { 0, 0, 0 }; - int vp8_dbg_color_ref_frame = 0; - int vp8_dbg_color_mb_modes = 0; - int vp8_dbg_color_b_modes = 0; - int vp8_dbg_display_mv = 0; #endif int frames_corrupted = 0; int dec_flags = 0; @@ -647,37 +624,6 @@ static int main_loop(int argc, const char **argv_) { } else if (arg_match(&arg, &mfqe, argi)) { postproc = 1; vp8_pp_cfg.post_proc_flag |= VP8_MFQE; - } else if (arg_match(&arg, &pp_debug_info, argi)) { - unsigned int level = arg_parse_uint(&arg); - - postproc = 1; - vp8_pp_cfg.post_proc_flag &= ~0x7; - - if (level) vp8_pp_cfg.post_proc_flag |= level; - } else if (arg_match(&arg, &pp_disp_ref_frame, argi)) { - unsigned int flags = arg_parse_int(&arg); - if (flags) { - postproc = 1; - vp8_dbg_color_ref_frame = flags; - } - } else if (arg_match(&arg, &pp_disp_mb_modes, argi)) { - unsigned int flags = arg_parse_int(&arg); - if (flags) { - postproc = 1; - vp8_dbg_color_mb_modes = flags; - } - } else if (arg_match(&arg, &pp_disp_b_modes, argi)) { - unsigned int flags = arg_parse_int(&arg); - if (flags) { - postproc = 1; - vp8_dbg_color_b_modes = flags; - } - } else if (arg_match(&arg, &pp_disp_mvs, argi)) { - unsigned int flags = arg_parse_int(&arg); - if (flags) { - postproc = 1; - vp8_dbg_display_mv = flags; - } } else if (arg_match(&arg, &error_concealment, argi)) { ec_enabled = 1; } @@ -789,37 +735,6 @@ static int main_loop(int argc, const char **argv_) { vpx_codec_error(&decoder)); return EXIT_FAILURE; } - - if (vp8_dbg_color_ref_frame && - vpx_codec_control(&decoder, VP8_SET_DBG_COLOR_REF_FRAME, - vp8_dbg_color_ref_frame)) { - fprintf(stderr, "Failed to configure reference block visualizer: %s\n", - vpx_codec_error(&decoder)); - return EXIT_FAILURE; - } - - if (vp8_dbg_color_mb_modes && - vpx_codec_control(&decoder, VP8_SET_DBG_COLOR_MB_MODES, - vp8_dbg_color_mb_modes)) { - fprintf(stderr, "Failed to configure macro block visualizer: %s\n", - vpx_codec_error(&decoder)); - return EXIT_FAILURE; - } - - if (vp8_dbg_color_b_modes && - vpx_codec_control(&decoder, VP8_SET_DBG_COLOR_B_MODES, - vp8_dbg_color_b_modes)) { - fprintf(stderr, "Failed to configure block visualizer: %s\n", - vpx_codec_error(&decoder)); - return EXIT_FAILURE; - } - - if (vp8_dbg_display_mv && - vpx_codec_control(&decoder, VP8_SET_DBG_DISPLAY_MV, vp8_dbg_display_mv)) { - fprintf(stderr, "Failed to configure motion vector visualizer: %s\n", - vpx_codec_error(&decoder)); - return EXIT_FAILURE; - } #endif if (arg_skip) fprintf(stderr, "Skipping first %d frames.\n", arg_skip); |