summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--examples/decode_with_drops.c4
-rw-r--r--examples/set_maps.c4
-rw-r--r--examples/simple_encoder.c10
-rw-r--r--examples/twopass_encoder.c6
-rw-r--r--examples/vp8cx_set_ref.c4
-rw-r--r--examples/vp9_lossless_encoder.c4
-rw-r--r--examples/vp9cx_set_ref.c4
-rw-r--r--examples/vpx_temporal_svc_encoder.c16
-rw-r--r--test/idct_test.cc4
-rw-r--r--test/predict_test.cc381
-rw-r--r--test/sixtap_predict_test.cc231
-rw-r--r--test/test.mk2
-rw-r--r--vp8/common/arm/neon/dequant_idct_neon.c10
-rw-r--r--vp8/common/arm/neon/shortidct4x4llm_neon.c10
-rw-r--r--vp8/common/arm/neon/sixtappredict_neon.c392
-rw-r--r--vp8/common/filter.c31
-rw-r--r--vp8/common/onyx.h2
-rw-r--r--vp8/common/onyxc_int.h4
-rw-r--r--vp8/common/postproc.c402
-rw-r--r--vp8/common/rtcd_defs.pl4
-rw-r--r--vp8/common/textblit.c118
-rw-r--r--vp8/common/x86/subpixel_sse2.asm8
-rw-r--r--vp8/common/x86/subpixel_ssse3.asm6
-rw-r--r--vp8/encoder/bitstream.c4
-rw-r--r--vp8/encoder/onyx_if.c10
-rw-r--r--vp8/encoder/onyx_int.h2
-rw-r--r--vp8/encoder/pickinter.c2
-rw-r--r--vp8/vp8_common.mk1
-rw-r--r--vp8/vp8_cx_iface.c2
-rw-r--r--vp8/vp8_dx_iface.c73
-rw-r--r--vp9/common/vp9_idct.c37
-rw-r--r--vp9/common/vp9_postproc.c1
-rw-r--r--vp9/common/vp9_rtcd_defs.pl27
-rw-r--r--vp9/common/vp9_textblit.c119
-rw-r--r--vp9/common/vp9_textblit.h27
-rw-r--r--vp9/decoder/vp9_decodeframe.c91
-rw-r--r--vp9/decoder/vp9_decodemv.c8
-rw-r--r--vp9/decoder/vp9_decodemv.h4
-rw-r--r--vp9/decoder/vp9_detokenize.c155
-rw-r--r--vp9/decoder/vp9_detokenize.h4
-rw-r--r--vp9/encoder/vp9_encodemv.c5
-rw-r--r--vp9/encoder/vp9_picklpf.c4
-rw-r--r--vp9/vp9_common.mk2
-rw-r--r--vp9/vp9_dx_iface.c11
-rw-r--r--vpx/vp8.h17
-rw-r--r--vpx_dsp/arm/loopfilter_16_neon.c173
-rw-r--r--vpx_dsp/arm/loopfilter_4_neon.c249
-rw-r--r--vpx_dsp/arm/loopfilter_8_neon.c445
-rw-r--r--vpx_dsp/arm/loopfilter_mb_neon.c437
-rw-r--r--vpx_dsp/arm/transpose_neon.h118
-rw-r--r--vpx_dsp/arm/variance_neon.c12
-rw-r--r--vpx_dsp/vpx_dsp.mk5
-rw-r--r--vpx_dsp/x86/inv_txfm_sse2.c10
-rw-r--r--vpx_dsp/x86/variance_avx2.c16
-rw-r--r--vpxdec.c89
55 files changed, 1515 insertions, 2302 deletions
diff --git a/examples/decode_with_drops.c b/examples/decode_with_drops.c
index 29b8be941..e69e2a9f9 100644
--- a/examples/decode_with_drops.c
+++ b/examples/decode_with_drops.c
@@ -92,8 +92,8 @@ int main(int argc, char **argv) {
if (!(outfile = fopen(argv[2], "wb")))
die("Failed to open %s for writing.", argv[2]);
- n = strtol(argv[3], &nptr, 0);
- m = strtol(nptr + 1, NULL, 0);
+ n = (int)strtol(argv[3], &nptr, 0);
+ m = (int)strtol(nptr + 1, NULL, 0);
is_range = (*nptr == '-');
if (!n || !m || (*nptr != '-' && *nptr != '/'))
die("Couldn't parse pattern %s.\n", argv[3]);
diff --git a/examples/set_maps.c b/examples/set_maps.c
index d128e7d9a..c0c7d10e7 100644
--- a/examples/set_maps.c
+++ b/examples/set_maps.c
@@ -174,8 +174,8 @@ int main(int argc, char **argv) {
}
assert(encoder != NULL);
info.codec_fourcc = encoder->fourcc;
- info.frame_width = strtol(argv[2], NULL, 0);
- info.frame_height = strtol(argv[3], NULL, 0);
+ info.frame_width = (int)strtol(argv[2], NULL, 0);
+ info.frame_height = (int)strtol(argv[3], NULL, 0);
info.time_base.numerator = 1;
info.time_base.denominator = fps;
diff --git a/examples/simple_encoder.c b/examples/simple_encoder.c
index 8632f179b..dde6344f8 100644
--- a/examples/simple_encoder.c
+++ b/examples/simple_encoder.c
@@ -175,14 +175,14 @@ int main(int argc, char **argv) {
infile_arg = argv[4];
outfile_arg = argv[5];
keyframe_interval_arg = argv[6];
- max_frames = strtol(argv[8], NULL, 0);
+ max_frames = (int)strtol(argv[8], NULL, 0);
encoder = get_vpx_encoder_by_name(codec_arg);
if (!encoder) die("Unsupported codec.");
info.codec_fourcc = encoder->fourcc;
- info.frame_width = strtol(width_arg, NULL, 0);
- info.frame_height = strtol(height_arg, NULL, 0);
+ info.frame_width = (int)strtol(width_arg, NULL, 0);
+ info.frame_height = (int)strtol(height_arg, NULL, 0);
info.time_base.numerator = 1;
info.time_base.denominator = fps;
@@ -196,7 +196,7 @@ int main(int argc, char **argv) {
die("Failed to allocate image.");
}
- keyframe_interval = strtol(keyframe_interval_arg, NULL, 0);
+ keyframe_interval = (int)strtol(keyframe_interval_arg, NULL, 0);
if (keyframe_interval < 0) die("Invalid keyframe interval value.");
printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface()));
@@ -209,7 +209,7 @@ int main(int argc, char **argv) {
cfg.g_timebase.num = info.time_base.numerator;
cfg.g_timebase.den = info.time_base.denominator;
cfg.rc_target_bitrate = bitrate;
- cfg.g_error_resilient = strtol(argv[7], NULL, 0);
+ cfg.g_error_resilient = (vpx_codec_er_flags_t)strtoul(argv[7], NULL, 0);
writer = vpx_video_writer_open(outfile_arg, kContainerIVF, &info);
if (!writer) die("Failed to open %s for writing.", outfile_arg);
diff --git a/examples/twopass_encoder.c b/examples/twopass_encoder.c
index 4c130ec18..4e63a7a6c 100644
--- a/examples/twopass_encoder.c
+++ b/examples/twopass_encoder.c
@@ -209,13 +209,13 @@ int main(int argc, char **argv) {
if (argc != 7) die("Invalid number of arguments.");
- max_frames = strtol(argv[6], NULL, 0);
+ max_frames = (int)strtol(argv[6], NULL, 0);
encoder = get_vpx_encoder_by_name(codec_arg);
if (!encoder) die("Unsupported codec.");
- w = strtol(width_arg, NULL, 0);
- h = strtol(height_arg, NULL, 0);
+ w = (int)strtol(width_arg, NULL, 0);
+ h = (int)strtol(height_arg, NULL, 0);
if (w <= 0 || h <= 0 || (w % 2) != 0 || (h % 2) != 0)
die("Invalid frame size: %dx%d", w, h);
diff --git a/examples/vp8cx_set_ref.c b/examples/vp8cx_set_ref.c
index fc7bdab39..846477c61 100644
--- a/examples/vp8cx_set_ref.c
+++ b/examples/vp8cx_set_ref.c
@@ -122,8 +122,8 @@ int main(int argc, char **argv) {
if (!update_frame_num) die("Couldn't parse frame number '%s'\n", argv[5]);
info.codec_fourcc = encoder->fourcc;
- info.frame_width = strtol(argv[1], NULL, 0);
- info.frame_height = strtol(argv[2], NULL, 0);
+ info.frame_width = (int)strtol(argv[1], NULL, 0);
+ info.frame_height = (int)strtol(argv[2], NULL, 0);
info.time_base.numerator = 1;
info.time_base.denominator = fps;
diff --git a/examples/vp9_lossless_encoder.c b/examples/vp9_lossless_encoder.c
index 5802186bf..cb5ca6bfe 100644
--- a/examples/vp9_lossless_encoder.c
+++ b/examples/vp9_lossless_encoder.c
@@ -78,8 +78,8 @@ int main(int argc, char **argv) {
if (!encoder) die("Unsupported codec.");
info.codec_fourcc = encoder->fourcc;
- info.frame_width = strtol(argv[1], NULL, 0);
- info.frame_height = strtol(argv[2], NULL, 0);
+ info.frame_width = (int)strtol(argv[1], NULL, 0);
+ info.frame_height = (int)strtol(argv[2], NULL, 0);
info.time_base.numerator = 1;
info.time_base.denominator = fps;
diff --git a/examples/vp9cx_set_ref.c b/examples/vp9cx_set_ref.c
index e0bb795f7..798d7e3f2 100644
--- a/examples/vp9cx_set_ref.c
+++ b/examples/vp9cx_set_ref.c
@@ -335,8 +335,8 @@ int main(int argc, char **argv) {
}
info.codec_fourcc = encoder->fourcc;
- info.frame_width = strtol(width_arg, NULL, 0);
- info.frame_height = strtol(height_arg, NULL, 0);
+ info.frame_width = (int)strtol(width_arg, NULL, 0);
+ info.frame_height = (int)strtol(height_arg, NULL, 0);
info.time_base.numerator = 1;
info.time_base.denominator = fps;
diff --git a/examples/vpx_temporal_svc_encoder.c b/examples/vpx_temporal_svc_encoder.c
index 4a3387787..309a2fe2e 100644
--- a/examples/vpx_temporal_svc_encoder.c
+++ b/examples/vpx_temporal_svc_encoder.c
@@ -547,13 +547,13 @@ int main(int argc, char **argv) {
printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface()));
- width = strtol(argv[4], NULL, 0);
- height = strtol(argv[5], NULL, 0);
+ width = (unsigned int)strtoul(argv[4], NULL, 0);
+ height = (unsigned int)strtoul(argv[5], NULL, 0);
if (width < 16 || width % 2 || height < 16 || height % 2) {
die("Invalid resolution: %d x %d", width, height);
}
- layering_mode = strtol(argv[10], NULL, 0);
+ layering_mode = (int)strtol(argv[10], NULL, 0);
if (layering_mode < 0 || layering_mode > 13) {
die("Invalid layering mode (0..12) %s", argv[10]);
}
@@ -609,17 +609,17 @@ int main(int argc, char **argv) {
#endif // CONFIG_VP9_HIGHBITDEPTH
// Timebase format e.g. 30fps: numerator=1, demoninator = 30.
- cfg.g_timebase.num = strtol(argv[6], NULL, 0);
- cfg.g_timebase.den = strtol(argv[7], NULL, 0);
+ cfg.g_timebase.num = (int)strtol(argv[6], NULL, 0);
+ cfg.g_timebase.den = (int)strtol(argv[7], NULL, 0);
- speed = strtol(argv[8], NULL, 0);
+ speed = (int)strtol(argv[8], NULL, 0);
if (speed < 0) {
die("Invalid speed setting: must be positive");
}
for (i = min_args_base;
(int)i < min_args_base + mode_to_num_layers[layering_mode]; ++i) {
- rc.layer_target_bitrate[i - 11] = strtol(argv[i], NULL, 0);
+ rc.layer_target_bitrate[i - 11] = (int)strtol(argv[i], NULL, 0);
if (strncmp(encoder->name, "vp8", 3) == 0)
cfg.ts_target_bitrate[i - 11] = rc.layer_target_bitrate[i - 11];
else if (strncmp(encoder->name, "vp9", 3) == 0)
@@ -627,7 +627,7 @@ int main(int argc, char **argv) {
}
// Real time parameters.
- cfg.rc_dropframe_thresh = strtol(argv[9], NULL, 0);
+ cfg.rc_dropframe_thresh = (unsigned int)strtoul(argv[9], NULL, 0);
cfg.rc_end_usage = VPX_CBR;
cfg.rc_min_quantizer = 2;
cfg.rc_max_quantizer = 56;
diff --git a/test/idct_test.cc b/test/idct_test.cc
index f54f2c005..700da77e3 100644
--- a/test/idct_test.cc
+++ b/test/idct_test.cc
@@ -115,6 +115,10 @@ TEST_P(IDCTTest, TestWithData) {
}
INSTANTIATE_TEST_CASE_P(C, IDCTTest, ::testing::Values(vp8_short_idct4x4llm_c));
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, IDCTTest,
+ ::testing::Values(vp8_short_idct4x4llm_neon));
+#endif
#if HAVE_MMX
INSTANTIATE_TEST_CASE_P(MMX, IDCTTest,
::testing::Values(vp8_short_idct4x4llm_mmx));
diff --git a/test/predict_test.cc b/test/predict_test.cc
new file mode 100644
index 000000000..f06e4dbb2
--- /dev/null
+++ b/test/predict_test.cc
@@ -0,0 +1,381 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vp8_rtcd.h"
+#include "./vpx_config.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
+
+namespace {
+
+using libvpx_test::ACMRandom;
+using std::tr1::make_tuple;
+
+typedef void (*PredictFunc)(uint8_t *src_ptr, int src_pixels_per_line,
+ int xoffset, int yoffset, uint8_t *dst_ptr,
+ int dst_pitch);
+
+typedef std::tr1::tuple<int, int, PredictFunc> PredictParam;
+
+class PredictTestBase : public ::testing::TestWithParam<PredictParam> {
+ public:
+ PredictTestBase()
+ : width_(GET_PARAM(0)), height_(GET_PARAM(1)), predict_(GET_PARAM(2)),
+ src_(NULL), padded_dst_(NULL), dst_(NULL), dst_c_(NULL) {}
+
+ virtual void SetUp() {
+ src_ = new uint8_t[kSrcSize];
+ ASSERT_TRUE(src_ != NULL);
+
+ // padded_dst_ provides a buffer of kBorderSize around the destination
+ // memory to facilitate detecting out of bounds writes.
+ dst_stride_ = kBorderSize + width_ + kBorderSize;
+ padded_dst_size_ = dst_stride_ * (kBorderSize + height_ + kBorderSize);
+ padded_dst_ =
+ reinterpret_cast<uint8_t *>(vpx_memalign(16, padded_dst_size_));
+ ASSERT_TRUE(padded_dst_ != NULL);
+ dst_ = padded_dst_ + (kBorderSize * dst_stride_) + kBorderSize;
+
+ dst_c_ = new uint8_t[16 * 16];
+ ASSERT_TRUE(dst_c_ != NULL);
+
+ memset(src_, 0, kSrcSize);
+ memset(padded_dst_, 128, padded_dst_size_);
+ memset(dst_c_, 0, 16 * 16);
+ }
+
+ virtual void TearDown() {
+ delete[] src_;
+ src_ = NULL;
+ vpx_free(padded_dst_);
+ padded_dst_ = NULL;
+ dst_ = NULL;
+ delete[] dst_c_;
+ dst_c_ = NULL;
+ libvpx_test::ClearSystemState();
+ }
+
+ protected:
+ // Make reference arrays big enough for 16x16 functions. Six-tap filters need
+ // 5 extra pixels outside of the macroblock.
+ static const int kSrcStride = 21;
+ static const int kSrcSize = kSrcStride * kSrcStride;
+ static const int kBorderSize = 16;
+
+ int width_;
+ int height_;
+ PredictFunc predict_;
+ uint8_t *src_;
+ uint8_t *padded_dst_;
+ uint8_t *dst_;
+ int padded_dst_size_;
+ uint8_t *dst_c_;
+ int dst_stride_;
+
+ bool CompareBuffers(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride) const {
+ for (int height = 0; height < height_; ++height) {
+ EXPECT_EQ(0, memcmp(a + height * a_stride, b + height * b_stride,
+ sizeof(*a) * width_))
+ << "Row " << height << " does not match.";
+ }
+
+ return !HasFailure();
+ }
+
+ // Given a block of memory 'a' with size 'a_size', determine if all regions
+ // excepting block 'b' described by 'b_stride', 'b_height', and 'b_width'
+ // match pixel value 'c'.
+ bool CheckBorder(const uint8_t *a, int a_size, const uint8_t *b, int b_width,
+ int b_height, int b_stride, uint8_t c) const {
+ const uint8_t *a_end = a + a_size;
+ const int b_size = (b_stride * b_height) + b_width;
+ const uint8_t *b_end = b + b_size;
+ const int left_border = (b_stride - b_width) / 2;
+ const int right_border = left_border + ((b_stride - b_width) % 2);
+
+ EXPECT_GE(b - left_border, a) << "'b' does not start within 'a'";
+ EXPECT_LE(b_end + right_border, a_end) << "'b' does not end within 'a'";
+
+ // Top border.
+ for (int pixel = 0; pixel < b - a - left_border; ++pixel) {
+ EXPECT_EQ(c, a[pixel]) << "Mismatch at " << pixel << " in top border.";
+ }
+
+ // Left border.
+ for (int height = 0; height < b_height; ++height) {
+ for (int width = left_border; width > 0; --width) {
+ EXPECT_EQ(c, b[height * b_stride - width])
+ << "Mismatch at row " << height << " column " << left_border - width
+ << " in left border.";
+ }
+ }
+
+ // Right border.
+ for (int height = 0; height < b_height; ++height) {
+ for (int width = b_width; width < b_width + right_border; ++width) {
+ EXPECT_EQ(c, b[height * b_stride + width])
+ << "Mismatch at row " << height << " column " << width - b_width
+ << " in right border.";
+ }
+ }
+
+ // Bottom border.
+ for (int pixel = static_cast<int>(b - a + b_size); pixel < a_size;
+ ++pixel) {
+ EXPECT_EQ(c, a[pixel]) << "Mismatch at " << pixel << " in bottom border.";
+ }
+
+ return !HasFailure();
+ }
+
+ void TestWithRandomData(PredictFunc reference) {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+ // Run tests for almost all possible offsets.
+ for (int xoffset = 0; xoffset < 8; ++xoffset) {
+ for (int yoffset = 0; yoffset < 8; ++yoffset) {
+ if (xoffset == 0 && yoffset == 0) {
+ // This represents a copy which is not required to be handled by this
+ // module.
+ continue;
+ }
+
+ for (int i = 0; i < kSrcSize; ++i) {
+ src_[i] = rnd.Rand8();
+ }
+ reference(&src_[kSrcStride * 2 + 2], kSrcStride, xoffset, yoffset,
+ dst_c_, 16);
+
+ ASM_REGISTER_STATE_CHECK(predict_(&src_[kSrcStride * 2 + 2], kSrcStride,
+ xoffset, yoffset, dst_, dst_stride_));
+
+ ASSERT_TRUE(CompareBuffers(dst_c_, 16, dst_, dst_stride_));
+ ASSERT_TRUE(CheckBorder(padded_dst_, padded_dst_size_, dst_, width_,
+ height_, dst_stride_, 128));
+ }
+ }
+ }
+
+ void TestWithUnalignedDst(PredictFunc reference) {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+ // Only the 4x4 need to be able to handle unaligned writes.
+ if (width_ == 4 && height_ == 4) {
+ for (int xoffset = 0; xoffset < 8; ++xoffset) {
+ for (int yoffset = 0; yoffset < 8; ++yoffset) {
+ if (xoffset == 0 && yoffset == 0) {
+ continue;
+ }
+ for (int i = 0; i < kSrcSize; ++i) {
+ src_[i] = rnd.Rand8();
+ }
+ reference(&src_[kSrcStride * 2 + 2], kSrcStride, xoffset, yoffset,
+ dst_c_, 16);
+
+ for (int i = 1; i < 4; ++i) {
+ memset(padded_dst_, 128, padded_dst_size_);
+
+ ASM_REGISTER_STATE_CHECK(predict_(&src_[kSrcStride * 2 + 2],
+ kSrcStride, xoffset, yoffset,
+ dst_ + i, dst_stride_ + i));
+
+ ASSERT_TRUE(CompareBuffers(dst_c_, 16, dst_ + i, dst_stride_ + i));
+ ASSERT_TRUE(CheckBorder(padded_dst_, padded_dst_size_, dst_ + i,
+ width_, height_, dst_stride_ + i, 128));
+ }
+ }
+ }
+ }
+ }
+};
+
+class SixtapPredictTest : public PredictTestBase {};
+
+TEST_P(SixtapPredictTest, TestWithRandomData) {
+ TestWithRandomData(vp8_sixtap_predict16x16_c);
+}
+TEST_P(SixtapPredictTest, TestWithUnalignedDst) {
+ TestWithUnalignedDst(vp8_sixtap_predict16x16_c);
+}
+
+TEST_P(SixtapPredictTest, TestWithPresetData) {
+ // Test input
+ static const uint8_t kTestData[kSrcSize] = {
+ 184, 4, 191, 82, 92, 41, 0, 1, 226, 236, 172, 20, 182, 42, 226,
+ 177, 79, 94, 77, 179, 203, 206, 198, 22, 192, 19, 75, 17, 192, 44,
+ 233, 120, 48, 168, 203, 141, 210, 203, 143, 180, 184, 59, 201, 110, 102,
+ 171, 32, 182, 10, 109, 105, 213, 60, 47, 236, 253, 67, 55, 14, 3,
+ 99, 247, 124, 148, 159, 71, 34, 114, 19, 177, 38, 203, 237, 239, 58,
+ 83, 155, 91, 10, 166, 201, 115, 124, 5, 163, 104, 2, 231, 160, 16,
+ 234, 4, 8, 103, 153, 167, 174, 187, 26, 193, 109, 64, 141, 90, 48,
+ 200, 174, 204, 36, 184, 114, 237, 43, 238, 242, 207, 86, 245, 182, 247,
+ 6, 161, 251, 14, 8, 148, 182, 182, 79, 208, 120, 188, 17, 6, 23,
+ 65, 206, 197, 13, 242, 126, 128, 224, 170, 110, 211, 121, 197, 200, 47,
+ 188, 207, 208, 184, 221, 216, 76, 148, 143, 156, 100, 8, 89, 117, 14,
+ 112, 183, 221, 54, 197, 208, 180, 69, 176, 94, 180, 131, 215, 121, 76,
+ 7, 54, 28, 216, 238, 249, 176, 58, 142, 64, 215, 242, 72, 49, 104,
+ 87, 161, 32, 52, 216, 230, 4, 141, 44, 181, 235, 224, 57, 195, 89,
+ 134, 203, 144, 162, 163, 126, 156, 84, 185, 42, 148, 145, 29, 221, 194,
+ 134, 52, 100, 166, 105, 60, 140, 110, 201, 184, 35, 181, 153, 93, 121,
+ 243, 227, 68, 131, 134, 232, 2, 35, 60, 187, 77, 209, 76, 106, 174,
+ 15, 241, 227, 115, 151, 77, 175, 36, 187, 121, 221, 223, 47, 118, 61,
+ 168, 105, 32, 237, 236, 167, 213, 238, 202, 17, 170, 24, 226, 247, 131,
+ 145, 6, 116, 117, 121, 11, 194, 41, 48, 126, 162, 13, 93, 209, 131,
+ 154, 122, 237, 187, 103, 217, 99, 60, 200, 45, 78, 115, 69, 49, 106,
+ 200, 194, 112, 60, 56, 234, 72, 251, 19, 120, 121, 182, 134, 215, 135,
+ 10, 114, 2, 247, 46, 105, 209, 145, 165, 153, 191, 243, 12, 5, 36,
+ 119, 206, 231, 231, 11, 32, 209, 83, 27, 229, 204, 149, 155, 83, 109,
+ 35, 93, 223, 37, 84, 14, 142, 37, 160, 52, 191, 96, 40, 204, 101,
+ 77, 67, 52, 53, 43, 63, 85, 253, 147, 113, 226, 96, 6, 125, 179,
+ 115, 161, 17, 83, 198, 101, 98, 85, 139, 3, 137, 75, 99, 178, 23,
+ 201, 255, 91, 253, 52, 134, 60, 138, 131, 208, 251, 101, 48, 2, 227,
+ 228, 118, 132, 245, 202, 75, 91, 44, 160, 231, 47, 41, 50, 147, 220,
+ 74, 92, 219, 165, 89, 16
+ };
+
+ // Expected results for xoffset = 2 and yoffset = 2.
+ static const int kExpectedDstStride = 16;
+ static const uint8_t kExpectedDst[256] = {
+ 117, 102, 74, 135, 42, 98, 175, 206, 70, 73, 222, 197, 50, 24, 39,
+ 49, 38, 105, 90, 47, 169, 40, 171, 215, 200, 73, 109, 141, 53, 85,
+ 177, 164, 79, 208, 124, 89, 212, 18, 81, 145, 151, 164, 217, 153, 91,
+ 154, 102, 102, 159, 75, 164, 152, 136, 51, 213, 219, 186, 116, 193, 224,
+ 186, 36, 231, 208, 84, 211, 155, 167, 35, 59, 42, 76, 216, 149, 73,
+ 201, 78, 149, 184, 100, 96, 196, 189, 198, 188, 235, 195, 117, 129, 120,
+ 129, 49, 25, 133, 113, 69, 221, 114, 70, 143, 99, 157, 108, 189, 140,
+ 78, 6, 55, 65, 240, 255, 245, 184, 72, 90, 100, 116, 131, 39, 60,
+ 234, 167, 33, 160, 88, 185, 200, 157, 159, 176, 127, 151, 138, 102, 168,
+ 106, 170, 86, 82, 219, 189, 76, 33, 115, 197, 106, 96, 198, 136, 97,
+ 141, 237, 151, 98, 137, 191, 185, 2, 57, 95, 142, 91, 255, 185, 97,
+ 137, 76, 162, 94, 173, 131, 193, 161, 81, 106, 72, 135, 222, 234, 137,
+ 66, 137, 106, 243, 210, 147, 95, 15, 137, 110, 85, 66, 16, 96, 167,
+ 147, 150, 173, 203, 140, 118, 196, 84, 147, 160, 19, 95, 101, 123, 74,
+ 132, 202, 82, 166, 12, 131, 166, 189, 170, 159, 85, 79, 66, 57, 152,
+ 132, 203, 194, 0, 1, 56, 146, 180, 224, 156, 28, 83, 181, 79, 76,
+ 80, 46, 160, 175, 59, 106, 43, 87, 75, 136, 85, 189, 46, 71, 200,
+ 90
+ };
+
+ ASM_REGISTER_STATE_CHECK(
+ predict_(const_cast<uint8_t *>(kTestData) + kSrcStride * 2 + 2,
+ kSrcStride, 2, 2, dst_, dst_stride_));
+
+ ASSERT_TRUE(
+ CompareBuffers(kExpectedDst, kExpectedDstStride, dst_, dst_stride_));
+}
+
+INSTANTIATE_TEST_CASE_P(
+ C, SixtapPredictTest,
+ ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_c),
+ make_tuple(8, 8, &vp8_sixtap_predict8x8_c),
+ make_tuple(8, 4, &vp8_sixtap_predict8x4_c),
+ make_tuple(4, 4, &vp8_sixtap_predict4x4_c)));
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(
+ NEON, SixtapPredictTest,
+ ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_neon),
+ make_tuple(8, 8, &vp8_sixtap_predict8x8_neon),
+ make_tuple(8, 4, &vp8_sixtap_predict8x4_neon),
+ make_tuple(4, 4, &vp8_sixtap_predict4x4_neon)));
+#endif
+#if HAVE_MMX
+INSTANTIATE_TEST_CASE_P(
+ MMX, SixtapPredictTest,
+ ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_mmx),
+ make_tuple(8, 8, &vp8_sixtap_predict8x8_mmx),
+ make_tuple(8, 4, &vp8_sixtap_predict8x4_mmx),
+ make_tuple(4, 4, &vp8_sixtap_predict4x4_mmx)));
+#endif
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+ SSE2, SixtapPredictTest,
+ ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_sse2),
+ make_tuple(8, 8, &vp8_sixtap_predict8x8_sse2),
+ make_tuple(8, 4, &vp8_sixtap_predict8x4_sse2)));
+#endif
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(
+ SSSE3, SixtapPredictTest,
+ ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_ssse3),
+ make_tuple(8, 8, &vp8_sixtap_predict8x8_ssse3),
+ make_tuple(8, 4, &vp8_sixtap_predict8x4_ssse3),
+ make_tuple(4, 4, &vp8_sixtap_predict4x4_ssse3)));
+#endif
+#if HAVE_MSA
+INSTANTIATE_TEST_CASE_P(
+ MSA, SixtapPredictTest,
+ ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_msa),
+ make_tuple(8, 8, &vp8_sixtap_predict8x8_msa),
+ make_tuple(8, 4, &vp8_sixtap_predict8x4_msa),
+ make_tuple(4, 4, &vp8_sixtap_predict4x4_msa)));
+#endif
+
+class BilinearPredictTest : public PredictTestBase {};
+
+TEST_P(BilinearPredictTest, TestWithRandomData) {
+ TestWithRandomData(vp8_bilinear_predict16x16_c);
+}
+TEST_P(BilinearPredictTest, TestWithUnalignedDst) {
+ TestWithUnalignedDst(vp8_bilinear_predict16x16_c);
+}
+
+INSTANTIATE_TEST_CASE_P(
+ C, BilinearPredictTest,
+ ::testing::Values(make_tuple(16, 16, &vp8_bilinear_predict16x16_c),
+ make_tuple(8, 8, &vp8_bilinear_predict8x8_c),
+ make_tuple(8, 4, &vp8_bilinear_predict8x4_c),
+ make_tuple(4, 4, &vp8_bilinear_predict4x4_c)));
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(
+ NEON, BilinearPredictTest,
+ ::testing::Values(make_tuple(16, 16, &vp8_bilinear_predict16x16_neon),
+ make_tuple(8, 8, &vp8_bilinear_predict8x8_neon),
+ make_tuple(8, 4, &vp8_bilinear_predict8x4_neon),
+ make_tuple(4, 4, &vp8_bilinear_predict4x4_neon)));
+#endif
+#if HAVE_MMX
+INSTANTIATE_TEST_CASE_P(
+ MMX, BilinearPredictTest,
+ ::testing::Values(make_tuple(16, 16, &vp8_bilinear_predict16x16_mmx),
+ make_tuple(8, 8, &vp8_bilinear_predict8x8_mmx),
+ make_tuple(8, 4, &vp8_bilinear_predict8x4_mmx),
+ make_tuple(4, 4, &vp8_bilinear_predict4x4_mmx)));
+#endif
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+ SSE2, BilinearPredictTest,
+ ::testing::Values(make_tuple(16, 16, &vp8_bilinear_predict16x16_sse2),
+ make_tuple(8, 8, &vp8_bilinear_predict8x8_sse2)));
+#endif
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(
+ SSSE3, BilinearPredictTest,
+ ::testing::Values(make_tuple(16, 16, &vp8_bilinear_predict16x16_ssse3),
+ make_tuple(8, 8, &vp8_bilinear_predict8x8_ssse3)));
+#endif
+#if HAVE_MSA
+INSTANTIATE_TEST_CASE_P(
+ MSA, BilinearPredictTest,
+ ::testing::Values(make_tuple(16, 16, &vp8_bilinear_predict16x16_msa),
+ make_tuple(8, 8, &vp8_bilinear_predict8x8_msa),
+ make_tuple(8, 4, &vp8_bilinear_predict8x4_msa),
+ make_tuple(4, 4, &vp8_bilinear_predict4x4_msa)));
+#endif
+} // namespace
diff --git a/test/sixtap_predict_test.cc b/test/sixtap_predict_test.cc
deleted file mode 100644
index 31a604417..000000000
--- a/test/sixtap_predict_test.cc
+++ /dev/null
@@ -1,231 +0,0 @@
-/*
- * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "third_party/googletest/src/include/gtest/gtest.h"
-
-#include "./vpx_config.h"
-#include "./vp8_rtcd.h"
-#include "test/acm_random.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-#include "test/util.h"
-#include "vpx/vpx_integer.h"
-#include "vpx_mem/vpx_mem.h"
-
-namespace {
-
-typedef void (*SixtapPredictFunc)(uint8_t *src_ptr, int src_pixels_per_line,
- int xoffset, int yoffset, uint8_t *dst_ptr,
- int dst_pitch);
-
-typedef std::tr1::tuple<int, int, SixtapPredictFunc> SixtapPredictParam;
-
-class SixtapPredictTest : public ::testing::TestWithParam<SixtapPredictParam> {
- public:
- static void SetUpTestCase() {
- src_ = reinterpret_cast<uint8_t *>(vpx_memalign(kDataAlignment, kSrcSize));
- dst_ = reinterpret_cast<uint8_t *>(vpx_memalign(kDataAlignment, kDstSize));
- dst_c_ =
- reinterpret_cast<uint8_t *>(vpx_memalign(kDataAlignment, kDstSize));
- }
-
- static void TearDownTestCase() {
- vpx_free(src_);
- src_ = NULL;
- vpx_free(dst_);
- dst_ = NULL;
- vpx_free(dst_c_);
- dst_c_ = NULL;
- }
-
- virtual void TearDown() { libvpx_test::ClearSystemState(); }
-
- protected:
- // Make test arrays big enough for 16x16 functions. Six-tap filters
- // need 5 extra pixels outside of the macroblock.
- static const int kSrcStride = 21;
- static const int kDstStride = 16;
- static const int kDataAlignment = 16;
- static const int kSrcSize = kSrcStride * kSrcStride + 1;
- static const int kDstSize = kDstStride * kDstStride;
-
- virtual void SetUp() {
- width_ = GET_PARAM(0);
- height_ = GET_PARAM(1);
- sixtap_predict_ = GET_PARAM(2);
- memset(src_, 0, kSrcSize);
- memset(dst_, 0, kDstSize);
- memset(dst_c_, 0, kDstSize);
- }
-
- int width_;
- int height_;
- SixtapPredictFunc sixtap_predict_;
- // The src stores the macroblock we will filter on, and makes it 1 byte larger
- // in order to test unaligned access. The result is stored in dst and dst_c(c
- // reference code result).
- static uint8_t *src_;
- static uint8_t *dst_;
- static uint8_t *dst_c_;
-};
-
-uint8_t *SixtapPredictTest::src_ = NULL;
-uint8_t *SixtapPredictTest::dst_ = NULL;
-uint8_t *SixtapPredictTest::dst_c_ = NULL;
-
-TEST_P(SixtapPredictTest, TestWithPresetData) {
- // Test input
- static const uint8_t test_data[kSrcSize] = {
- 216, 184, 4, 191, 82, 92, 41, 0, 1, 226, 236, 172, 20, 182, 42,
- 226, 177, 79, 94, 77, 179, 203, 206, 198, 22, 192, 19, 75, 17, 192,
- 44, 233, 120, 48, 168, 203, 141, 210, 203, 143, 180, 184, 59, 201, 110,
- 102, 171, 32, 182, 10, 109, 105, 213, 60, 47, 236, 253, 67, 55, 14,
- 3, 99, 247, 124, 148, 159, 71, 34, 114, 19, 177, 38, 203, 237, 239,
- 58, 83, 155, 91, 10, 166, 201, 115, 124, 5, 163, 104, 2, 231, 160,
- 16, 234, 4, 8, 103, 153, 167, 174, 187, 26, 193, 109, 64, 141, 90,
- 48, 200, 174, 204, 36, 184, 114, 237, 43, 238, 242, 207, 86, 245, 182,
- 247, 6, 161, 251, 14, 8, 148, 182, 182, 79, 208, 120, 188, 17, 6,
- 23, 65, 206, 197, 13, 242, 126, 128, 224, 170, 110, 211, 121, 197, 200,
- 47, 188, 207, 208, 184, 221, 216, 76, 148, 143, 156, 100, 8, 89, 117,
- 14, 112, 183, 221, 54, 197, 208, 180, 69, 176, 94, 180, 131, 215, 121,
- 76, 7, 54, 28, 216, 238, 249, 176, 58, 142, 64, 215, 242, 72, 49,
- 104, 87, 161, 32, 52, 216, 230, 4, 141, 44, 181, 235, 224, 57, 195,
- 89, 134, 203, 144, 162, 163, 126, 156, 84, 185, 42, 148, 145, 29, 221,
- 194, 134, 52, 100, 166, 105, 60, 140, 110, 201, 184, 35, 181, 153, 93,
- 121, 243, 227, 68, 131, 134, 232, 2, 35, 60, 187, 77, 209, 76, 106,
- 174, 15, 241, 227, 115, 151, 77, 175, 36, 187, 121, 221, 223, 47, 118,
- 61, 168, 105, 32, 237, 236, 167, 213, 238, 202, 17, 170, 24, 226, 247,
- 131, 145, 6, 116, 117, 121, 11, 194, 41, 48, 126, 162, 13, 93, 209,
- 131, 154, 122, 237, 187, 103, 217, 99, 60, 200, 45, 78, 115, 69, 49,
- 106, 200, 194, 112, 60, 56, 234, 72, 251, 19, 120, 121, 182, 134, 215,
- 135, 10, 114, 2, 247, 46, 105, 209, 145, 165, 153, 191, 243, 12, 5,
- 36, 119, 206, 231, 231, 11, 32, 209, 83, 27, 229, 204, 149, 155, 83,
- 109, 35, 93, 223, 37, 84, 14, 142, 37, 160, 52, 191, 96, 40, 204,
- 101, 77, 67, 52, 53, 43, 63, 85, 253, 147, 113, 226, 96, 6, 125,
- 179, 115, 161, 17, 83, 198, 101, 98, 85, 139, 3, 137, 75, 99, 178,
- 23, 201, 255, 91, 253, 52, 134, 60, 138, 131, 208, 251, 101, 48, 2,
- 227, 228, 118, 132, 245, 202, 75, 91, 44, 160, 231, 47, 41, 50, 147,
- 220, 74, 92, 219, 165, 89, 16
- };
-
- // Expected result
- static const uint8_t expected_dst[kDstSize] = {
- 117, 102, 74, 135, 42, 98, 175, 206, 70, 73, 222, 197, 50, 24, 39,
- 49, 38, 105, 90, 47, 169, 40, 171, 215, 200, 73, 109, 141, 53, 85,
- 177, 164, 79, 208, 124, 89, 212, 18, 81, 145, 151, 164, 217, 153, 91,
- 154, 102, 102, 159, 75, 164, 152, 136, 51, 213, 219, 186, 116, 193, 224,
- 186, 36, 231, 208, 84, 211, 155, 167, 35, 59, 42, 76, 216, 149, 73,
- 201, 78, 149, 184, 100, 96, 196, 189, 198, 188, 235, 195, 117, 129, 120,
- 129, 49, 25, 133, 113, 69, 221, 114, 70, 143, 99, 157, 108, 189, 140,
- 78, 6, 55, 65, 240, 255, 245, 184, 72, 90, 100, 116, 131, 39, 60,
- 234, 167, 33, 160, 88, 185, 200, 157, 159, 176, 127, 151, 138, 102, 168,
- 106, 170, 86, 82, 219, 189, 76, 33, 115, 197, 106, 96, 198, 136, 97,
- 141, 237, 151, 98, 137, 191, 185, 2, 57, 95, 142, 91, 255, 185, 97,
- 137, 76, 162, 94, 173, 131, 193, 161, 81, 106, 72, 135, 222, 234, 137,
- 66, 137, 106, 243, 210, 147, 95, 15, 137, 110, 85, 66, 16, 96, 167,
- 147, 150, 173, 203, 140, 118, 196, 84, 147, 160, 19, 95, 101, 123, 74,
- 132, 202, 82, 166, 12, 131, 166, 189, 170, 159, 85, 79, 66, 57, 152,
- 132, 203, 194, 0, 1, 56, 146, 180, 224, 156, 28, 83, 181, 79, 76,
- 80, 46, 160, 175, 59, 106, 43, 87, 75, 136, 85, 189, 46, 71, 200,
- 90
- };
-
- uint8_t *src = const_cast<uint8_t *>(test_data);
-
- ASM_REGISTER_STATE_CHECK(sixtap_predict_(&src[kSrcStride * 2 + 2 + 1],
- kSrcStride, 2, 2, dst_, kDstStride));
-
- for (int i = 0; i < height_; ++i) {
- for (int j = 0; j < width_; ++j)
- ASSERT_EQ(expected_dst[i * kDstStride + j], dst_[i * kDstStride + j])
- << "i==" << (i * width_ + j);
- }
-}
-
-using libvpx_test::ACMRandom;
-
-TEST_P(SixtapPredictTest, TestWithRandomData) {
- ACMRandom rnd(ACMRandom::DeterministicSeed());
- for (int i = 0; i < kSrcSize; ++i) src_[i] = rnd.Rand8();
-
- // Run tests for all possible offsets.
- for (int xoffset = 0; xoffset < 8; ++xoffset) {
- for (int yoffset = 0; yoffset < 8; ++yoffset) {
- // Call c reference function.
- // Move start point to next pixel to test if the function reads
- // unaligned data correctly.
- vp8_sixtap_predict16x16_c(&src_[kSrcStride * 2 + 2 + 1], kSrcStride,
- xoffset, yoffset, dst_c_, kDstStride);
-
- // Run test.
- ASM_REGISTER_STATE_CHECK(sixtap_predict_(&src_[kSrcStride * 2 + 2 + 1],
- kSrcStride, xoffset, yoffset,
- dst_, kDstStride));
-
- for (int i = 0; i < height_; ++i) {
- for (int j = 0; j < width_; ++j)
- ASSERT_EQ(dst_c_[i * kDstStride + j], dst_[i * kDstStride + j])
- << "i==" << (i * width_ + j);
- }
- }
- }
-}
-
-using std::tr1::make_tuple;
-
-INSTANTIATE_TEST_CASE_P(
- C, SixtapPredictTest,
- ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_c),
- make_tuple(8, 8, &vp8_sixtap_predict8x8_c),
- make_tuple(8, 4, &vp8_sixtap_predict8x4_c),
- make_tuple(4, 4, &vp8_sixtap_predict4x4_c)));
-#if HAVE_NEON
-INSTANTIATE_TEST_CASE_P(
- NEON, SixtapPredictTest,
- ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_neon),
- make_tuple(8, 8, &vp8_sixtap_predict8x8_neon),
- make_tuple(8, 4, &vp8_sixtap_predict8x4_neon)));
-#endif
-#if HAVE_MMX
-INSTANTIATE_TEST_CASE_P(
- MMX, SixtapPredictTest,
- ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_mmx),
- make_tuple(8, 8, &vp8_sixtap_predict8x8_mmx),
- make_tuple(8, 4, &vp8_sixtap_predict8x4_mmx),
- make_tuple(4, 4, &vp8_sixtap_predict4x4_mmx)));
-#endif
-#if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(
- SSE2, SixtapPredictTest,
- ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_sse2),
- make_tuple(8, 8, &vp8_sixtap_predict8x8_sse2),
- make_tuple(8, 4, &vp8_sixtap_predict8x4_sse2)));
-#endif
-#if HAVE_SSSE3
-INSTANTIATE_TEST_CASE_P(
- SSSE3, SixtapPredictTest,
- ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_ssse3),
- make_tuple(8, 8, &vp8_sixtap_predict8x8_ssse3),
- make_tuple(8, 4, &vp8_sixtap_predict8x4_ssse3),
- make_tuple(4, 4, &vp8_sixtap_predict4x4_ssse3)));
-#endif
-#if HAVE_MSA
-INSTANTIATE_TEST_CASE_P(
- MSA, SixtapPredictTest,
- ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_msa),
- make_tuple(8, 8, &vp8_sixtap_predict8x8_msa),
- make_tuple(8, 4, &vp8_sixtap_predict8x4_msa),
- make_tuple(4, 4, &vp8_sixtap_predict4x4_msa)));
-#endif
-} // namespace
diff --git a/test/test.mk b/test/test.mk
index aad264531..60218a780 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -119,7 +119,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += variance_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_fdct4x4_test.cc
LIBVPX_TEST_SRCS-yes += idct_test.cc
-LIBVPX_TEST_SRCS-yes += sixtap_predict_test.cc
+LIBVPX_TEST_SRCS-yes += predict_test.cc
LIBVPX_TEST_SRCS-yes += vpx_scale_test.cc
ifeq ($(CONFIG_VP8_ENCODER)$(CONFIG_TEMPORAL_DENOISING),yesyes)
diff --git a/vp8/common/arm/neon/dequant_idct_neon.c b/vp8/common/arm/neon/dequant_idct_neon.c
index ff5981eaa..753051c77 100644
--- a/vp8/common/arm/neon/dequant_idct_neon.c
+++ b/vp8/common/arm/neon/dequant_idct_neon.c
@@ -11,7 +11,11 @@
#include <arm_neon.h>
static const int16_t cospi8sqrt2minus1 = 20091;
-static const int16_t sinpi8sqrt2 = 35468;
+// 35468 exceeds INT16_MAX and gets converted to a negative number. Because of
+// the way it is used in vqdmulh, where the result is doubled, it can be divided
+// by 2 beforehand. This saves compensating for the negative value as well as
+// shifting the result.
+static const int16_t sinpi8sqrt2 = 35468 >> 1;
void vp8_dequant_idct_add_neon(int16_t *input, int16_t *dq, unsigned char *dst,
int stride) {
@@ -60,10 +64,8 @@ void vp8_dequant_idct_add_neon(int16_t *input, int16_t *dq, unsigned char *dst,
q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);
q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);
- q3 = vshrq_n_s16(q3, 1);
q4 = vshrq_n_s16(q4, 1);
- q3 = vqaddq_s16(q3, q2);
q4 = vqaddq_s16(q4, q2);
d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
@@ -90,10 +92,8 @@ void vp8_dequant_idct_add_neon(int16_t *input, int16_t *dq, unsigned char *dst,
d12 = vqadd_s16(d2tmp2.val[0], d2tmp3.val[0]);
d13 = vqsub_s16(d2tmp2.val[0], d2tmp3.val[0]);
- q3 = vshrq_n_s16(q3, 1);
q4 = vshrq_n_s16(q4, 1);
- q3 = vqaddq_s16(q3, q2);
q4 = vqaddq_s16(q4, q2);
d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
diff --git a/vp8/common/arm/neon/shortidct4x4llm_neon.c b/vp8/common/arm/neon/shortidct4x4llm_neon.c
index a36c0c1ca..1adb1c317 100644
--- a/vp8/common/arm/neon/shortidct4x4llm_neon.c
+++ b/vp8/common/arm/neon/shortidct4x4llm_neon.c
@@ -11,7 +11,11 @@
#include <arm_neon.h>
static const int16_t cospi8sqrt2minus1 = 20091;
-static const int16_t sinpi8sqrt2 = 35468;
+// 35468 exceeds INT16_MAX and gets converted to a negative number. Because of
+// the way it is used in vqdmulh, where the result is doubled, it can be divided
+// by 2 beforehand. This saves compensating for the negative value as well as
+// shifting the result.
+static const int16_t sinpi8sqrt2 = 35468 >> 1;
void vp8_short_idct4x4llm_neon(int16_t *input, unsigned char *pred_ptr,
int pred_stride, unsigned char *dst_ptr,
@@ -40,10 +44,8 @@ void vp8_short_idct4x4llm_neon(int16_t *input, unsigned char *pred_ptr,
d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // a1
d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // b1
- q3s16 = vshrq_n_s16(q3s16, 1);
q4s16 = vshrq_n_s16(q4s16, 1);
- q3s16 = vqaddq_s16(q3s16, q2s16);
q4s16 = vqaddq_s16(q4s16, q2s16);
d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16)); // c1
@@ -71,10 +73,8 @@ void vp8_short_idct4x4llm_neon(int16_t *input, unsigned char *pred_ptr,
d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // a1
d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // b1
- q3s16 = vshrq_n_s16(q3s16, 1);
q4s16 = vshrq_n_s16(q4s16, 1);
- q3s16 = vqaddq_s16(q3s16, q2s16);
q4s16 = vqaddq_s16(q4s16, q2s16);
d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16)); // c1
diff --git a/vp8/common/arm/neon/sixtappredict_neon.c b/vp8/common/arm/neon/sixtappredict_neon.c
index 622baa3c5..fbb552ebe 100644
--- a/vp8/common/arm/neon/sixtappredict_neon.c
+++ b/vp8/common/arm/neon/sixtappredict_neon.c
@@ -9,6 +9,8 @@
*/
#include <arm_neon.h>
+#include <string.h>
+#include "./vpx_config.h"
#include "vpx_ports/mem.h"
static const int8_t vp8_sub_pel_filters[8][8] = {
@@ -22,6 +24,396 @@ static const int8_t vp8_sub_pel_filters[8][8] = {
{ 0, -1, 12, 123, -6, 0, 0, 0 },
};
+// This table is derived from vp8/common/filter.c:vp8_sub_pel_filters.
+// Apply abs() to all the values. Elements 0, 2, 3, and 5 are always positive.
+// Elements 1 and 4 are either 0 or negative. The code accounts for this with
+// multiply/accumulates which either add or subtract as needed. The other
+// functions will be updated to use this table later.
+// It is also expanded to 8 elements to allow loading into 64 bit neon
+// registers.
+static const uint8_t abs_filters[8][8] = {
+ { 0, 0, 128, 0, 0, 0, 0, 0 }, { 0, 6, 123, 12, 1, 0, 0, 0 },
+ { 2, 11, 108, 36, 8, 1, 0, 0 }, { 0, 9, 93, 50, 6, 0, 0, 0 },
+ { 3, 16, 77, 77, 16, 3, 0, 0 }, { 0, 6, 50, 93, 9, 0, 0, 0 },
+ { 1, 8, 36, 108, 11, 2, 0, 0 }, { 0, 1, 12, 123, 6, 0, 0, 0 },
+};
+
+static INLINE uint8x8_t load_and_shift(const unsigned char *a) {
+ return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vld1_u8(a)), 32));
+}
+
+static INLINE void store4x4(unsigned char *dst, int dst_stride,
+ const uint8x8_t a0, const uint8x8_t a1) {
+ if (!((uintptr_t)dst & 0x3) && !(dst_stride & 0x3)) {
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a0), 0);
+ dst += dst_stride;
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a0), 1);
+ dst += dst_stride;
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a1), 0);
+ dst += dst_stride;
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a1), 1);
+ } else {
+ // Store to the aligned local buffer and memcpy instead of vget_lane_u8
+ // which is really really slow.
+ uint32_t output_buffer[4];
+ vst1_lane_u32(output_buffer, vreinterpret_u32_u8(a0), 0);
+ vst1_lane_u32(output_buffer + 1, vreinterpret_u32_u8(a0), 1);
+ vst1_lane_u32(output_buffer + 2, vreinterpret_u32_u8(a1), 0);
+ vst1_lane_u32(output_buffer + 3, vreinterpret_u32_u8(a1), 1);
+
+ memcpy(dst, output_buffer, 4);
+ dst += dst_stride;
+ memcpy(dst, output_buffer + 1, 4);
+ dst += dst_stride;
+ memcpy(dst, output_buffer + 2, 4);
+ dst += dst_stride;
+ memcpy(dst, output_buffer + 3, 4);
+ }
+}
+
+static INLINE void filter_add_accumulate(const uint8x16_t a, const uint8x16_t b,
+ const uint8x8_t filter, uint16x8_t *c,
+ uint16x8_t *d) {
+ const uint32x2x2_t a_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(a)),
+ vreinterpret_u32_u8(vget_high_u8(a)));
+ const uint32x2x2_t b_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(b)),
+ vreinterpret_u32_u8(vget_high_u8(b)));
+ *c = vmlal_u8(*c, vreinterpret_u8_u32(a_shuf.val[0]), filter);
+ *d = vmlal_u8(*d, vreinterpret_u8_u32(b_shuf.val[0]), filter);
+}
+
+static INLINE void filter_sub_accumulate(const uint8x16_t a, const uint8x16_t b,
+ const uint8x8_t filter, uint16x8_t *c,
+ uint16x8_t *d) {
+ const uint32x2x2_t a_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(a)),
+ vreinterpret_u32_u8(vget_high_u8(a)));
+ const uint32x2x2_t b_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(b)),
+ vreinterpret_u32_u8(vget_high_u8(b)));
+ *c = vmlsl_u8(*c, vreinterpret_u8_u32(a_shuf.val[0]), filter);
+ *d = vmlsl_u8(*d, vreinterpret_u8_u32(b_shuf.val[0]), filter);
+}
+
+static INLINE void yonly4x4(const unsigned char *src, int src_stride,
+ int filter_offset, unsigned char *dst,
+ int dst_stride) {
+ uint8x8_t a0, a1, a2, a3, a4, a5, a6, a7, a8;
+ uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7, b8;
+ uint16x8_t c0, c1, c2, c3;
+ int16x8_t d0, d1;
+ uint8x8_t e0, e1;
+
+ const uint8x8_t filter = vld1_u8(abs_filters[filter_offset]);
+ const uint8x8_t filter0 = vdup_lane_u8(filter, 0);
+ const uint8x8_t filter1 = vdup_lane_u8(filter, 1);
+ const uint8x8_t filter2 = vdup_lane_u8(filter, 2);
+ const uint8x8_t filter3 = vdup_lane_u8(filter, 3);
+ const uint8x8_t filter4 = vdup_lane_u8(filter, 4);
+ const uint8x8_t filter5 = vdup_lane_u8(filter, 5);
+
+ src -= src_stride * 2;
+ // Shift the even rows to allow using 'vext' to combine the vectors. armv8
+ // has vcopy_lane which would be interesting. This started as just a
+ // horrible workaround for clang adding alignment hints to 32bit loads:
+ // https://llvm.org/bugs/show_bug.cgi?id=24421
+ // But it turns out it almost identical to casting the loads.
+ a0 = load_and_shift(src);
+ src += src_stride;
+ a1 = vld1_u8(src);
+ src += src_stride;
+ a2 = load_and_shift(src);
+ src += src_stride;
+ a3 = vld1_u8(src);
+ src += src_stride;
+ a4 = load_and_shift(src);
+ src += src_stride;
+ a5 = vld1_u8(src);
+ src += src_stride;
+ a6 = load_and_shift(src);
+ src += src_stride;
+ a7 = vld1_u8(src);
+ src += src_stride;
+ a8 = vld1_u8(src);
+
+ // Combine the rows so we can operate on 8 at a time.
+ b0 = vext_u8(a0, a1, 4);
+ b2 = vext_u8(a2, a3, 4);
+ b4 = vext_u8(a4, a5, 4);
+ b6 = vext_u8(a6, a7, 4);
+ b8 = a8;
+
+ // To keep with the 8-at-a-time theme, combine *alternate* rows. This
+ // allows combining the odd rows with the even.
+ b1 = vext_u8(b0, b2, 4);
+ b3 = vext_u8(b2, b4, 4);
+ b5 = vext_u8(b4, b6, 4);
+ b7 = vext_u8(b6, b8, 4);
+
+ // Multiply and expand to 16 bits.
+ c0 = vmull_u8(b0, filter0);
+ c1 = vmull_u8(b2, filter0);
+ c2 = vmull_u8(b5, filter5);
+ c3 = vmull_u8(b7, filter5);
+
+ // Multiply, subtract and accumulate for filters 1 and 4 (the negative
+ // ones).
+ c0 = vmlsl_u8(c0, b4, filter4);
+ c1 = vmlsl_u8(c1, b6, filter4);
+ c2 = vmlsl_u8(c2, b1, filter1);
+ c3 = vmlsl_u8(c3, b3, filter1);
+
+ // Add more positive ones. vmlal should really return a signed type.
+ // It's doing signed math internally, as evidenced by the fact we can do
+ // subtractions followed by more additions. Ideally we could use
+ // vqmlal/sl but that instruction doesn't exist. Might be able to
+ // shoehorn vqdmlal/vqdmlsl in here but it would take some effort.
+ c0 = vmlal_u8(c0, b2, filter2);
+ c1 = vmlal_u8(c1, b4, filter2);
+ c2 = vmlal_u8(c2, b3, filter3);
+ c3 = vmlal_u8(c3, b5, filter3);
+
+ // Use signed saturation math because vmlsl may have left some negative
+ // numbers in there.
+ d0 = vqaddq_s16(vreinterpretq_s16_u16(c2), vreinterpretq_s16_u16(c0));
+ d1 = vqaddq_s16(vreinterpretq_s16_u16(c3), vreinterpretq_s16_u16(c1));
+
+ // Use signed again because numbers like -200 need to be saturated to 0.
+ e0 = vqrshrun_n_s16(d0, 7);
+ e1 = vqrshrun_n_s16(d1, 7);
+
+ store4x4(dst, dst_stride, e0, e1);
+}
+
+void vp8_sixtap_predict4x4_neon(unsigned char *src_ptr, int src_pixels_per_line,
+ int xoffset, int yoffset,
+ unsigned char *dst_ptr, int dst_pitch) {
+ uint8x16_t s0, s1, s2, s3, s4;
+ uint64x2_t s01, s23;
+ // Variables to hold src[] elements for the given filter[]
+ uint8x8_t s0_f5, s1_f5, s2_f5, s3_f5, s4_f5;
+ uint8x8_t s4_f1, s4_f2, s4_f3, s4_f4;
+ uint8x16_t s01_f0, s23_f0;
+ uint64x2_t s01_f3, s23_f3;
+ uint32x2x2_t s01_f3_q, s23_f3_q, s01_f5_q, s23_f5_q;
+ // Accumulator variables.
+ uint16x8_t d0123, d4567, d89;
+ uint16x8_t d0123_a, d4567_a, d89_a;
+ int16x8_t e0123, e4567, e89;
+ // Second pass intermediates.
+ uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7, b8;
+ uint16x8_t c0, c1, c2, c3;
+ int16x8_t d0, d1;
+ uint8x8_t e0, e1;
+ uint8x8_t filter, filter0, filter1, filter2, filter3, filter4, filter5;
+
+ if (xoffset == 0) { // Second pass only.
+ yonly4x4(src_ptr, src_pixels_per_line, yoffset, dst_ptr, dst_pitch);
+ return;
+ }
+
+ if (yoffset == 0) { // First pass only.
+ src_ptr -= 2;
+ } else { // Add context for the second pass. 2 extra lines on top.
+ src_ptr -= 2 + (src_pixels_per_line * 2);
+ }
+
+ filter = vld1_u8(abs_filters[xoffset]);
+ filter0 = vdup_lane_u8(filter, 0);
+ filter1 = vdup_lane_u8(filter, 1);
+ filter2 = vdup_lane_u8(filter, 2);
+ filter3 = vdup_lane_u8(filter, 3);
+ filter4 = vdup_lane_u8(filter, 4);
+ filter5 = vdup_lane_u8(filter, 5);
+
+ // 2 bytes of context, 4 bytes of src values, 3 bytes of context, 7 bytes of
+ // garbage. So much effort for that last single bit.
+ // The low values of each pair are for filter0.
+ s0 = vld1q_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ s1 = vld1q_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ s2 = vld1q_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ s3 = vld1q_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+
+ // Shift to extract values for filter[5]
+ // If src[] is 0, this puts:
+ // 3 4 5 6 7 8 9 10 in s0_f5
+ // Can't use vshr.u64 because it crosses the double word boundary.
+ s0_f5 = vext_u8(vget_low_u8(s0), vget_high_u8(s0), 5);
+ s1_f5 = vext_u8(vget_low_u8(s1), vget_high_u8(s1), 5);
+ s2_f5 = vext_u8(vget_low_u8(s2), vget_high_u8(s2), 5);
+ s3_f5 = vext_u8(vget_low_u8(s3), vget_high_u8(s3), 5);
+
+ s01_f0 = vcombine_u8(vget_low_u8(s0), vget_low_u8(s1));
+ s23_f0 = vcombine_u8(vget_low_u8(s2), vget_low_u8(s3));
+
+ s01_f5_q = vzip_u32(vreinterpret_u32_u8(s0_f5), vreinterpret_u32_u8(s1_f5));
+ s23_f5_q = vzip_u32(vreinterpret_u32_u8(s2_f5), vreinterpret_u32_u8(s3_f5));
+ d0123 = vmull_u8(vreinterpret_u8_u32(s01_f5_q.val[0]), filter5);
+ d4567 = vmull_u8(vreinterpret_u8_u32(s23_f5_q.val[0]), filter5);
+
+ // Keep original src data as 64 bits to simplify shifting and extracting.
+ s01 = vreinterpretq_u64_u8(s01_f0);
+ s23 = vreinterpretq_u64_u8(s23_f0);
+
+ // 3 4 5 6 * filter0
+ filter_add_accumulate(s01_f0, s23_f0, filter0, &d0123, &d4567);
+
+ // Shift over one to use -1, 0, 1, 2 for filter1
+ // -1 0 1 2 * filter1
+ filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 8)),
+ vreinterpretq_u8_u64(vshrq_n_u64(s23, 8)), filter1,
+ &d0123, &d4567);
+
+ // 2 3 4 5 * filter4
+ filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 32)),
+ vreinterpretq_u8_u64(vshrq_n_u64(s23, 32)), filter4,
+ &d0123, &d4567);
+
+ // 0 1 2 3 * filter2
+ filter_add_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 16)),
+ vreinterpretq_u8_u64(vshrq_n_u64(s23, 16)), filter2,
+ &d0123, &d4567);
+
+ // 1 2 3 4 * filter3
+ s01_f3 = vshrq_n_u64(s01, 24);
+ s23_f3 = vshrq_n_u64(s23, 24);
+ s01_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s01_f3)),
+ vreinterpret_u32_u64(vget_high_u64(s01_f3)));
+ s23_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s23_f3)),
+ vreinterpret_u32_u64(vget_high_u64(s23_f3)));
+ // Accumulate into different registers so it can use saturated addition.
+ d0123_a = vmull_u8(vreinterpret_u8_u32(s01_f3_q.val[0]), filter3);
+ d4567_a = vmull_u8(vreinterpret_u8_u32(s23_f3_q.val[0]), filter3);
+
+ e0123 =
+ vqaddq_s16(vreinterpretq_s16_u16(d0123), vreinterpretq_s16_u16(d0123_a));
+ e4567 =
+ vqaddq_s16(vreinterpretq_s16_u16(d4567), vreinterpretq_s16_u16(d4567_a));
+
+ // Shift and narrow.
+ b0 = vqrshrun_n_s16(e0123, 7);
+ b2 = vqrshrun_n_s16(e4567, 7);
+
+ if (yoffset == 0) { // firstpass_filter4x4_only
+ store4x4(dst_ptr, dst_pitch, b0, b2);
+ return;
+ }
+
+ // Load additional context when doing both filters.
+ s0 = vld1q_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ s1 = vld1q_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ s2 = vld1q_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ s3 = vld1q_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ s4 = vld1q_u8(src_ptr);
+
+ s0_f5 = vext_u8(vget_low_u8(s0), vget_high_u8(s0), 5);
+ s1_f5 = vext_u8(vget_low_u8(s1), vget_high_u8(s1), 5);
+ s2_f5 = vext_u8(vget_low_u8(s2), vget_high_u8(s2), 5);
+ s3_f5 = vext_u8(vget_low_u8(s3), vget_high_u8(s3), 5);
+ s4_f5 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 5);
+
+ // 3 4 5 6 * filter0
+ s01_f0 = vcombine_u8(vget_low_u8(s0), vget_low_u8(s1));
+ s23_f0 = vcombine_u8(vget_low_u8(s2), vget_low_u8(s3));
+
+ s01_f5_q = vzip_u32(vreinterpret_u32_u8(s0_f5), vreinterpret_u32_u8(s1_f5));
+ s23_f5_q = vzip_u32(vreinterpret_u32_u8(s2_f5), vreinterpret_u32_u8(s3_f5));
+ // But this time instead of 16 pixels to filter, there are 20. So an extra
+ // run with a doubleword register.
+ d0123 = vmull_u8(vreinterpret_u8_u32(s01_f5_q.val[0]), filter5);
+ d4567 = vmull_u8(vreinterpret_u8_u32(s23_f5_q.val[0]), filter5);
+ d89 = vmull_u8(s4_f5, filter5);
+
+ // Save a copy as u64 for shifting.
+ s01 = vreinterpretq_u64_u8(s01_f0);
+ s23 = vreinterpretq_u64_u8(s23_f0);
+
+ filter_add_accumulate(s01_f0, s23_f0, filter0, &d0123, &d4567);
+ d89 = vmlal_u8(d89, vget_low_u8(s4), filter0);
+
+ filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 8)),
+ vreinterpretq_u8_u64(vshrq_n_u64(s23, 8)), filter1,
+ &d0123, &d4567);
+ s4_f1 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 1);
+ d89 = vmlsl_u8(d89, s4_f1, filter1);
+
+ filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 32)),
+ vreinterpretq_u8_u64(vshrq_n_u64(s23, 32)), filter4,
+ &d0123, &d4567);
+ s4_f4 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 4);
+ d89 = vmlsl_u8(d89, s4_f4, filter4);
+
+ filter_add_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 16)),
+ vreinterpretq_u8_u64(vshrq_n_u64(s23, 16)), filter2,
+ &d0123, &d4567);
+ s4_f2 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 2);
+ d89 = vmlal_u8(d89, s4_f2, filter2);
+
+ s01_f3 = vshrq_n_u64(s01, 24);
+ s23_f3 = vshrq_n_u64(s23, 24);
+ s01_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s01_f3)),
+ vreinterpret_u32_u64(vget_high_u64(s01_f3)));
+ s23_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s23_f3)),
+ vreinterpret_u32_u64(vget_high_u64(s23_f3)));
+ s4_f3 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 3);
+ d0123_a = vmull_u8(vreinterpret_u8_u32(s01_f3_q.val[0]), filter3);
+ d4567_a = vmull_u8(vreinterpret_u8_u32(s23_f3_q.val[0]), filter3);
+ d89_a = vmull_u8(s4_f3, filter3);
+
+ e0123 =
+ vqaddq_s16(vreinterpretq_s16_u16(d0123), vreinterpretq_s16_u16(d0123_a));
+ e4567 =
+ vqaddq_s16(vreinterpretq_s16_u16(d4567), vreinterpretq_s16_u16(d4567_a));
+ e89 = vqaddq_s16(vreinterpretq_s16_u16(d89), vreinterpretq_s16_u16(d89_a));
+
+ b4 = vqrshrun_n_s16(e0123, 7);
+ b6 = vqrshrun_n_s16(e4567, 7);
+ b8 = vqrshrun_n_s16(e89, 7);
+
+ // Second pass: 4x4
+ filter = vld1_u8(abs_filters[yoffset]);
+ filter0 = vdup_lane_u8(filter, 0);
+ filter1 = vdup_lane_u8(filter, 1);
+ filter2 = vdup_lane_u8(filter, 2);
+ filter3 = vdup_lane_u8(filter, 3);
+ filter4 = vdup_lane_u8(filter, 4);
+ filter5 = vdup_lane_u8(filter, 5);
+
+ b1 = vext_u8(b0, b2, 4);
+ b3 = vext_u8(b2, b4, 4);
+ b5 = vext_u8(b4, b6, 4);
+ b7 = vext_u8(b6, b8, 4);
+
+ c0 = vmull_u8(b0, filter0);
+ c1 = vmull_u8(b2, filter0);
+ c2 = vmull_u8(b5, filter5);
+ c3 = vmull_u8(b7, filter5);
+
+ c0 = vmlsl_u8(c0, b4, filter4);
+ c1 = vmlsl_u8(c1, b6, filter4);
+ c2 = vmlsl_u8(c2, b1, filter1);
+ c3 = vmlsl_u8(c3, b3, filter1);
+
+ c0 = vmlal_u8(c0, b2, filter2);
+ c1 = vmlal_u8(c1, b4, filter2);
+ c2 = vmlal_u8(c2, b3, filter3);
+ c3 = vmlal_u8(c3, b5, filter3);
+
+ d0 = vqaddq_s16(vreinterpretq_s16_u16(c2), vreinterpretq_s16_u16(c0));
+ d1 = vqaddq_s16(vreinterpretq_s16_u16(c3), vreinterpretq_s16_u16(c1));
+
+ e0 = vqrshrun_n_s16(d0, 7);
+ e1 = vqrshrun_n_s16(d1, 7);
+
+ store4x4(dst_ptr, dst_pitch, e0, e1);
+}
+
void vp8_sixtap_predict8x4_neon(unsigned char *src_ptr, int src_pixels_per_line,
int xoffset, int yoffset,
unsigned char *dst_ptr, int dst_pitch) {
diff --git a/vp8/common/filter.c b/vp8/common/filter.c
index a312efb6c..267498335 100644
--- a/vp8/common/filter.c
+++ b/vp8/common/filter.c
@@ -8,8 +8,9 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "filter.h"
+#include <assert.h>
#include "./vp8_rtcd.h"
+#include "vp8/common/filter.h"
DECLARE_ALIGNED(16, const short, vp8_bilinear_filters[8][2]) = {
{ 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
@@ -324,27 +325,11 @@ void vp8_bilinear_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line,
const short *HFilter;
const short *VFilter;
+ // This represents a copy and is not required to be handled by optimizations.
+ assert((xoffset | yoffset) != 0);
+
HFilter = vp8_bilinear_filters[xoffset];
VFilter = vp8_bilinear_filters[yoffset];
-#if 0
- {
- int i;
- unsigned char temp1[16];
- unsigned char temp2[16];
-
- bilinear_predict4x4_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, temp1, 4);
- filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4);
-
- for (i = 0; i < 16; ++i)
- {
- if (temp1[i] != temp2[i])
- {
- bilinear_predict4x4_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, temp1, 4);
- filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4);
- }
- }
- }
-#endif
filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter,
VFilter, 4, 4);
}
@@ -355,6 +340,8 @@ void vp8_bilinear_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line,
const short *HFilter;
const short *VFilter;
+ assert((xoffset | yoffset) != 0);
+
HFilter = vp8_bilinear_filters[xoffset];
VFilter = vp8_bilinear_filters[yoffset];
@@ -368,6 +355,8 @@ void vp8_bilinear_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line,
const short *HFilter;
const short *VFilter;
+ assert((xoffset | yoffset) != 0);
+
HFilter = vp8_bilinear_filters[xoffset];
VFilter = vp8_bilinear_filters[yoffset];
@@ -382,6 +371,8 @@ void vp8_bilinear_predict16x16_c(unsigned char *src_ptr,
const short *HFilter;
const short *VFilter;
+ assert((xoffset | yoffset) != 0);
+
HFilter = vp8_bilinear_filters[xoffset];
VFilter = vp8_bilinear_filters[yoffset];
diff --git a/vp8/common/onyx.h b/vp8/common/onyx.h
index eb68246b2..43e3c29b5 100644
--- a/vp8/common/onyx.h
+++ b/vp8/common/onyx.h
@@ -251,7 +251,7 @@ int vp8_receive_raw_frame(struct VP8_COMP *comp, unsigned int frame_flags,
YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
int64_t end_time_stamp);
int vp8_get_compressed_data(struct VP8_COMP *comp, unsigned int *frame_flags,
- unsigned long *size, unsigned char *dest,
+ size_t *size, unsigned char *dest,
unsigned char *dest_end, int64_t *time_stamp,
int64_t *time_end, int flush);
int vp8_get_preview_raw_frame(struct VP8_COMP *comp, YV12_BUFFER_CONFIG *dest,
diff --git a/vp8/common/onyxc_int.h b/vp8/common/onyxc_int.h
index 732656f2f..9a12c7fb6 100644
--- a/vp8/common/onyxc_int.h
+++ b/vp8/common/onyxc_int.h
@@ -160,10 +160,6 @@ typedef struct VP8Common {
#ifdef PACKET_TESTING
VP8_HEADER oh;
#endif
-#if CONFIG_POSTPROC_VISUALIZER
- double bitrate;
- double framerate;
-#endif
#if CONFIG_MULTITHREAD
int processor_core_count;
diff --git a/vp8/common/postproc.c b/vp8/common/postproc.c
index 1c4e042c8..8b8c1701a 100644
--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c
@@ -37,46 +37,6 @@
(0.071 * (float)(t & 0xff)) + 128)
/* clang-format on */
-/* global constants */
-#if CONFIG_POSTPROC_VISUALIZER
-static const unsigned char MB_PREDICTION_MODE_colors[MB_MODE_COUNT][3] = {
- { RGB_TO_YUV(0x98FB98) }, /* PaleGreen */
- { RGB_TO_YUV(0x00FF00) }, /* Green */
- { RGB_TO_YUV(0xADFF2F) }, /* GreenYellow */
- { RGB_TO_YUV(0x228B22) }, /* ForestGreen */
- { RGB_TO_YUV(0x006400) }, /* DarkGreen */
- { RGB_TO_YUV(0x98F5FF) }, /* Cadet Blue */
- { RGB_TO_YUV(0x6CA6CD) }, /* Sky Blue */
- { RGB_TO_YUV(0x00008B) }, /* Dark blue */
- { RGB_TO_YUV(0x551A8B) }, /* Purple */
- { RGB_TO_YUV(0xFF0000) } /* Red */
-};
-
-static const unsigned char B_PREDICTION_MODE_colors[B_MODE_COUNT][3] = {
- { RGB_TO_YUV(0x6633ff) }, /* Purple */
- { RGB_TO_YUV(0xcc33ff) }, /* Magenta */
- { RGB_TO_YUV(0xff33cc) }, /* Pink */
- { RGB_TO_YUV(0xff3366) }, /* Coral */
- { RGB_TO_YUV(0x3366ff) }, /* Blue */
- { RGB_TO_YUV(0xed00f5) }, /* Dark Blue */
- { RGB_TO_YUV(0x2e00b8) }, /* Dark Purple */
- { RGB_TO_YUV(0xff6633) }, /* Orange */
- { RGB_TO_YUV(0x33ccff) }, /* Light Blue */
- { RGB_TO_YUV(0x8ab800) }, /* Green */
- { RGB_TO_YUV(0xffcc33) }, /* Light Orange */
- { RGB_TO_YUV(0x33ffcc) }, /* Aqua */
- { RGB_TO_YUV(0x66ff33) }, /* Light Green */
- { RGB_TO_YUV(0xccff33) }, /* Yellow */
-};
-
-static const unsigned char MV_REFERENCE_FRAME_colors[MAX_REF_FRAMES][3] = {
- { RGB_TO_YUV(0x00ff00) }, /* Blue */
- { RGB_TO_YUV(0x0000ff) }, /* Green */
- { RGB_TO_YUV(0xffff00) }, /* Yellow */
- { RGB_TO_YUV(0xff0000) }, /* Red */
-};
-#endif
-
extern void vp8_blit_text(const char *msg, unsigned char *address,
const int pitch);
extern void vp8_blit_line(int x0, int x1, int y0, int y1, unsigned char *image,
@@ -308,43 +268,6 @@ void vp8_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v,
}
}
-#if CONFIG_POSTPROC_VISUALIZER
-static void constrain_line(int x_0, int *x_1, int y_0, int *y_1, int width,
- int height) {
- int dx;
- int dy;
-
- if (*x_1 > width) {
- dx = *x_1 - x_0;
- dy = *y_1 - y_0;
-
- *x_1 = width;
- if (dx) *y_1 = ((width - x_0) * dy) / dx + y_0;
- }
- if (*x_1 < 0) {
- dx = *x_1 - x_0;
- dy = *y_1 - y_0;
-
- *x_1 = 0;
- if (dx) *y_1 = ((0 - x_0) * dy) / dx + y_0;
- }
- if (*y_1 > height) {
- dx = *x_1 - x_0;
- dy = *y_1 - y_0;
-
- *y_1 = height;
- if (dy) *x_1 = ((height - y_0) * dx) / dy + x_0;
- }
- if (*y_1 < 0) {
- dx = *x_1 - x_0;
- dy = *y_1 - y_0;
-
- *y_1 = 0;
- if (dy) *x_1 = ((0 - y_0) * dx) / dy + x_0;
- }
-}
-#endif // CONFIG_POSTPROC_VISUALIZER
-
#if CONFIG_POSTPROC
int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest,
vp8_ppflags_t *ppflags) {
@@ -455,331 +378,6 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest,
oci->post_proc_buffer.y_stride);
}
-#if CONFIG_POSTPROC_VISUALIZER
- if (flags & VP8D_DEBUG_TXT_FRAME_INFO) {
- char message[512];
- sprintf(message, "F%1dG%1dQ%3dF%3dP%d_s%dx%d",
- (oci->frame_type == KEY_FRAME), oci->refresh_golden_frame,
- oci->base_qindex, oci->filter_level, flags, oci->mb_cols,
- oci->mb_rows);
- vp8_blit_text(message, oci->post_proc_buffer.y_buffer,
- oci->post_proc_buffer.y_stride);
- }
-
- if (flags & VP8D_DEBUG_TXT_MBLK_MODES) {
- int i, j;
- unsigned char *y_ptr;
- YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
- int mb_rows = post->y_height >> 4;
- int mb_cols = post->y_width >> 4;
- int mb_index = 0;
- MODE_INFO *mi = oci->mi;
-
- y_ptr = post->y_buffer + 4 * post->y_stride + 4;
-
- /* vp8_filter each macro block */
- for (i = 0; i < mb_rows; ++i) {
- for (j = 0; j < mb_cols; ++j) {
- char zz[4];
-
- sprintf(zz, "%c", mi[mb_index].mbmi.mode + 'a');
-
- vp8_blit_text(zz, y_ptr, post->y_stride);
- mb_index++;
- y_ptr += 16;
- }
-
- mb_index++; /* border */
- y_ptr += post->y_stride * 16 - post->y_width;
- }
- }
-
- if (flags & VP8D_DEBUG_TXT_DC_DIFF) {
- int i, j;
- unsigned char *y_ptr;
- YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
- int mb_rows = post->y_height >> 4;
- int mb_cols = post->y_width >> 4;
- int mb_index = 0;
- MODE_INFO *mi = oci->mi;
-
- y_ptr = post->y_buffer + 4 * post->y_stride + 4;
-
- /* vp8_filter each macro block */
- for (i = 0; i < mb_rows; ++i) {
- for (j = 0; j < mb_cols; ++j) {
- char zz[4];
- int dc_diff = !(mi[mb_index].mbmi.mode != B_PRED &&
- mi[mb_index].mbmi.mode != SPLITMV &&
- mi[mb_index].mbmi.mb_skip_coeff);
-
- if (oci->frame_type == KEY_FRAME)
- sprintf(zz, "a");
- else
- sprintf(zz, "%c", dc_diff + '0');
-
- vp8_blit_text(zz, y_ptr, post->y_stride);
- mb_index++;
- y_ptr += 16;
- }
-
- mb_index++; /* border */
- y_ptr += post->y_stride * 16 - post->y_width;
- }
- }
-
- if (flags & VP8D_DEBUG_TXT_RATE_INFO) {
- char message[512];
- sprintf(message, "Bitrate: %10.2f framerate: %10.2f ", oci->bitrate,
- oci->framerate);
- vp8_blit_text(message, oci->post_proc_buffer.y_buffer,
- oci->post_proc_buffer.y_stride);
- }
-
- /* Draw motion vectors */
- if ((flags & VP8D_DEBUG_DRAW_MV) && ppflags->display_mv_flag) {
- YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
- int width = post->y_width;
- int height = post->y_height;
- unsigned char *y_buffer = oci->post_proc_buffer.y_buffer;
- int y_stride = oci->post_proc_buffer.y_stride;
- MODE_INFO *mi = oci->mi;
- int x0, y0;
-
- for (y0 = 0; y0 < height; y0 += 16) {
- for (x0 = 0; x0 < width; x0 += 16) {
- int x1, y1;
-
- if (!(ppflags->display_mv_flag & (1 << mi->mbmi.mode))) {
- mi++;
- continue;
- }
-
- if (mi->mbmi.mode == SPLITMV) {
- switch (mi->mbmi.partitioning) {
- case 0: /* mv_top_bottom */
- {
- union b_mode_info *bmi = &mi->bmi[0];
- MV *mv = &bmi->mv.as_mv;
-
- x1 = x0 + 8 + (mv->col >> 3);
- y1 = y0 + 4 + (mv->row >> 3);
-
- constrain_line(x0 + 8, &x1, y0 + 4, &y1, width, height);
- vp8_blit_line(x0 + 8, x1, y0 + 4, y1, y_buffer, y_stride);
-
- bmi = &mi->bmi[8];
-
- x1 = x0 + 8 + (mv->col >> 3);
- y1 = y0 + 12 + (mv->row >> 3);
-
- constrain_line(x0 + 8, &x1, y0 + 12, &y1, width, height);
- vp8_blit_line(x0 + 8, x1, y0 + 12, y1, y_buffer, y_stride);
-
- break;
- }
- case 1: /* mv_left_right */
- {
- union b_mode_info *bmi = &mi->bmi[0];
- MV *mv = &bmi->mv.as_mv;
-
- x1 = x0 + 4 + (mv->col >> 3);
- y1 = y0 + 8 + (mv->row >> 3);
-
- constrain_line(x0 + 4, &x1, y0 + 8, &y1, width, height);
- vp8_blit_line(x0 + 4, x1, y0 + 8, y1, y_buffer, y_stride);
-
- bmi = &mi->bmi[2];
-
- x1 = x0 + 12 + (mv->col >> 3);
- y1 = y0 + 8 + (mv->row >> 3);
-
- constrain_line(x0 + 12, &x1, y0 + 8, &y1, width, height);
- vp8_blit_line(x0 + 12, x1, y0 + 8, y1, y_buffer, y_stride);
-
- break;
- }
- case 2: /* mv_quarters */
- {
- union b_mode_info *bmi = &mi->bmi[0];
- MV *mv = &bmi->mv.as_mv;
-
- x1 = x0 + 4 + (mv->col >> 3);
- y1 = y0 + 4 + (mv->row >> 3);
-
- constrain_line(x0 + 4, &x1, y0 + 4, &y1, width, height);
- vp8_blit_line(x0 + 4, x1, y0 + 4, y1, y_buffer, y_stride);
-
- bmi = &mi->bmi[2];
-
- x1 = x0 + 12 + (mv->col >> 3);
- y1 = y0 + 4 + (mv->row >> 3);
-
- constrain_line(x0 + 12, &x1, y0 + 4, &y1, width, height);
- vp8_blit_line(x0 + 12, x1, y0 + 4, y1, y_buffer, y_stride);
-
- bmi = &mi->bmi[8];
-
- x1 = x0 + 4 + (mv->col >> 3);
- y1 = y0 + 12 + (mv->row >> 3);
-
- constrain_line(x0 + 4, &x1, y0 + 12, &y1, width, height);
- vp8_blit_line(x0 + 4, x1, y0 + 12, y1, y_buffer, y_stride);
-
- bmi = &mi->bmi[10];
-
- x1 = x0 + 12 + (mv->col >> 3);
- y1 = y0 + 12 + (mv->row >> 3);
-
- constrain_line(x0 + 12, &x1, y0 + 12, &y1, width, height);
- vp8_blit_line(x0 + 12, x1, y0 + 12, y1, y_buffer, y_stride);
- break;
- }
- default: {
- union b_mode_info *bmi = mi->bmi;
- int bx0, by0;
-
- for (by0 = y0; by0 < (y0 + 16); by0 += 4) {
- for (bx0 = x0; bx0 < (x0 + 16); bx0 += 4) {
- MV *mv = &bmi->mv.as_mv;
-
- x1 = bx0 + 2 + (mv->col >> 3);
- y1 = by0 + 2 + (mv->row >> 3);
-
- constrain_line(bx0 + 2, &x1, by0 + 2, &y1, width, height);
- vp8_blit_line(bx0 + 2, x1, by0 + 2, y1, y_buffer, y_stride);
-
- bmi++;
- }
- }
- }
- }
- } else if (mi->mbmi.mode >= NEARESTMV) {
- MV *mv = &mi->mbmi.mv.as_mv;
- const int lx0 = x0 + 8;
- const int ly0 = y0 + 8;
-
- x1 = lx0 + (mv->col >> 3);
- y1 = ly0 + (mv->row >> 3);
-
- if (x1 != lx0 && y1 != ly0) {
- constrain_line(lx0, &x1, ly0 - 1, &y1, width, height);
- vp8_blit_line(lx0, x1, ly0 - 1, y1, y_buffer, y_stride);
-
- constrain_line(lx0, &x1, ly0 + 1, &y1, width, height);
- vp8_blit_line(lx0, x1, ly0 + 1, y1, y_buffer, y_stride);
- } else
- vp8_blit_line(lx0, x1, ly0, y1, y_buffer, y_stride);
- }
-
- mi++;
- }
- mi++;
- }
- }
-
- /* Color in block modes */
- if ((flags & VP8D_DEBUG_CLR_BLK_MODES) &&
- (ppflags->display_mb_modes_flag || ppflags->display_b_modes_flag)) {
- int y, x;
- YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
- int width = post->y_width;
- int height = post->y_height;
- unsigned char *y_ptr = oci->post_proc_buffer.y_buffer;
- unsigned char *u_ptr = oci->post_proc_buffer.u_buffer;
- unsigned char *v_ptr = oci->post_proc_buffer.v_buffer;
- int y_stride = oci->post_proc_buffer.y_stride;
- MODE_INFO *mi = oci->mi;
-
- for (y = 0; y < height; y += 16) {
- for (x = 0; x < width; x += 16) {
- int Y = 0, U = 0, V = 0;
-
- if (mi->mbmi.mode == B_PRED &&
- ((ppflags->display_mb_modes_flag & B_PRED) ||
- ppflags->display_b_modes_flag)) {
- int by, bx;
- unsigned char *yl, *ul, *vl;
- union b_mode_info *bmi = mi->bmi;
-
- yl = y_ptr + x;
- ul = u_ptr + (x >> 1);
- vl = v_ptr + (x >> 1);
-
- for (by = 0; by < 16; by += 4) {
- for (bx = 0; bx < 16; bx += 4) {
- if ((ppflags->display_b_modes_flag & (1 << mi->mbmi.mode)) ||
- (ppflags->display_mb_modes_flag & B_PRED)) {
- Y = B_PREDICTION_MODE_colors[bmi->as_mode][0];
- U = B_PREDICTION_MODE_colors[bmi->as_mode][1];
- V = B_PREDICTION_MODE_colors[bmi->as_mode][2];
-
- vp8_blend_b(yl + bx, ul + (bx >> 1), vl + (bx >> 1), Y, U, V,
- 0xc000, y_stride);
- }
- bmi++;
- }
-
- yl += y_stride * 4;
- ul += y_stride * 1;
- vl += y_stride * 1;
- }
- } else if (ppflags->display_mb_modes_flag & (1 << mi->mbmi.mode)) {
- Y = MB_PREDICTION_MODE_colors[mi->mbmi.mode][0];
- U = MB_PREDICTION_MODE_colors[mi->mbmi.mode][1];
- V = MB_PREDICTION_MODE_colors[mi->mbmi.mode][2];
-
- vp8_blend_mb_inner(y_ptr + x, u_ptr + (x >> 1), v_ptr + (x >> 1), Y,
- U, V, 0xc000, y_stride);
- }
-
- mi++;
- }
- y_ptr += y_stride * 16;
- u_ptr += y_stride * 4;
- v_ptr += y_stride * 4;
-
- mi++;
- }
- }
-
- /* Color in frame reference blocks */
- if ((flags & VP8D_DEBUG_CLR_FRM_REF_BLKS) &&
- ppflags->display_ref_frame_flag) {
- int y, x;
- YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
- int width = post->y_width;
- int height = post->y_height;
- unsigned char *y_ptr = oci->post_proc_buffer.y_buffer;
- unsigned char *u_ptr = oci->post_proc_buffer.u_buffer;
- unsigned char *v_ptr = oci->post_proc_buffer.v_buffer;
- int y_stride = oci->post_proc_buffer.y_stride;
- MODE_INFO *mi = oci->mi;
-
- for (y = 0; y < height; y += 16) {
- for (x = 0; x < width; x += 16) {
- int Y = 0, U = 0, V = 0;
-
- if (ppflags->display_ref_frame_flag & (1 << mi->mbmi.ref_frame)) {
- Y = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][0];
- U = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][1];
- V = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][2];
-
- vp8_blend_mb_outer(y_ptr + x, u_ptr + (x >> 1), v_ptr + (x >> 1), Y,
- U, V, 0xc000, y_stride);
- }
-
- mi++;
- }
- y_ptr += y_stride * 16;
- u_ptr += y_stride * 4;
- v_ptr += y_stride * 4;
-
- mi++;
- }
- }
-#endif
-
*dest = oci->post_proc_buffer;
/* handle problem with extending borders */
diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl
index 8dc36f731..5d8e4a78d 100644
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -173,10 +173,8 @@ add_proto qw/void vp8_sixtap_predict8x4/, "unsigned char *src, int src_pitch, in
specialize qw/vp8_sixtap_predict8x4 mmx sse2 ssse3 neon dspr2 msa/;
$vp8_sixtap_predict8x4_dspr2=vp8_sixtap_predict8x4_dspr2;
-# TODO(johannkoenig): Add neon implementation
-# https://bugs.chromium.org/p/webm/issues/detail?id=1273
add_proto qw/void vp8_sixtap_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_sixtap_predict4x4 mmx ssse3 dspr2 msa/;
+specialize qw/vp8_sixtap_predict4x4 mmx ssse3 neon dspr2 msa/;
$vp8_sixtap_predict4x4_dspr2=vp8_sixtap_predict4x4_dspr2;
add_proto qw/void vp8_bilinear_predict16x16/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
diff --git a/vp8/common/textblit.c b/vp8/common/textblit.c
deleted file mode 100644
index e7c15c4e4..000000000
--- a/vp8/common/textblit.c
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stdlib.h>
-
-void vp8_blit_text(const char *msg, unsigned char *address, const int pitch) {
- int letter_bitmap;
- unsigned char *output_pos = address;
- int colpos;
- const int font[] = {
- 0x0, 0x5C00, 0x8020, 0xAFABEA, 0xD7EC0, 0x1111111, 0x1855740,
- 0x18000, 0x45C0, 0x74400, 0x51140, 0x23880, 0xC4000, 0x21080,
- 0x80000, 0x111110, 0xE9D72E, 0x87E40, 0x12AD732, 0xAAD62A, 0x4F94C4,
- 0x4D6B7, 0x456AA, 0x3E8423, 0xAAD6AA, 0xAAD6A2, 0x2800, 0x2A00,
- 0x8A880, 0x52940, 0x22A20, 0x15422, 0x6AD62E, 0x1E4A53E, 0xAAD6BF,
- 0x8C62E, 0xE8C63F, 0x118D6BF, 0x1094BF, 0xCAC62E, 0x1F2109F, 0x118FE31,
- 0xF8C628, 0x8A89F, 0x108421F, 0x1F1105F, 0x1F4105F, 0xE8C62E, 0x2294BF,
- 0x164C62E, 0x12694BF, 0x8AD6A2, 0x10FC21, 0x1F8421F, 0x744107, 0xF8220F,
- 0x1151151, 0x117041, 0x119D731, 0x47E0, 0x1041041, 0xFC400, 0x10440,
- 0x1084210, 0x820
- };
- colpos = 0;
-
- while (msg[colpos] != 0) {
- char letter = msg[colpos];
- int fontcol, fontrow;
-
- if (letter <= 'Z' && letter >= ' ')
- letter_bitmap = font[letter - ' '];
- else if (letter <= 'z' && letter >= 'a')
- letter_bitmap = font[letter - 'a' + 'A' - ' '];
- else
- letter_bitmap = font[0];
-
- for (fontcol = 6; fontcol >= 0; fontcol--)
- for (fontrow = 0; fontrow < 5; ++fontrow)
- output_pos[fontrow * pitch + fontcol] =
- ((letter_bitmap >> (fontcol * 5)) & (1 << fontrow) ? 255 : 0);
-
- output_pos += 7;
- colpos++;
- }
-}
-
-static void plot(const int x, const int y, unsigned char *image,
- const int pitch) {
- image[x + y * pitch] ^= 255;
-}
-
-/* Bresenham line algorithm */
-void vp8_blit_line(int x0, int x1, int y0, int y1, unsigned char *image,
- const int pitch) {
- int steep = abs(y1 - y0) > abs(x1 - x0);
- int deltax, deltay;
- int error, ystep, y, x;
-
- if (steep) {
- int t;
- t = x0;
- x0 = y0;
- y0 = t;
-
- t = x1;
- x1 = y1;
- y1 = t;
- }
-
- if (x0 > x1) {
- int t;
- t = x0;
- x0 = x1;
- x1 = t;
-
- t = y0;
- y0 = y1;
- y1 = t;
- }
-
- deltax = x1 - x0;
- deltay = abs(y1 - y0);
- error = deltax / 2;
-
- y = y0;
-
- if (y0 < y1)
- ystep = 1;
- else
- ystep = -1;
-
- if (steep) {
- for (x = x0; x <= x1; ++x) {
- plot(y, x, image, pitch);
-
- error = error - deltay;
- if (error < 0) {
- y = y + ystep;
- error = error + deltax;
- }
- }
- } else {
- for (x = x0; x <= x1; ++x) {
- plot(x, y, image, pitch);
-
- error = error - deltay;
- if (error < 0) {
- y = y + ystep;
- error = error + deltax;
- }
- }
- }
-}
diff --git a/vp8/common/x86/subpixel_sse2.asm b/vp8/common/x86/subpixel_sse2.asm
index 69f8d103c..ca00583ca 100644
--- a/vp8/common/x86/subpixel_sse2.asm
+++ b/vp8/common/x86/subpixel_sse2.asm
@@ -181,8 +181,12 @@ sym(vp8_filter_block1d16_h6_sse2):
movq xmm3, MMWORD PTR [rsi - 2]
movq xmm1, MMWORD PTR [rsi + 6]
- movq xmm2, MMWORD PTR [rsi +14]
- pslldq xmm2, 8
+ ; Load from 11 to avoid reading out of bounds.
+ movq xmm2, MMWORD PTR [rsi +11]
+ ; The lower bits are not cleared before 'or'ing with xmm1,
+ ; but that is OK because the values in the overlapping positions
+ ; are already equal to the ones in xmm1.
+ pslldq xmm2, 5
por xmm2, xmm1
prefetcht2 [rsi+rax-2]
diff --git a/vp8/common/x86/subpixel_ssse3.asm b/vp8/common/x86/subpixel_ssse3.asm
index c06f24556..1f6cbd1d1 100644
--- a/vp8/common/x86/subpixel_ssse3.asm
+++ b/vp8/common/x86/subpixel_ssse3.asm
@@ -1291,6 +1291,8 @@ sym(vp8_bilinear_predict8x8_ssse3):
movq xmm7, XMMWORD PTR [rsp+96]
punpcklbw xmm5, xmm6
+ ; Because the source register (xmm0) is always treated as signed by
+ ; pmaddubsw, the constant '128' is treated as '-128'.
pmaddubsw xmm1, xmm0
pmaddubsw xmm2, xmm0
@@ -1319,6 +1321,10 @@ sym(vp8_bilinear_predict8x8_ssse3):
psraw xmm5, VP8_FILTER_SHIFT
psraw xmm6, VP8_FILTER_SHIFT
+
+ ; Having multiplied everything by '-128' and obtained negative
+ ; numbers, the unsigned saturation truncates those values to 0,
+ ; resulting in incorrect handling of xoffset == 0 && yoffset == 0
packuswb xmm1, xmm1
packuswb xmm2, xmm2
diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c
index 06c2f624f..1b100cfe8 100644
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c
@@ -1056,7 +1056,7 @@ static void put_delta_q(vp8_writer *bc, int delta_q) {
}
void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest,
- unsigned char *dest_end, unsigned long *size) {
+ unsigned char *dest_end, size_t *size) {
int i, j;
VP8_HEADER oh;
VP8_COMMON *const pc = &cpi->common;
@@ -1347,7 +1347,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest,
*size = VP8_HEADER_SIZE + extra_bytes_packed + cpi->bc->pos;
- cpi->partition_sz[0] = *size;
+ cpi->partition_sz[0] = (unsigned int)*size;
#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
{
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index f61cfbe90..6ebf233ed 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -2746,7 +2746,7 @@ static int decide_key_frame(VP8_COMP *cpi) {
return code_key_frame;
}
-static void Pass1Encode(VP8_COMP *cpi, unsigned long *size, unsigned char *dest,
+static void Pass1Encode(VP8_COMP *cpi, size_t *size, unsigned char *dest,
unsigned int *frame_flags) {
(void)size;
(void)dest;
@@ -3185,7 +3185,7 @@ void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm) {
vp8_yv12_extend_frame_borders(cm->frame_to_show);
}
-static void encode_frame_to_data_rate(VP8_COMP *cpi, unsigned long *size,
+static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
unsigned char *dest,
unsigned char *dest_end,
unsigned int *frame_flags) {
@@ -4384,7 +4384,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, unsigned long *size,
/* Update rate control heuristics */
cpi->total_byte_count += (*size);
- cpi->projected_frame_size = (*size) << 3;
+ cpi->projected_frame_size = (int)(*size) << 3;
if (cpi->oxcf.number_of_layers > 1) {
unsigned int i;
@@ -4711,7 +4711,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, unsigned long *size,
/* vp8_write_yuv_frame("encoder_recon.yuv", cm->frame_to_show); */
}
#if !CONFIG_REALTIME_ONLY
-static void Pass2Encode(VP8_COMP *cpi, unsigned long *size, unsigned char *dest,
+static void Pass2Encode(VP8_COMP *cpi, size_t *size, unsigned char *dest,
unsigned char *dest_end, unsigned int *frame_flags) {
if (!cpi->common.refresh_alt_ref_frame) vp8_second_pass(cpi);
@@ -4764,7 +4764,7 @@ static int frame_is_reference(const VP8_COMP *cpi) {
}
int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags,
- unsigned long *size, unsigned char *dest,
+ size_t *size, unsigned char *dest,
unsigned char *dest_end, int64_t *time_stamp,
int64_t *time_end, int flush) {
VP8_COMMON *cm;
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index 32080eff7..59ad5773a 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -687,7 +687,7 @@ void vp8_new_framerate(VP8_COMP *cpi, double framerate);
void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm);
void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest,
- unsigned char *dest_end, unsigned long *size);
+ unsigned char *dest_end, size_t *size);
void vp8_tokenize_mb(VP8_COMP *, MACROBLOCK *, TOKENEXTRA **);
diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
index f0050d201..7b68d35f5 100644
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -570,7 +570,7 @@ static int evaluate_inter_mode(unsigned int *sse, int rate2, int *distortion2,
// No adjustment if block is considered to be skin area.
if (x->is_skin) rd_adj = 100;
- this_rd = ((int64_t)this_rd) * rd_adj / 100;
+ this_rd = (int)(((int64_t)this_rd) * rd_adj / 100);
}
check_for_encode_breakout(*sse, x);
diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index d863a0a26..886b127d6 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -68,7 +68,6 @@ VP8_COMMON_SRCS-yes += common/vp8_entropymodedata.h
-VP8_COMMON_SRCS-$(CONFIG_POSTPROC_VISUALIZER) += common/textblit.c
VP8_COMMON_SRCS-yes += common/treecoder.c
VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/filter_x86.c
diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index 0ec6902e7..fac237eec 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -824,7 +824,7 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
unsigned int lib_flags;
YV12_BUFFER_CONFIG sd;
int64_t dst_time_stamp, dst_end_time_stamp;
- unsigned long size, cx_data_sz;
+ size_t size, cx_data_sz;
unsigned char *cx_data;
unsigned char *cx_data_end;
int comp_data_state = 0;
diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c
index cab0a9997..b1f8340d6 100644
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -46,13 +46,6 @@ struct vpx_codec_alg_priv {
int decoder_init;
int postproc_cfg_set;
vp8_postproc_cfg_t postproc_cfg;
-#if CONFIG_POSTPROC_VISUALIZER
- unsigned int dbg_postproc_flag;
- int dbg_color_ref_frame_flag;
- int dbg_color_mb_modes_flag;
- int dbg_color_b_modes_flag;
- int dbg_display_mv_flag;
-#endif
vpx_decrypt_cb decrypt_cb;
void *decrypt_state;
vpx_image_t img;
@@ -478,22 +471,8 @@ static vpx_image_t *vp8_get_frame(vpx_codec_alg_priv_t *ctx,
if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC) {
flags.post_proc_flag = ctx->postproc_cfg.post_proc_flag;
-#if CONFIG_POSTPROC_VISUALIZER
- flags.post_proc_flag |=
- ((ctx->dbg_color_ref_frame_flag != 0) ? VP8D_DEBUG_CLR_FRM_REF_BLKS
- : 0) |
- ((ctx->dbg_color_mb_modes_flag != 0) ? VP8D_DEBUG_CLR_BLK_MODES : 0) |
- ((ctx->dbg_color_b_modes_flag != 0) ? VP8D_DEBUG_CLR_BLK_MODES : 0) |
- ((ctx->dbg_display_mv_flag != 0) ? VP8D_DEBUG_DRAW_MV : 0);
-#endif
flags.deblocking_level = ctx->postproc_cfg.deblocking_level;
flags.noise_level = ctx->postproc_cfg.noise_level;
-#if CONFIG_POSTPROC_VISUALIZER
- flags.display_ref_frame_flag = ctx->dbg_color_ref_frame_flag;
- flags.display_mb_modes_flag = ctx->dbg_color_mb_modes_flag;
- flags.display_b_modes_flag = ctx->dbg_color_b_modes_flag;
- flags.display_mv_flag = ctx->dbg_display_mv_flag;
-#endif
}
if (0 == vp8dx_get_raw_frame(ctx->yv12_frame_buffers.pbi[0], &sd,
@@ -589,54 +568,6 @@ static vpx_codec_err_t vp8_set_postproc(vpx_codec_alg_priv_t *ctx,
#endif
}
-static vpx_codec_err_t vp8_set_dbg_color_ref_frame(vpx_codec_alg_priv_t *ctx,
- va_list args) {
-#if CONFIG_POSTPROC_VISUALIZER && CONFIG_POSTPROC
- ctx->dbg_color_ref_frame_flag = va_arg(args, int);
- return VPX_CODEC_OK;
-#else
- (void)ctx;
- (void)args;
- return VPX_CODEC_INCAPABLE;
-#endif
-}
-
-static vpx_codec_err_t vp8_set_dbg_color_mb_modes(vpx_codec_alg_priv_t *ctx,
- va_list args) {
-#if CONFIG_POSTPROC_VISUALIZER && CONFIG_POSTPROC
- ctx->dbg_color_mb_modes_flag = va_arg(args, int);
- return VPX_CODEC_OK;
-#else
- (void)ctx;
- (void)args;
- return VPX_CODEC_INCAPABLE;
-#endif
-}
-
-static vpx_codec_err_t vp8_set_dbg_color_b_modes(vpx_codec_alg_priv_t *ctx,
- va_list args) {
-#if CONFIG_POSTPROC_VISUALIZER && CONFIG_POSTPROC
- ctx->dbg_color_b_modes_flag = va_arg(args, int);
- return VPX_CODEC_OK;
-#else
- (void)ctx;
- (void)args;
- return VPX_CODEC_INCAPABLE;
-#endif
-}
-
-static vpx_codec_err_t vp8_set_dbg_display_mv(vpx_codec_alg_priv_t *ctx,
- va_list args) {
-#if CONFIG_POSTPROC_VISUALIZER && CONFIG_POSTPROC
- ctx->dbg_display_mv_flag = va_arg(args, int);
- return VPX_CODEC_OK;
-#else
- (void)ctx;
- (void)args;
- return VPX_CODEC_INCAPABLE;
-#endif
-}
-
static vpx_codec_err_t vp8_get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
va_list args) {
int *update_info = va_arg(args, int *);
@@ -706,10 +637,6 @@ vpx_codec_ctrl_fn_map_t vp8_ctf_maps[] = {
{ VP8_SET_REFERENCE, vp8_set_reference },
{ VP8_COPY_REFERENCE, vp8_get_reference },
{ VP8_SET_POSTPROC, vp8_set_postproc },
- { VP8_SET_DBG_COLOR_REF_FRAME, vp8_set_dbg_color_ref_frame },
- { VP8_SET_DBG_COLOR_MB_MODES, vp8_set_dbg_color_mb_modes },
- { VP8_SET_DBG_COLOR_B_MODES, vp8_set_dbg_color_b_modes },
- { VP8_SET_DBG_DISPLAY_MV, vp8_set_dbg_display_mv },
{ VP8D_GET_LAST_REF_UPDATES, vp8_get_last_ref_updates },
{ VP8D_GET_FRAME_CORRUPTED, vp8_get_frame_corrupted },
{ VP8D_GET_LAST_REF_USED, vp8_get_last_ref_frame },
diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c
index ff7c1dd3f..dc2e03946 100644
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -139,8 +139,6 @@ void vp9_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
// The calculation can be simplified if there are not many non-zero dct
// coefficients. Use eobs to decide what to do.
- // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.
- // Combine that with code here.
if (eob == 1)
// DC only DCT coefficient
vpx_idct8x8_1_add(input, dest, stride);
@@ -204,6 +202,18 @@ void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
}
#if CONFIG_VP9_HIGHBITDEPTH
+
+// 12 signal input bits + 7 forward transform amplify bits + 1 bit
+// for contingency in rounding and quantizing
+#define VALID_IHT_MAGNITUDE_RANGE (1 << 20)
+
+static INLINE int detect_invalid_iht_input(const tran_low_t *input, int size) {
+ int i;
+ for (i = 0; i < size; ++i)
+ if (abs(input[i]) >= VALID_IHT_MAGNITUDE_RANGE) return 1;
+ return 0;
+}
+
void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
int stride, int tx_type, int bd) {
const highbd_transform_2d IHT_4[] = {
@@ -219,6 +229,13 @@ void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
tran_low_t *outptr = out;
tran_low_t temp_in[4], temp_out[4];
+ if (detect_invalid_iht_input(input, 16)) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+ assert(0 && "invalid highbd iht input");
+#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
+ return;
+ }
+
// Inverse transform row vectors.
for (i = 0; i < 4; ++i) {
IHT_4[tx_type].rows(input, outptr, bd);
@@ -253,6 +270,13 @@ void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
const highbd_transform_2d ht = HIGH_IHT_8[tx_type];
uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+ if (detect_invalid_iht_input(input, 64)) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+ assert(0 && "invalid highbd iht input");
+#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
+ return;
+ }
+
// Inverse transform row vectors.
for (i = 0; i < 8; ++i) {
ht.rows(input, outptr, bd);
@@ -287,6 +311,13 @@ void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
const highbd_transform_2d ht = HIGH_IHT_16[tx_type];
uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+ if (detect_invalid_iht_input(input, 256)) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+ assert(0 && "invalid highbd iht input");
+#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
+ return;
+ }
+
// Rows
for (i = 0; i < 16; ++i) {
ht.rows(input, outptr, bd);
@@ -329,8 +360,6 @@ void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
// The calculation can be simplified if there are not many non-zero dct
// coefficients. Use eobs to decide what to do.
- // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.
- // Combine that with code here.
// DC only DCT coefficient
if (eob == 1) {
vpx_highbd_idct8x8_1_add(input, dest, stride, bd);
diff --git a/vp9/common/vp9_postproc.c b/vp9/common/vp9_postproc.c
index b6ae10b1b..b105e5d45 100644
--- a/vp9/common/vp9_postproc.c
+++ b/vp9/common/vp9_postproc.c
@@ -26,7 +26,6 @@
#include "vp9/common/vp9_onyxc_int.h"
#include "vp9/common/vp9_postproc.h"
-#include "vp9/common/vp9_textblit.h"
#if CONFIG_VP9_POSTPROC
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index f315a3b85..37a867323 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -92,33 +92,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
# High bitdepth functions
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
#
- # Sub Pixel Filters
- #
- add_proto qw/void vp9_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
- specialize qw/vp9_highbd_convolve_copy/;
-
- add_proto qw/void vp9_highbd_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
- specialize qw/vp9_highbd_convolve_avg/;
-
- add_proto qw/void vp9_highbd_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
- specialize qw/vp9_highbd_convolve8/, "$sse2_x86_64";
-
- add_proto qw/void vp9_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
- specialize qw/vp9_highbd_convolve8_horiz/, "$sse2_x86_64";
-
- add_proto qw/void vp9_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
- specialize qw/vp9_highbd_convolve8_vert/, "$sse2_x86_64";
-
- add_proto qw/void vp9_highbd_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
- specialize qw/vp9_highbd_convolve8_avg/, "$sse2_x86_64";
-
- add_proto qw/void vp9_highbd_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
- specialize qw/vp9_highbd_convolve8_avg_horiz/, "$sse2_x86_64";
-
- add_proto qw/void vp9_highbd_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
- specialize qw/vp9_highbd_convolve8_avg_vert/, "$sse2_x86_64";
-
- #
# post proc
#
if (vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
diff --git a/vp9/common/vp9_textblit.c b/vp9/common/vp9_textblit.c
deleted file mode 100644
index 9940137ca..000000000
--- a/vp9/common/vp9_textblit.c
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stdlib.h>
-
-#include "vp9/common/vp9_textblit.h"
-
-static const int font[] = {
- 0x0, 0x5C00, 0x8020, 0xAFABEA, 0xD7EC0, 0x1111111, 0x1855740,
- 0x18000, 0x45C0, 0x74400, 0x51140, 0x23880, 0xC4000, 0x21080,
- 0x80000, 0x111110, 0xE9D72E, 0x87E40, 0x12AD732, 0xAAD62A, 0x4F94C4,
- 0x4D6B7, 0x456AA, 0x3E8423, 0xAAD6AA, 0xAAD6A2, 0x2800, 0x2A00,
- 0x8A880, 0x52940, 0x22A20, 0x15422, 0x6AD62E, 0x1E4A53E, 0xAAD6BF,
- 0x8C62E, 0xE8C63F, 0x118D6BF, 0x1094BF, 0xCAC62E, 0x1F2109F, 0x118FE31,
- 0xF8C628, 0x8A89F, 0x108421F, 0x1F1105F, 0x1F4105F, 0xE8C62E, 0x2294BF,
- 0x164C62E, 0x12694BF, 0x8AD6A2, 0x10FC21, 0x1F8421F, 0x744107, 0xF8220F,
- 0x1151151, 0x117041, 0x119D731, 0x47E0, 0x1041041, 0xFC400, 0x10440,
- 0x1084210, 0x820
-};
-
-static void plot(int x, int y, unsigned char *image, int pitch) {
- image[x + y * pitch] ^= 255;
-}
-
-void vp9_blit_text(const char *msg, unsigned char *address, const int pitch) {
- int letter_bitmap;
- unsigned char *output_pos = address;
- int colpos = 0;
-
- while (msg[colpos] != 0) {
- char letter = msg[colpos];
- int fontcol, fontrow;
-
- if (letter <= 'Z' && letter >= ' ')
- letter_bitmap = font[letter - ' '];
- else if (letter <= 'z' && letter >= 'a')
- letter_bitmap = font[letter - 'a' + 'A' - ' '];
- else
- letter_bitmap = font[0];
-
- for (fontcol = 6; fontcol >= 0; fontcol--)
- for (fontrow = 0; fontrow < 5; fontrow++)
- output_pos[fontrow * pitch + fontcol] =
- ((letter_bitmap >> (fontcol * 5)) & (1 << fontrow) ? 255 : 0);
-
- output_pos += 7;
- colpos++;
- }
-}
-
-/* Bresenham line algorithm */
-void vp9_blit_line(int x0, int x1, int y0, int y1, unsigned char *image,
- int pitch) {
- int steep = abs(y1 - y0) > abs(x1 - x0);
- int deltax, deltay;
- int error, ystep, y, x;
-
- if (steep) {
- int t;
- t = x0;
- x0 = y0;
- y0 = t;
-
- t = x1;
- x1 = y1;
- y1 = t;
- }
-
- if (x0 > x1) {
- int t;
- t = x0;
- x0 = x1;
- x1 = t;
-
- t = y0;
- y0 = y1;
- y1 = t;
- }
-
- deltax = x1 - x0;
- deltay = abs(y1 - y0);
- error = deltax / 2;
-
- y = y0;
-
- if (y0 < y1)
- ystep = 1;
- else
- ystep = -1;
-
- if (steep) {
- for (x = x0; x <= x1; x++) {
- plot(y, x, image, pitch);
-
- error = error - deltay;
- if (error < 0) {
- y = y + ystep;
- error = error + deltax;
- }
- }
- } else {
- for (x = x0; x <= x1; x++) {
- plot(x, y, image, pitch);
-
- error = error - deltay;
- if (error < 0) {
- y = y + ystep;
- error = error + deltax;
- }
- }
- }
-}
diff --git a/vp9/common/vp9_textblit.h b/vp9/common/vp9_textblit.h
deleted file mode 100644
index 158ec1b37..000000000
--- a/vp9/common/vp9_textblit.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_COMMON_VP9_TEXTBLIT_H_
-#define VP9_COMMON_VP9_TEXTBLIT_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void vp9_blit_text(const char *msg, unsigned char *address, int pitch);
-
-void vp9_blit_line(int x0, int x1, int y0, int y1, unsigned char *image,
- int pitch);
-
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif // VP9_COMMON_VP9_TEXTBLIT_H_
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index af2c900e6..fde0b7e31 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -318,11 +318,11 @@ static void inverse_transform_block_intra(MACROBLOCKD *xd, int plane,
}
}
-static void predict_and_reconstruct_intra_block(MACROBLOCKD *const xd,
- vpx_reader *r,
+static void predict_and_reconstruct_intra_block(TileWorkerData *twd,
MODE_INFO *const mi, int plane,
int row, int col,
TX_SIZE tx_size) {
+ MACROBLOCKD *const xd = &twd->xd;
struct macroblockd_plane *const pd = &xd->plane[plane];
PREDICTION_MODE mode = (plane == 0) ? mi->mode : mi->uv_mode;
uint8_t *dst;
@@ -340,7 +340,7 @@ static void predict_and_reconstruct_intra_block(MACROBLOCKD *const xd,
const scan_order *sc = (plane || xd->lossless)
? &vp9_default_scan_orders[tx_size]
: &vp9_scan_orders[tx_size][tx_type];
- const int eob = vp9_decode_block_tokens(xd, plane, sc, col, row, tx_size, r,
+ const int eob = vp9_decode_block_tokens(twd, plane, sc, col, row, tx_size,
mi->segment_id);
if (eob > 0) {
inverse_transform_block_intra(xd, plane, tx_type, tx_size, dst,
@@ -349,12 +349,13 @@ static void predict_and_reconstruct_intra_block(MACROBLOCKD *const xd,
}
}
-static int reconstruct_inter_block(MACROBLOCKD *const xd, vpx_reader *r,
- MODE_INFO *const mi, int plane, int row,
- int col, TX_SIZE tx_size) {
+static int reconstruct_inter_block(TileWorkerData *twd, MODE_INFO *const mi,
+ int plane, int row, int col,
+ TX_SIZE tx_size) {
+ MACROBLOCKD *const xd = &twd->xd;
struct macroblockd_plane *const pd = &xd->plane[plane];
const scan_order *sc = &vp9_default_scan_orders[tx_size];
- const int eob = vp9_decode_block_tokens(xd, plane, sc, col, row, tx_size, r,
+ const int eob = vp9_decode_block_tokens(twd, plane, sc, col, row, tx_size,
mi->segment_id);
if (eob > 0) {
@@ -761,15 +762,16 @@ static MODE_INFO *set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd,
return xd->mi[0];
}
-static void decode_block(VP9Decoder *const pbi, MACROBLOCKD *const xd,
- int mi_row, int mi_col, vpx_reader *r,
- BLOCK_SIZE bsize, int bwl, int bhl) {
+static void decode_block(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row,
+ int mi_col, BLOCK_SIZE bsize, int bwl, int bhl) {
VP9_COMMON *const cm = &pbi->common;
const int less8x8 = bsize < BLOCK_8X8;
const int bw = 1 << (bwl - 1);
const int bh = 1 << (bhl - 1);
const int x_mis = VPXMIN(bw, cm->mi_cols - mi_col);
const int y_mis = VPXMIN(bh, cm->mi_rows - mi_row);
+ vpx_reader *r = &twd->bit_reader;
+ MACROBLOCKD *const xd = &twd->xd;
MODE_INFO *mi = set_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis,
y_mis, bwl, bhl);
@@ -782,7 +784,7 @@ static void decode_block(VP9Decoder *const pbi, MACROBLOCKD *const xd,
"Invalid block size.");
}
- vp9_read_mode_info(pbi, xd, mi_row, mi_col, r, x_mis, y_mis);
+ vp9_read_mode_info(twd, pbi, mi_row, mi_col, x_mis, y_mis);
if (mi->skip) {
dec_reset_skip_context(xd);
@@ -811,7 +813,7 @@ static void decode_block(VP9Decoder *const pbi, MACROBLOCKD *const xd,
for (row = 0; row < max_blocks_high; row += step)
for (col = 0; col < max_blocks_wide; col += step)
- predict_and_reconstruct_intra_block(xd, r, mi, plane, row, col,
+ predict_and_reconstruct_intra_block(twd, mi, plane, row, col,
tx_size);
}
} else {
@@ -845,7 +847,7 @@ static void decode_block(VP9Decoder *const pbi, MACROBLOCKD *const xd,
for (row = 0; row < max_blocks_high; row += step)
for (col = 0; col < max_blocks_wide; col += step)
eobtotal +=
- reconstruct_inter_block(xd, r, mi, plane, row, col, tx_size);
+ reconstruct_inter_block(twd, mi, plane, row, col, tx_size);
}
if (!less8x8 && eobtotal == 0) mi->skip = 1; // skip loopfilter
@@ -859,10 +861,11 @@ static void decode_block(VP9Decoder *const pbi, MACROBLOCKD *const xd,
}
}
-static INLINE int dec_partition_plane_context(const MACROBLOCKD *xd, int mi_row,
+static INLINE int dec_partition_plane_context(TileWorkerData *twd, int mi_row,
int mi_col, int bsl) {
- const PARTITION_CONTEXT *above_ctx = xd->above_seg_context + mi_col;
- const PARTITION_CONTEXT *left_ctx = xd->left_seg_context + (mi_row & MI_MASK);
+ const PARTITION_CONTEXT *above_ctx = twd->xd.above_seg_context + mi_col;
+ const PARTITION_CONTEXT *left_ctx =
+ twd->xd.left_seg_context + (mi_row & MI_MASK);
int above = (*above_ctx >> bsl) & 1, left = (*left_ctx >> bsl) & 1;
// assert(bsl >= 0);
@@ -870,11 +873,12 @@ static INLINE int dec_partition_plane_context(const MACROBLOCKD *xd, int mi_row,
return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
}
-static INLINE void dec_update_partition_context(MACROBLOCKD *xd, int mi_row,
+static INLINE void dec_update_partition_context(TileWorkerData *twd, int mi_row,
int mi_col, BLOCK_SIZE subsize,
int bw) {
- PARTITION_CONTEXT *const above_ctx = xd->above_seg_context + mi_col;
- PARTITION_CONTEXT *const left_ctx = xd->left_seg_context + (mi_row & MI_MASK);
+ PARTITION_CONTEXT *const above_ctx = twd->xd.above_seg_context + mi_col;
+ PARTITION_CONTEXT *const left_ctx =
+ twd->xd.left_seg_context + (mi_row & MI_MASK);
// update the partition context at the end notes. set partition bits
// of block sizes larger than the current one to be one, and partition
@@ -883,13 +887,14 @@ static INLINE void dec_update_partition_context(MACROBLOCKD *xd, int mi_row,
memset(left_ctx, partition_context_lookup[subsize].left, bw);
}
-static PARTITION_TYPE read_partition(MACROBLOCKD *xd, int mi_row, int mi_col,
- vpx_reader *r, int has_rows, int has_cols,
+static PARTITION_TYPE read_partition(TileWorkerData *twd, int mi_row,
+ int mi_col, int has_rows, int has_cols,
int bsl) {
- const int ctx = dec_partition_plane_context(xd, mi_row, mi_col, bsl);
- const vpx_prob *const probs = get_partition_probs(xd, ctx);
- FRAME_COUNTS *counts = xd->counts;
+ const int ctx = dec_partition_plane_context(twd, mi_row, mi_col, bsl);
+ const vpx_prob *const probs = twd->xd.partition_probs[ctx];
+ FRAME_COUNTS *counts = twd->xd.counts;
PARTITION_TYPE p;
+ vpx_reader *r = &twd->bit_reader;
if (has_rows && has_cols)
p = (PARTITION_TYPE)vpx_read_tree(r, vp9_partition_tree, probs);
@@ -906,9 +911,9 @@ static PARTITION_TYPE read_partition(MACROBLOCKD *xd, int mi_row, int mi_col,
}
// TODO(slavarnway): eliminate bsize and subsize in future commits
-static void decode_partition(VP9Decoder *const pbi, MACROBLOCKD *const xd,
- int mi_row, int mi_col, vpx_reader *r,
- BLOCK_SIZE bsize, int n4x4_l2) {
+static void decode_partition(TileWorkerData *twd, VP9Decoder *const pbi,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ int n4x4_l2) {
VP9_COMMON *const cm = &pbi->common;
const int n8x8_l2 = n4x4_l2 - 1;
const int num_8x8_wh = 1 << n8x8_l2;
@@ -917,39 +922,39 @@ static void decode_partition(VP9Decoder *const pbi, MACROBLOCKD *const xd,
BLOCK_SIZE subsize;
const int has_rows = (mi_row + hbs) < cm->mi_rows;
const int has_cols = (mi_col + hbs) < cm->mi_cols;
+ MACROBLOCKD *const xd = &twd->xd;
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
- partition =
- read_partition(xd, mi_row, mi_col, r, has_rows, has_cols, n8x8_l2);
+ partition = read_partition(twd, mi_row, mi_col, has_rows, has_cols, n8x8_l2);
subsize = subsize_lookup[partition][bsize]; // get_subsize(bsize, partition);
if (!hbs) {
// calculate bmode block dimensions (log 2)
xd->bmode_blocks_wl = 1 >> !!(partition & PARTITION_VERT);
xd->bmode_blocks_hl = 1 >> !!(partition & PARTITION_HORZ);
- decode_block(pbi, xd, mi_row, mi_col, r, subsize, 1, 1);
+ decode_block(twd, pbi, mi_row, mi_col, subsize, 1, 1);
} else {
switch (partition) {
case PARTITION_NONE:
- decode_block(pbi, xd, mi_row, mi_col, r, subsize, n4x4_l2, n4x4_l2);
+ decode_block(twd, pbi, mi_row, mi_col, subsize, n4x4_l2, n4x4_l2);
break;
case PARTITION_HORZ:
- decode_block(pbi, xd, mi_row, mi_col, r, subsize, n4x4_l2, n8x8_l2);
+ decode_block(twd, pbi, mi_row, mi_col, subsize, n4x4_l2, n8x8_l2);
if (has_rows)
- decode_block(pbi, xd, mi_row + hbs, mi_col, r, subsize, n4x4_l2,
+ decode_block(twd, pbi, mi_row + hbs, mi_col, subsize, n4x4_l2,
n8x8_l2);
break;
case PARTITION_VERT:
- decode_block(pbi, xd, mi_row, mi_col, r, subsize, n8x8_l2, n4x4_l2);
+ decode_block(twd, pbi, mi_row, mi_col, subsize, n8x8_l2, n4x4_l2);
if (has_cols)
- decode_block(pbi, xd, mi_row, mi_col + hbs, r, subsize, n8x8_l2,
+ decode_block(twd, pbi, mi_row, mi_col + hbs, subsize, n8x8_l2,
n4x4_l2);
break;
case PARTITION_SPLIT:
- decode_partition(pbi, xd, mi_row, mi_col, r, subsize, n8x8_l2);
- decode_partition(pbi, xd, mi_row, mi_col + hbs, r, subsize, n8x8_l2);
- decode_partition(pbi, xd, mi_row + hbs, mi_col, r, subsize, n8x8_l2);
- decode_partition(pbi, xd, mi_row + hbs, mi_col + hbs, r, subsize,
+ decode_partition(twd, pbi, mi_row, mi_col, subsize, n8x8_l2);
+ decode_partition(twd, pbi, mi_row, mi_col + hbs, subsize, n8x8_l2);
+ decode_partition(twd, pbi, mi_row + hbs, mi_col, subsize, n8x8_l2);
+ decode_partition(twd, pbi, mi_row + hbs, mi_col + hbs, subsize,
n8x8_l2);
break;
default: assert(0 && "Invalid partition type");
@@ -959,7 +964,7 @@ static void decode_partition(VP9Decoder *const pbi, MACROBLOCKD *const xd,
// update partition context
if (bsize >= BLOCK_8X8 &&
(bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
- dec_update_partition_context(xd, mi_row, mi_col, subsize, num_8x8_wh);
+ dec_update_partition_context(twd, mi_row, mi_col, subsize, num_8x8_wh);
}
static void setup_token_decoder(const uint8_t *data, const uint8_t *data_end,
@@ -1442,8 +1447,7 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data,
vp9_zero(tile_data->xd.left_seg_context);
for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end;
mi_col += MI_BLOCK_SIZE) {
- decode_partition(pbi, &tile_data->xd, mi_row, mi_col,
- &tile_data->bit_reader, BLOCK_64X64, 4);
+ decode_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4);
}
pbi->mb.corrupted |= tile_data->xd.corrupted;
if (pbi->mb.corrupted)
@@ -1532,8 +1536,7 @@ static int tile_worker_hook(TileWorkerData *const tile_data,
vp9_zero(tile_data->xd.left_seg_context);
for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
mi_col += MI_BLOCK_SIZE) {
- decode_partition(pbi, &tile_data->xd, mi_row, mi_col,
- &tile_data->bit_reader, BLOCK_64X64, 4);
+ decode_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4);
}
}
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 7358c9a39..4372ba037 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -241,7 +241,7 @@ static int read_mv_component(vpx_reader *r, const nmv_component *mvcomp,
// Integer part
if (class0) {
- d = vpx_read_tree(r, vp9_mv_class0_tree, mvcomp->class0);
+ d = vpx_read(r, mvcomp->class0[0]);
mag = 0;
} else {
int i;
@@ -826,8 +826,10 @@ static INLINE void copy_ref_frame_pair(MV_REFERENCE_FRAME *dst,
memcpy(dst, src, sizeof(*dst) * 2);
}
-void vp9_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd, int mi_row,
- int mi_col, vpx_reader *r, int x_mis, int y_mis) {
+void vp9_read_mode_info(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row,
+ int mi_col, int x_mis, int y_mis) {
+ vpx_reader *r = &twd->bit_reader;
+ MACROBLOCKD *const xd = &twd->xd;
VP9_COMMON *const cm = &pbi->common;
MODE_INFO *const mi = xd->mi[0];
MV_REF *frame_mvs = cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col;
diff --git a/vp9/decoder/vp9_decodemv.h b/vp9/decoder/vp9_decodemv.h
index 4e11c2fc0..b460cb8fb 100644
--- a/vp9/decoder/vp9_decodemv.h
+++ b/vp9/decoder/vp9_decodemv.h
@@ -19,8 +19,8 @@
extern "C" {
#endif
-void vp9_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd, int mi_row,
- int mi_col, vpx_reader *r, int x_mis, int y_mis);
+void vp9_read_mode_info(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row,
+ int mi_col, int x_mis, int y_mis);
#ifdef __cplusplus
} // extern "C"
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index cc01909ff..7048fb1ca 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -29,9 +29,45 @@
if (counts) ++coef_counts[band][ctx][token]; \
} while (0)
-static INLINE int read_coeff(const vpx_prob *probs, int n, vpx_reader *r) {
+static INLINE int read_bool(vpx_reader *r, int prob, BD_VALUE *value,
+ int *count, unsigned int *range) {
+ const unsigned int split = (*range * prob + (256 - prob)) >> CHAR_BIT;
+ const BD_VALUE bigsplit = (BD_VALUE)split << (BD_VALUE_SIZE - CHAR_BIT);
+
+ if (*count < 0) {
+ r->value = *value;
+ r->count = *count;
+ vpx_reader_fill(r);
+ *value = r->value;
+ *count = r->count;
+ }
+
+ if (*value >= bigsplit) {
+ *range = *range - split;
+ *value = *value - bigsplit;
+ {
+ const int shift = vpx_norm[*range];
+ *range <<= shift;
+ *value <<= shift;
+ *count -= shift;
+ }
+ return 1;
+ }
+ *range = split;
+ {
+ const int shift = vpx_norm[*range];
+ *range <<= shift;
+ *value <<= shift;
+ *count -= shift;
+ }
+ return 0;
+}
+
+static INLINE int read_coeff(vpx_reader *r, const vpx_prob *probs, int n,
+ BD_VALUE *value, int *count, unsigned int *range) {
int i, val = 0;
- for (i = 0; i < n; ++i) val = (val << 1) | vpx_read(r, probs[i]);
+ for (i = 0; i < n; ++i)
+ val = (val << 1) | read_bool(r, probs[i], value, count, range);
return val;
}
@@ -52,7 +88,7 @@ static int decode_coefs(const MACROBLOCKD *xd, PLANE_TYPE type,
uint8_t token_cache[32 * 32];
const uint8_t *band_translate = get_band_translate(tx_size);
const int dq_shift = (tx_size == TX_32X32);
- int v, token;
+ int v;
int16_t dqv = dq[0];
const uint8_t *const cat6_prob =
#if CONFIG_VP9_HIGHBITDEPTH
@@ -66,6 +102,11 @@ static int decode_coefs(const MACROBLOCKD *xd, PLANE_TYPE type,
(xd->bd == VPX_BITS_12) ? 18 : (xd->bd == VPX_BITS_10) ? 16 :
#endif // CONFIG_VP9_HIGHBITDEPTH
14;
+ // Keep value, range, and count as locals. The compiler produces better
+ // results with the locals than using r directly.
+ BD_VALUE value = r->value;
+ unsigned int range = r->range;
+ int count = r->count;
if (counts) {
coef_counts = counts->coef[tx_size][type][ref];
@@ -77,70 +118,98 @@ static int decode_coefs(const MACROBLOCKD *xd, PLANE_TYPE type,
band = *band_translate++;
prob = coef_probs[band][ctx];
if (counts) ++eob_branch_count[band][ctx];
- if (!vpx_read(r, prob[EOB_CONTEXT_NODE])) {
+ if (!read_bool(r, prob[EOB_CONTEXT_NODE], &value, &count, &range)) {
INCREMENT_COUNT(EOB_MODEL_TOKEN);
break;
}
- while (!vpx_read(r, prob[ZERO_CONTEXT_NODE])) {
+ while (!read_bool(r, prob[ZERO_CONTEXT_NODE], &value, &count, &range)) {
INCREMENT_COUNT(ZERO_TOKEN);
dqv = dq[1];
token_cache[scan[c]] = 0;
++c;
- if (c >= max_eob) return c; // zero tokens at the end (no eob token)
+ if (c >= max_eob) {
+ r->value = value;
+ r->range = range;
+ r->count = count;
+ return c; // zero tokens at the end (no eob token)
+ }
ctx = get_coef_context(nb, token_cache, c);
band = *band_translate++;
prob = coef_probs[band][ctx];
}
- if (!vpx_read(r, prob[ONE_CONTEXT_NODE])) {
- INCREMENT_COUNT(ONE_TOKEN);
- token = ONE_TOKEN;
- val = 1;
- } else {
+ if (read_bool(r, prob[ONE_CONTEXT_NODE], &value, &count, &range)) {
+ const vpx_prob *p = vp9_pareto8_full[prob[PIVOT_NODE] - 1];
INCREMENT_COUNT(TWO_TOKEN);
- token = vpx_read_tree(r, vp9_coef_con_tree,
- vp9_pareto8_full[prob[PIVOT_NODE] - 1]);
- switch (token) {
- case TWO_TOKEN:
- case THREE_TOKEN:
- case FOUR_TOKEN: val = token; break;
- case CATEGORY1_TOKEN:
- val = CAT1_MIN_VAL + read_coeff(vp9_cat1_prob, 1, r);
- break;
- case CATEGORY2_TOKEN:
- val = CAT2_MIN_VAL + read_coeff(vp9_cat2_prob, 2, r);
- break;
- case CATEGORY3_TOKEN:
- val = CAT3_MIN_VAL + read_coeff(vp9_cat3_prob, 3, r);
- break;
- case CATEGORY4_TOKEN:
- val = CAT4_MIN_VAL + read_coeff(vp9_cat4_prob, 4, r);
- break;
- case CATEGORY5_TOKEN:
- val = CAT5_MIN_VAL + read_coeff(vp9_cat5_prob, 5, r);
- break;
- case CATEGORY6_TOKEN:
- val = CAT6_MIN_VAL + read_coeff(cat6_prob, cat6_bits, r);
- break;
+ if (read_bool(r, p[0], &value, &count, &range)) {
+ if (read_bool(r, p[3], &value, &count, &range)) {
+ token_cache[scan[c]] = 5;
+ if (read_bool(r, p[5], &value, &count, &range)) {
+ if (read_bool(r, p[7], &value, &count, &range)) {
+ val = CAT6_MIN_VAL +
+ read_coeff(r, cat6_prob, cat6_bits, &value, &count, &range);
+ } else {
+ val = CAT5_MIN_VAL +
+ read_coeff(r, vp9_cat5_prob, 5, &value, &count, &range);
+ }
+ } else if (read_bool(r, p[6], &value, &count, &range)) {
+ val = CAT4_MIN_VAL +
+ read_coeff(r, vp9_cat4_prob, 4, &value, &count, &range);
+ } else {
+ val = CAT3_MIN_VAL +
+ read_coeff(r, vp9_cat3_prob, 3, &value, &count, &range);
+ }
+ } else {
+ token_cache[scan[c]] = 4;
+ if (read_bool(r, p[4], &value, &count, &range)) {
+ val = CAT2_MIN_VAL +
+ read_coeff(r, vp9_cat2_prob, 2, &value, &count, &range);
+ } else {
+ val = CAT1_MIN_VAL +
+ read_coeff(r, vp9_cat1_prob, 1, &value, &count, &range);
+ }
+ }
+ v = (val * dqv) >> dq_shift;
+ } else {
+ if (read_bool(r, p[1], &value, &count, &range)) {
+ token_cache[scan[c]] = 3;
+ v = ((3 + read_bool(r, p[2], &value, &count, &range)) * dqv) >>
+ dq_shift;
+ } else {
+ token_cache[scan[c]] = 2;
+ v = (2 * dqv) >> dq_shift;
+ }
}
+ } else {
+ INCREMENT_COUNT(ONE_TOKEN);
+ token_cache[scan[c]] = 1;
+ v = dqv >> dq_shift;
}
- v = (val * dqv) >> dq_shift;
#if CONFIG_COEFFICIENT_RANGE_CHECKING
#if CONFIG_VP9_HIGHBITDEPTH
- dqcoeff[scan[c]] = highbd_check_range((vpx_read_bit(r) ? -v : v), xd->bd);
+ dqcoeff[scan[c]] =
+ highbd_check_range(read_bool(r, 128, &value, &count, &range) ? -v : v),
+ xd->bd);
#else
- dqcoeff[scan[c]] = check_range(vpx_read_bit(r) ? -v : v);
+ dqcoeff[scan[c]] =
+ check_range(read_bool(r, 128, &value, &count, &range) ? -v : v);
#endif // CONFIG_VP9_HIGHBITDEPTH
#else
- dqcoeff[scan[c]] = vpx_read_bit(r) ? -v : v;
+ if (read_bool(r, 128, &value, &count, &range)) {
+ dqcoeff[scan[c]] = -v;
+ } else {
+ dqcoeff[scan[c]] = v;
+ }
#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
- token_cache[scan[c]] = vp9_pt_energy_class[token];
++c;
ctx = get_coef_context(nb, token_cache, c);
dqv = dq[1];
}
+ r->value = value;
+ r->range = range;
+ r->count = count;
return c;
}
@@ -156,9 +225,11 @@ static void get_ctx_shift(MACROBLOCKD *xd, int *ctx_shift_a, int *ctx_shift_l,
}
}
-int vp9_decode_block_tokens(MACROBLOCKD *xd, int plane, const scan_order *sc,
- int x, int y, TX_SIZE tx_size, vpx_reader *r,
+int vp9_decode_block_tokens(TileWorkerData *twd, int plane,
+ const scan_order *sc, int x, int y, TX_SIZE tx_size,
int seg_id) {
+ vpx_reader *r = &twd->bit_reader;
+ MACROBLOCKD *xd = &twd->xd;
struct macroblockd_plane *const pd = &xd->plane[plane];
const int16_t *const dequant = pd->seg_dequant[seg_id];
int eob;
diff --git a/vp9/decoder/vp9_detokenize.h b/vp9/decoder/vp9_detokenize.h
index aa2afb16a..7b0d87601 100644
--- a/vp9/decoder/vp9_detokenize.h
+++ b/vp9/decoder/vp9_detokenize.h
@@ -19,8 +19,8 @@
extern "C" {
#endif
-int vp9_decode_block_tokens(MACROBLOCKD *xd, int plane, const scan_order *sc,
- int x, int y, TX_SIZE tx_size, vpx_reader *r,
+int vp9_decode_block_tokens(TileWorkerData *twd, int plane,
+ const scan_order *sc, int x, int y, TX_SIZE tx_size,
int seg_id);
#ifdef __cplusplus
diff --git a/vp9/encoder/vp9_encodemv.c b/vp9/encoder/vp9_encodemv.c
index 8e76f72fe..874a8e4b9 100644
--- a/vp9/encoder/vp9_encodemv.c
+++ b/vp9/encoder/vp9_encodemv.c
@@ -21,12 +21,10 @@
static struct vp9_token mv_joint_encodings[MV_JOINTS];
static struct vp9_token mv_class_encodings[MV_CLASSES];
static struct vp9_token mv_fp_encodings[MV_FP_SIZE];
-static struct vp9_token mv_class0_encodings[CLASS0_SIZE];
void vp9_entropy_mv_init(void) {
vp9_tokens_from_tree(mv_joint_encodings, vp9_mv_joint_tree);
vp9_tokens_from_tree(mv_class_encodings, vp9_mv_class_tree);
- vp9_tokens_from_tree(mv_class0_encodings, vp9_mv_class0_tree);
vp9_tokens_from_tree(mv_fp_encodings, vp9_mv_fp_tree);
}
@@ -51,8 +49,7 @@ static void encode_mv_component(vpx_writer *w, int comp,
// Integer bits
if (mv_class == MV_CLASS_0) {
- vp9_write_token(w, vp9_mv_class0_tree, mvcomp->class0,
- &mv_class0_encodings[d]);
+ vpx_write(w, d, mvcomp->class0[0]);
} else {
int i;
const int n = mv_class + CLASS0_BITS - 1; // number of bits
diff --git a/vp9/encoder/vp9_picklpf.c b/vp9/encoder/vp9_picklpf.c
index 1583cc8ab..6fc7cd1e3 100644
--- a/vp9/encoder/vp9_picklpf.c
+++ b/vp9/encoder/vp9_picklpf.c
@@ -180,6 +180,10 @@ void vp9_pick_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
}
#else
int filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18);
+ if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR &&
+ cpi->oxcf.content != VP9E_CONTENT_SCREEN && cm->frame_type != KEY_FRAME)
+ filt_guess = 5 * filt_guess >> 3;
+
#endif // CONFIG_VP9_HIGHBITDEPTH
if (cm->frame_type == KEY_FRAME) filt_guess -= 4;
lf->filter_level = clamp(filt_guess, min_filter_level, max_filter_level);
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 2fd42960e..5bfc0d359 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -45,7 +45,6 @@ VP9_COMMON_SRCS-yes += common/vp9_scale.h
VP9_COMMON_SRCS-yes += common/vp9_scale.c
VP9_COMMON_SRCS-yes += common/vp9_seg_common.h
VP9_COMMON_SRCS-yes += common/vp9_seg_common.c
-VP9_COMMON_SRCS-yes += common/vp9_textblit.h
VP9_COMMON_SRCS-yes += common/vp9_tile_common.h
VP9_COMMON_SRCS-yes += common/vp9_tile_common.c
VP9_COMMON_SRCS-yes += common/vp9_loopfilter.c
@@ -55,7 +54,6 @@ VP9_COMMON_SRCS-yes += common/vp9_mvref_common.h
VP9_COMMON_SRCS-yes += common/vp9_quant_common.c
VP9_COMMON_SRCS-yes += common/vp9_reconinter.c
VP9_COMMON_SRCS-yes += common/vp9_reconintra.c
-VP9_COMMON_SRCS-$(CONFIG_POSTPROC_VISUALIZER) += common/vp9_textblit.c
VP9_COMMON_SRCS-yes += common/vp9_common_data.c
VP9_COMMON_SRCS-yes += common/vp9_common_data.h
VP9_COMMON_SRCS-yes += common/vp9_scan.c
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index 04b1dca29..3b5dc3dda 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -829,13 +829,6 @@ static vpx_codec_err_t ctrl_set_postproc(vpx_codec_alg_priv_t *ctx,
#endif
}
-static vpx_codec_err_t ctrl_set_dbg_options(vpx_codec_alg_priv_t *ctx,
- va_list args) {
- (void)ctx;
- (void)args;
- return VPX_CODEC_INCAPABLE;
-}
-
static vpx_codec_err_t ctrl_get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
va_list args) {
int *const update_info = va_arg(args, int *);
@@ -1014,10 +1007,6 @@ static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
// Setters
{ VP8_SET_REFERENCE, ctrl_set_reference },
{ VP8_SET_POSTPROC, ctrl_set_postproc },
- { VP8_SET_DBG_COLOR_REF_FRAME, ctrl_set_dbg_options },
- { VP8_SET_DBG_COLOR_MB_MODES, ctrl_set_dbg_options },
- { VP8_SET_DBG_COLOR_B_MODES, ctrl_set_dbg_options },
- { VP8_SET_DBG_DISPLAY_MV, ctrl_set_dbg_options },
{ VP9_INVERT_TILE_DECODE_ORDER, ctrl_set_invert_tile_order },
{ VPXD_SET_DECRYPTOR, ctrl_set_decryptor },
{ VP9_SET_BYTE_ALIGNMENT, ctrl_set_byte_alignment },
diff --git a/vpx/vp8.h b/vpx/vp8.h
index c3eb5265a..059c9d0f6 100644
--- a/vpx/vp8.h
+++ b/vpx/vp8.h
@@ -47,11 +47,10 @@ enum vp8_com_control_id {
VP8_SET_REFERENCE = 1,
VP8_COPY_REFERENCE = 2, /**< get a copy of reference frame from the decoder */
VP8_SET_POSTPROC = 3, /**< set the decoder's post processing settings */
- VP8_SET_DBG_COLOR_REF_FRAME =
- 4, /**< set the reference frames to color for each macroblock */
- VP8_SET_DBG_COLOR_MB_MODES = 5, /**< set which macro block modes to color */
- VP8_SET_DBG_COLOR_B_MODES = 6, /**< set which blocks modes to color */
- VP8_SET_DBG_DISPLAY_MV = 7, /**< set which motion vector modes to draw */
+ VP8_SET_DBG_COLOR_REF_FRAME = 4, /**< \deprecated */
+ VP8_SET_DBG_COLOR_MB_MODES = 5, /**< \deprecated */
+ VP8_SET_DBG_COLOR_B_MODES = 6, /**< \deprecated */
+ VP8_SET_DBG_DISPLAY_MV = 7, /**< \deprecated */
/* TODO(jkoleszar): The encoder incorrectly reuses some of these values (5+)
* for its control ids. These should be migrated to something like the
@@ -133,13 +132,13 @@ VPX_CTRL_USE_TYPE(VP8_COPY_REFERENCE, vpx_ref_frame_t *)
#define VPX_CTRL_VP8_COPY_REFERENCE
VPX_CTRL_USE_TYPE(VP8_SET_POSTPROC, vp8_postproc_cfg_t *)
#define VPX_CTRL_VP8_SET_POSTPROC
-VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_REF_FRAME, int)
+VPX_CTRL_USE_TYPE_DEPRECATED(VP8_SET_DBG_COLOR_REF_FRAME, int)
#define VPX_CTRL_VP8_SET_DBG_COLOR_REF_FRAME
-VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_MB_MODES, int)
+VPX_CTRL_USE_TYPE_DEPRECATED(VP8_SET_DBG_COLOR_MB_MODES, int)
#define VPX_CTRL_VP8_SET_DBG_COLOR_MB_MODES
-VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_B_MODES, int)
+VPX_CTRL_USE_TYPE_DEPRECATED(VP8_SET_DBG_COLOR_B_MODES, int)
#define VPX_CTRL_VP8_SET_DBG_COLOR_B_MODES
-VPX_CTRL_USE_TYPE(VP8_SET_DBG_DISPLAY_MV, int)
+VPX_CTRL_USE_TYPE_DEPRECATED(VP8_SET_DBG_DISPLAY_MV, int)
#define VPX_CTRL_VP8_SET_DBG_DISPLAY_MV
VPX_CTRL_USE_TYPE(VP9_GET_REFERENCE, vp9_ref_frame_t *)
#define VPX_CTRL_VP9_GET_REFERENCE
diff --git a/vpx_dsp/arm/loopfilter_16_neon.c b/vpx_dsp/arm/loopfilter_16_neon.c
deleted file mode 100644
index 9607bb240..000000000
--- a/vpx_dsp/arm/loopfilter_16_neon.c
+++ /dev/null
@@ -1,173 +0,0 @@
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-#include "./vpx_dsp_rtcd.h"
-#include "./vpx_config.h"
-#include "vpx/vpx_integer.h"
-
-static INLINE void loop_filter_neon_16(uint8x16_t qblimit, // blimit
- uint8x16_t qlimit, // limit
- uint8x16_t qthresh, // thresh
- uint8x16_t q3, // p3
- uint8x16_t q4, // p2
- uint8x16_t q5, // p1
- uint8x16_t q6, // p0
- uint8x16_t q7, // q0
- uint8x16_t q8, // q1
- uint8x16_t q9, // q2
- uint8x16_t q10, // q3
- uint8x16_t *q5r, // p1
- uint8x16_t *q6r, // p0
- uint8x16_t *q7r, // q0
- uint8x16_t *q8r) { // q1
- uint8x16_t q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8;
- int16x8_t q2s16, q11s16;
- uint16x8_t q4u16;
- int8x16_t q0s8, q1s8, q2s8, q11s8, q12s8, q13s8;
- int8x8_t d2s8, d3s8;
-
- q11u8 = vabdq_u8(q3, q4);
- q12u8 = vabdq_u8(q4, q5);
- q13u8 = vabdq_u8(q5, q6);
- q14u8 = vabdq_u8(q8, q7);
- q3 = vabdq_u8(q9, q8);
- q4 = vabdq_u8(q10, q9);
-
- q11u8 = vmaxq_u8(q11u8, q12u8);
- q12u8 = vmaxq_u8(q13u8, q14u8);
- q3 = vmaxq_u8(q3, q4);
- q15u8 = vmaxq_u8(q11u8, q12u8);
-
- q9 = vabdq_u8(q6, q7);
-
- // vp8_hevmask
- q13u8 = vcgtq_u8(q13u8, qthresh);
- q14u8 = vcgtq_u8(q14u8, qthresh);
- q15u8 = vmaxq_u8(q15u8, q3);
-
- q2u8 = vabdq_u8(q5, q8);
- q9 = vqaddq_u8(q9, q9);
-
- q15u8 = vcgeq_u8(qlimit, q15u8);
-
- // vp8_filter() function
- // convert to signed
- q10 = vdupq_n_u8(0x80);
- q8 = veorq_u8(q8, q10);
- q7 = veorq_u8(q7, q10);
- q6 = veorq_u8(q6, q10);
- q5 = veorq_u8(q5, q10);
-
- q2u8 = vshrq_n_u8(q2u8, 1);
- q9 = vqaddq_u8(q9, q2u8);
-
- q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
- vget_low_s8(vreinterpretq_s8_u8(q6)));
- q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
- vget_high_s8(vreinterpretq_s8_u8(q6)));
-
- q9 = vcgeq_u8(qblimit, q9);
-
- q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), vreinterpretq_s8_u8(q8));
-
- q14u8 = vorrq_u8(q13u8, q14u8);
-
- q4u16 = vdupq_n_u16(3);
- q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16));
- q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16));
-
- q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8);
- q15u8 = vandq_u8(q15u8, q9);
-
- q1s8 = vreinterpretq_s8_u8(q1u8);
- q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
- q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8));
-
- q4 = vdupq_n_u8(3);
- q9 = vdupq_n_u8(4);
- // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
- d2s8 = vqmovn_s16(q2s16);
- d3s8 = vqmovn_s16(q11s16);
- q1s8 = vcombine_s8(d2s8, d3s8);
- q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8);
- q1s8 = vreinterpretq_s8_u8(q1u8);
-
- q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q4));
- q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9));
- q2s8 = vshrq_n_s8(q2s8, 3);
- q1s8 = vshrq_n_s8(q1s8, 3);
-
- q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8);
- q0s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8);
-
- q1s8 = vrshrq_n_s8(q1s8, 1);
- q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
-
- q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8);
- q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8);
-
- *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q10);
- *q7r = veorq_u8(vreinterpretq_u8_s8(q0s8), q10);
- *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q10);
- *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q10);
- return;
-}
-
-void vpx_lpf_horizontal_4_dual_neon(
- uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
- const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
- const uint8_t *limit1, const uint8_t *thresh1) {
- uint8x8_t dblimit0, dlimit0, dthresh0, dblimit1, dlimit1, dthresh1;
- uint8x16_t qblimit, qlimit, qthresh;
- uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8;
-
- dblimit0 = vld1_u8(blimit0);
- dlimit0 = vld1_u8(limit0);
- dthresh0 = vld1_u8(thresh0);
- dblimit1 = vld1_u8(blimit1);
- dlimit1 = vld1_u8(limit1);
- dthresh1 = vld1_u8(thresh1);
- qblimit = vcombine_u8(dblimit0, dblimit1);
- qlimit = vcombine_u8(dlimit0, dlimit1);
- qthresh = vcombine_u8(dthresh0, dthresh1);
-
- s -= (p << 2);
-
- q3u8 = vld1q_u8(s);
- s += p;
- q4u8 = vld1q_u8(s);
- s += p;
- q5u8 = vld1q_u8(s);
- s += p;
- q6u8 = vld1q_u8(s);
- s += p;
- q7u8 = vld1q_u8(s);
- s += p;
- q8u8 = vld1q_u8(s);
- s += p;
- q9u8 = vld1q_u8(s);
- s += p;
- q10u8 = vld1q_u8(s);
-
- loop_filter_neon_16(qblimit, qlimit, qthresh, q3u8, q4u8, q5u8, q6u8, q7u8,
- q8u8, q9u8, q10u8, &q5u8, &q6u8, &q7u8, &q8u8);
-
- s -= (p * 5);
- vst1q_u8(s, q5u8);
- s += p;
- vst1q_u8(s, q6u8);
- s += p;
- vst1q_u8(s, q7u8);
- s += p;
- vst1q_u8(s, q8u8);
- return;
-}
diff --git a/vpx_dsp/arm/loopfilter_4_neon.c b/vpx_dsp/arm/loopfilter_4_neon.c
deleted file mode 100644
index 1c1e80e00..000000000
--- a/vpx_dsp/arm/loopfilter_4_neon.c
+++ /dev/null
@@ -1,249 +0,0 @@
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-#include "./vpx_dsp_rtcd.h"
-
-static INLINE void loop_filter_neon(uint8x8_t dblimit, // flimit
- uint8x8_t dlimit, // limit
- uint8x8_t dthresh, // thresh
- uint8x8_t d3u8, // p3
- uint8x8_t d4u8, // p2
- uint8x8_t d5u8, // p1
- uint8x8_t d6u8, // p0
- uint8x8_t d7u8, // q0
- uint8x8_t d16u8, // q1
- uint8x8_t d17u8, // q2
- uint8x8_t d18u8, // q3
- uint8x8_t *d4ru8, // p1
- uint8x8_t *d5ru8, // p0
- uint8x8_t *d6ru8, // q0
- uint8x8_t *d7ru8) { // q1
- uint8x8_t d19u8, d20u8, d21u8, d22u8, d23u8, d27u8, d28u8;
- int16x8_t q12s16;
- int8x8_t d19s8, d20s8, d21s8, d26s8, d27s8, d28s8;
-
- d19u8 = vabd_u8(d3u8, d4u8);
- d20u8 = vabd_u8(d4u8, d5u8);
- d21u8 = vabd_u8(d5u8, d6u8);
- d22u8 = vabd_u8(d16u8, d7u8);
- d3u8 = vabd_u8(d17u8, d16u8);
- d4u8 = vabd_u8(d18u8, d17u8);
-
- d19u8 = vmax_u8(d19u8, d20u8);
- d20u8 = vmax_u8(d21u8, d22u8);
- d3u8 = vmax_u8(d3u8, d4u8);
- d23u8 = vmax_u8(d19u8, d20u8);
-
- d17u8 = vabd_u8(d6u8, d7u8);
-
- d21u8 = vcgt_u8(d21u8, dthresh);
- d22u8 = vcgt_u8(d22u8, dthresh);
- d23u8 = vmax_u8(d23u8, d3u8);
-
- d28u8 = vabd_u8(d5u8, d16u8);
- d17u8 = vqadd_u8(d17u8, d17u8);
-
- d23u8 = vcge_u8(dlimit, d23u8);
-
- d18u8 = vdup_n_u8(0x80);
- d5u8 = veor_u8(d5u8, d18u8);
- d6u8 = veor_u8(d6u8, d18u8);
- d7u8 = veor_u8(d7u8, d18u8);
- d16u8 = veor_u8(d16u8, d18u8);
-
- d28u8 = vshr_n_u8(d28u8, 1);
- d17u8 = vqadd_u8(d17u8, d28u8);
-
- d19u8 = vdup_n_u8(3);
-
- d28s8 = vsub_s8(vreinterpret_s8_u8(d7u8), vreinterpret_s8_u8(d6u8));
-
- d17u8 = vcge_u8(dblimit, d17u8);
-
- d27s8 = vqsub_s8(vreinterpret_s8_u8(d5u8), vreinterpret_s8_u8(d16u8));
-
- d22u8 = vorr_u8(d21u8, d22u8);
-
- q12s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d19u8));
-
- d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d22u8);
- d23u8 = vand_u8(d23u8, d17u8);
-
- q12s16 = vaddw_s8(q12s16, vreinterpret_s8_u8(d27u8));
-
- d17u8 = vdup_n_u8(4);
-
- d27s8 = vqmovn_s16(q12s16);
- d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d23u8);
- d27s8 = vreinterpret_s8_u8(d27u8);
-
- d28s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d19u8));
- d27s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d17u8));
- d28s8 = vshr_n_s8(d28s8, 3);
- d27s8 = vshr_n_s8(d27s8, 3);
-
- d19s8 = vqadd_s8(vreinterpret_s8_u8(d6u8), d28s8);
- d26s8 = vqsub_s8(vreinterpret_s8_u8(d7u8), d27s8);
-
- d27s8 = vrshr_n_s8(d27s8, 1);
- d27s8 = vbic_s8(d27s8, vreinterpret_s8_u8(d22u8));
-
- d21s8 = vqadd_s8(vreinterpret_s8_u8(d5u8), d27s8);
- d20s8 = vqsub_s8(vreinterpret_s8_u8(d16u8), d27s8);
-
- *d4ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d18u8);
- *d5ru8 = veor_u8(vreinterpret_u8_s8(d19s8), d18u8);
- *d6ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d18u8);
- *d7ru8 = veor_u8(vreinterpret_u8_s8(d20s8), d18u8);
- return;
-}
-
-void vpx_lpf_horizontal_4_neon(uint8_t *src, int pitch, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh) {
- int i;
- uint8_t *s, *psrc;
- uint8x8_t dblimit, dlimit, dthresh;
- uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
-
- dblimit = vld1_u8(blimit);
- dlimit = vld1_u8(limit);
- dthresh = vld1_u8(thresh);
-
- psrc = src - (pitch << 2);
- for (i = 0; i < 1; i++) {
- s = psrc + i * 8;
-
- d3u8 = vld1_u8(s);
- s += pitch;
- d4u8 = vld1_u8(s);
- s += pitch;
- d5u8 = vld1_u8(s);
- s += pitch;
- d6u8 = vld1_u8(s);
- s += pitch;
- d7u8 = vld1_u8(s);
- s += pitch;
- d16u8 = vld1_u8(s);
- s += pitch;
- d17u8 = vld1_u8(s);
- s += pitch;
- d18u8 = vld1_u8(s);
-
- loop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
- d16u8, d17u8, d18u8, &d4u8, &d5u8, &d6u8, &d7u8);
-
- s -= (pitch * 5);
- vst1_u8(s, d4u8);
- s += pitch;
- vst1_u8(s, d5u8);
- s += pitch;
- vst1_u8(s, d6u8);
- s += pitch;
- vst1_u8(s, d7u8);
- }
- return;
-}
-
-void vpx_lpf_vertical_4_neon(uint8_t *src, int pitch, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh) {
- int i, pitch8;
- uint8_t *s;
- uint8x8_t dblimit, dlimit, dthresh;
- uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
- uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
- uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
- uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
- uint8x8x4_t d4Result;
-
- dblimit = vld1_u8(blimit);
- dlimit = vld1_u8(limit);
- dthresh = vld1_u8(thresh);
-
- pitch8 = pitch * 8;
- for (i = 0; i < 1; i++, src += pitch8) {
- s = src - (i + 1) * 4;
-
- d3u8 = vld1_u8(s);
- s += pitch;
- d4u8 = vld1_u8(s);
- s += pitch;
- d5u8 = vld1_u8(s);
- s += pitch;
- d6u8 = vld1_u8(s);
- s += pitch;
- d7u8 = vld1_u8(s);
- s += pitch;
- d16u8 = vld1_u8(s);
- s += pitch;
- d17u8 = vld1_u8(s);
- s += pitch;
- d18u8 = vld1_u8(s);
-
- d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), vreinterpret_u32_u8(d7u8));
- d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), vreinterpret_u32_u8(d16u8));
- d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), vreinterpret_u32_u8(d17u8));
- d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), vreinterpret_u32_u8(d18u8));
-
- d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
- vreinterpret_u16_u32(d2tmp2.val[0]));
- d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
- vreinterpret_u16_u32(d2tmp3.val[0]));
- d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
- vreinterpret_u16_u32(d2tmp2.val[1]));
- d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
- vreinterpret_u16_u32(d2tmp3.val[1]));
-
- d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
- vreinterpret_u8_u16(d2tmp5.val[0]));
- d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
- vreinterpret_u8_u16(d2tmp5.val[1]));
- d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
- vreinterpret_u8_u16(d2tmp7.val[0]));
- d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
- vreinterpret_u8_u16(d2tmp7.val[1]));
-
- d3u8 = d2tmp8.val[0];
- d4u8 = d2tmp8.val[1];
- d5u8 = d2tmp9.val[0];
- d6u8 = d2tmp9.val[1];
- d7u8 = d2tmp10.val[0];
- d16u8 = d2tmp10.val[1];
- d17u8 = d2tmp11.val[0];
- d18u8 = d2tmp11.val[1];
-
- loop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
- d16u8, d17u8, d18u8, &d4u8, &d5u8, &d6u8, &d7u8);
-
- d4Result.val[0] = d4u8;
- d4Result.val[1] = d5u8;
- d4Result.val[2] = d6u8;
- d4Result.val[3] = d7u8;
-
- src -= 2;
- vst4_lane_u8(src, d4Result, 0);
- src += pitch;
- vst4_lane_u8(src, d4Result, 1);
- src += pitch;
- vst4_lane_u8(src, d4Result, 2);
- src += pitch;
- vst4_lane_u8(src, d4Result, 3);
- src += pitch;
- vst4_lane_u8(src, d4Result, 4);
- src += pitch;
- vst4_lane_u8(src, d4Result, 5);
- src += pitch;
- vst4_lane_u8(src, d4Result, 6);
- src += pitch;
- vst4_lane_u8(src, d4Result, 7);
- }
- return;
-}
diff --git a/vpx_dsp/arm/loopfilter_8_neon.c b/vpx_dsp/arm/loopfilter_8_neon.c
deleted file mode 100644
index 8641541b0..000000000
--- a/vpx_dsp/arm/loopfilter_8_neon.c
+++ /dev/null
@@ -1,445 +0,0 @@
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-#include "./vpx_dsp_rtcd.h"
-
-static INLINE void mbloop_filter_neon(uint8x8_t dblimit, // mblimit
- uint8x8_t dlimit, // limit
- uint8x8_t dthresh, // thresh
- uint8x8_t d3u8, // p2
- uint8x8_t d4u8, // p2
- uint8x8_t d5u8, // p1
- uint8x8_t d6u8, // p0
- uint8x8_t d7u8, // q0
- uint8x8_t d16u8, // q1
- uint8x8_t d17u8, // q2
- uint8x8_t d18u8, // q3
- uint8x8_t *d0ru8, // p1
- uint8x8_t *d1ru8, // p1
- uint8x8_t *d2ru8, // p0
- uint8x8_t *d3ru8, // q0
- uint8x8_t *d4ru8, // q1
- uint8x8_t *d5ru8) { // q1
- uint32_t flat;
- uint8x8_t d0u8, d1u8, d2u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8;
- uint8x8_t d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
- int16x8_t q15s16;
- uint16x8_t q10u16, q14u16;
- int8x8_t d21s8, d24s8, d25s8, d26s8, d28s8, d29s8, d30s8;
-
- d19u8 = vabd_u8(d3u8, d4u8);
- d20u8 = vabd_u8(d4u8, d5u8);
- d21u8 = vabd_u8(d5u8, d6u8);
- d22u8 = vabd_u8(d16u8, d7u8);
- d23u8 = vabd_u8(d17u8, d16u8);
- d24u8 = vabd_u8(d18u8, d17u8);
-
- d19u8 = vmax_u8(d19u8, d20u8);
- d20u8 = vmax_u8(d21u8, d22u8);
-
- d25u8 = vabd_u8(d6u8, d4u8);
-
- d23u8 = vmax_u8(d23u8, d24u8);
-
- d26u8 = vabd_u8(d7u8, d17u8);
-
- d19u8 = vmax_u8(d19u8, d20u8);
-
- d24u8 = vabd_u8(d6u8, d7u8);
- d27u8 = vabd_u8(d3u8, d6u8);
- d28u8 = vabd_u8(d18u8, d7u8);
-
- d19u8 = vmax_u8(d19u8, d23u8);
-
- d23u8 = vabd_u8(d5u8, d16u8);
- d24u8 = vqadd_u8(d24u8, d24u8);
-
- d19u8 = vcge_u8(dlimit, d19u8);
-
- d25u8 = vmax_u8(d25u8, d26u8);
- d26u8 = vmax_u8(d27u8, d28u8);
-
- d23u8 = vshr_n_u8(d23u8, 1);
-
- d25u8 = vmax_u8(d25u8, d26u8);
-
- d24u8 = vqadd_u8(d24u8, d23u8);
-
- d20u8 = vmax_u8(d20u8, d25u8);
-
- d23u8 = vdup_n_u8(1);
- d24u8 = vcge_u8(dblimit, d24u8);
-
- d21u8 = vcgt_u8(d21u8, dthresh);
-
- d20u8 = vcge_u8(d23u8, d20u8);
-
- d19u8 = vand_u8(d19u8, d24u8);
-
- d23u8 = vcgt_u8(d22u8, dthresh);
-
- d20u8 = vand_u8(d20u8, d19u8);
-
- d22u8 = vdup_n_u8(0x80);
-
- d23u8 = vorr_u8(d21u8, d23u8);
-
- q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8), vreinterpret_u16_u8(d21u8));
-
- d30u8 = vshrn_n_u16(q10u16, 4);
- flat = vget_lane_u32(vreinterpret_u32_u8(d30u8), 0);
-
- if (flat == 0xffffffff) { // Check for all 1's, power_branch_only
- d27u8 = vdup_n_u8(3);
- d21u8 = vdup_n_u8(2);
- q14u16 = vaddl_u8(d6u8, d7u8);
- q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
- q14u16 = vmlal_u8(q14u16, d4u8, d21u8);
- q14u16 = vaddw_u8(q14u16, d5u8);
- *d0ru8 = vqrshrn_n_u16(q14u16, 3);
-
- q14u16 = vsubw_u8(q14u16, d3u8);
- q14u16 = vsubw_u8(q14u16, d4u8);
- q14u16 = vaddw_u8(q14u16, d5u8);
- q14u16 = vaddw_u8(q14u16, d16u8);
- *d1ru8 = vqrshrn_n_u16(q14u16, 3);
-
- q14u16 = vsubw_u8(q14u16, d3u8);
- q14u16 = vsubw_u8(q14u16, d5u8);
- q14u16 = vaddw_u8(q14u16, d6u8);
- q14u16 = vaddw_u8(q14u16, d17u8);
- *d2ru8 = vqrshrn_n_u16(q14u16, 3);
-
- q14u16 = vsubw_u8(q14u16, d3u8);
- q14u16 = vsubw_u8(q14u16, d6u8);
- q14u16 = vaddw_u8(q14u16, d7u8);
- q14u16 = vaddw_u8(q14u16, d18u8);
- *d3ru8 = vqrshrn_n_u16(q14u16, 3);
-
- q14u16 = vsubw_u8(q14u16, d4u8);
- q14u16 = vsubw_u8(q14u16, d7u8);
- q14u16 = vaddw_u8(q14u16, d16u8);
- q14u16 = vaddw_u8(q14u16, d18u8);
- *d4ru8 = vqrshrn_n_u16(q14u16, 3);
-
- q14u16 = vsubw_u8(q14u16, d5u8);
- q14u16 = vsubw_u8(q14u16, d16u8);
- q14u16 = vaddw_u8(q14u16, d17u8);
- q14u16 = vaddw_u8(q14u16, d18u8);
- *d5ru8 = vqrshrn_n_u16(q14u16, 3);
- } else {
- d21u8 = veor_u8(d7u8, d22u8);
- d24u8 = veor_u8(d6u8, d22u8);
- d25u8 = veor_u8(d5u8, d22u8);
- d26u8 = veor_u8(d16u8, d22u8);
-
- d27u8 = vdup_n_u8(3);
-
- d28s8 = vsub_s8(vreinterpret_s8_u8(d21u8), vreinterpret_s8_u8(d24u8));
- d29s8 = vqsub_s8(vreinterpret_s8_u8(d25u8), vreinterpret_s8_u8(d26u8));
-
- q15s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d27u8));
-
- d29s8 = vand_s8(d29s8, vreinterpret_s8_u8(d23u8));
-
- q15s16 = vaddw_s8(q15s16, d29s8);
-
- d29u8 = vdup_n_u8(4);
-
- d28s8 = vqmovn_s16(q15s16);
-
- d28s8 = vand_s8(d28s8, vreinterpret_s8_u8(d19u8));
-
- d30s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d27u8));
- d29s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d29u8));
- d30s8 = vshr_n_s8(d30s8, 3);
- d29s8 = vshr_n_s8(d29s8, 3);
-
- d24s8 = vqadd_s8(vreinterpret_s8_u8(d24u8), d30s8);
- d21s8 = vqsub_s8(vreinterpret_s8_u8(d21u8), d29s8);
-
- d29s8 = vrshr_n_s8(d29s8, 1);
- d29s8 = vbic_s8(d29s8, vreinterpret_s8_u8(d23u8));
-
- d25s8 = vqadd_s8(vreinterpret_s8_u8(d25u8), d29s8);
- d26s8 = vqsub_s8(vreinterpret_s8_u8(d26u8), d29s8);
-
- if (flat == 0) { // filter_branch_only
- *d0ru8 = d4u8;
- *d1ru8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
- *d2ru8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
- *d3ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
- *d4ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
- *d5ru8 = d17u8;
- return;
- }
-
- d21u8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
- d24u8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
- d25u8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
- d26u8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
-
- d23u8 = vdup_n_u8(2);
- q14u16 = vaddl_u8(d6u8, d7u8);
- q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
- q14u16 = vmlal_u8(q14u16, d4u8, d23u8);
-
- d0u8 = vbsl_u8(d20u8, dblimit, d4u8);
-
- q14u16 = vaddw_u8(q14u16, d5u8);
-
- d1u8 = vbsl_u8(d20u8, dlimit, d25u8);
-
- d30u8 = vqrshrn_n_u16(q14u16, 3);
-
- q14u16 = vsubw_u8(q14u16, d3u8);
- q14u16 = vsubw_u8(q14u16, d4u8);
- q14u16 = vaddw_u8(q14u16, d5u8);
- q14u16 = vaddw_u8(q14u16, d16u8);
-
- d2u8 = vbsl_u8(d20u8, dthresh, d24u8);
-
- d31u8 = vqrshrn_n_u16(q14u16, 3);
-
- q14u16 = vsubw_u8(q14u16, d3u8);
- q14u16 = vsubw_u8(q14u16, d5u8);
- q14u16 = vaddw_u8(q14u16, d6u8);
- q14u16 = vaddw_u8(q14u16, d17u8);
-
- *d0ru8 = vbsl_u8(d20u8, d30u8, d0u8);
-
- d23u8 = vqrshrn_n_u16(q14u16, 3);
-
- q14u16 = vsubw_u8(q14u16, d3u8);
- q14u16 = vsubw_u8(q14u16, d6u8);
- q14u16 = vaddw_u8(q14u16, d7u8);
-
- *d1ru8 = vbsl_u8(d20u8, d31u8, d1u8);
-
- q14u16 = vaddw_u8(q14u16, d18u8);
-
- *d2ru8 = vbsl_u8(d20u8, d23u8, d2u8);
-
- d22u8 = vqrshrn_n_u16(q14u16, 3);
-
- q14u16 = vsubw_u8(q14u16, d4u8);
- q14u16 = vsubw_u8(q14u16, d7u8);
- q14u16 = vaddw_u8(q14u16, d16u8);
-
- d3u8 = vbsl_u8(d20u8, d3u8, d21u8);
-
- q14u16 = vaddw_u8(q14u16, d18u8);
-
- d4u8 = vbsl_u8(d20u8, d4u8, d26u8);
-
- d6u8 = vqrshrn_n_u16(q14u16, 3);
-
- q14u16 = vsubw_u8(q14u16, d5u8);
- q14u16 = vsubw_u8(q14u16, d16u8);
- q14u16 = vaddw_u8(q14u16, d17u8);
- q14u16 = vaddw_u8(q14u16, d18u8);
-
- d5u8 = vbsl_u8(d20u8, d5u8, d17u8);
-
- d7u8 = vqrshrn_n_u16(q14u16, 3);
-
- *d3ru8 = vbsl_u8(d20u8, d22u8, d3u8);
- *d4ru8 = vbsl_u8(d20u8, d6u8, d4u8);
- *d5ru8 = vbsl_u8(d20u8, d7u8, d5u8);
- }
- return;
-}
-
-void vpx_lpf_horizontal_8_neon(uint8_t *src, int pitch, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh) {
- int i;
- uint8_t *s, *psrc;
- uint8x8_t dblimit, dlimit, dthresh;
- uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
- uint8x8_t d16u8, d17u8, d18u8;
-
- dblimit = vld1_u8(blimit);
- dlimit = vld1_u8(limit);
- dthresh = vld1_u8(thresh);
-
- psrc = src - (pitch << 2);
- for (i = 0; i < 1; i++) {
- s = psrc + i * 8;
-
- d3u8 = vld1_u8(s);
- s += pitch;
- d4u8 = vld1_u8(s);
- s += pitch;
- d5u8 = vld1_u8(s);
- s += pitch;
- d6u8 = vld1_u8(s);
- s += pitch;
- d7u8 = vld1_u8(s);
- s += pitch;
- d16u8 = vld1_u8(s);
- s += pitch;
- d17u8 = vld1_u8(s);
- s += pitch;
- d18u8 = vld1_u8(s);
-
- mbloop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
- d16u8, d17u8, d18u8, &d0u8, &d1u8, &d2u8, &d3u8, &d4u8,
- &d5u8);
-
- s -= (pitch * 6);
- vst1_u8(s, d0u8);
- s += pitch;
- vst1_u8(s, d1u8);
- s += pitch;
- vst1_u8(s, d2u8);
- s += pitch;
- vst1_u8(s, d3u8);
- s += pitch;
- vst1_u8(s, d4u8);
- s += pitch;
- vst1_u8(s, d5u8);
- }
- return;
-}
-
-void vpx_lpf_horizontal_8_dual_neon(
- uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
- const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
- const uint8_t *limit1, const uint8_t *thresh1) {
- vpx_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0);
- vpx_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1);
-}
-
-void vpx_lpf_vertical_8_neon(uint8_t *src, int pitch, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh) {
- int i;
- uint8_t *s;
- uint8x8_t dblimit, dlimit, dthresh;
- uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
- uint8x8_t d16u8, d17u8, d18u8;
- uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
- uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
- uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
- uint8x8x4_t d4Result;
- uint8x8x2_t d2Result;
-
- dblimit = vld1_u8(blimit);
- dlimit = vld1_u8(limit);
- dthresh = vld1_u8(thresh);
-
- for (i = 0; i < 1; i++) {
- s = src + (i * (pitch << 3)) - 4;
-
- d3u8 = vld1_u8(s);
- s += pitch;
- d4u8 = vld1_u8(s);
- s += pitch;
- d5u8 = vld1_u8(s);
- s += pitch;
- d6u8 = vld1_u8(s);
- s += pitch;
- d7u8 = vld1_u8(s);
- s += pitch;
- d16u8 = vld1_u8(s);
- s += pitch;
- d17u8 = vld1_u8(s);
- s += pitch;
- d18u8 = vld1_u8(s);
-
- d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), vreinterpret_u32_u8(d7u8));
- d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), vreinterpret_u32_u8(d16u8));
- d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), vreinterpret_u32_u8(d17u8));
- d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), vreinterpret_u32_u8(d18u8));
-
- d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
- vreinterpret_u16_u32(d2tmp2.val[0]));
- d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
- vreinterpret_u16_u32(d2tmp3.val[0]));
- d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
- vreinterpret_u16_u32(d2tmp2.val[1]));
- d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
- vreinterpret_u16_u32(d2tmp3.val[1]));
-
- d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
- vreinterpret_u8_u16(d2tmp5.val[0]));
- d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
- vreinterpret_u8_u16(d2tmp5.val[1]));
- d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
- vreinterpret_u8_u16(d2tmp7.val[0]));
- d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
- vreinterpret_u8_u16(d2tmp7.val[1]));
-
- d3u8 = d2tmp8.val[0];
- d4u8 = d2tmp8.val[1];
- d5u8 = d2tmp9.val[0];
- d6u8 = d2tmp9.val[1];
- d7u8 = d2tmp10.val[0];
- d16u8 = d2tmp10.val[1];
- d17u8 = d2tmp11.val[0];
- d18u8 = d2tmp11.val[1];
-
- mbloop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
- d16u8, d17u8, d18u8, &d0u8, &d1u8, &d2u8, &d3u8, &d4u8,
- &d5u8);
-
- d4Result.val[0] = d0u8;
- d4Result.val[1] = d1u8;
- d4Result.val[2] = d2u8;
- d4Result.val[3] = d3u8;
-
- d2Result.val[0] = d4u8;
- d2Result.val[1] = d5u8;
-
- s = src - 3;
- vst4_lane_u8(s, d4Result, 0);
- s += pitch;
- vst4_lane_u8(s, d4Result, 1);
- s += pitch;
- vst4_lane_u8(s, d4Result, 2);
- s += pitch;
- vst4_lane_u8(s, d4Result, 3);
- s += pitch;
- vst4_lane_u8(s, d4Result, 4);
- s += pitch;
- vst4_lane_u8(s, d4Result, 5);
- s += pitch;
- vst4_lane_u8(s, d4Result, 6);
- s += pitch;
- vst4_lane_u8(s, d4Result, 7);
-
- s = src + 1;
- vst2_lane_u8(s, d2Result, 0);
- s += pitch;
- vst2_lane_u8(s, d2Result, 1);
- s += pitch;
- vst2_lane_u8(s, d2Result, 2);
- s += pitch;
- vst2_lane_u8(s, d2Result, 3);
- s += pitch;
- vst2_lane_u8(s, d2Result, 4);
- s += pitch;
- vst2_lane_u8(s, d2Result, 5);
- s += pitch;
- vst2_lane_u8(s, d2Result, 6);
- s += pitch;
- vst2_lane_u8(s, d2Result, 7);
- }
- return;
-}
-
-void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
- const uint8_t *limit0, const uint8_t *thresh0,
- const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1) {
- vpx_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0);
- vpx_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1);
-}
diff --git a/vpx_dsp/arm/loopfilter_mb_neon.c b/vpx_dsp/arm/loopfilter_mb_neon.c
index aa61220d3..f95267472 100644
--- a/vpx_dsp/arm/loopfilter_mb_neon.c
+++ b/vpx_dsp/arm/loopfilter_mb_neon.c
@@ -31,6 +31,15 @@ FUN_LOAD_THRESH(8, _) // load_thresh_8
FUN_LOAD_THRESH(16, q_) // load_thresh_16
#undef FUN_LOAD_THRESH
+static INLINE void load_thresh_8_dual(
+ const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1,
+ uint8x16_t *blimit_vec, uint8x16_t *limit_vec, uint8x16_t *thresh_vec) {
+ *blimit_vec = vcombine_u8(vld1_dup_u8(blimit0), vld1_dup_u8(blimit1));
+ *limit_vec = vcombine_u8(vld1_dup_u8(limit0), vld1_dup_u8(limit1));
+ *thresh_vec = vcombine_u8(vld1_dup_u8(thresh0), vld1_dup_u8(thresh1));
+}
+
// Here flat is 64-bit long, with each 8-bit (or 4-bit) chunk being a mask of a
// pixel. When used to control filter branches, we only detect whether it is all
// 0s or all 1s. We pairwise add flat to a 32-bit long number flat_status.
@@ -56,33 +65,51 @@ static INLINE uint32_t calc_flat_status_16(uint8x16_t flat) {
return calc_flat_status_8(flat_4bit);
}
-#define FUN_FILTER_FLAT_HEV_MASK(w, r) \
- static INLINE uint8x##w##_t filter_flat_hev_mask_##w( \
+#define FUN_FILTER_HEV_MASK4(w, r) \
+ static INLINE uint8x##w##_t filter_hev_mask4_##w( \
const uint8x##w##_t limit, const uint8x##w##_t blimit, \
const uint8x##w##_t thresh, const uint8x##w##_t p3, \
const uint8x##w##_t p2, const uint8x##w##_t p1, const uint8x##w##_t p0, \
const uint8x##w##_t q0, const uint8x##w##_t q1, const uint8x##w##_t q2, \
- const uint8x##w##_t q3, uint8x##w##_t *flat, uint32_t *flat_status, \
- uint8x##w##_t *hev) { \
- uint8x##w##_t t0, t1, mask; \
+ const uint8x##w##_t q3, uint8x##w##_t *hev, uint8x##w##_t *mask) { \
+ uint8x##w##_t max, t0, t1; \
\
- mask = vabd##r##u8(p1, p0); \
- mask = vmax##r##u8(mask, vabd##r##u8(q1, q0)); \
- *hev = vcgt##r##u8(mask, thresh); \
- *flat = vmax##r##u8(mask, vabd##r##u8(p2, p0)); \
- mask = vmax##r##u8(mask, vabd##r##u8(p3, p2)); \
- mask = vmax##r##u8(mask, vabd##r##u8(p2, p1)); \
- mask = vmax##r##u8(mask, vabd##r##u8(q2, q1)); \
- mask = vmax##r##u8(mask, vabd##r##u8(q3, q2)); \
+ max = vabd##r##u8(p1, p0); \
+ max = vmax##r##u8(max, vabd##r##u8(q1, q0)); \
+ *hev = vcgt##r##u8(max, thresh); \
+ *mask = vmax##r##u8(max, vabd##r##u8(p3, p2)); \
+ *mask = vmax##r##u8(*mask, vabd##r##u8(p2, p1)); \
+ *mask = vmax##r##u8(*mask, vabd##r##u8(q2, q1)); \
+ *mask = vmax##r##u8(*mask, vabd##r##u8(q3, q2)); \
t0 = vabd##r##u8(p0, q0); \
t1 = vabd##r##u8(p1, q1); \
t0 = vqadd##r##u8(t0, t0); \
t1 = vshr##r##n_u8(t1, 1); \
t0 = vqadd##r##u8(t0, t1); \
- mask = vcle##r##u8(mask, limit); \
+ *mask = vcle##r##u8(*mask, limit); \
t0 = vcle##r##u8(t0, blimit); \
- mask = vand##r##u8(mask, t0); \
+ *mask = vand##r##u8(*mask, t0); \
+ \
+ return max; \
+ }
+
+FUN_FILTER_HEV_MASK4(8, _) // filter_hev_mask4_8
+FUN_FILTER_HEV_MASK4(16, q_) // filter_hev_mask4_16
+#undef FUN_FILTER_HEV_MASK4
+
+#define FUN_FILTER_FLAT_HEV_MASK(w, r) \
+ static INLINE uint8x##w##_t filter_flat_hev_mask_##w( \
+ const uint8x##w##_t limit, const uint8x##w##_t blimit, \
+ const uint8x##w##_t thresh, const uint8x##w##_t p3, \
+ const uint8x##w##_t p2, const uint8x##w##_t p1, const uint8x##w##_t p0, \
+ const uint8x##w##_t q0, const uint8x##w##_t q1, const uint8x##w##_t q2, \
+ const uint8x##w##_t q3, uint8x##w##_t *flat, uint32_t *flat_status, \
+ uint8x##w##_t *hev) { \
+ uint8x##w##_t max, mask; \
\
+ max = filter_hev_mask4_##w(limit, blimit, thresh, p3, p2, p1, p0, q0, q1, \
+ q2, q3, hev, &mask); \
+ *flat = vmax##r##u8(max, vabd##r##u8(p2, p0)); \
*flat = vmax##r##u8(*flat, vabd##r##u8(q2, q0)); \
*flat = vmax##r##u8(*flat, vabd##r##u8(p3, p0)); \
*flat = vmax##r##u8(*flat, vabd##r##u8(q3, q0)); \
@@ -420,6 +447,33 @@ FUN_FILTER4(8, _) // filter4_8
FUN_FILTER4(16, q_) // filter4_16
#undef FUN_FILTER4
+#define FUN_FILTER8(w) \
+ static INLINE void filter8_##w( \
+ const uint8x##w##_t mask, const uint8x##w##_t flat, \
+ const uint32_t flat_status, const uint8x##w##_t hev, \
+ const uint8x##w##_t p3, const uint8x##w##_t p2, const uint8x##w##_t p1, \
+ const uint8x##w##_t p0, const uint8x##w##_t q0, const uint8x##w##_t q1, \
+ const uint8x##w##_t q2, const uint8x##w##_t q3, uint8x##w##_t *op2, \
+ uint8x##w##_t *op1, uint8x##w##_t *op0, uint8x##w##_t *oq0, \
+ uint8x##w##_t *oq1, uint8x##w##_t *oq2) { \
+ if (flat_status != (uint32_t)-2) { \
+ filter4_##w(mask, hev, p1, p0, q0, q1, op1, op0, oq0, oq1); \
+ *op2 = p2; \
+ *oq2 = q2; \
+ if (flat_status) { \
+ apply_7_tap_filter_##w(flat, p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, \
+ op0, oq0, oq1, oq2); \
+ } \
+ } else { \
+ calc_7_tap_filter_##w(p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, op0, \
+ oq0, oq1, oq2); \
+ } \
+ }
+
+FUN_FILTER8(8) // filter8_8
+FUN_FILTER8(16) // filter8_16
+#undef FUN_FILTER8
+
#define FUN_FILTER16(w) \
static INLINE void filter16_##w( \
const uint8x##w##_t mask, const uint8x##w##_t flat, \
@@ -481,6 +535,7 @@ FUN_FILTER16(16) // filter16_16
*q3 = vld1##r##u8(s); \
}
+FUN_LOAD8(8, _) // load_8x8
FUN_LOAD8(16, q_) // load_16x8
#undef FUN_LOAD8
@@ -529,6 +584,71 @@ FUN_LOAD16(8, _) // load_8x16
FUN_LOAD16(16, q_) // load_16x16
#undef FUN_LOAD16
+#define FUN_STORE4(w, r) \
+ static INLINE void store_##w##x4( \
+ uint8_t *s, const int p, const uint8x##w##_t s0, const uint8x##w##_t s1, \
+ const uint8x##w##_t s2, const uint8x##w##_t s3) { \
+ vst1##r##u8(s, s0); \
+ s += p; \
+ vst1##r##u8(s, s1); \
+ s += p; \
+ vst1##r##u8(s, s2); \
+ s += p; \
+ vst1##r##u8(s, s3); \
+ }
+
+FUN_STORE4(8, _) // store_8x4
+FUN_STORE4(16, q_) // store_16x4
+#undef FUN_STORE4
+
+#define FUN_STORE6(w, r) \
+ static INLINE void store_##w##x6( \
+ uint8_t *s, const int p, const uint8x##w##_t s0, const uint8x##w##_t s1, \
+ const uint8x##w##_t s2, const uint8x##w##_t s3, const uint8x##w##_t s4, \
+ const uint8x##w##_t s5) { \
+ vst1##r##u8(s, s0); \
+ s += p; \
+ vst1##r##u8(s, s1); \
+ s += p; \
+ vst1##r##u8(s, s2); \
+ s += p; \
+ vst1##r##u8(s, s3); \
+ s += p; \
+ vst1##r##u8(s, s4); \
+ s += p; \
+ vst1##r##u8(s, s5); \
+ }
+
+FUN_STORE6(8, _) // store_8x6
+FUN_STORE6(16, q_) // store_16x6
+#undef FUN_STORE6
+
+static INLINE void store_4x8(uint8_t *s, const int p, const uint8x8_t p1,
+ const uint8x8_t p0, const uint8x8_t q0,
+ const uint8x8_t q1) {
+ uint8x8x4_t o;
+
+ o.val[0] = p1;
+ o.val[1] = p0;
+ o.val[2] = q0;
+ o.val[3] = q1;
+ vst4_lane_u8(s, o, 0);
+ s += p;
+ vst4_lane_u8(s, o, 1);
+ s += p;
+ vst4_lane_u8(s, o, 2);
+ s += p;
+ vst4_lane_u8(s, o, 3);
+ s += p;
+ vst4_lane_u8(s, o, 4);
+ s += p;
+ vst4_lane_u8(s, o, 5);
+ s += p;
+ vst4_lane_u8(s, o, 6);
+ s += p;
+ vst4_lane_u8(s, o, 7);
+}
+
static INLINE void store_6x8(uint8_t *s, const int p, const uint8x8_t s0,
const uint8x8_t s1, const uint8x8_t s2,
const uint8x8_t s3, const uint8x8_t s4,
@@ -566,53 +686,64 @@ static INLINE void store_6x8(uint8_t *s, const int p, const uint8x8_t s0,
vst3_lane_u8(s + 0, o1, 7);
}
-static INLINE void store_4x8(uint8_t *s, const int p, const uint8x8_t p1,
- const uint8x8_t p0, const uint8x8_t q0,
- const uint8x8_t q1) {
- uint8x8x4_t o;
+#define FUN_STORE8(w, r) \
+ static INLINE void store_##w##x8( \
+ uint8_t *s, const int p, const uint8x##w##_t s0, const uint8x##w##_t s1, \
+ const uint8x##w##_t s2, const uint8x##w##_t s3, const uint8x##w##_t s4, \
+ const uint8x##w##_t s5, const uint8x##w##_t s6, \
+ const uint8x##w##_t s7) { \
+ vst1##r##u8(s, s0); \
+ s += p; \
+ vst1##r##u8(s, s1); \
+ s += p; \
+ vst1##r##u8(s, s2); \
+ s += p; \
+ vst1##r##u8(s, s3); \
+ s += p; \
+ vst1##r##u8(s, s4); \
+ s += p; \
+ vst1##r##u8(s, s5); \
+ s += p; \
+ vst1##r##u8(s, s6); \
+ s += p; \
+ vst1##r##u8(s, s7); \
+ }
- o.val[0] = p1;
- o.val[1] = p0;
- o.val[2] = q0;
- o.val[3] = q1;
- vst4_lane_u8(s, o, 0);
- s += p;
- vst4_lane_u8(s, o, 1);
- s += p;
- vst4_lane_u8(s, o, 2);
- s += p;
- vst4_lane_u8(s, o, 3);
- s += p;
- vst4_lane_u8(s, o, 4);
- s += p;
- vst4_lane_u8(s, o, 5);
- s += p;
- vst4_lane_u8(s, o, 6);
- s += p;
- vst4_lane_u8(s, o, 7);
-}
+FUN_STORE8(8, _) // store_8x8
+FUN_STORE8(16, q_) // store_16x8
+#undef FUN_STORE8
-static INLINE void store_16x8(uint8_t *s, const int p, const uint8x16_t s0,
- const uint8x16_t s1, const uint8x16_t s2,
- const uint8x16_t s3, const uint8x16_t s4,
- const uint8x16_t s5, const uint8x16_t s6,
- const uint8x16_t s7) {
- vst1q_u8(s, s0);
- s += p;
- vst1q_u8(s, s1);
- s += p;
- vst1q_u8(s, s2);
- s += p;
- vst1q_u8(s, s3);
- s += p;
- vst1q_u8(s, s4);
- s += p;
- vst1q_u8(s, s5);
- s += p;
- vst1q_u8(s, s6);
- s += p;
- vst1q_u8(s, s7);
-}
+#define FUN_STORE14(w, r) \
+ static INLINE void store_##w##x14( \
+ uint8_t *s, const int p, const uint8x##w##_t p6, const uint8x##w##_t p5, \
+ const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2, \
+ const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0, \
+ const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3, \
+ const uint8x##w##_t q4, const uint8x##w##_t q5, const uint8x##w##_t q6, \
+ const uint32_t flat_status, const uint32_t flat2_status) { \
+ if (flat_status) { \
+ if (flat2_status) { \
+ vst1##r##u8(s - 7 * p, p6); \
+ vst1##r##u8(s - 6 * p, p5); \
+ vst1##r##u8(s - 5 * p, p4); \
+ vst1##r##u8(s - 4 * p, p3); \
+ vst1##r##u8(s + 3 * p, q3); \
+ vst1##r##u8(s + 4 * p, q4); \
+ vst1##r##u8(s + 5 * p, q5); \
+ vst1##r##u8(s + 6 * p, q6); \
+ } \
+ vst1##r##u8(s - 3 * p, p2); \
+ vst1##r##u8(s + 2 * p, q2); \
+ } \
+ vst1##r##u8(s - 2 * p, p1); \
+ vst1##r##u8(s - 1 * p, p0); \
+ vst1##r##u8(s + 0 * p, q0); \
+ vst1##r##u8(s + 1 * p, q1); \
+ }
+
+FUN_STORE14(8, _) // store_8x14
+FUN_STORE14(16, q_) // store_16x14
+#undef FUN_STORE14
static INLINE void store_16x16(uint8_t *s, const int p, const uint8x16_t s0,
const uint8x16_t s1, const uint8x16_t s2,
@@ -656,37 +787,160 @@ static INLINE void store_16x16(uint8_t *s, const int p, const uint8x16_t s0,
vst1q_u8(s, s15);
}
-#define FUN_STORE14(w, r) \
- static INLINE void store_##w##x14( \
- uint8_t *s, const int p, const uint8x##w##_t p6, const uint8x##w##_t p5, \
- const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2, \
- const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0, \
- const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3, \
- const uint8x##w##_t q4, const uint8x##w##_t q5, const uint8x##w##_t q6, \
- const uint32_t flat_status, const uint32_t flat2_status) { \
- if (flat_status) { \
- if (flat2_status) { \
- vst1##r##u8(s - 7 * p, p6); \
- vst1##r##u8(s - 6 * p, p5); \
- vst1##r##u8(s - 5 * p, p4); \
- vst1##r##u8(s - 4 * p, p3); \
- vst1##r##u8(s + 3 * p, q3); \
- vst1##r##u8(s + 4 * p, q4); \
- vst1##r##u8(s + 5 * p, q5); \
- vst1##r##u8(s + 6 * p, q6); \
- } \
- vst1##r##u8(s - 3 * p, p2); \
- vst1##r##u8(s + 2 * p, q2); \
- } \
- vst1##r##u8(s - 2 * p, p1); \
- vst1##r##u8(s - 1 * p, p0); \
- vst1##r##u8(s + 0 * p, q0); \
- vst1##r##u8(s + 1 * p, q1); \
+#define FUN_HOR_4_KERNEL(name, w) \
+ static INLINE void lpf_horizontal_4##name##kernel( \
+ uint8_t *s, const int p, const uint8x##w##_t blimit, \
+ const uint8x##w##_t limit, const uint8x##w##_t thresh) { \
+ uint8x##w##_t p3, p2, p1, p0, q0, q1, q2, q3, mask, hev; \
+ \
+ load_##w##x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); \
+ filter_hev_mask4_##w(limit, blimit, thresh, p3, p2, p1, p0, q0, q1, q2, \
+ q3, &hev, &mask); \
+ filter4_##w(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1); \
+ store_##w##x4(s - 2 * p, p, p1, p0, q0, q1); \
}
-FUN_STORE14(8, _) // store_8x14
-FUN_STORE14(16, q_) // store_16x14
-#undef FUN_STORE14
+FUN_HOR_4_KERNEL(_, 8) // lpf_horizontal_4_kernel
+FUN_HOR_4_KERNEL(_dual_, 16) // lpf_horizontal_4_dual_kernel
+#undef FUN_HOR_4_KERNEL
+
+void vpx_lpf_horizontal_4_neon(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ uint8x8_t blimit_vec, limit_vec, thresh_vec;
+ load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec);
+ lpf_horizontal_4_kernel(s, p, blimit_vec, limit_vec, thresh_vec);
+}
+
+void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
+ const uint8_t *limit0,
+ const uint8_t *thresh0,
+ const uint8_t *blimit1,
+ const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ uint8x16_t blimit_vec, limit_vec, thresh_vec;
+ load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1,
+ &blimit_vec, &limit_vec, &thresh_vec);
+ lpf_horizontal_4_dual_kernel(s, p, blimit_vec, limit_vec, thresh_vec);
+}
+
+void vpx_lpf_vertical_4_neon(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ uint8x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
+ mask, hev;
+ load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec);
+ load_8x8(s - 4, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+ transpose_u8_8x8(&p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+ filter_hev_mask4_8(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1,
+ q2, q3, &hev, &mask);
+ filter4_8(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1);
+ store_4x8(s - 2, p, p1, p0, q0, q1);
+}
+
+void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ uint8x16_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
+ mask, hev;
+ uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
+ s15;
+
+ load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1,
+ &blimit_vec, &limit_vec, &thresh_vec);
+ load_8x16(s - 4, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &s10,
+ &s11, &s12, &s13, &s14, &s15);
+ transpose_u8_8x16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
+ s14, s15, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+ filter_hev_mask4_16(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1,
+ q2, q3, &hev, &mask);
+ filter4_16(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1);
+ s -= 2;
+ store_4x8(s, p, vget_low_u8(p1), vget_low_u8(p0), vget_low_u8(q0),
+ vget_low_u8(q1));
+ store_4x8(s + 8 * p, p, vget_high_u8(p1), vget_high_u8(p0), vget_high_u8(q0),
+ vget_high_u8(q1));
+}
+
+void vpx_lpf_horizontal_8_neon(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ uint8x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
+ op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
+ uint32_t flat_status;
+
+ load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec);
+ load_8x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+ mask = filter_flat_hev_mask_8(limit_vec, blimit_vec, thresh_vec, p3, p2, p1,
+ p0, q0, q1, q2, q3, &flat, &flat_status, &hev);
+ filter8_8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
+ &op1, &op0, &oq0, &oq1, &oq2);
+ store_8x6(s - 3 * p, p, op2, op1, op0, oq0, oq1, oq2);
+}
+
+void vpx_lpf_horizontal_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
+ const uint8_t *limit0,
+ const uint8_t *thresh0,
+ const uint8_t *blimit1,
+ const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ uint8x16_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
+ op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
+ uint32_t flat_status;
+
+ load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1,
+ &blimit_vec, &limit_vec, &thresh_vec);
+ load_16x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+ mask = filter_flat_hev_mask_16(limit_vec, blimit_vec, thresh_vec, p3, p2, p1,
+ p0, q0, q1, q2, q3, &flat, &flat_status, &hev);
+ filter8_16(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
+ &op1, &op0, &oq0, &oq1, &oq2);
+ store_16x6(s - 3 * p, p, op2, op1, op0, oq0, oq1, oq2);
+}
+
+void vpx_lpf_vertical_8_neon(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ uint8x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
+ op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
+ uint32_t flat_status;
+
+ load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec);
+ load_8x8(s - 4, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+ transpose_u8_8x8(&p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+ mask = filter_flat_hev_mask_8(limit_vec, blimit_vec, thresh_vec, p3, p2, p1,
+ p0, q0, q1, q2, q3, &flat, &flat_status, &hev);
+ filter8_8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
+ &op1, &op0, &oq0, &oq1, &oq2);
+ // Note: tranpose + store_8x8() is faster than store_6x8().
+ transpose_u8_8x8(&p3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &q3);
+ store_8x8(s - 4, p, p3, op2, op1, op0, oq0, oq1, oq2, q3);
+}
+
+void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ uint8x16_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
+ op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
+ uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
+ s15;
+ uint32_t flat_status;
+
+ load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1,
+ &blimit_vec, &limit_vec, &thresh_vec);
+ load_8x16(s - 4, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &s10,
+ &s11, &s12, &s13, &s14, &s15);
+ transpose_u8_8x16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
+ s14, s15, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+ mask = filter_flat_hev_mask_16(limit_vec, blimit_vec, thresh_vec, p3, p2, p1,
+ p0, q0, q1, q2, q3, &flat, &flat_status, &hev);
+ filter8_16(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
+ &op1, &op0, &oq0, &oq1, &oq2);
+ // Note: store_6x8() twice is faster than tranpose + store_8x16().
+ store_6x8(s, p, vget_low_u8(op2), vget_low_u8(op1), vget_low_u8(op0),
+ vget_low_u8(oq0), vget_low_u8(oq1), vget_low_u8(oq2));
+ store_6x8(s + 8 * p, p, vget_high_u8(op2), vget_high_u8(op1),
+ vget_high_u8(op0), vget_high_u8(oq0), vget_high_u8(oq1),
+ vget_high_u8(oq2));
+}
#define FUN_LPF_16_KERNEL(name, w) \
static INLINE void lpf_16##name##kernel( \
@@ -784,7 +1038,9 @@ void vpx_lpf_vertical_16_neon(uint8_t *s, int p, const uint8_t *blimit,
&s6, &s7);
store_16x8(s, p, s0, s1, s2, s3, s4, s5, s6, s7);
} else {
- store_6x8(s + 8, p, op2, op1, op0, oq0, oq1, oq2);
+ // Note: tranpose + store_8x8() is faster than store_6x8().
+ transpose_u8_8x8(&p3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &q3);
+ store_8x8(s + 4, p, p3, op2, op1, op0, oq0, oq1, oq2, q3);
}
} else {
store_4x8(s + 6, p, op1, op0, oq0, oq1);
@@ -819,6 +1075,7 @@ void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
store_16x16(s, p, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
s13, s14, s15);
} else {
+ // Note: store_6x8() twice is faster than tranpose + store_8x16().
s += 8;
store_6x8(s, p, vget_low_u8(op2), vget_low_u8(op1), vget_low_u8(op0),
vget_low_u8(oq0), vget_low_u8(oq1), vget_low_u8(oq2));
diff --git a/vpx_dsp/arm/transpose_neon.h b/vpx_dsp/arm/transpose_neon.h
index 2318fb44b..3d0b41f93 100644
--- a/vpx_dsp/arm/transpose_neon.h
+++ b/vpx_dsp/arm/transpose_neon.h
@@ -39,28 +39,84 @@ static INLINE uint8x16x2_t vpx_vtrnq_u64(uint32x4_t a0, uint32x4_t a1) {
return b0;
}
+// Note: Using 'd' registers or 'q' registers has almost identical speed. We use
+// 'q' registers here to save some instructions.
+static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
+ uint8x8_t *a3, uint8x8_t *a4, uint8x8_t *a5,
+ uint8x8_t *a6, uint8x8_t *a7) {
+ // Swap 8 bit elements. Goes from:
+ // a0: 00 01 02 03 04 05 06 07
+ // a1: 10 11 12 13 14 15 16 17
+ // a2: 20 21 22 23 24 25 26 27
+ // a3: 30 31 32 33 34 35 36 37
+ // a4: 40 41 42 43 44 45 46 47
+ // a5: 50 51 52 53 54 55 56 57
+ // a6: 60 61 62 63 64 65 66 67
+ // a7: 70 71 72 73 74 75 76 77
+ // to:
+ // b0.val[0]: 00 10 02 12 04 14 06 16 40 50 42 52 44 54 46 56
+ // b0.val[1]: 01 11 03 13 05 15 07 17 41 51 43 53 45 55 47 57
+ // b1.val[0]: 20 30 22 32 24 34 26 36 60 70 62 72 64 74 66 76
+ // b1.val[1]: 21 31 23 33 25 35 27 37 61 71 63 73 65 75 67 77
+
+ const uint8x16x2_t b0 =
+ vtrnq_u8(vcombine_u8(*a0, *a4), vcombine_u8(*a1, *a5));
+ const uint8x16x2_t b1 =
+ vtrnq_u8(vcombine_u8(*a2, *a6), vcombine_u8(*a3, *a7));
+
+ // Swap 16 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 04 14 24 34 40 50 60 70 44 54 64 74
+ // c0.val[1]: 02 12 22 32 06 16 26 36 42 52 62 72 46 56 66 76
+ // c1.val[0]: 01 11 21 31 05 15 25 35 41 51 61 71 45 55 65 75
+ // c1.val[1]: 03 13 23 33 07 17 27 37 43 53 63 73 47 57 67 77
+
+ const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
+ vreinterpretq_u16_u8(b1.val[0]));
+ const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
+ vreinterpretq_u16_u8(b1.val[1]));
+
+ // Unzip 32 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+ // d0.val[1]: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
+ // d1.val[0]: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+ // d1.val[1]: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
+ const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
+ vreinterpretq_u32_u16(c1.val[0]));
+ const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
+ vreinterpretq_u32_u16(c1.val[1]));
+
+ *a0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0]));
+ *a1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0]));
+ *a2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
+ *a3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
+ *a4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
+ *a5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
+ *a6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1]));
+ *a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
+}
+
static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
int16x8_t *a2, int16x8_t *a3,
int16x8_t *a4, int16x8_t *a5,
int16x8_t *a6, int16x8_t *a7) {
// Swap 16 bit elements. Goes from:
// a0: 00 01 02 03 04 05 06 07
- // a1: 08 09 10 11 12 13 14 15
- // a2: 16 17 18 19 20 21 22 23
- // a3: 24 25 26 27 28 29 30 31
- // a4: 32 33 34 35 36 37 38 39
- // a5: 40 41 42 43 44 45 46 47
- // a6: 48 49 50 51 52 53 54 55
- // a7: 56 57 58 59 60 61 62 63
+ // a1: 10 11 12 13 14 15 16 17
+ // a2: 20 21 22 23 24 25 26 27
+ // a3: 30 31 32 33 34 35 36 37
+ // a4: 40 41 42 43 44 45 46 47
+ // a5: 50 51 52 53 54 55 56 57
+ // a6: 60 61 62 63 64 65 66 67
+ // a7: 70 71 72 73 74 75 76 77
// to:
- // b0.val[0]: 00 08 02 10 04 12 06 14
- // b0.val[1]: 01 09 03 11 05 13 07 15
- // b1.val[0]: 16 24 18 26 20 28 22 30
- // b1.val[1]: 17 25 19 27 21 29 23 31
- // b2.val[0]: 32 40 34 42 36 44 38 46
- // b2.val[1]: 33 41 35 43 37 45 39 47
- // b3.val[0]: 48 56 50 58 52 60 54 62
- // b3.val[1]: 49 57 51 59 53 61 55 63
+ // b0.val[0]: 00 10 02 12 04 14 06 16
+ // b0.val[1]: 01 11 03 13 05 15 07 17
+ // b1.val[0]: 20 30 22 32 24 34 26 36
+ // b1.val[1]: 21 31 23 33 25 35 27 37
+ // b2.val[0]: 40 50 42 52 44 54 46 56
+ // b2.val[1]: 41 51 43 53 45 55 47 57
+ // b3.val[0]: 60 70 62 72 64 74 66 76
+ // b3.val[1]: 61 71 63 73 65 75 67 77
const int16x8x2_t b0 = vtrnq_s16(*a0, *a1);
const int16x8x2_t b1 = vtrnq_s16(*a2, *a3);
@@ -68,14 +124,14 @@ static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
const int16x8x2_t b3 = vtrnq_s16(*a6, *a7);
// Swap 32 bit elements resulting in:
- // c0.val[0]: 00 08 16 24 04 12 20 28
- // c0.val[1]: 02 10 18 26 06 14 22 30
- // c1.val[0]: 01 09 17 25 05 13 21 29
- // c1.val[1]: 03 11 19 27 07 15 23 31
- // c2.val[0]: 32 40 48 56 36 44 52 60
- // c2.val[1]: 34 42 50 58 38 46 54 62
- // c3.val[0]: 33 41 49 57 37 45 53 61
- // c3.val[1]: 35 43 51 59 39 47 55 63
+ // c0.val[0]: 00 10 20 30 04 14 24 34
+ // c0.val[1]: 02 12 22 32 06 16 26 36
+ // c1.val[0]: 01 11 21 31 05 15 25 35
+ // c1.val[1]: 03 13 23 33 07 17 27 37
+ // c2.val[0]: 40 50 60 70 44 54 64 74
+ // c2.val[1]: 42 52 62 72 46 56 66 76
+ // c3.val[0]: 41 51 61 71 45 55 65 75
+ // c3.val[1]: 43 53 63 73 47 57 67 77
const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
vreinterpretq_s32_s16(b1.val[0]));
@@ -87,14 +143,14 @@ static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
vreinterpretq_s32_s16(b3.val[1]));
// Swap 64 bit elements resulting in:
- // d0.val[0]: 00 08 16 24 32 40 48 56
- // d0.val[1]: 04 12 20 28 36 44 52 60
- // d1.val[0]: 01 09 17 25 33 41 49 57
- // d1.val[1]: 05 13 21 29 37 45 53 61
- // d2.val[0]: 02 10 18 26 34 42 50 58
- // d2.val[1]: 06 14 22 30 38 46 54 62
- // d3.val[0]: 03 11 19 27 35 43 51 59
- // d3.val[1]: 07 15 23 31 39 47 55 63
+ // d0.val[0]: 00 10 20 30 40 50 60 70
+ // d0.val[1]: 04 14 24 34 44 54 64 74
+ // d1.val[0]: 01 11 21 31 41 51 61 71
+ // d1.val[1]: 05 15 25 35 45 55 65 75
+ // d2.val[0]: 02 12 22 32 42 52 62 72
+ // d2.val[1]: 06 16 26 36 46 56 66 76
+ // d3.val[0]: 03 13 23 33 43 53 63 73
+ // d3.val[1]: 07 17 27 37 47 57 67 77
const int16x8x2_t d0 = vpx_vtrnq_s64(c0.val[0], c2.val[0]);
const int16x8x2_t d1 = vpx_vtrnq_s64(c1.val[0], c3.val[0]);
const int16x8x2_t d2 = vpx_vtrnq_s64(c0.val[1], c2.val[1]);
diff --git a/vpx_dsp/arm/variance_neon.c b/vpx_dsp/arm/variance_neon.c
index f469afc4e..b6d7f86a4 100644
--- a/vpx_dsp/arm/variance_neon.c
+++ b/vpx_dsp/arm/variance_neon.c
@@ -75,7 +75,7 @@ unsigned int vpx_variance8x8_neon(const uint8_t *a, int a_stride,
unsigned int *sse) {
int sum;
variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum);
- return *sse - (((int64_t)sum * sum) >> 6); // >> 6 = / 8 * 8
+ return *sse - ((sum * sum) >> 6);
}
unsigned int vpx_variance16x16_neon(const uint8_t *a, int a_stride,
@@ -83,7 +83,7 @@ unsigned int vpx_variance16x16_neon(const uint8_t *a, int a_stride,
unsigned int *sse) {
int sum;
variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum);
- return *sse - (((int64_t)sum * sum) >> 8); // >> 8 = / 16 * 16
+ return *sse - (((uint32_t)((int64_t)sum * sum)) >> 8);
}
unsigned int vpx_variance32x32_neon(const uint8_t *a, int a_stride,
@@ -91,7 +91,7 @@ unsigned int vpx_variance32x32_neon(const uint8_t *a, int a_stride,
unsigned int *sse) {
int sum;
variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum);
- return *sse - (((int64_t)sum * sum) >> 10); // >> 10 = / 32 * 32
+ return *sse - (unsigned int)(((int64_t)sum * sum) >> 10);
}
unsigned int vpx_variance32x64_neon(const uint8_t *a, int a_stride,
@@ -104,7 +104,7 @@ unsigned int vpx_variance32x64_neon(const uint8_t *a, int a_stride,
32, 32, &sse2, &sum2);
*sse = sse1 + sse2;
sum1 += sum2;
- return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64
+ return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11);
}
unsigned int vpx_variance64x32_neon(const uint8_t *a, int a_stride,
@@ -117,7 +117,7 @@ unsigned int vpx_variance64x32_neon(const uint8_t *a, int a_stride,
64, 16, &sse2, &sum2);
*sse = sse1 + sse2;
sum1 += sum2;
- return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64
+ return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11);
}
unsigned int vpx_variance64x64_neon(const uint8_t *a, int a_stride,
@@ -141,7 +141,7 @@ unsigned int vpx_variance64x64_neon(const uint8_t *a, int a_stride,
b_stride, 64, 16, &sse2, &sum2);
*sse = sse1 + sse2;
sum1 += sum2;
- return *sse - (((int64_t)sum1 * sum1) >> 12); // >> 12 = / 64 * 64
+ return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 12);
}
unsigned int vpx_variance16x8_neon(const unsigned char *src_ptr,
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index def9c8e1b..a78041ce7 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -136,8 +136,8 @@ DSP_SRCS-yes += loopfilter.c
DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/loopfilter_sse2.c
DSP_SRCS-$(HAVE_AVX2) += x86/loopfilter_avx2.c
-DSP_SRCS-$(HAVE_NEON) += arm/loopfilter_neon.c
ifeq ($(HAVE_NEON_ASM),yes)
+DSP_SRCS-yes += arm/loopfilter_neon.c
DSP_SRCS-yes += arm/loopfilter_mb_neon$(ASM)
DSP_SRCS-yes += arm/loopfilter_16_neon$(ASM)
DSP_SRCS-yes += arm/loopfilter_8_neon$(ASM)
@@ -145,9 +145,6 @@ DSP_SRCS-yes += arm/loopfilter_4_neon$(ASM)
else
ifeq ($(HAVE_NEON),yes)
DSP_SRCS-yes += arm/loopfilter_mb_neon.c
-DSP_SRCS-yes += arm/loopfilter_16_neon.c
-DSP_SRCS-yes += arm/loopfilter_8_neon.c
-DSP_SRCS-yes += arm/loopfilter_4_neon.c
endif # HAVE_NEON
endif # HAVE_NEON_ASM
diff --git a/vpx_dsp/x86/inv_txfm_sse2.c b/vpx_dsp/x86/inv_txfm_sse2.c
index 330ae8d6a..cb56ad078 100644
--- a/vpx_dsp/x86/inv_txfm_sse2.c
+++ b/vpx_dsp/x86/inv_txfm_sse2.c
@@ -3066,17 +3066,7 @@ void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
in[6] = load_input_data(input + 192);
in[7] = load_input_data(input + 224);
- for (i = 8; i < 32; ++i) {
- in[i] = _mm_setzero_si128();
- }
-
array_transpose_8x8(in, in);
- // TODO(hkuang): Following transposes are unnecessary. But remove them will
- // lead to performance drop on some devices.
- array_transpose_8x8(in + 8, in + 8);
- array_transpose_8x8(in + 16, in + 16);
- array_transpose_8x8(in + 24, in + 24);
-
IDCT32_34
// 1_D: Store 32 intermediate results for each 8x32 block.
diff --git a/vpx_dsp/x86/variance_avx2.c b/vpx_dsp/x86/variance_avx2.c
index 7bc2693cf..8428e0520 100644
--- a/vpx_dsp/x86/variance_avx2.c
+++ b/vpx_dsp/x86/variance_avx2.c
@@ -61,7 +61,7 @@ unsigned int vpx_variance32x16_avx2(const uint8_t *src, int src_stride,
int sum;
variance_avx2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum,
vpx_get32x32var_avx2, 32);
- return *sse - (((int64_t)sum * sum) >> 9);
+ return *sse - (uint32_t)(((int64_t)sum * sum) >> 9);
}
unsigned int vpx_variance32x32_avx2(const uint8_t *src, int src_stride,
@@ -70,7 +70,7 @@ unsigned int vpx_variance32x32_avx2(const uint8_t *src, int src_stride,
int sum;
variance_avx2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum,
vpx_get32x32var_avx2, 32);
- return *sse - (((int64_t)sum * sum) >> 10);
+ return *sse - (uint32_t)(((int64_t)sum * sum) >> 10);
}
unsigned int vpx_variance64x64_avx2(const uint8_t *src, int src_stride,
@@ -79,7 +79,7 @@ unsigned int vpx_variance64x64_avx2(const uint8_t *src, int src_stride,
int sum;
variance_avx2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum,
vpx_get32x32var_avx2, 32);
- return *sse - (((int64_t)sum * sum) >> 12);
+ return *sse - (uint32_t)(((int64_t)sum * sum) >> 12);
}
unsigned int vpx_variance64x32_avx2(const uint8_t *src, int src_stride,
@@ -88,7 +88,7 @@ unsigned int vpx_variance64x32_avx2(const uint8_t *src, int src_stride,
int sum;
variance_avx2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum,
vpx_get32x32var_avx2, 32);
- return *sse - (((int64_t)sum * sum) >> 11);
+ return *sse - (uint32_t)(((int64_t)sum * sum) >> 11);
}
unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
@@ -115,7 +115,7 @@ unsigned int vpx_sub_pixel_variance64x64_avx2(const uint8_t *src,
dst + 32, dst_stride, 64, &sse2);
const int se = se1 + se2;
*sse = sse1 + sse2;
- return *sse - (((int64_t)se * se) >> 12);
+ return *sse - (uint32_t)(((int64_t)se * se) >> 12);
}
unsigned int vpx_sub_pixel_variance32x32_avx2(const uint8_t *src,
@@ -125,7 +125,7 @@ unsigned int vpx_sub_pixel_variance32x32_avx2(const uint8_t *src,
unsigned int *sse) {
const int se = vpx_sub_pixel_variance32xh_avx2(
src, src_stride, x_offset, y_offset, dst, dst_stride, 32, sse);
- return *sse - (((int64_t)se * se) >> 10);
+ return *sse - (uint32_t)(((int64_t)se * se) >> 10);
}
unsigned int vpx_sub_pixel_avg_variance64x64_avx2(
@@ -142,7 +142,7 @@ unsigned int vpx_sub_pixel_avg_variance64x64_avx2(
*sse = sse1 + sse2;
- return *sse - (((int64_t)se * se) >> 12);
+ return *sse - (uint32_t)(((int64_t)se * se) >> 12);
}
unsigned int vpx_sub_pixel_avg_variance32x32_avx2(
@@ -151,5 +151,5 @@ unsigned int vpx_sub_pixel_avg_variance32x32_avx2(
// Process 32 elements in parallel.
const int se = vpx_sub_pixel_avg_variance32xh_avx2(
src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 32, 32, sse);
- return *sse - (((int64_t)se * se) >> 10);
+ return *sse - (uint32_t)(((int64_t)se * se) >> 10);
}
diff --git a/vpxdec.c b/vpxdec.c
index c1ff5a3f8..ab638ec6b 100644
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -125,30 +125,11 @@ static const arg_def_t deblock =
ARG_DEF(NULL, "deblock", 0, "Enable VP8 deblocking");
static const arg_def_t demacroblock_level = ARG_DEF(
NULL, "demacroblock-level", 1, "Enable VP8 demacroblocking, w/ level");
-static const arg_def_t pp_debug_info =
- ARG_DEF(NULL, "pp-debug-info", 1, "Enable VP8 visible debug info");
-static const arg_def_t pp_disp_ref_frame =
- ARG_DEF(NULL, "pp-dbg-ref-frame", 1,
- "Display only selected reference frame per macro block");
-static const arg_def_t pp_disp_mb_modes = ARG_DEF(
- NULL, "pp-dbg-mb-modes", 1, "Display only selected macro block modes");
-static const arg_def_t pp_disp_b_modes =
- ARG_DEF(NULL, "pp-dbg-b-modes", 1, "Display only selected block modes");
-static const arg_def_t pp_disp_mvs =
- ARG_DEF(NULL, "pp-dbg-mvs", 1, "Draw only selected motion vectors");
static const arg_def_t mfqe =
ARG_DEF(NULL, "mfqe", 0, "Enable multiframe quality enhancement");
-static const arg_def_t *vp8_pp_args[] = { &addnoise_level,
- &deblock,
- &demacroblock_level,
- &pp_debug_info,
- &pp_disp_ref_frame,
- &pp_disp_mb_modes,
- &pp_disp_b_modes,
- &pp_disp_mvs,
- &mfqe,
- NULL };
+static const arg_def_t *vp8_pp_args[] = { &addnoise_level, &deblock,
+ &demacroblock_level, &mfqe, NULL };
#endif
#if CONFIG_LIBYUV
@@ -539,10 +520,6 @@ static int main_loop(int argc, const char **argv_) {
#endif
#if CONFIG_VP8_DECODER
vp8_postproc_cfg_t vp8_pp_cfg = { 0, 0, 0 };
- int vp8_dbg_color_ref_frame = 0;
- int vp8_dbg_color_mb_modes = 0;
- int vp8_dbg_color_b_modes = 0;
- int vp8_dbg_display_mv = 0;
#endif
int frames_corrupted = 0;
int dec_flags = 0;
@@ -647,37 +624,6 @@ static int main_loop(int argc, const char **argv_) {
} else if (arg_match(&arg, &mfqe, argi)) {
postproc = 1;
vp8_pp_cfg.post_proc_flag |= VP8_MFQE;
- } else if (arg_match(&arg, &pp_debug_info, argi)) {
- unsigned int level = arg_parse_uint(&arg);
-
- postproc = 1;
- vp8_pp_cfg.post_proc_flag &= ~0x7;
-
- if (level) vp8_pp_cfg.post_proc_flag |= level;
- } else if (arg_match(&arg, &pp_disp_ref_frame, argi)) {
- unsigned int flags = arg_parse_int(&arg);
- if (flags) {
- postproc = 1;
- vp8_dbg_color_ref_frame = flags;
- }
- } else if (arg_match(&arg, &pp_disp_mb_modes, argi)) {
- unsigned int flags = arg_parse_int(&arg);
- if (flags) {
- postproc = 1;
- vp8_dbg_color_mb_modes = flags;
- }
- } else if (arg_match(&arg, &pp_disp_b_modes, argi)) {
- unsigned int flags = arg_parse_int(&arg);
- if (flags) {
- postproc = 1;
- vp8_dbg_color_b_modes = flags;
- }
- } else if (arg_match(&arg, &pp_disp_mvs, argi)) {
- unsigned int flags = arg_parse_int(&arg);
- if (flags) {
- postproc = 1;
- vp8_dbg_display_mv = flags;
- }
} else if (arg_match(&arg, &error_concealment, argi)) {
ec_enabled = 1;
}
@@ -789,37 +735,6 @@ static int main_loop(int argc, const char **argv_) {
vpx_codec_error(&decoder));
return EXIT_FAILURE;
}
-
- if (vp8_dbg_color_ref_frame &&
- vpx_codec_control(&decoder, VP8_SET_DBG_COLOR_REF_FRAME,
- vp8_dbg_color_ref_frame)) {
- fprintf(stderr, "Failed to configure reference block visualizer: %s\n",
- vpx_codec_error(&decoder));
- return EXIT_FAILURE;
- }
-
- if (vp8_dbg_color_mb_modes &&
- vpx_codec_control(&decoder, VP8_SET_DBG_COLOR_MB_MODES,
- vp8_dbg_color_mb_modes)) {
- fprintf(stderr, "Failed to configure macro block visualizer: %s\n",
- vpx_codec_error(&decoder));
- return EXIT_FAILURE;
- }
-
- if (vp8_dbg_color_b_modes &&
- vpx_codec_control(&decoder, VP8_SET_DBG_COLOR_B_MODES,
- vp8_dbg_color_b_modes)) {
- fprintf(stderr, "Failed to configure block visualizer: %s\n",
- vpx_codec_error(&decoder));
- return EXIT_FAILURE;
- }
-
- if (vp8_dbg_display_mv &&
- vpx_codec_control(&decoder, VP8_SET_DBG_DISPLAY_MV, vp8_dbg_display_mv)) {
- fprintf(stderr, "Failed to configure motion vector visualizer: %s\n",
- vpx_codec_error(&decoder));
- return EXIT_FAILURE;
- }
#endif
if (arg_skip) fprintf(stderr, "Skipping first %d frames.\n", arg_skip);