55 files changed, 1515 insertions, 2302 deletions
diff --git a/examples/decode_with_drops.c b/examples/decode_with_drops.c
index 29b8be941..e69e2a9f9 100644
--- a/examples/decode_with_drops.c
+++ b/examples/decode_with_drops.c
@@ -92,8 +92,8 @@ int main(int argc, char **argv) {
   if (!(outfile = fopen(argv[2], "wb")))
     die("Failed to open %s for writing.", argv[2]);
 
-  n = strtol(argv[3], &nptr, 0);
-  m = strtol(nptr + 1, NULL, 0);
+  n = (int)strtol(argv[3], &nptr, 0);
+  m = (int)strtol(nptr + 1, NULL, 0);
   is_range = (*nptr == '-');
   if (!n || !m || (*nptr != '-' && *nptr != '/'))
     die("Couldn't parse pattern %s.\n", argv[3]);
diff --git a/examples/set_maps.c b/examples/set_maps.c
index d128e7d9a..c0c7d10e7 100644
--- a/examples/set_maps.c
+++ b/examples/set_maps.c
@@ -174,8 +174,8 @@ int main(int argc, char **argv) {
   }
   assert(encoder != NULL);
   info.codec_fourcc = encoder->fourcc;
-  info.frame_width = strtol(argv[2], NULL, 0);
-  info.frame_height = strtol(argv[3], NULL, 0);
+  info.frame_width = (int)strtol(argv[2], NULL, 0);
+  info.frame_height = (int)strtol(argv[3], NULL, 0);
   info.time_base.numerator = 1;
   info.time_base.denominator = fps;
 
diff --git a/examples/simple_encoder.c b/examples/simple_encoder.c
index 8632f179b..dde6344f8 100644
--- a/examples/simple_encoder.c
+++ b/examples/simple_encoder.c
@@ -175,14 +175,14 @@ int main(int argc, char **argv) {
   infile_arg = argv[4];
   outfile_arg = argv[5];
   keyframe_interval_arg = argv[6];
-  max_frames = strtol(argv[8], NULL, 0);
+  max_frames = (int)strtol(argv[8], NULL, 0);
 
   encoder = get_vpx_encoder_by_name(codec_arg);
   if (!encoder) die("Unsupported codec.");
 
   info.codec_fourcc = encoder->fourcc;
-  info.frame_width = strtol(width_arg, NULL, 0);
-  info.frame_height = strtol(height_arg, NULL, 0);
+  info.frame_width = (int)strtol(width_arg, NULL, 0);
+  info.frame_height = (int)strtol(height_arg, NULL, 0);
   info.time_base.numerator = 1;
   info.time_base.denominator = fps;
 
@@ -196,7 +196,7 @@ int main(int argc, char **argv) {
     die("Failed to allocate image.");
   }
 
-  keyframe_interval = strtol(keyframe_interval_arg, NULL, 0);
+  keyframe_interval = (int)strtol(keyframe_interval_arg, NULL, 0);
   if (keyframe_interval < 0) die("Invalid keyframe interval value.");
 
   printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface()));
@@ -209,7 +209,7 @@ int main(int argc, char **argv) {
   cfg.g_timebase.num = info.time_base.numerator;
   cfg.g_timebase.den = info.time_base.denominator;
   cfg.rc_target_bitrate = bitrate;
-  cfg.g_error_resilient = strtol(argv[7], NULL, 0);
+  cfg.g_error_resilient = (vpx_codec_er_flags_t)strtoul(argv[7], NULL, 0);
 
   writer = vpx_video_writer_open(outfile_arg, kContainerIVF, &info);
   if (!writer) die("Failed to open %s for writing.", outfile_arg);
diff --git a/examples/twopass_encoder.c b/examples/twopass_encoder.c
index 4c130ec18..4e63a7a6c 100644
--- a/examples/twopass_encoder.c
+++ b/examples/twopass_encoder.c
@@ -209,13 +209,13 @@ int main(int argc, char **argv) {
 
   if (argc != 7) die("Invalid number of arguments.");
 
-  max_frames = strtol(argv[6], NULL, 0);
+  max_frames = (int)strtol(argv[6], NULL, 0);
 
   encoder = get_vpx_encoder_by_name(codec_arg);
   if (!encoder) die("Unsupported codec.");
 
-  w = strtol(width_arg, NULL, 0);
-  h = strtol(height_arg, NULL, 0);
+  w = (int)strtol(width_arg, NULL, 0);
+  h = (int)strtol(height_arg, NULL, 0);
 
   if (w <= 0 || h <= 0 || (w % 2) != 0 || (h % 2) != 0)
     die("Invalid frame size: %dx%d", w, h);
diff --git a/examples/vp8cx_set_ref.c b/examples/vp8cx_set_ref.c
index fc7bdab39..846477c61 100644
--- a/examples/vp8cx_set_ref.c
+++ b/examples/vp8cx_set_ref.c
@@ -122,8 +122,8 @@ int main(int argc, char **argv) {
   if (!update_frame_num) die("Couldn't parse frame number '%s'\n", argv[5]);
 
   info.codec_fourcc = encoder->fourcc;
-  info.frame_width = strtol(argv[1], NULL, 0);
-  info.frame_height = strtol(argv[2], NULL, 0);
+  info.frame_width = (int)strtol(argv[1], NULL, 0);
+  info.frame_height = (int)strtol(argv[2], NULL, 0);
   info.time_base.numerator = 1;
   info.time_base.denominator = fps;
 
diff --git a/examples/vp9_lossless_encoder.c b/examples/vp9_lossless_encoder.c
index 5802186bf..cb5ca6bfe 100644
--- a/examples/vp9_lossless_encoder.c
+++ b/examples/vp9_lossless_encoder.c
@@ -78,8 +78,8 @@ int main(int argc, char **argv) {
   if (!encoder) die("Unsupported codec.");
 
   info.codec_fourcc = encoder->fourcc;
-  info.frame_width = strtol(argv[1], NULL, 0);
-  info.frame_height = strtol(argv[2], NULL, 0);
+  info.frame_width = (int)strtol(argv[1], NULL, 0);
+  info.frame_height = (int)strtol(argv[2], NULL, 0);
   info.time_base.numerator = 1;
   info.time_base.denominator = fps;
 
diff --git a/examples/vp9cx_set_ref.c b/examples/vp9cx_set_ref.c
index e0bb795f7..798d7e3f2 100644
--- a/examples/vp9cx_set_ref.c
+++ b/examples/vp9cx_set_ref.c
@@ -335,8 +335,8 @@ int main(int argc, char **argv) {
   }
 
   info.codec_fourcc = encoder->fourcc;
-  info.frame_width = strtol(width_arg, NULL, 0);
-  info.frame_height = strtol(height_arg, NULL, 0);
+  info.frame_width = (int)strtol(width_arg, NULL, 0);
+  info.frame_height = (int)strtol(height_arg, NULL, 0);
   info.time_base.numerator = 1;
   info.time_base.denominator = fps;
 
diff --git a/examples/vpx_temporal_svc_encoder.c b/examples/vpx_temporal_svc_encoder.c
index 4a3387787..309a2fe2e 100644
--- a/examples/vpx_temporal_svc_encoder.c
+++ b/examples/vpx_temporal_svc_encoder.c
@@ -547,13 +547,13 @@ int main(int argc, char **argv) {
 
   printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface()));
 
-  width = strtol(argv[4], NULL, 0);
-  height = strtol(argv[5], NULL, 0);
+  width = (unsigned int)strtoul(argv[4], NULL, 0);
+  height = (unsigned int)strtoul(argv[5], NULL, 0);
   if (width < 16 || width % 2 || height < 16 || height % 2) {
     die("Invalid resolution: %d x %d", width, height);
   }
 
-  layering_mode = strtol(argv[10], NULL, 0);
+  layering_mode = (int)strtol(argv[10], NULL, 0);
   if (layering_mode < 0 || layering_mode > 13) {
     die("Invalid layering mode (0..12) %s", argv[10]);
   }
@@ -609,17 +609,17 @@ int main(int argc, char **argv) {
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
   // Timebase format e.g. 30fps: numerator=1, demoninator = 30.
-  cfg.g_timebase.num = strtol(argv[6], NULL, 0);
-  cfg.g_timebase.den = strtol(argv[7], NULL, 0);
+  cfg.g_timebase.num = (int)strtol(argv[6], NULL, 0);
+  cfg.g_timebase.den = (int)strtol(argv[7], NULL, 0);
 
-  speed = strtol(argv[8], NULL, 0);
+  speed = (int)strtol(argv[8], NULL, 0);
   if (speed < 0) {
     die("Invalid speed setting: must be positive");
   }
 
   for (i = min_args_base;
        (int)i < min_args_base + mode_to_num_layers[layering_mode]; ++i) {
-    rc.layer_target_bitrate[i - 11] = strtol(argv[i], NULL, 0);
+    rc.layer_target_bitrate[i - 11] = (int)strtol(argv[i], NULL, 0);
     if (strncmp(encoder->name, "vp8", 3) == 0)
       cfg.ts_target_bitrate[i - 11] = rc.layer_target_bitrate[i - 11];
     else if (strncmp(encoder->name, "vp9", 3) == 0)
@@ -627,7 +627,7 @@ int main(int argc, char **argv) {
   }
 
   // Real time parameters.
-  cfg.rc_dropframe_thresh = strtol(argv[9], NULL, 0);
+  cfg.rc_dropframe_thresh = (unsigned int)strtoul(argv[9], NULL, 0);
   cfg.rc_end_usage = VPX_CBR;
   cfg.rc_min_quantizer = 2;
   cfg.rc_max_quantizer = 56;
diff --git a/test/idct_test.cc b/test/idct_test.cc
index f54f2c005..700da77e3 100644
--- a/test/idct_test.cc
+++ b/test/idct_test.cc
@@ -115,6 +115,10 @@ TEST_P(IDCTTest, TestWithData) {
 }
 
 INSTANTIATE_TEST_CASE_P(C, IDCTTest, ::testing::Values(vp8_short_idct4x4llm_c));
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, IDCTTest,
+                        ::testing::Values(vp8_short_idct4x4llm_neon));
+#endif
 #if HAVE_MMX
 INSTANTIATE_TEST_CASE_P(MMX, IDCTTest,
                         ::testing::Values(vp8_short_idct4x4llm_mmx));
diff --git a/test/predict_test.cc b/test/predict_test.cc
new file mode 100644
index 000000000..f06e4dbb2
--- /dev/null
+++ b/test/predict_test.cc
@@ -0,0 +1,381 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vp8_rtcd.h"
+#include "./vpx_config.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
+
+namespace {
+
+using libvpx_test::ACMRandom;
+using std::tr1::make_tuple;
+
+typedef void (*PredictFunc)(uint8_t *src_ptr, int src_pixels_per_line,
+                            int xoffset, int yoffset, uint8_t *dst_ptr,
+                            int dst_pitch);
+
+typedef std::tr1::tuple<int, int, PredictFunc> PredictParam;
+
+class PredictTestBase : public ::testing::TestWithParam<PredictParam> {
+ public:
+  PredictTestBase()
+      : width_(GET_PARAM(0)), height_(GET_PARAM(1)), predict_(GET_PARAM(2)),
+        src_(NULL), padded_dst_(NULL), dst_(NULL), dst_c_(NULL) {}
+
+  virtual void SetUp() {
+    src_ = new uint8_t[kSrcSize];
+    ASSERT_TRUE(src_ != NULL);
+
+    // padded_dst_ provides a buffer of kBorderSize around the destination
+    // memory to facilitate detecting out of bounds writes.
+    dst_stride_ = kBorderSize + width_ + kBorderSize;
+    padded_dst_size_ = dst_stride_ * (kBorderSize + height_ + kBorderSize);
+    padded_dst_ =
+        reinterpret_cast<uint8_t *>(vpx_memalign(16, padded_dst_size_));
+    ASSERT_TRUE(padded_dst_ != NULL);
+    dst_ = padded_dst_ + (kBorderSize * dst_stride_) + kBorderSize;
+
+    dst_c_ = new uint8_t[16 * 16];
+    ASSERT_TRUE(dst_c_ != NULL);
+
+    memset(src_, 0, kSrcSize);
+    memset(padded_dst_, 128, padded_dst_size_);
+    memset(dst_c_, 0, 16 * 16);
+  }
+
+  virtual void TearDown() {
+    delete[] src_;
+    src_ = NULL;
+    vpx_free(padded_dst_);
+    padded_dst_ = NULL;
+    dst_ = NULL;
+    delete[] dst_c_;
+    dst_c_ = NULL;
+    libvpx_test::ClearSystemState();
+  }
+
+ protected:
+  // Make reference arrays big enough for 16x16 functions. Six-tap filters need
+  // 5 extra pixels outside of the macroblock.
+  static const int kSrcStride = 21;
+  static const int kSrcSize = kSrcStride * kSrcStride;
+  static const int kBorderSize = 16;
+
+  int width_;
+  int height_;
+  PredictFunc predict_;
+  uint8_t *src_;
+  uint8_t *padded_dst_;
+  uint8_t *dst_;
+  int padded_dst_size_;
+  uint8_t *dst_c_;
+  int dst_stride_;
+
+  bool CompareBuffers(const uint8_t *a, int a_stride, const uint8_t *b,
+                      int b_stride) const {
+    for (int height = 0; height < height_; ++height) {
+      EXPECT_EQ(0, memcmp(a + height * a_stride, b + height * b_stride,
+                          sizeof(*a) * width_))
+          << "Row " << height << " does not match.";
+    }
+
+    return !HasFailure();
+  }
+
+  // Given a block of memory 'a' with size 'a_size', determine if all regions
+  // excepting block 'b' described by 'b_stride', 'b_height', and 'b_width'
+  // match pixel value 'c'.
+  bool CheckBorder(const uint8_t *a, int a_size, const uint8_t *b, int b_width,
+                   int b_height, int b_stride, uint8_t c) const {
+    const uint8_t *a_end = a + a_size;
+    const int b_size = (b_stride * b_height) + b_width;
+    const uint8_t *b_end = b + b_size;
+    const int left_border = (b_stride - b_width) / 2;
+    const int right_border = left_border + ((b_stride - b_width) % 2);
+
+    EXPECT_GE(b - left_border, a) << "'b' does not start within 'a'";
+    EXPECT_LE(b_end + right_border, a_end) << "'b' does not end within 'a'";
+
+    // Top border.
+    for (int pixel = 0; pixel < b - a - left_border; ++pixel) {
+      EXPECT_EQ(c, a[pixel]) << "Mismatch at " << pixel << " in top border.";
+    }
+
+    // Left border.
+    for (int height = 0; height < b_height; ++height) {
+      for (int width = left_border; width > 0; --width) {
+        EXPECT_EQ(c, b[height * b_stride - width])
+            << "Mismatch at row " << height << " column " << left_border - width
+            << " in left border.";
+      }
+    }
+
+    // Right border.
+    for (int height = 0; height < b_height; ++height) {
+      for (int width = b_width; width < b_width + right_border; ++width) {
+        EXPECT_EQ(c, b[height * b_stride + width])
+            << "Mismatch at row " << height << " column " << width - b_width
+            << " in right border.";
+      }
+    }
+
+    // Bottom border.
+    for (int pixel = static_cast<int>(b - a + b_size); pixel < a_size;
+         ++pixel) {
+      EXPECT_EQ(c, a[pixel]) << "Mismatch at " << pixel << " in bottom border.";
+    }
+
+    return !HasFailure();
+  }
+
+  void TestWithRandomData(PredictFunc reference) {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+    // Run tests for almost all possible offsets.
+    for (int xoffset = 0; xoffset < 8; ++xoffset) {
+      for (int yoffset = 0; yoffset < 8; ++yoffset) {
+        if (xoffset == 0 && yoffset == 0) {
+          // This represents a copy which is not required to be handled by this
+          // module.
+          continue;
+        }
+
+        for (int i = 0; i < kSrcSize; ++i) {
+          src_[i] = rnd.Rand8();
+        }
+        reference(&src_[kSrcStride * 2 + 2], kSrcStride, xoffset, yoffset,
+                  dst_c_, 16);
+
+        ASM_REGISTER_STATE_CHECK(predict_(&src_[kSrcStride * 2 + 2], kSrcStride,
+                                          xoffset, yoffset, dst_, dst_stride_));
+
+        ASSERT_TRUE(CompareBuffers(dst_c_, 16, dst_, dst_stride_));
+        ASSERT_TRUE(CheckBorder(padded_dst_, padded_dst_size_, dst_, width_,
+                                height_, dst_stride_, 128));
+      }
+    }
+  }
+
+  void TestWithUnalignedDst(PredictFunc reference) {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+    // Only the 4x4 need to be able to handle unaligned writes.
+    if (width_ == 4 && height_ == 4) {
+      for (int xoffset = 0; xoffset < 8; ++xoffset) {
+        for (int yoffset = 0; yoffset < 8; ++yoffset) {
+          if (xoffset == 0 && yoffset == 0) {
+            continue;
+          }
+          for (int i = 0; i < kSrcSize; ++i) {
+            src_[i] = rnd.Rand8();
+          }
+          reference(&src_[kSrcStride * 2 + 2], kSrcStride, xoffset, yoffset,
+                    dst_c_, 16);
+
+          for (int i = 1; i < 4; ++i) {
+            memset(padded_dst_, 128, padded_dst_size_);
+
+            ASM_REGISTER_STATE_CHECK(predict_(&src_[kSrcStride * 2 + 2],
+                                              kSrcStride, xoffset, yoffset,
+                                              dst_ + i, dst_stride_ + i));
+
+            ASSERT_TRUE(CompareBuffers(dst_c_, 16, dst_ + i, dst_stride_ + i));
+            ASSERT_TRUE(CheckBorder(padded_dst_, padded_dst_size_, dst_ + i,
+                                    width_, height_, dst_stride_ + i, 128));
+          }
+        }
+      }
+    }
+  }
+};
+
+class SixtapPredictTest : public PredictTestBase {};
+
+TEST_P(SixtapPredictTest, TestWithRandomData) {
+  TestWithRandomData(vp8_sixtap_predict16x16_c);
+}
+TEST_P(SixtapPredictTest, TestWithUnalignedDst) {
+  TestWithUnalignedDst(vp8_sixtap_predict16x16_c);
+}
+
+TEST_P(SixtapPredictTest, TestWithPresetData) {
+  // Test input
+  static const uint8_t kTestData[kSrcSize] = {
+    184, 4,   191, 82,  92,  41,  0,   1,   226, 236, 172, 20,  182, 42,  226,
+    177, 79,  94,  77,  179, 203, 206, 198, 22,  192, 19,  75,  17,  192, 44,
+    233, 120, 48,  168, 203, 141, 210, 203, 143, 180, 184, 59,  201, 110, 102,
+    171, 32,  182, 10,  109, 105, 213, 60,  47,  236, 253, 67,  55,  14,  3,
+    99,  247, 124, 148, 159, 71,  34,  114, 19,  177, 38,  203, 237, 239, 58,
+    83,  155, 91,  10,  166, 201, 115, 124, 5,   163, 104, 2,   231, 160, 16,
+    234, 4,   8,   103, 153, 167, 174, 187, 26,  193, 109, 64,  141, 90,  48,
+    200, 174, 204, 36,  184, 114, 237, 43,  238, 242, 207, 86,  245, 182, 247,
+    6,   161, 251, 14,  8,   148, 182, 182, 79,  208, 120, 188, 17,  6,   23,
+    65,  206, 197, 13,  242, 126, 128, 224, 170, 110, 211, 121, 197, 200, 47,
+    188, 207, 208, 184, 221, 216, 76,  148, 143, 156, 100, 8,   89,  117, 14,
+    112, 183, 221, 54,  197, 208, 180, 69,  176, 94,  180, 131, 215, 121, 76,
+    7,   54,  28,  216, 238, 249, 176, 58,  142, 64,  215, 242, 72,  49,  104,
+    87,  161, 32,  52,  216, 230, 4,   141, 44,  181, 235, 224, 57,  195, 89,
+    134, 203, 144, 162, 163, 126, 156, 84,  185, 42,  148, 145, 29,  221, 194,
+    134, 52,  100, 166, 105, 60,  140, 110, 201, 184, 35,  181, 153, 93,  121,
+    243, 227, 68,  131, 134, 232, 2,   35,  60,  187, 77,  209, 76,  106, 174,
+    15,  241, 227, 115, 151, 77,  175, 36,  187, 121, 221, 223, 47,  118, 61,
+    168, 105, 32,  237, 236, 167, 213, 238, 202, 17,  170, 24,  226, 247, 131,
+    145, 6,   116, 117, 121, 11,  194, 41,  48,  126, 162, 13,  93,  209, 131,
+    154, 122, 237, 187, 103, 217, 99,  60,  200, 45,  78,  115, 69,  49,  106,
+    200, 194, 112, 60,  56,  234, 72,  251, 19,  120, 121, 182, 134, 215, 135,
+    10,  114, 2,   247, 46,  105, 209, 145, 165, 153, 191, 243, 12,  5,   36,
+    119, 206, 231, 231, 11,  32,  209, 83,  27,  229, 204, 149, 155, 83,  109,
+    35,  93,  223, 37,  84,  14,  142, 37,  160, 52,  191, 96,  40,  204, 101,
+    77,  67,  52,  53,  43,  63,  85,  253, 147, 113, 226, 96,  6,   125, 179,
+    115, 161, 17,  83,  198, 101, 98,  85,  139, 3,   137, 75,  99,  178, 23,
+    201, 255, 91,  253, 52,  134, 60,  138, 131, 208, 251, 101, 48,  2,   227,
+    228, 118, 132, 245, 202, 75,  91,  44,  160, 231, 47,  41,  50,  147, 220,
+    74,  92,  219, 165, 89,  16
+  };
+
+  // Expected results for xoffset = 2 and yoffset = 2.
+  static const int kExpectedDstStride = 16;
+  static const uint8_t kExpectedDst[256] = {
+    117, 102, 74,  135, 42,  98,  175, 206, 70,  73,  222, 197, 50,  24,  39,
+    49,  38,  105, 90,  47,  169, 40,  171, 215, 200, 73,  109, 141, 53,  85,
+    177, 164, 79,  208, 124, 89,  212, 18,  81,  145, 151, 164, 217, 153, 91,
+    154, 102, 102, 159, 75,  164, 152, 136, 51,  213, 219, 186, 116, 193, 224,
+    186, 36,  231, 208, 84,  211, 155, 167, 35,  59,  42,  76,  216, 149, 73,
+    201, 78,  149, 184, 100, 96,  196, 189, 198, 188, 235, 195, 117, 129, 120,
+    129, 49,  25,  133, 113, 69,  221, 114, 70,  143, 99,  157, 108, 189, 140,
+    78,  6,   55,  65,  240, 255, 245, 184, 72,  90,  100, 116, 131, 39,  60,
+    234, 167, 33,  160, 88,  185, 200, 157, 159, 176, 127, 151, 138, 102, 168,
+    106, 170, 86,  82,  219, 189, 76,  33,  115, 197, 106, 96,  198, 136, 97,
+    141, 237, 151, 98,  137, 191, 185, 2,   57,  95,  142, 91,  255, 185, 97,
+    137, 76,  162, 94,  173, 131, 193, 161, 81,  106, 72,  135, 222, 234, 137,
+    66,  137, 106, 243, 210, 147, 95,  15,  137, 110, 85,  66,  16,  96,  167,
+    147, 150, 173, 203, 140, 118, 196, 84,  147, 160, 19,  95,  101, 123, 74,
+    132, 202, 82,  166, 12,  131, 166, 189, 170, 159, 85,  79,  66,  57,  152,
+    132, 203, 194, 0,   1,   56,  146, 180, 224, 156, 28,  83,  181, 79,  76,
+    80,  46,  160, 175, 59,  106, 43,  87,  75,  136, 85,  189, 46,  71,  200,
+    90
+  };
+
+  ASM_REGISTER_STATE_CHECK(
+      predict_(const_cast<uint8_t *>(kTestData) + kSrcStride * 2 + 2,
+               kSrcStride, 2, 2, dst_, dst_stride_));
+
+  ASSERT_TRUE(
+      CompareBuffers(kExpectedDst, kExpectedDstStride, dst_, dst_stride_));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    C, SixtapPredictTest,
+    ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_c),
+                      make_tuple(8, 8, &vp8_sixtap_predict8x8_c),
+                      make_tuple(8, 4, &vp8_sixtap_predict8x4_c),
+                      make_tuple(4, 4, &vp8_sixtap_predict4x4_c)));
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(
+    NEON, SixtapPredictTest,
+    ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_neon),
+                      make_tuple(8, 8, &vp8_sixtap_predict8x8_neon),
+                      make_tuple(8, 4, &vp8_sixtap_predict8x4_neon),
+                      make_tuple(4, 4, &vp8_sixtap_predict4x4_neon)));
+#endif
+#if HAVE_MMX
+INSTANTIATE_TEST_CASE_P(
+    MMX, SixtapPredictTest,
+    ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_mmx),
+                      make_tuple(8, 8, &vp8_sixtap_predict8x8_mmx),
+                      make_tuple(8, 4, &vp8_sixtap_predict8x4_mmx),
+                      make_tuple(4, 4, &vp8_sixtap_predict4x4_mmx)));
+#endif
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, SixtapPredictTest,
+    ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_sse2),
+                      make_tuple(8, 8, &vp8_sixtap_predict8x8_sse2),
+                      make_tuple(8, 4, &vp8_sixtap_predict8x4_sse2)));
+#endif
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(
+    SSSE3, SixtapPredictTest,
+    ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_ssse3),
+                      make_tuple(8, 8, &vp8_sixtap_predict8x8_ssse3),
+                      make_tuple(8, 4, &vp8_sixtap_predict8x4_ssse3),
+                      make_tuple(4, 4, &vp8_sixtap_predict4x4_ssse3)));
+#endif
+#if HAVE_MSA
+INSTANTIATE_TEST_CASE_P(
+    MSA, SixtapPredictTest,
+    ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_msa),
+                      make_tuple(8, 8, &vp8_sixtap_predict8x8_msa),
+                      make_tuple(8, 4, &vp8_sixtap_predict8x4_msa),
+                      make_tuple(4, 4, &vp8_sixtap_predict4x4_msa)));
+#endif
+
+class BilinearPredictTest : public PredictTestBase {};
+
+TEST_P(BilinearPredictTest, TestWithRandomData) {
+  TestWithRandomData(vp8_bilinear_predict16x16_c);
+}
+TEST_P(BilinearPredictTest, TestWithUnalignedDst) {
+  TestWithUnalignedDst(vp8_bilinear_predict16x16_c);
+}
+
+INSTANTIATE_TEST_CASE_P(
+    C, BilinearPredictTest,
+    ::testing::Values(make_tuple(16, 16, &vp8_bilinear_predict16x16_c),
+                      make_tuple(8, 8, &vp8_bilinear_predict8x8_c),
+                      make_tuple(8, 4, &vp8_bilinear_predict8x4_c),
+                      make_tuple(4, 4, &vp8_bilinear_predict4x4_c)));
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(
+    NEON, BilinearPredictTest,
+    ::testing::Values(make_tuple(16, 16, &vp8_bilinear_predict16x16_neon),
+                      make_tuple(8, 8, &vp8_bilinear_predict8x8_neon),
+                      make_tuple(8, 4, &vp8_bilinear_predict8x4_neon),
+                      make_tuple(4, 4, &vp8_bilinear_predict4x4_neon)));
+#endif
+#if HAVE_MMX
+INSTANTIATE_TEST_CASE_P(
+    MMX, BilinearPredictTest,
+    ::testing::Values(make_tuple(16, 16, &vp8_bilinear_predict16x16_mmx),
+                      make_tuple(8, 8, &vp8_bilinear_predict8x8_mmx),
+                      make_tuple(8, 4, &vp8_bilinear_predict8x4_mmx),
+                      make_tuple(4, 4, &vp8_bilinear_predict4x4_mmx)));
+#endif
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, BilinearPredictTest,
+    ::testing::Values(make_tuple(16, 16, &vp8_bilinear_predict16x16_sse2),
+                      make_tuple(8, 8, &vp8_bilinear_predict8x8_sse2)));
+#endif
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(
+    SSSE3, BilinearPredictTest,
+    ::testing::Values(make_tuple(16, 16, &vp8_bilinear_predict16x16_ssse3),
+                      make_tuple(8, 8, &vp8_bilinear_predict8x8_ssse3)));
+#endif
+#if HAVE_MSA
+INSTANTIATE_TEST_CASE_P(
+    MSA, BilinearPredictTest,
+    ::testing::Values(make_tuple(16, 16, &vp8_bilinear_predict16x16_msa),
+                      make_tuple(8, 8, &vp8_bilinear_predict8x8_msa),
+                      make_tuple(8, 4, &vp8_bilinear_predict8x4_msa),
+                      make_tuple(4, 4, &vp8_bilinear_predict4x4_msa)));
+#endif
+}  // namespace
diff --git a/test/sixtap_predict_test.cc b/test/sixtap_predict_test.cc
deleted file mode 100644
index 31a604417..000000000
--- a/test/sixtap_predict_test.cc
+++ /dev/null
@@ -1,231 +0,0 @@
-/*
- *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "third_party/googletest/src/include/gtest/gtest.h"
-
-#include "./vpx_config.h"
-#include "./vp8_rtcd.h"
-#include "test/acm_random.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-#include "test/util.h"
-#include "vpx/vpx_integer.h"
-#include "vpx_mem/vpx_mem.h"
-
-namespace {
-
-typedef void (*SixtapPredictFunc)(uint8_t *src_ptr, int src_pixels_per_line,
-                                  int xoffset, int yoffset, uint8_t *dst_ptr,
-                                  int dst_pitch);
-
-typedef std::tr1::tuple<int, int, SixtapPredictFunc> SixtapPredictParam;
-
-class SixtapPredictTest : public ::testing::TestWithParam<SixtapPredictParam> {
- public:
-  static void SetUpTestCase() {
-    src_ = reinterpret_cast<uint8_t *>(vpx_memalign(kDataAlignment, kSrcSize));
-    dst_ = reinterpret_cast<uint8_t *>(vpx_memalign(kDataAlignment, kDstSize));
-    dst_c_ =
-        reinterpret_cast<uint8_t *>(vpx_memalign(kDataAlignment, kDstSize));
-  }
-
-  static void TearDownTestCase() {
-    vpx_free(src_);
-    src_ = NULL;
-    vpx_free(dst_);
-    dst_ = NULL;
-    vpx_free(dst_c_);
-    dst_c_ = NULL;
-  }
-
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
-
- protected:
-  // Make test arrays big enough for 16x16 functions. Six-tap filters
-  // need 5 extra pixels outside of the macroblock.
-  static const int kSrcStride = 21;
-  static const int kDstStride = 16;
-  static const int kDataAlignment = 16;
-  static const int kSrcSize = kSrcStride * kSrcStride + 1;
-  static const int kDstSize = kDstStride * kDstStride;
-
-  virtual void SetUp() {
-    width_ = GET_PARAM(0);
-    height_ = GET_PARAM(1);
-    sixtap_predict_ = GET_PARAM(2);
-    memset(src_, 0, kSrcSize);
-    memset(dst_, 0, kDstSize);
-    memset(dst_c_, 0, kDstSize);
-  }
-
-  int width_;
-  int height_;
-  SixtapPredictFunc sixtap_predict_;
-  // The src stores the macroblock we will filter on, and makes it 1 byte larger
-  // in order to test unaligned access. The result is stored in dst and dst_c(c
-  // reference code result).
-  static uint8_t *src_;
-  static uint8_t *dst_;
-  static uint8_t *dst_c_;
-};
-
-uint8_t *SixtapPredictTest::src_ = NULL;
-uint8_t *SixtapPredictTest::dst_ = NULL;
-uint8_t *SixtapPredictTest::dst_c_ = NULL;
-
-TEST_P(SixtapPredictTest, TestWithPresetData) {
-  // Test input
-  static const uint8_t test_data[kSrcSize] = {
-    216, 184, 4,   191, 82,  92,  41,  0,   1,   226, 236, 172, 20,  182, 42,
-    226, 177, 79,  94,  77,  179, 203, 206, 198, 22,  192, 19,  75,  17,  192,
-    44,  233, 120, 48,  168, 203, 141, 210, 203, 143, 180, 184, 59,  201, 110,
-    102, 171, 32,  182, 10,  109, 105, 213, 60,  47,  236, 253, 67,  55,  14,
-    3,   99,  247, 124, 148, 159, 71,  34,  114, 19,  177, 38,  203, 237, 239,
-    58,  83,  155, 91,  10,  166, 201, 115, 124, 5,   163, 104, 2,   231, 160,
-    16,  234, 4,   8,   103, 153, 167, 174, 187, 26,  193, 109, 64,  141, 90,
-    48,  200, 174, 204, 36,  184, 114, 237, 43,  238, 242, 207, 86,  245, 182,
-    247, 6,   161, 251, 14,  8,   148, 182, 182, 79,  208, 120, 188, 17,  6,
-    23,  65,  206, 197, 13,  242, 126, 128, 224, 170, 110, 211, 121, 197, 200,
-    47,  188, 207, 208, 184, 221, 216, 76,  148, 143, 156, 100, 8,   89,  117,
-    14,  112, 183, 221, 54,  197, 208, 180, 69,  176, 94,  180, 131, 215, 121,
-    76,  7,   54,  28,  216, 238, 249, 176, 58,  142, 64,  215, 242, 72,  49,
-    104, 87,  161, 32,  52,  216, 230, 4,   141, 44,  181, 235, 224, 57,  195,
-    89,  134, 203, 144, 162, 163, 126, 156, 84,  185, 42,  148, 145, 29,  221,
-    194, 134, 52,  100, 166, 105, 60,  140, 110, 201, 184, 35,  181, 153, 93,
-    121, 243, 227, 68,  131, 134, 232, 2,   35,  60,  187, 77,  209, 76,  106,
-    174, 15,  241, 227, 115, 151, 77,  175, 36,  187, 121, 221, 223, 47,  118,
-    61,  168, 105, 32,  237, 236, 167, 213, 238, 202, 17,  170, 24,  226, 247,
-    131, 145, 6,   116, 117, 121, 11,  194, 41,  48,  126, 162, 13,  93,  209,
-    131, 154, 122, 237, 187, 103, 217, 99,  60,  200, 45,  78,  115, 69,  49,
-    106, 200, 194, 112, 60,  56,  234, 72,  251, 19,  120, 121, 182, 134, 215,
-    135, 10,  114, 2,   247, 46,  105, 209, 145, 165, 153, 191, 243, 12,  5,
-    36,  119, 206, 231, 231, 11,  32,  209, 83,  27,  229, 204, 149, 155, 83,
-    109, 35,  93,  223, 37,  84,  14,  142, 37,  160, 52,  191, 96,  40,  204,
-    101, 77,  67,  52,  53,  43,  63,  85,  253, 147, 113, 226, 96,  6,   125,
-    179, 115, 161, 17,  83,  198, 101, 98,  85,  139, 3,   137, 75,  99,  178,
-    23,  201, 255, 91,  253, 52,  134, 60,  138, 131, 208, 251, 101, 48,  2,
-    227, 228, 118, 132, 245, 202, 75,  91,  44,  160, 231, 47,  41,  50,  147,
-    220, 74,  92,  219, 165, 89,  16
-  };
-
-  // Expected result
-  static const uint8_t expected_dst[kDstSize] = {
-    117, 102, 74,  135, 42,  98,  175, 206, 70,  73,  222, 197, 50,  24,  39,
-    49,  38,  105, 90,  47,  169, 40,  171, 215, 200, 73,  109, 141, 53,  85,
-    177, 164, 79,  208, 124, 89,  212, 18,  81,  145, 151, 164, 217, 153, 91,
-    154, 102, 102, 159, 75,  164, 152, 136, 51,  213, 219, 186, 116, 193, 224,
-    186, 36,  231, 208, 84,  211, 155, 167, 35,  59,  42,  76,  216, 149, 73,
-    201, 78,  149, 184, 100, 96,  196, 189, 198, 188, 235, 195, 117, 129, 120,
-    129, 49,  25,  133, 113, 69,  221, 114, 70,  143, 99,  157, 108, 189, 140,
-    78,  6,   55,  65,  240, 255, 245, 184, 72,  90,  100, 116, 131, 39,  60,
-    234, 167, 33,  160, 88,  185, 200, 157, 159, 176, 127, 151, 138, 102, 168,
-    106, 170, 86,  82,  219, 189, 76,  33,  115, 197, 106, 96,  198, 136, 97,
-    141, 237, 151, 98,  137, 191, 185, 2,   57,  95,  142, 91,  255, 185, 97,
-    137, 76,  162, 94,  173, 131, 193, 161, 81,  106, 72,  135, 222, 234, 137,
-    66,  137, 106, 243, 210, 147, 95,  15,  137, 110, 85,  66,  16,  96,  167,
-    147, 150, 173, 203, 140, 118, 196, 84,  147, 160, 19,  95,  101, 123, 74,
-    132, 202, 82,  166, 12,  131, 166, 189, 170, 159, 85,  79,  66,  57,  152,
-    132, 203, 194, 0,   1,   56,  146, 180, 224, 156, 28,  83,  181, 79,  76,
-    80,  46,  160, 175, 59,  106, 43,  87,  75,  136, 85,  189, 46,  71,  200,
-    90
-  };
-
-  uint8_t *src = const_cast<uint8_t *>(test_data);
-
-  ASM_REGISTER_STATE_CHECK(sixtap_predict_(&src[kSrcStride * 2 + 2 + 1],
-                                           kSrcStride, 2, 2, dst_, kDstStride));
-
-  for (int i = 0; i < height_; ++i) {
-    for (int j = 0; j < width_; ++j)
-      ASSERT_EQ(expected_dst[i * kDstStride + j], dst_[i * kDstStride + j])
-          << "i==" << (i * width_ + j);
-  }
-}
-
-using libvpx_test::ACMRandom;
-
-TEST_P(SixtapPredictTest, TestWithRandomData) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  for (int i = 0; i < kSrcSize; ++i) src_[i] = rnd.Rand8();
-
-  // Run tests for all possible offsets.
-  for (int xoffset = 0; xoffset < 8; ++xoffset) {
-    for (int yoffset = 0; yoffset < 8; ++yoffset) {
-      // Call c reference function.
-      // Move start point to next pixel to test if the function reads
-      // unaligned data correctly.
-      vp8_sixtap_predict16x16_c(&src_[kSrcStride * 2 + 2 + 1], kSrcStride,
-                                xoffset, yoffset, dst_c_, kDstStride);
-
-      // Run test.
-      ASM_REGISTER_STATE_CHECK(sixtap_predict_(&src_[kSrcStride * 2 + 2 + 1],
-                                               kSrcStride, xoffset, yoffset,
-                                               dst_, kDstStride));
-
-      for (int i = 0; i < height_; ++i) {
-        for (int j = 0; j < width_; ++j)
-          ASSERT_EQ(dst_c_[i * kDstStride + j], dst_[i * kDstStride + j])
-              << "i==" << (i * width_ + j);
-      }
-    }
-  }
-}
-
-using std::tr1::make_tuple;
-
-INSTANTIATE_TEST_CASE_P(
-    C, SixtapPredictTest,
-    ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_c),
-                      make_tuple(8, 8, &vp8_sixtap_predict8x8_c),
-                      make_tuple(8, 4, &vp8_sixtap_predict8x4_c),
-                      make_tuple(4, 4, &vp8_sixtap_predict4x4_c)));
-#if HAVE_NEON
-INSTANTIATE_TEST_CASE_P(
-    NEON, SixtapPredictTest,
-    ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_neon),
-                      make_tuple(8, 8, &vp8_sixtap_predict8x8_neon),
-                      make_tuple(8, 4, &vp8_sixtap_predict8x4_neon)));
-#endif
-#if HAVE_MMX
-INSTANTIATE_TEST_CASE_P(
-    MMX, SixtapPredictTest,
-    ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_mmx),
-                      make_tuple(8, 8, &vp8_sixtap_predict8x8_mmx),
-                      make_tuple(8, 4, &vp8_sixtap_predict8x4_mmx),
-                      make_tuple(4, 4, &vp8_sixtap_predict4x4_mmx)));
-#endif
-#if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(
-    SSE2, SixtapPredictTest,
-    ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_sse2),
-                      make_tuple(8, 8, &vp8_sixtap_predict8x8_sse2),
-                      make_tuple(8, 4, &vp8_sixtap_predict8x4_sse2)));
-#endif
-#if HAVE_SSSE3
-INSTANTIATE_TEST_CASE_P(
-    SSSE3, SixtapPredictTest,
-    ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_ssse3),
-                      make_tuple(8, 8, &vp8_sixtap_predict8x8_ssse3),
-                      make_tuple(8, 4, &vp8_sixtap_predict8x4_ssse3),
-                      make_tuple(4, 4, &vp8_sixtap_predict4x4_ssse3)));
-#endif
-#if HAVE_MSA
-INSTANTIATE_TEST_CASE_P(
-    MSA, SixtapPredictTest,
-    ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_msa),
-                      make_tuple(8, 8, &vp8_sixtap_predict8x8_msa),
-                      make_tuple(8, 4, &vp8_sixtap_predict8x4_msa),
-                      make_tuple(4, 4, &vp8_sixtap_predict4x4_msa)));
-#endif
-}  // namespace
diff --git a/test/test.mk b/test/test.mk
index aad264531..60218a780 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -119,7 +119,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += variance_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_fdct4x4_test.cc
 
 LIBVPX_TEST_SRCS-yes                   += idct_test.cc
-LIBVPX_TEST_SRCS-yes                   += sixtap_predict_test.cc
+LIBVPX_TEST_SRCS-yes                   += predict_test.cc
 LIBVPX_TEST_SRCS-yes                   += vpx_scale_test.cc
 
 ifeq ($(CONFIG_VP8_ENCODER)$(CONFIG_TEMPORAL_DENOISING),yesyes)
diff --git a/vp8/common/arm/neon/dequant_idct_neon.c b/vp8/common/arm/neon/dequant_idct_neon.c
index ff5981eaa..753051c77 100644
--- a/vp8/common/arm/neon/dequant_idct_neon.c
+++ b/vp8/common/arm/neon/dequant_idct_neon.c
@@ -11,7 +11,11 @@
 #include <arm_neon.h>
 
 static const int16_t cospi8sqrt2minus1 = 20091;
-static const int16_t sinpi8sqrt2 = 35468;
+// 35468 exceeds INT16_MAX and gets converted to a negative number. Because of
+// the way it is used in vqdmulh, where the result is doubled, it can be divided
+// by 2 beforehand. This saves compensating for the negative value as well as
+// shifting the result.
+static const int16_t sinpi8sqrt2 = 35468 >> 1;
 
 void vp8_dequant_idct_add_neon(int16_t *input, int16_t *dq, unsigned char *dst,
                                int stride) {
@@ -60,10 +64,8 @@ void vp8_dequant_idct_add_neon(int16_t *input, int16_t *dq, unsigned char *dst,
   q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);
   q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);
 
-  q3 = vshrq_n_s16(q3, 1);
   q4 = vshrq_n_s16(q4, 1);
 
-  q3 = vqaddq_s16(q3, q2);
   q4 = vqaddq_s16(q4, q2);
 
   d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
@@ -90,10 +92,8 @@ void vp8_dequant_idct_add_neon(int16_t *input, int16_t *dq, unsigned char *dst,
   d12 = vqadd_s16(d2tmp2.val[0], d2tmp3.val[0]);
   d13 = vqsub_s16(d2tmp2.val[0], d2tmp3.val[0]);
 
-  q3 = vshrq_n_s16(q3, 1);
   q4 = vshrq_n_s16(q4, 1);
 
-  q3 = vqaddq_s16(q3, q2);
   q4 = vqaddq_s16(q4, q2);
 
   d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
diff --git a/vp8/common/arm/neon/shortidct4x4llm_neon.c b/vp8/common/arm/neon/shortidct4x4llm_neon.c
index a36c0c1ca..1adb1c317 100644
--- a/vp8/common/arm/neon/shortidct4x4llm_neon.c
+++ b/vp8/common/arm/neon/shortidct4x4llm_neon.c
@@ -11,7 +11,11 @@
 #include <arm_neon.h>
 
 static const int16_t cospi8sqrt2minus1 = 20091;
-static const int16_t sinpi8sqrt2 = 35468;
+// 35468 exceeds INT16_MAX and gets converted to a negative number. Because of
+// the way it is used in vqdmulh, where the result is doubled, it can be divided
+// by 2 beforehand. This saves compensating for the negative value as well as
+// shifting the result.
+static const int16_t sinpi8sqrt2 = 35468 >> 1;
 
 void vp8_short_idct4x4llm_neon(int16_t *input, unsigned char *pred_ptr,
                                int pred_stride, unsigned char *dst_ptr,
@@ -40,10 +44,8 @@ void vp8_short_idct4x4llm_neon(int16_t *input, unsigned char *pred_ptr,
   d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // a1
   d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // b1
 
-  q3s16 = vshrq_n_s16(q3s16, 1);
   q4s16 = vshrq_n_s16(q4s16, 1);
 
-  q3s16 = vqaddq_s16(q3s16, q2s16);
   q4s16 = vqaddq_s16(q4s16, q2s16);
 
   d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16));  // c1
@@ -71,10 +73,8 @@ void vp8_short_idct4x4llm_neon(int16_t *input, unsigned char *pred_ptr,
   d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // a1
   d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // b1
 
-  q3s16 = vshrq_n_s16(q3s16, 1);
   q4s16 = vshrq_n_s16(q4s16, 1);
 
-  q3s16 = vqaddq_s16(q3s16, q2s16);
   q4s16 = vqaddq_s16(q4s16, q2s16);
 
   d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16));  // c1
diff --git a/vp8/common/arm/neon/sixtappredict_neon.c b/vp8/common/arm/neon/sixtappredict_neon.c
index 622baa3c5..fbb552ebe 100644
--- a/vp8/common/arm/neon/sixtappredict_neon.c
+++ b/vp8/common/arm/neon/sixtappredict_neon.c
@@ -9,6 +9,8 @@
  */
 
 #include <arm_neon.h>
+#include <string.h>
+#include "./vpx_config.h"
 #include "vpx_ports/mem.h"
 
 static const int8_t vp8_sub_pel_filters[8][8] = {
@@ -22,6 +24,396 @@ static const int8_t vp8_sub_pel_filters[8][8] = {
   { 0, -1, 12, 123, -6, 0, 0, 0 },
 };
 
+// This table is derived from vp8/common/filter.c:vp8_sub_pel_filters.
+// Apply abs() to all the values. Elements 0, 2, 3, and 5 are always positive.
+// Elements 1 and 4 are either 0 or negative. The code accounts for this with
+// multiply/accumulates which either add or subtract as needed. The other
+// functions will be updated to use this table later.
+// It is also expanded to 8 elements to allow loading into 64 bit neon
+// registers.
+static const uint8_t abs_filters[8][8] = {
+  { 0, 0, 128, 0, 0, 0, 0, 0 },   { 0, 6, 123, 12, 1, 0, 0, 0 },
+  { 2, 11, 108, 36, 8, 1, 0, 0 }, { 0, 9, 93, 50, 6, 0, 0, 0 },
+  { 3, 16, 77, 77, 16, 3, 0, 0 }, { 0, 6, 50, 93, 9, 0, 0, 0 },
+  { 1, 8, 36, 108, 11, 2, 0, 0 }, { 0, 1, 12, 123, 6, 0, 0, 0 },
+};
+
+static INLINE uint8x8_t load_and_shift(const unsigned char *a) {
+  return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vld1_u8(a)), 32));
+}
+
+static INLINE void store4x4(unsigned char *dst, int dst_stride,
+                            const uint8x8_t a0, const uint8x8_t a1) {
+  if (!((uintptr_t)dst & 0x3) && !(dst_stride & 0x3)) {
+    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a0), 0);
+    dst += dst_stride;
+    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a0), 1);
+    dst += dst_stride;
+    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a1), 0);
+    dst += dst_stride;
+    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a1), 1);
+  } else {
+    // Store to the aligned local buffer and memcpy instead of vget_lane_u8
+    // which is really really slow.
+    uint32_t output_buffer[4];
+    vst1_lane_u32(output_buffer, vreinterpret_u32_u8(a0), 0);
+    vst1_lane_u32(output_buffer + 1, vreinterpret_u32_u8(a0), 1);
+    vst1_lane_u32(output_buffer + 2, vreinterpret_u32_u8(a1), 0);
+    vst1_lane_u32(output_buffer + 3, vreinterpret_u32_u8(a1), 1);
+
+    memcpy(dst, output_buffer, 4);
+    dst += dst_stride;
+    memcpy(dst, output_buffer + 1, 4);
+    dst += dst_stride;
+    memcpy(dst, output_buffer + 2, 4);
+    dst += dst_stride;
+    memcpy(dst, output_buffer + 3, 4);
+  }
+}
+
+static INLINE void filter_add_accumulate(const uint8x16_t a, const uint8x16_t b,
+                                         const uint8x8_t filter, uint16x8_t *c,
+                                         uint16x8_t *d) {
+  const uint32x2x2_t a_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(a)),
+                                       vreinterpret_u32_u8(vget_high_u8(a)));
+  const uint32x2x2_t b_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(b)),
+                                       vreinterpret_u32_u8(vget_high_u8(b)));
+  *c = vmlal_u8(*c, vreinterpret_u8_u32(a_shuf.val[0]), filter);
+  *d = vmlal_u8(*d, vreinterpret_u8_u32(b_shuf.val[0]), filter);
+}
+
+static INLINE void filter_sub_accumulate(const uint8x16_t a, const uint8x16_t b,
+                                         const uint8x8_t filter, uint16x8_t *c,
+                                         uint16x8_t *d) {
+  const uint32x2x2_t a_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(a)),
+                                       vreinterpret_u32_u8(vget_high_u8(a)));
+  const uint32x2x2_t b_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(b)),
+                                       vreinterpret_u32_u8(vget_high_u8(b)));
+  *c = vmlsl_u8(*c, vreinterpret_u8_u32(a_shuf.val[0]), filter);
+  *d = vmlsl_u8(*d, vreinterpret_u8_u32(b_shuf.val[0]), filter);
+}
+
+static INLINE void yonly4x4(const unsigned char *src, int src_stride,
+                            int filter_offset, unsigned char *dst,
+                            int dst_stride) {
+  uint8x8_t a0, a1, a2, a3, a4, a5, a6, a7, a8;
+  uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7, b8;
+  uint16x8_t c0, c1, c2, c3;
+  int16x8_t d0, d1;
+  uint8x8_t e0, e1;
+
+  const uint8x8_t filter = vld1_u8(abs_filters[filter_offset]);
+  const uint8x8_t filter0 = vdup_lane_u8(filter, 0);
+  const uint8x8_t filter1 = vdup_lane_u8(filter, 1);
+  const uint8x8_t filter2 = vdup_lane_u8(filter, 2);
+  const uint8x8_t filter3 = vdup_lane_u8(filter, 3);
+  const uint8x8_t filter4 = vdup_lane_u8(filter, 4);
+  const uint8x8_t filter5 = vdup_lane_u8(filter, 5);
+
+  src -= src_stride * 2;
+  // Shift the even rows to allow using 'vext' to combine the vectors. armv8
+  // has vcopy_lane which would be interesting. This started as just a
+  // horrible workaround for clang adding alignment hints to 32bit loads:
+  // https://llvm.org/bugs/show_bug.cgi?id=24421
+  // But it turns out it almost identical to casting the loads.
+  a0 = load_and_shift(src);
+  src += src_stride;
+  a1 = vld1_u8(src);
+  src += src_stride;
+  a2 = load_and_shift(src);
+  src += src_stride;
+  a3 = vld1_u8(src);
+  src += src_stride;
+  a4 = load_and_shift(src);
+  src += src_stride;
+  a5 = vld1_u8(src);
+  src += src_stride;
+  a6 = load_and_shift(src);
+  src += src_stride;
+  a7 = vld1_u8(src);
+  src += src_stride;
+  a8 = vld1_u8(src);
+
+  // Combine the rows so we can operate on 8 at a time.
+  b0 = vext_u8(a0, a1, 4);
+  b2 = vext_u8(a2, a3, 4);
+  b4 = vext_u8(a4, a5, 4);
+  b6 = vext_u8(a6, a7, 4);
+  b8 = a8;
+
+  // To keep with the 8-at-a-time theme, combine *alternate* rows. This
+  // allows combining the odd rows with the even.
+  b1 = vext_u8(b0, b2, 4);
+  b3 = vext_u8(b2, b4, 4);
+  b5 = vext_u8(b4, b6, 4);
+  b7 = vext_u8(b6, b8, 4);
+
+  // Multiply and expand to 16 bits.
+  c0 = vmull_u8(b0, filter0);
+  c1 = vmull_u8(b2, filter0);
+  c2 = vmull_u8(b5, filter5);
+  c3 = vmull_u8(b7, filter5);
+
+  // Multiply, subtract and accumulate for filters 1 and 4 (the negative
+  // ones).
+  c0 = vmlsl_u8(c0, b4, filter4);
+  c1 = vmlsl_u8(c1, b6, filter4);
+  c2 = vmlsl_u8(c2, b1, filter1);
+  c3 = vmlsl_u8(c3, b3, filter1);
+
+  // Add more positive ones. vmlal should really return a signed type.
+  // It's doing signed math internally, as evidenced by the fact we can do
+  // subtractions followed by more additions. Ideally we could use
+  // vqmlal/sl but that instruction doesn't exist. Might be able to
+  // shoehorn vqdmlal/vqdmlsl in here but it would take some effort.
+  c0 = vmlal_u8(c0, b2, filter2);
+  c1 = vmlal_u8(c1, b4, filter2);
+  c2 = vmlal_u8(c2, b3, filter3);
+  c3 = vmlal_u8(c3, b5, filter3);
+
+  // Use signed saturation math because vmlsl may have left some negative
+  // numbers in there.
+  d0 = vqaddq_s16(vreinterpretq_s16_u16(c2), vreinterpretq_s16_u16(c0));
+  d1 = vqaddq_s16(vreinterpretq_s16_u16(c3), vreinterpretq_s16_u16(c1));
+
+  // Use signed again because numbers like -200 need to be saturated to 0.
+  e0 = vqrshrun_n_s16(d0, 7);
+  e1 = vqrshrun_n_s16(d1, 7);
+
+  store4x4(dst, dst_stride, e0, e1);
+}
+
+void vp8_sixtap_predict4x4_neon(unsigned char *src_ptr, int src_pixels_per_line,
+                                int xoffset, int yoffset,
+                                unsigned char *dst_ptr, int dst_pitch) {
+  uint8x16_t s0, s1, s2, s3, s4;
+  uint64x2_t s01, s23;
+  // Variables to hold src[] elements for the given filter[]
+  uint8x8_t s0_f5, s1_f5, s2_f5, s3_f5, s4_f5;
+  uint8x8_t s4_f1, s4_f2, s4_f3, s4_f4;
+  uint8x16_t s01_f0, s23_f0;
+  uint64x2_t s01_f3, s23_f3;
+  uint32x2x2_t s01_f3_q, s23_f3_q, s01_f5_q, s23_f5_q;
+  // Accumulator variables.
+  uint16x8_t d0123, d4567, d89;
+  uint16x8_t d0123_a, d4567_a, d89_a;
+  int16x8_t e0123, e4567, e89;
+  // Second pass intermediates.
+  uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7, b8;
+  uint16x8_t c0, c1, c2, c3;
+  int16x8_t d0, d1;
+  uint8x8_t e0, e1;
+  uint8x8_t filter, filter0, filter1, filter2, filter3, filter4, filter5;
+
+  if (xoffset == 0) {  // Second pass only.
+    yonly4x4(src_ptr, src_pixels_per_line, yoffset, dst_ptr, dst_pitch);
+    return;
+  }
+
+  if (yoffset == 0) {  // First pass only.
+    src_ptr -= 2;
+  } else {  // Add context for the second pass. 2 extra lines on top.
+    src_ptr -= 2 + (src_pixels_per_line * 2);
+  }
+
+  filter = vld1_u8(abs_filters[xoffset]);
+  filter0 = vdup_lane_u8(filter, 0);
+  filter1 = vdup_lane_u8(filter, 1);
+  filter2 = vdup_lane_u8(filter, 2);
+  filter3 = vdup_lane_u8(filter, 3);
+  filter4 = vdup_lane_u8(filter, 4);
+  filter5 = vdup_lane_u8(filter, 5);
+
+  // 2 bytes of context, 4 bytes of src values, 3 bytes of context, 7 bytes of
+  // garbage. So much effort for that last single bit.
+  // The low values of each pair are for filter0.
+  s0 = vld1q_u8(src_ptr);
+  src_ptr += src_pixels_per_line;
+  s1 = vld1q_u8(src_ptr);
+  src_ptr += src_pixels_per_line;
+  s2 = vld1q_u8(src_ptr);
+  src_ptr += src_pixels_per_line;
+  s3 = vld1q_u8(src_ptr);
+  src_ptr += src_pixels_per_line;
+
+  // Shift to extract values for filter[5]
+  // If src[] is 0, this puts:
+  // 3 4 5 6 7 8 9 10 in s0_f5
+  // Can't use vshr.u64 because it crosses the double word boundary.
+  s0_f5 = vext_u8(vget_low_u8(s0), vget_high_u8(s0), 5);
+  s1_f5 = vext_u8(vget_low_u8(s1), vget_high_u8(s1), 5);
+  s2_f5 = vext_u8(vget_low_u8(s2), vget_high_u8(s2), 5);
+  s3_f5 = vext_u8(vget_low_u8(s3), vget_high_u8(s3), 5);
+
+  s01_f0 = vcombine_u8(vget_low_u8(s0), vget_low_u8(s1));
+  s23_f0 = vcombine_u8(vget_low_u8(s2), vget_low_u8(s3));
+
+  s01_f5_q = vzip_u32(vreinterpret_u32_u8(s0_f5), vreinterpret_u32_u8(s1_f5));
+  s23_f5_q = vzip_u32(vreinterpret_u32_u8(s2_f5), vreinterpret_u32_u8(s3_f5));
+  d0123 = vmull_u8(vreinterpret_u8_u32(s01_f5_q.val[0]), filter5);
+  d4567 = vmull_u8(vreinterpret_u8_u32(s23_f5_q.val[0]), filter5);
+
+  // Keep original src data as 64 bits to simplify shifting and extracting.
+  s01 = vreinterpretq_u64_u8(s01_f0);
+  s23 = vreinterpretq_u64_u8(s23_f0);
+
+  // 3 4 5 6 * filter0
+  filter_add_accumulate(s01_f0, s23_f0, filter0, &d0123, &d4567);
+
+  // Shift over one to use -1, 0, 1, 2 for filter1
+  // -1 0 1 2 * filter1
+  filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 8)),
+                        vreinterpretq_u8_u64(vshrq_n_u64(s23, 8)), filter1,
+                        &d0123, &d4567);
+
+  // 2 3 4 5 * filter4
+  filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 32)),
+                        vreinterpretq_u8_u64(vshrq_n_u64(s23, 32)), filter4,
+                        &d0123, &d4567);
+
+  // 0 1 2 3 * filter2
+  filter_add_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 16)),
+                        vreinterpretq_u8_u64(vshrq_n_u64(s23, 16)), filter2,
+                        &d0123, &d4567);
+
+  // 1 2 3 4 * filter3
+  s01_f3 = vshrq_n_u64(s01, 24);
+  s23_f3 = vshrq_n_u64(s23, 24);
+  s01_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s01_f3)),
+                      vreinterpret_u32_u64(vget_high_u64(s01_f3)));
+  s23_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s23_f3)),
+                      vreinterpret_u32_u64(vget_high_u64(s23_f3)));
+  // Accumulate into different registers so it can use saturated addition.
+  d0123_a = vmull_u8(vreinterpret_u8_u32(s01_f3_q.val[0]), filter3);
+  d4567_a = vmull_u8(vreinterpret_u8_u32(s23_f3_q.val[0]), filter3);
+
+  e0123 =
+      vqaddq_s16(vreinterpretq_s16_u16(d0123), vreinterpretq_s16_u16(d0123_a));
+  e4567 =
+      vqaddq_s16(vreinterpretq_s16_u16(d4567), vreinterpretq_s16_u16(d4567_a));
+
+  // Shift and narrow.
+  b0 = vqrshrun_n_s16(e0123, 7);
+  b2 = vqrshrun_n_s16(e4567, 7);
+
+  if (yoffset == 0) {  // firstpass_filter4x4_only
+    store4x4(dst_ptr, dst_pitch, b0, b2);
+    return;
+  }
+
+  // Load additional context when doing both filters.
+  s0 = vld1q_u8(src_ptr);
+  src_ptr += src_pixels_per_line;
+  s1 = vld1q_u8(src_ptr);
+  src_ptr += src_pixels_per_line;
+  s2 = vld1q_u8(src_ptr);
+  src_ptr += src_pixels_per_line;
+  s3 = vld1q_u8(src_ptr);
+  src_ptr += src_pixels_per_line;
+  s4 = vld1q_u8(src_ptr);
+
+  s0_f5 = vext_u8(vget_low_u8(s0), vget_high_u8(s0), 5);
+  s1_f5 = vext_u8(vget_low_u8(s1), vget_high_u8(s1), 5);
+  s2_f5 = vext_u8(vget_low_u8(s2), vget_high_u8(s2), 5);
+  s3_f5 = vext_u8(vget_low_u8(s3), vget_high_u8(s3), 5);
+  s4_f5 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 5);
+
+  // 3 4 5 6 * filter0
+  s01_f0 = vcombine_u8(vget_low_u8(s0), vget_low_u8(s1));
+  s23_f0 = vcombine_u8(vget_low_u8(s2), vget_low_u8(s3));
+
+  s01_f5_q = vzip_u32(vreinterpret_u32_u8(s0_f5), vreinterpret_u32_u8(s1_f5));
+  s23_f5_q = vzip_u32(vreinterpret_u32_u8(s2_f5), vreinterpret_u32_u8(s3_f5));
+  // But this time instead of 16 pixels to filter, there are 20. So an extra
+  // run with a doubleword register.
+  d0123 = vmull_u8(vreinterpret_u8_u32(s01_f5_q.val[0]), filter5);
+  d4567 = vmull_u8(vreinterpret_u8_u32(s23_f5_q.val[0]), filter5);
+  d89 = vmull_u8(s4_f5, filter5);
+
+  // Save a copy as u64 for shifting.
+  s01 = vreinterpretq_u64_u8(s01_f0);
+  s23 = vreinterpretq_u64_u8(s23_f0);
+
+  filter_add_accumulate(s01_f0, s23_f0, filter0, &d0123, &d4567);
+  d89 = vmlal_u8(d89, vget_low_u8(s4), filter0);
+
+  filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 8)),
+                        vreinterpretq_u8_u64(vshrq_n_u64(s23, 8)), filter1,
+                        &d0123, &d4567);
+  s4_f1 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 1);
+  d89 = vmlsl_u8(d89, s4_f1, filter1);
+
+  filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 32)),
+                        vreinterpretq_u8_u64(vshrq_n_u64(s23, 32)), filter4,
+                        &d0123, &d4567);
+  s4_f4 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 4);
+  d89 = vmlsl_u8(d89, s4_f4, filter4);
+
+  filter_add_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 16)),
+                        vreinterpretq_u8_u64(vshrq_n_u64(s23, 16)), filter2,
+                        &d0123, &d4567);
+  s4_f2 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 2);
+  d89 = vmlal_u8(d89, s4_f2, filter2);
+
+  s01_f3 = vshrq_n_u64(s01, 24);
+  s23_f3 = vshrq_n_u64(s23, 24);
+  s01_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s01_f3)),
+                      vreinterpret_u32_u64(vget_high_u64(s01_f3)));
+  s23_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s23_f3)),
+                      vreinterpret_u32_u64(vget_high_u64(s23_f3)));
+  s4_f3 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 3);
+  d0123_a = vmull_u8(vreinterpret_u8_u32(s01_f3_q.val[0]), filter3);
+  d4567_a = vmull_u8(vreinterpret_u8_u32(s23_f3_q.val[0]), filter3);
+  d89_a = vmull_u8(s4_f3, filter3);
+
+  e0123 =
+      vqaddq_s16(vreinterpretq_s16_u16(d0123), vreinterpretq_s16_u16(d0123_a));
+  e4567 =
+      vqaddq_s16(vreinterpretq_s16_u16(d4567), vreinterpretq_s16_u16(d4567_a));
+  e89 = vqaddq_s16(vreinterpretq_s16_u16(d89), vreinterpretq_s16_u16(d89_a));
+
+  b4 = vqrshrun_n_s16(e0123, 7);
+  b6 = vqrshrun_n_s16(e4567, 7);
+  b8 = vqrshrun_n_s16(e89, 7);
+
+  // Second pass: 4x4
+  filter = vld1_u8(abs_filters[yoffset]);
+  filter0 = vdup_lane_u8(filter, 0);
+  filter1 = vdup_lane_u8(filter, 1);
+  filter2 = vdup_lane_u8(filter, 2);
+  filter3 = vdup_lane_u8(filter, 3);
+  filter4 = vdup_lane_u8(filter, 4);
+  filter5 = vdup_lane_u8(filter, 5);
+
+  b1 = vext_u8(b0, b2, 4);
+  b3 = vext_u8(b2, b4, 4);
+  b5 = vext_u8(b4, b6, 4);
+  b7 = vext_u8(b6, b8, 4);
+
+  c0 = vmull_u8(b0, filter0);
+  c1 = vmull_u8(b2, filter0);
+  c2 = vmull_u8(b5, filter5);
+  c3 = vmull_u8(b7, filter5);
+
+  c0 = vmlsl_u8(c0, b4, filter4);
+  c1 = vmlsl_u8(c1, b6, filter4);
+  c2 = vmlsl_u8(c2, b1, filter1);
+  c3 = vmlsl_u8(c3, b3, filter1);
+
+  c0 = vmlal_u8(c0, b2, filter2);
+  c1 = vmlal_u8(c1, b4, filter2);
+  c2 = vmlal_u8(c2, b3, filter3);
+  c3 = vmlal_u8(c3, b5, filter3);
+
+  d0 = vqaddq_s16(vreinterpretq_s16_u16(c2), vreinterpretq_s16_u16(c0));
+  d1 = vqaddq_s16(vreinterpretq_s16_u16(c3), vreinterpretq_s16_u16(c1));
+
+  e0 = vqrshrun_n_s16(d0, 7);
+  e1 = vqrshrun_n_s16(d1, 7);
+
+  store4x4(dst_ptr, dst_pitch, e0, e1);
+}
+
 void vp8_sixtap_predict8x4_neon(unsigned char *src_ptr, int src_pixels_per_line,
                                 int xoffset, int yoffset,
                                 unsigned char *dst_ptr, int dst_pitch) {
diff --git a/vp8/common/filter.c b/vp8/common/filter.c
index a312efb6c..267498335 100644
--- a/vp8/common/filter.c
+++ b/vp8/common/filter.c
@@ -8,8 +8,9 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "filter.h"
+#include <assert.h>
 #include "./vp8_rtcd.h"
+#include "vp8/common/filter.h"
 
 DECLARE_ALIGNED(16, const short, vp8_bilinear_filters[8][2]) = {
   { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
@@ -324,27 +325,11 @@ void vp8_bilinear_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line,
   const short *HFilter;
   const short *VFilter;
 
+  // This represents a copy and is not required to be handled by optimizations.
+  assert((xoffset | yoffset) != 0);
+
   HFilter = vp8_bilinear_filters[xoffset];
   VFilter = vp8_bilinear_filters[yoffset];
-#if 0
-    {
-        int i;
-        unsigned char temp1[16];
-        unsigned char temp2[16];
-
-        bilinear_predict4x4_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, temp1, 4);
-        filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4);
-
-        for (i = 0; i < 16; ++i)
-        {
-            if (temp1[i] != temp2[i])
-            {
-                bilinear_predict4x4_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, temp1, 4);
-                filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4);
-            }
-        }
-    }
-#endif
   filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter,
                      VFilter, 4, 4);
 }
@@ -355,6 +340,8 @@ void vp8_bilinear_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line,
   const short *HFilter;
   const short *VFilter;
 
+  assert((xoffset | yoffset) != 0);
+
   HFilter = vp8_bilinear_filters[xoffset];
   VFilter = vp8_bilinear_filters[yoffset];
 
@@ -368,6 +355,8 @@ void vp8_bilinear_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line,
   const short *HFilter;
   const short *VFilter;
 
+  assert((xoffset | yoffset) != 0);
+
   HFilter = vp8_bilinear_filters[xoffset];
   VFilter = vp8_bilinear_filters[yoffset];
 
@@ -382,6 +371,8 @@ void vp8_bilinear_predict16x16_c(unsigned char *src_ptr,
   const short *HFilter;
   const short *VFilter;
 
+  assert((xoffset | yoffset) != 0);
+
   HFilter = vp8_bilinear_filters[xoffset];
   VFilter = vp8_bilinear_filters[yoffset];
 
diff --git a/vp8/common/onyx.h b/vp8/common/onyx.h
index eb68246b2..43e3c29b5 100644
--- a/vp8/common/onyx.h
+++ b/vp8/common/onyx.h
@@ -251,7 +251,7 @@ int vp8_receive_raw_frame(struct VP8_COMP *comp, unsigned int frame_flags,
                           YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
                           int64_t end_time_stamp);
 int vp8_get_compressed_data(struct VP8_COMP *comp, unsigned int *frame_flags,
-                            unsigned long *size, unsigned char *dest,
+                            size_t *size, unsigned char *dest,
                             unsigned char *dest_end, int64_t *time_stamp,
                             int64_t *time_end, int flush);
 int vp8_get_preview_raw_frame(struct VP8_COMP *comp, YV12_BUFFER_CONFIG *dest,
diff --git a/vp8/common/onyxc_int.h b/vp8/common/onyxc_int.h
index 732656f2f..9a12c7fb6 100644
--- a/vp8/common/onyxc_int.h
+++ b/vp8/common/onyxc_int.h
@@ -160,10 +160,6 @@ typedef struct VP8Common {
 #ifdef PACKET_TESTING
   VP8_HEADER oh;
 #endif
-#if CONFIG_POSTPROC_VISUALIZER
-  double bitrate;
-  double framerate;
-#endif
 
 #if CONFIG_MULTITHREAD
   int processor_core_count;
diff --git a/vp8/common/postproc.c b/vp8/common/postproc.c
index 1c4e042c8..8b8c1701a 100644
--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c
@@ -37,46 +37,6 @@
                   (0.071 * (float)(t & 0xff)) + 128)
 /* clang-format on */
 
-/* global constants */
-#if CONFIG_POSTPROC_VISUALIZER
-static const unsigned char MB_PREDICTION_MODE_colors[MB_MODE_COUNT][3] = {
-  { RGB_TO_YUV(0x98FB98) }, /* PaleGreen */
-  { RGB_TO_YUV(0x00FF00) }, /* Green */
-  { RGB_TO_YUV(0xADFF2F) }, /* GreenYellow */
-  { RGB_TO_YUV(0x228B22) }, /* ForestGreen */
-  { RGB_TO_YUV(0x006400) }, /* DarkGreen */
-  { RGB_TO_YUV(0x98F5FF) }, /* Cadet Blue */
-  { RGB_TO_YUV(0x6CA6CD) }, /* Sky Blue */
-  { RGB_TO_YUV(0x00008B) }, /* Dark blue */
-  { RGB_TO_YUV(0x551A8B) }, /* Purple */
-  { RGB_TO_YUV(0xFF0000) }  /* Red */
-};
-
-static const unsigned char B_PREDICTION_MODE_colors[B_MODE_COUNT][3] = {
-  { RGB_TO_YUV(0x6633ff) }, /* Purple */
-  { RGB_TO_YUV(0xcc33ff) }, /* Magenta */
-  { RGB_TO_YUV(0xff33cc) }, /* Pink */
-  { RGB_TO_YUV(0xff3366) }, /* Coral */
-  { RGB_TO_YUV(0x3366ff) }, /* Blue */
-  { RGB_TO_YUV(0xed00f5) }, /* Dark Blue */
-  { RGB_TO_YUV(0x2e00b8) }, /* Dark Purple */
-  { RGB_TO_YUV(0xff6633) }, /* Orange */
-  { RGB_TO_YUV(0x33ccff) }, /* Light Blue */
-  { RGB_TO_YUV(0x8ab800) }, /* Green */
-  { RGB_TO_YUV(0xffcc33) }, /* Light Orange */
-  { RGB_TO_YUV(0x33ffcc) }, /* Aqua */
-  { RGB_TO_YUV(0x66ff33) }, /* Light Green */
-  { RGB_TO_YUV(0xccff33) }, /* Yellow */
-};
-
-static const unsigned char MV_REFERENCE_FRAME_colors[MAX_REF_FRAMES][3] = {
-  { RGB_TO_YUV(0x00ff00) }, /* Blue */
-  { RGB_TO_YUV(0x0000ff) }, /* Green */
-  { RGB_TO_YUV(0xffff00) }, /* Yellow */
-  { RGB_TO_YUV(0xff0000) }, /* Red */
-};
-#endif
-
 extern void vp8_blit_text(const char *msg, unsigned char *address,
                           const int pitch);
 extern void vp8_blit_line(int x0, int x1, int y0, int y1, unsigned char *image,
@@ -308,43 +268,6 @@ void vp8_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v,
   }
 }
 
-#if CONFIG_POSTPROC_VISUALIZER
-static void constrain_line(int x_0, int *x_1, int y_0, int *y_1, int width,
-                           int height) {
-  int dx;
-  int dy;
-
-  if (*x_1 > width) {
-    dx = *x_1 - x_0;
-    dy = *y_1 - y_0;
-
-    *x_1 = width;
-    if (dx) *y_1 = ((width - x_0) * dy) / dx + y_0;
-  }
-  if (*x_1 < 0) {
-    dx = *x_1 - x_0;
-    dy = *y_1 - y_0;
-
-    *x_1 = 0;
-    if (dx) *y_1 = ((0 - x_0) * dy) / dx + y_0;
-  }
-  if (*y_1 > height) {
-    dx = *x_1 - x_0;
-    dy = *y_1 - y_0;
-
-    *y_1 = height;
-    if (dy) *x_1 = ((height - y_0) * dx) / dy + x_0;
-  }
-  if (*y_1 < 0) {
-    dx = *x_1 - x_0;
-    dy = *y_1 - y_0;
-
-    *y_1 = 0;
-    if (dy) *x_1 = ((0 - y_0) * dx) / dy + x_0;
-  }
-}
-#endif  // CONFIG_POSTPROC_VISUALIZER
-
 #if CONFIG_POSTPROC
 int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest,
                         vp8_ppflags_t *ppflags) {
@@ -455,331 +378,6 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest,
         oci->post_proc_buffer.y_stride);
   }
 
-#if CONFIG_POSTPROC_VISUALIZER
-  if (flags & VP8D_DEBUG_TXT_FRAME_INFO) {
-    char message[512];
-    sprintf(message, "F%1dG%1dQ%3dF%3dP%d_s%dx%d",
-            (oci->frame_type == KEY_FRAME), oci->refresh_golden_frame,
-            oci->base_qindex, oci->filter_level, flags, oci->mb_cols,
-            oci->mb_rows);
-    vp8_blit_text(message, oci->post_proc_buffer.y_buffer,
-                  oci->post_proc_buffer.y_stride);
-  }
-
-  if (flags & VP8D_DEBUG_TXT_MBLK_MODES) {
-    int i, j;
-    unsigned char *y_ptr;
-    YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
-    int mb_rows = post->y_height >> 4;
-    int mb_cols = post->y_width >> 4;
-    int mb_index = 0;
-    MODE_INFO *mi = oci->mi;
-
-    y_ptr = post->y_buffer + 4 * post->y_stride + 4;
-
-    /* vp8_filter each macro block */
-    for (i = 0; i < mb_rows; ++i) {
-      for (j = 0; j < mb_cols; ++j) {
-        char zz[4];
-
-        sprintf(zz, "%c", mi[mb_index].mbmi.mode + 'a');
-
-        vp8_blit_text(zz, y_ptr, post->y_stride);
-        mb_index++;
-        y_ptr += 16;
-      }
-
-      mb_index++; /* border */
-      y_ptr += post->y_stride * 16 - post->y_width;
-    }
-  }
-
-  if (flags & VP8D_DEBUG_TXT_DC_DIFF) {
-    int i, j;
-    unsigned char *y_ptr;
-    YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
-    int mb_rows = post->y_height >> 4;
-    int mb_cols = post->y_width >> 4;
-    int mb_index = 0;
-    MODE_INFO *mi = oci->mi;
-
-    y_ptr = post->y_buffer + 4 * post->y_stride + 4;
-
-    /* vp8_filter each macro block */
-    for (i = 0; i < mb_rows; ++i) {
-      for (j = 0; j < mb_cols; ++j) {
-        char zz[4];
-        int dc_diff = !(mi[mb_index].mbmi.mode != B_PRED &&
-                        mi[mb_index].mbmi.mode != SPLITMV &&
-                        mi[mb_index].mbmi.mb_skip_coeff);
-
-        if (oci->frame_type == KEY_FRAME)
-          sprintf(zz, "a");
-        else
-          sprintf(zz, "%c", dc_diff + '0');
-
-        vp8_blit_text(zz, y_ptr, post->y_stride);
-        mb_index++;
-        y_ptr += 16;
-      }
-
-      mb_index++; /* border */
-      y_ptr += post->y_stride * 16 - post->y_width;
-    }
-  }
-
-  if (flags & VP8D_DEBUG_TXT_RATE_INFO) {
-    char message[512];
-    sprintf(message, "Bitrate: %10.2f framerate: %10.2f ", oci->bitrate,
-            oci->framerate);
-    vp8_blit_text(message, oci->post_proc_buffer.y_buffer,
-                  oci->post_proc_buffer.y_stride);
-  }
-
-  /* Draw motion vectors */
-  if ((flags & VP8D_DEBUG_DRAW_MV) && ppflags->display_mv_flag) {
-    YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
-    int width = post->y_width;
-    int height = post->y_height;
-    unsigned char *y_buffer = oci->post_proc_buffer.y_buffer;
-    int y_stride = oci->post_proc_buffer.y_stride;
-    MODE_INFO *mi = oci->mi;
-    int x0, y0;
-
-    for (y0 = 0; y0 < height; y0 += 16) {
-      for (x0 = 0; x0 < width; x0 += 16) {
-        int x1, y1;
-
-        if (!(ppflags->display_mv_flag & (1 << mi->mbmi.mode))) {
-          mi++;
-          continue;
-        }
-
-        if (mi->mbmi.mode == SPLITMV) {
-          switch (mi->mbmi.partitioning) {
-            case 0: /* mv_top_bottom */
-            {
-              union b_mode_info *bmi = &mi->bmi[0];
-              MV *mv = &bmi->mv.as_mv;
-
-              x1 = x0 + 8 + (mv->col >> 3);
-              y1 = y0 + 4 + (mv->row >> 3);
-
-              constrain_line(x0 + 8, &x1, y0 + 4, &y1, width, height);
-              vp8_blit_line(x0 + 8, x1, y0 + 4, y1, y_buffer, y_stride);
-
-              bmi = &mi->bmi[8];
-
-              x1 = x0 + 8 + (mv->col >> 3);
-              y1 = y0 + 12 + (mv->row >> 3);
-
-              constrain_line(x0 + 8, &x1, y0 + 12, &y1, width, height);
-              vp8_blit_line(x0 + 8, x1, y0 + 12, y1, y_buffer, y_stride);
-
-              break;
-            }
-            case 1: /* mv_left_right */
-            {
-              union b_mode_info *bmi = &mi->bmi[0];
-              MV *mv = &bmi->mv.as_mv;
-
-              x1 = x0 + 4 + (mv->col >> 3);
-              y1 = y0 + 8 + (mv->row >> 3);
-
-              constrain_line(x0 + 4, &x1, y0 + 8, &y1, width, height);
-              vp8_blit_line(x0 + 4, x1, y0 + 8, y1, y_buffer, y_stride);
-
-              bmi = &mi->bmi[2];
-
-              x1 = x0 + 12 + (mv->col >> 3);
-              y1 = y0 + 8 + (mv->row >> 3);
-
-              constrain_line(x0 + 12, &x1, y0 + 8, &y1, width, height);
-              vp8_blit_line(x0 + 12, x1, y0 + 8, y1, y_buffer, y_stride);
-
-              break;
-            }
-            case 2: /* mv_quarters   */
-            {
-              union b_mode_info *bmi = &mi->bmi[0];
-              MV *mv = &bmi->mv.as_mv;
-
-              x1 = x0 + 4 + (mv->col >> 3);
-              y1 = y0 + 4 + (mv->row >> 3);
-
-              constrain_line(x0 + 4, &x1, y0 + 4, &y1, width, height);
-              vp8_blit_line(x0 + 4, x1, y0 + 4, y1, y_buffer, y_stride);
-
-              bmi = &mi->bmi[2];
-
-              x1 = x0 + 12 + (mv->col >> 3);
-              y1 = y0 + 4 + (mv->row >> 3);
-
-              constrain_line(x0 + 12, &x1, y0 + 4, &y1, width, height);
-              vp8_blit_line(x0 + 12, x1, y0 + 4, y1, y_buffer, y_stride);
-
-              bmi = &mi->bmi[8];
-
-              x1 = x0 + 4 + (mv->col >> 3);
-              y1 = y0 + 12 + (mv->row >> 3);
-
-              constrain_line(x0 + 4, &x1, y0 + 12, &y1, width, height);
-              vp8_blit_line(x0 + 4, x1, y0 + 12, y1, y_buffer, y_stride);
-
-              bmi = &mi->bmi[10];
-
-              x1 = x0 + 12 + (mv->col >> 3);
-              y1 = y0 + 12 + (mv->row >> 3);
-
-              constrain_line(x0 + 12, &x1, y0 + 12, &y1, width, height);
-              vp8_blit_line(x0 + 12, x1, y0 + 12, y1, y_buffer, y_stride);
-              break;
-            }
-            default: {
-              union b_mode_info *bmi = mi->bmi;
-              int bx0, by0;
-
-              for (by0 = y0; by0 < (y0 + 16); by0 += 4) {
-                for (bx0 = x0; bx0 < (x0 + 16); bx0 += 4) {
-                  MV *mv = &bmi->mv.as_mv;
-
-                  x1 = bx0 + 2 + (mv->col >> 3);
-                  y1 = by0 + 2 + (mv->row >> 3);
-
-                  constrain_line(bx0 + 2, &x1, by0 + 2, &y1, width, height);
-                  vp8_blit_line(bx0 + 2, x1, by0 + 2, y1, y_buffer, y_stride);
-
-                  bmi++;
-                }
-              }
-            }
-          }
-        } else if (mi->mbmi.mode >= NEARESTMV) {
-          MV *mv = &mi->mbmi.mv.as_mv;
-          const int lx0 = x0 + 8;
-          const int ly0 = y0 + 8;
-
-          x1 = lx0 + (mv->col >> 3);
-          y1 = ly0 + (mv->row >> 3);
-
-          if (x1 != lx0 && y1 != ly0) {
-            constrain_line(lx0, &x1, ly0 - 1, &y1, width, height);
-            vp8_blit_line(lx0, x1, ly0 - 1, y1, y_buffer, y_stride);
-
-            constrain_line(lx0, &x1, ly0 + 1, &y1, width, height);
-            vp8_blit_line(lx0, x1, ly0 + 1, y1, y_buffer, y_stride);
-          } else
-            vp8_blit_line(lx0, x1, ly0, y1, y_buffer, y_stride);
-        }
-
-        mi++;
-      }
-      mi++;
-    }
-  }
-
-  /* Color in block modes */
-  if ((flags & VP8D_DEBUG_CLR_BLK_MODES) &&
-      (ppflags->display_mb_modes_flag || ppflags->display_b_modes_flag)) {
-    int y, x;
-    YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
-    int width = post->y_width;
-    int height = post->y_height;
-    unsigned char *y_ptr = oci->post_proc_buffer.y_buffer;
-    unsigned char *u_ptr = oci->post_proc_buffer.u_buffer;
-    unsigned char *v_ptr = oci->post_proc_buffer.v_buffer;
-    int y_stride = oci->post_proc_buffer.y_stride;
-    MODE_INFO *mi = oci->mi;
-
-    for (y = 0; y < height; y += 16) {
-      for (x = 0; x < width; x += 16) {
-        int Y = 0, U = 0, V = 0;
-
-        if (mi->mbmi.mode == B_PRED &&
-            ((ppflags->display_mb_modes_flag & B_PRED) ||
-             ppflags->display_b_modes_flag)) {
-          int by, bx;
-          unsigned char *yl, *ul, *vl;
-          union b_mode_info *bmi = mi->bmi;
-
-          yl = y_ptr + x;
-          ul = u_ptr + (x >> 1);
-          vl = v_ptr + (x >> 1);
-
-          for (by = 0; by < 16; by += 4) {
-            for (bx = 0; bx < 16; bx += 4) {
-              if ((ppflags->display_b_modes_flag & (1 << mi->mbmi.mode)) ||
-                  (ppflags->display_mb_modes_flag & B_PRED)) {
-                Y = B_PREDICTION_MODE_colors[bmi->as_mode][0];
-                U = B_PREDICTION_MODE_colors[bmi->as_mode][1];
-                V = B_PREDICTION_MODE_colors[bmi->as_mode][2];
-
-                vp8_blend_b(yl + bx, ul + (bx >> 1), vl + (bx >> 1), Y, U, V,
-                            0xc000, y_stride);
-              }
-              bmi++;
-            }
-
-            yl += y_stride * 4;
-            ul += y_stride * 1;
-            vl += y_stride * 1;
-          }
-        } else if (ppflags->display_mb_modes_flag & (1 << mi->mbmi.mode)) {
-          Y = MB_PREDICTION_MODE_colors[mi->mbmi.mode][0];
-          U = MB_PREDICTION_MODE_colors[mi->mbmi.mode][1];
-          V = MB_PREDICTION_MODE_colors[mi->mbmi.mode][2];
-
-          vp8_blend_mb_inner(y_ptr + x, u_ptr + (x >> 1), v_ptr + (x >> 1), Y,
-                             U, V, 0xc000, y_stride);
-        }
-
-        mi++;
-      }
-      y_ptr += y_stride * 16;
-      u_ptr += y_stride * 4;
-      v_ptr += y_stride * 4;
-
-      mi++;
-    }
-  }
-
-  /* Color in frame reference blocks */
-  if ((flags & VP8D_DEBUG_CLR_FRM_REF_BLKS) &&
-      ppflags->display_ref_frame_flag) {
-    int y, x;
-    YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
-    int width = post->y_width;
-    int height = post->y_height;
-    unsigned char *y_ptr = oci->post_proc_buffer.y_buffer;
-    unsigned char *u_ptr = oci->post_proc_buffer.u_buffer;
-    unsigned char *v_ptr = oci->post_proc_buffer.v_buffer;
-    int y_stride = oci->post_proc_buffer.y_stride;
-    MODE_INFO *mi = oci->mi;
-
-    for (y = 0; y < height; y += 16) {
-      for (x = 0; x < width; x += 16) {
-        int Y = 0, U = 0, V = 0;
-
-        if (ppflags->display_ref_frame_flag & (1 << mi->mbmi.ref_frame)) {
-          Y = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][0];
-          U = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][1];
-          V = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][2];
-
-          vp8_blend_mb_outer(y_ptr + x, u_ptr + (x >> 1), v_ptr + (x >> 1), Y,
-                             U, V, 0xc000, y_stride);
-        }
-
-        mi++;
-      }
-      y_ptr += y_stride * 16;
-      u_ptr += y_stride * 4;
-      v_ptr += y_stride * 4;
-
-      mi++;
-    }
-  }
-#endif
-
   *dest = oci->post_proc_buffer;
 
   /* handle problem with extending borders */
diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl
index 8dc36f731..5d8e4a78d 100644
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -173,10 +173,8 @@ add_proto qw/void vp8_sixtap_predict8x4/, "unsigned char *src, int src_pitch, in
 specialize qw/vp8_sixtap_predict8x4 mmx sse2 ssse3 neon dspr2 msa/;
 $vp8_sixtap_predict8x4_dspr2=vp8_sixtap_predict8x4_dspr2;
 
-# TODO(johannkoenig): Add neon implementation
-# https://bugs.chromium.org/p/webm/issues/detail?id=1273
 add_proto qw/void vp8_sixtap_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_sixtap_predict4x4 mmx ssse3 dspr2 msa/;
+specialize qw/vp8_sixtap_predict4x4 mmx ssse3 neon dspr2 msa/;
 $vp8_sixtap_predict4x4_dspr2=vp8_sixtap_predict4x4_dspr2;
 
 add_proto qw/void vp8_bilinear_predict16x16/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
diff --git a/vp8/common/textblit.c b/vp8/common/textblit.c
deleted file mode 100644
index e7c15c4e4..000000000
--- a/vp8/common/textblit.c
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stdlib.h>
-
-void vp8_blit_text(const char *msg, unsigned char *address, const int pitch) {
-  int letter_bitmap;
-  unsigned char *output_pos = address;
-  int colpos;
-  const int font[] = {
-    0x0,       0x5C00,    0x8020,    0xAFABEA,  0xD7EC0,   0x1111111, 0x1855740,
-    0x18000,   0x45C0,    0x74400,   0x51140,   0x23880,   0xC4000,   0x21080,
-    0x80000,   0x111110,  0xE9D72E,  0x87E40,   0x12AD732, 0xAAD62A,  0x4F94C4,
-    0x4D6B7,   0x456AA,   0x3E8423,  0xAAD6AA,  0xAAD6A2,  0x2800,    0x2A00,
-    0x8A880,   0x52940,   0x22A20,   0x15422,   0x6AD62E,  0x1E4A53E, 0xAAD6BF,
-    0x8C62E,   0xE8C63F,  0x118D6BF, 0x1094BF,  0xCAC62E,  0x1F2109F, 0x118FE31,
-    0xF8C628,  0x8A89F,   0x108421F, 0x1F1105F, 0x1F4105F, 0xE8C62E,  0x2294BF,
-    0x164C62E, 0x12694BF, 0x8AD6A2,  0x10FC21,  0x1F8421F, 0x744107,  0xF8220F,
-    0x1151151, 0x117041,  0x119D731, 0x47E0,    0x1041041, 0xFC400,   0x10440,
-    0x1084210, 0x820
-  };
-  colpos = 0;
-
-  while (msg[colpos] != 0) {
-    char letter = msg[colpos];
-    int fontcol, fontrow;
-
-    if (letter <= 'Z' && letter >= ' ')
-      letter_bitmap = font[letter - ' '];
-    else if (letter <= 'z' && letter >= 'a')
-      letter_bitmap = font[letter - 'a' + 'A' - ' '];
-    else
-      letter_bitmap = font[0];
-
-    for (fontcol = 6; fontcol >= 0; fontcol--)
-      for (fontrow = 0; fontrow < 5; ++fontrow)
-        output_pos[fontrow * pitch + fontcol] =
-            ((letter_bitmap >> (fontcol * 5)) & (1 << fontrow) ? 255 : 0);
-
-    output_pos += 7;
-    colpos++;
-  }
-}
-
-static void plot(const int x, const int y, unsigned char *image,
-                 const int pitch) {
-  image[x + y * pitch] ^= 255;
-}
-
-/* Bresenham line algorithm */
-void vp8_blit_line(int x0, int x1, int y0, int y1, unsigned char *image,
-                   const int pitch) {
-  int steep = abs(y1 - y0) > abs(x1 - x0);
-  int deltax, deltay;
-  int error, ystep, y, x;
-
-  if (steep) {
-    int t;
-    t = x0;
-    x0 = y0;
-    y0 = t;
-
-    t = x1;
-    x1 = y1;
-    y1 = t;
-  }
-
-  if (x0 > x1) {
-    int t;
-    t = x0;
-    x0 = x1;
-    x1 = t;
-
-    t = y0;
-    y0 = y1;
-    y1 = t;
-  }
-
-  deltax = x1 - x0;
-  deltay = abs(y1 - y0);
-  error = deltax / 2;
-
-  y = y0;
-
-  if (y0 < y1)
-    ystep = 1;
-  else
-    ystep = -1;
-
-  if (steep) {
-    for (x = x0; x <= x1; ++x) {
-      plot(y, x, image, pitch);
-
-      error = error - deltay;
-      if (error < 0) {
-        y = y + ystep;
-        error = error + deltax;
-      }
-    }
-  } else {
-    for (x = x0; x <= x1; ++x) {
-      plot(x, y, image, pitch);
-
-      error = error - deltay;
-      if (error < 0) {
-        y = y + ystep;
-        error = error + deltax;
-      }
-    }
-  }
-}
diff --git a/vp8/common/x86/subpixel_sse2.asm b/vp8/common/x86/subpixel_sse2.asm
index 69f8d103c..ca00583ca 100644
--- a/vp8/common/x86/subpixel_sse2.asm
+++ b/vp8/common/x86/subpixel_sse2.asm
@@ -181,8 +181,12 @@ sym(vp8_filter_block1d16_h6_sse2):
         movq        xmm3,       MMWORD PTR [rsi - 2]
         movq        xmm1,       MMWORD PTR [rsi + 6]
 
-        movq        xmm2,       MMWORD PTR [rsi +14]
-        pslldq      xmm2,       8
+        ; Load from 11 to avoid reading out of bounds.
+        movq        xmm2,       MMWORD PTR [rsi +11]
+        ; The lower bits are not cleared before 'or'ing with xmm1,
+        ; but that is OK because the values in the overlapping positions
+        ; are already equal to the ones in xmm1.
+        pslldq      xmm2,       5
 
         por         xmm2,       xmm1
         prefetcht2  [rsi+rax-2]
diff --git a/vp8/common/x86/subpixel_ssse3.asm b/vp8/common/x86/subpixel_ssse3.asm
index c06f24556..1f6cbd1d1 100644
--- a/vp8/common/x86/subpixel_ssse3.asm
+++ b/vp8/common/x86/subpixel_ssse3.asm
@@ -1291,6 +1291,8 @@ sym(vp8_bilinear_predict8x8_ssse3):
         movq        xmm7,       XMMWORD PTR [rsp+96]
         punpcklbw   xmm5,       xmm6
 
+        ; Because the source register (xmm0) is always treated as signed by
+        ; pmaddubsw, the constant '128' is treated as '-128'.
         pmaddubsw   xmm1,       xmm0
         pmaddubsw   xmm2,       xmm0
 
@@ -1319,6 +1321,10 @@ sym(vp8_bilinear_predict8x8_ssse3):
         psraw       xmm5,       VP8_FILTER_SHIFT
 
         psraw       xmm6,       VP8_FILTER_SHIFT
+
+        ; Having multiplied everything by '-128' and obtained negative
+        ; numbers, the unsigned saturation truncates those values to 0,
+        ; resulting in incorrect handling of xoffset == 0 && yoffset == 0
         packuswb    xmm1,       xmm1
 
         packuswb    xmm2,       xmm2
diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c
index 06c2f624f..1b100cfe8 100644
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c
@@ -1056,7 +1056,7 @@ static void put_delta_q(vp8_writer *bc, int delta_q) {
 }
 
 void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest,
-                        unsigned char *dest_end, unsigned long *size) {
+                        unsigned char *dest_end, size_t *size) {
   int i, j;
   VP8_HEADER oh;
   VP8_COMMON *const pc = &cpi->common;
@@ -1347,7 +1347,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest,
 
   *size = VP8_HEADER_SIZE + extra_bytes_packed + cpi->bc->pos;
 
-  cpi->partition_sz[0] = *size;
+  cpi->partition_sz[0] = (unsigned int)*size;
 
 #if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
   {
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index f61cfbe90..6ebf233ed 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -2746,7 +2746,7 @@ static int decide_key_frame(VP8_COMP *cpi) {
   return code_key_frame;
 }
 
-static void Pass1Encode(VP8_COMP *cpi, unsigned long *size, unsigned char *dest,
+static void Pass1Encode(VP8_COMP *cpi, size_t *size, unsigned char *dest,
                         unsigned int *frame_flags) {
   (void)size;
   (void)dest;
@@ -3185,7 +3185,7 @@ void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm) {
   vp8_yv12_extend_frame_borders(cm->frame_to_show);
 }
 
-static void encode_frame_to_data_rate(VP8_COMP *cpi, unsigned long *size,
+static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
                                       unsigned char *dest,
                                       unsigned char *dest_end,
                                       unsigned int *frame_flags) {
@@ -4384,7 +4384,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, unsigned long *size,
 
   /* Update rate control heuristics */
   cpi->total_byte_count += (*size);
-  cpi->projected_frame_size = (*size) << 3;
+  cpi->projected_frame_size = (int)(*size) << 3;
 
   if (cpi->oxcf.number_of_layers > 1) {
     unsigned int i;
@@ -4711,7 +4711,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, unsigned long *size,
   /* vp8_write_yuv_frame("encoder_recon.yuv", cm->frame_to_show); */
 }
 #if !CONFIG_REALTIME_ONLY
-static void Pass2Encode(VP8_COMP *cpi, unsigned long *size, unsigned char *dest,
+static void Pass2Encode(VP8_COMP *cpi, size_t *size, unsigned char *dest,
                         unsigned char *dest_end, unsigned int *frame_flags) {
   if (!cpi->common.refresh_alt_ref_frame) vp8_second_pass(cpi);
 
@@ -4764,7 +4764,7 @@ static int frame_is_reference(const VP8_COMP *cpi) {
 }
 
 int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags,
-                            unsigned long *size, unsigned char *dest,
+                            size_t *size, unsigned char *dest,
                             unsigned char *dest_end, int64_t *time_stamp,
                             int64_t *time_end, int flush) {
   VP8_COMMON *cm;
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index 32080eff7..59ad5773a 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -687,7 +687,7 @@ void vp8_new_framerate(VP8_COMP *cpi, double framerate);
 void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm);
 
 void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest,
-                        unsigned char *dest_end, unsigned long *size);
+                        unsigned char *dest_end, size_t *size);
 
 void vp8_tokenize_mb(VP8_COMP *, MACROBLOCK *, TOKENEXTRA **);
 
diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
index f0050d201..7b68d35f5 100644
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -570,7 +570,7 @@ static int evaluate_inter_mode(unsigned int *sse, int rate2, int *distortion2,
     // No adjustment if block is considered to be skin area.
     if (x->is_skin) rd_adj = 100;
 
-    this_rd = ((int64_t)this_rd) * rd_adj / 100;
+    this_rd = (int)(((int64_t)this_rd) * rd_adj / 100);
   }
 
   check_for_encode_breakout(*sse, x);
diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index d863a0a26..886b127d6 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -68,7 +68,6 @@ VP8_COMMON_SRCS-yes += common/vp8_entropymodedata.h
 
 
 
-VP8_COMMON_SRCS-$(CONFIG_POSTPROC_VISUALIZER) += common/textblit.c
 VP8_COMMON_SRCS-yes += common/treecoder.c
 
 VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/filter_x86.c
diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index 0ec6902e7..fac237eec 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -824,7 +824,7 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
     unsigned int lib_flags;
     YV12_BUFFER_CONFIG sd;
     int64_t dst_time_stamp, dst_end_time_stamp;
-    unsigned long size, cx_data_sz;
+    size_t size, cx_data_sz;
     unsigned char *cx_data;
     unsigned char *cx_data_end;
     int comp_data_state = 0;
diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c
index cab0a9997..b1f8340d6 100644
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -46,13 +46,6 @@ struct vpx_codec_alg_priv {
   int decoder_init;
   int postproc_cfg_set;
   vp8_postproc_cfg_t postproc_cfg;
-#if CONFIG_POSTPROC_VISUALIZER
-  unsigned int dbg_postproc_flag;
-  int dbg_color_ref_frame_flag;
-  int dbg_color_mb_modes_flag;
-  int dbg_color_b_modes_flag;
-  int dbg_display_mv_flag;
-#endif
   vpx_decrypt_cb decrypt_cb;
   void *decrypt_state;
   vpx_image_t img;
@@ -478,22 +471,8 @@ static vpx_image_t *vp8_get_frame(vpx_codec_alg_priv_t *ctx,
 
     if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC) {
       flags.post_proc_flag = ctx->postproc_cfg.post_proc_flag;
-#if CONFIG_POSTPROC_VISUALIZER
-      flags.post_proc_flag |=
-          ((ctx->dbg_color_ref_frame_flag != 0) ? VP8D_DEBUG_CLR_FRM_REF_BLKS
-                                                : 0) |
-          ((ctx->dbg_color_mb_modes_flag != 0) ? VP8D_DEBUG_CLR_BLK_MODES : 0) |
-          ((ctx->dbg_color_b_modes_flag != 0) ? VP8D_DEBUG_CLR_BLK_MODES : 0) |
-          ((ctx->dbg_display_mv_flag != 0) ? VP8D_DEBUG_DRAW_MV : 0);
-#endif
       flags.deblocking_level = ctx->postproc_cfg.deblocking_level;
       flags.noise_level = ctx->postproc_cfg.noise_level;
-#if CONFIG_POSTPROC_VISUALIZER
-      flags.display_ref_frame_flag = ctx->dbg_color_ref_frame_flag;
-      flags.display_mb_modes_flag = ctx->dbg_color_mb_modes_flag;
-      flags.display_b_modes_flag = ctx->dbg_color_b_modes_flag;
-      flags.display_mv_flag = ctx->dbg_display_mv_flag;
-#endif
     }
 
     if (0 == vp8dx_get_raw_frame(ctx->yv12_frame_buffers.pbi[0], &sd,
@@ -589,54 +568,6 @@ static vpx_codec_err_t vp8_set_postproc(vpx_codec_alg_priv_t *ctx,
 #endif
 }
 
-static vpx_codec_err_t vp8_set_dbg_color_ref_frame(vpx_codec_alg_priv_t *ctx,
-                                                   va_list args) {
-#if CONFIG_POSTPROC_VISUALIZER && CONFIG_POSTPROC
-  ctx->dbg_color_ref_frame_flag = va_arg(args, int);
-  return VPX_CODEC_OK;
-#else
-  (void)ctx;
-  (void)args;
-  return VPX_CODEC_INCAPABLE;
-#endif
-}
-
-static vpx_codec_err_t vp8_set_dbg_color_mb_modes(vpx_codec_alg_priv_t *ctx,
-                                                  va_list args) {
-#if CONFIG_POSTPROC_VISUALIZER && CONFIG_POSTPROC
-  ctx->dbg_color_mb_modes_flag = va_arg(args, int);
-  return VPX_CODEC_OK;
-#else
-  (void)ctx;
-  (void)args;
-  return VPX_CODEC_INCAPABLE;
-#endif
-}
-
-static vpx_codec_err_t vp8_set_dbg_color_b_modes(vpx_codec_alg_priv_t *ctx,
-                                                 va_list args) {
-#if CONFIG_POSTPROC_VISUALIZER && CONFIG_POSTPROC
-  ctx->dbg_color_b_modes_flag = va_arg(args, int);
-  return VPX_CODEC_OK;
-#else
-  (void)ctx;
-  (void)args;
-  return VPX_CODEC_INCAPABLE;
-#endif
-}
-
-static vpx_codec_err_t vp8_set_dbg_display_mv(vpx_codec_alg_priv_t *ctx,
-                                              va_list args) {
-#if CONFIG_POSTPROC_VISUALIZER && CONFIG_POSTPROC
-  ctx->dbg_display_mv_flag = va_arg(args, int);
-  return VPX_CODEC_OK;
-#else
-  (void)ctx;
-  (void)args;
-  return VPX_CODEC_INCAPABLE;
-#endif
-}
-
 static vpx_codec_err_t vp8_get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
                                                 va_list args) {
   int *update_info = va_arg(args, int *);
@@ -706,10 +637,6 @@ vpx_codec_ctrl_fn_map_t vp8_ctf_maps[] = {
   { VP8_SET_REFERENCE, vp8_set_reference },
   { VP8_COPY_REFERENCE, vp8_get_reference },
   { VP8_SET_POSTPROC, vp8_set_postproc },
-  { VP8_SET_DBG_COLOR_REF_FRAME, vp8_set_dbg_color_ref_frame },
-  { VP8_SET_DBG_COLOR_MB_MODES, vp8_set_dbg_color_mb_modes },
-  { VP8_SET_DBG_COLOR_B_MODES, vp8_set_dbg_color_b_modes },
-  { VP8_SET_DBG_DISPLAY_MV, vp8_set_dbg_display_mv },
   { VP8D_GET_LAST_REF_UPDATES, vp8_get_last_ref_updates },
   { VP8D_GET_FRAME_CORRUPTED, vp8_get_frame_corrupted },
   { VP8D_GET_LAST_REF_USED, vp8_get_last_ref_frame },
diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c
index ff7c1dd3f..dc2e03946 100644
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -139,8 +139,6 @@ void vp9_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
 
   // The calculation can be simplified if there are not many non-zero dct
   // coefficients. Use eobs to decide what to do.
-  // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.
-  // Combine that with code here.
   if (eob == 1)
     // DC only DCT coefficient
     vpx_idct8x8_1_add(input, dest, stride);
@@ -204,6 +202,18 @@ void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
+
+// 12 signal input bits + 7 forward transform amplify bits + 1 bit
+// for contingency in rounding and quantizing
+#define VALID_IHT_MAGNITUDE_RANGE (1 << 20)
+
+static INLINE int detect_invalid_iht_input(const tran_low_t *input, int size) {
+  int i;
+  for (i = 0; i < size; ++i)
+    if (abs(input[i]) >= VALID_IHT_MAGNITUDE_RANGE) return 1;
+  return 0;
+}
+
 void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
                                 int stride, int tx_type, int bd) {
   const highbd_transform_2d IHT_4[] = {
@@ -219,6 +229,13 @@ void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
   tran_low_t *outptr = out;
   tran_low_t temp_in[4], temp_out[4];
 
+  if (detect_invalid_iht_input(input, 16)) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+    assert(0 && "invalid highbd iht input");
+#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
+    return;
+  }
+
   // Inverse transform row vectors.
   for (i = 0; i < 4; ++i) {
     IHT_4[tx_type].rows(input, outptr, bd);
@@ -253,6 +270,13 @@ void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
   const highbd_transform_2d ht = HIGH_IHT_8[tx_type];
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
+  if (detect_invalid_iht_input(input, 64)) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+    assert(0 && "invalid highbd iht input");
+#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
+    return;
+  }
+
   // Inverse transform row vectors.
   for (i = 0; i < 8; ++i) {
     ht.rows(input, outptr, bd);
@@ -287,6 +311,13 @@ void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
   const highbd_transform_2d ht = HIGH_IHT_16[tx_type];
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
+  if (detect_invalid_iht_input(input, 256)) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+    assert(0 && "invalid highbd iht input");
+#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
+    return;
+  }
+
   // Rows
   for (i = 0; i < 16; ++i) {
     ht.rows(input, outptr, bd);
@@ -329,8 +360,6 @@ void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
 
   // The calculation can be simplified if there are not many non-zero dct
   // coefficients. Use eobs to decide what to do.
-  // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.
-  // Combine that with code here.
   // DC only DCT coefficient
   if (eob == 1) {
     vpx_highbd_idct8x8_1_add(input, dest, stride, bd);
diff --git a/vp9/common/vp9_postproc.c b/vp9/common/vp9_postproc.c
index b6ae10b1b..b105e5d45 100644
--- a/vp9/common/vp9_postproc.c
+++ b/vp9/common/vp9_postproc.c
@@ -26,7 +26,6 @@
 
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_postproc.h"
-#include "vp9/common/vp9_textblit.h"
 
 #if CONFIG_VP9_POSTPROC
 
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index f315a3b85..37a867323 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -92,33 +92,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 # High bitdepth functions
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   #
-  # Sub Pixel Filters
-  #
-  add_proto qw/void vp9_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/vp9_highbd_convolve_copy/;
-
-  add_proto qw/void vp9_highbd_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/vp9_highbd_convolve_avg/;
-
-  add_proto qw/void vp9_highbd_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/vp9_highbd_convolve8/, "$sse2_x86_64";
-
-  add_proto qw/void vp9_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/vp9_highbd_convolve8_horiz/, "$sse2_x86_64";
-
-  add_proto qw/void vp9_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/vp9_highbd_convolve8_vert/, "$sse2_x86_64";
-
-  add_proto qw/void vp9_highbd_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/vp9_highbd_convolve8_avg/, "$sse2_x86_64";
-
-  add_proto qw/void vp9_highbd_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/vp9_highbd_convolve8_avg_horiz/, "$sse2_x86_64";
-
-  add_proto qw/void vp9_highbd_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/vp9_highbd_convolve8_avg_vert/, "$sse2_x86_64";
-
-  #
   # post proc
   #
   if (vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
diff --git a/vp9/common/vp9_textblit.c b/vp9/common/vp9_textblit.c
deleted file mode 100644
index 9940137ca..000000000
--- a/vp9/common/vp9_textblit.c
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stdlib.h>
-
-#include "vp9/common/vp9_textblit.h"
-
-static const int font[] = {
-  0x0,       0x5C00,    0x8020,    0xAFABEA,  0xD7EC0,   0x1111111, 0x1855740,
-  0x18000,   0x45C0,    0x74400,   0x51140,   0x23880,   0xC4000,   0x21080,
-  0x80000,   0x111110,  0xE9D72E,  0x87E40,   0x12AD732, 0xAAD62A,  0x4F94C4,
-  0x4D6B7,   0x456AA,   0x3E8423,  0xAAD6AA,  0xAAD6A2,  0x2800,    0x2A00,
-  0x8A880,   0x52940,   0x22A20,   0x15422,   0x6AD62E,  0x1E4A53E, 0xAAD6BF,
-  0x8C62E,   0xE8C63F,  0x118D6BF, 0x1094BF,  0xCAC62E,  0x1F2109F, 0x118FE31,
-  0xF8C628,  0x8A89F,   0x108421F, 0x1F1105F, 0x1F4105F, 0xE8C62E,  0x2294BF,
-  0x164C62E, 0x12694BF, 0x8AD6A2,  0x10FC21,  0x1F8421F, 0x744107,  0xF8220F,
-  0x1151151, 0x117041,  0x119D731, 0x47E0,    0x1041041, 0xFC400,   0x10440,
-  0x1084210, 0x820
-};
-
-static void plot(int x, int y, unsigned char *image, int pitch) {
-  image[x + y * pitch] ^= 255;
-}
-
-void vp9_blit_text(const char *msg, unsigned char *address, const int pitch) {
-  int letter_bitmap;
-  unsigned char *output_pos = address;
-  int colpos = 0;
-
-  while (msg[colpos] != 0) {
-    char letter = msg[colpos];
-    int fontcol, fontrow;
-
-    if (letter <= 'Z' && letter >= ' ')
-      letter_bitmap = font[letter - ' '];
-    else if (letter <= 'z' && letter >= 'a')
-      letter_bitmap = font[letter - 'a' + 'A' - ' '];
-    else
-      letter_bitmap = font[0];
-
-    for (fontcol = 6; fontcol >= 0; fontcol--)
-      for (fontrow = 0; fontrow < 5; fontrow++)
-        output_pos[fontrow * pitch + fontcol] =
-            ((letter_bitmap >> (fontcol * 5)) & (1 << fontrow) ? 255 : 0);
-
-    output_pos += 7;
-    colpos++;
-  }
-}
-
-/* Bresenham line algorithm */
-void vp9_blit_line(int x0, int x1, int y0, int y1, unsigned char *image,
-                   int pitch) {
-  int steep = abs(y1 - y0) > abs(x1 - x0);
-  int deltax, deltay;
-  int error, ystep, y, x;
-
-  if (steep) {
-    int t;
-    t = x0;
-    x0 = y0;
-    y0 = t;
-
-    t = x1;
-    x1 = y1;
-    y1 = t;
-  }
-
-  if (x0 > x1) {
-    int t;
-    t = x0;
-    x0 = x1;
-    x1 = t;
-
-    t = y0;
-    y0 = y1;
-    y1 = t;
-  }
-
-  deltax = x1 - x0;
-  deltay = abs(y1 - y0);
-  error = deltax / 2;
-
-  y = y0;
-
-  if (y0 < y1)
-    ystep = 1;
-  else
-    ystep = -1;
-
-  if (steep) {
-    for (x = x0; x <= x1; x++) {
-      plot(y, x, image, pitch);
-
-      error = error - deltay;
-      if (error < 0) {
-        y = y + ystep;
-        error = error + deltax;
-      }
-    }
-  } else {
-    for (x = x0; x <= x1; x++) {
-      plot(x, y, image, pitch);
-
-      error = error - deltay;
-      if (error < 0) {
-        y = y + ystep;
-        error = error + deltax;
-      }
-    }
-  }
-}
diff --git a/vp9/common/vp9_textblit.h b/vp9/common/vp9_textblit.h
deleted file mode 100644
index 158ec1b37..000000000
--- a/vp9/common/vp9_textblit.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_COMMON_VP9_TEXTBLIT_H_
-#define VP9_COMMON_VP9_TEXTBLIT_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void vp9_blit_text(const char *msg, unsigned char *address, int pitch);
-
-void vp9_blit_line(int x0, int x1, int y0, int y1, unsigned char *image,
-                   int pitch);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // VP9_COMMON_VP9_TEXTBLIT_H_
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index af2c900e6..fde0b7e31 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -318,11 +318,11 @@ static void inverse_transform_block_intra(MACROBLOCKD *xd, int plane,
   }
 }
 
-static void predict_and_reconstruct_intra_block(MACROBLOCKD *const xd,
-                                                vpx_reader *r,
+static void predict_and_reconstruct_intra_block(TileWorkerData *twd,
                                                 MODE_INFO *const mi, int plane,
                                                 int row, int col,
                                                 TX_SIZE tx_size) {
+  MACROBLOCKD *const xd = &twd->xd;
   struct macroblockd_plane *const pd = &xd->plane[plane];
   PREDICTION_MODE mode = (plane == 0) ? mi->mode : mi->uv_mode;
   uint8_t *dst;
@@ -340,7 +340,7 @@ static void predict_and_reconstruct_intra_block(MACROBLOCKD *const xd,
     const scan_order *sc = (plane || xd->lossless)
                                ? &vp9_default_scan_orders[tx_size]
                                : &vp9_scan_orders[tx_size][tx_type];
-    const int eob = vp9_decode_block_tokens(xd, plane, sc, col, row, tx_size, r,
+    const int eob = vp9_decode_block_tokens(twd, plane, sc, col, row, tx_size,
                                             mi->segment_id);
     if (eob > 0) {
       inverse_transform_block_intra(xd, plane, tx_type, tx_size, dst,
@@ -349,12 +349,13 @@ static void predict_and_reconstruct_intra_block(MACROBLOCKD *const xd,
   }
 }
 
-static int reconstruct_inter_block(MACROBLOCKD *const xd, vpx_reader *r,
-                                   MODE_INFO *const mi, int plane, int row,
-                                   int col, TX_SIZE tx_size) {
+static int reconstruct_inter_block(TileWorkerData *twd, MODE_INFO *const mi,
+                                   int plane, int row, int col,
+                                   TX_SIZE tx_size) {
+  MACROBLOCKD *const xd = &twd->xd;
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const scan_order *sc = &vp9_default_scan_orders[tx_size];
-  const int eob = vp9_decode_block_tokens(xd, plane, sc, col, row, tx_size, r,
+  const int eob = vp9_decode_block_tokens(twd, plane, sc, col, row, tx_size,
                                           mi->segment_id);
 
   if (eob > 0) {
@@ -761,15 +762,16 @@ static MODE_INFO *set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd,
   return xd->mi[0];
 }
 
-static void decode_block(VP9Decoder *const pbi, MACROBLOCKD *const xd,
-                         int mi_row, int mi_col, vpx_reader *r,
-                         BLOCK_SIZE bsize, int bwl, int bhl) {
+static void decode_block(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row,
+                         int mi_col, BLOCK_SIZE bsize, int bwl, int bhl) {
   VP9_COMMON *const cm = &pbi->common;
   const int less8x8 = bsize < BLOCK_8X8;
   const int bw = 1 << (bwl - 1);
   const int bh = 1 << (bhl - 1);
   const int x_mis = VPXMIN(bw, cm->mi_cols - mi_col);
   const int y_mis = VPXMIN(bh, cm->mi_rows - mi_row);
+  vpx_reader *r = &twd->bit_reader;
+  MACROBLOCKD *const xd = &twd->xd;
 
   MODE_INFO *mi = set_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis,
                               y_mis, bwl, bhl);
@@ -782,7 +784,7 @@ static void decode_block(VP9Decoder *const pbi, MACROBLOCKD *const xd,
                          "Invalid block size.");
   }
 
-  vp9_read_mode_info(pbi, xd, mi_row, mi_col, r, x_mis, y_mis);
+  vp9_read_mode_info(twd, pbi, mi_row, mi_col, x_mis, y_mis);
 
   if (mi->skip) {
     dec_reset_skip_context(xd);
@@ -811,7 +813,7 @@ static void decode_block(VP9Decoder *const pbi, MACROBLOCKD *const xd,
 
       for (row = 0; row < max_blocks_high; row += step)
         for (col = 0; col < max_blocks_wide; col += step)
-          predict_and_reconstruct_intra_block(xd, r, mi, plane, row, col,
+          predict_and_reconstruct_intra_block(twd, mi, plane, row, col,
                                               tx_size);
     }
   } else {
@@ -845,7 +847,7 @@ static void decode_block(VP9Decoder *const pbi, MACROBLOCKD *const xd,
         for (row = 0; row < max_blocks_high; row += step)
           for (col = 0; col < max_blocks_wide; col += step)
             eobtotal +=
-                reconstruct_inter_block(xd, r, mi, plane, row, col, tx_size);
+                reconstruct_inter_block(twd, mi, plane, row, col, tx_size);
       }
 
       if (!less8x8 && eobtotal == 0) mi->skip = 1;  // skip loopfilter
@@ -859,10 +861,11 @@ static void decode_block(VP9Decoder *const pbi, MACROBLOCKD *const xd,
   }
 }
 
-static INLINE int dec_partition_plane_context(const MACROBLOCKD *xd, int mi_row,
+static INLINE int dec_partition_plane_context(TileWorkerData *twd, int mi_row,
                                               int mi_col, int bsl) {
-  const PARTITION_CONTEXT *above_ctx = xd->above_seg_context + mi_col;
-  const PARTITION_CONTEXT *left_ctx = xd->left_seg_context + (mi_row & MI_MASK);
+  const PARTITION_CONTEXT *above_ctx = twd->xd.above_seg_context + mi_col;
+  const PARTITION_CONTEXT *left_ctx =
+      twd->xd.left_seg_context + (mi_row & MI_MASK);
   int above = (*above_ctx >> bsl) & 1, left = (*left_ctx >> bsl) & 1;
 
   //  assert(bsl >= 0);
@@ -870,11 +873,12 @@ static INLINE int dec_partition_plane_context(const MACROBLOCKD *xd, int mi_row,
   return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
 }
 
-static INLINE void dec_update_partition_context(MACROBLOCKD *xd, int mi_row,
+static INLINE void dec_update_partition_context(TileWorkerData *twd, int mi_row,
                                                 int mi_col, BLOCK_SIZE subsize,
                                                 int bw) {
-  PARTITION_CONTEXT *const above_ctx = xd->above_seg_context + mi_col;
-  PARTITION_CONTEXT *const left_ctx = xd->left_seg_context + (mi_row & MI_MASK);
+  PARTITION_CONTEXT *const above_ctx = twd->xd.above_seg_context + mi_col;
+  PARTITION_CONTEXT *const left_ctx =
+      twd->xd.left_seg_context + (mi_row & MI_MASK);
 
   // update the partition context at the end notes. set partition bits
   // of block sizes larger than the current one to be one, and partition
@@ -883,13 +887,14 @@ static INLINE void dec_update_partition_context(MACROBLOCKD *xd, int mi_row,
   memset(left_ctx, partition_context_lookup[subsize].left, bw);
 }
 
-static PARTITION_TYPE read_partition(MACROBLOCKD *xd, int mi_row, int mi_col,
-                                     vpx_reader *r, int has_rows, int has_cols,
+static PARTITION_TYPE read_partition(TileWorkerData *twd, int mi_row,
+                                     int mi_col, int has_rows, int has_cols,
                                      int bsl) {
-  const int ctx = dec_partition_plane_context(xd, mi_row, mi_col, bsl);
-  const vpx_prob *const probs = get_partition_probs(xd, ctx);
-  FRAME_COUNTS *counts = xd->counts;
+  const int ctx = dec_partition_plane_context(twd, mi_row, mi_col, bsl);
+  const vpx_prob *const probs = twd->xd.partition_probs[ctx];
+  FRAME_COUNTS *counts = twd->xd.counts;
   PARTITION_TYPE p;
+  vpx_reader *r = &twd->bit_reader;
 
   if (has_rows && has_cols)
     p = (PARTITION_TYPE)vpx_read_tree(r, vp9_partition_tree, probs);
@@ -906,9 +911,9 @@ static PARTITION_TYPE read_partition(MACROBLOCKD *xd, int mi_row, int mi_col,
 }
 
 // TODO(slavarnway): eliminate bsize and subsize in future commits
-static void decode_partition(VP9Decoder *const pbi, MACROBLOCKD *const xd,
-                             int mi_row, int mi_col, vpx_reader *r,
-                             BLOCK_SIZE bsize, int n4x4_l2) {
+static void decode_partition(TileWorkerData *twd, VP9Decoder *const pbi,
+                             int mi_row, int mi_col, BLOCK_SIZE bsize,
+                             int n4x4_l2) {
   VP9_COMMON *const cm = &pbi->common;
   const int n8x8_l2 = n4x4_l2 - 1;
   const int num_8x8_wh = 1 << n8x8_l2;
@@ -917,39 +922,39 @@ static void decode_partition(VP9Decoder *const pbi, MACROBLOCKD *const xd,
   BLOCK_SIZE subsize;
   const int has_rows = (mi_row + hbs) < cm->mi_rows;
   const int has_cols = (mi_col + hbs) < cm->mi_cols;
+  MACROBLOCKD *const xd = &twd->xd;
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
 
-  partition =
-      read_partition(xd, mi_row, mi_col, r, has_rows, has_cols, n8x8_l2);
+  partition = read_partition(twd, mi_row, mi_col, has_rows, has_cols, n8x8_l2);
   subsize = subsize_lookup[partition][bsize];  // get_subsize(bsize, partition);
   if (!hbs) {
     // calculate bmode block dimensions (log 2)
     xd->bmode_blocks_wl = 1 >> !!(partition & PARTITION_VERT);
     xd->bmode_blocks_hl = 1 >> !!(partition & PARTITION_HORZ);
-    decode_block(pbi, xd, mi_row, mi_col, r, subsize, 1, 1);
+    decode_block(twd, pbi, mi_row, mi_col, subsize, 1, 1);
   } else {
     switch (partition) {
       case PARTITION_NONE:
-        decode_block(pbi, xd, mi_row, mi_col, r, subsize, n4x4_l2, n4x4_l2);
+        decode_block(twd, pbi, mi_row, mi_col, subsize, n4x4_l2, n4x4_l2);
         break;
       case PARTITION_HORZ:
-        decode_block(pbi, xd, mi_row, mi_col, r, subsize, n4x4_l2, n8x8_l2);
+        decode_block(twd, pbi, mi_row, mi_col, subsize, n4x4_l2, n8x8_l2);
         if (has_rows)
-          decode_block(pbi, xd, mi_row + hbs, mi_col, r, subsize, n4x4_l2,
+          decode_block(twd, pbi, mi_row + hbs, mi_col, subsize, n4x4_l2,
                        n8x8_l2);
         break;
       case PARTITION_VERT:
-        decode_block(pbi, xd, mi_row, mi_col, r, subsize, n8x8_l2, n4x4_l2);
+        decode_block(twd, pbi, mi_row, mi_col, subsize, n8x8_l2, n4x4_l2);
         if (has_cols)
-          decode_block(pbi, xd, mi_row, mi_col + hbs, r, subsize, n8x8_l2,
+          decode_block(twd, pbi, mi_row, mi_col + hbs, subsize, n8x8_l2,
                        n4x4_l2);
         break;
       case PARTITION_SPLIT:
-        decode_partition(pbi, xd, mi_row, mi_col, r, subsize, n8x8_l2);
-        decode_partition(pbi, xd, mi_row, mi_col + hbs, r, subsize, n8x8_l2);
-        decode_partition(pbi, xd, mi_row + hbs, mi_col, r, subsize, n8x8_l2);
-        decode_partition(pbi, xd, mi_row + hbs, mi_col + hbs, r, subsize,
+        decode_partition(twd, pbi, mi_row, mi_col, subsize, n8x8_l2);
+        decode_partition(twd, pbi, mi_row, mi_col + hbs, subsize, n8x8_l2);
+        decode_partition(twd, pbi, mi_row + hbs, mi_col, subsize, n8x8_l2);
+        decode_partition(twd, pbi, mi_row + hbs, mi_col + hbs, subsize,
                          n8x8_l2);
         break;
       default: assert(0 && "Invalid partition type");
@@ -959,7 +964,7 @@ static void decode_partition(VP9Decoder *const pbi, MACROBLOCKD *const xd,
   // update partition context
   if (bsize >= BLOCK_8X8 &&
       (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
-    dec_update_partition_context(xd, mi_row, mi_col, subsize, num_8x8_wh);
+    dec_update_partition_context(twd, mi_row, mi_col, subsize, num_8x8_wh);
 }
 
 static void setup_token_decoder(const uint8_t *data, const uint8_t *data_end,
@@ -1442,8 +1447,7 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data,
         vp9_zero(tile_data->xd.left_seg_context);
         for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end;
              mi_col += MI_BLOCK_SIZE) {
-          decode_partition(pbi, &tile_data->xd, mi_row, mi_col,
-                           &tile_data->bit_reader, BLOCK_64X64, 4);
+          decode_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4);
         }
         pbi->mb.corrupted |= tile_data->xd.corrupted;
         if (pbi->mb.corrupted)
@@ -1532,8 +1536,7 @@ static int tile_worker_hook(TileWorkerData *const tile_data,
       vp9_zero(tile_data->xd.left_seg_context);
       for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
            mi_col += MI_BLOCK_SIZE) {
-        decode_partition(pbi, &tile_data->xd, mi_row, mi_col,
-                         &tile_data->bit_reader, BLOCK_64X64, 4);
+        decode_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4);
       }
     }
 
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 7358c9a39..4372ba037 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -241,7 +241,7 @@ static int read_mv_component(vpx_reader *r, const nmv_component *mvcomp,
 
   // Integer part
   if (class0) {
-    d = vpx_read_tree(r, vp9_mv_class0_tree, mvcomp->class0);
+    d = vpx_read(r, mvcomp->class0[0]);
     mag = 0;
   } else {
     int i;
@@ -826,8 +826,10 @@ static INLINE void copy_ref_frame_pair(MV_REFERENCE_FRAME *dst,
   memcpy(dst, src, sizeof(*dst) * 2);
 }
 
-void vp9_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd, int mi_row,
-                        int mi_col, vpx_reader *r, int x_mis, int y_mis) {
+void vp9_read_mode_info(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row,
+                        int mi_col, int x_mis, int y_mis) {
+  vpx_reader *r = &twd->bit_reader;
+  MACROBLOCKD *const xd = &twd->xd;
   VP9_COMMON *const cm = &pbi->common;
   MODE_INFO *const mi = xd->mi[0];
   MV_REF *frame_mvs = cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col;
diff --git a/vp9/decoder/vp9_decodemv.h b/vp9/decoder/vp9_decodemv.h
index 4e11c2fc0..b460cb8fb 100644
--- a/vp9/decoder/vp9_decodemv.h
+++ b/vp9/decoder/vp9_decodemv.h
@@ -19,8 +19,8 @@
 extern "C" {
 #endif
 
-void vp9_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd, int mi_row,
-                        int mi_col, vpx_reader *r, int x_mis, int y_mis);
+void vp9_read_mode_info(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row,
+                        int mi_col, int x_mis, int y_mis);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index cc01909ff..7048fb1ca 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -29,9 +29,45 @@
     if (counts) ++coef_counts[band][ctx][token]; \
   } while (0)
 
-static INLINE int read_coeff(const vpx_prob *probs, int n, vpx_reader *r) {
+static INLINE int read_bool(vpx_reader *r, int prob, BD_VALUE *value,
+                            int *count, unsigned int *range) {
+  const unsigned int split = (*range * prob + (256 - prob)) >> CHAR_BIT;
+  const BD_VALUE bigsplit = (BD_VALUE)split << (BD_VALUE_SIZE - CHAR_BIT);
+
+  if (*count < 0) {
+    r->value = *value;
+    r->count = *count;
+    vpx_reader_fill(r);
+    *value = r->value;
+    *count = r->count;
+  }
+
+  if (*value >= bigsplit) {
+    *range = *range - split;
+    *value = *value - bigsplit;
+    {
+      const int shift = vpx_norm[*range];
+      *range <<= shift;
+      *value <<= shift;
+      *count -= shift;
+    }
+    return 1;
+  }
+  *range = split;
+  {
+    const int shift = vpx_norm[*range];
+    *range <<= shift;
+    *value <<= shift;
+    *count -= shift;
+  }
+  return 0;
+}
+
+static INLINE int read_coeff(vpx_reader *r, const vpx_prob *probs, int n,
+                             BD_VALUE *value, int *count, unsigned int *range) {
   int i, val = 0;
-  for (i = 0; i < n; ++i) val = (val << 1) | vpx_read(r, probs[i]);
+  for (i = 0; i < n; ++i)
+    val = (val << 1) | read_bool(r, probs[i], value, count, range);
   return val;
 }
 
@@ -52,7 +88,7 @@ static int decode_coefs(const MACROBLOCKD *xd, PLANE_TYPE type,
   uint8_t token_cache[32 * 32];
   const uint8_t *band_translate = get_band_translate(tx_size);
   const int dq_shift = (tx_size == TX_32X32);
-  int v, token;
+  int v;
   int16_t dqv = dq[0];
   const uint8_t *const cat6_prob =
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -66,6 +102,11 @@ static int decode_coefs(const MACROBLOCKD *xd, PLANE_TYPE type,
       (xd->bd == VPX_BITS_12) ? 18 : (xd->bd == VPX_BITS_10) ? 16 :
 #endif  // CONFIG_VP9_HIGHBITDEPTH
                                                              14;
+  // Keep value, range, and count as locals.  The compiler produces better
+  // results with the locals than using r directly.
+  BD_VALUE value = r->value;
+  unsigned int range = r->range;
+  int count = r->count;
 
   if (counts) {
     coef_counts = counts->coef[tx_size][type][ref];
@@ -77,70 +118,98 @@ static int decode_coefs(const MACROBLOCKD *xd, PLANE_TYPE type,
     band = *band_translate++;
     prob = coef_probs[band][ctx];
     if (counts) ++eob_branch_count[band][ctx];
-    if (!vpx_read(r, prob[EOB_CONTEXT_NODE])) {
+    if (!read_bool(r, prob[EOB_CONTEXT_NODE], &value, &count, &range)) {
       INCREMENT_COUNT(EOB_MODEL_TOKEN);
       break;
     }
 
-    while (!vpx_read(r, prob[ZERO_CONTEXT_NODE])) {
+    while (!read_bool(r, prob[ZERO_CONTEXT_NODE], &value, &count, &range)) {
       INCREMENT_COUNT(ZERO_TOKEN);
       dqv = dq[1];
       token_cache[scan[c]] = 0;
       ++c;
-      if (c >= max_eob) return c;  // zero tokens at the end (no eob token)
+      if (c >= max_eob) {
+        r->value = value;
+        r->range = range;
+        r->count = count;
+        return c;  // zero tokens at the end (no eob token)
+      }
       ctx = get_coef_context(nb, token_cache, c);
       band = *band_translate++;
       prob = coef_probs[band][ctx];
     }
 
-    if (!vpx_read(r, prob[ONE_CONTEXT_NODE])) {
-      INCREMENT_COUNT(ONE_TOKEN);
-      token = ONE_TOKEN;
-      val = 1;
-    } else {
+    if (read_bool(r, prob[ONE_CONTEXT_NODE], &value, &count, &range)) {
+      const vpx_prob *p = vp9_pareto8_full[prob[PIVOT_NODE] - 1];
       INCREMENT_COUNT(TWO_TOKEN);
-      token = vpx_read_tree(r, vp9_coef_con_tree,
-                            vp9_pareto8_full[prob[PIVOT_NODE] - 1]);
-      switch (token) {
-        case TWO_TOKEN:
-        case THREE_TOKEN:
-        case FOUR_TOKEN: val = token; break;
-        case CATEGORY1_TOKEN:
-          val = CAT1_MIN_VAL + read_coeff(vp9_cat1_prob, 1, r);
-          break;
-        case CATEGORY2_TOKEN:
-          val = CAT2_MIN_VAL + read_coeff(vp9_cat2_prob, 2, r);
-          break;
-        case CATEGORY3_TOKEN:
-          val = CAT3_MIN_VAL + read_coeff(vp9_cat3_prob, 3, r);
-          break;
-        case CATEGORY4_TOKEN:
-          val = CAT4_MIN_VAL + read_coeff(vp9_cat4_prob, 4, r);
-          break;
-        case CATEGORY5_TOKEN:
-          val = CAT5_MIN_VAL + read_coeff(vp9_cat5_prob, 5, r);
-          break;
-        case CATEGORY6_TOKEN:
-          val = CAT6_MIN_VAL + read_coeff(cat6_prob, cat6_bits, r);
-          break;
+      if (read_bool(r, p[0], &value, &count, &range)) {
+        if (read_bool(r, p[3], &value, &count, &range)) {
+          token_cache[scan[c]] = 5;
+          if (read_bool(r, p[5], &value, &count, &range)) {
+            if (read_bool(r, p[7], &value, &count, &range)) {
+              val = CAT6_MIN_VAL +
+                    read_coeff(r, cat6_prob, cat6_bits, &value, &count, &range);
+            } else {
+              val = CAT5_MIN_VAL +
+                    read_coeff(r, vp9_cat5_prob, 5, &value, &count, &range);
+            }
+          } else if (read_bool(r, p[6], &value, &count, &range)) {
+            val = CAT4_MIN_VAL +
+                  read_coeff(r, vp9_cat4_prob, 4, &value, &count, &range);
+          } else {
+            val = CAT3_MIN_VAL +
+                  read_coeff(r, vp9_cat3_prob, 3, &value, &count, &range);
+          }
+        } else {
+          token_cache[scan[c]] = 4;
+          if (read_bool(r, p[4], &value, &count, &range)) {
+            val = CAT2_MIN_VAL +
+                  read_coeff(r, vp9_cat2_prob, 2, &value, &count, &range);
+          } else {
+            val = CAT1_MIN_VAL +
+                  read_coeff(r, vp9_cat1_prob, 1, &value, &count, &range);
+          }
+        }
+        v = (val * dqv) >> dq_shift;
+      } else {
+        if (read_bool(r, p[1], &value, &count, &range)) {
+          token_cache[scan[c]] = 3;
+          v = ((3 + read_bool(r, p[2], &value, &count, &range)) * dqv) >>
+              dq_shift;
+        } else {
+          token_cache[scan[c]] = 2;
+          v = (2 * dqv) >> dq_shift;
+        }
       }
+    } else {
+      INCREMENT_COUNT(ONE_TOKEN);
+      token_cache[scan[c]] = 1;
+      v = dqv >> dq_shift;
     }
-    v = (val * dqv) >> dq_shift;
 #if CONFIG_COEFFICIENT_RANGE_CHECKING
 #if CONFIG_VP9_HIGHBITDEPTH
-    dqcoeff[scan[c]] = highbd_check_range((vpx_read_bit(r) ? -v : v), xd->bd);
+    dqcoeff[scan[c]] =
+        highbd_check_range(read_bool(r, 128, &value, &count, &range) ? -v : v),
+                           xd->bd);
 #else
-    dqcoeff[scan[c]] = check_range(vpx_read_bit(r) ? -v : v);
+    dqcoeff[scan[c]] =
+        check_range(read_bool(r, 128, &value, &count, &range) ? -v : v);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #else
-    dqcoeff[scan[c]] = vpx_read_bit(r) ? -v : v;
+    if (read_bool(r, 128, &value, &count, &range)) {
+      dqcoeff[scan[c]] = -v;
+    } else {
+      dqcoeff[scan[c]] = v;
+    }
 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
-    token_cache[scan[c]] = vp9_pt_energy_class[token];
     ++c;
     ctx = get_coef_context(nb, token_cache, c);
     dqv = dq[1];
   }
 
+  r->value = value;
+  r->range = range;
+  r->count = count;
   return c;
 }
 
@@ -156,9 +225,11 @@ static void get_ctx_shift(MACROBLOCKD *xd, int *ctx_shift_a, int *ctx_shift_l,
   }
 }
 
-int vp9_decode_block_tokens(MACROBLOCKD *xd, int plane, const scan_order *sc,
-                            int x, int y, TX_SIZE tx_size, vpx_reader *r,
+int vp9_decode_block_tokens(TileWorkerData *twd, int plane,
+                            const scan_order *sc, int x, int y, TX_SIZE tx_size,
                             int seg_id) {
+  vpx_reader *r = &twd->bit_reader;
+  MACROBLOCKD *xd = &twd->xd;
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const int16_t *const dequant = pd->seg_dequant[seg_id];
   int eob;
diff --git a/vp9/decoder/vp9_detokenize.h b/vp9/decoder/vp9_detokenize.h
index aa2afb16a..7b0d87601 100644
--- a/vp9/decoder/vp9_detokenize.h
+++ b/vp9/decoder/vp9_detokenize.h
@@ -19,8 +19,8 @@
 extern "C" {
 #endif
 
-int vp9_decode_block_tokens(MACROBLOCKD *xd, int plane, const scan_order *sc,
-                            int x, int y, TX_SIZE tx_size, vpx_reader *r,
+int vp9_decode_block_tokens(TileWorkerData *twd, int plane,
+                            const scan_order *sc, int x, int y, TX_SIZE tx_size,
                             int seg_id);
 
 #ifdef __cplusplus
diff --git a/vp9/encoder/vp9_encodemv.c b/vp9/encoder/vp9_encodemv.c
index 8e76f72fe..874a8e4b9 100644
--- a/vp9/encoder/vp9_encodemv.c
+++ b/vp9/encoder/vp9_encodemv.c
@@ -21,12 +21,10 @@
 static struct vp9_token mv_joint_encodings[MV_JOINTS];
 static struct vp9_token mv_class_encodings[MV_CLASSES];
 static struct vp9_token mv_fp_encodings[MV_FP_SIZE];
-static struct vp9_token mv_class0_encodings[CLASS0_SIZE];
 
 void vp9_entropy_mv_init(void) {
   vp9_tokens_from_tree(mv_joint_encodings, vp9_mv_joint_tree);
   vp9_tokens_from_tree(mv_class_encodings, vp9_mv_class_tree);
-  vp9_tokens_from_tree(mv_class0_encodings, vp9_mv_class0_tree);
   vp9_tokens_from_tree(mv_fp_encodings, vp9_mv_fp_tree);
 }
 
@@ -51,8 +49,7 @@ static void encode_mv_component(vpx_writer *w, int comp,
 
   // Integer bits
   if (mv_class == MV_CLASS_0) {
-    vp9_write_token(w, vp9_mv_class0_tree, mvcomp->class0,
-                    &mv_class0_encodings[d]);
+    vpx_write(w, d, mvcomp->class0[0]);
   } else {
     int i;
     const int n = mv_class + CLASS0_BITS - 1;  // number of bits
diff --git a/vp9/encoder/vp9_picklpf.c b/vp9/encoder/vp9_picklpf.c
index 1583cc8ab..6fc7cd1e3 100644
--- a/vp9/encoder/vp9_picklpf.c
+++ b/vp9/encoder/vp9_picklpf.c
@@ -180,6 +180,10 @@ void vp9_pick_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
     }
 #else
     int filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18);
+    if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR &&
+        cpi->oxcf.content != VP9E_CONTENT_SCREEN && cm->frame_type != KEY_FRAME)
+      filt_guess = 5 * filt_guess >> 3;
+
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     if (cm->frame_type == KEY_FRAME) filt_guess -= 4;
     lf->filter_level = clamp(filt_guess, min_filter_level, max_filter_level);
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 2fd42960e..5bfc0d359 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -45,7 +45,6 @@ VP9_COMMON_SRCS-yes += common/vp9_scale.h
 VP9_COMMON_SRCS-yes += common/vp9_scale.c
 VP9_COMMON_SRCS-yes += common/vp9_seg_common.h
 VP9_COMMON_SRCS-yes += common/vp9_seg_common.c
-VP9_COMMON_SRCS-yes += common/vp9_textblit.h
 VP9_COMMON_SRCS-yes += common/vp9_tile_common.h
 VP9_COMMON_SRCS-yes += common/vp9_tile_common.c
 VP9_COMMON_SRCS-yes += common/vp9_loopfilter.c
@@ -55,7 +54,6 @@ VP9_COMMON_SRCS-yes += common/vp9_mvref_common.h
 VP9_COMMON_SRCS-yes += common/vp9_quant_common.c
 VP9_COMMON_SRCS-yes += common/vp9_reconinter.c
 VP9_COMMON_SRCS-yes += common/vp9_reconintra.c
-VP9_COMMON_SRCS-$(CONFIG_POSTPROC_VISUALIZER) += common/vp9_textblit.c
 VP9_COMMON_SRCS-yes += common/vp9_common_data.c
 VP9_COMMON_SRCS-yes += common/vp9_common_data.h
 VP9_COMMON_SRCS-yes += common/vp9_scan.c
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index 04b1dca29..3b5dc3dda 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -829,13 +829,6 @@ static vpx_codec_err_t ctrl_set_postproc(vpx_codec_alg_priv_t *ctx,
 #endif
 }
 
-static vpx_codec_err_t ctrl_set_dbg_options(vpx_codec_alg_priv_t *ctx,
-                                            va_list args) {
-  (void)ctx;
-  (void)args;
-  return VPX_CODEC_INCAPABLE;
-}
-
 static vpx_codec_err_t ctrl_get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
                                                  va_list args) {
   int *const update_info = va_arg(args, int *);
@@ -1014,10 +1007,6 @@ static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
   // Setters
   { VP8_SET_REFERENCE, ctrl_set_reference },
   { VP8_SET_POSTPROC, ctrl_set_postproc },
-  { VP8_SET_DBG_COLOR_REF_FRAME, ctrl_set_dbg_options },
-  { VP8_SET_DBG_COLOR_MB_MODES, ctrl_set_dbg_options },
-  { VP8_SET_DBG_COLOR_B_MODES, ctrl_set_dbg_options },
-  { VP8_SET_DBG_DISPLAY_MV, ctrl_set_dbg_options },
   { VP9_INVERT_TILE_DECODE_ORDER, ctrl_set_invert_tile_order },
   { VPXD_SET_DECRYPTOR, ctrl_set_decryptor },
   { VP9_SET_BYTE_ALIGNMENT, ctrl_set_byte_alignment },
diff --git a/vpx/vp8.h b/vpx/vp8.h
index c3eb5265a..059c9d0f6 100644
--- a/vpx/vp8.h
+++ b/vpx/vp8.h
@@ -47,11 +47,10 @@ enum vp8_com_control_id {
   VP8_SET_REFERENCE = 1,
   VP8_COPY_REFERENCE = 2, /**< get a copy of reference frame from the decoder */
   VP8_SET_POSTPROC = 3,   /**< set the decoder's post processing settings  */
-  VP8_SET_DBG_COLOR_REF_FRAME =
-      4, /**< set the reference frames to color for each macroblock */
-  VP8_SET_DBG_COLOR_MB_MODES = 5, /**< set which macro block modes to color */
-  VP8_SET_DBG_COLOR_B_MODES = 6,  /**< set which blocks modes to color */
-  VP8_SET_DBG_DISPLAY_MV = 7,     /**< set which motion vector modes to draw */
+  VP8_SET_DBG_COLOR_REF_FRAME = 4, /**< \deprecated */
+  VP8_SET_DBG_COLOR_MB_MODES = 5,  /**< \deprecated */
+  VP8_SET_DBG_COLOR_B_MODES = 6,   /**< \deprecated */
+  VP8_SET_DBG_DISPLAY_MV = 7,      /**< \deprecated */
 
   /* TODO(jkoleszar): The encoder incorrectly reuses some of these values (5+)
    * for its control ids. These should be migrated to something like the
@@ -133,13 +132,13 @@ VPX_CTRL_USE_TYPE(VP8_COPY_REFERENCE, vpx_ref_frame_t *)
 #define VPX_CTRL_VP8_COPY_REFERENCE
 VPX_CTRL_USE_TYPE(VP8_SET_POSTPROC, vp8_postproc_cfg_t *)
 #define VPX_CTRL_VP8_SET_POSTPROC
-VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_REF_FRAME, int)
+VPX_CTRL_USE_TYPE_DEPRECATED(VP8_SET_DBG_COLOR_REF_FRAME, int)
 #define VPX_CTRL_VP8_SET_DBG_COLOR_REF_FRAME
-VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_MB_MODES, int)
+VPX_CTRL_USE_TYPE_DEPRECATED(VP8_SET_DBG_COLOR_MB_MODES, int)
 #define VPX_CTRL_VP8_SET_DBG_COLOR_MB_MODES
-VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_B_MODES, int)
+VPX_CTRL_USE_TYPE_DEPRECATED(VP8_SET_DBG_COLOR_B_MODES, int)
 #define VPX_CTRL_VP8_SET_DBG_COLOR_B_MODES
-VPX_CTRL_USE_TYPE(VP8_SET_DBG_DISPLAY_MV, int)
+VPX_CTRL_USE_TYPE_DEPRECATED(VP8_SET_DBG_DISPLAY_MV, int)
 #define VPX_CTRL_VP8_SET_DBG_DISPLAY_MV
 VPX_CTRL_USE_TYPE(VP9_GET_REFERENCE, vp9_ref_frame_t *)
 #define VPX_CTRL_VP9_GET_REFERENCE
diff --git a/vpx_dsp/arm/loopfilter_16_neon.c b/vpx_dsp/arm/loopfilter_16_neon.c
deleted file mode 100644
index 9607bb240..000000000
--- a/vpx_dsp/arm/loopfilter_16_neon.c
+++ /dev/null
@@ -1,173 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-#include "./vpx_dsp_rtcd.h"
-#include "./vpx_config.h"
-#include "vpx/vpx_integer.h"
-
-static INLINE void loop_filter_neon_16(uint8x16_t qblimit,  // blimit
-                                       uint8x16_t qlimit,   // limit
-                                       uint8x16_t qthresh,  // thresh
-                                       uint8x16_t q3,       // p3
-                                       uint8x16_t q4,       // p2
-                                       uint8x16_t q5,       // p1
-                                       uint8x16_t q6,       // p0
-                                       uint8x16_t q7,       // q0
-                                       uint8x16_t q8,       // q1
-                                       uint8x16_t q9,       // q2
-                                       uint8x16_t q10,      // q3
-                                       uint8x16_t *q5r,     // p1
-                                       uint8x16_t *q6r,     // p0
-                                       uint8x16_t *q7r,     // q0
-                                       uint8x16_t *q8r) {   // q1
-  uint8x16_t q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8;
-  int16x8_t q2s16, q11s16;
-  uint16x8_t q4u16;
-  int8x16_t q0s8, q1s8, q2s8, q11s8, q12s8, q13s8;
-  int8x8_t d2s8, d3s8;
-
-  q11u8 = vabdq_u8(q3, q4);
-  q12u8 = vabdq_u8(q4, q5);
-  q13u8 = vabdq_u8(q5, q6);
-  q14u8 = vabdq_u8(q8, q7);
-  q3 = vabdq_u8(q9, q8);
-  q4 = vabdq_u8(q10, q9);
-
-  q11u8 = vmaxq_u8(q11u8, q12u8);
-  q12u8 = vmaxq_u8(q13u8, q14u8);
-  q3 = vmaxq_u8(q3, q4);
-  q15u8 = vmaxq_u8(q11u8, q12u8);
-
-  q9 = vabdq_u8(q6, q7);
-
-  // vp8_hevmask
-  q13u8 = vcgtq_u8(q13u8, qthresh);
-  q14u8 = vcgtq_u8(q14u8, qthresh);
-  q15u8 = vmaxq_u8(q15u8, q3);
-
-  q2u8 = vabdq_u8(q5, q8);
-  q9 = vqaddq_u8(q9, q9);
-
-  q15u8 = vcgeq_u8(qlimit, q15u8);
-
-  // vp8_filter() function
-  // convert to signed
-  q10 = vdupq_n_u8(0x80);
-  q8 = veorq_u8(q8, q10);
-  q7 = veorq_u8(q7, q10);
-  q6 = veorq_u8(q6, q10);
-  q5 = veorq_u8(q5, q10);
-
-  q2u8 = vshrq_n_u8(q2u8, 1);
-  q9 = vqaddq_u8(q9, q2u8);
-
-  q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
-                   vget_low_s8(vreinterpretq_s8_u8(q6)));
-  q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
-                    vget_high_s8(vreinterpretq_s8_u8(q6)));
-
-  q9 = vcgeq_u8(qblimit, q9);
-
-  q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), vreinterpretq_s8_u8(q8));
-
-  q14u8 = vorrq_u8(q13u8, q14u8);
-
-  q4u16 = vdupq_n_u16(3);
-  q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16));
-  q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16));
-
-  q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8);
-  q15u8 = vandq_u8(q15u8, q9);
-
-  q1s8 = vreinterpretq_s8_u8(q1u8);
-  q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
-  q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8));
-
-  q4 = vdupq_n_u8(3);
-  q9 = vdupq_n_u8(4);
-  // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
-  d2s8 = vqmovn_s16(q2s16);
-  d3s8 = vqmovn_s16(q11s16);
-  q1s8 = vcombine_s8(d2s8, d3s8);
-  q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8);
-  q1s8 = vreinterpretq_s8_u8(q1u8);
-
-  q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q4));
-  q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9));
-  q2s8 = vshrq_n_s8(q2s8, 3);
-  q1s8 = vshrq_n_s8(q1s8, 3);
-
-  q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8);
-  q0s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8);
-
-  q1s8 = vrshrq_n_s8(q1s8, 1);
-  q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
-
-  q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8);
-  q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8);
-
-  *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q10);
-  *q7r = veorq_u8(vreinterpretq_u8_s8(q0s8), q10);
-  *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q10);
-  *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q10);
-  return;
-}
-
-void vpx_lpf_horizontal_4_dual_neon(
-    uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
-    const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
-    const uint8_t *limit1, const uint8_t *thresh1) {
-  uint8x8_t dblimit0, dlimit0, dthresh0, dblimit1, dlimit1, dthresh1;
-  uint8x16_t qblimit, qlimit, qthresh;
-  uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8;
-
-  dblimit0 = vld1_u8(blimit0);
-  dlimit0 = vld1_u8(limit0);
-  dthresh0 = vld1_u8(thresh0);
-  dblimit1 = vld1_u8(blimit1);
-  dlimit1 = vld1_u8(limit1);
-  dthresh1 = vld1_u8(thresh1);
-  qblimit = vcombine_u8(dblimit0, dblimit1);
-  qlimit = vcombine_u8(dlimit0, dlimit1);
-  qthresh = vcombine_u8(dthresh0, dthresh1);
-
-  s -= (p << 2);
-
-  q3u8 = vld1q_u8(s);
-  s += p;
-  q4u8 = vld1q_u8(s);
-  s += p;
-  q5u8 = vld1q_u8(s);
-  s += p;
-  q6u8 = vld1q_u8(s);
-  s += p;
-  q7u8 = vld1q_u8(s);
-  s += p;
-  q8u8 = vld1q_u8(s);
-  s += p;
-  q9u8 = vld1q_u8(s);
-  s += p;
-  q10u8 = vld1q_u8(s);
-
-  loop_filter_neon_16(qblimit, qlimit, qthresh, q3u8, q4u8, q5u8, q6u8, q7u8,
-                      q8u8, q9u8, q10u8, &q5u8, &q6u8, &q7u8, &q8u8);
-
-  s -= (p * 5);
-  vst1q_u8(s, q5u8);
-  s += p;
-  vst1q_u8(s, q6u8);
-  s += p;
-  vst1q_u8(s, q7u8);
-  s += p;
-  vst1q_u8(s, q8u8);
-  return;
-}
diff --git a/vpx_dsp/arm/loopfilter_4_neon.c b/vpx_dsp/arm/loopfilter_4_neon.c
deleted file mode 100644
index 1c1e80e00..000000000
--- a/vpx_dsp/arm/loopfilter_4_neon.c
+++ /dev/null
@@ -1,249 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-#include "./vpx_dsp_rtcd.h"
-
-static INLINE void loop_filter_neon(uint8x8_t dblimit,   // flimit
-                                    uint8x8_t dlimit,    // limit
-                                    uint8x8_t dthresh,   // thresh
-                                    uint8x8_t d3u8,      // p3
-                                    uint8x8_t d4u8,      // p2
-                                    uint8x8_t d5u8,      // p1
-                                    uint8x8_t d6u8,      // p0
-                                    uint8x8_t d7u8,      // q0
-                                    uint8x8_t d16u8,     // q1
-                                    uint8x8_t d17u8,     // q2
-                                    uint8x8_t d18u8,     // q3
-                                    uint8x8_t *d4ru8,    // p1
-                                    uint8x8_t *d5ru8,    // p0
-                                    uint8x8_t *d6ru8,    // q0
-                                    uint8x8_t *d7ru8) {  // q1
-  uint8x8_t d19u8, d20u8, d21u8, d22u8, d23u8, d27u8, d28u8;
-  int16x8_t q12s16;
-  int8x8_t d19s8, d20s8, d21s8, d26s8, d27s8, d28s8;
-
-  d19u8 = vabd_u8(d3u8, d4u8);
-  d20u8 = vabd_u8(d4u8, d5u8);
-  d21u8 = vabd_u8(d5u8, d6u8);
-  d22u8 = vabd_u8(d16u8, d7u8);
-  d3u8 = vabd_u8(d17u8, d16u8);
-  d4u8 = vabd_u8(d18u8, d17u8);
-
-  d19u8 = vmax_u8(d19u8, d20u8);
-  d20u8 = vmax_u8(d21u8, d22u8);
-  d3u8 = vmax_u8(d3u8, d4u8);
-  d23u8 = vmax_u8(d19u8, d20u8);
-
-  d17u8 = vabd_u8(d6u8, d7u8);
-
-  d21u8 = vcgt_u8(d21u8, dthresh);
-  d22u8 = vcgt_u8(d22u8, dthresh);
-  d23u8 = vmax_u8(d23u8, d3u8);
-
-  d28u8 = vabd_u8(d5u8, d16u8);
-  d17u8 = vqadd_u8(d17u8, d17u8);
-
-  d23u8 = vcge_u8(dlimit, d23u8);
-
-  d18u8 = vdup_n_u8(0x80);
-  d5u8 = veor_u8(d5u8, d18u8);
-  d6u8 = veor_u8(d6u8, d18u8);
-  d7u8 = veor_u8(d7u8, d18u8);
-  d16u8 = veor_u8(d16u8, d18u8);
-
-  d28u8 = vshr_n_u8(d28u8, 1);
-  d17u8 = vqadd_u8(d17u8, d28u8);
-
-  d19u8 = vdup_n_u8(3);
-
-  d28s8 = vsub_s8(vreinterpret_s8_u8(d7u8), vreinterpret_s8_u8(d6u8));
-
-  d17u8 = vcge_u8(dblimit, d17u8);
-
-  d27s8 = vqsub_s8(vreinterpret_s8_u8(d5u8), vreinterpret_s8_u8(d16u8));
-
-  d22u8 = vorr_u8(d21u8, d22u8);
-
-  q12s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d19u8));
-
-  d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d22u8);
-  d23u8 = vand_u8(d23u8, d17u8);
-
-  q12s16 = vaddw_s8(q12s16, vreinterpret_s8_u8(d27u8));
-
-  d17u8 = vdup_n_u8(4);
-
-  d27s8 = vqmovn_s16(q12s16);
-  d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d23u8);
-  d27s8 = vreinterpret_s8_u8(d27u8);
-
-  d28s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d19u8));
-  d27s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d17u8));
-  d28s8 = vshr_n_s8(d28s8, 3);
-  d27s8 = vshr_n_s8(d27s8, 3);
-
-  d19s8 = vqadd_s8(vreinterpret_s8_u8(d6u8), d28s8);
-  d26s8 = vqsub_s8(vreinterpret_s8_u8(d7u8), d27s8);
-
-  d27s8 = vrshr_n_s8(d27s8, 1);
-  d27s8 = vbic_s8(d27s8, vreinterpret_s8_u8(d22u8));
-
-  d21s8 = vqadd_s8(vreinterpret_s8_u8(d5u8), d27s8);
-  d20s8 = vqsub_s8(vreinterpret_s8_u8(d16u8), d27s8);
-
-  *d4ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d18u8);
-  *d5ru8 = veor_u8(vreinterpret_u8_s8(d19s8), d18u8);
-  *d6ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d18u8);
-  *d7ru8 = veor_u8(vreinterpret_u8_s8(d20s8), d18u8);
-  return;
-}
-
-void vpx_lpf_horizontal_4_neon(uint8_t *src, int pitch, const uint8_t *blimit,
-                               const uint8_t *limit, const uint8_t *thresh) {
-  int i;
-  uint8_t *s, *psrc;
-  uint8x8_t dblimit, dlimit, dthresh;
-  uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
-
-  dblimit = vld1_u8(blimit);
-  dlimit = vld1_u8(limit);
-  dthresh = vld1_u8(thresh);
-
-  psrc = src - (pitch << 2);
-  for (i = 0; i < 1; i++) {
-    s = psrc + i * 8;
-
-    d3u8 = vld1_u8(s);
-    s += pitch;
-    d4u8 = vld1_u8(s);
-    s += pitch;
-    d5u8 = vld1_u8(s);
-    s += pitch;
-    d6u8 = vld1_u8(s);
-    s += pitch;
-    d7u8 = vld1_u8(s);
-    s += pitch;
-    d16u8 = vld1_u8(s);
-    s += pitch;
-    d17u8 = vld1_u8(s);
-    s += pitch;
-    d18u8 = vld1_u8(s);
-
-    loop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
-                     d16u8, d17u8, d18u8, &d4u8, &d5u8, &d6u8, &d7u8);
-
-    s -= (pitch * 5);
-    vst1_u8(s, d4u8);
-    s += pitch;
-    vst1_u8(s, d5u8);
-    s += pitch;
-    vst1_u8(s, d6u8);
-    s += pitch;
-    vst1_u8(s, d7u8);
-  }
-  return;
-}
-
-void vpx_lpf_vertical_4_neon(uint8_t *src, int pitch, const uint8_t *blimit,
-                             const uint8_t *limit, const uint8_t *thresh) {
-  int i, pitch8;
-  uint8_t *s;
-  uint8x8_t dblimit, dlimit, dthresh;
-  uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
-  uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
-  uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
-  uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
-  uint8x8x4_t d4Result;
-
-  dblimit = vld1_u8(blimit);
-  dlimit = vld1_u8(limit);
-  dthresh = vld1_u8(thresh);
-
-  pitch8 = pitch * 8;
-  for (i = 0; i < 1; i++, src += pitch8) {
-    s = src - (i + 1) * 4;
-
-    d3u8 = vld1_u8(s);
-    s += pitch;
-    d4u8 = vld1_u8(s);
-    s += pitch;
-    d5u8 = vld1_u8(s);
-    s += pitch;
-    d6u8 = vld1_u8(s);
-    s += pitch;
-    d7u8 = vld1_u8(s);
-    s += pitch;
-    d16u8 = vld1_u8(s);
-    s += pitch;
-    d17u8 = vld1_u8(s);
-    s += pitch;
-    d18u8 = vld1_u8(s);
-
-    d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), vreinterpret_u32_u8(d7u8));
-    d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), vreinterpret_u32_u8(d16u8));
-    d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), vreinterpret_u32_u8(d17u8));
-    d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), vreinterpret_u32_u8(d18u8));
-
-    d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
-                      vreinterpret_u16_u32(d2tmp2.val[0]));
-    d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
-                      vreinterpret_u16_u32(d2tmp3.val[0]));
-    d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
-                      vreinterpret_u16_u32(d2tmp2.val[1]));
-    d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
-                      vreinterpret_u16_u32(d2tmp3.val[1]));
-
-    d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
-                     vreinterpret_u8_u16(d2tmp5.val[0]));
-    d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
-                     vreinterpret_u8_u16(d2tmp5.val[1]));
-    d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
-                      vreinterpret_u8_u16(d2tmp7.val[0]));
-    d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
-                      vreinterpret_u8_u16(d2tmp7.val[1]));
-
-    d3u8 = d2tmp8.val[0];
-    d4u8 = d2tmp8.val[1];
-    d5u8 = d2tmp9.val[0];
-    d6u8 = d2tmp9.val[1];
-    d7u8 = d2tmp10.val[0];
-    d16u8 = d2tmp10.val[1];
-    d17u8 = d2tmp11.val[0];
-    d18u8 = d2tmp11.val[1];
-
-    loop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
-                     d16u8, d17u8, d18u8, &d4u8, &d5u8, &d6u8, &d7u8);
-
-    d4Result.val[0] = d4u8;
-    d4Result.val[1] = d5u8;
-    d4Result.val[2] = d6u8;
-    d4Result.val[3] = d7u8;
-
-    src -= 2;
-    vst4_lane_u8(src, d4Result, 0);
-    src += pitch;
-    vst4_lane_u8(src, d4Result, 1);
-    src += pitch;
-    vst4_lane_u8(src, d4Result, 2);
-    src += pitch;
-    vst4_lane_u8(src, d4Result, 3);
-    src += pitch;
-    vst4_lane_u8(src, d4Result, 4);
-    src += pitch;
-    vst4_lane_u8(src, d4Result, 5);
-    src += pitch;
-    vst4_lane_u8(src, d4Result, 6);
-    src += pitch;
-    vst4_lane_u8(src, d4Result, 7);
-  }
-  return;
-}
diff --git a/vpx_dsp/arm/loopfilter_8_neon.c b/vpx_dsp/arm/loopfilter_8_neon.c
deleted file mode 100644
index 8641541b0..000000000
--- a/vpx_dsp/arm/loopfilter_8_neon.c
+++ /dev/null
@@ -1,445 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-#include "./vpx_dsp_rtcd.h"
-
-static INLINE void mbloop_filter_neon(uint8x8_t dblimit,   // mblimit
-                                      uint8x8_t dlimit,    // limit
-                                      uint8x8_t dthresh,   // thresh
-                                      uint8x8_t d3u8,      // p2
-                                      uint8x8_t d4u8,      // p2
-                                      uint8x8_t d5u8,      // p1
-                                      uint8x8_t d6u8,      // p0
-                                      uint8x8_t d7u8,      // q0
-                                      uint8x8_t d16u8,     // q1
-                                      uint8x8_t d17u8,     // q2
-                                      uint8x8_t d18u8,     // q3
-                                      uint8x8_t *d0ru8,    // p1
-                                      uint8x8_t *d1ru8,    // p1
-                                      uint8x8_t *d2ru8,    // p0
-                                      uint8x8_t *d3ru8,    // q0
-                                      uint8x8_t *d4ru8,    // q1
-                                      uint8x8_t *d5ru8) {  // q1
-  uint32_t flat;
-  uint8x8_t d0u8, d1u8, d2u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8;
-  uint8x8_t d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
-  int16x8_t q15s16;
-  uint16x8_t q10u16, q14u16;
-  int8x8_t d21s8, d24s8, d25s8, d26s8, d28s8, d29s8, d30s8;
-
-  d19u8 = vabd_u8(d3u8, d4u8);
-  d20u8 = vabd_u8(d4u8, d5u8);
-  d21u8 = vabd_u8(d5u8, d6u8);
-  d22u8 = vabd_u8(d16u8, d7u8);
-  d23u8 = vabd_u8(d17u8, d16u8);
-  d24u8 = vabd_u8(d18u8, d17u8);
-
-  d19u8 = vmax_u8(d19u8, d20u8);
-  d20u8 = vmax_u8(d21u8, d22u8);
-
-  d25u8 = vabd_u8(d6u8, d4u8);
-
-  d23u8 = vmax_u8(d23u8, d24u8);
-
-  d26u8 = vabd_u8(d7u8, d17u8);
-
-  d19u8 = vmax_u8(d19u8, d20u8);
-
-  d24u8 = vabd_u8(d6u8, d7u8);
-  d27u8 = vabd_u8(d3u8, d6u8);
-  d28u8 = vabd_u8(d18u8, d7u8);
-
-  d19u8 = vmax_u8(d19u8, d23u8);
-
-  d23u8 = vabd_u8(d5u8, d16u8);
-  d24u8 = vqadd_u8(d24u8, d24u8);
-
-  d19u8 = vcge_u8(dlimit, d19u8);
-
-  d25u8 = vmax_u8(d25u8, d26u8);
-  d26u8 = vmax_u8(d27u8, d28u8);
-
-  d23u8 = vshr_n_u8(d23u8, 1);
-
-  d25u8 = vmax_u8(d25u8, d26u8);
-
-  d24u8 = vqadd_u8(d24u8, d23u8);
-
-  d20u8 = vmax_u8(d20u8, d25u8);
-
-  d23u8 = vdup_n_u8(1);
-  d24u8 = vcge_u8(dblimit, d24u8);
-
-  d21u8 = vcgt_u8(d21u8, dthresh);
-
-  d20u8 = vcge_u8(d23u8, d20u8);
-
-  d19u8 = vand_u8(d19u8, d24u8);
-
-  d23u8 = vcgt_u8(d22u8, dthresh);
-
-  d20u8 = vand_u8(d20u8, d19u8);
-
-  d22u8 = vdup_n_u8(0x80);
-
-  d23u8 = vorr_u8(d21u8, d23u8);
-
-  q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8), vreinterpret_u16_u8(d21u8));
-
-  d30u8 = vshrn_n_u16(q10u16, 4);
-  flat = vget_lane_u32(vreinterpret_u32_u8(d30u8), 0);
-
-  if (flat == 0xffffffff) {  // Check for all 1's, power_branch_only
-    d27u8 = vdup_n_u8(3);
-    d21u8 = vdup_n_u8(2);
-    q14u16 = vaddl_u8(d6u8, d7u8);
-    q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
-    q14u16 = vmlal_u8(q14u16, d4u8, d21u8);
-    q14u16 = vaddw_u8(q14u16, d5u8);
-    *d0ru8 = vqrshrn_n_u16(q14u16, 3);
-
-    q14u16 = vsubw_u8(q14u16, d3u8);
-    q14u16 = vsubw_u8(q14u16, d4u8);
-    q14u16 = vaddw_u8(q14u16, d5u8);
-    q14u16 = vaddw_u8(q14u16, d16u8);
-    *d1ru8 = vqrshrn_n_u16(q14u16, 3);
-
-    q14u16 = vsubw_u8(q14u16, d3u8);
-    q14u16 = vsubw_u8(q14u16, d5u8);
-    q14u16 = vaddw_u8(q14u16, d6u8);
-    q14u16 = vaddw_u8(q14u16, d17u8);
-    *d2ru8 = vqrshrn_n_u16(q14u16, 3);
-
-    q14u16 = vsubw_u8(q14u16, d3u8);
-    q14u16 = vsubw_u8(q14u16, d6u8);
-    q14u16 = vaddw_u8(q14u16, d7u8);
-    q14u16 = vaddw_u8(q14u16, d18u8);
-    *d3ru8 = vqrshrn_n_u16(q14u16, 3);
-
-    q14u16 = vsubw_u8(q14u16, d4u8);
-    q14u16 = vsubw_u8(q14u16, d7u8);
-    q14u16 = vaddw_u8(q14u16, d16u8);
-    q14u16 = vaddw_u8(q14u16, d18u8);
-    *d4ru8 = vqrshrn_n_u16(q14u16, 3);
-
-    q14u16 = vsubw_u8(q14u16, d5u8);
-    q14u16 = vsubw_u8(q14u16, d16u8);
-    q14u16 = vaddw_u8(q14u16, d17u8);
-    q14u16 = vaddw_u8(q14u16, d18u8);
-    *d5ru8 = vqrshrn_n_u16(q14u16, 3);
-  } else {
-    d21u8 = veor_u8(d7u8, d22u8);
-    d24u8 = veor_u8(d6u8, d22u8);
-    d25u8 = veor_u8(d5u8, d22u8);
-    d26u8 = veor_u8(d16u8, d22u8);
-
-    d27u8 = vdup_n_u8(3);
-
-    d28s8 = vsub_s8(vreinterpret_s8_u8(d21u8), vreinterpret_s8_u8(d24u8));
-    d29s8 = vqsub_s8(vreinterpret_s8_u8(d25u8), vreinterpret_s8_u8(d26u8));
-
-    q15s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d27u8));
-
-    d29s8 = vand_s8(d29s8, vreinterpret_s8_u8(d23u8));
-
-    q15s16 = vaddw_s8(q15s16, d29s8);
-
-    d29u8 = vdup_n_u8(4);
-
-    d28s8 = vqmovn_s16(q15s16);
-
-    d28s8 = vand_s8(d28s8, vreinterpret_s8_u8(d19u8));
-
-    d30s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d27u8));
-    d29s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d29u8));
-    d30s8 = vshr_n_s8(d30s8, 3);
-    d29s8 = vshr_n_s8(d29s8, 3);
-
-    d24s8 = vqadd_s8(vreinterpret_s8_u8(d24u8), d30s8);
-    d21s8 = vqsub_s8(vreinterpret_s8_u8(d21u8), d29s8);
-
-    d29s8 = vrshr_n_s8(d29s8, 1);
-    d29s8 = vbic_s8(d29s8, vreinterpret_s8_u8(d23u8));
-
-    d25s8 = vqadd_s8(vreinterpret_s8_u8(d25u8), d29s8);
-    d26s8 = vqsub_s8(vreinterpret_s8_u8(d26u8), d29s8);
-
-    if (flat == 0) {  // filter_branch_only
-      *d0ru8 = d4u8;
-      *d1ru8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
-      *d2ru8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
-      *d3ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
-      *d4ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
-      *d5ru8 = d17u8;
-      return;
-    }
-
-    d21u8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
-    d24u8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
-    d25u8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
-    d26u8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
-
-    d23u8 = vdup_n_u8(2);
-    q14u16 = vaddl_u8(d6u8, d7u8);
-    q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
-    q14u16 = vmlal_u8(q14u16, d4u8, d23u8);
-
-    d0u8 = vbsl_u8(d20u8, dblimit, d4u8);
-
-    q14u16 = vaddw_u8(q14u16, d5u8);
-
-    d1u8 = vbsl_u8(d20u8, dlimit, d25u8);
-
-    d30u8 = vqrshrn_n_u16(q14u16, 3);
-
-    q14u16 = vsubw_u8(q14u16, d3u8);
-    q14u16 = vsubw_u8(q14u16, d4u8);
-    q14u16 = vaddw_u8(q14u16, d5u8);
-    q14u16 = vaddw_u8(q14u16, d16u8);
-
-    d2u8 = vbsl_u8(d20u8, dthresh, d24u8);
-
-    d31u8 = vqrshrn_n_u16(q14u16, 3);
-
-    q14u16 = vsubw_u8(q14u16, d3u8);
-    q14u16 = vsubw_u8(q14u16, d5u8);
-    q14u16 = vaddw_u8(q14u16, d6u8);
-    q14u16 = vaddw_u8(q14u16, d17u8);
-
-    *d0ru8 = vbsl_u8(d20u8, d30u8, d0u8);
-
-    d23u8 = vqrshrn_n_u16(q14u16, 3);
-
-    q14u16 = vsubw_u8(q14u16, d3u8);
-    q14u16 = vsubw_u8(q14u16, d6u8);
-    q14u16 = vaddw_u8(q14u16, d7u8);
-
-    *d1ru8 = vbsl_u8(d20u8, d31u8, d1u8);
-
-    q14u16 = vaddw_u8(q14u16, d18u8);
-
-    *d2ru8 = vbsl_u8(d20u8, d23u8, d2u8);
-
-    d22u8 = vqrshrn_n_u16(q14u16, 3);
-
-    q14u16 = vsubw_u8(q14u16, d4u8);
-    q14u16 = vsubw_u8(q14u16, d7u8);
-    q14u16 = vaddw_u8(q14u16, d16u8);
-
-    d3u8 = vbsl_u8(d20u8, d3u8, d21u8);
-
-    q14u16 = vaddw_u8(q14u16, d18u8);
-
-    d4u8 = vbsl_u8(d20u8, d4u8, d26u8);
-
-    d6u8 = vqrshrn_n_u16(q14u16, 3);
-
-    q14u16 = vsubw_u8(q14u16, d5u8);
-    q14u16 = vsubw_u8(q14u16, d16u8);
-    q14u16 = vaddw_u8(q14u16, d17u8);
-    q14u16 = vaddw_u8(q14u16, d18u8);
-
-    d5u8 = vbsl_u8(d20u8, d5u8, d17u8);
-
-    d7u8 = vqrshrn_n_u16(q14u16, 3);
-
-    *d3ru8 = vbsl_u8(d20u8, d22u8, d3u8);
-    *d4ru8 = vbsl_u8(d20u8, d6u8, d4u8);
-    *d5ru8 = vbsl_u8(d20u8, d7u8, d5u8);
-  }
-  return;
-}
-
-void vpx_lpf_horizontal_8_neon(uint8_t *src, int pitch, const uint8_t *blimit,
-                               const uint8_t *limit, const uint8_t *thresh) {
-  int i;
-  uint8_t *s, *psrc;
-  uint8x8_t dblimit, dlimit, dthresh;
-  uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
-  uint8x8_t d16u8, d17u8, d18u8;
-
-  dblimit = vld1_u8(blimit);
-  dlimit = vld1_u8(limit);
-  dthresh = vld1_u8(thresh);
-
-  psrc = src - (pitch << 2);
-  for (i = 0; i < 1; i++) {
-    s = psrc + i * 8;
-
-    d3u8 = vld1_u8(s);
-    s += pitch;
-    d4u8 = vld1_u8(s);
-    s += pitch;
-    d5u8 = vld1_u8(s);
-    s += pitch;
-    d6u8 = vld1_u8(s);
-    s += pitch;
-    d7u8 = vld1_u8(s);
-    s += pitch;
-    d16u8 = vld1_u8(s);
-    s += pitch;
-    d17u8 = vld1_u8(s);
-    s += pitch;
-    d18u8 = vld1_u8(s);
-
-    mbloop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
-                       d16u8, d17u8, d18u8, &d0u8, &d1u8, &d2u8, &d3u8, &d4u8,
-                       &d5u8);
-
-    s -= (pitch * 6);
-    vst1_u8(s, d0u8);
-    s += pitch;
-    vst1_u8(s, d1u8);
-    s += pitch;
-    vst1_u8(s, d2u8);
-    s += pitch;
-    vst1_u8(s, d3u8);
-    s += pitch;
-    vst1_u8(s, d4u8);
-    s += pitch;
-    vst1_u8(s, d5u8);
-  }
-  return;
-}
-
-void vpx_lpf_horizontal_8_dual_neon(
-    uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
-    const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
-    const uint8_t *limit1, const uint8_t *thresh1) {
-  vpx_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0);
-  vpx_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1);
-}
-
-void vpx_lpf_vertical_8_neon(uint8_t *src, int pitch, const uint8_t *blimit,
-                             const uint8_t *limit, const uint8_t *thresh) {
-  int i;
-  uint8_t *s;
-  uint8x8_t dblimit, dlimit, dthresh;
-  uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
-  uint8x8_t d16u8, d17u8, d18u8;
-  uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
-  uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
-  uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
-  uint8x8x4_t d4Result;
-  uint8x8x2_t d2Result;
-
-  dblimit = vld1_u8(blimit);
-  dlimit = vld1_u8(limit);
-  dthresh = vld1_u8(thresh);
-
-  for (i = 0; i < 1; i++) {
-    s = src + (i * (pitch << 3)) - 4;
-
-    d3u8 = vld1_u8(s);
-    s += pitch;
-    d4u8 = vld1_u8(s);
-    s += pitch;
-    d5u8 = vld1_u8(s);
-    s += pitch;
-    d6u8 = vld1_u8(s);
-    s += pitch;
-    d7u8 = vld1_u8(s);
-    s += pitch;
-    d16u8 = vld1_u8(s);
-    s += pitch;
-    d17u8 = vld1_u8(s);
-    s += pitch;
-    d18u8 = vld1_u8(s);
-
-    d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), vreinterpret_u32_u8(d7u8));
-    d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), vreinterpret_u32_u8(d16u8));
-    d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), vreinterpret_u32_u8(d17u8));
-    d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), vreinterpret_u32_u8(d18u8));
-
-    d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
-                      vreinterpret_u16_u32(d2tmp2.val[0]));
-    d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
-                      vreinterpret_u16_u32(d2tmp3.val[0]));
-    d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
-                      vreinterpret_u16_u32(d2tmp2.val[1]));
-    d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
-                      vreinterpret_u16_u32(d2tmp3.val[1]));
-
-    d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
-                     vreinterpret_u8_u16(d2tmp5.val[0]));
-    d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
-                     vreinterpret_u8_u16(d2tmp5.val[1]));
-    d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
-                      vreinterpret_u8_u16(d2tmp7.val[0]));
-    d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
-                      vreinterpret_u8_u16(d2tmp7.val[1]));
-
-    d3u8 = d2tmp8.val[0];
-    d4u8 = d2tmp8.val[1];
-    d5u8 = d2tmp9.val[0];
-    d6u8 = d2tmp9.val[1];
-    d7u8 = d2tmp10.val[0];
-    d16u8 = d2tmp10.val[1];
-    d17u8 = d2tmp11.val[0];
-    d18u8 = d2tmp11.val[1];
-
-    mbloop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
-                       d16u8, d17u8, d18u8, &d0u8, &d1u8, &d2u8, &d3u8, &d4u8,
-                       &d5u8);
-
-    d4Result.val[0] = d0u8;
-    d4Result.val[1] = d1u8;
-    d4Result.val[2] = d2u8;
-    d4Result.val[3] = d3u8;
-
-    d2Result.val[0] = d4u8;
-    d2Result.val[1] = d5u8;
-
-    s = src - 3;
-    vst4_lane_u8(s, d4Result, 0);
-    s += pitch;
-    vst4_lane_u8(s, d4Result, 1);
-    s += pitch;
-    vst4_lane_u8(s, d4Result, 2);
-    s += pitch;
-    vst4_lane_u8(s, d4Result, 3);
-    s += pitch;
-    vst4_lane_u8(s, d4Result, 4);
-    s += pitch;
-    vst4_lane_u8(s, d4Result, 5);
-    s += pitch;
-    vst4_lane_u8(s, d4Result, 6);
-    s += pitch;
-    vst4_lane_u8(s, d4Result, 7);
-
-    s = src + 1;
-    vst2_lane_u8(s, d2Result, 0);
-    s += pitch;
-    vst2_lane_u8(s, d2Result, 1);
-    s += pitch;
-    vst2_lane_u8(s, d2Result, 2);
-    s += pitch;
-    vst2_lane_u8(s, d2Result, 3);
-    s += pitch;
-    vst2_lane_u8(s, d2Result, 4);
-    s += pitch;
-    vst2_lane_u8(s, d2Result, 5);
-    s += pitch;
-    vst2_lane_u8(s, d2Result, 6);
-    s += pitch;
-    vst2_lane_u8(s, d2Result, 7);
-  }
-  return;
-}
-
-void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
-                                  const uint8_t *limit0, const uint8_t *thresh0,
-                                  const uint8_t *blimit1, const uint8_t *limit1,
-                                  const uint8_t *thresh1) {
-  vpx_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0);
-  vpx_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1);
-}
diff --git a/vpx_dsp/arm/loopfilter_mb_neon.c b/vpx_dsp/arm/loopfilter_mb_neon.c
index aa61220d3..f95267472 100644
--- a/vpx_dsp/arm/loopfilter_mb_neon.c
+++ b/vpx_dsp/arm/loopfilter_mb_neon.c
@@ -31,6 +31,15 @@ FUN_LOAD_THRESH(8, _)    // load_thresh_8
 FUN_LOAD_THRESH(16, q_)  // load_thresh_16
 #undef FUN_LOAD_THRESH
 
+static INLINE void load_thresh_8_dual(
+    const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0,
+    const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1,
+    uint8x16_t *blimit_vec, uint8x16_t *limit_vec, uint8x16_t *thresh_vec) {
+  *blimit_vec = vcombine_u8(vld1_dup_u8(blimit0), vld1_dup_u8(blimit1));
+  *limit_vec = vcombine_u8(vld1_dup_u8(limit0), vld1_dup_u8(limit1));
+  *thresh_vec = vcombine_u8(vld1_dup_u8(thresh0), vld1_dup_u8(thresh1));
+}
+
 // Here flat is 64-bit long, with each 8-bit (or 4-bit) chunk being a mask of a
 // pixel. When used to control filter branches, we only detect whether it is all
 // 0s or all 1s. We pairwise add flat to a 32-bit long number flat_status.
@@ -56,33 +65,51 @@ static INLINE uint32_t calc_flat_status_16(uint8x16_t flat) {
   return calc_flat_status_8(flat_4bit);
 }
 
-#define FUN_FILTER_FLAT_HEV_MASK(w, r)                                        \
-  static INLINE uint8x##w##_t filter_flat_hev_mask_##w(                       \
+#define FUN_FILTER_HEV_MASK4(w, r)                                            \
+  static INLINE uint8x##w##_t filter_hev_mask4_##w(                           \
       const uint8x##w##_t limit, const uint8x##w##_t blimit,                  \
       const uint8x##w##_t thresh, const uint8x##w##_t p3,                     \
       const uint8x##w##_t p2, const uint8x##w##_t p1, const uint8x##w##_t p0, \
       const uint8x##w##_t q0, const uint8x##w##_t q1, const uint8x##w##_t q2, \
-      const uint8x##w##_t q3, uint8x##w##_t *flat, uint32_t *flat_status,     \
-      uint8x##w##_t *hev) {                                                   \
-    uint8x##w##_t t0, t1, mask;                                               \
+      const uint8x##w##_t q3, uint8x##w##_t *hev, uint8x##w##_t *mask) {      \
+    uint8x##w##_t max, t0, t1;                                                \
                                                                               \
-    mask = vabd##r##u8(p1, p0);                                               \
-    mask = vmax##r##u8(mask, vabd##r##u8(q1, q0));                            \
-    *hev = vcgt##r##u8(mask, thresh);                                         \
-    *flat = vmax##r##u8(mask, vabd##r##u8(p2, p0));                           \
-    mask = vmax##r##u8(mask, vabd##r##u8(p3, p2));                            \
-    mask = vmax##r##u8(mask, vabd##r##u8(p2, p1));                            \
-    mask = vmax##r##u8(mask, vabd##r##u8(q2, q1));                            \
-    mask = vmax##r##u8(mask, vabd##r##u8(q3, q2));                            \
+    max = vabd##r##u8(p1, p0);                                                \
+    max = vmax##r##u8(max, vabd##r##u8(q1, q0));                              \
+    *hev = vcgt##r##u8(max, thresh);                                          \
+    *mask = vmax##r##u8(max, vabd##r##u8(p3, p2));                            \
+    *mask = vmax##r##u8(*mask, vabd##r##u8(p2, p1));                          \
+    *mask = vmax##r##u8(*mask, vabd##r##u8(q2, q1));                          \
+    *mask = vmax##r##u8(*mask, vabd##r##u8(q3, q2));                          \
     t0 = vabd##r##u8(p0, q0);                                                 \
     t1 = vabd##r##u8(p1, q1);                                                 \
     t0 = vqadd##r##u8(t0, t0);                                                \
     t1 = vshr##r##n_u8(t1, 1);                                                \
     t0 = vqadd##r##u8(t0, t1);                                                \
-    mask = vcle##r##u8(mask, limit);                                          \
+    *mask = vcle##r##u8(*mask, limit);                                        \
     t0 = vcle##r##u8(t0, blimit);                                             \
-    mask = vand##r##u8(mask, t0);                                             \
+    *mask = vand##r##u8(*mask, t0);                                           \
+                                                                              \
+    return max;                                                               \
+  }
+
+FUN_FILTER_HEV_MASK4(8, _)    // filter_hev_mask4_8
+FUN_FILTER_HEV_MASK4(16, q_)  // filter_hev_mask4_16
+#undef FUN_FILTER_HEV_MASK4
+
+#define FUN_FILTER_FLAT_HEV_MASK(w, r)                                        \
+  static INLINE uint8x##w##_t filter_flat_hev_mask_##w(                       \
+      const uint8x##w##_t limit, const uint8x##w##_t blimit,                  \
+      const uint8x##w##_t thresh, const uint8x##w##_t p3,                     \
+      const uint8x##w##_t p2, const uint8x##w##_t p1, const uint8x##w##_t p0, \
+      const uint8x##w##_t q0, const uint8x##w##_t q1, const uint8x##w##_t q2, \
+      const uint8x##w##_t q3, uint8x##w##_t *flat, uint32_t *flat_status,     \
+      uint8x##w##_t *hev) {                                                   \
+    uint8x##w##_t max, mask;                                                  \
                                                                               \
+    max = filter_hev_mask4_##w(limit, blimit, thresh, p3, p2, p1, p0, q0, q1, \
+                               q2, q3, hev, &mask);                           \
+    *flat = vmax##r##u8(max, vabd##r##u8(p2, p0));                            \
     *flat = vmax##r##u8(*flat, vabd##r##u8(q2, q0));                          \
     *flat = vmax##r##u8(*flat, vabd##r##u8(p3, p0));                          \
     *flat = vmax##r##u8(*flat, vabd##r##u8(q3, q0));                          \
@@ -420,6 +447,33 @@ FUN_FILTER4(8, _)    // filter4_8
 FUN_FILTER4(16, q_)  // filter4_16
 #undef FUN_FILTER4
 
+#define FUN_FILTER8(w)                                                         \
+  static INLINE void filter8_##w(                                              \
+      const uint8x##w##_t mask, const uint8x##w##_t flat,                      \
+      const uint32_t flat_status, const uint8x##w##_t hev,                     \
+      const uint8x##w##_t p3, const uint8x##w##_t p2, const uint8x##w##_t p1,  \
+      const uint8x##w##_t p0, const uint8x##w##_t q0, const uint8x##w##_t q1,  \
+      const uint8x##w##_t q2, const uint8x##w##_t q3, uint8x##w##_t *op2,      \
+      uint8x##w##_t *op1, uint8x##w##_t *op0, uint8x##w##_t *oq0,              \
+      uint8x##w##_t *oq1, uint8x##w##_t *oq2) {                                \
+    if (flat_status != (uint32_t)-2) {                                         \
+      filter4_##w(mask, hev, p1, p0, q0, q1, op1, op0, oq0, oq1);              \
+      *op2 = p2;                                                               \
+      *oq2 = q2;                                                               \
+      if (flat_status) {                                                       \
+        apply_7_tap_filter_##w(flat, p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, \
+                               op0, oq0, oq1, oq2);                            \
+      }                                                                        \
+    } else {                                                                   \
+      calc_7_tap_filter_##w(p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, op0,     \
+                            oq0, oq1, oq2);                                    \
+    }                                                                          \
+  }
+
+FUN_FILTER8(8)   // filter8_8
+FUN_FILTER8(16)  // filter8_16
+#undef FUN_FILTER8
+
 #define FUN_FILTER16(w)                                                        \
   static INLINE void filter16_##w(                                             \
       const uint8x##w##_t mask, const uint8x##w##_t flat,                      \
@@ -481,6 +535,7 @@ FUN_FILTER16(16)  // filter16_16
     *q3 = vld1##r##u8(s);                                                  \
   }
 
+FUN_LOAD8(8, _)    // load_8x8
 FUN_LOAD8(16, q_)  // load_16x8
 #undef FUN_LOAD8
 
@@ -529,6 +584,71 @@ FUN_LOAD16(8, _)    // load_8x16
 FUN_LOAD16(16, q_)  // load_16x16
 #undef FUN_LOAD16
 
+#define FUN_STORE4(w, r)                                                       \
+  static INLINE void store_##w##x4(                                            \
+      uint8_t *s, const int p, const uint8x##w##_t s0, const uint8x##w##_t s1, \
+      const uint8x##w##_t s2, const uint8x##w##_t s3) {                        \
+    vst1##r##u8(s, s0);                                                        \
+    s += p;                                                                    \
+    vst1##r##u8(s, s1);                                                        \
+    s += p;                                                                    \
+    vst1##r##u8(s, s2);                                                        \
+    s += p;                                                                    \
+    vst1##r##u8(s, s3);                                                        \
+  }
+
+FUN_STORE4(8, _)    // store_8x4
+FUN_STORE4(16, q_)  // store_16x4
+#undef FUN_STORE4
+
+#define FUN_STORE6(w, r)                                                       \
+  static INLINE void store_##w##x6(                                            \
+      uint8_t *s, const int p, const uint8x##w##_t s0, const uint8x##w##_t s1, \
+      const uint8x##w##_t s2, const uint8x##w##_t s3, const uint8x##w##_t s4,  \
+      const uint8x##w##_t s5) {                                                \
+    vst1##r##u8(s, s0);                                                        \
+    s += p;                                                                    \
+    vst1##r##u8(s, s1);                                                        \
+    s += p;                                                                    \
+    vst1##r##u8(s, s2);                                                        \
+    s += p;                                                                    \
+    vst1##r##u8(s, s3);                                                        \
+    s += p;                                                                    \
+    vst1##r##u8(s, s4);                                                        \
+    s += p;                                                                    \
+    vst1##r##u8(s, s5);                                                        \
+  }
+
+FUN_STORE6(8, _)    // store_8x6
+FUN_STORE6(16, q_)  // store_16x6
+#undef FUN_STORE6
+
+static INLINE void store_4x8(uint8_t *s, const int p, const uint8x8_t p1,
+                             const uint8x8_t p0, const uint8x8_t q0,
+                             const uint8x8_t q1) {
+  uint8x8x4_t o;
+
+  o.val[0] = p1;
+  o.val[1] = p0;
+  o.val[2] = q0;
+  o.val[3] = q1;
+  vst4_lane_u8(s, o, 0);
+  s += p;
+  vst4_lane_u8(s, o, 1);
+  s += p;
+  vst4_lane_u8(s, o, 2);
+  s += p;
+  vst4_lane_u8(s, o, 3);
+  s += p;
+  vst4_lane_u8(s, o, 4);
+  s += p;
+  vst4_lane_u8(s, o, 5);
+  s += p;
+  vst4_lane_u8(s, o, 6);
+  s += p;
+  vst4_lane_u8(s, o, 7);
+}
+
 static INLINE void store_6x8(uint8_t *s, const int p, const uint8x8_t s0,
                              const uint8x8_t s1, const uint8x8_t s2,
                              const uint8x8_t s3, const uint8x8_t s4,
@@ -566,53 +686,64 @@ static INLINE void store_6x8(uint8_t *s, const int p, const uint8x8_t s0,
   vst3_lane_u8(s + 0, o1, 7);
 }
 
-static INLINE void store_4x8(uint8_t *s, const int p, const uint8x8_t p1,
-                             const uint8x8_t p0, const uint8x8_t q0,
-                             const uint8x8_t q1) {
-  uint8x8x4_t o;
+#define FUN_STORE8(w, r)                                                       \
+  static INLINE void store_##w##x8(                                            \
+      uint8_t *s, const int p, const uint8x##w##_t s0, const uint8x##w##_t s1, \
+      const uint8x##w##_t s2, const uint8x##w##_t s3, const uint8x##w##_t s4,  \
+      const uint8x##w##_t s5, const uint8x##w##_t s6,                          \
+      const uint8x##w##_t s7) {                                                \
+    vst1##r##u8(s, s0);                                                        \
+    s += p;                                                                    \
+    vst1##r##u8(s, s1);                                                        \
+    s += p;                                                                    \
+    vst1##r##u8(s, s2);                                                        \
+    s += p;                                                                    \
+    vst1##r##u8(s, s3);                                                        \
+    s += p;                                                                    \
+    vst1##r##u8(s, s4);                                                        \
+    s += p;                                                                    \
+    vst1##r##u8(s, s5);                                                        \
+    s += p;                                                                    \
+    vst1##r##u8(s, s6);                                                        \
+    s += p;                                                                    \
+    vst1##r##u8(s, s7);                                                        \
+  }
 
-  o.val[0] = p1;
-  o.val[1] = p0;
-  o.val[2] = q0;
-  o.val[3] = q1;
-  vst4_lane_u8(s, o, 0);
-  s += p;
-  vst4_lane_u8(s, o, 1);
-  s += p;
-  vst4_lane_u8(s, o, 2);
-  s += p;
-  vst4_lane_u8(s, o, 3);
-  s += p;
-  vst4_lane_u8(s, o, 4);
-  s += p;
-  vst4_lane_u8(s, o, 5);
-  s += p;
-  vst4_lane_u8(s, o, 6);
-  s += p;
-  vst4_lane_u8(s, o, 7);
-}
+FUN_STORE8(8, _)    // store_8x8
+FUN_STORE8(16, q_)  // store_16x8
+#undef FUN_STORE8
 
-static INLINE void store_16x8(uint8_t *s, const int p, const uint8x16_t s0,
-                              const uint8x16_t s1, const uint8x16_t s2,
-                              const uint8x16_t s3, const uint8x16_t s4,
-                              const uint8x16_t s5, const uint8x16_t s6,
-                              const uint8x16_t s7) {
-  vst1q_u8(s, s0);
-  s += p;
-  vst1q_u8(s, s1);
-  s += p;
-  vst1q_u8(s, s2);
-  s += p;
-  vst1q_u8(s, s3);
-  s += p;
-  vst1q_u8(s, s4);
-  s += p;
-  vst1q_u8(s, s5);
-  s += p;
-  vst1q_u8(s, s6);
-  s += p;
-  vst1q_u8(s, s7);
-}
+#define FUN_STORE14(w, r)                                                      \
+  static INLINE void store_##w##x14(                                           \
+      uint8_t *s, const int p, const uint8x##w##_t p6, const uint8x##w##_t p5, \
+      const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2,  \
+      const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0,  \
+      const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3,  \
+      const uint8x##w##_t q4, const uint8x##w##_t q5, const uint8x##w##_t q6,  \
+      const uint32_t flat_status, const uint32_t flat2_status) {               \
+    if (flat_status) {                                                         \
+      if (flat2_status) {                                                      \
+        vst1##r##u8(s - 7 * p, p6);                                            \
+        vst1##r##u8(s - 6 * p, p5);                                            \
+        vst1##r##u8(s - 5 * p, p4);                                            \
+        vst1##r##u8(s - 4 * p, p3);                                            \
+        vst1##r##u8(s + 3 * p, q3);                                            \
+        vst1##r##u8(s + 4 * p, q4);                                            \
+        vst1##r##u8(s + 5 * p, q5);                                            \
+        vst1##r##u8(s + 6 * p, q6);                                            \
+      }                                                                        \
+      vst1##r##u8(s - 3 * p, p2);                                              \
+      vst1##r##u8(s + 2 * p, q2);                                              \
+    }                                                                          \
+    vst1##r##u8(s - 2 * p, p1);                                                \
+    vst1##r##u8(s - 1 * p, p0);                                                \
+    vst1##r##u8(s + 0 * p, q0);                                                \
+    vst1##r##u8(s + 1 * p, q1);                                                \
+  }
+
+FUN_STORE14(8, _)    // store_8x14
+FUN_STORE14(16, q_)  // store_16x14
+#undef FUN_STORE14
 
 static INLINE void store_16x16(uint8_t *s, const int p, const uint8x16_t s0,
                                const uint8x16_t s1, const uint8x16_t s2,
@@ -656,37 +787,160 @@ static INLINE void store_16x16(uint8_t *s, const int p, const uint8x16_t s0,
   vst1q_u8(s, s15);
 }
 
-#define FUN_STORE14(w, r)                                                      \
-  static INLINE void store_##w##x14(                                           \
-      uint8_t *s, const int p, const uint8x##w##_t p6, const uint8x##w##_t p5, \
-      const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2,  \
-      const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0,  \
-      const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3,  \
-      const uint8x##w##_t q4, const uint8x##w##_t q5, const uint8x##w##_t q6,  \
-      const uint32_t flat_status, const uint32_t flat2_status) {               \
-    if (flat_status) {                                                         \
-      if (flat2_status) {                                                      \
-        vst1##r##u8(s - 7 * p, p6);                                            \
-        vst1##r##u8(s - 6 * p, p5);                                            \
-        vst1##r##u8(s - 5 * p, p4);                                            \
-        vst1##r##u8(s - 4 * p, p3);                                            \
-        vst1##r##u8(s + 3 * p, q3);                                            \
-        vst1##r##u8(s + 4 * p, q4);                                            \
-        vst1##r##u8(s + 5 * p, q5);                                            \
-        vst1##r##u8(s + 6 * p, q6);                                            \
-      }                                                                        \
-      vst1##r##u8(s - 3 * p, p2);                                              \
-      vst1##r##u8(s + 2 * p, q2);                                              \
-    }                                                                          \
-    vst1##r##u8(s - 2 * p, p1);                                                \
-    vst1##r##u8(s - 1 * p, p0);                                                \
-    vst1##r##u8(s + 0 * p, q0);                                                \
-    vst1##r##u8(s + 1 * p, q1);                                                \
+#define FUN_HOR_4_KERNEL(name, w)                                           \
+  static INLINE void lpf_horizontal_4##name##kernel(                        \
+      uint8_t *s, const int p, const uint8x##w##_t blimit,                  \
+      const uint8x##w##_t limit, const uint8x##w##_t thresh) {              \
+    uint8x##w##_t p3, p2, p1, p0, q0, q1, q2, q3, mask, hev;                \
+                                                                            \
+    load_##w##x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);     \
+    filter_hev_mask4_##w(limit, blimit, thresh, p3, p2, p1, p0, q0, q1, q2, \
+                         q3, &hev, &mask);                                  \
+    filter4_##w(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1);             \
+    store_##w##x4(s - 2 * p, p, p1, p0, q0, q1);                            \
   }
 
-FUN_STORE14(8, _)    // store_8x14
-FUN_STORE14(16, q_)  // store_16x14
-#undef FUN_STORE14
+FUN_HOR_4_KERNEL(_, 8)        // lpf_horizontal_4_kernel
+FUN_HOR_4_KERNEL(_dual_, 16)  // lpf_horizontal_4_dual_kernel
+#undef FUN_HOR_4_KERNEL
+
+void vpx_lpf_horizontal_4_neon(uint8_t *s, int p, const uint8_t *blimit,
+                               const uint8_t *limit, const uint8_t *thresh) {
+  uint8x8_t blimit_vec, limit_vec, thresh_vec;
+  load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec);
+  lpf_horizontal_4_kernel(s, p, blimit_vec, limit_vec, thresh_vec);
+}
+
+void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
+                                    const uint8_t *limit0,
+                                    const uint8_t *thresh0,
+                                    const uint8_t *blimit1,
+                                    const uint8_t *limit1,
+                                    const uint8_t *thresh1) {
+  uint8x16_t blimit_vec, limit_vec, thresh_vec;
+  load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1,
+                     &blimit_vec, &limit_vec, &thresh_vec);
+  lpf_horizontal_4_dual_kernel(s, p, blimit_vec, limit_vec, thresh_vec);
+}
+
+void vpx_lpf_vertical_4_neon(uint8_t *s, int p, const uint8_t *blimit,
+                             const uint8_t *limit, const uint8_t *thresh) {
+  uint8x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
+      mask, hev;
+  load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec);
+  load_8x8(s - 4, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+  transpose_u8_8x8(&p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+  filter_hev_mask4_8(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1,
+                     q2, q3, &hev, &mask);
+  filter4_8(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1);
+  store_4x8(s - 2, p, p1, p0, q0, q1);
+}
+
+void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
+                                  const uint8_t *limit0, const uint8_t *thresh0,
+                                  const uint8_t *blimit1, const uint8_t *limit1,
+                                  const uint8_t *thresh1) {
+  uint8x16_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
+      mask, hev;
+  uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
+      s15;
+
+  load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1,
+                     &blimit_vec, &limit_vec, &thresh_vec);
+  load_8x16(s - 4, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &s10,
+            &s11, &s12, &s13, &s14, &s15);
+  transpose_u8_8x16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
+                    s14, s15, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+  filter_hev_mask4_16(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1,
+                      q2, q3, &hev, &mask);
+  filter4_16(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1);
+  s -= 2;
+  store_4x8(s, p, vget_low_u8(p1), vget_low_u8(p0), vget_low_u8(q0),
+            vget_low_u8(q1));
+  store_4x8(s + 8 * p, p, vget_high_u8(p1), vget_high_u8(p0), vget_high_u8(q0),
+            vget_high_u8(q1));
+}
+
+void vpx_lpf_horizontal_8_neon(uint8_t *s, int p, const uint8_t *blimit,
+                               const uint8_t *limit, const uint8_t *thresh) {
+  uint8x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
+      op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
+  uint32_t flat_status;
+
+  load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec);
+  load_8x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+  mask = filter_flat_hev_mask_8(limit_vec, blimit_vec, thresh_vec, p3, p2, p1,
+                                p0, q0, q1, q2, q3, &flat, &flat_status, &hev);
+  filter8_8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
+            &op1, &op0, &oq0, &oq1, &oq2);
+  store_8x6(s - 3 * p, p, op2, op1, op0, oq0, oq1, oq2);
+}
+
+void vpx_lpf_horizontal_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
+                                    const uint8_t *limit0,
+                                    const uint8_t *thresh0,
+                                    const uint8_t *blimit1,
+                                    const uint8_t *limit1,
+                                    const uint8_t *thresh1) {
+  uint8x16_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
+      op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
+  uint32_t flat_status;
+
+  load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1,
+                     &blimit_vec, &limit_vec, &thresh_vec);
+  load_16x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+  mask = filter_flat_hev_mask_16(limit_vec, blimit_vec, thresh_vec, p3, p2, p1,
+                                 p0, q0, q1, q2, q3, &flat, &flat_status, &hev);
+  filter8_16(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
+             &op1, &op0, &oq0, &oq1, &oq2);
+  store_16x6(s - 3 * p, p, op2, op1, op0, oq0, oq1, oq2);
+}
+
+void vpx_lpf_vertical_8_neon(uint8_t *s, int p, const uint8_t *blimit,
+                             const uint8_t *limit, const uint8_t *thresh) {
+  uint8x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
+      op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
+  uint32_t flat_status;
+
+  load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec);
+  load_8x8(s - 4, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+  transpose_u8_8x8(&p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+  mask = filter_flat_hev_mask_8(limit_vec, blimit_vec, thresh_vec, p3, p2, p1,
+                                p0, q0, q1, q2, q3, &flat, &flat_status, &hev);
+  filter8_8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
+            &op1, &op0, &oq0, &oq1, &oq2);
+  // Note: tranpose + store_8x8() is faster than store_6x8().
+  transpose_u8_8x8(&p3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &q3);
+  store_8x8(s - 4, p, p3, op2, op1, op0, oq0, oq1, oq2, q3);
+}
+
+void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
+                                  const uint8_t *limit0, const uint8_t *thresh0,
+                                  const uint8_t *blimit1, const uint8_t *limit1,
+                                  const uint8_t *thresh1) {
+  uint8x16_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
+      op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
+  uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
+      s15;
+  uint32_t flat_status;
+
+  load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1,
+                     &blimit_vec, &limit_vec, &thresh_vec);
+  load_8x16(s - 4, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &s10,
+            &s11, &s12, &s13, &s14, &s15);
+  transpose_u8_8x16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
+                    s14, s15, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+  mask = filter_flat_hev_mask_16(limit_vec, blimit_vec, thresh_vec, p3, p2, p1,
+                                 p0, q0, q1, q2, q3, &flat, &flat_status, &hev);
+  filter8_16(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
+             &op1, &op0, &oq0, &oq1, &oq2);
+  // Note: store_6x8() twice is faster than tranpose + store_8x16().
+  store_6x8(s, p, vget_low_u8(op2), vget_low_u8(op1), vget_low_u8(op0),
+            vget_low_u8(oq0), vget_low_u8(oq1), vget_low_u8(oq2));
+  store_6x8(s + 8 * p, p, vget_high_u8(op2), vget_high_u8(op1),
+            vget_high_u8(op0), vget_high_u8(oq0), vget_high_u8(oq1),
+            vget_high_u8(oq2));
+}
 
 #define FUN_LPF_16_KERNEL(name, w)                                             \
   static INLINE void lpf_16##name##kernel(                                     \
@@ -784,7 +1038,9 @@ void vpx_lpf_vertical_16_neon(uint8_t *s, int p, const uint8_t *blimit,
                         &s6, &s7);
       store_16x8(s, p, s0, s1, s2, s3, s4, s5, s6, s7);
     } else {
-      store_6x8(s + 8, p, op2, op1, op0, oq0, oq1, oq2);
+      // Note: tranpose + store_8x8() is faster than store_6x8().
+      transpose_u8_8x8(&p3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &q3);
+      store_8x8(s + 4, p, p3, op2, op1, op0, oq0, oq1, oq2, q3);
     }
   } else {
     store_4x8(s + 6, p, op1, op0, oq0, oq1);
@@ -819,6 +1075,7 @@ void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
       store_16x16(s, p, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
                   s13, s14, s15);
     } else {
+      // Note: store_6x8() twice is faster than tranpose + store_8x16().
       s += 8;
       store_6x8(s, p, vget_low_u8(op2), vget_low_u8(op1), vget_low_u8(op0),
                 vget_low_u8(oq0), vget_low_u8(oq1), vget_low_u8(oq2));
diff --git a/vpx_dsp/arm/transpose_neon.h b/vpx_dsp/arm/transpose_neon.h
index 2318fb44b..3d0b41f93 100644
--- a/vpx_dsp/arm/transpose_neon.h
+++ b/vpx_dsp/arm/transpose_neon.h
@@ -39,28 +39,84 @@ static INLINE uint8x16x2_t vpx_vtrnq_u64(uint32x4_t a0, uint32x4_t a1) {
   return b0;
 }
 
+// Note: Using 'd' registers or 'q' registers has almost identical speed. We use
+// 'q' registers here to save some instructions.
+static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
+                                    uint8x8_t *a3, uint8x8_t *a4, uint8x8_t *a5,
+                                    uint8x8_t *a6, uint8x8_t *a7) {
+  // Swap 8 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07
+  // a1: 10 11 12 13 14 15 16 17
+  // a2: 20 21 22 23 24 25 26 27
+  // a3: 30 31 32 33 34 35 36 37
+  // a4: 40 41 42 43 44 45 46 47
+  // a5: 50 51 52 53 54 55 56 57
+  // a6: 60 61 62 63 64 65 66 67
+  // a7: 70 71 72 73 74 75 76 77
+  // to:
+  // b0.val[0]: 00 10 02 12 04 14 06 16  40 50 42 52 44 54 46 56
+  // b0.val[1]: 01 11 03 13 05 15 07 17  41 51 43 53 45 55 47 57
+  // b1.val[0]: 20 30 22 32 24 34 26 36  60 70 62 72 64 74 66 76
+  // b1.val[1]: 21 31 23 33 25 35 27 37  61 71 63 73 65 75 67 77
+
+  const uint8x16x2_t b0 =
+      vtrnq_u8(vcombine_u8(*a0, *a4), vcombine_u8(*a1, *a5));
+  const uint8x16x2_t b1 =
+      vtrnq_u8(vcombine_u8(*a2, *a6), vcombine_u8(*a3, *a7));
+
+  // Swap 16 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34  40 50 60 70 44 54 64 74
+  // c0.val[1]: 02 12 22 32 06 16 26 36  42 52 62 72 46 56 66 76
+  // c1.val[0]: 01 11 21 31 05 15 25 35  41 51 61 71 45 55 65 75
+  // c1.val[1]: 03 13 23 33 07 17 27 37  43 53 63 73 47 57 67 77
+
+  const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
+                                    vreinterpretq_u16_u8(b1.val[0]));
+  const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
+                                    vreinterpretq_u16_u8(b1.val[1]));
+
+  // Unzip 32 bit elements resulting in:
+  // d0.val[0]: 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
+  // d0.val[1]: 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
+  // d1.val[0]: 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
+  // d1.val[1]: 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
+  const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
+                                    vreinterpretq_u32_u16(c1.val[0]));
+  const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
+                                    vreinterpretq_u32_u16(c1.val[1]));
+
+  *a0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0]));
+  *a1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0]));
+  *a2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
+  *a3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
+  *a4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
+  *a5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
+  *a6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1]));
+  *a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
+}
+
 static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
                                      int16x8_t *a2, int16x8_t *a3,
                                      int16x8_t *a4, int16x8_t *a5,
                                      int16x8_t *a6, int16x8_t *a7) {
   // Swap 16 bit elements. Goes from:
   // a0: 00 01 02 03 04 05 06 07
-  // a1: 08 09 10 11 12 13 14 15
-  // a2: 16 17 18 19 20 21 22 23
-  // a3: 24 25 26 27 28 29 30 31
-  // a4: 32 33 34 35 36 37 38 39
-  // a5: 40 41 42 43 44 45 46 47
-  // a6: 48 49 50 51 52 53 54 55
-  // a7: 56 57 58 59 60 61 62 63
+  // a1: 10 11 12 13 14 15 16 17
+  // a2: 20 21 22 23 24 25 26 27
+  // a3: 30 31 32 33 34 35 36 37
+  // a4: 40 41 42 43 44 45 46 47
+  // a5: 50 51 52 53 54 55 56 57
+  // a6: 60 61 62 63 64 65 66 67
+  // a7: 70 71 72 73 74 75 76 77
   // to:
-  // b0.val[0]: 00 08 02 10 04 12 06 14
-  // b0.val[1]: 01 09 03 11 05 13 07 15
-  // b1.val[0]: 16 24 18 26 20 28 22 30
-  // b1.val[1]: 17 25 19 27 21 29 23 31
-  // b2.val[0]: 32 40 34 42 36 44 38 46
-  // b2.val[1]: 33 41 35 43 37 45 39 47
-  // b3.val[0]: 48 56 50 58 52 60 54 62
-  // b3.val[1]: 49 57 51 59 53 61 55 63
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // b0.val[1]: 01 11 03 13 05 15 07 17
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // b1.val[1]: 21 31 23 33 25 35 27 37
+  // b2.val[0]: 40 50 42 52 44 54 46 56
+  // b2.val[1]: 41 51 43 53 45 55 47 57
+  // b3.val[0]: 60 70 62 72 64 74 66 76
+  // b3.val[1]: 61 71 63 73 65 75 67 77
 
   const int16x8x2_t b0 = vtrnq_s16(*a0, *a1);
   const int16x8x2_t b1 = vtrnq_s16(*a2, *a3);
@@ -68,14 +124,14 @@ static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
   const int16x8x2_t b3 = vtrnq_s16(*a6, *a7);
 
   // Swap 32 bit elements resulting in:
-  // c0.val[0]: 00 08 16 24 04 12 20 28
-  // c0.val[1]: 02 10 18 26 06 14 22 30
-  // c1.val[0]: 01 09 17 25 05 13 21 29
-  // c1.val[1]: 03 11 19 27 07 15 23 31
-  // c2.val[0]: 32 40 48 56 36 44 52 60
-  // c2.val[1]: 34 42 50 58 38 46 54 62
-  // c3.val[0]: 33 41 49 57 37 45 53 61
-  // c3.val[1]: 35 43 51 59 39 47 55 63
+  // c0.val[0]: 00 10 20 30 04 14 24 34
+  // c0.val[1]: 02 12 22 32 06 16 26 36
+  // c1.val[0]: 01 11 21 31 05 15 25 35
+  // c1.val[1]: 03 13 23 33 07 17 27 37
+  // c2.val[0]: 40 50 60 70 44 54 64 74
+  // c2.val[1]: 42 52 62 72 46 56 66 76
+  // c3.val[0]: 41 51 61 71 45 55 65 75
+  // c3.val[1]: 43 53 63 73 47 57 67 77
 
   const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
                                    vreinterpretq_s32_s16(b1.val[0]));
@@ -87,14 +143,14 @@ static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
                                    vreinterpretq_s32_s16(b3.val[1]));
 
   // Swap 64 bit elements resulting in:
-  // d0.val[0]: 00 08 16 24 32 40 48 56
-  // d0.val[1]: 04 12 20 28 36 44 52 60
-  // d1.val[0]: 01 09 17 25 33 41 49 57
-  // d1.val[1]: 05 13 21 29 37 45 53 61
-  // d2.val[0]: 02 10 18 26 34 42 50 58
-  // d2.val[1]: 06 14 22 30 38 46 54 62
-  // d3.val[0]: 03 11 19 27 35 43 51 59
-  // d3.val[1]: 07 15 23 31 39 47 55 63
+  // d0.val[0]: 00 10 20 30 40 50 60 70
+  // d0.val[1]: 04 14 24 34 44 54 64 74
+  // d1.val[0]: 01 11 21 31 41 51 61 71
+  // d1.val[1]: 05 15 25 35 45 55 65 75
+  // d2.val[0]: 02 12 22 32 42 52 62 72
+  // d2.val[1]: 06 16 26 36 46 56 66 76
+  // d3.val[0]: 03 13 23 33 43 53 63 73
+  // d3.val[1]: 07 17 27 37 47 57 67 77
   const int16x8x2_t d0 = vpx_vtrnq_s64(c0.val[0], c2.val[0]);
   const int16x8x2_t d1 = vpx_vtrnq_s64(c1.val[0], c3.val[0]);
   const int16x8x2_t d2 = vpx_vtrnq_s64(c0.val[1], c2.val[1]);
diff --git a/vpx_dsp/arm/variance_neon.c b/vpx_dsp/arm/variance_neon.c
index f469afc4e..b6d7f86a4 100644
--- a/vpx_dsp/arm/variance_neon.c
+++ b/vpx_dsp/arm/variance_neon.c
@@ -75,7 +75,7 @@ unsigned int vpx_variance8x8_neon(const uint8_t *a, int a_stride,
                                   unsigned int *sse) {
   int sum;
   variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum);
-  return *sse - (((int64_t)sum * sum) >> 6);  //  >> 6 = / 8 * 8
+  return *sse - ((sum * sum) >> 6);
 }
 
 unsigned int vpx_variance16x16_neon(const uint8_t *a, int a_stride,
@@ -83,7 +83,7 @@ unsigned int vpx_variance16x16_neon(const uint8_t *a, int a_stride,
                                     unsigned int *sse) {
   int sum;
   variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum);
-  return *sse - (((int64_t)sum * sum) >> 8);  //  >> 8 = / 16 * 16
+  return *sse - (((uint32_t)((int64_t)sum * sum)) >> 8);
 }
 
 unsigned int vpx_variance32x32_neon(const uint8_t *a, int a_stride,
@@ -91,7 +91,7 @@ unsigned int vpx_variance32x32_neon(const uint8_t *a, int a_stride,
                                     unsigned int *sse) {
   int sum;
   variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum);
-  return *sse - (((int64_t)sum * sum) >> 10);  // >> 10 = / 32 * 32
+  return *sse - (unsigned int)(((int64_t)sum * sum) >> 10);
 }
 
 unsigned int vpx_variance32x64_neon(const uint8_t *a, int a_stride,
@@ -104,7 +104,7 @@ unsigned int vpx_variance32x64_neon(const uint8_t *a, int a_stride,
                    32, 32, &sse2, &sum2);
   *sse = sse1 + sse2;
   sum1 += sum2;
-  return *sse - (((int64_t)sum1 * sum1) >> 11);  // >> 11 = / 32 * 64
+  return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11);
 }
 
 unsigned int vpx_variance64x32_neon(const uint8_t *a, int a_stride,
@@ -117,7 +117,7 @@ unsigned int vpx_variance64x32_neon(const uint8_t *a, int a_stride,
                    64, 16, &sse2, &sum2);
   *sse = sse1 + sse2;
   sum1 += sum2;
-  return *sse - (((int64_t)sum1 * sum1) >> 11);  // >> 11 = / 32 * 64
+  return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11);
 }
 
 unsigned int vpx_variance64x64_neon(const uint8_t *a, int a_stride,
@@ -141,7 +141,7 @@ unsigned int vpx_variance64x64_neon(const uint8_t *a, int a_stride,
                    b_stride, 64, 16, &sse2, &sum2);
   *sse = sse1 + sse2;
   sum1 += sum2;
-  return *sse - (((int64_t)sum1 * sum1) >> 12);  // >> 12 = / 64 * 64
+  return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 12);
 }
 
 unsigned int vpx_variance16x8_neon(const unsigned char *src_ptr,
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index def9c8e1b..a78041ce7 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -136,8 +136,8 @@ DSP_SRCS-yes += loopfilter.c
 DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64)   += x86/loopfilter_sse2.c
 DSP_SRCS-$(HAVE_AVX2)                += x86/loopfilter_avx2.c
 
-DSP_SRCS-$(HAVE_NEON)   += arm/loopfilter_neon.c
 ifeq ($(HAVE_NEON_ASM),yes)
+DSP_SRCS-yes  += arm/loopfilter_neon.c
 DSP_SRCS-yes  += arm/loopfilter_mb_neon$(ASM)
 DSP_SRCS-yes  += arm/loopfilter_16_neon$(ASM)
 DSP_SRCS-yes  += arm/loopfilter_8_neon$(ASM)
@@ -145,9 +145,6 @@ DSP_SRCS-yes  += arm/loopfilter_4_neon$(ASM)
 else
 ifeq ($(HAVE_NEON),yes)
 DSP_SRCS-yes   += arm/loopfilter_mb_neon.c
-DSP_SRCS-yes   += arm/loopfilter_16_neon.c
-DSP_SRCS-yes   += arm/loopfilter_8_neon.c
-DSP_SRCS-yes   += arm/loopfilter_4_neon.c
 endif  # HAVE_NEON
 endif  # HAVE_NEON_ASM
 
diff --git a/vpx_dsp/x86/inv_txfm_sse2.c b/vpx_dsp/x86/inv_txfm_sse2.c
index 330ae8d6a..cb56ad078 100644
--- a/vpx_dsp/x86/inv_txfm_sse2.c
+++ b/vpx_dsp/x86/inv_txfm_sse2.c
@@ -3066,17 +3066,7 @@ void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
   in[6] = load_input_data(input + 192);
   in[7] = load_input_data(input + 224);
 
-  for (i = 8; i < 32; ++i) {
-    in[i] = _mm_setzero_si128();
-  }
-
   array_transpose_8x8(in, in);
-  // TODO(hkuang): Following transposes are unnecessary. But remove them will
-  // lead to performance drop on some devices.
-  array_transpose_8x8(in + 8, in + 8);
-  array_transpose_8x8(in + 16, in + 16);
-  array_transpose_8x8(in + 24, in + 24);
-
   IDCT32_34
 
   // 1_D: Store 32 intermediate results for each 8x32 block.
diff --git a/vpx_dsp/x86/variance_avx2.c b/vpx_dsp/x86/variance_avx2.c
index 7bc2693cf..8428e0520 100644
--- a/vpx_dsp/x86/variance_avx2.c
+++ b/vpx_dsp/x86/variance_avx2.c
@@ -61,7 +61,7 @@ unsigned int vpx_variance32x16_avx2(const uint8_t *src, int src_stride,
   int sum;
   variance_avx2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum,
                 vpx_get32x32var_avx2, 32);
-  return *sse - (((int64_t)sum * sum) >> 9);
+  return *sse - (uint32_t)(((int64_t)sum * sum) >> 9);
 }
 
 unsigned int vpx_variance32x32_avx2(const uint8_t *src, int src_stride,
@@ -70,7 +70,7 @@ unsigned int vpx_variance32x32_avx2(const uint8_t *src, int src_stride,
   int sum;
   variance_avx2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum,
                 vpx_get32x32var_avx2, 32);
-  return *sse - (((int64_t)sum * sum) >> 10);
+  return *sse - (uint32_t)(((int64_t)sum * sum) >> 10);
 }
 
 unsigned int vpx_variance64x64_avx2(const uint8_t *src, int src_stride,
@@ -79,7 +79,7 @@ unsigned int vpx_variance64x64_avx2(const uint8_t *src, int src_stride,
   int sum;
   variance_avx2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum,
                 vpx_get32x32var_avx2, 32);
-  return *sse - (((int64_t)sum * sum) >> 12);
+  return *sse - (uint32_t)(((int64_t)sum * sum) >> 12);
 }
 
 unsigned int vpx_variance64x32_avx2(const uint8_t *src, int src_stride,
@@ -88,7 +88,7 @@ unsigned int vpx_variance64x32_avx2(const uint8_t *src, int src_stride,
   int sum;
   variance_avx2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum,
                 vpx_get32x32var_avx2, 32);
-  return *sse - (((int64_t)sum * sum) >> 11);
+  return *sse - (uint32_t)(((int64_t)sum * sum) >> 11);
 }
 
 unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
@@ -115,7 +115,7 @@ unsigned int vpx_sub_pixel_variance64x64_avx2(const uint8_t *src,
                                       dst + 32, dst_stride, 64, &sse2);
   const int se = se1 + se2;
   *sse = sse1 + sse2;
-  return *sse - (((int64_t)se * se) >> 12);
+  return *sse - (uint32_t)(((int64_t)se * se) >> 12);
 }
 
 unsigned int vpx_sub_pixel_variance32x32_avx2(const uint8_t *src,
@@ -125,7 +125,7 @@ unsigned int vpx_sub_pixel_variance32x32_avx2(const uint8_t *src,
                                               unsigned int *sse) {
   const int se = vpx_sub_pixel_variance32xh_avx2(
       src, src_stride, x_offset, y_offset, dst, dst_stride, 32, sse);
-  return *sse - (((int64_t)se * se) >> 10);
+  return *sse - (uint32_t)(((int64_t)se * se) >> 10);
 }
 
 unsigned int vpx_sub_pixel_avg_variance64x64_avx2(
@@ -142,7 +142,7 @@ unsigned int vpx_sub_pixel_avg_variance64x64_avx2(
 
   *sse = sse1 + sse2;
 
-  return *sse - (((int64_t)se * se) >> 12);
+  return *sse - (uint32_t)(((int64_t)se * se) >> 12);
 }
 
 unsigned int vpx_sub_pixel_avg_variance32x32_avx2(
@@ -151,5 +151,5 @@ unsigned int vpx_sub_pixel_avg_variance32x32_avx2(
   // Process 32 elements in parallel.
   const int se = vpx_sub_pixel_avg_variance32xh_avx2(
       src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 32, 32, sse);
-  return *sse - (((int64_t)se * se) >> 10);
+  return *sse - (uint32_t)(((int64_t)se * se) >> 10);
 }
diff --git a/vpxdec.c b/vpxdec.c
index c1ff5a3f8..ab638ec6b 100644
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -125,30 +125,11 @@ static const arg_def_t deblock =
     ARG_DEF(NULL, "deblock", 0, "Enable VP8 deblocking");
 static const arg_def_t demacroblock_level = ARG_DEF(
     NULL, "demacroblock-level", 1, "Enable VP8 demacroblocking, w/ level");
-static const arg_def_t pp_debug_info =
-    ARG_DEF(NULL, "pp-debug-info", 1, "Enable VP8 visible debug info");
-static const arg_def_t pp_disp_ref_frame =
-    ARG_DEF(NULL, "pp-dbg-ref-frame", 1,
-            "Display only selected reference frame per macro block");
-static const arg_def_t pp_disp_mb_modes = ARG_DEF(
-    NULL, "pp-dbg-mb-modes", 1, "Display only selected macro block modes");
-static const arg_def_t pp_disp_b_modes =
-    ARG_DEF(NULL, "pp-dbg-b-modes", 1, "Display only selected block modes");
-static const arg_def_t pp_disp_mvs =
-    ARG_DEF(NULL, "pp-dbg-mvs", 1, "Draw only selected motion vectors");
 static const arg_def_t mfqe =
     ARG_DEF(NULL, "mfqe", 0, "Enable multiframe quality enhancement");
 
-static const arg_def_t *vp8_pp_args[] = { &addnoise_level,
-                                          &deblock,
-                                          &demacroblock_level,
-                                          &pp_debug_info,
-                                          &pp_disp_ref_frame,
-                                          &pp_disp_mb_modes,
-                                          &pp_disp_b_modes,
-                                          &pp_disp_mvs,
-                                          &mfqe,
-                                          NULL };
+static const arg_def_t *vp8_pp_args[] = { &addnoise_level, &deblock,
+                                          &demacroblock_level, &mfqe, NULL };
 #endif
 
 #if CONFIG_LIBYUV
@@ -539,10 +520,6 @@ static int main_loop(int argc, const char **argv_) {
 #endif
 #if CONFIG_VP8_DECODER
   vp8_postproc_cfg_t vp8_pp_cfg = { 0, 0, 0 };
-  int vp8_dbg_color_ref_frame = 0;
-  int vp8_dbg_color_mb_modes = 0;
-  int vp8_dbg_color_b_modes = 0;
-  int vp8_dbg_display_mv = 0;
 #endif
   int frames_corrupted = 0;
   int dec_flags = 0;
@@ -647,37 +624,6 @@ static int main_loop(int argc, const char **argv_) {
     } else if (arg_match(&arg, &mfqe, argi)) {
       postproc = 1;
       vp8_pp_cfg.post_proc_flag |= VP8_MFQE;
-    } else if (arg_match(&arg, &pp_debug_info, argi)) {
-      unsigned int level = arg_parse_uint(&arg);
-
-      postproc = 1;
-      vp8_pp_cfg.post_proc_flag &= ~0x7;
-
-      if (level) vp8_pp_cfg.post_proc_flag |= level;
-    } else if (arg_match(&arg, &pp_disp_ref_frame, argi)) {
-      unsigned int flags = arg_parse_int(&arg);
-      if (flags) {
-        postproc = 1;
-        vp8_dbg_color_ref_frame = flags;
-      }
-    } else if (arg_match(&arg, &pp_disp_mb_modes, argi)) {
-      unsigned int flags = arg_parse_int(&arg);
-      if (flags) {
-        postproc = 1;
-        vp8_dbg_color_mb_modes = flags;
-      }
-    } else if (arg_match(&arg, &pp_disp_b_modes, argi)) {
-      unsigned int flags = arg_parse_int(&arg);
-      if (flags) {
-        postproc = 1;
-        vp8_dbg_color_b_modes = flags;
-      }
-    } else if (arg_match(&arg, &pp_disp_mvs, argi)) {
-      unsigned int flags = arg_parse_int(&arg);
-      if (flags) {
-        postproc = 1;
-        vp8_dbg_display_mv = flags;
-      }
     } else if (arg_match(&arg, &error_concealment, argi)) {
       ec_enabled = 1;
     }
@@ -789,37 +735,6 @@ static int main_loop(int argc, const char **argv_) {
             vpx_codec_error(&decoder));
     return EXIT_FAILURE;
   }
-
-  if (vp8_dbg_color_ref_frame &&
-      vpx_codec_control(&decoder, VP8_SET_DBG_COLOR_REF_FRAME,
-                        vp8_dbg_color_ref_frame)) {
-    fprintf(stderr, "Failed to configure reference block visualizer: %s\n",
-            vpx_codec_error(&decoder));
-    return EXIT_FAILURE;
-  }
-
-  if (vp8_dbg_color_mb_modes &&
-      vpx_codec_control(&decoder, VP8_SET_DBG_COLOR_MB_MODES,
-                        vp8_dbg_color_mb_modes)) {
-    fprintf(stderr, "Failed to configure macro block visualizer: %s\n",
-            vpx_codec_error(&decoder));
-    return EXIT_FAILURE;
-  }
-
-  if (vp8_dbg_color_b_modes &&
-      vpx_codec_control(&decoder, VP8_SET_DBG_COLOR_B_MODES,
-                        vp8_dbg_color_b_modes)) {
-    fprintf(stderr, "Failed to configure block visualizer: %s\n",
-            vpx_codec_error(&decoder));
-    return EXIT_FAILURE;
-  }
-
-  if (vp8_dbg_display_mv &&
-      vpx_codec_control(&decoder, VP8_SET_DBG_DISPLAY_MV, vp8_dbg_display_mv)) {
-    fprintf(stderr, "Failed to configure motion vector visualizer: %s\n",
-            vpx_codec_error(&decoder));
-    return EXIT_FAILURE;
-  }
 #endif
 
   if (arg_skip) fprintf(stderr, "Skipping first %d frames.\n", arg_skip);