72 files changed, 3006 insertions, 1816 deletions
diff --git a/configure b/configure
index 3ac00ff31..f5dd7fc13 100755
--- a/configure
+++ b/configure
@@ -36,6 +36,7 @@ Advanced options:
   ${toggle_codec_srcs}            in/exclude codec library source code
   ${toggle_debug_libs}            in/exclude debug version of libraries
   ${toggle_static_msvcrt}         use static MSVCRT (VS builds only)
+  ${toggle_vp9_highbitdepth}      use VP9 high bit depth (10/12) profiles
   ${toggle_vp8}                   VP8 codec support
   ${toggle_vp9}                   VP9 codec support
   ${toggle_internal_stats}        output of encoder internal stats for debug, if supported (encoders)
@@ -57,6 +58,7 @@ Advanced options:
   ${toggle_postproc_visualizer}   macro block / block level visualizers
   ${toggle_multi_res_encoding}    enable multiple-resolution encoding
   ${toggle_temporal_denoising}    enable temporal denoising and disable the spatial denoiser
+  ${toggle_vp9_highbitdepth}      enable 10/12 bit support in VP9
   ${toggle_vp9_temporal_denoising}
                                   enable vp9 temporal denoising
   ${toggle_webm_io}               enable input from and output to WebM container
diff --git a/libs.doxy_template b/libs.doxy_template
index 02e290242..5a8f84728 100644
--- a/libs.doxy_template
+++ b/libs.doxy_template
@@ -36,7 +36,7 @@ DOXYFILE_ENCODING      = UTF-8
 # The PROJECT_NAME tag is a single word (or a sequence of words surrounded
 # by quotes) that should identify the project.
 
-PROJECT_NAME           = "WebM VP8 Codec SDK"
+PROJECT_NAME           = "WebM Codec SDK"
 
 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
 # base path where the generated documentation will be put.
@@ -415,12 +415,6 @@ MAX_INITIALIZER_LINES  = 30
 
 SHOW_USED_FILES        = YES
 
-# If the sources in your project are distributed over multiple directories
-# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy
-# in the documentation. The default is NO.
-
-SHOW_DIRECTORIES       = NO
-
 # The FILE_VERSION_FILTER tag can be used to specify a program or script that
 # doxygen should invoke to get the current version for each file (typically from the
 # version control system). Doxygen will invoke the program by executing (via
@@ -715,12 +709,6 @@ HTML_FOOTER            =
 
 HTML_STYLESHEET        =
 
-# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes,
-# files or namespaces will be aligned in HTML using tables. If set to
-# NO a bullet list will be used.
-
-HTML_ALIGN_MEMBERS     = YES
-
 # If the GENERATE_HTMLHELP tag is set to YES, additional index files
 # will be generated that can be used as input for tools like the
 # Microsoft HTML help workshop to generate a compressed HTML help file (.chm)
diff --git a/mainpage.dox b/mainpage.dox
index e2ec28002..ec202fa4f 100644
--- a/mainpage.dox
+++ b/mainpage.dox
@@ -1,4 +1,4 @@
-/*!\mainpage WebM VP8 Codec SDK
+/*!\mainpage WebM Codec SDK
 
   \section main_contents Page Contents
   - \ref main_intro
@@ -6,11 +6,11 @@
   - \ref main_support
 
   \section main_intro Introduction
-  Welcome to the WebM VP8 Codec SDK. This SDK allows you to integrate your
-  applications with the VP8 video codec, a high quality, royalty free, open
-  source codec deployed on millions of computers and devices worldwide.
+  Welcome to the WebM Codec SDK. This SDK allows you to integrate your
+  applications with the VP8 and VP9 video codecs, high quality, royalty free,
+  open source codecs deployed on billions of computers and devices worldwide.
 
-  This distribution of the WebM VP8 Codec SDK includes the following support:
+  This distribution of the WebM Codec SDK includes the following support:
 
   \if vp8_encoder
   - \ref vp8_encoder
@@ -28,12 +28,12 @@
   - Read the \ref samples "sample code" for examples of how to interact with the
     codec.
   - \ref codec reference
-    \if encoder
-    - \ref encoder reference
-    \endif
-    \if decoder
-    - \ref decoder reference
-    \endif
+  \if encoder
+  - \ref encoder reference
+  \endif
+  \if decoder
+  - \ref decoder reference
+  \endif
 
   \section main_support Support Options & FAQ
   The WebM project is an open source project supported by its community. For
diff --git a/test/datarate_test.cc b/test/datarate_test.cc
index a76d806bf..e52934771 100644
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -38,6 +38,8 @@ class DatarateTestLarge : public ::libvpx_test::EncoderTest,
     first_drop_ = 0;
     bits_total_ = 0;
     duration_ = 0.0;
+    denoiser_offon_test_ = 0;
+    denoiser_offon_period_ = -1;
   }
 
   virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
@@ -45,6 +47,17 @@ class DatarateTestLarge : public ::libvpx_test::EncoderTest,
     if (video->frame() == 1) {
       encoder->Control(VP8E_SET_NOISE_SENSITIVITY, denoiser_on_);
     }
+
+    if (denoiser_offon_test_) {
+      ASSERT_GT(denoiser_offon_period_, 0)
+          << "denoiser_offon_period_ is not positive.";
+      if ((video->frame() + 1) % denoiser_offon_period_ == 0) {
+        // Flip denoiser_on_ periodically
+        denoiser_on_ ^= 1;
+      }
+      encoder->Control(VP8E_SET_NOISE_SENSITIVITY, denoiser_on_);
+    }
+
     const vpx_rational_t tb = video->timebase();
     timebase_ = static_cast<double>(tb.num) / tb.den;
     duration_ = 0;
@@ -124,6 +137,8 @@ class DatarateTestLarge : public ::libvpx_test::EncoderTest,
   double effective_datarate_;
   size_t bits_in_last_frame_;
   int denoiser_on_;
+  int denoiser_offon_test_;
+  int denoiser_offon_period_;
 };
 
 #if CONFIG_TEMPORAL_DENOISING
@@ -155,6 +170,29 @@ TEST_P(DatarateTestLarge, DenoiserLevels) {
         << " The datarate for the file missed the target!";
   }
 }
+
+// Check basic datarate targeting, for a single bitrate, when denoiser is off
+// and on.
+TEST_P(DatarateTestLarge, DenoiserOffOn) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 299);
+  cfg_.rc_target_bitrate = 300;
+  ResetModel();
+  // The denoiser is off by default.
+  denoiser_on_ = 0;
+  // Set the offon test flag.
+  denoiser_offon_test_ = 1;
+  denoiser_offon_period_ = 100;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
+      << " The datarate for the file exceeds the target!";
+  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.3)
+      << " The datarate for the file missed the target!";
+}
 #endif  // CONFIG_TEMPORAL_DENOISING
 
 TEST_P(DatarateTestLarge, BasicBufferModel) {
@@ -246,6 +284,8 @@ class DatarateTestVP9Large : public ::libvpx_test::EncoderTest,
     for (int i = 0; i < 3; ++i) {
       bits_total_[i] = 0;
     }
+    denoiser_offon_test_ = 0;
+    denoiser_offon_period_ = -1;
   }
 
   //
@@ -313,10 +353,20 @@ class DatarateTestVP9Large : public ::libvpx_test::EncoderTest,
 
   virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                   ::libvpx_test::Encoder *encoder) {
-    if (video->frame() == 1) {
+    if (video->frame() == 1)
       encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
-      encoder->Control(VP9E_SET_NOISE_SENSITIVITY, denoiser_on_);
+
+    if (denoiser_offon_test_) {
+      ASSERT_GT(denoiser_offon_period_, 0)
+          << "denoiser_offon_period_ is not positive.";
+      if ((video->frame() + 1) % denoiser_offon_period_ == 0) {
+        // Flip denoiser_on_ periodically
+        denoiser_on_ ^= 1;
+      }
     }
+
+    encoder->Control(VP9E_SET_NOISE_SENSITIVITY, denoiser_on_);
+
     if (cfg_.ts_number_layers > 1) {
       if (video->frame() == 1) {
         encoder->Control(VP9E_SET_SVC, 1);
@@ -398,6 +448,8 @@ class DatarateTestVP9Large : public ::libvpx_test::EncoderTest,
   vpx_codec_pts_t first_drop_;
   int num_drops_;
   int denoiser_on_;
+  int denoiser_offon_test_;
+  int denoiser_offon_period_;
 };
 
 // Check basic rate targeting,
@@ -652,6 +704,38 @@ TEST_P(DatarateTestVP9Large, DenoiserLevels) {
   ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
       << " The datarate for the file is greater than target by too much!";
 }
+
+// Check basic datarate targeting, for a single bitrate, when denoiser is off
+// and on.
+TEST_P(DatarateTestVP9Large, DenoiserOffOn) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_min_quantizer = 2;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 299);
+
+  // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING),
+  // there is only one denoiser mode: denoiserYonly(which is 1),
+  // but may add more modes in the future.
+  cfg_.rc_target_bitrate = 300;
+  ResetModel();
+  // The denoiser is off by default.
+  denoiser_on_ = 0;
+  // Set the offon test flag.
+  denoiser_offon_test_ = 1;
+  denoiser_offon_period_ = 100;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
+      << " The datarate for the file is lower than target by too much!";
+  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
+      << " The datarate for the file is greater than target by too much!";
+}
 #endif  // CONFIG_VP9_TEMPORAL_DENOISING
 
 VP8_INSTANTIATE_TEST_CASE(DatarateTestLarge, ALL_TEST_MODES);
diff --git a/test/decode_perf_test.cc b/test/decode_perf_test.cc
index 33399e95d..c24d51701 100644
--- a/test/decode_perf_test.cc
+++ b/test/decode_perf_test.cc
@@ -176,7 +176,8 @@ class VP9NewEncodeDecodePerfTest :
 
     // Write frame header and data.
     ivf_write_frame_header(outfile_, out_frames_, pkt->data.frame.sz);
-    ASSERT_GT(fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile_), 0);
+    ASSERT_EQ(fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile_),
+              pkt->data.frame.sz);
   }
 
   virtual bool DoDecode() { return false; }
diff --git a/test/encode_perf_test.cc b/test/encode_perf_test.cc
index 769e14ae2..7e9f0d6c4 100644
--- a/test/encode_perf_test.cc
+++ b/test/encode_perf_test.cc
@@ -189,7 +189,7 @@ TEST_P(VP9EncodePerfTest, PerfTest) {
         printf("\t\"totalFrames\" : %u,\n", frames);
         printf("\t\"framesPerSecond\" : %f,\n", fps);
         printf("\t\"minPsnr\" : %f,\n", minimum_psnr);
-        printf("\t\"speed\" : %d\n", kEncodePerfTestSpeeds[j]);
+        printf("\t\"speed\" : %d,\n", kEncodePerfTestSpeeds[j]);
         printf("\t\"threads\" : %d\n", kEncodePerfTestThreads[k]);
         printf("}\n");
       }
diff --git a/test/encode_test_driver.cc b/test/encode_test_driver.cc
index 7a133643d..b03235b71 100644
--- a/test/encode_test_driver.cc
+++ b/test/encode_test_driver.cc
@@ -64,8 +64,7 @@ void Encoder::EncodeFrameInternal(const VideoSource &video,
 
   // Encode the frame
   API_REGISTER_STATE_CHECK(
-      res = vpx_codec_encode(&encoder_,
-                             video.img(), video.pts(), video.duration(),
+      res = vpx_codec_encode(&encoder_, img, video.pts(), video.duration(),
                              frame_flags, deadline_));
   ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
 }
@@ -115,6 +114,7 @@ void EncoderTest::SetMode(TestMode mode) {
 static bool compare_img(const vpx_image_t *img1,
                         const vpx_image_t *img2) {
   bool match = (img1->fmt == img2->fmt) &&
+               (img1->cs == img2->cs) &&
                (img1->d_w == img2->d_w) &&
                (img1->d_h == img2->d_h);
 
diff --git a/test/error_resilience_test.cc b/test/error_resilience_test.cc
index 182547bdf..9a99a80f8 100644
--- a/test/error_resilience_test.cc
+++ b/test/error_resilience_test.cc
@@ -37,6 +37,7 @@ class ErrorResilienceTestLarge : public ::libvpx_test::EncoderTest,
   void Reset() {
     error_nframes_ = 0;
     droppable_nframes_ = 0;
+    pattern_switch_ = 0;
   }
 
   virtual void SetUp() {
@@ -62,19 +63,37 @@ class ErrorResilienceTestLarge : public ::libvpx_test::EncoderTest,
   //   1     3
   // 0    2     .....
   // LAST is updated on base/layer 0, GOLDEN  updated on layer 1.
-  int SetFrameFlags(int frame_num, int num_temp_layers) {
+  // Non-zero pattern_switch parameter means pattern will switch to
+  // not using LAST for frame_num >= pattern_switch.
+  int SetFrameFlags(int frame_num,
+                    int num_temp_layers,
+                    int pattern_switch) {
     int frame_flags = 0;
     if (num_temp_layers == 2) {
-      if (frame_num % 2 == 0) {
-        // Layer 0: predict from L and ARF, update L.
-        frame_flags = VP8_EFLAG_NO_REF_GF |
-                      VP8_EFLAG_NO_UPD_GF |
-                      VP8_EFLAG_NO_UPD_ARF;
-      } else {
-        // Layer 1: predict from L, GF, and ARF, and update GF.
-        frame_flags = VP8_EFLAG_NO_UPD_ARF |
-                      VP8_EFLAG_NO_UPD_LAST;
-      }
+        if (frame_num % 2 == 0) {
+          if (frame_num < pattern_switch || pattern_switch == 0) {
+            // Layer 0: predict from LAST and ARF, update LAST.
+            frame_flags = VP8_EFLAG_NO_REF_GF |
+                          VP8_EFLAG_NO_UPD_GF |
+                          VP8_EFLAG_NO_UPD_ARF;
+          } else {
+            // Layer 0: predict from GF and ARF, update GF.
+            frame_flags = VP8_EFLAG_NO_REF_LAST |
+                          VP8_EFLAG_NO_UPD_LAST |
+                          VP8_EFLAG_NO_UPD_ARF;
+          }
+        } else {
+          if (frame_num < pattern_switch || pattern_switch == 0) {
+            // Layer 1: predict from L, GF, and ARF, update GF.
+            frame_flags = VP8_EFLAG_NO_UPD_ARF |
+                          VP8_EFLAG_NO_UPD_LAST;
+          } else {
+            // Layer 1: predict from GF and ARF, update GF.
+            frame_flags = VP8_EFLAG_NO_REF_LAST |
+                          VP8_EFLAG_NO_UPD_LAST |
+                          VP8_EFLAG_NO_UPD_ARF;
+          }
+        }
     }
     return frame_flags;
   }
@@ -86,7 +105,9 @@ class ErrorResilienceTestLarge : public ::libvpx_test::EncoderTest,
                       VP8_EFLAG_NO_UPD_ARF);
     // For temporal layer case.
     if (cfg_.ts_number_layers > 1) {
-      frame_flags_ = SetFrameFlags(video->frame(), cfg_.ts_number_layers);
+      frame_flags_ = SetFrameFlags(video->frame(),
+                                   cfg_.ts_number_layers,
+                                   pattern_switch_);
       for (unsigned int i = 0; i < droppable_nframes_; ++i) {
         if (droppable_frames_[i] == video->frame()) {
           std::cout << "Encoding droppable frame: "
@@ -168,11 +189,16 @@ class ErrorResilienceTestLarge : public ::libvpx_test::EncoderTest,
     return mismatch_nframes_;
   }
 
+  void SetPatternSwitch(int frame_switch) {
+     pattern_switch_ = frame_switch;
+   }
+
  private:
   double psnr_;
   unsigned int nframes_;
   unsigned int error_nframes_;
   unsigned int droppable_nframes_;
+  unsigned int pattern_switch_;
   double mismatch_psnr_;
   unsigned int mismatch_nframes_;
   unsigned int error_frames_[kMaxErrorFrames];
@@ -299,6 +325,7 @@ TEST_P(ErrorResilienceTestLarge, 2LayersDropEnhancement) {
   // Error resilient mode ON.
   cfg_.g_error_resilient = 1;
   cfg_.kf_mode = VPX_KF_DISABLED;
+  SetPatternSwitch(0);
 
   // The odd frames are the enhancement layer for 2 layer pattern, so set
   // those frames as droppable. Drop the last 7 frames.
@@ -316,6 +343,45 @@ TEST_P(ErrorResilienceTestLarge, 2LayersDropEnhancement) {
   Reset();
 }
 
+// Check for successful decoding and no encoder/decoder mismatch
+// for a two layer temporal pattern, where at some point in the
+// sequence, the LAST ref is not used anymore.
+TEST_P(ErrorResilienceTestLarge, 2LayersNoRefLast) {
+  const vpx_rational timebase = { 33333333, 1000000000 };
+  cfg_.g_timebase = timebase;
+  cfg_.rc_target_bitrate = 500;
+  cfg_.g_lag_in_frames = 0;
+
+  cfg_.rc_end_usage = VPX_CBR;
+  // 2 Temporal layers, no spatial layers, CBR mode.
+  cfg_.ss_number_layers = 1;
+  cfg_.ts_number_layers = 2;
+  cfg_.ts_rate_decimator[0] = 2;
+  cfg_.ts_rate_decimator[1] = 1;
+  cfg_.ts_periodicity = 2;
+  cfg_.ts_target_bitrate[0] = 60 * cfg_.rc_target_bitrate / 100;
+  cfg_.ts_target_bitrate[1] = cfg_.rc_target_bitrate;
+
+  init_flags_ = VPX_CODEC_USE_PSNR;
+
+  libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                     timebase.den, timebase.num, 0, 100);
+
+  // Error resilient mode ON.
+  cfg_.g_error_resilient = 1;
+  cfg_.kf_mode = VPX_KF_DISABLED;
+  SetPatternSwitch(60);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  // Test that no mismatches have been found
+  std::cout << "             Mismatch frames: "
+            << GetMismatchFrames() << "\n";
+  EXPECT_EQ(GetMismatchFrames(), (unsigned int) 0);
+
+  // Reset previously set of error/droppable frames.
+  Reset();
+}
+
 class ErrorResilienceTestLargeCodecControls : public ::libvpx_test::EncoderTest,
     public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
  protected:
diff --git a/test/lpf_8_test.cc b/test/lpf_8_test.cc
index cdc0a9895..c836facb3 100644
--- a/test/lpf_8_test.cc
+++ b/test/lpf_8_test.cc
@@ -107,6 +107,36 @@ void wrapper_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_SSE2
 
+#if HAVE_NEON_ASM
+#if CONFIG_VP9_HIGHBITDEPTH
+// No neon high bitdepth functions.
+#else
+void wrapper_vertical_16_neon(uint8_t *s, int p, const uint8_t *blimit,
+                              const uint8_t *limit, const uint8_t *thresh,
+                              int count) {
+  vp9_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
+}
+
+void wrapper_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
+                           const uint8_t *limit, const uint8_t *thresh,
+                           int count) {
+  vp9_lpf_vertical_16_c(s, p, blimit, limit, thresh);
+}
+
+void wrapper_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
+                                   const uint8_t *limit, const uint8_t *thresh,
+                                   int count) {
+  vp9_lpf_vertical_16_dual_neon(s, p, blimit, limit, thresh);
+}
+
+void wrapper_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
+                                const uint8_t *limit, const uint8_t *thresh,
+                                int count) {
+  vp9_lpf_vertical_16_dual_c(s, p, blimit, limit, thresh);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_NEON_ASM
+
 class Loop8Test6Param : public ::testing::TestWithParam<loop8_param_t> {
  public:
   virtual ~Loop8Test6Param() {}
@@ -594,33 +624,45 @@ INSTANTIATE_TEST_CASE_P(
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif
 
-#if HAVE_NEON && (!CONFIG_VP9_HIGHBITDEPTH)
+#if HAVE_NEON
+#if CONFIG_VP9_HIGHBITDEPTH
+// No neon high bitdepth functions.
+#else
 INSTANTIATE_TEST_CASE_P(
     NEON, Loop8Test6Param,
     ::testing::Values(
 #if HAVE_NEON_ASM
+// Using #if inside the macro is unsupported on MSVS but the tests are not
+// currently built for MSVS with ARM and NEON.
         make_tuple(&vp9_lpf_horizontal_16_neon,
                    &vp9_lpf_horizontal_16_c, 8),
+        make_tuple(&wrapper_vertical_16_neon,
+                   &wrapper_vertical_16_c, 8),
+        make_tuple(&wrapper_vertical_16_dual_neon,
+                   &wrapper_vertical_16_dual_c, 8),
+        make_tuple(&vp9_lpf_horizontal_8_neon,
+                   &vp9_lpf_horizontal_8_c, 8),
+        make_tuple(&vp9_lpf_vertical_8_neon,
+                   &vp9_lpf_vertical_8_c, 8),
 #endif  // HAVE_NEON_ASM
         make_tuple(&vp9_lpf_horizontal_4_neon,
                    &vp9_lpf_horizontal_4_c, 8),
-        make_tuple(&vp9_lpf_horizontal_8_neon,
-                   &vp9_lpf_horizontal_8_c, 8),
         make_tuple(&vp9_lpf_vertical_4_neon,
-                   &vp9_lpf_vertical_4_c, 8),
-        make_tuple(&vp9_lpf_vertical_8_neon,
-                   &vp9_lpf_vertical_8_c, 8)));
+                   &vp9_lpf_vertical_4_c, 8)));
 INSTANTIATE_TEST_CASE_P(
     NEON, Loop8Test9Param,
     ::testing::Values(
-        make_tuple(&vp9_lpf_horizontal_4_dual_neon,
-                   &vp9_lpf_horizontal_4_dual_c, 8),
+#if HAVE_NEON_ASM
         make_tuple(&vp9_lpf_horizontal_8_dual_neon,
                    &vp9_lpf_horizontal_8_dual_c, 8),
-        make_tuple(&vp9_lpf_vertical_4_dual_neon,
-                   &vp9_lpf_vertical_4_dual_c, 8),
         make_tuple(&vp9_lpf_vertical_8_dual_neon,
-                   &vp9_lpf_vertical_8_dual_c, 8)));
-#endif  // HAVE_NEON && (!CONFIG_VP9_HIGHBITDEPTH)
+                   &vp9_lpf_vertical_8_dual_c, 8),
+#endif  // HAVE_NEON_ASM
+        make_tuple(&vp9_lpf_horizontal_4_dual_neon,
+                   &vp9_lpf_horizontal_4_dual_c, 8),
+        make_tuple(&vp9_lpf_vertical_4_dual_neon,
+                   &vp9_lpf_vertical_4_dual_c, 8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_NEON
 
 }  // namespace
diff --git a/test/variance_test.cc b/test/variance_test.cc
index 4d279f686..a8dd7de13 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -1914,11 +1914,17 @@ INSTANTIATE_TEST_CASE_P(
 const vp9_variance_fn_t variance8x8_neon = vp9_variance8x8_neon;
 const vp9_variance_fn_t variance16x16_neon = vp9_variance16x16_neon;
 const vp9_variance_fn_t variance32x32_neon = vp9_variance32x32_neon;
+const vp9_variance_fn_t variance32x64_neon = vp9_variance32x64_neon;
+const vp9_variance_fn_t variance64x32_neon = vp9_variance64x32_neon;
+const vp9_variance_fn_t variance64x64_neon = vp9_variance64x64_neon;
 INSTANTIATE_TEST_CASE_P(
     NEON, VP9VarianceTest,
     ::testing::Values(make_tuple(3, 3, variance8x8_neon, 0),
                       make_tuple(4, 4, variance16x16_neon, 0),
-                      make_tuple(5, 5, variance32x32_neon, 0)));
+                      make_tuple(5, 5, variance32x32_neon, 0),
+                      make_tuple(5, 6, variance32x64_neon, 0),
+                      make_tuple(6, 5, variance64x32_neon, 0),
+                      make_tuple(6, 6, variance64x64_neon, 0)));
 
 const vp9_subpixvariance_fn_t subpel_variance8x8_neon =
     vp9_sub_pixel_variance8x8_neon;
@@ -1926,11 +1932,14 @@ const vp9_subpixvariance_fn_t subpel_variance16x16_neon =
     vp9_sub_pixel_variance16x16_neon;
 const vp9_subpixvariance_fn_t subpel_variance32x32_neon =
     vp9_sub_pixel_variance32x32_neon;
+const vp9_subpixvariance_fn_t subpel_variance64x64_neon =
+    vp9_sub_pixel_variance64x64_neon;
 INSTANTIATE_TEST_CASE_P(
     NEON, VP9SubpelVarianceTest,
     ::testing::Values(make_tuple(3, 3, subpel_variance8x8_neon, 0),
                       make_tuple(4, 4, subpel_variance16x16_neon, 0),
-                      make_tuple(5, 5, subpel_variance32x32_neon, 0)));
+                      make_tuple(5, 5, subpel_variance32x32_neon, 0),
+                      make_tuple(6, 6, subpel_variance64x64_neon, 0)));
 #endif  // HAVE_NEON
 #endif  // CONFIG_VP9_ENCODER
 
diff --git a/test/video_source.h b/test/video_source.h
index 84bfa8e53..b97e1550e 100644
--- a/test/video_source.h
+++ b/test/video_source.h
@@ -175,8 +175,8 @@ class DummyVideoSource : public VideoSource {
   void SetSize(unsigned int width, unsigned int height) {
     if (width != width_ || height != height_) {
       vpx_img_free(img_);
-      raw_sz_ = ((width + 31)&~31) * height * 3 / 2;
       img_ = vpx_img_alloc(NULL, VPX_IMG_FMT_I420, width, height, 32);
+      raw_sz_ = ((img_->w + 31) & ~31) * img_->h * 3 / 2;
       width_ = width;
       height_ = height;
     }
diff --git a/test/vp9_avg_test.cc b/test/vp9_avg_test.cc
index fa04528a2..252ed4efa 100644
--- a/test/vp9_avg_test.cc
+++ b/test/vp9_avg_test.cc
@@ -165,4 +165,14 @@ INSTANTIATE_TEST_CASE_P(
 
 #endif
 
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(
+    NEON, AverageTest,
+    ::testing::Values(
+        make_tuple(16, 16, 0, 8, &vp9_avg_8x8_neon),
+        make_tuple(16, 16, 5, 8, &vp9_avg_8x8_neon),
+        make_tuple(32, 32, 15, 8, &vp9_avg_8x8_neon)));
+
+#endif
+
 }  // namespace
diff --git a/test/vp9_encoder_parms_get_to_decoder.cc b/test/vp9_encoder_parms_get_to_decoder.cc
index 6c354fd38..34e7854a9 100644
--- a/test/vp9_encoder_parms_get_to_decoder.cc
+++ b/test/vp9_encoder_parms_get_to_decoder.cc
@@ -65,14 +65,15 @@ struct EncodeParameters {
   int32_t lossless;
   int32_t error_resilient;
   int32_t frame_parallel;
+  vpx_color_space_t cs;
   // TODO(JBB): quantizers / bitrate
 };
 
 const EncodeParameters kVP9EncodeParameterSet[] = {
-    {0, 0, 0, 1, 0},
-    {0, 0, 0, 0, 0},
-    {0, 0, 1, 0, 0},
-    {0, 2, 0, 0, 1},
+    {0, 0, 0, 1, 0, VPX_CS_BT_601},
+    {0, 0, 0, 0, 0, VPX_CS_BT_709},
+    {0, 0, 1, 0, 0, VPX_CS_BT_2020},
+    {0, 2, 0, 0, 1, VPX_CS_UNKNOWN},
     // TODO(JBB): Test profiles (requires more work).
 };
 
@@ -109,6 +110,7 @@ class Vp9EncoderParmsGetToDecoder
   virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                   ::libvpx_test::Encoder *encoder) {
     if (video->frame() == 1) {
+      encoder->Control(VP9E_SET_COLOR_SPACE, encode_parms.cs);
       encoder->Control(VP9E_SET_LOSSLESS, encode_parms.lossless);
       encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING,
                        encode_parms.frame_parallel);
@@ -147,7 +149,7 @@ class Vp9EncoderParmsGetToDecoder
       EXPECT_EQ(common->frame_parallel_decoding_mode,
                 encode_parms.frame_parallel);
     }
-
+    EXPECT_EQ(common->color_space, encode_parms.cs);
     EXPECT_EQ(common->log2_tile_cols, encode_parms.tile_cols);
     EXPECT_EQ(common->log2_tile_rows, encode_parms.tile_rows);
 
diff --git a/usage.dox b/usage.dox
index 237b8dc42..6bfca3bc8 100644
--- a/usage.dox
+++ b/usage.dox
@@ -80,8 +80,11 @@
 
 
     The available initialization methods are:
-    \if encoder - #vpx_codec_enc_init (calls vpx_codec_enc_init_ver()) \endif
-    \if multi-encoder - #vpx_codec_enc_init_multi (calls vpx_codec_enc_init_multi_ver()) \endif
+    \if encoder
+     - #vpx_codec_enc_init (calls vpx_codec_enc_init_ver())
+     - #vpx_codec_enc_init_multi (calls vpx_codec_enc_init_multi_ver())
+     .
+    \endif
     \if decoder - #vpx_codec_dec_init (calls vpx_codec_dec_init_ver()) \endif
 
 
diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
index fc026aa9c..d02cd30b9 100644
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -753,45 +753,46 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
     int ref_frame_map[4];
     int sign_bias = 0;
     int dot_artifact_candidate = 0;
-    // For detecting dot artifact.
-    unsigned char* target = x->src.y_buffer;
-    unsigned char* target_u = x->block[16].src + *x->block[16].base_src;
-    unsigned char* target_v = x->block[20].src + *x->block[20].base_src;
-    int stride = x->src.y_stride;
-    int stride_uv = x->block[16].src_stride;
+    get_predictor_pointers(cpi, plane, recon_yoffset, recon_uvoffset);
+
+    // If the current frame is using LAST as a reference, check for
+    // biasing the mode selection for dot artifacts.
+    if (cpi->ref_frame_flags & VP8_LAST_FRAME) {
+      unsigned char* target_y = x->src.y_buffer;
+      unsigned char* target_u = x->block[16].src + *x->block[16].base_src;
+      unsigned char* target_v = x->block[20].src + *x->block[20].base_src;
+      int stride = x->src.y_stride;
+      int stride_uv = x->block[16].src_stride;
 #if CONFIG_TEMPORAL_DENOISING
-    if (cpi->oxcf.noise_sensitivity) {
-      int uv_denoise = (cpi->oxcf.noise_sensitivity >= 2) ? 1 : 0;
-      target =
-          cpi->denoiser.yv12_running_avg[LAST_FRAME].y_buffer + recon_yoffset;
-      stride = cpi->denoiser.yv12_running_avg[LAST_FRAME].y_stride;
-      if (uv_denoise) {
-        target_u =
-            cpi->denoiser.yv12_running_avg[LAST_FRAME].u_buffer + recon_uvoffset;
-        target_v =
-            cpi->denoiser.yv12_running_avg[LAST_FRAME].v_buffer + recon_uvoffset;
-        stride_uv = cpi->denoiser.yv12_running_avg[LAST_FRAME].uv_stride;
+      if (cpi->oxcf.noise_sensitivity) {
+        const int uv_denoise = (cpi->oxcf.noise_sensitivity >= 2) ? 1 : 0;
+        target_y =
+            cpi->denoiser.yv12_running_avg[LAST_FRAME].y_buffer + recon_yoffset;
+        stride = cpi->denoiser.yv12_running_avg[LAST_FRAME].y_stride;
+        if (uv_denoise) {
+          target_u =
+              cpi->denoiser.yv12_running_avg[LAST_FRAME].u_buffer +
+                  recon_uvoffset;
+          target_v =
+              cpi->denoiser.yv12_running_avg[LAST_FRAME].v_buffer +
+                  recon_uvoffset;
+          stride_uv = cpi->denoiser.yv12_running_avg[LAST_FRAME].uv_stride;
+        }
       }
-    }
 #endif
-
-    get_predictor_pointers(cpi, plane, recon_yoffset, recon_uvoffset);
-
-    dot_artifact_candidate =
-        check_dot_artifact_candidate(cpi, x,
-            target, stride,
-            plane[LAST_FRAME][0], mb_row, mb_col, 0);
-    // If not found in Y channel, check UV channel.
-    if (!dot_artifact_candidate) {
       dot_artifact_candidate =
-          check_dot_artifact_candidate(cpi, x,
-              target_u, stride_uv,
-              plane[LAST_FRAME][1], mb_row, mb_col, 1);
+          check_dot_artifact_candidate(cpi, x, target_y, stride,
+              plane[LAST_FRAME][0], mb_row, mb_col, 0);
+      // If not found in Y channel, check UV channel.
       if (!dot_artifact_candidate) {
         dot_artifact_candidate =
-            check_dot_artifact_candidate(cpi, x,
-                target_v, stride_uv,
-                plane[LAST_FRAME][2], mb_row, mb_col, 2);
+            check_dot_artifact_candidate(cpi, x, target_u, stride_uv,
+                plane[LAST_FRAME][1], mb_row, mb_col, 1);
+        if (!dot_artifact_candidate) {
+          dot_artifact_candidate =
+              check_dot_artifact_candidate(cpi, x, target_v, stride_uv,
+                  plane[LAST_FRAME][2], mb_row, mb_col, 2);
+        }
       }
     }
 
diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c
index 6810644ba..67a0fef64 100644
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -188,7 +188,7 @@ static vpx_codec_err_t vp8_peek_si_internal(const uint8_t *data,
 
             /* vet via sync code */
             if (clear[3] != 0x9d || clear[4] != 0x01 || clear[5] != 0x2a)
-                res = VPX_CODEC_UNSUP_BITSTREAM;
+                return VPX_CODEC_UNSUP_BITSTREAM;
 
             si->w = (clear[6] | (clear[7] << 8)) & 0x3fff;
             si->h = (clear[8] | (clear[9] << 8)) & 0x3fff;
@@ -402,7 +402,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,
     if (!res)
     {
         VP8D_COMP *pbi = ctx->yv12_frame_buffers.pbi[0];
-        if(resolution_change)
+        if (resolution_change)
         {
             VP8_COMMON *const pc = & pbi->common;
             MACROBLOCKD *const xd  = & pbi->mb;
diff --git a/vp9/common/arm/neon/vp9_loopfilter_16_neon.c b/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
index 09f470e97..c69ee1009 100644
--- a/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
+++ b/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
@@ -124,7 +124,6 @@ static INLINE void vp9_loop_filter_neon_16(
     return;
 }
 
-#if !HAVE_NEON_ASM
 void vp9_lpf_horizontal_4_dual_neon(uint8_t *s, int p /* pitch */,
                                     const uint8_t *blimit0,
                                     const uint8_t *limit0,
@@ -178,47 +177,3 @@ void vp9_lpf_horizontal_4_dual_neon(uint8_t *s, int p /* pitch */,
     vst1q_u8(s, q8u8);
     return;
 }
-#endif  // !HAVE_NEON_ASM
-
-void vp9_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */,
-                                    const uint8_t *blimit0,
-                                    const uint8_t *limit0,
-                                    const uint8_t *thresh0,
-                                    const uint8_t *blimit1,
-                                    const uint8_t *limit1,
-                                    const uint8_t *thresh1) {
-  vp9_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0, 1);
-  vp9_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1, 1);
-}
-
-void vp9_lpf_vertical_4_dual_neon(uint8_t *s, int p,
-                                  const uint8_t *blimit0,
-                                  const uint8_t *limit0,
-                                  const uint8_t *thresh0,
-                                  const uint8_t *blimit1,
-                                  const uint8_t *limit1,
-                                  const uint8_t *thresh1) {
-  vp9_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0, 1);
-  vp9_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
-}
-
-void vp9_lpf_vertical_8_dual_neon(uint8_t *s, int p,
-                                  const uint8_t *blimit0,
-                                  const uint8_t *limit0,
-                                  const uint8_t *thresh0,
-                                  const uint8_t *blimit1,
-                                  const uint8_t *limit1,
-                                  const uint8_t *thresh1) {
-  vp9_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0, 1);
-  vp9_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
-}
-
-#if HAVE_NEON_ASM
-void vp9_lpf_vertical_16_dual_neon(uint8_t *s, int p,
-                                   const uint8_t *blimit,
-                                   const uint8_t *limit,
-                                   const uint8_t *thresh) {
-  vp9_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
-  vp9_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh);
-}
-#endif  // HAVE_NEON_ASM
diff --git a/vp9/common/arm/neon/vp9_loopfilter_4_neon.c b/vp9/common/arm/neon/vp9_loopfilter_4_neon.c
new file mode 100644
index 000000000..fd9db6187
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_loopfilter_4_neon.c
@@ -0,0 +1,274 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+
+static INLINE void vp9_loop_filter_neon(
+        uint8x8_t dblimit,    // flimit
+        uint8x8_t dlimit,     // limit
+        uint8x8_t dthresh,    // thresh
+        uint8x8_t d3u8,       // p3
+        uint8x8_t d4u8,       // p2
+        uint8x8_t d5u8,       // p1
+        uint8x8_t d6u8,       // p0
+        uint8x8_t d7u8,       // q0
+        uint8x8_t d16u8,      // q1
+        uint8x8_t d17u8,      // q2
+        uint8x8_t d18u8,      // q3
+        uint8x8_t *d4ru8,     // p1
+        uint8x8_t *d5ru8,     // p0
+        uint8x8_t *d6ru8,     // q0
+        uint8x8_t *d7ru8) {   // q1
+    uint8x8_t d19u8, d20u8, d21u8, d22u8, d23u8, d27u8, d28u8;
+    int16x8_t q12s16;
+    int8x8_t d19s8, d20s8, d21s8, d26s8, d27s8, d28s8;
+
+    d19u8 = vabd_u8(d3u8, d4u8);
+    d20u8 = vabd_u8(d4u8, d5u8);
+    d21u8 = vabd_u8(d5u8, d6u8);
+    d22u8 = vabd_u8(d16u8, d7u8);
+    d3u8  = vabd_u8(d17u8, d16u8);
+    d4u8  = vabd_u8(d18u8, d17u8);
+
+    d19u8 = vmax_u8(d19u8, d20u8);
+    d20u8 = vmax_u8(d21u8, d22u8);
+    d3u8  = vmax_u8(d3u8,  d4u8);
+    d23u8 = vmax_u8(d19u8, d20u8);
+
+    d17u8 = vabd_u8(d6u8, d7u8);
+
+    d21u8 = vcgt_u8(d21u8, dthresh);
+    d22u8 = vcgt_u8(d22u8, dthresh);
+    d23u8 = vmax_u8(d23u8, d3u8);
+
+    d28u8 = vabd_u8(d5u8, d16u8);
+    d17u8 = vqadd_u8(d17u8, d17u8);
+
+    d23u8 = vcge_u8(dlimit, d23u8);
+
+    d18u8 = vdup_n_u8(0x80);
+    d5u8  = veor_u8(d5u8,  d18u8);
+    d6u8  = veor_u8(d6u8,  d18u8);
+    d7u8  = veor_u8(d7u8,  d18u8);
+    d16u8 = veor_u8(d16u8, d18u8);
+
+    d28u8 = vshr_n_u8(d28u8, 1);
+    d17u8 = vqadd_u8(d17u8, d28u8);
+
+    d19u8 = vdup_n_u8(3);
+
+    d28s8 = vsub_s8(vreinterpret_s8_u8(d7u8),
+                    vreinterpret_s8_u8(d6u8));
+
+    d17u8 = vcge_u8(dblimit, d17u8);
+
+    d27s8 = vqsub_s8(vreinterpret_s8_u8(d5u8),
+                     vreinterpret_s8_u8(d16u8));
+
+    d22u8 = vorr_u8(d21u8, d22u8);
+
+    q12s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d19u8));
+
+    d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d22u8);
+    d23u8 = vand_u8(d23u8, d17u8);
+
+    q12s16 = vaddw_s8(q12s16, vreinterpret_s8_u8(d27u8));
+
+    d17u8 = vdup_n_u8(4);
+
+    d27s8 = vqmovn_s16(q12s16);
+    d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d23u8);
+    d27s8 = vreinterpret_s8_u8(d27u8);
+
+    d28s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d19u8));
+    d27s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d17u8));
+    d28s8 = vshr_n_s8(d28s8, 3);
+    d27s8 = vshr_n_s8(d27s8, 3);
+
+    d19s8 = vqadd_s8(vreinterpret_s8_u8(d6u8), d28s8);
+    d26s8 = vqsub_s8(vreinterpret_s8_u8(d7u8), d27s8);
+
+    d27s8 = vrshr_n_s8(d27s8, 1);
+    d27s8 = vbic_s8(d27s8, vreinterpret_s8_u8(d22u8));
+
+    d21s8 = vqadd_s8(vreinterpret_s8_u8(d5u8), d27s8);
+    d20s8 = vqsub_s8(vreinterpret_s8_u8(d16u8), d27s8);
+
+    *d4ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d18u8);
+    *d5ru8 = veor_u8(vreinterpret_u8_s8(d19s8), d18u8);
+    *d6ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d18u8);
+    *d7ru8 = veor_u8(vreinterpret_u8_s8(d20s8), d18u8);
+    return;
+}
+
+void vp9_lpf_horizontal_4_neon(
+        unsigned char *src,
+        int pitch,
+        unsigned char *blimit,
+        unsigned char *limit,
+        unsigned char *thresh,
+        int count) {
+    int i;
+    uint8_t *s, *psrc;
+    uint8x8_t dblimit, dlimit, dthresh;
+    uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
+
+    if (count == 0)  // end_vp9_lf_h_edge
+        return;
+
+    dblimit = vld1_u8(blimit);
+    dlimit = vld1_u8(limit);
+    dthresh = vld1_u8(thresh);
+
+    psrc = src - (pitch << 2);
+    for (i = 0; i < count; i++) {
+        s = psrc + i * 8;
+
+        d3u8 = vld1_u8(s);
+        s += pitch;
+        d4u8 = vld1_u8(s);
+        s += pitch;
+        d5u8 = vld1_u8(s);
+        s += pitch;
+        d6u8 = vld1_u8(s);
+        s += pitch;
+        d7u8 = vld1_u8(s);
+        s += pitch;
+        d16u8 = vld1_u8(s);
+        s += pitch;
+        d17u8 = vld1_u8(s);
+        s += pitch;
+        d18u8 = vld1_u8(s);
+
+        vp9_loop_filter_neon(dblimit, dlimit, dthresh,
+                             d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
+                             &d4u8, &d5u8, &d6u8, &d7u8);
+
+        s -= (pitch * 5);
+        vst1_u8(s, d4u8);
+        s += pitch;
+        vst1_u8(s, d5u8);
+        s += pitch;
+        vst1_u8(s, d6u8);
+        s += pitch;
+        vst1_u8(s, d7u8);
+    }
+    return;
+}
+
+void vp9_lpf_vertical_4_neon(
+        unsigned char *src,
+        int pitch,
+        unsigned char *blimit,
+        unsigned char *limit,
+        unsigned char *thresh,
+        int count) {
+    int i, pitch8;
+    uint8_t *s;
+    uint8x8_t dblimit, dlimit, dthresh;
+    uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
+    uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
+    uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
+    uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
+    uint8x8x4_t d4Result;
+
+    if (count == 0)  // end_vp9_lf_h_edge
+        return;
+
+    dblimit = vld1_u8(blimit);
+    dlimit = vld1_u8(limit);
+    dthresh = vld1_u8(thresh);
+
+    pitch8 = pitch * 8;
+    for (i = 0; i < count; i++, src += pitch8) {
+        s = src - (i + 1) * 4;
+
+        d3u8 = vld1_u8(s);
+        s += pitch;
+        d4u8 = vld1_u8(s);
+        s += pitch;
+        d5u8 = vld1_u8(s);
+        s += pitch;
+        d6u8 = vld1_u8(s);
+        s += pitch;
+        d7u8 = vld1_u8(s);
+        s += pitch;
+        d16u8 = vld1_u8(s);
+        s += pitch;
+        d17u8 = vld1_u8(s);
+        s += pitch;
+        d18u8 = vld1_u8(s);
+
+        d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8),
+                      vreinterpret_u32_u8(d7u8));
+        d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8),
+                      vreinterpret_u32_u8(d16u8));
+        d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8),
+                      vreinterpret_u32_u8(d17u8));
+        d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8),
+                      vreinterpret_u32_u8(d18u8));
+
+        d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
+                          vreinterpret_u16_u32(d2tmp2.val[0]));
+        d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
+                          vreinterpret_u16_u32(d2tmp3.val[0]));
+        d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
+                          vreinterpret_u16_u32(d2tmp2.val[1]));
+        d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
+                          vreinterpret_u16_u32(d2tmp3.val[1]));
+
+        d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
+                         vreinterpret_u8_u16(d2tmp5.val[0]));
+        d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
+                         vreinterpret_u8_u16(d2tmp5.val[1]));
+        d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
+                          vreinterpret_u8_u16(d2tmp7.val[0]));
+        d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
+                          vreinterpret_u8_u16(d2tmp7.val[1]));
+
+        d3u8 = d2tmp8.val[0];
+        d4u8 = d2tmp8.val[1];
+        d5u8 = d2tmp9.val[0];
+        d6u8 = d2tmp9.val[1];
+        d7u8 = d2tmp10.val[0];
+        d16u8 = d2tmp10.val[1];
+        d17u8 = d2tmp11.val[0];
+        d18u8 = d2tmp11.val[1];
+
+        vp9_loop_filter_neon(dblimit, dlimit, dthresh,
+                             d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
+                             &d4u8, &d5u8, &d6u8, &d7u8);
+
+        d4Result.val[0] = d4u8;
+        d4Result.val[1] = d5u8;
+        d4Result.val[2] = d6u8;
+        d4Result.val[3] = d7u8;
+
+        src -= 2;
+        vst4_lane_u8(src, d4Result, 0);
+        src += pitch;
+        vst4_lane_u8(src, d4Result, 1);
+        src += pitch;
+        vst4_lane_u8(src, d4Result, 2);
+        src += pitch;
+        vst4_lane_u8(src, d4Result, 3);
+        src += pitch;
+        vst4_lane_u8(src, d4Result, 4);
+        src += pitch;
+        vst4_lane_u8(src, d4Result, 5);
+        src += pitch;
+        vst4_lane_u8(src, d4Result, 6);
+        src += pitch;
+        vst4_lane_u8(src, d4Result, 7);
+    }
+    return;
+}
diff --git a/vp9/common/arm/neon/vp9_loopfilter_4_neon_asm.asm b/vp9/common/arm/neon/vp9_loopfilter_4_neon_asm.asm
new file mode 100644
index 000000000..7738e0d3a
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_loopfilter_4_neon_asm.asm
@@ -0,0 +1,277 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vp9_lpf_horizontal_4_neon|
+    EXPORT  |vp9_lpf_vertical_4_neon|
+    ARM
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter
+; works on 16 iterations at a time.
+; TODO(fgalligan): See about removing the count code as this function is only
+; called with a count of 1.
+;
+; void vp9_lpf_horizontal_4_neon(uint8_t *s,
+;                                int p /* pitch */,
+;                                const uint8_t *blimit,
+;                                const uint8_t *limit,
+;                                const uint8_t *thresh,
+;                                int count)
+;
+; r0    uint8_t *s,
+; r1    int p, /* pitch */
+; r2    const uint8_t *blimit,
+; r3    const uint8_t *limit,
+; sp    const uint8_t *thresh,
+; sp+4  int count
+|vp9_lpf_horizontal_4_neon| PROC
+    push        {lr}
+
+    vld1.8      {d0[]}, [r2]               ; duplicate *blimit
+    ldr         r12, [sp, #8]              ; load count
+    ldr         r2, [sp, #4]               ; load thresh
+    add         r1, r1, r1                 ; double pitch
+
+    cmp         r12, #0
+    beq         end_vp9_lf_h_edge
+
+    vld1.8      {d1[]}, [r3]               ; duplicate *limit
+    vld1.8      {d2[]}, [r2]               ; duplicate *thresh
+
+count_lf_h_loop
+    sub         r2, r0, r1, lsl #1         ; move src pointer down by 4 lines
+    add         r3, r2, r1, lsr #1         ; set to 3 lines down
+
+    vld1.u8     {d3}, [r2@64], r1          ; p3
+    vld1.u8     {d4}, [r3@64], r1          ; p2
+    vld1.u8     {d5}, [r2@64], r1          ; p1
+    vld1.u8     {d6}, [r3@64], r1          ; p0
+    vld1.u8     {d7}, [r2@64], r1          ; q0
+    vld1.u8     {d16}, [r3@64], r1         ; q1
+    vld1.u8     {d17}, [r2@64]             ; q2
+    vld1.u8     {d18}, [r3@64]             ; q3
+
+    sub         r2, r2, r1, lsl #1
+    sub         r3, r3, r1, lsl #1
+
+    bl          vp9_loop_filter_neon
+
+    vst1.u8     {d4}, [r2@64], r1          ; store op1
+    vst1.u8     {d5}, [r3@64], r1          ; store op0
+    vst1.u8     {d6}, [r2@64], r1          ; store oq0
+    vst1.u8     {d7}, [r3@64], r1          ; store oq1
+
+    add         r0, r0, #8
+    subs        r12, r12, #1
+    bne         count_lf_h_loop
+
+end_vp9_lf_h_edge
+    pop         {pc}
+    ENDP        ; |vp9_lpf_horizontal_4_neon|
+
+; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter
+; works on 16 iterations at a time.
+; TODO(fgalligan): See about removing the count code as this function is only
+; called with a count of 1.
+;
+; void vp9_lpf_vertical_4_neon(uint8_t *s,
+;                              int p /* pitch */,
+;                              const uint8_t *blimit,
+;                              const uint8_t *limit,
+;                              const uint8_t *thresh,
+;                              int count)
+;
+; r0    uint8_t *s,
+; r1    int p, /* pitch */
+; r2    const uint8_t *blimit,
+; r3    const uint8_t *limit,
+; sp    const uint8_t *thresh,
+; sp+4  int count
+|vp9_lpf_vertical_4_neon| PROC
+    push        {lr}
+
+    vld1.8      {d0[]}, [r2]              ; duplicate *blimit
+    ldr         r12, [sp, #8]             ; load count
+    vld1.8      {d1[]}, [r3]              ; duplicate *limit
+
+    ldr         r3, [sp, #4]              ; load thresh
+    sub         r2, r0, #4                ; move s pointer down by 4 columns
+    cmp         r12, #0
+    beq         end_vp9_lf_v_edge
+
+    vld1.8      {d2[]}, [r3]              ; duplicate *thresh
+
+count_lf_v_loop
+    vld1.u8     {d3}, [r2], r1             ; load s data
+    vld1.u8     {d4}, [r2], r1
+    vld1.u8     {d5}, [r2], r1
+    vld1.u8     {d6}, [r2], r1
+    vld1.u8     {d7}, [r2], r1
+    vld1.u8     {d16}, [r2], r1
+    vld1.u8     {d17}, [r2], r1
+    vld1.u8     {d18}, [r2]
+
+    ;transpose to 8x16 matrix
+    vtrn.32     d3, d7
+    vtrn.32     d4, d16
+    vtrn.32     d5, d17
+    vtrn.32     d6, d18
+
+    vtrn.16     d3, d5
+    vtrn.16     d4, d6
+    vtrn.16     d7, d17
+    vtrn.16     d16, d18
+
+    vtrn.8      d3, d4
+    vtrn.8      d5, d6
+    vtrn.8      d7, d16
+    vtrn.8      d17, d18
+
+    bl          vp9_loop_filter_neon
+
+    sub         r0, r0, #2
+
+    ;store op1, op0, oq0, oq1
+    vst4.8      {d4[0], d5[0], d6[0], d7[0]}, [r0], r1
+    vst4.8      {d4[1], d5[1], d6[1], d7[1]}, [r0], r1
+    vst4.8      {d4[2], d5[2], d6[2], d7[2]}, [r0], r1
+    vst4.8      {d4[3], d5[3], d6[3], d7[3]}, [r0], r1
+    vst4.8      {d4[4], d5[4], d6[4], d7[4]}, [r0], r1
+    vst4.8      {d4[5], d5[5], d6[5], d7[5]}, [r0], r1
+    vst4.8      {d4[6], d5[6], d6[6], d7[6]}, [r0], r1
+    vst4.8      {d4[7], d5[7], d6[7], d7[7]}, [r0]
+
+    add         r0, r0, r1, lsl #3         ; s += pitch * 8
+    subs        r12, r12, #1
+    subne       r2, r0, #4                 ; move s pointer down by 4 columns
+    bne         count_lf_v_loop
+
+end_vp9_lf_v_edge
+    pop         {pc}
+    ENDP        ; |vp9_lpf_vertical_4_neon|
+
+; void vp9_loop_filter_neon();
+; This is a helper function for the loopfilters. The invidual functions do the
+; necessary load, transpose (if necessary) and store. The function does not use
+; registers d8-d15.
+;
+; Inputs:
+; r0-r3, r12 PRESERVE
+; d0    blimit
+; d1    limit
+; d2    thresh
+; d3    p3
+; d4    p2
+; d5    p1
+; d6    p0
+; d7    q0
+; d16   q1
+; d17   q2
+; d18   q3
+;
+; Outputs:
+; d4    op1
+; d5    op0
+; d6    oq0
+; d7    oq1
+|vp9_loop_filter_neon| PROC
+    ; filter_mask
+    vabd.u8     d19, d3, d4                 ; m1 = abs(p3 - p2)
+    vabd.u8     d20, d4, d5                 ; m2 = abs(p2 - p1)
+    vabd.u8     d21, d5, d6                 ; m3 = abs(p1 - p0)
+    vabd.u8     d22, d16, d7                ; m4 = abs(q1 - q0)
+    vabd.u8     d3, d17, d16                ; m5 = abs(q2 - q1)
+    vabd.u8     d4, d18, d17                ; m6 = abs(q3 - q2)
+
+    ; only compare the largest value to limit
+    vmax.u8     d19, d19, d20               ; m1 = max(m1, m2)
+    vmax.u8     d20, d21, d22               ; m2 = max(m3, m4)
+
+    vabd.u8     d17, d6, d7                 ; abs(p0 - q0)
+
+    vmax.u8     d3, d3, d4                  ; m3 = max(m5, m6)
+
+    vmov.u8     d18, #0x80
+
+    vmax.u8     d23, d19, d20               ; m1 = max(m1, m2)
+
+    ; hevmask
+    vcgt.u8     d21, d21, d2                ; (abs(p1 - p0) > thresh)*-1
+    vcgt.u8     d22, d22, d2                ; (abs(q1 - q0) > thresh)*-1
+    vmax.u8     d23, d23, d3                ; m1 = max(m1, m3)
+
+    vabd.u8     d28, d5, d16                ; a = abs(p1 - q1)
+    vqadd.u8    d17, d17, d17               ; b = abs(p0 - q0) * 2
+
+    veor        d7, d7, d18                 ; qs0
+
+    vcge.u8     d23, d1, d23                ; abs(m1) > limit
+
+    ; filter() function
+    ; convert to signed
+
+    vshr.u8     d28, d28, #1                ; a = a / 2
+    veor        d6, d6, d18                 ; ps0
+
+    veor        d5, d5, d18                 ; ps1
+    vqadd.u8    d17, d17, d28               ; a = b + a
+
+    veor        d16, d16, d18               ; qs1
+
+    vmov.u8     d19, #3
+
+    vsub.s8     d28, d7, d6                 ; ( qs0 - ps0)
+
+    vcge.u8     d17, d0, d17                ; a > blimit
+
+    vqsub.s8    d27, d5, d16                ; filter = clamp(ps1-qs1)
+    vorr        d22, d21, d22               ; hevmask
+
+    vmull.s8    q12, d28, d19               ; 3 * ( qs0 - ps0)
+
+    vand        d27, d27, d22               ; filter &= hev
+    vand        d23, d23, d17               ; filter_mask
+
+    vaddw.s8    q12, q12, d27               ; filter + 3 * (qs0 - ps0)
+
+    vmov.u8     d17, #4
+
+    ; filter = clamp(filter + 3 * ( qs0 - ps0))
+    vqmovn.s16  d27, q12
+
+    vand        d27, d27, d23               ; filter &= mask
+
+    vqadd.s8    d28, d27, d19               ; filter2 = clamp(filter+3)
+    vqadd.s8    d27, d27, d17               ; filter1 = clamp(filter+4)
+    vshr.s8     d28, d28, #3                ; filter2 >>= 3
+    vshr.s8     d27, d27, #3                ; filter1 >>= 3
+
+    vqadd.s8    d19, d6, d28                ; u = clamp(ps0 + filter2)
+    vqsub.s8    d26, d7, d27                ; u = clamp(qs0 - filter1)
+
+    ; outer tap adjustments
+    vrshr.s8    d27, d27, #1                ; filter = ++filter1 >> 1
+
+    veor        d6, d26, d18                ; *oq0 = u^0x80
+
+    vbic        d27, d27, d22               ; filter &= ~hev
+
+    vqadd.s8    d21, d5, d27                ; u = clamp(ps1 + filter)
+    vqsub.s8    d20, d16, d27               ; u = clamp(qs1 - filter)
+
+    veor        d5, d19, d18                ; *op0 = u^0x80
+    veor        d4, d21, d18                ; *op1 = u^0x80
+    veor        d7, d20, d18                ; *oq1 = u^0x80
+
+    bx          lr
+    ENDP        ; |vp9_loop_filter_neon|
+
+    END
diff --git a/vp9/common/arm/neon/vp9_loopfilter_8_neon.c b/vp9/common/arm/neon/vp9_loopfilter_8_neon.c
new file mode 100644
index 000000000..33068a8a2
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_loopfilter_8_neon.c
@@ -0,0 +1,453 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+
+static INLINE void vp9_mbloop_filter_neon(
+        uint8x8_t dblimit,   // mblimit
+        uint8x8_t dlimit,    // limit
+        uint8x8_t dthresh,   // thresh
+        uint8x8_t d3u8,      // p2
+        uint8x8_t d4u8,      // p2
+        uint8x8_t d5u8,      // p1
+        uint8x8_t d6u8,      // p0
+        uint8x8_t d7u8,      // q0
+        uint8x8_t d16u8,     // q1
+        uint8x8_t d17u8,     // q2
+        uint8x8_t d18u8,     // q3
+        uint8x8_t *d0ru8,    // p1
+        uint8x8_t *d1ru8,    // p1
+        uint8x8_t *d2ru8,    // p0
+        uint8x8_t *d3ru8,    // q0
+        uint8x8_t *d4ru8,    // q1
+        uint8x8_t *d5ru8) {  // q1
+    uint32_t flat;
+    uint8x8_t d0u8, d1u8, d2u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8;
+    uint8x8_t d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
+    int16x8_t q15s16;
+    uint16x8_t q10u16, q14u16;
+    int8x8_t d21s8, d24s8, d25s8, d26s8, d28s8, d29s8, d30s8;
+
+    d19u8 = vabd_u8(d3u8, d4u8);
+    d20u8 = vabd_u8(d4u8, d5u8);
+    d21u8 = vabd_u8(d5u8, d6u8);
+    d22u8 = vabd_u8(d16u8, d7u8);
+    d23u8 = vabd_u8(d17u8, d16u8);
+    d24u8 = vabd_u8(d18u8, d17u8);
+
+    d19u8 = vmax_u8(d19u8, d20u8);
+    d20u8 = vmax_u8(d21u8, d22u8);
+
+    d25u8 = vabd_u8(d6u8, d4u8);
+
+    d23u8 = vmax_u8(d23u8, d24u8);
+
+    d26u8 = vabd_u8(d7u8, d17u8);
+
+    d19u8 = vmax_u8(d19u8, d20u8);
+
+    d24u8 = vabd_u8(d6u8, d7u8);
+    d27u8 = vabd_u8(d3u8, d6u8);
+    d28u8 = vabd_u8(d18u8, d7u8);
+
+    d19u8 = vmax_u8(d19u8, d23u8);
+
+    d23u8 = vabd_u8(d5u8, d16u8);
+    d24u8 = vqadd_u8(d24u8, d24u8);
+
+
+    d19u8 = vcge_u8(dlimit, d19u8);
+
+
+    d25u8 = vmax_u8(d25u8, d26u8);
+    d26u8 = vmax_u8(d27u8, d28u8);
+
+    d23u8 = vshr_n_u8(d23u8, 1);
+
+    d25u8 = vmax_u8(d25u8, d26u8);
+
+    d24u8 = vqadd_u8(d24u8, d23u8);
+
+    d20u8 = vmax_u8(d20u8, d25u8);
+
+    d23u8 = vdup_n_u8(1);
+    d24u8 = vcge_u8(dblimit, d24u8);
+
+    d21u8 = vcgt_u8(d21u8, dthresh);
+
+    d20u8 = vcge_u8(d23u8, d20u8);
+
+    d19u8 = vand_u8(d19u8, d24u8);
+
+    d23u8 = vcgt_u8(d22u8, dthresh);
+
+    d20u8 = vand_u8(d20u8, d19u8);
+
+    d22u8 = vdup_n_u8(0x80);
+
+    d23u8 = vorr_u8(d21u8, d23u8);
+
+    q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8),
+                          vreinterpret_u16_u8(d21u8));
+
+    d30u8 = vshrn_n_u16(q10u16, 4);
+    flat = vget_lane_u32(vreinterpret_u32_u8(d30u8), 0);
+
+    if (flat == 0xffffffff) {  // Check for all 1's, power_branch_only
+        d27u8 = vdup_n_u8(3);
+        d21u8 = vdup_n_u8(2);
+        q14u16 = vaddl_u8(d6u8, d7u8);
+        q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
+        q14u16 = vmlal_u8(q14u16, d4u8, d21u8);
+        q14u16 = vaddw_u8(q14u16, d5u8);
+        *d0ru8 = vqrshrn_n_u16(q14u16, 3);
+
+        q14u16 = vsubw_u8(q14u16, d3u8);
+        q14u16 = vsubw_u8(q14u16, d4u8);
+        q14u16 = vaddw_u8(q14u16, d5u8);
+        q14u16 = vaddw_u8(q14u16, d16u8);
+        *d1ru8 = vqrshrn_n_u16(q14u16, 3);
+
+        q14u16 = vsubw_u8(q14u16, d3u8);
+        q14u16 = vsubw_u8(q14u16, d5u8);
+        q14u16 = vaddw_u8(q14u16, d6u8);
+        q14u16 = vaddw_u8(q14u16, d17u8);
+        *d2ru8 = vqrshrn_n_u16(q14u16, 3);
+
+        q14u16 = vsubw_u8(q14u16, d3u8);
+        q14u16 = vsubw_u8(q14u16, d6u8);
+        q14u16 = vaddw_u8(q14u16, d7u8);
+        q14u16 = vaddw_u8(q14u16, d18u8);
+        *d3ru8 = vqrshrn_n_u16(q14u16, 3);
+
+        q14u16 = vsubw_u8(q14u16, d4u8);
+        q14u16 = vsubw_u8(q14u16, d7u8);
+        q14u16 = vaddw_u8(q14u16, d16u8);
+        q14u16 = vaddw_u8(q14u16, d18u8);
+        *d4ru8 = vqrshrn_n_u16(q14u16, 3);
+
+        q14u16 = vsubw_u8(q14u16, d5u8);
+        q14u16 = vsubw_u8(q14u16, d16u8);
+        q14u16 = vaddw_u8(q14u16, d17u8);
+        q14u16 = vaddw_u8(q14u16, d18u8);
+        *d5ru8 = vqrshrn_n_u16(q14u16, 3);
+    } else {
+        d21u8 = veor_u8(d7u8,  d22u8);
+        d24u8 = veor_u8(d6u8,  d22u8);
+        d25u8 = veor_u8(d5u8,  d22u8);
+        d26u8 = veor_u8(d16u8, d22u8);
+
+        d27u8 = vdup_n_u8(3);
+
+        d28s8 = vsub_s8(vreinterpret_s8_u8(d21u8), vreinterpret_s8_u8(d24u8));
+        d29s8 = vqsub_s8(vreinterpret_s8_u8(d25u8), vreinterpret_s8_u8(d26u8));
+
+        q15s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d27u8));
+
+        d29s8 = vand_s8(d29s8, vreinterpret_s8_u8(d23u8));
+
+        q15s16 = vaddw_s8(q15s16, d29s8);
+
+        d29u8 = vdup_n_u8(4);
+
+        d28s8 = vqmovn_s16(q15s16);
+
+        d28s8 = vand_s8(d28s8, vreinterpret_s8_u8(d19u8));
+
+        d30s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d27u8));
+        d29s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d29u8));
+        d30s8 = vshr_n_s8(d30s8, 3);
+        d29s8 = vshr_n_s8(d29s8, 3);
+
+        d24s8 = vqadd_s8(vreinterpret_s8_u8(d24u8), d30s8);
+        d21s8 = vqsub_s8(vreinterpret_s8_u8(d21u8), d29s8);
+
+        d29s8 = vrshr_n_s8(d29s8, 1);
+        d29s8 = vbic_s8(d29s8, vreinterpret_s8_u8(d23u8));
+
+        d25s8 = vqadd_s8(vreinterpret_s8_u8(d25u8), d29s8);
+        d26s8 = vqsub_s8(vreinterpret_s8_u8(d26u8), d29s8);
+
+        if (flat == 0) {  // filter_branch_only
+            *d0ru8 = d4u8;
+            *d1ru8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
+            *d2ru8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
+            *d3ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
+            *d4ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
+            *d5ru8 = d17u8;
+            return;
+        }
+
+        d21u8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
+        d24u8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
+        d25u8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
+        d26u8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
+
+        d23u8 = vdup_n_u8(2);
+        q14u16 = vaddl_u8(d6u8, d7u8);
+        q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
+        q14u16 = vmlal_u8(q14u16, d4u8, d23u8);
+
+        d0u8 = vbsl_u8(d20u8, dblimit, d4u8);
+
+        q14u16 = vaddw_u8(q14u16, d5u8);
+
+        d1u8 = vbsl_u8(d20u8, dlimit, d25u8);
+
+        d30u8 = vqrshrn_n_u16(q14u16, 3);
+
+        q14u16 = vsubw_u8(q14u16, d3u8);
+        q14u16 = vsubw_u8(q14u16, d4u8);
+        q14u16 = vaddw_u8(q14u16, d5u8);
+        q14u16 = vaddw_u8(q14u16, d16u8);
+
+        d2u8 = vbsl_u8(d20u8, dthresh, d24u8);
+
+        d31u8 = vqrshrn_n_u16(q14u16, 3);
+
+        q14u16 = vsubw_u8(q14u16, d3u8);
+        q14u16 = vsubw_u8(q14u16, d5u8);
+        q14u16 = vaddw_u8(q14u16, d6u8);
+        q14u16 = vaddw_u8(q14u16, d17u8);
+
+        *d0ru8 = vbsl_u8(d20u8, d30u8, d0u8);
+
+        d23u8 = vqrshrn_n_u16(q14u16, 3);
+
+        q14u16 = vsubw_u8(q14u16, d3u8);
+        q14u16 = vsubw_u8(q14u16, d6u8);
+        q14u16 = vaddw_u8(q14u16, d7u8);
+
+        *d1ru8 = vbsl_u8(d20u8, d31u8, d1u8);
+
+        q14u16 = vaddw_u8(q14u16, d18u8);
+
+        *d2ru8 = vbsl_u8(d20u8, d23u8, d2u8);
+
+        d22u8 = vqrshrn_n_u16(q14u16, 3);
+
+        q14u16 = vsubw_u8(q14u16, d4u8);
+        q14u16 = vsubw_u8(q14u16, d7u8);
+        q14u16 = vaddw_u8(q14u16, d16u8);
+
+        d3u8 = vbsl_u8(d20u8, d3u8, d21u8);
+
+        q14u16 = vaddw_u8(q14u16, d18u8);
+
+        d4u8 = vbsl_u8(d20u8, d4u8, d26u8);
+
+        d6u8 = vqrshrn_n_u16(q14u16, 3);
+
+        q14u16 = vsubw_u8(q14u16, d5u8);
+        q14u16 = vsubw_u8(q14u16, d16u8);
+        q14u16 = vaddw_u8(q14u16, d17u8);
+        q14u16 = vaddw_u8(q14u16, d18u8);
+
+        d5u8 = vbsl_u8(d20u8, d5u8, d17u8);
+
+        d7u8 = vqrshrn_n_u16(q14u16, 3);
+
+        *d3ru8 = vbsl_u8(d20u8, d22u8, d3u8);
+        *d4ru8 = vbsl_u8(d20u8, d6u8, d4u8);
+        *d5ru8 = vbsl_u8(d20u8, d7u8, d5u8);
+    }
+    return;
+}
+
+void vp9_lpf_horizontal_8_neon(
+        unsigned char *src,
+        int pitch,
+        unsigned char *blimit,
+        unsigned char *limit,
+        unsigned char *thresh,
+        int count) {
+    int i;
+    uint8_t *s, *psrc;
+    uint8x8_t dblimit, dlimit, dthresh;
+    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+    uint8x8_t d16u8, d17u8, d18u8;
+
+    if (count == 0)  // end_vp9_mblf_h_edge
+        return;
+
+    dblimit = vld1_u8(blimit);
+    dlimit = vld1_u8(limit);
+    dthresh = vld1_u8(thresh);
+
+    psrc = src - (pitch << 2);
+    for (i = 0; i < count; i++) {
+        s = psrc + i * 8;
+
+        d3u8  = vld1_u8(s);
+        s += pitch;
+        d4u8  = vld1_u8(s);
+        s += pitch;
+        d5u8  = vld1_u8(s);
+        s += pitch;
+        d6u8  = vld1_u8(s);
+        s += pitch;
+        d7u8  = vld1_u8(s);
+        s += pitch;
+        d16u8 = vld1_u8(s);
+        s += pitch;
+        d17u8 = vld1_u8(s);
+        s += pitch;
+        d18u8 = vld1_u8(s);
+
+        vp9_mbloop_filter_neon(dblimit, dlimit, dthresh,
+                             d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
+                             &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8);
+
+        s -= (pitch * 6);
+        vst1_u8(s, d0u8);
+        s += pitch;
+        vst1_u8(s, d1u8);
+        s += pitch;
+        vst1_u8(s, d2u8);
+        s += pitch;
+        vst1_u8(s, d3u8);
+        s += pitch;
+        vst1_u8(s, d4u8);
+        s += pitch;
+        vst1_u8(s, d5u8);
+    }
+    return;
+}
+
+void vp9_lpf_vertical_8_neon(
+        unsigned char *src,
+        int pitch,
+        unsigned char *blimit,
+        unsigned char *limit,
+        unsigned char *thresh,
+        int count) {
+    int i;
+    uint8_t *s;
+    uint8x8_t dblimit, dlimit, dthresh;
+    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+    uint8x8_t d16u8, d17u8, d18u8;
+    uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
+    uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
+    uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
+    uint8x8x4_t d4Result;
+    uint8x8x2_t d2Result;
+
+    if (count == 0)
+        return;
+
+    dblimit = vld1_u8(blimit);
+    dlimit = vld1_u8(limit);
+    dthresh = vld1_u8(thresh);
+
+    for (i = 0; i < count; i++) {
+        s = src + (i * (pitch << 3)) - 4;
+
+        d3u8 = vld1_u8(s);
+        s += pitch;
+        d4u8 = vld1_u8(s);
+        s += pitch;
+        d5u8 = vld1_u8(s);
+        s += pitch;
+        d6u8 = vld1_u8(s);
+        s += pitch;
+        d7u8 = vld1_u8(s);
+        s += pitch;
+        d16u8 = vld1_u8(s);
+        s += pitch;
+        d17u8 = vld1_u8(s);
+        s += pitch;
+        d18u8 = vld1_u8(s);
+
+        d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8),
+                          vreinterpret_u32_u8(d7u8));
+        d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8),
+                          vreinterpret_u32_u8(d16u8));
+        d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8),
+                          vreinterpret_u32_u8(d17u8));
+        d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8),
+                          vreinterpret_u32_u8(d18u8));
+
+        d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
+                          vreinterpret_u16_u32(d2tmp2.val[0]));
+        d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
+                          vreinterpret_u16_u32(d2tmp3.val[0]));
+        d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
+                          vreinterpret_u16_u32(d2tmp2.val[1]));
+        d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
+                          vreinterpret_u16_u32(d2tmp3.val[1]));
+
+        d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
+                         vreinterpret_u8_u16(d2tmp5.val[0]));
+        d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
+                         vreinterpret_u8_u16(d2tmp5.val[1]));
+        d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
+                          vreinterpret_u8_u16(d2tmp7.val[0]));
+        d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
+                          vreinterpret_u8_u16(d2tmp7.val[1]));
+
+        d3u8 = d2tmp8.val[0];
+        d4u8 = d2tmp8.val[1];
+        d5u8 = d2tmp9.val[0];
+        d6u8 = d2tmp9.val[1];
+        d7u8 = d2tmp10.val[0];
+        d16u8 = d2tmp10.val[1];
+        d17u8 = d2tmp11.val[0];
+        d18u8 = d2tmp11.val[1];
+
+        vp9_mbloop_filter_neon(dblimit, dlimit, dthresh,
+                             d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
+                             &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8);
+
+        d4Result.val[0] = d0u8;
+        d4Result.val[1] = d1u8;
+        d4Result.val[2] = d2u8;
+        d4Result.val[3] = d3u8;
+
+        d2Result.val[0] = d4u8;
+        d2Result.val[1] = d5u8;
+
+        s = src - 3;
+        vst4_lane_u8(s, d4Result, 0);
+        s += pitch;
+        vst4_lane_u8(s, d4Result, 1);
+        s += pitch;
+        vst4_lane_u8(s, d4Result, 2);
+        s += pitch;
+        vst4_lane_u8(s, d4Result, 3);
+        s += pitch;
+        vst4_lane_u8(s, d4Result, 4);
+        s += pitch;
+        vst4_lane_u8(s, d4Result, 5);
+        s += pitch;
+        vst4_lane_u8(s, d4Result, 6);
+        s += pitch;
+        vst4_lane_u8(s, d4Result, 7);
+
+        s = src + 1;
+        vst2_lane_u8(s, d2Result, 0);
+        s += pitch;
+        vst2_lane_u8(s, d2Result, 1);
+        s += pitch;
+        vst2_lane_u8(s, d2Result, 2);
+        s += pitch;
+        vst2_lane_u8(s, d2Result, 3);
+        s += pitch;
+        vst2_lane_u8(s, d2Result, 4);
+        s += pitch;
+        vst2_lane_u8(s, d2Result, 5);
+        s += pitch;
+        vst2_lane_u8(s, d2Result, 6);
+        s += pitch;
+        vst2_lane_u8(s, d2Result, 7);
+    }
+    return;
+}
diff --git a/vp9/common/arm/neon/vp9_loopfilter_neon_asm.asm b/vp9/common/arm/neon/vp9_loopfilter_8_neon_asm.asm
index 443032217..91aaec04e 100644
--- a/vp9/common/arm/neon/vp9_loopfilter_neon_asm.asm
+++ b/vp9/common/arm/neon/vp9_loopfilter_8_neon_asm.asm
@@ -8,8 +8,6 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-    EXPORT  |vp9_lpf_horizontal_4_neon|
-    EXPORT  |vp9_lpf_vertical_4_neon|
     EXPORT  |vp9_lpf_horizontal_8_neon|
     EXPORT  |vp9_lpf_vertical_8_neon|
     ARM
@@ -21,261 +19,6 @@
 ; TODO(fgalligan): See about removing the count code as this function is only
 ; called with a count of 1.
 ;
-; void vp9_lpf_horizontal_4_neon(uint8_t *s,
-;                                int p /* pitch */,
-;                                const uint8_t *blimit,
-;                                const uint8_t *limit,
-;                                const uint8_t *thresh,
-;                                int count)
-;
-; r0    uint8_t *s,
-; r1    int p, /* pitch */
-; r2    const uint8_t *blimit,
-; r3    const uint8_t *limit,
-; sp    const uint8_t *thresh,
-; sp+4  int count
-|vp9_lpf_horizontal_4_neon| PROC
-    push        {lr}
-
-    vld1.8      {d0[]}, [r2]               ; duplicate *blimit
-    ldr         r12, [sp, #8]              ; load count
-    ldr         r2, [sp, #4]               ; load thresh
-    add         r1, r1, r1                 ; double pitch
-
-    cmp         r12, #0
-    beq         end_vp9_lf_h_edge
-
-    vld1.8      {d1[]}, [r3]               ; duplicate *limit
-    vld1.8      {d2[]}, [r2]               ; duplicate *thresh
-
-count_lf_h_loop
-    sub         r2, r0, r1, lsl #1         ; move src pointer down by 4 lines
-    add         r3, r2, r1, lsr #1         ; set to 3 lines down
-
-    vld1.u8     {d3}, [r2@64], r1          ; p3
-    vld1.u8     {d4}, [r3@64], r1          ; p2
-    vld1.u8     {d5}, [r2@64], r1          ; p1
-    vld1.u8     {d6}, [r3@64], r1          ; p0
-    vld1.u8     {d7}, [r2@64], r1          ; q0
-    vld1.u8     {d16}, [r3@64], r1         ; q1
-    vld1.u8     {d17}, [r2@64]             ; q2
-    vld1.u8     {d18}, [r3@64]             ; q3
-
-    sub         r2, r2, r1, lsl #1
-    sub         r3, r3, r1, lsl #1
-
-    bl          vp9_loop_filter_neon
-
-    vst1.u8     {d4}, [r2@64], r1          ; store op1
-    vst1.u8     {d5}, [r3@64], r1          ; store op0
-    vst1.u8     {d6}, [r2@64], r1          ; store oq0
-    vst1.u8     {d7}, [r3@64], r1          ; store oq1
-
-    add         r0, r0, #8
-    subs        r12, r12, #1
-    bne         count_lf_h_loop
-
-end_vp9_lf_h_edge
-    pop         {pc}
-    ENDP        ; |vp9_lpf_horizontal_4_neon|
-
-; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter
-; works on 16 iterations at a time.
-; TODO(fgalligan): See about removing the count code as this function is only
-; called with a count of 1.
-;
-; void vp9_lpf_vertical_4_neon(uint8_t *s,
-;                              int p /* pitch */,
-;                              const uint8_t *blimit,
-;                              const uint8_t *limit,
-;                              const uint8_t *thresh,
-;                              int count)
-;
-; r0    uint8_t *s,
-; r1    int p, /* pitch */
-; r2    const uint8_t *blimit,
-; r3    const uint8_t *limit,
-; sp    const uint8_t *thresh,
-; sp+4  int count
-|vp9_lpf_vertical_4_neon| PROC
-    push        {lr}
-
-    vld1.8      {d0[]}, [r2]              ; duplicate *blimit
-    ldr         r12, [sp, #8]             ; load count
-    vld1.8      {d1[]}, [r3]              ; duplicate *limit
-
-    ldr         r3, [sp, #4]              ; load thresh
-    sub         r2, r0, #4                ; move s pointer down by 4 columns
-    cmp         r12, #0
-    beq         end_vp9_lf_v_edge
-
-    vld1.8      {d2[]}, [r3]              ; duplicate *thresh
-
-count_lf_v_loop
-    vld1.u8     {d3}, [r2], r1             ; load s data
-    vld1.u8     {d4}, [r2], r1
-    vld1.u8     {d5}, [r2], r1
-    vld1.u8     {d6}, [r2], r1
-    vld1.u8     {d7}, [r2], r1
-    vld1.u8     {d16}, [r2], r1
-    vld1.u8     {d17}, [r2], r1
-    vld1.u8     {d18}, [r2]
-
-    ;transpose to 8x16 matrix
-    vtrn.32     d3, d7
-    vtrn.32     d4, d16
-    vtrn.32     d5, d17
-    vtrn.32     d6, d18
-
-    vtrn.16     d3, d5
-    vtrn.16     d4, d6
-    vtrn.16     d7, d17
-    vtrn.16     d16, d18
-
-    vtrn.8      d3, d4
-    vtrn.8      d5, d6
-    vtrn.8      d7, d16
-    vtrn.8      d17, d18
-
-    bl          vp9_loop_filter_neon
-
-    sub         r0, r0, #2
-
-    ;store op1, op0, oq0, oq1
-    vst4.8      {d4[0], d5[0], d6[0], d7[0]}, [r0], r1
-    vst4.8      {d4[1], d5[1], d6[1], d7[1]}, [r0], r1
-    vst4.8      {d4[2], d5[2], d6[2], d7[2]}, [r0], r1
-    vst4.8      {d4[3], d5[3], d6[3], d7[3]}, [r0], r1
-    vst4.8      {d4[4], d5[4], d6[4], d7[4]}, [r0], r1
-    vst4.8      {d4[5], d5[5], d6[5], d7[5]}, [r0], r1
-    vst4.8      {d4[6], d5[6], d6[6], d7[6]}, [r0], r1
-    vst4.8      {d4[7], d5[7], d6[7], d7[7]}, [r0]
-
-    add         r0, r0, r1, lsl #3         ; s += pitch * 8
-    subs        r12, r12, #1
-    subne       r2, r0, #4                 ; move s pointer down by 4 columns
-    bne         count_lf_v_loop
-
-end_vp9_lf_v_edge
-    pop         {pc}
-    ENDP        ; |vp9_lpf_vertical_4_neon|
-
-; void vp9_loop_filter_neon();
-; This is a helper function for the loopfilters. The invidual functions do the
-; necessary load, transpose (if necessary) and store. The function does not use
-; registers d8-d15.
-;
-; Inputs:
-; r0-r3, r12 PRESERVE
-; d0    blimit
-; d1    limit
-; d2    thresh
-; d3    p3
-; d4    p2
-; d5    p1
-; d6    p0
-; d7    q0
-; d16   q1
-; d17   q2
-; d18   q3
-;
-; Outputs:
-; d4    op1
-; d5    op0
-; d6    oq0
-; d7    oq1
-|vp9_loop_filter_neon| PROC
-    ; filter_mask
-    vabd.u8     d19, d3, d4                 ; m1 = abs(p3 - p2)
-    vabd.u8     d20, d4, d5                 ; m2 = abs(p2 - p1)
-    vabd.u8     d21, d5, d6                 ; m3 = abs(p1 - p0)
-    vabd.u8     d22, d16, d7                ; m4 = abs(q1 - q0)
-    vabd.u8     d3, d17, d16                ; m5 = abs(q2 - q1)
-    vabd.u8     d4, d18, d17                ; m6 = abs(q3 - q2)
-
-    ; only compare the largest value to limit
-    vmax.u8     d19, d19, d20               ; m1 = max(m1, m2)
-    vmax.u8     d20, d21, d22               ; m2 = max(m3, m4)
-
-    vabd.u8     d17, d6, d7                 ; abs(p0 - q0)
-
-    vmax.u8     d3, d3, d4                  ; m3 = max(m5, m6)
-
-    vmov.u8     d18, #0x80
-
-    vmax.u8     d23, d19, d20               ; m1 = max(m1, m2)
-
-    ; hevmask
-    vcgt.u8     d21, d21, d2                ; (abs(p1 - p0) > thresh)*-1
-    vcgt.u8     d22, d22, d2                ; (abs(q1 - q0) > thresh)*-1
-    vmax.u8     d23, d23, d3                ; m1 = max(m1, m3)
-
-    vabd.u8     d28, d5, d16                ; a = abs(p1 - q1)
-    vqadd.u8    d17, d17, d17               ; b = abs(p0 - q0) * 2
-
-    veor        d7, d7, d18                 ; qs0
-
-    vcge.u8     d23, d1, d23                ; abs(m1) > limit
-
-    ; filter() function
-    ; convert to signed
-
-    vshr.u8     d28, d28, #1                ; a = a / 2
-    veor        d6, d6, d18                 ; ps0
-
-    veor        d5, d5, d18                 ; ps1
-    vqadd.u8    d17, d17, d28               ; a = b + a
-
-    veor        d16, d16, d18               ; qs1
-
-    vmov.u8     d19, #3
-
-    vsub.s8     d28, d7, d6                 ; ( qs0 - ps0)
-
-    vcge.u8     d17, d0, d17                ; a > blimit
-
-    vqsub.s8    d27, d5, d16                ; filter = clamp(ps1-qs1)
-    vorr        d22, d21, d22               ; hevmask
-
-    vmull.s8    q12, d28, d19               ; 3 * ( qs0 - ps0)
-
-    vand        d27, d27, d22               ; filter &= hev
-    vand        d23, d23, d17               ; filter_mask
-
-    vaddw.s8    q12, q12, d27               ; filter + 3 * (qs0 - ps0)
-
-    vmov.u8     d17, #4
-
-    ; filter = clamp(filter + 3 * ( qs0 - ps0))
-    vqmovn.s16  d27, q12
-
-    vand        d27, d27, d23               ; filter &= mask
-
-    vqadd.s8    d28, d27, d19               ; filter2 = clamp(filter+3)
-    vqadd.s8    d27, d27, d17               ; filter1 = clamp(filter+4)
-    vshr.s8     d28, d28, #3                ; filter2 >>= 3
-    vshr.s8     d27, d27, #3                ; filter1 >>= 3
-
-    vqadd.s8    d19, d6, d28                ; u = clamp(ps0 + filter2)
-    vqsub.s8    d26, d7, d27                ; u = clamp(qs0 - filter1)
-
-    ; outer tap adjustments
-    vrshr.s8    d27, d27, #1                ; filter = ++filter1 >> 1
-
-    veor        d6, d26, d18                ; *oq0 = u^0x80
-
-    vbic        d27, d27, d22               ; filter &= ~hev
-
-    vqadd.s8    d21, d5, d27                ; u = clamp(ps1 + filter)
-    vqsub.s8    d20, d16, d27               ; u = clamp(qs1 - filter)
-
-    veor        d5, d19, d18                ; *op0 = u^0x80
-    veor        d4, d21, d18                ; *op1 = u^0x80
-    veor        d7, d20, d18                ; *oq1 = u^0x80
-
-    bx          lr
-    ENDP        ; |vp9_loop_filter_neon|
-
 ; void vp9_lpf_horizontal_8_neon(uint8_t *s, int p,
 ;                                const uint8_t *blimit,
 ;                                const uint8_t *limit,
diff --git a/vp9/common/arm/neon/vp9_loopfilter_neon.c b/vp9/common/arm/neon/vp9_loopfilter_neon.c
index 079d26677..31fcc63ba 100644
--- a/vp9/common/arm/neon/vp9_loopfilter_neon.c
+++ b/vp9/common/arm/neon/vp9_loopfilter_neon.c
@@ -10,705 +10,49 @@
 
 #include <arm_neon.h>
 
+#include "./vp9_rtcd.h"
 #include "./vpx_config.h"
-
-static INLINE void vp9_loop_filter_neon(
-        uint8x8_t dblimit,    // flimit
-        uint8x8_t dlimit,     // limit
-        uint8x8_t dthresh,    // thresh
-        uint8x8_t d3u8,       // p3
-        uint8x8_t d4u8,       // p2
-        uint8x8_t d5u8,       // p1
-        uint8x8_t d6u8,       // p0
-        uint8x8_t d7u8,       // q0
-        uint8x8_t d16u8,      // q1
-        uint8x8_t d17u8,      // q2
-        uint8x8_t d18u8,      // q3
-        uint8x8_t *d4ru8,     // p1
-        uint8x8_t *d5ru8,     // p0
-        uint8x8_t *d6ru8,     // q0
-        uint8x8_t *d7ru8) {   // q1
-    uint8x8_t d19u8, d20u8, d21u8, d22u8, d23u8, d27u8, d28u8;
-    int16x8_t q12s16;
-    int8x8_t d19s8, d20s8, d21s8, d26s8, d27s8, d28s8;
-
-    d19u8 = vabd_u8(d3u8, d4u8);
-    d20u8 = vabd_u8(d4u8, d5u8);
-    d21u8 = vabd_u8(d5u8, d6u8);
-    d22u8 = vabd_u8(d16u8, d7u8);
-    d3u8  = vabd_u8(d17u8, d16u8);
-    d4u8  = vabd_u8(d18u8, d17u8);
-
-    d19u8 = vmax_u8(d19u8, d20u8);
-    d20u8 = vmax_u8(d21u8, d22u8);
-    d3u8  = vmax_u8(d3u8,  d4u8);
-    d23u8 = vmax_u8(d19u8, d20u8);
-
-    d17u8 = vabd_u8(d6u8, d7u8);
-
-    d21u8 = vcgt_u8(d21u8, dthresh);
-    d22u8 = vcgt_u8(d22u8, dthresh);
-    d23u8 = vmax_u8(d23u8, d3u8);
-
-    d28u8 = vabd_u8(d5u8, d16u8);
-    d17u8 = vqadd_u8(d17u8, d17u8);
-
-    d23u8 = vcge_u8(dlimit, d23u8);
-
-    d18u8 = vdup_n_u8(0x80);
-    d5u8  = veor_u8(d5u8,  d18u8);
-    d6u8  = veor_u8(d6u8,  d18u8);
-    d7u8  = veor_u8(d7u8,  d18u8);
-    d16u8 = veor_u8(d16u8, d18u8);
-
-    d28u8 = vshr_n_u8(d28u8, 1);
-    d17u8 = vqadd_u8(d17u8, d28u8);
-
-    d19u8 = vdup_n_u8(3);
-
-    d28s8 = vsub_s8(vreinterpret_s8_u8(d7u8),
-                    vreinterpret_s8_u8(d6u8));
-
-    d17u8 = vcge_u8(dblimit, d17u8);
-
-    d27s8 = vqsub_s8(vreinterpret_s8_u8(d5u8),
-                     vreinterpret_s8_u8(d16u8));
-
-    d22u8 = vorr_u8(d21u8, d22u8);
-
-    q12s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d19u8));
-
-    d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d22u8);
-    d23u8 = vand_u8(d23u8, d17u8);
-
-    q12s16 = vaddw_s8(q12s16, vreinterpret_s8_u8(d27u8));
-
-    d17u8 = vdup_n_u8(4);
-
-    d27s8 = vqmovn_s16(q12s16);
-    d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d23u8);
-    d27s8 = vreinterpret_s8_u8(d27u8);
-
-    d28s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d19u8));
-    d27s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d17u8));
-    d28s8 = vshr_n_s8(d28s8, 3);
-    d27s8 = vshr_n_s8(d27s8, 3);
-
-    d19s8 = vqadd_s8(vreinterpret_s8_u8(d6u8), d28s8);
-    d26s8 = vqsub_s8(vreinterpret_s8_u8(d7u8), d27s8);
-
-    d27s8 = vrshr_n_s8(d27s8, 1);
-    d27s8 = vbic_s8(d27s8, vreinterpret_s8_u8(d22u8));
-
-    d21s8 = vqadd_s8(vreinterpret_s8_u8(d5u8), d27s8);
-    d20s8 = vqsub_s8(vreinterpret_s8_u8(d16u8), d27s8);
-
-    *d4ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d18u8);
-    *d5ru8 = veor_u8(vreinterpret_u8_s8(d19s8), d18u8);
-    *d6ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d18u8);
-    *d7ru8 = veor_u8(vreinterpret_u8_s8(d20s8), d18u8);
-    return;
-}
-
-void vp9_lpf_horizontal_4_neon(
-        unsigned char *src,
-        int pitch,
-        unsigned char *blimit,
-        unsigned char *limit,
-        unsigned char *thresh,
-        int count) {
-    int i;
-    uint8_t *s, *psrc;
-    uint8x8_t dblimit, dlimit, dthresh;
-    uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
-
-    if (count == 0)  // end_vp9_lf_h_edge
-        return;
-
-    dblimit = vld1_u8(blimit);
-    dlimit = vld1_u8(limit);
-    dthresh = vld1_u8(thresh);
-
-    psrc = src - (pitch << 2);
-    for (i = 0; i < count; i++) {
-        s = psrc + i * 8;
-
-        d3u8 = vld1_u8(s);
-        s += pitch;
-        d4u8 = vld1_u8(s);
-        s += pitch;
-        d5u8 = vld1_u8(s);
-        s += pitch;
-        d6u8 = vld1_u8(s);
-        s += pitch;
-        d7u8 = vld1_u8(s);
-        s += pitch;
-        d16u8 = vld1_u8(s);
-        s += pitch;
-        d17u8 = vld1_u8(s);
-        s += pitch;
-        d18u8 = vld1_u8(s);
-
-        vp9_loop_filter_neon(dblimit, dlimit, dthresh,
-                             d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
-                             &d4u8, &d5u8, &d6u8, &d7u8);
-
-        s -= (pitch * 5);
-        vst1_u8(s, d4u8);
-        s += pitch;
-        vst1_u8(s, d5u8);
-        s += pitch;
-        vst1_u8(s, d6u8);
-        s += pitch;
-        vst1_u8(s, d7u8);
-    }
-    return;
-}
-
-void vp9_lpf_vertical_4_neon(
-        unsigned char *src,
-        int pitch,
-        unsigned char *blimit,
-        unsigned char *limit,
-        unsigned char *thresh,
-        int count) {
-    int i, pitch8;
-    uint8_t *s;
-    uint8x8_t dblimit, dlimit, dthresh;
-    uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
-    uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
-    uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
-    uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
-    uint8x8x4_t d4Result;
-
-    if (count == 0)  // end_vp9_lf_h_edge
-        return;
-
-    dblimit = vld1_u8(blimit);
-    dlimit = vld1_u8(limit);
-    dthresh = vld1_u8(thresh);
-
-    pitch8 = pitch * 8;
-    for (i = 0; i < count; i++, src += pitch8) {
-        s = src - (i + 1) * 4;
-
-        d3u8 = vld1_u8(s);
-        s += pitch;
-        d4u8 = vld1_u8(s);
-        s += pitch;
-        d5u8 = vld1_u8(s);
-        s += pitch;
-        d6u8 = vld1_u8(s);
-        s += pitch;
-        d7u8 = vld1_u8(s);
-        s += pitch;
-        d16u8 = vld1_u8(s);
-        s += pitch;
-        d17u8 = vld1_u8(s);
-        s += pitch;
-        d18u8 = vld1_u8(s);
-
-        d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8),
-                      vreinterpret_u32_u8(d7u8));
-        d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8),
-                      vreinterpret_u32_u8(d16u8));
-        d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8),
-                      vreinterpret_u32_u8(d17u8));
-        d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8),
-                      vreinterpret_u32_u8(d18u8));
-
-        d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
-                          vreinterpret_u16_u32(d2tmp2.val[0]));
-        d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
-                          vreinterpret_u16_u32(d2tmp3.val[0]));
-        d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
-                          vreinterpret_u16_u32(d2tmp2.val[1]));
-        d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
-                          vreinterpret_u16_u32(d2tmp3.val[1]));
-
-        d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
-                         vreinterpret_u8_u16(d2tmp5.val[0]));
-        d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
-                         vreinterpret_u8_u16(d2tmp5.val[1]));
-        d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
-                          vreinterpret_u8_u16(d2tmp7.val[0]));
-        d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
-                          vreinterpret_u8_u16(d2tmp7.val[1]));
-
-        d3u8 = d2tmp8.val[0];
-        d4u8 = d2tmp8.val[1];
-        d5u8 = d2tmp9.val[0];
-        d6u8 = d2tmp9.val[1];
-        d7u8 = d2tmp10.val[0];
-        d16u8 = d2tmp10.val[1];
-        d17u8 = d2tmp11.val[0];
-        d18u8 = d2tmp11.val[1];
-
-        vp9_loop_filter_neon(dblimit, dlimit, dthresh,
-                             d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
-                             &d4u8, &d5u8, &d6u8, &d7u8);
-
-        d4Result.val[0] = d4u8;
-        d4Result.val[1] = d5u8;
-        d4Result.val[2] = d6u8;
-        d4Result.val[3] = d7u8;
-
-        src -= 2;
-        vst4_lane_u8(src, d4Result, 0);
-        src += pitch;
-        vst4_lane_u8(src, d4Result, 1);
-        src += pitch;
-        vst4_lane_u8(src, d4Result, 2);
-        src += pitch;
-        vst4_lane_u8(src, d4Result, 3);
-        src += pitch;
-        vst4_lane_u8(src, d4Result, 4);
-        src += pitch;
-        vst4_lane_u8(src, d4Result, 5);
-        src += pitch;
-        vst4_lane_u8(src, d4Result, 6);
-        src += pitch;
-        vst4_lane_u8(src, d4Result, 7);
-    }
-    return;
+#include "vpx/vpx_integer.h"
+
+void vp9_lpf_vertical_4_dual_neon(uint8_t *s, int p,
+                                  const uint8_t *blimit0,
+                                  const uint8_t *limit0,
+                                  const uint8_t *thresh0,
+                                  const uint8_t *blimit1,
+                                  const uint8_t *limit1,
+                                  const uint8_t *thresh1) {
+  vp9_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0, 1);
+  vp9_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
 }
 
-static INLINE void vp9_mbloop_filter_neon(
-        uint8x8_t dblimit,   // mblimit
-        uint8x8_t dlimit,    // limit
-        uint8x8_t dthresh,   // thresh
-        uint8x8_t d3u8,      // p2
-        uint8x8_t d4u8,      // p2
-        uint8x8_t d5u8,      // p1
-        uint8x8_t d6u8,      // p0
-        uint8x8_t d7u8,      // q0
-        uint8x8_t d16u8,     // q1
-        uint8x8_t d17u8,     // q2
-        uint8x8_t d18u8,     // q3
-        uint8x8_t *d0ru8,    // p1
-        uint8x8_t *d1ru8,    // p1
-        uint8x8_t *d2ru8,    // p0
-        uint8x8_t *d3ru8,    // q0
-        uint8x8_t *d4ru8,    // q1
-        uint8x8_t *d5ru8) {  // q1
-    uint32_t flat;
-    uint8x8_t d0u8, d1u8, d2u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8;
-    uint8x8_t d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
-    int16x8_t q15s16;
-    uint16x8_t q10u16, q14u16;
-    int8x8_t d21s8, d24s8, d25s8, d26s8, d28s8, d29s8, d30s8;
-
-    d19u8 = vabd_u8(d3u8, d4u8);
-    d20u8 = vabd_u8(d4u8, d5u8);
-    d21u8 = vabd_u8(d5u8, d6u8);
-    d22u8 = vabd_u8(d16u8, d7u8);
-    d23u8 = vabd_u8(d17u8, d16u8);
-    d24u8 = vabd_u8(d18u8, d17u8);
-
-    d19u8 = vmax_u8(d19u8, d20u8);
-    d20u8 = vmax_u8(d21u8, d22u8);
-
-    d25u8 = vabd_u8(d6u8, d4u8);
-
-    d23u8 = vmax_u8(d23u8, d24u8);
-
-    d26u8 = vabd_u8(d7u8, d17u8);
-
-    d19u8 = vmax_u8(d19u8, d20u8);
-
-    d24u8 = vabd_u8(d6u8, d7u8);
-    d27u8 = vabd_u8(d3u8, d6u8);
-    d28u8 = vabd_u8(d18u8, d7u8);
-
-    d19u8 = vmax_u8(d19u8, d23u8);
-
-    d23u8 = vabd_u8(d5u8, d16u8);
-    d24u8 = vqadd_u8(d24u8, d24u8);
-
-
-    d19u8 = vcge_u8(dlimit, d19u8);
-
-
-    d25u8 = vmax_u8(d25u8, d26u8);
-    d26u8 = vmax_u8(d27u8, d28u8);
-
-    d23u8 = vshr_n_u8(d23u8, 1);
-
-    d25u8 = vmax_u8(d25u8, d26u8);
-
-    d24u8 = vqadd_u8(d24u8, d23u8);
-
-    d20u8 = vmax_u8(d20u8, d25u8);
-
-    d23u8 = vdup_n_u8(1);
-    d24u8 = vcge_u8(dblimit, d24u8);
-
-    d21u8 = vcgt_u8(d21u8, dthresh);
-
-    d20u8 = vcge_u8(d23u8, d20u8);
-
-    d19u8 = vand_u8(d19u8, d24u8);
-
-    d23u8 = vcgt_u8(d22u8, dthresh);
-
-    d20u8 = vand_u8(d20u8, d19u8);
-
-    d22u8 = vdup_n_u8(0x80);
-
-    d23u8 = vorr_u8(d21u8, d23u8);
-
-    q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8),
-                          vreinterpret_u16_u8(d21u8));
-
-    d30u8 = vshrn_n_u16(q10u16, 4);
-    flat = vget_lane_u32(vreinterpret_u32_u8(d30u8), 0);
-
-    if (flat == 0xffffffff) {  // Check for all 1's, power_branch_only
-        d27u8 = vdup_n_u8(3);
-        d21u8 = vdup_n_u8(2);
-        q14u16 = vaddl_u8(d6u8, d7u8);
-        q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
-        q14u16 = vmlal_u8(q14u16, d4u8, d21u8);
-        q14u16 = vaddw_u8(q14u16, d5u8);
-        *d0ru8 = vqrshrn_n_u16(q14u16, 3);
-
-        q14u16 = vsubw_u8(q14u16, d3u8);
-        q14u16 = vsubw_u8(q14u16, d4u8);
-        q14u16 = vaddw_u8(q14u16, d5u8);
-        q14u16 = vaddw_u8(q14u16, d16u8);
-        *d1ru8 = vqrshrn_n_u16(q14u16, 3);
-
-        q14u16 = vsubw_u8(q14u16, d3u8);
-        q14u16 = vsubw_u8(q14u16, d5u8);
-        q14u16 = vaddw_u8(q14u16, d6u8);
-        q14u16 = vaddw_u8(q14u16, d17u8);
-        *d2ru8 = vqrshrn_n_u16(q14u16, 3);
-
-        q14u16 = vsubw_u8(q14u16, d3u8);
-        q14u16 = vsubw_u8(q14u16, d6u8);
-        q14u16 = vaddw_u8(q14u16, d7u8);
-        q14u16 = vaddw_u8(q14u16, d18u8);
-        *d3ru8 = vqrshrn_n_u16(q14u16, 3);
-
-        q14u16 = vsubw_u8(q14u16, d4u8);
-        q14u16 = vsubw_u8(q14u16, d7u8);
-        q14u16 = vaddw_u8(q14u16, d16u8);
-        q14u16 = vaddw_u8(q14u16, d18u8);
-        *d4ru8 = vqrshrn_n_u16(q14u16, 3);
-
-        q14u16 = vsubw_u8(q14u16, d5u8);
-        q14u16 = vsubw_u8(q14u16, d16u8);
-        q14u16 = vaddw_u8(q14u16, d17u8);
-        q14u16 = vaddw_u8(q14u16, d18u8);
-        *d5ru8 = vqrshrn_n_u16(q14u16, 3);
-    } else {
-        d21u8 = veor_u8(d7u8,  d22u8);
-        d24u8 = veor_u8(d6u8,  d22u8);
-        d25u8 = veor_u8(d5u8,  d22u8);
-        d26u8 = veor_u8(d16u8, d22u8);
-
-        d27u8 = vdup_n_u8(3);
-
-        d28s8 = vsub_s8(vreinterpret_s8_u8(d21u8), vreinterpret_s8_u8(d24u8));
-        d29s8 = vqsub_s8(vreinterpret_s8_u8(d25u8), vreinterpret_s8_u8(d26u8));
-
-        q15s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d27u8));
-
-        d29s8 = vand_s8(d29s8, vreinterpret_s8_u8(d23u8));
-
-        q15s16 = vaddw_s8(q15s16, d29s8);
-
-        d29u8 = vdup_n_u8(4);
-
-        d28s8 = vqmovn_s16(q15s16);
-
-        d28s8 = vand_s8(d28s8, vreinterpret_s8_u8(d19u8));
-
-        d30s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d27u8));
-        d29s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d29u8));
-        d30s8 = vshr_n_s8(d30s8, 3);
-        d29s8 = vshr_n_s8(d29s8, 3);
-
-        d24s8 = vqadd_s8(vreinterpret_s8_u8(d24u8), d30s8);
-        d21s8 = vqsub_s8(vreinterpret_s8_u8(d21u8), d29s8);
-
-        d29s8 = vrshr_n_s8(d29s8, 1);
-        d29s8 = vbic_s8(d29s8, vreinterpret_s8_u8(d23u8));
-
-        d25s8 = vqadd_s8(vreinterpret_s8_u8(d25u8), d29s8);
-        d26s8 = vqsub_s8(vreinterpret_s8_u8(d26u8), d29s8);
-
-        if (flat == 0) {  // filter_branch_only
-            *d0ru8 = d4u8;
-            *d1ru8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
-            *d2ru8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
-            *d3ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
-            *d4ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
-            *d5ru8 = d17u8;
-            return;
-        }
-
-        d21u8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
-        d24u8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
-        d25u8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
-        d26u8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
-
-        d23u8 = vdup_n_u8(2);
-        q14u16 = vaddl_u8(d6u8, d7u8);
-        q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
-        q14u16 = vmlal_u8(q14u16, d4u8, d23u8);
-
-        d0u8 = vbsl_u8(d20u8, dblimit, d4u8);
-
-        q14u16 = vaddw_u8(q14u16, d5u8);
-
-        d1u8 = vbsl_u8(d20u8, dlimit, d25u8);
-
-        d30u8 = vqrshrn_n_u16(q14u16, 3);
-
-        q14u16 = vsubw_u8(q14u16, d3u8);
-        q14u16 = vsubw_u8(q14u16, d4u8);
-        q14u16 = vaddw_u8(q14u16, d5u8);
-        q14u16 = vaddw_u8(q14u16, d16u8);
-
-        d2u8 = vbsl_u8(d20u8, dthresh, d24u8);
-
-        d31u8 = vqrshrn_n_u16(q14u16, 3);
-
-        q14u16 = vsubw_u8(q14u16, d3u8);
-        q14u16 = vsubw_u8(q14u16, d5u8);
-        q14u16 = vaddw_u8(q14u16, d6u8);
-        q14u16 = vaddw_u8(q14u16, d17u8);
-
-        *d0ru8 = vbsl_u8(d20u8, d30u8, d0u8);
-
-        d23u8 = vqrshrn_n_u16(q14u16, 3);
-
-        q14u16 = vsubw_u8(q14u16, d3u8);
-        q14u16 = vsubw_u8(q14u16, d6u8);
-        q14u16 = vaddw_u8(q14u16, d7u8);
-
-        *d1ru8 = vbsl_u8(d20u8, d31u8, d1u8);
-
-        q14u16 = vaddw_u8(q14u16, d18u8);
-
-        *d2ru8 = vbsl_u8(d20u8, d23u8, d2u8);
-
-        d22u8 = vqrshrn_n_u16(q14u16, 3);
-
-        q14u16 = vsubw_u8(q14u16, d4u8);
-        q14u16 = vsubw_u8(q14u16, d7u8);
-        q14u16 = vaddw_u8(q14u16, d16u8);
-
-        d3u8 = vbsl_u8(d20u8, d3u8, d21u8);
-
-        q14u16 = vaddw_u8(q14u16, d18u8);
-
-        d4u8 = vbsl_u8(d20u8, d4u8, d26u8);
-
-        d6u8 = vqrshrn_n_u16(q14u16, 3);
-
-        q14u16 = vsubw_u8(q14u16, d5u8);
-        q14u16 = vsubw_u8(q14u16, d16u8);
-        q14u16 = vaddw_u8(q14u16, d17u8);
-        q14u16 = vaddw_u8(q14u16, d18u8);
-
-        d5u8 = vbsl_u8(d20u8, d5u8, d17u8);
-
-        d7u8 = vqrshrn_n_u16(q14u16, 3);
-
-        *d3ru8 = vbsl_u8(d20u8, d22u8, d3u8);
-        *d4ru8 = vbsl_u8(d20u8, d6u8, d4u8);
-        *d5ru8 = vbsl_u8(d20u8, d7u8, d5u8);
-    }
-    return;
+#if HAVE_NEON_ASM
+void vp9_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */,
+                                    const uint8_t *blimit0,
+                                    const uint8_t *limit0,
+                                    const uint8_t *thresh0,
+                                    const uint8_t *blimit1,
+                                    const uint8_t *limit1,
+                                    const uint8_t *thresh1) {
+  vp9_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0, 1);
+  vp9_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1, 1);
 }
 
-void vp9_lpf_horizontal_8_neon(
-        unsigned char *src,
-        int pitch,
-        unsigned char *blimit,
-        unsigned char *limit,
-        unsigned char *thresh,
-        int count) {
-    int i;
-    uint8_t *s, *psrc;
-    uint8x8_t dblimit, dlimit, dthresh;
-    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
-    uint8x8_t d16u8, d17u8, d18u8;
-
-    if (count == 0)  // end_vp9_mblf_h_edge
-        return;
-
-    dblimit = vld1_u8(blimit);
-    dlimit = vld1_u8(limit);
-    dthresh = vld1_u8(thresh);
-
-    psrc = src - (pitch << 2);
-    for (i = 0; i < count; i++) {
-        s = psrc + i * 8;
-
-        d3u8  = vld1_u8(s);
-        s += pitch;
-        d4u8  = vld1_u8(s);
-        s += pitch;
-        d5u8  = vld1_u8(s);
-        s += pitch;
-        d6u8  = vld1_u8(s);
-        s += pitch;
-        d7u8  = vld1_u8(s);
-        s += pitch;
-        d16u8 = vld1_u8(s);
-        s += pitch;
-        d17u8 = vld1_u8(s);
-        s += pitch;
-        d18u8 = vld1_u8(s);
-
-        vp9_mbloop_filter_neon(dblimit, dlimit, dthresh,
-                             d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
-                             &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8);
-
-        s -= (pitch * 6);
-        vst1_u8(s, d0u8);
-        s += pitch;
-        vst1_u8(s, d1u8);
-        s += pitch;
-        vst1_u8(s, d2u8);
-        s += pitch;
-        vst1_u8(s, d3u8);
-        s += pitch;
-        vst1_u8(s, d4u8);
-        s += pitch;
-        vst1_u8(s, d5u8);
-    }
-    return;
+void vp9_lpf_vertical_8_dual_neon(uint8_t *s, int p,
+                                  const uint8_t *blimit0,
+                                  const uint8_t *limit0,
+                                  const uint8_t *thresh0,
+                                  const uint8_t *blimit1,
+                                  const uint8_t *limit1,
+                                  const uint8_t *thresh1) {
+  vp9_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0, 1);
+  vp9_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
 }
 
-void vp9_lpf_vertical_8_neon(
-        unsigned char *src,
-        int pitch,
-        unsigned char *blimit,
-        unsigned char *limit,
-        unsigned char *thresh,
-        int count) {
-    int i;
-    uint8_t *s;
-    uint8x8_t dblimit, dlimit, dthresh;
-    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
-    uint8x8_t d16u8, d17u8, d18u8;
-    uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
-    uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
-    uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
-    uint8x8x4_t d4Result;
-    uint8x8x2_t d2Result;
-
-    if (count == 0)
-        return;
-
-    dblimit = vld1_u8(blimit);
-    dlimit = vld1_u8(limit);
-    dthresh = vld1_u8(thresh);
-
-    for (i = 0; i < count; i++) {
-        s = src + (i * (pitch << 3)) - 4;
-
-        d3u8 = vld1_u8(s);
-        s += pitch;
-        d4u8 = vld1_u8(s);
-        s += pitch;
-        d5u8 = vld1_u8(s);
-        s += pitch;
-        d6u8 = vld1_u8(s);
-        s += pitch;
-        d7u8 = vld1_u8(s);
-        s += pitch;
-        d16u8 = vld1_u8(s);
-        s += pitch;
-        d17u8 = vld1_u8(s);
-        s += pitch;
-        d18u8 = vld1_u8(s);
-
-        d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8),
-                          vreinterpret_u32_u8(d7u8));
-        d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8),
-                          vreinterpret_u32_u8(d16u8));
-        d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8),
-                          vreinterpret_u32_u8(d17u8));
-        d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8),
-                          vreinterpret_u32_u8(d18u8));
-
-        d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
-                          vreinterpret_u16_u32(d2tmp2.val[0]));
-        d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
-                          vreinterpret_u16_u32(d2tmp3.val[0]));
-        d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
-                          vreinterpret_u16_u32(d2tmp2.val[1]));
-        d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
-                          vreinterpret_u16_u32(d2tmp3.val[1]));
-
-        d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
-                         vreinterpret_u8_u16(d2tmp5.val[0]));
-        d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
-                         vreinterpret_u8_u16(d2tmp5.val[1]));
-        d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
-                          vreinterpret_u8_u16(d2tmp7.val[0]));
-        d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
-                          vreinterpret_u8_u16(d2tmp7.val[1]));
-
-        d3u8 = d2tmp8.val[0];
-        d4u8 = d2tmp8.val[1];
-        d5u8 = d2tmp9.val[0];
-        d6u8 = d2tmp9.val[1];
-        d7u8 = d2tmp10.val[0];
-        d16u8 = d2tmp10.val[1];
-        d17u8 = d2tmp11.val[0];
-        d18u8 = d2tmp11.val[1];
-
-        vp9_mbloop_filter_neon(dblimit, dlimit, dthresh,
-                             d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
-                             &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8);
-
-        d4Result.val[0] = d0u8;
-        d4Result.val[1] = d1u8;
-        d4Result.val[2] = d2u8;
-        d4Result.val[3] = d3u8;
-
-        d2Result.val[0] = d4u8;
-        d2Result.val[1] = d5u8;
-
-        s = src - 3;
-        vst4_lane_u8(s, d4Result, 0);
-        s += pitch;
-        vst4_lane_u8(s, d4Result, 1);
-        s += pitch;
-        vst4_lane_u8(s, d4Result, 2);
-        s += pitch;
-        vst4_lane_u8(s, d4Result, 3);
-        s += pitch;
-        vst4_lane_u8(s, d4Result, 4);
-        s += pitch;
-        vst4_lane_u8(s, d4Result, 5);
-        s += pitch;
-        vst4_lane_u8(s, d4Result, 6);
-        s += pitch;
-        vst4_lane_u8(s, d4Result, 7);
-
-        s = src + 1;
-        vst2_lane_u8(s, d2Result, 0);
-        s += pitch;
-        vst2_lane_u8(s, d2Result, 1);
-        s += pitch;
-        vst2_lane_u8(s, d2Result, 2);
-        s += pitch;
-        vst2_lane_u8(s, d2Result, 3);
-        s += pitch;
-        vst2_lane_u8(s, d2Result, 4);
-        s += pitch;
-        vst2_lane_u8(s, d2Result, 5);
-        s += pitch;
-        vst2_lane_u8(s, d2Result, 6);
-        s += pitch;
-        vst2_lane_u8(s, d2Result, 7);
-    }
-    return;
+void vp9_lpf_vertical_16_dual_neon(uint8_t *s, int p,
+                                   const uint8_t *blimit,
+                                   const uint8_t *limit,
+                                   const uint8_t *thresh) {
+  vp9_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
+  vp9_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh);
 }
+#endif  // HAVE_NEON_ASM
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index e7fb19fd1..124057634 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -232,6 +232,8 @@ typedef struct macroblockd {
 
   int lossless;
   int corrupted;
+
+  struct vpx_internal_error_info *error_info;
 } MACROBLOCKD;
 
 static INLINE BLOCK_SIZE get_subsize(BLOCK_SIZE bsize,
diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index 239c0494c..4eb2e6413 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -81,6 +81,7 @@ typedef struct {
   const vp9_prob *prob;
   int len;
   int base_val;
+  const int16_t *cost;
 } vp9_extra_bit;
 
 // indexed by token value
diff --git a/vp9/common/vp9_enums.h b/vp9/common/vp9_enums.h
index 7454dd439..7938fc10a 100644
--- a/vp9/common/vp9_enums.h
+++ b/vp9/common/vp9_enums.h
@@ -99,17 +99,6 @@ typedef enum {
 } TX_TYPE;
 
 typedef enum {
-  UNKNOWN    = 0,
-  BT_601     = 1,  // YUV
-  BT_709     = 2,  // YUV
-  SMPTE_170  = 3,  // YUV
-  SMPTE_240  = 4,  // YUV
-  BT_2020    = 5,  // YUV
-  RESERVED_2 = 6,
-  SRGB       = 7   // RGB
-} COLOR_SPACE;
-
-typedef enum {
   VP9_LAST_FLAG = 1 << 0,
   VP9_GOLD_FLAG = 1 << 1,
   VP9_ALT_FLAG = 1 << 2,
diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c
index 1a3fefc5f..b48d52230 100644
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -1517,12 +1517,12 @@ void vp9_highbd_idct4(const tran_low_t *input, tran_low_t *output, int bd) {
   // stage 1
   temp1 = (input[0] + input[2]) * cospi_16_64;
   temp2 = (input[0] - input[2]) * cospi_16_64;
-  step[0] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step[1] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
   temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
-  step[2] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step[3] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
 
   // stage 2
   output[0] = WRAPLOW(step[0] + step[3], bd);
@@ -1562,10 +1562,11 @@ void vp9_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
                                 int dest_stride, int bd) {
   int i;
   tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
+  tran_low_t out = WRAPLOW(
+      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
+  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
   a1 = ROUND_POWER_OF_TWO(out, 4);
 
   for (i = 0; i < 4; i++) {
@@ -1587,12 +1588,12 @@ void vp9_highbd_idct8(const tran_low_t *input, tran_low_t *output, int bd) {
   step1[3] = input[6];
   temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
   temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
-  step1[4] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[7] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
   temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
 
   // stage 2 & stage 3 - even half
   vp9_highbd_idct4(step1, step1, bd);
@@ -1607,8 +1608,8 @@ void vp9_highbd_idct8(const tran_low_t *input, tran_low_t *output, int bd) {
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   step1[7] = step2[7];
 
   // stage 4
@@ -1653,9 +1654,10 @@ void vp9_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
                                 int stride, int bd) {
   int i, j;
   tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
+  tran_low_t out = WRAPLOW(
+      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
+  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
   a1 = ROUND_POWER_OF_TWO(out, 5);
   for (j = 0; j < 8; ++j) {
     for (i = 0; i < 8; ++i)
@@ -1696,10 +1698,10 @@ static void highbd_iadst4(const tran_low_t *input, tran_low_t *output, int bd) {
   // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
   // + 1b (addition) = 29b.
   // Hence the output bit depth is 15b.
-  output[0] = WRAPLOW(dct_const_round_shift(s0 + s3), bd);
-  output[1] = WRAPLOW(dct_const_round_shift(s1 + s3), bd);
-  output[2] = WRAPLOW(dct_const_round_shift(s2), bd);
-  output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3), bd);
+  output[0] = WRAPLOW(highbd_dct_const_round_shift(s0 + s3, bd), bd);
+  output[1] = WRAPLOW(highbd_dct_const_round_shift(s1 + s3, bd), bd);
+  output[2] = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
+  output[3] = WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3, bd), bd);
 }
 
 void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
@@ -1764,14 +1766,14 @@ static void highbd_iadst8(const tran_low_t *input, tran_low_t *output, int bd) {
   s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
   s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
 
-  x0 = WRAPLOW(dct_const_round_shift(s0 + s4), bd);
-  x1 = WRAPLOW(dct_const_round_shift(s1 + s5), bd);
-  x2 = WRAPLOW(dct_const_round_shift(s2 + s6), bd);
-  x3 = WRAPLOW(dct_const_round_shift(s3 + s7), bd);
-  x4 = WRAPLOW(dct_const_round_shift(s0 - s4), bd);
-  x5 = WRAPLOW(dct_const_round_shift(s1 - s5), bd);
-  x6 = WRAPLOW(dct_const_round_shift(s2 - s6), bd);
-  x7 = WRAPLOW(dct_const_round_shift(s3 - s7), bd);
+  x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s4, bd), bd);
+  x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s5, bd), bd);
+  x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s6, bd), bd);
+  x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s7, bd), bd);
+  x4 = WRAPLOW(highbd_dct_const_round_shift(s0 - s4, bd), bd);
+  x5 = WRAPLOW(highbd_dct_const_round_shift(s1 - s5, bd), bd);
+  x6 = WRAPLOW(highbd_dct_const_round_shift(s2 - s6, bd), bd);
+  x7 = WRAPLOW(highbd_dct_const_round_shift(s3 - s7, bd), bd);
 
   // stage 2
   s0 = x0;
@@ -1787,10 +1789,10 @@ static void highbd_iadst8(const tran_low_t *input, tran_low_t *output, int bd) {
   x1 = WRAPLOW(s1 + s3, bd);
   x2 = WRAPLOW(s0 - s2, bd);
   x3 = WRAPLOW(s1 - s3, bd);
-  x4 = WRAPLOW(dct_const_round_shift(s4 + s6), bd);
-  x5 = WRAPLOW(dct_const_round_shift(s5 + s7), bd);
-  x6 = WRAPLOW(dct_const_round_shift(s4 - s6), bd);
-  x7 = WRAPLOW(dct_const_round_shift(s5 - s7), bd);
+  x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);
+  x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);
+  x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);
+  x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);
 
   // stage 3
   s2 = cospi_16_64 * (x2 + x3);
@@ -1798,10 +1800,10 @@ static void highbd_iadst8(const tran_low_t *input, tran_low_t *output, int bd) {
   s6 = cospi_16_64 * (x6 + x7);
   s7 = cospi_16_64 * (x6 - x7);
 
-  x2 = WRAPLOW(dct_const_round_shift(s2), bd);
-  x3 = WRAPLOW(dct_const_round_shift(s3), bd);
-  x6 = WRAPLOW(dct_const_round_shift(s6), bd);
-  x7 = WRAPLOW(dct_const_round_shift(s7), bd);
+  x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
+  x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);
+  x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);
+  x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);
 
   output[0] = WRAPLOW(x0, bd);
   output[1] = WRAPLOW(-x4, bd);
@@ -1910,23 +1912,23 @@ void vp9_highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) {
 
   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step2[15] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
 
   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
 
   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
 
   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
 
   // stage 3
   step1[0] = step2[0];
@@ -1936,12 +1938,12 @@ void vp9_highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) {
 
   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[7] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
 
   step1[8] = WRAPLOW(step2[8] + step2[9], bd);
   step1[9] = WRAPLOW(step2[8] - step2[9], bd);
@@ -1955,12 +1957,12 @@ void vp9_highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) {
   // stage 4
   temp1 = (step1[0] + step1[1]) * cospi_16_64;
   temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step2[1] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step2[3] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   step2[4] = WRAPLOW(step1[4] + step1[5], bd);
   step2[5] = WRAPLOW(step1[4] - step1[5], bd);
   step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
@@ -1970,12 +1972,12 @@ void vp9_highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) {
   step2[15] = step1[15];
   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   step2[11] = step1[11];
   step2[12] = step1[12];
 
@@ -1987,8 +1989,8 @@ void vp9_highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) {
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   step1[7] = step2[7];
 
   step1[8] = WRAPLOW(step2[8] + step2[11], bd);
@@ -2013,12 +2015,12 @@ void vp9_highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) {
   step2[9] = step1[9];
   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
   temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
   temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   step2[14] = step1[14];
   step2[15] = step1[15];
 
@@ -2115,22 +2117,22 @@ static void highbd_iadst16(const tran_low_t *input, tran_low_t *output,
   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
   s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
 
-  x0 = WRAPLOW(dct_const_round_shift(s0 + s8), bd);
-  x1 = WRAPLOW(dct_const_round_shift(s1 + s9), bd);
-  x2 = WRAPLOW(dct_const_round_shift(s2 + s10), bd);
-  x3 = WRAPLOW(dct_const_round_shift(s3 + s11), bd);
-  x4 = WRAPLOW(dct_const_round_shift(s4 + s12), bd);
-  x5 = WRAPLOW(dct_const_round_shift(s5 + s13), bd);
-  x6 = WRAPLOW(dct_const_round_shift(s6 + s14), bd);
-  x7 = WRAPLOW(dct_const_round_shift(s7 + s15), bd);
-  x8  = WRAPLOW(dct_const_round_shift(s0 - s8), bd);
-  x9  = WRAPLOW(dct_const_round_shift(s1 - s9), bd);
-  x10 = WRAPLOW(dct_const_round_shift(s2 - s10), bd);
-  x11 = WRAPLOW(dct_const_round_shift(s3 - s11), bd);
-  x12 = WRAPLOW(dct_const_round_shift(s4 - s12), bd);
-  x13 = WRAPLOW(dct_const_round_shift(s5 - s13), bd);
-  x14 = WRAPLOW(dct_const_round_shift(s6 - s14), bd);
-  x15 = WRAPLOW(dct_const_round_shift(s7 - s15), bd);
+  x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s8, bd), bd);
+  x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s9, bd), bd);
+  x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s10, bd), bd);
+  x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s11, bd), bd);
+  x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s12, bd), bd);
+  x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s13, bd), bd);
+  x6 = WRAPLOW(highbd_dct_const_round_shift(s6 + s14, bd), bd);
+  x7 = WRAPLOW(highbd_dct_const_round_shift(s7 + s15, bd), bd);
+  x8  = WRAPLOW(highbd_dct_const_round_shift(s0 - s8, bd), bd);
+  x9  = WRAPLOW(highbd_dct_const_round_shift(s1 - s9, bd), bd);
+  x10 = WRAPLOW(highbd_dct_const_round_shift(s2 - s10, bd), bd);
+  x11 = WRAPLOW(highbd_dct_const_round_shift(s3 - s11, bd), bd);
+  x12 = WRAPLOW(highbd_dct_const_round_shift(s4 - s12, bd), bd);
+  x13 = WRAPLOW(highbd_dct_const_round_shift(s5 - s13, bd), bd);
+  x14 = WRAPLOW(highbd_dct_const_round_shift(s6 - s14, bd), bd);
+  x15 = WRAPLOW(highbd_dct_const_round_shift(s7 - s15, bd), bd);
 
   // stage 2
   s0 = x0;
@@ -2158,14 +2160,14 @@ static void highbd_iadst16(const tran_low_t *input, tran_low_t *output,
   x5 = WRAPLOW(s1 - s5, bd);
   x6 = WRAPLOW(s2 - s6, bd);
   x7 = WRAPLOW(s3 - s7, bd);
-  x8 = WRAPLOW(dct_const_round_shift(s8 + s12), bd);
-  x9 = WRAPLOW(dct_const_round_shift(s9 + s13), bd);
-  x10 = WRAPLOW(dct_const_round_shift(s10 + s14), bd);
-  x11 = WRAPLOW(dct_const_round_shift(s11 + s15), bd);
-  x12 = WRAPLOW(dct_const_round_shift(s8 - s12), bd);
-  x13 = WRAPLOW(dct_const_round_shift(s9 - s13), bd);
-  x14 = WRAPLOW(dct_const_round_shift(s10 - s14), bd);
-  x15 = WRAPLOW(dct_const_round_shift(s11 - s15), bd);
+  x8 = WRAPLOW(highbd_dct_const_round_shift(s8 + s12, bd), bd);
+  x9 = WRAPLOW(highbd_dct_const_round_shift(s9 + s13, bd), bd);
+  x10 = WRAPLOW(highbd_dct_const_round_shift(s10 + s14, bd), bd);
+  x11 = WRAPLOW(highbd_dct_const_round_shift(s11 + s15, bd), bd);
+  x12 = WRAPLOW(highbd_dct_const_round_shift(s8 - s12, bd), bd);
+  x13 = WRAPLOW(highbd_dct_const_round_shift(s9 - s13, bd), bd);
+  x14 = WRAPLOW(highbd_dct_const_round_shift(s10 - s14, bd), bd);
+  x15 = WRAPLOW(highbd_dct_const_round_shift(s11 - s15, bd), bd);
 
   // stage 3
   s0 = x0;
@@ -2189,18 +2191,18 @@ static void highbd_iadst16(const tran_low_t *input, tran_low_t *output,
   x1 = WRAPLOW(s1 + s3, bd);
   x2 = WRAPLOW(s0 - s2, bd);
   x3 = WRAPLOW(s1 - s3, bd);
-  x4 = WRAPLOW(dct_const_round_shift(s4 + s6), bd);
-  x5 = WRAPLOW(dct_const_round_shift(s5 + s7), bd);
-  x6 = WRAPLOW(dct_const_round_shift(s4 - s6), bd);
-  x7 = WRAPLOW(dct_const_round_shift(s5 - s7), bd);
+  x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);
+  x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);
+  x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);
+  x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);
   x8 = WRAPLOW(s8 + s10, bd);
   x9 = WRAPLOW(s9 + s11, bd);
   x10 = WRAPLOW(s8 - s10, bd);
   x11 = WRAPLOW(s9 - s11, bd);
-  x12 = WRAPLOW(dct_const_round_shift(s12 + s14), bd);
-  x13 = WRAPLOW(dct_const_round_shift(s13 + s15), bd);
-  x14 = WRAPLOW(dct_const_round_shift(s12 - s14), bd);
-  x15 = WRAPLOW(dct_const_round_shift(s13 - s15), bd);
+  x12 = WRAPLOW(highbd_dct_const_round_shift(s12 + s14, bd), bd);
+  x13 = WRAPLOW(highbd_dct_const_round_shift(s13 + s15, bd), bd);
+  x14 = WRAPLOW(highbd_dct_const_round_shift(s12 - s14, bd), bd);
+  x15 = WRAPLOW(highbd_dct_const_round_shift(s13 - s15, bd), bd);
 
   // stage 4
   s2 = (- cospi_16_64) * (x2 + x3);
@@ -2212,14 +2214,14 @@ static void highbd_iadst16(const tran_low_t *input, tran_low_t *output,
   s14 = (- cospi_16_64) * (x14 + x15);
   s15 = cospi_16_64 * (x14 - x15);
 
-  x2 = WRAPLOW(dct_const_round_shift(s2), bd);
-  x3 = WRAPLOW(dct_const_round_shift(s3), bd);
-  x6 = WRAPLOW(dct_const_round_shift(s6), bd);
-  x7 = WRAPLOW(dct_const_round_shift(s7), bd);
-  x10 = WRAPLOW(dct_const_round_shift(s10), bd);
-  x11 = WRAPLOW(dct_const_round_shift(s11), bd);
-  x14 = WRAPLOW(dct_const_round_shift(s14), bd);
-  x15 = WRAPLOW(dct_const_round_shift(s15), bd);
+  x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
+  x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);
+  x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);
+  x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);
+  x10 = WRAPLOW(highbd_dct_const_round_shift(s10, bd), bd);
+  x11 = WRAPLOW(highbd_dct_const_round_shift(s11, bd), bd);
+  x14 = WRAPLOW(highbd_dct_const_round_shift(s14, bd), bd);
+  x15 = WRAPLOW(highbd_dct_const_round_shift(s15, bd), bd);
 
   output[0] = WRAPLOW(x0, bd);
   output[1] = WRAPLOW(-x8, bd);
@@ -2306,10 +2308,11 @@ void vp9_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
                                   int stride, int bd) {
   int i, j;
   tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
+  tran_low_t out = WRAPLOW(
+      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
+  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
   a1 = ROUND_POWER_OF_TWO(out, 6);
   for (j = 0; j < 16; ++j) {
     for (i = 0; i < 16; ++i)
@@ -2343,43 +2346,43 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) {
 
   temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
   temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
-  step1[16] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[31] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[16] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[31] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
 
   temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
   temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
-  step1[17] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[30] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
 
   temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
   temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
-  step1[18] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[29] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
 
   temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
   temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
-  step1[19] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[28] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
 
   temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
   temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
-  step1[20] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[27] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
 
   temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
   temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
 
   temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
   temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
-  step1[22] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[25] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
 
   temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
   temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
-  step1[23] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[24] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
 
   // stage 2
   step2[0] = step1[0];
@@ -2393,23 +2396,23 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) {
 
   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step2[15] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
 
   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
 
   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
 
   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
 
   step2[16] = WRAPLOW(step1[16] + step1[17], bd);
   step2[17] = WRAPLOW(step1[16] - step1[17], bd);
@@ -2436,12 +2439,12 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) {
 
   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[7] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
 
   step1[8] = WRAPLOW(step2[8] + step2[9], bd);
   step1[9] = WRAPLOW(step2[8] - step2[9], bd);
@@ -2456,22 +2459,22 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) {
   step1[31] = step2[31];
   temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
   temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
-  step1[17] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[30] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
   temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
-  step1[18] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[29] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   step1[19] = step2[19];
   step1[20] = step2[20];
   temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
   temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
   temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
-  step1[22] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[25] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   step1[23] = step2[23];
   step1[24] = step2[24];
   step1[27] = step2[27];
@@ -2480,12 +2483,12 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) {
   // stage 4
   temp1 = (step1[0] + step1[1]) * cospi_16_64;
   temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step2[1] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step2[3] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   step2[4] = WRAPLOW(step1[4] + step1[5], bd);
   step2[5] = WRAPLOW(step1[4] - step1[5], bd);
   step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
@@ -2495,12 +2498,12 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) {
   step2[15] = step1[15];
   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   step2[11] = step1[11];
   step2[12] = step1[12];
 
@@ -2530,8 +2533,8 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) {
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   step1[7] = step2[7];
 
   step1[8] = WRAPLOW(step2[8] + step2[11], bd);
@@ -2547,20 +2550,20 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) {
   step1[17] = step2[17];
   temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
   temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
-  step1[18] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[29] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
   temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
-  step1[19] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[28] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
   temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
-  step1[20] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[27] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
   temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   step1[22] = step2[22];
   step1[23] = step2[23];
   step1[24] = step2[24];
@@ -2581,12 +2584,12 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) {
   step2[9] = step1[9];
   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
   temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
   temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   step2[14] = step1[14];
   step2[15] = step1[15];
 
@@ -2632,20 +2635,20 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) {
   step1[19] = step2[19];
   temp1 = (-step2[20] + step2[27]) * cospi_16_64;
   temp2 = (step2[20] + step2[27]) * cospi_16_64;
-  step1[20] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[27] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   temp1 = (-step2[21] + step2[26]) * cospi_16_64;
   temp2 = (step2[21] + step2[26]) * cospi_16_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   temp1 = (-step2[22] + step2[25]) * cospi_16_64;
   temp2 = (step2[22] + step2[25]) * cospi_16_64;
-  step1[22] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[25] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   temp1 = (-step2[23] + step2[24]) * cospi_16_64;
   temp2 = (step2[23] + step2[24]) * cospi_16_64;
-  step1[23] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[24] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   step1[28] = step2[28];
   step1[29] = step2[29];
   step1[30] = step2[30];
@@ -2759,8 +2762,9 @@ void vp9_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
   int a1;
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
+  tran_low_t out = WRAPLOW(
+      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
+  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
   a1 = ROUND_POWER_OF_TWO(out, 6);
 
   for (j = 0; j < 32; ++j) {
diff --git a/vp9/common/vp9_idct.h b/vp9/common/vp9_idct.h
index 1d8836cf3..6e2551dd4 100644
--- a/vp9/common/vp9_idct.h
+++ b/vp9/common/vp9_idct.h
@@ -80,13 +80,7 @@ static const tran_high_t sinpi_3_9 = 13377;
 static const tran_high_t sinpi_4_9 = 15212;
 
 static INLINE tran_low_t check_range(tran_high_t input) {
-#if CONFIG_VP9_HIGHBITDEPTH
-  // For valid highbitdepth VP9 streams, intermediate stage coefficients will
-  // stay within the ranges:
-  // - 8 bit: signed 16 bit integer
-  // - 10 bit: signed 18 bit integer
-  // - 12 bit: signed 20 bit integer
-#elif CONFIG_COEFFICIENT_RANGE_CHECKING
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
   // For valid VP9 input streams, intermediate stage coefficients should always
   // stay within the range of a signed 16 bit integer. Coefficients can go out
   // of this range for invalid/corrupt VP9 streams. However, strictly checking
@@ -95,7 +89,7 @@ static INLINE tran_low_t check_range(tran_high_t input) {
   // --enable-coefficient-range-checking.
   assert(INT16_MIN <= input);
   assert(input <= INT16_MAX);
-#endif
+#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
   return (tran_low_t)input;
 }
 
@@ -104,6 +98,32 @@ static INLINE tran_low_t dct_const_round_shift(tran_high_t input) {
   return check_range(rv);
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE tran_low_t highbd_check_range(tran_high_t input,
+                                            int bd) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+  // For valid highbitdepth VP9 streams, intermediate stage coefficients will
+  // stay within the ranges:
+  // - 8 bit: signed 16 bit integer
+  // - 10 bit: signed 18 bit integer
+  // - 12 bit: signed 20 bit integer
+  const int32_t int_max = (1 << (7 + bd)) - 1;
+  const int32_t int_min = -int_max - 1;
+  assert(int_min <= input);
+  assert(input <= int_max);
+  (void) int_min;
+#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
+  (void) bd;
+  return (tran_low_t)input;
+}
+
+static INLINE tran_low_t highbd_dct_const_round_shift(tran_high_t input,
+                                                      int bd) {
+  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+  return highbd_check_range(rv, bd);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 typedef void (*transform_1d)(const tran_low_t*, tran_low_t*);
 
 typedef struct {
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index 58b2da75f..2101ec58c 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -1149,10 +1149,10 @@ static void highbd_filter_selectively_vert(uint16_t *s, int pitch,
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-static void filter_block_plane_non420(VP9_COMMON *cm,
-                                      struct macroblockd_plane *plane,
-                                      MODE_INFO *mi_8x8,
-                                      int mi_row, int mi_col) {
+void vp9_filter_block_plane_non420(VP9_COMMON *cm,
+                                   struct macroblockd_plane *plane,
+                                   MODE_INFO *mi_8x8,
+                                   int mi_row, int mi_col) {
   const int ss_x = plane->subsampling_x;
   const int ss_y = plane->subsampling_y;
   const int row_step = 1 << ss_y;
@@ -1598,8 +1598,8 @@ void vp9_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer,
         if (use_420)
           vp9_filter_block_plane(cm, &planes[plane], mi_row, &lfm);
         else
-          filter_block_plane_non420(cm, &planes[plane], mi + mi_col,
-                                    mi_row, mi_col);
+          vp9_filter_block_plane_non420(cm, &planes[plane], mi + mi_col,
+                                        mi_row, mi_col);
       }
     }
   }
diff --git a/vp9/common/vp9_loopfilter.h b/vp9/common/vp9_loopfilter.h
index 4c15e6bd4..6d7cabf7c 100644
--- a/vp9/common/vp9_loopfilter.h
+++ b/vp9/common/vp9_loopfilter.h
@@ -97,6 +97,11 @@ void vp9_filter_block_plane(struct VP9Common *const cm,
                             int mi_row,
                             LOOP_FILTER_MASK *lfm);
 
+void vp9_filter_block_plane_non420(struct VP9Common *cm,
+                                   struct macroblockd_plane *plane,
+                                   MODE_INFO *mi_8x8,
+                                   int mi_row, int mi_col);
+
 void vp9_loop_filter_init(struct VP9Common *cm);
 
 // Update the loop filter for the current frame.
diff --git a/vp9/decoder/vp9_dthread.c b/vp9/common/vp9_loopfilter_thread.c
index 3d2d0dd2e..2d47daeaf 100644
--- a/vp9/decoder/vp9_dthread.c
+++ b/vp9/common/vp9_loopfilter_thread.c
@@ -9,14 +9,10 @@
  */
 
 #include "./vpx_config.h"
-
 #include "vpx_mem/vpx_mem.h"
-
+#include "vp9/common/vp9_loopfilter_thread.h"
 #include "vp9/common/vp9_reconinter.h"
 
-#include "vp9/decoder/vp9_dthread.h"
-#include "vp9/decoder/vp9_decoder.h"
-
 #if CONFIG_MULTITHREAD
 static INLINE void mutex_lock(pthread_mutex_t *const mutex) {
   const int kMaxTryLocks = 4000;
@@ -88,31 +84,43 @@ static INLINE void sync_write(VP9LfSync *const lf_sync, int r, int c,
 }
 
 // Implement row loopfiltering for each thread.
-static void loop_filter_rows_mt(const YV12_BUFFER_CONFIG *const frame_buffer,
-                                VP9_COMMON *const cm,
-                                struct macroblockd_plane planes[MAX_MB_PLANE],
-                                int start, int stop, int y_only,
-                                VP9LfSync *const lf_sync) {
+static INLINE
+void thread_loop_filter_rows(const YV12_BUFFER_CONFIG *const frame_buffer,
+                             VP9_COMMON *const cm,
+                             struct macroblockd_plane planes[MAX_MB_PLANE],
+                             int start, int stop, int y_only,
+                             VP9LfSync *const lf_sync) {
   const int num_planes = y_only ? 1 : MAX_MB_PLANE;
-  int r, c;  // SB row and col
+  const int use_420 = y_only || (planes[1].subsampling_y == 1 &&
+                                 planes[1].subsampling_x == 1);
   const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2;
+  int mi_row, mi_col;
 
-  for (r = start; r < stop; r += lf_sync->num_workers) {
-    const int mi_row = r << MI_BLOCK_SIZE_LOG2;
+  for (mi_row = start; mi_row < stop;
+       mi_row += lf_sync->num_workers * MI_BLOCK_SIZE) {
     MODE_INFO *const mi = cm->mi + mi_row * cm->mi_stride;
 
-    for (c = 0; c < sb_cols; ++c) {
-      const int mi_col = c << MI_BLOCK_SIZE_LOG2;
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
+      const int r = mi_row >> MI_BLOCK_SIZE_LOG2;
+      const int c = mi_col >> MI_BLOCK_SIZE_LOG2;
       LOOP_FILTER_MASK lfm;
       int plane;
 
       sync_read(lf_sync, r, c);
 
       vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
-      vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm);
+
+      // TODO(JBB): Make setup_mask work for non 420.
+      if (use_420)
+        vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride,
+                       &lfm);
 
       for (plane = 0; plane < num_planes; ++plane) {
-        vp9_filter_block_plane(cm, &planes[plane], mi_row, &lfm);
+        if (use_420)
+          vp9_filter_block_plane(cm, &planes[plane], mi_row, &lfm);
+        else
+          vp9_filter_block_plane_non420(cm, &planes[plane], mi + mi_col,
+                                        mi_row, mi_col);
       }
 
       sync_write(lf_sync, r, c, sb_cols);
@@ -123,37 +131,33 @@ static void loop_filter_rows_mt(const YV12_BUFFER_CONFIG *const frame_buffer,
 // Row-based multi-threaded loopfilter hook
 static int loop_filter_row_worker(VP9LfSync *const lf_sync,
                                   LFWorkerData *const lf_data) {
-  loop_filter_rows_mt(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
-                      lf_data->start, lf_data->stop, lf_data->y_only, lf_sync);
+  thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
+                          lf_data->start, lf_data->stop, lf_data->y_only,
+                          lf_sync);
   return 1;
 }
 
-// VP9 decoder: Implement multi-threaded loopfilter that uses the tile
-// threads.
-void vp9_loop_filter_frame_mt(VP9LfSync *lf_sync,
-                              YV12_BUFFER_CONFIG *frame,
-                              struct macroblockd_plane planes[MAX_MB_PLANE],
-                              VP9_COMMON *cm,
-                              VP9Worker *workers, int nworkers,
-                              int frame_filter_level,
-                              int y_only) {
+static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame,
+                                VP9_COMMON *cm,
+                                struct macroblockd_plane planes[MAX_MB_PLANE],
+                                int start, int stop, int y_only,
+                                VP9Worker *workers, int nworkers,
+                                VP9LfSync *lf_sync) {
   const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
   // Number of superblock rows and cols
   const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
+  // Decoder may allocate more threads than number of tiles based on user's
+  // input.
   const int tile_cols = 1 << cm->log2_tile_cols;
   const int num_workers = MIN(nworkers, tile_cols);
   int i;
 
-  if (!frame_filter_level) return;
-
   if (!lf_sync->sync_range || cm->last_height != cm->height ||
       num_workers > lf_sync->num_workers) {
     vp9_loop_filter_dealloc(lf_sync);
     vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);
   }
 
-  vp9_loop_filter_frame_init(cm, frame_filter_level);
-
   // Initialize cur_sb_col to -1 for all SB rows.
   vpx_memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);
 
@@ -175,8 +179,8 @@ void vp9_loop_filter_frame_mt(VP9LfSync *lf_sync,
 
     // Loopfilter data
     vp9_loop_filter_data_reset(lf_data, frame, cm, planes);
-    lf_data->start = i;
-    lf_data->stop = sb_rows;
+    lf_data->start = start + i * MI_BLOCK_SIZE;
+    lf_data->stop = stop;
     lf_data->y_only = y_only;
 
     // Start loopfiltering
@@ -193,8 +197,33 @@ void vp9_loop_filter_frame_mt(VP9LfSync *lf_sync,
   }
 }
 
+void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
+                              VP9_COMMON *cm,
+                              struct macroblockd_plane planes[MAX_MB_PLANE],
+                              int frame_filter_level,
+                              int y_only, int partial_frame,
+                              VP9Worker *workers, int num_workers,
+                              VP9LfSync *lf_sync) {
+  int start_mi_row, end_mi_row, mi_rows_to_filter;
+
+  if (!frame_filter_level) return;
+
+  start_mi_row = 0;
+  mi_rows_to_filter = cm->mi_rows;
+  if (partial_frame && cm->mi_rows > 8) {
+    start_mi_row = cm->mi_rows >> 1;
+    start_mi_row &= 0xfffffff8;
+    mi_rows_to_filter = MAX(cm->mi_rows / 8, 8);
+  }
+  end_mi_row = start_mi_row + mi_rows_to_filter;
+  vp9_loop_filter_frame_init(cm, frame_filter_level);
+
+  loop_filter_rows_mt(frame, cm, planes, start_mi_row, end_mi_row,
+                      y_only, workers, num_workers, lf_sync);
+}
+
 // Set up nsync by width.
-static int get_sync_range(int width) {
+static INLINE int get_sync_range(int width) {
   // nsync numbers are picked by testing. For example, for 4k
   // video, using 4 gives best performance.
   if (width < 640)
diff --git a/vp9/decoder/vp9_dthread.h b/vp9/common/vp9_loopfilter_thread.h
index d5810b45b..bca357e52 100644
--- a/vp9/decoder/vp9_dthread.h
+++ b/vp9/common/vp9_loopfilter_thread.h
@@ -8,21 +8,13 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_DECODER_VP9_DTHREAD_H_
-#define VP9_DECODER_VP9_DTHREAD_H_
-
+#ifndef VP9_COMMON_VP9_LOOPFILTER_THREAD_H_
+#define VP9_COMMON_VP9_LOOPFILTER_THREAD_H_
 #include "./vpx_config.h"
+#include "vp9/common/vp9_loopfilter.h"
 #include "vp9/common/vp9_thread.h"
-#include "vp9/decoder/vp9_reader.h"
 
 struct VP9Common;
-struct VP9Decoder;
-
-typedef struct TileWorkerData {
-  struct VP9Common *cm;
-  vp9_reader bit_reader;
-  DECLARE_ALIGNED(16, struct macroblockd, xd);
-} TileWorkerData;
 
 // Loopfilter row synchronization
 typedef struct VP9LfSyncData {
@@ -43,19 +35,19 @@ typedef struct VP9LfSyncData {
 } VP9LfSync;
 
 // Allocate memory for loopfilter row synchronization.
-void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
+void vp9_loop_filter_alloc(VP9LfSync *lf_sync, struct VP9Common *cm, int rows,
                            int width, int num_workers);
 
 // Deallocate loopfilter synchronization related mutex and data.
 void vp9_loop_filter_dealloc(VP9LfSync *lf_sync);
 
 // Multi-threaded loopfilter that uses the tile threads.
-void vp9_loop_filter_frame_mt(VP9LfSync *lf_sync,
-                              YV12_BUFFER_CONFIG *frame,
-                              struct macroblockd_plane planes[MAX_MB_PLANE],
+void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
                               struct VP9Common *cm,
-                              VP9Worker *workers, int num_workers,
+                              struct macroblockd_plane planes[MAX_MB_PLANE],
                               int frame_filter_level,
-                              int y_only);
+                              int y_only, int partial_frame,
+                              VP9Worker *workers, int num_workers,
+                              VP9LfSync *lf_sync);
 
-#endif  // VP9_DECODER_VP9_DTHREAD_H_
+#endif  // VP9_COMMON_VP9_LOOPFILTER_THREAD_H_
diff --git a/vp9/common/vp9_mfqe.c b/vp9/common/vp9_mfqe.c
index 92650e954..e7ee903c6 100644
--- a/vp9/common/vp9_mfqe.c
+++ b/vp9/common/vp9_mfqe.c
@@ -136,13 +136,27 @@ static void copy_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
   }
 }
 
+static void get_thr(BLOCK_SIZE bs, int qdiff, int *sad_thr, int *vdiff_thr) {
+  const int adj = qdiff >> MFQE_PRECISION;
+  if (bs == BLOCK_16X16) {
+    *sad_thr = 7 + adj;
+  } else if (bs == BLOCK_32X32) {
+    *sad_thr = 6 + adj;
+  } else {  // BLOCK_64X64
+    *sad_thr = 5 + adj;
+  }
+  *vdiff_thr = 125 + qdiff;
+}
+
 static void mfqe_block(BLOCK_SIZE bs, const uint8_t *y, const uint8_t *u,
                        const uint8_t *v, int y_stride, int uv_stride,
-                       uint8_t *yd, uint8_t *ud, uint8_t *vd,
-                       int yd_stride, int uvd_stride) {
-  int sad, sad_thr, vdiff;
+                       uint8_t *yd, uint8_t *ud, uint8_t *vd, int yd_stride,
+                       int uvd_stride, int qdiff) {
+  int sad, sad_thr, vdiff, vdiff_thr;
   uint32_t sse;
 
+  get_thr(bs, qdiff, &sad_thr, &vdiff_thr);
+
   if (bs == BLOCK_16X16) {
     vdiff = (vp9_variance16x16(y, y_stride, yd, yd_stride, &sse) + 128) >> 8;
     sad = (vp9_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8;
@@ -154,23 +168,18 @@ static void mfqe_block(BLOCK_SIZE bs, const uint8_t *y, const uint8_t *u,
     sad = (vp9_sad64x64(y, y_stride, yd, yd_stride) + 2048) >> 12;
   }
 
-  if (bs == BLOCK_16X16) {
-    sad_thr = 8;
-  } else if (bs == BLOCK_32X32) {
-    sad_thr = 7;
-  } else {  // BLOCK_64X64
-    sad_thr = 6;
-  }
-
-  // TODO(jackychen): More experiments and remove magic numbers.
   // vdiff > sad * 3 means vdiff should not be too small, otherwise,
   // it might be a lighting change in smooth area. When there is a
   // lighting change in smooth area, it is dangerous to do MFQE.
-  if (sad > 1 && sad < sad_thr && vdiff > sad * 3 && vdiff < 150) {
-    // TODO(jackychen): Add weighted average in the calculation.
-    // Currently, the data is copied from last frame without averaging.
-    apply_ifactor(y, y_stride, yd, yd_stride, u, v, uv_stride,
-                  ud, vd, uvd_stride, bs, 0);
+  if (sad > 1 && vdiff > sad * 3) {
+    const int weight = 1 << MFQE_PRECISION;
+    int ifactor = weight * sad * vdiff / (sad_thr * vdiff_thr);
+    // When ifactor equals weight, no MFQE is done.
+    if (ifactor > weight) {
+      ifactor = weight;
+    }
+    apply_ifactor(y, y_stride, yd, yd_stride, u, v, uv_stride, ud, vd,
+                  uvd_stride, bs, ifactor);
   } else {
     // Copy the block from current frame (i.e., no mfqe is done).
     copy_block(y, u, v, y_stride, uv_stride, yd, ud, vd,
@@ -199,8 +208,7 @@ static void mfqe_partition(VP9_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs,
                            int yd_stride, int uvd_stride) {
   int mi_offset, y_offset, uv_offset;
   const BLOCK_SIZE cur_bs = mi->mbmi.sb_type;
-  // TODO(jackychen): Consider how and whether to use qdiff in MFQE.
-  // int qdiff = cm->base_qindex - cm->postproc_state.last_base_qindex;
+  const int qdiff = cm->base_qindex - cm->postproc_state.last_base_qindex;
   const int bsl = b_width_log2_lookup[bs];
   PARTITION_TYPE partition = partition_lookup[bsl][cur_bs];
   const BLOCK_SIZE subsize = get_subsize(bs, partition);
@@ -235,18 +243,18 @@ static void mfqe_partition(VP9_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs,
       if (mfqe_decision(mi, mfqe_bs)) {
         // Do mfqe on the first square partition.
         mfqe_block(bs_tmp, y, u, v, y_stride, uv_stride,
-                   yd, ud, vd, yd_stride, uvd_stride);
+                   yd, ud, vd, yd_stride, uvd_stride, qdiff);
         // Do mfqe on the second square partition.
         mfqe_block(bs_tmp, y + y_offset, u + uv_offset, v + uv_offset,
                    y_stride, uv_stride, yd + y_offset, ud + uv_offset,
-                   vd + uv_offset, yd_stride, uvd_stride);
+                   vd + uv_offset, yd_stride, uvd_stride, qdiff);
       }
       if (mfqe_decision(mi + mi_offset * cm->mi_stride, mfqe_bs)) {
         // Do mfqe on the first square partition.
         mfqe_block(bs_tmp, y + y_offset * y_stride, u + uv_offset * uv_stride,
                    v + uv_offset * uv_stride, y_stride, uv_stride,
                    yd + y_offset * yd_stride, ud + uv_offset * uvd_stride,
-                   vd + uv_offset * uvd_stride, yd_stride, uvd_stride);
+                   vd + uv_offset * uvd_stride, yd_stride, uvd_stride, qdiff);
         // Do mfqe on the second square partition.
         mfqe_block(bs_tmp, y + y_offset * y_stride + y_offset,
                    u + uv_offset * uv_stride + uv_offset,
@@ -254,7 +262,7 @@ static void mfqe_partition(VP9_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs,
                    uv_stride, yd + y_offset * yd_stride + y_offset,
                    ud + uv_offset * uvd_stride + uv_offset,
                    vd + uv_offset * uvd_stride + uv_offset,
-                   yd_stride, uvd_stride);
+                   yd_stride, uvd_stride, qdiff);
       }
       break;
     case PARTITION_VERT:
@@ -268,18 +276,18 @@ static void mfqe_partition(VP9_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs,
       if (mfqe_decision(mi, mfqe_bs)) {
         // Do mfqe on the first square partition.
         mfqe_block(bs_tmp, y, u, v, y_stride, uv_stride,
-                   yd, ud, vd, yd_stride, uvd_stride);
+                   yd, ud, vd, yd_stride, uvd_stride, qdiff);
         // Do mfqe on the second square partition.
         mfqe_block(bs_tmp, y + y_offset * y_stride, u + uv_offset * uv_stride,
                    v + uv_offset * uv_stride, y_stride, uv_stride,
                    yd + y_offset * yd_stride, ud + uv_offset * uvd_stride,
-                   vd + uv_offset * uvd_stride, yd_stride, uvd_stride);
+                   vd + uv_offset * uvd_stride, yd_stride, uvd_stride, qdiff);
       }
       if (mfqe_decision(mi + mi_offset, mfqe_bs)) {
         // Do mfqe on the first square partition.
         mfqe_block(bs_tmp, y + y_offset, u + uv_offset, v + uv_offset,
                    y_stride, uv_stride, yd + y_offset, ud + uv_offset,
-                   vd + uv_offset, yd_stride, uvd_stride);
+                   vd + uv_offset, yd_stride, uvd_stride, qdiff);
         // Do mfqe on the second square partition.
         mfqe_block(bs_tmp, y + y_offset * y_stride + y_offset,
                    u + uv_offset * uv_stride + uv_offset,
@@ -287,14 +295,14 @@ static void mfqe_partition(VP9_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs,
                    uv_stride, yd + y_offset * yd_stride + y_offset,
                    ud + uv_offset * uvd_stride + uv_offset,
                    vd + uv_offset * uvd_stride + uv_offset,
-                   yd_stride, uvd_stride);
+                   yd_stride, uvd_stride, qdiff);
       }
       break;
     case PARTITION_NONE:
       if (mfqe_decision(mi, cur_bs)) {
         // Do mfqe on this partition.
         mfqe_block(cur_bs, y, u, v, y_stride, uv_stride,
-                   yd, ud, vd, yd_stride, uvd_stride);
+                   yd, ud, vd, yd_stride, uvd_stride, qdiff);
       } else {
         // Copy the block from current frame(i.e., no mfqe is done).
         copy_block(y, u, v, y_stride, uv_stride, yd, ud, vd,
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index ad91c10dd..1a957bc99 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -76,7 +76,7 @@ typedef struct VP9Common {
   DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][8]);
   DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][8]);
 
-  COLOR_SPACE color_space;
+  vpx_color_space_t color_space;
 
   int width;
   int height;
@@ -272,6 +272,7 @@ static INLINE void init_macroblockd(VP9_COMMON *cm, MACROBLOCKD *xd) {
 
   xd->above_seg_context = cm->above_seg_context;
   xd->mi_stride = cm->mi_stride;
+  xd->error_info = &cm->error;
 }
 
 static INLINE int frame_is_intra_only(const VP9_COMMON *const cm) {
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index d2ab875e9..4e9ec0f56 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -224,10 +224,12 @@ specialize qw/vp9_lpf_vertical_16_dual sse2 neon_asm dspr2/;
 $vp9_lpf_vertical_16_dual_neon_asm=vp9_lpf_vertical_16_dual_neon;
 
 add_proto qw/void vp9_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
-specialize qw/vp9_lpf_vertical_8 sse2 neon dspr2/;
+specialize qw/vp9_lpf_vertical_8 sse2 neon_asm dspr2/;
+$vp9_lpf_vertical_8_neon_asm=vp9_lpf_vertical_8_neon;
 
 add_proto qw/void vp9_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vp9_lpf_vertical_8_dual sse2 neon dspr2/;
+specialize qw/vp9_lpf_vertical_8_dual sse2 neon_asm dspr2/;
+$vp9_lpf_vertical_8_dual_neon_asm=vp9_lpf_vertical_8_dual_neon;
 
 add_proto qw/void vp9_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
 specialize qw/vp9_lpf_vertical_4 mmx neon dspr2/;
@@ -240,10 +242,12 @@ specialize qw/vp9_lpf_horizontal_16 sse2 avx2 neon_asm dspr2/;
 $vp9_lpf_horizontal_16_neon_asm=vp9_lpf_horizontal_16_neon;
 
 add_proto qw/void vp9_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
-specialize qw/vp9_lpf_horizontal_8 sse2 neon dspr2/;
+specialize qw/vp9_lpf_horizontal_8 sse2 neon_asm dspr2/;
+$vp9_lpf_horizontal_8_neon_asm=vp9_lpf_horizontal_8_neon;
 
 add_proto qw/void vp9_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vp9_lpf_horizontal_8_dual sse2 neon dspr2/;
+specialize qw/vp9_lpf_horizontal_8_dual sse2 neon_asm dspr2/;
+$vp9_lpf_horizontal_8_dual_neon_asm=vp9_lpf_horizontal_8_dual_neon;
 
 add_proto qw/void vp9_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
 specialize qw/vp9_lpf_horizontal_4 mmx neon dspr2/;
@@ -794,16 +798,16 @@ add_proto qw/unsigned int vp9_variance16x32/, "const uint8_t *src_ptr, int sourc
 specialize qw/vp9_variance16x32/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance64x32 avx2/, "$sse2_x86inc";
+specialize qw/vp9_variance64x32 avx2 neon/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance32x64/, "$sse2_x86inc";
+specialize qw/vp9_variance32x64 neon/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_variance32x32 avx2 neon/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance64x64 avx2/, "$sse2_x86inc";
+specialize qw/vp9_variance64x64 avx2 neon/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_variance16x16 avx2 neon/, "$sse2_x86inc";
@@ -833,7 +837,7 @@ add_proto qw/unsigned int vp9_variance4x4/, "const uint8_t *src_ptr, int source_
 specialize qw/vp9_variance4x4/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_variance64x64 avx2 neon/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/unsigned int vp9_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
 specialize qw/vp9_sub_pixel_avg_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
@@ -1094,7 +1098,7 @@ add_proto qw/unsigned int vp9_get_mb_ss/, "const int16_t *";
 specialize qw/vp9_get_mb_ss/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_avg_8x8/, "const uint8_t *, int p";
-specialize qw/vp9_avg_8x8 sse2/;
+specialize qw/vp9_avg_8x8 sse2 neon/;
 
 add_proto qw/unsigned int vp9_avg_4x4/, "const uint8_t *, int p";
 specialize qw/vp9_avg_4x4 sse2/;
diff --git a/vp9/common/vp9_tile_common.c b/vp9/common/vp9_tile_common.c
index 8c4a30353..7a20e0a9e 100644
--- a/vp9/common/vp9_tile_common.c
+++ b/vp9/common/vp9_tile_common.c
@@ -36,24 +36,24 @@ void vp9_tile_init(TileInfo *tile, const VP9_COMMON *cm, int row, int col) {
   vp9_tile_set_col(tile, cm, col);
 }
 
-void vp9_get_tile_n_bits(int mi_cols,
-                         int *min_log2_tile_cols, int *max_log2_tile_cols) {
-  const int sb_cols = mi_cols_aligned_to_sb(mi_cols) >> MI_BLOCK_SIZE_LOG2;
-  int min_log2 = 0, max_log2 = 0;
-
-  // max
-  while ((sb_cols >> max_log2) >= MIN_TILE_WIDTH_B64)
-    ++max_log2;
-  --max_log2;
-  if (max_log2 < 0)
-    max_log2 = 0;
-
-  // min
-  while ((MAX_TILE_WIDTH_B64 << min_log2) < sb_cols)
+static int get_min_log2_tile_cols(const int sb64_cols) {
+  int min_log2 = 0;
+  while ((MAX_TILE_WIDTH_B64 << min_log2) < sb64_cols)
     ++min_log2;
+  return min_log2;
+}
 
-  assert(min_log2 <= max_log2);
+static int get_max_log2_tile_cols(const int sb64_cols) {
+  int max_log2 = 1;
+  while ((sb64_cols >> max_log2) >= MIN_TILE_WIDTH_B64)
+    ++max_log2;
+  return max_log2 - 1;
+}
 
-  *min_log2_tile_cols = min_log2;
-  *max_log2_tile_cols = max_log2;
+void vp9_get_tile_n_bits(int mi_cols,
+                         int *min_log2_tile_cols, int *max_log2_tile_cols) {
+  const int sb64_cols = mi_cols_aligned_to_sb(mi_cols) >> MI_BLOCK_SIZE_LOG2;
+  *min_log2_tile_cols = get_min_log2_tile_cols(sb64_cols);
+  *max_log2_tile_cols = get_max_log2_tile_cols(sb64_cols);
+  assert(*min_log2_tile_cols <= *max_log2_tile_cols);
 }
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 9677173db..ea4edbffe 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -36,7 +36,6 @@
 #include "vp9/decoder/vp9_decodemv.h"
 #include "vp9/decoder/vp9_decoder.h"
 #include "vp9/decoder/vp9_dsubexp.h"
-#include "vp9/decoder/vp9_dthread.h"
 #include "vp9/decoder/vp9_read_bit_buffer.h"
 #include "vp9/decoder/vp9_reader.h"
 
@@ -463,8 +462,8 @@ static void decode_partition(VP9_COMMON *const cm, MACROBLOCKD *const xd,
   subsize = get_subsize(bsize, partition);
   uv_subsize = ss_size_lookup[subsize][cm->subsampling_x][cm->subsampling_y];
   if (subsize >= BLOCK_8X8 && uv_subsize == BLOCK_INVALID)
-    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
-                       "Invalid block size.");
+    vpx_internal_error(xd->error_info,
+                       VPX_CODEC_CORRUPT_FRAME, "Invalid block size.");
   if (subsize < BLOCK_8X8) {
     decode_block(cm, xd, tile, mi_row, mi_col, r, subsize);
   } else {
@@ -727,6 +726,8 @@ static void setup_frame_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
   }
   cm->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x;
   cm->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y;
+  cm->frame_bufs[cm->new_fb_idx].buf.color_space =
+      (vpx_color_space_t)cm->color_space;
   cm->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth;
 }
 
@@ -781,7 +782,7 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm,
             cm->subsampling_x,
             cm->subsampling_y))
       vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
-                         "Referenced frame has incompatible color space");
+                         "Referenced frame has incompatible color format");
   }
 
   resize_context_buffers(cm, width, height);
@@ -1021,6 +1022,15 @@ static int tile_worker_hook(TileWorkerData *const tile_data,
                             const TileInfo *const tile) {
   int mi_row, mi_col;
 
+  if (setjmp(tile_data->error_info.jmp)) {
+    tile_data->error_info.setjmp = 0;
+    tile_data->xd.corrupted = 1;
+    return 0;
+  }
+
+  tile_data->error_info.setjmp = 1;
+  tile_data->xd.error_info = &tile_data->error_info;
+
   for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
        mi_row += MI_BLOCK_SIZE) {
     vp9_zero(tile_data->xd.left_context);
@@ -1167,6 +1177,10 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,
 
     for (; i > 0; --i) {
       VP9Worker *const worker = &pbi->tile_workers[i - 1];
+      // TODO(jzern): The tile may have specific error data associated with
+      // its vpx_internal_error_info which could be propagated to the main info
+      // in cm. Additionally once the threads have been synced and an error is
+      // detected, there's no point in continuing to decode tiles.
       pbi->mb.corrupted |= !winterface->sync(worker);
     }
     if (final_worker > -1) {
@@ -1212,8 +1226,8 @@ static void read_bitdepth_colorspace_sampling(
     cm->use_highbitdepth = 0;
 #endif
   }
-  cm->color_space = (COLOR_SPACE)vp9_rb_read_literal(rb, 3);
-  if (cm->color_space != SRGB) {
+  cm->color_space = vp9_rb_read_literal(rb, 3);
+  if (cm->color_space != VPX_CS_SRGB) {
     vp9_rb_read_bit(rb);  // [16,235] (including xvycc) vs [0,255] range
     if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) {
       cm->subsampling_x = vp9_rb_read_bit(rb);
@@ -1311,9 +1325,9 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
       } else {
         // NOTE: The intra-only frame header does not include the specification
         // of either the color format or color sub-sampling in profile 0. VP9
-        // specifies that the default color space should be YUV 4:2:0 in this
+        // specifies that the default color format should be YUV 4:2:0 in this
         // case (normative).
-        cm->color_space = BT_601;
+        cm->color_space = VPX_CS_BT_601;
         cm->subsampling_y = cm->subsampling_x = 1;
         cm->bit_depth = VPX_BITS_8;
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -1546,8 +1560,6 @@ void vp9_decode_frame(VP9Decoder *pbi,
     vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
                        "Truncated packet or corrupt header length");
 
-  init_macroblockd(cm, &pbi->mb);
-
   cm->use_prev_frame_mvs = !cm->error_resilient_mode &&
                            cm->width == cm->last_width &&
                            cm->height == cm->last_height &&
@@ -1578,9 +1590,9 @@ void vp9_decode_frame(VP9Decoder *pbi,
     if (!xd->corrupted) {
       // If multiple threads are used to decode tiles, then we use those threads
       // to do parallel loopfiltering.
-      vp9_loop_filter_frame_mt(&pbi->lf_row_sync, new_fb, pbi->mb.plane, cm,
-                               pbi->tile_workers, pbi->num_tile_workers,
-                               cm->lf.filter_level, 0);
+      vp9_loop_filter_frame_mt(new_fb, cm, pbi->mb.plane, cm->lf.filter_level,
+                               0, 0, pbi->tile_workers, pbi->num_tile_workers,
+                               &pbi->lf_row_sync);
     } else {
       vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
                          "Decode failed. Frame data is corrupted.");
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index cff94db2d..1c2603b0a 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -438,7 +438,7 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
     RefBuffer *ref_buf = &cm->frame_refs[frame - LAST_FRAME];
     xd->block_refs[ref] = ref_buf;
     if ((!vp9_is_valid_scale(&ref_buf->sf)))
-      vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+      vpx_internal_error(xd->error_info, VPX_CODEC_UNSUP_BITSTREAM,
                          "Reference frame has invalid dimensions");
     vp9_setup_pre_planes(xd, ref, ref_buf->buf, mi_row, mi_col,
                          &ref_buf->sf);
@@ -451,7 +451,7 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
   if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
     mbmi->mode = ZEROMV;
     if (bsize < BLOCK_8X8) {
-        vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+        vpx_internal_error(xd->error_info, VPX_CODEC_UNSUP_BITSTREAM,
                            "Invalid usage of segement feature on small blocks");
         return;
     }
diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c
index 1406b4034..7bef265b8 100644
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@@ -32,7 +32,6 @@
 #include "vp9/decoder/vp9_decodeframe.h"
 #include "vp9/decoder/vp9_decoder.h"
 #include "vp9/decoder/vp9_detokenize.h"
-#include "vp9/decoder/vp9_dthread.h"
 
 static void initialize_dec(void) {
   static volatile int init_done = 0;
@@ -63,8 +62,8 @@ static void vp9_dec_free_mi(VP9_COMMON *cm) {
 }
 
 VP9Decoder *vp9_decoder_create() {
-  VP9Decoder *const pbi = vpx_memalign(32, sizeof(*pbi));
-  VP9_COMMON *const cm = pbi ? &pbi->common : NULL;
+  VP9Decoder *volatile const pbi = vpx_memalign(32, sizeof(*pbi));
+  VP9_COMMON *volatile const cm = pbi ? &pbi->common : NULL;
 
   if (!cm)
     return NULL;
@@ -238,12 +237,12 @@ static void swap_frame_buffers(VP9Decoder *pbi) {
 
   // Invalidate these references until the next frame starts.
   for (ref_index = 0; ref_index < 3; ref_index++)
-    cm->frame_refs[ref_index].idx = INT_MAX;
+    cm->frame_refs[ref_index].idx = -1;
 }
 
 int vp9_receive_compressed_data(VP9Decoder *pbi,
                                 size_t size, const uint8_t **psource) {
-  VP9_COMMON *const cm = &pbi->common;
+  VP9_COMMON *volatile const cm = &pbi->common;
   const uint8_t *source = *psource;
   int retcode = 0;
 
@@ -258,7 +257,7 @@ int vp9_receive_compressed_data(VP9Decoder *pbi,
     // TODO(jkoleszar): Error concealment is undefined and non-normative
     // at this point, but if it becomes so, [0] may not always be the correct
     // thing to do here.
-    if (cm->frame_refs[0].idx != INT_MAX)
+    if (cm->frame_refs[0].idx > 0)
       cm->frame_refs[0].buf->corrupted = 1;
   }
 
diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h
index 25b7339ed..1415019a1 100644
--- a/vp9/decoder/vp9_decoder.h
+++ b/vp9/decoder/vp9_decoder.h
@@ -15,12 +15,11 @@
 
 #include "vpx/vpx_codec.h"
 #include "vpx_scale/yv12config.h"
-
+#include "vp9/common/vp9_loopfilter_thread.h"
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_ppflags.h"
 #include "vp9/common/vp9_thread.h"
-
-#include "vp9/decoder/vp9_dthread.h"
+#include "vp9/decoder/vp9_reader.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -33,6 +32,13 @@ typedef struct TileData {
   DECLARE_ALIGNED(16, MACROBLOCKD, xd);
 } TileData;
 
+typedef struct TileWorkerData {
+  VP9_COMMON *cm;
+  vp9_reader bit_reader;
+  DECLARE_ALIGNED(16, MACROBLOCKD, xd);
+  struct vpx_internal_error_info error_info;
+} TileWorkerData;
+
 typedef struct VP9Decoder {
   DECLARE_ALIGNED(16, MACROBLOCKD, mb);
 
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index 8704fddac..23d622d70 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -14,6 +14,9 @@
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_entropy.h"
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+#include "vp9/common/vp9_idct.h"
+#endif
 
 #include "vp9/decoder/vp9_detokenize.h"
 
@@ -32,7 +35,7 @@
 #define INCREMENT_COUNT(token)                              \
   do {                                                      \
      if (!cm->frame_parallel_decoding_mode)                 \
-       ++coef_counts[band][ctx][token];                      \
+       ++coef_counts[band][ctx][token];                     \
   } while (0)
 
 static INLINE int read_coeff(const vp9_prob *probs, int n, vp9_reader *r) {
@@ -191,10 +194,15 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, PLANE_TYPE type,
     }
     v = (val * dqv) >> dq_shift;
 #if CONFIG_COEFFICIENT_RANGE_CHECKING
+#if CONFIG_VP9_HIGHBITDEPTH
+    dqcoeff[scan[c]] = highbd_check_range((vp9_read_bit(r) ? -v : v),
+                                          cm->bit_depth);
+#else
     dqcoeff[scan[c]] = check_range(vp9_read_bit(r) ? -v : v);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 #else
     dqcoeff[scan[c]] = vp9_read_bit(r) ? -v : v;
-#endif
+#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
     token_cache[scan[c]] = vp9_pt_energy_class[token];
     ++c;
     ctx = get_coef_context(nb, token_cache, c);
diff --git a/vp9/encoder/arm/neon/vp9_avg_neon.c b/vp9/encoder/arm/neon/vp9_avg_neon.c
new file mode 100644
index 000000000..f505fcb7a
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_avg_neon.c
@@ -0,0 +1,49 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+
+static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) {
+  const uint32x4_t a = vpaddlq_u16(v_16x8);
+  const uint64x2_t b = vpaddlq_u32(a);
+  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                                vreinterpret_u32_u64(vget_high_u64(b)));
+  return vget_lane_u32(c, 0);
+}
+
+unsigned int vp9_avg_8x8_neon(const uint8_t *s, int p) {
+  uint8x8_t v_s0 = vld1_u8(s);
+  const uint8x8_t v_s1 = vld1_u8(s + p);
+  uint16x8_t v_sum = vaddl_u8(v_s0, v_s1);
+
+  v_s0 = vld1_u8(s + 2 * p);
+  v_sum = vaddw_u8(v_sum, v_s0);
+
+  v_s0 = vld1_u8(s + 3 * p);
+  v_sum = vaddw_u8(v_sum, v_s0);
+
+  v_s0 = vld1_u8(s + 4 * p);
+  v_sum = vaddw_u8(v_sum, v_s0);
+
+  v_s0 = vld1_u8(s + 5 * p);
+  v_sum = vaddw_u8(v_sum, v_s0);
+
+  v_s0 = vld1_u8(s + 6 * p);
+  v_sum = vaddw_u8(v_sum, v_s0);
+
+  v_s0 = vld1_u8(s + 7 * p);
+  v_sum = vaddw_u8(v_sum, v_s0);
+
+  return (horizontal_add_u16x8(v_sum) + 32) >> 6;
+}
diff --git a/vp9/encoder/arm/neon/vp9_variance_neon.c b/vp9/encoder/arm/neon/vp9_variance_neon.c
index 816fbda1f..b1ad83731 100644
--- a/vp9/encoder/arm/neon/vp9_variance_neon.c
+++ b/vp9/encoder/arm/neon/vp9_variance_neon.c
@@ -10,6 +10,7 @@
 
 #include <arm_neon.h>
 #include "./vp9_rtcd.h"
+#include "./vpx_config.h"
 
 #include "vpx_ports/mem.h"
 #include "vpx/vpx_integer.h"
@@ -28,6 +29,9 @@ enum { kHeight16PlusOne = 17 };
 enum { kWidth32 = 32 };
 enum { kHeight32 = 32 };
 enum { kHeight32PlusOne = 33 };
+enum { kWidth64 = 64 };
+enum { kHeight64 = 64 };
+enum { kHeight64PlusOne = 65 };
 enum { kPixelStepOne = 1 };
 enum { kAlign16 = 16 };
 
@@ -46,9 +50,10 @@ static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
   return vget_lane_s32(c, 0);
 }
 
+// w * h must be less than 2048 or local variable v_sum may overflow.
 static void variance_neon_w8(const uint8_t *a, int a_stride,
                              const uint8_t *b, int b_stride,
-                             int w, int h, unsigned int *sse, int *sum) {
+                             int w, int h, uint32_t *sse, int *sum) {
   int i, j;
   int16x8_t v_sum = vdupq_n_s16(0);
   int32x4_t v_sse_lo = vdupq_n_s32(0);
@@ -88,7 +93,7 @@ unsigned int vp9_variance8x8_neon(const uint8_t *a, int a_stride,
                                   unsigned int *sse) {
   int sum;
   variance_neon_w8(a, a_stride, b, b_stride, kWidth8, kHeight8, sse, &sum);
-  return *sse - (((int64_t)sum * sum) / (kWidth8 * kHeight8));
+  return *sse - (((int64_t)sum * sum) >> 6);  //  >> 6 = / 8 * 8
 }
 
 void vp9_get16x16var_neon(const uint8_t *src_ptr, int source_stride,
@@ -103,7 +108,7 @@ unsigned int vp9_variance16x16_neon(const uint8_t *a, int a_stride,
                                     unsigned int *sse) {
   int sum;
   variance_neon_w8(a, a_stride, b, b_stride, kWidth16, kHeight16, sse, &sum);
-  return *sse - (((int64_t)sum * sum) / (kWidth16 * kHeight16));
+  return *sse - (((int64_t)sum * sum) >> 8);  //  >> 8 = / 16 * 16
 }
 
 static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
@@ -205,7 +210,62 @@ unsigned int vp9_variance32x32_neon(const uint8_t *a, int a_stride,
                                     unsigned int *sse) {
   int sum;
   variance_neon_w8(a, a_stride, b, b_stride, kWidth32, kHeight32, sse, &sum);
-  return *sse - (((int64_t)sum * sum) / (kWidth32 * kHeight32));
+  return *sse - (((int64_t)sum * sum) >> 10);  // >> 10 = / 32 * 32
+}
+
+unsigned int vp9_variance32x64_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum1, sum2;
+  uint32_t sse1, sse2;
+  variance_neon_w8(a, a_stride, b, b_stride, kWidth32, kHeight32, &sse1, &sum1);
+  variance_neon_w8(a + (kHeight32 * a_stride), a_stride,
+                   b + (kHeight32 * b_stride), b_stride, kWidth32, kHeight32,
+                   &sse2, &sum2);
+  *sse = sse1 + sse2;
+  sum1 += sum2;
+  return *sse - (((int64_t)sum1 * sum1) >> 11);  // >> 11 = / 32 * 64
+}
+
+unsigned int vp9_variance64x32_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum1, sum2;
+  uint32_t sse1, sse2;
+  variance_neon_w8(a, a_stride, b, b_stride, kWidth64, kHeight16, &sse1, &sum1);
+  variance_neon_w8(a + (kHeight16 * a_stride), a_stride,
+                   b + (kHeight16 * b_stride), b_stride, kWidth64, kHeight16,
+                   &sse2, &sum2);
+  *sse = sse1 + sse2;
+  sum1 += sum2;
+  return *sse - (((int64_t)sum1 * sum1) >> 11);  // >> 11 = / 32 * 64
+}
+
+unsigned int vp9_variance64x64_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum1, sum2;
+  uint32_t sse1, sse2;
+
+  variance_neon_w8(a, a_stride, b, b_stride, kWidth64, kHeight16, &sse1, &sum1);
+  variance_neon_w8(a + (kHeight16 * a_stride), a_stride,
+                   b + (kHeight16 * b_stride), b_stride, kWidth64, kHeight16,
+                   &sse2, &sum2);
+  sse1 += sse2;
+  sum1 += sum2;
+
+  variance_neon_w8(a + (kHeight16 * 2 * a_stride), a_stride,
+                   b + (kHeight16 * 2 * b_stride), b_stride,
+                   kWidth64, kHeight16, &sse2, &sum2);
+  sse1 += sse2;
+  sum1 += sum2;
+
+  variance_neon_w8(a + (kHeight16 * 3 * a_stride), a_stride,
+                   b + (kHeight16 * 3 * b_stride), b_stride,
+                   kWidth64, kHeight16, &sse2, &sum2);
+  *sse = sse1 + sse2;
+  sum1 += sum2;
+  return *sse - (((int64_t)sum1 * sum1) >> 12);  // >> 12 = / 64 * 64
 }
 
 unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src,
@@ -225,3 +285,21 @@ unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src,
                              kWidth32, BILINEAR_FILTERS_2TAP(yoffset));
   return vp9_variance32x32_neon(temp2, kWidth32, dst, dst_stride, sse);
 }
+
+unsigned int vp9_sub_pixel_variance64x64_neon(const uint8_t *src,
+                                              int src_stride,
+                                              int xoffset,
+                                              int yoffset,
+                                              const uint8_t *dst,
+                                              int dst_stride,
+                                              unsigned int *sse) {
+  DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight64 * kWidth64);
+  DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight64PlusOne * kWidth64);
+
+  var_filter_block2d_bil_w16(src, fdata3, src_stride, kPixelStepOne,
+                             kHeight64PlusOne, kWidth64,
+                             BILINEAR_FILTERS_2TAP(xoffset));
+  var_filter_block2d_bil_w16(fdata3, temp2, kWidth64, kWidth64, kHeight64,
+                             kWidth64, BILINEAR_FILTERS_2TAP(yoffset));
+  return vp9_variance64x64_neon(temp2, kWidth64, dst, dst_stride, sse);
+}
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 752429c8f..3f4ed94d6 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -297,7 +297,6 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
     if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
       if (bsize >= BLOCK_8X8) {
         write_inter_mode(w, mode, inter_probs);
-        ++cpi->td.counts->inter_mode[mode_ctx][INTER_OFFSET(mode)];
       }
     }
 
@@ -320,7 +319,6 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
           const int j = idy * 2 + idx;
           const PREDICTION_MODE b_mode = mi->bmi[j].as_mode;
           write_inter_mode(w, b_mode, inter_probs);
-          ++cpi->td.counts->inter_mode[mode_ctx][INTER_OFFSET(b_mode)];
           if (b_mode == NEWMV) {
             for (ref = 0; ref < 1 + is_compound; ++ref)
               vp9_encode_mv(cpi, w, &mi->bmi[j].as_mv[ref].as_mv,
@@ -1057,7 +1055,7 @@ static void write_bitdepth_colorspace_sampling(
     vp9_wb_write_bit(wb, cm->bit_depth == VPX_BITS_10 ? 0 : 1);
   }
   vp9_wb_write_literal(wb, cm->color_space, 3);
-  if (cm->color_space != SRGB) {
+  if (cm->color_space != VPX_CS_SRGB) {
     vp9_wb_write_bit(wb, 0);  // 0: [16, 235] (i.e. xvYCC), 1: [0, 255]
     if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) {
       assert(cm->subsampling_x != 1 || cm->subsampling_y != 1);
@@ -1172,8 +1170,6 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
       prob_diff_update(vp9_inter_mode_tree, cm->fc->inter_mode_probs[i],
                        counts->inter_mode[i], INTER_MODES, &header_bc);
 
-    vp9_zero(counts->inter_mode);
-
     if (cm->interp_filter == SWITCHABLE)
       update_switchable_interp_probs(cm, &header_bc, counts);
 
diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c
index 506f6de84..41f72f89b 100644
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@@ -170,7 +170,6 @@ void vp9_fht4x4_c(const int16_t *input, tran_low_t *output,
     vp9_fdct4x4_c(input, output, stride);
   } else {
     tran_low_t out[4 * 4];
-    tran_low_t *outptr = &out[0];
     int i, j;
     tran_low_t temp_in[4], temp_out[4];
     const transform_2d ht = FHT_4[tx_type];
@@ -183,7 +182,7 @@ void vp9_fht4x4_c(const int16_t *input, tran_low_t *output,
         temp_in[0] += 1;
       ht.cols(temp_in, temp_out);
       for (j = 0; j < 4; ++j)
-        outptr[j * 4 + i] = temp_out[j];
+        out[j * 4 + i] = temp_out[j];
     }
 
     // Rows
@@ -711,7 +710,6 @@ void vp9_fht8x8_c(const int16_t *input, tran_low_t *output,
     vp9_fdct8x8_c(input, output, stride);
   } else {
     tran_low_t out[64];
-    tran_low_t *outptr = &out[0];
     int i, j;
     tran_low_t temp_in[8], temp_out[8];
     const transform_2d ht = FHT_8[tx_type];
@@ -722,7 +720,7 @@ void vp9_fht8x8_c(const int16_t *input, tran_low_t *output,
         temp_in[j] = input[j * stride + i] * 4;
       ht.cols(temp_in, temp_out);
       for (j = 0; j < 8; ++j)
-        outptr[j * 8 + i] = temp_out[j];
+        out[j * 8 + i] = temp_out[j];
     }
 
     // Rows
@@ -1103,7 +1101,6 @@ void vp9_fht16x16_c(const int16_t *input, tran_low_t *output,
     vp9_fdct16x16_c(input, output, stride);
   } else {
     tran_low_t out[256];
-    tran_low_t *outptr = &out[0];
     int i, j;
     tran_low_t temp_in[16], temp_out[16];
     const transform_2d ht = FHT_16[tx_type];
@@ -1114,7 +1111,7 @@ void vp9_fht16x16_c(const int16_t *input, tran_low_t *output,
         temp_in[j] = input[j * stride + i] * 4;
       ht.cols(temp_in, temp_out);
       for (j = 0; j < 16; ++j)
-        outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
+        out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
     }
 
     // Rows
diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index 56ec6b335..a7aaff0cf 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -49,9 +49,7 @@ static int noise_motion_thresh(BLOCK_SIZE bs, int increase_denoising) {
 }
 
 static unsigned int sse_thresh(BLOCK_SIZE bs, int increase_denoising) {
-  return (4 << b_width_log2_lookup[bs]) *
-         (4 << b_height_log2_lookup[bs]) *
-         (increase_denoising ? 60 : 40);
+  return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 60 : 40);
 }
 
 static int sse_diff_thresh(BLOCK_SIZE bs, int increase_denoising,
@@ -60,19 +58,16 @@ static int sse_diff_thresh(BLOCK_SIZE bs, int increase_denoising,
       noise_motion_thresh(bs, increase_denoising)) {
     return 0;
   } else {
-    return (4 << b_width_log2_lookup[bs]) *
-           (4 << b_height_log2_lookup[bs]) * 20;
+    return (1 << num_pels_log2_lookup[bs]) * 20;
   }
 }
 
 int total_adj_strong_thresh(BLOCK_SIZE bs, int increase_denoising) {
-  return (4 << b_width_log2_lookup[bs]) *
-         (4 << b_height_log2_lookup[bs]) * (increase_denoising ? 3 : 2);
+  return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 3 : 2);
 }
 
 static int total_adj_weak_thresh(BLOCK_SIZE bs, int increase_denoising) {
-  return (4 << b_width_log2_lookup[bs]) *
-         (4 << b_height_log2_lookup[bs]) * (increase_denoising ? 3 : 2);
+  return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 3 : 2);
 }
 
 // TODO(jackychen): If increase_denoising is enabled in the future,
@@ -195,16 +190,6 @@ static uint8_t *block_start(uint8_t *framebuf, int stride,
   return framebuf + (stride * mi_row * 8) + (mi_col * 8);
 }
 
-static void copy_block(uint8_t *dest, int dest_stride,
-                       const uint8_t *src, int src_stride, BLOCK_SIZE bs) {
-  int r;
-  for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) {
-    vpx_memcpy(dest, src, (4 << b_width_log2_lookup[bs]));
-    dest += dest_stride;
-    src += src_stride;
-  }
-}
-
 static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser,
                                                          MACROBLOCK *mb,
                                                          BLOCK_SIZE bs,
@@ -219,28 +204,18 @@ static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser,
   MV_REFERENCE_FRAME frame;
   MACROBLOCKD *filter_mbd = &mb->e_mbd;
   MB_MODE_INFO *mbmi = &filter_mbd->mi[0].src_mi->mbmi;
-
   MB_MODE_INFO saved_mbmi;
   int i, j;
   struct buf_2d saved_dst[MAX_MB_PLANE];
   struct buf_2d saved_pre[MAX_MB_PLANE][2];  // 2 pre buffers
 
-  // We will restore these after motion compensation.
-  saved_mbmi = *mbmi;
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
-    for (j = 0; j < 2; ++j) {
-      saved_pre[i][j] = filter_mbd->plane[i].pre[j];
-    }
-    saved_dst[i] = filter_mbd->plane[i].dst;
-  }
-
   mv_col = ctx->best_sse_mv.as_mv.col;
   mv_row = ctx->best_sse_mv.as_mv.row;
-
   *motion_magnitude = mv_row * mv_row + mv_col * mv_col;
-
   frame = ctx->best_reference_frame;
 
+  saved_mbmi = *mbmi;
+
   // If the best reference frame uses inter-prediction and there is enough of a
   // difference in sum-squared-error, use it.
   if (frame != INTRA_FRAME &&
@@ -261,6 +236,26 @@ static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser,
     ctx->newmv_sse = ctx->zeromv_sse;
   }
 
+  if (ctx->newmv_sse > sse_thresh(bs, increase_denoising)) {
+    // Restore everything to its original state
+    *mbmi = saved_mbmi;
+    return COPY_BLOCK;
+  }
+  if (mv_row * mv_row + mv_col * mv_col >
+      8 * noise_motion_thresh(bs, increase_denoising)) {
+    // Restore everything to its original state
+    *mbmi = saved_mbmi;
+    return COPY_BLOCK;
+  }
+
+  // We will restore these after motion compensation.
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    for (j = 0; j < 2; ++j) {
+      saved_pre[i][j] = filter_mbd->plane[i].pre[j];
+    }
+    saved_dst[i] = filter_mbd->plane[i].dst;
+  }
+
   // Set the pointers in the MACROBLOCKD to point to the buffers in the denoiser
   // struct.
   for (j = 0; j < 2; ++j) {
@@ -313,13 +308,6 @@ static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser,
   mv_row = ctx->best_sse_mv.as_mv.row;
   mv_col = ctx->best_sse_mv.as_mv.col;
 
-  if (ctx->newmv_sse > sse_thresh(bs, increase_denoising)) {
-    return COPY_BLOCK;
-  }
-  if (mv_row * mv_row + mv_col * mv_col >
-      8 * noise_motion_thresh(bs, increase_denoising)) {
-    return COPY_BLOCK;
-  }
   return FILTER_BLOCK;
 }
 
@@ -348,9 +336,15 @@ void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
   }
 
   if (decision == FILTER_BLOCK) {
-    copy_block(src.buf, src.stride, avg_start, avg.y_stride, bs);
+    vp9_convolve_copy(avg_start, avg.y_stride, src.buf, src.stride,
+                      NULL, 0, NULL, 0,
+                      num_4x4_blocks_wide_lookup[bs] << 2,
+                      num_4x4_blocks_high_lookup[bs] << 2);
   } else {  // COPY_BLOCK
-    copy_block(avg_start, avg.y_stride, src.buf, src.stride, bs);
+    vp9_convolve_copy(src.buf, src.stride, avg_start, avg.y_stride,
+                      NULL, 0, NULL, 0,
+                      num_4x4_blocks_wide_lookup[bs] << 2,
+                      num_4x4_blocks_high_lookup[bs] << 2);
   }
 }
 
@@ -358,6 +352,7 @@ static void copy_frame(YV12_BUFFER_CONFIG dest, const YV12_BUFFER_CONFIG src) {
   int r;
   const uint8_t *srcbuf = src.y_buffer;
   uint8_t *destbuf = dest.y_buffer;
+
   assert(dest.y_width == src.y_width);
   assert(dest.y_height == src.y_height);
 
@@ -368,6 +363,15 @@ static void copy_frame(YV12_BUFFER_CONFIG dest, const YV12_BUFFER_CONFIG src) {
   }
 }
 
+static void swap_frame_buffer(YV12_BUFFER_CONFIG *dest,
+                              YV12_BUFFER_CONFIG *src) {
+  uint8_t *tmp_buf = dest->y_buffer;
+  assert(dest->y_width == src->y_width);
+  assert(dest->y_height == src->y_height);
+  dest->y_buffer = src->y_buffer;
+  src->y_buffer = tmp_buf;
+}
+
 void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser,
                                     YV12_BUFFER_CONFIG src,
                                     FRAME_TYPE frame_type,
@@ -377,28 +381,32 @@ void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser,
   if (frame_type == KEY_FRAME) {
     int i;
     // Start at 1 so as not to overwrite the INTRA_FRAME
-    for (i = 1; i < MAX_REF_FRAMES; ++i) {
+    for (i = 1; i < MAX_REF_FRAMES; ++i)
       copy_frame(denoiser->running_avg_y[i], src);
-    }
-  } else {  /* For non key frames */
-    if (refresh_alt_ref_frame) {
-      copy_frame(denoiser->running_avg_y[ALTREF_FRAME],
-                 denoiser->running_avg_y[INTRA_FRAME]);
-    }
-    if (refresh_golden_frame) {
-      copy_frame(denoiser->running_avg_y[GOLDEN_FRAME],
-                 denoiser->running_avg_y[INTRA_FRAME]);
-    }
-    if (refresh_last_frame) {
-      copy_frame(denoiser->running_avg_y[LAST_FRAME],
-                 denoiser->running_avg_y[INTRA_FRAME]);
-    }
+    return;
+  }
+
+  /* For non key frames */
+  if (refresh_alt_ref_frame) {
+    swap_frame_buffer(&denoiser->running_avg_y[ALTREF_FRAME],
+                      &denoiser->running_avg_y[INTRA_FRAME]);
+  }
+  if (refresh_golden_frame) {
+    swap_frame_buffer(&denoiser->running_avg_y[GOLDEN_FRAME],
+                      &denoiser->running_avg_y[INTRA_FRAME]);
+  }
+  if (refresh_last_frame) {
+    swap_frame_buffer(&denoiser->running_avg_y[LAST_FRAME],
+                      &denoiser->running_avg_y[INTRA_FRAME]);
   }
 }
 
 void vp9_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx) {
   ctx->zeromv_sse = UINT_MAX;
-  ctx->newmv_sse = UINT_MAX;
+  // This should be initialized as zero since mode search stage might skip
+  // NEWMV mode if inferred motion vector modes provide sufficiently good
+  // prediction quality.
+  ctx->newmv_sse = 0;
 }
 
 void vp9_denoiser_update_frame_stats(MB_MODE_INFO *mbmi, unsigned int sse,
@@ -458,12 +466,14 @@ int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height,
   make_grayscale(&denoiser->running_avg_y[i]);
 #endif
   denoiser->increase_denoising = 0;
+  denoiser->frame_buffer_initialized = 1;
 
   return 0;
 }
 
 void vp9_denoiser_free(VP9_DENOISER *denoiser) {
   int i;
+  denoiser->frame_buffer_initialized = 0;
   if (denoiser == NULL) {
     return;
   }
@@ -483,15 +493,13 @@ static void make_grayscale(YV12_BUFFER_CONFIG *yuv) {
   uint8_t *u = yuv->u_buffer;
   uint8_t *v = yuv->v_buffer;
 
-  // The '/2's are there because we have a 440 buffer, but we want to output
-  // 420.
-  for (r = 0; r < yuv->uv_height / 2; ++r) {
-    for (c = 0; c < yuv->uv_width / 2; ++c) {
+  for (r = 0; r < yuv->uv_height; ++r) {
+    for (c = 0; c < yuv->uv_width; ++c) {
       u[c] = UINT8_MAX / 2;
       v[c] = UINT8_MAX / 2;
     }
-    u += yuv->uv_stride + yuv->uv_width / 2;
-    v += yuv->uv_stride + yuv->uv_width / 2;
+    u += yuv->uv_stride;
+    v += yuv->uv_stride;
   }
 }
 #endif
diff --git a/vp9/encoder/vp9_denoiser.h b/vp9/encoder/vp9_denoiser.h
index 421dfcd0c..8eb5da1b8 100644
--- a/vp9/encoder/vp9_denoiser.h
+++ b/vp9/encoder/vp9_denoiser.h
@@ -29,6 +29,7 @@ typedef struct vp9_denoiser {
   YV12_BUFFER_CONFIG running_avg_y[MAX_REF_FRAMES];
   YV12_BUFFER_CONFIG mc_running_avg_y;
   int increase_denoising;
+  int frame_buffer_initialized;
 } VP9_DENOISER;
 
 void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser,
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 4c948237d..756052771 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -55,9 +55,6 @@ static void encode_superblock(VP9_COMP *cpi, ThreadData * td,
                               int mi_row, int mi_col, BLOCK_SIZE bsize,
                               PICK_MODE_CONTEXT *ctx);
 
-// Motion vector component magnitude threshold for defining fast motion.
-#define FAST_MOTION_MV_THRESH 24
-
 // This is used as a reference when computing the source variance for the
 //  purposes of activity masking.
 // Eventually this should be replaced by custom no-reference routines,
@@ -1010,22 +1007,20 @@ static void update_stats(VP9_COMMON *cm, ThreadData *td) {
   const MACROBLOCKD *const xd = &x->e_mbd;
   const MODE_INFO *const mi = xd->mi[0].src_mi;
   const MB_MODE_INFO *const mbmi = &mi->mbmi;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
 
   if (!frame_is_intra_only(cm)) {
+    FRAME_COUNTS *const counts = td->counts;
+    const int inter_block = is_inter_block(mbmi);
     const int seg_ref_active = vp9_segfeature_active(&cm->seg, mbmi->segment_id,
                                                      SEG_LVL_REF_FRAME);
     if (!seg_ref_active) {
-      FRAME_COUNTS *const counts = td->counts;
-      const int inter_block = is_inter_block(mbmi);
-
       counts->intra_inter[vp9_get_intra_inter_context(xd)][inter_block]++;
-
       // If the segment reference feature is enabled we have only a single
       // reference frame allowed for the segment so exclude it from
       // the reference frame counts used to work out probabilities.
       if (inter_block) {
         const MV_REFERENCE_FRAME ref0 = mbmi->ref_frame[0];
-
         if (cm->reference_mode == REFERENCE_MODE_SELECT)
           counts->comp_inter[vp9_get_reference_mode_context(cm, xd)]
                             [has_second_ref(mbmi)]++;
@@ -1042,6 +1037,25 @@ static void update_stats(VP9_COMMON *cm, ThreadData *td) {
         }
       }
     }
+    if (inter_block &&
+        !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+      const int mode_ctx = mbmi->mode_context[mbmi->ref_frame[0]];
+      if (bsize >= BLOCK_8X8) {
+        const PREDICTION_MODE mode = mbmi->mode;
+        ++counts->inter_mode[mode_ctx][INTER_OFFSET(mode)];
+      } else {
+        const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+        const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+        int idx, idy;
+        for (idy = 0; idy < 2; idy += num_4x4_h) {
+          for (idx = 0; idx < 2; idx += num_4x4_w) {
+            const int j = idy * 2 + idx;
+            const PREDICTION_MODE b_mode = mi->bmi[j].as_mode;
+            ++counts->inter_mode[mode_ctx][INTER_OFFSET(b_mode)];
+          }
+        }
+      }
+    }
   }
 }
 
@@ -1410,6 +1424,11 @@ static void update_state_rt(VP9_COMP *cpi, ThreadData *td,
       const int pred_ctx = vp9_get_pred_context_switchable_interp(xd);
       ++td->counts->switchable_interp[pred_ctx][mbmi->interp_filter];
     }
+
+    if (mbmi->sb_type < BLOCK_8X8) {
+      mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
+      mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
+    }
   }
 
   if (cm->use_prev_frame_mvs) {
@@ -2705,9 +2724,12 @@ static void nonrd_pick_sb_modes(VP9_COMP *cpi,
     hybrid_intra_mode_search(cpi, x, rd_cost, bsize, ctx);
   else if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
     set_mode_info_seg_skip(x, cm->tx_mode, rd_cost, bsize);
-  else
+  else if (bsize >= BLOCK_8X8)
     vp9_pick_inter_mode(cpi, x, tile_data, mi_row, mi_col,
                         rd_cost, bsize, ctx);
+  else
+    vp9_pick_inter_mode_sub8x8(cpi, x, tile_data, mi_row, mi_col,
+                               rd_cost, bsize, ctx);
 
   duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize);
 
@@ -3219,7 +3241,7 @@ static void nonrd_use_partition(VP9_COMP *cpi,
       pc_tree->vertical[0].skip = x->skip;
       encode_b_rt(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled,
                   subsize, &pc_tree->vertical[0]);
-      if (mi_col + hbs < cm->mi_cols) {
+      if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) {
         pc_tree->vertical[1].pred_pixel_ready = 1;
         nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs,
                             dummy_cost, subsize, &pc_tree->vertical[1]);
@@ -3240,7 +3262,7 @@ static void nonrd_use_partition(VP9_COMP *cpi,
       encode_b_rt(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled,
                   subsize, &pc_tree->horizontal[0]);
 
-      if (mi_row + hbs < cm->mi_rows) {
+      if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) {
         pc_tree->horizontal[1].pred_pixel_ready = 1;
         nonrd_pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col,
                             dummy_cost, subsize, &pc_tree->horizontal[1]);
@@ -3312,9 +3334,10 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi,
     // Set the partition type of the 64X64 block
     switch (sf->partition_search_type) {
       case VAR_BASED_PARTITION:
-        // TODO(jingning) Only key frame coding supports sub8x8 block at this
-        // point. To be continued to enable sub8x8 block mode decision for
-        // P frames.
+        // TODO(jingning, marpan): The mode decision and encoding process
+        // support both intra and inter sub8x8 block coding for RTC mode.
+        // Tune the thresholds accordingly to use sub8x8 block coding for
+        // coding performance improvement.
         choose_partitioning(cpi, tile_info, x, mi_row, mi_col);
         nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
                             BLOCK_64X64, 1, &dummy_rdc, td->pc_root);
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 9c29eb438..70b804e31 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -93,7 +93,7 @@ typedef struct vp9_token_state {
   int           rate;
   int           error;
   int           next;
-  signed char   token;
+  int16_t       token;
   short         qc;
 } vp9_token_state;
 
@@ -147,9 +147,15 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block,
   int next = eob, sz = 0;
   int64_t rdmult = mb->rdmult * plane_rd_mult[type], rddiv = mb->rddiv;
   int64_t rd_cost0, rd_cost1;
-  int rate0, rate1, error0, error1, t0, t1;
+  int rate0, rate1, error0, error1;
+  int16_t t0, t1;
+  EXTRABIT e0;
   int best, band, pt, i, final_eob;
-  const int16_t *dct_value_cost;
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int16_t *cat6_high_cost = vp9_get_high_cost_table(xd->bd);
+#else
+  const int16_t *cat6_high_cost = vp9_get_high_cost_table(8);
+#endif
 
   assert((!type && !plane) || (type && plane));
   assert(eob <= default_eob);
@@ -166,17 +172,6 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block,
   tokens[eob][0].qc = 0;
   tokens[eob][1] = tokens[eob][0];
 
-#if CONFIG_VP9_HIGHBITDEPTH
-  if (xd->bd == 12) {
-    dct_value_cost = vp9_dct_value_cost_high12_ptr;
-  } else if (xd->bd == 10) {
-    dct_value_cost = vp9_dct_value_cost_high10_ptr;
-  } else {
-    dct_value_cost = vp9_dct_value_cost_ptr;
-  }
-#else
-  dct_value_cost = vp9_dct_value_cost_ptr;
-#endif
   for (i = 0; i < eob; i++)
     token_cache[scan[i]] =
         vp9_pt_energy_class[vp9_get_token(qcoeff[scan[i]])];
@@ -193,7 +188,7 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block,
       /* Evaluate the first possibility for this state. */
       rate0 = tokens[next][0].rate;
       rate1 = tokens[next][1].rate;
-      t0 = vp9_get_token(x);
+      vp9_get_token_extra(x, &t0, &e0);
       /* Consider both possible successor states. */
       if (next < default_eob) {
         band = band_translate[i + 1];
@@ -206,7 +201,7 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block,
       UPDATE_RD_COST();
       /* And pick the best. */
       best = rd_cost1 < rd_cost0;
-      base_bits = dct_value_cost[x];
+      base_bits = vp9_get_cost(t0, e0, cat6_high_cost);
       dx = mul * (dqcoeff[rc] - coeff[rc]);
 #if CONFIG_VP9_HIGHBITDEPTH
       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -244,8 +239,10 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block,
          */
         t0 = tokens[next][0].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
         t1 = tokens[next][1].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
+        e0 = 0;
       } else {
-        t0 = t1 = vp9_get_token(x);
+        vp9_get_token_extra(x, &t0, &e0);
+        t1 = t0;
       }
       if (next < default_eob) {
         band = band_translate[i + 1];
@@ -264,7 +261,7 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block,
       UPDATE_RD_COST();
       /* And pick the best. */
       best = rd_cost1 < rd_cost0;
-      base_bits = dct_value_cost[x];
+      base_bits = vp9_get_cost(t0, e0, cat6_high_cost);
 
       if (shortcut) {
 #if CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 430ad17e0..35fea57f5 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -186,7 +186,6 @@ void vp9_initialize_enc(void) {
   if (!init_done) {
     vp9_rtcd();
     vp9_init_intra_predictors();
-    vp9_tokenize_initialize();
     vp9_init_me_luts();
     vp9_rc_init_minq_luts();
     vp9_entropy_mv_init();
@@ -608,7 +607,7 @@ static void init_config(struct VP9_COMP *cpi, VP9EncoderConfig *oxcf) {
 #if CONFIG_VP9_HIGHBITDEPTH
   cm->use_highbitdepth = oxcf->use_highbitdepth;
 #endif
-  cm->color_space = UNKNOWN;
+  cm->color_space = oxcf->color_space;
 
   cm->width = oxcf->width;
   cm->height = oxcf->height;
@@ -1265,6 +1264,7 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
   if (cm->profile != oxcf->profile)
     cm->profile = oxcf->profile;
   cm->bit_depth = oxcf->bit_depth;
+  cm->color_space = oxcf->color_space;
 
   if (cm->profile <= PROFILE_1)
     assert(cm->bit_depth == VPX_BITS_8);
@@ -1348,17 +1348,6 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
 #if CONFIG_VP9_HIGHBITDEPTH
   highbd_set_var_fns(cpi);
 #endif
-
-#if CONFIG_VP9_TEMPORAL_DENOISING
-  if (cpi->oxcf.noise_sensitivity > 0) {
-    vp9_denoiser_alloc(&(cpi->denoiser), cm->width, cm->height,
-                       cm->subsampling_x, cm->subsampling_y,
-#if CONFIG_VP9_HIGHBITDEPTH
-                       cm->use_highbitdepth,
-#endif
-                       VP9_ENC_BORDER_IN_PIXELS);
-  }
-#endif
 }
 
 #ifndef M_LOG2_E
@@ -1406,8 +1395,8 @@ static void cal_nmvsadcosts_hp(int *mvsadcost[2]) {
 
 VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) {
   unsigned int i;
-  VP9_COMP *const cpi = vpx_memalign(32, sizeof(VP9_COMP));
-  VP9_COMMON *const cm = cpi != NULL ? &cpi->common : NULL;
+  VP9_COMP *volatile const cpi = vpx_memalign(32, sizeof(VP9_COMP));
+  VP9_COMMON *volatile const cm = cpi != NULL ? &cpi->common : NULL;
 
   if (!cm)
     return NULL;
@@ -1792,14 +1781,12 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
   }
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
-  if (cpi->oxcf.noise_sensitivity > 0) {
-    vp9_denoiser_free(&(cpi->denoiser));
-  }
+  vp9_denoiser_free(&(cpi->denoiser));
 #endif
 
   for (t = 0; t < cpi->num_workers; ++t) {
     VP9Worker *const worker = &cpi->workers[t];
-    EncWorkerData *const thread_data = (EncWorkerData*)worker->data1;
+    EncWorkerData *const thread_data = &cpi->tile_thr_data[t];
 
     // Deallocate allocated threads.
     vp9_get_worker_interface()->end(worker);
@@ -1810,11 +1797,13 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
       vp9_free_pc_tree(thread_data->td);
       vpx_free(thread_data->td);
     }
-
-    vpx_free(worker->data1);
   }
+  vpx_free(cpi->tile_thr_data);
   vpx_free(cpi->workers);
 
+  if (cpi->num_workers > 1)
+    vp9_loop_filter_dealloc(&cpi->lf_row_sync);
+
   dealloc_compressor_data(cpi);
 
   for (i = 0; i < sizeof(cpi->mbgraph_stats) /
@@ -2140,19 +2129,19 @@ void vp9_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f) {
   } while (--h);
 
   src = s->u_buffer;
-  h = s->uv_height / 2;
+  h = s->uv_height;
 
   do {
-    fwrite(src, s->uv_width / 2, 1, f);
-    src += s->uv_stride + s->uv_width / 2;
+    fwrite(src, s->uv_width, 1, f);
+    src += s->uv_stride;
   } while (--h);
 
   src = s->v_buffer;
-  h = s->uv_height / 2;
+  h = s->uv_height;
 
   do {
-    fwrite(src, s->uv_width / 2, 1, f);
-    src += s->uv_stride + s->uv_width / 2;
+    fwrite(src, s->uv_width, 1, f);
+    src += s->uv_stride;
   } while (--h);
 }
 #endif
@@ -2450,7 +2439,13 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
   }
 
   if (lf->filter_level > 0) {
-    vp9_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0);
+    if (cpi->num_workers > 1)
+      vp9_loop_filter_frame_mt(cm->frame_to_show, cm, xd->plane,
+                               lf->filter_level, 0, 0,
+                               cpi->workers, cpi->num_workers,
+                               &cpi->lf_row_sync);
+    else
+      vp9_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0);
   }
 
   vp9_extend_frame_inner_borders(cm->frame_to_show);
@@ -2550,7 +2545,7 @@ static void full_to_model_counts(vp9_coeff_count_model *model_count,
 static void output_frame_level_debug_stats(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   FILE *const f = fopen("tmp.stt", cm->current_video_frame ? "a" : "w");
-  int recon_err;
+  int64_t recon_err;
 
   vp9_clear_system_state();
 
@@ -2562,7 +2557,7 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) {
         "%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf"
         "%6d %6d %5d %5d %5d "
         "%10"PRId64" %10.3lf"
-        "%10lf %8u %10d %10d %10d\n",
+        "%10lf %8u %10"PRId64" %10d %10d\n",
         cpi->common.current_video_frame, cpi->rc.this_frame_target,
         cpi->rc.projected_frame_size,
         cpi->rc.projected_frame_size / cpi->common.MBs,
@@ -2891,15 +2886,14 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
            rc->this_key_frame_forced &&
            (rc->projected_frame_size < rc->max_frame_bandwidth)) {
         int last_q = q;
-        int kf_err;
+        int64_t kf_err;
 
-        int high_err_target = cpi->ambient_err;
-        int low_err_target = cpi->ambient_err >> 1;
+        int64_t high_err_target = cpi->ambient_err;
+        int64_t low_err_target = cpi->ambient_err >> 1;
 
 #if CONFIG_VP9_HIGHBITDEPTH
         if (cm->use_highbitdepth) {
-          kf_err = vp9_highbd_get_y_sse(cpi->Source, get_frame_new_buffer(cm),
-                                        cm->bit_depth);
+          kf_err = vp9_highbd_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
         } else {
           kf_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
         }
@@ -2920,7 +2914,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
           q_high = q > q_low ? q - 1 : q_low;
 
           // Adjust Q
-          q = (q * high_err_target) / kf_err;
+          q = (int)((q * high_err_target) / kf_err);
           q = MIN(q, (q_high + q_low) >> 1);
         } else if (kf_err < low_err_target &&
                    rc->projected_frame_size >= frame_under_shoot_limit) {
@@ -2929,7 +2923,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
           q_low = q < q_high ? q + 1 : q_high;
 
           // Adjust Q
-          q = (q * low_err_target) / kf_err;
+          q = (int)((q * low_err_target) / kf_err);
           q = MIN(q, (q_high + q_low + 1) >> 1);
         }
 
@@ -3257,8 +3251,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
 #if CONFIG_VP9_HIGHBITDEPTH
     if (cm->use_highbitdepth) {
       cpi->ambient_err = vp9_highbd_get_y_sse(cpi->Source,
-                                              get_frame_new_buffer(cm),
-                                              cm->bit_depth);
+                                              get_frame_new_buffer(cm));
     } else {
       cpi->ambient_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
     }
@@ -3409,6 +3402,20 @@ static void check_initial_width(VP9_COMP *cpi,
   }
 }
 
+#if CONFIG_VP9_TEMPORAL_DENOISING
+static void setup_denoiser_buffer(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  if (cpi->oxcf.noise_sensitivity > 0 &&
+      !cpi->denoiser.frame_buffer_initialized) {
+    vp9_denoiser_alloc(&(cpi->denoiser), cm->width, cm->height,
+                       cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                       cm->use_highbitdepth,
+#endif
+                       VP9_ENC_BORDER_IN_PIXELS);
+  }
+}
+#endif
 
 int vp9_receive_raw_frame(VP9_COMP *cpi, unsigned int frame_flags,
                           YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
@@ -3425,6 +3432,9 @@ int vp9_receive_raw_frame(VP9_COMP *cpi, unsigned int frame_flags,
   check_initial_width(cpi, subsampling_x, subsampling_y);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
+#if CONFIG_VP9_TEMPORAL_DENOISING
+  setup_denoiser_buffer(cpi);
+#endif
   vpx_usec_timer_start(&timer);
 
   if (vp9_lookahead_push(cpi->lookahead, sd, time_stamp, end_time, frame_flags))
@@ -3435,13 +3445,13 @@ int vp9_receive_raw_frame(VP9_COMP *cpi, unsigned int frame_flags,
   if ((cm->profile == PROFILE_0 || cm->profile == PROFILE_2) &&
       (subsampling_x != 1 || subsampling_y != 1)) {
     vpx_internal_error(&cm->error, VPX_CODEC_INVALID_PARAM,
-                       "Non-4:2:0 color space requires profile 1 or 3");
+                       "Non-4:2:0 color format requires profile 1 or 3");
     res = -1;
   }
   if ((cm->profile == PROFILE_1 || cm->profile == PROFILE_3) &&
       (subsampling_x == 1 && subsampling_y == 1)) {
     vpx_internal_error(&cm->error, VPX_CODEC_INVALID_PARAM,
-                       "4:2:0 color space requires profile 0 or 2");
+                       "4:2:0 color format requires profile 0 or 2");
     res = -1;
   }
 
@@ -3999,6 +4009,10 @@ int vp9_set_size_literal(VP9_COMP *cpi, unsigned int width,
   check_initial_width(cpi, 1, 1);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
+#if CONFIG_VP9_TEMPORAL_DENOISING
+  setup_denoiser_buffer(cpi);
+#endif
+
   if (width) {
     cm->width = width;
     if (cm->width > cpi->initial_width) {
@@ -4027,41 +4041,25 @@ void vp9_set_svc(VP9_COMP *cpi, int use_svc) {
   return;
 }
 
-int vp9_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b) {
+int64_t vp9_get_y_sse(const YV12_BUFFER_CONFIG *a,
+                      const YV12_BUFFER_CONFIG *b) {
   assert(a->y_crop_width == b->y_crop_width);
   assert(a->y_crop_height == b->y_crop_height);
 
-  return (int)get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
-                      a->y_crop_width, a->y_crop_height);
+  return get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
+                 a->y_crop_width, a->y_crop_height);
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-int vp9_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
-                         const YV12_BUFFER_CONFIG *b,
-                         vpx_bit_depth_t bit_depth) {
-  unsigned int sse;
-  int sum;
+int64_t vp9_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
+                             const YV12_BUFFER_CONFIG *b) {
   assert(a->y_crop_width == b->y_crop_width);
   assert(a->y_crop_height == b->y_crop_height);
   assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
   assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
-  switch (bit_depth) {
-    case VPX_BITS_8:
-      highbd_variance(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
-                      a->y_crop_width, a->y_crop_height, &sse, &sum);
-      return (int) sse;
-    case VPX_BITS_10:
-      highbd_10_variance(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
-                         a->y_crop_width, a->y_crop_height, &sse, &sum);
-      return (int) sse;
-    case VPX_BITS_12:
-      highbd_12_variance(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
-                         a->y_crop_width, a->y_crop_height, &sse, &sum);
-      return (int) sse;
-    default:
-      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
-      return -1;
-  }
+
+  return highbd_get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
+                        a->y_crop_width, a->y_crop_height);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 7872e2cc1..cf269c108 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -19,6 +19,7 @@
 
 #include "vp9/common/vp9_ppflags.h"
 #include "vp9/common/vp9_entropymode.h"
+#include "vp9/common/vp9_loopfilter_thread.h"
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_thread.h"
 
@@ -36,6 +37,7 @@
 #include "vp9/encoder/vp9_svc_layercontext.h"
 #include "vp9/encoder/vp9_tokenize.h"
 #include "vp9/encoder/vp9_variance.h"
+
 #if CONFIG_VP9_TEMPORAL_DENOISING
 #include "vp9/encoder/vp9_denoiser.h"
 #endif
@@ -231,6 +233,7 @@ typedef struct VP9EncoderConfig {
 #if CONFIG_VP9_HIGHBITDEPTH
   int use_highbitdepth;
 #endif
+  vpx_color_space_t color_space;
 } VP9EncoderConfig;
 
 static INLINE int is_lossless_requested(const VP9EncoderConfig *cfg) {
@@ -261,6 +264,8 @@ typedef struct ThreadData {
   PC_TREE *pc_root;
 } ThreadData;
 
+struct EncWorkerData;
+
 typedef struct VP9_COMP {
   QUANTS quants;
   ThreadData td;
@@ -304,7 +309,7 @@ typedef struct VP9_COMP {
   unsigned int tok_count[4][1 << 6];
 
   // Ambient reconstruction err target for force key frames
-  int ambient_err;
+  int64_t ambient_err;
 
   RD_OPT rd;
 
@@ -446,6 +451,8 @@ typedef struct VP9_COMP {
   // Multi-threading
   int num_workers;
   VP9Worker *workers;
+  struct EncWorkerData *tile_thr_data;
+  VP9LfSync lf_row_sync;
 } VP9_COMP;
 
 void vp9_initialize_enc(void);
@@ -534,11 +541,10 @@ static INLINE int allocated_tokens(TileInfo tile) {
   return get_token_alloc(tile_mb_rows, tile_mb_cols);
 }
 
-int vp9_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
+int64_t vp9_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
 #if CONFIG_VP9_HIGHBITDEPTH
-int vp9_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
-                         const YV12_BUFFER_CONFIG *b,
-                         vpx_bit_depth_t bit_depth);
+int64_t vp9_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
+                             const YV12_BUFFER_CONFIG *b);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 void vp9_alloc_compressor_data(VP9_COMP *cpi);
diff --git a/vp9/encoder/vp9_ethread.c b/vp9/encoder/vp9_ethread.c
index daf3da44c..12fb4d107 100644
--- a/vp9/encoder/vp9_ethread.c
+++ b/vp9/encoder/vp9_ethread.c
@@ -167,23 +167,24 @@ void vp9_encode_tiles_mt(VP9_COMP *cpi) {
     CHECK_MEM_ERROR(cm, cpi->workers,
                     vpx_malloc(num_workers * sizeof(*cpi->workers)));
 
+    CHECK_MEM_ERROR(cm, cpi->tile_thr_data,
+                    vpx_calloc(num_workers, sizeof(*cpi->tile_thr_data)));
+
     for (i = 0; i < num_workers; i++) {
       VP9Worker *const worker = &cpi->workers[i];
-      EncWorkerData *thread_data;
+      EncWorkerData *thread_data = &cpi->tile_thr_data[i];
 
       ++cpi->num_workers;
-
       winterface->init(worker);
-      CHECK_MEM_ERROR(cm, worker->data1,
-                      (EncWorkerData*)vpx_calloc(1, sizeof(EncWorkerData)));
-      thread_data = (EncWorkerData*)worker->data1;
 
       if (i < num_workers - 1) {
       thread_data->cpi = cpi;
 
       // Allocate thread data.
       CHECK_MEM_ERROR(cm, thread_data->td,
-                      vpx_calloc(1, sizeof(*thread_data->td)));
+                      vpx_memalign(32, sizeof(*thread_data->td)));
+      vp9_zero(*thread_data->td);
+
       // Set up pc_tree.
       thread_data->td->leaf_tree = NULL;
       thread_data->td->pc_tree = NULL;
@@ -203,17 +204,18 @@ void vp9_encode_tiles_mt(VP9_COMP *cpi) {
         thread_data->td = &cpi->td;
       }
 
-      // data2 is unused.
-      worker->data2 = NULL;
-
       winterface->sync(worker);
-      worker->hook = (VP9WorkerHook)enc_worker_hook;
     }
   }
 
   for (i = 0; i < num_workers; i++) {
     VP9Worker *const worker = &cpi->workers[i];
-    EncWorkerData *const thread_data = (EncWorkerData*)worker->data1;
+    EncWorkerData *thread_data;
+
+    worker->hook = (VP9WorkerHook)enc_worker_hook;
+    worker->data1 = &cpi->tile_thr_data[i];
+    worker->data2 = NULL;
+    thread_data = (EncWorkerData*)worker->data1;
 
     // Before encoding a frame, copy the thread data from cpi.
     thread_data->td->mb = cpi->td.mb;
diff --git a/vp9/encoder/vp9_picklpf.c b/vp9/encoder/vp9_picklpf.c
index 81334e448..a95f0f46d 100644
--- a/vp9/encoder/vp9_picklpf.c
+++ b/vp9/encoder/vp9_picklpf.c
@@ -33,16 +33,23 @@ static int get_max_filter_level(const VP9_COMP *cpi) {
 }
 
 
-static int try_filter_frame(const YV12_BUFFER_CONFIG *sd, VP9_COMP *const cpi,
-                            int filt_level, int partial_frame) {
+static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd,
+                                VP9_COMP *const cpi,
+                                int filt_level, int partial_frame) {
   VP9_COMMON *const cm = &cpi->common;
-  int filt_err;
+  int64_t filt_err;
+
+  if (cpi->num_workers > 1)
+    vp9_loop_filter_frame_mt(cm->frame_to_show, cm, cpi->td.mb.e_mbd.plane,
+                             filt_level, 1, partial_frame,
+                             cpi->workers, cpi->num_workers, &cpi->lf_row_sync);
+  else
+    vp9_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level,
+                          1, partial_frame);
 
-  vp9_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level, 1,
-                        partial_frame);
 #if CONFIG_VP9_HIGHBITDEPTH
   if (cm->use_highbitdepth) {
-    filt_err = vp9_highbd_get_y_sse(sd, cm->frame_to_show, cm->bit_depth);
+    filt_err = vp9_highbd_get_y_sse(sd, cm->frame_to_show);
   } else {
     filt_err = vp9_get_y_sse(sd, cm->frame_to_show);
   }
@@ -63,14 +70,15 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
   const int min_filter_level = 0;
   const int max_filter_level = get_max_filter_level(cpi);
   int filt_direction = 0;
-  int best_err, filt_best;
+  int64_t best_err;
+  int filt_best;
 
   // Start the search at the previous frame filter level unless it is now out of
   // range.
   int filt_mid = clamp(lf->filter_level, min_filter_level, max_filter_level);
   int filter_step = filt_mid < 16 ? 4 : filt_mid / 4;
   // Sum squared error at each filter level
-  int ss_err[MAX_LOOP_FILTER + 1];
+  int64_t ss_err[MAX_LOOP_FILTER + 1];
 
   // Set each entry to -1
   vpx_memset(ss_err, 0xFF, sizeof(ss_err));
@@ -87,7 +95,7 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
     const int filt_low = MAX(filt_mid - filter_step, min_filter_level);
 
     // Bias against raising loop filter in favor of lowering it.
-    int bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
+    int64_t bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
 
     if ((cpi->oxcf.pass == 2) && (cpi->twopass.section_intra_rating < 20))
       bias = (bias * cpi->twopass.section_intra_rating) / 20;
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 319a47833..5acfcc51d 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -540,8 +540,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth) >> reduction_fac;
   const int64_t inter_mode_thresh = RDCOST(x->rdmult, x->rddiv,
                                            intra_cost_penalty, 0);
-  const int8_t segment_id = mbmi->segment_id;
-  const int *const rd_threshes = cpi->rd.threshes[segment_id][bsize];
+  const int *const rd_threshes = cpi->rd.threshes[mbmi->segment_id][bsize];
   const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize];
   INTERP_FILTER filter_ref;
   const int bsl = mi_width_log2_lookup[bsize];
@@ -605,7 +604,10 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                       tx_mode_to_biggest_tx_size[cm->tx_mode]);
   mbmi->interp_filter = cm->interp_filter == SWITCHABLE ?
                         EIGHTTAP : cm->interp_filter;
-  mbmi->segment_id = segment_id;
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+  vp9_denoiser_reset_frame_stats(ctx);
+#endif
 
   for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) {
     x->pred_mv_sad[ref_frame] = INT_MAX;
@@ -662,6 +664,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     clamp_mv2(&frame_mv[NEARMV][ref_frame].as_mv, xd);
 
     mbmi->ref_frame[0] = ref_frame;
+    set_ref_ptrs(cm, xd, ref_frame, NONE);
 
     for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
       int rate_mv = 0;
@@ -946,3 +949,247 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 
   *rd_cost = best_rdc;
 }
+
+void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
+                                TileDataEnc *tile_data,
+                                int mi_row, int mi_col, RD_COST *rd_cost,
+                                BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
+  VP9_COMMON *const cm = &cpi->common;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  SPEED_FEATURES *const sf = &cpi->sf;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
+  const struct segmentation *const seg = &cm->seg;
+  MV_REFERENCE_FRAME ref_frame, second_ref_frame = NONE;
+  MV_REFERENCE_FRAME best_ref_frame = NONE;
+  unsigned char segment_id = mbmi->segment_id;
+  struct buf_2d yv12_mb[4][MAX_MB_PLANE];
+  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+                                    VP9_ALT_FLAG };
+  int64_t best_rd = INT64_MAX;
+  b_mode_info bsi[MAX_REF_FRAMES][4];
+  int ref_frame_skip_mask = 0;
+  const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+  const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
+  int idx, idy;
+
+  x->skip_encode = sf->skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
+  ctx->pred_pixel_ready = 0;
+
+  for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) {
+    int_mv dummy_mv[2];
+    x->pred_mv_sad[ref_frame] = INT_MAX;
+
+    if (cpi->ref_frame_flags & flag_list[ref_frame]) {
+      const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
+      int_mv *const candidates = mbmi->ref_mvs[ref_frame];
+      const struct scale_factors *const sf =
+                             &cm->frame_refs[ref_frame - 1].sf;
+      vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col,
+                           sf, sf);
+      vp9_find_mv_refs(cm, xd, tile_info, xd->mi[0].src_mi, ref_frame,
+                       candidates, mi_row, mi_col);
+
+      vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates,
+                            &dummy_mv[0], &dummy_mv[1]);
+    } else {
+      ref_frame_skip_mask |= (1 << ref_frame);
+    }
+  }
+
+  mbmi->sb_type = bsize;
+  mbmi->tx_size = TX_4X4;
+  mbmi->uv_mode = DC_PRED;
+  mbmi->ref_frame[0] = LAST_FRAME;
+  mbmi->ref_frame[1] = NONE;
+  mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP
+                                                        : cm->interp_filter;
+
+  for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) {
+    int64_t this_rd = 0;
+    int plane;
+
+    if (ref_frame_skip_mask & (1 << ref_frame))
+      continue;
+
+    // TODO(jingning, agrange): Scaling reference frame not supported for
+    // sub8x8 blocks. Is this supported now?
+    if (ref_frame > INTRA_FRAME &&
+        vp9_is_scaled(&cm->frame_refs[ref_frame - 1].sf))
+      continue;
+
+    // If the segment reference frame feature is enabled....
+    // then do nothing if the current ref frame is not allowed..
+    if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+        vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame)
+      continue;
+
+    mbmi->ref_frame[0] = ref_frame;
+    x->skip = 0;
+    set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
+
+    // Select prediction reference frames.
+    for (plane = 0; plane < MAX_MB_PLANE; plane++)
+      xd->plane[plane].pre[0] = yv12_mb[ref_frame][plane];
+
+    for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
+      for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
+        int_mv b_mv[MB_MODE_COUNT];
+        int64_t b_best_rd = INT64_MAX;
+        const int i = idy * 2 + idx;
+        PREDICTION_MODE this_mode;
+        int b_rate = 0;
+        int64_t b_dist = 0;
+        RD_COST this_rdc;
+        unsigned int var_y, sse_y;
+
+        struct macroblock_plane *p = &x->plane[0];
+        struct macroblockd_plane *pd = &xd->plane[0];
+
+        const struct buf_2d orig_src = p->src;
+        const struct buf_2d orig_dst = pd->dst;
+        struct buf_2d orig_pre[2];
+        vpx_memcpy(orig_pre, xd->plane[0].pre, sizeof(orig_pre));
+
+        // set buffer pointers for sub8x8 motion search.
+        p->src.buf =
+            &p->src.buf[vp9_raster_block_offset(BLOCK_8X8, i, p->src.stride)];
+        pd->dst.buf =
+            &pd->dst.buf[vp9_raster_block_offset(BLOCK_8X8, i, pd->dst.stride)];
+        pd->pre[0].buf =
+            &pd->pre[0].buf[vp9_raster_block_offset(BLOCK_8X8,
+                                                    i, pd->pre[0].stride)];
+
+        b_mv[ZEROMV].as_int = 0;
+        b_mv[NEWMV].as_int = INVALID_MV;
+        vp9_append_sub8x8_mvs_for_idx(cm, xd, tile_info, i, 0, mi_row, mi_col,
+                                      &b_mv[NEARESTMV],
+                                      &b_mv[NEARMV]);
+
+        for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
+          xd->mi[0].bmi[i].as_mv[0].as_int = b_mv[this_mode].as_int;
+
+          if (this_mode == NEWMV) {
+            const int step_param = cpi->sf.mv.fullpel_search_step_param;
+            MV mvp_full;
+            MV tmp_mv;
+            int cost_list[5];
+            const int tmp_col_min = x->mv_col_min;
+            const int tmp_col_max = x->mv_col_max;
+            const int tmp_row_min = x->mv_row_min;
+            const int tmp_row_max = x->mv_row_max;
+            int dummy_dist;
+
+            if (i == 0) {
+              mvp_full.row = b_mv[NEARESTMV].as_mv.row >> 3;
+              mvp_full.col = b_mv[NEARESTMV].as_mv.col >> 3;
+            } else {
+              mvp_full.row = xd->mi[0].bmi[0].as_mv[0].as_mv.row >> 3;
+              mvp_full.col = xd->mi[0].bmi[0].as_mv[0].as_mv.col >> 3;
+            }
+
+            vp9_set_mv_search_range(x, &mbmi->ref_mvs[0]->as_mv);
+
+            vp9_full_pixel_search(
+                cpi, x, bsize, &mvp_full, step_param, x->sadperbit4,
+                cond_cost_list(cpi, cost_list),
+                &mbmi->ref_mvs[ref_frame][0].as_mv, &tmp_mv,
+                INT_MAX, 0);
+
+            x->mv_col_min = tmp_col_min;
+            x->mv_col_max = tmp_col_max;
+            x->mv_row_min = tmp_row_min;
+            x->mv_row_max = tmp_row_max;
+
+            // calculate the bit cost on motion vector
+            mvp_full.row = tmp_mv.row * 8;
+            mvp_full.col = tmp_mv.col * 8;
+
+            b_rate += vp9_mv_bit_cost(&mvp_full,
+                                      &mbmi->ref_mvs[ref_frame][0].as_mv,
+                                      x->nmvjointcost, x->mvcost,
+                                      MV_COST_WEIGHT);
+
+            b_rate += cpi->inter_mode_cost[mbmi->mode_context[ref_frame]]
+                                          [INTER_OFFSET(NEWMV)];
+            if (RDCOST(x->rdmult, x->rddiv, b_rate, 0) > b_best_rd)
+              continue;
+
+            cpi->find_fractional_mv_step(x, &tmp_mv,
+                                         &mbmi->ref_mvs[ref_frame][0].as_mv,
+                                         cpi->common.allow_high_precision_mv,
+                                         x->errorperbit,
+                                         &cpi->fn_ptr[bsize],
+                                         cpi->sf.mv.subpel_force_stop,
+                                         cpi->sf.mv.subpel_iters_per_step,
+                                         cond_cost_list(cpi, cost_list),
+                                         x->nmvjointcost, x->mvcost,
+                                         &dummy_dist,
+                                         &x->pred_sse[ref_frame], NULL, 0, 0);
+
+            xd->mi[0].bmi[i].as_mv[0].as_mv = tmp_mv;
+          }
+
+          vp9_build_inter_predictor(pd->pre[0].buf, pd->pre[0].stride,
+                                    pd->dst.buf, pd->dst.stride,
+                                    &xd->mi[0].bmi[i].as_mv[0].as_mv,
+                                    &xd->block_refs[0]->sf,
+                                    4 * num_4x4_blocks_wide,
+                                    4 * num_4x4_blocks_high, 0,
+                                    vp9_get_interp_kernel(mbmi->interp_filter),
+                                    MV_PRECISION_Q3,
+                                    mi_col * MI_SIZE + 4 * (i & 0x01),
+                                    mi_row * MI_SIZE + 4 * (i >> 1));
+          model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist,
+                            &var_y, &sse_y);
+
+          this_rdc.rate += b_rate;
+          this_rdc.dist += b_dist;
+          this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+                                   this_rdc.rate, this_rdc.dist);
+          if (this_rdc.rdcost < b_best_rd) {
+            b_best_rd = this_rdc.rdcost;
+            bsi[ref_frame][i].as_mode = this_mode;
+            bsi[ref_frame][i].as_mv[0].as_mv = xd->mi[0].bmi[i].as_mv[0].as_mv;
+          }
+        }  // mode search
+
+        // restore source and prediction buffer pointers.
+        p->src = orig_src;
+        pd->pre[0] = orig_pre[0];
+        pd->dst = orig_dst;
+        this_rd += b_best_rd;
+
+        xd->mi[0].bmi[i] = bsi[ref_frame][i];
+        if (num_4x4_blocks_wide > 1)
+          xd->mi[0].bmi[i + 1] = xd->mi[0].bmi[i];
+        if (num_4x4_blocks_high > 1)
+          xd->mi[0].bmi[i + 2] = xd->mi[0].bmi[i];
+      }
+    }  // loop through sub8x8 blocks
+
+    if (this_rd < best_rd) {
+      best_rd = this_rd;
+      best_ref_frame = ref_frame;
+    }
+  }  // reference frames
+
+  mbmi->tx_size = TX_4X4;
+  mbmi->ref_frame[0] = best_ref_frame;
+  for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
+    for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
+      const int block = idy * 2 + idx;
+      xd->mi[0].bmi[block] = bsi[best_ref_frame][block];
+      if (num_4x4_blocks_wide > 1)
+        xd->mi[0].bmi[block + 1] = bsi[best_ref_frame][block];
+      if (num_4x4_blocks_high > 1)
+        xd->mi[0].bmi[block + 2] = bsi[best_ref_frame][block];
+    }
+  }
+  mbmi->mode = xd->mi[0].bmi[3].as_mode;
+  ctx->mic = *(xd->mi[0].src_mi);
+  ctx->skip_txfm[0] = 0;
+  ctx->skip = 0;
+  // Dummy assignment for speed -5. No effect in speed -6.
+  rd_cost->rdcost = best_rd;
+}
diff --git a/vp9/encoder/vp9_pickmode.h b/vp9/encoder/vp9_pickmode.h
index 57a1cf937..11f44099c 100644
--- a/vp9/encoder/vp9_pickmode.h
+++ b/vp9/encoder/vp9_pickmode.h
@@ -26,6 +26,12 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                          BLOCK_SIZE bsize,
                          PICK_MODE_CONTEXT *ctx);
 
+void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
+                                TileDataEnc *tile_data,
+                                int mi_row, int mi_col, RD_COST *rd_cost,
+                                BLOCK_SIZE bsize,
+                                PICK_MODE_CONTEXT *ctx);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c
index 34d49f058..375407d44 100644
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c
@@ -516,6 +516,20 @@ void vp9_setup_pred_block(const MACROBLOCKD *xd,
   }
 }
 
+int vp9_raster_block_offset(BLOCK_SIZE plane_bsize,
+                            int raster_block, int stride) {
+  const int bw = b_width_log2_lookup[plane_bsize];
+  const int y = 4 * (raster_block >> bw);
+  const int x = 4 * (raster_block & ((1 << bw) - 1));
+  return y * stride + x;
+}
+
+int16_t* vp9_raster_block_offset_int16(BLOCK_SIZE plane_bsize,
+                                       int raster_block, int16_t *base) {
+  const int stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+  return base + vp9_raster_block_offset(plane_bsize, raster_block, stride);
+}
+
 const YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const VP9_COMP *cpi,
                                                    int ref_frame) {
   const VP9_COMMON *const cm = &cpi->common;
diff --git a/vp9/encoder/vp9_rd.h b/vp9/encoder/vp9_rd.h
index e1593af5a..59a87cf98 100644
--- a/vp9/encoder/vp9_rd.h
+++ b/vp9/encoder/vp9_rd.h
@@ -141,6 +141,12 @@ void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n,
 int vp9_get_switchable_rate(const struct VP9_COMP *cpi,
                             const MACROBLOCKD *const xd);
 
+int vp9_raster_block_offset(BLOCK_SIZE plane_bsize,
+                            int raster_block, int stride);
+
+int16_t* vp9_raster_block_offset_int16(BLOCK_SIZE plane_bsize,
+                                       int raster_block, int16_t *base);
+
 const YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const struct VP9_COMP *cpi,
                                                    int ref_frame);
 
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index ded082f86..a183fdc69 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -129,19 +129,6 @@ static const REF_DEFINITION vp9_ref_order[MAX_REFS] = {
   {{INTRA_FRAME,  NONE}},
 };
 
-static int raster_block_offset(BLOCK_SIZE plane_bsize,
-                               int raster_block, int stride) {
-  const int bw = b_width_log2_lookup[plane_bsize];
-  const int y = 4 * (raster_block >> bw);
-  const int x = 4 * (raster_block & ((1 << bw) - 1));
-  return y * stride + x;
-}
-static int16_t* raster_block_offset_int16(BLOCK_SIZE plane_bsize,
-                                          int raster_block, int16_t *base) {
-  const int stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
-  return base + raster_block_offset(plane_bsize, raster_block, stride);
-}
-
 static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
                            int m, int n, int min_plane, int max_plane) {
   int i;
@@ -360,6 +347,12 @@ static INLINE int cost_coeffs(MACROBLOCK *x,
   uint8_t token_cache[32 * 32];
   int pt = combine_entropy_contexts(*A, *L);
   int c, cost;
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int16_t *cat6_high_cost = vp9_get_high_cost_table(xd->bd);
+#else
+  const int16_t *cat6_high_cost = vp9_get_high_cost_table(8);
+#endif
+
   // Check for consistency of tx_size with mode info
   assert(type == PLANE_TYPE_Y ? mbmi->tx_size == tx_size
                               : get_uv_tx_size(mbmi, pd) == tx_size);
@@ -373,23 +366,29 @@ static INLINE int cost_coeffs(MACROBLOCK *x,
 
     // dc token
     int v = qcoeff[0];
-    int prev_t = vp9_get_token(v);
-    cost = (*token_costs)[0][pt][prev_t] + vp9_dct_value_cost_ptr[v];
+    int16_t prev_t;
+    EXTRABIT e;
+    vp9_get_token_extra(v, &prev_t, &e);
+    cost = (*token_costs)[0][pt][prev_t] +
+        vp9_get_cost(prev_t, e, cat6_high_cost);
+
     token_cache[0] = vp9_pt_energy_class[prev_t];
     ++token_costs;
 
     // ac tokens
     for (c = 1; c < eob; c++) {
       const int rc = scan[c];
-      int t;
+      int16_t t;
 
       v = qcoeff[rc];
-      t = vp9_get_token(v);
+      vp9_get_token_extra(v, &t, &e);
       if (use_fast_coef_costing) {
-        cost += (*token_costs)[!prev_t][!prev_t][t] + vp9_dct_value_cost_ptr[v];
+        cost += (*token_costs)[!prev_t][!prev_t][t] +
+            vp9_get_cost(t, e, cat6_high_cost);
       } else {
         pt = get_coef_context(nb, token_cache, c);
-        cost += (*token_costs)[!prev_t][pt][t] + vp9_dct_value_cost_ptr[v];
+        cost += (*token_costs)[!prev_t][pt][t] +
+            vp9_get_cost(t, e, cat6_high_cost);
         token_cache[rc] = vp9_pt_energy_class[t];
       }
       prev_t = t;
@@ -761,10 +760,10 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
   struct macroblockd_plane *pd = &xd->plane[0];
   const int src_stride = p->src.stride;
   const int dst_stride = pd->dst.stride;
-  const uint8_t *src_init = &p->src.buf[raster_block_offset(BLOCK_8X8, ib,
-                                                            src_stride)];
-  uint8_t *dst_init = &pd->dst.buf[raster_block_offset(BLOCK_8X8, ib,
-                                                       dst_stride)];
+  const uint8_t *src_init = &p->src.buf[vp9_raster_block_offset(BLOCK_8X8, ib,
+                                                                src_stride)];
+  uint8_t *dst_init = &pd->dst.buf[vp9_raster_block_offset(BLOCK_8X8, ib,
+                                                           dst_stride)];
   ENTROPY_CONTEXT ta[2], tempa[2];
   ENTROPY_CONTEXT tl[2], templ[2];
 
@@ -808,8 +807,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
           const int block = ib + idy * 2 + idx;
           const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
           uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
-          int16_t *const src_diff = raster_block_offset_int16(BLOCK_8X8, block,
-                                                              p->src_diff);
+          int16_t *const src_diff = vp9_raster_block_offset_int16(BLOCK_8X8,
+                                                                  block,
+                                                                  p->src_diff);
           tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
           xd->mi[0].src_mi->bmi[block].as_mode = mode;
           vp9_predict_intra_block(xd, block, 1,
@@ -908,8 +908,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
         const int block = ib + idy * 2 + idx;
         const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
         uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
-        int16_t *const src_diff = raster_block_offset_int16(BLOCK_8X8, block,
-                                                            p->src_diff);
+        int16_t *const src_diff =
+            vp9_raster_block_offset_int16(BLOCK_8X8, block, p->src_diff);
         tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
         xd->mi[0].src_mi->bmi[block].as_mode = mode;
         vp9_predict_intra_block(xd, block, 1,
@@ -1341,10 +1341,10 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
   const int height = 4 * num_4x4_blocks_high_lookup[plane_bsize];
   int idx, idy;
 
-  const uint8_t *const src = &p->src.buf[raster_block_offset(BLOCK_8X8, i,
-                                                             p->src.stride)];
-  uint8_t *const dst = &pd->dst.buf[raster_block_offset(BLOCK_8X8, i,
-                                                        pd->dst.stride)];
+  const uint8_t *const src =
+      &p->src.buf[vp9_raster_block_offset(BLOCK_8X8, i, p->src.stride)];
+  uint8_t *const dst = &pd->dst.buf[vp9_raster_block_offset(BLOCK_8X8, i,
+                                                            pd->dst.stride)];
   int64_t thisdistortion = 0, thissse = 0;
   int thisrate = 0, ref;
   const scan_order *so = &vp9_default_scan_orders[TX_4X4];
@@ -1352,7 +1352,7 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
   const InterpKernel *kernel = vp9_get_interp_kernel(mi->mbmi.interp_filter);
 
   for (ref = 0; ref < 1 + is_compound; ++ref) {
-    const uint8_t *pre = &pd->pre[ref].buf[raster_block_offset(BLOCK_8X8, i,
+    const uint8_t *pre = &pd->pre[ref].buf[vp9_raster_block_offset(BLOCK_8X8, i,
                                                pd->pre[ref].stride)];
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -1386,17 +1386,17 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     vp9_highbd_subtract_block(
-        height, width, raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8,
-        src, p->src.stride, dst, pd->dst.stride, xd->bd);
+        height, width, vp9_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
+        8, src, p->src.stride, dst, pd->dst.stride, xd->bd);
   } else {
     vp9_subtract_block(
-        height, width, raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8,
-        src, p->src.stride, dst, pd->dst.stride);
+        height, width, vp9_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
+        8, src, p->src.stride, dst, pd->dst.stride);
   }
 #else
   vp9_subtract_block(height, width,
-                     raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8,
-                     src, p->src.stride, dst, pd->dst.stride);
+                     vp9_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
+                     8, src, p->src.stride, dst, pd->dst.stride);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
   k = i;
@@ -1407,7 +1407,7 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
 
       k += (idy * 2 + idx);
       coeff = BLOCK_OFFSET(p->coeff, k);
-      x->fwd_txm4x4(raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
+      x->fwd_txm4x4(vp9_raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
                     coeff, 8);
       vp9_regular_quantize_b_4x4(x, 0, k, so->scan, so->iscan);
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -1480,13 +1480,14 @@ static INLINE void mi_buf_shift(MACROBLOCK *x, int i) {
   struct macroblock_plane *const p = &x->plane[0];
   struct macroblockd_plane *const pd = &x->e_mbd.plane[0];
 
-  p->src.buf = &p->src.buf[raster_block_offset(BLOCK_8X8, i, p->src.stride)];
+  p->src.buf = &p->src.buf[vp9_raster_block_offset(BLOCK_8X8, i,
+                                                   p->src.stride)];
   assert(((intptr_t)pd->pre[0].buf & 0x7) == 0);
-  pd->pre[0].buf = &pd->pre[0].buf[raster_block_offset(BLOCK_8X8, i,
-                                                       pd->pre[0].stride)];
+  pd->pre[0].buf = &pd->pre[0].buf[vp9_raster_block_offset(BLOCK_8X8, i,
+                                                           pd->pre[0].stride)];
   if (has_second_ref(mbmi))
-    pd->pre[1].buf = &pd->pre[1].buf[raster_block_offset(BLOCK_8X8, i,
-                                                         pd->pre[1].stride)];
+    pd->pre[1].buf = &pd->pre[1].buf[vp9_raster_block_offset(BLOCK_8X8, i,
+                                                           pd->pre[1].stride)];
 }
 
 static INLINE void mi_buf_restore(MACROBLOCK *x, struct buf_2d orig_src,
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index 31e93be65..82bce3780 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -279,6 +279,7 @@ static void get_layer_resolution(const int width_org, const int height_org,
 int vp9_svc_start_frame(VP9_COMP *const cpi) {
   int width = 0, height = 0;
   LAYER_CONTEXT *lc;
+  struct lookahead_entry *buf;
   int count = 1 << (cpi->svc.number_temporal_layers - 1);
 
   cpi->svc.spatial_layer_id = cpi->svc.spatial_layer_to_encode;
@@ -339,8 +340,12 @@ int vp9_svc_start_frame(VP9_COMP *const cpi) {
   // since its previous frame could be changed during decoding time. The idea is
   // we put a empty invisible frame in front of them, then we will not use
   // prev_mi when encoding these frames.
+
+  buf = vp9_lookahead_peek(cpi->lookahead, 0);
   if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2 &&
-      cpi->svc.encode_empty_frame_state == NEED_TO_ENCODE) {
+      cpi->svc.encode_empty_frame_state == NEED_TO_ENCODE &&
+      lc->rc.frames_to_key != 0 &&
+      !(buf != NULL && (buf->flags & VPX_EFLAG_FORCE_KF))) {
     if ((cpi->svc.number_temporal_layers > 1 &&
          cpi->svc.temporal_layer_id < cpi->svc.number_temporal_layers - 1) ||
         (cpi->svc.number_spatial_layers > 1 &&
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 54b2594e7..4c8995356 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -23,19 +23,6 @@
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_tokenize.h"
 
-static int16_t dct_value_cost[DCT_MAX_VALUE * 2];
-const int16_t *vp9_dct_value_cost_ptr = dct_value_cost + DCT_MAX_VALUE;
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static int16_t dct_value_cost_high10[DCT_MAX_VALUE_HIGH10 * 2];
-const int16_t *vp9_dct_value_cost_high10_ptr =
-    dct_value_cost_high10 + DCT_MAX_VALUE_HIGH10;
-
-static int16_t dct_value_cost_high12[DCT_MAX_VALUE_HIGH12 * 2];
-const int16_t *vp9_dct_value_cost_high12_ptr =
-    dct_value_cost_high12 + DCT_MAX_VALUE_HIGH12;
-#endif
-
 static const TOKENVALUE dct_cat_lt_10_value_tokens[] = {
   {9, 63}, {9, 61}, {9, 59}, {9, 57}, {9, 55}, {9, 53}, {9, 51}, {9, 49},
   {9, 47}, {9, 45}, {9, 43}, {9, 41}, {9, 39}, {9, 37}, {9, 35}, {9, 33},
@@ -98,6 +85,298 @@ static const vp9_tree_index cat5[10] = {2, 2, 4, 4, 6, 6, 8, 8, 0, 0};
 static const vp9_tree_index cat6[28] = {2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12,
     14, 14, 16, 16, 18, 18, 20, 20, 22, 22, 24, 24, 26, 26, 0, 0};
 
+static const int16_t zero_cost[] = {0};
+static const int16_t one_cost[] = {255, 257};
+static const int16_t two_cost[] = {255, 257};
+static const int16_t three_cost[] = {255, 257};
+static const int16_t four_cost[] = {255, 257};
+static const int16_t cat1_cost[] = {429, 431, 616, 618};
+static const int16_t cat2_cost[] = {624, 626, 727, 729, 848, 850, 951, 953};
+static const int16_t cat3_cost[] = {
+  820, 822, 893, 895, 940, 942, 1013, 1015, 1096, 1098, 1169, 1171, 1216, 1218,
+  1289, 1291
+};
+static const int16_t cat4_cost[] = {
+  1032, 1034, 1075, 1077, 1105, 1107, 1148, 1150, 1194, 1196, 1237, 1239,
+  1267, 1269, 1310, 1312, 1328, 1330, 1371, 1373, 1401, 1403, 1444, 1446,
+  1490, 1492, 1533, 1535, 1563, 1565, 1606, 1608
+};
+static const int16_t cat5_cost[] = {
+  1269, 1271, 1283, 1285, 1306, 1308, 1320,
+  1322, 1347, 1349, 1361, 1363, 1384, 1386, 1398, 1400, 1443, 1445, 1457,
+  1459, 1480, 1482, 1494, 1496, 1521, 1523, 1535, 1537, 1558, 1560, 1572,
+  1574, 1592, 1594, 1606, 1608, 1629, 1631, 1643, 1645, 1670, 1672, 1684,
+  1686, 1707, 1709, 1721, 1723, 1766, 1768, 1780, 1782, 1803, 1805, 1817,
+  1819, 1844, 1846, 1858, 1860, 1881, 1883, 1895, 1897
+};
+const int16_t vp9_cat6_low_cost[256] = {
+  1638, 1640, 1646, 1648, 1652, 1654, 1660, 1662,
+  1670, 1672, 1678, 1680, 1684, 1686, 1692, 1694, 1711, 1713, 1719, 1721,
+  1725, 1727, 1733, 1735, 1743, 1745, 1751, 1753, 1757, 1759, 1765, 1767,
+  1787, 1789, 1795, 1797, 1801, 1803, 1809, 1811, 1819, 1821, 1827, 1829,
+  1833, 1835, 1841, 1843, 1860, 1862, 1868, 1870, 1874, 1876, 1882, 1884,
+  1892, 1894, 1900, 1902, 1906, 1908, 1914, 1916, 1940, 1942, 1948, 1950,
+  1954, 1956, 1962, 1964, 1972, 1974, 1980, 1982, 1986, 1988, 1994, 1996,
+  2013, 2015, 2021, 2023, 2027, 2029, 2035, 2037, 2045, 2047, 2053, 2055,
+  2059, 2061, 2067, 2069, 2089, 2091, 2097, 2099, 2103, 2105, 2111, 2113,
+  2121, 2123, 2129, 2131, 2135, 2137, 2143, 2145, 2162, 2164, 2170, 2172,
+  2176, 2178, 2184, 2186, 2194, 2196, 2202, 2204, 2208, 2210, 2216, 2218,
+  2082, 2084, 2090, 2092, 2096, 2098, 2104, 2106, 2114, 2116, 2122, 2124,
+  2128, 2130, 2136, 2138, 2155, 2157, 2163, 2165, 2169, 2171, 2177, 2179,
+  2187, 2189, 2195, 2197, 2201, 2203, 2209, 2211, 2231, 2233, 2239, 2241,
+  2245, 2247, 2253, 2255, 2263, 2265, 2271, 2273, 2277, 2279, 2285, 2287,
+  2304, 2306, 2312, 2314, 2318, 2320, 2326, 2328, 2336, 2338, 2344, 2346,
+  2350, 2352, 2358, 2360, 2384, 2386, 2392, 2394, 2398, 2400, 2406, 2408,
+  2416, 2418, 2424, 2426, 2430, 2432, 2438, 2440, 2457, 2459, 2465, 2467,
+  2471, 2473, 2479, 2481, 2489, 2491, 2497, 2499, 2503, 2505, 2511, 2513,
+  2533, 2535, 2541, 2543, 2547, 2549, 2555, 2557, 2565, 2567, 2573, 2575,
+  2579, 2581, 2587, 2589, 2606, 2608, 2614, 2616, 2620, 2622, 2628, 2630,
+  2638, 2640, 2646, 2648, 2652, 2654, 2660, 2662
+};
+const int16_t vp9_cat6_high_cost[128] = {
+  72, 892, 1183, 2003, 1448, 2268, 2559, 3379,
+  1709, 2529, 2820, 3640, 3085, 3905, 4196, 5016, 2118, 2938, 3229, 4049,
+  3494, 4314, 4605, 5425, 3755, 4575, 4866, 5686, 5131, 5951, 6242, 7062,
+  2118, 2938, 3229, 4049, 3494, 4314, 4605, 5425, 3755, 4575, 4866, 5686,
+  5131, 5951, 6242, 7062, 4164, 4984, 5275, 6095, 5540, 6360, 6651, 7471,
+  5801, 6621, 6912, 7732, 7177, 7997, 8288, 9108, 2118, 2938, 3229, 4049,
+  3494, 4314, 4605, 5425, 3755, 4575, 4866, 5686, 5131, 5951, 6242, 7062,
+  4164, 4984, 5275, 6095, 5540, 6360, 6651, 7471, 5801, 6621, 6912, 7732,
+  7177, 7997, 8288, 9108, 4164, 4984, 5275, 6095, 5540, 6360, 6651, 7471,
+  5801, 6621, 6912, 7732, 7177, 7997, 8288, 9108, 6210, 7030, 7321, 8141,
+  7586, 8406, 8697, 9517, 7847, 8667, 8958, 9778, 9223, 10043, 10334, 11154
+};
+
+#if CONFIG_VP9_HIGHBITDEPTH
+const int16_t vp9_cat6_high10_high_cost[512] = {
+  74, 894, 1185, 2005, 1450, 2270, 2561,
+  3381, 1711, 2531, 2822, 3642, 3087, 3907, 4198, 5018, 2120, 2940, 3231,
+  4051, 3496, 4316, 4607, 5427, 3757, 4577, 4868, 5688, 5133, 5953, 6244,
+  7064, 2120, 2940, 3231, 4051, 3496, 4316, 4607, 5427, 3757, 4577, 4868,
+  5688, 5133, 5953, 6244, 7064, 4166, 4986, 5277, 6097, 5542, 6362, 6653,
+  7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290, 9110, 2120, 2940, 3231,
+  4051, 3496, 4316, 4607, 5427, 3757, 4577, 4868, 5688, 5133, 5953, 6244,
+  7064, 4166, 4986, 5277, 6097, 5542, 6362, 6653, 7473, 5803, 6623, 6914,
+  7734, 7179, 7999, 8290, 9110, 4166, 4986, 5277, 6097, 5542, 6362, 6653,
+  7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290, 9110, 6212, 7032, 7323,
+  8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960, 9780, 9225, 10045, 10336,
+  11156, 2120, 2940, 3231, 4051, 3496, 4316, 4607, 5427, 3757, 4577, 4868,
+  5688, 5133, 5953, 6244, 7064, 4166, 4986, 5277, 6097, 5542, 6362, 6653,
+  7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290, 9110, 4166, 4986, 5277,
+  6097, 5542, 6362, 6653, 7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290,
+  9110, 6212, 7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960,
+  9780, 9225, 10045, 10336, 11156, 4166, 4986, 5277, 6097, 5542, 6362, 6653,
+  7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290, 9110, 6212, 7032, 7323,
+  8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960, 9780, 9225, 10045, 10336,
+  11156, 6212, 7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960,
+  9780, 9225, 10045, 10336, 11156, 8258, 9078, 9369, 10189, 9634, 10454,
+  10745, 11565, 9895, 10715, 11006, 11826, 11271, 12091, 12382, 13202, 2120,
+  2940, 3231, 4051, 3496, 4316, 4607, 5427, 3757, 4577, 4868, 5688, 5133,
+  5953, 6244, 7064, 4166, 4986, 5277, 6097, 5542, 6362, 6653, 7473, 5803,
+  6623, 6914, 7734, 7179, 7999, 8290, 9110, 4166, 4986, 5277, 6097, 5542,
+  6362, 6653, 7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290, 9110, 6212,
+  7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960, 9780, 9225,
+  10045, 10336, 11156, 4166, 4986, 5277, 6097, 5542, 6362, 6653, 7473, 5803,
+  6623, 6914, 7734, 7179, 7999, 8290, 9110, 6212, 7032, 7323, 8143, 7588,
+  8408, 8699, 9519, 7849, 8669, 8960, 9780, 9225, 10045, 10336, 11156, 6212,
+  7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960, 9780, 9225,
+  10045, 10336, 11156, 8258, 9078, 9369, 10189, 9634, 10454, 10745, 11565,
+  9895, 10715, 11006, 11826, 11271, 12091, 12382, 13202, 4166, 4986, 5277,
+  6097, 5542, 6362, 6653, 7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290,
+  9110, 6212, 7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960,
+  9780, 9225, 10045, 10336, 11156, 6212, 7032, 7323, 8143, 7588, 8408, 8699,
+  9519, 7849, 8669, 8960, 9780, 9225, 10045, 10336, 11156, 8258, 9078, 9369,
+  10189, 9634, 10454, 10745, 11565, 9895, 10715, 11006, 11826, 11271, 12091,
+  12382, 13202, 6212, 7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669,
+  8960, 9780, 9225, 10045, 10336, 11156, 8258, 9078, 9369, 10189, 9634, 10454,
+  10745, 11565, 9895, 10715, 11006, 11826, 11271, 12091, 12382, 13202, 8258,
+  9078, 9369, 10189, 9634, 10454, 10745, 11565, 9895, 10715, 11006, 11826,
+  11271, 12091, 12382, 13202, 10304, 11124, 11415, 12235, 11680, 12500, 12791,
+  13611, 11941, 12761, 13052, 13872, 13317, 14137, 14428, 15248,
+};
+const int16_t vp9_cat6_high12_high_cost[2048] = {
+  76, 896, 1187, 2007, 1452, 2272, 2563,
+  3383, 1713, 2533, 2824, 3644, 3089, 3909, 4200, 5020, 2122, 2942, 3233,
+  4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870, 5690, 5135, 5955, 6246,
+  7066, 2122, 2942, 3233, 4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870,
+  5690, 5135, 5955, 6246, 7066, 4168, 4988, 5279, 6099, 5544, 6364, 6655,
+  7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 2122, 2942, 3233,
+  4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870, 5690, 5135, 5955, 6246,
+  7066, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916,
+  7736, 7181, 8001, 8292, 9112, 4168, 4988, 5279, 6099, 5544, 6364, 6655,
+  7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325,
+  8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338,
+  11158, 2122, 2942, 3233, 4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870,
+  5690, 5135, 5955, 6246, 7066, 4168, 4988, 5279, 6099, 5544, 6364, 6655,
+  7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 4168, 4988, 5279,
+  6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292,
+  9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962,
+  9782, 9227, 10047, 10338, 11158, 4168, 4988, 5279, 6099, 5544, 6364, 6655,
+  7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325,
+  8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338,
+  11158, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962,
+  9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456,
+  10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 2122,
+  2942, 3233, 4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870, 5690, 5135,
+  5955, 6246, 7066, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805,
+  6625, 6916, 7736, 7181, 8001, 8292, 9112, 4168, 4988, 5279, 6099, 5544,
+  6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214,
+  7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227,
+  10047, 10338, 11158, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805,
+  6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145, 7590,
+  8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 6214,
+  7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227,
+  10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567,
+  9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 4168, 4988, 5279,
+  6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292,
+  9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962,
+  9782, 9227, 10047, 10338, 11158, 6214, 7034, 7325, 8145, 7590, 8410, 8701,
+  9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371,
+  10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093,
+  12384, 13204, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671,
+  8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456,
+  10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 8260,
+  9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828,
+  11273, 12093, 12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793,
+  13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 2122, 2942,
+  3233, 4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870, 5690, 5135, 5955,
+  6246, 7066, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625,
+  6916, 7736, 7181, 8001, 8292, 9112, 4168, 4988, 5279, 6099, 5544, 6364,
+  6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034,
+  7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047,
+  10338, 11158, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625,
+  6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145, 7590, 8410,
+  8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 6214, 7034,
+  7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047,
+  10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897,
+  10717, 11008, 11828, 11273, 12093, 12384, 13204, 4168, 4988, 5279, 6099,
+  5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112,
+  6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782,
+  9227, 10047, 10338, 11158, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521,
+  7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191,
+  9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384,
+  13204, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962,
+  9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456,
+  10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 8260,
+  9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828,
+  11273, 12093, 12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793,
+  13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 4168, 4988,
+  5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001,
+  8292, 9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671,
+  8962, 9782, 9227, 10047, 10338, 11158, 6214, 7034, 7325, 8145, 7590, 8410,
+  8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080,
+  9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273,
+  12093, 12384, 13204, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851,
+  8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636,
+  10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204,
+  8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008,
+  11828, 11273, 12093, 12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502,
+  12793, 13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 6214,
+  7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227,
+  10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567,
+  9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 8260, 9080, 9371,
+  10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093,
+  12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943,
+  12763, 13054, 13874, 13319, 14139, 14430, 15250, 8260, 9080, 9371, 10191,
+  9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384,
+  13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763,
+  13054, 13874, 13319, 14139, 14430, 15250, 10306, 11126, 11417, 12237, 11682,
+  12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250,
+  12352, 13172, 13463, 14283, 13728, 14548, 14839, 15659, 13989, 14809, 15100,
+  15920, 15365, 16185, 16476, 17296, 2122, 2942, 3233, 4053, 3498, 4318, 4609,
+  5429, 3759, 4579, 4870, 5690, 5135, 5955, 6246, 7066, 4168, 4988, 5279,
+  6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292,
+  9112, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916,
+  7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701,
+  9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 4168, 4988, 5279,
+  6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292,
+  9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962,
+  9782, 9227, 10047, 10338, 11158, 6214, 7034, 7325, 8145, 7590, 8410, 8701,
+  9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371,
+  10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093,
+  12384, 13204, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625,
+  6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145, 7590, 8410,
+  8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 6214, 7034,
+  7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047,
+  10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897,
+  10717, 11008, 11828, 11273, 12093, 12384, 13204, 6214, 7034, 7325, 8145,
+  7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158,
+  8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008,
+  11828, 11273, 12093, 12384, 13204, 8260, 9080, 9371, 10191, 9636, 10456,
+  10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306,
+  11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874,
+  13319, 14139, 14430, 15250, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475,
+  5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145,
+  7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158,
+  6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782,
+  9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747,
+  11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 6214, 7034,
+  7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047,
+  10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897,
+  10717, 11008, 11828, 11273, 12093, 12384, 13204, 8260, 9080, 9371, 10191,
+  9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384,
+  13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763,
+  13054, 13874, 13319, 14139, 14430, 15250, 6214, 7034, 7325, 8145, 7590,
+  8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260,
+  9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828,
+  11273, 12093, 12384, 13204, 8260, 9080, 9371, 10191, 9636, 10456, 10747,
+  11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306, 11126,
+  11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319,
+  14139, 14430, 15250, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567,
+  9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306, 11126, 11417,
+  12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319, 14139,
+  14430, 15250, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943,
+  12763, 13054, 13874, 13319, 14139, 14430, 15250, 12352, 13172, 13463, 14283,
+  13728, 14548, 14839, 15659, 13989, 14809, 15100, 15920, 15365, 16185, 16476,
+  17296, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916,
+  7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701,
+  9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 6214, 7034, 7325,
+  8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338,
+  11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717,
+  11008, 11828, 11273, 12093, 12384, 13204, 6214, 7034, 7325, 8145, 7590,
+  8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260,
+  9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828,
+  11273, 12093, 12384, 13204, 8260, 9080, 9371, 10191, 9636, 10456, 10747,
+  11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306, 11126,
+  11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319,
+  14139, 14430, 15250, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851,
+  8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636,
+  10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204,
+  8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008,
+  11828, 11273, 12093, 12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502,
+  12793, 13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 8260,
+  9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828,
+  11273, 12093, 12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793,
+  13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 10306, 11126,
+  11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319,
+  14139, 14430, 15250, 12352, 13172, 13463, 14283, 13728, 14548, 14839, 15659,
+  13989, 14809, 15100, 15920, 15365, 16185, 16476, 17296, 6214, 7034, 7325,
+  8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338,
+  11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717,
+  11008, 11828, 11273, 12093, 12384, 13204, 8260, 9080, 9371, 10191, 9636,
+  10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204,
+  10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054,
+  13874, 13319, 14139, 14430, 15250, 8260, 9080, 9371, 10191, 9636, 10456,
+  10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306,
+  11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874,
+  13319, 14139, 14430, 15250, 10306, 11126, 11417, 12237, 11682, 12502, 12793,
+  13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 12352, 13172,
+  13463, 14283, 13728, 14548, 14839, 15659, 13989, 14809, 15100, 15920, 15365,
+  16185, 16476, 17296, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567,
+  9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306, 11126, 11417,
+  12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319, 14139,
+  14430, 15250, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943,
+  12763, 13054, 13874, 13319, 14139, 14430, 15250, 12352, 13172, 13463, 14283,
+  13728, 14548, 14839, 15659, 13989, 14809, 15100, 15920, 15365, 16185, 16476,
+  17296, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763,
+  13054, 13874, 13319, 14139, 14430, 15250, 12352, 13172, 13463, 14283, 13728,
+  14548, 14839, 15659, 13989, 14809, 15100, 15920, 15365, 16185, 16476, 17296,
+  12352, 13172, 13463, 14283, 13728, 14548, 14839, 15659, 13989, 14809, 15100,
+  15920, 15365, 16185, 16476, 17296, 14398, 15218, 15509, 16329, 15774, 16594,
+  16885, 17705, 16035, 16855, 17146, 17966, 17411, 18231, 18522, 19342
+};
+#endif
+
 #if CONFIG_VP9_HIGHBITDEPTH
 static const vp9_tree_index cat1_high10[2] = {0, 0};
 static const vp9_tree_index cat2_high10[4] = {2, 2, 0, 0};
@@ -117,61 +396,49 @@ static const vp9_tree_index cat6_high12[36] = {2, 2, 4, 4, 6, 6, 8, 8, 10, 10,
   30, 30, 32, 32, 34, 34, 0, 0};
 #endif
 
-static void init_bit_tree(vp9_tree_index *p, int n) {
-  int i = 0;
-
-  while (++i < n) {
-    p[0] = p[1] = i << 1;
-    p += 2;
-  }
-
-  p[0] = p[1] = 0;
-}
-
-
 const vp9_extra_bit vp9_extra_bits[ENTROPY_TOKENS] = {
-  {0, 0, 0, 0},                              // ZERO_TOKEN
-  {0, 0, 0, 1},                              // ONE_TOKEN
-  {0, 0, 0, 2},                              // TWO_TOKEN
-  {0, 0, 0, 3},                              // THREE_TOKEN
-  {0, 0, 0, 4},                              // FOUR_TOKEN
-  {cat1, vp9_cat1_prob, 1,  CAT1_MIN_VAL},   // CATEGORY1_TOKEN
-  {cat2, vp9_cat2_prob, 2,  CAT2_MIN_VAL},   // CATEGORY2_TOKEN
-  {cat3, vp9_cat3_prob, 3,  CAT3_MIN_VAL},   // CATEGORY3_TOKEN
-  {cat4, vp9_cat4_prob, 4,  CAT4_MIN_VAL},   // CATEGORY4_TOKEN
-  {cat5, vp9_cat5_prob, 5,  CAT5_MIN_VAL},   // CATEGORY5_TOKEN
-  {cat6, vp9_cat6_prob, 14, CAT6_MIN_VAL},   // CATEGORY6_TOKEN
-  {0, 0, 0, 0}                               // EOB_TOKEN
+  {0, 0, 0, 0, zero_cost},                             // ZERO_TOKEN
+  {0, 0, 0, 1, one_cost},                              // ONE_TOKEN
+  {0, 0, 0, 2, two_cost},                              // TWO_TOKEN
+  {0, 0, 0, 3, three_cost},                            // THREE_TOKEN
+  {0, 0, 0, 4, four_cost},                             // FOUR_TOKEN
+  {cat1, vp9_cat1_prob, 1,  CAT1_MIN_VAL, cat1_cost},  // CATEGORY1_TOKEN
+  {cat2, vp9_cat2_prob, 2,  CAT2_MIN_VAL, cat2_cost},  // CATEGORY2_TOKEN
+  {cat3, vp9_cat3_prob, 3,  CAT3_MIN_VAL, cat3_cost},  // CATEGORY3_TOKEN
+  {cat4, vp9_cat4_prob, 4,  CAT4_MIN_VAL, cat4_cost},  // CATEGORY4_TOKEN
+  {cat5, vp9_cat5_prob, 5,  CAT5_MIN_VAL, cat5_cost},  // CATEGORY5_TOKEN
+  {cat6, vp9_cat6_prob, 14, CAT6_MIN_VAL, 0},          // CATEGORY6_TOKEN
+  {0, 0, 0, 0, zero_cost}                              // EOB_TOKEN
 };
 
 #if CONFIG_VP9_HIGHBITDEPTH
 const vp9_extra_bit vp9_extra_bits_high10[ENTROPY_TOKENS] = {
-  {0, 0, 0, 0},                                            // ZERO_TOKEN
-  {0, 0, 0, 1},                                            // ONE_TOKEN
-  {0, 0, 0, 2},                                            // TWO_TOKEN
-  {0, 0, 0, 3},                                            // THREE_TOKEN
-  {0, 0, 0, 4},                                            // FOUR_TOKEN
-  {cat1_high10, vp9_cat1_prob_high10, 1,  CAT1_MIN_VAL},   // CATEGORY1_TOKEN
-  {cat2_high10, vp9_cat2_prob_high10, 2,  CAT2_MIN_VAL},   // CATEGORY2_TOKEN
-  {cat3_high10, vp9_cat3_prob_high10, 3,  CAT3_MIN_VAL},   // CATEGORY3_TOKEN
-  {cat4_high10, vp9_cat4_prob_high10, 4,  CAT4_MIN_VAL},   // CATEGORY4_TOKEN
-  {cat5_high10, vp9_cat5_prob_high10, 5,  CAT5_MIN_VAL},   // CATEGORY5_TOKEN
-  {cat6_high10, vp9_cat6_prob_high10, 16, CAT6_MIN_VAL},   // CATEGORY6_TOKEN
-  {0, 0, 0, 0}                                             // EOB_TOKEN
+  {0, 0, 0, 0, zero_cost},                                           // ZERO
+  {0, 0, 0, 1, one_cost},                                            // ONE
+  {0, 0, 0, 2, two_cost},                                            // TWO
+  {0, 0, 0, 3, three_cost},                                          // THREE
+  {0, 0, 0, 4, four_cost},                                           // FOUR
+  {cat1_high10, vp9_cat1_prob_high10, 1,  CAT1_MIN_VAL, cat1_cost},  // CAT1
+  {cat2_high10, vp9_cat2_prob_high10, 2,  CAT2_MIN_VAL, cat2_cost},  // CAT2
+  {cat3_high10, vp9_cat3_prob_high10, 3,  CAT3_MIN_VAL, cat3_cost},  // CAT3
+  {cat4_high10, vp9_cat4_prob_high10, 4,  CAT4_MIN_VAL, cat4_cost},  // CAT4
+  {cat5_high10, vp9_cat5_prob_high10, 5,  CAT5_MIN_VAL, cat5_cost},  // CAT5
+  {cat6_high10, vp9_cat6_prob_high10, 16, CAT6_MIN_VAL, 0},          // CAT6
+  {0, 0, 0, 0, zero_cost}                                            // EOB
 };
 const vp9_extra_bit vp9_extra_bits_high12[ENTROPY_TOKENS] = {
-  {0, 0, 0, 0},                                            // ZERO_TOKEN
-  {0, 0, 0, 1},                                            // ONE_TOKEN
-  {0, 0, 0, 2},                                            // TWO_TOKEN
-  {0, 0, 0, 3},                                            // THREE_TOKEN
-  {0, 0, 0, 4},                                            // FOUR_TOKEN
-  {cat1_high12, vp9_cat1_prob_high12, 1,  CAT1_MIN_VAL},   // CATEGORY1_TOKEN
-  {cat2_high12, vp9_cat2_prob_high12, 2,  CAT2_MIN_VAL},   // CATEGORY2_TOKEN
-  {cat3_high12, vp9_cat3_prob_high12, 3,  CAT3_MIN_VAL},   // CATEGORY3_TOKEN
-  {cat4_high12, vp9_cat4_prob_high12, 4,  CAT4_MIN_VAL},   // CATEGORY4_TOKEN
-  {cat5_high12, vp9_cat5_prob_high12, 5,  CAT5_MIN_VAL},   // CATEGORY5_TOKEN
-  {cat6_high12, vp9_cat6_prob_high12, 18, CAT6_MIN_VAL},   // CATEGORY6_TOKEN
-  {0, 0, 0, 0}                                             // EOB_TOKEN
+  {0, 0, 0, 0, zero_cost},                                           // ZERO
+  {0, 0, 0, 1, one_cost},                                            // ONE
+  {0, 0, 0, 2, two_cost},                                            // TWO
+  {0, 0, 0, 3, three_cost},                                          // THREE
+  {0, 0, 0, 4, four_cost},                                           // FOUR
+  {cat1_high12, vp9_cat1_prob_high12, 1,  CAT1_MIN_VAL, cat1_cost},  // CAT1
+  {cat2_high12, vp9_cat2_prob_high12, 2,  CAT2_MIN_VAL, cat2_cost},  // CAT2
+  {cat3_high12, vp9_cat3_prob_high12, 3,  CAT3_MIN_VAL, cat3_cost},  // CAT3
+  {cat4_high12, vp9_cat4_prob_high12, 4,  CAT4_MIN_VAL, cat4_cost},  // CAT4
+  {cat5_high12, vp9_cat5_prob_high12, 5,  CAT5_MIN_VAL, cat5_cost},  // CAT5
+  {cat6_high12, vp9_cat6_prob_high12, 18, CAT6_MIN_VAL, 0},          // CAT6
+  {0, 0, 0, 0, zero_cost}                                            // EOB
 };
 #endif
 
@@ -180,46 +447,6 @@ const struct vp9_token vp9_coef_encodings[ENTROPY_TOKENS] = {
   {125, 7}, {126, 7}, {127, 7}, {0, 1}
 };
 
-static void tokenize_init_one(const vp9_extra_bit *const e,
-                              int16_t *value_cost, int max_value) {
-  int i = -max_value;
-
-  TOKENVALUE t;
-  do {
-
-    vp9_get_token_extra(i, &t.token, &t.extra);
-    // initialize the cost for extra bits for all possible coefficient value.
-    {
-      int cost = 0;
-      const vp9_extra_bit *p = &e[t.token];
-
-      if (p->base_val) {
-        const int extra = t.extra;
-        const int length = p->len;
-
-        if (length)
-          cost += treed_cost(p->tree, p->prob, extra >> 1, length);
-
-        cost += vp9_cost_bit(vp9_prob_half, extra & 1); /* sign */
-        value_cost[i] = cost;
-      }
-    }
-  } while (++i < max_value);
-}
-
-void vp9_tokenize_initialize() {
-  tokenize_init_one(vp9_extra_bits,
-                    dct_value_cost + DCT_MAX_VALUE, DCT_MAX_VALUE);
-#if CONFIG_VP9_HIGHBITDEPTH
-  tokenize_init_one(vp9_extra_bits_high10,
-                    dct_value_cost_high10 + DCT_MAX_VALUE_HIGH10,
-                    DCT_MAX_VALUE_HIGH10);
-
-  tokenize_init_one(vp9_extra_bits_high12,
-                    dct_value_cost_high12 + DCT_MAX_VALUE_HIGH12,
-                    DCT_MAX_VALUE_HIGH12);
-#endif
-}
 
 struct tokenize_b_args {
   VP9_COMP *cpi;
diff --git a/vp9/encoder/vp9_tokenize.h b/vp9/encoder/vp9_tokenize.h
index 49dc4a718..81cc2e13f 100644
--- a/vp9/encoder/vp9_tokenize.h
+++ b/vp9/encoder/vp9_tokenize.h
@@ -20,8 +20,6 @@
 extern "C" {
 #endif
 
-void vp9_tokenize_initialize();
-
 #define EOSB_TOKEN 127     // Not signalled, encoder only
 
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -63,11 +61,29 @@ extern const int16_t *vp9_dct_value_cost_ptr;
  */
 extern const TOKENVALUE *vp9_dct_value_tokens_ptr;
 extern const TOKENVALUE *vp9_dct_cat_lt_10_value_tokens;
+extern const int16_t vp9_cat6_low_cost[256];
+extern const int16_t vp9_cat6_high_cost[128];
+extern const int16_t vp9_cat6_high10_high_cost[512];
+extern const int16_t vp9_cat6_high12_high_cost[2048];
+static INLINE int16_t vp9_get_cost(int16_t token, EXTRABIT extrabits,
+                                   const int16_t *cat6_high_table) {
+  if (token != CATEGORY6_TOKEN)
+    return vp9_extra_bits[token].cost[extrabits];
+  return vp9_cat6_low_cost[extrabits & 0xff]
+      + cat6_high_table[extrabits >> 8];
+}
+
 #if CONFIG_VP9_HIGHBITDEPTH
-extern const int16_t *vp9_dct_value_cost_high10_ptr;
-extern const TOKENVALUE *vp9_dct_value_tokens_high10_ptr;
-extern const int16_t *vp9_dct_value_cost_high12_ptr;
-extern const TOKENVALUE *vp9_dct_value_tokens_high12_ptr;
+static INLINE const int16_t* vp9_get_high_cost_table(int bit_depth) {
+  return bit_depth == 8 ? vp9_cat6_high_cost
+      : (bit_depth == 10 ? vp9_cat6_high10_high_cost :
+         vp9_cat6_high12_high_cost);
+}
+#else
+static INLINE const int16_t* vp9_get_high_cost_table(int bit_depth) {
+  (void) bit_depth;
+  return vp9_cat6_high_cost;
+}
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 static INLINE void vp9_get_token_extra(int v, int16_t *token, EXTRABIT *extra) {
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index f5e6e3190..06096a6b1 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -33,6 +33,7 @@ VP9_COMMON_SRCS-yes += common/vp9_entropymv.h
 VP9_COMMON_SRCS-yes += common/vp9_enums.h
 VP9_COMMON_SRCS-yes += common/vp9_idct.h
 VP9_COMMON_SRCS-yes += common/vp9_loopfilter.h
+VP9_COMMON_SRCS-yes += common/vp9_loopfilter_thread.h
 VP9_COMMON_SRCS-yes += common/vp9_mv.h
 VP9_COMMON_SRCS-yes += common/vp9_onyxc_int.h
 VP9_COMMON_SRCS-yes += common/vp9_pred_common.h
@@ -56,6 +57,7 @@ VP9_COMMON_SRCS-yes += common/vp9_tile_common.h
 VP9_COMMON_SRCS-yes += common/vp9_tile_common.c
 VP9_COMMON_SRCS-yes += common/vp9_loopfilter.c
 VP9_COMMON_SRCS-yes += common/vp9_loopfilter_filters.c
+VP9_COMMON_SRCS-yes += common/vp9_loopfilter_thread.c
 VP9_COMMON_SRCS-yes += common/vp9_mvref_common.c
 VP9_COMMON_SRCS-yes += common/vp9_mvref_common.h
 VP9_COMMON_SRCS-yes += common/vp9_quant_common.c
@@ -133,11 +135,14 @@ ifeq ($(ARCH_X86_64), yes)
 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_idct_ssse3_x86_64.asm
 endif
 
+VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_16_neon_asm$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_8_neon_asm$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_mb_lpf_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_save_reg_neon$(ASM)
 
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht4x4_add_neon.c
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht8x8_add_neon.c
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_neon.c
 
 # neon with assembly and intrinsics implementations. If both are available
 # prefer assembly.
@@ -156,9 +161,7 @@ VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_1_add_neon_asm$(ASM)
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_add_neon_asm$(ASM)
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_1_add_neon_asm$(ASM)
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_add_neon_asm$(ASM)
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_16_neon_asm$(ASM)
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_neon_asm$(ASM)
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_16_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_4_neon_asm$(ASM)
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_reconintra_neon_asm$(ASM)
 else
 ifeq ($(HAVE_NEON), yes)
@@ -176,8 +179,11 @@ VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_1_add_neon.c
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_add_neon.c
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_1_add_neon.c
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_add_neon.c
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_neon.c
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_16_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_4_neon.c
+# TODO(johannkoenig): re-enable when chromium build is fixed
+# # https://code.google.com/p/chromium/issues/detail?id=443839
+#VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_8_neon.c
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_reconintra_neon.c
 endif  # HAVE_NEON
 endif  # HAVE_NEON_ASM
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 7b4b17809..46e6e919c 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -42,6 +42,7 @@ struct vp9_extracfg {
   unsigned int                frame_periodic_boost;
   vpx_bit_depth_t             bit_depth;
   vp9e_tune_content           content;
+  vpx_color_space_t           color_space;
 };
 
 static struct vp9_extracfg default_extra_cfg = {
@@ -64,7 +65,8 @@ static struct vp9_extracfg default_extra_cfg = {
   NO_AQ,                      // aq_mode
   0,                          // frame_periodic_delta_q
   VPX_BITS_8,                 // Bit depth
-  VP9E_CONTENT_DEFAULT        // content
+  VP9E_CONTENT_DEFAULT,       // content
+  VPX_CS_UNKNOWN,             // color space
 };
 
 struct vpx_codec_alg_priv {
@@ -294,7 +296,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
       cfg->g_bit_depth == VPX_BITS_8) {
     ERROR("Codec bit-depth 8 not supported in profile > 1");
   }
-
+  RANGE_CHECK(extra_cfg, color_space, VPX_CS_UNKNOWN, VPX_CS_SRGB);
   return VPX_CODEC_OK;
 }
 
@@ -351,9 +353,9 @@ static int get_image_bps(const vpx_image_t *img) {
 }
 
 static vpx_codec_err_t set_encoder_config(
-    VP9EncoderConfig *oxcf,
-    const vpx_codec_enc_cfg_t *cfg,
-    const struct vp9_extracfg *extra_cfg) {
+  VP9EncoderConfig *oxcf,
+  const vpx_codec_enc_cfg_t *cfg,
+  const struct vp9_extracfg *extra_cfg) {
   const int is_vbr = cfg->rc_end_usage == VPX_VBR;
   oxcf->profile = cfg->g_profile;
   oxcf->max_threads = (int)cfg->g_threads;
@@ -437,6 +439,7 @@ static vpx_codec_err_t set_encoder_config(
   oxcf->firstpass_mb_stats_in  = cfg->rc_firstpass_mb_stats_in;
 #endif
 
+  oxcf->color_space = extra_cfg->color_space;
   oxcf->arnr_max_frames = extra_cfg->arnr_max_frames;
   oxcf->arnr_strength   = extra_cfg->arnr_strength;
 
@@ -1322,6 +1325,13 @@ static vpx_codec_err_t ctrl_set_tune_content(vpx_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static vpx_codec_err_t ctrl_set_color_space(vpx_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.color_space = CAST(VP9E_SET_COLOR_SPACE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   {VP8_COPY_REFERENCE,                ctrl_copy_reference},
   {VP8E_UPD_ENTROPY,                  ctrl_update_entropy},
@@ -1357,6 +1367,7 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   {VP9E_REGISTER_CX_CALLBACK,         ctrl_register_cx_callback},
   {VP9E_SET_SVC_LAYER_ID,             ctrl_set_svc_layer_id},
   {VP9E_SET_TUNE_CONTENT,             ctrl_set_tune_content},
+  {VP9E_SET_COLOR_SPACE,              ctrl_set_color_space},
   {VP9E_SET_NOISE_SENSITIVITY,        ctrl_set_noise_sensitivity},
 
   // Getters
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index 43bf35f9c..c0e429736 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -95,12 +95,11 @@ static vpx_codec_err_t decoder_destroy(vpx_codec_alg_priv_t *ctx) {
 
 static int parse_bitdepth_colorspace_sampling(
     BITSTREAM_PROFILE profile, struct vp9_read_bit_buffer *rb) {
-  const int sRGB = 7;
-  int colorspace;
+  vpx_color_space_t color_space;
   if (profile >= PROFILE_2)
     rb->bit_offset += 1;  // Bit-depth 10 or 12.
-  colorspace = vp9_rb_read_literal(rb, 3);
-  if (colorspace != sRGB) {
+  color_space = (vpx_color_space_t)vp9_rb_read_literal(rb, 3);
+  if (color_space != VPX_CS_SRGB) {
     rb->bit_offset += 1;  // [16,235] (including xvycc) vs [0,255] range.
     if (profile == PROFILE_1 || profile == PROFILE_3) {
       rb->bit_offset += 2;  // subsampling x/y.
@@ -148,7 +147,11 @@ static vpx_codec_err_t decoder_peek_si_internal(const uint8_t *data,
     if (frame_marker != VP9_FRAME_MARKER)
       return VPX_CODEC_UNSUP_BITSTREAM;
 
-    if (profile >= MAX_PROFILES) return VPX_CODEC_UNSUP_BITSTREAM;
+    if (profile >= MAX_PROFILES)
+      return VPX_CODEC_UNSUP_BITSTREAM;
+
+    if ((profile >= 2 && data_sz <= 1) || data_sz < 1)
+      return VPX_CODEC_UNSUP_BITSTREAM;
 
     if (vp9_rb_read_bit(&rb)) {  // show an existing frame
       vp9_rb_read_literal(&rb, 3);  // Frame buffer to show.
diff --git a/vp9/vp9_iface_common.h b/vp9/vp9_iface_common.h
index 00fbfdd7d..e585aa147 100644
--- a/vp9/vp9_iface_common.h
+++ b/vp9/vp9_iface_common.h
@@ -34,6 +34,7 @@ static void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG  *yv12,
       bps = 12;
     }
   }
+  img->cs = yv12->color_space;
   img->bit_depth = 8;
   img->w = yv12->y_stride;
   img->h = ALIGN_POWER_OF_TWO(yv12->y_height + 2 * VP9_ENC_BORDER_IN_PIXELS, 3);
@@ -92,6 +93,7 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img,
 
   yv12->y_stride = img->stride[VPX_PLANE_Y];
   yv12->uv_stride = img->stride[VPX_PLANE_U];
+  yv12->color_space = img->cs;
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index c75fd8a01..33a1e6735 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -150,6 +150,7 @@ VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct_avx2.c
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_intrin_avx2.c
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_avx2.c
 
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_avg_neon.c
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_sad_neon.c
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_variance_neon.c
diff --git a/vp9/vp9dx.mk b/vp9/vp9dx.mk
index 1fcb36f66..603158a9c 100644
--- a/vp9/vp9dx.mk
+++ b/vp9/vp9dx.mk
@@ -21,8 +21,6 @@ VP9_DX_SRCS-yes += decoder/vp9_decodemv.c
 VP9_DX_SRCS-yes += decoder/vp9_decodeframe.c
 VP9_DX_SRCS-yes += decoder/vp9_decodeframe.h
 VP9_DX_SRCS-yes += decoder/vp9_detokenize.c
-VP9_DX_SRCS-yes += decoder/vp9_dthread.c
-VP9_DX_SRCS-yes += decoder/vp9_dthread.h
 VP9_DX_SRCS-yes += decoder/vp9_reader.h
 VP9_DX_SRCS-yes += decoder/vp9_reader.c
 VP9_DX_SRCS-yes += decoder/vp9_read_bit_buffer.c
diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index 4fc0fd62f..a920ee3f9 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -10,7 +10,7 @@
 #ifndef VPX_VP8CX_H_
 #define VPX_VP8CX_H_
 
-/*!\defgroup vp8_encoder WebM VP8 Encoder
+/*!\defgroup vp8_encoder WebM VP8/VP9 Encoder
  * \ingroup vp8
  *
  * @{
@@ -18,7 +18,7 @@
 #include "./vp8.h"
 
 /*!\file
- * \brief Provides definitions for using the VP8 encoder algorithm within the
+ * \brief Provides definitions for using VP8 or VP9 encoder algorithm within the
  *        vpx Codec Interface.
  */
 
@@ -28,17 +28,20 @@ extern "C" {
 
 /*!\name Algorithm interface for VP8
  *
- * This interface provides the capability to encode raw VP8 streams, as would
- * be found in AVI files.
+ * This interface provides the capability to encode raw VP8 streams.
  * @{
  */
 extern vpx_codec_iface_t  vpx_codec_vp8_cx_algo;
 extern vpx_codec_iface_t *vpx_codec_vp8_cx(void);
+/*!@} - end algorithm interface member group*/
 
-/* TODO(jkoleszar): These move to VP9 in a later patch set. */
+/*!\name Algorithm interface for VP9
+ *
+ * This interface provides the capability to encode raw VP9 streams.
+ * @{
+ */
 extern vpx_codec_iface_t  vpx_codec_vp9_cx_algo;
 extern vpx_codec_iface_t *vpx_codec_vp9_cx(void);
-
 /*!@} - end algorithm interface member group*/
 
 
@@ -234,20 +237,111 @@ enum vp8e_enc_control_id {
 
   VP8E_SET_SCREEN_CONTENT_MODE,  /**<control function to set encoder screen content mode */
 
-  /* TODO(jkoleszar): Move to vp9cx.h */
+  /*!\brief Codec control function to set lossless encoding mode
+   *
+   * VP9 can operate in lossless encoding mode, in which the bitstream
+   * produced will be able to decode and reconstruct a perfect copy of
+   * input source. This control function provides a mean to switch encoder
+   * into lossless coding mode(1) or normal coding mode(0) that may be lossy.
+   *                          0 = lossy coding mode
+   *                          1 = lossless coding mode
+   *
+   *  By default, encoder operates in normal coding mode (maybe lossy).
+   */
   VP9E_SET_LOSSLESS,
+
+  /*!\brief Codec control function to set number of tile columns
+   *
+   * In encoding and decoding, VP9 allows an input image frame be partitioned
+   * into separated vertical tile columns, which can be encoded or decoded
+   * independently. This enables easy implementation of parallel encoding and
+   * decoding. This control requests the encoder to use column tiles in
+   * encoding an input frame, with number of tile columns (in Log2 unit) as
+   * the parameter:
+   *             0 = 1 tile column
+   *             1 = 2 tile columns
+   *             2 = 4 tile columns
+   *             .....
+   *             n = 2**n tile columns
+   * The requested tile columns will be capped by encoder based on image size
+   * limitation (The minimum width of a tile column is 256 pixel, the maximum
+   * is 4096).
+   *
+   * By default, the value is 0, i.e. one single column tile for entire image.
+   */
   VP9E_SET_TILE_COLUMNS,
+
+  /*!\brief Codec control function to set number of tile rows
+   *
+   * In encoding and decoding, VP9 allows an input image frame be partitioned
+   * into separated horizontal tile rows. Tile rows are encoded or decoded
+   * sequentially. Even though encoding/decoding of later tile rows depends on
+   * earlier ones, this allows the encoder to output data packets for tile rows
+   * prior to completely processing all tile rows in a frame, thereby reducing
+   * the latency in processing between input and output. The parameter
+   * for this control describes the number of tile rows, which has a valid
+   * range [0, 2]:
+   *            0 = 1 tile row
+   *            1 = 2 tile rows
+   *            2 = 4 tile rows
+   *
+   * By default, the value is 0, i.e. one single row tile for entire image.
+   */
   VP9E_SET_TILE_ROWS,
+
+  /*!\brief Codec control function to enable frame parallel decoding feature
+   *
+   * VP9 has a bitstream feature to reduce decoding dependency between frames
+   * by turning off backward update of probability context used in encoding
+   * and decoding. This allows staged parallel processing of more than one
+   * video frames in the decoder. This control function provides a mean to
+   * turn this feature on or off for bitstreams produced by encoder.
+   *
+   * By default, this feature is off.
+   */
   VP9E_SET_FRAME_PARALLEL_DECODING,
+
+  /*!\brief Codec control function to set adaptive quantization mode
+   *
+   * VP9 has a segment based feature that allows encoder to adaptively change
+   * quantization parameter for each segment within a frame to improve the
+   * subjective quality. This control makes encoder operate in one of the
+   * several AQ_modes supported.
+   *
+   * By default, encoder operates with AQ_Mode 0(adaptive quantization off).
+   */
   VP9E_SET_AQ_MODE,
+
+  /*!\brief Codec control function to enable/disable periodic Q boost
+   *
+   * One VP9 encoder speed feature is to enable quality boost by lowering
+   * frame level Q periodically. This control function provides a mean to
+   * turn on/off this feature.
+   *               0 = off
+   *               1 = on
+   *
+   * By default, the encoder is allowed to use this feature for appropriate
+   * encoding modes.
+   */
   VP9E_SET_FRAME_PERIODIC_BOOST,
+
   /*!\brief control function to set noise sensitivity
    *
    *  0: off, 1: OnYOnly
    */
   VP9E_SET_NOISE_SENSITIVITY,
 
+  /*!\brief control function to turn on/off SVC in encoder.
+   * \note Return value is VPX_CODEC_INVALID_PARAM if the encoder does not
+   *       support SVC in its current encoding mode
+   *  0: off, 1: on
+   */
   VP9E_SET_SVC,
+
+  /*!\brief control function to set parameters for SVC.
+   * \note Parameters contain min_q, max_q, scaling factor for each of the
+   *       SVC layers.
+   */
   VP9E_SET_SVC_PARAMETERS,
 
   /*!\brief control function to set svc layer for spatial and temporal.
@@ -256,9 +350,38 @@ enum vp8e_enc_control_id {
    *                     temporal layer.
    */
   VP9E_SET_SVC_LAYER_ID,
+
+  /*!\brief control function to set content type.
+   * \note Valid parameter range:
+   *              VP9E_CONTENT_DEFAULT = Regular video content (Default)
+   *              VP9E_CONTENT_SCREEN  = Screen capture content
+   */
   VP9E_SET_TUNE_CONTENT,
+
+  /*!\brief control function to get svc layer ID.
+   * \note The layer ID returned is for the data packet from the registered
+   *       callback function.
+   */
   VP9E_GET_SVC_LAYER_ID,
+
+  /*!\brief control function to register callback for getting per layer packet.
+   * \note Parameter for this control function is a structure with a callback
+   *       function and a pointer to private data used by the callback.
+   */
   VP9E_REGISTER_CX_CALLBACK,
+
+  /*!\brief control function to set color space info.
+   * \note Valid ranges: 0..7, default is "UNKNOWN".
+   *                     0 = UNKNOWN,
+   *                     1 = BT_601
+   *                     2 = BT_709
+   *                     3 = SMPTE_170
+   *                     4 = SMPTE_240
+   *                     5 = BT_2020
+   *                     6 = RESERVED
+   *                     7 = SRGB
+   */
+  VP9E_SET_COLOR_SPACE,
 };
 
 /*!\brief vpx 1-D scaling mode
@@ -423,6 +546,8 @@ VPX_CTRL_USE_TYPE(VP9E_SET_FRAME_PERIODIC_BOOST, unsigned int)
 VPX_CTRL_USE_TYPE(VP9E_SET_NOISE_SENSITIVITY,  unsigned int)
 
 VPX_CTRL_USE_TYPE(VP9E_SET_TUNE_CONTENT, int) /* vp9e_tune_content */
+
+VPX_CTRL_USE_TYPE(VP9E_SET_COLOR_SPACE, int)
 /*! @} - end defgroup vp8_encoder */
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vpx/vp8dx.h b/vpx/vp8dx.h
index 5cc25cd6a..3ba72fe3c 100644
--- a/vpx/vp8dx.h
+++ b/vpx/vp8dx.h
@@ -9,13 +9,13 @@
  */
 
 
-/*!\defgroup vp8_decoder WebM VP8 Decoder
+/*!\defgroup vp8_decoder WebM VP8/VP9 Decoder
  * \ingroup vp8
  *
  * @{
  */
 /*!\file
- * \brief Provides definitions for using the VP8 algorithm within the vpx Decoder
+ * \brief Provides definitions for using VP8 or VP9 within the vpx Decoder
  *        interface.
  */
 #ifndef VPX_VP8DX_H_
@@ -30,14 +30,18 @@ extern "C" {
 
 /*!\name Algorithm interface for VP8
  *
- * This interface provides the capability to decode raw VP8 streams, as would
- * be found in AVI files and other non-Flash uses.
+ * This interface provides the capability to decode VP8 streams.
  * @{
  */
 extern vpx_codec_iface_t  vpx_codec_vp8_dx_algo;
 extern vpx_codec_iface_t *vpx_codec_vp8_dx(void);
+/*!@} - end algorithm interface member group*/
 
-/* TODO(jkoleszar): These move to VP9 in a later patch set. */
+/*!\name Algorithm interface for VP9
+ *
+ * This interface provides the capability to decode VP9 streams.
+ * @{
+ */
 extern vpx_codec_iface_t  vpx_codec_vp9_dx_algo;
 extern vpx_codec_iface_t *vpx_codec_vp9_dx(void);
 /*!@} - end algorithm interface member group*/
@@ -85,7 +89,14 @@ enum vp8_dec_control_id {
    */
   VP9_SET_BYTE_ALIGNMENT,
 
-  /** For testing. */
+  /** control function to invert the decoding order to from right to left. The
+   * function is used in a test to confirm the decoding independence of tile
+   * columns. The function may be used in application where this order
+   * of decoding is desired.
+   *
+   * TODO(yaowu): Rework the unit test that uses this control, and in a future
+   *              release, this test-only control shall be removed.
+   */
   VP9_INVERT_TILE_DECODE_ORDER,
 
   VP8_DECODER_CTRL_ID_MAX
diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h
index da5bd0659..8f7bff518 100644
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h
@@ -232,8 +232,8 @@ extern "C" {
 
   /*!\brief Callback function pointer / user data pair storage */
   typedef struct vpx_codec_enc_output_cx_cb_pair {
-    vpx_codec_enc_output_cx_pkt_cb_fn_t output_cx_pkt;
-    void                            *user_priv;
+    vpx_codec_enc_output_cx_pkt_cb_fn_t output_cx_pkt; /**< Callback function */
+    void                            *user_priv; /**< Pointer to private data */
   } vpx_codec_priv_output_cx_pkt_cb_pair_t;
 
   /*!\brief Rational Number
@@ -737,10 +737,10 @@ extern "C" {
    *
    */
   typedef struct vpx_svc_parameters {
-    int max_quantizers[VPX_SS_MAX_LAYERS];
-    int min_quantizers[VPX_SS_MAX_LAYERS];
-    int scaling_factor_num[VPX_SS_MAX_LAYERS];
-    int scaling_factor_den[VPX_SS_MAX_LAYERS];
+    int max_quantizers[VPX_SS_MAX_LAYERS]; /**< Max Q for each layer */
+    int min_quantizers[VPX_SS_MAX_LAYERS]; /**< Min Q for each layer */
+    int scaling_factor_num[VPX_SS_MAX_LAYERS]; /**< Scaling factor-numerator*/
+    int scaling_factor_den[VPX_SS_MAX_LAYERS]; /**< Scaling factor-denominator*/
   } vpx_svc_extra_cfg_t;
 
 
diff --git a/vpx/vpx_image.h b/vpx/vpx_image.h
index 337e4c4be..c06d35101 100644
--- a/vpx/vpx_image.h
+++ b/vpx/vpx_image.h
@@ -28,7 +28,7 @@ extern "C" {
    * types, removing or reassigning enums, adding/removing/rearranging
    * fields to structures
    */
-#define VPX_IMAGE_ABI_VERSION (2) /**<\hideinitializer*/
+#define VPX_IMAGE_ABI_VERSION (3) /**<\hideinitializer*/
 
 
 #define VPX_IMG_FMT_PLANAR     0x100  /**< Image is a planar format. */
@@ -66,9 +66,22 @@ extern "C" {
     VPX_IMG_FMT_I44016    = VPX_IMG_FMT_I440 | VPX_IMG_FMT_HIGHBITDEPTH
   } vpx_img_fmt_t; /**< alias for enum vpx_img_fmt */
 
+  /*!\brief List of supported color spaces */
+  typedef enum vpx_color_space {
+    VPX_CS_UNKNOWN    = 0,  /**< Unknown */
+    VPX_CS_BT_601     = 1,  /**< BT.601 */
+    VPX_CS_BT_709     = 2,  /**< BT.709 */
+    VPX_CS_SMPTE_170  = 3,  /**< SMPTE.170 */
+    VPX_CS_SMPTE_240  = 4,  /**< SMPTE.240 */
+    VPX_CS_BT_2020    = 5,  /**< BT.2020 */
+    VPX_CS_RESERVED   = 6,  /**< Reserved */
+    VPX_CS_SRGB       = 7   /**< sRGB */
+  } vpx_color_space_t; /**< alias for enum vpx_color_space */
+
   /**\brief Image Descriptor */
   typedef struct vpx_image {
     vpx_img_fmt_t fmt; /**< Image Format */
+    vpx_color_space_t cs; /**< Color Space */
 
     /* Image storage dimensions */
     unsigned int  w;           /**< Stored image width */
diff --git a/vpx_scale/yv12config.h b/vpx_scale/yv12config.h
index f04dee1e8..6cdc235fe 100644
--- a/vpx_scale/yv12config.h
+++ b/vpx_scale/yv12config.h
@@ -55,6 +55,7 @@ typedef struct yv12_buffer_config {
   int subsampling_x;
   int subsampling_y;
   unsigned int bit_depth;
+  vpx_color_space_t color_space;
 
   int corrupted;
   int flags;
diff --git a/vpxenc.c b/vpxenc.c
index e14c5eaab..944dfa8b5 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -442,7 +442,7 @@ static const int vp9_arg_ctrl_map[] = {
   VP8E_SET_MAX_INTER_BITRATE_PCT, VP8E_SET_GF_CBR_BOOST_PCT,
   VP9E_SET_LOSSLESS, VP9E_SET_FRAME_PARALLEL_DECODING, VP9E_SET_AQ_MODE,
   VP9E_SET_FRAME_PERIODIC_BOOST, VP9E_SET_NOISE_SENSITIVITY,
-  VP9E_SET_TUNE_CONTENT,
+  VP9E_SET_TUNE_CONTENT, VP9E_SET_COLOR_SPACE,
   0
 };
 #endif