59 files changed, 1322 insertions, 436 deletions
diff --git a/build/make/configure.sh b/build/make/configure.sh
index f98018027..c527cd527 100644
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -666,6 +666,10 @@ process_common_toolchain() {
         tgt_isa=x86_64
         tgt_os=darwin13
         ;;
+      *darwin14*)
+        tgt_isa=x86_64
+        tgt_os=darwin14
+        ;;
       x86_64*mingw32*)
         tgt_os=win64
         ;;
@@ -775,6 +779,10 @@ process_common_toolchain() {
       add_cflags  "-mmacosx-version-min=10.9"
       add_ldflags "-mmacosx-version-min=10.9"
       ;;
+    *-darwin14-*)
+      add_cflags  "-mmacosx-version-min=10.10"
+      add_ldflags "-mmacosx-version-min=10.10"
+      ;;
     *-iphonesimulator-*)
       add_cflags  "-miphoneos-version-min=${IOS_VERSION_MIN}"
       add_ldflags "-miphoneos-version-min=${IOS_VERSION_MIN}"
@@ -1243,10 +1251,14 @@ EOF
   fi
 
   tgt_os_no_version=$(echo "${tgt_os}" | tr -d "[0-9]")
+  if [ "${tgt_os_no_version}" = "darwin" ] || \
+     [ "${tgt_os_no_version}" = "openbsd" ] || [ "`uname`" = "OpenBSD" ]; then
+    openbsd_like=yes
+  fi
   # Default use_x86inc to yes when we are 64 bit, non-pic, or on any
   # non-Darwin target.
   if [ "${tgt_isa}" = "x86_64" ] || [ "${pic}" != "yes" ] || \
-      [ "${tgt_os_no_version}" != "darwin" ]; then
+      [ "${openbsd_like}" != "yes" ]; then
     soft_enable use_x86inc
   fi
 
diff --git a/configure b/configure
index 450969747..80149c2f1 100755
--- a/configure
+++ b/configure
@@ -58,6 +58,7 @@ Advanced options:
   ${toggle_postproc_visualizer}   macro block / block level visualizers
   ${toggle_multi_res_encoding}    enable multiple-resolution encoding
   ${toggle_temporal_denoising}    enable temporal denoising and disable the spatial denoiser
+  ${toggle_vp9_highbitdepth}      enable 10/12 bit support in VP9
   ${toggle_vp9_temporal_denoising}
                                   enable vp9 temporal denoising
   ${toggle_webm_io}               enable input from and output to WebM container
@@ -128,6 +129,7 @@ all_platforms="${all_platforms} x86-darwin10-gcc"
 all_platforms="${all_platforms} x86-darwin11-gcc"
 all_platforms="${all_platforms} x86-darwin12-gcc"
 all_platforms="${all_platforms} x86-darwin13-gcc"
+all_platforms="${all_platforms} x86-darwin14-gcc"
 all_platforms="${all_platforms} x86-iphonesimulator-gcc"
 all_platforms="${all_platforms} x86-linux-gcc"
 all_platforms="${all_platforms} x86-linux-icc"
@@ -145,6 +147,7 @@ all_platforms="${all_platforms} x86_64-darwin10-gcc"
 all_platforms="${all_platforms} x86_64-darwin11-gcc"
 all_platforms="${all_platforms} x86_64-darwin12-gcc"
 all_platforms="${all_platforms} x86_64-darwin13-gcc"
+all_platforms="${all_platforms} x86_64-darwin14-gcc"
 all_platforms="${all_platforms} x86_64-iphonesimulator-gcc"
 all_platforms="${all_platforms} x86_64-linux-gcc"
 all_platforms="${all_platforms} x86_64-linux-icc"
@@ -161,6 +164,7 @@ all_platforms="${all_platforms} universal-darwin10-gcc"
 all_platforms="${all_platforms} universal-darwin11-gcc"
 all_platforms="${all_platforms} universal-darwin12-gcc"
 all_platforms="${all_platforms} universal-darwin13-gcc"
+all_platforms="${all_platforms} universal-darwin14-gcc"
 all_platforms="${all_platforms} generic-gnu"
 
 # all_targets is a list of all targets that can be configured
diff --git a/libs.doxy_template b/libs.doxy_template
index 02e290242..5a8f84728 100644
--- a/libs.doxy_template
+++ b/libs.doxy_template
@@ -36,7 +36,7 @@ DOXYFILE_ENCODING      = UTF-8
 # The PROJECT_NAME tag is a single word (or a sequence of words surrounded
 # by quotes) that should identify the project.
 
-PROJECT_NAME           = "WebM VP8 Codec SDK"
+PROJECT_NAME           = "WebM Codec SDK"
 
 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
 # base path where the generated documentation will be put.
@@ -415,12 +415,6 @@ MAX_INITIALIZER_LINES  = 30
 
 SHOW_USED_FILES        = YES
 
-# If the sources in your project are distributed over multiple directories
-# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy
-# in the documentation. The default is NO.
-
-SHOW_DIRECTORIES       = NO
-
 # The FILE_VERSION_FILTER tag can be used to specify a program or script that
 # doxygen should invoke to get the current version for each file (typically from the
 # version control system). Doxygen will invoke the program by executing (via
@@ -715,12 +709,6 @@ HTML_FOOTER            =
 
 HTML_STYLESHEET        =
 
-# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes,
-# files or namespaces will be aligned in HTML using tables. If set to
-# NO a bullet list will be used.
-
-HTML_ALIGN_MEMBERS     = YES
-
 # If the GENERATE_HTMLHELP tag is set to YES, additional index files
 # will be generated that can be used as input for tools like the
 # Microsoft HTML help workshop to generate a compressed HTML help file (.chm)
diff --git a/mainpage.dox b/mainpage.dox
index e2ec28002..ec202fa4f 100644
--- a/mainpage.dox
+++ b/mainpage.dox
@@ -1,4 +1,4 @@
-/*!\mainpage WebM VP8 Codec SDK
+/*!\mainpage WebM Codec SDK
 
   \section main_contents Page Contents
   - \ref main_intro
@@ -6,11 +6,11 @@
   - \ref main_support
 
   \section main_intro Introduction
-  Welcome to the WebM VP8 Codec SDK. This SDK allows you to integrate your
-  applications with the VP8 video codec, a high quality, royalty free, open
-  source codec deployed on millions of computers and devices worldwide.
+  Welcome to the WebM Codec SDK. This SDK allows you to integrate your
+  applications with the VP8 and VP9 video codecs, high quality, royalty free,
+  open source codecs deployed on billions of computers and devices worldwide.
 
-  This distribution of the WebM VP8 Codec SDK includes the following support:
+  This distribution of the WebM Codec SDK includes the following support:
 
   \if vp8_encoder
   - \ref vp8_encoder
@@ -28,12 +28,12 @@
   - Read the \ref samples "sample code" for examples of how to interact with the
     codec.
   - \ref codec reference
-    \if encoder
-    - \ref encoder reference
-    \endif
-    \if decoder
-    - \ref decoder reference
-    \endif
+  \if encoder
+  - \ref encoder reference
+  \endif
+  \if decoder
+  - \ref decoder reference
+  \endif
 
   \section main_support Support Options & FAQ
   The WebM project is an open source project supported by its community. For
diff --git a/test/datarate_test.cc b/test/datarate_test.cc
index 573870e91..e52934771 100644
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -38,6 +38,8 @@ class DatarateTestLarge : public ::libvpx_test::EncoderTest,
     first_drop_ = 0;
     bits_total_ = 0;
     duration_ = 0.0;
+    denoiser_offon_test_ = 0;
+    denoiser_offon_period_ = -1;
   }
 
   virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
@@ -45,6 +47,17 @@ class DatarateTestLarge : public ::libvpx_test::EncoderTest,
     if (video->frame() == 1) {
       encoder->Control(VP8E_SET_NOISE_SENSITIVITY, denoiser_on_);
     }
+
+    if (denoiser_offon_test_) {
+      ASSERT_GT(denoiser_offon_period_, 0)
+          << "denoiser_offon_period_ is not positive.";
+      if ((video->frame() + 1) % denoiser_offon_period_ == 0) {
+        // Flip denoiser_on_ periodically
+        denoiser_on_ ^= 1;
+      }
+      encoder->Control(VP8E_SET_NOISE_SENSITIVITY, denoiser_on_);
+    }
+
     const vpx_rational_t tb = video->timebase();
     timebase_ = static_cast<double>(tb.num) / tb.den;
     duration_ = 0;
@@ -124,6 +137,8 @@ class DatarateTestLarge : public ::libvpx_test::EncoderTest,
   double effective_datarate_;
   size_t bits_in_last_frame_;
   int denoiser_on_;
+  int denoiser_offon_test_;
+  int denoiser_offon_period_;
 };
 
 #if CONFIG_TEMPORAL_DENOISING
@@ -155,6 +170,29 @@ TEST_P(DatarateTestLarge, DenoiserLevels) {
         << " The datarate for the file missed the target!";
   }
 }
+
+// Check basic datarate targeting, for a single bitrate, when denoiser is off
+// and on.
+TEST_P(DatarateTestLarge, DenoiserOffOn) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 299);
+  cfg_.rc_target_bitrate = 300;
+  ResetModel();
+  // The denoiser is off by default.
+  denoiser_on_ = 0;
+  // Set the offon test flag.
+  denoiser_offon_test_ = 1;
+  denoiser_offon_period_ = 100;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
+      << " The datarate for the file exceeds the target!";
+  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.3)
+      << " The datarate for the file missed the target!";
+}
 #endif  // CONFIG_TEMPORAL_DENOISING
 
 TEST_P(DatarateTestLarge, BasicBufferModel) {
@@ -669,7 +707,7 @@ TEST_P(DatarateTestVP9Large, DenoiserLevels) {
 
 // Check basic datarate targeting, for a single bitrate, when denoiser is off
 // and on.
-TEST_P(DatarateTestVP9Large, DenoiserOffon) {
+TEST_P(DatarateTestVP9Large, DenoiserOffOn) {
   cfg_.rc_buf_initial_sz = 500;
   cfg_.rc_buf_optimal_sz = 500;
   cfg_.rc_buf_sz = 1000;
diff --git a/test/encode_test_driver.cc b/test/encode_test_driver.cc
index 77135ec6d..b03235b71 100644
--- a/test/encode_test_driver.cc
+++ b/test/encode_test_driver.cc
@@ -114,6 +114,7 @@ void EncoderTest::SetMode(TestMode mode) {
 static bool compare_img(const vpx_image_t *img1,
                         const vpx_image_t *img2) {
   bool match = (img1->fmt == img2->fmt) &&
+               (img1->cs == img2->cs) &&
                (img1->d_w == img2->d_w) &&
                (img1->d_h == img2->d_h);
 
diff --git a/test/error_resilience_test.cc b/test/error_resilience_test.cc
index 182547bdf..9a99a80f8 100644
--- a/test/error_resilience_test.cc
+++ b/test/error_resilience_test.cc
@@ -37,6 +37,7 @@ class ErrorResilienceTestLarge : public ::libvpx_test::EncoderTest,
   void Reset() {
     error_nframes_ = 0;
     droppable_nframes_ = 0;
+    pattern_switch_ = 0;
   }
 
   virtual void SetUp() {
@@ -62,19 +63,37 @@ class ErrorResilienceTestLarge : public ::libvpx_test::EncoderTest,
   //   1     3
   // 0    2     .....
   // LAST is updated on base/layer 0, GOLDEN  updated on layer 1.
-  int SetFrameFlags(int frame_num, int num_temp_layers) {
+  // Non-zero pattern_switch parameter means pattern will switch to
+  // not using LAST for frame_num >= pattern_switch.
+  int SetFrameFlags(int frame_num,
+                    int num_temp_layers,
+                    int pattern_switch) {
     int frame_flags = 0;
     if (num_temp_layers == 2) {
-      if (frame_num % 2 == 0) {
-        // Layer 0: predict from L and ARF, update L.
-        frame_flags = VP8_EFLAG_NO_REF_GF |
-                      VP8_EFLAG_NO_UPD_GF |
-                      VP8_EFLAG_NO_UPD_ARF;
-      } else {
-        // Layer 1: predict from L, GF, and ARF, and update GF.
-        frame_flags = VP8_EFLAG_NO_UPD_ARF |
-                      VP8_EFLAG_NO_UPD_LAST;
-      }
+        if (frame_num % 2 == 0) {
+          if (frame_num < pattern_switch || pattern_switch == 0) {
+            // Layer 0: predict from LAST and ARF, update LAST.
+            frame_flags = VP8_EFLAG_NO_REF_GF |
+                          VP8_EFLAG_NO_UPD_GF |
+                          VP8_EFLAG_NO_UPD_ARF;
+          } else {
+            // Layer 0: predict from GF and ARF, update GF.
+            frame_flags = VP8_EFLAG_NO_REF_LAST |
+                          VP8_EFLAG_NO_UPD_LAST |
+                          VP8_EFLAG_NO_UPD_ARF;
+          }
+        } else {
+          if (frame_num < pattern_switch || pattern_switch == 0) {
+            // Layer 1: predict from L, GF, and ARF, update GF.
+            frame_flags = VP8_EFLAG_NO_UPD_ARF |
+                          VP8_EFLAG_NO_UPD_LAST;
+          } else {
+            // Layer 1: predict from GF and ARF, update GF.
+            frame_flags = VP8_EFLAG_NO_REF_LAST |
+                          VP8_EFLAG_NO_UPD_LAST |
+                          VP8_EFLAG_NO_UPD_ARF;
+          }
+        }
     }
     return frame_flags;
   }
@@ -86,7 +105,9 @@ class ErrorResilienceTestLarge : public ::libvpx_test::EncoderTest,
                       VP8_EFLAG_NO_UPD_ARF);
     // For temporal layer case.
     if (cfg_.ts_number_layers > 1) {
-      frame_flags_ = SetFrameFlags(video->frame(), cfg_.ts_number_layers);
+      frame_flags_ = SetFrameFlags(video->frame(),
+                                   cfg_.ts_number_layers,
+                                   pattern_switch_);
       for (unsigned int i = 0; i < droppable_nframes_; ++i) {
         if (droppable_frames_[i] == video->frame()) {
           std::cout << "Encoding droppable frame: "
@@ -168,11 +189,16 @@ class ErrorResilienceTestLarge : public ::libvpx_test::EncoderTest,
     return mismatch_nframes_;
   }
 
+  void SetPatternSwitch(int frame_switch) {
+     pattern_switch_ = frame_switch;
+   }
+
  private:
   double psnr_;
   unsigned int nframes_;
   unsigned int error_nframes_;
   unsigned int droppable_nframes_;
+  unsigned int pattern_switch_;
   double mismatch_psnr_;
   unsigned int mismatch_nframes_;
   unsigned int error_frames_[kMaxErrorFrames];
@@ -299,6 +325,7 @@ TEST_P(ErrorResilienceTestLarge, 2LayersDropEnhancement) {
   // Error resilient mode ON.
   cfg_.g_error_resilient = 1;
   cfg_.kf_mode = VPX_KF_DISABLED;
+  SetPatternSwitch(0);
 
   // The odd frames are the enhancement layer for 2 layer pattern, so set
   // those frames as droppable. Drop the last 7 frames.
@@ -316,6 +343,45 @@ TEST_P(ErrorResilienceTestLarge, 2LayersDropEnhancement) {
   Reset();
 }
 
+// Check for successful decoding and no encoder/decoder mismatch
+// for a two layer temporal pattern, where at some point in the
+// sequence, the LAST ref is not used anymore.
+TEST_P(ErrorResilienceTestLarge, 2LayersNoRefLast) {
+  const vpx_rational timebase = { 33333333, 1000000000 };
+  cfg_.g_timebase = timebase;
+  cfg_.rc_target_bitrate = 500;
+  cfg_.g_lag_in_frames = 0;
+
+  cfg_.rc_end_usage = VPX_CBR;
+  // 2 Temporal layers, no spatial layers, CBR mode.
+  cfg_.ss_number_layers = 1;
+  cfg_.ts_number_layers = 2;
+  cfg_.ts_rate_decimator[0] = 2;
+  cfg_.ts_rate_decimator[1] = 1;
+  cfg_.ts_periodicity = 2;
+  cfg_.ts_target_bitrate[0] = 60 * cfg_.rc_target_bitrate / 100;
+  cfg_.ts_target_bitrate[1] = cfg_.rc_target_bitrate;
+
+  init_flags_ = VPX_CODEC_USE_PSNR;
+
+  libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                     timebase.den, timebase.num, 0, 100);
+
+  // Error resilient mode ON.
+  cfg_.g_error_resilient = 1;
+  cfg_.kf_mode = VPX_KF_DISABLED;
+  SetPatternSwitch(60);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  // Test that no mismatches have been found
+  std::cout << "             Mismatch frames: "
+            << GetMismatchFrames() << "\n";
+  EXPECT_EQ(GetMismatchFrames(), (unsigned int) 0);
+
+  // Reset previously set of error/droppable frames.
+  Reset();
+}
+
 class ErrorResilienceTestLargeCodecControls : public ::libvpx_test::EncoderTest,
     public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
  protected:
diff --git a/test/resize_test.cc b/test/resize_test.cc
index 9d0c570ae..5c25dc11c 100644
--- a/test/resize_test.cc
+++ b/test/resize_test.cc
@@ -144,6 +144,7 @@ class ResizeTest : public ::libvpx_test::EncoderTest,
 
 TEST_P(ResizeTest, TestExternalResizeWorks) {
   ResizingVideoSource video;
+  cfg_.g_lag_in_frames = 0;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 
   for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
@@ -153,9 +154,9 @@ TEST_P(ResizeTest, TestExternalResizeWorks) {
     const unsigned int expected_h = ScaleForFrameNumber(frame, kInitialHeight);
 
     EXPECT_EQ(expected_w, info->w)
-        << "Frame " << frame << "had unexpected width";
+        << "Frame " << frame << " had unexpected width";
     EXPECT_EQ(expected_h, info->h)
-        << "Frame " << frame << "had unexpected height";
+        << "Frame " << frame << " had unexpected height";
   }
 }
 
@@ -261,6 +262,8 @@ TEST_P(ResizeInternalTest, TestInternalResizeWorks) {
 }
 
 VP8_INSTANTIATE_TEST_CASE(ResizeTest, ONE_PASS_TEST_MODES);
+VP9_INSTANTIATE_TEST_CASE(ResizeTest,
+                          ::testing::Values(::libvpx_test::kRealTime));
 VP9_INSTANTIATE_TEST_CASE(ResizeInternalTest,
                           ::testing::Values(::libvpx_test::kOnePassBest));
 }  // namespace
diff --git a/test/variance_test.cc b/test/variance_test.cc
index 4d279f686..a8dd7de13 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -1914,11 +1914,17 @@ INSTANTIATE_TEST_CASE_P(
 const vp9_variance_fn_t variance8x8_neon = vp9_variance8x8_neon;
 const vp9_variance_fn_t variance16x16_neon = vp9_variance16x16_neon;
 const vp9_variance_fn_t variance32x32_neon = vp9_variance32x32_neon;
+const vp9_variance_fn_t variance32x64_neon = vp9_variance32x64_neon;
+const vp9_variance_fn_t variance64x32_neon = vp9_variance64x32_neon;
+const vp9_variance_fn_t variance64x64_neon = vp9_variance64x64_neon;
 INSTANTIATE_TEST_CASE_P(
     NEON, VP9VarianceTest,
     ::testing::Values(make_tuple(3, 3, variance8x8_neon, 0),
                       make_tuple(4, 4, variance16x16_neon, 0),
-                      make_tuple(5, 5, variance32x32_neon, 0)));
+                      make_tuple(5, 5, variance32x32_neon, 0),
+                      make_tuple(5, 6, variance32x64_neon, 0),
+                      make_tuple(6, 5, variance64x32_neon, 0),
+                      make_tuple(6, 6, variance64x64_neon, 0)));
 
 const vp9_subpixvariance_fn_t subpel_variance8x8_neon =
     vp9_sub_pixel_variance8x8_neon;
@@ -1926,11 +1932,14 @@ const vp9_subpixvariance_fn_t subpel_variance16x16_neon =
     vp9_sub_pixel_variance16x16_neon;
 const vp9_subpixvariance_fn_t subpel_variance32x32_neon =
     vp9_sub_pixel_variance32x32_neon;
+const vp9_subpixvariance_fn_t subpel_variance64x64_neon =
+    vp9_sub_pixel_variance64x64_neon;
 INSTANTIATE_TEST_CASE_P(
     NEON, VP9SubpelVarianceTest,
     ::testing::Values(make_tuple(3, 3, subpel_variance8x8_neon, 0),
                       make_tuple(4, 4, subpel_variance16x16_neon, 0),
-                      make_tuple(5, 5, subpel_variance32x32_neon, 0)));
+                      make_tuple(5, 5, subpel_variance32x32_neon, 0),
+                      make_tuple(6, 6, subpel_variance64x64_neon, 0)));
 #endif  // HAVE_NEON
 #endif  // CONFIG_VP9_ENCODER
 
diff --git a/test/vp9_avg_test.cc b/test/vp9_avg_test.cc
index fa04528a2..252ed4efa 100644
--- a/test/vp9_avg_test.cc
+++ b/test/vp9_avg_test.cc
@@ -165,4 +165,14 @@ INSTANTIATE_TEST_CASE_P(
 
 #endif
 
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(
+    NEON, AverageTest,
+    ::testing::Values(
+        make_tuple(16, 16, 0, 8, &vp9_avg_8x8_neon),
+        make_tuple(16, 16, 5, 8, &vp9_avg_8x8_neon),
+        make_tuple(32, 32, 15, 8, &vp9_avg_8x8_neon)));
+
+#endif
+
 }  // namespace
diff --git a/test/vp9_encoder_parms_get_to_decoder.cc b/test/vp9_encoder_parms_get_to_decoder.cc
index 6c354fd38..34e7854a9 100644
--- a/test/vp9_encoder_parms_get_to_decoder.cc
+++ b/test/vp9_encoder_parms_get_to_decoder.cc
@@ -65,14 +65,15 @@ struct EncodeParameters {
   int32_t lossless;
   int32_t error_resilient;
   int32_t frame_parallel;
+  vpx_color_space_t cs;
   // TODO(JBB): quantizers / bitrate
 };
 
 const EncodeParameters kVP9EncodeParameterSet[] = {
-    {0, 0, 0, 1, 0},
-    {0, 0, 0, 0, 0},
-    {0, 0, 1, 0, 0},
-    {0, 2, 0, 0, 1},
+    {0, 0, 0, 1, 0, VPX_CS_BT_601},
+    {0, 0, 0, 0, 0, VPX_CS_BT_709},
+    {0, 0, 1, 0, 0, VPX_CS_BT_2020},
+    {0, 2, 0, 0, 1, VPX_CS_UNKNOWN},
     // TODO(JBB): Test profiles (requires more work).
 };
 
@@ -109,6 +110,7 @@ class Vp9EncoderParmsGetToDecoder
   virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                   ::libvpx_test::Encoder *encoder) {
     if (video->frame() == 1) {
+      encoder->Control(VP9E_SET_COLOR_SPACE, encode_parms.cs);
       encoder->Control(VP9E_SET_LOSSLESS, encode_parms.lossless);
       encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING,
                        encode_parms.frame_parallel);
@@ -147,7 +149,7 @@ class Vp9EncoderParmsGetToDecoder
       EXPECT_EQ(common->frame_parallel_decoding_mode,
                 encode_parms.frame_parallel);
     }
-
+    EXPECT_EQ(common->color_space, encode_parms.cs);
     EXPECT_EQ(common->log2_tile_cols, encode_parms.tile_cols);
     EXPECT_EQ(common->log2_tile_rows, encode_parms.tile_rows);
 
diff --git a/usage.dox b/usage.dox
index 237b8dc42..6bfca3bc8 100644
--- a/usage.dox
+++ b/usage.dox
@@ -80,8 +80,11 @@
 
 
     The available initialization methods are:
-    \if encoder - #vpx_codec_enc_init (calls vpx_codec_enc_init_ver()) \endif
-    \if multi-encoder - #vpx_codec_enc_init_multi (calls vpx_codec_enc_init_multi_ver()) \endif
+    \if encoder
+     - #vpx_codec_enc_init (calls vpx_codec_enc_init_ver())
+     - #vpx_codec_enc_init_multi (calls vpx_codec_enc_init_multi_ver())
+     .
+    \endif
     \if decoder - #vpx_codec_dec_init (calls vpx_codec_dec_init_ver()) \endif
 
 
diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
index fc026aa9c..d02cd30b9 100644
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -753,45 +753,46 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
     int ref_frame_map[4];
     int sign_bias = 0;
     int dot_artifact_candidate = 0;
-    // For detecting dot artifact.
-    unsigned char* target = x->src.y_buffer;
-    unsigned char* target_u = x->block[16].src + *x->block[16].base_src;
-    unsigned char* target_v = x->block[20].src + *x->block[20].base_src;
-    int stride = x->src.y_stride;
-    int stride_uv = x->block[16].src_stride;
+    get_predictor_pointers(cpi, plane, recon_yoffset, recon_uvoffset);
+
+    // If the current frame is using LAST as a reference, check for
+    // biasing the mode selection for dot artifacts.
+    if (cpi->ref_frame_flags & VP8_LAST_FRAME) {
+      unsigned char* target_y = x->src.y_buffer;
+      unsigned char* target_u = x->block[16].src + *x->block[16].base_src;
+      unsigned char* target_v = x->block[20].src + *x->block[20].base_src;
+      int stride = x->src.y_stride;
+      int stride_uv = x->block[16].src_stride;
 #if CONFIG_TEMPORAL_DENOISING
-    if (cpi->oxcf.noise_sensitivity) {
-      int uv_denoise = (cpi->oxcf.noise_sensitivity >= 2) ? 1 : 0;
-      target =
-          cpi->denoiser.yv12_running_avg[LAST_FRAME].y_buffer + recon_yoffset;
-      stride = cpi->denoiser.yv12_running_avg[LAST_FRAME].y_stride;
-      if (uv_denoise) {
-        target_u =
-            cpi->denoiser.yv12_running_avg[LAST_FRAME].u_buffer + recon_uvoffset;
-        target_v =
-            cpi->denoiser.yv12_running_avg[LAST_FRAME].v_buffer + recon_uvoffset;
-        stride_uv = cpi->denoiser.yv12_running_avg[LAST_FRAME].uv_stride;
+      if (cpi->oxcf.noise_sensitivity) {
+        const int uv_denoise = (cpi->oxcf.noise_sensitivity >= 2) ? 1 : 0;
+        target_y =
+            cpi->denoiser.yv12_running_avg[LAST_FRAME].y_buffer + recon_yoffset;
+        stride = cpi->denoiser.yv12_running_avg[LAST_FRAME].y_stride;
+        if (uv_denoise) {
+          target_u =
+              cpi->denoiser.yv12_running_avg[LAST_FRAME].u_buffer +
+                  recon_uvoffset;
+          target_v =
+              cpi->denoiser.yv12_running_avg[LAST_FRAME].v_buffer +
+                  recon_uvoffset;
+          stride_uv = cpi->denoiser.yv12_running_avg[LAST_FRAME].uv_stride;
+        }
       }
-    }
 #endif
-
-    get_predictor_pointers(cpi, plane, recon_yoffset, recon_uvoffset);
-
-    dot_artifact_candidate =
-        check_dot_artifact_candidate(cpi, x,
-            target, stride,
-            plane[LAST_FRAME][0], mb_row, mb_col, 0);
-    // If not found in Y channel, check UV channel.
-    if (!dot_artifact_candidate) {
       dot_artifact_candidate =
-          check_dot_artifact_candidate(cpi, x,
-              target_u, stride_uv,
-              plane[LAST_FRAME][1], mb_row, mb_col, 1);
+          check_dot_artifact_candidate(cpi, x, target_y, stride,
+              plane[LAST_FRAME][0], mb_row, mb_col, 0);
+      // If not found in Y channel, check UV channel.
       if (!dot_artifact_candidate) {
         dot_artifact_candidate =
-            check_dot_artifact_candidate(cpi, x,
-                target_v, stride_uv,
-                plane[LAST_FRAME][2], mb_row, mb_col, 2);
+            check_dot_artifact_candidate(cpi, x, target_u, stride_uv,
+                plane[LAST_FRAME][1], mb_row, mb_col, 1);
+        if (!dot_artifact_candidate) {
+          dot_artifact_candidate =
+              check_dot_artifact_candidate(cpi, x, target_v, stride_uv,
+                  plane[LAST_FRAME][2], mb_row, mb_col, 2);
+        }
       }
     }
 
diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c
index 4557e19bf..47e5164d7 100644
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -334,20 +334,6 @@ const vp9_tree_index vp9_switchable_interp_tree
   -EIGHTTAP_SMOOTH, -EIGHTTAP_SHARP
 };
 
-#define COUNT_SAT 20
-#define MAX_UPDATE_FACTOR 128
-
-static int adapt_prob(vp9_prob pre_prob, const unsigned int ct[2]) {
-  return merge_probs(pre_prob, ct, COUNT_SAT, MAX_UPDATE_FACTOR);
-}
-
-static void adapt_probs(const vp9_tree_index *tree,
-                        const vp9_prob *pre_probs, const unsigned int *counts,
-                        vp9_prob *probs) {
-  vp9_tree_merge_probs(tree, pre_probs, counts, COUNT_SAT, MAX_UPDATE_FACTOR,
-                   probs);
-}
-
 void vp9_adapt_mode_probs(VP9_COMMON *cm) {
   int i, j;
   FRAME_CONTEXT *fc = cm->fc;
@@ -355,39 +341,41 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
   const FRAME_COUNTS *counts = &cm->counts;
 
   for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
-    fc->intra_inter_prob[i] = adapt_prob(pre_fc->intra_inter_prob[i],
-                                         counts->intra_inter[i]);
+    fc->intra_inter_prob[i] = mode_mv_merge_probs(pre_fc->intra_inter_prob[i],
+                                                  counts->intra_inter[i]);
   for (i = 0; i < COMP_INTER_CONTEXTS; i++)
-    fc->comp_inter_prob[i] = adapt_prob(pre_fc->comp_inter_prob[i],
-                                        counts->comp_inter[i]);
+    fc->comp_inter_prob[i] = mode_mv_merge_probs(pre_fc->comp_inter_prob[i],
+                                                 counts->comp_inter[i]);
   for (i = 0; i < REF_CONTEXTS; i++)
-    fc->comp_ref_prob[i] = adapt_prob(pre_fc->comp_ref_prob[i],
-                                      counts->comp_ref[i]);
+    fc->comp_ref_prob[i] = mode_mv_merge_probs(pre_fc->comp_ref_prob[i],
+                                               counts->comp_ref[i]);
   for (i = 0; i < REF_CONTEXTS; i++)
     for (j = 0; j < 2; j++)
-      fc->single_ref_prob[i][j] = adapt_prob(pre_fc->single_ref_prob[i][j],
-                                             counts->single_ref[i][j]);
+      fc->single_ref_prob[i][j] = mode_mv_merge_probs(
+          pre_fc->single_ref_prob[i][j], counts->single_ref[i][j]);
 
   for (i = 0; i < INTER_MODE_CONTEXTS; i++)
-    adapt_probs(vp9_inter_mode_tree, pre_fc->inter_mode_probs[i],
+    vp9_tree_merge_probs(vp9_inter_mode_tree, pre_fc->inter_mode_probs[i],
                 counts->inter_mode[i], fc->inter_mode_probs[i]);
 
   for (i = 0; i < BLOCK_SIZE_GROUPS; i++)
-    adapt_probs(vp9_intra_mode_tree, pre_fc->y_mode_prob[i],
+    vp9_tree_merge_probs(vp9_intra_mode_tree, pre_fc->y_mode_prob[i],
                 counts->y_mode[i], fc->y_mode_prob[i]);
 
   for (i = 0; i < INTRA_MODES; ++i)
-    adapt_probs(vp9_intra_mode_tree, pre_fc->uv_mode_prob[i],
-                counts->uv_mode[i], fc->uv_mode_prob[i]);
+    vp9_tree_merge_probs(vp9_intra_mode_tree, pre_fc->uv_mode_prob[i],
+                         counts->uv_mode[i], fc->uv_mode_prob[i]);
 
   for (i = 0; i < PARTITION_CONTEXTS; i++)
-    adapt_probs(vp9_partition_tree, pre_fc->partition_prob[i],
-                counts->partition[i], fc->partition_prob[i]);
+    vp9_tree_merge_probs(vp9_partition_tree, pre_fc->partition_prob[i],
+                         counts->partition[i], fc->partition_prob[i]);
 
   if (cm->interp_filter == SWITCHABLE) {
     for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
-      adapt_probs(vp9_switchable_interp_tree, pre_fc->switchable_interp_prob[i],
-                  counts->switchable_interp[i], fc->switchable_interp_prob[i]);
+      vp9_tree_merge_probs(vp9_switchable_interp_tree,
+                           pre_fc->switchable_interp_prob[i],
+                           counts->switchable_interp[i],
+                           fc->switchable_interp_prob[i]);
   }
 
   if (cm->tx_mode == TX_MODE_SELECT) {
@@ -399,23 +387,24 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
     for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
       tx_counts_to_branch_counts_8x8(counts->tx.p8x8[i], branch_ct_8x8p);
       for (j = 0; j < TX_SIZES - 3; ++j)
-        fc->tx_probs.p8x8[i][j] = adapt_prob(pre_fc->tx_probs.p8x8[i][j],
-                                             branch_ct_8x8p[j]);
+        fc->tx_probs.p8x8[i][j] = mode_mv_merge_probs(
+            pre_fc->tx_probs.p8x8[i][j], branch_ct_8x8p[j]);
 
       tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i], branch_ct_16x16p);
       for (j = 0; j < TX_SIZES - 2; ++j)
-        fc->tx_probs.p16x16[i][j] = adapt_prob(pre_fc->tx_probs.p16x16[i][j],
-                                               branch_ct_16x16p[j]);
+        fc->tx_probs.p16x16[i][j] = mode_mv_merge_probs(
+            pre_fc->tx_probs.p16x16[i][j], branch_ct_16x16p[j]);
 
       tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i], branch_ct_32x32p);
       for (j = 0; j < TX_SIZES - 1; ++j)
-        fc->tx_probs.p32x32[i][j] = adapt_prob(pre_fc->tx_probs.p32x32[i][j],
-                                               branch_ct_32x32p[j]);
+        fc->tx_probs.p32x32[i][j] = mode_mv_merge_probs(
+            pre_fc->tx_probs.p32x32[i][j], branch_ct_32x32p[j]);
     }
   }
 
   for (i = 0; i < SKIP_CONTEXTS; ++i)
-    fc->skip_probs[i] = adapt_prob(pre_fc->skip_probs[i], counts->skip[i]);
+    fc->skip_probs[i] = mode_mv_merge_probs(
+        pre_fc->skip_probs[i], counts->skip[i]);
 }
 
 static void set_default_lf_deltas(struct loopfilter *lf) {
diff --git a/vp9/common/vp9_entropymv.c b/vp9/common/vp9_entropymv.c
index 922c03947..2477e6ef3 100644
--- a/vp9/common/vp9_entropymv.c
+++ b/vp9/common/vp9_entropymv.c
@@ -11,9 +11,6 @@
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_entropymv.h"
 
-#define MV_COUNT_SAT 20
-#define MV_MAX_UPDATE_FACTOR 128
-
 // Integer pel reference mv threshold for use of high-precision 1/8 mv
 #define COMPANDED_MVREF_THRESH 8
 
@@ -183,16 +180,6 @@ void vp9_inc_mv(const MV *mv, nmv_context_counts *counts) {
   }
 }
 
-static vp9_prob adapt_prob(vp9_prob prep, const unsigned int ct[2]) {
-  return merge_probs(prep, ct, MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR);
-}
-
-static void adapt_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs,
-                        const unsigned int *counts, vp9_prob *probs) {
-  vp9_tree_merge_probs(tree, pre_probs, counts, MV_COUNT_SAT,
-                       MV_MAX_UPDATE_FACTOR, probs);
-}
-
 void vp9_adapt_mv_probs(VP9_COMMON *cm, int allow_hp) {
   int i, j;
 
@@ -200,30 +187,32 @@ void vp9_adapt_mv_probs(VP9_COMMON *cm, int allow_hp) {
   const nmv_context *pre_fc = &cm->frame_contexts[cm->frame_context_idx].nmvc;
   const nmv_context_counts *counts = &cm->counts.mv;
 
-  adapt_probs(vp9_mv_joint_tree, pre_fc->joints, counts->joints, fc->joints);
+  vp9_tree_merge_probs(vp9_mv_joint_tree, pre_fc->joints, counts->joints,
+                       fc->joints);
 
   for (i = 0; i < 2; ++i) {
     nmv_component *comp = &fc->comps[i];
     const nmv_component *pre_comp = &pre_fc->comps[i];
     const nmv_component_counts *c = &counts->comps[i];
 
-    comp->sign = adapt_prob(pre_comp->sign, c->sign);
-    adapt_probs(vp9_mv_class_tree, pre_comp->classes, c->classes,
-                comp->classes);
-    adapt_probs(vp9_mv_class0_tree, pre_comp->class0, c->class0, comp->class0);
+    comp->sign = mode_mv_merge_probs(pre_comp->sign, c->sign);
+    vp9_tree_merge_probs(vp9_mv_class_tree, pre_comp->classes, c->classes,
+                         comp->classes);
+    vp9_tree_merge_probs(vp9_mv_class0_tree, pre_comp->class0, c->class0,
+                         comp->class0);
 
     for (j = 0; j < MV_OFFSET_BITS; ++j)
-      comp->bits[j] = adapt_prob(pre_comp->bits[j], c->bits[j]);
+      comp->bits[j] = mode_mv_merge_probs(pre_comp->bits[j], c->bits[j]);
 
     for (j = 0; j < CLASS0_SIZE; ++j)
-      adapt_probs(vp9_mv_fp_tree, pre_comp->class0_fp[j], c->class0_fp[j],
-                  comp->class0_fp[j]);
+      vp9_tree_merge_probs(vp9_mv_fp_tree, pre_comp->class0_fp[j],
+                           c->class0_fp[j], comp->class0_fp[j]);
 
-    adapt_probs(vp9_mv_fp_tree, pre_comp->fp, c->fp, comp->fp);
+    vp9_tree_merge_probs(vp9_mv_fp_tree, pre_comp->fp, c->fp, comp->fp);
 
     if (allow_hp) {
-      comp->class0_hp = adapt_prob(pre_comp->class0_hp, c->class0_hp);
-      comp->hp = adapt_prob(pre_comp->hp, c->hp);
+      comp->class0_hp = mode_mv_merge_probs(pre_comp->class0_hp, c->class0_hp);
+      comp->hp = mode_mv_merge_probs(pre_comp->hp, c->hp);
     }
   }
 }
diff --git a/vp9/common/vp9_enums.h b/vp9/common/vp9_enums.h
index 7454dd439..7938fc10a 100644
--- a/vp9/common/vp9_enums.h
+++ b/vp9/common/vp9_enums.h
@@ -99,17 +99,6 @@ typedef enum {
 } TX_TYPE;
 
 typedef enum {
-  UNKNOWN    = 0,
-  BT_601     = 1,  // YUV
-  BT_709     = 2,  // YUV
-  SMPTE_170  = 3,  // YUV
-  SMPTE_240  = 4,  // YUV
-  BT_2020    = 5,  // YUV
-  RESERVED_2 = 6,
-  SRGB       = 7   // RGB
-} COLOR_SPACE;
-
-typedef enum {
   VP9_LAST_FLAG = 1 << 0,
   VP9_GOLD_FLAG = 1 << 1,
   VP9_ALT_FLAG = 1 << 2,
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index 58b2da75f..2101ec58c 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -1149,10 +1149,10 @@ static void highbd_filter_selectively_vert(uint16_t *s, int pitch,
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-static void filter_block_plane_non420(VP9_COMMON *cm,
-                                      struct macroblockd_plane *plane,
-                                      MODE_INFO *mi_8x8,
-                                      int mi_row, int mi_col) {
+void vp9_filter_block_plane_non420(VP9_COMMON *cm,
+                                   struct macroblockd_plane *plane,
+                                   MODE_INFO *mi_8x8,
+                                   int mi_row, int mi_col) {
   const int ss_x = plane->subsampling_x;
   const int ss_y = plane->subsampling_y;
   const int row_step = 1 << ss_y;
@@ -1598,8 +1598,8 @@ void vp9_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer,
         if (use_420)
           vp9_filter_block_plane(cm, &planes[plane], mi_row, &lfm);
         else
-          filter_block_plane_non420(cm, &planes[plane], mi + mi_col,
-                                    mi_row, mi_col);
+          vp9_filter_block_plane_non420(cm, &planes[plane], mi + mi_col,
+                                        mi_row, mi_col);
       }
     }
   }
diff --git a/vp9/common/vp9_loopfilter.h b/vp9/common/vp9_loopfilter.h
index 4c15e6bd4..6d7cabf7c 100644
--- a/vp9/common/vp9_loopfilter.h
+++ b/vp9/common/vp9_loopfilter.h
@@ -97,6 +97,11 @@ void vp9_filter_block_plane(struct VP9Common *const cm,
                             int mi_row,
                             LOOP_FILTER_MASK *lfm);
 
+void vp9_filter_block_plane_non420(struct VP9Common *cm,
+                                   struct macroblockd_plane *plane,
+                                   MODE_INFO *mi_8x8,
+                                   int mi_row, int mi_col);
+
 void vp9_loop_filter_init(struct VP9Common *cm);
 
 // Update the loop filter for the current frame.
diff --git a/vp9/decoder/vp9_dthread.c b/vp9/common/vp9_loopfilter_thread.c
index 3d2d0dd2e..2d47daeaf 100644
--- a/vp9/decoder/vp9_dthread.c
+++ b/vp9/common/vp9_loopfilter_thread.c
@@ -9,14 +9,10 @@
  */
 
 #include "./vpx_config.h"
-
 #include "vpx_mem/vpx_mem.h"
-
+#include "vp9/common/vp9_loopfilter_thread.h"
 #include "vp9/common/vp9_reconinter.h"
 
-#include "vp9/decoder/vp9_dthread.h"
-#include "vp9/decoder/vp9_decoder.h"
-
 #if CONFIG_MULTITHREAD
 static INLINE void mutex_lock(pthread_mutex_t *const mutex) {
   const int kMaxTryLocks = 4000;
@@ -88,31 +84,43 @@ static INLINE void sync_write(VP9LfSync *const lf_sync, int r, int c,
 }
 
 // Implement row loopfiltering for each thread.
-static void loop_filter_rows_mt(const YV12_BUFFER_CONFIG *const frame_buffer,
-                                VP9_COMMON *const cm,
-                                struct macroblockd_plane planes[MAX_MB_PLANE],
-                                int start, int stop, int y_only,
-                                VP9LfSync *const lf_sync) {
+static INLINE
+void thread_loop_filter_rows(const YV12_BUFFER_CONFIG *const frame_buffer,
+                             VP9_COMMON *const cm,
+                             struct macroblockd_plane planes[MAX_MB_PLANE],
+                             int start, int stop, int y_only,
+                             VP9LfSync *const lf_sync) {
   const int num_planes = y_only ? 1 : MAX_MB_PLANE;
-  int r, c;  // SB row and col
+  const int use_420 = y_only || (planes[1].subsampling_y == 1 &&
+                                 planes[1].subsampling_x == 1);
   const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2;
+  int mi_row, mi_col;
 
-  for (r = start; r < stop; r += lf_sync->num_workers) {
-    const int mi_row = r << MI_BLOCK_SIZE_LOG2;
+  for (mi_row = start; mi_row < stop;
+       mi_row += lf_sync->num_workers * MI_BLOCK_SIZE) {
     MODE_INFO *const mi = cm->mi + mi_row * cm->mi_stride;
 
-    for (c = 0; c < sb_cols; ++c) {
-      const int mi_col = c << MI_BLOCK_SIZE_LOG2;
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
+      const int r = mi_row >> MI_BLOCK_SIZE_LOG2;
+      const int c = mi_col >> MI_BLOCK_SIZE_LOG2;
       LOOP_FILTER_MASK lfm;
       int plane;
 
       sync_read(lf_sync, r, c);
 
       vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
-      vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm);
+
+      // TODO(JBB): Make setup_mask work for non 420.
+      if (use_420)
+        vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride,
+                       &lfm);
 
       for (plane = 0; plane < num_planes; ++plane) {
-        vp9_filter_block_plane(cm, &planes[plane], mi_row, &lfm);
+        if (use_420)
+          vp9_filter_block_plane(cm, &planes[plane], mi_row, &lfm);
+        else
+          vp9_filter_block_plane_non420(cm, &planes[plane], mi + mi_col,
+                                        mi_row, mi_col);
       }
 
       sync_write(lf_sync, r, c, sb_cols);
@@ -123,37 +131,33 @@ static void loop_filter_rows_mt(const YV12_BUFFER_CONFIG *const frame_buffer,
 // Row-based multi-threaded loopfilter hook
 static int loop_filter_row_worker(VP9LfSync *const lf_sync,
                                   LFWorkerData *const lf_data) {
-  loop_filter_rows_mt(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
-                      lf_data->start, lf_data->stop, lf_data->y_only, lf_sync);
+  thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
+                          lf_data->start, lf_data->stop, lf_data->y_only,
+                          lf_sync);
   return 1;
 }
 
-// VP9 decoder: Implement multi-threaded loopfilter that uses the tile
-// threads.
-void vp9_loop_filter_frame_mt(VP9LfSync *lf_sync,
-                              YV12_BUFFER_CONFIG *frame,
-                              struct macroblockd_plane planes[MAX_MB_PLANE],
-                              VP9_COMMON *cm,
-                              VP9Worker *workers, int nworkers,
-                              int frame_filter_level,
-                              int y_only) {
+static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame,
+                                VP9_COMMON *cm,
+                                struct macroblockd_plane planes[MAX_MB_PLANE],
+                                int start, int stop, int y_only,
+                                VP9Worker *workers, int nworkers,
+                                VP9LfSync *lf_sync) {
   const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
   // Number of superblock rows and cols
   const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
+  // Decoder may allocate more threads than number of tiles based on user's
+  // input.
   const int tile_cols = 1 << cm->log2_tile_cols;
   const int num_workers = MIN(nworkers, tile_cols);
   int i;
 
-  if (!frame_filter_level) return;
-
   if (!lf_sync->sync_range || cm->last_height != cm->height ||
       num_workers > lf_sync->num_workers) {
     vp9_loop_filter_dealloc(lf_sync);
     vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);
   }
 
-  vp9_loop_filter_frame_init(cm, frame_filter_level);
-
   // Initialize cur_sb_col to -1 for all SB rows.
   vpx_memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);
 
@@ -175,8 +179,8 @@ void vp9_loop_filter_frame_mt(VP9LfSync *lf_sync,
 
     // Loopfilter data
     vp9_loop_filter_data_reset(lf_data, frame, cm, planes);
-    lf_data->start = i;
-    lf_data->stop = sb_rows;
+    lf_data->start = start + i * MI_BLOCK_SIZE;
+    lf_data->stop = stop;
     lf_data->y_only = y_only;
 
     // Start loopfiltering
@@ -193,8 +197,33 @@ void vp9_loop_filter_frame_mt(VP9LfSync *lf_sync,
   }
 }
 
+void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
+                              VP9_COMMON *cm,
+                              struct macroblockd_plane planes[MAX_MB_PLANE],
+                              int frame_filter_level,
+                              int y_only, int partial_frame,
+                              VP9Worker *workers, int num_workers,
+                              VP9LfSync *lf_sync) {
+  int start_mi_row, end_mi_row, mi_rows_to_filter;
+
+  if (!frame_filter_level) return;
+
+  start_mi_row = 0;
+  mi_rows_to_filter = cm->mi_rows;
+  if (partial_frame && cm->mi_rows > 8) {
+    start_mi_row = cm->mi_rows >> 1;
+    start_mi_row &= 0xfffffff8;
+    mi_rows_to_filter = MAX(cm->mi_rows / 8, 8);
+  }
+  end_mi_row = start_mi_row + mi_rows_to_filter;
+  vp9_loop_filter_frame_init(cm, frame_filter_level);
+
+  loop_filter_rows_mt(frame, cm, planes, start_mi_row, end_mi_row,
+                      y_only, workers, num_workers, lf_sync);
+}
+
 // Set up nsync by width.
-static int get_sync_range(int width) {
+static INLINE int get_sync_range(int width) {
   // nsync numbers are picked by testing. For example, for 4k
   // video, using 4 gives best performance.
   if (width < 640)
diff --git a/vp9/decoder/vp9_dthread.h b/vp9/common/vp9_loopfilter_thread.h
index 664aaa32a..bca357e52 100644
--- a/vp9/decoder/vp9_dthread.h
+++ b/vp9/common/vp9_loopfilter_thread.h
@@ -8,23 +8,13 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_DECODER_VP9_DTHREAD_H_
-#define VP9_DECODER_VP9_DTHREAD_H_
-
+#ifndef VP9_COMMON_VP9_LOOPFILTER_THREAD_H_
+#define VP9_COMMON_VP9_LOOPFILTER_THREAD_H_
 #include "./vpx_config.h"
+#include "vp9/common/vp9_loopfilter.h"
 #include "vp9/common/vp9_thread.h"
-#include "vp9/decoder/vp9_reader.h"
-#include "vpx/internal/vpx_codec_internal.h"
 
 struct VP9Common;
-struct VP9Decoder;
-
-typedef struct TileWorkerData {
-  struct VP9Common *cm;
-  vp9_reader bit_reader;
-  DECLARE_ALIGNED(16, struct macroblockd, xd);
-  struct vpx_internal_error_info error_info;
-} TileWorkerData;
 
 // Loopfilter row synchronization
 typedef struct VP9LfSyncData {
@@ -45,19 +35,19 @@ typedef struct VP9LfSyncData {
 } VP9LfSync;
 
 // Allocate memory for loopfilter row synchronization.
-void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
+void vp9_loop_filter_alloc(VP9LfSync *lf_sync, struct VP9Common *cm, int rows,
                            int width, int num_workers);
 
 // Deallocate loopfilter synchronization related mutex and data.
 void vp9_loop_filter_dealloc(VP9LfSync *lf_sync);
 
 // Multi-threaded loopfilter that uses the tile threads.
-void vp9_loop_filter_frame_mt(VP9LfSync *lf_sync,
-                              YV12_BUFFER_CONFIG *frame,
-                              struct macroblockd_plane planes[MAX_MB_PLANE],
+void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
                               struct VP9Common *cm,
-                              VP9Worker *workers, int num_workers,
+                              struct macroblockd_plane planes[MAX_MB_PLANE],
                               int frame_filter_level,
-                              int y_only);
+                              int y_only, int partial_frame,
+                              VP9Worker *workers, int num_workers,
+                              VP9LfSync *lf_sync);
 
-#endif  // VP9_DECODER_VP9_DTHREAD_H_
+#endif  // VP9_COMMON_VP9_LOOPFILTER_THREAD_H_
diff --git a/vp9/common/vp9_mfqe.c b/vp9/common/vp9_mfqe.c
index e7ee903c6..1494c3fd7 100644
--- a/vp9/common/vp9_mfqe.c
+++ b/vp9/common/vp9_mfqe.c
@@ -35,14 +35,26 @@ static void filter_by_weight(const uint8_t *src, int src_stride,
   }
 }
 
+void vp9_filter_by_weight8x8_c(const uint8_t *src, int src_stride,
+                               uint8_t *dst, int dst_stride, int src_weight) {
+  filter_by_weight(src, src_stride, dst, dst_stride, 8, src_weight);
+}
+
+void vp9_filter_by_weight16x16_c(const uint8_t *src, int src_stride,
+                                 uint8_t *dst, int dst_stride,
+                                 int src_weight) {
+  filter_by_weight(src, src_stride, dst, dst_stride, 16, src_weight);
+}
+
 static void filter_by_weight32x32(const uint8_t *src, int src_stride,
                                   uint8_t *dst, int dst_stride, int weight) {
-  filter_by_weight(src, src_stride, dst, dst_stride, 16, weight);
-  filter_by_weight(src + 16, src_stride, dst + 16, dst_stride, 16, weight);
-  filter_by_weight(src + src_stride * 16, src_stride, dst + dst_stride * 16,
-                   dst_stride, 16, weight);
-  filter_by_weight(src + src_stride * 16 + 16, src_stride,
-                   dst + dst_stride * 16 + 16, dst_stride, 16, weight);
+  vp9_filter_by_weight16x16(src, src_stride, dst, dst_stride, weight);
+  vp9_filter_by_weight16x16(src + 16, src_stride, dst + 16, dst_stride,
+                            weight);
+  vp9_filter_by_weight16x16(src + src_stride * 16, src_stride,
+                            dst + dst_stride * 16, dst_stride, weight);
+  vp9_filter_by_weight16x16(src + src_stride * 16 + 16, src_stride,
+                            dst + dst_stride * 16 + 16, dst_stride, weight);
 }
 
 static void filter_by_weight64x64(const uint8_t *src, int src_stride,
@@ -62,13 +74,13 @@ static void apply_ifactor(const uint8_t *y, int y_stride, uint8_t *yd,
                           int uvd_stride, BLOCK_SIZE block_size,
                           int weight) {
   if (block_size == BLOCK_16X16) {
-    filter_by_weight(y, y_stride, yd, yd_stride, 16, weight);
-    filter_by_weight(u, uv_stride, ud, uvd_stride, 8, weight);
-    filter_by_weight(v, uv_stride, vd, uvd_stride, 8, weight);
+    vp9_filter_by_weight16x16(y, y_stride, yd, yd_stride, weight);
+    vp9_filter_by_weight8x8(u, uv_stride, ud, uvd_stride, weight);
+    vp9_filter_by_weight8x8(v, uv_stride, vd, uvd_stride, weight);
   } else if (block_size == BLOCK_32X32) {
     filter_by_weight32x32(y, y_stride, yd, yd_stride, weight);
-    filter_by_weight(u, uv_stride, ud, uvd_stride, 16, weight);
-    filter_by_weight(v, uv_stride, vd, uvd_stride, 16, weight);
+    vp9_filter_by_weight16x16(u, uv_stride, ud, uvd_stride, weight);
+    vp9_filter_by_weight16x16(v, uv_stride, vd, uvd_stride, weight);
   } else if (block_size == BLOCK_64X64) {
     filter_by_weight64x64(y, y_stride, yd, yd_stride, weight);
     filter_by_weight32x32(u, uv_stride, ud, uvd_stride, weight);
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index ff1ab9fa2..1a957bc99 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -76,7 +76,7 @@ typedef struct VP9Common {
   DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][8]);
   DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][8]);
 
-  COLOR_SPACE color_space;
+  vpx_color_space_t color_space;
 
   int width;
   int height;
diff --git a/vp9/common/vp9_prob.c b/vp9/common/vp9_prob.c
index a1befc63e..3b7b9bf3b 100644
--- a/vp9/common/vp9_prob.c
+++ b/vp9/common/vp9_prob.c
@@ -29,33 +29,25 @@ const uint8_t vp9_norm[256] = {
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 };
 
-
 static unsigned int tree_merge_probs_impl(unsigned int i,
                                           const vp9_tree_index *tree,
                                           const vp9_prob *pre_probs,
                                           const unsigned int *counts,
-                                          unsigned int count_sat,
-                                          unsigned int max_update,
                                           vp9_prob *probs) {
   const int l = tree[i];
   const unsigned int left_count = (l <= 0)
                  ? counts[-l]
-                 : tree_merge_probs_impl(l, tree, pre_probs, counts,
-                                         count_sat, max_update, probs);
+                 : tree_merge_probs_impl(l, tree, pre_probs, counts, probs);
   const int r = tree[i + 1];
   const unsigned int right_count = (r <= 0)
                  ? counts[-r]
-                 : tree_merge_probs_impl(r, tree, pre_probs, counts,
-                                         count_sat, max_update, probs);
+                 : tree_merge_probs_impl(r, tree, pre_probs, counts, probs);
   const unsigned int ct[2] = { left_count, right_count };
-  probs[i >> 1] = merge_probs(pre_probs[i >> 1], ct,
-                              count_sat, max_update);
+  probs[i >> 1] = mode_mv_merge_probs(pre_probs[i >> 1], ct);
   return left_count + right_count;
 }
 
 void vp9_tree_merge_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs,
-                          const unsigned int *counts, unsigned int count_sat,
-                          unsigned int max_update_factor, vp9_prob *probs) {
-  tree_merge_probs_impl(0, tree, pre_probs, counts, count_sat,
-                        max_update_factor, probs);
+                          const unsigned int *counts, vp9_prob *probs) {
+  tree_merge_probs_impl(0, tree, pre_probs, counts, probs);
 }
diff --git a/vp9/common/vp9_prob.h b/vp9/common/vp9_prob.h
index bc1511a5e..c69c62c81 100644
--- a/vp9/common/vp9_prob.h
+++ b/vp9/common/vp9_prob.h
@@ -33,6 +33,8 @@ typedef int8_t vp9_tree_index;
 
 #define vp9_complement(x) (255 - x)
 
+#define MODE_MV_COUNT_SAT 20
+
 /* We build coding trees compactly in arrays.
    Each node of the tree is a pair of vp9_tree_indices.
    Array index often references a corresponding probability table.
@@ -69,9 +71,28 @@ static INLINE vp9_prob merge_probs(vp9_prob pre_prob,
   return weighted_prob(pre_prob, prob, factor);
 }
 
+// MODE_MV_MAX_UPDATE_FACTOR (128) * count / MODE_MV_COUNT_SAT;
+static const int count_to_update_factor[MODE_MV_COUNT_SAT + 1] = {
+  0, 6, 12, 19, 25, 32, 38, 44, 51, 57, 64,
+  70, 76, 83, 89, 96, 102, 108, 115, 121, 128
+};
+
+static INLINE vp9_prob mode_mv_merge_probs(vp9_prob pre_prob,
+                                           const unsigned int ct[2]) {
+  const unsigned int den = ct[0] + ct[1];
+  if (den == 0) {
+    return pre_prob;
+  } else {
+    const unsigned int count = MIN(den, MODE_MV_COUNT_SAT);
+    const unsigned int factor = count_to_update_factor[count];
+    const vp9_prob prob =
+        clip_prob(((int64_t)(ct[0]) * 256 + (den >> 1)) / den);
+    return weighted_prob(pre_prob, prob, factor);
+  }
+}
+
 void vp9_tree_merge_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs,
-                          const unsigned int *counts, unsigned int count_sat,
-                          unsigned int max_update_factor, vp9_prob *probs);
+                          const unsigned int *counts, vp9_prob *probs);
 
 
 DECLARE_ALIGNED(16, extern const uint8_t, vp9_norm[256]);
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 88f85a86d..12f076fed 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -274,6 +274,12 @@ $vp9_post_proc_down_and_across_sse2=vp9_post_proc_down_and_across_xmm;
 add_proto qw/void vp9_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch";
 specialize qw/vp9_plane_add_noise sse2/;
 $vp9_plane_add_noise_sse2=vp9_plane_add_noise_wmt;
+
+add_proto qw/void vp9_filter_by_weight16x16/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight";
+specialize qw/vp9_filter_by_weight16x16 sse2/;
+
+add_proto qw/void vp9_filter_by_weight8x8/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight";
+specialize qw/vp9_filter_by_weight8x8 sse2/;
 }
 
 #
@@ -798,16 +804,16 @@ add_proto qw/unsigned int vp9_variance16x32/, "const uint8_t *src_ptr, int sourc
 specialize qw/vp9_variance16x32/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance64x32 avx2/, "$sse2_x86inc";
+specialize qw/vp9_variance64x32 avx2 neon/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance32x64/, "$sse2_x86inc";
+specialize qw/vp9_variance32x64 neon/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_variance32x32 avx2 neon/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance64x64 avx2/, "$sse2_x86inc";
+specialize qw/vp9_variance64x64 avx2 neon/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_variance16x16 avx2 neon/, "$sse2_x86inc";
@@ -837,7 +843,7 @@ add_proto qw/unsigned int vp9_variance4x4/, "const uint8_t *src_ptr, int source_
 specialize qw/vp9_variance4x4/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_variance64x64 avx2 neon/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/unsigned int vp9_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
 specialize qw/vp9_sub_pixel_avg_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
@@ -1098,7 +1104,7 @@ add_proto qw/unsigned int vp9_get_mb_ss/, "const int16_t *";
 specialize qw/vp9_get_mb_ss/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_avg_8x8/, "const uint8_t *, int p";
-specialize qw/vp9_avg_8x8 sse2/;
+specialize qw/vp9_avg_8x8 sse2 neon/;
 
 add_proto qw/unsigned int vp9_avg_4x4/, "const uint8_t *, int p";
 specialize qw/vp9_avg_4x4 sse2/;
@@ -1160,7 +1166,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vp9_quantize_b_32x32/, "$ssse3_x86_64";
 
   add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_fdct8x8_quant sse2 ssse3/;
+  specialize qw/vp9_fdct8x8_quant sse2 ssse3 neon/;
 }
 
 #
diff --git a/vp9/common/x86/vp9_mfqe_sse2.asm b/vp9/common/x86/vp9_mfqe_sse2.asm
new file mode 100644
index 000000000..6029420d1
--- /dev/null
+++ b/vp9/common/x86/vp9_mfqe_sse2.asm
@@ -0,0 +1,287 @@
+;
+;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+;  This file is a duplicate of mfqe_sse2.asm in VP8.
+;  TODO(jackychen): Find a way to fix the duplicate.
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp9_filter_by_weight16x16_sse2
+;(
+;    unsigned char *src,
+;    int            src_stride,
+;    unsigned char *dst,
+;    int            dst_stride,
+;    int            src_weight
+;)
+global sym(vp9_filter_by_weight16x16_sse2) PRIVATE
+sym(vp9_filter_by_weight16x16_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    movd        xmm0, arg(4)                ; src_weight
+    pshuflw     xmm0, xmm0, 0x0             ; replicate to all low words
+    punpcklqdq  xmm0, xmm0                  ; replicate to all hi words
+
+    movdqa      xmm1, [GLOBAL(tMFQE)]
+    psubw       xmm1, xmm0                  ; dst_weight
+
+    mov         rax, arg(0)                 ; src
+    mov         rsi, arg(1)                 ; src_stride
+    mov         rdx, arg(2)                 ; dst
+    mov         rdi, arg(3)                 ; dst_stride
+
+    mov         rcx, 16                     ; loop count
+    pxor        xmm6, xmm6
+
+.combine
+    movdqa      xmm2, [rax]
+    movdqa      xmm4, [rdx]
+    add         rax, rsi
+
+    ; src * src_weight
+    movdqa      xmm3, xmm2
+    punpcklbw   xmm2, xmm6
+    punpckhbw   xmm3, xmm6
+    pmullw      xmm2, xmm0
+    pmullw      xmm3, xmm0
+
+    ; dst * dst_weight
+    movdqa      xmm5, xmm4
+    punpcklbw   xmm4, xmm6
+    punpckhbw   xmm5, xmm6
+    pmullw      xmm4, xmm1
+    pmullw      xmm5, xmm1
+
+    ; sum, round and shift
+    paddw       xmm2, xmm4
+    paddw       xmm3, xmm5
+    paddw       xmm2, [GLOBAL(tMFQE_round)]
+    paddw       xmm3, [GLOBAL(tMFQE_round)]
+    psrlw       xmm2, 4
+    psrlw       xmm3, 4
+
+    packuswb    xmm2, xmm3
+    movdqa      [rdx], xmm2
+    add         rdx, rdi
+
+    dec         rcx
+    jnz         .combine
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+
+    ret
+
+;void vp9_filter_by_weight8x8_sse2
+;(
+;    unsigned char *src,
+;    int            src_stride,
+;    unsigned char *dst,
+;    int            dst_stride,
+;    int            src_weight
+;)
+global sym(vp9_filter_by_weight8x8_sse2) PRIVATE
+sym(vp9_filter_by_weight8x8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    movd        xmm0, arg(4)                ; src_weight
+    pshuflw     xmm0, xmm0, 0x0             ; replicate to all low words
+    punpcklqdq  xmm0, xmm0                  ; replicate to all hi words
+
+    movdqa      xmm1, [GLOBAL(tMFQE)]
+    psubw       xmm1, xmm0                  ; dst_weight
+
+    mov         rax, arg(0)                 ; src
+    mov         rsi, arg(1)                 ; src_stride
+    mov         rdx, arg(2)                 ; dst
+    mov         rdi, arg(3)                 ; dst_stride
+
+    mov         rcx, 8                      ; loop count
+    pxor        xmm4, xmm4
+
+.combine
+    movq        xmm2, [rax]
+    movq        xmm3, [rdx]
+    add         rax, rsi
+
+    ; src * src_weight
+    punpcklbw   xmm2, xmm4
+    pmullw      xmm2, xmm0
+
+    ; dst * dst_weight
+    punpcklbw   xmm3, xmm4
+    pmullw      xmm3, xmm1
+
+    ; sum, round and shift
+    paddw       xmm2, xmm3
+    paddw       xmm2, [GLOBAL(tMFQE_round)]
+    psrlw       xmm2, 4
+
+    packuswb    xmm2, xmm4
+    movq        [rdx], xmm2
+    add         rdx, rdi
+
+    dec         rcx
+    jnz         .combine
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+
+    ret
+
+;void vp9_variance_and_sad_16x16_sse2 | arg
+;(
+;    unsigned char *src1,          0
+;    int            stride1,       1
+;    unsigned char *src2,          2
+;    int            stride2,       3
+;    unsigned int  *variance,      4
+;    unsigned int  *sad,           5
+;)
+global sym(vp9_variance_and_sad_16x16_sse2) PRIVATE
+sym(vp9_variance_and_sad_16x16_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov         rax,        arg(0)          ; src1
+    mov         rcx,        arg(1)          ; stride1
+    mov         rdx,        arg(2)          ; src2
+    mov         rdi,        arg(3)          ; stride2
+
+    mov         rsi,        16              ; block height
+
+    ; Prep accumulator registers
+    pxor        xmm3, xmm3                  ; SAD
+    pxor        xmm4, xmm4                  ; sum of src2
+    pxor        xmm5, xmm5                  ; sum of src2^2
+
+    ; Because we're working with the actual output frames
+    ; we can't depend on any kind of data alignment.
+.accumulate
+    movdqa      xmm0, [rax]                 ; src1
+    movdqa      xmm1, [rdx]                 ; src2
+    add         rax, rcx                    ; src1 + stride1
+    add         rdx, rdi                    ; src2 + stride2
+
+    ; SAD(src1, src2)
+    psadbw      xmm0, xmm1
+    paddusw     xmm3, xmm0
+
+    ; SUM(src2)
+    pxor        xmm2, xmm2
+    psadbw      xmm2, xmm1                  ; sum src2 by misusing SAD against 0
+    paddusw     xmm4, xmm2
+
+    ; pmaddubsw would be ideal if it took two unsigned values. instead,
+    ; it expects a signed and an unsigned value. so instead we zero extend
+    ; and operate on words.
+    pxor        xmm2, xmm2
+    movdqa      xmm0, xmm1
+    punpcklbw   xmm0, xmm2
+    punpckhbw   xmm1, xmm2
+    pmaddwd     xmm0, xmm0
+    pmaddwd     xmm1, xmm1
+    paddd       xmm5, xmm0
+    paddd       xmm5, xmm1
+
+    sub         rsi,        1
+    jnz         .accumulate
+
+    ; phaddd only operates on adjacent double words.
+    ; Finalize SAD and store
+    movdqa      xmm0, xmm3
+    psrldq      xmm0, 8
+    paddusw     xmm0, xmm3
+    paddd       xmm0, [GLOBAL(t128)]
+    psrld       xmm0, 8
+
+    mov         rax,  arg(5)
+    movd        [rax], xmm0
+
+    ; Accumulate sum of src2
+    movdqa      xmm0, xmm4
+    psrldq      xmm0, 8
+    paddusw     xmm0, xmm4
+    ; Square src2. Ignore high value
+    pmuludq     xmm0, xmm0
+    psrld       xmm0, 8
+
+    ; phaddw could be used to sum adjacent values but we want
+    ; all the values summed. promote to doubles, accumulate,
+    ; shift and sum
+    pxor        xmm2, xmm2
+    movdqa      xmm1, xmm5
+    punpckldq   xmm1, xmm2
+    punpckhdq   xmm5, xmm2
+    paddd       xmm1, xmm5
+    movdqa      xmm2, xmm1
+    psrldq      xmm1, 8
+    paddd       xmm1, xmm2
+
+    psubd       xmm1, xmm0
+
+    ; (variance + 128) >> 8
+    paddd       xmm1, [GLOBAL(t128)]
+    psrld       xmm1, 8
+    mov         rax,  arg(4)
+
+    movd        [rax], xmm1
+
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+t128:
+%ifndef __NASM_VER__
+    ddq 128
+%elif CONFIG_BIG_ENDIAN
+    dq  0, 128
+%else
+    dq  128, 0
+%endif
+align 16
+tMFQE: ; 1 << MFQE_PRECISION
+    times 8 dw 0x10
+align 16
+tMFQE_round: ; 1 << (MFQE_PRECISION - 1)
+    times 8 dw 0x08
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index ffc59a6ed..ea4edbffe 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -36,7 +36,6 @@
 #include "vp9/decoder/vp9_decodemv.h"
 #include "vp9/decoder/vp9_decoder.h"
 #include "vp9/decoder/vp9_dsubexp.h"
-#include "vp9/decoder/vp9_dthread.h"
 #include "vp9/decoder/vp9_read_bit_buffer.h"
 #include "vp9/decoder/vp9_reader.h"
 
@@ -727,6 +726,8 @@ static void setup_frame_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
   }
   cm->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x;
   cm->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y;
+  cm->frame_bufs[cm->new_fb_idx].buf.color_space =
+      (vpx_color_space_t)cm->color_space;
   cm->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth;
 }
 
@@ -1225,8 +1226,8 @@ static void read_bitdepth_colorspace_sampling(
     cm->use_highbitdepth = 0;
 #endif
   }
-  cm->color_space = (COLOR_SPACE)vp9_rb_read_literal(rb, 3);
-  if (cm->color_space != SRGB) {
+  cm->color_space = vp9_rb_read_literal(rb, 3);
+  if (cm->color_space != VPX_CS_SRGB) {
     vp9_rb_read_bit(rb);  // [16,235] (including xvycc) vs [0,255] range
     if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) {
       cm->subsampling_x = vp9_rb_read_bit(rb);
@@ -1326,7 +1327,7 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
         // of either the color format or color sub-sampling in profile 0. VP9
         // specifies that the default color format should be YUV 4:2:0 in this
         // case (normative).
-        cm->color_space = BT_601;
+        cm->color_space = VPX_CS_BT_601;
         cm->subsampling_y = cm->subsampling_x = 1;
         cm->bit_depth = VPX_BITS_8;
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -1589,9 +1590,9 @@ void vp9_decode_frame(VP9Decoder *pbi,
     if (!xd->corrupted) {
       // If multiple threads are used to decode tiles, then we use those threads
       // to do parallel loopfiltering.
-      vp9_loop_filter_frame_mt(&pbi->lf_row_sync, new_fb, pbi->mb.plane, cm,
-                               pbi->tile_workers, pbi->num_tile_workers,
-                               cm->lf.filter_level, 0);
+      vp9_loop_filter_frame_mt(new_fb, cm, pbi->mb.plane, cm->lf.filter_level,
+                               0, 0, pbi->tile_workers, pbi->num_tile_workers,
+                               &pbi->lf_row_sync);
     } else {
       vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
                          "Decode failed. Frame data is corrupted.");
diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c
index 1d254d2db..7bef265b8 100644
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@@ -32,7 +32,6 @@
 #include "vp9/decoder/vp9_decodeframe.h"
 #include "vp9/decoder/vp9_decoder.h"
 #include "vp9/decoder/vp9_detokenize.h"
-#include "vp9/decoder/vp9_dthread.h"
 
 static void initialize_dec(void) {
   static volatile int init_done = 0;
diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h
index 25b7339ed..1415019a1 100644
--- a/vp9/decoder/vp9_decoder.h
+++ b/vp9/decoder/vp9_decoder.h
@@ -15,12 +15,11 @@
 
 #include "vpx/vpx_codec.h"
 #include "vpx_scale/yv12config.h"
-
+#include "vp9/common/vp9_loopfilter_thread.h"
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_ppflags.h"
 #include "vp9/common/vp9_thread.h"
-
-#include "vp9/decoder/vp9_dthread.h"
+#include "vp9/decoder/vp9_reader.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -33,6 +32,13 @@ typedef struct TileData {
   DECLARE_ALIGNED(16, MACROBLOCKD, xd);
 } TileData;
 
+typedef struct TileWorkerData {
+  VP9_COMMON *cm;
+  vp9_reader bit_reader;
+  DECLARE_ALIGNED(16, MACROBLOCKD, xd);
+  struct vpx_internal_error_info error_info;
+} TileWorkerData;
+
 typedef struct VP9Decoder {
   DECLARE_ALIGNED(16, MACROBLOCKD, mb);
 
diff --git a/vp9/encoder/arm/neon/vp9_avg_neon.c b/vp9/encoder/arm/neon/vp9_avg_neon.c
new file mode 100644
index 000000000..f505fcb7a
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_avg_neon.c
@@ -0,0 +1,49 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+
+static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) {
+  const uint32x4_t a = vpaddlq_u16(v_16x8);
+  const uint64x2_t b = vpaddlq_u32(a);
+  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                                vreinterpret_u32_u64(vget_high_u64(b)));
+  return vget_lane_u32(c, 0);
+}
+
+unsigned int vp9_avg_8x8_neon(const uint8_t *s, int p) {
+  uint8x8_t v_s0 = vld1_u8(s);
+  const uint8x8_t v_s1 = vld1_u8(s + p);
+  uint16x8_t v_sum = vaddl_u8(v_s0, v_s1);
+
+  v_s0 = vld1_u8(s + 2 * p);
+  v_sum = vaddw_u8(v_sum, v_s0);
+
+  v_s0 = vld1_u8(s + 3 * p);
+  v_sum = vaddw_u8(v_sum, v_s0);
+
+  v_s0 = vld1_u8(s + 4 * p);
+  v_sum = vaddw_u8(v_sum, v_s0);
+
+  v_s0 = vld1_u8(s + 5 * p);
+  v_sum = vaddw_u8(v_sum, v_s0);
+
+  v_s0 = vld1_u8(s + 6 * p);
+  v_sum = vaddw_u8(v_sum, v_s0);
+
+  v_s0 = vld1_u8(s + 7 * p);
+  v_sum = vaddw_u8(v_sum, v_s0);
+
+  return (horizontal_add_u16x8(v_sum) + 32) >> 6;
+}
diff --git a/vp9/encoder/arm/neon/vp9_dct_neon.c b/vp9/encoder/arm/neon/vp9_dct_neon.c
index 6c66f5d5b..a6d4797ad 100644
--- a/vp9/encoder/arm/neon/vp9_dct_neon.c
+++ b/vp9/encoder/arm/neon/vp9_dct_neon.c
@@ -32,6 +32,24 @@ void vp9_fdct8x8_1_neon(const int16_t *input, int16_t *output, int stride) {
   }
 }
 
+void vp9_fdct8x8_quant_neon(const int16_t *input, int stride,
+                            int16_t* coeff_ptr, intptr_t n_coeffs,
+                            int skip_block, const int16_t* zbin_ptr,
+                            const int16_t* round_ptr, const int16_t* quant_ptr,
+                            const int16_t* quant_shift_ptr,
+                            int16_t* qcoeff_ptr, int16_t* dqcoeff_ptr,
+                            const int16_t* dequant_ptr, uint16_t* eob_ptr,
+                            const int16_t* scan_ptr,
+                            const int16_t* iscan_ptr) {
+  int16_t temp_buffer[64];
+  (void)coeff_ptr;
+
+  vp9_fdct8x8_neon(input, temp_buffer, stride);
+  vp9_quantize_fp_neon(temp_buffer, n_coeffs, skip_block, zbin_ptr, round_ptr,
+                       quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr,
+                       dequant_ptr, eob_ptr, scan_ptr, iscan_ptr);
+}
+
 void vp9_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
   int i;
   // stage 1
diff --git a/vp9/encoder/arm/neon/vp9_variance_neon.c b/vp9/encoder/arm/neon/vp9_variance_neon.c
index 816fbda1f..b1ad83731 100644
--- a/vp9/encoder/arm/neon/vp9_variance_neon.c
+++ b/vp9/encoder/arm/neon/vp9_variance_neon.c
@@ -10,6 +10,7 @@
 
 #include <arm_neon.h>
 #include "./vp9_rtcd.h"
+#include "./vpx_config.h"
 
 #include "vpx_ports/mem.h"
 #include "vpx/vpx_integer.h"
@@ -28,6 +29,9 @@ enum { kHeight16PlusOne = 17 };
 enum { kWidth32 = 32 };
 enum { kHeight32 = 32 };
 enum { kHeight32PlusOne = 33 };
+enum { kWidth64 = 64 };
+enum { kHeight64 = 64 };
+enum { kHeight64PlusOne = 65 };
 enum { kPixelStepOne = 1 };
 enum { kAlign16 = 16 };
 
@@ -46,9 +50,10 @@ static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
   return vget_lane_s32(c, 0);
 }
 
+// w * h must be less than 2048 or local variable v_sum may overflow.
 static void variance_neon_w8(const uint8_t *a, int a_stride,
                              const uint8_t *b, int b_stride,
-                             int w, int h, unsigned int *sse, int *sum) {
+                             int w, int h, uint32_t *sse, int *sum) {
   int i, j;
   int16x8_t v_sum = vdupq_n_s16(0);
   int32x4_t v_sse_lo = vdupq_n_s32(0);
@@ -88,7 +93,7 @@ unsigned int vp9_variance8x8_neon(const uint8_t *a, int a_stride,
                                   unsigned int *sse) {
   int sum;
   variance_neon_w8(a, a_stride, b, b_stride, kWidth8, kHeight8, sse, &sum);
-  return *sse - (((int64_t)sum * sum) / (kWidth8 * kHeight8));
+  return *sse - (((int64_t)sum * sum) >> 6);  //  >> 6 = / 8 * 8
 }
 
 void vp9_get16x16var_neon(const uint8_t *src_ptr, int source_stride,
@@ -103,7 +108,7 @@ unsigned int vp9_variance16x16_neon(const uint8_t *a, int a_stride,
                                     unsigned int *sse) {
   int sum;
   variance_neon_w8(a, a_stride, b, b_stride, kWidth16, kHeight16, sse, &sum);
-  return *sse - (((int64_t)sum * sum) / (kWidth16 * kHeight16));
+  return *sse - (((int64_t)sum * sum) >> 8);  //  >> 8 = / 16 * 16
 }
 
 static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
@@ -205,7 +210,62 @@ unsigned int vp9_variance32x32_neon(const uint8_t *a, int a_stride,
                                     unsigned int *sse) {
   int sum;
   variance_neon_w8(a, a_stride, b, b_stride, kWidth32, kHeight32, sse, &sum);
-  return *sse - (((int64_t)sum * sum) / (kWidth32 * kHeight32));
+  return *sse - (((int64_t)sum * sum) >> 10);  // >> 10 = / 32 * 32
+}
+
+unsigned int vp9_variance32x64_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum1, sum2;
+  uint32_t sse1, sse2;
+  variance_neon_w8(a, a_stride, b, b_stride, kWidth32, kHeight32, &sse1, &sum1);
+  variance_neon_w8(a + (kHeight32 * a_stride), a_stride,
+                   b + (kHeight32 * b_stride), b_stride, kWidth32, kHeight32,
+                   &sse2, &sum2);
+  *sse = sse1 + sse2;
+  sum1 += sum2;
+  return *sse - (((int64_t)sum1 * sum1) >> 11);  // >> 11 = / 32 * 64
+}
+
+unsigned int vp9_variance64x32_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum1, sum2;
+  uint32_t sse1, sse2;
+  variance_neon_w8(a, a_stride, b, b_stride, kWidth64, kHeight16, &sse1, &sum1);
+  variance_neon_w8(a + (kHeight16 * a_stride), a_stride,
+                   b + (kHeight16 * b_stride), b_stride, kWidth64, kHeight16,
+                   &sse2, &sum2);
+  *sse = sse1 + sse2;
+  sum1 += sum2;
+  return *sse - (((int64_t)sum1 * sum1) >> 11);  // >> 11 = / 32 * 64
+}
+
+unsigned int vp9_variance64x64_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum1, sum2;
+  uint32_t sse1, sse2;
+
+  variance_neon_w8(a, a_stride, b, b_stride, kWidth64, kHeight16, &sse1, &sum1);
+  variance_neon_w8(a + (kHeight16 * a_stride), a_stride,
+                   b + (kHeight16 * b_stride), b_stride, kWidth64, kHeight16,
+                   &sse2, &sum2);
+  sse1 += sse2;
+  sum1 += sum2;
+
+  variance_neon_w8(a + (kHeight16 * 2 * a_stride), a_stride,
+                   b + (kHeight16 * 2 * b_stride), b_stride,
+                   kWidth64, kHeight16, &sse2, &sum2);
+  sse1 += sse2;
+  sum1 += sum2;
+
+  variance_neon_w8(a + (kHeight16 * 3 * a_stride), a_stride,
+                   b + (kHeight16 * 3 * b_stride), b_stride,
+                   kWidth64, kHeight16, &sse2, &sum2);
+  *sse = sse1 + sse2;
+  sum1 += sum2;
+  return *sse - (((int64_t)sum1 * sum1) >> 12);  // >> 12 = / 64 * 64
 }
 
 unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src,
@@ -225,3 +285,21 @@ unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src,
                              kWidth32, BILINEAR_FILTERS_2TAP(yoffset));
   return vp9_variance32x32_neon(temp2, kWidth32, dst, dst_stride, sse);
 }
+
+unsigned int vp9_sub_pixel_variance64x64_neon(const uint8_t *src,
+                                              int src_stride,
+                                              int xoffset,
+                                              int yoffset,
+                                              const uint8_t *dst,
+                                              int dst_stride,
+                                              unsigned int *sse) {
+  DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight64 * kWidth64);
+  DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight64PlusOne * kWidth64);
+
+  var_filter_block2d_bil_w16(src, fdata3, src_stride, kPixelStepOne,
+                             kHeight64PlusOne, kWidth64,
+                             BILINEAR_FILTERS_2TAP(xoffset));
+  var_filter_block2d_bil_w16(fdata3, temp2, kWidth64, kWidth64, kHeight64,
+                             kWidth64, BILINEAR_FILTERS_2TAP(yoffset));
+  return vp9_variance64x64_neon(temp2, kWidth64, dst, dst_stride, sse);
+}
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 19bcfd2b6..3f4ed94d6 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -1055,7 +1055,7 @@ static void write_bitdepth_colorspace_sampling(
     vp9_wb_write_bit(wb, cm->bit_depth == VPX_BITS_10 ? 0 : 1);
   }
   vp9_wb_write_literal(wb, cm->color_space, 3);
-  if (cm->color_space != SRGB) {
+  if (cm->color_space != VPX_CS_SRGB) {
     vp9_wb_write_bit(wb, 0);  // 0: [16, 235] (i.e. xvYCC), 1: [0, 255]
     if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) {
       assert(cm->subsampling_x != 1 || cm->subsampling_y != 1);
diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index ab8533703..7d4e26aaf 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -352,6 +352,7 @@ static void copy_frame(YV12_BUFFER_CONFIG dest, const YV12_BUFFER_CONFIG src) {
   int r;
   const uint8_t *srcbuf = src.y_buffer;
   uint8_t *destbuf = dest.y_buffer;
+
   assert(dest.y_width == src.y_width);
   assert(dest.y_height == src.y_height);
 
@@ -362,13 +363,13 @@ static void copy_frame(YV12_BUFFER_CONFIG dest, const YV12_BUFFER_CONFIG src) {
   }
 }
 
-static void swap_frame_buffer(YV12_BUFFER_CONFIG dest,
-                              YV12_BUFFER_CONFIG src) {
-  uint8_t *tmp_buf = dest.y_buffer;
-  assert(dest.y_width == src.y_width);
-  assert(dest.y_height == src.y_height);
-  dest.y_buffer = src.y_buffer;
-  src.y_buffer = tmp_buf;
+static void swap_frame_buffer(YV12_BUFFER_CONFIG *dest,
+                              YV12_BUFFER_CONFIG *src) {
+  uint8_t *tmp_buf = dest->y_buffer;
+  assert(dest->y_width == src->y_width);
+  assert(dest->y_height == src->y_height);
+  dest->y_buffer = src->y_buffer;
+  src->y_buffer = tmp_buf;
 }
 
 void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser,
@@ -387,16 +388,16 @@ void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser,
 
   /* For non key frames */
   if (refresh_alt_ref_frame) {
-    swap_frame_buffer(denoiser->running_avg_y[ALTREF_FRAME],
-                      denoiser->running_avg_y[INTRA_FRAME]);
+    swap_frame_buffer(&denoiser->running_avg_y[ALTREF_FRAME],
+                      &denoiser->running_avg_y[INTRA_FRAME]);
   }
   if (refresh_golden_frame) {
-    swap_frame_buffer(denoiser->running_avg_y[GOLDEN_FRAME],
-                      denoiser->running_avg_y[INTRA_FRAME]);
+    swap_frame_buffer(&denoiser->running_avg_y[GOLDEN_FRAME],
+                      &denoiser->running_avg_y[INTRA_FRAME]);
   }
   if (refresh_last_frame) {
-    swap_frame_buffer(denoiser->running_avg_y[LAST_FRAME],
-                      denoiser->running_avg_y[INTRA_FRAME]);
+    swap_frame_buffer(&denoiser->running_avg_y[LAST_FRAME],
+                      &denoiser->running_avg_y[INTRA_FRAME]);
   }
 }
 
@@ -477,13 +478,9 @@ void vp9_denoiser_free(VP9_DENOISER *denoiser) {
     return;
   }
   for (i = 0; i < MAX_REF_FRAMES; ++i) {
-    if (&denoiser->running_avg_y[i] != NULL) {
-      vp9_free_frame_buffer(&denoiser->running_avg_y[i]);
-    }
-  }
-  if (&denoiser->mc_running_avg_y != NULL) {
-    vp9_free_frame_buffer(&denoiser->mc_running_avg_y);
+    vp9_free_frame_buffer(&denoiser->running_avg_y[i]);
   }
+  vp9_free_frame_buffer(&denoiser->mc_running_avg_y);
 }
 
 #ifdef OUTPUT_YUV_DENOISED
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 756052771..e142a3181 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -401,55 +401,42 @@ static int set_vt_partitioning(VP9_COMP *cpi,
                                void *data,
                                BLOCK_SIZE bsize,
                                int mi_row,
-                               int mi_col) {
+                               int mi_col,
+                               int threshold,
+                               BLOCK_SIZE bsize_min) {
   VP9_COMMON * const cm = &cpi->common;
   variance_node vt;
   const int block_width = num_8x8_blocks_wide_lookup[bsize];
   const int block_height = num_8x8_blocks_high_lookup[bsize];
-  // TODO(marpan): Adjust/tune these thresholds.
-  const int threshold_multiplier = cm->frame_type == KEY_FRAME ? 80 : 4;
-  int64_t threshold =
-      (int64_t)(threshold_multiplier *
-                vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth));
-  int64_t threshold_bsize_ref = threshold << 6;
-  int64_t threshold_low = threshold;
-  BLOCK_SIZE bsize_ref = BLOCK_16X16;
 
   assert(block_height == block_width);
   tree_to_node(data, bsize, &vt);
 
-  if (cm->frame_type == KEY_FRAME) {
-    bsize_ref = BLOCK_8X8;
-    // Choose lower thresholds for key frame variance to favor split, but keep
-    // threshold for splitting to 4x4 block still fairly high for now.
-    threshold_bsize_ref = threshold << 2;
-    threshold_low = threshold >> 2;
-  }
-
-  // For bsize=bsize_ref (16x16/8x8 for 8x8/4x4 downsampling), select if
+  // For bsize=bsize_min (16x16/8x8 for 8x8/4x4 downsampling), select if
   // variance is below threshold, otherwise split will be selected.
   // No check for vert/horiz split as too few samples for variance.
-  if (bsize == bsize_ref) {
+  if (bsize == bsize_min) {
     get_variance(&vt.part_variances->none);
     if (mi_col + block_width / 2 < cm->mi_cols &&
         mi_row + block_height / 2 < cm->mi_rows &&
-        vt.part_variances->none.variance < threshold_bsize_ref) {
+        vt.part_variances->none.variance < threshold) {
       set_block_size(cpi, xd, mi_row, mi_col, bsize);
       return 1;
     }
     return 0;
-  } else if (bsize > bsize_ref) {
+  } else if (bsize > bsize_min) {
     get_variance(&vt.part_variances->none);
-    // For key frame, for bsize above 32X32, or very high variance, take split.
+    // For key frame or low_res: for bsize above 32X32 or very high variance,
+    // take split.
     if (cm->frame_type == KEY_FRAME &&
         (bsize > BLOCK_32X32 ||
-        vt.part_variances->none.variance > (threshold << 2))) {
+        vt.part_variances->none.variance > (threshold << 4))) {
       return 0;
     }
     // If variance is low, take the bsize (no split).
     if (mi_col + block_width / 2 < cm->mi_cols &&
         mi_row + block_height / 2 < cm->mi_rows &&
-        vt.part_variances->none.variance < threshold_low) {
+        vt.part_variances->none.variance < threshold) {
       set_block_size(cpi, xd, mi_row, mi_col, bsize);
       return 1;
     }
@@ -458,8 +445,8 @@ static int set_vt_partitioning(VP9_COMP *cpi,
     if (mi_row + block_height / 2 < cm->mi_rows) {
       get_variance(&vt.part_variances->vert[0]);
       get_variance(&vt.part_variances->vert[1]);
-      if (vt.part_variances->vert[0].variance < threshold_low &&
-          vt.part_variances->vert[1].variance < threshold_low) {
+      if (vt.part_variances->vert[0].variance < threshold &&
+          vt.part_variances->vert[1].variance < threshold) {
         BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_VERT);
         set_block_size(cpi, xd, mi_row, mi_col, subsize);
         set_block_size(cpi, xd, mi_row, mi_col + block_width / 2, subsize);
@@ -470,8 +457,8 @@ static int set_vt_partitioning(VP9_COMP *cpi,
     if (mi_col + block_width / 2 < cm->mi_cols) {
       get_variance(&vt.part_variances->horz[0]);
       get_variance(&vt.part_variances->horz[1]);
-      if (vt.part_variances->horz[0].variance < threshold_low &&
-          vt.part_variances->horz[1].variance < threshold_low) {
+      if (vt.part_variances->horz[0].variance < threshold &&
+          vt.part_variances->horz[1].variance < threshold) {
         BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_HORZ);
         set_block_size(cpi, xd, mi_row, mi_col, subsize);
         set_block_size(cpi, xd, mi_row + block_height / 2, mi_col, subsize);
@@ -485,8 +472,7 @@ static int set_vt_partitioning(VP9_COMP *cpi,
 }
 
 // This function chooses partitioning based on the variance between source and
-// reconstructed last, where variance is computed for downsampled inputs.
-// Currently 8x8 downsampling is used for delta frames, 4x4 for key frames.
+// reconstructed last, where variance is computed for downs-sampled inputs.
 static void choose_partitioning(VP9_COMP *cpi,
                                 const TileInfo *const tile,
                                 MACROBLOCK *x,
@@ -496,6 +482,7 @@ static void choose_partitioning(VP9_COMP *cpi,
 
   int i, j, k, m;
   v64x64 vt;
+  v16x16 vt2[16];
   uint8_t *s;
   const uint8_t *d;
   int sp;
@@ -504,6 +491,27 @@ static void choose_partitioning(VP9_COMP *cpi,
   const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
   const struct scale_factors *const sf = &cm->frame_refs[LAST_FRAME - 1].sf;
 
+  // Always use 4x4 partition for key frame.
+  int use_4x4_partition = (cm->frame_type == KEY_FRAME);
+
+  int variance4x4downsample[16];
+  int low_res = (cm->width <= 352 && cm->height <= 288) ? 1 : 0;
+  const int threshold_multiplier = cm->frame_type == KEY_FRAME ? 80 : 4;
+  int64_t threshold_base = (int64_t)(threshold_multiplier *
+      vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth));
+  int64_t threshold = threshold_base;
+  int64_t threshold_bsize_min = threshold_base << 6;
+  int64_t threshold_bsize_max = threshold_base;
+  // Modify thresholds for key frame and for low-resolutions (set lower
+  // thresholds to favor split).
+  if (cm->frame_type == KEY_FRAME) {
+    threshold = threshold_base >> 2;
+    threshold_bsize_min = threshold_base << 2;
+  } else if (low_res) {
+    threshold_bsize_min = threshold_base << 3;
+    threshold_bsize_max = threshold_base >> 2;
+  }
+
   vp9_clear_system_state();
   set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64);
 
@@ -546,130 +554,169 @@ static void choose_partitioning(VP9_COMP *cpi,
 #endif  // CONFIG_VP9_HIGHBITDEPTH
   }
 
-  // Fill in the entire tree of 8x8 variances for splits.
+  // Fill in the entire tree of 8x8 (or 4x4 under some conditions) variances
+  // for splits.
   for (i = 0; i < 4; i++) {
     const int x32_idx = ((i & 1) << 5);
     const int y32_idx = ((i >> 1) << 5);
+    const int i2 = i << 2;
     for (j = 0; j < 4; j++) {
       const int x16_idx = x32_idx + ((j & 1) << 4);
       const int y16_idx = y32_idx + ((j >> 1) << 4);
       v16x16 *vst = &vt.split[i].split[j];
-      for (k = 0; k < 4; k++) {
-        int x8_idx = x16_idx + ((k & 1) << 3);
-        int y8_idx = y16_idx + ((k >> 1) << 3);
-        if (cm->frame_type != KEY_FRAME) {
-          unsigned int sse = 0;
-          int sum = 0;
-          if (x8_idx < pixels_wide && y8_idx < pixels_high) {
-            int s_avg, d_avg;
+      variance4x4downsample[i2 + j] = 0;
+      if (cm->frame_type != KEY_FRAME) {
+        for (k = 0; k < 4; k++) {
+          int x8_idx = x16_idx + ((k & 1) << 3);
+          int y8_idx = y16_idx + ((k >> 1) << 3);
+            unsigned int sse = 0;
+            int sum = 0;
+            if (x8_idx < pixels_wide && y8_idx < pixels_high) {
+              int s_avg, d_avg;
 #if CONFIG_VP9_HIGHBITDEPTH
-            if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-              s_avg = vp9_highbd_avg_8x8(s + y8_idx * sp + x8_idx, sp);
-              d_avg = vp9_highbd_avg_8x8(d + y8_idx * dp + x8_idx, dp);
-            } else {
+              if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+                s_avg = vp9_highbd_avg_8x8(s + y8_idx * sp + x8_idx, sp);
+                d_avg = vp9_highbd_avg_8x8(d + y8_idx * dp + x8_idx, dp);
+              } else {
+                s_avg = vp9_avg_8x8(s + y8_idx * sp + x8_idx, sp);
+                d_avg = vp9_avg_8x8(d + y8_idx * dp + x8_idx, dp);
+             }
+#else
               s_avg = vp9_avg_8x8(s + y8_idx * sp + x8_idx, sp);
               d_avg = vp9_avg_8x8(d + y8_idx * dp + x8_idx, dp);
-           }
-#else
-            s_avg = vp9_avg_8x8(s + y8_idx * sp + x8_idx, sp);
-            d_avg = vp9_avg_8x8(d + y8_idx * dp + x8_idx, dp);
 #endif
-            sum = s_avg - d_avg;
-            sse = sum * sum;
-          }
-          // If variance is based on 8x8 downsampling, we stop here and have
-          // one sample for 8x8 block (so use 1 for count in fill_variance),
-          // which of course means variance = 0 for 8x8 block.
-          fill_variance(sse, sum, 0, &vst->split[k].part_variances.none);
-        } else {
-          // For key frame, go down to 4x4.
-          v8x8 *vst2 = &vst->split[k];
+              sum = s_avg - d_avg;
+              sse = sum * sum;
+            }
+            // If variance is based on 8x8 downsampling, we stop here and have
+            // one sample for 8x8 block (so use 1 for count in fill_variance),
+            // which of course means variance = 0 for 8x8 block.
+            fill_variance(sse, sum, 0, &vst->split[k].part_variances.none);
+        }
+        fill_variance_tree(&vt.split[i].split[j], BLOCK_16X16);
+        // For low-resolution, compute the variance based on 8x8 down-sampling,
+        // and if it is large (above the threshold) we go down for 4x4.
+        // For key frame we always go down to 4x4.
+        if (low_res)
+          get_variance(&vt.split[i].split[j].part_variances.none);
+      }
+      if (cm->frame_type == KEY_FRAME || (low_res &&
+          vt.split[i].split[j].part_variances.none.variance >
+          (threshold << 1))) {
+        // Go down to 4x4 down-sampling for variance.
+        variance4x4downsample[i2 + j] = 1;
+        for (k = 0; k < 4; k++) {
+          int x8_idx = x16_idx + ((k & 1) << 3);
+          int y8_idx = y16_idx + ((k >> 1) << 3);
+          v8x8 *vst2 = (cm->frame_type == KEY_FRAME) ? &vst->split[k] :
+              &vt2[i2 + j].split[k];
           for (m = 0; m < 4; m++) {
             int x4_idx = x8_idx + ((m & 1) << 2);
             int y4_idx = y8_idx + ((m >> 1) << 2);
             unsigned int sse = 0;
             int sum = 0;
             if (x4_idx < pixels_wide && y4_idx < pixels_high) {
+              int d_avg = 128;
 #if CONFIG_VP9_HIGHBITDEPTH
               int s_avg;
               if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
                 s_avg = vp9_highbd_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+                if (cm->frame_type != KEY_FRAME)
+                  d_avg = vp9_highbd_avg_4x4(d + y4_idx * dp + x4_idx, dp);
               } else {
                 s_avg = vp9_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+                if (cm->frame_type != KEY_FRAME)
+                  d_avg = vp9_avg_4x4(d + y4_idx * dp + x4_idx, dp);
               }
 #else
               int s_avg = vp9_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+              if (cm->frame_type != KEY_FRAME)
+                d_avg = vp9_avg_4x4(d + y4_idx * dp + x4_idx, dp);
 #endif
-              // For key frame, reference is set to 128.
-              sum = s_avg - 128;
+              sum = s_avg - d_avg;
               sse = sum * sum;
             }
-            // If variance is based on 4x4 downsampling, we stop here and have
+            // If variance is based on 4x4 down-sampling, we stop here and have
             // one sample for 4x4 block (so use 1 for count in fill_variance),
             // which of course means variance = 0 for 4x4 block.
-           fill_variance(sse, sum, 0, &vst2->split[m].part_variances.none);
+            fill_variance(sse, sum, 0, &vst2->split[m].part_variances.none);
           }
         }
       }
     }
   }
+
   // Fill the rest of the variance tree by summing split partition values.
   for (i = 0; i < 4; i++) {
+    const int i2 = i << 2;
     for (j = 0; j < 4; j++) {
-      if (cm->frame_type == KEY_FRAME) {
+      if (variance4x4downsample[i2 + j] == 1) {
+        v16x16 *vtemp = (cm->frame_type != KEY_FRAME) ? &vt2[i2 + j] :
+            &vt.split[i].split[j];
         for (m = 0; m < 4; m++) {
-          fill_variance_tree(&vt.split[i].split[j].split[m], BLOCK_8X8);
+          fill_variance_tree(&vtemp->split[m], BLOCK_8X8);
         }
+        fill_variance_tree(vtemp, BLOCK_16X16);
       }
-      fill_variance_tree(&vt.split[i].split[j], BLOCK_16X16);
     }
     fill_variance_tree(&vt.split[i], BLOCK_32X32);
   }
   fill_variance_tree(&vt, BLOCK_64X64);
 
+
   // Now go through the entire structure,  splitting every block size until
   // we get to one that's got a variance lower than our threshold.
   if ( mi_col + 8 > cm->mi_cols || mi_row + 8 > cm->mi_rows ||
-      !set_vt_partitioning(cpi, xd, &vt, BLOCK_64X64, mi_row, mi_col)) {
+      !set_vt_partitioning(cpi, xd, &vt, BLOCK_64X64, mi_row, mi_col,
+                           threshold_bsize_max, BLOCK_16X16)) {
     for (i = 0; i < 4; ++i) {
       const int x32_idx = ((i & 1) << 2);
       const int y32_idx = ((i >> 1) << 2);
+      const int i2 = i << 2;
       if (!set_vt_partitioning(cpi, xd, &vt.split[i], BLOCK_32X32,
-                               (mi_row + y32_idx), (mi_col + x32_idx))) {
+                               (mi_row + y32_idx), (mi_col + x32_idx),
+                               threshold, BLOCK_16X16)) {
         for (j = 0; j < 4; ++j) {
           const int x16_idx = ((j & 1) << 1);
           const int y16_idx = ((j >> 1) << 1);
-          // Note: If 8x8 downsampling is used for variance calculation we
-          // cannot really select block size 8x8 (or even 8x16/16x8), since we
-          // don't have sufficient samples for variance. So on delta frames,
-          // 8x8 partition is only set if variance of the 16x16 block is very
-          // high. For key frames, 4x4 downsampling is used, so we can better
-          // select 8x16/16x8 and 8x8. 4x4 partition can potentially be set
-          // used here too, but for now 4x4 is not allowed.
-          if (!set_vt_partitioning(cpi, xd, &vt.split[i].split[j],
-                                   BLOCK_16X16,
+          // TODO(marpan): Allow 4x4 partitions for inter-frames.
+          // use_4x4_partition = (variance4x4downsample[i2 + j] == 1);
+          // If 4x4 partition is not used, then 8x8 partition will be selected
+          // if variance of 16x16 block is very high, so use larger threshold
+          // for 16x16 (threshold_bsize_min) in that case.
+          uint64_t threshold_16x16 = (use_4x4_partition) ? threshold :
+                                      threshold_bsize_min;
+          BLOCK_SIZE bsize_min = (use_4x4_partition) ? BLOCK_8X8 : BLOCK_16X16;
+          // For inter frames: if variance4x4downsample[] == 1 for this 16x16
+          // block, then the variance is based on 4x4 down-sampling, so use vt2
+          // in set_vt_partioning(), otherwise use vt.
+          v16x16 *vtemp = (cm->frame_type != KEY_FRAME &&
+                           variance4x4downsample[i2 + j] == 1) ?
+                           &vt2[i2 + j] : &vt.split[i].split[j];
+          if (!set_vt_partitioning(cpi, xd, vtemp, BLOCK_16X16,
                                    mi_row + y32_idx + y16_idx,
-                                   mi_col + x32_idx + x16_idx)) {
+                                   mi_col + x32_idx + x16_idx,
+                                   threshold_16x16, bsize_min)) {
             for (k = 0; k < 4; ++k) {
               const int x8_idx = (k & 1);
               const int y8_idx = (k >> 1);
-              if (cm->frame_type == KEY_FRAME) {
-                if (!set_vt_partitioning(cpi, xd,
-                                         &vt.split[i].split[j].split[k],
+              if (use_4x4_partition) {
+                if (!set_vt_partitioning(cpi, xd, &vtemp->split[k],
                                          BLOCK_8X8,
                                          mi_row + y32_idx + y16_idx + y8_idx,
-                                         mi_col + x32_idx + x16_idx + x8_idx)) {
-                    set_block_size(cpi, xd,
-                                  (mi_row + y32_idx + y16_idx + y8_idx),
-                                  (mi_col + x32_idx + x16_idx + x8_idx),
-                                   BLOCK_4X4);
+                                         mi_col + x32_idx + x16_idx + x8_idx,
+                                         threshold_bsize_min, BLOCK_8X8)) {
+                  set_block_size(cpi, xd,
+                                 (mi_row + y32_idx + y16_idx + y8_idx),
+                                 (mi_col + x32_idx + x16_idx + x8_idx),
+                                 BLOCK_4X4);
                 }
               } else {
                 set_block_size(cpi, xd,
                                (mi_row + y32_idx + y16_idx + y8_idx),
                                (mi_col + x32_idx + x16_idx + x8_idx),
                                BLOCK_8X8);
-               }
+              }
             }
           }
         }
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index da9f34253..c85bf2a0e 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -607,7 +607,7 @@ static void init_config(struct VP9_COMP *cpi, VP9EncoderConfig *oxcf) {
 #if CONFIG_VP9_HIGHBITDEPTH
   cm->use_highbitdepth = oxcf->use_highbitdepth;
 #endif
-  cm->color_space = UNKNOWN;
+  cm->color_space = oxcf->color_space;
 
   cm->width = oxcf->width;
   cm->height = oxcf->height;
@@ -1264,6 +1264,7 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
   if (cm->profile != oxcf->profile)
     cm->profile = oxcf->profile;
   cm->bit_depth = oxcf->bit_depth;
+  cm->color_space = oxcf->color_space;
 
   if (cm->profile <= PROFILE_1)
     assert(cm->bit_depth == VPX_BITS_8);
@@ -1311,6 +1312,8 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
 
   cm->display_width = cpi->oxcf.width;
   cm->display_height = cpi->oxcf.height;
+  cm->width = cpi->oxcf.width;
+  cm->height = cpi->oxcf.height;
 
   if (cpi->initial_width) {
     // Increasing the size of the frame beyond the first seen frame, or some
@@ -1785,7 +1788,7 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
 
   for (t = 0; t < cpi->num_workers; ++t) {
     VP9Worker *const worker = &cpi->workers[t];
-    EncWorkerData *const thread_data = (EncWorkerData*)worker->data1;
+    EncWorkerData *const thread_data = &cpi->tile_thr_data[t];
 
     // Deallocate allocated threads.
     vp9_get_worker_interface()->end(worker);
@@ -1796,11 +1799,13 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
       vp9_free_pc_tree(thread_data->td);
       vpx_free(thread_data->td);
     }
-
-    vpx_free(worker->data1);
   }
+  vpx_free(cpi->tile_thr_data);
   vpx_free(cpi->workers);
 
+  if (cpi->num_workers > 1)
+    vp9_loop_filter_dealloc(&cpi->lf_row_sync);
+
   dealloc_compressor_data(cpi);
 
   for (i = 0; i < sizeof(cpi->mbgraph_stats) /
@@ -2436,7 +2441,13 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
   }
 
   if (lf->filter_level > 0) {
-    vp9_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0);
+    if (cpi->num_workers > 1)
+      vp9_loop_filter_frame_mt(cm->frame_to_show, cm, xd->plane,
+                               lf->filter_level, 0, 0,
+                               cpi->workers, cpi->num_workers,
+                               &cpi->lf_row_sync);
+    else
+      vp9_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0);
   }
 
   vp9_extend_frame_inner_borders(cm->frame_to_show);
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index d256935f1..cf269c108 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -19,6 +19,7 @@
 
 #include "vp9/common/vp9_ppflags.h"
 #include "vp9/common/vp9_entropymode.h"
+#include "vp9/common/vp9_loopfilter_thread.h"
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_thread.h"
 
@@ -36,6 +37,7 @@
 #include "vp9/encoder/vp9_svc_layercontext.h"
 #include "vp9/encoder/vp9_tokenize.h"
 #include "vp9/encoder/vp9_variance.h"
+
 #if CONFIG_VP9_TEMPORAL_DENOISING
 #include "vp9/encoder/vp9_denoiser.h"
 #endif
@@ -231,6 +233,7 @@ typedef struct VP9EncoderConfig {
 #if CONFIG_VP9_HIGHBITDEPTH
   int use_highbitdepth;
 #endif
+  vpx_color_space_t color_space;
 } VP9EncoderConfig;
 
 static INLINE int is_lossless_requested(const VP9EncoderConfig *cfg) {
@@ -261,6 +264,8 @@ typedef struct ThreadData {
   PC_TREE *pc_root;
 } ThreadData;
 
+struct EncWorkerData;
+
 typedef struct VP9_COMP {
   QUANTS quants;
   ThreadData td;
@@ -446,6 +451,8 @@ typedef struct VP9_COMP {
   // Multi-threading
   int num_workers;
   VP9Worker *workers;
+  struct EncWorkerData *tile_thr_data;
+  VP9LfSync lf_row_sync;
 } VP9_COMP;
 
 void vp9_initialize_enc(void);
diff --git a/vp9/encoder/vp9_ethread.c b/vp9/encoder/vp9_ethread.c
index daf3da44c..12fb4d107 100644
--- a/vp9/encoder/vp9_ethread.c
+++ b/vp9/encoder/vp9_ethread.c
@@ -167,23 +167,24 @@ void vp9_encode_tiles_mt(VP9_COMP *cpi) {
     CHECK_MEM_ERROR(cm, cpi->workers,
                     vpx_malloc(num_workers * sizeof(*cpi->workers)));
 
+    CHECK_MEM_ERROR(cm, cpi->tile_thr_data,
+                    vpx_calloc(num_workers, sizeof(*cpi->tile_thr_data)));
+
     for (i = 0; i < num_workers; i++) {
       VP9Worker *const worker = &cpi->workers[i];
-      EncWorkerData *thread_data;
+      EncWorkerData *thread_data = &cpi->tile_thr_data[i];
 
       ++cpi->num_workers;
-
       winterface->init(worker);
-      CHECK_MEM_ERROR(cm, worker->data1,
-                      (EncWorkerData*)vpx_calloc(1, sizeof(EncWorkerData)));
-      thread_data = (EncWorkerData*)worker->data1;
 
       if (i < num_workers - 1) {
       thread_data->cpi = cpi;
 
       // Allocate thread data.
       CHECK_MEM_ERROR(cm, thread_data->td,
-                      vpx_calloc(1, sizeof(*thread_data->td)));
+                      vpx_memalign(32, sizeof(*thread_data->td)));
+      vp9_zero(*thread_data->td);
+
       // Set up pc_tree.
       thread_data->td->leaf_tree = NULL;
       thread_data->td->pc_tree = NULL;
@@ -203,17 +204,18 @@ void vp9_encode_tiles_mt(VP9_COMP *cpi) {
         thread_data->td = &cpi->td;
       }
 
-      // data2 is unused.
-      worker->data2 = NULL;
-
       winterface->sync(worker);
-      worker->hook = (VP9WorkerHook)enc_worker_hook;
     }
   }
 
   for (i = 0; i < num_workers; i++) {
     VP9Worker *const worker = &cpi->workers[i];
-    EncWorkerData *const thread_data = (EncWorkerData*)worker->data1;
+    EncWorkerData *thread_data;
+
+    worker->hook = (VP9WorkerHook)enc_worker_hook;
+    worker->data1 = &cpi->tile_thr_data[i];
+    worker->data2 = NULL;
+    thread_data = (EncWorkerData*)worker->data1;
 
     // Before encoding a frame, copy the thread data from cpi.
     thread_data->td->mb = cpi->td.mb;
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 74f5efbec..3b0f2f012 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -1843,6 +1843,9 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
   twopass->gf_zeromotion_pct = (int)(zero_motion_accumulator * 1000.0);
 
+  // Was the group length constrained by the requirement for a new KF?
+  rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0;
+
   // Set the interval until the next gf.
   if (cpi->common.frame_type == KEY_FRAME || rc->source_alt_ref_active)
     rc->baseline_gf_interval = i - 1;
diff --git a/vp9/encoder/vp9_picklpf.c b/vp9/encoder/vp9_picklpf.c
index 01cc519f9..a95f0f46d 100644
--- a/vp9/encoder/vp9_picklpf.c
+++ b/vp9/encoder/vp9_picklpf.c
@@ -39,8 +39,14 @@ static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd,
   VP9_COMMON *const cm = &cpi->common;
   int64_t filt_err;
 
-  vp9_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level, 1,
-                        partial_frame);
+  if (cpi->num_workers > 1)
+    vp9_loop_filter_frame_mt(cm->frame_to_show, cm, cpi->td.mb.e_mbd.plane,
+                             filt_level, 1, partial_frame,
+                             cpi->workers, cpi->num_workers, &cpi->lf_row_sync);
+  else
+    vp9_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level,
+                          1, partial_frame);
+
 #if CONFIG_VP9_HIGHBITDEPTH
   if (cm->use_highbitdepth) {
     filt_err = vp9_highbd_get_y_sse(sd, cm->frame_to_show);
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 3cc9d9a7b..21f4cce03 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -1254,7 +1254,9 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
   // better than that already stored.
   // This is used to help set quality in forced key frames to reduce popping
   if ((qindex < rc->last_boosted_qindex) ||
-      (((cm->frame_type == KEY_FRAME) || cpi->refresh_alt_ref_frame ||
+      (cm->frame_type == KEY_FRAME) ||
+      (!rc->constrained_gf_group &&
+       (cpi->refresh_alt_ref_frame ||
         (cpi->refresh_golden_frame && !rc->is_src_frame_alt_ref)))) {
     rc->last_boosted_qindex = qindex;
   }
@@ -1358,8 +1360,12 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) {
     rc->baseline_gf_interval = DEFAULT_GF_INTERVAL;
     rc->frames_till_gf_update_due = rc->baseline_gf_interval;
     // NOTE: frames_till_gf_update_due must be <= frames_to_key.
-    if (rc->frames_till_gf_update_due > rc->frames_to_key)
+    if (rc->frames_till_gf_update_due > rc->frames_to_key) {
       rc->frames_till_gf_update_due = rc->frames_to_key;
+      rc->constrained_gf_group = 1;
+    } else {
+      rc->constrained_gf_group = 0;
+    }
     cpi->refresh_golden_frame = 1;
     rc->source_alt_ref_pending = USE_ALTREF_FOR_ONE_PASS;
     rc->gfu_boost = DEFAULT_GF_BOOST;
diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h
index cf709274d..a53f4e0a2 100644
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -55,6 +55,7 @@ typedef struct {
   int max_gf_interval;
   int static_scene_max_gf_interval;
   int baseline_gf_interval;
+  int constrained_gf_group;
   int frames_to_key;
   int frames_since_key;
   int this_key_frame_forced;
diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c
index 375407d44..adbe0244d 100644
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c
@@ -570,10 +570,6 @@ void vp9_set_rd_speed_thresholds(VP9_COMP *cpi) {
   rd->thresh_mult[THR_NEWA] += 1000;
   rd->thresh_mult[THR_NEWG] += 1000;
 
-  // Adjust threshold only in real time mode, which only uses last
-  // reference frame.
-  rd->thresh_mult[THR_NEWMV] += sf->elevate_newmv_thresh;
-
   rd->thresh_mult[THR_NEARMV] += 1000;
   rd->thresh_mult[THR_NEARA] += 1000;
   rd->thresh_mult[THR_COMP_NEARESTLA] += 1000;
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 15831fbbe..81f3195fe 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -446,7 +446,6 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
   sf->always_this_block_size = BLOCK_16X16;
   sf->search_type_check_frequency = 50;
   sf->encode_breakout_thresh = 0;
-  sf->elevate_newmv_thresh = 0;
   // Recode loop tolerance %.
   sf->recode_tolerance = 25;
   sf->default_interp_filter = SWITCHABLE;
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index c2cfd62da..eaa0accdb 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -390,9 +390,6 @@ typedef struct SPEED_FEATURES {
   // enabled in real time mode.
   int encode_breakout_thresh;
 
-  // In real time encoding, increase the threshold for NEWMV.
-  int elevate_newmv_thresh;
-
   // default interp filter choice
   INTERP_FILTER default_interp_filter;
 
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index 31e93be65..82bce3780 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -279,6 +279,7 @@ static void get_layer_resolution(const int width_org, const int height_org,
 int vp9_svc_start_frame(VP9_COMP *const cpi) {
   int width = 0, height = 0;
   LAYER_CONTEXT *lc;
+  struct lookahead_entry *buf;
   int count = 1 << (cpi->svc.number_temporal_layers - 1);
 
   cpi->svc.spatial_layer_id = cpi->svc.spatial_layer_to_encode;
@@ -339,8 +340,12 @@ int vp9_svc_start_frame(VP9_COMP *const cpi) {
   // since its previous frame could be changed during decoding time. The idea is
   // we put a empty invisible frame in front of them, then we will not use
   // prev_mi when encoding these frames.
+
+  buf = vp9_lookahead_peek(cpi->lookahead, 0);
   if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2 &&
-      cpi->svc.encode_empty_frame_state == NEED_TO_ENCODE) {
+      cpi->svc.encode_empty_frame_state == NEED_TO_ENCODE &&
+      lc->rc.frames_to_key != 0 &&
+      !(buf != NULL && (buf->flags & VPX_EFLAG_FORCE_KF))) {
     if ((cpi->svc.number_temporal_layers > 1 &&
          cpi->svc.temporal_layer_id < cpi->svc.number_temporal_layers - 1) ||
         (cpi->svc.number_spatial_layers > 1 &&
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 76602c2d7..58920e247 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -33,6 +33,7 @@ VP9_COMMON_SRCS-yes += common/vp9_entropymv.h
 VP9_COMMON_SRCS-yes += common/vp9_enums.h
 VP9_COMMON_SRCS-yes += common/vp9_idct.h
 VP9_COMMON_SRCS-yes += common/vp9_loopfilter.h
+VP9_COMMON_SRCS-yes += common/vp9_loopfilter_thread.h
 VP9_COMMON_SRCS-yes += common/vp9_mv.h
 VP9_COMMON_SRCS-yes += common/vp9_onyxc_int.h
 VP9_COMMON_SRCS-yes += common/vp9_pred_common.h
@@ -56,6 +57,7 @@ VP9_COMMON_SRCS-yes += common/vp9_tile_common.h
 VP9_COMMON_SRCS-yes += common/vp9_tile_common.c
 VP9_COMMON_SRCS-yes += common/vp9_loopfilter.c
 VP9_COMMON_SRCS-yes += common/vp9_loopfilter_filters.c
+VP9_COMMON_SRCS-yes += common/vp9_loopfilter_thread.c
 VP9_COMMON_SRCS-yes += common/vp9_mvref_common.c
 VP9_COMMON_SRCS-yes += common/vp9_mvref_common.h
 VP9_COMMON_SRCS-yes += common/vp9_quant_common.c
@@ -82,6 +84,7 @@ VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_bilinear_ssse3.asm
 VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_subpixel_8t_intrin_avx2.c
 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_intrin_ssse3.c
 ifeq ($(CONFIG_VP9_POSTPROC),yes)
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_mfqe_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm
 endif
 
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 531a03ebe..589f0b1bf 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -42,6 +42,7 @@ struct vp9_extracfg {
   unsigned int                frame_periodic_boost;
   vpx_bit_depth_t             bit_depth;
   vp9e_tune_content           content;
+  vpx_color_space_t           color_space;
 };
 
 static struct vp9_extracfg default_extra_cfg = {
@@ -64,7 +65,8 @@ static struct vp9_extracfg default_extra_cfg = {
   NO_AQ,                      // aq_mode
   0,                          // frame_periodic_delta_q
   VPX_BITS_8,                 // Bit depth
-  VP9E_CONTENT_DEFAULT        // content
+  VP9E_CONTENT_DEFAULT,       // content
+  VPX_CS_UNKNOWN,             // color space
 };
 
 struct vpx_codec_alg_priv {
@@ -81,6 +83,7 @@ struct vpx_codec_alg_priv {
   size_t                  pending_frame_sizes[8];
   size_t                  pending_frame_magnitude;
   vpx_image_t             preview_img;
+  vpx_enc_frame_flags_t   next_frame_flags;
   vp8_postproc_cfg_t      preview_ppcfg;
   vpx_codec_pkt_list_decl(256) pkt_list;
   unsigned int                 fixed_kf_cntr;
@@ -294,7 +297,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
       cfg->g_bit_depth == VPX_BITS_8) {
     ERROR("Codec bit-depth 8 not supported in profile > 1");
   }
-
+  RANGE_CHECK(extra_cfg, color_space, VPX_CS_UNKNOWN, VPX_CS_SRGB);
   return VPX_CODEC_OK;
 }
 
@@ -437,6 +440,7 @@ static vpx_codec_err_t set_encoder_config(
   oxcf->firstpass_mb_stats_in  = cfg->rc_firstpass_mb_stats_in;
 #endif
 
+  oxcf->color_space = extra_cfg->color_space;
   oxcf->arnr_max_frames = extra_cfg->arnr_max_frames;
   oxcf->arnr_strength   = extra_cfg->arnr_strength;
 
@@ -521,9 +525,17 @@ static vpx_codec_err_t set_encoder_config(
 static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx,
                                           const vpx_codec_enc_cfg_t  *cfg) {
   vpx_codec_err_t res;
-
-  if (cfg->g_w != ctx->cfg.g_w || cfg->g_h != ctx->cfg.g_h)
-    ERROR("Cannot change width or height after initialization");
+  int force_key = 0;
+
+  if (cfg->g_w != ctx->cfg.g_w || cfg->g_h != ctx->cfg.g_h) {
+    if (cfg->g_lag_in_frames > 1 || cfg->g_pass != VPX_RC_ONE_PASS)
+      ERROR("Cannot change width or height after initialization");
+    if ((ctx->cpi->initial_width && (int)cfg->g_w > ctx->cpi->initial_width) ||
+        (ctx->cpi->initial_height && (int)cfg->g_h > ctx->cpi->initial_height))
+      ERROR("Cannot increase width or height larger than their initial values");
+    if (!valid_ref_frame_size(ctx->cfg.g_w, ctx->cfg.g_h, cfg->g_w, cfg->g_h))
+      force_key = 1;
+  }
 
   // Prevent increasing lag_in_frames. This check is stricter than it needs
   // to be -- the limit is not increasing past the first lag_in_frames
@@ -540,6 +552,9 @@ static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx,
     vp9_change_config(ctx->cpi, &ctx->oxcf);
   }
 
+  if (force_key)
+    ctx->next_frame_flags |= VPX_EFLAG_FORCE_KF;
+
   return res;
 }
 
@@ -952,10 +967,11 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t  *ctx,
 
       // Store the original flags in to the frame buffer. Will extract the
       // key frame flag when we actually encode this frame.
-      if (vp9_receive_raw_frame(cpi, flags,
+      if (vp9_receive_raw_frame(cpi, flags | ctx->next_frame_flags,
                                 &sd, dst_time_stamp, dst_end_time_stamp)) {
         res = update_error_state(ctx, &cpi->common.error);
       }
+      ctx->next_frame_flags = 0;
     }
 
     cx_data = ctx->cx_data;
@@ -1322,6 +1338,13 @@ static vpx_codec_err_t ctrl_set_tune_content(vpx_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static vpx_codec_err_t ctrl_set_color_space(vpx_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.color_space = CAST(VP9E_SET_COLOR_SPACE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   {VP8_COPY_REFERENCE,                ctrl_copy_reference},
   {VP8E_UPD_ENTROPY,                  ctrl_update_entropy},
@@ -1357,6 +1380,7 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   {VP9E_REGISTER_CX_CALLBACK,         ctrl_register_cx_callback},
   {VP9E_SET_SVC_LAYER_ID,             ctrl_set_svc_layer_id},
   {VP9E_SET_TUNE_CONTENT,             ctrl_set_tune_content},
+  {VP9E_SET_COLOR_SPACE,              ctrl_set_color_space},
   {VP9E_SET_NOISE_SENSITIVITY,        ctrl_set_noise_sensitivity},
 
   // Getters
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index b9177876e..c0e429736 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -95,12 +95,11 @@ static vpx_codec_err_t decoder_destroy(vpx_codec_alg_priv_t *ctx) {
 
 static int parse_bitdepth_colorspace_sampling(
     BITSTREAM_PROFILE profile, struct vp9_read_bit_buffer *rb) {
-  const int sRGB = 7;
-  int colorspace;
+  vpx_color_space_t color_space;
   if (profile >= PROFILE_2)
     rb->bit_offset += 1;  // Bit-depth 10 or 12.
-  colorspace = vp9_rb_read_literal(rb, 3);
-  if (colorspace != sRGB) {
+  color_space = (vpx_color_space_t)vp9_rb_read_literal(rb, 3);
+  if (color_space != VPX_CS_SRGB) {
     rb->bit_offset += 1;  // [16,235] (including xvycc) vs [0,255] range.
     if (profile == PROFILE_1 || profile == PROFILE_3) {
       rb->bit_offset += 2;  // subsampling x/y.
diff --git a/vp9/vp9_iface_common.h b/vp9/vp9_iface_common.h
index 00fbfdd7d..e585aa147 100644
--- a/vp9/vp9_iface_common.h
+++ b/vp9/vp9_iface_common.h
@@ -34,6 +34,7 @@ static void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG  *yv12,
       bps = 12;
     }
   }
+  img->cs = yv12->color_space;
   img->bit_depth = 8;
   img->w = yv12->y_stride;
   img->h = ALIGN_POWER_OF_TWO(yv12->y_height + 2 * VP9_ENC_BORDER_IN_PIXELS, 3);
@@ -92,6 +93,7 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img,
 
   yv12->y_stride = img->stride[VPX_PLANE_Y];
   yv12->uv_stride = img->stride[VPX_PLANE_U];
+  yv12->color_space = img->cs;
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index c75fd8a01..33a1e6735 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -150,6 +150,7 @@ VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct_avx2.c
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_intrin_avx2.c
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_avx2.c
 
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_avg_neon.c
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_sad_neon.c
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_variance_neon.c
diff --git a/vp9/vp9dx.mk b/vp9/vp9dx.mk
index 1fcb36f66..603158a9c 100644
--- a/vp9/vp9dx.mk
+++ b/vp9/vp9dx.mk
@@ -21,8 +21,6 @@ VP9_DX_SRCS-yes += decoder/vp9_decodemv.c
 VP9_DX_SRCS-yes += decoder/vp9_decodeframe.c
 VP9_DX_SRCS-yes += decoder/vp9_decodeframe.h
 VP9_DX_SRCS-yes += decoder/vp9_detokenize.c
-VP9_DX_SRCS-yes += decoder/vp9_dthread.c
-VP9_DX_SRCS-yes += decoder/vp9_dthread.h
 VP9_DX_SRCS-yes += decoder/vp9_reader.h
 VP9_DX_SRCS-yes += decoder/vp9_reader.c
 VP9_DX_SRCS-yes += decoder/vp9_read_bit_buffer.c
diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index 4fc0fd62f..a920ee3f9 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -10,7 +10,7 @@
 #ifndef VPX_VP8CX_H_
 #define VPX_VP8CX_H_
 
-/*!\defgroup vp8_encoder WebM VP8 Encoder
+/*!\defgroup vp8_encoder WebM VP8/VP9 Encoder
  * \ingroup vp8
  *
  * @{
@@ -18,7 +18,7 @@
 #include "./vp8.h"
 
 /*!\file
- * \brief Provides definitions for using the VP8 encoder algorithm within the
+ * \brief Provides definitions for using VP8 or VP9 encoder algorithm within the
  *        vpx Codec Interface.
  */
 
@@ -28,17 +28,20 @@ extern "C" {
 
 /*!\name Algorithm interface for VP8
  *
- * This interface provides the capability to encode raw VP8 streams, as would
- * be found in AVI files.
+ * This interface provides the capability to encode raw VP8 streams.
  * @{
  */
 extern vpx_codec_iface_t  vpx_codec_vp8_cx_algo;
 extern vpx_codec_iface_t *vpx_codec_vp8_cx(void);
+/*!@} - end algorithm interface member group*/
 
-/* TODO(jkoleszar): These move to VP9 in a later patch set. */
+/*!\name Algorithm interface for VP9
+ *
+ * This interface provides the capability to encode raw VP9 streams.
+ * @{
+ */
 extern vpx_codec_iface_t  vpx_codec_vp9_cx_algo;
 extern vpx_codec_iface_t *vpx_codec_vp9_cx(void);
-
 /*!@} - end algorithm interface member group*/
 
 
@@ -234,20 +237,111 @@ enum vp8e_enc_control_id {
 
   VP8E_SET_SCREEN_CONTENT_MODE,  /**<control function to set encoder screen content mode */
 
-  /* TODO(jkoleszar): Move to vp9cx.h */
+  /*!\brief Codec control function to set lossless encoding mode
+   *
+   * VP9 can operate in lossless encoding mode, in which the bitstream
+   * produced will be able to decode and reconstruct a perfect copy of
+   * input source. This control function provides a mean to switch encoder
+   * into lossless coding mode(1) or normal coding mode(0) that may be lossy.
+   *                          0 = lossy coding mode
+   *                          1 = lossless coding mode
+   *
+   *  By default, encoder operates in normal coding mode (maybe lossy).
+   */
   VP9E_SET_LOSSLESS,
+
+  /*!\brief Codec control function to set number of tile columns
+   *
+   * In encoding and decoding, VP9 allows an input image frame be partitioned
+   * into separated vertical tile columns, which can be encoded or decoded
+   * independently. This enables easy implementation of parallel encoding and
+   * decoding. This control requests the encoder to use column tiles in
+   * encoding an input frame, with number of tile columns (in Log2 unit) as
+   * the parameter:
+   *             0 = 1 tile column
+   *             1 = 2 tile columns
+   *             2 = 4 tile columns
+   *             .....
+   *             n = 2**n tile columns
+   * The requested tile columns will be capped by encoder based on image size
+   * limitation (The minimum width of a tile column is 256 pixel, the maximum
+   * is 4096).
+   *
+   * By default, the value is 0, i.e. one single column tile for entire image.
+   */
   VP9E_SET_TILE_COLUMNS,
+
+  /*!\brief Codec control function to set number of tile rows
+   *
+   * In encoding and decoding, VP9 allows an input image frame be partitioned
+   * into separated horizontal tile rows. Tile rows are encoded or decoded
+   * sequentially. Even though encoding/decoding of later tile rows depends on
+   * earlier ones, this allows the encoder to output data packets for tile rows
+   * prior to completely processing all tile rows in a frame, thereby reducing
+   * the latency in processing between input and output. The parameter
+   * for this control describes the number of tile rows, which has a valid
+   * range [0, 2]:
+   *            0 = 1 tile row
+   *            1 = 2 tile rows
+   *            2 = 4 tile rows
+   *
+   * By default, the value is 0, i.e. one single row tile for entire image.
+   */
   VP9E_SET_TILE_ROWS,
+
+  /*!\brief Codec control function to enable frame parallel decoding feature
+   *
+   * VP9 has a bitstream feature to reduce decoding dependency between frames
+   * by turning off backward update of probability context used in encoding
+   * and decoding. This allows staged parallel processing of more than one
+   * video frames in the decoder. This control function provides a mean to
+   * turn this feature on or off for bitstreams produced by encoder.
+   *
+   * By default, this feature is off.
+   */
   VP9E_SET_FRAME_PARALLEL_DECODING,
+
+  /*!\brief Codec control function to set adaptive quantization mode
+   *
+   * VP9 has a segment based feature that allows encoder to adaptively change
+   * quantization parameter for each segment within a frame to improve the
+   * subjective quality. This control makes encoder operate in one of the
+   * several AQ_modes supported.
+   *
+   * By default, encoder operates with AQ_Mode 0(adaptive quantization off).
+   */
   VP9E_SET_AQ_MODE,
+
+  /*!\brief Codec control function to enable/disable periodic Q boost
+   *
+   * One VP9 encoder speed feature is to enable quality boost by lowering
+   * frame level Q periodically. This control function provides a mean to
+   * turn on/off this feature.
+   *               0 = off
+   *               1 = on
+   *
+   * By default, the encoder is allowed to use this feature for appropriate
+   * encoding modes.
+   */
   VP9E_SET_FRAME_PERIODIC_BOOST,
+
   /*!\brief control function to set noise sensitivity
    *
    *  0: off, 1: OnYOnly
    */
   VP9E_SET_NOISE_SENSITIVITY,
 
+  /*!\brief control function to turn on/off SVC in encoder.
+   * \note Return value is VPX_CODEC_INVALID_PARAM if the encoder does not
+   *       support SVC in its current encoding mode
+   *  0: off, 1: on
+   */
   VP9E_SET_SVC,
+
+  /*!\brief control function to set parameters for SVC.
+   * \note Parameters contain min_q, max_q, scaling factor for each of the
+   *       SVC layers.
+   */
   VP9E_SET_SVC_PARAMETERS,
 
   /*!\brief control function to set svc layer for spatial and temporal.
@@ -256,9 +350,38 @@ enum vp8e_enc_control_id {
    *                     temporal layer.
    */
   VP9E_SET_SVC_LAYER_ID,
+
+  /*!\brief control function to set content type.
+   * \note Valid parameter range:
+   *              VP9E_CONTENT_DEFAULT = Regular video content (Default)
+   *              VP9E_CONTENT_SCREEN  = Screen capture content
+   */
   VP9E_SET_TUNE_CONTENT,
+
+  /*!\brief control function to get svc layer ID.
+   * \note The layer ID returned is for the data packet from the registered
+   *       callback function.
+   */
   VP9E_GET_SVC_LAYER_ID,
+
+  /*!\brief control function to register callback for getting per layer packet.
+   * \note Parameter for this control function is a structure with a callback
+   *       function and a pointer to private data used by the callback.
+   */
   VP9E_REGISTER_CX_CALLBACK,
+
+  /*!\brief control function to set color space info.
+   * \note Valid ranges: 0..7, default is "UNKNOWN".
+   *                     0 = UNKNOWN,
+   *                     1 = BT_601
+   *                     2 = BT_709
+   *                     3 = SMPTE_170
+   *                     4 = SMPTE_240
+   *                     5 = BT_2020
+   *                     6 = RESERVED
+   *                     7 = SRGB
+   */
+  VP9E_SET_COLOR_SPACE,
 };
 
 /*!\brief vpx 1-D scaling mode
@@ -423,6 +546,8 @@ VPX_CTRL_USE_TYPE(VP9E_SET_FRAME_PERIODIC_BOOST, unsigned int)
 VPX_CTRL_USE_TYPE(VP9E_SET_NOISE_SENSITIVITY,  unsigned int)
 
 VPX_CTRL_USE_TYPE(VP9E_SET_TUNE_CONTENT, int) /* vp9e_tune_content */
+
+VPX_CTRL_USE_TYPE(VP9E_SET_COLOR_SPACE, int)
 /*! @} - end defgroup vp8_encoder */
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vpx/vp8dx.h b/vpx/vp8dx.h
index 5cc25cd6a..3ba72fe3c 100644
--- a/vpx/vp8dx.h
+++ b/vpx/vp8dx.h
@@ -9,13 +9,13 @@
  */
 
 
-/*!\defgroup vp8_decoder WebM VP8 Decoder
+/*!\defgroup vp8_decoder WebM VP8/VP9 Decoder
  * \ingroup vp8
  *
  * @{
  */
 /*!\file
- * \brief Provides definitions for using the VP8 algorithm within the vpx Decoder
+ * \brief Provides definitions for using VP8 or VP9 within the vpx Decoder
  *        interface.
  */
 #ifndef VPX_VP8DX_H_
@@ -30,14 +30,18 @@ extern "C" {
 
 /*!\name Algorithm interface for VP8
  *
- * This interface provides the capability to decode raw VP8 streams, as would
- * be found in AVI files and other non-Flash uses.
+ * This interface provides the capability to decode VP8 streams.
  * @{
  */
 extern vpx_codec_iface_t  vpx_codec_vp8_dx_algo;
 extern vpx_codec_iface_t *vpx_codec_vp8_dx(void);
+/*!@} - end algorithm interface member group*/
 
-/* TODO(jkoleszar): These move to VP9 in a later patch set. */
+/*!\name Algorithm interface for VP9
+ *
+ * This interface provides the capability to decode VP9 streams.
+ * @{
+ */
 extern vpx_codec_iface_t  vpx_codec_vp9_dx_algo;
 extern vpx_codec_iface_t *vpx_codec_vp9_dx(void);
 /*!@} - end algorithm interface member group*/
@@ -85,7 +89,14 @@ enum vp8_dec_control_id {
    */
   VP9_SET_BYTE_ALIGNMENT,
 
-  /** For testing. */
+  /** control function to invert the decoding order to from right to left. The
+   * function is used in a test to confirm the decoding independence of tile
+   * columns. The function may be used in application where this order
+   * of decoding is desired.
+   *
+   * TODO(yaowu): Rework the unit test that uses this control, and in a future
+   *              release, this test-only control shall be removed.
+   */
   VP9_INVERT_TILE_DECODE_ORDER,
 
   VP8_DECODER_CTRL_ID_MAX
diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h
index da5bd0659..8f7bff518 100644
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h
@@ -232,8 +232,8 @@ extern "C" {
 
   /*!\brief Callback function pointer / user data pair storage */
   typedef struct vpx_codec_enc_output_cx_cb_pair {
-    vpx_codec_enc_output_cx_pkt_cb_fn_t output_cx_pkt;
-    void                            *user_priv;
+    vpx_codec_enc_output_cx_pkt_cb_fn_t output_cx_pkt; /**< Callback function */
+    void                            *user_priv; /**< Pointer to private data */
   } vpx_codec_priv_output_cx_pkt_cb_pair_t;
 
   /*!\brief Rational Number
@@ -737,10 +737,10 @@ extern "C" {
    *
    */
   typedef struct vpx_svc_parameters {
-    int max_quantizers[VPX_SS_MAX_LAYERS];
-    int min_quantizers[VPX_SS_MAX_LAYERS];
-    int scaling_factor_num[VPX_SS_MAX_LAYERS];
-    int scaling_factor_den[VPX_SS_MAX_LAYERS];
+    int max_quantizers[VPX_SS_MAX_LAYERS]; /**< Max Q for each layer */
+    int min_quantizers[VPX_SS_MAX_LAYERS]; /**< Min Q for each layer */
+    int scaling_factor_num[VPX_SS_MAX_LAYERS]; /**< Scaling factor-numerator*/
+    int scaling_factor_den[VPX_SS_MAX_LAYERS]; /**< Scaling factor-denominator*/
   } vpx_svc_extra_cfg_t;
 
 
diff --git a/vpx/vpx_image.h b/vpx/vpx_image.h
index 337e4c4be..c06d35101 100644
--- a/vpx/vpx_image.h
+++ b/vpx/vpx_image.h
@@ -28,7 +28,7 @@ extern "C" {
    * types, removing or reassigning enums, adding/removing/rearranging
    * fields to structures
    */
-#define VPX_IMAGE_ABI_VERSION (2) /**<\hideinitializer*/
+#define VPX_IMAGE_ABI_VERSION (3) /**<\hideinitializer*/
 
 
 #define VPX_IMG_FMT_PLANAR     0x100  /**< Image is a planar format. */
@@ -66,9 +66,22 @@ extern "C" {
     VPX_IMG_FMT_I44016    = VPX_IMG_FMT_I440 | VPX_IMG_FMT_HIGHBITDEPTH
   } vpx_img_fmt_t; /**< alias for enum vpx_img_fmt */
 
+  /*!\brief List of supported color spaces */
+  typedef enum vpx_color_space {
+    VPX_CS_UNKNOWN    = 0,  /**< Unknown */
+    VPX_CS_BT_601     = 1,  /**< BT.601 */
+    VPX_CS_BT_709     = 2,  /**< BT.709 */
+    VPX_CS_SMPTE_170  = 3,  /**< SMPTE.170 */
+    VPX_CS_SMPTE_240  = 4,  /**< SMPTE.240 */
+    VPX_CS_BT_2020    = 5,  /**< BT.2020 */
+    VPX_CS_RESERVED   = 6,  /**< Reserved */
+    VPX_CS_SRGB       = 7   /**< sRGB */
+  } vpx_color_space_t; /**< alias for enum vpx_color_space */
+
   /**\brief Image Descriptor */
   typedef struct vpx_image {
     vpx_img_fmt_t fmt; /**< Image Format */
+    vpx_color_space_t cs; /**< Color Space */
 
     /* Image storage dimensions */
     unsigned int  w;           /**< Stored image width */
diff --git a/vpx_ports/x86.h b/vpx_ports/x86.h
index 81c2b8b87..ae349fb84 100644
--- a/vpx_ports/x86.h
+++ b/vpx_ports/x86.h
@@ -13,6 +13,7 @@
 #define VPX_PORTS_X86_H_
 #include <stdlib.h>
 #include "vpx_config.h"
+#include "vpx/vpx_integer.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -104,6 +105,37 @@ void __cpuid(int CPUInfo[4], int info_type);
 #endif
 #endif /* end others */
 
+// NaCl has no support for xgetbv or the raw opcode.
+#if !defined(__native_client__) && (defined(__i386__) || defined(__x86_64__))
+static INLINE uint64_t xgetbv(void) {
+  const uint32_t ecx = 0;
+  uint32_t eax, edx;
+  // Use the raw opcode for xgetbv for compatibility with older toolchains.
+  __asm__ volatile (
+    ".byte 0x0f, 0x01, 0xd0\n"
+    : "=a"(eax), "=d"(edx) : "c" (ecx));
+  return ((uint64_t)edx << 32) | eax;
+}
+#elif (defined(_M_X64) || defined(_M_IX86)) && \
+      defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040219  // >= VS2010 SP1
+#include <immintrin.h>
+#define xgetbv() _xgetbv(0)
+#elif defined(_MSC_VER) && defined(_M_IX86)
+static INLINE uint64_t xgetbv(void) {
+  uint32_t eax_, edx_;
+  __asm {
+    xor ecx, ecx  // ecx = 0
+    // Use the raw opcode for xgetbv for compatibility with older toolchains.
+    __asm _emit 0x0f __asm _emit 0x01 __asm _emit 0xd0
+    mov eax_, eax
+    mov edx_, edx
+  }
+  return ((uint64_t)edx_ << 32) | eax_;
+}
+#else
+#define xgetbv() 0U  // no AVX for older x64 or unrecognized toolchains.
+#endif
+
 #define HAS_MMX     0x01
 #define HAS_SSE     0x02
 #define HAS_SSE2    0x04
@@ -156,14 +188,17 @@ x86_simd_caps(void) {
 
   if (reg_ecx & BIT(19)) flags |= HAS_SSE4_1;
 
-  if (reg_ecx & BIT(28)) flags |= HAS_AVX;
+  // bits 27 (OSXSAVE) & 28 (256-bit AVX)
+  if ((reg_ecx & (BIT(27) | BIT(28))) == (BIT(27) | BIT(28))) {
+    if ((xgetbv() & 0x6) == 0x6) {
+      flags |= HAS_AVX;
 
-  /* Get the leaf 7 feature flags. Needed to check for AVX2 support */
-  reg_eax = 7;
-  reg_ecx = 0;
-  cpuid(7, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
+      /* Get the leaf 7 feature flags. Needed to check for AVX2 support */
+      cpuid(7, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
 
-  if (reg_ebx & BIT(5)) flags |= HAS_AVX2;
+      if (reg_ebx & BIT(5)) flags |= HAS_AVX2;
+    }
+  }
 
   return flags & mask;
 }
diff --git a/vpx_scale/yv12config.h b/vpx_scale/yv12config.h
index f04dee1e8..6cdc235fe 100644
--- a/vpx_scale/yv12config.h
+++ b/vpx_scale/yv12config.h
@@ -55,6 +55,7 @@ typedef struct yv12_buffer_config {
   int subsampling_x;
   int subsampling_y;
   unsigned int bit_depth;
+  vpx_color_space_t color_space;
 
   int corrupted;
   int flags;
diff --git a/vpxenc.c b/vpxenc.c
index e14c5eaab..944dfa8b5 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -442,7 +442,7 @@ static const int vp9_arg_ctrl_map[] = {
   VP8E_SET_MAX_INTER_BITRATE_PCT, VP8E_SET_GF_CBR_BOOST_PCT,
   VP9E_SET_LOSSLESS, VP9E_SET_FRAME_PARALLEL_DECODING, VP9E_SET_AQ_MODE,
   VP9E_SET_FRAME_PERIODIC_BOOST, VP9E_SET_NOISE_SENSITIVITY,
-  VP9E_SET_TUNE_CONTENT,
+  VP9E_SET_TUNE_CONTENT, VP9E_SET_COLOR_SPACE,
   0
 };
 #endif