60 files changed, 1651 insertions, 1162 deletions
diff --git a/build/make/Android.mk b/build/make/Android.mk
index db0cebff5..cf6221017 100644
--- a/build/make/Android.mk
+++ b/build/make/Android.mk
@@ -112,12 +112,12 @@ endef
 # Use ads2gas script to convert from RVCT format to GAS format.  This passes
 #  puts the processed file under $(ASM_CNV_PATH).  Local clean rule
 #  to handle removing these
-ASM_CNV_OFFSETS_DEPEND = $(ASM_CNV_PATH)/asm_com_offsets.asm
+ASM_CNV_OFFSETS_DEPEND = $(ASM_CNV_PATH)/vp8_asm_com_offsets.asm
 ifeq ($(CONFIG_VP8_DECODER), yes)
-  ASM_CNV_OFFSETS_DEPEND += $(ASM_CNV_PATH)/asm_dec_offsets.asm
+  ASM_CNV_OFFSETS_DEPEND += $(ASM_CNV_PATH)/vp8_asm_dec_offsets.asm
 endif
 ifeq ($(CONFIG_VP8_ENCODER), yes)
-  ASM_CNV_OFFSETS_DEPEND += $(ASM_CNV_PATH)/asm_enc_offsets.asm
+  ASM_CNV_OFFSETS_DEPEND += $(ASM_CNV_PATH)/vp8_asm_enc_offsets.asm
 endif
 
 .PRECIOUS: %.asm.s
@@ -190,19 +190,19 @@ clean:
 include $(BUILD_SHARED_LIBRARY)
 
 $(eval $(call asm_offsets_template,\
-    $(ASM_CNV_PATH)/asm_com_offsets.asm, \
-    $(LIBVPX_PATH)/vp8/common/asm_com_offsets.c))
+    $(ASM_CNV_PATH)/vp8_asm_com_offsets.asm, \
+    $(LIBVPX_PATH)/vp8/common/vp8_asm_com_offsets.c))
 
 ifeq ($(CONFIG_VP8_DECODER), yes)
   $(eval $(call asm_offsets_template,\
-    $(ASM_CNV_PATH)/asm_dec_offsets.asm, \
-    $(LIBVPX_PATH)/vp8/decoder/asm_dec_offsets.c))
+    $(ASM_CNV_PATH)/vp8_asm_dec_offsets.asm, \
+    $(LIBVPX_PATH)/vp8/decoder/vp8_asm_dec_offsets.c))
 endif
 
 ifeq ($(CONFIG_VP8_ENCODER), yes)
   $(eval $(call asm_offsets_template,\
-    $(ASM_CNV_PATH)/asm_enc_offsets.asm, \
-    $(LIBVPX_PATH)/vp8/encoder/asm_enc_offsets.c))
+    $(ASM_CNV_PATH)/vp8_asm_enc_offsets.asm, \
+    $(LIBVPX_PATH)/vp8/encoder/vp8_asm_enc_offsets.c))
 endif
 
 ifeq ($(CONFIG_RUNTIME_CPU_DETECT),yes)
diff --git a/build/make/Makefile b/build/make/Makefile
index da7fb03a0..4ac5bcf1f 100644
--- a/build/make/Makefile
+++ b/build/make/Makefile
@@ -377,7 +377,7 @@ ifneq ($(call enabled,DIST-SRCS),)
     DIST-SRCS-$(CONFIG_MSVS)  += build/x86-msvs/yasm.rules
     DIST-SRCS-$(CONFIG_MSVS)  += build/x86-msvs/obj_int_extract.bat
     DIST-SRCS-$(CONFIG_RVCT) += build/make/armlink_adapter.sh
-    # Include obj_int_extract if we use offsets from asm_*_offsets
+    # Include obj_int_extract if we use offsets from *_asm_*_offsets
     DIST-SRCS-$(ARCH_ARM)$(ARCH_X86)$(ARCH_X86_64)    += build/make/obj_int_extract.c
     DIST-SRCS-$(ARCH_ARM)    += build/make/ads2gas.pl
     DIST-SRCS-$(ARCH_ARM)    += build/make/ads2gas_apple.pl
diff --git a/build/make/obj_int_extract.c b/build/make/obj_int_extract.c
index f86cec2ac..1604b5e68 100644
--- a/build/make/obj_int_extract.c
+++ b/build/make/obj_int_extract.c
@@ -144,7 +144,7 @@ int parse_macho(uint8_t *base_buf, size_t sz) {
           /* Location of string is cacluated each time from the
            * start of the string buffer.  On darwin the symbols
            * are prefixed by "_", so we bump the pointer by 1.
-           * The target value is defined as an int in asm_*_offsets.c,
+           * The target value is defined as an int in *_asm_*_offsets.c,
            * which is 4 bytes on all targets we currently use.
            */
           if (bits == 32) {
@@ -446,7 +446,7 @@ int parse_elf(uint8_t *buf, size_t sz, output_fmt_t mode) {
             if (strcmp(section_name, ".bss")) {
               if (sizeof(val) != sym.st_size) {
                 /* The target value is declared as an int in
-                 * asm_*_offsets.c, which is 4 bytes on all
+                 * *_asm_*_offsets.c, which is 4 bytes on all
                  * targets we currently use. Complain loudly if
                  * this is not true.
                  */
@@ -528,7 +528,7 @@ int parse_elf(uint8_t *buf, size_t sz, output_fmt_t mode) {
             if ((strcmp(section_name, ".bss"))) {
               if (sizeof(val) != sym.st_size) {
                 /* The target value is declared as an int in
-                 * asm_*_offsets.c, which is 4 bytes on all
+                 * *_asm_*_offsets.c, which is 4 bytes on all
                  * targets we currently use. Complain loudly if
                  * this is not true.
                  */
diff --git a/build/x86-msvs/obj_int_extract.bat b/build/x86-msvs/obj_int_extract.bat
index 70b39f68a..47fef974c 100644
--- a/build/x86-msvs/obj_int_extract.bat
+++ b/build/x86-msvs/obj_int_extract.bat
@@ -14,10 +14,10 @@ obj_int_extract.exe rvds "vp9_asm_com_offsets.obj" > "vp9_asm_com_offsets.asm"
 obj_int_extract.exe rvds "vp9_asm_dec_offsets.obj" > "vp9_asm_dec_offsets.asm"
 obj_int_extract.exe rvds "vp9_asm_enc_offsets.obj" > "vp9_asm_enc_offsets.asm"
 
-cl /I "./" /I "%1" /nologo /c "%1/vp8/common/asm_com_offsets.c"
-cl /I "./" /I "%1" /nologo /c "%1/vp8/decoder/asm_dec_offsets.c"
-cl /I "./" /I "%1" /nologo /c "%1/vp8/encoder/asm_enc_offsets.c"
-obj_int_extract.exe rvds "asm_com_offsets.obj" > "vp8_asm_com_offsets.asm"
-obj_int_extract.exe rvds "asm_dec_offsets.obj" > "vp8_asm_dec_offsets.asm"
-obj_int_extract.exe rvds "asm_enc_offsets.obj" > "vp8_asm_enc_offsets.asm"
+cl /I "./" /I "%1" /nologo /c "%1/vp8/common/vp8_asm_com_offsets.c"
+cl /I "./" /I "%1" /nologo /c "%1/vp8/decoder/vp8_asm_dec_offsets.c"
+cl /I "./" /I "%1" /nologo /c "%1/vp8/encoder/vp8_asm_enc_offsets.c"
+obj_int_extract.exe rvds "vp8_asm_com_offsets.obj" > "vp8_asm_com_offsets.asm"
+obj_int_extract.exe rvds "vp8_asm_dec_offsets.obj" > "vp8_asm_dec_offsets.asm"
+obj_int_extract.exe rvds "vp8_asm_enc_offsets.obj" > "vp8_asm_enc_offsets.asm"
 
diff --git a/configure b/configure
index 8cd3b9391..5f2c39183 100755
--- a/configure
+++ b/configure
@@ -299,6 +299,7 @@ CONFIG_LIST="
     multi_res_encoding
     temporal_denoising
     experimental
+    decrypt
     ${EXPERIMENT_LIST}
 "
 CMDLINE_SELECT="
@@ -348,6 +349,7 @@ CMDLINE_SELECT="
     multi_res_encoding
     temporal_denoising
     experimental
+    decrypt
 "
 
 process_cmdline() {
diff --git a/test/decode_test_driver.cc b/test/decode_test_driver.cc
index da43310c1..1f6d54064 100644
--- a/test/decode_test_driver.cc
+++ b/test/decode_test_driver.cc
@@ -14,18 +14,13 @@
 #include "test/video_source.h"
 
 namespace libvpx_test {
-void Decoder::DecodeFrame(const uint8_t *cxdata, int size) {
-  if (!decoder_.priv) {
-    const vpx_codec_err_t res_init = vpx_codec_dec_init(&decoder_,
-                                                        CodecInterface(),
-                                                        &cfg_, 0);
-    ASSERT_EQ(VPX_CODEC_OK, res_init) << DecodeError();
-  }
 
+vpx_codec_err_t Decoder::DecodeFrame(const uint8_t *cxdata, int size) {
   vpx_codec_err_t res_dec;
+  InitOnce();
   REGISTER_STATE_CHECK(res_dec = vpx_codec_decode(&decoder_,
                                                   cxdata, size, NULL, 0));
-  ASSERT_EQ(VPX_CODEC_OK, res_dec) << DecodeError();
+  return res_dec;
 }
 
 void DecoderTest::RunLoop(CompressedVideoSource *video) {
@@ -35,7 +30,9 @@ void DecoderTest::RunLoop(CompressedVideoSource *video) {
 
   // Decode frames.
   for (video->Begin(); video->cxdata(); video->Next()) {
-    decoder->DecodeFrame(video->cxdata(), video->frame_size());
+    vpx_codec_err_t res_dec = decoder->DecodeFrame(video->cxdata(),
+                                                   video->frame_size());
+    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError();
 
     DxDataIterator dec_iter = decoder->GetDxData();
     const vpx_image_t *img = NULL;
diff --git a/test/decode_test_driver.h b/test/decode_test_driver.h
index ed7069004..49e7384f4 100644
--- a/test/decode_test_driver.h
+++ b/test/decode_test_driver.h
@@ -42,7 +42,7 @@ class DxDataIterator {
 class Decoder {
  public:
   Decoder(vpx_codec_dec_cfg_t cfg, unsigned long deadline)
-      : cfg_(cfg), deadline_(deadline) {
+      : cfg_(cfg), deadline_(deadline), init_done_(false) {
     memset(&decoder_, 0, sizeof(decoder_));
   }
 
@@ -50,7 +50,7 @@ class Decoder {
     vpx_codec_destroy(&decoder_);
   }
 
-  void DecodeFrame(const uint8_t *cxdata, int size);
+  vpx_codec_err_t DecodeFrame(const uint8_t *cxdata, int size);
 
   DxDataIterator GetDxData() {
     return DxDataIterator(&decoder_);
@@ -61,21 +61,39 @@ class Decoder {
   }
 
   void Control(int ctrl_id, int arg) {
+    InitOnce();
     const vpx_codec_err_t res = vpx_codec_control_(&decoder_, ctrl_id, arg);
     ASSERT_EQ(VPX_CODEC_OK, res) << DecodeError();
   }
 
- protected:
-  virtual const vpx_codec_iface_t* CodecInterface() const = 0;
+  void Control(int ctrl_id, const void *arg) {
+    InitOnce();
+    const vpx_codec_err_t res = vpx_codec_control_(&decoder_, ctrl_id, arg);
+    ASSERT_EQ(VPX_CODEC_OK, res) << DecodeError();
+  }
 
   const char* DecodeError() {
     const char *detail = vpx_codec_error_detail(&decoder_);
     return detail ? detail : vpx_codec_error(&decoder_);
   }
 
+ protected:
+  virtual const vpx_codec_iface_t* CodecInterface() const = 0;
+
+  void InitOnce() {
+    if (!init_done_) {
+      const vpx_codec_err_t res = vpx_codec_dec_init(&decoder_,
+                                                     CodecInterface(),
+                                                     &cfg_, 0);
+      ASSERT_EQ(VPX_CODEC_OK, res) << DecodeError();
+      init_done_ = true;
+    }
+  }
+
   vpx_codec_ctx_t     decoder_;
   vpx_codec_dec_cfg_t cfg_;
   unsigned int        deadline_;
+  bool                init_done_;
 };
 
 // Common test functionality for all Decoder tests.
diff --git a/test/encode_test_driver.cc b/test/encode_test_driver.cc
index 75921aa02..eed3e33af 100644
--- a/test/encode_test_driver.cc
+++ b/test/encode_test_driver.cc
@@ -175,8 +175,9 @@ void EncoderTest::RunLoop(VideoSource *video) {
           case VPX_CODEC_CX_FRAME_PKT:
             has_cxdata = true;
             if (decoder && DoDecode()) {
-              decoder->DecodeFrame((const uint8_t*)pkt->data.frame.buf,
-                                   pkt->data.frame.sz);
+              vpx_codec_err_t res_dec = decoder->DecodeFrame(
+                  (const uint8_t*)pkt->data.frame.buf, pkt->data.frame.sz);
+              ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError();
               has_dxdata = true;
             }
             ASSERT_GE(pkt->data.frame.pts, last_pts_);
diff --git a/test/test.mk b/test/test.mk
index 793fbf8b2..0d069d026 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -31,6 +31,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ivf_video_source.h
 
 
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += test_vector_test.cc
+
 ##
 ## WHITE BOX TESTS
 ##
@@ -55,6 +56,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += set_roi.cc
 LIBVPX_TEST_SRCS-yes                   += sixtap_predict_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += subtract_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += variance_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += vp8_decrypt_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_fdct4x4_test.cc
 
 endif # VP8
diff --git a/test/tile_independence_test.cc b/test/tile_independence_test.cc
index acff0fdfb..711d0bd45 100644
--- a/test/tile_independence_test.cc
+++ b/test/tile_independence_test.cc
@@ -32,10 +32,9 @@ class TileIndependenceTest : public ::libvpx_test::EncoderTest,
     cfg.w = 704;
     cfg.h = 144;
     cfg.threads = 1;
-    cfg.inv_tile_order = 0;
     fw_dec_ = codec_->CreateDecoder(cfg, 0);
-    cfg.inv_tile_order = 1;
     inv_dec_ = codec_->CreateDecoder(cfg, 0);
+    inv_dec_->Control(VP9_INVERT_TILE_DECODE_ORDER, 1);
   }
 
   virtual ~TileIndependenceTest() {
diff --git a/test/vp8_boolcoder_test.cc b/test/vp8_boolcoder_test.cc
index 4e21be8c5..ab19c3412 100644
--- a/test/vp8_boolcoder_test.cc
+++ b/test/vp8_boolcoder_test.cc
@@ -26,6 +26,20 @@ extern "C" {
 
 namespace {
 const int num_tests = 10;
+
+void encrypt_buffer(uint8_t *buffer, int size, const uint8_t *key) {
+  for (int i = 0; i < size; ++i) {
+    buffer[i] ^= key[i % 32];
+  }
+}
+
+const uint8_t secret_key[32] = {
+  234,  32,   2,  3,  4, 230,   6,  11,
+    0, 132,  22, 23, 45,  21, 124, 255,
+    0,  43,  52,  3, 23,  63,  99,   7,
+  120,   8, 252, 84,  4,  83,   6,  13
+};
+
 }  // namespace
 
 using libvpx_test::ACMRandom;
@@ -71,7 +85,12 @@ TEST(VP8, TestBitIO) {
         vp8_stop_encode(&bw);
 
         BOOL_DECODER br;
-        vp8dx_start_decode(&br, bw_buffer, buffer_size);
+
+#if CONFIG_DECRYPT
+        encrypt_buffer(bw_buffer, buffer_size, secret_key);
+#endif
+
+        vp8dx_start_decode(&br, bw_buffer, buffer_size, bw_buffer, secret_key);
         bit_rnd.Reset(random_seed);
         for (int i = 0; i < bits_to_test; ++i) {
           if (bit_method == 2) {
diff --git a/test/vp8_decrypt_test.cc b/test/vp8_decrypt_test.cc
new file mode 100644
index 000000000..ea7b92049
--- /dev/null
+++ b/test/vp8_decrypt_test.cc
@@ -0,0 +1,65 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/decode_test_driver.h"
+#include "test/ivf_video_source.h"
+
+#if CONFIG_DECRYPT
+
+namespace {
+
+const uint8_t decrypt_key[32] = {
+  255, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+}  // namespace
+
+namespace libvpx_test {
+
+TEST(TestDecrypt, NullKey) {
+  vpx_codec_dec_cfg_t cfg = {0};
+  vpx_codec_ctx_t decoder = {0};
+  vpx_codec_err_t res = vpx_codec_dec_init(&decoder, &vpx_codec_vp8_dx_algo,
+                                           &cfg, 0);
+  ASSERT_EQ(VPX_CODEC_OK, res);
+
+  res = vpx_codec_control(&decoder, VP8_SET_DECRYPT_KEY, NULL);
+  ASSERT_EQ(VPX_CODEC_INVALID_PARAM, res);
+}
+
+TEST(TestDecrypt, DecryptWorks) {
+  libvpx_test::IVFVideoSource video("vp80-00-comprehensive-001.ivf");
+  video.Init();
+
+  vpx_codec_dec_cfg_t dec_cfg = {0};
+  Decoder decoder(dec_cfg, 0);
+
+  // Zero decrypt key (by default)
+  video.Begin();
+  vpx_codec_err_t res = decoder.DecodeFrame(video.cxdata(), video.frame_size());
+  ASSERT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError();
+
+  // Non-zero decrypt key
+  video.Next();
+  decoder.Control(VP8_SET_DECRYPT_KEY, decrypt_key);
+  res = decoder.DecodeFrame(video.cxdata(), video.frame_size());
+  ASSERT_NE(VPX_CODEC_OK, res) << decoder.DecodeError();
+}
+
+}  // namespace libvpx_test
+
+#endif  // CONFIG_DECRYPT
diff --git a/vp8/common/loopfilter.c b/vp8/common/loopfilter.c
index 8681b7a6a..19857a7e9 100644
--- a/vp8/common/loopfilter.c
+++ b/vp8/common/loopfilter.c
@@ -156,39 +156,38 @@ void vp8_loop_filter_frame_init(VP8_COMMON *cm,
             continue;
         }
 
-        lvl_ref = lvl_seg;
-
         /* INTRA_FRAME */
         ref = INTRA_FRAME;
 
         /* Apply delta for reference frame */
-        lvl_ref += mbd->ref_lf_deltas[ref];
+        lvl_ref = lvl_seg + mbd->ref_lf_deltas[ref];
 
         /* Apply delta for Intra modes */
         mode = 0; /* B_PRED */
         /* Only the split mode BPRED has a further special case */
-        lvl_mode = lvl_ref +  mbd->mode_lf_deltas[mode];
-        lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; /* clamp */
+        lvl_mode = lvl_ref + mbd->mode_lf_deltas[mode];
+        /* clamp */
+        lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0;
 
         lfi->lvl[seg][ref][mode] = lvl_mode;
 
         mode = 1; /* all the rest of Intra modes */
-        lvl_mode = (lvl_ref > 0) ? (lvl_ref > 63 ? 63 : lvl_ref)  : 0; /* clamp */
+        /* clamp */
+        lvl_mode = (lvl_ref > 0) ? (lvl_ref > 63 ? 63 : lvl_ref) : 0;
         lfi->lvl[seg][ref][mode] = lvl_mode;
 
         /* LAST, GOLDEN, ALT */
         for(ref = 1; ref < MAX_REF_FRAMES; ref++)
         {
-            int lvl_ref = lvl_seg;
-
             /* Apply delta for reference frame */
-            lvl_ref += mbd->ref_lf_deltas[ref];
+            lvl_ref = lvl_seg + mbd->ref_lf_deltas[ref];
 
             /* Apply delta for Inter modes */
             for (mode = 1; mode < 4; mode++)
             {
                 lvl_mode = lvl_ref + mbd->mode_lf_deltas[mode];
-                lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; /* clamp */
+                /* clamp */
+                lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0;
 
                 lfi->lvl[seg][ref][mode] = lvl_mode;
             }
diff --git a/vp8/common/loopfilter_filters.c b/vp8/common/loopfilter_filters.c
index 8235f6e9f..1d51696ff 100644
--- a/vp8/common/loopfilter_filters.c
+++ b/vp8/common/loopfilter_filters.c
@@ -54,7 +54,7 @@ static void vp8_filter(signed char mask, uc hev, uc *op1,
 {
     signed char ps0, qs0;
     signed char ps1, qs1;
-    signed char vp8_filter, Filter1, Filter2;
+    signed char filter_value, Filter1, Filter2;
     signed char u;
 
     ps1 = (signed char) * op1 ^ 0x80;
@@ -63,35 +63,35 @@ static void vp8_filter(signed char mask, uc hev, uc *op1,
     qs1 = (signed char) * oq1 ^ 0x80;
 
     /* add outer taps if we have high edge variance */
-    vp8_filter = vp8_signed_char_clamp(ps1 - qs1);
-    vp8_filter &= hev;
+    filter_value = vp8_signed_char_clamp(ps1 - qs1);
+    filter_value &= hev;
 
     /* inner taps */
-    vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0));
-    vp8_filter &= mask;
+    filter_value = vp8_signed_char_clamp(filter_value + 3 * (qs0 - ps0));
+    filter_value &= mask;
 
     /* save bottom 3 bits so that we round one side +4 and the other +3
      * if it equals 4 we'll set to adjust by -1 to account for the fact
      * we'd round 3 the other way
      */
-    Filter1 = vp8_signed_char_clamp(vp8_filter + 4);
-    Filter2 = vp8_signed_char_clamp(vp8_filter + 3);
+    Filter1 = vp8_signed_char_clamp(filter_value + 4);
+    Filter2 = vp8_signed_char_clamp(filter_value + 3);
     Filter1 >>= 3;
     Filter2 >>= 3;
     u = vp8_signed_char_clamp(qs0 - Filter1);
     *oq0 = u ^ 0x80;
     u = vp8_signed_char_clamp(ps0 + Filter2);
     *op0 = u ^ 0x80;
-    vp8_filter = Filter1;
+    filter_value = Filter1;
 
     /* outer tap adjustments */
-    vp8_filter += 1;
-    vp8_filter >>= 1;
-    vp8_filter &= ~hev;
+    filter_value += 1;
+    filter_value >>= 1;
+    filter_value &= ~hev;
 
-    u = vp8_signed_char_clamp(qs1 - vp8_filter);
+    u = vp8_signed_char_clamp(qs1 - filter_value);
     *oq1 = u ^ 0x80;
-    u = vp8_signed_char_clamp(ps1 + vp8_filter);
+    u = vp8_signed_char_clamp(ps1 + filter_value);
     *op1 = u ^ 0x80;
 
 }
@@ -162,7 +162,7 @@ static void vp8_mbfilter(signed char mask, uc hev,
                            uc *op2, uc *op1, uc *op0, uc *oq0, uc *oq1, uc *oq2)
 {
     signed char s, u;
-    signed char vp8_filter, Filter1, Filter2;
+    signed char filter_value, Filter1, Filter2;
     signed char ps2 = (signed char) * op2 ^ 0x80;
     signed char ps1 = (signed char) * op1 ^ 0x80;
     signed char ps0 = (signed char) * op0 ^ 0x80;
@@ -171,11 +171,11 @@ static void vp8_mbfilter(signed char mask, uc hev,
     signed char qs2 = (signed char) * oq2 ^ 0x80;
 
     /* add outer taps if we have high edge variance */
-    vp8_filter = vp8_signed_char_clamp(ps1 - qs1);
-    vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0));
-    vp8_filter &= mask;
+    filter_value = vp8_signed_char_clamp(ps1 - qs1);
+    filter_value = vp8_signed_char_clamp(filter_value + 3 * (qs0 - ps0));
+    filter_value &= mask;
 
-    Filter2 = vp8_filter;
+    Filter2 = filter_value;
     Filter2 &= hev;
 
     /* save bottom 3 bits so that we round one side +4 and the other +3 */
@@ -188,8 +188,8 @@ static void vp8_mbfilter(signed char mask, uc hev,
 
 
     /* only apply wider filter if not high edge variance */
-    vp8_filter &= ~hev;
-    Filter2 = vp8_filter;
+    filter_value &= ~hev;
+    Filter2 = filter_value;
 
     /* roughly 3/7th difference across boundary */
     u = vp8_signed_char_clamp((63 + Filter2 * 27) >> 7);
@@ -291,24 +291,24 @@ static signed char vp8_simple_filter_mask(uc blimit, uc p1, uc p0, uc q0, uc q1)
 
 static void vp8_simple_filter(signed char mask, uc *op1, uc *op0, uc *oq0, uc *oq1)
 {
-    signed char vp8_filter, Filter1, Filter2;
+    signed char filter_value, Filter1, Filter2;
     signed char p1 = (signed char) * op1 ^ 0x80;
     signed char p0 = (signed char) * op0 ^ 0x80;
     signed char q0 = (signed char) * oq0 ^ 0x80;
     signed char q1 = (signed char) * oq1 ^ 0x80;
     signed char u;
 
-    vp8_filter = vp8_signed_char_clamp(p1 - q1);
-    vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (q0 - p0));
-    vp8_filter &= mask;
+    filter_value = vp8_signed_char_clamp(p1 - q1);
+    filter_value = vp8_signed_char_clamp(filter_value + 3 * (q0 - p0));
+    filter_value &= mask;
 
     /* save bottom 3 bits so that we round one side +4 and the other +3 */
-    Filter1 = vp8_signed_char_clamp(vp8_filter + 4);
+    Filter1 = vp8_signed_char_clamp(filter_value + 4);
     Filter1 >>= 3;
     u = vp8_signed_char_clamp(q0 - Filter1);
     *oq0  = u ^ 0x80;
 
-    Filter2 = vp8_signed_char_clamp(vp8_filter + 3);
+    Filter2 = vp8_signed_char_clamp(filter_value + 3);
     Filter2 >>= 3;
     u = vp8_signed_char_clamp(p0 + Filter2);
     *op0 = u ^ 0x80;
diff --git a/vp8/common/reconintra.c b/vp8/common/reconintra.c
index a85121579..ec51ffe40 100644
--- a/vp8/common/reconintra.c
+++ b/vp8/common/reconintra.c
@@ -36,7 +36,6 @@ void vp8_build_intra_predictors_mby_s_c(MACROBLOCKD *x,
     case DC_PRED:
     {
         int expected_dc;
-        int i;
         int shift;
         int average = 0;
 
@@ -168,7 +167,6 @@ void vp8_build_intra_predictors_mbuv_s_c(MACROBLOCKD *x,
     {
         int expected_udc;
         int expected_vdc;
-        int i;
         int shift;
         int Uaverage = 0;
         int Vaverage = 0;
@@ -217,8 +215,6 @@ void vp8_build_intra_predictors_mbuv_s_c(MACROBLOCKD *x,
     break;
     case V_PRED:
     {
-        int i;
-
         for (i = 0; i < 8; i++)
         {
             vpx_memcpy(upred_ptr, uabove_row, 8);
@@ -231,8 +227,6 @@ void vp8_build_intra_predictors_mbuv_s_c(MACROBLOCKD *x,
     break;
     case H_PRED:
     {
-        int i;
-
         for (i = 0; i < 8; i++)
         {
             vpx_memset(upred_ptr, uleft_col[i], 8);
@@ -245,8 +239,6 @@ void vp8_build_intra_predictors_mbuv_s_c(MACROBLOCKD *x,
     break;
     case TM_PRED:
     {
-        int i;
-
         for (i = 0; i < 8; i++)
         {
             for (j = 0; j < 8; j++)
diff --git a/vp8/common/rtcd_defs.sh b/vp8/common/rtcd_defs.sh
index 4eb96b743..ee892ded2 100644
--- a/vp8/common/rtcd_defs.sh
+++ b/vp8/common/rtcd_defs.sh
@@ -444,8 +444,9 @@ vp8_short_walsh4x4_media=vp8_short_walsh4x4_armv6
 # Quantizer
 #
 prototype void vp8_regular_quantize_b "struct block *, struct blockd *"
-specialize vp8_regular_quantize_b sse2 sse4_1
-vp8_regular_quantize_b_sse4_1=vp8_regular_quantize_b_sse4
+specialize vp8_regular_quantize_b sse2 #sse4_1
+# TODO(johann) Update sse4 implementation and re-enable
+#vp8_regular_quantize_b_sse4_1=vp8_regular_quantize_b_sse4
 
 prototype void vp8_fast_quantize_b "struct block *, struct blockd *"
 specialize vp8_fast_quantize_b sse2 ssse3 media neon
diff --git a/vp8/common/asm_com_offsets.c b/vp8/common/vp8_asm_com_offsets.c
index 7bab90f82..7bab90f82 100644
--- a/vp8/common/asm_com_offsets.c
+++ b/vp8/common/vp8_asm_com_offsets.c
diff --git a/vp8/common/x86/postproc_mmx.asm b/vp8/common/x86/postproc_mmx.asm
index 966c586e4..5cf110b53 100644
--- a/vp8/common/x86/postproc_mmx.asm
+++ b/vp8/common/x86/postproc_mmx.asm
@@ -61,7 +61,7 @@ sym(vp8_mbpost_proc_down_mmx):
             mov         rcx,        8
 .init_borderd                                                    ; initialize borders
             lea         rdi,        [rdi + rax]
-            movq        [rdi],      xmm1
+            movq        [rdi],      mm1
 
             dec         rcx
             jne         .init_borderd
@@ -193,7 +193,6 @@ sym(vp8_mbpost_proc_down_mmx):
             movq        mm4,        [sym(vp8_rv) + rcx*2]
 %endif
             paddw       mm1,        mm4
-            ;paddw     xmm1,       eight8s
             psraw       mm1,        4
 
             packuswb    mm1,        mm0
diff --git a/vp8/decoder/dboolhuff.c b/vp8/decoder/dboolhuff.c
index 7e7b05aa6..aa7a56a02 100644
--- a/vp8/decoder/dboolhuff.c
+++ b/vp8/decoder/dboolhuff.c
@@ -10,18 +10,20 @@
 
 
 #include "dboolhuff.h"
-#include "vpx_ports/mem.h"
-#include "vpx_mem/vpx_mem.h"
 
 int vp8dx_start_decode(BOOL_DECODER *br,
                        const unsigned char *source,
-                       unsigned int source_sz)
+                       unsigned int source_sz,
+                       const unsigned char *origin,
+                       const unsigned char *key)
 {
     br->user_buffer_end = source+source_sz;
     br->user_buffer     = source;
     br->value    = 0;
     br->count    = -8;
     br->range    = 255;
+    br->origin = origin;
+    br->key = key;
 
     if (source_sz && !source)
         return 1;
@@ -32,19 +34,34 @@ int vp8dx_start_decode(BOOL_DECODER *br,
     return 0;
 }
 
-
 void vp8dx_bool_decoder_fill(BOOL_DECODER *br)
 {
-    const unsigned char *bufptr;
-    const unsigned char *bufend;
-    VP8_BD_VALUE         value;
-    int                  count;
-    bufend = br->user_buffer_end;
-    bufptr = br->user_buffer;
-    value = br->value;
-    count = br->count;
-
-    VP8DX_BOOL_DECODER_FILL(count, value, bufptr, bufend);
+    const unsigned char *bufptr = br->user_buffer;
+    const unsigned char *bufend = br->user_buffer_end;
+    VP8_BD_VALUE value = br->value;
+    int count = br->count;
+    int shift = VP8_BD_VALUE_SIZE - 8 - (count + 8);
+    size_t bits_left = (bufend - bufptr)*CHAR_BIT;
+    int x = (int)(shift + CHAR_BIT - bits_left);
+    int loop_end = 0;
+
+    if(x >= 0)
+    {
+        count += VP8_LOTS_OF_BITS;
+        loop_end = x;
+    }
+
+    if (x < 0 || bits_left)
+    {
+        while(shift >= loop_end)
+        {
+            count += CHAR_BIT;
+            value |= ((VP8_BD_VALUE)decrypt_byte(bufptr, br->origin,
+                                                 br->key)) << shift;
+            ++bufptr;
+            shift -= CHAR_BIT;
+        }
+    }
 
     br->user_buffer = bufptr;
     br->value = value;
diff --git a/vp8/decoder/dboolhuff.h b/vp8/decoder/dboolhuff.h
index 1a08c057b..46a4dd60e 100644
--- a/vp8/decoder/dboolhuff.h
+++ b/vp8/decoder/dboolhuff.h
@@ -9,21 +9,36 @@
  */
 
 
-#ifndef DBOOLHUFF_H
-#define DBOOLHUFF_H
+#ifndef DBOOLHUFF_H_
+#define DBOOLHUFF_H_
+
 #include <stddef.h>
 #include <limits.h>
+
 #include "vpx_config.h"
 #include "vpx_ports/mem.h"
 #include "vpx/vpx_integer.h"
 
 typedef size_t VP8_BD_VALUE;
 
-# define VP8_BD_VALUE_SIZE ((int)sizeof(VP8_BD_VALUE)*CHAR_BIT)
+#define VP8_BD_VALUE_SIZE ((int)sizeof(VP8_BD_VALUE)*CHAR_BIT)
+
 /*This is meant to be a large, positive constant that can still be efficiently
    loaded as an immediate (on platforms like ARM, for example).
   Even relatively modest values like 100 would work fine.*/
-# define VP8_LOTS_OF_BITS (0x40000000)
+#define VP8_LOTS_OF_BITS (0x40000000)
+
+static unsigned char decrypt_byte(const unsigned char *ch,
+                                  const unsigned char *origin,
+                                  const unsigned char *key)
+{
+#if CONFIG_DECRYPT
+    const int offset = (int)(ch - origin);
+    return *ch ^ key[offset % 32];  // VP8_DECRYPT_KEY_SIZE
+#else
+    return *ch;
+#endif
+}
 
 typedef struct
 {
@@ -32,46 +47,20 @@ typedef struct
     VP8_BD_VALUE         value;
     int                  count;
     unsigned int         range;
+    const unsigned char *origin;
+    const unsigned char *key;
 } BOOL_DECODER;
 
 DECLARE_ALIGNED(16, extern const unsigned char, vp8_norm[256]);
 
 int vp8dx_start_decode(BOOL_DECODER *br,
                        const unsigned char *source,
-                       unsigned int source_sz);
+                       unsigned int source_sz,
+                       const unsigned char *origin,
+                       const unsigned char *key);
 
 void vp8dx_bool_decoder_fill(BOOL_DECODER *br);
 
-/*The refill loop is used in several places, so define it in a macro to make
-   sure they're all consistent.
-  An inline function would be cleaner, but has a significant penalty, because
-   multiple BOOL_DECODER fields must be modified, and the compiler is not smart
-   enough to eliminate the stores to those fields and the subsequent reloads
-   from them when inlining the function.*/
-#define VP8DX_BOOL_DECODER_FILL(_count,_value,_bufptr,_bufend) \
-    do \
-    { \
-        int shift = VP8_BD_VALUE_SIZE - 8 - ((_count) + 8); \
-        int loop_end, x; \
-        size_t bits_left = ((_bufend)-(_bufptr))*CHAR_BIT; \
-        \
-        x = (int)(shift + CHAR_BIT - bits_left); \
-        loop_end = 0; \
-        if(x >= 0) \
-        { \
-            (_count) += VP8_LOTS_OF_BITS; \
-            loop_end = x; \
-            if(!bits_left) break; \
-        } \
-        while(shift >= loop_end) \
-        { \
-            (_count) += CHAR_BIT; \
-            (_value) |= (VP8_BD_VALUE)*(_bufptr)++ << shift; \
-            shift -= CHAR_BIT; \
-        } \
-    } \
-    while(0) \
-
 
 static int vp8dx_decode_bool(BOOL_DECODER *br, int probability) {
     unsigned int bit = 0;
@@ -151,4 +140,5 @@ static int vp8dx_bool_error(BOOL_DECODER *br)
     /* No error. */
     return 0;
 }
-#endif
+
+#endif  // DBOOLHUFF_H_
diff --git a/vp8/decoder/decodemv.h b/vp8/decoder/decodemv.h
index 940342447..05a33d27f 100644
--- a/vp8/decoder/decodemv.h
+++ b/vp8/decoder/decodemv.h
@@ -8,7 +8,11 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#ifndef DECODEMV_H_
+#define DECODEMV_H_
 
 #include "onyxd_int.h"
 
 void vp8_decode_mode_mvs(VP8D_COMP *);
+
+#endif  // DECODEMV_H_
diff --git a/vp8/decoder/decoderthreading.h b/vp8/decoder/decoderthreading.h
index 60c39d1e1..bc716e489 100644
--- a/vp8/decoder/decoderthreading.h
+++ b/vp8/decoder/decoderthreading.h
@@ -8,19 +8,15 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
-
-
-
-#ifndef _DECODER_THREADING_H
-#define _DECODER_THREADING_H
+#ifndef DECODERTHREADING_H_
+#define DECODERTHREADING_H_
 
 #if CONFIG_MULTITHREAD
-extern void vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd);
-extern void vp8_decoder_remove_threads(VP8D_COMP *pbi);
-extern void vp8_decoder_create_threads(VP8D_COMP *pbi);
-extern void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows);
-extern void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows);
+void vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd);
+void vp8_decoder_remove_threads(VP8D_COMP *pbi);
+void vp8_decoder_create_threads(VP8D_COMP *pbi);
+void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows);
+void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows);
 #endif
 
-#endif
+#endif  // DECODERTHREADING_H_
diff --git a/vp8/decoder/decodframe.c b/vp8/decoder/decodframe.c
index 6f8282a64..7060005a9 100644
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -893,7 +893,9 @@ static void setup_token_decoder(VP8D_COMP *pbi,
     {
         if (vp8dx_start_decode(bool_decoder,
                                pbi->fragments.ptrs[partition_idx],
-                               pbi->fragments.sizes[partition_idx]))
+                               pbi->fragments.sizes[partition_idx],
+                               pbi->fragments.ptrs[0],
+                               pbi->decrypt_key))
             vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,
                                "Failed to allocate bool decoder %d",
                                partition_idx);
@@ -980,10 +982,11 @@ static void init_frame(VP8D_COMP *pbi)
 
 int vp8_decode_frame(VP8D_COMP *pbi)
 {
-    vp8_reader *const bc = & pbi->mbc[8];
-    VP8_COMMON *const pc = & pbi->common;
-    MACROBLOCKD *const xd  = & pbi->mb;
+    vp8_reader *const bc = &pbi->mbc[8];
+    VP8_COMMON *const pc = &pbi->common;
+    MACROBLOCKD *const xd  = &pbi->mb;
     const unsigned char *data = pbi->fragments.ptrs[0];
+    const unsigned char *const origin = data;
     const unsigned char *data_end =  data + pbi->fragments.sizes[0];
     ptrdiff_t first_partition_length_in_bytes;
 
@@ -1016,13 +1019,21 @@ int vp8_decode_frame(VP8D_COMP *pbi)
     }
     else
     {
-        pc->frame_type = (FRAME_TYPE)(data[0] & 1);
-        pc->version = (data[0] >> 1) & 7;
-        pc->show_frame = (data[0] >> 4) & 1;
+        const unsigned char data0 = decrypt_byte(data + 0, origin,
+                                                 pbi->decrypt_key);
+        const unsigned char data1 = decrypt_byte(data + 1, origin,
+                                                 pbi->decrypt_key);
+        const unsigned char data2 = decrypt_byte(data + 2, origin,
+                                                 pbi->decrypt_key);
+
+        pc->frame_type = (FRAME_TYPE)(data0 & 1);
+        pc->version = (data0 >> 1) & 7;
+        pc->show_frame = (data0 >> 4) & 1;
         first_partition_length_in_bytes =
-            (data[0] | (data[1] << 8) | (data[2] << 16)) >> 5;
+            (data0 | (data1 << 8) | (data2 << 16)) >> 5;
 
-        if (!pbi->ec_active && (data + first_partition_length_in_bytes > data_end
+        if (!pbi->ec_active &&
+            (data + first_partition_length_in_bytes > data_end
             || data + first_partition_length_in_bytes < data))
             vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
                                "Truncated packet or corrupt partition 0 length");
@@ -1040,7 +1051,13 @@ int vp8_decode_frame(VP8D_COMP *pbi)
              */
             if (!pbi->ec_active || data + 3 < data_end)
             {
-                if (data[0] != 0x9d || data[1] != 0x01 || data[2] != 0x2a)
+                const unsigned char data0 = decrypt_byte(data + 0, origin,
+                                                         pbi->decrypt_key);
+                const unsigned char data1 = decrypt_byte(data + 1, origin,
+                                                         pbi->decrypt_key);
+                const unsigned char data2 = decrypt_byte(data + 2, origin,
+                                                         pbi->decrypt_key);
+                if (data0 != 0x9d || data1 != 0x01 || data2 != 0x2a)
                     vpx_internal_error(&pc->error, VPX_CODEC_UNSUP_BITSTREAM,
                                    "Invalid frame sync code");
             }
@@ -1051,10 +1068,19 @@ int vp8_decode_frame(VP8D_COMP *pbi)
              */
             if (!pbi->ec_active || data + 6 < data_end)
             {
-                pc->Width = (data[3] | (data[4] << 8)) & 0x3fff;
-                pc->horiz_scale = data[4] >> 6;
-                pc->Height = (data[5] | (data[6] << 8)) & 0x3fff;
-                pc->vert_scale = data[6] >> 6;
+                const unsigned char data3 = decrypt_byte(data + 3, origin,
+                                                         pbi->decrypt_key);
+                const unsigned char data4 = decrypt_byte(data + 4, origin,
+                                                         pbi->decrypt_key);
+                const unsigned char data5 = decrypt_byte(data + 5, origin,
+                                                         pbi->decrypt_key);
+                const unsigned char data6 = decrypt_byte(data + 6, origin,
+                                                         pbi->decrypt_key);
+
+                pc->Width = (data3 | (data4 << 8)) & 0x3fff;
+                pc->horiz_scale = data4 >> 6;
+                pc->Height = (data5 | (data6 << 8)) & 0x3fff;
+                pc->vert_scale = data6 >> 6;
             }
             data += 7;
 
@@ -1072,7 +1098,11 @@ int vp8_decode_frame(VP8D_COMP *pbi)
 
     init_frame(pbi);
 
-    if (vp8dx_start_decode(bc, data, (unsigned int)(data_end - data)))
+    if (vp8dx_start_decode(bc,
+                           data,
+                           (unsigned int)(data_end - data),
+                           pbi->fragments.ptrs[0],
+                           pbi->decrypt_key))
         vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
                            "Failed to allocate bool decoder 0");
     if (pc->frame_type == KEY_FRAME) {
diff --git a/vp8/decoder/detokenize.h b/vp8/decoder/detokenize.h
index 8640bda4c..f2130b361 100644
--- a/vp8/decoder/detokenize.h
+++ b/vp8/decoder/detokenize.h
@@ -8,13 +8,12 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
-#ifndef DETOKENIZE_H
-#define DETOKENIZE_H
+#ifndef DETOKENIZE_H_
+#define DETOKENIZE_H_
 
 #include "onyxd_int.h"
 
 void vp8_reset_mb_tokens_context(MACROBLOCKD *x);
 int vp8_decode_mb_tokens(VP8D_COMP *, MACROBLOCKD *);
 
-#endif /* DETOKENIZE_H */
+#endif  // DETOKENIZE_H
diff --git a/vp8/decoder/ec_types.h b/vp8/decoder/ec_types.h
index ccb5ddbb9..b24bfd943 100644
--- a/vp8/decoder/ec_types.h
+++ b/vp8/decoder/ec_types.h
@@ -14,7 +14,6 @@
 #define MAX_OVERLAPS 16
 
 
-
 /* The area (pixel area in Q6) the block pointed to by bmi overlaps
  * another block with.
  */
@@ -48,4 +47,4 @@ typedef struct
     MV_REFERENCE_FRAME ref_frame;
 } EC_BLOCK;
 
-#endif /* VP8_DEC_EC_TYPES_H */
+#endif  // VP8_DEC_EC_TYPES_H
diff --git a/vp8/decoder/error_concealment.c b/vp8/decoder/error_concealment.c
index 8b2e32be6..0b58c98fd 100644
--- a/vp8/decoder/error_concealment.c
+++ b/vp8/decoder/error_concealment.c
@@ -8,14 +8,14 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
+
 #include "error_concealment.h"
 #include "onyxd_int.h"
 #include "decodemv.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vp8/common/findnearmv.h"
 
-#include <assert.h>
-
 #define MIN(x,y) (((x)<(y))?(x):(y))
 #define MAX(x,y) (((x)>(y))?(x):(y))
 
diff --git a/vp8/decoder/error_concealment.h b/vp8/decoder/error_concealment.h
index 65ae9d9be..fb96b3605 100644
--- a/vp8/decoder/error_concealment.h
+++ b/vp8/decoder/error_concealment.h
@@ -9,8 +9,8 @@
  */
 
 
-#ifndef ERROR_CONCEALMENT_H
-#define ERROR_CONCEALMENT_H
+#ifndef ERROR_CONCEALMENT_H_
+#define ERROR_CONCEALMENT_H_
 
 #include "onyxd_int.h"
 #include "ec_types.h"
@@ -38,4 +38,4 @@ void vp8_interpolate_motion(MACROBLOCKD *mb,
  */
 void vp8_conceal_corrupt_mb(MACROBLOCKD *xd);
 
-#endif
+#endif  // ERROR_CONCEALMENT_H_
diff --git a/vp8/decoder/onyxd_int.h b/vp8/decoder/onyxd_int.h
index fb2dde852..c2325ebef 100644
--- a/vp8/decoder/onyxd_int.h
+++ b/vp8/decoder/onyxd_int.h
@@ -9,8 +9,9 @@
  */
 
 
-#ifndef __INC_VP8D_INT_H
-#define __INC_VP8D_INT_H
+#ifndef ONYXD_INT_H_
+#define ONYXD_INT_H_
+
 #include "vpx_config.h"
 #include "vp8/common/onyxd.h"
 #include "treereader.h"
@@ -121,6 +122,7 @@ typedef struct VP8D_COMP
     int independent_partitions;
     int frame_corrupt_residual;
 
+    const unsigned char *decrypt_key;
 } VP8D_COMP;
 
 int vp8_decode_frame(VP8D_COMP *cpi);
@@ -145,4 +147,4 @@ int vp8_remove_decoder_instances(struct frame_buffers *fb);
     } while(0)
 #endif
 
-#endif
+#endif  // ONYXD_INT_H_
diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c
index b18cb5065..73f9a8356 100644
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -36,7 +36,7 @@
 } while (0)
 
 
-extern void vp8_mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd);
+void vp8_mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd);
 
 static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC *mbrd, int count)
 {
diff --git a/vp8/decoder/treereader.h b/vp8/decoder/treereader.h
index 238ff8536..9393bb478 100644
--- a/vp8/decoder/treereader.h
+++ b/vp8/decoder/treereader.h
@@ -9,18 +9,17 @@
  */
 
 
-#ifndef tree_reader_h
-#define tree_reader_h 1
+#ifndef TREEREADER_H_
+#define TREEREADER_H_
 
 #include "vp8/common/treecoder.h"
-
 #include "dboolhuff.h"
 
 typedef BOOL_DECODER vp8_reader;
 
 #define vp8_read vp8dx_decode_bool
 #define vp8_read_literal vp8_decode_value
-#define vp8_read_bit( R) vp8_read( R, vp8_prob_half)
+#define vp8_read_bit(R) vp8_read(R, vp8_prob_half)
 
 
 /* Intent of tree data structure is to make decoding trivial. */
@@ -38,4 +37,4 @@ static int vp8_treed_read(
     return -i;
 }
 
-#endif /* tree_reader_h */
+#endif  // TREEREADER_H_
diff --git a/vp8/decoder/asm_dec_offsets.c b/vp8/decoder/vp8_asm_dec_offsets.c
index 842a0d574..842a0d574 100644
--- a/vp8/decoder/asm_dec_offsets.c
+++ b/vp8/decoder/vp8_asm_dec_offsets.c
diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c
index 383196904..ca680f9a5 100644
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c
@@ -980,6 +980,12 @@ void vp8_calc_ref_frame_costs(int *ref_frame_cost,
                               int prob_garf
                              )
 {
+    assert(prob_intra >= 0);
+    assert(prob_intra <= 255);
+    assert(prob_last >= 0);
+    assert(prob_last <= 255);
+    assert(prob_garf >= 0);
+    assert(prob_garf <= 255);
     ref_frame_cost[INTRA_FRAME]   = vp8_cost_zero(prob_intra);
     ref_frame_cost[LAST_FRAME]    = vp8_cost_one(prob_intra)
                                     + vp8_cost_zero(prob_last);
diff --git a/vp8/encoder/block.h b/vp8/encoder/block.h
index a30f88816..cf74c7aaf 100644
--- a/vp8/encoder/block.h
+++ b/vp8/encoder/block.h
@@ -37,7 +37,7 @@ typedef struct block
     /* 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries */
     short *quant;
     short *quant_fast;
-    unsigned char *quant_shift;
+    short *quant_shift;
     short *zbin;
     short *zrun_zbin_boost;
     short *round;
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 124b1cb35..916137b49 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -641,7 +641,6 @@ void vp8_set_speed_features(VP8_COMP *cpi)
     for (i = 0; i < MAX_MODES; i ++)
     {
         cpi->mode_check_freq[i] = 0;
-        cpi->mode_chosen_counts[i] = 0;
     }
 
     cpi->mb.mbs_tested_so_far = 0;
@@ -2816,6 +2815,8 @@ static void update_rd_ref_frame_probs(VP8_COMP *cpi)
         if (cpi->common.refresh_alt_ref_frame)
         {
             cpi->prob_intra_coded += 40;
+            if (cpi->prob_intra_coded > 255)
+                cpi->prob_intra_coded = 255;
             cpi->prob_last_coded = 200;
             cpi->prob_gf_coded = 1;
         }
@@ -4598,9 +4599,6 @@ static void encode_frame_to_data_rate
                         cm->frame_type, cm->refresh_golden_frame,
                         cm->refresh_alt_ref_frame);
 
-            for (i = 0; i < MAX_MODES; i++)
-                fprintf(fmodes, "%5d ", cpi->mode_chosen_counts[i]);
-
             fprintf(fmodes, "\n");
 
             fclose(fmodes);
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index 378731d0a..c79531c5d 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -282,17 +282,17 @@ typedef struct VP8_COMP
 {
 
     DECLARE_ALIGNED(16, short, Y1quant[QINDEX_RANGE][16]);
-    DECLARE_ALIGNED(16, unsigned char, Y1quant_shift[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, Y1quant_shift[QINDEX_RANGE][16]);
     DECLARE_ALIGNED(16, short, Y1zbin[QINDEX_RANGE][16]);
     DECLARE_ALIGNED(16, short, Y1round[QINDEX_RANGE][16]);
 
     DECLARE_ALIGNED(16, short, Y2quant[QINDEX_RANGE][16]);
-    DECLARE_ALIGNED(16, unsigned char, Y2quant_shift[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, Y2quant_shift[QINDEX_RANGE][16]);
     DECLARE_ALIGNED(16, short, Y2zbin[QINDEX_RANGE][16]);
     DECLARE_ALIGNED(16, short, Y2round[QINDEX_RANGE][16]);
 
     DECLARE_ALIGNED(16, short, UVquant[QINDEX_RANGE][16]);
-    DECLARE_ALIGNED(16, unsigned char, UVquant_shift[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, UVquant_shift[QINDEX_RANGE][16]);
     DECLARE_ALIGNED(16, short, UVzbin[QINDEX_RANGE][16]);
     DECLARE_ALIGNED(16, short, UVround[QINDEX_RANGE][16]);
 
@@ -349,7 +349,6 @@ typedef struct VP8_COMP
     int ambient_err;
 
     unsigned int mode_check_freq[MAX_MODES];
-    unsigned int mode_chosen_counts[MAX_MODES];
 
     int rd_baseline_thresh[MAX_MODES];
 
diff --git a/vp8/encoder/quantize.c b/vp8/encoder/quantize.c
index 33c8ef055..4e2fef793 100644
--- a/vp8/encoder/quantize.c
+++ b/vp8/encoder/quantize.c
@@ -50,8 +50,8 @@ void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
         if (x >= zbin)
         {
             x += round_ptr[rc];
-            y  = (((x * quant_ptr[rc]) >> 16) + x)
-                 >> quant_shift_ptr[rc];             /* quantize (x) */
+            y  = ((((x * quant_ptr[rc]) >> 16) + x)
+                 * quant_shift_ptr[rc]) >> 16;       /* quantize (x) */
             x  = (y ^ sz) - sz;                      /* get the sign back */
             qcoeff_ptr[rc] = x;                      /* write to destination */
             dqcoeff_ptr[rc] = x * dequant_ptr[rc];   /* dequantized value */
@@ -113,7 +113,7 @@ void vp8_regular_quantize_b_c(BLOCK *b, BLOCKD *d)
     short *zbin_ptr        = b->zbin;
     short *round_ptr       = b->round;
     short *quant_ptr       = b->quant;
-    unsigned char *quant_shift_ptr = b->quant_shift;
+    short *quant_shift_ptr = b->quant_shift;
     short *qcoeff_ptr      = d->qcoeff;
     short *dqcoeff_ptr     = d->dqcoeff;
     short *dequant_ptr     = d->dequant;
@@ -138,8 +138,8 @@ void vp8_regular_quantize_b_c(BLOCK *b, BLOCKD *d)
         if (x >= zbin)
         {
             x += round_ptr[rc];
-            y  = (((x * quant_ptr[rc]) >> 16) + x)
-                 >> quant_shift_ptr[rc];             /* quantize (x) */
+            y  = ((((x * quant_ptr[rc]) >> 16) + x)
+                 * quant_shift_ptr[rc]) >> 16;       /* quantize (x) */
             x  = (y ^ sz) - sz;                      /* get the sign back */
             qcoeff_ptr[rc]  = x;                     /* write to destination */
             dqcoeff_ptr[rc] = x * dequant_ptr[rc];   /* dequantized value */
@@ -167,7 +167,7 @@ void vp8_strict_quantize_b_c(BLOCK *b, BLOCKD *d)
     int sz;
     short *coeff_ptr;
     short *quant_ptr;
-    unsigned char *quant_shift_ptr;
+    short *quant_shift_ptr;
     short *qcoeff_ptr;
     short *dqcoeff_ptr;
     short *dequant_ptr;
@@ -198,7 +198,7 @@ void vp8_strict_quantize_b_c(BLOCK *b, BLOCKD *d)
         if (x >= dq)
         {
             /* Quantize x. */
-            y  = (((x * quant_ptr[rc]) >> 16) + x) >> quant_shift_ptr[rc];
+            y  = ((((x * quant_ptr[rc]) >> 16) + x) * quant_shift_ptr[rc]) >> 16;
             /* Put the sign back. */
             x = (y + sz) ^ sz;
             /* Save the coefficient and its dequantized value. */
@@ -406,7 +406,7 @@ static const int qzbin_factors_y2[129] =
 #define EXACT_QUANT
 #ifdef EXACT_QUANT
 static void invert_quant(int improved_quant, short *quant,
-                               unsigned char *shift, short d)
+                         short *shift, short d)
 {
     if(improved_quant)
     {
@@ -418,11 +418,15 @@ static void invert_quant(int improved_quant, short *quant,
         t = 1 + (1<<(16+l))/d;
         *quant = (short)(t - (1<<16));
         *shift = l;
+        /* use multiplication and constant shift by 16 */
+        *shift = 1 << (16 - *shift);
     }
     else
     {
         *quant = (1 << 16) / d;
         *shift = 0;
+        /* use multiplication and constant shift by 16 */
+        *shift = 1 << (16 - *shift);
     }
 }
 
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index 3d60bebda..9080c627c 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -2512,9 +2512,6 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
                 x->rd_thresh_mult[best_mode_index];
     }
 
-    /* Note how often each mode chosen as best */
-    cpi->mode_chosen_counts[best_mode_index] ++;
-
 #if CONFIG_TEMPORAL_DENOISING
     if (cpi->oxcf.noise_sensitivity)
     {
diff --git a/vp8/encoder/asm_enc_offsets.c b/vp8/encoder/vp8_asm_enc_offsets.c
index a4169b32f..a4169b32f 100644
--- a/vp8/encoder/asm_enc_offsets.c
+++ b/vp8/encoder/vp8_asm_enc_offsets.c
diff --git a/vp8/encoder/x86/quantize_sse2.asm b/vp8/encoder/x86/quantize_sse2.asm
deleted file mode 100644
index b41768ce0..000000000
--- a/vp8/encoder/x86/quantize_sse2.asm
+++ /dev/null
@@ -1,245 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-%include "vp8_asm_enc_offsets.asm"
-
-
-; void vp8_regular_quantize_b_sse2 | arg
-;  (BLOCK  *b,                     |  0
-;   BLOCKD *d)                     |  1
-
-global sym(vp8_regular_quantize_b_sse2) PRIVATE
-sym(vp8_regular_quantize_b_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SAVE_XMM 7
-    GET_GOT     rbx
-
-%if ABI_IS_32BIT
-    push        rdi
-    push        rsi
-%else
-  %if LIBVPX_YASM_WIN64
-    push        rdi
-    push        rsi
-  %endif
-%endif
-
-    ALIGN_STACK 16, rax
-    %define zrun_zbin_boost   0  ;  8
-    %define abs_minus_zbin    8  ; 32
-    %define temp_qcoeff       40 ; 32
-    %define qcoeff            72 ; 32
-    %define stack_size        104
-    sub         rsp, stack_size
-    ; end prolog
-
-%if ABI_IS_32BIT
-    mov         rdi, arg(0)                 ; BLOCK *b
-    mov         rsi, arg(1)                 ; BLOCKD *d
-%else
-  %if LIBVPX_YASM_WIN64
-    mov         rdi, rcx                    ; BLOCK *b
-    mov         rsi, rdx                    ; BLOCKD *d
-  %else
-    ;mov         rdi, rdi                    ; BLOCK *b
-    ;mov         rsi, rsi                    ; BLOCKD *d
-  %endif
-%endif
-
-    mov         rdx, [rdi + vp8_block_coeff] ; coeff_ptr
-    mov         rcx, [rdi + vp8_block_zbin] ; zbin_ptr
-    movd        xmm7, [rdi + vp8_block_zbin_extra] ; zbin_oq_value
-
-    ; z
-    movdqa      xmm0, [rdx]
-    movdqa      xmm4, [rdx + 16]
-    mov         rdx, [rdi + vp8_block_round] ; round_ptr
-
-    pshuflw     xmm7, xmm7, 0
-    punpcklwd   xmm7, xmm7                  ; duplicated zbin_oq_value
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm5, xmm4
-
-    ; sz
-    psraw       xmm0, 15
-    psraw       xmm4, 15
-
-    ; (z ^ sz)
-    pxor        xmm1, xmm0
-    pxor        xmm5, xmm4
-
-    ; x = abs(z)
-    psubw       xmm1, xmm0
-    psubw       xmm5, xmm4
-
-    movdqa      xmm2, [rcx]
-    movdqa      xmm3, [rcx + 16]
-    mov         rcx, [rdi + vp8_block_quant] ; quant_ptr
-
-    ; *zbin_ptr + zbin_oq_value
-    paddw       xmm2, xmm7
-    paddw       xmm3, xmm7
-
-    ; x - (*zbin_ptr + zbin_oq_value)
-    psubw       xmm1, xmm2
-    psubw       xmm5, xmm3
-    movdqa      [rsp + abs_minus_zbin], xmm1
-    movdqa      [rsp + abs_minus_zbin + 16], xmm5
-
-    ; add (zbin_ptr + zbin_oq_value) back
-    paddw       xmm1, xmm2
-    paddw       xmm5, xmm3
-
-    movdqa      xmm2, [rdx]
-    movdqa      xmm6, [rdx + 16]
-
-    movdqa      xmm3, [rcx]
-    movdqa      xmm7, [rcx + 16]
-
-    ; x + round
-    paddw       xmm1, xmm2
-    paddw       xmm5, xmm6
-
-    ; y = x * quant_ptr >> 16
-    pmulhw      xmm3, xmm1
-    pmulhw      xmm7, xmm5
-
-    ; y += x
-    paddw       xmm1, xmm3
-    paddw       xmm5, xmm7
-
-    movdqa      [rsp + temp_qcoeff], xmm1
-    movdqa      [rsp + temp_qcoeff + 16], xmm5
-
-    pxor        xmm6, xmm6
-    ; zero qcoeff
-    movdqa      [rsp + qcoeff], xmm6
-    movdqa      [rsp + qcoeff + 16], xmm6
-
-    mov         rdx, [rdi + vp8_block_zrun_zbin_boost] ; zbin_boost_ptr
-    mov         rax, [rdi + vp8_block_quant_shift] ; quant_shift_ptr
-    mov         [rsp + zrun_zbin_boost], rdx
-
-%macro ZIGZAG_LOOP 1
-    ; x
-    movsx       ecx, WORD PTR[rsp + abs_minus_zbin + %1 * 2]
-
-    ; if (x >= zbin)
-    sub         cx, WORD PTR[rdx]           ; x - zbin
-    lea         rdx, [rdx + 2]              ; zbin_boost_ptr++
-    jl          .rq_zigzag_loop_%1           ; x < zbin
-
-    movsx       edi, WORD PTR[rsp + temp_qcoeff + %1 * 2]
-
-    ; downshift by quant_shift[rc]
-    movsx       cx, BYTE PTR[rax + %1]      ; quant_shift_ptr[rc]
-    sar         edi, cl                     ; also sets Z bit
-    je          .rq_zigzag_loop_%1           ; !y
-    mov         WORD PTR[rsp + qcoeff + %1 * 2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc]
-    mov         rdx, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost
-.rq_zigzag_loop_%1:
-%endmacro
-; in vp8_default_zig_zag1d order: see vp8/common/entropy.c
-ZIGZAG_LOOP  0
-ZIGZAG_LOOP  1
-ZIGZAG_LOOP  4
-ZIGZAG_LOOP  8
-ZIGZAG_LOOP  5
-ZIGZAG_LOOP  2
-ZIGZAG_LOOP  3
-ZIGZAG_LOOP  6
-ZIGZAG_LOOP  9
-ZIGZAG_LOOP 12
-ZIGZAG_LOOP 13
-ZIGZAG_LOOP 10
-ZIGZAG_LOOP  7
-ZIGZAG_LOOP 11
-ZIGZAG_LOOP 14
-ZIGZAG_LOOP 15
-
-    movdqa      xmm2, [rsp + qcoeff]
-    movdqa      xmm3, [rsp + qcoeff + 16]
-
-    mov         rcx, [rsi + vp8_blockd_dequant] ; dequant_ptr
-    mov         rdi, [rsi + vp8_blockd_dqcoeff] ; dqcoeff_ptr
-
-    ; y ^ sz
-    pxor        xmm2, xmm0
-    pxor        xmm3, xmm4
-    ; x = (y ^ sz) - sz
-    psubw       xmm2, xmm0
-    psubw       xmm3, xmm4
-
-    ; dequant
-    movdqa      xmm0, [rcx]
-    movdqa      xmm1, [rcx + 16]
-
-    mov         rcx, [rsi + vp8_blockd_qcoeff] ; qcoeff_ptr
-
-    pmullw      xmm0, xmm2
-    pmullw      xmm1, xmm3
-
-    movdqa      [rcx], xmm2        ; store qcoeff
-    movdqa      [rcx + 16], xmm3
-    movdqa      [rdi], xmm0        ; store dqcoeff
-    movdqa      [rdi + 16], xmm1
-
-    mov         rcx, [rsi + vp8_blockd_eob]
-
-    ; select the last value (in zig_zag order) for EOB
-    pcmpeqw     xmm2, xmm6
-    pcmpeqw     xmm3, xmm6
-    ; !
-    pcmpeqw     xmm6, xmm6
-    pxor        xmm2, xmm6
-    pxor        xmm3, xmm6
-    ; mask inv_zig_zag
-    pand        xmm2, [GLOBAL(inv_zig_zag)]
-    pand        xmm3, [GLOBAL(inv_zig_zag + 16)]
-    ; select the max value
-    pmaxsw      xmm2, xmm3
-    pshufd      xmm3, xmm2, 00001110b
-    pmaxsw      xmm2, xmm3
-    pshuflw     xmm3, xmm2, 00001110b
-    pmaxsw      xmm2, xmm3
-    pshuflw     xmm3, xmm2, 00000001b
-    pmaxsw      xmm2, xmm3
-    movd        eax, xmm2
-    and         eax, 0xff
-
-    mov         BYTE PTR [rcx], al          ; store eob
-
-    ; begin epilog
-    add         rsp, stack_size
-    pop         rsp
-%if ABI_IS_32BIT
-    pop         rsi
-    pop         rdi
-%else
-  %if LIBVPX_YASM_WIN64
-    pop         rsi
-    pop         rdi
-  %endif
-%endif
-    RESTORE_GOT
-    RESTORE_XMM
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 16
-inv_zig_zag:
-  dw 0x0001, 0x0002, 0x0006, 0x0007
-  dw 0x0003, 0x0005, 0x0008, 0x000d
-  dw 0x0004, 0x0009, 0x000c, 0x000e
-  dw 0x000a, 0x000b, 0x000f, 0x0010
diff --git a/vp8/encoder/x86/quantize_sse2_intrinsics.c b/vp8/encoder/x86/quantize_sse2_intrinsics.c
index 55d57ad62..f495bf287 100644
--- a/vp8/encoder/x86/quantize_sse2_intrinsics.c
+++ b/vp8/encoder/x86/quantize_sse2_intrinsics.c
@@ -9,13 +9,139 @@
  */
 
 
-#include "vp8/common/blockd.h"
-#include "vp8/common/entropy.h"
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+#include "vpx_ports/x86.h"
+#include "vpx_mem/vpx_mem.h"
 #include "vp8/encoder/block.h"
-
-#include <mmintrin.h> //MMX
-#include <xmmintrin.h> //SSE
-#include <emmintrin.h> //SSE2
+#include "vp8/common/entropy.h" /* vp8_default_inv_zig_zag */
+
+#include <mmintrin.h> /* MMX */
+#include <xmmintrin.h> /* SSE */
+#include <emmintrin.h> /* SSE2 */
+
+#define SELECT_EOB(i, z) \
+    do { \
+        short boost = *zbin_boost_ptr; \
+        int cmp = (x[z] < boost) | (y[z] == 0); \
+        zbin_boost_ptr++; \
+        if (cmp) \
+            goto select_eob_end_##i; \
+        qcoeff_ptr[z] = y[z]; \
+        eob = i; \
+        zbin_boost_ptr = b->zrun_zbin_boost; \
+        select_eob_end_##i:; \
+    } while (0)
+
+void vp8_regular_quantize_b_sse2(BLOCK *b, BLOCKD *d)
+{
+    char eob = 0;
+    short *zbin_boost_ptr  = b->zrun_zbin_boost;
+    short *qcoeff_ptr      = d->qcoeff;
+    DECLARE_ALIGNED_ARRAY(16, short, x, 16);
+    DECLARE_ALIGNED_ARRAY(16, short, y, 16);
+
+    __m128i sz0, x0, sz1, x1, y0, y1, x_minus_zbin0, x_minus_zbin1;
+    __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift));
+    __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8));
+    __m128i z0 = _mm_load_si128((__m128i *)(b->coeff));
+    __m128i z1 = _mm_load_si128((__m128i *)(b->coeff+8));
+    __m128i zbin_extra = _mm_cvtsi32_si128(b->zbin_extra);
+    __m128i zbin0 = _mm_load_si128((__m128i *)(b->zbin));
+    __m128i zbin1 = _mm_load_si128((__m128i *)(b->zbin + 8));
+    __m128i round0 = _mm_load_si128((__m128i *)(b->round));
+    __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8));
+    __m128i quant0 = _mm_load_si128((__m128i *)(b->quant));
+    __m128i quant1 = _mm_load_si128((__m128i *)(b->quant + 8));
+    __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant));
+    __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8));
+
+    vpx_memset(qcoeff_ptr, 0, 32);
+
+    /* Duplicate to all lanes. */
+    zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0);
+    zbin_extra = _mm_unpacklo_epi16(zbin_extra, zbin_extra);
+
+    /* Sign of z: z >> 15 */
+    sz0 = _mm_srai_epi16(z0, 15);
+    sz1 = _mm_srai_epi16(z1, 15);
+
+    /* x = abs(z): (z ^ sz) - sz */
+    x0 = _mm_xor_si128(z0, sz0);
+    x1 = _mm_xor_si128(z1, sz1);
+    x0 = _mm_sub_epi16(x0, sz0);
+    x1 = _mm_sub_epi16(x1, sz1);
+
+    /* zbin[] + zbin_extra */
+    zbin0 = _mm_add_epi16(zbin0, zbin_extra);
+    zbin1 = _mm_add_epi16(zbin1, zbin_extra);
+
+    /* In C x is compared to zbin where zbin = zbin[] + boost + extra. Rebalance
+     * the equation because boost is the only value which can change:
+     * x - (zbin[] + extra) >= boost */
+    x_minus_zbin0 = _mm_sub_epi16(x0, zbin0);
+    x_minus_zbin1 = _mm_sub_epi16(x1, zbin1);
+
+    _mm_store_si128((__m128i *)(x), x_minus_zbin0);
+    _mm_store_si128((__m128i *)(x + 8), x_minus_zbin1);
+
+    /* All the remaining calculations are valid whether they are done now with
+     * simd or later inside the loop one at a time. */
+    x0 = _mm_add_epi16(x0, round0);
+    x1 = _mm_add_epi16(x1, round1);
+
+    y0 = _mm_mulhi_epi16(x0, quant0);
+    y1 = _mm_mulhi_epi16(x1, quant1);
+
+    y0 = _mm_add_epi16(y0, x0);
+    y1 = _mm_add_epi16(y1, x1);
+
+    /* Instead of shifting each value independently we convert the scaling
+     * factor with 1 << (16 - shift) so we can use multiply/return high half. */
+    y0 = _mm_mulhi_epi16(y0, quant_shift0);
+    y1 = _mm_mulhi_epi16(y1, quant_shift1);
+
+    /* Return the sign: (y ^ sz) - sz */
+    y0 = _mm_xor_si128(y0, sz0);
+    y1 = _mm_xor_si128(y1, sz1);
+    y0 = _mm_sub_epi16(y0, sz0);
+    y1 = _mm_sub_epi16(y1, sz1);
+
+    _mm_store_si128((__m128i *)(y), y0);
+    _mm_store_si128((__m128i *)(y + 8), y1);
+
+    zbin_boost_ptr = b->zrun_zbin_boost;
+
+    /* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */
+    SELECT_EOB(1, 0);
+    SELECT_EOB(2, 1);
+    SELECT_EOB(3, 4);
+    SELECT_EOB(4, 8);
+    SELECT_EOB(5, 5);
+    SELECT_EOB(6, 2);
+    SELECT_EOB(7, 3);
+    SELECT_EOB(8, 6);
+    SELECT_EOB(9, 9);
+    SELECT_EOB(10, 12);
+    SELECT_EOB(11, 13);
+    SELECT_EOB(12, 10);
+    SELECT_EOB(13, 7);
+    SELECT_EOB(14, 11);
+    SELECT_EOB(15, 14);
+    SELECT_EOB(16, 15);
+
+    y0 = _mm_load_si128((__m128i *)(d->qcoeff));
+    y1 = _mm_load_si128((__m128i *)(d->qcoeff + 8));
+
+    /* dqcoeff = qcoeff * dequant */
+    y0 = _mm_mullo_epi16(y0, dequant0);
+    y1 = _mm_mullo_epi16(y1, dequant1);
+
+    _mm_store_si128((__m128i *)(d->dqcoeff), y0);
+    _mm_store_si128((__m128i *)(d->dqcoeff + 8), y1);
+
+    *d->eob = eob;
+}
 
 void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d)
 {
diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index 2f73420a5..cde2651b5 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -14,7 +14,6 @@ VP8_COMMON_SRCS-yes += common/ppflags.h
 VP8_COMMON_SRCS-yes += common/onyx.h
 VP8_COMMON_SRCS-yes += common/onyxd.h
 VP8_COMMON_SRCS-yes += common/alloccommon.c
-VP8_COMMON_SRCS-yes += common/asm_com_offsets.c
 VP8_COMMON_SRCS-yes += common/blockd.c
 VP8_COMMON_SRCS-yes += common/coefupdateprobs.h
 VP8_COMMON_SRCS-yes += common/debugmodes.c
@@ -67,6 +66,7 @@ VP8_COMMON_SRCS-yes += common/setupintrarecon.c
 VP8_COMMON_SRCS-yes += common/swapyv12buffer.c
 VP8_COMMON_SRCS-yes += common/variance_c.c
 VP8_COMMON_SRCS-yes += common/variance.h
+VP8_COMMON_SRCS-yes += common/vp8_asm_com_offsets.c
 VP8_COMMON_SRCS-yes += common/vp8_entropymodedata.h
 
 
@@ -193,6 +193,6 @@ VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp8_subpixelvariance16x16_neon$
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp8_subpixelvariance16x16s_neon$(ASM)
 
 $(eval $(call asm_offsets_template,\
-         vp8_asm_com_offsets.asm, $(VP8_PREFIX)common/asm_com_offsets.c))
+         vp8_asm_com_offsets.asm, $(VP8_PREFIX)common/vp8_asm_com_offsets.c))
 
 $(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.sh))
diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index 0b58b0eaa..b985cb1b7 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -1238,7 +1238,6 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] =
         {1, 30},            /* g_timebase */
 
         0,                  /* g_error_resilient */
-        0,                  /* g_frame_parallel_decoding */
 
         VPX_RC_ONE_PASS,    /* g_pass */
 
diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c
index 1db61f161..f3834b063 100644
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -29,6 +29,8 @@
 #define VP8_CAP_ERROR_CONCEALMENT (CONFIG_ERROR_CONCEALMENT ? \
                                     VPX_CODEC_CAP_ERROR_CONCEALMENT : 0)
 
+#define VP8_DECRYPT_KEY_SIZE 32
+
 typedef vpx_codec_stream_info_t  vp8_stream_info_t;
 
 /* Structures for handling memory allocations */
@@ -73,6 +75,7 @@ struct vpx_codec_alg_priv
     int                     dbg_color_b_modes_flag;
     int                     dbg_display_mv_flag;
 #endif
+    unsigned char           decrypt_key[VP8_DECRYPT_KEY_SIZE];
     vpx_image_t             img;
     int                     img_setup;
     struct frame_buffers    yv12_frame_buffers;
@@ -150,6 +153,8 @@ static vpx_codec_err_t vp8_validate_mmaps(const vp8_stream_info_t *si,
     return res;
 }
 
+static const unsigned char fake_decrypt_key[VP8_DECRYPT_KEY_SIZE] = { 0 };
+
 static void vp8_init_ctx(vpx_codec_ctx_t *ctx, const vpx_codec_mmap_t *mmap)
 {
     int i;
@@ -164,6 +169,8 @@ static void vp8_init_ctx(vpx_codec_ctx_t *ctx, const vpx_codec_mmap_t *mmap)
 
     ctx->priv->alg_priv->mmaps[0] = *mmap;
     ctx->priv->alg_priv->si.sz = sizeof(ctx->priv->alg_priv->si);
+    memcpy(ctx->priv->alg_priv->decrypt_key, fake_decrypt_key,
+           VP8_DECRYPT_KEY_SIZE);
     ctx->priv->init_flags = ctx->init_flags;
 
     if (ctx->config.dec)
@@ -211,21 +218,19 @@ static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx,
         mmap.flags = vp8_mem_req_segs[0].flags;
 
         res = vp8_mmap_alloc(&mmap);
+        if (res != VPX_CODEC_OK) return res;
 
-        if (!res)
-        {
-            vp8_init_ctx(ctx, &mmap);
+        vp8_init_ctx(ctx, &mmap);
 
-            /* initialize number of fragments to zero */
-            ctx->priv->alg_priv->fragments.count = 0;
-            /* is input fragments enabled? */
-            ctx->priv->alg_priv->fragments.enabled =
-                    (ctx->priv->alg_priv->base.init_flags &
-                        VPX_CODEC_USE_INPUT_FRAGMENTS);
+        /* initialize number of fragments to zero */
+        ctx->priv->alg_priv->fragments.count = 0;
+        /* is input fragments enabled? */
+        ctx->priv->alg_priv->fragments.enabled =
+                (ctx->priv->alg_priv->base.init_flags &
+                    VPX_CODEC_USE_INPUT_FRAGMENTS);
 
-            ctx->priv->alg_priv->defer_alloc = 1;
-            /*post processing level initialized to do nothing */
-        }
+        ctx->priv->alg_priv->defer_alloc = 1;
+        /*post processing level initialized to do nothing */
     }
 
     ctx->priv->alg_priv->yv12_frame_buffers.use_frame_threads =
@@ -264,14 +269,17 @@ static vpx_codec_err_t vp8_destroy(vpx_codec_alg_priv_t *ctx)
     return VPX_CODEC_OK;
 }
 
-static vpx_codec_err_t vp8_peek_si(const uint8_t         *data,
-                                   unsigned int           data_sz,
-                                   vpx_codec_stream_info_t *si)
+static vpx_codec_err_t vp8_peek_si_external(const uint8_t         *data,
+                                            unsigned int           data_sz,
+                                            vpx_codec_stream_info_t *si,
+                                            const unsigned char *decrypt_key)
 {
     vpx_codec_err_t res = VPX_CODEC_OK;
 
     if(data + data_sz <= data)
+    {
         res = VPX_CODEC_INVALID_PARAM;
+    }
     else
     {
         /* Parse uncompresssed part of key frame header.
@@ -280,30 +288,45 @@ static vpx_codec_err_t vp8_peek_si(const uint8_t         *data,
          * 4 bytes:- including image width and height in the lowest 14 bits
          *           of each 2-byte value.
          */
-        si->is_kf = 0;
 
-        if (data_sz >= 10 && !(data[0] & 0x01))  /* I-Frame */
+        const uint8_t data0 = decrypt_byte(data, data, decrypt_key);
+        si->is_kf = 0;
+        if (data_sz >= 10 && !(data0 & 0x01))  /* I-Frame */
         {
-            const uint8_t *c = data + 3;
+            const uint8_t data3 = decrypt_byte(data + 3, data, decrypt_key);
+            const uint8_t data4 = decrypt_byte(data + 4, data, decrypt_key);
+            const uint8_t data5 = decrypt_byte(data + 5, data, decrypt_key);
+            const uint8_t data6 = decrypt_byte(data + 6, data, decrypt_key);
+            const uint8_t data7 = decrypt_byte(data + 7, data, decrypt_key);
+            const uint8_t data8 = decrypt_byte(data + 8, data, decrypt_key);
+            const uint8_t data9 = decrypt_byte(data + 9, data, decrypt_key);
+
             si->is_kf = 1;
 
             /* vet via sync code */
-            if (c[0] != 0x9d || c[1] != 0x01 || c[2] != 0x2a)
+            if (data3 != 0x9d || data4 != 0x01 || data5 != 0x2a)
                 res = VPX_CODEC_UNSUP_BITSTREAM;
 
-            si->w = (c[3] | (c[4] << 8)) & 0x3fff;
-            si->h = (c[5] | (c[6] << 8)) & 0x3fff;
+            si->w = (data6 | (data7 << 8)) & 0x3fff;
+            si->h = (data8 | (data9 << 8)) & 0x3fff;
 
             /*printf("w=%d, h=%d\n", si->w, si->h);*/
             if (!(si->h | si->w))
                 res = VPX_CODEC_UNSUP_BITSTREAM;
         }
         else
+        {
             res = VPX_CODEC_UNSUP_BITSTREAM;
+        }
     }
 
     return res;
+}
 
+static vpx_codec_err_t vp8_peek_si(const uint8_t *data,
+                                   unsigned int data_sz,
+                                   vpx_codec_stream_info_t *si) {
+    return vp8_peek_si_external(data, data_sz, si, fake_decrypt_key);
 }
 
 static vpx_codec_err_t vp8_get_si(vpx_codec_alg_priv_t    *ctx,
@@ -432,8 +455,10 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,
     w = ctx->si.w;
     h = ctx->si.h;
 
-    res = ctx->base.iface->dec.peek_si(ctx->fragments.ptrs[0],
-                                       ctx->fragments.sizes[0], &ctx->si);
+    res = vp8_peek_si_external(ctx->fragments.ptrs[0],
+                               ctx->fragments.sizes[0],
+                               &ctx->si,
+                               ctx->decrypt_key);
 
     if((res == VPX_CODEC_UNSUP_BITSTREAM) && !ctx->si.is_kf)
     {
@@ -507,6 +532,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,
             }
 
             res = vp8_create_decoder_instances(&ctx->yv12_frame_buffers, &oxcf);
+            ctx->yv12_frame_buffers.pbi[0]->decrypt_key = ctx->decrypt_key;
         }
 
         ctx->decoder_init = 1;
@@ -928,6 +954,20 @@ static vpx_codec_err_t vp8_get_frame_corrupted(vpx_codec_alg_priv_t *ctx,
 
 }
 
+
+static vpx_codec_err_t vp8_set_decrypt_key(vpx_codec_alg_priv_t *ctx,
+                                           int ctr_id,
+                                           va_list args)
+{
+    const unsigned char *data = va_arg(args, const unsigned char *);
+    if (data == NULL) {
+        return VPX_CODEC_INVALID_PARAM;
+    }
+
+    memcpy(ctx->decrypt_key, data, VP8_DECRYPT_KEY_SIZE);
+    return VPX_CODEC_OK;
+}
+
 vpx_codec_ctrl_fn_map_t vp8_ctf_maps[] =
 {
     {VP8_SET_REFERENCE,             vp8_set_reference},
@@ -940,6 +980,7 @@ vpx_codec_ctrl_fn_map_t vp8_ctf_maps[] =
     {VP8D_GET_LAST_REF_UPDATES,     vp8_get_last_ref_updates},
     {VP8D_GET_FRAME_CORRUPTED,      vp8_get_frame_corrupted},
     {VP8D_GET_LAST_REF_USED,        vp8_get_last_ref_frame},
+    {VP8_SET_DECRYPT_KEY,           vp8_set_decrypt_key},
     { -1, NULL},
 };
 
diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk
index 2a0e7c526..ca9f6a62e 100644
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -24,7 +24,6 @@ VP8_CX_SRCS-yes += vp8cx.mk
 
 VP8_CX_SRCS-yes += vp8_cx_iface.c
 
-VP8_CX_SRCS-yes += encoder/asm_enc_offsets.c
 VP8_CX_SRCS-yes += encoder/defaultcoefcounts.h
 VP8_CX_SRCS-yes += encoder/bitstream.c
 VP8_CX_SRCS-yes += encoder/boolhuff.c
@@ -78,6 +77,7 @@ VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.c
 VP8_CX_SRCS-yes += encoder/temporal_filter.c
 VP8_CX_SRCS-$(CONFIG_MULTI_RES_ENCODING) += encoder/mr_dissim.c
 VP8_CX_SRCS-$(CONFIG_MULTI_RES_ENCODING) += encoder/mr_dissim.h
+VP8_CX_SRCS-yes += encoder/vp8_asm_enc_offsets.c
 
 ifeq ($(CONFIG_REALTIME_ONLY),yes)
 VP8_CX_SRCS_REMOVE-yes += encoder/firstpass.c
@@ -90,7 +90,6 @@ VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp8_enc_stubs_mmx.c
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2_intrinsics.c
-VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.asm
 
 # TODO(johann) make this generic
 ifeq ($(HAVE_SSE2),yes)
@@ -122,4 +121,4 @@ endif
 VP8_CX_SRCS-yes := $(filter-out $(VP8_CX_SRCS_REMOVE-yes),$(VP8_CX_SRCS-yes))
 
 $(eval $(call asm_offsets_template,\
-         vp8_asm_enc_offsets.asm, $(VP8_PREFIX)encoder/asm_enc_offsets.c))
+         vp8_asm_enc_offsets.asm, $(VP8_PREFIX)encoder/vp8_asm_enc_offsets.c))
diff --git a/vp8/vp8dx.mk b/vp8/vp8dx.mk
index 8be4c7ba5..c26f42d58 100644
--- a/vp8/vp8dx.mk
+++ b/vp8/vp8dx.mk
@@ -20,7 +20,6 @@ VP8_DX_SRCS-yes += vp8dx.mk
 
 VP8_DX_SRCS-yes += vp8_dx_iface.c
 
-VP8_DX_SRCS-yes += decoder/asm_dec_offsets.c
 VP8_DX_SRCS-yes += decoder/dboolhuff.c
 VP8_DX_SRCS-yes += decoder/decodemv.c
 VP8_DX_SRCS-yes += decoder/decodframe.c
@@ -36,8 +35,9 @@ VP8_DX_SRCS-yes += decoder/onyxd_int.h
 VP8_DX_SRCS-yes += decoder/treereader.h
 VP8_DX_SRCS-yes += decoder/onyxd_if.c
 VP8_DX_SRCS-$(CONFIG_MULTITHREAD) += decoder/threading.c
+VP8_DX_SRCS-yes += decoder/vp8_asm_dec_offsets.c
 
 VP8_DX_SRCS-yes := $(filter-out $(VP8_DX_SRCS_REMOVE-yes),$(VP8_DX_SRCS-yes))
 
 $(eval $(call asm_offsets_template,\
-         vp8_asm_dec_offsets.asm, $(VP8_PREFIX)decoder/asm_dec_offsets.c))
+         vp8_asm_dec_offsets.asm, $(VP8_PREFIX)decoder/vp8_asm_dec_offsets.c))
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 8de68505a..8b6efc384 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -361,7 +361,7 @@ specialize vp9_short_idct1_16x16
 
 
 prototype void vp9_short_idct32x32 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct32x32
+specialize vp9_short_idct32x32 sse2
 
 prototype void vp9_short_idct1_32x32 "int16_t *input, int16_t *output"
 specialize vp9_short_idct1_32x32
diff --git a/vp9/common/vp9_seg_common.c b/vp9/common/vp9_seg_common.c
index 859e211bd..44d317293 100644
--- a/vp9/common/vp9_seg_common.c
+++ b/vp9/common/vp9_seg_common.c
@@ -51,7 +51,7 @@ int vp9_seg_feature_data_max(SEG_LVL_FEATURES feature_id) {
 }
 
 int vp9_is_segfeature_signed(SEG_LVL_FEATURES feature_id) {
-  return (segfeaturedata_signed[feature_id]);
+  return segfeaturedata_signed[feature_id];
 }
 
 void vp9_clear_segdata(MACROBLOCKD *xd,
diff --git a/vp9/common/x86/vp9_idct_x86.c b/vp9/common/x86/vp9_idct_x86.c
index 1a2c84a40..811ed9899 100644
--- a/vp9/common/x86/vp9_idct_x86.c
+++ b/vp9/common/x86/vp9_idct_x86.c
@@ -298,129 +298,110 @@ void vp9_idct4_1d_sse2(int16_t *input, int16_t *output) {
     in3 = _mm_unpackhi_epi32(tr0_2, tr0_3);  /* i7 i6 */  \
   }
 
-#define IDCT8x8_1D                                             \
-  /* Stage1 */                                                 \
-  {                                                            \
-    const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7);        \
-    const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7);        \
-    const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5);        \
-    const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5);        \
-                                                               \
-    tmp0 = _mm_madd_epi16(lo_17, stg1_0);                      \
-    tmp1 = _mm_madd_epi16(hi_17, stg1_0);                      \
-    tmp2 = _mm_madd_epi16(lo_17, stg1_1);                      \
-    tmp3 = _mm_madd_epi16(hi_17, stg1_1);                      \
-    tmp4 = _mm_madd_epi16(lo_35, stg1_2);                      \
-    tmp5 = _mm_madd_epi16(hi_35, stg1_2);                      \
-    tmp6 = _mm_madd_epi16(lo_35, stg1_3);                      \
-    tmp7 = _mm_madd_epi16(hi_35, stg1_3);                      \
-                                                               \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                      \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                      \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                      \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                      \
-    tmp4 = _mm_add_epi32(tmp4, rounding);                      \
-    tmp5 = _mm_add_epi32(tmp5, rounding);                      \
-    tmp6 = _mm_add_epi32(tmp6, rounding);                      \
-    tmp7 = _mm_add_epi32(tmp7, rounding);                      \
-                                                               \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);               \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);               \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);               \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);               \
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);               \
-    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);               \
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);               \
-    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);               \
-                                                               \
-    stp1_4 = _mm_packs_epi32(tmp0, tmp1);                      \
-    stp1_7 = _mm_packs_epi32(tmp2, tmp3);                      \
-    stp1_5 = _mm_packs_epi32(tmp4, tmp5);                      \
-    stp1_6 = _mm_packs_epi32(tmp6, tmp7);                      \
-  }                                                            \
-                                                               \
-  /* Stage2 */                                                 \
-  {                                                            \
-    const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4);        \
-    const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4);        \
-    const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6);        \
-    const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6);        \
-                                                               \
-    tmp0 = _mm_madd_epi16(lo_04, stg2_0);                      \
-    tmp1 = _mm_madd_epi16(hi_04, stg2_0);                      \
-    tmp2 = _mm_madd_epi16(lo_04, stg2_1);                      \
-    tmp3 = _mm_madd_epi16(hi_04, stg2_1);                      \
-    tmp4 = _mm_madd_epi16(lo_26, stg2_2);                      \
-    tmp5 = _mm_madd_epi16(hi_26, stg2_2);                      \
-    tmp6 = _mm_madd_epi16(lo_26, stg2_3);                      \
-    tmp7 = _mm_madd_epi16(hi_26, stg2_3);                      \
-                                                               \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                      \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                      \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                      \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                      \
-    tmp4 = _mm_add_epi32(tmp4, rounding);                      \
-    tmp5 = _mm_add_epi32(tmp5, rounding);                      \
-    tmp6 = _mm_add_epi32(tmp6, rounding);                      \
-    tmp7 = _mm_add_epi32(tmp7, rounding);                      \
-                                                               \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);               \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);               \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);               \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);               \
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);               \
-    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);               \
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);               \
-    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);               \
-                                                               \
-    stp2_0 = _mm_packs_epi32(tmp0, tmp1);                      \
-    stp2_1 = _mm_packs_epi32(tmp2, tmp3);                      \
-    stp2_2 = _mm_packs_epi32(tmp4, tmp5);                      \
-    stp2_3 = _mm_packs_epi32(tmp6, tmp7);                      \
-                                                               \
-    stp2_4 = _mm_adds_epi16(stp1_4, stp1_5);                   \
-    stp2_5 = _mm_subs_epi16(stp1_4, stp1_5);                   \
-    stp2_6 = _mm_subs_epi16(stp1_7, stp1_6);                   \
-    stp2_7 = _mm_adds_epi16(stp1_7, stp1_6);                   \
-  }                                                            \
-                                                               \
-  /* Stage3 */                                                 \
-  {                                                            \
-    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5);  \
-    const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5);  \
-                                                               \
-    stp1_0 = _mm_adds_epi16(stp2_0, stp2_3);                   \
-    stp1_1 = _mm_adds_epi16(stp2_1, stp2_2);                   \
-    stp1_2 = _mm_subs_epi16(stp2_1, stp2_2);                   \
-    stp1_3 = _mm_subs_epi16(stp2_0, stp2_3);                   \
-                                                               \
-    tmp0 = _mm_madd_epi16(lo_56, stg2_1);                      \
-    tmp1 = _mm_madd_epi16(hi_56, stg2_1);                      \
-    tmp2 = _mm_madd_epi16(lo_56, stg2_0);                      \
-    tmp3 = _mm_madd_epi16(hi_56, stg2_0);                      \
-                                                               \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                      \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                      \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                      \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                      \
-                                                               \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);               \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);               \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);               \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);               \
-                                                               \
-    stp1_5 = _mm_packs_epi32(tmp0, tmp1);                      \
-    stp1_6 = _mm_packs_epi32(tmp2, tmp3);                      \
-  }                                                            \
-                                                               \
-  /* Stage4  */                                                \
-  in0 = _mm_adds_epi16(stp1_0, stp2_7);                        \
-  in1 = _mm_adds_epi16(stp1_1, stp1_6);                        \
-  in2 = _mm_adds_epi16(stp1_2, stp1_5);                        \
-  in3 = _mm_adds_epi16(stp1_3, stp2_4);                        \
-  in4 = _mm_subs_epi16(stp1_3, stp2_4);                        \
-  in5 = _mm_subs_epi16(stp1_2, stp1_5);                        \
-  in6 = _mm_subs_epi16(stp1_1, stp1_6);                        \
+// Define Macro for multiplying elements by constants and adding them together.
+#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \
+                               cst0, cst1, cst2, cst3, res0, res1, res2, res3) \
+  {   \
+      tmp0 = _mm_madd_epi16(lo_0, cst0); \
+      tmp1 = _mm_madd_epi16(hi_0, cst0); \
+      tmp2 = _mm_madd_epi16(lo_0, cst1); \
+      tmp3 = _mm_madd_epi16(hi_0, cst1); \
+      tmp4 = _mm_madd_epi16(lo_1, cst2); \
+      tmp5 = _mm_madd_epi16(hi_1, cst2); \
+      tmp6 = _mm_madd_epi16(lo_1, cst3); \
+      tmp7 = _mm_madd_epi16(hi_1, cst3); \
+      \
+      tmp0 = _mm_add_epi32(tmp0, rounding); \
+      tmp1 = _mm_add_epi32(tmp1, rounding); \
+      tmp2 = _mm_add_epi32(tmp2, rounding); \
+      tmp3 = _mm_add_epi32(tmp3, rounding); \
+      tmp4 = _mm_add_epi32(tmp4, rounding); \
+      tmp5 = _mm_add_epi32(tmp5, rounding); \
+      tmp6 = _mm_add_epi32(tmp6, rounding); \
+      tmp7 = _mm_add_epi32(tmp7, rounding); \
+      \
+      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+      tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
+      tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
+      tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
+      tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
+      \
+      res0 = _mm_packs_epi32(tmp0, tmp1); \
+      res1 = _mm_packs_epi32(tmp2, tmp3); \
+      res2 = _mm_packs_epi32(tmp4, tmp5); \
+      res3 = _mm_packs_epi32(tmp6, tmp7); \
+  }
+
+#define IDCT8x8_1D  \
+  /* Stage1 */      \
+  { \
+    const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
+    const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \
+    const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \
+    const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \
+    \
+    MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \
+                          stg1_1, stg1_2, stg1_3, stp1_4,      \
+                          stp1_7, stp1_5, stp1_6)              \
+  } \
+    \
+  /* Stage2 */ \
+  { \
+    const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \
+    const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \
+    const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \
+    const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \
+    \
+    MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \
+                           stg2_1, stg2_2, stg2_3, stp2_0,     \
+                           stp2_1, stp2_2, stp2_3)             \
+    \
+    stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \
+    stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \
+    stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \
+    stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \
+  } \
+    \
+  /* Stage3 */ \
+  { \
+    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+    const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+    \
+    stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \
+    stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \
+    stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \
+    stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \
+    \
+    tmp0 = _mm_madd_epi16(lo_56, stg2_1); \
+    tmp1 = _mm_madd_epi16(hi_56, stg2_1); \
+    tmp2 = _mm_madd_epi16(lo_56, stg2_0); \
+    tmp3 = _mm_madd_epi16(hi_56, stg2_0); \
+    \
+    tmp0 = _mm_add_epi32(tmp0, rounding); \
+    tmp1 = _mm_add_epi32(tmp1, rounding); \
+    tmp2 = _mm_add_epi32(tmp2, rounding); \
+    tmp3 = _mm_add_epi32(tmp3, rounding); \
+    \
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+    \
+    stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+    stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+  } \
+  \
+  /* Stage4  */ \
+  in0 = _mm_adds_epi16(stp1_0, stp2_7); \
+  in1 = _mm_adds_epi16(stp1_1, stp1_6); \
+  in2 = _mm_adds_epi16(stp1_2, stp1_5); \
+  in3 = _mm_adds_epi16(stp1_3, stp2_4); \
+  in4 = _mm_subs_epi16(stp1_3, stp2_4); \
+  in5 = _mm_subs_epi16(stp1_2, stp1_5); \
+  in6 = _mm_subs_epi16(stp1_1, stp1_6); \
   in7 = _mm_subs_epi16(stp1_0, stp2_7);
 
 void vp9_short_idct8x8_sse2(int16_t *input, int16_t *output, int pitch) {
@@ -643,9 +624,9 @@ void vp9_short_idct10_8x8_sse2(int16_t *input, int16_t *output, int pitch) {
   _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);
 }
 
-#define IDCT16x16_1D                                       \
-  /* Stage2 */                                             \
-  {                                                        \
+#define IDCT16x16_1D \
+  /* Stage2 */ \
+  { \
     const __m128i lo_1_15 = _mm_unpacklo_epi16(in1, in15); \
     const __m128i hi_1_15 = _mm_unpackhi_epi16(in1, in15); \
     const __m128i lo_9_7 = _mm_unpacklo_epi16(in9, in7);   \
@@ -654,250 +635,110 @@ void vp9_short_idct10_8x8_sse2(int16_t *input, int16_t *output, int pitch) {
     const __m128i hi_5_11 = _mm_unpackhi_epi16(in5, in11); \
     const __m128i lo_13_3 = _mm_unpacklo_epi16(in13, in3); \
     const __m128i hi_13_3 = _mm_unpackhi_epi16(in13, in3); \
-                                            \
-    tmp0 = _mm_madd_epi16(lo_1_15, stg2_0); \
-    tmp1 = _mm_madd_epi16(hi_1_15, stg2_0); \
-    tmp2 = _mm_madd_epi16(lo_1_15, stg2_1); \
-    tmp3 = _mm_madd_epi16(hi_1_15, stg2_1); \
-    tmp4 = _mm_madd_epi16(lo_9_7, stg2_2);  \
-    tmp5 = _mm_madd_epi16(hi_9_7, stg2_2);  \
-    tmp6 = _mm_madd_epi16(lo_9_7, stg2_3);  \
-    tmp7 = _mm_madd_epi16(hi_9_7, stg2_3);  \
-                                          \
-    tmp0 = _mm_add_epi32(tmp0, rounding); \
-    tmp1 = _mm_add_epi32(tmp1, rounding); \
-    tmp2 = _mm_add_epi32(tmp2, rounding); \
-    tmp3 = _mm_add_epi32(tmp3, rounding); \
-    tmp4 = _mm_add_epi32(tmp4, rounding); \
-    tmp5 = _mm_add_epi32(tmp5, rounding); \
-    tmp6 = _mm_add_epi32(tmp6, rounding); \
-    tmp7 = _mm_add_epi32(tmp7, rounding); \
-                                          \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
-    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
-    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
-                                           \
-    stp2_8 = _mm_packs_epi32(tmp0, tmp1);  \
-    stp2_15 = _mm_packs_epi32(tmp2, tmp3); \
-    stp2_9 = _mm_packs_epi32(tmp4, tmp5);  \
-    stp2_14 = _mm_packs_epi32(tmp6, tmp7); \
-                                           \
-    tmp0 = _mm_madd_epi16(lo_5_11, stg2_4); \
-    tmp1 = _mm_madd_epi16(hi_5_11, stg2_4); \
-    tmp2 = _mm_madd_epi16(lo_5_11, stg2_5); \
-    tmp3 = _mm_madd_epi16(hi_5_11, stg2_5); \
-    tmp4 = _mm_madd_epi16(lo_13_3, stg2_6); \
-    tmp5 = _mm_madd_epi16(hi_13_3, stg2_6); \
-    tmp6 = _mm_madd_epi16(lo_13_3, stg2_7); \
-    tmp7 = _mm_madd_epi16(hi_13_3, stg2_7); \
-                                          \
-    tmp0 = _mm_add_epi32(tmp0, rounding); \
-    tmp1 = _mm_add_epi32(tmp1, rounding); \
-    tmp2 = _mm_add_epi32(tmp2, rounding); \
-    tmp3 = _mm_add_epi32(tmp3, rounding); \
-    tmp4 = _mm_add_epi32(tmp4, rounding); \
-    tmp5 = _mm_add_epi32(tmp5, rounding); \
-    tmp6 = _mm_add_epi32(tmp6, rounding); \
-    tmp7 = _mm_add_epi32(tmp7, rounding); \
-                                          \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
-    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
-    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
-                                           \
-    stp2_10 = _mm_packs_epi32(tmp0, tmp1); \
-    stp2_13 = _mm_packs_epi32(tmp2, tmp3); \
-    stp2_11 = _mm_packs_epi32(tmp4, tmp5); \
-    stp2_12 = _mm_packs_epi32(tmp6, tmp7); \
-  }                                        \
-                                           \
-  /* Stage3 */                             \
-  {                                        \
+    \
+    MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \
+                           stg2_0, stg2_1, stg2_2, stg2_3, \
+                           stp2_8, stp2_15, stp2_9, stp2_14) \
+    \
+    MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \
+                           stg2_4, stg2_5, stg2_6, stg2_7, \
+                           stp2_10, stp2_13, stp2_11, stp2_12) \
+  } \
+    \
+  /* Stage3 */ \
+  { \
     const __m128i lo_2_14 = _mm_unpacklo_epi16(in2, in14); \
     const __m128i hi_2_14 = _mm_unpackhi_epi16(in2, in14); \
     const __m128i lo_10_6 = _mm_unpacklo_epi16(in10, in6); \
     const __m128i hi_10_6 = _mm_unpackhi_epi16(in10, in6); \
-                                            \
-    tmp0 = _mm_madd_epi16(lo_2_14, stg3_0); \
-    tmp1 = _mm_madd_epi16(hi_2_14, stg3_0); \
-    tmp2 = _mm_madd_epi16(lo_2_14, stg3_1); \
-    tmp3 = _mm_madd_epi16(hi_2_14, stg3_1); \
-    tmp4 = _mm_madd_epi16(lo_10_6, stg3_2); \
-    tmp5 = _mm_madd_epi16(hi_10_6, stg3_2); \
-    tmp6 = _mm_madd_epi16(lo_10_6, stg3_3); \
-    tmp7 = _mm_madd_epi16(hi_10_6, stg3_3); \
-                                          \
-    tmp0 = _mm_add_epi32(tmp0, rounding); \
-    tmp1 = _mm_add_epi32(tmp1, rounding); \
-    tmp2 = _mm_add_epi32(tmp2, rounding); \
-    tmp3 = _mm_add_epi32(tmp3, rounding); \
-    tmp4 = _mm_add_epi32(tmp4, rounding); \
-    tmp5 = _mm_add_epi32(tmp5, rounding); \
-    tmp6 = _mm_add_epi32(tmp6, rounding); \
-    tmp7 = _mm_add_epi32(tmp7, rounding); \
-                                          \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
-    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
-    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
-                                          \
-    stp1_4 = _mm_packs_epi32(tmp0, tmp1); \
-    stp1_7 = _mm_packs_epi32(tmp2, tmp3); \
-    stp1_5 = _mm_packs_epi32(tmp4, tmp5); \
-    stp1_6 = _mm_packs_epi32(tmp6, tmp7); \
-                                          \
+    \
+    MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \
+                           stg3_0, stg3_1, stg3_2, stg3_3, \
+                           stp1_4, stp1_7, stp1_5, stp1_6) \
+    \
     stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);  \
     stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);    \
     stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
     stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
-                                               \
+    \
     stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \
     stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
     stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
     stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
-  }                                            \
-                                               \
-  /* Stage4 */                                 \
-  {                                            \
+  } \
+  \
+  /* Stage4 */ \
+  { \
     const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8); \
     const __m128i hi_0_8 = _mm_unpackhi_epi16(in0, in8); \
     const __m128i lo_4_12 = _mm_unpacklo_epi16(in4, in12); \
     const __m128i hi_4_12 = _mm_unpackhi_epi16(in4, in12); \
-                                                           \
+    \
     const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
     const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
     const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
-                                           \
-    tmp0 = _mm_madd_epi16(lo_0_8, stg4_0); \
-    tmp1 = _mm_madd_epi16(hi_0_8, stg4_0); \
-    tmp2 = _mm_madd_epi16(lo_0_8, stg4_1); \
-    tmp3 = _mm_madd_epi16(hi_0_8, stg4_1); \
-    tmp4 = _mm_madd_epi16(lo_4_12, stg4_2); \
-    tmp5 = _mm_madd_epi16(hi_4_12, stg4_2); \
-    tmp6 = _mm_madd_epi16(lo_4_12, stg4_3); \
-    tmp7 = _mm_madd_epi16(hi_4_12, stg4_3); \
-                                          \
-    tmp0 = _mm_add_epi32(tmp0, rounding); \
-    tmp1 = _mm_add_epi32(tmp1, rounding); \
-    tmp2 = _mm_add_epi32(tmp2, rounding); \
-    tmp3 = _mm_add_epi32(tmp3, rounding); \
-    tmp4 = _mm_add_epi32(tmp4, rounding); \
-    tmp5 = _mm_add_epi32(tmp5, rounding); \
-    tmp6 = _mm_add_epi32(tmp6, rounding); \
-    tmp7 = _mm_add_epi32(tmp7, rounding); \
-                                          \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
-    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
-    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
-                                          \
-    stp2_0 = _mm_packs_epi32(tmp0, tmp1); \
-    stp2_1 = _mm_packs_epi32(tmp2, tmp3); \
-    stp2_2 = _mm_packs_epi32(tmp4, tmp5); \
-    stp2_3 = _mm_packs_epi32(tmp6, tmp7); \
-                                          \
+    \
+    MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \
+                           stg4_0, stg4_1, stg4_2, stg4_3, \
+                           stp2_0, stp2_1, stp2_2, stp2_3) \
+    \
     stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
     stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
     stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
     stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
-                                            \
-    tmp0 = _mm_madd_epi16(lo_9_14, stg4_4); \
-    tmp1 = _mm_madd_epi16(hi_9_14, stg4_4); \
-    tmp2 = _mm_madd_epi16(lo_9_14, stg4_5); \
-    tmp3 = _mm_madd_epi16(hi_9_14, stg4_5); \
-    tmp4 = _mm_madd_epi16(lo_10_13, stg4_6); \
-    tmp5 = _mm_madd_epi16(hi_10_13, stg4_6); \
-    tmp6 = _mm_madd_epi16(lo_10_13, stg4_7); \
-    tmp7 = _mm_madd_epi16(hi_10_13, stg4_7); \
-                                          \
-    tmp0 = _mm_add_epi32(tmp0, rounding); \
-    tmp1 = _mm_add_epi32(tmp1, rounding); \
-    tmp2 = _mm_add_epi32(tmp2, rounding); \
-    tmp3 = _mm_add_epi32(tmp3, rounding); \
-    tmp4 = _mm_add_epi32(tmp4, rounding); \
-    tmp5 = _mm_add_epi32(tmp5, rounding); \
-    tmp6 = _mm_add_epi32(tmp6, rounding); \
-    tmp7 = _mm_add_epi32(tmp7, rounding); \
-                                          \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
-    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
-    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
-                                           \
-    stp2_9 = _mm_packs_epi32(tmp0, tmp1);  \
-    stp2_14 = _mm_packs_epi32(tmp2, tmp3); \
-    stp2_10 = _mm_packs_epi32(tmp4, tmp5); \
-    stp2_13 = _mm_packs_epi32(tmp6, tmp7); \
-  }                                        \
-                                           \
-  /* Stage5 */                             \
-  {                                        \
+    \
+    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
+                           stg4_4, stg4_5, stg4_6, stg4_7, \
+                           stp2_9, stp2_14, stp2_10, stp2_13) \
+  } \
+    \
+  /* Stage5 */ \
+  { \
     const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
     const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
-                                            \
+    \
     stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
     stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
     stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
     stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
-                                           \
+    \
     tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
     tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
     tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
     tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
-                                          \
+    \
     tmp0 = _mm_add_epi32(tmp0, rounding); \
     tmp1 = _mm_add_epi32(tmp1, rounding); \
     tmp2 = _mm_add_epi32(tmp2, rounding); \
     tmp3 = _mm_add_epi32(tmp3, rounding); \
-                                          \
+    \
     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
     tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
     tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
-                                          \
+    \
     stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
     stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
-                                          \
+    \
     stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);  \
     stp1_9 = _mm_add_epi16(stp2_9, stp2_10);    \
     stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);   \
     stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
-                                                 \
+    \
     stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
     stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);   \
     stp1_14 = _mm_add_epi16(stp2_14, stp2_13);   \
     stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
-  }                                              \
-                                                 \
-  /* Stage6 */                                   \
-  {                                              \
+  } \
+    \
+  /* Stage6 */ \
+  { \
     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
     const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
     const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
     const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
-                                            \
+    \
     stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
     stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
     stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
@@ -906,38 +747,10 @@ void vp9_short_idct10_8x8_sse2(int16_t *input, int16_t *output, int pitch) {
     stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
     stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
     stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
-                                             \
-    tmp0 = _mm_madd_epi16(lo_10_13, stg6_0); \
-    tmp1 = _mm_madd_epi16(hi_10_13, stg6_0); \
-    tmp2 = _mm_madd_epi16(lo_10_13, stg4_0); \
-    tmp3 = _mm_madd_epi16(hi_10_13, stg4_0); \
-    tmp4 = _mm_madd_epi16(lo_11_12, stg6_0); \
-    tmp5 = _mm_madd_epi16(hi_11_12, stg6_0); \
-    tmp6 = _mm_madd_epi16(lo_11_12, stg4_0); \
-    tmp7 = _mm_madd_epi16(hi_11_12, stg4_0); \
-                                          \
-    tmp0 = _mm_add_epi32(tmp0, rounding); \
-    tmp1 = _mm_add_epi32(tmp1, rounding); \
-    tmp2 = _mm_add_epi32(tmp2, rounding); \
-    tmp3 = _mm_add_epi32(tmp3, rounding); \
-    tmp4 = _mm_add_epi32(tmp4, rounding); \
-    tmp5 = _mm_add_epi32(tmp5, rounding); \
-    tmp6 = _mm_add_epi32(tmp6, rounding); \
-    tmp7 = _mm_add_epi32(tmp7, rounding); \
-                                                 \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
-    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
-    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
-                                           \
-    stp2_10 = _mm_packs_epi32(tmp0, tmp1); \
-    stp2_13 = _mm_packs_epi32(tmp2, tmp3); \
-    stp2_11 = _mm_packs_epi32(tmp4, tmp5); \
-    stp2_12 = _mm_packs_epi32(tmp6, tmp7); \
+    \
+    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
+                           stg6_0, stg4_0, stg6_0, stg4_0, \
+                           stp2_10, stp2_13, stp2_11, stp2_12) \
   }
 
 void vp9_short_idct16x16_sse2(int16_t *input, int16_t *output, int pitch) {
@@ -1506,4 +1319,657 @@ void vp9_short_idct10_16x16_sse2(int16_t *input, int16_t *output, int pitch) {
     output += 8;
   }
 }
+
+void vp9_short_idct32x32_sse2(int16_t *input, int16_t *output, int pitch) {
+  const int half_pitch = pitch >> 1;
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i final_rounding = _mm_set1_epi16(1<<5);
+
+  // idct constants for each stage
+  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
+  const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
+  const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
+  const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
+  const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
+  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
+  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
+  const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
+  const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
+  const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
+  const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
+  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
+
+  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
+  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
+  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
+  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+
+  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
+  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
+  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+
+  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+
+  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12,
+          in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23,
+          in24, in25, in26, in27, in28, in29, in30, in31;
+  __m128i col[128];
+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
+          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+          stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
+          stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
+          stp1_30, stp1_31;
+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
+          stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
+          stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
+          stp2_30, stp2_31;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  int i, j;
+
+  // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct.
+  for (i = 0; i < 8; i++) {
+    if (i < 4) {
+      // First 1-D idct
+      // Load input data.
+      in0 = _mm_load_si128((__m128i *)input);
+      in8 = _mm_load_si128((__m128i *)(input + 8 * 1));
+      in16 = _mm_load_si128((__m128i *)(input + 8 * 2));
+      in24 = _mm_load_si128((__m128i *)(input + 8 * 3));
+      in1 = _mm_load_si128((__m128i *)(input + 8 * 4));
+      in9 = _mm_load_si128((__m128i *)(input + 8 * 5));
+      in17 = _mm_load_si128((__m128i *)(input + 8 * 6));
+      in25 = _mm_load_si128((__m128i *)(input + 8 * 7));
+      in2 = _mm_load_si128((__m128i *)(input + 8 * 8));
+      in10 = _mm_load_si128((__m128i *)(input + 8 * 9));
+      in18 = _mm_load_si128((__m128i *)(input + 8 * 10));
+      in26 = _mm_load_si128((__m128i *)(input + 8 * 11));
+      in3 = _mm_load_si128((__m128i *)(input + 8 * 12));
+      in11 = _mm_load_si128((__m128i *)(input + 8 * 13));
+      in19 = _mm_load_si128((__m128i *)(input + 8 * 14));
+      in27 = _mm_load_si128((__m128i *)(input + 8 * 15));
+
+      in4 = _mm_load_si128((__m128i *)(input + 8 * 16));
+      in12 = _mm_load_si128((__m128i *)(input + 8 * 17));
+      in20 = _mm_load_si128((__m128i *)(input + 8 * 18));
+      in28 = _mm_load_si128((__m128i *)(input + 8 * 19));
+      in5 = _mm_load_si128((__m128i *)(input + 8 * 20));
+      in13 = _mm_load_si128((__m128i *)(input + 8 * 21));
+      in21 = _mm_load_si128((__m128i *)(input + 8 * 22));
+      in29 = _mm_load_si128((__m128i *)(input + 8 * 23));
+      in6 = _mm_load_si128((__m128i *)(input + 8 * 24));
+      in14 = _mm_load_si128((__m128i *)(input + 8 * 25));
+      in22 = _mm_load_si128((__m128i *)(input + 8 * 26));
+      in30 = _mm_load_si128((__m128i *)(input + 8 * 27));
+      in7 = _mm_load_si128((__m128i *)(input + 8 * 28));
+      in15 = _mm_load_si128((__m128i *)(input + 8 * 29));
+      in23 = _mm_load_si128((__m128i *)(input + 8 * 30));
+      in31 = _mm_load_si128((__m128i *)(input + 8 * 31));
+
+      input += 256;
+
+      // Transpose 32x8 block to 8x32 block
+      TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                    in4, in5, in6, in7);
+      TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
+                    in10, in11, in12, in13, in14, in15);
+      TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17,
+                    in18, in19, in20, in21, in22, in23);
+      TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25,
+                    in26, in27, in28, in29, in30, in31);
+    } else {
+      // Second 1-D idct
+      j = i - 4;
+
+      // Transpose 32x8 block to 8x32 block
+      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
+                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
+                    col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4,
+                    in5, in6, in7);
+      j += 4;
+      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
+                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
+                    col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10,
+                    in11, in12, in13, in14, in15);
+      j += 4;
+      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
+                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
+                    col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18,
+                    in19, in20, in21, in22, in23);
+      j += 4;
+      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
+                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
+                    col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27,
+                    in28, in29, in30, in31);
+    }
+
+    // Stage1
+    {
+      const __m128i lo_1_31 = _mm_unpacklo_epi16(in1, in31);
+      const __m128i hi_1_31 = _mm_unpackhi_epi16(in1, in31);
+      const __m128i lo_17_15 = _mm_unpacklo_epi16(in17, in15);
+      const __m128i hi_17_15 = _mm_unpackhi_epi16(in17, in15);
+
+      const __m128i lo_9_23 = _mm_unpacklo_epi16(in9, in23);
+      const __m128i hi_9_23 = _mm_unpackhi_epi16(in9, in23);
+      const __m128i lo_25_7= _mm_unpacklo_epi16(in25, in7);
+      const __m128i hi_25_7 = _mm_unpackhi_epi16(in25, in7);
+
+      const __m128i lo_5_27 = _mm_unpacklo_epi16(in5, in27);
+      const __m128i hi_5_27 = _mm_unpackhi_epi16(in5, in27);
+      const __m128i lo_21_11 = _mm_unpacklo_epi16(in21, in11);
+      const __m128i hi_21_11 = _mm_unpackhi_epi16(in21, in11);
+
+      const __m128i lo_13_19 = _mm_unpacklo_epi16(in13, in19);
+      const __m128i hi_13_19 = _mm_unpackhi_epi16(in13, in19);
+      const __m128i lo_29_3 = _mm_unpacklo_epi16(in29, in3);
+      const __m128i hi_29_3 = _mm_unpackhi_epi16(in29, in3);
+
+      MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0,
+                             stg1_1, stg1_2, stg1_3, stp1_16, stp1_31,
+                             stp1_17, stp1_30)
+      MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4,
+                             stg1_5, stg1_6, stg1_7, stp1_18, stp1_29,
+                             stp1_19, stp1_28)
+      MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8,
+                             stg1_9, stg1_10, stg1_11, stp1_20, stp1_27,
+                             stp1_21, stp1_26)
+      MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12,
+                             stg1_13, stg1_14, stg1_15, stp1_22, stp1_25,
+                             stp1_23, stp1_24)
+    }
+
+    // Stage2
+    {
+      const __m128i lo_2_30 = _mm_unpacklo_epi16(in2, in30);
+      const __m128i hi_2_30 = _mm_unpackhi_epi16(in2, in30);
+      const __m128i lo_18_14 = _mm_unpacklo_epi16(in18, in14);
+      const __m128i hi_18_14 = _mm_unpackhi_epi16(in18, in14);
+
+      const __m128i lo_10_22 = _mm_unpacklo_epi16(in10, in22);
+      const __m128i hi_10_22 = _mm_unpackhi_epi16(in10, in22);
+      const __m128i lo_26_6 = _mm_unpacklo_epi16(in26, in6);
+      const __m128i hi_26_6 = _mm_unpackhi_epi16(in26, in6);
+
+      MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0,
+                             stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9,
+                             stp2_14)
+      MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4,
+                             stg2_5, stg2_6, stg2_7, stp2_10, stp2_13,
+                             stp2_11, stp2_12)
+
+      stp2_16 = _mm_add_epi16(stp1_16, stp1_17);
+      stp2_17 = _mm_sub_epi16(stp1_16, stp1_17);
+      stp2_18 = _mm_sub_epi16(stp1_19, stp1_18);
+      stp2_19 = _mm_add_epi16(stp1_19, stp1_18);
+
+      stp2_20 = _mm_add_epi16(stp1_20, stp1_21);
+      stp2_21 = _mm_sub_epi16(stp1_20, stp1_21);
+      stp2_22 = _mm_sub_epi16(stp1_23, stp1_22);
+      stp2_23 = _mm_add_epi16(stp1_23, stp1_22);
+
+      stp2_24 = _mm_add_epi16(stp1_24, stp1_25);
+      stp2_25 = _mm_sub_epi16(stp1_24, stp1_25);
+      stp2_26 = _mm_sub_epi16(stp1_27, stp1_26);
+      stp2_27 = _mm_add_epi16(stp1_27, stp1_26);
+
+      stp2_28 = _mm_add_epi16(stp1_28, stp1_29);
+      stp2_29 = _mm_sub_epi16(stp1_28, stp1_29);
+      stp2_30 = _mm_sub_epi16(stp1_31, stp1_30);
+      stp2_31 = _mm_add_epi16(stp1_31, stp1_30);
+    }
+
+    // Stage3
+    {
+      const __m128i lo_4_28 = _mm_unpacklo_epi16(in4, in28);
+      const __m128i hi_4_28 = _mm_unpackhi_epi16(in4, in28);
+      const __m128i lo_20_12 = _mm_unpacklo_epi16(in20, in12);
+      const __m128i hi_20_12 = _mm_unpackhi_epi16(in20, in12);
+
+      const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30);
+      const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30);
+      const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);
+      const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);
+
+      const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);
+      const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);
+      const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);
+      const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);
+
+      MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0,
+                             stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5,
+                             stp1_6)
+
+      stp1_8 = _mm_add_epi16(stp2_8, stp2_9);
+      stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);
+      stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);
+      stp1_11 = _mm_add_epi16(stp2_11, stp2_10);
+      stp1_12 = _mm_add_epi16(stp2_12, stp2_13);
+      stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);
+      stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);
+      stp1_15 = _mm_add_epi16(stp2_15, stp2_14);
+
+      MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4,
+                             stg3_5, stg3_6, stg3_4, stp1_17, stp1_30,
+                             stp1_18, stp1_29)
+      MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8,
+                             stg3_9, stg3_10, stg3_8, stp1_21, stp1_26,
+                             stp1_22, stp1_25)
+
+      stp1_16 = stp2_16;
+      stp1_31 = stp2_31;
+      stp1_19 = stp2_19;
+      stp1_20 = stp2_20;
+      stp1_23 = stp2_23;
+      stp1_24 = stp2_24;
+      stp1_27 = stp2_27;
+      stp1_28 = stp2_28;
+    }
+
+    // Stage4
+    {
+      const __m128i lo_0_16 = _mm_unpacklo_epi16(in0, in16);
+      const __m128i hi_0_16 = _mm_unpackhi_epi16(in0, in16);
+      const __m128i lo_8_24 = _mm_unpacklo_epi16(in8, in24);
+      const __m128i hi_8_24 = _mm_unpackhi_epi16(in8, in24);
+
+      const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);
+      const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);
+      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
+      const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);
+
+      MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0,
+                             stg4_1, stg4_2, stg4_3, stp2_0, stp2_1,
+                             stp2_2, stp2_3)
+
+      stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
+      stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
+      stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
+      stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
+
+      MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,
+                             stg4_5, stg4_6, stg4_4, stp2_9, stp2_14,
+                             stp2_10, stp2_13)
+
+      stp2_8 = stp1_8;
+      stp2_15 = stp1_15;
+      stp2_11 = stp1_11;
+      stp2_12 = stp1_12;
+
+      stp2_16 = _mm_add_epi16(stp1_16, stp1_19);
+      stp2_17 = _mm_add_epi16(stp1_17, stp1_18);
+      stp2_18 = _mm_sub_epi16(stp1_17, stp1_18);
+      stp2_19 = _mm_sub_epi16(stp1_16, stp1_19);
+      stp2_20 = _mm_sub_epi16(stp1_23, stp1_20);
+      stp2_21 = _mm_sub_epi16(stp1_22, stp1_21);
+      stp2_22 = _mm_add_epi16(stp1_22, stp1_21);
+      stp2_23 = _mm_add_epi16(stp1_23, stp1_20);
+
+      stp2_24 = _mm_add_epi16(stp1_24, stp1_27);
+      stp2_25 = _mm_add_epi16(stp1_25, stp1_26);
+      stp2_26 = _mm_sub_epi16(stp1_25, stp1_26);
+      stp2_27 = _mm_sub_epi16(stp1_24, stp1_27);
+      stp2_28 = _mm_sub_epi16(stp1_31, stp1_28);
+      stp2_29 = _mm_sub_epi16(stp1_30, stp1_29);
+      stp2_30 = _mm_add_epi16(stp1_29, stp1_30);
+      stp2_31 = _mm_add_epi16(stp1_28, stp1_31);
+    }
+
+    // Stage5
+    {
+      const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);
+      const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);
+      const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);
+      const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);
+
+      const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28);
+      const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28);
+      const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);
+      const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);
+
+      const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);
+      const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);
+
+      stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
+      stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
+      stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
+      stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
+
+      tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);
+      tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);
+      tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);
+      tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);
+
+      tmp0 = _mm_add_epi32(tmp0, rounding);
+      tmp1 = _mm_add_epi32(tmp1, rounding);
+      tmp2 = _mm_add_epi32(tmp2, rounding);
+      tmp3 = _mm_add_epi32(tmp3, rounding);
+
+      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
+      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
+
+      stp1_5 = _mm_packs_epi32(tmp0, tmp1);
+      stp1_6 = _mm_packs_epi32(tmp2, tmp3);
+
+      stp1_4 = stp2_4;
+      stp1_7 = stp2_7;
+
+      stp1_8 = _mm_add_epi16(stp2_8, stp2_11);
+      stp1_9 = _mm_add_epi16(stp2_9, stp2_10);
+      stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);
+      stp1_11 = _mm_sub_epi16(stp2_8, stp2_11);
+      stp1_12 = _mm_sub_epi16(stp2_15, stp2_12);
+      stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);
+      stp1_14 = _mm_add_epi16(stp2_14, stp2_13);
+      stp1_15 = _mm_add_epi16(stp2_15, stp2_12);
+
+      stp1_16 = stp2_16;
+      stp1_17 = stp2_17;
+
+      MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4,
+                             stg4_5, stg4_4, stg4_5, stp1_18, stp1_29,
+                             stp1_19, stp1_28)
+      MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6,
+                             stg4_4, stg4_6, stg4_4, stp1_20, stp1_27,
+                             stp1_21, stp1_26)
+
+      stp1_22 = stp2_22;
+      stp1_23 = stp2_23;
+      stp1_24 = stp2_24;
+      stp1_25 = stp2_25;
+      stp1_30 = stp2_30;
+      stp1_31 = stp2_31;
+    }
+
+    // Stage6
+    {
+      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
+      const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);
+      const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
+      const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);
+
+      stp2_0 = _mm_add_epi16(stp1_0, stp1_7);
+      stp2_1 = _mm_add_epi16(stp1_1, stp1_6);
+      stp2_2 = _mm_add_epi16(stp1_2, stp1_5);
+      stp2_3 = _mm_add_epi16(stp1_3, stp1_4);
+      stp2_4 = _mm_sub_epi16(stp1_3, stp1_4);
+      stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);
+      stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);
+      stp2_7 = _mm_sub_epi16(stp1_0, stp1_7);
+
+      stp2_8 = stp1_8;
+      stp2_9 = stp1_9;
+      stp2_14 = stp1_14;
+      stp2_15 = stp1_15;
+
+      MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12,
+                             stg6_0, stg4_0, stg6_0, stg4_0, stp2_10,
+                             stp2_13, stp2_11, stp2_12)
+
+      stp2_16 = _mm_add_epi16(stp1_16, stp1_23);
+      stp2_17 = _mm_add_epi16(stp1_17, stp1_22);
+      stp2_18 = _mm_add_epi16(stp1_18, stp1_21);
+      stp2_19 = _mm_add_epi16(stp1_19, stp1_20);
+      stp2_20 = _mm_sub_epi16(stp1_19, stp1_20);
+      stp2_21 = _mm_sub_epi16(stp1_18, stp1_21);
+      stp2_22 = _mm_sub_epi16(stp1_17, stp1_22);
+      stp2_23 = _mm_sub_epi16(stp1_16, stp1_23);
+
+      stp2_24 = _mm_sub_epi16(stp1_31, stp1_24);
+      stp2_25 = _mm_sub_epi16(stp1_30, stp1_25);
+      stp2_26 = _mm_sub_epi16(stp1_29, stp1_26);
+      stp2_27 = _mm_sub_epi16(stp1_28, stp1_27);
+      stp2_28 = _mm_add_epi16(stp1_27, stp1_28);
+      stp2_29 = _mm_add_epi16(stp1_26, stp1_29);
+      stp2_30 = _mm_add_epi16(stp1_25, stp1_30);
+      stp2_31 = _mm_add_epi16(stp1_24, stp1_31);
+    }
+
+    // Stage7
+    {
+      const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);
+      const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);
+      const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);
+      const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);
+
+      const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);
+      const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);
+      const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24);
+      const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24);
+
+      stp1_0 = _mm_add_epi16(stp2_0, stp2_15);
+      stp1_1 = _mm_add_epi16(stp2_1, stp2_14);
+      stp1_2 = _mm_add_epi16(stp2_2, stp2_13);
+      stp1_3 = _mm_add_epi16(stp2_3, stp2_12);
+      stp1_4 = _mm_add_epi16(stp2_4, stp2_11);
+      stp1_5 = _mm_add_epi16(stp2_5, stp2_10);
+      stp1_6 = _mm_add_epi16(stp2_6, stp2_9);
+      stp1_7 = _mm_add_epi16(stp2_7, stp2_8);
+      stp1_8 = _mm_sub_epi16(stp2_7, stp2_8);
+      stp1_9 = _mm_sub_epi16(stp2_6, stp2_9);
+      stp1_10 = _mm_sub_epi16(stp2_5, stp2_10);
+      stp1_11 = _mm_sub_epi16(stp2_4, stp2_11);
+      stp1_12 = _mm_sub_epi16(stp2_3, stp2_12);
+      stp1_13 = _mm_sub_epi16(stp2_2, stp2_13);
+      stp1_14 = _mm_sub_epi16(stp2_1, stp2_14);
+      stp1_15 = _mm_sub_epi16(stp2_0, stp2_15);
+
+      stp1_16 = stp2_16;
+      stp1_17 = stp2_17;
+      stp1_18 = stp2_18;
+      stp1_19 = stp2_19;
+
+      MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0,
+                             stg4_0, stg6_0, stg4_0, stp1_20, stp1_27,
+                             stp1_21, stp1_26)
+      MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0,
+                             stg4_0, stg6_0, stg4_0, stp1_22, stp1_25,
+                             stp1_23, stp1_24)
+
+      stp1_28 = stp2_28;
+      stp1_29 = stp2_29;
+      stp1_30 = stp2_30;
+      stp1_31 = stp2_31;
+    }
+
+    // final stage
+    if (i < 4) {
+      // 1_D: Store 32 intermediate results for each 8x32 block.
+      col[i * 32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
+      col[i * 32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
+      col[i * 32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
+      col[i * 32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
+      col[i * 32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
+      col[i * 32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
+      col[i * 32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
+      col[i * 32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
+      col[i * 32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
+      col[i * 32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
+      col[i * 32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
+      col[i * 32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
+      col[i * 32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
+      col[i * 32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
+      col[i * 32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
+      col[i * 32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
+      col[i * 32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
+      col[i * 32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
+      col[i * 32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
+      col[i * 32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
+      col[i * 32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
+      col[i * 32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
+      col[i * 32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
+      col[i * 32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
+      col[i * 32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
+      col[i * 32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
+      col[i * 32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
+      col[i * 32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
+      col[i * 32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
+      col[i * 32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
+      col[i * 32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
+      col[i * 32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
+    } else {
+      // 2_D: Calculate the results and store them to destination.
+      in0 = _mm_add_epi16(stp1_0, stp1_31);
+      in1 = _mm_add_epi16(stp1_1, stp1_30);
+      in2 = _mm_add_epi16(stp1_2, stp1_29);
+      in3 = _mm_add_epi16(stp1_3, stp1_28);
+      in4 = _mm_add_epi16(stp1_4, stp1_27);
+      in5 = _mm_add_epi16(stp1_5, stp1_26);
+      in6 = _mm_add_epi16(stp1_6, stp1_25);
+      in7 = _mm_add_epi16(stp1_7, stp1_24);
+      in8 = _mm_add_epi16(stp1_8, stp1_23);
+      in9 = _mm_add_epi16(stp1_9, stp1_22);
+      in10 = _mm_add_epi16(stp1_10, stp1_21);
+      in11 = _mm_add_epi16(stp1_11, stp1_20);
+      in12 = _mm_add_epi16(stp1_12, stp1_19);
+      in13 = _mm_add_epi16(stp1_13, stp1_18);
+      in14 = _mm_add_epi16(stp1_14, stp1_17);
+      in15 = _mm_add_epi16(stp1_15, stp1_16);
+      in16 = _mm_sub_epi16(stp1_15, stp1_16);
+      in17 = _mm_sub_epi16(stp1_14, stp1_17);
+      in18 = _mm_sub_epi16(stp1_13, stp1_18);
+      in19 = _mm_sub_epi16(stp1_12, stp1_19);
+      in20 = _mm_sub_epi16(stp1_11, stp1_20);
+      in21 = _mm_sub_epi16(stp1_10, stp1_21);
+      in22 = _mm_sub_epi16(stp1_9, stp1_22);
+      in23 = _mm_sub_epi16(stp1_8, stp1_23);
+      in24 = _mm_sub_epi16(stp1_7, stp1_24);
+      in25 = _mm_sub_epi16(stp1_6, stp1_25);
+      in26 = _mm_sub_epi16(stp1_5, stp1_26);
+      in27 = _mm_sub_epi16(stp1_4, stp1_27);
+      in28 = _mm_sub_epi16(stp1_3, stp1_28);
+      in29 = _mm_sub_epi16(stp1_2, stp1_29);
+      in30 = _mm_sub_epi16(stp1_1, stp1_30);
+      in31 = _mm_sub_epi16(stp1_0, stp1_31);
+
+      // Final rounding and shift
+      in0 = _mm_adds_epi16(in0, final_rounding);
+      in1 = _mm_adds_epi16(in1, final_rounding);
+      in2 = _mm_adds_epi16(in2, final_rounding);
+      in3 = _mm_adds_epi16(in3, final_rounding);
+      in4 = _mm_adds_epi16(in4, final_rounding);
+      in5 = _mm_adds_epi16(in5, final_rounding);
+      in6 = _mm_adds_epi16(in6, final_rounding);
+      in7 = _mm_adds_epi16(in7, final_rounding);
+      in8 = _mm_adds_epi16(in8, final_rounding);
+      in9 = _mm_adds_epi16(in9, final_rounding);
+      in10 = _mm_adds_epi16(in10, final_rounding);
+      in11 = _mm_adds_epi16(in11, final_rounding);
+      in12 = _mm_adds_epi16(in12, final_rounding);
+      in13 = _mm_adds_epi16(in13, final_rounding);
+      in14 = _mm_adds_epi16(in14, final_rounding);
+      in15 = _mm_adds_epi16(in15, final_rounding);
+      in16 = _mm_adds_epi16(in16, final_rounding);
+      in17 = _mm_adds_epi16(in17, final_rounding);
+      in18 = _mm_adds_epi16(in18, final_rounding);
+      in19 = _mm_adds_epi16(in19, final_rounding);
+      in20 = _mm_adds_epi16(in20, final_rounding);
+      in21 = _mm_adds_epi16(in21, final_rounding);
+      in22 = _mm_adds_epi16(in22, final_rounding);
+      in23 = _mm_adds_epi16(in23, final_rounding);
+      in24 = _mm_adds_epi16(in24, final_rounding);
+      in25 = _mm_adds_epi16(in25, final_rounding);
+      in26 = _mm_adds_epi16(in26, final_rounding);
+      in27 = _mm_adds_epi16(in27, final_rounding);
+      in28 = _mm_adds_epi16(in28, final_rounding);
+      in29 = _mm_adds_epi16(in29, final_rounding);
+      in30 = _mm_adds_epi16(in30, final_rounding);
+      in31 = _mm_adds_epi16(in31, final_rounding);
+
+      in0 = _mm_srai_epi16(in0, 6);
+      in1 = _mm_srai_epi16(in1, 6);
+      in2 = _mm_srai_epi16(in2, 6);
+      in3 = _mm_srai_epi16(in3, 6);
+      in4 = _mm_srai_epi16(in4, 6);
+      in5 = _mm_srai_epi16(in5, 6);
+      in6 = _mm_srai_epi16(in6, 6);
+      in7 = _mm_srai_epi16(in7, 6);
+      in8 = _mm_srai_epi16(in8, 6);
+      in9 = _mm_srai_epi16(in9, 6);
+      in10 = _mm_srai_epi16(in10, 6);
+      in11 = _mm_srai_epi16(in11, 6);
+      in12 = _mm_srai_epi16(in12, 6);
+      in13 = _mm_srai_epi16(in13, 6);
+      in14 = _mm_srai_epi16(in14, 6);
+      in15 = _mm_srai_epi16(in15, 6);
+      in16 = _mm_srai_epi16(in16, 6);
+      in17 = _mm_srai_epi16(in17, 6);
+      in18 = _mm_srai_epi16(in18, 6);
+      in19 = _mm_srai_epi16(in19, 6);
+      in20 = _mm_srai_epi16(in20, 6);
+      in21 = _mm_srai_epi16(in21, 6);
+      in22 = _mm_srai_epi16(in22, 6);
+      in23 = _mm_srai_epi16(in23, 6);
+      in24 = _mm_srai_epi16(in24, 6);
+      in25 = _mm_srai_epi16(in25, 6);
+      in26 = _mm_srai_epi16(in26, 6);
+      in27 = _mm_srai_epi16(in27, 6);
+      in28 = _mm_srai_epi16(in28, 6);
+      in29 = _mm_srai_epi16(in29, 6);
+      in30 = _mm_srai_epi16(in30, 6);
+      in31 = _mm_srai_epi16(in31, 6);
+
+      // Store results
+      _mm_store_si128((__m128i *)output, in0);
+      _mm_store_si128((__m128i *)(output + half_pitch * 1), in1);
+      _mm_store_si128((__m128i *)(output + half_pitch * 2), in2);
+      _mm_store_si128((__m128i *)(output + half_pitch * 3), in3);
+      _mm_store_si128((__m128i *)(output + half_pitch * 4), in4);
+      _mm_store_si128((__m128i *)(output + half_pitch * 5), in5);
+      _mm_store_si128((__m128i *)(output + half_pitch * 6), in6);
+      _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);
+      _mm_store_si128((__m128i *)(output + half_pitch * 8), in8);
+      _mm_store_si128((__m128i *)(output + half_pitch * 9), in9);
+      _mm_store_si128((__m128i *)(output + half_pitch * 10), in10);
+      _mm_store_si128((__m128i *)(output + half_pitch * 11), in11);
+      _mm_store_si128((__m128i *)(output + half_pitch * 12), in12);
+      _mm_store_si128((__m128i *)(output + half_pitch * 13), in13);
+      _mm_store_si128((__m128i *)(output + half_pitch * 14), in14);
+      _mm_store_si128((__m128i *)(output + half_pitch * 15), in15);
+      _mm_store_si128((__m128i *)(output + half_pitch * 16), in16);
+      _mm_store_si128((__m128i *)(output + half_pitch * 17), in17);
+      _mm_store_si128((__m128i *)(output + half_pitch * 18), in18);
+      _mm_store_si128((__m128i *)(output + half_pitch * 19), in19);
+      _mm_store_si128((__m128i *)(output + half_pitch * 20), in20);
+      _mm_store_si128((__m128i *)(output + half_pitch * 21), in21);
+      _mm_store_si128((__m128i *)(output + half_pitch * 22), in22);
+      _mm_store_si128((__m128i *)(output + half_pitch * 23), in23);
+      _mm_store_si128((__m128i *)(output + half_pitch * 24), in24);
+      _mm_store_si128((__m128i *)(output + half_pitch * 25), in25);
+      _mm_store_si128((__m128i *)(output + half_pitch * 26), in26);
+      _mm_store_si128((__m128i *)(output + half_pitch * 27), in27);
+      _mm_store_si128((__m128i *)(output + half_pitch * 28), in28);
+      _mm_store_si128((__m128i *)(output + half_pitch * 29), in29);
+      _mm_store_si128((__m128i *)(output + half_pitch * 30), in30);
+      _mm_store_si128((__m128i *)(output + half_pitch * 31), in31);
+
+      output += 8;
+    }
+  }
+}
 #endif
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 9cb18143f..353e94fa5 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -490,7 +490,7 @@ static void read_switchable_interp_probs(VP9D_COMP* const pbi,
   int i, j;
   for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) {
     for (i = 0; i < VP9_SWITCHABLE_FILTERS - 1; ++i) {
-      cm->fc.switchable_interp_prob[j][i] = vp9_read_literal(bc, 8);
+      cm->fc.switchable_interp_prob[j][i] = vp9_read_prob(bc);
     }
   }
   //printf("DECODER: %d %d\n", cm->fc.switchable_interp_prob[0],
@@ -511,13 +511,13 @@ static void mb_mode_mv_init(VP9D_COMP *pbi, vp9_reader *bc) {
 #if CONFIG_COMP_INTERINTRA_PRED
     if (cm->use_interintra) {
       if (vp9_read(bc, VP9_UPD_INTERINTRA_PROB))
-        cm->fc.interintra_prob  = (vp9_prob)vp9_read_literal(bc, 8);
+        cm->fc.interintra_prob = vp9_read_prob(bc);
     }
 #endif
     // Decode the baseline probabilities for decoding reference frame
-    cm->prob_intra_coded = (vp9_prob)vp9_read_literal(bc, 8);
-    cm->prob_last_coded  = (vp9_prob)vp9_read_literal(bc, 8);
-    cm->prob_gf_coded    = (vp9_prob)vp9_read_literal(bc, 8);
+    cm->prob_intra_coded = vp9_read_prob(bc);
+    cm->prob_last_coded  = vp9_read_prob(bc);
+    cm->prob_gf_coded    = vp9_read_prob(bc);
 
     // Computes a modified set of probabilities for use when reference
     // frame prediction fails.
@@ -529,14 +529,14 @@ static void mb_mode_mv_init(VP9D_COMP *pbi, vp9_reader *bc) {
     if (cm->comp_pred_mode == HYBRID_PREDICTION) {
       int i;
       for (i = 0; i < COMP_PRED_CONTEXTS; i++)
-        cm->prob_comppred[i] = (vp9_prob)vp9_read_literal(bc, 8);
+        cm->prob_comppred[i] = vp9_read_prob(bc);
     }
 
     if (vp9_read_bit(bc)) {
       int i = 0;
 
       do {
-        cm->fc.ymode_prob[i] = (vp9_prob) vp9_read_literal(bc, 8);
+        cm->fc.ymode_prob[i] = vp9_read_prob(bc);
       } while (++i < VP9_YMODES - 1);
     }
 
@@ -544,7 +544,7 @@ static void mb_mode_mv_init(VP9D_COMP *pbi, vp9_reader *bc) {
       int i = 0;
 
       do {
-        cm->fc.sb_ymode_prob[i] = (vp9_prob) vp9_read_literal(bc, 8);
+        cm->fc.sb_ymode_prob[i] = vp9_read_prob(bc);
       } while (++i < VP9_I32X32_MODES - 1);
     }
 
@@ -1141,7 +1141,7 @@ void vp9_decode_mode_mvs_init(VP9D_COMP* const pbi, BOOL_DECODER* const bc) {
   if (pbi->common.mb_no_coeff_skip) {
     int k;
     for (k = 0; k < MBSKIP_CONTEXTS; ++k) {
-      cm->mbskip_pred_probs[k] = (vp9_prob)vp9_read_literal(bc, 8);
+      cm->mbskip_pred_probs[k] = vp9_read_prob(bc);
     }
   }
 
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index ba7570347..40e5b1451 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -44,7 +44,6 @@
 int dec_debug = 0;
 #endif
 
-
 static int read_le16(const uint8_t *p) {
   return (p[1] << 8) | p[0];
 }
@@ -1278,61 +1277,51 @@ static void update_frame_size(VP9D_COMP *pbi) {
   vp9_update_mode_info_in_image(cm, cm->mi);
 }
 
-static void setup_segmentation(VP9_COMMON *pc, MACROBLOCKD *xd,
-                               BOOL_DECODER *header_bc) {
+static void setup_segmentation(VP9_COMMON *pc, MACROBLOCKD *xd, vp9_reader *r) {
   int i, j;
 
-  // Is segmentation enabled
-  xd->segmentation_enabled = vp9_read_bit(header_bc);
-
+  xd->segmentation_enabled = vp9_read_bit(r);
   if (xd->segmentation_enabled) {
     // Read whether or not the segmentation map is being explicitly updated
     // this frame.
-    xd->update_mb_segmentation_map = vp9_read_bit(header_bc);
+    xd->update_mb_segmentation_map = vp9_read_bit(r);
 
     // If so what method will be used.
     if (xd->update_mb_segmentation_map) {
       // Which macro block level features are enabled. Read the probs used to
       // decode the segment id for each macro block.
       for (i = 0; i < MB_FEATURE_TREE_PROBS; i++) {
-          xd->mb_segment_tree_probs[i] = vp9_read_bit(header_bc) ?
-              (vp9_prob)vp9_read_literal(header_bc, 8) : 255;
+        xd->mb_segment_tree_probs[i] = vp9_read_bit(r) ? vp9_read_prob(r) : 255;
       }
 
       // Read the prediction probs needed to decode the segment id
-      pc->temporal_update = vp9_read_bit(header_bc);
+      pc->temporal_update = vp9_read_bit(r);
       for (i = 0; i < PREDICTION_PROBS; i++) {
-        if (pc->temporal_update) {
-          pc->segment_pred_probs[i] = vp9_read_bit(header_bc) ?
-              (vp9_prob)vp9_read_literal(header_bc, 8) : 255;
-        } else {
-          pc->segment_pred_probs[i] = 255;
-        }
+        pc->segment_pred_probs[i] = pc->temporal_update
+            ? (vp9_read_bit(r) ? vp9_read_prob(r) : 255)
+            : 255;
       }
 
       if (pc->temporal_update) {
-        int count[4];
         const vp9_prob *p = xd->mb_segment_tree_probs;
         vp9_prob *p_mod = xd->mb_segment_mispred_tree_probs;
-
-        count[0] =        p[0]  *        p[1];
-        count[1] =        p[0]  * (256 - p[1]);
-        count[2] = (256 - p[0]) *        p[2];
-        count[3] = (256 - p[0]) * (256 - p[2]);
-
-        p_mod[0] = get_binary_prob(count[1], count[2] + count[3]);
-        p_mod[1] = get_binary_prob(count[0], count[2] + count[3]);
-        p_mod[2] = get_binary_prob(count[0] + count[1], count[3]);
-        p_mod[3] = get_binary_prob(count[0] + count[1], count[2]);
+        const int c0 =        p[0]  *        p[1];
+        const int c1 =        p[0]  * (256 - p[1]);
+        const int c2 = (256 - p[0]) *        p[2];
+        const int c3 = (256 - p[0]) * (256 - p[2]);
+
+        p_mod[0] = get_binary_prob(c1, c2 + c3);
+        p_mod[1] = get_binary_prob(c0, c2 + c3);
+        p_mod[2] = get_binary_prob(c0 + c1, c3);
+        p_mod[3] = get_binary_prob(c0 + c1, c2);
       }
     }
-    // Is the segment data being updated
-    xd->update_mb_segmentation_data = vp9_read_bit(header_bc);
 
+    xd->update_mb_segmentation_data = vp9_read_bit(r);
     if (xd->update_mb_segmentation_data) {
       int data;
 
-      xd->mb_segment_abs_delta = vp9_read_bit(header_bc);
+      xd->mb_segment_abs_delta = vp9_read_bit(r);
 
       vp9_clearall_segfeatures(xd);
 
@@ -1341,16 +1330,15 @@ static void setup_segmentation(VP9_COMMON *pc, MACROBLOCKD *xd,
         // For each of the segments features...
         for (j = 0; j < SEG_LVL_MAX; j++) {
           // Is the feature enabled
-          if (vp9_read_bit(header_bc)) {
+          if (vp9_read_bit(r)) {
             // Update the feature data and mask
             vp9_enable_segfeature(xd, i, j);
 
-            data = vp9_decode_unsigned_max(header_bc,
-                                           vp9_seg_feature_data_max(j));
+            data = vp9_decode_unsigned_max(r, vp9_seg_feature_data_max(j));
 
             // Is the segment data signed..
             if (vp9_is_segfeature_signed(j)) {
-              if (vp9_read_bit(header_bc))
+              if (vp9_read_bit(r))
                 data = -data;
             }
           } else {
@@ -1364,17 +1352,16 @@ static void setup_segmentation(VP9_COMMON *pc, MACROBLOCKD *xd,
   }
 }
 
-static void setup_loopfilter(VP9_COMMON *pc, MACROBLOCKD *xd,
-                             BOOL_DECODER *header_bc) {
+static void setup_loopfilter(VP9_COMMON *pc, MACROBLOCKD *xd, vp9_reader *r) {
   int i;
 
-  pc->filter_type = (LOOPFILTERTYPE) vp9_read_bit(header_bc);
-  pc->filter_level = vp9_read_literal(header_bc, 6);
-  pc->sharpness_level = vp9_read_literal(header_bc, 3);
+  pc->filter_type = (LOOPFILTERTYPE) vp9_read_bit(r);
+  pc->filter_level = vp9_read_literal(r, 6);
+  pc->sharpness_level = vp9_read_literal(r, 3);
 
 #if CONFIG_LOOP_DERING
-  if (vp9_read_bit(header_bc))
-    pc->dering_enabled = 1 + vp9_read_literal(header_bc, 4);
+  if (vp9_read_bit(r))
+    pc->dering_enabled = 1 + vp9_read_literal(r, 4);
   else
     pc->dering_enabled = 0;
 #endif
@@ -1382,31 +1369,31 @@ static void setup_loopfilter(VP9_COMMON *pc, MACROBLOCKD *xd,
   // Read in loop filter deltas applied at the MB level based on mode or ref
   // frame.
   xd->mode_ref_lf_delta_update = 0;
-  xd->mode_ref_lf_delta_enabled = vp9_read_bit(header_bc);
+  xd->mode_ref_lf_delta_enabled = vp9_read_bit(r);
 
   if (xd->mode_ref_lf_delta_enabled) {
     // Do the deltas need to be updated
-    xd->mode_ref_lf_delta_update = vp9_read_bit(header_bc);
+    xd->mode_ref_lf_delta_update = vp9_read_bit(r);
 
     if (xd->mode_ref_lf_delta_update) {
       // Send update
       for (i = 0; i < MAX_REF_LF_DELTAS; i++) {
-        if (vp9_read_bit(header_bc)) {
-          // sign = vp9_read_bit( &header_bc );
-          xd->ref_lf_deltas[i] = (signed char)vp9_read_literal(header_bc, 6);
+        if (vp9_read_bit(r)) {
+          // sign = vp9_read_bit(r);
+          xd->ref_lf_deltas[i] = vp9_read_literal(r, 6);
 
-          if (vp9_read_bit(header_bc))
+          if (vp9_read_bit(r))
             xd->ref_lf_deltas[i] = -xd->ref_lf_deltas[i];  // Apply sign
         }
       }
 
       // Send update
       for (i = 0; i < MAX_MODE_LF_DELTAS; i++) {
-        if (vp9_read_bit(header_bc)) {
-          // sign = vp9_read_bit( &header_bc );
-          xd->mode_lf_deltas[i] = (signed char)vp9_read_literal(header_bc, 6);
+        if (vp9_read_bit(r)) {
+          // sign = vp9_read_bit(r);
+          xd->mode_lf_deltas[i] = vp9_read_literal(r, 6);
 
-          if (vp9_read_bit(header_bc))
+          if (vp9_read_bit(r))
             xd->mode_lf_deltas[i] = -xd->mode_lf_deltas[i];  // Apply sign
         }
       }
@@ -1414,6 +1401,124 @@ static void setup_loopfilter(VP9_COMMON *pc, MACROBLOCKD *xd,
   }
 }
 
+static const uint8_t *setup_frame_size(VP9D_COMP *pbi, int scaling_active,
+                                      const uint8_t *data,
+                                      const uint8_t *data_end) {
+  VP9_COMMON *const pc = &pbi->common;
+  const int width = pc->width;
+  const int height = pc->height;
+
+  // If error concealment is enabled we should only parse the new size
+  // if we have enough data. Otherwise we will end up with the wrong size.
+  if (scaling_active && data + 4 < data_end) {
+    pc->display_width = read_le16(data + 0);
+    pc->display_height = read_le16(data + 2);
+    data += 4;
+  }
+
+  if (data + 4 < data_end) {
+    pc->width = read_le16(data + 0);
+    pc->height = read_le16(data + 2);
+    data += 4;
+  }
+
+  if (!scaling_active) {
+    pc->display_width = pc->width;
+    pc->display_height = pc->height;
+  }
+
+  if (width != pc->width || height != pc->height) {
+    if (pc->width <= 0) {
+      pc->width = width;
+      vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+                         "Invalid frame width");
+    }
+
+    if (pc->height <= 0) {
+      pc->height = height;
+      vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+                         "Invalid frame height");
+    }
+
+    if (!pbi->initial_width || !pbi->initial_height) {
+      if (vp9_alloc_frame_buffers(pc, pc->width, pc->height))
+        vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
+                           "Failed to allocate frame buffers");
+      pbi->initial_width = pc->width;
+      pbi->initial_height = pc->height;
+    }
+
+    if (pc->width > pbi->initial_width) {
+      vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+                         "Frame width too large");
+    }
+
+    if (pc->height > pbi->initial_height) {
+      vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+                         "Frame height too large");
+    }
+
+    update_frame_size(pbi);
+  }
+
+  return data;
+}
+
+static void update_frame_context(VP9D_COMP *pbi, vp9_reader *r) {
+  FRAME_CONTEXT *const fc = &pbi->common.fc;
+
+  vp9_copy(fc->pre_coef_probs_4x4, fc->coef_probs_4x4);
+  vp9_copy(fc->pre_coef_probs_8x8, fc->coef_probs_8x8);
+  vp9_copy(fc->pre_coef_probs_16x16, fc->coef_probs_16x16);
+  vp9_copy(fc->pre_coef_probs_32x32, fc->coef_probs_32x32);
+  vp9_copy(fc->pre_ymode_prob, fc->ymode_prob);
+  vp9_copy(fc->pre_sb_ymode_prob, fc->sb_ymode_prob);
+  vp9_copy(fc->pre_uv_mode_prob, fc->uv_mode_prob);
+  vp9_copy(fc->pre_bmode_prob, fc->bmode_prob);
+  vp9_copy(fc->pre_i8x8_mode_prob, fc->i8x8_mode_prob);
+  vp9_copy(fc->pre_sub_mv_ref_prob, fc->sub_mv_ref_prob);
+  vp9_copy(fc->pre_mbsplit_prob, fc->mbsplit_prob);
+  fc->pre_nmvc = fc->nmvc;
+
+  vp9_zero(fc->coef_counts_4x4);
+  vp9_zero(fc->coef_counts_8x8);
+  vp9_zero(fc->coef_counts_16x16);
+  vp9_zero(fc->coef_counts_32x32);
+  vp9_zero(fc->eob_branch_counts);
+  vp9_zero(fc->ymode_counts);
+  vp9_zero(fc->sb_ymode_counts);
+  vp9_zero(fc->uv_mode_counts);
+  vp9_zero(fc->bmode_counts);
+  vp9_zero(fc->i8x8_mode_counts);
+  vp9_zero(fc->sub_mv_ref_counts);
+  vp9_zero(fc->mbsplit_counts);
+  vp9_zero(fc->NMVcount);
+  vp9_zero(fc->mv_ref_ct);
+
+#if CONFIG_COMP_INTERINTRA_PRED
+  fc->pre_interintra_prob = fc->interintra_prob;
+  vp9_zero(fc->interintra_counts);
+#endif
+
+#if CONFIG_CODE_NONZEROCOUNT
+  vp9_copy(fc->pre_nzc_probs_4x4, fc->nzc_probs_4x4);
+  vp9_copy(fc->pre_nzc_probs_8x8, fc->nzc_probs_8x8);
+  vp9_copy(fc->pre_nzc_probs_16x16, fc->nzc_probs_16x16);
+  vp9_copy(fc->pre_nzc_probs_32x32, fc->nzc_probs_32x32);
+  vp9_copy(fc->pre_nzc_pcat_probs, fc->nzc_pcat_probs);
+
+  vp9_zero(fc->nzc_counts_4x4);
+  vp9_zero(fc->nzc_counts_8x8);
+  vp9_zero(fc->nzc_counts_16x16);
+  vp9_zero(fc->nzc_counts_32x32);
+  vp9_zero(fc->nzc_pcat_counts);
+#endif
+
+  read_coef_probs(pbi, r);
+#if CONFIG_CODE_NONZEROCOUNT
+  read_nzc_probs(&pbi->common, r);
+#endif
+}
 
 int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
   BOOL_DECODER header_bc, residual_bc;
@@ -1425,8 +1530,8 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
   int mb_row, i, corrupt_tokens = 0;
 
   // printf("Decoding frame %d\n", pc->current_video_frame);
-  /* start with no corruption of current frame */
-  xd->corrupted = 0;
+
+  xd->corrupted = 0;  // start with no corruption of current frame
   pc->yv12_fb[pc->new_fb_idx].corrupted = 0;
 
   if (data_end - data < 3) {
@@ -1449,10 +1554,8 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
     vp9_setup_version(pc);
 
     if (pc->frame_type == KEY_FRAME) {
-      /* vet via sync code */
-      /* When error concealment is enabled we should only check the sync
-       * code if we have enough bits available
-       */
+      // When error concealment is enabled we should only check the sync
+      // code if we have enough bits available
       if (data + 3 < data_end) {
         if (data[0] != 0x9d || data[1] != 0x01 || data[2] != 0x2a)
           vpx_internal_error(&pc->error, VPX_CODEC_UNSUP_BITSTREAM,
@@ -1460,63 +1563,8 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
       }
       data += 3;
     }
-    {
-      const int width = pc->width;
-      const int height = pc->height;
-
-      /* If error concealment is enabled we should only parse the new size
-       * if we have enough data. Otherwise we will end up with the wrong
-       * size.
-       */
-      if (scaling_active && data + 4 < data_end) {
-        pc->display_width = read_le16(data + 0);
-        pc->display_height = read_le16(data + 2);
-        data += 4;
-      }
-      if (data + 4 < data_end) {
-        pc->width = read_le16(data + 0);
-        pc->height = read_le16(data + 2);
-        data += 4;
-      }
-      if (!scaling_active) {
-        pc->display_width = pc->width;
-        pc->display_height = pc->height;
-      }
-
-      if (width != pc->width || height != pc->height) {
-        if (pc->width <= 0) {
-          pc->width = width;
-          vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
-                             "Invalid frame width");
-        }
-
-        if (pc->height <= 0) {
-          pc->height = height;
-          vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
-                             "Invalid frame height");
-        }
-
-        if (!pbi->initial_width || !pbi->initial_height) {
-          if (vp9_alloc_frame_buffers(pc, pc->width, pc->height))
-            vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
-                               "Failed to allocate frame buffers");
-          pbi->initial_width = pc->width;
-          pbi->initial_height = pc->height;
-        }
-
-        if (pc->width > pbi->initial_width) {
-          vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
-                             "Frame width too large");
-        }
 
-        if (pc->height > pbi->initial_height) {
-          vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
-                             "Frame height too large");
-        }
-
-        update_frame_size(pbi);
-      }
-    }
+    data = setup_frame_size(pbi, scaling_active, data, data_end);
   }
 
   if ((!pbi->decoded_key_frame && pc->frame_type != KEY_FRAME) ||
@@ -1526,7 +1574,7 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
 
   init_frame(pbi);
 
-  /* Reset the frame pointers to the current frame size */
+  // Reset the frame pointers to the current frame size
   vp8_yv12_realloc_frame_buffer(&pc->yv12_fb[pc->new_fb_idx],
                                 pc->width, pc->height,
                                 VP9BORDERINPIXELS);
@@ -1535,9 +1583,9 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
                        (unsigned int)first_partition_length_in_bytes))
     vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate bool decoder 0");
-  pc->clr_type    = (YUV_TYPE)vp9_read_bit(&header_bc);
-  pc->clamp_type  = (CLAMP_TYPE)vp9_read_bit(&header_bc);
 
+  pc->clr_type = (YUV_TYPE)vp9_read_bit(&header_bc);
+  pc->clamp_type = (CLAMP_TYPE)vp9_read_bit(&header_bc);
   pc->error_resilient_mode = vp9_read_bit(&header_bc);
 
   setup_segmentation(pc, xd, &header_bc);
@@ -1552,25 +1600,25 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
   } else {
     for (i = 0; i < PREDICTION_PROBS; i++) {
       if (vp9_read_bit(&header_bc))
-        pc->ref_pred_probs[i] = (vp9_prob)vp9_read_literal(&header_bc, 8);
+        pc->ref_pred_probs[i] = vp9_read_prob(&header_bc);
     }
   }
 
-  pc->sb64_coded = vp9_read_literal(&header_bc, 8);
-  pc->sb32_coded = vp9_read_literal(&header_bc, 8);
+  pc->sb64_coded = vp9_read_prob(&header_bc);
+  pc->sb32_coded = vp9_read_prob(&header_bc);
   xd->lossless = vp9_read_bit(&header_bc);
   if (xd->lossless) {
     pc->txfm_mode = ONLY_4X4;
   } else {
     // Read the loop filter level and type
     pc->txfm_mode = vp9_read_literal(&header_bc, 2);
-    if (pc->txfm_mode == 3)
+    if (pc->txfm_mode == ALLOW_32X32)
       pc->txfm_mode += vp9_read_bit(&header_bc);
 
     if (pc->txfm_mode == TX_MODE_SELECT) {
-      pc->prob_tx[0] = vp9_read_literal(&header_bc, 8);
-      pc->prob_tx[1] = vp9_read_literal(&header_bc, 8);
-      pc->prob_tx[2] = vp9_read_literal(&header_bc, 8);
+      pc->prob_tx[0] = vp9_read_prob(&header_bc);
+      pc->prob_tx[1] = vp9_read_prob(&header_bc);
+      pc->prob_tx[2] = vp9_read_prob(&header_bc);
     }
   }
 
@@ -1596,22 +1644,20 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
     mb_init_dequantizer(pbi, &pbi->mb);
   }
 
-  /* Determine if the golden frame or ARF buffer should be updated and how.
-   * For all non key frames the GF and ARF refresh flags and sign bias
-   * flags must be set explicitly.
-   */
+  // Determine if the golden frame or ARF buffer should be updated and how.
+  // For all non key frames the GF and ARF refresh flags and sign bias
+  // flags must be set explicitly.
   if (pc->frame_type == KEY_FRAME) {
     pc->active_ref_idx[0] = pc->new_fb_idx;
     pc->active_ref_idx[1] = pc->new_fb_idx;
     pc->active_ref_idx[2] = pc->new_fb_idx;
   } else {
-    /* Should the GF or ARF be updated from the current frame */
+    // Should the GF or ARF be updated from the current frame
     pbi->refresh_frame_flags = vp9_read_literal(&header_bc, NUM_REF_FRAMES);
 
-    /* Select active reference frames */
+    // Select active reference frames
     for (i = 0; i < 3; i++) {
       int ref_frame_num = vp9_read_literal(&header_bc, NUM_REF_FRAMES_LG2);
-
       pc->active_ref_idx[i] = pc->ref_frame_map[ref_frame_num];
     }
 
@@ -1619,16 +1665,17 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
     pc->ref_frame_sign_bias[ALTREF_FRAME] = vp9_read_bit(&header_bc);
 
     // Is high precision mv allowed
-    xd->allow_high_precision_mv = (unsigned char)vp9_read_bit(&header_bc);
+    xd->allow_high_precision_mv = vp9_read_bit(&header_bc);
 
     // Read the type of subpel filter to use
-    pc->mcomp_filter_type = vp9_read_bit(&header_bc) ? SWITCHABLE :
-                            vp9_read_literal(&header_bc, 2);
+    pc->mcomp_filter_type = vp9_read_bit(&header_bc)
+                                ? SWITCHABLE
+                                : vp9_read_literal(&header_bc, 2);
 
 #if CONFIG_COMP_INTERINTRA_PRED
     pc->use_interintra = vp9_read_bit(&header_bc);
 #endif
-    /* To enable choice of different interploation filters */
+    // To enable choice of different interploation filters
     vp9_setup_interp_filters(xd, pc->mcomp_filter_type, pc);
   }
 
@@ -1649,8 +1696,7 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
     for (i = 0; i < INTER_MODE_CONTEXTS; i++) {
       for (j = 0; j < 4; j++) {
         if (vp9_read(&header_bc, 252)) {
-          pc->fc.vp9_mode_contexts[i][j] =
-            (vp9_prob)vp9_read_literal(&header_bc, 8);
+          pc->fc.vp9_mode_contexts[i][j] = vp9_read_prob(&header_bc);
         }
       }
     }
@@ -1675,8 +1721,7 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
       // Read any updates to probabilities
       for (j = 0; j < MAX_MV_REF_CANDIDATES - 1; ++j) {
         if (vp9_read(&header_bc, VP9_MVREF_UPDATE_PROB)) {
-          xd->mb_mv_ref_probs[i][j] =
-            (vp9_prob)vp9_read_literal(&header_bc, 8);
+          xd->mb_mv_ref_probs[i][j] = vp9_read_prob(&header_bc);
         }
       }
     }
@@ -1693,69 +1738,9 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
     fclose(z);
   }
 
-  vp9_copy(pbi->common.fc.pre_coef_probs_4x4,
-           pbi->common.fc.coef_probs_4x4);
-  vp9_copy(pbi->common.fc.pre_coef_probs_8x8,
-           pbi->common.fc.coef_probs_8x8);
-  vp9_copy(pbi->common.fc.pre_coef_probs_16x16,
-           pbi->common.fc.coef_probs_16x16);
-  vp9_copy(pbi->common.fc.pre_coef_probs_32x32,
-           pbi->common.fc.coef_probs_32x32);
-  vp9_copy(pbi->common.fc.pre_ymode_prob, pbi->common.fc.ymode_prob);
-  vp9_copy(pbi->common.fc.pre_sb_ymode_prob, pbi->common.fc.sb_ymode_prob);
-  vp9_copy(pbi->common.fc.pre_uv_mode_prob, pbi->common.fc.uv_mode_prob);
-  vp9_copy(pbi->common.fc.pre_bmode_prob, pbi->common.fc.bmode_prob);
-  vp9_copy(pbi->common.fc.pre_i8x8_mode_prob, pbi->common.fc.i8x8_mode_prob);
-  vp9_copy(pbi->common.fc.pre_sub_mv_ref_prob, pbi->common.fc.sub_mv_ref_prob);
-  vp9_copy(pbi->common.fc.pre_mbsplit_prob, pbi->common.fc.mbsplit_prob);
-#if CONFIG_COMP_INTERINTRA_PRED
-  pbi->common.fc.pre_interintra_prob = pbi->common.fc.interintra_prob;
-#endif
-  pbi->common.fc.pre_nmvc = pbi->common.fc.nmvc;
-#if CONFIG_CODE_NONZEROCOUNT
-  vp9_copy(pbi->common.fc.pre_nzc_probs_4x4,
-           pbi->common.fc.nzc_probs_4x4);
-  vp9_copy(pbi->common.fc.pre_nzc_probs_8x8,
-           pbi->common.fc.nzc_probs_8x8);
-  vp9_copy(pbi->common.fc.pre_nzc_probs_16x16,
-           pbi->common.fc.nzc_probs_16x16);
-  vp9_copy(pbi->common.fc.pre_nzc_probs_32x32,
-           pbi->common.fc.nzc_probs_32x32);
-  vp9_copy(pbi->common.fc.pre_nzc_pcat_probs,
-           pbi->common.fc.nzc_pcat_probs);
-#endif
-
-  vp9_zero(pbi->common.fc.coef_counts_4x4);
-  vp9_zero(pbi->common.fc.coef_counts_8x8);
-  vp9_zero(pbi->common.fc.coef_counts_16x16);
-  vp9_zero(pbi->common.fc.coef_counts_32x32);
-  vp9_zero(pbi->common.fc.eob_branch_counts);
-  vp9_zero(pbi->common.fc.ymode_counts);
-  vp9_zero(pbi->common.fc.sb_ymode_counts);
-  vp9_zero(pbi->common.fc.uv_mode_counts);
-  vp9_zero(pbi->common.fc.bmode_counts);
-  vp9_zero(pbi->common.fc.i8x8_mode_counts);
-  vp9_zero(pbi->common.fc.sub_mv_ref_counts);
-  vp9_zero(pbi->common.fc.mbsplit_counts);
-  vp9_zero(pbi->common.fc.NMVcount);
-  vp9_zero(pbi->common.fc.mv_ref_ct);
-#if CONFIG_COMP_INTERINTRA_PRED
-  vp9_zero(pbi->common.fc.interintra_counts);
-#endif
-#if CONFIG_CODE_NONZEROCOUNT
-  vp9_zero(pbi->common.fc.nzc_counts_4x4);
-  vp9_zero(pbi->common.fc.nzc_counts_8x8);
-  vp9_zero(pbi->common.fc.nzc_counts_16x16);
-  vp9_zero(pbi->common.fc.nzc_counts_32x32);
-  vp9_zero(pbi->common.fc.nzc_pcat_counts);
-#endif
-
-  read_coef_probs(pbi, &header_bc);
-#if CONFIG_CODE_NONZEROCOUNT
-  read_nzc_probs(&pbi->common, &header_bc);
-#endif
+  update_frame_context(pbi, &header_bc);
 
-  /* Initialize xd pointers. Any reference should do for xd->pre, so use 0. */
+  // Initialize xd pointers. Any reference should do for xd->pre, so use 0.
   vpx_memcpy(&xd->pre, &pc->yv12_fb[pc->active_ref_idx[0]],
              sizeof(YV12_BUFFER_CONFIG));
   vpx_memcpy(&xd->dst, &pc->yv12_fb[pc->new_fb_idx],
diff --git a/vp9/decoder/vp9_treereader.h b/vp9/decoder/vp9_treereader.h
index 305dfe51f..4ec6de99d 100644
--- a/vp9/decoder/vp9_treereader.h
+++ b/vp9/decoder/vp9_treereader.h
@@ -19,10 +19,10 @@ typedef BOOL_DECODER vp9_reader;
 
 #define vp9_read decode_bool
 #define vp9_read_literal decode_value
-#define vp9_read_bit(R) vp9_read(R, vp9_prob_half)
-
-/* Intent of tree data structure is to make decoding trivial. */
+#define vp9_read_bit(r) vp9_read(r, vp9_prob_half)
+#define vp9_read_prob(r) ((vp9_prob)vp9_read_literal(r, 8))
 
+// Intent of tree data structure is to make decoding trivial.
 static int treed_read(vp9_reader *const r, /* !!! must return a 0 or 1 !!! */
                       vp9_tree t,
                       const vp9_prob *const p) {
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 82bd70bf8..d26f5ec46 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -89,45 +89,31 @@ static const unsigned int prior_key_frame_weight[KEY_FRAME_CONTEXT] = { 1, 2, 3,
 // tables if and when things settle down in the experimental bitstream
 double vp9_convert_qindex_to_q(int qindex) {
   // Convert the index to a real Q value (scaled down to match old Q values)
-  return (double)vp9_ac_yquant(qindex) / 4.0;
+  return vp9_ac_yquant(qindex) / 4.0;
 }
 
 int vp9_gfboost_qadjust(int qindex) {
-  int retval;
-  double q;
-
-  q = vp9_convert_qindex_to_q(qindex);
-  retval = (int)((0.00000828 * q * q * q) +
-                 (-0.0055 * q * q) +
-                 (1.32 * q) + 79.3);
-  return retval;
+  const double q = vp9_convert_qindex_to_q(qindex);
+  return (int)((0.00000828 * q * q * q) +
+               (-0.0055 * q * q) +
+               (1.32 * q) + 79.3);
 }
 
 static int kfboost_qadjust(int qindex) {
-  int retval;
-  double q;
-
-  q = vp9_convert_qindex_to_q(qindex);
-  retval = (int)((0.00000973 * q * q * q) +
-                 (-0.00613 * q * q) +
-                 (1.316 * q) + 121.2);
-  return retval;
+  const double q = vp9_convert_qindex_to_q(qindex);
+  return (int)((0.00000973 * q * q * q) +
+               (-0.00613 * q * q) +
+               (1.316 * q) + 121.2);
 }
 
 int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex,
                     double correction_factor) {
-  int enumerator;
-  double q = vp9_convert_qindex_to_q(qindex);
 
-  if (frame_type == KEY_FRAME) {
-    enumerator = 4000000;
-  } else {
-    enumerator = 2500000;
-  }
+  const double q = vp9_convert_qindex_to_q(qindex);
+  int enumerator = frame_type == KEY_FRAME ? 4000000 : 2500000;
 
-  // Q based adjustment to baseline enumberator
+  // q based adjustment to baseline enumberator
   enumerator += (int)(enumerator * q) >> 12;
-
   return (int)(0.5 + (enumerator * correction_factor / q));
 }
 
@@ -265,33 +251,30 @@ void vp9_setup_key_frame(VP9_COMP *cpi) {
   // interval before next GF
   cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
   /* All buffers are implicitly updated on key frames. */
-  cpi->refresh_golden_frame = TRUE;
-  cpi->refresh_alt_ref_frame = TRUE;
+  cpi->refresh_golden_frame = 1;
+  cpi->refresh_alt_ref_frame = 1;
 }
 
 void vp9_setup_inter_frame(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &cpi->mb.e_mbd;
-  if (cm->error_resilient_mode) {
+  if (cm->error_resilient_mode)
     vp9_setup_past_independence(cm, xd);
-  }
+
   assert(cm->frame_context_idx < NUM_FRAME_CONTEXTS);
   vpx_memcpy(&cm->fc, &cm->frame_contexts[cm->frame_context_idx],
              sizeof(cm->fc));
 }
 
-static int estimate_bits_at_q(int frame_kind, int Q, int MBs,
+static int estimate_bits_at_q(int frame_kind, int q, int mbs,
                               double correction_factor) {
-  int Bpm = (int)(vp9_bits_per_mb(frame_kind, Q, correction_factor));
+  const int bpm = (int)(vp9_bits_per_mb(frame_kind, q, correction_factor));
 
-  /* Attempt to retain reasonable accuracy without overflow. The cutoff is
-   * chosen such that the maximum product of Bpm and MBs fits 31 bits. The
-   * largest Bpm takes 20 bits.
-   */
-  if (MBs > (1 << 11))
-    return (Bpm >> BPER_MB_NORMBITS) * MBs;
-  else
-    return (Bpm * MBs) >> BPER_MB_NORMBITS;
+  // Attempt to retain reasonable accuracy without overflow. The cutoff is
+  // chosen such that the maximum product of Bpm and MBs fits 31 bits. The
+  // largest Bpm takes 20 bits.
+  return (mbs > (1 << 11)) ? (bpm >> BPER_MB_NORMBITS) * mbs
+                           : (bpm * mbs) >> BPER_MB_NORMBITS;
 }
 
 
@@ -314,7 +297,6 @@ static void calc_iframe_target_size(VP9_COMP *cpi) {
   }
 
   cpi->this_frame_target = target;
-
 }
 
 
@@ -330,25 +312,15 @@ static void calc_gf_params(VP9_COMP *cpi) {
 
 
 static void calc_pframe_target_size(VP9_COMP *cpi) {
-  int min_frame_target;
-
-  min_frame_target = 0;
-
-  min_frame_target = cpi->min_frame_bandwidth;
-
-  if (min_frame_target < (cpi->av_per_frame_bandwidth >> 5))
-    min_frame_target = cpi->av_per_frame_bandwidth >> 5;
-
-
-  // Special alt reference frame case
+  const int min_frame_target = MAX(cpi->min_frame_bandwidth,
+                                   cpi->av_per_frame_bandwidth >> 5);
   if (cpi->refresh_alt_ref_frame) {
+    // Special alt reference frame case
     // Per frame bit target for the alt ref frame
     cpi->per_frame_bandwidth = cpi->twopass.gf_bits;
     cpi->this_frame_target = cpi->per_frame_bandwidth;
-  }
-
-  // Normal frames (gf,and inter)
-  else {
+  } else {
+    // Normal frames (gf,and inter)
     cpi->this_frame_target = cpi->per_frame_bandwidth;
   }
 
@@ -366,10 +338,10 @@ static void calc_pframe_target_size(VP9_COMP *cpi) {
 
   // Adjust target frame size for Golden Frames:
   if (cpi->frames_till_gf_update_due == 0) {
-    // int Boost = 0;
-    int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;
+    const int q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME]
+                                          : cpi->oxcf.fixed_q;
 
-    cpi->refresh_golden_frame = TRUE;
+    cpi->refresh_golden_frame = 1;
 
     calc_gf_params(cpi);
 
@@ -381,17 +353,17 @@ static void calc_pframe_target_size(VP9_COMP *cpi) {
         // The spend on the GF is defined in the two pass code
         // for two pass encodes
         cpi->this_frame_target = cpi->per_frame_bandwidth;
-      } else
+      } else {
         cpi->this_frame_target =
-          (estimate_bits_at_q(1, Q, cpi->common.MBs, 1.0)
+          (estimate_bits_at_q(1, q, cpi->common.MBs, 1.0)
            * cpi->last_boost) / 100;
+      }
 
-    }
-    // If there is an active ARF at this location use the minimum
-    // bits on this frame even if it is a contructed arf.
-    // The active maximum quantizer insures that an appropriate
-    // number of bits will be spent if needed for contstructed ARFs.
-    else {
+    } else {
+      // If there is an active ARF at this location use the minimum
+      // bits on this frame even if it is a contructed arf.
+      // The active maximum quantizer insures that an appropriate
+      // number of bits will be spent if needed for contstructed ARFs.
       cpi->this_frame_target = 0;
     }
 
@@ -401,12 +373,12 @@ static void calc_pframe_target_size(VP9_COMP *cpi) {
 
 
 void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
-  int    Q = cpi->common.base_qindex;
-  int    correction_factor = 100;
+  const int q = cpi->common.base_qindex;
+  int correction_factor = 100;
   double rate_correction_factor;
   double adjustment_limit;
 
-  int    projected_size_based_on_q = 0;
+  int projected_size_based_on_q = 0;
 
   // Clear down mmx registers to allow floating point in what follows
   vp9_clear_system_state();  // __asm emms;
@@ -423,9 +395,9 @@ void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
   // Work out how big we would have expected the frame to be at this Q given
   // the current correction factor.
   // Stay in double to avoid int overflow when values are large
-  projected_size_based_on_q =
-    estimate_bits_at_q(cpi->common.frame_type, Q,
-                       cpi->common.MBs, rate_correction_factor);
+  projected_size_based_on_q = estimate_bits_at_q(cpi->common.frame_type, q,
+                                                 cpi->common.MBs,
+                                                 rate_correction_factor);
 
   // Work out a size correction factor.
   // if ( cpi->this_frame_target > 0 )
@@ -480,7 +452,7 @@ void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
 
 
 int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame) {
-  int Q = cpi->active_worst_quality;
+  int q = cpi->active_worst_quality;
 
   int i;
   int last_error = INT_MAX;
@@ -507,21 +479,22 @@ int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame) {
   i = cpi->active_best_quality;
 
   do {
-    bits_per_mb_at_this_q =
-      (int)(vp9_bits_per_mb(cpi->common.frame_type, i, correction_factor));
+    bits_per_mb_at_this_q = (int)vp9_bits_per_mb(cpi->common.frame_type, i,
+                                                 correction_factor);
 
     if (bits_per_mb_at_this_q <= target_bits_per_mb) {
       if ((target_bits_per_mb - bits_per_mb_at_this_q) <= last_error)
-        Q = i;
+        q = i;
       else
-        Q = i - 1;
+        q = i - 1;
 
       break;
-    } else
+    } else {
       last_error = bits_per_mb_at_this_q - target_bits_per_mb;
+    }
   } while (++i <= cpi->active_worst_quality);
 
-  return Q;
+  return q;
 }
 
 
@@ -566,7 +539,7 @@ static int estimate_keyframe_frequency(VP9_COMP *cpi) {
       total_weight += prior_key_frame_weight[i];
     }
 
-    av_key_frame_frequency  /= total_weight;
+    av_key_frame_frequency /= total_weight;
 
   }
   return av_key_frame_frequency;
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 19255bbf9..56453e249 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -37,6 +37,7 @@ struct vp8_extracfg {
   unsigned int                cq_level;         /* constrained quality level */
   unsigned int                rc_max_intra_bitrate_pct;
   unsigned int                lossless;
+  unsigned int                frame_parallel_decoding_mode;
 };
 
 struct extraconfig_map {
@@ -64,6 +65,7 @@ static const struct extraconfig_map extracfg_map[] = {
       10,                         /* cq_level */
       0,                          /* rc_max_intra_bitrate_pct */
       0,                          /* lossless */
+      0,                          /* frame_parallel_decoding_mode */
     }
   }
 };
@@ -313,7 +315,7 @@ static vpx_codec_err_t set_vp8e_config(VP9_CONFIG *oxcf,
   oxcf->lossless = vp8_cfg.lossless;
 
   oxcf->error_resilient_mode = cfg.g_error_resilient;
-  oxcf->frame_parallel_decoding_mode = cfg.g_frame_parallel_decoding;
+  oxcf->frame_parallel_decoding_mode = vp8_cfg.frame_parallel_decoding_mode;
   /*
   printf("Current VP9 Settings: \n");
   printf("target_bandwidth: %d\n", oxcf->target_bandwidth);
@@ -423,6 +425,7 @@ static vpx_codec_err_t set_param(vpx_codec_alg_priv_t *ctx,
       MAP(VP8E_SET_CQ_LEVEL,              xcfg.cq_level);
       MAP(VP8E_SET_MAX_INTRA_BITRATE_PCT, xcfg.rc_max_intra_bitrate_pct);
       MAP(VP9E_SET_LOSSLESS,              xcfg.lossless);
+      MAP(VP9E_SET_FRAME_PARALLEL_DECODING, xcfg.frame_parallel_decoding_mode);
   }
 
   res = validate_config(ctx, &ctx->cfg, &xcfg);
@@ -1096,7 +1099,6 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = {
       {1, 30},            /* g_timebase */
 
       0,                  /* g_error_resilient */
-      0,                  /* g_frame_parallel_decoding */
 
       VPX_RC_ONE_PASS,    /* g_pass */
 
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index 66c89b5a9..d0c23f07a 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -64,6 +64,7 @@ struct vpx_codec_alg_priv {
   vpx_image_t             img;
   int                     img_setup;
   int                     img_avail;
+  int                     invert_tile_order;
 };
 
 static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si,
@@ -333,7 +334,7 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t  *ctx,
       oxcf.Version = 9;
       oxcf.postprocess = 0;
       oxcf.max_threads = ctx->cfg.threads;
-      oxcf.inv_tile_order = ctx->cfg.inv_tile_order;
+      oxcf.inv_tile_order = ctx->invert_tile_order;
       optr = vp9_create_decompressor(&oxcf);
 
       /* If postprocessing was enabled by the application and a
@@ -726,6 +727,13 @@ static vpx_codec_err_t vp8_get_frame_corrupted(vpx_codec_alg_priv_t *ctx,
 
 }
 
+static vpx_codec_err_t set_invert_tile_order(vpx_codec_alg_priv_t *ctx,
+                                             int ctr_id,
+                                             va_list args) {
+  ctx->invert_tile_order = va_arg(args, int);
+  return VPX_CODEC_OK;
+}
+
 static vpx_codec_ctrl_fn_map_t ctf_maps[] = {
   {VP8_SET_REFERENCE,             vp9_set_reference},
   {VP8_COPY_REFERENCE,            vp9_copy_reference},
@@ -737,6 +745,7 @@ static vpx_codec_ctrl_fn_map_t ctf_maps[] = {
   {VP8D_GET_LAST_REF_UPDATES,     vp8_get_last_ref_updates},
   {VP8D_GET_FRAME_CORRUPTED,      vp8_get_frame_corrupted},
   {VP9_GET_REFERENCE,             get_reference},
+  {VP9_INVERT_TILE_DECODE_ORDER,  set_invert_tile_order},
   { -1, NULL},
 };
 
diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index 0b910b99d..7f19dd033 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -190,6 +190,7 @@ enum vp8e_enc_control_id {
   VP9E_SET_LOSSLESS,
   VP9E_SET_TILE_COLUMNS,
   VP9E_SET_TILE_ROWS,
+  VP9E_SET_FRAME_PARALLEL_DECODING
 };
 
 /*!\brief vpx 1-D scaling mode
@@ -310,6 +311,7 @@ VPX_CTRL_USE_TYPE(VP8E_SET_MAX_INTRA_BITRATE_PCT, unsigned int)
 
 VPX_CTRL_USE_TYPE(VP9E_SET_LOSSLESS, unsigned int)
 
+VPX_CTRL_USE_TYPE(VP9E_SET_FRAME_PARALLEL_DECODING, unsigned int)
 /*! @} - end defgroup vp8_encoder */
 #include "vpx_codec_impl_bottom.h"
 #endif
diff --git a/vpx/vp8dx.h b/vpx/vp8dx.h
index e2ec8b213..201df88fe 100644
--- a/vpx/vp8dx.h
+++ b/vpx/vp8dx.h
@@ -63,6 +63,15 @@ enum vp8_dec_control_id {
    */
   VP8D_GET_LAST_REF_USED,
 
+  /** decryption key to protect encoded data buffer before decoding,
+   *  pointer to 32 byte array which is copied, so the array passed
+   *  does not need to be preserved
+   */
+  VP8_SET_DECRYPT_KEY,
+
+  /** For testing. */
+  VP9_INVERT_TILE_DECODE_ORDER,
+
   VP8_DECODER_CTRL_ID_MAX
 };
 
@@ -78,6 +87,8 @@ enum vp8_dec_control_id {
 VPX_CTRL_USE_TYPE(VP8D_GET_LAST_REF_UPDATES,   int *)
 VPX_CTRL_USE_TYPE(VP8D_GET_FRAME_CORRUPTED,    int *)
 VPX_CTRL_USE_TYPE(VP8D_GET_LAST_REF_USED,      int *)
+VPX_CTRL_USE_TYPE(VP8_SET_DECRYPT_KEY,         const unsigned char *)
+VPX_CTRL_USE_TYPE(VP9_INVERT_TILE_DECODE_ORDER, int)
 
 /*! @} - end defgroup vp8_decoder */
 
diff --git a/vpx/vpx_decoder.h b/vpx/vpx_decoder.h
index ddbc0a6d6..e7701e512 100644
--- a/vpx/vpx_decoder.h
+++ b/vpx/vpx_decoder.h
@@ -106,7 +106,6 @@ extern "C" {
     unsigned int threads; /**< Maximum number of threads to use, default 1 */
     unsigned int w;      /**< Width */
     unsigned int h;      /**< Height */
-    int inv_tile_order;  /**< Invert tile decoding order, default 0 */
   } vpx_codec_dec_cfg_t; /**< alias for struct vpx_codec_dec_cfg */
 
 
diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h
index 2ec09bdd4..ffdbc0644 100644
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h
@@ -334,12 +334,6 @@ extern "C" {
      */
     vpx_codec_er_flags_t   g_error_resilient;
 
-    /*!\brief Enable frame parallel decoding mode
-     * This value should be 1 to encode in a way that enables frame parallel
-     * decoding. Otherwise make it 0.
-     */
-    unsigned int           g_frame_parallel_decoding;
-
 
     /*!\brief Multi-pass Encoding Mode
      *
diff --git a/vpxdec.c b/vpxdec.c
index 287e796ae..41c654fae 100644
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -1024,7 +1024,7 @@ int main(int argc, const char **argv_) {
 
     if (!noblit) {
       if (do_scale) {
-        if (frame_out == 1) {
+        if (img && frame_out == 1) {
           stream_w = img->d_w;
           stream_h = img->d_h;
           scaled_img = vpx_img_alloc(NULL, VPX_IMG_FMT_I420,
diff --git a/vpxenc.c b/vpxenc.c
index 87205e636..3295fd9a4 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -89,8 +89,8 @@ static size_t wrap_fwrite(const void *ptr, size_t size, size_t nmemb,
 
 static const char *exec_name;
 
-#define VP8_FOURCC (0x00385056)
-#define VP9_FOURCC (0x00395056)
+#define VP8_FOURCC (0x30385056)
+#define VP9_FOURCC (0x30395056)
 static const struct codec_item {
   char const              *name;
   const vpx_codec_iface_t *(*iface)(void);
@@ -1023,10 +1023,6 @@ static const arg_def_t timebase         = ARG_DEF(NULL, "timebase", 1,
                                                   "Output timestamp precision (fractional seconds)");
 static const arg_def_t error_resilient  = ARG_DEF(NULL, "error-resilient", 1,
                                                   "Enable error resiliency features");
-#if CONFIG_VP9_ENCODER
-static const arg_def_t frame_parallel_decoding  = ARG_DEF(
-    NULL, "frame-parallel", 1, "Enable frame parallel decodability features");
-#endif
 static const arg_def_t lag_in_frames    = ARG_DEF(NULL, "lag-in-frames", 1,
                                                   "Max number of frames to lag");
 
@@ -1034,9 +1030,6 @@ static const arg_def_t *global_args[] = {
   &use_yv12, &use_i420, &usage, &threads, &profile,
   &width, &height, &stereo_mode, &timebase, &framerate,
   &error_resilient,
-#if CONFIG_VP9_ENCODER
-  &frame_parallel_decoding,
-#endif
   &lag_in_frames, NULL
 };
 
@@ -1136,6 +1129,10 @@ static const arg_def_t cq_level = ARG_DEF(NULL, "cq-level", 1,
 static const arg_def_t max_intra_rate_pct = ARG_DEF(NULL, "max-intra-rate", 1,
                                                     "Max I-frame bitrate (pct)");
 static const arg_def_t lossless = ARG_DEF(NULL, "lossless", 1, "Lossless mode");
+#if CONFIG_VP9_ENCODER
+static const arg_def_t frame_parallel_decoding  = ARG_DEF(
+    NULL, "frame-parallel", 1, "Enable frame parallel decodability features");
+#endif
 
 #if CONFIG_VP8_ENCODER
 static const arg_def_t *vp8_args[] = {
@@ -1159,6 +1156,7 @@ static const arg_def_t *vp9_args[] = {
   &cpu_used, &auto_altref, &noise_sens, &sharpness, &static_thresh,
   &tile_cols, &tile_rows, &arnr_maxframes, &arnr_strength, &arnr_type,
   &tune_ssim, &cq_level, &max_intra_rate_pct, &lossless,
+  &frame_parallel_decoding,
   NULL
 };
 static const int vp9_arg_ctrl_map[] = {
@@ -1167,7 +1165,7 @@ static const int vp9_arg_ctrl_map[] = {
   VP9E_SET_TILE_COLUMNS, VP9E_SET_TILE_ROWS,
   VP8E_SET_ARNR_MAXFRAMES, VP8E_SET_ARNR_STRENGTH, VP8E_SET_ARNR_TYPE,
   VP8E_SET_TUNING, VP8E_SET_CQ_LEVEL, VP8E_SET_MAX_INTRA_BITRATE_PCT,
-  VP9E_SET_LOSSLESS,
+  VP9E_SET_LOSSLESS, VP9E_SET_FRAME_PARALLEL_DECODING,
   0
 };
 #endif
@@ -1936,10 +1934,6 @@ static int parse_stream_params(struct global_config *global,
       validate_positive_rational(arg.name, &config->cfg.g_timebase);
     } else if (arg_match(&arg, &error_resilient, argi))
       config->cfg.g_error_resilient = arg_parse_uint(&arg);
-#if CONFIG_VP9_ENCODER
-    else if (arg_match(&arg, &frame_parallel_decoding, argi))
-      config->cfg.g_frame_parallel_decoding = arg_parse_uint(&arg);
-#endif
     else if (arg_match(&arg, &lag_in_frames, argi))
       config->cfg.g_lag_in_frames = arg_parse_uint(&arg);
     else if (arg_match(&arg, &dropframe_thresh, argi))
@@ -2124,9 +2118,6 @@ static void show_stream_config(struct stream_state  *stream,
   SHOW(g_timebase.num);
   SHOW(g_timebase.den);
   SHOW(g_error_resilient);
-#if CONFIG_VP9_ENCODER
-  SHOW(g_frame_parallel_decoding);
-#endif
   SHOW(g_pass);
   SHOW(g_lag_in_frames);
   SHOW(rc_dropframe_thresh);
@@ -2560,7 +2551,7 @@ int main(int argc, const char **argv_) {
     usage_exit();
 
   for (pass = global.pass ? global.pass - 1 : 0; pass < global.passes; pass++) {
-    int frames_in = 0;
+    int frames_in = 0, seen_frames = 0;
     int64_t estimated_time_left = -1;
     int64_t average_rate = -1;
     off_t lagged_count = 0;
@@ -2640,9 +2631,11 @@ int main(int argc, const char **argv_) {
 
         if (frame_avail)
           frames_in++;
+        seen_frames = frames_in > global.skip_frames ?
+                          frames_in - global.skip_frames : 0;
 
         if (!global.quiet) {
-          float fps = usec_to_fps(cx_time, frames_in);
+          float fps = usec_to_fps(cx_time, seen_frames);
           fprintf(stderr, "\rPass %d/%d ", pass + 1, global.passes);
 
           if (stream_cnt == 1)
@@ -2678,16 +2671,17 @@ int main(int argc, const char **argv_) {
         FOREACH_STREAM(get_cx_data(stream, &global, &got_data));
 
         if (!got_data && input.length && !streams->frames_out) {
-          lagged_count = global.limit ? frames_in : ftello(input.file);
+          lagged_count = global.limit ? seen_frames : ftello(input.file);
         } else if (input.length) {
           int64_t remaining;
           int64_t rate;
 
           if (global.limit) {
-            int frame_in_lagged = (frames_in - lagged_count) * 1000;
+            int frame_in_lagged = (seen_frames - lagged_count) * 1000;
 
             rate = cx_time ? frame_in_lagged * (int64_t)1000000 / cx_time : 0;
-            remaining = 1000 * (global.limit - frames_in + lagged_count);
+            remaining = 1000 * (global.limit - global.skip_frames
+                                - seen_frames + lagged_count);
           } else {
             off_t input_pos = ftello(input.file);
             off_t input_pos_lagged = input_pos - lagged_count;
@@ -2719,14 +2713,14 @@ int main(int argc, const char **argv_) {
                        "\rPass %d/%d frame %4d/%-4d %7"PRId64"B %7lub/f %7"PRId64"b/s"
                        " %7"PRId64" %s (%.2f fps)\033[K\n", pass + 1,
                        global.passes, frames_in, stream->frames_out, (int64_t)stream->nbytes,
-                       frames_in ? (unsigned long)(stream->nbytes * 8 / frames_in) : 0,
-                       frames_in ? (int64_t)stream->nbytes * 8
+                       seen_frames ? (unsigned long)(stream->nbytes * 8 / seen_frames) : 0,
+                       seen_frames ? (int64_t)stream->nbytes * 8
                        * (int64_t)global.framerate.num / global.framerate.den
-                       / frames_in
+                       / seen_frames
                        : 0,
                        stream->cx_time > 9999999 ? stream->cx_time / 1000 : stream->cx_time,
                        stream->cx_time > 9999999 ? "ms" : "us",
-                       usec_to_fps(stream->cx_time, frames_in));
+                       usec_to_fps(stream->cx_time, seen_frames));
                     );
 
     if (global.show_psnr)